iddaai-be/ai-engine/scripts/market_calibration.py

"""
Market Calibration Scan — find where the ODDS THEMSELVES are systematically wrong.
=================================================================================
The legit, measurable version of "odds şike": pockets (leagues / teams / bands)
where the market's implied probability does NOT match realized frequency, so a
SIMPLE rule (no model) is +EV. This is pure market inefficiency — soft pricing
in obscure leagues, persistent team bias, etc.

Discipline against false 'rigged' pockets (the multiple-comparison trap):
  * split history by time into HALF-1 (discover) and HALF-2 (validate)
  * a pocket counts ONLY if it is +EV in BOTH halves with enough bets each
  * report realized-vs-implied gap (the miscalibration) + ROI

No model. Just odds vs outcomes. Read-only on the training CSV (104k matches
with odds). Forward 'suspicious line movement' detection needs odds_history
(currently empty) — separate, forward-only.

Usage: python scripts/market_calibration.py --min-bets 120 --side fav
"""
from __future__ import annotations
import argparse, os, sys
import numpy as np, pandas as pd

if sys.stdout and hasattr(sys.stdout, "reconfigure"):
    try: sys.stdout.reconfigure(encoding="utf-8")
    except Exception: pass

AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")


def league_names(ids):
    try:
        sys.path.insert(0, AI_DIR)
        from data.db import get_clean_dsn
        import psycopg2
        from psycopg2.extras import RealDictCursor
        ids = [str(i) for i in ids if i is not None]
        for _ in range(3):
            try:
                with psycopg2.connect(get_clean_dsn()) as c:
                    with c.cursor(cursor_factory=RealDictCursor) as cur:
                        cur.execute("SELECT id,name FROM leagues WHERE id = ANY(%s)", (ids,))
                        return {str(r["id"]): r["name"] for r in cur.fetchall()}
            except Exception:
                import time; time.sleep(1)
    except Exception:
        pass
    return {}


def team_names(ids):
    try:
        sys.path.insert(0, AI_DIR)
        from data.db import get_clean_dsn
        import psycopg2
        from psycopg2.extras import RealDictCursor
        ids = [str(i) for i in ids if i is not None]
        for _ in range(3):
            try:
                with psycopg2.connect(get_clean_dsn()) as c:
                    with c.cursor(cursor_factory=RealDictCursor) as cur:
                        cur.execute("SELECT id,name FROM teams WHERE id = ANY(%s)", (ids,))
                        return {str(r["id"]): r["name"] for r in cur.fetchall()}
            except Exception:
                import time; time.sleep(1)
    except Exception:
        pass
    return {}


def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--min-bets", type=int, default=120, help="min bets PER HALF")
    ap.add_argument("--fav-max", type=float, default=2.5, help="only count favourites below this odds")
    args = ap.parse_args()

    df = pd.read_csv(CSV, low_memory=False,
                     usecols=["match_id","league_id","home_team_id","away_team_id","mst_utc",
                              "odds_ms_h","odds_ms_d","odds_ms_a","score_home","score_away"])
    df = df.sort_values("mst_utc").reset_index(drop=True)
    sh = pd.to_numeric(df["score_home"],errors="coerce"); sa = pd.to_numeric(df["score_away"],errors="coerce")
    ok = sh.notna()&sa.notna()
    df = df[ok].reset_index(drop=True); sh=sh[ok.values].values; sa=sa[ok.values].values
    O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric,errors="coerce").fillna(0.0).values
    valid = (O>1.0).all(1)
    outcome = np.where(sh>sa,0,np.where(sh==sa,1,2))   # 0 home,1 draw,2 away
    fav = O.argmin(1); fav_odds = O[np.arange(len(O)),fav]
    fav_won = (fav==outcome).astype(float)
    fav_implied = 1.0/fav_odds
    pnl = np.where(fav_won, fav_odds-1.0, -1.0)
    half = (np.arange(len(df)) >= len(df)//2).astype(int)   # 0=first half,1=second
    use = valid & (fav_odds <= args.fav_max)

    base = pd.DataFrame({
        "league": df["league_id"].astype(str).values,
        "home": df["home_team_id"].astype(str).values,
        "fav_is_home": (fav==0),
        "won": fav_won, "implied": fav_implied, "pnl": pnl, "half": half, "use": use,
        "fav_odds": fav_odds,
    })
    b = base[base["use"]].copy()
    print(f"{len(b):,} favourite bets (odds<= {args.fav_max}); split into 2 time halves\n")
    print(f"GLOBAL favourite: realized={100*b['won'].mean():.1f}% implied={100*b['implied'].mean():.1f}% "
          f"ROI={100*b['pnl'].mean():+.2f}%  (negative = vig; market roughly right)")

    def scan(groupcol, label, namefn, min_bets):
        rows=[]
        for key,d in b.groupby(groupcol):
            h0=d[d["half"]==0]; h1=d[d["half"]==1]
            if len(h0)<min_bets or len(h1)<min_bets: continue
            r0=100*h0["pnl"].mean(); r1=100*h1["pnl"].mean()
            # miscalibration gap: realized - implied (positive = market underprices the favourite)
            gap=100*(d["won"].mean()-d["implied"].mean())
            both_pos = r0>0 and r1>0
            rows.append((min(r0,r1), key, len(d), 100*d["pnl"].mean(), r0, r1, gap, both_pos))
        rows.sort(reverse=True)
        names = namefn([r[1] for r in rows[:40]])
        print(f"\n{'='*82}\n{label}  (✓ = +EV in BOTH halves, the only trustworthy ones)\n{'='*82}")
        print(f"  {'name':<30}{'n':>6}{'ROI%':>7}{'H1%':>7}{'H2%':>7}{'gap%':>7}  ✓")
        print("  "+"-"*72)
        shown=0
        for mn,key,n,roi,r0,r1,gap,both in rows:
            if shown>=20 and not both: continue
            nm=(names.get(key,key) or key)[:28]
            mark = "✓" if both else ""
            print(f"  {nm:<30}{n:>6}{roi:>+7.1f}{r0:>+7.1f}{r1:>+7.1f}{gap:>+7.1f}  {mark}")
            shown+=1
            if shown>=25: break
        good=[r for r in rows if r[7]]
        print(f"\n  -> {len(good)} {label.split()[0].lower()} pockets are +EV in BOTH halves "
              f"(out of {len(rows)} with enough data)")
        return good

    scan("league", "BY LEAGUE (favourite flat bet)", league_names, args.min_bets)
    # team: only when the team is the home favourite (cleanest, most samples)
    bt = b[b["fav_is_home"]]
    globals()['b'] = bt  # reuse scan on home-favourite subset
    # inline team scan
    rows=[]
    for key,d in bt.groupby("home"):
        h0=d[d["half"]==0]; h1=d[d["half"]==1]
        if len(h0)<max(25,args.min_bets//3) or len(h1)<max(25,args.min_bets//3): continue
        r0=100*h0["pnl"].mean(); r1=100*h1["pnl"].mean()
        gap=100*(d["won"].mean()-d["implied"].mean())
        rows.append((min(r0,r1), key, len(d), 100*d["pnl"].mean(), r0, r1, gap, r0>0 and r1>0))
    rows.sort(reverse=True)
    tn=team_names([r[1] for r in rows[:40]])
    print(f"\n{'='*82}\nBY TEAM as HOME FAVOURITE  (✓ = +EV both halves)\n{'='*82}")
    print(f"  {'team':<30}{'n':>6}{'ROI%':>7}{'H1%':>7}{'H2%':>7}{'gap%':>7}  ✓")
    print("  "+"-"*72)
    for mn,key,n,roi,r0,r1,gap,both in rows[:22]:
        nm=(tn.get(key,key) or key)[:28]; mark="✓" if both else ""
        print(f"  {nm:<30}{n:>6}{roi:>+7.1f}{r0:>+7.1f}{r1:>+7.1f}{gap:>+7.1f}  {mark}")
    good=[r for r in rows if r[7]]
    print(f"\n  -> {len(good)} teams +EV in BOTH halves (out of {len(rows)})")
    print("\nREAD: ✓ pockets survived a time-split = candidate real inefficiencies (not noise).")
    print("Still forward-validate with CLV. No ✓ = market is efficient there; don't bet.")


if __name__ == "__main__":
    main()