""" Market Calibration Scan — find where the ODDS THEMSELVES are systematically wrong. ================================================================================= The legit, measurable version of "odds şike": pockets (leagues / teams / bands) where the market's implied probability does NOT match realized frequency, so a SIMPLE rule (no model) is +EV. This is pure market inefficiency — soft pricing in obscure leagues, persistent team bias, etc. Discipline against false 'rigged' pockets (the multiple-comparison trap): * split history by time into HALF-1 (discover) and HALF-2 (validate) * a pocket counts ONLY if it is +EV in BOTH halves with enough bets each * report realized-vs-implied gap (the miscalibration) + ROI No model. Just odds vs outcomes. Read-only on the training CSV (104k matches with odds). Forward 'suspicious line movement' detection needs odds_history (currently empty) — separate, forward-only. Usage: python scripts/market_calibration.py --min-bets 120 --side fav """ from __future__ import annotations import argparse, os, sys import numpy as np, pandas as pd if sys.stdout and hasattr(sys.stdout, "reconfigure"): try: sys.stdout.reconfigure(encoding="utf-8") except Exception: pass AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv") def league_names(ids): try: sys.path.insert(0, AI_DIR) from data.db import get_clean_dsn import psycopg2 from psycopg2.extras import RealDictCursor ids = [str(i) for i in ids if i is not None] for _ in range(3): try: with psycopg2.connect(get_clean_dsn()) as c: with c.cursor(cursor_factory=RealDictCursor) as cur: cur.execute("SELECT id,name FROM leagues WHERE id = ANY(%s)", (ids,)) return {str(r["id"]): r["name"] for r in cur.fetchall()} except Exception: import time; time.sleep(1) except Exception: pass return {} def team_names(ids): try: sys.path.insert(0, AI_DIR) from data.db import get_clean_dsn import psycopg2 from psycopg2.extras import RealDictCursor ids = [str(i) for i in ids if i is not None] for _ in range(3): try: with psycopg2.connect(get_clean_dsn()) as c: with c.cursor(cursor_factory=RealDictCursor) as cur: cur.execute("SELECT id,name FROM teams WHERE id = ANY(%s)", (ids,)) return {str(r["id"]): r["name"] for r in cur.fetchall()} except Exception: import time; time.sleep(1) except Exception: pass return {} def main(): ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--min-bets", type=int, default=120, help="min bets PER HALF") ap.add_argument("--fav-max", type=float, default=2.5, help="only count favourites below this odds") args = ap.parse_args() df = pd.read_csv(CSV, low_memory=False, usecols=["match_id","league_id","home_team_id","away_team_id","mst_utc", "odds_ms_h","odds_ms_d","odds_ms_a","score_home","score_away"]) df = df.sort_values("mst_utc").reset_index(drop=True) sh = pd.to_numeric(df["score_home"],errors="coerce"); sa = pd.to_numeric(df["score_away"],errors="coerce") ok = sh.notna()&sa.notna() df = df[ok].reset_index(drop=True); sh=sh[ok.values].values; sa=sa[ok.values].values O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric,errors="coerce").fillna(0.0).values valid = (O>1.0).all(1) outcome = np.where(sh>sa,0,np.where(sh==sa,1,2)) # 0 home,1 draw,2 away fav = O.argmin(1); fav_odds = O[np.arange(len(O)),fav] fav_won = (fav==outcome).astype(float) fav_implied = 1.0/fav_odds pnl = np.where(fav_won, fav_odds-1.0, -1.0) half = (np.arange(len(df)) >= len(df)//2).astype(int) # 0=first half,1=second use = valid & (fav_odds <= args.fav_max) base = pd.DataFrame({ "league": df["league_id"].astype(str).values, "home": df["home_team_id"].astype(str).values, "fav_is_home": (fav==0), "won": fav_won, "implied": fav_implied, "pnl": pnl, "half": half, "use": use, "fav_odds": fav_odds, }) b = base[base["use"]].copy() print(f"{len(b):,} favourite bets (odds<= {args.fav_max}); split into 2 time halves\n") print(f"GLOBAL favourite: realized={100*b['won'].mean():.1f}% implied={100*b['implied'].mean():.1f}% " f"ROI={100*b['pnl'].mean():+.2f}% (negative = vig; market roughly right)") def scan(groupcol, label, namefn, min_bets): rows=[] for key,d in b.groupby(groupcol): h0=d[d["half"]==0]; h1=d[d["half"]==1] if len(h0)0 and r1>0 rows.append((min(r0,r1), key, len(d), 100*d["pnl"].mean(), r0, r1, gap, both_pos)) rows.sort(reverse=True) names = namefn([r[1] for r in rows[:40]]) print(f"\n{'='*82}\n{label} (✓ = +EV in BOTH halves, the only trustworthy ones)\n{'='*82}") print(f" {'name':<30}{'n':>6}{'ROI%':>7}{'H1%':>7}{'H2%':>7}{'gap%':>7} ✓") print(" "+"-"*72) shown=0 for mn,key,n,roi,r0,r1,gap,both in rows: if shown>=20 and not both: continue nm=(names.get(key,key) or key)[:28] mark = "✓" if both else "" print(f" {nm:<30}{n:>6}{roi:>+7.1f}{r0:>+7.1f}{r1:>+7.1f}{gap:>+7.1f} {mark}") shown+=1 if shown>=25: break good=[r for r in rows if r[7]] print(f"\n -> {len(good)} {label.split()[0].lower()} pockets are +EV in BOTH halves " f"(out of {len(rows)} with enough data)") return good scan("league", "BY LEAGUE (favourite flat bet)", league_names, args.min_bets) # team: only when the team is the home favourite (cleanest, most samples) bt = b[b["fav_is_home"]] globals()['b'] = bt # reuse scan on home-favourite subset # inline team scan rows=[] for key,d in bt.groupby("home"): h0=d[d["half"]==0]; h1=d[d["half"]==1] if len(h0)0 and r1>0)) rows.sort(reverse=True) tn=team_names([r[1] for r in rows[:40]]) print(f"\n{'='*82}\nBY TEAM as HOME FAVOURITE (✓ = +EV both halves)\n{'='*82}") print(f" {'team':<30}{'n':>6}{'ROI%':>7}{'H1%':>7}{'H2%':>7}{'gap%':>7} ✓") print(" "+"-"*72) for mn,key,n,roi,r0,r1,gap,both in rows[:22]: nm=(tn.get(key,key) or key)[:28]; mark="✓" if both else "" print(f" {nm:<30}{n:>6}{roi:>+7.1f}{r0:>+7.1f}{r1:>+7.1f}{gap:>+7.1f} {mark}") good=[r for r in rows if r[7]] print(f"\n -> {len(good)} teams +EV in BOTH halves (out of {len(rows)})") print("\nREAD: ✓ pockets survived a time-split = candidate real inefficiencies (not noise).") print("Still forward-validate with CLV. No ✓ = market is efficient there; don't bet.") if __name__ == "__main__": main()