gg

2026-06-06 14:08:30 +03:00
parent 9e41407cb5
commit 1c03fa5e1c
2 changed files with 345 additions and 0 deletions
@@ -0,0 +1,162 @@
+"""
+Market Calibration Scan — find where the ODDS THEMSELVES are systematically wrong.
+=================================================================================
+The legit, measurable version of "odds şike": pockets (leagues / teams / bands)
+where the market's implied probability does NOT match realized frequency, so a
+SIMPLE rule (no model) is +EV. This is pure market inefficiency — soft pricing
+in obscure leagues, persistent team bias, etc.
+
+Discipline against false 'rigged' pockets (the multiple-comparison trap):
+  * split history by time into HALF-1 (discover) and HALF-2 (validate)
+  * a pocket counts ONLY if it is +EV in BOTH halves with enough bets each
+  * report realized-vs-implied gap (the miscalibration) + ROI
+
+No model. Just odds vs outcomes. Read-only on the training CSV (104k matches
+with odds). Forward 'suspicious line movement' detection needs odds_history
+(currently empty) — separate, forward-only.
+
+Usage: python scripts/market_calibration.py --min-bets 120 --side fav
+"""
+from __future__ import annotations
+import argparse, os, sys
+import numpy as np, pandas as pd
+
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try: sys.stdout.reconfigure(encoding="utf-8")
+    except Exception: pass
+
+AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
+
+
+def league_names(ids):
+    try:
+        sys.path.insert(0, AI_DIR)
+        from data.db import get_clean_dsn
+        import psycopg2
+        from psycopg2.extras import RealDictCursor
+        ids = [str(i) for i in ids if i is not None]
+        for _ in range(3):
+            try:
+                with psycopg2.connect(get_clean_dsn()) as c:
+                    with c.cursor(cursor_factory=RealDictCursor) as cur:
+                        cur.execute("SELECT id,name FROM leagues WHERE id = ANY(%s)", (ids,))
+                        return {str(r["id"]): r["name"] for r in cur.fetchall()}
+            except Exception:
+                import time; time.sleep(1)
+    except Exception:
+        pass
+    return {}
+
+
+def team_names(ids):
+    try:
+        sys.path.insert(0, AI_DIR)
+        from data.db import get_clean_dsn
+        import psycopg2
+        from psycopg2.extras import RealDictCursor
+        ids = [str(i) for i in ids if i is not None]
+        for _ in range(3):
+            try:
+                with psycopg2.connect(get_clean_dsn()) as c:
+                    with c.cursor(cursor_factory=RealDictCursor) as cur:
+                        cur.execute("SELECT id,name FROM teams WHERE id = ANY(%s)", (ids,))
+                        return {str(r["id"]): r["name"] for r in cur.fetchall()}
+            except Exception:
+                import time; time.sleep(1)
+    except Exception:
+        pass
+    return {}
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--min-bets", type=int, default=120, help="min bets PER HALF")
+    ap.add_argument("--fav-max", type=float, default=2.5, help="only count favourites below this odds")
+    args = ap.parse_args()
+
+    df = pd.read_csv(CSV, low_memory=False,
+                     usecols=["match_id","league_id","home_team_id","away_team_id","mst_utc",
+                              "odds_ms_h","odds_ms_d","odds_ms_a","score_home","score_away"])
+    df = df.sort_values("mst_utc").reset_index(drop=True)
+    sh = pd.to_numeric(df["score_home"],errors="coerce"); sa = pd.to_numeric(df["score_away"],errors="coerce")
+    ok = sh.notna()&sa.notna()
+    df = df[ok].reset_index(drop=True); sh=sh[ok.values].values; sa=sa[ok.values].values
+    O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric,errors="coerce").fillna(0.0).values
+    valid = (O>1.0).all(1)
+    outcome = np.where(sh>sa,0,np.where(sh==sa,1,2))   # 0 home,1 draw,2 away
+    fav = O.argmin(1); fav_odds = O[np.arange(len(O)),fav]
+    fav_won = (fav==outcome).astype(float)
+    fav_implied = 1.0/fav_odds
+    pnl = np.where(fav_won, fav_odds-1.0, -1.0)
+    half = (np.arange(len(df)) >= len(df)//2).astype(int)   # 0=first half,1=second
+    use = valid & (fav_odds <= args.fav_max)
+
+    base = pd.DataFrame({
+        "league": df["league_id"].astype(str).values,
+        "home": df["home_team_id"].astype(str).values,
+        "fav_is_home": (fav==0),
+        "won": fav_won, "implied": fav_implied, "pnl": pnl, "half": half, "use": use,
+        "fav_odds": fav_odds,
+    })
+    b = base[base["use"]].copy()
+    print(f"{len(b):,} favourite bets (odds<= {args.fav_max}); split into 2 time halves\n")
+    print(f"GLOBAL favourite: realized={100*b['won'].mean():.1f}% implied={100*b['implied'].mean():.1f}% "
+          f"ROI={100*b['pnl'].mean():+.2f}%  (negative = vig; market roughly right)")
+
+    def scan(groupcol, label, namefn, min_bets):
+        rows=[]
+        for key,d in b.groupby(groupcol):
+            h0=d[d["half"]==0]; h1=d[d["half"]==1]
+            if len(h0)<min_bets or len(h1)<min_bets: continue
+            r0=100*h0["pnl"].mean(); r1=100*h1["pnl"].mean()
+            # miscalibration gap: realized - implied (positive = market underprices the favourite)
+            gap=100*(d["won"].mean()-d["implied"].mean())
+            both_pos = r0>0 and r1>0
+            rows.append((min(r0,r1), key, len(d), 100*d["pnl"].mean(), r0, r1, gap, both_pos))
+        rows.sort(reverse=True)
+        names = namefn([r[1] for r in rows[:40]])
+        print(f"\n{'='*82}\n{label}  (✓ = +EV in BOTH halves, the only trustworthy ones)\n{'='*82}")
+        print(f"  {'name':<30}{'n':>6}{'ROI%':>7}{'H1%':>7}{'H2%':>7}{'gap%':>7}  ✓")
+        print("  "+"-"*72)
+        shown=0
+        for mn,key,n,roi,r0,r1,gap,both in rows:
+            if shown>=20 and not both: continue
+            nm=(names.get(key,key) or key)[:28]
+            mark = "✓" if both else ""
+            print(f"  {nm:<30}{n:>6}{roi:>+7.1f}{r0:>+7.1f}{r1:>+7.1f}{gap:>+7.1f}  {mark}")
+            shown+=1
+            if shown>=25: break
+        good=[r for r in rows if r[7]]
+        print(f"\n  -> {len(good)} {label.split()[0].lower()} pockets are +EV in BOTH halves "
+              f"(out of {len(rows)} with enough data)")
+        return good
+
+    scan("league", "BY LEAGUE (favourite flat bet)", league_names, args.min_bets)
+    # team: only when the team is the home favourite (cleanest, most samples)
+    bt = b[b["fav_is_home"]]
+    globals()['b'] = bt  # reuse scan on home-favourite subset
+    # inline team scan
+    rows=[]
+    for key,d in bt.groupby("home"):
+        h0=d[d["half"]==0]; h1=d[d["half"]==1]
+        if len(h0)<max(25,args.min_bets//3) or len(h1)<max(25,args.min_bets//3): continue
+        r0=100*h0["pnl"].mean(); r1=100*h1["pnl"].mean()
+        gap=100*(d["won"].mean()-d["implied"].mean())
+        rows.append((min(r0,r1), key, len(d), 100*d["pnl"].mean(), r0, r1, gap, r0>0 and r1>0))
+    rows.sort(reverse=True)
+    tn=team_names([r[1] for r in rows[:40]])
+    print(f"\n{'='*82}\nBY TEAM as HOME FAVOURITE  (✓ = +EV both halves)\n{'='*82}")
+    print(f"  {'team':<30}{'n':>6}{'ROI%':>7}{'H1%':>7}{'H2%':>7}{'gap%':>7}  ✓")
+    print("  "+"-"*72)
+    for mn,key,n,roi,r0,r1,gap,both in rows[:22]:
+        nm=(tn.get(key,key) or key)[:28]; mark="✓" if both else ""
+        print(f"  {nm:<30}{n:>6}{roi:>+7.1f}{r0:>+7.1f}{r1:>+7.1f}{gap:>+7.1f}  {mark}")
+    good=[r for r in rows if r[7]]
+    print(f"\n  -> {len(good)} teams +EV in BOTH halves (out of {len(rows)})")
+    print("\nREAD: ✓ pockets survived a time-split = candidate real inefficiencies (not noise).")
+    print("Still forward-validate with CLV. No ✓ = market is efficient there; don't bet.")
+
+
+if __name__ == "__main__":
+    main()