gg

2026-06-06 14:08:30 +03:00
parent 9e41407cb5
commit 1c03fa5e1c
2 changed files with 345 additions and 0 deletions
@@ -0,0 +1,162 @@
 """
 Market Calibration Scan — find where the ODDS THEMSELVES are systematically wrong.
 =================================================================================
 The legit, measurable version of "odds şike": pockets (leagues / teams / bands)
 where the market's implied probability does NOT match realized frequency, so a
 SIMPLE rule (no model) is +EV. This is pure market inefficiency — soft pricing
 in obscure leagues, persistent team bias, etc.
 Discipline against false 'rigged' pockets (the multiple-comparison trap):
  * split history by time into HALF-1 (discover) and HALF-2 (validate)
  * a pocket counts ONLY if it is +EV in BOTH halves with enough bets each
  * report realized-vs-implied gap (the miscalibration) + ROI
 No model. Just odds vs outcomes. Read-only on the training CSV (104k matches
 with odds). Forward 'suspicious line movement' detection needs odds_history
 (currently empty) — separate, forward-only.
 Usage: python scripts/market_calibration.py --min-bets 120 --side fav
 """
 from __future__ import annotations
 import argparse, os, sys
 import numpy as np, pandas as pd
 if sys.stdout and hasattr(sys.stdout, "reconfigure"):
    try: sys.stdout.reconfigure(encoding="utf-8")
    except Exception: pass
 AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
 def league_names(ids):
    try:
        sys.path.insert(0, AI_DIR)
        from data.db import get_clean_dsn
        import psycopg2
        from psycopg2.extras import RealDictCursor
        ids = [str(i) for i in ids if i is not None]
        for _ in range(3):
            try:
                with psycopg2.connect(get_clean_dsn()) as c:
                    with c.cursor(cursor_factory=RealDictCursor) as cur:
                        cur.execute("SELECT id,name FROM leagues WHERE id = ANY(%s)", (ids,))
                        return {str(r["id"]): r["name"] for r in cur.fetchall()}
            except Exception:
                import time; time.sleep(1)
    except Exception:
        pass
    return {}
 def team_names(ids):
    try:
        sys.path.insert(0, AI_DIR)
        from data.db import get_clean_dsn
        import psycopg2
        from psycopg2.extras import RealDictCursor
        ids = [str(i) for i in ids if i is not None]
        for _ in range(3):
            try:
                with psycopg2.connect(get_clean_dsn()) as c:
                    with c.cursor(cursor_factory=RealDictCursor) as cur:
                        cur.execute("SELECT id,name FROM teams WHERE id = ANY(%s)", (ids,))
                        return {str(r["id"]): r["name"] for r in cur.fetchall()}
            except Exception:
                import time; time.sleep(1)
    except Exception:
        pass
    return {}
 def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--min-bets", type=int, default=120, help="min bets PER HALF")
    ap.add_argument("--fav-max", type=float, default=2.5, help="only count favourites below this odds")
    args = ap.parse_args()
    df = pd.read_csv(CSV, low_memory=False,
                     usecols=["match_id","league_id","home_team_id","away_team_id","mst_utc",
                              "odds_ms_h","odds_ms_d","odds_ms_a","score_home","score_away"])
    df = df.sort_values("mst_utc").reset_index(drop=True)
    sh = pd.to_numeric(df["score_home"],errors="coerce"); sa = pd.to_numeric(df["score_away"],errors="coerce")
    ok = sh.notna()&sa.notna()
    df = df[ok].reset_index(drop=True); sh=sh[ok.values].values; sa=sa[ok.values].values
    O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric,errors="coerce").fillna(0.0).values
    valid = (O>1.0).all(1)
    outcome = np.where(sh>sa,0,np.where(sh==sa,1,2))   # 0 home,1 draw,2 away
    fav = O.argmin(1); fav_odds = O[np.arange(len(O)),fav]
    fav_won = (fav==outcome).astype(float)
    fav_implied = 1.0/fav_odds
    pnl = np.where(fav_won, fav_odds-1.0, -1.0)
    half = (np.arange(len(df)) >= len(df)//2).astype(int)   # 0=first half,1=second
    use = valid & (fav_odds <= args.fav_max)
    base = pd.DataFrame({
        "league": df["league_id"].astype(str).values,
        "home": df["home_team_id"].astype(str).values,
        "fav_is_home": (fav==0),
        "won": fav_won, "implied": fav_implied, "pnl": pnl, "half": half, "use": use,
        "fav_odds": fav_odds,
    })
    b = base[base["use"]].copy()
    print(f"{len(b):,} favourite bets (odds<= {args.fav_max}); split into 2 time halves\n")
    print(f"GLOBAL favourite: realized={100*b['won'].mean():.1f}% implied={100*b['implied'].mean():.1f}% "
          f"ROI={100*b['pnl'].mean():+.2f}%  (negative = vig; market roughly right)")
    def scan(groupcol, label, namefn, min_bets):
        rows=[]
        for key,d in b.groupby(groupcol):
            h0=d[d["half"]==0]; h1=d[d["half"]==1]
            if len(h0)<min_bets or len(h1)<min_bets: continue
            r0=100*h0["pnl"].mean(); r1=100*h1["pnl"].mean()
            # miscalibration gap: realized - implied (positive = market underprices the favourite)
            gap=100*(d["won"].mean()-d["implied"].mean())
            both_pos = r0>0 and r1>0
            rows.append((min(r0,r1), key, len(d), 100*d["pnl"].mean(), r0, r1, gap, both_pos))
        rows.sort(reverse=True)
        names = namefn([r[1] for r in rows[:40]])
        print(f"\n{'='*82}\n{label}  (✓ = +EV in BOTH halves, the only trustworthy ones)\n{'='*82}")
        print(f"  {'name':<30}{'n':>6}{'ROI%':>7}{'H1%':>7}{'H2%':>7}{'gap%':>7}  ✓")
        print("  "+"-"*72)
        shown=0
        for mn,key,n,roi,r0,r1,gap,both in rows:
            if shown>=20 and not both: continue
            nm=(names.get(key,key) or key)[:28]
            mark = "✓" if both else ""
            print(f"  {nm:<30}{n:>6}{roi:>+7.1f}{r0:>+7.1f}{r1:>+7.1f}{gap:>+7.1f}  {mark}")
            shown+=1
            if shown>=25: break
        good=[r for r in rows if r[7]]
        print(f"\n  -> {len(good)} {label.split()[0].lower()} pockets are +EV in BOTH halves "
              f"(out of {len(rows)} with enough data)")
        return good
    scan("league", "BY LEAGUE (favourite flat bet)", league_names, args.min_bets)
    # team: only when the team is the home favourite (cleanest, most samples)
    bt = b[b["fav_is_home"]]
    globals()['b'] = bt  # reuse scan on home-favourite subset
    # inline team scan
    rows=[]
    for key,d in bt.groupby("home"):
        h0=d[d["half"]==0]; h1=d[d["half"]==1]
        if len(h0)<max(25,args.min_bets//3) or len(h1)<max(25,args.min_bets//3): continue
        r0=100*h0["pnl"].mean(); r1=100*h1["pnl"].mean()
        gap=100*(d["won"].mean()-d["implied"].mean())
        rows.append((min(r0,r1), key, len(d), 100*d["pnl"].mean(), r0, r1, gap, r0>0 and r1>0))
    rows.sort(reverse=True)
    tn=team_names([r[1] for r in rows[:40]])
    print(f"\n{'='*82}\nBY TEAM as HOME FAVOURITE  (✓ = +EV both halves)\n{'='*82}")
    print(f"  {'team':<30}{'n':>6}{'ROI%':>7}{'H1%':>7}{'H2%':>7}{'gap%':>7}  ✓")
    print("  "+"-"*72)
    for mn,key,n,roi,r0,r1,gap,both in rows[:22]:
        nm=(tn.get(key,key) or key)[:28]; mark="✓" if both else ""
        print(f"  {nm:<30}{n:>6}{roi:>+7.1f}{r0:>+7.1f}{r1:>+7.1f}{gap:>+7.1f}  {mark}")
    good=[r for r in rows if r[7]]
    print(f"\n  -> {len(good)} teams +EV in BOTH halves (out of {len(rows)})")
    print("\nREAD: ✓ pockets survived a time-split = candidate real inefficiencies (not noise).")
    print("Still forward-validate with CLV. No ✓ = market is efficient there; don't bet.")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,183 @@
 """
 Odds Movement Monitor — forward steam / odds-anomaly ("şike" signal) detector.
 =============================================================================
 The only viable version of "detect odds manipulation": capture upcoming-match
 odds PERIODICALLY and flag abnormal moves (steam = a price shortening fast =
 money/information arriving, sometimes a fixed match). Retrospective detection is
 impossible here (odds_history empty); this builds the time-series going forward.
 No schema change: snapshots append to data/odds_snapshots.jsonl (reads
 live_matches.odds, which the feeder refreshes every 15 min).
 Run --snapshot every ~15-20 min (scheduler). Run --report anytime to see the
 current movement watchlist.
 For a CLOSING-time bettor the use is mainly a RISK FILTER: a match with heavy
 unexplained late steam against your pick = the market knows something you don't
 → skip it. (Profiting from steam needs betting BEFORE it, i.e. early.)
 Usage:
  python scripts/monitor_odds_movement.py --snapshot     # capture now (cron this)
  python scripts/monitor_odds_movement.py --report       # show movement watchlist
  python scripts/monitor_odds_movement.py --report --min-move 0.10
 """
 from __future__ import annotations
 import argparse, json, os, sys, time, datetime
 from collections import defaultdict
 if sys.stdout and hasattr(sys.stdout, "reconfigure"):
    try: sys.stdout.reconfigure(encoding="utf-8")
    except Exception: pass
 AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.insert(0, AI_DIR)
 SNAP = os.path.join(AI_DIR, "data", "odds_snapshots.jsonl")
 # markets tracked for steam (Turkish keys as stored in live_matches.odds)
 TRACK = {"Maç Sonucu": ["1", "X", "2"],
         "2,5 Alt/Üst": ["Üst", "Alt"],
         "Karşılıklı Gol": ["Var", "Yok"]}
 def _conn():
    from data.db import get_clean_dsn
    import psycopg2
    last = None
    for _ in range(3):
        try:
            return psycopg2.connect(get_clean_dsn())
        except Exception as e:
            last = e; time.sleep(1.2)
    raise last
 def _f(x):
    try: return float(x)
    except (TypeError, ValueError): return None
 def snapshot():
    from psycopg2.extras import RealDictCursor
    now_ms = int(time.time() * 1000)
    n = 0
    with _conn() as c:
        with c.cursor(cursor_factory=RealDictCursor) as cur:
            cur.execute("""SELECT id, mst_utc, odds FROM live_matches
                           WHERE odds IS NOT NULL AND mst_utc > %s
                           ORDER BY mst_utc ASC""", (now_ms - 2*3600*1000,))
            rows = cur.fetchall()
    os.makedirs(os.path.dirname(SNAP), exist_ok=True)
    with open(SNAP, "a", encoding="utf-8") as f:
        for r in rows:
            odds = r["odds"]
            if isinstance(odds, str):
                try: odds = json.loads(odds)
                except Exception: continue
            if not isinstance(odds, dict): continue
            compact = {}
            for cat, sels in TRACK.items():
                cm = odds.get(cat)
                if isinstance(cm, dict):
                    vals = {s: _f(cm.get(s)) for s in sels if _f(cm.get(s))}
                    if vals: compact[cat] = vals
            if not compact: continue
            f.write(json.dumps({"ts": now_ms, "match_id": r["id"],
                                "mst_utc": r["mst_utc"], "odds": compact},
                               ensure_ascii=False) + "\n")
            n += 1
    print(f"[snapshot] {datetime.datetime.now():%Y-%m-%d %H:%M} captured {n} upcoming matches -> {SNAP}")
 def _names(ids):
    try:
        from psycopg2.extras import RealDictCursor
        ids = [str(i) for i in ids]
        if not ids: return {}
        with _conn() as c:
            with c.cursor(cursor_factory=RealDictCursor) as cur:
                cur.execute("""SELECT m.id, ht.name h, at.name a
                               FROM matches m JOIN teams ht ON ht.id=m.home_team_id
                               JOIN teams at ON at.id=m.away_team_id WHERE m.id = ANY(%s)""", (ids,))
                return {str(r["id"]): f"{r['h']} v {r['a']}" for r in cur.fetchall()}
    except Exception:
        return {}
 def report(min_move):
    if not os.path.exists(SNAP):
        print("No snapshots yet. Schedule '--snapshot' every ~15-20 min first."); return
    series = defaultdict(list)   # match_id -> [(ts, mst, odds_compact), ...]
    with open(SNAP, encoding="utf-8") as f:
        for line in f:
            try: d = json.loads(line)
            except Exception: continue
            series[d["match_id"]].append((d["ts"], d.get("mst_utc"), d["odds"]))
    now_ms = int(time.time()*1000)
    flagged = []
    for mid, snaps in series.items():
        if len(snaps) < 2: continue
        snaps.sort(key=lambda x: x[0])
        mst = snaps[-1][1]
        # focus on MS market
        def ms(snap): return snap[2].get("Maç Sonucu", {})
        op, la = ms(snaps[0]), ms(snaps[-1])
        best = None  # most-SHORTENED side = the steam (money/info) signal
        for sel in ("1", "X", "2"):
            o0, o1 = op.get(sel), la.get(sel)
            if o0 and o1 and o0 > 1.0 and o1 > 1.0:
                drift = (o1 - o0) / o0          # negative = shortened = steam
                if best is None or drift < best[4]:
                    best = (abs(drift), sel, o0, o1, drift)
        if best and abs(best[4]) >= min_move:
            # velocity: biggest single-step move on that selection
            sel = best[1]; steps = [s[2].get("Maç Sonucu", {}).get(sel) for s in snaps]
            steps = [x for x in steps if x]
            vmax = 0.0
            for i in range(1, len(steps)):
                if steps[i-1]:
                    vmax = max(vmax, abs(steps[i]-steps[i-1])/steps[i-1])
            flagged.append((best[0], mid, best[1], best[2], best[3], best[4], vmax,
                            len(snaps), mst))
    flagged.sort(reverse=True)
    names = _names([f[1] for f in flagged[:30]])
    print("="*84)
    print("ODDS MOVEMENT WATCHLIST  (MS market; drift = (last-open)/open; ↓ = shortened = steam)")
    print("="*84)
    if not flagged:
        print(f"  No matches moved >= {min_move:.0%} yet. (Need more snapshots over time;")
        print("   monitor only sees movement once it has captured several snapshots.)")
        # still show coverage
        multi = sum(1 for s in series.values() if len(s) >= 2)
        print(f"\n  coverage: {len(series)} matches tracked, {multi} with >=2 snapshots.")
        return
    print(f"  {'match':<34}{'side':>5}{'open':>7}{'last':>7}{'drift':>8}{'maxStep':>8}{'snaps':>6}")
    print("  "+"-"*78)
    for ab, mid, sel, o0, o1, drift, vmax, ns, mst in flagged[:25]:
        nm = (names.get(mid, mid) or mid)[:32]
        arrow = "↓steam" if drift < 0 else "↑drift"
        ko = ""
        if mst:
            mins = (mst - now_ms)/60000
            ko = f" KO~{mins/60:.1f}h" if mins > 0 else " (started)"
        print(f"  {nm:<34}{sel:>5}{o0:>7.2f}{o1:>7.2f}{100*drift:>+7.1f}%{100*vmax:>+7.1f}%{ns:>6}  {arrow}{ko}")
    print(f"\n  {len(flagged)} matches flagged (moved >= {min_move:.0%}).")
    print("  ↓steam on a side = market backing it hard (info/possible fix). As a closing")
    print("  bettor: treat heavy late steam AGAINST your pick as a reason to SKIP.")
 def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--snapshot", action="store_true")
    ap.add_argument("--report", action="store_true")
    ap.add_argument("--min-move", type=float, default=0.08, help="flag drift >= this fraction (default 0.08)")
    args = ap.parse_args()
    if args.snapshot:
        snapshot()
    if args.report or not args.snapshot:
        report(args.min_move)
 if __name__ == "__main__":
    main()