gg3

2026-06-05 00:36:24 +03:00
parent b9700f9fda
commit 9e41407cb5
10 changed files with 1683 additions and 0 deletions
@@ -0,0 +1,181 @@
+"""
+Edge Search — is there a profitable POCKET (by league) the global model misses?
+==============================================================================
+Global leak-free MS is ~-5.6% (the vig). But efficiency varies: obscure / low-
+tier leagues may be mispriced. This walks a leak-free model forward and slices
+the value-bet ROI BY LEAGUE, requiring a real sample AND multi-fold consistency
+so we don't chase one lucky window.
+
+Leak-free: drops the confirmed/suspected leakage columns (see LEAKY). Uses odds
+in features (realistic). Value bet = biggest model_prob - implied edge > margin.
+
+⚠️ Even a positive pocket here is a LEAD, not proof: the CSV odds are a static
+capture, not the verified closing line. Anything flagged must be forward-
+validated with real CLV (capture_closing_odds.py) before staking.
+
+Usage: python scripts/edge_search.py --folds 6 --min-bets 150
+"""
+from __future__ import annotations
+import argparse, os, sys, time
+import numpy as np, pandas as pd, xgboost as xgb
+
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try: sys.stdout.reconfigure(encoding="utf-8")
+    except Exception: pass
+
+AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, AI_DIR)
+CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
+
+META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
+        "score_home","score_away","ht_score_home","ht_score_away"}
+LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
+         "squad_diff","home_squad_quality","away_squad_quality",
+         "referee_home_bias","referee_avg_goals"}
+
+
+def league_names(ids):
+    """Resilient id->name lookup."""
+    from data.db import get_clean_dsn
+    import psycopg2
+    from psycopg2.extras import RealDictCursor
+    out = {}
+    ids = [str(i) for i in ids if i is not None]
+    if not ids: return out
+    for _ in range(3):
+        try:
+            with psycopg2.connect(get_clean_dsn()) as c:
+                with c.cursor(cursor_factory=RealDictCursor) as cur:
+                    cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (ids,))
+                    for r in cur.fetchall(): out[str(r["id"])] = r["name"]
+            return out
+        except Exception:
+            time.sleep(1.0)
+    return out
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--folds", type=int, default=6)
+    ap.add_argument("--estimators", type=int, default=200)
+    ap.add_argument("--margin", type=float, default=0.0)
+    ap.add_argument("--min-bets", type=int, default=150)
+    args = ap.parse_args()
+
+    print(f"Loading {CSV} ...")
+    df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
+    sh = pd.to_numeric(df["score_home"], errors="coerce")
+    sa = pd.to_numeric(df["score_away"], errors="coerce")
+    ok = sh.notna() & sa.notna()
+    df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
+    y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
+    league = df["league_id"].astype(str).values
+    odds = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+
+    feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
+    X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+    rel = pd.to_numeric(df.get("league_reliability_score", pd.Series([np.nan]*len(df))),
+                        errors="coerce").fillna(-1.0).values
+    print(f"  {len(df):,} rows  features={len(feats)} (leak-free)  folds={args.folds}")
+
+    n = len(df); start = int(n * 0.5)
+    bounds = np.linspace(start, n, args.folds + 1, dtype=int)
+    params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
+              "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
+
+    # reliability quartile edges from the betting universe (rel>=0)
+    rv = rel[rel >= 0]
+    qs = np.quantile(rv, [0.25, 0.5, 0.75]) if len(rv) else [0.3, 0.5, 0.7]
+    def rel_band(x):
+        if x < 0: return "rel:unknown"
+        if x < qs[0]: return f"rel:Q1(<{qs[0]:.2f})"
+        if x < qs[1]: return f"rel:Q2"
+        if x < qs[2]: return f"rel:Q3"
+        return f"rel:Q4(>={qs[2]:.2f})"
+    def odds_band(o):
+        return ("<1.5" if o<1.5 else "1.5-2" if o<2 else "2-3" if o<3 else
+                "3-5" if o<5 else "5-8" if o<8 else "8+")
+
+    recs = []  # (group_key, fold, pnl, win)
+    glob = {"n":0,"pnl":0.0,"win":0}
+    for fi in range(args.folds):
+        te0, te1 = bounds[fi], bounds[fi+1]
+        if te1-te0 < 50: continue
+        bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators)
+        proba = bst.predict(xgb.DMatrix(X[te0:te1]))
+        yte, ote, rte = y[te0:te1], odds[te0:te1], rel[te0:te1]
+        implied = np.where(ote > 1.0, 1.0/ote, np.nan)
+        edge = np.where(np.isnan(implied), -9.0, proba - implied)
+        pick = edge.argmax(1)
+        bet = edge[np.arange(len(yte)), pick] > args.margin
+        win = (pick == yte) & bet
+        pick_odds = ote[np.arange(len(yte)), pick]
+        pnl = np.where(win, pick_odds-1.0, -1.0)
+        for i in range(len(yte)):
+            if not bet[i]: continue
+            glob["n"]+=1; glob["pnl"]+=pnl[i]; glob["win"]+=int(win[i])
+            recs.append((rel_band(rte[i]), fi, pnl[i], int(win[i])))
+            recs.append((odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
+            recs.append((rel_band(rte[i])+" x "+odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
+        print(f"  fold {fi}: tested {len(yte):,}  bets {int(bet.sum()):,}")
+
+    print("\n"+"="*78)
+    print(f"GLOBAL leak-free: bets={glob['n']:,}  hit={100*glob['win']/max(glob['n'],1):.1f}%  "
+          f"ROI(flat1u)={100*glob['pnl']/max(glob['n'],1):.2f}%")
+    print("="*78)
+
+    rdf = pd.DataFrame(recs, columns=["grp","fold","pnl","win"])
+    def report(prefix, title):
+        sub = rdf[rdf["grp"].str.startswith(prefix)]
+        if sub.empty: return
+        print(f"\n{title}")
+        print(f"  {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
+        print("  "+"-"*54)
+        g = sub.groupby("grp")
+        out=[]
+        for k,d in g:
+            nb=len(d)
+            if nb < args.min_bets: continue
+            roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
+            fp=d.groupby("fold")["pnl"].sum(); folds_pos=int((fp>0).sum()); ft=fp.shape[0]
+            out.append((roi,k,nb,hit,folds_pos,ft))
+        for roi,k,nb,hit,fp,ft in sorted(out,reverse=True):
+            print(f"  {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fp)+'/'+str(ft):>8}")
+    report("rel:", "BY LEAGUE-RELIABILITY BAND  (Q1=most obscure ... Q4=most reliable)")
+    report(("<","1","2","3","5","8"), None)  # odds bands start with digit/<
+    # odds-band buckets begin with a digit or '<'
+    sub = rdf[~rdf["grp"].str.startswith("rel:")]
+    sub = sub[~sub["grp"].str.contains(" x ")]
+    if not sub.empty:
+        print("\nBY ODDS BAND")
+        print(f"  {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
+        print("  "+"-"*54)
+        out=[]
+        for k,d in sub.groupby("grp"):
+            nb=len(d)
+            if nb<args.min_bets: continue
+            roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
+            fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
+        for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True):
+            print(f"  {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
+    # 2D reliability x odds
+    sub2 = rdf[rdf["grp"].str.contains(" x ")]
+    if not sub2.empty:
+        print("\nBY RELIABILITY x ODDS  (candidate pockets, n>=min-bets)")
+        print(f"  {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
+        print("  "+"-"*54)
+        out=[]
+        for k,d in sub2.groupby("grp"):
+            nb=len(d)
+            if nb<args.min_bets: continue
+            roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
+            fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
+        for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True)[:15]:
+            print(f"  {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
+    print("\nREAD: a pocket is a real LEAD only if ROI>0 AND positive in MOST folds")
+    print("(folds+ near full) AND bets large. +ROI in 1-2 folds = noise / overfit.")
+    print("Then forward-validate with CLV (capture_closing_odds.py) before staking.")
+
+
+if __name__ == "__main__":
+    main()