""" Edge Search — is there a profitable POCKET (by league) the global model misses? ============================================================================== Global leak-free MS is ~-5.6% (the vig). But efficiency varies: obscure / low- tier leagues may be mispriced. This walks a leak-free model forward and slices the value-bet ROI BY LEAGUE, requiring a real sample AND multi-fold consistency so we don't chase one lucky window. Leak-free: drops the confirmed/suspected leakage columns (see LEAKY). Uses odds in features (realistic). Value bet = biggest model_prob - implied edge > margin. ⚠️ Even a positive pocket here is a LEAD, not proof: the CSV odds are a static capture, not the verified closing line. Anything flagged must be forward- validated with real CLV (capture_closing_odds.py) before staking. Usage: python scripts/edge_search.py --folds 6 --min-bets 150 """ from __future__ import annotations import argparse, os, sys, time import numpy as np, pandas as pd, xgboost as xgb if sys.stdout and hasattr(sys.stdout, "reconfigure"): try: sys.stdout.reconfigure(encoding="utf-8") except Exception: pass AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, AI_DIR) CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv") META = {"match_id","home_team_id","away_team_id","league_id","mst_utc", "score_home","score_away","ht_score_home","ht_score_away"} LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals", "squad_diff","home_squad_quality","away_squad_quality", "referee_home_bias","referee_avg_goals"} def league_names(ids): """Resilient id->name lookup.""" from data.db import get_clean_dsn import psycopg2 from psycopg2.extras import RealDictCursor out = {} ids = [str(i) for i in ids if i is not None] if not ids: return out for _ in range(3): try: with psycopg2.connect(get_clean_dsn()) as c: with c.cursor(cursor_factory=RealDictCursor) as cur: cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (ids,)) for r in cur.fetchall(): out[str(r["id"])] = r["name"] return out except Exception: time.sleep(1.0) return out def main(): ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--folds", type=int, default=6) ap.add_argument("--estimators", type=int, default=200) ap.add_argument("--margin", type=float, default=0.0) ap.add_argument("--min-bets", type=int, default=150) args = ap.parse_args() print(f"Loading {CSV} ...") df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True) sh = pd.to_numeric(df["score_home"], errors="coerce") sa = pd.to_numeric(df["score_away"], errors="coerce") ok = sh.notna() & sa.notna() df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2)) league = df["league_id"].astype(str).values odds = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY] X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values rel = pd.to_numeric(df.get("league_reliability_score", pd.Series([np.nan]*len(df))), errors="coerce").fillna(-1.0).values print(f" {len(df):,} rows features={len(feats)} (leak-free) folds={args.folds}") n = len(df); start = int(n * 0.5) bounds = np.linspace(start, n, args.folds + 1, dtype=int) params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05, "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0} # reliability quartile edges from the betting universe (rel>=0) rv = rel[rel >= 0] qs = np.quantile(rv, [0.25, 0.5, 0.75]) if len(rv) else [0.3, 0.5, 0.7] def rel_band(x): if x < 0: return "rel:unknown" if x < qs[0]: return f"rel:Q1(<{qs[0]:.2f})" if x < qs[1]: return f"rel:Q2" if x < qs[2]: return f"rel:Q3" return f"rel:Q4(>={qs[2]:.2f})" def odds_band(o): return ("<1.5" if o<1.5 else "1.5-2" if o<2 else "2-3" if o<3 else "3-5" if o<5 else "5-8" if o<8 else "8+") recs = [] # (group_key, fold, pnl, win) glob = {"n":0,"pnl":0.0,"win":0} for fi in range(args.folds): te0, te1 = bounds[fi], bounds[fi+1] if te1-te0 < 50: continue bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators) proba = bst.predict(xgb.DMatrix(X[te0:te1])) yte, ote, rte = y[te0:te1], odds[te0:te1], rel[te0:te1] implied = np.where(ote > 1.0, 1.0/ote, np.nan) edge = np.where(np.isnan(implied), -9.0, proba - implied) pick = edge.argmax(1) bet = edge[np.arange(len(yte)), pick] > args.margin win = (pick == yte) & bet pick_odds = ote[np.arange(len(yte)), pick] pnl = np.where(win, pick_odds-1.0, -1.0) for i in range(len(yte)): if not bet[i]: continue glob["n"]+=1; glob["pnl"]+=pnl[i]; glob["win"]+=int(win[i]) recs.append((rel_band(rte[i]), fi, pnl[i], int(win[i]))) recs.append((odds_band(pick_odds[i]), fi, pnl[i], int(win[i]))) recs.append((rel_band(rte[i])+" x "+odds_band(pick_odds[i]), fi, pnl[i], int(win[i]))) print(f" fold {fi}: tested {len(yte):,} bets {int(bet.sum()):,}") print("\n"+"="*78) print(f"GLOBAL leak-free: bets={glob['n']:,} hit={100*glob['win']/max(glob['n'],1):.1f}% " f"ROI(flat1u)={100*glob['pnl']/max(glob['n'],1):.2f}%") print("="*78) rdf = pd.DataFrame(recs, columns=["grp","fold","pnl","win"]) def report(prefix, title): sub = rdf[rdf["grp"].str.startswith(prefix)] if sub.empty: return print(f"\n{title}") print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}") print(" "+"-"*54) g = sub.groupby("grp") out=[] for k,d in g: nb=len(d) if nb < args.min_bets: continue roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb fp=d.groupby("fold")["pnl"].sum(); folds_pos=int((fp>0).sum()); ft=fp.shape[0] out.append((roi,k,nb,hit,folds_pos,ft)) for roi,k,nb,hit,fp,ft in sorted(out,reverse=True): print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fp)+'/'+str(ft):>8}") report("rel:", "BY LEAGUE-RELIABILITY BAND (Q1=most obscure ... Q4=most reliable)") report(("<","1","2","3","5","8"), None) # odds bands start with digit/< # odds-band buckets begin with a digit or '<' sub = rdf[~rdf["grp"].str.startswith("rel:")] sub = sub[~sub["grp"].str.contains(" x ")] if not sub.empty: print("\nBY ODDS BAND") print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}") print(" "+"-"*54) out=[] for k,d in sub.groupby("grp"): nb=len(d) if nb0).sum()),fp.shape[0])) for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True): print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}") # 2D reliability x odds sub2 = rdf[rdf["grp"].str.contains(" x ")] if not sub2.empty: print("\nBY RELIABILITY x ODDS (candidate pockets, n>=min-bets)") print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}") print(" "+"-"*54) out=[] for k,d in sub2.groupby("grp"): nb=len(d) if nb0).sum()),fp.shape[0])) for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True)[:15]: print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}") print("\nREAD: a pocket is a real LEAD only if ROI>0 AND positive in MOST folds") print("(folds+ near full) AND bets large. +ROI in 1-2 folds = noise / overfit.") print("Then forward-validate with CLV (capture_closing_odds.py) before staking.") if __name__ == "__main__": main()