@@ -0,0 +1,181 @@
|
||||
"""
|
||||
Edge Search — is there a profitable POCKET (by league) the global model misses?
|
||||
==============================================================================
|
||||
Global leak-free MS is ~-5.6% (the vig). But efficiency varies: obscure / low-
|
||||
tier leagues may be mispriced. This walks a leak-free model forward and slices
|
||||
the value-bet ROI BY LEAGUE, requiring a real sample AND multi-fold consistency
|
||||
so we don't chase one lucky window.
|
||||
|
||||
Leak-free: drops the confirmed/suspected leakage columns (see LEAKY). Uses odds
|
||||
in features (realistic). Value bet = biggest model_prob - implied edge > margin.
|
||||
|
||||
⚠️ Even a positive pocket here is a LEAD, not proof: the CSV odds are a static
|
||||
capture, not the verified closing line. Anything flagged must be forward-
|
||||
validated with real CLV (capture_closing_odds.py) before staking.
|
||||
|
||||
Usage: python scripts/edge_search.py --folds 6 --min-bets 150
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse, os, sys, time
|
||||
import numpy as np, pandas as pd, xgboost as xgb
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try: sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception: pass
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0, AI_DIR)
|
||||
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||||
|
||||
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
|
||||
"score_home","score_away","ht_score_home","ht_score_away"}
|
||||
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
|
||||
"squad_diff","home_squad_quality","away_squad_quality",
|
||||
"referee_home_bias","referee_avg_goals"}
|
||||
|
||||
|
||||
def league_names(ids):
|
||||
"""Resilient id->name lookup."""
|
||||
from data.db import get_clean_dsn
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
out = {}
|
||||
ids = [str(i) for i in ids if i is not None]
|
||||
if not ids: return out
|
||||
for _ in range(3):
|
||||
try:
|
||||
with psycopg2.connect(get_clean_dsn()) as c:
|
||||
with c.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (ids,))
|
||||
for r in cur.fetchall(): out[str(r["id"])] = r["name"]
|
||||
return out
|
||||
except Exception:
|
||||
time.sleep(1.0)
|
||||
return out
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--folds", type=int, default=6)
|
||||
ap.add_argument("--estimators", type=int, default=200)
|
||||
ap.add_argument("--margin", type=float, default=0.0)
|
||||
ap.add_argument("--min-bets", type=int, default=150)
|
||||
args = ap.parse_args()
|
||||
|
||||
print(f"Loading {CSV} ...")
|
||||
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
|
||||
sh = pd.to_numeric(df["score_home"], errors="coerce")
|
||||
sa = pd.to_numeric(df["score_away"], errors="coerce")
|
||||
ok = sh.notna() & sa.notna()
|
||||
df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
|
||||
y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
|
||||
league = df["league_id"].astype(str).values
|
||||
odds = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
|
||||
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
|
||||
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
rel = pd.to_numeric(df.get("league_reliability_score", pd.Series([np.nan]*len(df))),
|
||||
errors="coerce").fillna(-1.0).values
|
||||
print(f" {len(df):,} rows features={len(feats)} (leak-free) folds={args.folds}")
|
||||
|
||||
n = len(df); start = int(n * 0.5)
|
||||
bounds = np.linspace(start, n, args.folds + 1, dtype=int)
|
||||
params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
|
||||
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
|
||||
|
||||
# reliability quartile edges from the betting universe (rel>=0)
|
||||
rv = rel[rel >= 0]
|
||||
qs = np.quantile(rv, [0.25, 0.5, 0.75]) if len(rv) else [0.3, 0.5, 0.7]
|
||||
def rel_band(x):
|
||||
if x < 0: return "rel:unknown"
|
||||
if x < qs[0]: return f"rel:Q1(<{qs[0]:.2f})"
|
||||
if x < qs[1]: return f"rel:Q2"
|
||||
if x < qs[2]: return f"rel:Q3"
|
||||
return f"rel:Q4(>={qs[2]:.2f})"
|
||||
def odds_band(o):
|
||||
return ("<1.5" if o<1.5 else "1.5-2" if o<2 else "2-3" if o<3 else
|
||||
"3-5" if o<5 else "5-8" if o<8 else "8+")
|
||||
|
||||
recs = [] # (group_key, fold, pnl, win)
|
||||
glob = {"n":0,"pnl":0.0,"win":0}
|
||||
for fi in range(args.folds):
|
||||
te0, te1 = bounds[fi], bounds[fi+1]
|
||||
if te1-te0 < 50: continue
|
||||
bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators)
|
||||
proba = bst.predict(xgb.DMatrix(X[te0:te1]))
|
||||
yte, ote, rte = y[te0:te1], odds[te0:te1], rel[te0:te1]
|
||||
implied = np.where(ote > 1.0, 1.0/ote, np.nan)
|
||||
edge = np.where(np.isnan(implied), -9.0, proba - implied)
|
||||
pick = edge.argmax(1)
|
||||
bet = edge[np.arange(len(yte)), pick] > args.margin
|
||||
win = (pick == yte) & bet
|
||||
pick_odds = ote[np.arange(len(yte)), pick]
|
||||
pnl = np.where(win, pick_odds-1.0, -1.0)
|
||||
for i in range(len(yte)):
|
||||
if not bet[i]: continue
|
||||
glob["n"]+=1; glob["pnl"]+=pnl[i]; glob["win"]+=int(win[i])
|
||||
recs.append((rel_band(rte[i]), fi, pnl[i], int(win[i])))
|
||||
recs.append((odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
|
||||
recs.append((rel_band(rte[i])+" x "+odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
|
||||
print(f" fold {fi}: tested {len(yte):,} bets {int(bet.sum()):,}")
|
||||
|
||||
print("\n"+"="*78)
|
||||
print(f"GLOBAL leak-free: bets={glob['n']:,} hit={100*glob['win']/max(glob['n'],1):.1f}% "
|
||||
f"ROI(flat1u)={100*glob['pnl']/max(glob['n'],1):.2f}%")
|
||||
print("="*78)
|
||||
|
||||
rdf = pd.DataFrame(recs, columns=["grp","fold","pnl","win"])
|
||||
def report(prefix, title):
|
||||
sub = rdf[rdf["grp"].str.startswith(prefix)]
|
||||
if sub.empty: return
|
||||
print(f"\n{title}")
|
||||
print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
|
||||
print(" "+"-"*54)
|
||||
g = sub.groupby("grp")
|
||||
out=[]
|
||||
for k,d in g:
|
||||
nb=len(d)
|
||||
if nb < args.min_bets: continue
|
||||
roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
|
||||
fp=d.groupby("fold")["pnl"].sum(); folds_pos=int((fp>0).sum()); ft=fp.shape[0]
|
||||
out.append((roi,k,nb,hit,folds_pos,ft))
|
||||
for roi,k,nb,hit,fp,ft in sorted(out,reverse=True):
|
||||
print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fp)+'/'+str(ft):>8}")
|
||||
report("rel:", "BY LEAGUE-RELIABILITY BAND (Q1=most obscure ... Q4=most reliable)")
|
||||
report(("<","1","2","3","5","8"), None) # odds bands start with digit/<
|
||||
# odds-band buckets begin with a digit or '<'
|
||||
sub = rdf[~rdf["grp"].str.startswith("rel:")]
|
||||
sub = sub[~sub["grp"].str.contains(" x ")]
|
||||
if not sub.empty:
|
||||
print("\nBY ODDS BAND")
|
||||
print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
|
||||
print(" "+"-"*54)
|
||||
out=[]
|
||||
for k,d in sub.groupby("grp"):
|
||||
nb=len(d)
|
||||
if nb<args.min_bets: continue
|
||||
roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
|
||||
fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
|
||||
for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True):
|
||||
print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
|
||||
# 2D reliability x odds
|
||||
sub2 = rdf[rdf["grp"].str.contains(" x ")]
|
||||
if not sub2.empty:
|
||||
print("\nBY RELIABILITY x ODDS (candidate pockets, n>=min-bets)")
|
||||
print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
|
||||
print(" "+"-"*54)
|
||||
out=[]
|
||||
for k,d in sub2.groupby("grp"):
|
||||
nb=len(d)
|
||||
if nb<args.min_bets: continue
|
||||
roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
|
||||
fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
|
||||
for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True)[:15]:
|
||||
print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
|
||||
print("\nREAD: a pocket is a real LEAD only if ROI>0 AND positive in MOST folds")
|
||||
print("(folds+ near full) AND bets large. +ROI in 1-2 folds = noise / overfit.")
|
||||
print("Then forward-validate with CLV (capture_closing_odds.py) before staking.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user