Files
iddaai-be/ai-engine/scripts/edge_search.py
T
fahricansecer 9e41407cb5
Deploy Iddaai Backend / build-and-deploy (push) Successful in 35s
gg3
2026-06-05 00:36:24 +03:00

182 lines
8.4 KiB
Python

"""
Edge Search — is there a profitable POCKET (by league) the global model misses?
==============================================================================
Global leak-free MS is ~-5.6% (the vig). But efficiency varies: obscure / low-
tier leagues may be mispriced. This walks a leak-free model forward and slices
the value-bet ROI BY LEAGUE, requiring a real sample AND multi-fold consistency
so we don't chase one lucky window.
Leak-free: drops the confirmed/suspected leakage columns (see LEAKY). Uses odds
in features (realistic). Value bet = biggest model_prob - implied edge > margin.
⚠️ Even a positive pocket here is a LEAD, not proof: the CSV odds are a static
capture, not the verified closing line. Anything flagged must be forward-
validated with real CLV (capture_closing_odds.py) before staking.
Usage: python scripts/edge_search.py --folds 6 --min-bets 150
"""
from __future__ import annotations
import argparse, os, sys, time
import numpy as np, pandas as pd, xgboost as xgb
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try: sys.stdout.reconfigure(encoding="utf-8")
except Exception: pass
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_DIR)
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
"score_home","score_away","ht_score_home","ht_score_away"}
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
"squad_diff","home_squad_quality","away_squad_quality",
"referee_home_bias","referee_avg_goals"}
def league_names(ids):
"""Resilient id->name lookup."""
from data.db import get_clean_dsn
import psycopg2
from psycopg2.extras import RealDictCursor
out = {}
ids = [str(i) for i in ids if i is not None]
if not ids: return out
for _ in range(3):
try:
with psycopg2.connect(get_clean_dsn()) as c:
with c.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (ids,))
for r in cur.fetchall(): out[str(r["id"])] = r["name"]
return out
except Exception:
time.sleep(1.0)
return out
def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--folds", type=int, default=6)
ap.add_argument("--estimators", type=int, default=200)
ap.add_argument("--margin", type=float, default=0.0)
ap.add_argument("--min-bets", type=int, default=150)
args = ap.parse_args()
print(f"Loading {CSV} ...")
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
sh = pd.to_numeric(df["score_home"], errors="coerce")
sa = pd.to_numeric(df["score_away"], errors="coerce")
ok = sh.notna() & sa.notna()
df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
league = df["league_id"].astype(str).values
odds = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
rel = pd.to_numeric(df.get("league_reliability_score", pd.Series([np.nan]*len(df))),
errors="coerce").fillna(-1.0).values
print(f" {len(df):,} rows features={len(feats)} (leak-free) folds={args.folds}")
n = len(df); start = int(n * 0.5)
bounds = np.linspace(start, n, args.folds + 1, dtype=int)
params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
# reliability quartile edges from the betting universe (rel>=0)
rv = rel[rel >= 0]
qs = np.quantile(rv, [0.25, 0.5, 0.75]) if len(rv) else [0.3, 0.5, 0.7]
def rel_band(x):
if x < 0: return "rel:unknown"
if x < qs[0]: return f"rel:Q1(<{qs[0]:.2f})"
if x < qs[1]: return f"rel:Q2"
if x < qs[2]: return f"rel:Q3"
return f"rel:Q4(>={qs[2]:.2f})"
def odds_band(o):
return ("<1.5" if o<1.5 else "1.5-2" if o<2 else "2-3" if o<3 else
"3-5" if o<5 else "5-8" if o<8 else "8+")
recs = [] # (group_key, fold, pnl, win)
glob = {"n":0,"pnl":0.0,"win":0}
for fi in range(args.folds):
te0, te1 = bounds[fi], bounds[fi+1]
if te1-te0 < 50: continue
bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators)
proba = bst.predict(xgb.DMatrix(X[te0:te1]))
yte, ote, rte = y[te0:te1], odds[te0:te1], rel[te0:te1]
implied = np.where(ote > 1.0, 1.0/ote, np.nan)
edge = np.where(np.isnan(implied), -9.0, proba - implied)
pick = edge.argmax(1)
bet = edge[np.arange(len(yte)), pick] > args.margin
win = (pick == yte) & bet
pick_odds = ote[np.arange(len(yte)), pick]
pnl = np.where(win, pick_odds-1.0, -1.0)
for i in range(len(yte)):
if not bet[i]: continue
glob["n"]+=1; glob["pnl"]+=pnl[i]; glob["win"]+=int(win[i])
recs.append((rel_band(rte[i]), fi, pnl[i], int(win[i])))
recs.append((odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
recs.append((rel_band(rte[i])+" x "+odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
print(f" fold {fi}: tested {len(yte):,} bets {int(bet.sum()):,}")
print("\n"+"="*78)
print(f"GLOBAL leak-free: bets={glob['n']:,} hit={100*glob['win']/max(glob['n'],1):.1f}% "
f"ROI(flat1u)={100*glob['pnl']/max(glob['n'],1):.2f}%")
print("="*78)
rdf = pd.DataFrame(recs, columns=["grp","fold","pnl","win"])
def report(prefix, title):
sub = rdf[rdf["grp"].str.startswith(prefix)]
if sub.empty: return
print(f"\n{title}")
print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
print(" "+"-"*54)
g = sub.groupby("grp")
out=[]
for k,d in g:
nb=len(d)
if nb < args.min_bets: continue
roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
fp=d.groupby("fold")["pnl"].sum(); folds_pos=int((fp>0).sum()); ft=fp.shape[0]
out.append((roi,k,nb,hit,folds_pos,ft))
for roi,k,nb,hit,fp,ft in sorted(out,reverse=True):
print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fp)+'/'+str(ft):>8}")
report("rel:", "BY LEAGUE-RELIABILITY BAND (Q1=most obscure ... Q4=most reliable)")
report(("<","1","2","3","5","8"), None) # odds bands start with digit/<
# odds-band buckets begin with a digit or '<'
sub = rdf[~rdf["grp"].str.startswith("rel:")]
sub = sub[~sub["grp"].str.contains(" x ")]
if not sub.empty:
print("\nBY ODDS BAND")
print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
print(" "+"-"*54)
out=[]
for k,d in sub.groupby("grp"):
nb=len(d)
if nb<args.min_bets: continue
roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True):
print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
# 2D reliability x odds
sub2 = rdf[rdf["grp"].str.contains(" x ")]
if not sub2.empty:
print("\nBY RELIABILITY x ODDS (candidate pockets, n>=min-bets)")
print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
print(" "+"-"*54)
out=[]
for k,d in sub2.groupby("grp"):
nb=len(d)
if nb<args.min_bets: continue
roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True)[:15]:
print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
print("\nREAD: a pocket is a real LEAD only if ROI>0 AND positive in MOST folds")
print("(folds+ near full) AND bets large. +ROI in 1-2 folds = noise / overfit.")
print("Then forward-validate with CLV (capture_closing_odds.py) before staking.")
if __name__ == "__main__":
main()