""" Multi-Market Edge + Best-Bet Selector — pick the best value bet PER MATCH ======================================================================== Not "play the handed main_pick". For each match, score EVERY market the model covers, compare model prob vs market implied, and select the single best VALUE bet across all markets. Leak-free, walk-forward, honest. Markets (truth derived from scores, not trusted labels): MS(1X2), HT-result, OU0.5/1.5/2.5/3.5, HT_OU0.5/1.5, BTTS. Outputs: (A) per-market value ROI -> which bet types actually carry edge (B) cross-market SELECTOR -> best value bet per match, with odds-band filter, fold-consistency, and the model-free baseline. ⚠️ CSV odds are a static capture, not verified closing. Positive = LEAD; forward paper-trade with real CLV before staking. Usage: python scripts/multi_market_edge.py --folds 5 --lo 1.5 --hi 2.6 --margin 0.03 """ from __future__ import annotations import argparse, os, sys import numpy as np, pandas as pd, xgboost as xgb if sys.stdout and hasattr(sys.stdout, "reconfigure"): try: sys.stdout.reconfigure(encoding="utf-8") except Exception: pass AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv") META = {"match_id","home_team_id","away_team_id","league_id","mst_utc", "score_home","score_away","ht_score_home","ht_score_away"} LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals", "squad_diff","home_squad_quality","away_squad_quality", "referee_home_bias","referee_avg_goals"} # market -> (kind, [odds_cols aligned to classes], truth_fn(sh,sa,hh,ha)->class idx or None) def ou(line): return lambda sh,sa,hh,ha: (0 if (sh+sa) > line else 1) # 0=Over,1=Under def htou(line): return lambda sh,sa,hh,ha: (None if np.isnan(hh) else (0 if (hh+ha) > line else 1)) def ms_truth(sh,sa,hh,ha): return 0 if sh>sa else (1 if sh==sa else 2) def ht_truth(sh,sa,hh,ha): return None if np.isnan(hh) else (0 if hh>ha else (1 if hh==ha else 2)) def btts_truth(sh,sa,hh,ha): return 0 if (sh>0 and sa>0) else 1 # 0=Yes,1=No MARKETS = { "MS": ("multi", ["odds_ms_h","odds_ms_d","odds_ms_a"], ["1","X","2"], ms_truth), "HT": ("multi", ["odds_ht_ms_h","odds_ht_ms_d","odds_ht_ms_a"], ["1","X","2"], ht_truth), "OU05": ("binary", ["odds_ou05_o","odds_ou05_u"], ["Üst","Alt"], ou(0.5)), "OU15": ("binary", ["odds_ou15_o","odds_ou15_u"], ["Üst","Alt"], ou(1.5)), "OU25": ("binary", ["odds_ou25_o","odds_ou25_u"], ["Üst","Alt"], ou(2.5)), "OU35": ("binary", ["odds_ou35_o","odds_ou35_u"], ["Üst","Alt"], ou(3.5)), "HT_OU05": ("binary", ["odds_ht_ou05_o","odds_ht_ou05_u"], ["Üst","Alt"], htou(0.5)), "HT_OU15": ("binary", ["odds_ht_ou15_o","odds_ht_ou15_u"], ["Üst","Alt"], htou(1.5)), "BTTS": ("binary", ["odds_btts_y","odds_btts_n"], ["Var","Yok"], btts_truth), } PARAMS_M = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05, "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0} PARAMS_B = {"objective":"binary:logistic","max_depth":5,"eta":0.05, "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0} def main(): ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--folds", type=int, default=5) ap.add_argument("--estimators", type=int, default=150) ap.add_argument("--lo", type=float, default=1.5) ap.add_argument("--hi", type=float, default=2.6) ap.add_argument("--margin", type=float, default=0.03) args = ap.parse_args() df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True) sh = pd.to_numeric(df["score_home"], errors="coerce") sa = pd.to_numeric(df["score_away"], errors="coerce") ok = sh.notna() & sa.notna() df = df[ok].reset_index(drop=True) SH = sh[ok.values].values.astype(float); SA = sa[ok.values].values.astype(float) HH = pd.to_numeric(df["ht_score_home"], errors="coerce").values.astype(float) HA = pd.to_numeric(df["ht_score_away"], errors="coerce").values.astype(float) feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY] X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values N = len(df) print(f"{N:,} matches, {len(feats)} leak-free feats, {len(MARKETS)} markets, folds={args.folds}") # precompute truth + odds per market MK = {} for mname,(kind,ocols,picks,tfn) in MARKETS.items(): if not all(c in df.columns for c in ocols): print(f" skip {mname}: missing odds cols"); continue O = df[ocols].apply(pd.to_numeric, errors="coerce").fillna(0.0).values truth = np.array([tfn(SH[i],SA[i],HH[i],HA[i]) for i in range(N)], dtype=object) MK[mname] = (kind, O, picks, truth) start = int(N*0.5); bounds = np.linspace(start, N, args.folds+1, dtype=int) # accumulators per_market = {m: {"n":0,"pnl":0.0,"win":0} for m in MK} # (A) best value pick within market sel = {"n":0,"pnl":0.0,"win":0,"fold":{}} # (B) cross-market selector sel_by_mkt = {m: {"n":0,"pnl":0.0,"win":0} for m in MK} for fi in range(args.folds): te0,te1 = bounds[fi], bounds[fi+1] if te1-te0 < 50: continue idx = np.arange(te0,te1) # train each market model on [:te0], predict test cand = {} # market -> (P_matrix[n_test, n_picks], O_test, truth_test) for m,(kind,O,picks,truth) in MK.items(): ytr_full = truth[:te0] # mask invalid truth (e.g., HT markets with missing HT score) valid_tr = np.array([v is not None for v in ytr_full]) if kind=="multi": ytr = ytr_full[valid_tr].astype(int) bst = xgb.train(PARAMS_M, xgb.DMatrix(X[:te0][valid_tr], label=ytr), num_boost_round=args.estimators) P = bst.predict(xgb.DMatrix(X[te0:te1])) # [n,3] else: ytr = ytr_full[valid_tr].astype(int) # 0=positive,1=neg pos = (ytr==0).astype(int) bst = xgb.train(PARAMS_B, xgb.DMatrix(X[:te0][valid_tr], label=pos), num_boost_round=args.estimators) ppos = bst.predict(xgb.DMatrix(X[te0:te1])) P = np.column_stack([ppos, 1.0-ppos]) # [n,2] -> [pos,neg] cand[m] = (P, O[te0:te1], truth[te0:te1]) # iterate test matches for j in range(te1-te0): best = None # (edge, market, pickidx, odds, won) for m,(P,Ot,Tt) in cand.items(): t = Tt[j] if t is None: continue probs = P[j]; odds = Ot[j] for k in range(len(probs)): o = odds[k] if o <= 1.0: continue edge = probs[k] - 1.0/o won = int(t==k) # (A) per-market: track best value pick in this market (any band, edge>margin) if edge > args.margin: d = per_market[m] # only count the market's single best pick per match # collect for selector if in band + margin if edge > args.margin and args.lo <= o < args.hi: if best is None or edge > best[0]: best = (edge, m, k, o, won) # per-market best pick (separate loop for clean per-market ROI in band) bestk=None for k in range(len(probs)): o=odds[k] if o<=1.0: continue e=probs[k]-1.0/o if e>args.margin and args.lo<=obestk[0]): bestk=(e,k,o,int(t==k)) if bestk is not None: e,k,o,won = bestk pnl = (o-1.0) if won else -1.0 d=per_market[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won # selector: single best value bet across all markets for this match if best is not None: edge,m,k,o,won = best pnl = (o-1.0) if won else -1.0 sel["n"]+=1; sel["pnl"]+=pnl; sel["win"]+=won sel["fold"][fi] = sel["fold"].get(fi,0.0)+pnl d=sel_by_mkt[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won print(f" fold {fi}: tested {te1-te0:,}") def line(name,d): n=d["n"]; roi=100*d["pnl"]/n if n else float('nan'); hit=100*d["win"]/n if n else float('nan') return f" {name:<10} bets={n:>6} hit={hit:>5.1f}% ROI={roi:>7.2f}% net={d['pnl']:>7.1f}u" print("\n"+"="*70); print(f"(A) PER-MARKET value ROI (best value pick in band [{args.lo},{args.hi}], margin {args.margin})"); print("="*70) for m in sorted(per_market, key=lambda x:-(100*per_market[x]['pnl']/per_market[x]['n'] if per_market[x]['n'] else -99)): print(line(m, per_market[m])) print("\n"+"="*70); print("(B) CROSS-MARKET SELECTOR (best value bet per match, all markets)"); print("="*70) print(line("SELECTOR", sel)) folds_pos = sum(1 for v in sel["fold"].values() if v>0) print(f" folds positive: {folds_pos}/{len(sel['fold'])}") print(" selector picks distributed across markets:") for m in sorted(sel_by_mkt, key=lambda x:-sel_by_mkt[x]['n']): if sel_by_mkt[m]["n"]>0: print(" "+line(m, sel_by_mkt[m]).strip()) print("\nREAD: a market/selector is a LEAD only if ROI>0, folds consistent, n large.") print("Forward-validate with CLV before staking. Static CSV odds may overstate edge.") if __name__ == "__main__": main()