""" Betting Policy — the honest, leak-free strategy the data actually supports. ========================================================================== Everything else in this repo bet UNDERDOGS (odds 6-7.5) and lost (-43.7% live). The data says the opposite: the only positive, fold-consistent, model-driven signal is MILD FAVOURITES the model rates above the market price. POLICY (MS / 1X2 only): * leak-free model (drops the result-encoding features, see LEAKY) * bet the model's single biggest value edge (model_prob - implied) ... * ONLY if the picked side's odds are in [--lo, --hi] (favourite band) * ONLY if that edge > --margin * flat 1u stake, one bet per match, never a longshot, never a parlay. Walk-forward, no leakage. Reports the policy ROI, fold consistency, drawdown, and the model-free baseline (blind favourite) so you can see the model's lift. ⚠️ HONEST CAVEAT: CSV odds are a static capture, not the verified obtainable closing line. A small backtest edge here is a LEAD, not a guarantee. Forward paper-trade with real CLV (capture_closing_odds.py) before risking money. Usage: python scripts/betting_policy.py --lo 1.5 --hi 2.2 --margin 0.0 --folds 8 """ from __future__ import annotations import argparse, os, sys import numpy as np, pandas as pd, xgboost as xgb if sys.stdout and hasattr(sys.stdout, "reconfigure"): try: sys.stdout.reconfigure(encoding="utf-8") except Exception: pass AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv") META = {"match_id","home_team_id","away_team_id","league_id","mst_utc", "score_home","score_away","ht_score_home","ht_score_away"} LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals", "squad_diff","home_squad_quality","away_squad_quality", "referee_home_bias","referee_avg_goals"} def main(): ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--lo", type=float, default=1.5) ap.add_argument("--hi", type=float, default=2.2) ap.add_argument("--margin", type=float, default=0.0) ap.add_argument("--folds", type=int, default=8) ap.add_argument("--estimators", type=int, default=250) args = ap.parse_args() df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True) sh = pd.to_numeric(df["score_home"], errors="coerce") sa = pd.to_numeric(df["score_away"], errors="coerce") ok = sh.notna() & sa.notna() df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2)) O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY] X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values n = len(df); start = int(n*0.5) bounds = np.linspace(start, n, args.folds+1, dtype=int) params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05, "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0} print(f"POLICY: favourite band [{args.lo},{args.hi}] margin {args.margin} " f"leak-free feats={len(feats)} folds={args.folds}\n") all_pnl=[]; fold_rows=[]; base_pnl=[] for fi in range(args.folds): te0,te1 = bounds[fi], bounds[fi+1] if te1-te0 < 50: continue bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators) P = bst.predict(xgb.DMatrix(X[te0:te1])) yte, Ote = y[te0:te1], O[te0:te1] implied = np.where(Ote>1.0, 1.0/Ote, np.nan) edge = np.where(np.isnan(implied), -9.0, P-implied) pick = edge.argmax(1); pe = edge[np.arange(len(yte)),pick]; po = Ote[np.arange(len(yte)),pick] bet = (pe>args.margin) & (po>=args.lo) & (po=args.lo)&(fo1.0).all(1) bpnl=np.where(fav[bmask]==yte[bmask], fo[bmask]-1.0, -1.0) roi = 100*pnl.sum()/len(pnl) if len(pnl) else float('nan') broi= 100*bpnl.sum()/len(bpnl) if len(bpnl) else float('nan') fold_rows.append((fi, len(pnl), 100*win.sum()/max(bet.sum(),1), roi, broi)) all_pnl.extend(pnl.tolist()); base_pnl.extend(bpnl.tolist()) print(f" fold {fi}: policy_bets={len(pnl):>4} hit={100*win.sum()/max(bet.sum(),1):>5.1f}% " f"ROI={roi:>7.2f}% | baseline(blind fav) ROI={broi:>7.2f}%") a=np.array(all_pnl); b=np.array(base_pnl) print("\n"+"="*70) print("AGGREGATE") print("="*70) if len(a): cum=np.cumsum(a); peak=np.maximum.accumulate(cum); dd=(cum-peak).min() folds_pos=sum(1 for r in fold_rows if r[3]>0) print(f" POLICY: bets={len(a):>5} hit={100*(a>0).mean():.1f}% " f"ROI={100*a.mean():+.2f}% net={a.sum():+.1f}u maxDD={dd:.1f}u " f"folds+={folds_pos}/{len(fold_rows)}") if len(b): print(f" BASELINE: bets={len(b):>5} hit={100*(b>0).mean():.1f}% " f"ROI={100*b.mean():+.2f}% (blind favourite, same band)") if len(a): print(f"\n MODEL LIFT over blind favourite: " f"{100*a.mean()-100*b.mean():+.1f} percentage points") print("\nREAD: a believable system has ROI>0, folds+ near full, tolerable maxDD,") print("and clearly beats the blind-favourite baseline. Even then it's a LEAD —") print("forward paper-trade with real CLV before staking real money.") if __name__ == "__main__": main()