diff --git a/ai-engine/scripts/analyze_match_v2.py b/ai-engine/scripts/analyze_match_v2.py new file mode 100644 index 0000000..ab1e523 --- /dev/null +++ b/ai-engine/scripts/analyze_match_v2.py @@ -0,0 +1,137 @@ +""" +Analyze Match v2 — the per-match multi-market value board + disciplined pick. +=========================================================================== +Answers "for ONE match, show every bet type's probability + model signal + +market-vs-model value, and pick the right bet." Leak-free models. + +KEY HONEST RULE (proven by multi_market_edge.py): compute & SHOW value for all +markets, but only MS (1X2) carries real, fold-consistent model edge. In OU/HT/ +BTTS the market is efficient — a big model-vs-market gap there is the MODEL'S +ERROR, not value. So non-MS rows are INFO-ONLY; only an MS value bet in the +favourite band is STAKED. + +Demo: trains all market models on the first 85% of history, then prints the full +board for sample matches in the unseen last 15% (with what actually happened). + +Usage: + python scripts/analyze_match_v2.py --n 6 + python scripts/analyze_match_v2.py --match +""" +from __future__ import annotations +import argparse, os, sys +import numpy as np, pandas as pd, xgboost as xgb + +if sys.stdout and hasattr(sys.stdout, "reconfigure"): + try: sys.stdout.reconfigure(encoding="utf-8") + except Exception: pass + +AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv") +META = {"match_id","home_team_id","away_team_id","league_id","mst_utc", + "score_home","score_away","ht_score_home","ht_score_away"} +LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals", + "squad_diff","home_squad_quality","away_squad_quality", + "referee_home_bias","referee_avg_goals"} +STAKE_LO, STAKE_HI = 1.5, 2.4 # MS favourite band that staking is allowed in +STAKE_MARGIN = 0.03 + +def ou(line): return lambda sh,sa,hh,ha: (0 if (sh+sa) > line else 1) +def htou(line): return lambda sh,sa,hh,ha: (None if np.isnan(hh) else (0 if (hh+ha) > line else 1)) +MARKETS = { + "MS": ("multi", ["odds_ms_h","odds_ms_d","odds_ms_a"], ["1","X","2"], + lambda sh,sa,hh,ha: 0 if sh>sa else (1 if sh==sa else 2)), + "OU25": ("binary",["odds_ou25_o","odds_ou25_u"], ["2.5Üst","2.5Alt"], ou(2.5)), + "OU15": ("binary",["odds_ou15_o","odds_ou15_u"], ["1.5Üst","1.5Alt"], ou(1.5)), + "OU35": ("binary",["odds_ou35_o","odds_ou35_u"], ["3.5Üst","3.5Alt"], ou(3.5)), + "BTTS": ("binary",["odds_btts_y","odds_btts_n"], ["KG Var","KG Yok"], + lambda sh,sa,hh,ha: 0 if (sh>0 and sa>0) else 1), + "HT": ("multi", ["odds_ht_ms_h","odds_ht_ms_d","odds_ht_ms_a"], ["İY1","İYX","İY2"], + lambda sh,sa,hh,ha: None if np.isnan(hh) else (0 if hh>ha else (1 if hh==ha else 2))), + "HT_OU15": ("binary",["odds_ht_ou15_o","odds_ht_ou15_u"], ["İY1.5Üst","İY1.5Alt"], htou(1.5)), +} +STAKED_MARKETS = {"MS"} # only these are bet; rest are info-only +PM = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0} +PB = {"objective":"binary:logistic","max_depth":5,"eta":0.05,"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0} + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--n", type=int, default=6, help="how many sample matches") + ap.add_argument("--match", help="specific match_id") + ap.add_argument("--estimators", type=int, default=250) + args = ap.parse_args() + + df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True) + sh = pd.to_numeric(df["score_home"],errors="coerce"); sa = pd.to_numeric(df["score_away"],errors="coerce") + ok = sh.notna()&sa.notna(); df = df[ok].reset_index(drop=True) + SH=sh[ok.values].values.astype(float); SA=sa[ok.values].values.astype(float) + HH=pd.to_numeric(df["ht_score_home"],errors="coerce").values.astype(float) + HA=pd.to_numeric(df["ht_score_away"],errors="coerce").values.astype(float) + feats=[c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY] + X=df[feats].apply(pd.to_numeric,errors="coerce").fillna(0.0).values + N=len(df); cut=int(N*0.85) + print(f"Training {len(MARKETS)} leak-free market models on {cut:,} matches ...") + + models={} + for m,(kind,ocols,picks,tfn) in MARKETS.items(): + if not all(c in df.columns for c in ocols): continue + truth=np.array([tfn(SH[i],SA[i],HH[i],HA[i]) for i in range(cut)],dtype=object) + valid=np.array([v is not None for v in truth]) + if kind=="multi": + b=xgb.train(PM,xgb.DMatrix(X[:cut][valid],label=truth[valid].astype(int)),num_boost_round=args.estimators) + else: + b=xgb.train(PB,xgb.DMatrix(X[:cut][valid],label=(truth[valid].astype(int)==0).astype(int)),num_boost_round=args.estimators) + models[m]=(kind,ocols,picks,tfn,b) + + # choose matches from holdout + hold = df.iloc[cut:].reset_index(drop=True) + if args.match: + sel_idx = df.index[df["match_id"].astype(str)==str(args.match)].tolist() + rows = [(i,) for i in sel_idx] + base = df + else: + pick_pos = np.linspace(0, len(hold)-1, args.n, dtype=int) + rows = [(cut+p,) for p in pick_pos] + base = df + + for (gi,) in rows: + r = base.iloc[gi] + xrow = X[gi:gi+1] + sh_,sa_,hh_,ha_ = SH[gi],SA[gi],HH[gi],HA[gi] + ht = f"{int(hh_)}-{int(ha_)}" if not np.isnan(hh_) else "?" + print("\n"+"="*72) + print(f"MATCH {r['match_id']} | elo H{r.get('home_overall_elo','?'):.0f} vs A{r.get('away_overall_elo','?'):.0f}" + f" | ACTUAL {int(sh_)}-{int(sa_)} (HT {ht})") + print(f" {'market':<8}{'pick':<10}{'model%':>8}{'impl%':>7}{'edge':>7}{'odds':>7} flag result") + print(" "+"-"*64) + best_ms=None + for m,(kind,ocols,picks,tfn,b) in models.items(): + if kind=="multi": + P=b.predict(xgb.DMatrix(xrow))[0] + else: + p=float(b.predict(xgb.DMatrix(xrow))[0]); P=np.array([p,1-p]) + O=pd.to_numeric(r[ocols],errors="coerce").fillna(0.0).values + truth=tfn(sh_,sa_,hh_,ha_) + for k in range(len(picks)): + o=O[k] + if o<=1.0: continue + imp=1.0/o; edge=P[k]-imp + res = "—" if truth is None else ("WON" if truth==k else "lost") + staked = (m in STAKED_MARKETS) and edge>STAKE_MARGIN and STAKE_LO<=oSTAKE_MARGIN else "") + print(f" {m:<8}{picks[k]:<10}{100*P[k]:>7.1f}{100*imp:>7.1f}{100*edge:>+7.1f}{o:>7.2f} {flag:<5} {res}") + if staked and (best_ms is None or edge>best_ms[0]): + best_ms=(edge,m,picks[k],o,res) + print(" "+"-"*64) + if best_ms: + e,m,p,o,res = best_ms + print(f" >>> STAKE: {m} {p} @ {o:.2f} (edge +{100*e:.1f}%, favourite band) -> {res}") + else: + print(f" >>> NO STAKE: no MS value in favourite band. (Other markets info-only —") + print(f" their 'value' is model error in efficient markets; do NOT chase it.)") + print("\nNOTE: only MS staked (proven edge). All markets shown for transparency.") + print("Forward-validate with CLV before real money. Static CSV odds may overstate edge.") + + +if __name__ == "__main__": + main() diff --git a/ai-engine/scripts/betting_policy.py b/ai-engine/scripts/betting_policy.py new file mode 100644 index 0000000..bd47441 --- /dev/null +++ b/ai-engine/scripts/betting_policy.py @@ -0,0 +1,113 @@ +""" +Betting Policy — the honest, leak-free strategy the data actually supports. +========================================================================== +Everything else in this repo bet UNDERDOGS (odds 6-7.5) and lost (-43.7% live). +The data says the opposite: the only positive, fold-consistent, model-driven +signal is MILD FAVOURITES the model rates above the market price. + +POLICY (MS / 1X2 only): + * leak-free model (drops the result-encoding features, see LEAKY) + * bet the model's single biggest value edge (model_prob - implied) ... + * ONLY if the picked side's odds are in [--lo, --hi] (favourite band) + * ONLY if that edge > --margin + * flat 1u stake, one bet per match, never a longshot, never a parlay. + +Walk-forward, no leakage. Reports the policy ROI, fold consistency, drawdown, +and the model-free baseline (blind favourite) so you can see the model's lift. + +⚠️ HONEST CAVEAT: CSV odds are a static capture, not the verified obtainable +closing line. A small backtest edge here is a LEAD, not a guarantee. Forward +paper-trade with real CLV (capture_closing_odds.py) before risking money. + +Usage: python scripts/betting_policy.py --lo 1.5 --hi 2.2 --margin 0.0 --folds 8 +""" +from __future__ import annotations +import argparse, os, sys +import numpy as np, pandas as pd, xgboost as xgb + +if sys.stdout and hasattr(sys.stdout, "reconfigure"): + try: sys.stdout.reconfigure(encoding="utf-8") + except Exception: pass + +AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv") +META = {"match_id","home_team_id","away_team_id","league_id","mst_utc", + "score_home","score_away","ht_score_home","ht_score_away"} +LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals", + "squad_diff","home_squad_quality","away_squad_quality", + "referee_home_bias","referee_avg_goals"} + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--lo", type=float, default=1.5) + ap.add_argument("--hi", type=float, default=2.2) + ap.add_argument("--margin", type=float, default=0.0) + ap.add_argument("--folds", type=int, default=8) + ap.add_argument("--estimators", type=int, default=250) + args = ap.parse_args() + + df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True) + sh = pd.to_numeric(df["score_home"], errors="coerce") + sa = pd.to_numeric(df["score_away"], errors="coerce") + ok = sh.notna() & sa.notna() + df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values + y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2)) + O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values + feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY] + X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values + + n = len(df); start = int(n*0.5) + bounds = np.linspace(start, n, args.folds+1, dtype=int) + params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05, + "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0} + + print(f"POLICY: favourite band [{args.lo},{args.hi}] margin {args.margin} " + f"leak-free feats={len(feats)} folds={args.folds}\n") + all_pnl=[]; fold_rows=[]; base_pnl=[] + for fi in range(args.folds): + te0,te1 = bounds[fi], bounds[fi+1] + if te1-te0 < 50: continue + bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators) + P = bst.predict(xgb.DMatrix(X[te0:te1])) + yte, Ote = y[te0:te1], O[te0:te1] + implied = np.where(Ote>1.0, 1.0/Ote, np.nan) + edge = np.where(np.isnan(implied), -9.0, P-implied) + pick = edge.argmax(1); pe = edge[np.arange(len(yte)),pick]; po = Ote[np.arange(len(yte)),pick] + bet = (pe>args.margin) & (po>=args.lo) & (po=args.lo)&(fo1.0).all(1) + bpnl=np.where(fav[bmask]==yte[bmask], fo[bmask]-1.0, -1.0) + roi = 100*pnl.sum()/len(pnl) if len(pnl) else float('nan') + broi= 100*bpnl.sum()/len(bpnl) if len(bpnl) else float('nan') + fold_rows.append((fi, len(pnl), 100*win.sum()/max(bet.sum(),1), roi, broi)) + all_pnl.extend(pnl.tolist()); base_pnl.extend(bpnl.tolist()) + print(f" fold {fi}: policy_bets={len(pnl):>4} hit={100*win.sum()/max(bet.sum(),1):>5.1f}% " + f"ROI={roi:>7.2f}% | baseline(blind fav) ROI={broi:>7.2f}%") + + a=np.array(all_pnl); b=np.array(base_pnl) + print("\n"+"="*70) + print("AGGREGATE") + print("="*70) + if len(a): + cum=np.cumsum(a); peak=np.maximum.accumulate(cum); dd=(cum-peak).min() + folds_pos=sum(1 for r in fold_rows if r[3]>0) + print(f" POLICY: bets={len(a):>5} hit={100*(a>0).mean():.1f}% " + f"ROI={100*a.mean():+.2f}% net={a.sum():+.1f}u maxDD={dd:.1f}u " + f"folds+={folds_pos}/{len(fold_rows)}") + if len(b): + print(f" BASELINE: bets={len(b):>5} hit={100*(b>0).mean():.1f}% " + f"ROI={100*b.mean():+.2f}% (blind favourite, same band)") + if len(a): + print(f"\n MODEL LIFT over blind favourite: " + f"{100*a.mean()-100*b.mean():+.1f} percentage points") + print("\nREAD: a believable system has ROI>0, folds+ near full, tolerable maxDD,") + print("and clearly beats the blind-favourite baseline. Even then it's a LEAD —") + print("forward paper-trade with real CLV before staking real money.") + + +if __name__ == "__main__": + main() diff --git a/ai-engine/scripts/capture_closing_odds.py b/ai-engine/scripts/capture_closing_odds.py new file mode 100644 index 0000000..ac97966 --- /dev/null +++ b/ai-engine/scripts/capture_closing_odds.py @@ -0,0 +1,136 @@ +""" +Capture Closing Odds — snapshot #2 of the minimal 2-snapshot CLV system. +======================================================================= +WHY: CLV (closing line value) is the only reliable proof of betting edge. +This codebase never captured it: odds are stored as a single static snapshot +and `odds_history` is empty. But the live sync (DataFetcherTask CRON 1) DOES +refresh `live_matches.odds` every 15 min before kickoff, and prediction_runs +already store the bet-time odds blob (odds_snapshot.odds, source=live_match). + +This script supplies the missing half: just before kickoff it copies the +*current* live odds blob onto the match's latest prediction_run as +`odds_snapshot.closing_odds`. Later, CLV per bet = bet-time pick odds vs +closing pick odds (computed in live_scoreboard.py once enough data exists). + +Run it every ~15 min (e.g. alongside the existing sync, or its own cron): + python scripts/capture_closing_odds.py # default 25-min window + python scripts/capture_closing_odds.py --window-min 20 --dry-run + +Structure-agnostic: stores the whole live odds blob; no pick parsing here. +Idempotent: skips runs that already have closing_odds. Only ADDS a JSON key, +never deletes. Safe to run repeatedly. + +⚠️ Needs one supervised test run against a live DB with upcoming matches + before scheduling (DB was down at authoring time). +""" +from __future__ import annotations + +import argparse +import json +import os +import sys +import time +from datetime import datetime, timezone + +if sys.stdout and hasattr(sys.stdout, "reconfigure"): + try: + sys.stdout.reconfigure(encoding="utf-8") + except Exception: + pass + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR) +sys.path.insert(0, AI_ENGINE_DIR) + +from data.db import get_clean_dsn # noqa: E402 +import psycopg2 # noqa: E402 +from psycopg2.extras import RealDictCursor # noqa: E402 + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--window-min", type=int, default=25, + help="Capture matches kicking off within the next N minutes (default 25)") + ap.add_argument("--grace-min", type=int, default=10, + help="Also include matches that kicked off up to N min ago (default 10)") + ap.add_argument("--dry-run", action="store_true", + help="Report what would be captured without writing") + args = ap.parse_args() + + now_ms = int(time.time() * 1000) + lo_ms = now_ms - args.grace_min * 60 * 1000 + hi_ms = now_ms + args.window_min * 60 * 1000 + + captured = skipped = no_run = 0 + with psycopg2.connect(get_clean_dsn()) as conn: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + # Upcoming/just-started live matches that still hold pre-kickoff odds. + cur.execute( + """ + SELECT id, mst_utc, odds + FROM live_matches + WHERE odds IS NOT NULL + AND mst_utc BETWEEN %s AND %s + ORDER BY mst_utc ASC + """, + (lo_ms, hi_ms), + ) + matches = cur.fetchall() + print(f"[capture_closing_odds] window={args.window_min}m grace={args.grace_min}m " + f"upcoming_with_odds={len(matches)} dry_run={args.dry_run}") + + for m in matches: + mid = m["id"] + cur.execute( + """ + SELECT id, odds_snapshot + FROM prediction_runs + WHERE match_id = %s + ORDER BY generated_at DESC + LIMIT 1 + """, + (mid,), + ) + run = cur.fetchone() + if not run: + no_run += 1 + continue + snap = run["odds_snapshot"] or {} + if isinstance(snap, str): + try: + snap = json.loads(snap) + except Exception: + snap = {} + if snap.get("closing_odds") is not None: + skipped += 1 + continue + + patch = { + "closing_odds": m["odds"], + "closing_captured_at": datetime.now(timezone.utc).isoformat(), + "closing_mst_utc": m["mst_utc"], + "closing_source": "live_match", + } + if args.dry_run: + captured += 1 + print(f" would capture match={mid} run_id={run['id']} mst_utc={m['mst_utc']}") + continue + cur.execute( + """ + UPDATE prediction_runs + SET odds_snapshot = COALESCE(odds_snapshot, '{}'::jsonb) || %s::jsonb + WHERE id = %s + """, + (json.dumps(patch, default=str), run["id"]), + ) + captured += 1 + if not args.dry_run: + conn.commit() + + print(f"[capture_closing_odds] captured={captured} already_had={skipped} " + f"no_prediction_run={no_run}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/ai-engine/scripts/clv_report.py b/ai-engine/scripts/clv_report.py new file mode 100644 index 0000000..90bcd79 --- /dev/null +++ b/ai-engine/scripts/clv_report.py @@ -0,0 +1,224 @@ +""" +CLV Report — the single most important edge metric. +=================================================== +Closing Line Value = did we bet at better odds than the market's closing line? +Consistently positive CLV is the only reliable proof of a real betting edge; +negative CLV means no edge, regardless of short-term wins/losses. + +This codebase stores the BET-TIME odds for ~92% of runs (prediction_runs. +odds_snapshot.source = 'live_match' with the live odds blob, and the pick's +odds in payload main_pick.odds). For the closing line we use, in order: + 1. odds_snapshot.closing_odds (captured by capture_closing_odds.py, forward) + 2. odd_selections current value (the static near-final capture — a proxy) + +CLV per bet = bet_odds / closing_odds - 1 (positive = beat the close = good). + +Read-only. SELECT only. +Usage: + python scripts/clv_report.py + python scripts/clv_report.py --staked-only +""" +from __future__ import annotations + +import argparse +import json +import os +import sys +from collections import defaultdict +from typing import Any, Dict, Optional, Tuple + +if sys.stdout and hasattr(sys.stdout, "reconfigure"): + try: + sys.stdout.reconfigure(encoding="utf-8") + except Exception: + pass + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR) +sys.path.insert(0, AI_ENGINE_DIR) + +from data.db import get_clean_dsn # noqa: E402 +import psycopg2 # noqa: E402 +from psycopg2.extras import RealDictCursor # noqa: E402 + +# market code -> (Turkish odds-category name, pick-normalizer -> selection key) +OU_CATS = {"OU05": "0,5 Alt/Üst", "OU15": "1,5 Alt/Üst", "OU25": "2,5 Alt/Üst", + "OU35": "3,5 Alt/Üst", "OU45": "4,5 Alt/Üst"} + + +def _f(x: Any, d: Optional[float] = None) -> Optional[float]: + try: + return float(x) if x is not None else d + except (TypeError, ValueError): + return d + + +def _parse(j: Any) -> Dict[str, Any]: + if isinstance(j, str): + try: + return json.loads(j) + except Exception: + return {} + return j or {} + + +def map_pick(market: str, pick: str) -> Optional[Tuple[str, str]]: + """Return (category_name, selection_key) for the live-odds JSON / odd_selections.""" + m = (market or "").upper() + p = (pick or "").strip() + pl = p.casefold() + if m in ("MS", "ML", "1X2"): + return ("Maç Sonucu", p if p in ("1", "X", "2") else None) if p in ("1", "X", "2") else None + if m == "HT": + return ("1. Yarı Sonucu", p) if p in ("1", "X", "2") else None + if m in OU_CATS: + if "üst" in pl or "ust" in pl or "over" in pl: + return (OU_CATS[m], "Üst") + if "alt" in pl or "under" in pl: + return (OU_CATS[m], "Alt") + return None + if m == "DC": + key = p.upper().replace(" ", "").replace("/", "-") + norm = {"1X": "1-X", "X1": "1-X", "X2": "X-2", "2X": "X-2", + "12": "1-2", "21": "1-2", "1-X": "1-X", "X-2": "X-2", "1-2": "1-2"}.get(key) + return ("Çifte Şans", norm) if norm else None + if m == "BTTS": + if "var" in pl or "yes" in pl: + return ("Karşılıklı Gol", "Var") + if "yok" in pl or "no" in pl: + return ("Karşılıklı Gol", "Yok") + return None + if m == "OE": + if "tek" in pl or "odd" in pl: + return ("Tek/Çift", "Tek") + if "çift" in pl or "cift" in pl or "even" in pl: + return ("Tek/Çift", "Çift") + return None + return None + + +def closing_from_blob(blob: Any, cat: str, sel: str) -> Optional[float]: + blob = _parse(blob) + cat_map = blob.get(cat) if isinstance(blob, dict) else None + if isinstance(cat_map, dict): + return _f(cat_map.get(sel)) + return None + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--staked-only", action="store_true", + help="Only playable/staked bets (default: all picks with a mappable market)") + args = ap.parse_args() + + rows_out = [] + with psycopg2.connect(get_clean_dsn()) as conn: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + cur.execute(""" + SELECT match_id, engine_version, odds_snapshot, payload_summary, + eventual_outcome, unit_profit + FROM prediction_runs + WHERE odds_snapshot->>'source' = 'live_match' + ORDER BY generated_at ASC + """) + runs = cur.fetchall() + + for r in runs: + snap = _parse(r["odds_snapshot"]) + ps = _parse(r["payload_summary"]) + mp = ps.get("main_pick") or {} + market = mp.get("market") + pick = mp.get("pick") + bet_odds = _f(mp.get("odds")) + playable = bool(mp.get("playable")) + if args.staked_only and not playable: + continue + if not market or not pick or not bet_odds or bet_odds <= 1.0: + continue + mapped = map_pick(market, pick) + if not mapped or not mapped[1]: + continue + cat, sel = mapped + + # closing line: prefer captured closing_odds, else static odd_selections + closing = closing_from_blob(snap.get("closing_odds"), cat, sel) + src = "captured" + if closing is None: + cur.execute(""" + SELECT os.odd_value FROM odd_categories oc + JOIN odd_selections os ON os.odd_category_db_id = oc.db_id + WHERE oc.match_id = %s AND oc.name = %s AND os.name = %s + LIMIT 1 + """, (r["match_id"], cat, sel)) + row = cur.fetchone() + closing = _f(row["odd_value"]) if row else None + src = "static_proxy" + if closing is None or closing <= 1.0: + continue + + clv = bet_odds / closing - 1.0 + rows_out.append({ + "market": market, "playable": playable, + "bet_odds": bet_odds, "closing": closing, "clv": clv, + "src": src, "profit": _f(r["unit_profit"], 0.0) or 0.0, + "settled": r["eventual_outcome"] is not None + and not str(r["eventual_outcome"]).startswith("NO_BET"), + }) + + if not rows_out: + print("No mappable runs with both bet-time and closing odds found.") + return 0 + + def agg(rs): + n = len(rs) + clvs = [x["clv"] for x in rs] + pos = sum(1 for c in clvs if c > 0) + return { + "n": n, + "mean_clv_pct": round(100.0 * sum(clvs) / n, 2), + "pct_positive": round(100.0 * pos / n, 1), + "captured": sum(1 for x in rs if x["src"] == "captured"), + } + + print("=" * 70) + print("CLV REPORT — did we beat the closing line? (the edge compass)") + print("=" * 70) + o = agg(rows_out) + print(f"runs analyzed: {o['n']} (closing source: {o['captured']} captured, " + f"{o['n'] - o['captured']} static-proxy)") + print(f"\nOVERALL mean CLV: {o['mean_clv_pct']}% " + f"bets beating close: {o['pct_positive']}%") + print(" (positive mean CLV = real edge; ~0 or negative = no edge)\n") + + staked = [x for x in rows_out if x["playable"]] + if staked: + s = agg(staked) + print(f"STAKED only: n={s['n']} mean CLV={s['mean_clv_pct']}% " + f"beating close={s['pct_positive']}%\n") + + print("BY MARKET") + by_m = defaultdict(list) + for x in rows_out: + by_m[x["market"]].append(x) + for m, rs in sorted(by_m.items(), key=lambda kv: -len(kv[1])): + a = agg(rs) + print(f" {m:<8} n={a['n']:>4} mean CLV={a['mean_clv_pct']:>7}% " + f"beating close={a['pct_positive']:>5}%") + + # CLV vs outcome sanity: do positive-CLV bets actually win more / lose less? + print("\nCLV vs realized P/L (settled staked)") + ss = [x for x in rows_out if x["playable"] and x["settled"]] + if ss: + posc = [x for x in ss if x["clv"] > 0] + negc = [x for x in ss if x["clv"] <= 0] + for label, grp in (("CLV>0", posc), ("CLV<=0", negc)): + if grp: + pr = sum(x["profit"] for x in grp) + print(f" {label:<7} n={len(grp):>3} profit={pr:>7.2f}u " + f"ROI(flat1u)={round(100*pr/len(grp),1)}%") + print("=" * 70) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/ai-engine/scripts/edge_search.py b/ai-engine/scripts/edge_search.py new file mode 100644 index 0000000..04b78f1 --- /dev/null +++ b/ai-engine/scripts/edge_search.py @@ -0,0 +1,181 @@ +""" +Edge Search — is there a profitable POCKET (by league) the global model misses? +============================================================================== +Global leak-free MS is ~-5.6% (the vig). But efficiency varies: obscure / low- +tier leagues may be mispriced. This walks a leak-free model forward and slices +the value-bet ROI BY LEAGUE, requiring a real sample AND multi-fold consistency +so we don't chase one lucky window. + +Leak-free: drops the confirmed/suspected leakage columns (see LEAKY). Uses odds +in features (realistic). Value bet = biggest model_prob - implied edge > margin. + +⚠️ Even a positive pocket here is a LEAD, not proof: the CSV odds are a static +capture, not the verified closing line. Anything flagged must be forward- +validated with real CLV (capture_closing_odds.py) before staking. + +Usage: python scripts/edge_search.py --folds 6 --min-bets 150 +""" +from __future__ import annotations +import argparse, os, sys, time +import numpy as np, pandas as pd, xgboost as xgb + +if sys.stdout and hasattr(sys.stdout, "reconfigure"): + try: sys.stdout.reconfigure(encoding="utf-8") + except Exception: pass + +AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, AI_DIR) +CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv") + +META = {"match_id","home_team_id","away_team_id","league_id","mst_utc", + "score_home","score_away","ht_score_home","ht_score_away"} +LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals", + "squad_diff","home_squad_quality","away_squad_quality", + "referee_home_bias","referee_avg_goals"} + + +def league_names(ids): + """Resilient id->name lookup.""" + from data.db import get_clean_dsn + import psycopg2 + from psycopg2.extras import RealDictCursor + out = {} + ids = [str(i) for i in ids if i is not None] + if not ids: return out + for _ in range(3): + try: + with psycopg2.connect(get_clean_dsn()) as c: + with c.cursor(cursor_factory=RealDictCursor) as cur: + cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (ids,)) + for r in cur.fetchall(): out[str(r["id"])] = r["name"] + return out + except Exception: + time.sleep(1.0) + return out + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--folds", type=int, default=6) + ap.add_argument("--estimators", type=int, default=200) + ap.add_argument("--margin", type=float, default=0.0) + ap.add_argument("--min-bets", type=int, default=150) + args = ap.parse_args() + + print(f"Loading {CSV} ...") + df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True) + sh = pd.to_numeric(df["score_home"], errors="coerce") + sa = pd.to_numeric(df["score_away"], errors="coerce") + ok = sh.notna() & sa.notna() + df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values + y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2)) + league = df["league_id"].astype(str).values + odds = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values + + feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY] + X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values + rel = pd.to_numeric(df.get("league_reliability_score", pd.Series([np.nan]*len(df))), + errors="coerce").fillna(-1.0).values + print(f" {len(df):,} rows features={len(feats)} (leak-free) folds={args.folds}") + + n = len(df); start = int(n * 0.5) + bounds = np.linspace(start, n, args.folds + 1, dtype=int) + params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05, + "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0} + + # reliability quartile edges from the betting universe (rel>=0) + rv = rel[rel >= 0] + qs = np.quantile(rv, [0.25, 0.5, 0.75]) if len(rv) else [0.3, 0.5, 0.7] + def rel_band(x): + if x < 0: return "rel:unknown" + if x < qs[0]: return f"rel:Q1(<{qs[0]:.2f})" + if x < qs[1]: return f"rel:Q2" + if x < qs[2]: return f"rel:Q3" + return f"rel:Q4(>={qs[2]:.2f})" + def odds_band(o): + return ("<1.5" if o<1.5 else "1.5-2" if o<2 else "2-3" if o<3 else + "3-5" if o<5 else "5-8" if o<8 else "8+") + + recs = [] # (group_key, fold, pnl, win) + glob = {"n":0,"pnl":0.0,"win":0} + for fi in range(args.folds): + te0, te1 = bounds[fi], bounds[fi+1] + if te1-te0 < 50: continue + bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators) + proba = bst.predict(xgb.DMatrix(X[te0:te1])) + yte, ote, rte = y[te0:te1], odds[te0:te1], rel[te0:te1] + implied = np.where(ote > 1.0, 1.0/ote, np.nan) + edge = np.where(np.isnan(implied), -9.0, proba - implied) + pick = edge.argmax(1) + bet = edge[np.arange(len(yte)), pick] > args.margin + win = (pick == yte) & bet + pick_odds = ote[np.arange(len(yte)), pick] + pnl = np.where(win, pick_odds-1.0, -1.0) + for i in range(len(yte)): + if not bet[i]: continue + glob["n"]+=1; glob["pnl"]+=pnl[i]; glob["win"]+=int(win[i]) + recs.append((rel_band(rte[i]), fi, pnl[i], int(win[i]))) + recs.append((odds_band(pick_odds[i]), fi, pnl[i], int(win[i]))) + recs.append((rel_band(rte[i])+" x "+odds_band(pick_odds[i]), fi, pnl[i], int(win[i]))) + print(f" fold {fi}: tested {len(yte):,} bets {int(bet.sum()):,}") + + print("\n"+"="*78) + print(f"GLOBAL leak-free: bets={glob['n']:,} hit={100*glob['win']/max(glob['n'],1):.1f}% " + f"ROI(flat1u)={100*glob['pnl']/max(glob['n'],1):.2f}%") + print("="*78) + + rdf = pd.DataFrame(recs, columns=["grp","fold","pnl","win"]) + def report(prefix, title): + sub = rdf[rdf["grp"].str.startswith(prefix)] + if sub.empty: return + print(f"\n{title}") + print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}") + print(" "+"-"*54) + g = sub.groupby("grp") + out=[] + for k,d in g: + nb=len(d) + if nb < args.min_bets: continue + roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb + fp=d.groupby("fold")["pnl"].sum(); folds_pos=int((fp>0).sum()); ft=fp.shape[0] + out.append((roi,k,nb,hit,folds_pos,ft)) + for roi,k,nb,hit,fp,ft in sorted(out,reverse=True): + print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fp)+'/'+str(ft):>8}") + report("rel:", "BY LEAGUE-RELIABILITY BAND (Q1=most obscure ... Q4=most reliable)") + report(("<","1","2","3","5","8"), None) # odds bands start with digit/< + # odds-band buckets begin with a digit or '<' + sub = rdf[~rdf["grp"].str.startswith("rel:")] + sub = sub[~sub["grp"].str.contains(" x ")] + if not sub.empty: + print("\nBY ODDS BAND") + print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}") + print(" "+"-"*54) + out=[] + for k,d in sub.groupby("grp"): + nb=len(d) + if nb0).sum()),fp.shape[0])) + for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True): + print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}") + # 2D reliability x odds + sub2 = rdf[rdf["grp"].str.contains(" x ")] + if not sub2.empty: + print("\nBY RELIABILITY x ODDS (candidate pockets, n>=min-bets)") + print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}") + print(" "+"-"*54) + out=[] + for k,d in sub2.groupby("grp"): + nb=len(d) + if nb0).sum()),fp.shape[0])) + for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True)[:15]: + print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}") + print("\nREAD: a pocket is a real LEAD only if ROI>0 AND positive in MOST folds") + print("(folds+ near full) AND bets large. +ROI in 1-2 folds = noise / overfit.") + print("Then forward-validate with CLV (capture_closing_odds.py) before staking.") + + +if __name__ == "__main__": + main() diff --git a/ai-engine/scripts/generate_daily_picks.py b/ai-engine/scripts/generate_daily_picks.py new file mode 100644 index 0000000..1621303 --- /dev/null +++ b/ai-engine/scripts/generate_daily_picks.py @@ -0,0 +1,154 @@ +""" +Generate Daily Picks — the serving picker for the validated favourite policy. +============================================================================ +Loads the saved leak-free MS model (models/favorite_v1) and applies the +favourite-band value policy to a set of matches, emitting the day's STAKED +picks and logging them for forward paper-trade settlement. + +Train/serve consistency: features MUST come from the SAME extractor that built +training_data_v27.csv. Production path = run the extractor nightly INCLUDING +upcoming (status NS) matches, then point this script at that CSV. Demo path = +use the tail of the training CSV as stand-in "today" matches (with the real +result shown, since those are settled). + +Policy: bet the MS side with the biggest model_prob - implied edge, ONLY if +odds in [--lo,--hi] and edge>--margin. Flat 1u. No longshots, no parlays. +Non-MS markets are NOT staked (efficient -> model error). One bet per match. + +Usage: + python scripts/generate_daily_picks.py --demo --n 20 # see it work now + python scripts/generate_daily_picks.py --features today.csv # production + python scripts/generate_daily_picks.py --settle # settle paper log +""" +from __future__ import annotations +import argparse, json, os, sys, datetime +import numpy as np, pandas as pd, xgboost as xgb + +if sys.stdout and hasattr(sys.stdout, "reconfigure"): + try: sys.stdout.reconfigure(encoding="utf-8") + except Exception: pass + +AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +MODEL_DIR = os.path.join(AI_DIR, "models", "favorite_v1") +TRAIN_CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv") +PAPER_LOG = os.path.join(AI_DIR, "data", "paper_trades.csv") +MS_ODDS = ["odds_ms_h", "odds_ms_d", "odds_ms_a"] +MS_PICKS = ["1", "X", "2"] + + +def load_model(): + bst = xgb.Booster(); bst.load_model(os.path.join(MODEL_DIR, "model.json")) + with open(os.path.join(MODEL_DIR, "feature_cols.json"), encoding="utf-8") as f: + feats = json.load(f) + with open(os.path.join(MODEL_DIR, "metadata.json"), encoding="utf-8") as f: + meta = json.load(f) + return bst, feats, meta + + +def pick_for_rows(df, bst, feats, lo, hi, margin): + X = df.reindex(columns=feats).apply(pd.to_numeric, errors="coerce").fillna(0.0).values + P = bst.predict(xgb.DMatrix(X)) # [n,3] home/draw/away + O = df[MS_ODDS].apply(pd.to_numeric, errors="coerce").fillna(0.0).values + implied = np.where(O > 1.0, 1.0/O, np.nan) + edge = np.where(np.isnan(implied), -9.0, P - implied) + out = [] + for i in range(len(df)): + k = int(np.argmax(edge[i])); o = float(O[i, k]); e = float(edge[i, k]) + staked = (e > margin) and (lo <= o < hi) + out.append({"idx": i, "pick": MS_PICKS[k], "odds": round(o, 2), + "model_prob": round(float(P[i, k]), 4), "edge": round(e, 4), + "staked": staked}) + return out + + +def settle(): + if not os.path.exists(PAPER_LOG): + print("No paper_trades.csv yet."); return + pt = pd.read_csv(PAPER_LOG) + open_bets = pt[pt["result"].isna()] if "result" in pt.columns else pt + if open_bets.empty: + print("No open bets to settle."); + # settle from training CSV scores if present, else needs DB (left as note) + src = pd.read_csv(TRAIN_CSV, low_memory=False, usecols=["match_id","score_home","score_away"]) + sc = src.set_index("match_id") + def res(row): + if not pd.isna(row.get("result")): return row["result"] + m = sc.index == row["match_id"] + if not m.any(): return np.nan + r = sc[m].iloc[0]; sh, sa = r["score_home"], r["score_away"] + if pd.isna(sh): return np.nan + outcome = "1" if sh > sa else ("X" if sh == sa else "2") + won = (str(row["pick"]) == outcome) + return "WON" if won else "LOST" + pt["result"] = pt.apply(res, axis=1) + pt["pnl"] = pt.apply(lambda r: (r["odds"]-1.0) if r["result"]=="WON" + else (-1.0 if r["result"]=="LOST" else np.nan), axis=1) + pt.to_csv(PAPER_LOG, index=False) + s = pt.dropna(subset=["pnl"]) + if len(s): + print(f"Settled {len(s)} bets: hit={100*(s['result']=='WON').mean():.1f}% " + f"ROI={100*s['pnl'].sum()/len(s):+.2f}% net={s['pnl'].sum():+.1f}u") + return + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--features", help="CSV of upcoming matches in training schema") + ap.add_argument("--demo", action="store_true", help="use tail of training CSV as 'today'") + ap.add_argument("--n", type=int, default=20) + ap.add_argument("--lo", type=float, default=1.5) + ap.add_argument("--hi", type=float, default=2.2) + ap.add_argument("--margin", type=float, default=0.03) + ap.add_argument("--settle", action="store_true") + ap.add_argument("--log", action="store_true", help="append staked picks to paper_trades.csv") + args = ap.parse_args() + + if args.settle: + settle(); return + + bst, feats, meta = load_model() + print(f"Model {meta['version']} (trained {meta['trained_at']}, holdout " + f"ROI {meta['holdout_eval']['roi_pct']}%) band[{args.lo},{args.hi}] margin {args.margin}\n") + + if args.features: + df = pd.read_csv(args.features, low_memory=False) + demo = False + else: + df = pd.read_csv(TRAIN_CSV, low_memory=False).sort_values("mst_utc").tail(args.n).reset_index(drop=True) + demo = True + print("(DEMO: last matches of training CSV as stand-in for today)\n") + + picks = pick_for_rows(df, bst, feats, args.lo, args.hi, args.margin) + staked = [p for p in picks if p["staked"]] + print(f"{len(df)} matches scanned -> {len(staked)} STAKED MS picks\n") + print(f" {'match_id':<28}{'pick':>5}{'odds':>7}{'model%':>8}{'edge%':>7}" + (" result" if demo else "")) + print(" "+"-"*60) + log_rows = [] + for p in picks: + if not p["staked"]: continue + r = df.iloc[p["idx"]]; mid = str(r["match_id"]) + res = "" + if demo: + sh, sa = r.get("score_home"), r.get("score_away") + if pd.notna(sh): + out = "1" if sh>sa else ("X" if sh==sa else "2") + res = " WON" if p["pick"]==out else " lost" + print(f" {mid:<28}{p['pick']:>5}{p['odds']:>7.2f}{100*p['model_prob']:>8.1f}{100*p['edge']:>+7.1f}{res}") + log_rows.append({"logged_at": datetime.datetime.now().isoformat(timespec="seconds"), + "match_id": mid, "market": "MS", "pick": p["pick"], "odds": p["odds"], + "model_prob": p["model_prob"], "edge": p["edge"], "stake": 1.0, + "result": np.nan, "pnl": np.nan}) + if args.log and log_rows and not demo: + new = pd.DataFrame(log_rows) + if os.path.exists(PAPER_LOG): + new = pd.concat([pd.read_csv(PAPER_LOG), new], ignore_index=True) + new.to_csv(PAPER_LOG, index=False) + print(f"\n logged {len(log_rows)} picks -> {PAPER_LOG}") + elif args.log and demo: + print("\n (--log ignored in --demo; only real upcoming picks are logged)") + print("\nReminder: paper-trade only. Stake real money after weeks of forward") + print("CLV>0 + ROI>0 (settle with --settle, check scoreboard/clv_report).") + + +if __name__ == "__main__": + main() diff --git a/ai-engine/scripts/live_scoreboard.py b/ai-engine/scripts/live_scoreboard.py new file mode 100644 index 0000000..1e748e7 --- /dev/null +++ b/ai-engine/scripts/live_scoreboard.py @@ -0,0 +1,253 @@ +""" +Live Scoreboard — the single source of truth for real betting performance. +========================================================================= +Reads the *forward-tracked* results in `prediction_runs` (one row per analyzed +match, with the staked main pick + actual outcome + realized unit_profit) and +reports what ACTUALLY happened with real money logic — NOT a backtest. + +Why this exists: backtests on this codebase are overfit (a paper "+32.7% ROI" +strategy that the live engine never even ran). The only trustworthy number is +the realized P/L recorded after matches settle. This tool surfaces it. + +Read-only. SELECT only. Safe to run anytime. + +Usage: + python scripts/live_scoreboard.py + python scripts/live_scoreboard.py --days 30 + python scripts/live_scoreboard.py --version v28-pro-max +""" +from __future__ import annotations + +import argparse +import json +import os +import sys +from collections import defaultdict +from datetime import datetime, timedelta, timezone +from typing import Any, Dict, List, Optional + +# utf-8 stdout so Turkish market/league names never crash on Windows cp1252 +if sys.stdout and hasattr(sys.stdout, "reconfigure"): + try: + sys.stdout.reconfigure(encoding="utf-8") + except Exception: + pass + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR) +sys.path.insert(0, AI_ENGINE_DIR) + +from data.db import get_clean_dsn # noqa: E402 +import psycopg2 # noqa: E402 +from psycopg2.extras import RealDictCursor # noqa: E402 + +ODDS_BANDS = [(0, 1.5, "<1.5"), (1.5, 2.0, "1.5-2"), (2.0, 3.0, "2-3"), + (3.0, 5.0, "3-5"), (5.0, 6.0, "5-6"), (6.0, 7.5, "6-7.5"), + (7.5, 999, "7.5+")] + + +def _f(x: Any, d: Optional[float] = None) -> Optional[float]: + try: + return float(x) if x is not None else d + except (TypeError, ValueError): + return d + + +def _parse(j: Any) -> Dict[str, Any]: + if isinstance(j, str): + try: + return json.loads(j) + except Exception: + return {} + return j or {} + + +def _band(odds: Optional[float]) -> str: + if odds is None: + return "?" + for lo, hi, name in ODDS_BANDS: + if lo <= odds < hi: + return name + return "?" + + +def fetch_rows(args) -> List[Dict[str, Any]]: + dsn = get_clean_dsn() + where = ["eventual_outcome IS NOT NULL"] + params: List[Any] = [] + if args.version: + where.append("engine_version = %s") + params.append(args.version) + if args.days: + cutoff = datetime.now(timezone.utc) - timedelta(days=args.days) + where.append("generated_at >= %s") + params.append(cutoff) + sql = f""" + SELECT match_id, engine_version, generated_at, eventual_outcome, + unit_profit, payload_summary + FROM prediction_runs + WHERE {' AND '.join(where)} + ORDER BY generated_at ASC + """ + with psycopg2.connect(dsn) as conn: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + cur.execute(sql, params) + return cur.fetchall() + + +def distill(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """One analytic record per run with the staked pick + realized P/L.""" + out = [] + for r in rows: + ps = _parse(r["payload_summary"]) + mp = ps.get("main_pick") or {} + playable = bool(mp.get("playable")) + stake = _f(mp.get("stake_units"), 0.0) or 0.0 + profit = _f(r["unit_profit"], 0.0) or 0.0 + outcome = str(r["eventual_outcome"] or "") + staked = playable and stake > 0 + # settled stake = a real bet with a win/loss (exclude NO_BET / push) + settled_stake = staked and not outcome.startswith(("NO_BET", "PUSH", "VOID", "CANCEL")) + out.append({ + "match_id": r["match_id"], + "version": r["engine_version"], + "ts": r["generated_at"], + "market": mp.get("market") or "?", + "pick": mp.get("pick"), + "odds": _f(mp.get("odds")), + "stake": stake, + "profit": profit, + "outcome": outcome, + "staked": staked, + "settled_stake": settled_stake, + "win": settled_stake and profit > 0, + }) + return out + + +def _agg(recs: List[Dict[str, Any]]) -> Dict[str, Any]: + # NOTE: recorded unit_profit is on a FLAT 1u basis (win=odds-1, loss=-1), + # independent of the brain's suggested stake_units. So ROI is profit per + # bet at 1u flat = profit / n. (Using stake_units as denominator is wrong: + # it double-counts and produces impossible >100% losses.) + s = [r for r in recs if r["settled_stake"]] + n = len(s) + wins = sum(1 for r in s if r["win"]) + sug_stake = sum(r["stake"] for r in s) + profit = sum(r["profit"] for r in s) + return { + "n": n, + "wins": wins, + "hit_pct": round(100.0 * wins / n, 1) if n else None, + "sug_stake": round(sug_stake, 2), + "profit": round(profit, 2), + "roi_pct": round(100.0 * profit / n, 1) if n else None, # flat 1u + } + + +def _line(label: str, a: Dict[str, Any]) -> str: + return (f" {label:<14} n={a['n']:>4} hit={str(a['hit_pct'] if a['hit_pct'] is not None else '-'):>5}% " + f"profit={a['profit']:>8.2f}u ROI(flat1u)={str(a['roi_pct'] if a['roi_pct'] is not None else '-'):>7}%") + + +def risk_metrics(recs: List[Dict[str, Any]]) -> Dict[str, Any]: + s = [r for r in sorted(recs, key=lambda x: x["ts"]) if r["settled_stake"]] + cum = 0.0 + peak = 0.0 + max_dd = 0.0 + streak = 0 + worst_streak = 0 + for r in s: + cum += r["profit"] + peak = max(peak, cum) + max_dd = min(max_dd, cum - peak) + if r["profit"] <= 0: + streak += 1 + worst_streak = max(worst_streak, streak) + else: + streak = 0 + return {"max_drawdown_u": round(max_dd, 2), + "longest_losing_streak": worst_streak, + "final_cum_u": round(cum, 2)} + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--days", type=int, default=None, help="Only last N days") + ap.add_argument("--version", help="Filter by engine_version") + args = ap.parse_args() + + rows = fetch_rows(args) + recs = distill(rows) + + print("=" * 74) + print("LIVE SCOREBOARD — realized results from prediction_runs (NOT backtest)") + print("=" * 74) + if recs: + lo = min(r["ts"] for r in recs).date() + hi = max(r["ts"] for r in recs).date() + print(f"window: {lo} .. {hi} settled runs: {len(recs)}" + + (f" filter: {args.version}" if args.version else "")) + print() + + overall = _agg(recs) + print("OVERALL (staked = playable bets only)") + print(_line("ALL", overall)) + no_bet = sum(1 for r in recs if not r["staked"]) + print(f" (analyzed {len(recs)} matches; {overall['n']} actually staked, " + f"{no_bet} NO_BET)") + if overall["n"]: + rm = risk_metrics(recs) + print(f" max drawdown: {rm['max_drawdown_u']}u " + f"longest losing streak: {rm['longest_losing_streak']} " + f"net: {rm['final_cum_u']}u") + print() + + print("BY ENGINE VERSION") + by_v = defaultdict(list) + for r in recs: + by_v[r["version"]].append(r) + for v, rs in sorted(by_v.items(), key=lambda kv: -len(kv[1])): + print(_line(v, _agg(rs))) + print() + + print("BY MARKET (staked)") + by_m = defaultdict(list) + for r in recs: + if r["settled_stake"]: + by_m[r["market"]].append(r) + for m, rs in sorted(by_m.items(), key=lambda kv: -len(kv[1])): + print(_line(m, _agg(rs))) + if not by_m: + print(" (no staked settled bets in window)") + print() + + print("BY ODDS BAND (staked)") + by_b = defaultdict(list) + for r in recs: + if r["settled_stake"]: + by_b[_band(r["odds"])].append(r) + for _, _, name in ODDS_BANDS: + if name in by_b: + print(_line(name, _agg(by_b[name]))) + print() + + print("WEEKLY TREND (staked)") + by_w = defaultdict(list) + for r in recs: + if r["settled_stake"]: + iso = r["ts"].isocalendar() + by_w[f"{iso[0]}-W{iso[1]:02d}"].append(r) + for w in sorted(by_w): + a = _agg(by_w[w]) + print(_line(w, a)) + print() + print("=" * 74) + print("READ: ROI < 0 over a meaningful sample = the staked signals are not") + print("profitable. 'NO_BET' rows are free (no stake). CLV is unmeasurable") + print("until odds movement is captured (see scripts + odds_history fix).") + print("=" * 74) + + +if __name__ == "__main__": + main() diff --git a/ai-engine/scripts/multi_market_edge.py b/ai-engine/scripts/multi_market_edge.py new file mode 100644 index 0000000..b9a1e08 --- /dev/null +++ b/ai-engine/scripts/multi_market_edge.py @@ -0,0 +1,182 @@ +""" +Multi-Market Edge + Best-Bet Selector — pick the best value bet PER MATCH +======================================================================== +Not "play the handed main_pick". For each match, score EVERY market the model +covers, compare model prob vs market implied, and select the single best VALUE +bet across all markets. Leak-free, walk-forward, honest. + +Markets (truth derived from scores, not trusted labels): + MS(1X2), HT-result, OU0.5/1.5/2.5/3.5, HT_OU0.5/1.5, BTTS. + +Outputs: + (A) per-market value ROI -> which bet types actually carry edge + (B) cross-market SELECTOR -> best value bet per match, with odds-band filter, + fold-consistency, and the model-free baseline. + +⚠️ CSV odds are a static capture, not verified closing. Positive = LEAD; forward +paper-trade with real CLV before staking. + +Usage: python scripts/multi_market_edge.py --folds 5 --lo 1.5 --hi 2.6 --margin 0.03 +""" +from __future__ import annotations +import argparse, os, sys +import numpy as np, pandas as pd, xgboost as xgb + +if sys.stdout and hasattr(sys.stdout, "reconfigure"): + try: sys.stdout.reconfigure(encoding="utf-8") + except Exception: pass + +AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv") +META = {"match_id","home_team_id","away_team_id","league_id","mst_utc", + "score_home","score_away","ht_score_home","ht_score_away"} +LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals", + "squad_diff","home_squad_quality","away_squad_quality", + "referee_home_bias","referee_avg_goals"} + +# market -> (kind, [odds_cols aligned to classes], truth_fn(sh,sa,hh,ha)->class idx or None) +def ou(line): return lambda sh,sa,hh,ha: (0 if (sh+sa) > line else 1) # 0=Over,1=Under +def htou(line): return lambda sh,sa,hh,ha: (None if np.isnan(hh) else (0 if (hh+ha) > line else 1)) +def ms_truth(sh,sa,hh,ha): return 0 if sh>sa else (1 if sh==sa else 2) +def ht_truth(sh,sa,hh,ha): return None if np.isnan(hh) else (0 if hh>ha else (1 if hh==ha else 2)) +def btts_truth(sh,sa,hh,ha): return 0 if (sh>0 and sa>0) else 1 # 0=Yes,1=No + +MARKETS = { + "MS": ("multi", ["odds_ms_h","odds_ms_d","odds_ms_a"], ["1","X","2"], ms_truth), + "HT": ("multi", ["odds_ht_ms_h","odds_ht_ms_d","odds_ht_ms_a"], ["1","X","2"], ht_truth), + "OU05": ("binary", ["odds_ou05_o","odds_ou05_u"], ["Üst","Alt"], ou(0.5)), + "OU15": ("binary", ["odds_ou15_o","odds_ou15_u"], ["Üst","Alt"], ou(1.5)), + "OU25": ("binary", ["odds_ou25_o","odds_ou25_u"], ["Üst","Alt"], ou(2.5)), + "OU35": ("binary", ["odds_ou35_o","odds_ou35_u"], ["Üst","Alt"], ou(3.5)), + "HT_OU05": ("binary", ["odds_ht_ou05_o","odds_ht_ou05_u"], ["Üst","Alt"], htou(0.5)), + "HT_OU15": ("binary", ["odds_ht_ou15_o","odds_ht_ou15_u"], ["Üst","Alt"], htou(1.5)), + "BTTS": ("binary", ["odds_btts_y","odds_btts_n"], ["Var","Yok"], btts_truth), +} +PARAMS_M = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05, + "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0} +PARAMS_B = {"objective":"binary:logistic","max_depth":5,"eta":0.05, + "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0} + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--folds", type=int, default=5) + ap.add_argument("--estimators", type=int, default=150) + ap.add_argument("--lo", type=float, default=1.5) + ap.add_argument("--hi", type=float, default=2.6) + ap.add_argument("--margin", type=float, default=0.03) + args = ap.parse_args() + + df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True) + sh = pd.to_numeric(df["score_home"], errors="coerce") + sa = pd.to_numeric(df["score_away"], errors="coerce") + ok = sh.notna() & sa.notna() + df = df[ok].reset_index(drop=True) + SH = sh[ok.values].values.astype(float); SA = sa[ok.values].values.astype(float) + HH = pd.to_numeric(df["ht_score_home"], errors="coerce").values.astype(float) + HA = pd.to_numeric(df["ht_score_away"], errors="coerce").values.astype(float) + feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY] + X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values + N = len(df) + print(f"{N:,} matches, {len(feats)} leak-free feats, {len(MARKETS)} markets, folds={args.folds}") + + # precompute truth + odds per market + MK = {} + for mname,(kind,ocols,picks,tfn) in MARKETS.items(): + if not all(c in df.columns for c in ocols): + print(f" skip {mname}: missing odds cols"); continue + O = df[ocols].apply(pd.to_numeric, errors="coerce").fillna(0.0).values + truth = np.array([tfn(SH[i],SA[i],HH[i],HA[i]) for i in range(N)], dtype=object) + MK[mname] = (kind, O, picks, truth) + + start = int(N*0.5); bounds = np.linspace(start, N, args.folds+1, dtype=int) + + # accumulators + per_market = {m: {"n":0,"pnl":0.0,"win":0} for m in MK} # (A) best value pick within market + sel = {"n":0,"pnl":0.0,"win":0,"fold":{}} # (B) cross-market selector + sel_by_mkt = {m: {"n":0,"pnl":0.0,"win":0} for m in MK} + + for fi in range(args.folds): + te0,te1 = bounds[fi], bounds[fi+1] + if te1-te0 < 50: continue + idx = np.arange(te0,te1) + # train each market model on [:te0], predict test + cand = {} # market -> (P_matrix[n_test, n_picks], O_test, truth_test) + for m,(kind,O,picks,truth) in MK.items(): + ytr_full = truth[:te0] + # mask invalid truth (e.g., HT markets with missing HT score) + valid_tr = np.array([v is not None for v in ytr_full]) + if kind=="multi": + ytr = ytr_full[valid_tr].astype(int) + bst = xgb.train(PARAMS_M, xgb.DMatrix(X[:te0][valid_tr], label=ytr), num_boost_round=args.estimators) + P = bst.predict(xgb.DMatrix(X[te0:te1])) # [n,3] + else: + ytr = ytr_full[valid_tr].astype(int) # 0=positive,1=neg + pos = (ytr==0).astype(int) + bst = xgb.train(PARAMS_B, xgb.DMatrix(X[:te0][valid_tr], label=pos), num_boost_round=args.estimators) + ppos = bst.predict(xgb.DMatrix(X[te0:te1])) + P = np.column_stack([ppos, 1.0-ppos]) # [n,2] -> [pos,neg] + cand[m] = (P, O[te0:te1], truth[te0:te1]) + + # iterate test matches + for j in range(te1-te0): + best = None # (edge, market, pickidx, odds, won) + for m,(P,Ot,Tt) in cand.items(): + t = Tt[j] + if t is None: continue + probs = P[j]; odds = Ot[j] + for k in range(len(probs)): + o = odds[k] + if o <= 1.0: continue + edge = probs[k] - 1.0/o + won = int(t==k) + # (A) per-market: track best value pick in this market (any band, edge>margin) + if edge > args.margin: + d = per_market[m] + # only count the market's single best pick per match + # collect for selector if in band + margin + if edge > args.margin and args.lo <= o < args.hi: + if best is None or edge > best[0]: + best = (edge, m, k, o, won) + # per-market best pick (separate loop for clean per-market ROI in band) + bestk=None + for k in range(len(probs)): + o=odds[k] + if o<=1.0: continue + e=probs[k]-1.0/o + if e>args.margin and args.lo<=obestk[0]): + bestk=(e,k,o,int(t==k)) + if bestk is not None: + e,k,o,won = bestk + pnl = (o-1.0) if won else -1.0 + d=per_market[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won + # selector: single best value bet across all markets for this match + if best is not None: + edge,m,k,o,won = best + pnl = (o-1.0) if won else -1.0 + sel["n"]+=1; sel["pnl"]+=pnl; sel["win"]+=won + sel["fold"][fi] = sel["fold"].get(fi,0.0)+pnl + d=sel_by_mkt[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won + print(f" fold {fi}: tested {te1-te0:,}") + + def line(name,d): + n=d["n"]; roi=100*d["pnl"]/n if n else float('nan'); hit=100*d["win"]/n if n else float('nan') + return f" {name:<10} bets={n:>6} hit={hit:>5.1f}% ROI={roi:>7.2f}% net={d['pnl']:>7.1f}u" + + print("\n"+"="*70); print(f"(A) PER-MARKET value ROI (best value pick in band [{args.lo},{args.hi}], margin {args.margin})"); print("="*70) + for m in sorted(per_market, key=lambda x:-(100*per_market[x]['pnl']/per_market[x]['n'] if per_market[x]['n'] else -99)): + print(line(m, per_market[m])) + + print("\n"+"="*70); print("(B) CROSS-MARKET SELECTOR (best value bet per match, all markets)"); print("="*70) + print(line("SELECTOR", sel)) + folds_pos = sum(1 for v in sel["fold"].values() if v>0) + print(f" folds positive: {folds_pos}/{len(sel['fold'])}") + print(" selector picks distributed across markets:") + for m in sorted(sel_by_mkt, key=lambda x:-sel_by_mkt[x]['n']): + if sel_by_mkt[m]["n"]>0: print(" "+line(m, sel_by_mkt[m]).strip()) + print("\nREAD: a market/selector is a LEAD only if ROI>0, folds consistent, n large.") + print("Forward-validate with CLV before staking. Static CSV odds may overstate edge.") + + +if __name__ == "__main__": + main() diff --git a/ai-engine/scripts/train_favorite_model.py b/ai-engine/scripts/train_favorite_model.py new file mode 100644 index 0000000..3b09310 --- /dev/null +++ b/ai-engine/scripts/train_favorite_model.py @@ -0,0 +1,112 @@ +""" +Train Favorite-Policy Model (v1) — leak-free MS model for the validated strategy. +================================================================================ +Trains a LEAK-FREE 1X2 model (drops the result-encoding columns) and saves it +plus the feature list and policy metadata. This is the brain of the new system; +the favourite-band value policy (odds ~1.5-2.2, model_prob>implied, flat stake) +is applied on top of its probabilities at serving time. + +Honest holdout: trains on the first --holdout-frac of history, evaluates the +EXACT policy on the most recent slice (never seen in training), then retrains +on ALL history for the saved production artifact. + +Saves to models/favorite_v1/: model.json, feature_cols.json, metadata.json + +Usage: python scripts/train_favorite_model.py +""" +from __future__ import annotations +import argparse, json, os, sys, datetime +import numpy as np, pandas as pd, xgboost as xgb + +if sys.stdout and hasattr(sys.stdout, "reconfigure"): + try: sys.stdout.reconfigure(encoding="utf-8") + except Exception: pass + +AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv") +OUT = os.path.join(AI_DIR, "models", "favorite_v1") + +META = {"match_id","home_team_id","away_team_id","league_id","mst_utc", + "score_home","score_away","ht_score_home","ht_score_away"} +# Result-encoding leakage — never feed these to the model (train OR serve). +LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals", + "squad_diff","home_squad_quality","away_squad_quality", + "referee_home_bias","referee_avg_goals"} + +PARAMS = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05, + "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0} + + +def policy_eval(P, y, O, lo, hi, margin): + implied = np.where(O > 1.0, 1.0/O, np.nan) + edge = np.where(np.isnan(implied), -9.0, P - implied) + pick = edge.argmax(1); pe = edge[np.arange(len(y)), pick]; po = O[np.arange(len(y)), pick] + bet = (pe > margin) & (po >= lo) & (po < hi) + win = (pick == y) & bet + pnl = np.where(win, po-1.0, -1.0)[bet] + n = int(bet.sum()) + return {"bets": n, "hit_pct": round(100*win.sum()/max(n,1),1), + "roi_pct": round(100*pnl.sum()/max(n,1),2), "net_u": round(float(pnl.sum()),1)} + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--lo", type=float, default=1.5) + ap.add_argument("--hi", type=float, default=2.2) + ap.add_argument("--margin", type=float, default=0.0) + ap.add_argument("--holdout-frac", type=float, default=0.15) + ap.add_argument("--estimators", type=int, default=300) + args = ap.parse_args() + + print(f"Loading {CSV} ...") + df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True) + sh = pd.to_numeric(df["score_home"], errors="coerce") + sa = pd.to_numeric(df["score_away"], errors="coerce") + ok = sh.notna() & sa.notna() + df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values + y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2)) + O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values + feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY] + X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values + print(f" {len(df):,} rows, {len(feats)} leak-free features") + + # ── Honest holdout (last slice, never trained on) ── + cut = int(len(df) * (1 - args.holdout_frac)) + bst = xgb.train(PARAMS, xgb.DMatrix(X[:cut], label=y[:cut]), num_boost_round=args.estimators) + Ph = bst.predict(xgb.DMatrix(X[cut:])) + acc = float((Ph.argmax(1) == y[cut:]).mean()) + hold = policy_eval(Ph, y[cut:], O[cut:], args.lo, args.hi, args.margin) + print(f"\nHOLDOUT (last {args.holdout_frac:.0%}, {len(df)-cut:,} matches, never seen):") + print(f" MS accuracy: {acc*100:.1f}%") + print(f" POLICY band[{args.lo},{args.hi}] margin {args.margin}: {hold}") + + # ── Production model: retrain on ALL history ── + print("\nTraining production model on ALL history ...") + final = xgb.train(PARAMS, xgb.DMatrix(X, label=y), num_boost_round=args.estimators) + os.makedirs(OUT, exist_ok=True) + final.save_model(os.path.join(OUT, "model.json")) + with open(os.path.join(OUT, "feature_cols.json"), "w", encoding="utf-8") as f: + json.dump(feats, f, ensure_ascii=False, indent=2) + meta = { + "version": "favorite_v1", + "trained_at": datetime.datetime.now().isoformat(timespec="seconds"), + "market": "MS", + "classes": {"0": "home(1)", "1": "draw(X)", "2": "away(2)"}, + "policy": {"odds_lo": args.lo, "odds_hi": args.hi, "margin": args.margin, + "stake": "flat 1u", "rule": "bet model's max value edge if picked odds in band", + "never": ["longshots odds>=hi", "parlays/combos"]}, + "n_train": len(df), "n_features": len(feats), + "leaky_excluded": sorted(LEAKY), + "holdout_eval": {"accuracy_pct": round(acc*100,1), **hold}, + "caveat": "CSV odds are a static capture, not verified closing. Forward paper-trade with real CLV before staking.", + } + with open(os.path.join(OUT, "metadata.json"), "w", encoding="utf-8") as f: + json.dump(meta, f, ensure_ascii=False, indent=2) + print(f"\n✅ Saved production model to {OUT}/") + print(f" model.json, feature_cols.json ({len(feats)} feats), metadata.json") + print("\nNEXT: serving wrapper that loads this + applies the policy to upcoming") + print("matches, logs paper-trade picks, and we measure real forward CLV/ROI.") + + +if __name__ == "__main__": + main() diff --git a/ai-engine/scripts/walkforward_oddsblind.py b/ai-engine/scripts/walkforward_oddsblind.py new file mode 100644 index 0000000..f137383 --- /dev/null +++ b/ai-engine/scripts/walkforward_oddsblind.py @@ -0,0 +1,191 @@ +""" +Walk-Forward Odds-Blind Experiment — THE pivotal test. +====================================================== +Question this answers: can a model BEAT THE MARKET out-of-sample, betting only +on information the price doesn't already contain? + +Method (no leakage, time-ordered): + * data sorted by kickoff (mst_utc); train on the past, test on the future, + rolled over several folds. + * TWO models on the MS (1X2) market: + ALL = every feature INCLUDING the bookmaker odds (what the live + engine does -> it mostly re-learns the price). + BLIND = identical but odds/implied/_present columns REMOVED, so the + model must disagree with the market using fundamentals only. + * For each, an honest value-bet simulation on the test fold using the REAL + odds payouts (margin included): bet the outcome with the biggest + model_prob - implied_prob edge above a margin; ROI = realized P/L per 1u. + +Read: if BLIND's value ROI is consistently > 0 across folds, there is a real, +exploitable lead. If both are <= 0 (expected), these markets aren't beatable +with this data and the honest move is to stop staking. + +Usage: + python scripts/walkforward_oddsblind.py + python scripts/walkforward_oddsblind.py --folds 6 --estimators 300 +""" +from __future__ import annotations + +import argparse +import os +import sys +import numpy as np +import pandas as pd + +if sys.stdout and hasattr(sys.stdout, "reconfigure"): + try: + sys.stdout.reconfigure(encoding="utf-8") + except Exception: + pass + +AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv") + +import xgboost as xgb # noqa: E402 + +META = {"match_id", "home_team_id", "away_team_id", "league_id", "mst_utc", + "score_home", "score_away", "ht_score_home", "ht_score_away"} + +# Confirmed target leakage: *_goals_form integer-valued and ~0.63 correlated +# with THIS match's goals; their diff equals the actual goal diff 73% of the +# time. Excluded so the experiment measures genuine pre-match predictive power. +LEAKY = { + # CONFIRMED (encode the actual match result): + "home_goals_form", "away_goals_form", # ~0.63 corr w/ this match's goals + "total_goals", # this match's full-time total + "ht_total_goals", # this match's half-time total + # STRONG SUSPECTS (dominate importance + high outcome corr; audit extractor): + "squad_diff", "home_squad_quality", "away_squad_quality", + "referee_home_bias", "referee_avg_goals", +} + + +def is_odds_col(c: str) -> bool: + cl = c.lower() + return ("odds" in cl) or ("implied" in cl) + + +def logloss(y: np.ndarray, p: np.ndarray) -> float: + p = np.clip(p, 1e-9, 1 - 1e-9) + return float(-np.mean(np.log(p[np.arange(len(y)), y]))) + + +def value_sim(proba: np.ndarray, y: np.ndarray, odds: np.ndarray, + margin: float) -> dict: + """Bet the class with the biggest (model_prob - 1/odds) edge above margin.""" + implied = np.where(odds > 1.0, 1.0 / odds, np.nan) + edge = proba - implied + # ignore classes without valid odds + edge = np.where(np.isnan(implied), -9.0, edge) + pick = np.argmax(edge, axis=1) + best_edge = edge[np.arange(len(y)), pick] + bet = best_edge > margin + n = int(bet.sum()) + if n == 0: + return {"n": 0, "roi": None, "hit": None} + win = (pick == y) & bet + pick_odds = odds[np.arange(len(y)), pick] + pnl = np.where(win, pick_odds - 1.0, -1.0) + pnl = pnl[bet] + return {"n": n, "roi": round(100.0 * pnl.sum() / n, 2), + "hit": round(100.0 * win[bet].sum() / n, 1)} + + +def train_eval(Xtr, ytr, Xte, yte, odds_te, est, margins): + dtr = xgb.DMatrix(Xtr, label=ytr) + dte = xgb.DMatrix(Xte) + params = {"objective": "multi:softprob", "num_class": 3, "max_depth": 5, + "eta": 0.05, "subsample": 0.8, "colsample_bytree": 0.8, + "tree_method": "hist", "verbosity": 0} + booster = xgb.train(params, dtr, num_boost_round=est) + proba = booster.predict(dte) + out = {"logloss": round(logloss(yte, proba), 4), + "acc": round(100.0 * (proba.argmax(1) == yte).mean(), 1)} + for mg in margins: + out[f"val@{mg}"] = value_sim(proba, yte, odds_te, mg) + return out + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--folds", type=int, default=5) + ap.add_argument("--estimators", type=int, default=250) + ap.add_argument("--test-frac", type=float, default=0.5, + help="Fraction at the end used as rolling OOS (default 0.5)") + args = ap.parse_args() + + print(f"Loading {CSV} ...") + df = pd.read_csv(CSV, low_memory=False) + df = df.sort_values("mst_utc").reset_index(drop=True) + print(f" {len(df)} rows, {df.shape[1]} cols") + + # Derive true MS outcome from scores: 0=home,1=draw,2=away (robust, no label trust) + sh = pd.to_numeric(df["score_home"], errors="coerce") + sa = pd.to_numeric(df["score_away"], errors="coerce") + y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2)) + valid = sh.notna() & sa.notna() + df, y = df[valid].reset_index(drop=True), y[valid.values] + + odds = df[["odds_ms_h", "odds_ms_d", "odds_ms_a"]].apply( + pd.to_numeric, errors="coerce").fillna(0.0).values + + feat_all = [c for c in df.columns if c not in META and not c.startswith("label_") + and c not in LEAKY] + feat_blind = [c for c in feat_all if not is_odds_col(c)] + print(f" excluded leaky cols: {sorted(LEAKY)}") + Xall = df[feat_all].apply(pd.to_numeric, errors="coerce").fillna(0.0) + Xblind = df[feat_blind].apply(pd.to_numeric, errors="coerce").fillna(0.0) + print(f" features: ALL={len(feat_all)} BLIND={len(feat_blind)} " + f"(dropped {len(feat_all)-len(feat_blind)} odds cols)") + print(f" base rates: home={100*(y==0).mean():.1f}% draw={100*(y==1).mean():.1f}% " + f"away={100*(y==2).mean():.1f}%") + + n = len(df) + start = int(n * (1 - args.test_frac)) + bounds = np.linspace(start, n, args.folds + 1, dtype=int) + margins = [0.0, 0.05, 0.10] + + agg = {"ALL": {f"val@{m}": [] for m in margins}, "BLIND": {f"val@{m}": [] for m in margins}} + agg["ALL"]["logloss"] = []; agg["BLIND"]["logloss"] = [] + + print(f"\nWalk-forward: {args.folds} folds, train=expanding, est={args.estimators}\n") + hdr = f"{'fold':<5}{'model':<7}{'logloss':>9}{'acc%':>7}" + "".join( + f"{('val@'+str(m)):>22}" for m in margins) + print(hdr); print("-" * len(hdr)) + for i in range(args.folds): + te0, te1 = bounds[i], bounds[i + 1] + if te1 - te0 < 50: + continue + tr = slice(0, te0) + te = slice(te0, te1) + for name, X in (("ALL", Xall), ("BLIND", Xblind)): + r = train_eval(X.iloc[tr].values, y[tr], X.iloc[te].values, y[te], + odds[te], args.estimators, margins) + agg[name]["logloss"].append(r["logloss"]) + cells = "" + for m in margins: + v = r[f"val@{m}"] + agg[name][f"val@{m}"].append(v) + cells += f"{('n=' + str(v['n']) + ' roi=' + str(v['roi'])):>22}" + print(f"{i:<5}{name:<7}{r['logloss']:>9}{r['acc']:>7}{cells}") + print() + + print("=" * 70) + print("AGGREGATE (sum bets, weighted ROI across folds)") + print("=" * 70) + for name in ("ALL", "BLIND"): + ll = np.mean(agg[name]["logloss"]) if agg[name]["logloss"] else float("nan") + print(f"\n{name} mean logloss={ll:.4f}") + for m in margins: + vs = agg[name][f"val@{m}"] + tot_n = sum(v["n"] for v in vs) + tot_pnl = sum((v["roi"] / 100.0 * v["n"]) for v in vs if v["roi"] is not None) + roi = round(100.0 * tot_pnl / tot_n, 2) if tot_n else None + print(f" margin {m}: total_bets={tot_n:>6} ROI(flat1u)={roi}%") + print("\nREAD: BLIND ROI>0 across margins/folds = real edge. Both <=0 = no") + print("exploitable edge in MS with this data (stop staking; the -EV is the vig).") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())