gg3
Deploy Iddaai Backend / build-and-deploy (push) Successful in 35s

This commit is contained in:
2026-06-05 00:36:24 +03:00
parent b9700f9fda
commit 9e41407cb5
10 changed files with 1683 additions and 0 deletions
+137
View File
@@ -0,0 +1,137 @@
"""
Analyze Match v2 — the per-match multi-market value board + disciplined pick.
===========================================================================
Answers "for ONE match, show every bet type's probability + model signal +
market-vs-model value, and pick the right bet." Leak-free models.
KEY HONEST RULE (proven by multi_market_edge.py): compute & SHOW value for all
markets, but only MS (1X2) carries real, fold-consistent model edge. In OU/HT/
BTTS the market is efficient — a big model-vs-market gap there is the MODEL'S
ERROR, not value. So non-MS rows are INFO-ONLY; only an MS value bet in the
favourite band is STAKED.
Demo: trains all market models on the first 85% of history, then prints the full
board for sample matches in the unseen last 15% (with what actually happened).
Usage:
python scripts/analyze_match_v2.py --n 6
python scripts/analyze_match_v2.py --match <match_id>
"""
from __future__ import annotations
import argparse, os, sys
import numpy as np, pandas as pd, xgboost as xgb
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try: sys.stdout.reconfigure(encoding="utf-8")
except Exception: pass
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
"score_home","score_away","ht_score_home","ht_score_away"}
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
"squad_diff","home_squad_quality","away_squad_quality",
"referee_home_bias","referee_avg_goals"}
STAKE_LO, STAKE_HI = 1.5, 2.4 # MS favourite band that staking is allowed in
STAKE_MARGIN = 0.03
def ou(line): return lambda sh,sa,hh,ha: (0 if (sh+sa) > line else 1)
def htou(line): return lambda sh,sa,hh,ha: (None if np.isnan(hh) else (0 if (hh+ha) > line else 1))
MARKETS = {
"MS": ("multi", ["odds_ms_h","odds_ms_d","odds_ms_a"], ["1","X","2"],
lambda sh,sa,hh,ha: 0 if sh>sa else (1 if sh==sa else 2)),
"OU25": ("binary",["odds_ou25_o","odds_ou25_u"], ["2.5Üst","2.5Alt"], ou(2.5)),
"OU15": ("binary",["odds_ou15_o","odds_ou15_u"], ["1.5Üst","1.5Alt"], ou(1.5)),
"OU35": ("binary",["odds_ou35_o","odds_ou35_u"], ["3.5Üst","3.5Alt"], ou(3.5)),
"BTTS": ("binary",["odds_btts_y","odds_btts_n"], ["KG Var","KG Yok"],
lambda sh,sa,hh,ha: 0 if (sh>0 and sa>0) else 1),
"HT": ("multi", ["odds_ht_ms_h","odds_ht_ms_d","odds_ht_ms_a"], ["İY1","İYX","İY2"],
lambda sh,sa,hh,ha: None if np.isnan(hh) else (0 if hh>ha else (1 if hh==ha else 2))),
"HT_OU15": ("binary",["odds_ht_ou15_o","odds_ht_ou15_u"], ["İY1.5Üst","İY1.5Alt"], htou(1.5)),
}
STAKED_MARKETS = {"MS"} # only these are bet; rest are info-only
PM = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
PB = {"objective":"binary:logistic","max_depth":5,"eta":0.05,"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--n", type=int, default=6, help="how many sample matches")
ap.add_argument("--match", help="specific match_id")
ap.add_argument("--estimators", type=int, default=250)
args = ap.parse_args()
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
sh = pd.to_numeric(df["score_home"],errors="coerce"); sa = pd.to_numeric(df["score_away"],errors="coerce")
ok = sh.notna()&sa.notna(); df = df[ok].reset_index(drop=True)
SH=sh[ok.values].values.astype(float); SA=sa[ok.values].values.astype(float)
HH=pd.to_numeric(df["ht_score_home"],errors="coerce").values.astype(float)
HA=pd.to_numeric(df["ht_score_away"],errors="coerce").values.astype(float)
feats=[c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
X=df[feats].apply(pd.to_numeric,errors="coerce").fillna(0.0).values
N=len(df); cut=int(N*0.85)
print(f"Training {len(MARKETS)} leak-free market models on {cut:,} matches ...")
models={}
for m,(kind,ocols,picks,tfn) in MARKETS.items():
if not all(c in df.columns for c in ocols): continue
truth=np.array([tfn(SH[i],SA[i],HH[i],HA[i]) for i in range(cut)],dtype=object)
valid=np.array([v is not None for v in truth])
if kind=="multi":
b=xgb.train(PM,xgb.DMatrix(X[:cut][valid],label=truth[valid].astype(int)),num_boost_round=args.estimators)
else:
b=xgb.train(PB,xgb.DMatrix(X[:cut][valid],label=(truth[valid].astype(int)==0).astype(int)),num_boost_round=args.estimators)
models[m]=(kind,ocols,picks,tfn,b)
# choose matches from holdout
hold = df.iloc[cut:].reset_index(drop=True)
if args.match:
sel_idx = df.index[df["match_id"].astype(str)==str(args.match)].tolist()
rows = [(i,) for i in sel_idx]
base = df
else:
pick_pos = np.linspace(0, len(hold)-1, args.n, dtype=int)
rows = [(cut+p,) for p in pick_pos]
base = df
for (gi,) in rows:
r = base.iloc[gi]
xrow = X[gi:gi+1]
sh_,sa_,hh_,ha_ = SH[gi],SA[gi],HH[gi],HA[gi]
ht = f"{int(hh_)}-{int(ha_)}" if not np.isnan(hh_) else "?"
print("\n"+"="*72)
print(f"MATCH {r['match_id']} | elo H{r.get('home_overall_elo','?'):.0f} vs A{r.get('away_overall_elo','?'):.0f}"
f" | ACTUAL {int(sh_)}-{int(sa_)} (HT {ht})")
print(f" {'market':<8}{'pick':<10}{'model%':>8}{'impl%':>7}{'edge':>7}{'odds':>7} flag result")
print(" "+"-"*64)
best_ms=None
for m,(kind,ocols,picks,tfn,b) in models.items():
if kind=="multi":
P=b.predict(xgb.DMatrix(xrow))[0]
else:
p=float(b.predict(xgb.DMatrix(xrow))[0]); P=np.array([p,1-p])
O=pd.to_numeric(r[ocols],errors="coerce").fillna(0.0).values
truth=tfn(sh_,sa_,hh_,ha_)
for k in range(len(picks)):
o=O[k]
if o<=1.0: continue
imp=1.0/o; edge=P[k]-imp
res = "" if truth is None else ("WON" if truth==k else "lost")
staked = (m in STAKED_MARKETS) and edge>STAKE_MARGIN and STAKE_LO<=o<STAKE_HI
flag = "★BET" if staked else ("val" if edge>STAKE_MARGIN else "")
print(f" {m:<8}{picks[k]:<10}{100*P[k]:>7.1f}{100*imp:>7.1f}{100*edge:>+7.1f}{o:>7.2f} {flag:<5} {res}")
if staked and (best_ms is None or edge>best_ms[0]):
best_ms=(edge,m,picks[k],o,res)
print(" "+"-"*64)
if best_ms:
e,m,p,o,res = best_ms
print(f" >>> STAKE: {m} {p} @ {o:.2f} (edge +{100*e:.1f}%, favourite band) -> {res}")
else:
print(f" >>> NO STAKE: no MS value in favourite band. (Other markets info-only —")
print(f" their 'value' is model error in efficient markets; do NOT chase it.)")
print("\nNOTE: only MS staked (proven edge). All markets shown for transparency.")
print("Forward-validate with CLV before real money. Static CSV odds may overstate edge.")
if __name__ == "__main__":
main()
+113
View File
@@ -0,0 +1,113 @@
"""
Betting Policy — the honest, leak-free strategy the data actually supports.
==========================================================================
Everything else in this repo bet UNDERDOGS (odds 6-7.5) and lost (-43.7% live).
The data says the opposite: the only positive, fold-consistent, model-driven
signal is MILD FAVOURITES the model rates above the market price.
POLICY (MS / 1X2 only):
* leak-free model (drops the result-encoding features, see LEAKY)
* bet the model's single biggest value edge (model_prob - implied) ...
* ONLY if the picked side's odds are in [--lo, --hi] (favourite band)
* ONLY if that edge > --margin
* flat 1u stake, one bet per match, never a longshot, never a parlay.
Walk-forward, no leakage. Reports the policy ROI, fold consistency, drawdown,
and the model-free baseline (blind favourite) so you can see the model's lift.
⚠️ HONEST CAVEAT: CSV odds are a static capture, not the verified obtainable
closing line. A small backtest edge here is a LEAD, not a guarantee. Forward
paper-trade with real CLV (capture_closing_odds.py) before risking money.
Usage: python scripts/betting_policy.py --lo 1.5 --hi 2.2 --margin 0.0 --folds 8
"""
from __future__ import annotations
import argparse, os, sys
import numpy as np, pandas as pd, xgboost as xgb
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try: sys.stdout.reconfigure(encoding="utf-8")
except Exception: pass
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
"score_home","score_away","ht_score_home","ht_score_away"}
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
"squad_diff","home_squad_quality","away_squad_quality",
"referee_home_bias","referee_avg_goals"}
def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--lo", type=float, default=1.5)
ap.add_argument("--hi", type=float, default=2.2)
ap.add_argument("--margin", type=float, default=0.0)
ap.add_argument("--folds", type=int, default=8)
ap.add_argument("--estimators", type=int, default=250)
args = ap.parse_args()
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
sh = pd.to_numeric(df["score_home"], errors="coerce")
sa = pd.to_numeric(df["score_away"], errors="coerce")
ok = sh.notna() & sa.notna()
df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
n = len(df); start = int(n*0.5)
bounds = np.linspace(start, n, args.folds+1, dtype=int)
params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
print(f"POLICY: favourite band [{args.lo},{args.hi}] margin {args.margin} "
f"leak-free feats={len(feats)} folds={args.folds}\n")
all_pnl=[]; fold_rows=[]; base_pnl=[]
for fi in range(args.folds):
te0,te1 = bounds[fi], bounds[fi+1]
if te1-te0 < 50: continue
bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators)
P = bst.predict(xgb.DMatrix(X[te0:te1]))
yte, Ote = y[te0:te1], O[te0:te1]
implied = np.where(Ote>1.0, 1.0/Ote, np.nan)
edge = np.where(np.isnan(implied), -9.0, P-implied)
pick = edge.argmax(1); pe = edge[np.arange(len(yte)),pick]; po = Ote[np.arange(len(yte)),pick]
bet = (pe>args.margin) & (po>=args.lo) & (po<args.hi)
win = (pick==yte)&bet
pnl = np.where(win, po-1.0, -1.0)[bet]
# model-free baseline: blind favourite in same band
fav=Ote.argmin(1); fo=Ote[np.arange(len(yte)),fav]
bmask=(fo>=args.lo)&(fo<args.hi)&(Ote>1.0).all(1)
bpnl=np.where(fav[bmask]==yte[bmask], fo[bmask]-1.0, -1.0)
roi = 100*pnl.sum()/len(pnl) if len(pnl) else float('nan')
broi= 100*bpnl.sum()/len(bpnl) if len(bpnl) else float('nan')
fold_rows.append((fi, len(pnl), 100*win.sum()/max(bet.sum(),1), roi, broi))
all_pnl.extend(pnl.tolist()); base_pnl.extend(bpnl.tolist())
print(f" fold {fi}: policy_bets={len(pnl):>4} hit={100*win.sum()/max(bet.sum(),1):>5.1f}% "
f"ROI={roi:>7.2f}% | baseline(blind fav) ROI={broi:>7.2f}%")
a=np.array(all_pnl); b=np.array(base_pnl)
print("\n"+"="*70)
print("AGGREGATE")
print("="*70)
if len(a):
cum=np.cumsum(a); peak=np.maximum.accumulate(cum); dd=(cum-peak).min()
folds_pos=sum(1 for r in fold_rows if r[3]>0)
print(f" POLICY: bets={len(a):>5} hit={100*(a>0).mean():.1f}% "
f"ROI={100*a.mean():+.2f}% net={a.sum():+.1f}u maxDD={dd:.1f}u "
f"folds+={folds_pos}/{len(fold_rows)}")
if len(b):
print(f" BASELINE: bets={len(b):>5} hit={100*(b>0).mean():.1f}% "
f"ROI={100*b.mean():+.2f}% (blind favourite, same band)")
if len(a):
print(f"\n MODEL LIFT over blind favourite: "
f"{100*a.mean()-100*b.mean():+.1f} percentage points")
print("\nREAD: a believable system has ROI>0, folds+ near full, tolerable maxDD,")
print("and clearly beats the blind-favourite baseline. Even then it's a LEAD —")
print("forward paper-trade with real CLV before staking real money.")
if __name__ == "__main__":
main()
+136
View File
@@ -0,0 +1,136 @@
"""
Capture Closing Odds — snapshot #2 of the minimal 2-snapshot CLV system.
=======================================================================
WHY: CLV (closing line value) is the only reliable proof of betting edge.
This codebase never captured it: odds are stored as a single static snapshot
and `odds_history` is empty. But the live sync (DataFetcherTask CRON 1) DOES
refresh `live_matches.odds` every 15 min before kickoff, and prediction_runs
already store the bet-time odds blob (odds_snapshot.odds, source=live_match).
This script supplies the missing half: just before kickoff it copies the
*current* live odds blob onto the match's latest prediction_run as
`odds_snapshot.closing_odds`. Later, CLV per bet = bet-time pick odds vs
closing pick odds (computed in live_scoreboard.py once enough data exists).
Run it every ~15 min (e.g. alongside the existing sync, or its own cron):
python scripts/capture_closing_odds.py # default 25-min window
python scripts/capture_closing_odds.py --window-min 20 --dry-run
Structure-agnostic: stores the whole live odds blob; no pick parsing here.
Idempotent: skips runs that already have closing_odds. Only ADDS a JSON key,
never deletes. Safe to run repeatedly.
⚠️ Needs one supervised test run against a live DB with upcoming matches
before scheduling (DB was down at authoring time).
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from datetime import datetime, timezone
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try:
sys.stdout.reconfigure(encoding="utf-8")
except Exception:
pass
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR)
sys.path.insert(0, AI_ENGINE_DIR)
from data.db import get_clean_dsn # noqa: E402
import psycopg2 # noqa: E402
from psycopg2.extras import RealDictCursor # noqa: E402
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--window-min", type=int, default=25,
help="Capture matches kicking off within the next N minutes (default 25)")
ap.add_argument("--grace-min", type=int, default=10,
help="Also include matches that kicked off up to N min ago (default 10)")
ap.add_argument("--dry-run", action="store_true",
help="Report what would be captured without writing")
args = ap.parse_args()
now_ms = int(time.time() * 1000)
lo_ms = now_ms - args.grace_min * 60 * 1000
hi_ms = now_ms + args.window_min * 60 * 1000
captured = skipped = no_run = 0
with psycopg2.connect(get_clean_dsn()) as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# Upcoming/just-started live matches that still hold pre-kickoff odds.
cur.execute(
"""
SELECT id, mst_utc, odds
FROM live_matches
WHERE odds IS NOT NULL
AND mst_utc BETWEEN %s AND %s
ORDER BY mst_utc ASC
""",
(lo_ms, hi_ms),
)
matches = cur.fetchall()
print(f"[capture_closing_odds] window={args.window_min}m grace={args.grace_min}m "
f"upcoming_with_odds={len(matches)} dry_run={args.dry_run}")
for m in matches:
mid = m["id"]
cur.execute(
"""
SELECT id, odds_snapshot
FROM prediction_runs
WHERE match_id = %s
ORDER BY generated_at DESC
LIMIT 1
""",
(mid,),
)
run = cur.fetchone()
if not run:
no_run += 1
continue
snap = run["odds_snapshot"] or {}
if isinstance(snap, str):
try:
snap = json.loads(snap)
except Exception:
snap = {}
if snap.get("closing_odds") is not None:
skipped += 1
continue
patch = {
"closing_odds": m["odds"],
"closing_captured_at": datetime.now(timezone.utc).isoformat(),
"closing_mst_utc": m["mst_utc"],
"closing_source": "live_match",
}
if args.dry_run:
captured += 1
print(f" would capture match={mid} run_id={run['id']} mst_utc={m['mst_utc']}")
continue
cur.execute(
"""
UPDATE prediction_runs
SET odds_snapshot = COALESCE(odds_snapshot, '{}'::jsonb) || %s::jsonb
WHERE id = %s
""",
(json.dumps(patch, default=str), run["id"]),
)
captured += 1
if not args.dry_run:
conn.commit()
print(f"[capture_closing_odds] captured={captured} already_had={skipped} "
f"no_prediction_run={no_run}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
+224
View File
@@ -0,0 +1,224 @@
"""
CLV Report — the single most important edge metric.
===================================================
Closing Line Value = did we bet at better odds than the market's closing line?
Consistently positive CLV is the only reliable proof of a real betting edge;
negative CLV means no edge, regardless of short-term wins/losses.
This codebase stores the BET-TIME odds for ~92% of runs (prediction_runs.
odds_snapshot.source = 'live_match' with the live odds blob, and the pick's
odds in payload main_pick.odds). For the closing line we use, in order:
1. odds_snapshot.closing_odds (captured by capture_closing_odds.py, forward)
2. odd_selections current value (the static near-final capture — a proxy)
CLV per bet = bet_odds / closing_odds - 1 (positive = beat the close = good).
Read-only. SELECT only.
Usage:
python scripts/clv_report.py
python scripts/clv_report.py --staked-only
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from collections import defaultdict
from typing import Any, Dict, Optional, Tuple
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try:
sys.stdout.reconfigure(encoding="utf-8")
except Exception:
pass
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR)
sys.path.insert(0, AI_ENGINE_DIR)
from data.db import get_clean_dsn # noqa: E402
import psycopg2 # noqa: E402
from psycopg2.extras import RealDictCursor # noqa: E402
# market code -> (Turkish odds-category name, pick-normalizer -> selection key)
OU_CATS = {"OU05": "0,5 Alt/Üst", "OU15": "1,5 Alt/Üst", "OU25": "2,5 Alt/Üst",
"OU35": "3,5 Alt/Üst", "OU45": "4,5 Alt/Üst"}
def _f(x: Any, d: Optional[float] = None) -> Optional[float]:
try:
return float(x) if x is not None else d
except (TypeError, ValueError):
return d
def _parse(j: Any) -> Dict[str, Any]:
if isinstance(j, str):
try:
return json.loads(j)
except Exception:
return {}
return j or {}
def map_pick(market: str, pick: str) -> Optional[Tuple[str, str]]:
"""Return (category_name, selection_key) for the live-odds JSON / odd_selections."""
m = (market or "").upper()
p = (pick or "").strip()
pl = p.casefold()
if m in ("MS", "ML", "1X2"):
return ("Maç Sonucu", p if p in ("1", "X", "2") else None) if p in ("1", "X", "2") else None
if m == "HT":
return ("1. Yarı Sonucu", p) if p in ("1", "X", "2") else None
if m in OU_CATS:
if "üst" in pl or "ust" in pl or "over" in pl:
return (OU_CATS[m], "Üst")
if "alt" in pl or "under" in pl:
return (OU_CATS[m], "Alt")
return None
if m == "DC":
key = p.upper().replace(" ", "").replace("/", "-")
norm = {"1X": "1-X", "X1": "1-X", "X2": "X-2", "2X": "X-2",
"12": "1-2", "21": "1-2", "1-X": "1-X", "X-2": "X-2", "1-2": "1-2"}.get(key)
return ("Çifte Şans", norm) if norm else None
if m == "BTTS":
if "var" in pl or "yes" in pl:
return ("Karşılıklı Gol", "Var")
if "yok" in pl or "no" in pl:
return ("Karşılıklı Gol", "Yok")
return None
if m == "OE":
if "tek" in pl or "odd" in pl:
return ("Tek/Çift", "Tek")
if "çift" in pl or "cift" in pl or "even" in pl:
return ("Tek/Çift", "Çift")
return None
return None
def closing_from_blob(blob: Any, cat: str, sel: str) -> Optional[float]:
blob = _parse(blob)
cat_map = blob.get(cat) if isinstance(blob, dict) else None
if isinstance(cat_map, dict):
return _f(cat_map.get(sel))
return None
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--staked-only", action="store_true",
help="Only playable/staked bets (default: all picks with a mappable market)")
args = ap.parse_args()
rows_out = []
with psycopg2.connect(get_clean_dsn()) as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT match_id, engine_version, odds_snapshot, payload_summary,
eventual_outcome, unit_profit
FROM prediction_runs
WHERE odds_snapshot->>'source' = 'live_match'
ORDER BY generated_at ASC
""")
runs = cur.fetchall()
for r in runs:
snap = _parse(r["odds_snapshot"])
ps = _parse(r["payload_summary"])
mp = ps.get("main_pick") or {}
market = mp.get("market")
pick = mp.get("pick")
bet_odds = _f(mp.get("odds"))
playable = bool(mp.get("playable"))
if args.staked_only and not playable:
continue
if not market or not pick or not bet_odds or bet_odds <= 1.0:
continue
mapped = map_pick(market, pick)
if not mapped or not mapped[1]:
continue
cat, sel = mapped
# closing line: prefer captured closing_odds, else static odd_selections
closing = closing_from_blob(snap.get("closing_odds"), cat, sel)
src = "captured"
if closing is None:
cur.execute("""
SELECT os.odd_value FROM odd_categories oc
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
WHERE oc.match_id = %s AND oc.name = %s AND os.name = %s
LIMIT 1
""", (r["match_id"], cat, sel))
row = cur.fetchone()
closing = _f(row["odd_value"]) if row else None
src = "static_proxy"
if closing is None or closing <= 1.0:
continue
clv = bet_odds / closing - 1.0
rows_out.append({
"market": market, "playable": playable,
"bet_odds": bet_odds, "closing": closing, "clv": clv,
"src": src, "profit": _f(r["unit_profit"], 0.0) or 0.0,
"settled": r["eventual_outcome"] is not None
and not str(r["eventual_outcome"]).startswith("NO_BET"),
})
if not rows_out:
print("No mappable runs with both bet-time and closing odds found.")
return 0
def agg(rs):
n = len(rs)
clvs = [x["clv"] for x in rs]
pos = sum(1 for c in clvs if c > 0)
return {
"n": n,
"mean_clv_pct": round(100.0 * sum(clvs) / n, 2),
"pct_positive": round(100.0 * pos / n, 1),
"captured": sum(1 for x in rs if x["src"] == "captured"),
}
print("=" * 70)
print("CLV REPORT — did we beat the closing line? (the edge compass)")
print("=" * 70)
o = agg(rows_out)
print(f"runs analyzed: {o['n']} (closing source: {o['captured']} captured, "
f"{o['n'] - o['captured']} static-proxy)")
print(f"\nOVERALL mean CLV: {o['mean_clv_pct']}% "
f"bets beating close: {o['pct_positive']}%")
print(" (positive mean CLV = real edge; ~0 or negative = no edge)\n")
staked = [x for x in rows_out if x["playable"]]
if staked:
s = agg(staked)
print(f"STAKED only: n={s['n']} mean CLV={s['mean_clv_pct']}% "
f"beating close={s['pct_positive']}%\n")
print("BY MARKET")
by_m = defaultdict(list)
for x in rows_out:
by_m[x["market"]].append(x)
for m, rs in sorted(by_m.items(), key=lambda kv: -len(kv[1])):
a = agg(rs)
print(f" {m:<8} n={a['n']:>4} mean CLV={a['mean_clv_pct']:>7}% "
f"beating close={a['pct_positive']:>5}%")
# CLV vs outcome sanity: do positive-CLV bets actually win more / lose less?
print("\nCLV vs realized P/L (settled staked)")
ss = [x for x in rows_out if x["playable"] and x["settled"]]
if ss:
posc = [x for x in ss if x["clv"] > 0]
negc = [x for x in ss if x["clv"] <= 0]
for label, grp in (("CLV>0", posc), ("CLV<=0", negc)):
if grp:
pr = sum(x["profit"] for x in grp)
print(f" {label:<7} n={len(grp):>3} profit={pr:>7.2f}u "
f"ROI(flat1u)={round(100*pr/len(grp),1)}%")
print("=" * 70)
return 0
if __name__ == "__main__":
raise SystemExit(main())
+181
View File
@@ -0,0 +1,181 @@
"""
Edge Search — is there a profitable POCKET (by league) the global model misses?
==============================================================================
Global leak-free MS is ~-5.6% (the vig). But efficiency varies: obscure / low-
tier leagues may be mispriced. This walks a leak-free model forward and slices
the value-bet ROI BY LEAGUE, requiring a real sample AND multi-fold consistency
so we don't chase one lucky window.
Leak-free: drops the confirmed/suspected leakage columns (see LEAKY). Uses odds
in features (realistic). Value bet = biggest model_prob - implied edge > margin.
⚠️ Even a positive pocket here is a LEAD, not proof: the CSV odds are a static
capture, not the verified closing line. Anything flagged must be forward-
validated with real CLV (capture_closing_odds.py) before staking.
Usage: python scripts/edge_search.py --folds 6 --min-bets 150
"""
from __future__ import annotations
import argparse, os, sys, time
import numpy as np, pandas as pd, xgboost as xgb
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try: sys.stdout.reconfigure(encoding="utf-8")
except Exception: pass
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_DIR)
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
"score_home","score_away","ht_score_home","ht_score_away"}
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
"squad_diff","home_squad_quality","away_squad_quality",
"referee_home_bias","referee_avg_goals"}
def league_names(ids):
"""Resilient id->name lookup."""
from data.db import get_clean_dsn
import psycopg2
from psycopg2.extras import RealDictCursor
out = {}
ids = [str(i) for i in ids if i is not None]
if not ids: return out
for _ in range(3):
try:
with psycopg2.connect(get_clean_dsn()) as c:
with c.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (ids,))
for r in cur.fetchall(): out[str(r["id"])] = r["name"]
return out
except Exception:
time.sleep(1.0)
return out
def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--folds", type=int, default=6)
ap.add_argument("--estimators", type=int, default=200)
ap.add_argument("--margin", type=float, default=0.0)
ap.add_argument("--min-bets", type=int, default=150)
args = ap.parse_args()
print(f"Loading {CSV} ...")
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
sh = pd.to_numeric(df["score_home"], errors="coerce")
sa = pd.to_numeric(df["score_away"], errors="coerce")
ok = sh.notna() & sa.notna()
df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
league = df["league_id"].astype(str).values
odds = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
rel = pd.to_numeric(df.get("league_reliability_score", pd.Series([np.nan]*len(df))),
errors="coerce").fillna(-1.0).values
print(f" {len(df):,} rows features={len(feats)} (leak-free) folds={args.folds}")
n = len(df); start = int(n * 0.5)
bounds = np.linspace(start, n, args.folds + 1, dtype=int)
params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
# reliability quartile edges from the betting universe (rel>=0)
rv = rel[rel >= 0]
qs = np.quantile(rv, [0.25, 0.5, 0.75]) if len(rv) else [0.3, 0.5, 0.7]
def rel_band(x):
if x < 0: return "rel:unknown"
if x < qs[0]: return f"rel:Q1(<{qs[0]:.2f})"
if x < qs[1]: return f"rel:Q2"
if x < qs[2]: return f"rel:Q3"
return f"rel:Q4(>={qs[2]:.2f})"
def odds_band(o):
return ("<1.5" if o<1.5 else "1.5-2" if o<2 else "2-3" if o<3 else
"3-5" if o<5 else "5-8" if o<8 else "8+")
recs = [] # (group_key, fold, pnl, win)
glob = {"n":0,"pnl":0.0,"win":0}
for fi in range(args.folds):
te0, te1 = bounds[fi], bounds[fi+1]
if te1-te0 < 50: continue
bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators)
proba = bst.predict(xgb.DMatrix(X[te0:te1]))
yte, ote, rte = y[te0:te1], odds[te0:te1], rel[te0:te1]
implied = np.where(ote > 1.0, 1.0/ote, np.nan)
edge = np.where(np.isnan(implied), -9.0, proba - implied)
pick = edge.argmax(1)
bet = edge[np.arange(len(yte)), pick] > args.margin
win = (pick == yte) & bet
pick_odds = ote[np.arange(len(yte)), pick]
pnl = np.where(win, pick_odds-1.0, -1.0)
for i in range(len(yte)):
if not bet[i]: continue
glob["n"]+=1; glob["pnl"]+=pnl[i]; glob["win"]+=int(win[i])
recs.append((rel_band(rte[i]), fi, pnl[i], int(win[i])))
recs.append((odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
recs.append((rel_band(rte[i])+" x "+odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
print(f" fold {fi}: tested {len(yte):,} bets {int(bet.sum()):,}")
print("\n"+"="*78)
print(f"GLOBAL leak-free: bets={glob['n']:,} hit={100*glob['win']/max(glob['n'],1):.1f}% "
f"ROI(flat1u)={100*glob['pnl']/max(glob['n'],1):.2f}%")
print("="*78)
rdf = pd.DataFrame(recs, columns=["grp","fold","pnl","win"])
def report(prefix, title):
sub = rdf[rdf["grp"].str.startswith(prefix)]
if sub.empty: return
print(f"\n{title}")
print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
print(" "+"-"*54)
g = sub.groupby("grp")
out=[]
for k,d in g:
nb=len(d)
if nb < args.min_bets: continue
roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
fp=d.groupby("fold")["pnl"].sum(); folds_pos=int((fp>0).sum()); ft=fp.shape[0]
out.append((roi,k,nb,hit,folds_pos,ft))
for roi,k,nb,hit,fp,ft in sorted(out,reverse=True):
print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fp)+'/'+str(ft):>8}")
report("rel:", "BY LEAGUE-RELIABILITY BAND (Q1=most obscure ... Q4=most reliable)")
report(("<","1","2","3","5","8"), None) # odds bands start with digit/<
# odds-band buckets begin with a digit or '<'
sub = rdf[~rdf["grp"].str.startswith("rel:")]
sub = sub[~sub["grp"].str.contains(" x ")]
if not sub.empty:
print("\nBY ODDS BAND")
print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
print(" "+"-"*54)
out=[]
for k,d in sub.groupby("grp"):
nb=len(d)
if nb<args.min_bets: continue
roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True):
print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
# 2D reliability x odds
sub2 = rdf[rdf["grp"].str.contains(" x ")]
if not sub2.empty:
print("\nBY RELIABILITY x ODDS (candidate pockets, n>=min-bets)")
print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
print(" "+"-"*54)
out=[]
for k,d in sub2.groupby("grp"):
nb=len(d)
if nb<args.min_bets: continue
roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True)[:15]:
print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
print("\nREAD: a pocket is a real LEAD only if ROI>0 AND positive in MOST folds")
print("(folds+ near full) AND bets large. +ROI in 1-2 folds = noise / overfit.")
print("Then forward-validate with CLV (capture_closing_odds.py) before staking.")
if __name__ == "__main__":
main()
+154
View File
@@ -0,0 +1,154 @@
"""
Generate Daily Picks — the serving picker for the validated favourite policy.
============================================================================
Loads the saved leak-free MS model (models/favorite_v1) and applies the
favourite-band value policy to a set of matches, emitting the day's STAKED
picks and logging them for forward paper-trade settlement.
Train/serve consistency: features MUST come from the SAME extractor that built
training_data_v27.csv. Production path = run the extractor nightly INCLUDING
upcoming (status NS) matches, then point this script at that CSV. Demo path =
use the tail of the training CSV as stand-in "today" matches (with the real
result shown, since those are settled).
Policy: bet the MS side with the biggest model_prob - implied edge, ONLY if
odds in [--lo,--hi] and edge>--margin. Flat 1u. No longshots, no parlays.
Non-MS markets are NOT staked (efficient -> model error). One bet per match.
Usage:
python scripts/generate_daily_picks.py --demo --n 20 # see it work now
python scripts/generate_daily_picks.py --features today.csv # production
python scripts/generate_daily_picks.py --settle # settle paper log
"""
from __future__ import annotations
import argparse, json, os, sys, datetime
import numpy as np, pandas as pd, xgboost as xgb
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try: sys.stdout.reconfigure(encoding="utf-8")
except Exception: pass
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
MODEL_DIR = os.path.join(AI_DIR, "models", "favorite_v1")
TRAIN_CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
PAPER_LOG = os.path.join(AI_DIR, "data", "paper_trades.csv")
MS_ODDS = ["odds_ms_h", "odds_ms_d", "odds_ms_a"]
MS_PICKS = ["1", "X", "2"]
def load_model():
bst = xgb.Booster(); bst.load_model(os.path.join(MODEL_DIR, "model.json"))
with open(os.path.join(MODEL_DIR, "feature_cols.json"), encoding="utf-8") as f:
feats = json.load(f)
with open(os.path.join(MODEL_DIR, "metadata.json"), encoding="utf-8") as f:
meta = json.load(f)
return bst, feats, meta
def pick_for_rows(df, bst, feats, lo, hi, margin):
X = df.reindex(columns=feats).apply(pd.to_numeric, errors="coerce").fillna(0.0).values
P = bst.predict(xgb.DMatrix(X)) # [n,3] home/draw/away
O = df[MS_ODDS].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
implied = np.where(O > 1.0, 1.0/O, np.nan)
edge = np.where(np.isnan(implied), -9.0, P - implied)
out = []
for i in range(len(df)):
k = int(np.argmax(edge[i])); o = float(O[i, k]); e = float(edge[i, k])
staked = (e > margin) and (lo <= o < hi)
out.append({"idx": i, "pick": MS_PICKS[k], "odds": round(o, 2),
"model_prob": round(float(P[i, k]), 4), "edge": round(e, 4),
"staked": staked})
return out
def settle():
if not os.path.exists(PAPER_LOG):
print("No paper_trades.csv yet."); return
pt = pd.read_csv(PAPER_LOG)
open_bets = pt[pt["result"].isna()] if "result" in pt.columns else pt
if open_bets.empty:
print("No open bets to settle.");
# settle from training CSV scores if present, else needs DB (left as note)
src = pd.read_csv(TRAIN_CSV, low_memory=False, usecols=["match_id","score_home","score_away"])
sc = src.set_index("match_id")
def res(row):
if not pd.isna(row.get("result")): return row["result"]
m = sc.index == row["match_id"]
if not m.any(): return np.nan
r = sc[m].iloc[0]; sh, sa = r["score_home"], r["score_away"]
if pd.isna(sh): return np.nan
outcome = "1" if sh > sa else ("X" if sh == sa else "2")
won = (str(row["pick"]) == outcome)
return "WON" if won else "LOST"
pt["result"] = pt.apply(res, axis=1)
pt["pnl"] = pt.apply(lambda r: (r["odds"]-1.0) if r["result"]=="WON"
else (-1.0 if r["result"]=="LOST" else np.nan), axis=1)
pt.to_csv(PAPER_LOG, index=False)
s = pt.dropna(subset=["pnl"])
if len(s):
print(f"Settled {len(s)} bets: hit={100*(s['result']=='WON').mean():.1f}% "
f"ROI={100*s['pnl'].sum()/len(s):+.2f}% net={s['pnl'].sum():+.1f}u")
return
def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--features", help="CSV of upcoming matches in training schema")
ap.add_argument("--demo", action="store_true", help="use tail of training CSV as 'today'")
ap.add_argument("--n", type=int, default=20)
ap.add_argument("--lo", type=float, default=1.5)
ap.add_argument("--hi", type=float, default=2.2)
ap.add_argument("--margin", type=float, default=0.03)
ap.add_argument("--settle", action="store_true")
ap.add_argument("--log", action="store_true", help="append staked picks to paper_trades.csv")
args = ap.parse_args()
if args.settle:
settle(); return
bst, feats, meta = load_model()
print(f"Model {meta['version']} (trained {meta['trained_at']}, holdout "
f"ROI {meta['holdout_eval']['roi_pct']}%) band[{args.lo},{args.hi}] margin {args.margin}\n")
if args.features:
df = pd.read_csv(args.features, low_memory=False)
demo = False
else:
df = pd.read_csv(TRAIN_CSV, low_memory=False).sort_values("mst_utc").tail(args.n).reset_index(drop=True)
demo = True
print("(DEMO: last matches of training CSV as stand-in for today)\n")
picks = pick_for_rows(df, bst, feats, args.lo, args.hi, args.margin)
staked = [p for p in picks if p["staked"]]
print(f"{len(df)} matches scanned -> {len(staked)} STAKED MS picks\n")
print(f" {'match_id':<28}{'pick':>5}{'odds':>7}{'model%':>8}{'edge%':>7}" + (" result" if demo else ""))
print(" "+"-"*60)
log_rows = []
for p in picks:
if not p["staked"]: continue
r = df.iloc[p["idx"]]; mid = str(r["match_id"])
res = ""
if demo:
sh, sa = r.get("score_home"), r.get("score_away")
if pd.notna(sh):
out = "1" if sh>sa else ("X" if sh==sa else "2")
res = " WON" if p["pick"]==out else " lost"
print(f" {mid:<28}{p['pick']:>5}{p['odds']:>7.2f}{100*p['model_prob']:>8.1f}{100*p['edge']:>+7.1f}{res}")
log_rows.append({"logged_at": datetime.datetime.now().isoformat(timespec="seconds"),
"match_id": mid, "market": "MS", "pick": p["pick"], "odds": p["odds"],
"model_prob": p["model_prob"], "edge": p["edge"], "stake": 1.0,
"result": np.nan, "pnl": np.nan})
if args.log and log_rows and not demo:
new = pd.DataFrame(log_rows)
if os.path.exists(PAPER_LOG):
new = pd.concat([pd.read_csv(PAPER_LOG), new], ignore_index=True)
new.to_csv(PAPER_LOG, index=False)
print(f"\n logged {len(log_rows)} picks -> {PAPER_LOG}")
elif args.log and demo:
print("\n (--log ignored in --demo; only real upcoming picks are logged)")
print("\nReminder: paper-trade only. Stake real money after weeks of forward")
print("CLV>0 + ROI>0 (settle with --settle, check scoreboard/clv_report).")
if __name__ == "__main__":
main()
+253
View File
@@ -0,0 +1,253 @@
"""
Live Scoreboard — the single source of truth for real betting performance.
=========================================================================
Reads the *forward-tracked* results in `prediction_runs` (one row per analyzed
match, with the staked main pick + actual outcome + realized unit_profit) and
reports what ACTUALLY happened with real money logic — NOT a backtest.
Why this exists: backtests on this codebase are overfit (a paper "+32.7% ROI"
strategy that the live engine never even ran). The only trustworthy number is
the realized P/L recorded after matches settle. This tool surfaces it.
Read-only. SELECT only. Safe to run anytime.
Usage:
python scripts/live_scoreboard.py
python scripts/live_scoreboard.py --days 30
python scripts/live_scoreboard.py --version v28-pro-max
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from collections import defaultdict
from datetime import datetime, timedelta, timezone
from typing import Any, Dict, List, Optional
# utf-8 stdout so Turkish market/league names never crash on Windows cp1252
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try:
sys.stdout.reconfigure(encoding="utf-8")
except Exception:
pass
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR)
sys.path.insert(0, AI_ENGINE_DIR)
from data.db import get_clean_dsn # noqa: E402
import psycopg2 # noqa: E402
from psycopg2.extras import RealDictCursor # noqa: E402
ODDS_BANDS = [(0, 1.5, "<1.5"), (1.5, 2.0, "1.5-2"), (2.0, 3.0, "2-3"),
(3.0, 5.0, "3-5"), (5.0, 6.0, "5-6"), (6.0, 7.5, "6-7.5"),
(7.5, 999, "7.5+")]
def _f(x: Any, d: Optional[float] = None) -> Optional[float]:
try:
return float(x) if x is not None else d
except (TypeError, ValueError):
return d
def _parse(j: Any) -> Dict[str, Any]:
if isinstance(j, str):
try:
return json.loads(j)
except Exception:
return {}
return j or {}
def _band(odds: Optional[float]) -> str:
if odds is None:
return "?"
for lo, hi, name in ODDS_BANDS:
if lo <= odds < hi:
return name
return "?"
def fetch_rows(args) -> List[Dict[str, Any]]:
dsn = get_clean_dsn()
where = ["eventual_outcome IS NOT NULL"]
params: List[Any] = []
if args.version:
where.append("engine_version = %s")
params.append(args.version)
if args.days:
cutoff = datetime.now(timezone.utc) - timedelta(days=args.days)
where.append("generated_at >= %s")
params.append(cutoff)
sql = f"""
SELECT match_id, engine_version, generated_at, eventual_outcome,
unit_profit, payload_summary
FROM prediction_runs
WHERE {' AND '.join(where)}
ORDER BY generated_at ASC
"""
with psycopg2.connect(dsn) as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute(sql, params)
return cur.fetchall()
def distill(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""One analytic record per run with the staked pick + realized P/L."""
out = []
for r in rows:
ps = _parse(r["payload_summary"])
mp = ps.get("main_pick") or {}
playable = bool(mp.get("playable"))
stake = _f(mp.get("stake_units"), 0.0) or 0.0
profit = _f(r["unit_profit"], 0.0) or 0.0
outcome = str(r["eventual_outcome"] or "")
staked = playable and stake > 0
# settled stake = a real bet with a win/loss (exclude NO_BET / push)
settled_stake = staked and not outcome.startswith(("NO_BET", "PUSH", "VOID", "CANCEL"))
out.append({
"match_id": r["match_id"],
"version": r["engine_version"],
"ts": r["generated_at"],
"market": mp.get("market") or "?",
"pick": mp.get("pick"),
"odds": _f(mp.get("odds")),
"stake": stake,
"profit": profit,
"outcome": outcome,
"staked": staked,
"settled_stake": settled_stake,
"win": settled_stake and profit > 0,
})
return out
def _agg(recs: List[Dict[str, Any]]) -> Dict[str, Any]:
# NOTE: recorded unit_profit is on a FLAT 1u basis (win=odds-1, loss=-1),
# independent of the brain's suggested stake_units. So ROI is profit per
# bet at 1u flat = profit / n. (Using stake_units as denominator is wrong:
# it double-counts and produces impossible >100% losses.)
s = [r for r in recs if r["settled_stake"]]
n = len(s)
wins = sum(1 for r in s if r["win"])
sug_stake = sum(r["stake"] for r in s)
profit = sum(r["profit"] for r in s)
return {
"n": n,
"wins": wins,
"hit_pct": round(100.0 * wins / n, 1) if n else None,
"sug_stake": round(sug_stake, 2),
"profit": round(profit, 2),
"roi_pct": round(100.0 * profit / n, 1) if n else None, # flat 1u
}
def _line(label: str, a: Dict[str, Any]) -> str:
return (f" {label:<14} n={a['n']:>4} hit={str(a['hit_pct'] if a['hit_pct'] is not None else '-'):>5}% "
f"profit={a['profit']:>8.2f}u ROI(flat1u)={str(a['roi_pct'] if a['roi_pct'] is not None else '-'):>7}%")
def risk_metrics(recs: List[Dict[str, Any]]) -> Dict[str, Any]:
s = [r for r in sorted(recs, key=lambda x: x["ts"]) if r["settled_stake"]]
cum = 0.0
peak = 0.0
max_dd = 0.0
streak = 0
worst_streak = 0
for r in s:
cum += r["profit"]
peak = max(peak, cum)
max_dd = min(max_dd, cum - peak)
if r["profit"] <= 0:
streak += 1
worst_streak = max(worst_streak, streak)
else:
streak = 0
return {"max_drawdown_u": round(max_dd, 2),
"longest_losing_streak": worst_streak,
"final_cum_u": round(cum, 2)}
def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--days", type=int, default=None, help="Only last N days")
ap.add_argument("--version", help="Filter by engine_version")
args = ap.parse_args()
rows = fetch_rows(args)
recs = distill(rows)
print("=" * 74)
print("LIVE SCOREBOARD — realized results from prediction_runs (NOT backtest)")
print("=" * 74)
if recs:
lo = min(r["ts"] for r in recs).date()
hi = max(r["ts"] for r in recs).date()
print(f"window: {lo} .. {hi} settled runs: {len(recs)}"
+ (f" filter: {args.version}" if args.version else ""))
print()
overall = _agg(recs)
print("OVERALL (staked = playable bets only)")
print(_line("ALL", overall))
no_bet = sum(1 for r in recs if not r["staked"])
print(f" (analyzed {len(recs)} matches; {overall['n']} actually staked, "
f"{no_bet} NO_BET)")
if overall["n"]:
rm = risk_metrics(recs)
print(f" max drawdown: {rm['max_drawdown_u']}u "
f"longest losing streak: {rm['longest_losing_streak']} "
f"net: {rm['final_cum_u']}u")
print()
print("BY ENGINE VERSION")
by_v = defaultdict(list)
for r in recs:
by_v[r["version"]].append(r)
for v, rs in sorted(by_v.items(), key=lambda kv: -len(kv[1])):
print(_line(v, _agg(rs)))
print()
print("BY MARKET (staked)")
by_m = defaultdict(list)
for r in recs:
if r["settled_stake"]:
by_m[r["market"]].append(r)
for m, rs in sorted(by_m.items(), key=lambda kv: -len(kv[1])):
print(_line(m, _agg(rs)))
if not by_m:
print(" (no staked settled bets in window)")
print()
print("BY ODDS BAND (staked)")
by_b = defaultdict(list)
for r in recs:
if r["settled_stake"]:
by_b[_band(r["odds"])].append(r)
for _, _, name in ODDS_BANDS:
if name in by_b:
print(_line(name, _agg(by_b[name])))
print()
print("WEEKLY TREND (staked)")
by_w = defaultdict(list)
for r in recs:
if r["settled_stake"]:
iso = r["ts"].isocalendar()
by_w[f"{iso[0]}-W{iso[1]:02d}"].append(r)
for w in sorted(by_w):
a = _agg(by_w[w])
print(_line(w, a))
print()
print("=" * 74)
print("READ: ROI < 0 over a meaningful sample = the staked signals are not")
print("profitable. 'NO_BET' rows are free (no stake). CLV is unmeasurable")
print("until odds movement is captured (see scripts + odds_history fix).")
print("=" * 74)
if __name__ == "__main__":
main()
+182
View File
@@ -0,0 +1,182 @@
"""
Multi-Market Edge + Best-Bet Selector — pick the best value bet PER MATCH
========================================================================
Not "play the handed main_pick". For each match, score EVERY market the model
covers, compare model prob vs market implied, and select the single best VALUE
bet across all markets. Leak-free, walk-forward, honest.
Markets (truth derived from scores, not trusted labels):
MS(1X2), HT-result, OU0.5/1.5/2.5/3.5, HT_OU0.5/1.5, BTTS.
Outputs:
(A) per-market value ROI -> which bet types actually carry edge
(B) cross-market SELECTOR -> best value bet per match, with odds-band filter,
fold-consistency, and the model-free baseline.
⚠️ CSV odds are a static capture, not verified closing. Positive = LEAD; forward
paper-trade with real CLV before staking.
Usage: python scripts/multi_market_edge.py --folds 5 --lo 1.5 --hi 2.6 --margin 0.03
"""
from __future__ import annotations
import argparse, os, sys
import numpy as np, pandas as pd, xgboost as xgb
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try: sys.stdout.reconfigure(encoding="utf-8")
except Exception: pass
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
"score_home","score_away","ht_score_home","ht_score_away"}
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
"squad_diff","home_squad_quality","away_squad_quality",
"referee_home_bias","referee_avg_goals"}
# market -> (kind, [odds_cols aligned to classes], truth_fn(sh,sa,hh,ha)->class idx or None)
def ou(line): return lambda sh,sa,hh,ha: (0 if (sh+sa) > line else 1) # 0=Over,1=Under
def htou(line): return lambda sh,sa,hh,ha: (None if np.isnan(hh) else (0 if (hh+ha) > line else 1))
def ms_truth(sh,sa,hh,ha): return 0 if sh>sa else (1 if sh==sa else 2)
def ht_truth(sh,sa,hh,ha): return None if np.isnan(hh) else (0 if hh>ha else (1 if hh==ha else 2))
def btts_truth(sh,sa,hh,ha): return 0 if (sh>0 and sa>0) else 1 # 0=Yes,1=No
MARKETS = {
"MS": ("multi", ["odds_ms_h","odds_ms_d","odds_ms_a"], ["1","X","2"], ms_truth),
"HT": ("multi", ["odds_ht_ms_h","odds_ht_ms_d","odds_ht_ms_a"], ["1","X","2"], ht_truth),
"OU05": ("binary", ["odds_ou05_o","odds_ou05_u"], ["Üst","Alt"], ou(0.5)),
"OU15": ("binary", ["odds_ou15_o","odds_ou15_u"], ["Üst","Alt"], ou(1.5)),
"OU25": ("binary", ["odds_ou25_o","odds_ou25_u"], ["Üst","Alt"], ou(2.5)),
"OU35": ("binary", ["odds_ou35_o","odds_ou35_u"], ["Üst","Alt"], ou(3.5)),
"HT_OU05": ("binary", ["odds_ht_ou05_o","odds_ht_ou05_u"], ["Üst","Alt"], htou(0.5)),
"HT_OU15": ("binary", ["odds_ht_ou15_o","odds_ht_ou15_u"], ["Üst","Alt"], htou(1.5)),
"BTTS": ("binary", ["odds_btts_y","odds_btts_n"], ["Var","Yok"], btts_truth),
}
PARAMS_M = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
PARAMS_B = {"objective":"binary:logistic","max_depth":5,"eta":0.05,
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--folds", type=int, default=5)
ap.add_argument("--estimators", type=int, default=150)
ap.add_argument("--lo", type=float, default=1.5)
ap.add_argument("--hi", type=float, default=2.6)
ap.add_argument("--margin", type=float, default=0.03)
args = ap.parse_args()
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
sh = pd.to_numeric(df["score_home"], errors="coerce")
sa = pd.to_numeric(df["score_away"], errors="coerce")
ok = sh.notna() & sa.notna()
df = df[ok].reset_index(drop=True)
SH = sh[ok.values].values.astype(float); SA = sa[ok.values].values.astype(float)
HH = pd.to_numeric(df["ht_score_home"], errors="coerce").values.astype(float)
HA = pd.to_numeric(df["ht_score_away"], errors="coerce").values.astype(float)
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
N = len(df)
print(f"{N:,} matches, {len(feats)} leak-free feats, {len(MARKETS)} markets, folds={args.folds}")
# precompute truth + odds per market
MK = {}
for mname,(kind,ocols,picks,tfn) in MARKETS.items():
if not all(c in df.columns for c in ocols):
print(f" skip {mname}: missing odds cols"); continue
O = df[ocols].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
truth = np.array([tfn(SH[i],SA[i],HH[i],HA[i]) for i in range(N)], dtype=object)
MK[mname] = (kind, O, picks, truth)
start = int(N*0.5); bounds = np.linspace(start, N, args.folds+1, dtype=int)
# accumulators
per_market = {m: {"n":0,"pnl":0.0,"win":0} for m in MK} # (A) best value pick within market
sel = {"n":0,"pnl":0.0,"win":0,"fold":{}} # (B) cross-market selector
sel_by_mkt = {m: {"n":0,"pnl":0.0,"win":0} for m in MK}
for fi in range(args.folds):
te0,te1 = bounds[fi], bounds[fi+1]
if te1-te0 < 50: continue
idx = np.arange(te0,te1)
# train each market model on [:te0], predict test
cand = {} # market -> (P_matrix[n_test, n_picks], O_test, truth_test)
for m,(kind,O,picks,truth) in MK.items():
ytr_full = truth[:te0]
# mask invalid truth (e.g., HT markets with missing HT score)
valid_tr = np.array([v is not None for v in ytr_full])
if kind=="multi":
ytr = ytr_full[valid_tr].astype(int)
bst = xgb.train(PARAMS_M, xgb.DMatrix(X[:te0][valid_tr], label=ytr), num_boost_round=args.estimators)
P = bst.predict(xgb.DMatrix(X[te0:te1])) # [n,3]
else:
ytr = ytr_full[valid_tr].astype(int) # 0=positive,1=neg
pos = (ytr==0).astype(int)
bst = xgb.train(PARAMS_B, xgb.DMatrix(X[:te0][valid_tr], label=pos), num_boost_round=args.estimators)
ppos = bst.predict(xgb.DMatrix(X[te0:te1]))
P = np.column_stack([ppos, 1.0-ppos]) # [n,2] -> [pos,neg]
cand[m] = (P, O[te0:te1], truth[te0:te1])
# iterate test matches
for j in range(te1-te0):
best = None # (edge, market, pickidx, odds, won)
for m,(P,Ot,Tt) in cand.items():
t = Tt[j]
if t is None: continue
probs = P[j]; odds = Ot[j]
for k in range(len(probs)):
o = odds[k]
if o <= 1.0: continue
edge = probs[k] - 1.0/o
won = int(t==k)
# (A) per-market: track best value pick in this market (any band, edge>margin)
if edge > args.margin:
d = per_market[m]
# only count the market's single best pick per match
# collect for selector if in band + margin
if edge > args.margin and args.lo <= o < args.hi:
if best is None or edge > best[0]:
best = (edge, m, k, o, won)
# per-market best pick (separate loop for clean per-market ROI in band)
bestk=None
for k in range(len(probs)):
o=odds[k]
if o<=1.0: continue
e=probs[k]-1.0/o
if e>args.margin and args.lo<=o<args.hi and (bestk is None or e>bestk[0]):
bestk=(e,k,o,int(t==k))
if bestk is not None:
e,k,o,won = bestk
pnl = (o-1.0) if won else -1.0
d=per_market[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won
# selector: single best value bet across all markets for this match
if best is not None:
edge,m,k,o,won = best
pnl = (o-1.0) if won else -1.0
sel["n"]+=1; sel["pnl"]+=pnl; sel["win"]+=won
sel["fold"][fi] = sel["fold"].get(fi,0.0)+pnl
d=sel_by_mkt[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won
print(f" fold {fi}: tested {te1-te0:,}")
def line(name,d):
n=d["n"]; roi=100*d["pnl"]/n if n else float('nan'); hit=100*d["win"]/n if n else float('nan')
return f" {name:<10} bets={n:>6} hit={hit:>5.1f}% ROI={roi:>7.2f}% net={d['pnl']:>7.1f}u"
print("\n"+"="*70); print(f"(A) PER-MARKET value ROI (best value pick in band [{args.lo},{args.hi}], margin {args.margin})"); print("="*70)
for m in sorted(per_market, key=lambda x:-(100*per_market[x]['pnl']/per_market[x]['n'] if per_market[x]['n'] else -99)):
print(line(m, per_market[m]))
print("\n"+"="*70); print("(B) CROSS-MARKET SELECTOR (best value bet per match, all markets)"); print("="*70)
print(line("SELECTOR", sel))
folds_pos = sum(1 for v in sel["fold"].values() if v>0)
print(f" folds positive: {folds_pos}/{len(sel['fold'])}")
print(" selector picks distributed across markets:")
for m in sorted(sel_by_mkt, key=lambda x:-sel_by_mkt[x]['n']):
if sel_by_mkt[m]["n"]>0: print(" "+line(m, sel_by_mkt[m]).strip())
print("\nREAD: a market/selector is a LEAD only if ROI>0, folds consistent, n large.")
print("Forward-validate with CLV before staking. Static CSV odds may overstate edge.")
if __name__ == "__main__":
main()
+112
View File
@@ -0,0 +1,112 @@
"""
Train Favorite-Policy Model (v1) — leak-free MS model for the validated strategy.
================================================================================
Trains a LEAK-FREE 1X2 model (drops the result-encoding columns) and saves it
plus the feature list and policy metadata. This is the brain of the new system;
the favourite-band value policy (odds ~1.5-2.2, model_prob>implied, flat stake)
is applied on top of its probabilities at serving time.
Honest holdout: trains on the first --holdout-frac of history, evaluates the
EXACT policy on the most recent slice (never seen in training), then retrains
on ALL history for the saved production artifact.
Saves to models/favorite_v1/: model.json, feature_cols.json, metadata.json
Usage: python scripts/train_favorite_model.py
"""
from __future__ import annotations
import argparse, json, os, sys, datetime
import numpy as np, pandas as pd, xgboost as xgb
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try: sys.stdout.reconfigure(encoding="utf-8")
except Exception: pass
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
OUT = os.path.join(AI_DIR, "models", "favorite_v1")
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
"score_home","score_away","ht_score_home","ht_score_away"}
# Result-encoding leakage — never feed these to the model (train OR serve).
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
"squad_diff","home_squad_quality","away_squad_quality",
"referee_home_bias","referee_avg_goals"}
PARAMS = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
def policy_eval(P, y, O, lo, hi, margin):
implied = np.where(O > 1.0, 1.0/O, np.nan)
edge = np.where(np.isnan(implied), -9.0, P - implied)
pick = edge.argmax(1); pe = edge[np.arange(len(y)), pick]; po = O[np.arange(len(y)), pick]
bet = (pe > margin) & (po >= lo) & (po < hi)
win = (pick == y) & bet
pnl = np.where(win, po-1.0, -1.0)[bet]
n = int(bet.sum())
return {"bets": n, "hit_pct": round(100*win.sum()/max(n,1),1),
"roi_pct": round(100*pnl.sum()/max(n,1),2), "net_u": round(float(pnl.sum()),1)}
def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--lo", type=float, default=1.5)
ap.add_argument("--hi", type=float, default=2.2)
ap.add_argument("--margin", type=float, default=0.0)
ap.add_argument("--holdout-frac", type=float, default=0.15)
ap.add_argument("--estimators", type=int, default=300)
args = ap.parse_args()
print(f"Loading {CSV} ...")
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
sh = pd.to_numeric(df["score_home"], errors="coerce")
sa = pd.to_numeric(df["score_away"], errors="coerce")
ok = sh.notna() & sa.notna()
df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
print(f" {len(df):,} rows, {len(feats)} leak-free features")
# ── Honest holdout (last slice, never trained on) ──
cut = int(len(df) * (1 - args.holdout_frac))
bst = xgb.train(PARAMS, xgb.DMatrix(X[:cut], label=y[:cut]), num_boost_round=args.estimators)
Ph = bst.predict(xgb.DMatrix(X[cut:]))
acc = float((Ph.argmax(1) == y[cut:]).mean())
hold = policy_eval(Ph, y[cut:], O[cut:], args.lo, args.hi, args.margin)
print(f"\nHOLDOUT (last {args.holdout_frac:.0%}, {len(df)-cut:,} matches, never seen):")
print(f" MS accuracy: {acc*100:.1f}%")
print(f" POLICY band[{args.lo},{args.hi}] margin {args.margin}: {hold}")
# ── Production model: retrain on ALL history ──
print("\nTraining production model on ALL history ...")
final = xgb.train(PARAMS, xgb.DMatrix(X, label=y), num_boost_round=args.estimators)
os.makedirs(OUT, exist_ok=True)
final.save_model(os.path.join(OUT, "model.json"))
with open(os.path.join(OUT, "feature_cols.json"), "w", encoding="utf-8") as f:
json.dump(feats, f, ensure_ascii=False, indent=2)
meta = {
"version": "favorite_v1",
"trained_at": datetime.datetime.now().isoformat(timespec="seconds"),
"market": "MS",
"classes": {"0": "home(1)", "1": "draw(X)", "2": "away(2)"},
"policy": {"odds_lo": args.lo, "odds_hi": args.hi, "margin": args.margin,
"stake": "flat 1u", "rule": "bet model's max value edge if picked odds in band",
"never": ["longshots odds>=hi", "parlays/combos"]},
"n_train": len(df), "n_features": len(feats),
"leaky_excluded": sorted(LEAKY),
"holdout_eval": {"accuracy_pct": round(acc*100,1), **hold},
"caveat": "CSV odds are a static capture, not verified closing. Forward paper-trade with real CLV before staking.",
}
with open(os.path.join(OUT, "metadata.json"), "w", encoding="utf-8") as f:
json.dump(meta, f, ensure_ascii=False, indent=2)
print(f"\n✅ Saved production model to {OUT}/")
print(f" model.json, feature_cols.json ({len(feats)} feats), metadata.json")
print("\nNEXT: serving wrapper that loads this + applies the policy to upcoming")
print("matches, logs paper-trade picks, and we measure real forward CLV/ROI.")
if __name__ == "__main__":
main()
+191
View File
@@ -0,0 +1,191 @@
"""
Walk-Forward Odds-Blind Experiment — THE pivotal test.
======================================================
Question this answers: can a model BEAT THE MARKET out-of-sample, betting only
on information the price doesn't already contain?
Method (no leakage, time-ordered):
* data sorted by kickoff (mst_utc); train on the past, test on the future,
rolled over several folds.
* TWO models on the MS (1X2) market:
ALL = every feature INCLUDING the bookmaker odds (what the live
engine does -> it mostly re-learns the price).
BLIND = identical but odds/implied/_present columns REMOVED, so the
model must disagree with the market using fundamentals only.
* For each, an honest value-bet simulation on the test fold using the REAL
odds payouts (margin included): bet the outcome with the biggest
model_prob - implied_prob edge above a margin; ROI = realized P/L per 1u.
Read: if BLIND's value ROI is consistently > 0 across folds, there is a real,
exploitable lead. If both are <= 0 (expected), these markets aren't beatable
with this data and the honest move is to stop staking.
Usage:
python scripts/walkforward_oddsblind.py
python scripts/walkforward_oddsblind.py --folds 6 --estimators 300
"""
from __future__ import annotations
import argparse
import os
import sys
import numpy as np
import pandas as pd
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try:
sys.stdout.reconfigure(encoding="utf-8")
except Exception:
pass
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
import xgboost as xgb # noqa: E402
META = {"match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
"score_home", "score_away", "ht_score_home", "ht_score_away"}
# Confirmed target leakage: *_goals_form integer-valued and ~0.63 correlated
# with THIS match's goals; their diff equals the actual goal diff 73% of the
# time. Excluded so the experiment measures genuine pre-match predictive power.
LEAKY = {
# CONFIRMED (encode the actual match result):
"home_goals_form", "away_goals_form", # ~0.63 corr w/ this match's goals
"total_goals", # this match's full-time total
"ht_total_goals", # this match's half-time total
# STRONG SUSPECTS (dominate importance + high outcome corr; audit extractor):
"squad_diff", "home_squad_quality", "away_squad_quality",
"referee_home_bias", "referee_avg_goals",
}
def is_odds_col(c: str) -> bool:
cl = c.lower()
return ("odds" in cl) or ("implied" in cl)
def logloss(y: np.ndarray, p: np.ndarray) -> float:
p = np.clip(p, 1e-9, 1 - 1e-9)
return float(-np.mean(np.log(p[np.arange(len(y)), y])))
def value_sim(proba: np.ndarray, y: np.ndarray, odds: np.ndarray,
margin: float) -> dict:
"""Bet the class with the biggest (model_prob - 1/odds) edge above margin."""
implied = np.where(odds > 1.0, 1.0 / odds, np.nan)
edge = proba - implied
# ignore classes without valid odds
edge = np.where(np.isnan(implied), -9.0, edge)
pick = np.argmax(edge, axis=1)
best_edge = edge[np.arange(len(y)), pick]
bet = best_edge > margin
n = int(bet.sum())
if n == 0:
return {"n": 0, "roi": None, "hit": None}
win = (pick == y) & bet
pick_odds = odds[np.arange(len(y)), pick]
pnl = np.where(win, pick_odds - 1.0, -1.0)
pnl = pnl[bet]
return {"n": n, "roi": round(100.0 * pnl.sum() / n, 2),
"hit": round(100.0 * win[bet].sum() / n, 1)}
def train_eval(Xtr, ytr, Xte, yte, odds_te, est, margins):
dtr = xgb.DMatrix(Xtr, label=ytr)
dte = xgb.DMatrix(Xte)
params = {"objective": "multi:softprob", "num_class": 3, "max_depth": 5,
"eta": 0.05, "subsample": 0.8, "colsample_bytree": 0.8,
"tree_method": "hist", "verbosity": 0}
booster = xgb.train(params, dtr, num_boost_round=est)
proba = booster.predict(dte)
out = {"logloss": round(logloss(yte, proba), 4),
"acc": round(100.0 * (proba.argmax(1) == yte).mean(), 1)}
for mg in margins:
out[f"val@{mg}"] = value_sim(proba, yte, odds_te, mg)
return out
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--folds", type=int, default=5)
ap.add_argument("--estimators", type=int, default=250)
ap.add_argument("--test-frac", type=float, default=0.5,
help="Fraction at the end used as rolling OOS (default 0.5)")
args = ap.parse_args()
print(f"Loading {CSV} ...")
df = pd.read_csv(CSV, low_memory=False)
df = df.sort_values("mst_utc").reset_index(drop=True)
print(f" {len(df)} rows, {df.shape[1]} cols")
# Derive true MS outcome from scores: 0=home,1=draw,2=away (robust, no label trust)
sh = pd.to_numeric(df["score_home"], errors="coerce")
sa = pd.to_numeric(df["score_away"], errors="coerce")
y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
valid = sh.notna() & sa.notna()
df, y = df[valid].reset_index(drop=True), y[valid.values]
odds = df[["odds_ms_h", "odds_ms_d", "odds_ms_a"]].apply(
pd.to_numeric, errors="coerce").fillna(0.0).values
feat_all = [c for c in df.columns if c not in META and not c.startswith("label_")
and c not in LEAKY]
feat_blind = [c for c in feat_all if not is_odds_col(c)]
print(f" excluded leaky cols: {sorted(LEAKY)}")
Xall = df[feat_all].apply(pd.to_numeric, errors="coerce").fillna(0.0)
Xblind = df[feat_blind].apply(pd.to_numeric, errors="coerce").fillna(0.0)
print(f" features: ALL={len(feat_all)} BLIND={len(feat_blind)} "
f"(dropped {len(feat_all)-len(feat_blind)} odds cols)")
print(f" base rates: home={100*(y==0).mean():.1f}% draw={100*(y==1).mean():.1f}% "
f"away={100*(y==2).mean():.1f}%")
n = len(df)
start = int(n * (1 - args.test_frac))
bounds = np.linspace(start, n, args.folds + 1, dtype=int)
margins = [0.0, 0.05, 0.10]
agg = {"ALL": {f"val@{m}": [] for m in margins}, "BLIND": {f"val@{m}": [] for m in margins}}
agg["ALL"]["logloss"] = []; agg["BLIND"]["logloss"] = []
print(f"\nWalk-forward: {args.folds} folds, train=expanding, est={args.estimators}\n")
hdr = f"{'fold':<5}{'model':<7}{'logloss':>9}{'acc%':>7}" + "".join(
f"{('val@'+str(m)):>22}" for m in margins)
print(hdr); print("-" * len(hdr))
for i in range(args.folds):
te0, te1 = bounds[i], bounds[i + 1]
if te1 - te0 < 50:
continue
tr = slice(0, te0)
te = slice(te0, te1)
for name, X in (("ALL", Xall), ("BLIND", Xblind)):
r = train_eval(X.iloc[tr].values, y[tr], X.iloc[te].values, y[te],
odds[te], args.estimators, margins)
agg[name]["logloss"].append(r["logloss"])
cells = ""
for m in margins:
v = r[f"val@{m}"]
agg[name][f"val@{m}"].append(v)
cells += f"{('n=' + str(v['n']) + ' roi=' + str(v['roi'])):>22}"
print(f"{i:<5}{name:<7}{r['logloss']:>9}{r['acc']:>7}{cells}")
print()
print("=" * 70)
print("AGGREGATE (sum bets, weighted ROI across folds)")
print("=" * 70)
for name in ("ALL", "BLIND"):
ll = np.mean(agg[name]["logloss"]) if agg[name]["logloss"] else float("nan")
print(f"\n{name} mean logloss={ll:.4f}")
for m in margins:
vs = agg[name][f"val@{m}"]
tot_n = sum(v["n"] for v in vs)
tot_pnl = sum((v["roi"] / 100.0 * v["n"]) for v in vs if v["roi"] is not None)
roi = round(100.0 * tot_pnl / tot_n, 2) if tot_n else None
print(f" margin {m}: total_bets={tot_n:>6} ROI(flat1u)={roi}%")
print("\nREAD: BLIND ROI>0 across margins/folds = real edge. Both <=0 = no")
print("exploitable edge in MS with this data (stop staking; the -EV is the vig).")
return 0
if __name__ == "__main__":
raise SystemExit(main())