@@ -0,0 +1,137 @@
|
||||
"""
|
||||
Analyze Match v2 — the per-match multi-market value board + disciplined pick.
|
||||
===========================================================================
|
||||
Answers "for ONE match, show every bet type's probability + model signal +
|
||||
market-vs-model value, and pick the right bet." Leak-free models.
|
||||
|
||||
KEY HONEST RULE (proven by multi_market_edge.py): compute & SHOW value for all
|
||||
markets, but only MS (1X2) carries real, fold-consistent model edge. In OU/HT/
|
||||
BTTS the market is efficient — a big model-vs-market gap there is the MODEL'S
|
||||
ERROR, not value. So non-MS rows are INFO-ONLY; only an MS value bet in the
|
||||
favourite band is STAKED.
|
||||
|
||||
Demo: trains all market models on the first 85% of history, then prints the full
|
||||
board for sample matches in the unseen last 15% (with what actually happened).
|
||||
|
||||
Usage:
|
||||
python scripts/analyze_match_v2.py --n 6
|
||||
python scripts/analyze_match_v2.py --match <match_id>
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse, os, sys
|
||||
import numpy as np, pandas as pd, xgboost as xgb
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try: sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception: pass
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||||
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
|
||||
"score_home","score_away","ht_score_home","ht_score_away"}
|
||||
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
|
||||
"squad_diff","home_squad_quality","away_squad_quality",
|
||||
"referee_home_bias","referee_avg_goals"}
|
||||
STAKE_LO, STAKE_HI = 1.5, 2.4 # MS favourite band that staking is allowed in
|
||||
STAKE_MARGIN = 0.03
|
||||
|
||||
def ou(line): return lambda sh,sa,hh,ha: (0 if (sh+sa) > line else 1)
|
||||
def htou(line): return lambda sh,sa,hh,ha: (None if np.isnan(hh) else (0 if (hh+ha) > line else 1))
|
||||
MARKETS = {
|
||||
"MS": ("multi", ["odds_ms_h","odds_ms_d","odds_ms_a"], ["1","X","2"],
|
||||
lambda sh,sa,hh,ha: 0 if sh>sa else (1 if sh==sa else 2)),
|
||||
"OU25": ("binary",["odds_ou25_o","odds_ou25_u"], ["2.5Üst","2.5Alt"], ou(2.5)),
|
||||
"OU15": ("binary",["odds_ou15_o","odds_ou15_u"], ["1.5Üst","1.5Alt"], ou(1.5)),
|
||||
"OU35": ("binary",["odds_ou35_o","odds_ou35_u"], ["3.5Üst","3.5Alt"], ou(3.5)),
|
||||
"BTTS": ("binary",["odds_btts_y","odds_btts_n"], ["KG Var","KG Yok"],
|
||||
lambda sh,sa,hh,ha: 0 if (sh>0 and sa>0) else 1),
|
||||
"HT": ("multi", ["odds_ht_ms_h","odds_ht_ms_d","odds_ht_ms_a"], ["İY1","İYX","İY2"],
|
||||
lambda sh,sa,hh,ha: None if np.isnan(hh) else (0 if hh>ha else (1 if hh==ha else 2))),
|
||||
"HT_OU15": ("binary",["odds_ht_ou15_o","odds_ht_ou15_u"], ["İY1.5Üst","İY1.5Alt"], htou(1.5)),
|
||||
}
|
||||
STAKED_MARKETS = {"MS"} # only these are bet; rest are info-only
|
||||
PM = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
|
||||
PB = {"objective":"binary:logistic","max_depth":5,"eta":0.05,"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--n", type=int, default=6, help="how many sample matches")
|
||||
ap.add_argument("--match", help="specific match_id")
|
||||
ap.add_argument("--estimators", type=int, default=250)
|
||||
args = ap.parse_args()
|
||||
|
||||
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
|
||||
sh = pd.to_numeric(df["score_home"],errors="coerce"); sa = pd.to_numeric(df["score_away"],errors="coerce")
|
||||
ok = sh.notna()&sa.notna(); df = df[ok].reset_index(drop=True)
|
||||
SH=sh[ok.values].values.astype(float); SA=sa[ok.values].values.astype(float)
|
||||
HH=pd.to_numeric(df["ht_score_home"],errors="coerce").values.astype(float)
|
||||
HA=pd.to_numeric(df["ht_score_away"],errors="coerce").values.astype(float)
|
||||
feats=[c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
|
||||
X=df[feats].apply(pd.to_numeric,errors="coerce").fillna(0.0).values
|
||||
N=len(df); cut=int(N*0.85)
|
||||
print(f"Training {len(MARKETS)} leak-free market models on {cut:,} matches ...")
|
||||
|
||||
models={}
|
||||
for m,(kind,ocols,picks,tfn) in MARKETS.items():
|
||||
if not all(c in df.columns for c in ocols): continue
|
||||
truth=np.array([tfn(SH[i],SA[i],HH[i],HA[i]) for i in range(cut)],dtype=object)
|
||||
valid=np.array([v is not None for v in truth])
|
||||
if kind=="multi":
|
||||
b=xgb.train(PM,xgb.DMatrix(X[:cut][valid],label=truth[valid].astype(int)),num_boost_round=args.estimators)
|
||||
else:
|
||||
b=xgb.train(PB,xgb.DMatrix(X[:cut][valid],label=(truth[valid].astype(int)==0).astype(int)),num_boost_round=args.estimators)
|
||||
models[m]=(kind,ocols,picks,tfn,b)
|
||||
|
||||
# choose matches from holdout
|
||||
hold = df.iloc[cut:].reset_index(drop=True)
|
||||
if args.match:
|
||||
sel_idx = df.index[df["match_id"].astype(str)==str(args.match)].tolist()
|
||||
rows = [(i,) for i in sel_idx]
|
||||
base = df
|
||||
else:
|
||||
pick_pos = np.linspace(0, len(hold)-1, args.n, dtype=int)
|
||||
rows = [(cut+p,) for p in pick_pos]
|
||||
base = df
|
||||
|
||||
for (gi,) in rows:
|
||||
r = base.iloc[gi]
|
||||
xrow = X[gi:gi+1]
|
||||
sh_,sa_,hh_,ha_ = SH[gi],SA[gi],HH[gi],HA[gi]
|
||||
ht = f"{int(hh_)}-{int(ha_)}" if not np.isnan(hh_) else "?"
|
||||
print("\n"+"="*72)
|
||||
print(f"MATCH {r['match_id']} | elo H{r.get('home_overall_elo','?'):.0f} vs A{r.get('away_overall_elo','?'):.0f}"
|
||||
f" | ACTUAL {int(sh_)}-{int(sa_)} (HT {ht})")
|
||||
print(f" {'market':<8}{'pick':<10}{'model%':>8}{'impl%':>7}{'edge':>7}{'odds':>7} flag result")
|
||||
print(" "+"-"*64)
|
||||
best_ms=None
|
||||
for m,(kind,ocols,picks,tfn,b) in models.items():
|
||||
if kind=="multi":
|
||||
P=b.predict(xgb.DMatrix(xrow))[0]
|
||||
else:
|
||||
p=float(b.predict(xgb.DMatrix(xrow))[0]); P=np.array([p,1-p])
|
||||
O=pd.to_numeric(r[ocols],errors="coerce").fillna(0.0).values
|
||||
truth=tfn(sh_,sa_,hh_,ha_)
|
||||
for k in range(len(picks)):
|
||||
o=O[k]
|
||||
if o<=1.0: continue
|
||||
imp=1.0/o; edge=P[k]-imp
|
||||
res = "—" if truth is None else ("WON" if truth==k else "lost")
|
||||
staked = (m in STAKED_MARKETS) and edge>STAKE_MARGIN and STAKE_LO<=o<STAKE_HI
|
||||
flag = "★BET" if staked else ("val" if edge>STAKE_MARGIN else "")
|
||||
print(f" {m:<8}{picks[k]:<10}{100*P[k]:>7.1f}{100*imp:>7.1f}{100*edge:>+7.1f}{o:>7.2f} {flag:<5} {res}")
|
||||
if staked and (best_ms is None or edge>best_ms[0]):
|
||||
best_ms=(edge,m,picks[k],o,res)
|
||||
print(" "+"-"*64)
|
||||
if best_ms:
|
||||
e,m,p,o,res = best_ms
|
||||
print(f" >>> STAKE: {m} {p} @ {o:.2f} (edge +{100*e:.1f}%, favourite band) -> {res}")
|
||||
else:
|
||||
print(f" >>> NO STAKE: no MS value in favourite band. (Other markets info-only —")
|
||||
print(f" their 'value' is model error in efficient markets; do NOT chase it.)")
|
||||
print("\nNOTE: only MS staked (proven edge). All markets shown for transparency.")
|
||||
print("Forward-validate with CLV before real money. Static CSV odds may overstate edge.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,113 @@
|
||||
"""
|
||||
Betting Policy — the honest, leak-free strategy the data actually supports.
|
||||
==========================================================================
|
||||
Everything else in this repo bet UNDERDOGS (odds 6-7.5) and lost (-43.7% live).
|
||||
The data says the opposite: the only positive, fold-consistent, model-driven
|
||||
signal is MILD FAVOURITES the model rates above the market price.
|
||||
|
||||
POLICY (MS / 1X2 only):
|
||||
* leak-free model (drops the result-encoding features, see LEAKY)
|
||||
* bet the model's single biggest value edge (model_prob - implied) ...
|
||||
* ONLY if the picked side's odds are in [--lo, --hi] (favourite band)
|
||||
* ONLY if that edge > --margin
|
||||
* flat 1u stake, one bet per match, never a longshot, never a parlay.
|
||||
|
||||
Walk-forward, no leakage. Reports the policy ROI, fold consistency, drawdown,
|
||||
and the model-free baseline (blind favourite) so you can see the model's lift.
|
||||
|
||||
⚠️ HONEST CAVEAT: CSV odds are a static capture, not the verified obtainable
|
||||
closing line. A small backtest edge here is a LEAD, not a guarantee. Forward
|
||||
paper-trade with real CLV (capture_closing_odds.py) before risking money.
|
||||
|
||||
Usage: python scripts/betting_policy.py --lo 1.5 --hi 2.2 --margin 0.0 --folds 8
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse, os, sys
|
||||
import numpy as np, pandas as pd, xgboost as xgb
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try: sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception: pass
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||||
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
|
||||
"score_home","score_away","ht_score_home","ht_score_away"}
|
||||
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
|
||||
"squad_diff","home_squad_quality","away_squad_quality",
|
||||
"referee_home_bias","referee_avg_goals"}
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--lo", type=float, default=1.5)
|
||||
ap.add_argument("--hi", type=float, default=2.2)
|
||||
ap.add_argument("--margin", type=float, default=0.0)
|
||||
ap.add_argument("--folds", type=int, default=8)
|
||||
ap.add_argument("--estimators", type=int, default=250)
|
||||
args = ap.parse_args()
|
||||
|
||||
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
|
||||
sh = pd.to_numeric(df["score_home"], errors="coerce")
|
||||
sa = pd.to_numeric(df["score_away"], errors="coerce")
|
||||
ok = sh.notna() & sa.notna()
|
||||
df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
|
||||
y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
|
||||
O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
|
||||
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
|
||||
n = len(df); start = int(n*0.5)
|
||||
bounds = np.linspace(start, n, args.folds+1, dtype=int)
|
||||
params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
|
||||
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
|
||||
|
||||
print(f"POLICY: favourite band [{args.lo},{args.hi}] margin {args.margin} "
|
||||
f"leak-free feats={len(feats)} folds={args.folds}\n")
|
||||
all_pnl=[]; fold_rows=[]; base_pnl=[]
|
||||
for fi in range(args.folds):
|
||||
te0,te1 = bounds[fi], bounds[fi+1]
|
||||
if te1-te0 < 50: continue
|
||||
bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators)
|
||||
P = bst.predict(xgb.DMatrix(X[te0:te1]))
|
||||
yte, Ote = y[te0:te1], O[te0:te1]
|
||||
implied = np.where(Ote>1.0, 1.0/Ote, np.nan)
|
||||
edge = np.where(np.isnan(implied), -9.0, P-implied)
|
||||
pick = edge.argmax(1); pe = edge[np.arange(len(yte)),pick]; po = Ote[np.arange(len(yte)),pick]
|
||||
bet = (pe>args.margin) & (po>=args.lo) & (po<args.hi)
|
||||
win = (pick==yte)&bet
|
||||
pnl = np.where(win, po-1.0, -1.0)[bet]
|
||||
# model-free baseline: blind favourite in same band
|
||||
fav=Ote.argmin(1); fo=Ote[np.arange(len(yte)),fav]
|
||||
bmask=(fo>=args.lo)&(fo<args.hi)&(Ote>1.0).all(1)
|
||||
bpnl=np.where(fav[bmask]==yte[bmask], fo[bmask]-1.0, -1.0)
|
||||
roi = 100*pnl.sum()/len(pnl) if len(pnl) else float('nan')
|
||||
broi= 100*bpnl.sum()/len(bpnl) if len(bpnl) else float('nan')
|
||||
fold_rows.append((fi, len(pnl), 100*win.sum()/max(bet.sum(),1), roi, broi))
|
||||
all_pnl.extend(pnl.tolist()); base_pnl.extend(bpnl.tolist())
|
||||
print(f" fold {fi}: policy_bets={len(pnl):>4} hit={100*win.sum()/max(bet.sum(),1):>5.1f}% "
|
||||
f"ROI={roi:>7.2f}% | baseline(blind fav) ROI={broi:>7.2f}%")
|
||||
|
||||
a=np.array(all_pnl); b=np.array(base_pnl)
|
||||
print("\n"+"="*70)
|
||||
print("AGGREGATE")
|
||||
print("="*70)
|
||||
if len(a):
|
||||
cum=np.cumsum(a); peak=np.maximum.accumulate(cum); dd=(cum-peak).min()
|
||||
folds_pos=sum(1 for r in fold_rows if r[3]>0)
|
||||
print(f" POLICY: bets={len(a):>5} hit={100*(a>0).mean():.1f}% "
|
||||
f"ROI={100*a.mean():+.2f}% net={a.sum():+.1f}u maxDD={dd:.1f}u "
|
||||
f"folds+={folds_pos}/{len(fold_rows)}")
|
||||
if len(b):
|
||||
print(f" BASELINE: bets={len(b):>5} hit={100*(b>0).mean():.1f}% "
|
||||
f"ROI={100*b.mean():+.2f}% (blind favourite, same band)")
|
||||
if len(a):
|
||||
print(f"\n MODEL LIFT over blind favourite: "
|
||||
f"{100*a.mean()-100*b.mean():+.1f} percentage points")
|
||||
print("\nREAD: a believable system has ROI>0, folds+ near full, tolerable maxDD,")
|
||||
print("and clearly beats the blind-favourite baseline. Even then it's a LEAD —")
|
||||
print("forward paper-trade with real CLV before staking real money.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,136 @@
|
||||
"""
|
||||
Capture Closing Odds — snapshot #2 of the minimal 2-snapshot CLV system.
|
||||
=======================================================================
|
||||
WHY: CLV (closing line value) is the only reliable proof of betting edge.
|
||||
This codebase never captured it: odds are stored as a single static snapshot
|
||||
and `odds_history` is empty. But the live sync (DataFetcherTask CRON 1) DOES
|
||||
refresh `live_matches.odds` every 15 min before kickoff, and prediction_runs
|
||||
already store the bet-time odds blob (odds_snapshot.odds, source=live_match).
|
||||
|
||||
This script supplies the missing half: just before kickoff it copies the
|
||||
*current* live odds blob onto the match's latest prediction_run as
|
||||
`odds_snapshot.closing_odds`. Later, CLV per bet = bet-time pick odds vs
|
||||
closing pick odds (computed in live_scoreboard.py once enough data exists).
|
||||
|
||||
Run it every ~15 min (e.g. alongside the existing sync, or its own cron):
|
||||
python scripts/capture_closing_odds.py # default 25-min window
|
||||
python scripts/capture_closing_odds.py --window-min 20 --dry-run
|
||||
|
||||
Structure-agnostic: stores the whole live odds blob; no pick parsing here.
|
||||
Idempotent: skips runs that already have closing_odds. Only ADDS a JSON key,
|
||||
never deletes. Safe to run repeatedly.
|
||||
|
||||
⚠️ Needs one supervised test run against a live DB with upcoming matches
|
||||
before scheduling (DB was down at authoring time).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR)
|
||||
sys.path.insert(0, AI_ENGINE_DIR)
|
||||
|
||||
from data.db import get_clean_dsn # noqa: E402
|
||||
import psycopg2 # noqa: E402
|
||||
from psycopg2.extras import RealDictCursor # noqa: E402
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--window-min", type=int, default=25,
|
||||
help="Capture matches kicking off within the next N minutes (default 25)")
|
||||
ap.add_argument("--grace-min", type=int, default=10,
|
||||
help="Also include matches that kicked off up to N min ago (default 10)")
|
||||
ap.add_argument("--dry-run", action="store_true",
|
||||
help="Report what would be captured without writing")
|
||||
args = ap.parse_args()
|
||||
|
||||
now_ms = int(time.time() * 1000)
|
||||
lo_ms = now_ms - args.grace_min * 60 * 1000
|
||||
hi_ms = now_ms + args.window_min * 60 * 1000
|
||||
|
||||
captured = skipped = no_run = 0
|
||||
with psycopg2.connect(get_clean_dsn()) as conn:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
# Upcoming/just-started live matches that still hold pre-kickoff odds.
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT id, mst_utc, odds
|
||||
FROM live_matches
|
||||
WHERE odds IS NOT NULL
|
||||
AND mst_utc BETWEEN %s AND %s
|
||||
ORDER BY mst_utc ASC
|
||||
""",
|
||||
(lo_ms, hi_ms),
|
||||
)
|
||||
matches = cur.fetchall()
|
||||
print(f"[capture_closing_odds] window={args.window_min}m grace={args.grace_min}m "
|
||||
f"upcoming_with_odds={len(matches)} dry_run={args.dry_run}")
|
||||
|
||||
for m in matches:
|
||||
mid = m["id"]
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT id, odds_snapshot
|
||||
FROM prediction_runs
|
||||
WHERE match_id = %s
|
||||
ORDER BY generated_at DESC
|
||||
LIMIT 1
|
||||
""",
|
||||
(mid,),
|
||||
)
|
||||
run = cur.fetchone()
|
||||
if not run:
|
||||
no_run += 1
|
||||
continue
|
||||
snap = run["odds_snapshot"] or {}
|
||||
if isinstance(snap, str):
|
||||
try:
|
||||
snap = json.loads(snap)
|
||||
except Exception:
|
||||
snap = {}
|
||||
if snap.get("closing_odds") is not None:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
patch = {
|
||||
"closing_odds": m["odds"],
|
||||
"closing_captured_at": datetime.now(timezone.utc).isoformat(),
|
||||
"closing_mst_utc": m["mst_utc"],
|
||||
"closing_source": "live_match",
|
||||
}
|
||||
if args.dry_run:
|
||||
captured += 1
|
||||
print(f" would capture match={mid} run_id={run['id']} mst_utc={m['mst_utc']}")
|
||||
continue
|
||||
cur.execute(
|
||||
"""
|
||||
UPDATE prediction_runs
|
||||
SET odds_snapshot = COALESCE(odds_snapshot, '{}'::jsonb) || %s::jsonb
|
||||
WHERE id = %s
|
||||
""",
|
||||
(json.dumps(patch, default=str), run["id"]),
|
||||
)
|
||||
captured += 1
|
||||
if not args.dry_run:
|
||||
conn.commit()
|
||||
|
||||
print(f"[capture_closing_odds] captured={captured} already_had={skipped} "
|
||||
f"no_prediction_run={no_run}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,224 @@
|
||||
"""
|
||||
CLV Report — the single most important edge metric.
|
||||
===================================================
|
||||
Closing Line Value = did we bet at better odds than the market's closing line?
|
||||
Consistently positive CLV is the only reliable proof of a real betting edge;
|
||||
negative CLV means no edge, regardless of short-term wins/losses.
|
||||
|
||||
This codebase stores the BET-TIME odds for ~92% of runs (prediction_runs.
|
||||
odds_snapshot.source = 'live_match' with the live odds blob, and the pick's
|
||||
odds in payload main_pick.odds). For the closing line we use, in order:
|
||||
1. odds_snapshot.closing_odds (captured by capture_closing_odds.py, forward)
|
||||
2. odd_selections current value (the static near-final capture — a proxy)
|
||||
|
||||
CLV per bet = bet_odds / closing_odds - 1 (positive = beat the close = good).
|
||||
|
||||
Read-only. SELECT only.
|
||||
Usage:
|
||||
python scripts/clv_report.py
|
||||
python scripts/clv_report.py --staked-only
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR)
|
||||
sys.path.insert(0, AI_ENGINE_DIR)
|
||||
|
||||
from data.db import get_clean_dsn # noqa: E402
|
||||
import psycopg2 # noqa: E402
|
||||
from psycopg2.extras import RealDictCursor # noqa: E402
|
||||
|
||||
# market code -> (Turkish odds-category name, pick-normalizer -> selection key)
|
||||
OU_CATS = {"OU05": "0,5 Alt/Üst", "OU15": "1,5 Alt/Üst", "OU25": "2,5 Alt/Üst",
|
||||
"OU35": "3,5 Alt/Üst", "OU45": "4,5 Alt/Üst"}
|
||||
|
||||
|
||||
def _f(x: Any, d: Optional[float] = None) -> Optional[float]:
|
||||
try:
|
||||
return float(x) if x is not None else d
|
||||
except (TypeError, ValueError):
|
||||
return d
|
||||
|
||||
|
||||
def _parse(j: Any) -> Dict[str, Any]:
|
||||
if isinstance(j, str):
|
||||
try:
|
||||
return json.loads(j)
|
||||
except Exception:
|
||||
return {}
|
||||
return j or {}
|
||||
|
||||
|
||||
def map_pick(market: str, pick: str) -> Optional[Tuple[str, str]]:
|
||||
"""Return (category_name, selection_key) for the live-odds JSON / odd_selections."""
|
||||
m = (market or "").upper()
|
||||
p = (pick or "").strip()
|
||||
pl = p.casefold()
|
||||
if m in ("MS", "ML", "1X2"):
|
||||
return ("Maç Sonucu", p if p in ("1", "X", "2") else None) if p in ("1", "X", "2") else None
|
||||
if m == "HT":
|
||||
return ("1. Yarı Sonucu", p) if p in ("1", "X", "2") else None
|
||||
if m in OU_CATS:
|
||||
if "üst" in pl or "ust" in pl or "over" in pl:
|
||||
return (OU_CATS[m], "Üst")
|
||||
if "alt" in pl or "under" in pl:
|
||||
return (OU_CATS[m], "Alt")
|
||||
return None
|
||||
if m == "DC":
|
||||
key = p.upper().replace(" ", "").replace("/", "-")
|
||||
norm = {"1X": "1-X", "X1": "1-X", "X2": "X-2", "2X": "X-2",
|
||||
"12": "1-2", "21": "1-2", "1-X": "1-X", "X-2": "X-2", "1-2": "1-2"}.get(key)
|
||||
return ("Çifte Şans", norm) if norm else None
|
||||
if m == "BTTS":
|
||||
if "var" in pl or "yes" in pl:
|
||||
return ("Karşılıklı Gol", "Var")
|
||||
if "yok" in pl or "no" in pl:
|
||||
return ("Karşılıklı Gol", "Yok")
|
||||
return None
|
||||
if m == "OE":
|
||||
if "tek" in pl or "odd" in pl:
|
||||
return ("Tek/Çift", "Tek")
|
||||
if "çift" in pl or "cift" in pl or "even" in pl:
|
||||
return ("Tek/Çift", "Çift")
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def closing_from_blob(blob: Any, cat: str, sel: str) -> Optional[float]:
|
||||
blob = _parse(blob)
|
||||
cat_map = blob.get(cat) if isinstance(blob, dict) else None
|
||||
if isinstance(cat_map, dict):
|
||||
return _f(cat_map.get(sel))
|
||||
return None
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--staked-only", action="store_true",
|
||||
help="Only playable/staked bets (default: all picks with a mappable market)")
|
||||
args = ap.parse_args()
|
||||
|
||||
rows_out = []
|
||||
with psycopg2.connect(get_clean_dsn()) as conn:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT match_id, engine_version, odds_snapshot, payload_summary,
|
||||
eventual_outcome, unit_profit
|
||||
FROM prediction_runs
|
||||
WHERE odds_snapshot->>'source' = 'live_match'
|
||||
ORDER BY generated_at ASC
|
||||
""")
|
||||
runs = cur.fetchall()
|
||||
|
||||
for r in runs:
|
||||
snap = _parse(r["odds_snapshot"])
|
||||
ps = _parse(r["payload_summary"])
|
||||
mp = ps.get("main_pick") or {}
|
||||
market = mp.get("market")
|
||||
pick = mp.get("pick")
|
||||
bet_odds = _f(mp.get("odds"))
|
||||
playable = bool(mp.get("playable"))
|
||||
if args.staked_only and not playable:
|
||||
continue
|
||||
if not market or not pick or not bet_odds or bet_odds <= 1.0:
|
||||
continue
|
||||
mapped = map_pick(market, pick)
|
||||
if not mapped or not mapped[1]:
|
||||
continue
|
||||
cat, sel = mapped
|
||||
|
||||
# closing line: prefer captured closing_odds, else static odd_selections
|
||||
closing = closing_from_blob(snap.get("closing_odds"), cat, sel)
|
||||
src = "captured"
|
||||
if closing is None:
|
||||
cur.execute("""
|
||||
SELECT os.odd_value FROM odd_categories oc
|
||||
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||||
WHERE oc.match_id = %s AND oc.name = %s AND os.name = %s
|
||||
LIMIT 1
|
||||
""", (r["match_id"], cat, sel))
|
||||
row = cur.fetchone()
|
||||
closing = _f(row["odd_value"]) if row else None
|
||||
src = "static_proxy"
|
||||
if closing is None or closing <= 1.0:
|
||||
continue
|
||||
|
||||
clv = bet_odds / closing - 1.0
|
||||
rows_out.append({
|
||||
"market": market, "playable": playable,
|
||||
"bet_odds": bet_odds, "closing": closing, "clv": clv,
|
||||
"src": src, "profit": _f(r["unit_profit"], 0.0) or 0.0,
|
||||
"settled": r["eventual_outcome"] is not None
|
||||
and not str(r["eventual_outcome"]).startswith("NO_BET"),
|
||||
})
|
||||
|
||||
if not rows_out:
|
||||
print("No mappable runs with both bet-time and closing odds found.")
|
||||
return 0
|
||||
|
||||
def agg(rs):
|
||||
n = len(rs)
|
||||
clvs = [x["clv"] for x in rs]
|
||||
pos = sum(1 for c in clvs if c > 0)
|
||||
return {
|
||||
"n": n,
|
||||
"mean_clv_pct": round(100.0 * sum(clvs) / n, 2),
|
||||
"pct_positive": round(100.0 * pos / n, 1),
|
||||
"captured": sum(1 for x in rs if x["src"] == "captured"),
|
||||
}
|
||||
|
||||
print("=" * 70)
|
||||
print("CLV REPORT — did we beat the closing line? (the edge compass)")
|
||||
print("=" * 70)
|
||||
o = agg(rows_out)
|
||||
print(f"runs analyzed: {o['n']} (closing source: {o['captured']} captured, "
|
||||
f"{o['n'] - o['captured']} static-proxy)")
|
||||
print(f"\nOVERALL mean CLV: {o['mean_clv_pct']}% "
|
||||
f"bets beating close: {o['pct_positive']}%")
|
||||
print(" (positive mean CLV = real edge; ~0 or negative = no edge)\n")
|
||||
|
||||
staked = [x for x in rows_out if x["playable"]]
|
||||
if staked:
|
||||
s = agg(staked)
|
||||
print(f"STAKED only: n={s['n']} mean CLV={s['mean_clv_pct']}% "
|
||||
f"beating close={s['pct_positive']}%\n")
|
||||
|
||||
print("BY MARKET")
|
||||
by_m = defaultdict(list)
|
||||
for x in rows_out:
|
||||
by_m[x["market"]].append(x)
|
||||
for m, rs in sorted(by_m.items(), key=lambda kv: -len(kv[1])):
|
||||
a = agg(rs)
|
||||
print(f" {m:<8} n={a['n']:>4} mean CLV={a['mean_clv_pct']:>7}% "
|
||||
f"beating close={a['pct_positive']:>5}%")
|
||||
|
||||
# CLV vs outcome sanity: do positive-CLV bets actually win more / lose less?
|
||||
print("\nCLV vs realized P/L (settled staked)")
|
||||
ss = [x for x in rows_out if x["playable"] and x["settled"]]
|
||||
if ss:
|
||||
posc = [x for x in ss if x["clv"] > 0]
|
||||
negc = [x for x in ss if x["clv"] <= 0]
|
||||
for label, grp in (("CLV>0", posc), ("CLV<=0", negc)):
|
||||
if grp:
|
||||
pr = sum(x["profit"] for x in grp)
|
||||
print(f" {label:<7} n={len(grp):>3} profit={pr:>7.2f}u "
|
||||
f"ROI(flat1u)={round(100*pr/len(grp),1)}%")
|
||||
print("=" * 70)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,181 @@
|
||||
"""
|
||||
Edge Search — is there a profitable POCKET (by league) the global model misses?
|
||||
==============================================================================
|
||||
Global leak-free MS is ~-5.6% (the vig). But efficiency varies: obscure / low-
|
||||
tier leagues may be mispriced. This walks a leak-free model forward and slices
|
||||
the value-bet ROI BY LEAGUE, requiring a real sample AND multi-fold consistency
|
||||
so we don't chase one lucky window.
|
||||
|
||||
Leak-free: drops the confirmed/suspected leakage columns (see LEAKY). Uses odds
|
||||
in features (realistic). Value bet = biggest model_prob - implied edge > margin.
|
||||
|
||||
⚠️ Even a positive pocket here is a LEAD, not proof: the CSV odds are a static
|
||||
capture, not the verified closing line. Anything flagged must be forward-
|
||||
validated with real CLV (capture_closing_odds.py) before staking.
|
||||
|
||||
Usage: python scripts/edge_search.py --folds 6 --min-bets 150
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse, os, sys, time
|
||||
import numpy as np, pandas as pd, xgboost as xgb
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try: sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception: pass
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0, AI_DIR)
|
||||
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||||
|
||||
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
|
||||
"score_home","score_away","ht_score_home","ht_score_away"}
|
||||
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
|
||||
"squad_diff","home_squad_quality","away_squad_quality",
|
||||
"referee_home_bias","referee_avg_goals"}
|
||||
|
||||
|
||||
def league_names(ids):
|
||||
"""Resilient id->name lookup."""
|
||||
from data.db import get_clean_dsn
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
out = {}
|
||||
ids = [str(i) for i in ids if i is not None]
|
||||
if not ids: return out
|
||||
for _ in range(3):
|
||||
try:
|
||||
with psycopg2.connect(get_clean_dsn()) as c:
|
||||
with c.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (ids,))
|
||||
for r in cur.fetchall(): out[str(r["id"])] = r["name"]
|
||||
return out
|
||||
except Exception:
|
||||
time.sleep(1.0)
|
||||
return out
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--folds", type=int, default=6)
|
||||
ap.add_argument("--estimators", type=int, default=200)
|
||||
ap.add_argument("--margin", type=float, default=0.0)
|
||||
ap.add_argument("--min-bets", type=int, default=150)
|
||||
args = ap.parse_args()
|
||||
|
||||
print(f"Loading {CSV} ...")
|
||||
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
|
||||
sh = pd.to_numeric(df["score_home"], errors="coerce")
|
||||
sa = pd.to_numeric(df["score_away"], errors="coerce")
|
||||
ok = sh.notna() & sa.notna()
|
||||
df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
|
||||
y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
|
||||
league = df["league_id"].astype(str).values
|
||||
odds = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
|
||||
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
|
||||
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
rel = pd.to_numeric(df.get("league_reliability_score", pd.Series([np.nan]*len(df))),
|
||||
errors="coerce").fillna(-1.0).values
|
||||
print(f" {len(df):,} rows features={len(feats)} (leak-free) folds={args.folds}")
|
||||
|
||||
n = len(df); start = int(n * 0.5)
|
||||
bounds = np.linspace(start, n, args.folds + 1, dtype=int)
|
||||
params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
|
||||
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
|
||||
|
||||
# reliability quartile edges from the betting universe (rel>=0)
|
||||
rv = rel[rel >= 0]
|
||||
qs = np.quantile(rv, [0.25, 0.5, 0.75]) if len(rv) else [0.3, 0.5, 0.7]
|
||||
def rel_band(x):
|
||||
if x < 0: return "rel:unknown"
|
||||
if x < qs[0]: return f"rel:Q1(<{qs[0]:.2f})"
|
||||
if x < qs[1]: return f"rel:Q2"
|
||||
if x < qs[2]: return f"rel:Q3"
|
||||
return f"rel:Q4(>={qs[2]:.2f})"
|
||||
def odds_band(o):
|
||||
return ("<1.5" if o<1.5 else "1.5-2" if o<2 else "2-3" if o<3 else
|
||||
"3-5" if o<5 else "5-8" if o<8 else "8+")
|
||||
|
||||
recs = [] # (group_key, fold, pnl, win)
|
||||
glob = {"n":0,"pnl":0.0,"win":0}
|
||||
for fi in range(args.folds):
|
||||
te0, te1 = bounds[fi], bounds[fi+1]
|
||||
if te1-te0 < 50: continue
|
||||
bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators)
|
||||
proba = bst.predict(xgb.DMatrix(X[te0:te1]))
|
||||
yte, ote, rte = y[te0:te1], odds[te0:te1], rel[te0:te1]
|
||||
implied = np.where(ote > 1.0, 1.0/ote, np.nan)
|
||||
edge = np.where(np.isnan(implied), -9.0, proba - implied)
|
||||
pick = edge.argmax(1)
|
||||
bet = edge[np.arange(len(yte)), pick] > args.margin
|
||||
win = (pick == yte) & bet
|
||||
pick_odds = ote[np.arange(len(yte)), pick]
|
||||
pnl = np.where(win, pick_odds-1.0, -1.0)
|
||||
for i in range(len(yte)):
|
||||
if not bet[i]: continue
|
||||
glob["n"]+=1; glob["pnl"]+=pnl[i]; glob["win"]+=int(win[i])
|
||||
recs.append((rel_band(rte[i]), fi, pnl[i], int(win[i])))
|
||||
recs.append((odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
|
||||
recs.append((rel_band(rte[i])+" x "+odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
|
||||
print(f" fold {fi}: tested {len(yte):,} bets {int(bet.sum()):,}")
|
||||
|
||||
print("\n"+"="*78)
|
||||
print(f"GLOBAL leak-free: bets={glob['n']:,} hit={100*glob['win']/max(glob['n'],1):.1f}% "
|
||||
f"ROI(flat1u)={100*glob['pnl']/max(glob['n'],1):.2f}%")
|
||||
print("="*78)
|
||||
|
||||
rdf = pd.DataFrame(recs, columns=["grp","fold","pnl","win"])
|
||||
def report(prefix, title):
|
||||
sub = rdf[rdf["grp"].str.startswith(prefix)]
|
||||
if sub.empty: return
|
||||
print(f"\n{title}")
|
||||
print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
|
||||
print(" "+"-"*54)
|
||||
g = sub.groupby("grp")
|
||||
out=[]
|
||||
for k,d in g:
|
||||
nb=len(d)
|
||||
if nb < args.min_bets: continue
|
||||
roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
|
||||
fp=d.groupby("fold")["pnl"].sum(); folds_pos=int((fp>0).sum()); ft=fp.shape[0]
|
||||
out.append((roi,k,nb,hit,folds_pos,ft))
|
||||
for roi,k,nb,hit,fp,ft in sorted(out,reverse=True):
|
||||
print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fp)+'/'+str(ft):>8}")
|
||||
report("rel:", "BY LEAGUE-RELIABILITY BAND (Q1=most obscure ... Q4=most reliable)")
|
||||
report(("<","1","2","3","5","8"), None) # odds bands start with digit/<
|
||||
# odds-band buckets begin with a digit or '<'
|
||||
sub = rdf[~rdf["grp"].str.startswith("rel:")]
|
||||
sub = sub[~sub["grp"].str.contains(" x ")]
|
||||
if not sub.empty:
|
||||
print("\nBY ODDS BAND")
|
||||
print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
|
||||
print(" "+"-"*54)
|
||||
out=[]
|
||||
for k,d in sub.groupby("grp"):
|
||||
nb=len(d)
|
||||
if nb<args.min_bets: continue
|
||||
roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
|
||||
fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
|
||||
for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True):
|
||||
print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
|
||||
# 2D reliability x odds
|
||||
sub2 = rdf[rdf["grp"].str.contains(" x ")]
|
||||
if not sub2.empty:
|
||||
print("\nBY RELIABILITY x ODDS (candidate pockets, n>=min-bets)")
|
||||
print(f" {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
|
||||
print(" "+"-"*54)
|
||||
out=[]
|
||||
for k,d in sub2.groupby("grp"):
|
||||
nb=len(d)
|
||||
if nb<args.min_bets: continue
|
||||
roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
|
||||
fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
|
||||
for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True)[:15]:
|
||||
print(f" {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
|
||||
print("\nREAD: a pocket is a real LEAD only if ROI>0 AND positive in MOST folds")
|
||||
print("(folds+ near full) AND bets large. +ROI in 1-2 folds = noise / overfit.")
|
||||
print("Then forward-validate with CLV (capture_closing_odds.py) before staking.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,154 @@
|
||||
"""
|
||||
Generate Daily Picks — the serving picker for the validated favourite policy.
|
||||
============================================================================
|
||||
Loads the saved leak-free MS model (models/favorite_v1) and applies the
|
||||
favourite-band value policy to a set of matches, emitting the day's STAKED
|
||||
picks and logging them for forward paper-trade settlement.
|
||||
|
||||
Train/serve consistency: features MUST come from the SAME extractor that built
|
||||
training_data_v27.csv. Production path = run the extractor nightly INCLUDING
|
||||
upcoming (status NS) matches, then point this script at that CSV. Demo path =
|
||||
use the tail of the training CSV as stand-in "today" matches (with the real
|
||||
result shown, since those are settled).
|
||||
|
||||
Policy: bet the MS side with the biggest model_prob - implied edge, ONLY if
|
||||
odds in [--lo,--hi] and edge>--margin. Flat 1u. No longshots, no parlays.
|
||||
Non-MS markets are NOT staked (efficient -> model error). One bet per match.
|
||||
|
||||
Usage:
|
||||
python scripts/generate_daily_picks.py --demo --n 20 # see it work now
|
||||
python scripts/generate_daily_picks.py --features today.csv # production
|
||||
python scripts/generate_daily_picks.py --settle # settle paper log
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse, json, os, sys, datetime
|
||||
import numpy as np, pandas as pd, xgboost as xgb
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try: sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception: pass
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
MODEL_DIR = os.path.join(AI_DIR, "models", "favorite_v1")
|
||||
TRAIN_CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||||
PAPER_LOG = os.path.join(AI_DIR, "data", "paper_trades.csv")
|
||||
MS_ODDS = ["odds_ms_h", "odds_ms_d", "odds_ms_a"]
|
||||
MS_PICKS = ["1", "X", "2"]
|
||||
|
||||
|
||||
def load_model():
|
||||
bst = xgb.Booster(); bst.load_model(os.path.join(MODEL_DIR, "model.json"))
|
||||
with open(os.path.join(MODEL_DIR, "feature_cols.json"), encoding="utf-8") as f:
|
||||
feats = json.load(f)
|
||||
with open(os.path.join(MODEL_DIR, "metadata.json"), encoding="utf-8") as f:
|
||||
meta = json.load(f)
|
||||
return bst, feats, meta
|
||||
|
||||
|
||||
def pick_for_rows(df, bst, feats, lo, hi, margin):
|
||||
X = df.reindex(columns=feats).apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
P = bst.predict(xgb.DMatrix(X)) # [n,3] home/draw/away
|
||||
O = df[MS_ODDS].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
implied = np.where(O > 1.0, 1.0/O, np.nan)
|
||||
edge = np.where(np.isnan(implied), -9.0, P - implied)
|
||||
out = []
|
||||
for i in range(len(df)):
|
||||
k = int(np.argmax(edge[i])); o = float(O[i, k]); e = float(edge[i, k])
|
||||
staked = (e > margin) and (lo <= o < hi)
|
||||
out.append({"idx": i, "pick": MS_PICKS[k], "odds": round(o, 2),
|
||||
"model_prob": round(float(P[i, k]), 4), "edge": round(e, 4),
|
||||
"staked": staked})
|
||||
return out
|
||||
|
||||
|
||||
def settle():
|
||||
if not os.path.exists(PAPER_LOG):
|
||||
print("No paper_trades.csv yet."); return
|
||||
pt = pd.read_csv(PAPER_LOG)
|
||||
open_bets = pt[pt["result"].isna()] if "result" in pt.columns else pt
|
||||
if open_bets.empty:
|
||||
print("No open bets to settle.");
|
||||
# settle from training CSV scores if present, else needs DB (left as note)
|
||||
src = pd.read_csv(TRAIN_CSV, low_memory=False, usecols=["match_id","score_home","score_away"])
|
||||
sc = src.set_index("match_id")
|
||||
def res(row):
|
||||
if not pd.isna(row.get("result")): return row["result"]
|
||||
m = sc.index == row["match_id"]
|
||||
if not m.any(): return np.nan
|
||||
r = sc[m].iloc[0]; sh, sa = r["score_home"], r["score_away"]
|
||||
if pd.isna(sh): return np.nan
|
||||
outcome = "1" if sh > sa else ("X" if sh == sa else "2")
|
||||
won = (str(row["pick"]) == outcome)
|
||||
return "WON" if won else "LOST"
|
||||
pt["result"] = pt.apply(res, axis=1)
|
||||
pt["pnl"] = pt.apply(lambda r: (r["odds"]-1.0) if r["result"]=="WON"
|
||||
else (-1.0 if r["result"]=="LOST" else np.nan), axis=1)
|
||||
pt.to_csv(PAPER_LOG, index=False)
|
||||
s = pt.dropna(subset=["pnl"])
|
||||
if len(s):
|
||||
print(f"Settled {len(s)} bets: hit={100*(s['result']=='WON').mean():.1f}% "
|
||||
f"ROI={100*s['pnl'].sum()/len(s):+.2f}% net={s['pnl'].sum():+.1f}u")
|
||||
return
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--features", help="CSV of upcoming matches in training schema")
|
||||
ap.add_argument("--demo", action="store_true", help="use tail of training CSV as 'today'")
|
||||
ap.add_argument("--n", type=int, default=20)
|
||||
ap.add_argument("--lo", type=float, default=1.5)
|
||||
ap.add_argument("--hi", type=float, default=2.2)
|
||||
ap.add_argument("--margin", type=float, default=0.03)
|
||||
ap.add_argument("--settle", action="store_true")
|
||||
ap.add_argument("--log", action="store_true", help="append staked picks to paper_trades.csv")
|
||||
args = ap.parse_args()
|
||||
|
||||
if args.settle:
|
||||
settle(); return
|
||||
|
||||
bst, feats, meta = load_model()
|
||||
print(f"Model {meta['version']} (trained {meta['trained_at']}, holdout "
|
||||
f"ROI {meta['holdout_eval']['roi_pct']}%) band[{args.lo},{args.hi}] margin {args.margin}\n")
|
||||
|
||||
if args.features:
|
||||
df = pd.read_csv(args.features, low_memory=False)
|
||||
demo = False
|
||||
else:
|
||||
df = pd.read_csv(TRAIN_CSV, low_memory=False).sort_values("mst_utc").tail(args.n).reset_index(drop=True)
|
||||
demo = True
|
||||
print("(DEMO: last matches of training CSV as stand-in for today)\n")
|
||||
|
||||
picks = pick_for_rows(df, bst, feats, args.lo, args.hi, args.margin)
|
||||
staked = [p for p in picks if p["staked"]]
|
||||
print(f"{len(df)} matches scanned -> {len(staked)} STAKED MS picks\n")
|
||||
print(f" {'match_id':<28}{'pick':>5}{'odds':>7}{'model%':>8}{'edge%':>7}" + (" result" if demo else ""))
|
||||
print(" "+"-"*60)
|
||||
log_rows = []
|
||||
for p in picks:
|
||||
if not p["staked"]: continue
|
||||
r = df.iloc[p["idx"]]; mid = str(r["match_id"])
|
||||
res = ""
|
||||
if demo:
|
||||
sh, sa = r.get("score_home"), r.get("score_away")
|
||||
if pd.notna(sh):
|
||||
out = "1" if sh>sa else ("X" if sh==sa else "2")
|
||||
res = " WON" if p["pick"]==out else " lost"
|
||||
print(f" {mid:<28}{p['pick']:>5}{p['odds']:>7.2f}{100*p['model_prob']:>8.1f}{100*p['edge']:>+7.1f}{res}")
|
||||
log_rows.append({"logged_at": datetime.datetime.now().isoformat(timespec="seconds"),
|
||||
"match_id": mid, "market": "MS", "pick": p["pick"], "odds": p["odds"],
|
||||
"model_prob": p["model_prob"], "edge": p["edge"], "stake": 1.0,
|
||||
"result": np.nan, "pnl": np.nan})
|
||||
if args.log and log_rows and not demo:
|
||||
new = pd.DataFrame(log_rows)
|
||||
if os.path.exists(PAPER_LOG):
|
||||
new = pd.concat([pd.read_csv(PAPER_LOG), new], ignore_index=True)
|
||||
new.to_csv(PAPER_LOG, index=False)
|
||||
print(f"\n logged {len(log_rows)} picks -> {PAPER_LOG}")
|
||||
elif args.log and demo:
|
||||
print("\n (--log ignored in --demo; only real upcoming picks are logged)")
|
||||
print("\nReminder: paper-trade only. Stake real money after weeks of forward")
|
||||
print("CLV>0 + ROI>0 (settle with --settle, check scoreboard/clv_report).")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,253 @@
|
||||
"""
|
||||
Live Scoreboard — the single source of truth for real betting performance.
|
||||
=========================================================================
|
||||
Reads the *forward-tracked* results in `prediction_runs` (one row per analyzed
|
||||
match, with the staked main pick + actual outcome + realized unit_profit) and
|
||||
reports what ACTUALLY happened with real money logic — NOT a backtest.
|
||||
|
||||
Why this exists: backtests on this codebase are overfit (a paper "+32.7% ROI"
|
||||
strategy that the live engine never even ran). The only trustworthy number is
|
||||
the realized P/L recorded after matches settle. This tool surfaces it.
|
||||
|
||||
Read-only. SELECT only. Safe to run anytime.
|
||||
|
||||
Usage:
|
||||
python scripts/live_scoreboard.py
|
||||
python scripts/live_scoreboard.py --days 30
|
||||
python scripts/live_scoreboard.py --version v28-pro-max
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
# utf-8 stdout so Turkish market/league names never crash on Windows cp1252
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR)
|
||||
sys.path.insert(0, AI_ENGINE_DIR)
|
||||
|
||||
from data.db import get_clean_dsn # noqa: E402
|
||||
import psycopg2 # noqa: E402
|
||||
from psycopg2.extras import RealDictCursor # noqa: E402
|
||||
|
||||
ODDS_BANDS = [(0, 1.5, "<1.5"), (1.5, 2.0, "1.5-2"), (2.0, 3.0, "2-3"),
|
||||
(3.0, 5.0, "3-5"), (5.0, 6.0, "5-6"), (6.0, 7.5, "6-7.5"),
|
||||
(7.5, 999, "7.5+")]
|
||||
|
||||
|
||||
def _f(x: Any, d: Optional[float] = None) -> Optional[float]:
|
||||
try:
|
||||
return float(x) if x is not None else d
|
||||
except (TypeError, ValueError):
|
||||
return d
|
||||
|
||||
|
||||
def _parse(j: Any) -> Dict[str, Any]:
|
||||
if isinstance(j, str):
|
||||
try:
|
||||
return json.loads(j)
|
||||
except Exception:
|
||||
return {}
|
||||
return j or {}
|
||||
|
||||
|
||||
def _band(odds: Optional[float]) -> str:
|
||||
if odds is None:
|
||||
return "?"
|
||||
for lo, hi, name in ODDS_BANDS:
|
||||
if lo <= odds < hi:
|
||||
return name
|
||||
return "?"
|
||||
|
||||
|
||||
def fetch_rows(args) -> List[Dict[str, Any]]:
|
||||
dsn = get_clean_dsn()
|
||||
where = ["eventual_outcome IS NOT NULL"]
|
||||
params: List[Any] = []
|
||||
if args.version:
|
||||
where.append("engine_version = %s")
|
||||
params.append(args.version)
|
||||
if args.days:
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=args.days)
|
||||
where.append("generated_at >= %s")
|
||||
params.append(cutoff)
|
||||
sql = f"""
|
||||
SELECT match_id, engine_version, generated_at, eventual_outcome,
|
||||
unit_profit, payload_summary
|
||||
FROM prediction_runs
|
||||
WHERE {' AND '.join(where)}
|
||||
ORDER BY generated_at ASC
|
||||
"""
|
||||
with psycopg2.connect(dsn) as conn:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute(sql, params)
|
||||
return cur.fetchall()
|
||||
|
||||
|
||||
def distill(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""One analytic record per run with the staked pick + realized P/L."""
|
||||
out = []
|
||||
for r in rows:
|
||||
ps = _parse(r["payload_summary"])
|
||||
mp = ps.get("main_pick") or {}
|
||||
playable = bool(mp.get("playable"))
|
||||
stake = _f(mp.get("stake_units"), 0.0) or 0.0
|
||||
profit = _f(r["unit_profit"], 0.0) or 0.0
|
||||
outcome = str(r["eventual_outcome"] or "")
|
||||
staked = playable and stake > 0
|
||||
# settled stake = a real bet with a win/loss (exclude NO_BET / push)
|
||||
settled_stake = staked and not outcome.startswith(("NO_BET", "PUSH", "VOID", "CANCEL"))
|
||||
out.append({
|
||||
"match_id": r["match_id"],
|
||||
"version": r["engine_version"],
|
||||
"ts": r["generated_at"],
|
||||
"market": mp.get("market") or "?",
|
||||
"pick": mp.get("pick"),
|
||||
"odds": _f(mp.get("odds")),
|
||||
"stake": stake,
|
||||
"profit": profit,
|
||||
"outcome": outcome,
|
||||
"staked": staked,
|
||||
"settled_stake": settled_stake,
|
||||
"win": settled_stake and profit > 0,
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def _agg(recs: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
# NOTE: recorded unit_profit is on a FLAT 1u basis (win=odds-1, loss=-1),
|
||||
# independent of the brain's suggested stake_units. So ROI is profit per
|
||||
# bet at 1u flat = profit / n. (Using stake_units as denominator is wrong:
|
||||
# it double-counts and produces impossible >100% losses.)
|
||||
s = [r for r in recs if r["settled_stake"]]
|
||||
n = len(s)
|
||||
wins = sum(1 for r in s if r["win"])
|
||||
sug_stake = sum(r["stake"] for r in s)
|
||||
profit = sum(r["profit"] for r in s)
|
||||
return {
|
||||
"n": n,
|
||||
"wins": wins,
|
||||
"hit_pct": round(100.0 * wins / n, 1) if n else None,
|
||||
"sug_stake": round(sug_stake, 2),
|
||||
"profit": round(profit, 2),
|
||||
"roi_pct": round(100.0 * profit / n, 1) if n else None, # flat 1u
|
||||
}
|
||||
|
||||
|
||||
def _line(label: str, a: Dict[str, Any]) -> str:
|
||||
return (f" {label:<14} n={a['n']:>4} hit={str(a['hit_pct'] if a['hit_pct'] is not None else '-'):>5}% "
|
||||
f"profit={a['profit']:>8.2f}u ROI(flat1u)={str(a['roi_pct'] if a['roi_pct'] is not None else '-'):>7}%")
|
||||
|
||||
|
||||
def risk_metrics(recs: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
s = [r for r in sorted(recs, key=lambda x: x["ts"]) if r["settled_stake"]]
|
||||
cum = 0.0
|
||||
peak = 0.0
|
||||
max_dd = 0.0
|
||||
streak = 0
|
||||
worst_streak = 0
|
||||
for r in s:
|
||||
cum += r["profit"]
|
||||
peak = max(peak, cum)
|
||||
max_dd = min(max_dd, cum - peak)
|
||||
if r["profit"] <= 0:
|
||||
streak += 1
|
||||
worst_streak = max(worst_streak, streak)
|
||||
else:
|
||||
streak = 0
|
||||
return {"max_drawdown_u": round(max_dd, 2),
|
||||
"longest_losing_streak": worst_streak,
|
||||
"final_cum_u": round(cum, 2)}
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--days", type=int, default=None, help="Only last N days")
|
||||
ap.add_argument("--version", help="Filter by engine_version")
|
||||
args = ap.parse_args()
|
||||
|
||||
rows = fetch_rows(args)
|
||||
recs = distill(rows)
|
||||
|
||||
print("=" * 74)
|
||||
print("LIVE SCOREBOARD — realized results from prediction_runs (NOT backtest)")
|
||||
print("=" * 74)
|
||||
if recs:
|
||||
lo = min(r["ts"] for r in recs).date()
|
||||
hi = max(r["ts"] for r in recs).date()
|
||||
print(f"window: {lo} .. {hi} settled runs: {len(recs)}"
|
||||
+ (f" filter: {args.version}" if args.version else ""))
|
||||
print()
|
||||
|
||||
overall = _agg(recs)
|
||||
print("OVERALL (staked = playable bets only)")
|
||||
print(_line("ALL", overall))
|
||||
no_bet = sum(1 for r in recs if not r["staked"])
|
||||
print(f" (analyzed {len(recs)} matches; {overall['n']} actually staked, "
|
||||
f"{no_bet} NO_BET)")
|
||||
if overall["n"]:
|
||||
rm = risk_metrics(recs)
|
||||
print(f" max drawdown: {rm['max_drawdown_u']}u "
|
||||
f"longest losing streak: {rm['longest_losing_streak']} "
|
||||
f"net: {rm['final_cum_u']}u")
|
||||
print()
|
||||
|
||||
print("BY ENGINE VERSION")
|
||||
by_v = defaultdict(list)
|
||||
for r in recs:
|
||||
by_v[r["version"]].append(r)
|
||||
for v, rs in sorted(by_v.items(), key=lambda kv: -len(kv[1])):
|
||||
print(_line(v, _agg(rs)))
|
||||
print()
|
||||
|
||||
print("BY MARKET (staked)")
|
||||
by_m = defaultdict(list)
|
||||
for r in recs:
|
||||
if r["settled_stake"]:
|
||||
by_m[r["market"]].append(r)
|
||||
for m, rs in sorted(by_m.items(), key=lambda kv: -len(kv[1])):
|
||||
print(_line(m, _agg(rs)))
|
||||
if not by_m:
|
||||
print(" (no staked settled bets in window)")
|
||||
print()
|
||||
|
||||
print("BY ODDS BAND (staked)")
|
||||
by_b = defaultdict(list)
|
||||
for r in recs:
|
||||
if r["settled_stake"]:
|
||||
by_b[_band(r["odds"])].append(r)
|
||||
for _, _, name in ODDS_BANDS:
|
||||
if name in by_b:
|
||||
print(_line(name, _agg(by_b[name])))
|
||||
print()
|
||||
|
||||
print("WEEKLY TREND (staked)")
|
||||
by_w = defaultdict(list)
|
||||
for r in recs:
|
||||
if r["settled_stake"]:
|
||||
iso = r["ts"].isocalendar()
|
||||
by_w[f"{iso[0]}-W{iso[1]:02d}"].append(r)
|
||||
for w in sorted(by_w):
|
||||
a = _agg(by_w[w])
|
||||
print(_line(w, a))
|
||||
print()
|
||||
print("=" * 74)
|
||||
print("READ: ROI < 0 over a meaningful sample = the staked signals are not")
|
||||
print("profitable. 'NO_BET' rows are free (no stake). CLV is unmeasurable")
|
||||
print("until odds movement is captured (see scripts + odds_history fix).")
|
||||
print("=" * 74)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,182 @@
|
||||
"""
|
||||
Multi-Market Edge + Best-Bet Selector — pick the best value bet PER MATCH
|
||||
========================================================================
|
||||
Not "play the handed main_pick". For each match, score EVERY market the model
|
||||
covers, compare model prob vs market implied, and select the single best VALUE
|
||||
bet across all markets. Leak-free, walk-forward, honest.
|
||||
|
||||
Markets (truth derived from scores, not trusted labels):
|
||||
MS(1X2), HT-result, OU0.5/1.5/2.5/3.5, HT_OU0.5/1.5, BTTS.
|
||||
|
||||
Outputs:
|
||||
(A) per-market value ROI -> which bet types actually carry edge
|
||||
(B) cross-market SELECTOR -> best value bet per match, with odds-band filter,
|
||||
fold-consistency, and the model-free baseline.
|
||||
|
||||
⚠️ CSV odds are a static capture, not verified closing. Positive = LEAD; forward
|
||||
paper-trade with real CLV before staking.
|
||||
|
||||
Usage: python scripts/multi_market_edge.py --folds 5 --lo 1.5 --hi 2.6 --margin 0.03
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse, os, sys
|
||||
import numpy as np, pandas as pd, xgboost as xgb
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try: sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception: pass
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||||
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
|
||||
"score_home","score_away","ht_score_home","ht_score_away"}
|
||||
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
|
||||
"squad_diff","home_squad_quality","away_squad_quality",
|
||||
"referee_home_bias","referee_avg_goals"}
|
||||
|
||||
# market -> (kind, [odds_cols aligned to classes], truth_fn(sh,sa,hh,ha)->class idx or None)
|
||||
def ou(line): return lambda sh,sa,hh,ha: (0 if (sh+sa) > line else 1) # 0=Over,1=Under
|
||||
def htou(line): return lambda sh,sa,hh,ha: (None if np.isnan(hh) else (0 if (hh+ha) > line else 1))
|
||||
def ms_truth(sh,sa,hh,ha): return 0 if sh>sa else (1 if sh==sa else 2)
|
||||
def ht_truth(sh,sa,hh,ha): return None if np.isnan(hh) else (0 if hh>ha else (1 if hh==ha else 2))
|
||||
def btts_truth(sh,sa,hh,ha): return 0 if (sh>0 and sa>0) else 1 # 0=Yes,1=No
|
||||
|
||||
MARKETS = {
|
||||
"MS": ("multi", ["odds_ms_h","odds_ms_d","odds_ms_a"], ["1","X","2"], ms_truth),
|
||||
"HT": ("multi", ["odds_ht_ms_h","odds_ht_ms_d","odds_ht_ms_a"], ["1","X","2"], ht_truth),
|
||||
"OU05": ("binary", ["odds_ou05_o","odds_ou05_u"], ["Üst","Alt"], ou(0.5)),
|
||||
"OU15": ("binary", ["odds_ou15_o","odds_ou15_u"], ["Üst","Alt"], ou(1.5)),
|
||||
"OU25": ("binary", ["odds_ou25_o","odds_ou25_u"], ["Üst","Alt"], ou(2.5)),
|
||||
"OU35": ("binary", ["odds_ou35_o","odds_ou35_u"], ["Üst","Alt"], ou(3.5)),
|
||||
"HT_OU05": ("binary", ["odds_ht_ou05_o","odds_ht_ou05_u"], ["Üst","Alt"], htou(0.5)),
|
||||
"HT_OU15": ("binary", ["odds_ht_ou15_o","odds_ht_ou15_u"], ["Üst","Alt"], htou(1.5)),
|
||||
"BTTS": ("binary", ["odds_btts_y","odds_btts_n"], ["Var","Yok"], btts_truth),
|
||||
}
|
||||
PARAMS_M = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
|
||||
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
|
||||
PARAMS_B = {"objective":"binary:logistic","max_depth":5,"eta":0.05,
|
||||
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--folds", type=int, default=5)
|
||||
ap.add_argument("--estimators", type=int, default=150)
|
||||
ap.add_argument("--lo", type=float, default=1.5)
|
||||
ap.add_argument("--hi", type=float, default=2.6)
|
||||
ap.add_argument("--margin", type=float, default=0.03)
|
||||
args = ap.parse_args()
|
||||
|
||||
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
|
||||
sh = pd.to_numeric(df["score_home"], errors="coerce")
|
||||
sa = pd.to_numeric(df["score_away"], errors="coerce")
|
||||
ok = sh.notna() & sa.notna()
|
||||
df = df[ok].reset_index(drop=True)
|
||||
SH = sh[ok.values].values.astype(float); SA = sa[ok.values].values.astype(float)
|
||||
HH = pd.to_numeric(df["ht_score_home"], errors="coerce").values.astype(float)
|
||||
HA = pd.to_numeric(df["ht_score_away"], errors="coerce").values.astype(float)
|
||||
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
|
||||
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
N = len(df)
|
||||
print(f"{N:,} matches, {len(feats)} leak-free feats, {len(MARKETS)} markets, folds={args.folds}")
|
||||
|
||||
# precompute truth + odds per market
|
||||
MK = {}
|
||||
for mname,(kind,ocols,picks,tfn) in MARKETS.items():
|
||||
if not all(c in df.columns for c in ocols):
|
||||
print(f" skip {mname}: missing odds cols"); continue
|
||||
O = df[ocols].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
truth = np.array([tfn(SH[i],SA[i],HH[i],HA[i]) for i in range(N)], dtype=object)
|
||||
MK[mname] = (kind, O, picks, truth)
|
||||
|
||||
start = int(N*0.5); bounds = np.linspace(start, N, args.folds+1, dtype=int)
|
||||
|
||||
# accumulators
|
||||
per_market = {m: {"n":0,"pnl":0.0,"win":0} for m in MK} # (A) best value pick within market
|
||||
sel = {"n":0,"pnl":0.0,"win":0,"fold":{}} # (B) cross-market selector
|
||||
sel_by_mkt = {m: {"n":0,"pnl":0.0,"win":0} for m in MK}
|
||||
|
||||
for fi in range(args.folds):
|
||||
te0,te1 = bounds[fi], bounds[fi+1]
|
||||
if te1-te0 < 50: continue
|
||||
idx = np.arange(te0,te1)
|
||||
# train each market model on [:te0], predict test
|
||||
cand = {} # market -> (P_matrix[n_test, n_picks], O_test, truth_test)
|
||||
for m,(kind,O,picks,truth) in MK.items():
|
||||
ytr_full = truth[:te0]
|
||||
# mask invalid truth (e.g., HT markets with missing HT score)
|
||||
valid_tr = np.array([v is not None for v in ytr_full])
|
||||
if kind=="multi":
|
||||
ytr = ytr_full[valid_tr].astype(int)
|
||||
bst = xgb.train(PARAMS_M, xgb.DMatrix(X[:te0][valid_tr], label=ytr), num_boost_round=args.estimators)
|
||||
P = bst.predict(xgb.DMatrix(X[te0:te1])) # [n,3]
|
||||
else:
|
||||
ytr = ytr_full[valid_tr].astype(int) # 0=positive,1=neg
|
||||
pos = (ytr==0).astype(int)
|
||||
bst = xgb.train(PARAMS_B, xgb.DMatrix(X[:te0][valid_tr], label=pos), num_boost_round=args.estimators)
|
||||
ppos = bst.predict(xgb.DMatrix(X[te0:te1]))
|
||||
P = np.column_stack([ppos, 1.0-ppos]) # [n,2] -> [pos,neg]
|
||||
cand[m] = (P, O[te0:te1], truth[te0:te1])
|
||||
|
||||
# iterate test matches
|
||||
for j in range(te1-te0):
|
||||
best = None # (edge, market, pickidx, odds, won)
|
||||
for m,(P,Ot,Tt) in cand.items():
|
||||
t = Tt[j]
|
||||
if t is None: continue
|
||||
probs = P[j]; odds = Ot[j]
|
||||
for k in range(len(probs)):
|
||||
o = odds[k]
|
||||
if o <= 1.0: continue
|
||||
edge = probs[k] - 1.0/o
|
||||
won = int(t==k)
|
||||
# (A) per-market: track best value pick in this market (any band, edge>margin)
|
||||
if edge > args.margin:
|
||||
d = per_market[m]
|
||||
# only count the market's single best pick per match
|
||||
# collect for selector if in band + margin
|
||||
if edge > args.margin and args.lo <= o < args.hi:
|
||||
if best is None or edge > best[0]:
|
||||
best = (edge, m, k, o, won)
|
||||
# per-market best pick (separate loop for clean per-market ROI in band)
|
||||
bestk=None
|
||||
for k in range(len(probs)):
|
||||
o=odds[k]
|
||||
if o<=1.0: continue
|
||||
e=probs[k]-1.0/o
|
||||
if e>args.margin and args.lo<=o<args.hi and (bestk is None or e>bestk[0]):
|
||||
bestk=(e,k,o,int(t==k))
|
||||
if bestk is not None:
|
||||
e,k,o,won = bestk
|
||||
pnl = (o-1.0) if won else -1.0
|
||||
d=per_market[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won
|
||||
# selector: single best value bet across all markets for this match
|
||||
if best is not None:
|
||||
edge,m,k,o,won = best
|
||||
pnl = (o-1.0) if won else -1.0
|
||||
sel["n"]+=1; sel["pnl"]+=pnl; sel["win"]+=won
|
||||
sel["fold"][fi] = sel["fold"].get(fi,0.0)+pnl
|
||||
d=sel_by_mkt[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won
|
||||
print(f" fold {fi}: tested {te1-te0:,}")
|
||||
|
||||
def line(name,d):
|
||||
n=d["n"]; roi=100*d["pnl"]/n if n else float('nan'); hit=100*d["win"]/n if n else float('nan')
|
||||
return f" {name:<10} bets={n:>6} hit={hit:>5.1f}% ROI={roi:>7.2f}% net={d['pnl']:>7.1f}u"
|
||||
|
||||
print("\n"+"="*70); print(f"(A) PER-MARKET value ROI (best value pick in band [{args.lo},{args.hi}], margin {args.margin})"); print("="*70)
|
||||
for m in sorted(per_market, key=lambda x:-(100*per_market[x]['pnl']/per_market[x]['n'] if per_market[x]['n'] else -99)):
|
||||
print(line(m, per_market[m]))
|
||||
|
||||
print("\n"+"="*70); print("(B) CROSS-MARKET SELECTOR (best value bet per match, all markets)"); print("="*70)
|
||||
print(line("SELECTOR", sel))
|
||||
folds_pos = sum(1 for v in sel["fold"].values() if v>0)
|
||||
print(f" folds positive: {folds_pos}/{len(sel['fold'])}")
|
||||
print(" selector picks distributed across markets:")
|
||||
for m in sorted(sel_by_mkt, key=lambda x:-sel_by_mkt[x]['n']):
|
||||
if sel_by_mkt[m]["n"]>0: print(" "+line(m, sel_by_mkt[m]).strip())
|
||||
print("\nREAD: a market/selector is a LEAD only if ROI>0, folds consistent, n large.")
|
||||
print("Forward-validate with CLV before staking. Static CSV odds may overstate edge.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,112 @@
|
||||
"""
|
||||
Train Favorite-Policy Model (v1) — leak-free MS model for the validated strategy.
|
||||
================================================================================
|
||||
Trains a LEAK-FREE 1X2 model (drops the result-encoding columns) and saves it
|
||||
plus the feature list and policy metadata. This is the brain of the new system;
|
||||
the favourite-band value policy (odds ~1.5-2.2, model_prob>implied, flat stake)
|
||||
is applied on top of its probabilities at serving time.
|
||||
|
||||
Honest holdout: trains on the first --holdout-frac of history, evaluates the
|
||||
EXACT policy on the most recent slice (never seen in training), then retrains
|
||||
on ALL history for the saved production artifact.
|
||||
|
||||
Saves to models/favorite_v1/: model.json, feature_cols.json, metadata.json
|
||||
|
||||
Usage: python scripts/train_favorite_model.py
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse, json, os, sys, datetime
|
||||
import numpy as np, pandas as pd, xgboost as xgb
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try: sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception: pass
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||||
OUT = os.path.join(AI_DIR, "models", "favorite_v1")
|
||||
|
||||
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
|
||||
"score_home","score_away","ht_score_home","ht_score_away"}
|
||||
# Result-encoding leakage — never feed these to the model (train OR serve).
|
||||
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
|
||||
"squad_diff","home_squad_quality","away_squad_quality",
|
||||
"referee_home_bias","referee_avg_goals"}
|
||||
|
||||
PARAMS = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
|
||||
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
|
||||
|
||||
|
||||
def policy_eval(P, y, O, lo, hi, margin):
|
||||
implied = np.where(O > 1.0, 1.0/O, np.nan)
|
||||
edge = np.where(np.isnan(implied), -9.0, P - implied)
|
||||
pick = edge.argmax(1); pe = edge[np.arange(len(y)), pick]; po = O[np.arange(len(y)), pick]
|
||||
bet = (pe > margin) & (po >= lo) & (po < hi)
|
||||
win = (pick == y) & bet
|
||||
pnl = np.where(win, po-1.0, -1.0)[bet]
|
||||
n = int(bet.sum())
|
||||
return {"bets": n, "hit_pct": round(100*win.sum()/max(n,1),1),
|
||||
"roi_pct": round(100*pnl.sum()/max(n,1),2), "net_u": round(float(pnl.sum()),1)}
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--lo", type=float, default=1.5)
|
||||
ap.add_argument("--hi", type=float, default=2.2)
|
||||
ap.add_argument("--margin", type=float, default=0.0)
|
||||
ap.add_argument("--holdout-frac", type=float, default=0.15)
|
||||
ap.add_argument("--estimators", type=int, default=300)
|
||||
args = ap.parse_args()
|
||||
|
||||
print(f"Loading {CSV} ...")
|
||||
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
|
||||
sh = pd.to_numeric(df["score_home"], errors="coerce")
|
||||
sa = pd.to_numeric(df["score_away"], errors="coerce")
|
||||
ok = sh.notna() & sa.notna()
|
||||
df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
|
||||
y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
|
||||
O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
|
||||
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
print(f" {len(df):,} rows, {len(feats)} leak-free features")
|
||||
|
||||
# ── Honest holdout (last slice, never trained on) ──
|
||||
cut = int(len(df) * (1 - args.holdout_frac))
|
||||
bst = xgb.train(PARAMS, xgb.DMatrix(X[:cut], label=y[:cut]), num_boost_round=args.estimators)
|
||||
Ph = bst.predict(xgb.DMatrix(X[cut:]))
|
||||
acc = float((Ph.argmax(1) == y[cut:]).mean())
|
||||
hold = policy_eval(Ph, y[cut:], O[cut:], args.lo, args.hi, args.margin)
|
||||
print(f"\nHOLDOUT (last {args.holdout_frac:.0%}, {len(df)-cut:,} matches, never seen):")
|
||||
print(f" MS accuracy: {acc*100:.1f}%")
|
||||
print(f" POLICY band[{args.lo},{args.hi}] margin {args.margin}: {hold}")
|
||||
|
||||
# ── Production model: retrain on ALL history ──
|
||||
print("\nTraining production model on ALL history ...")
|
||||
final = xgb.train(PARAMS, xgb.DMatrix(X, label=y), num_boost_round=args.estimators)
|
||||
os.makedirs(OUT, exist_ok=True)
|
||||
final.save_model(os.path.join(OUT, "model.json"))
|
||||
with open(os.path.join(OUT, "feature_cols.json"), "w", encoding="utf-8") as f:
|
||||
json.dump(feats, f, ensure_ascii=False, indent=2)
|
||||
meta = {
|
||||
"version": "favorite_v1",
|
||||
"trained_at": datetime.datetime.now().isoformat(timespec="seconds"),
|
||||
"market": "MS",
|
||||
"classes": {"0": "home(1)", "1": "draw(X)", "2": "away(2)"},
|
||||
"policy": {"odds_lo": args.lo, "odds_hi": args.hi, "margin": args.margin,
|
||||
"stake": "flat 1u", "rule": "bet model's max value edge if picked odds in band",
|
||||
"never": ["longshots odds>=hi", "parlays/combos"]},
|
||||
"n_train": len(df), "n_features": len(feats),
|
||||
"leaky_excluded": sorted(LEAKY),
|
||||
"holdout_eval": {"accuracy_pct": round(acc*100,1), **hold},
|
||||
"caveat": "CSV odds are a static capture, not verified closing. Forward paper-trade with real CLV before staking.",
|
||||
}
|
||||
with open(os.path.join(OUT, "metadata.json"), "w", encoding="utf-8") as f:
|
||||
json.dump(meta, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n✅ Saved production model to {OUT}/")
|
||||
print(f" model.json, feature_cols.json ({len(feats)} feats), metadata.json")
|
||||
print("\nNEXT: serving wrapper that loads this + applies the policy to upcoming")
|
||||
print("matches, logs paper-trade picks, and we measure real forward CLV/ROI.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,191 @@
|
||||
"""
|
||||
Walk-Forward Odds-Blind Experiment — THE pivotal test.
|
||||
======================================================
|
||||
Question this answers: can a model BEAT THE MARKET out-of-sample, betting only
|
||||
on information the price doesn't already contain?
|
||||
|
||||
Method (no leakage, time-ordered):
|
||||
* data sorted by kickoff (mst_utc); train on the past, test on the future,
|
||||
rolled over several folds.
|
||||
* TWO models on the MS (1X2) market:
|
||||
ALL = every feature INCLUDING the bookmaker odds (what the live
|
||||
engine does -> it mostly re-learns the price).
|
||||
BLIND = identical but odds/implied/_present columns REMOVED, so the
|
||||
model must disagree with the market using fundamentals only.
|
||||
* For each, an honest value-bet simulation on the test fold using the REAL
|
||||
odds payouts (margin included): bet the outcome with the biggest
|
||||
model_prob - implied_prob edge above a margin; ROI = realized P/L per 1u.
|
||||
|
||||
Read: if BLIND's value ROI is consistently > 0 across folds, there is a real,
|
||||
exploitable lead. If both are <= 0 (expected), these markets aren't beatable
|
||||
with this data and the honest move is to stop staking.
|
||||
|
||||
Usage:
|
||||
python scripts/walkforward_oddsblind.py
|
||||
python scripts/walkforward_oddsblind.py --folds 6 --estimators 300
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||||
|
||||
import xgboost as xgb # noqa: E402
|
||||
|
||||
META = {"match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
|
||||
"score_home", "score_away", "ht_score_home", "ht_score_away"}
|
||||
|
||||
# Confirmed target leakage: *_goals_form integer-valued and ~0.63 correlated
|
||||
# with THIS match's goals; their diff equals the actual goal diff 73% of the
|
||||
# time. Excluded so the experiment measures genuine pre-match predictive power.
|
||||
LEAKY = {
|
||||
# CONFIRMED (encode the actual match result):
|
||||
"home_goals_form", "away_goals_form", # ~0.63 corr w/ this match's goals
|
||||
"total_goals", # this match's full-time total
|
||||
"ht_total_goals", # this match's half-time total
|
||||
# STRONG SUSPECTS (dominate importance + high outcome corr; audit extractor):
|
||||
"squad_diff", "home_squad_quality", "away_squad_quality",
|
||||
"referee_home_bias", "referee_avg_goals",
|
||||
}
|
||||
|
||||
|
||||
def is_odds_col(c: str) -> bool:
|
||||
cl = c.lower()
|
||||
return ("odds" in cl) or ("implied" in cl)
|
||||
|
||||
|
||||
def logloss(y: np.ndarray, p: np.ndarray) -> float:
|
||||
p = np.clip(p, 1e-9, 1 - 1e-9)
|
||||
return float(-np.mean(np.log(p[np.arange(len(y)), y])))
|
||||
|
||||
|
||||
def value_sim(proba: np.ndarray, y: np.ndarray, odds: np.ndarray,
|
||||
margin: float) -> dict:
|
||||
"""Bet the class with the biggest (model_prob - 1/odds) edge above margin."""
|
||||
implied = np.where(odds > 1.0, 1.0 / odds, np.nan)
|
||||
edge = proba - implied
|
||||
# ignore classes without valid odds
|
||||
edge = np.where(np.isnan(implied), -9.0, edge)
|
||||
pick = np.argmax(edge, axis=1)
|
||||
best_edge = edge[np.arange(len(y)), pick]
|
||||
bet = best_edge > margin
|
||||
n = int(bet.sum())
|
||||
if n == 0:
|
||||
return {"n": 0, "roi": None, "hit": None}
|
||||
win = (pick == y) & bet
|
||||
pick_odds = odds[np.arange(len(y)), pick]
|
||||
pnl = np.where(win, pick_odds - 1.0, -1.0)
|
||||
pnl = pnl[bet]
|
||||
return {"n": n, "roi": round(100.0 * pnl.sum() / n, 2),
|
||||
"hit": round(100.0 * win[bet].sum() / n, 1)}
|
||||
|
||||
|
||||
def train_eval(Xtr, ytr, Xte, yte, odds_te, est, margins):
|
||||
dtr = xgb.DMatrix(Xtr, label=ytr)
|
||||
dte = xgb.DMatrix(Xte)
|
||||
params = {"objective": "multi:softprob", "num_class": 3, "max_depth": 5,
|
||||
"eta": 0.05, "subsample": 0.8, "colsample_bytree": 0.8,
|
||||
"tree_method": "hist", "verbosity": 0}
|
||||
booster = xgb.train(params, dtr, num_boost_round=est)
|
||||
proba = booster.predict(dte)
|
||||
out = {"logloss": round(logloss(yte, proba), 4),
|
||||
"acc": round(100.0 * (proba.argmax(1) == yte).mean(), 1)}
|
||||
for mg in margins:
|
||||
out[f"val@{mg}"] = value_sim(proba, yte, odds_te, mg)
|
||||
return out
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--folds", type=int, default=5)
|
||||
ap.add_argument("--estimators", type=int, default=250)
|
||||
ap.add_argument("--test-frac", type=float, default=0.5,
|
||||
help="Fraction at the end used as rolling OOS (default 0.5)")
|
||||
args = ap.parse_args()
|
||||
|
||||
print(f"Loading {CSV} ...")
|
||||
df = pd.read_csv(CSV, low_memory=False)
|
||||
df = df.sort_values("mst_utc").reset_index(drop=True)
|
||||
print(f" {len(df)} rows, {df.shape[1]} cols")
|
||||
|
||||
# Derive true MS outcome from scores: 0=home,1=draw,2=away (robust, no label trust)
|
||||
sh = pd.to_numeric(df["score_home"], errors="coerce")
|
||||
sa = pd.to_numeric(df["score_away"], errors="coerce")
|
||||
y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
|
||||
valid = sh.notna() & sa.notna()
|
||||
df, y = df[valid].reset_index(drop=True), y[valid.values]
|
||||
|
||||
odds = df[["odds_ms_h", "odds_ms_d", "odds_ms_a"]].apply(
|
||||
pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
|
||||
feat_all = [c for c in df.columns if c not in META and not c.startswith("label_")
|
||||
and c not in LEAKY]
|
||||
feat_blind = [c for c in feat_all if not is_odds_col(c)]
|
||||
print(f" excluded leaky cols: {sorted(LEAKY)}")
|
||||
Xall = df[feat_all].apply(pd.to_numeric, errors="coerce").fillna(0.0)
|
||||
Xblind = df[feat_blind].apply(pd.to_numeric, errors="coerce").fillna(0.0)
|
||||
print(f" features: ALL={len(feat_all)} BLIND={len(feat_blind)} "
|
||||
f"(dropped {len(feat_all)-len(feat_blind)} odds cols)")
|
||||
print(f" base rates: home={100*(y==0).mean():.1f}% draw={100*(y==1).mean():.1f}% "
|
||||
f"away={100*(y==2).mean():.1f}%")
|
||||
|
||||
n = len(df)
|
||||
start = int(n * (1 - args.test_frac))
|
||||
bounds = np.linspace(start, n, args.folds + 1, dtype=int)
|
||||
margins = [0.0, 0.05, 0.10]
|
||||
|
||||
agg = {"ALL": {f"val@{m}": [] for m in margins}, "BLIND": {f"val@{m}": [] for m in margins}}
|
||||
agg["ALL"]["logloss"] = []; agg["BLIND"]["logloss"] = []
|
||||
|
||||
print(f"\nWalk-forward: {args.folds} folds, train=expanding, est={args.estimators}\n")
|
||||
hdr = f"{'fold':<5}{'model':<7}{'logloss':>9}{'acc%':>7}" + "".join(
|
||||
f"{('val@'+str(m)):>22}" for m in margins)
|
||||
print(hdr); print("-" * len(hdr))
|
||||
for i in range(args.folds):
|
||||
te0, te1 = bounds[i], bounds[i + 1]
|
||||
if te1 - te0 < 50:
|
||||
continue
|
||||
tr = slice(0, te0)
|
||||
te = slice(te0, te1)
|
||||
for name, X in (("ALL", Xall), ("BLIND", Xblind)):
|
||||
r = train_eval(X.iloc[tr].values, y[tr], X.iloc[te].values, y[te],
|
||||
odds[te], args.estimators, margins)
|
||||
agg[name]["logloss"].append(r["logloss"])
|
||||
cells = ""
|
||||
for m in margins:
|
||||
v = r[f"val@{m}"]
|
||||
agg[name][f"val@{m}"].append(v)
|
||||
cells += f"{('n=' + str(v['n']) + ' roi=' + str(v['roi'])):>22}"
|
||||
print(f"{i:<5}{name:<7}{r['logloss']:>9}{r['acc']:>7}{cells}")
|
||||
print()
|
||||
|
||||
print("=" * 70)
|
||||
print("AGGREGATE (sum bets, weighted ROI across folds)")
|
||||
print("=" * 70)
|
||||
for name in ("ALL", "BLIND"):
|
||||
ll = np.mean(agg[name]["logloss"]) if agg[name]["logloss"] else float("nan")
|
||||
print(f"\n{name} mean logloss={ll:.4f}")
|
||||
for m in margins:
|
||||
vs = agg[name][f"val@{m}"]
|
||||
tot_n = sum(v["n"] for v in vs)
|
||||
tot_pnl = sum((v["roi"] / 100.0 * v["n"]) for v in vs if v["roi"] is not None)
|
||||
roi = round(100.0 * tot_pnl / tot_n, 2) if tot_n else None
|
||||
print(f" margin {m}: total_bets={tot_n:>6} ROI(flat1u)={roi}%")
|
||||
print("\nREAD: BLIND ROI>0 across margins/folds = real edge. Both <=0 = no")
|
||||
print("exploitable edge in MS with this data (stop staking; the -EV is the vig).")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user