@@ -0,0 +1,182 @@
|
||||
"""
|
||||
Multi-Market Edge + Best-Bet Selector — pick the best value bet PER MATCH
|
||||
========================================================================
|
||||
Not "play the handed main_pick". For each match, score EVERY market the model
|
||||
covers, compare model prob vs market implied, and select the single best VALUE
|
||||
bet across all markets. Leak-free, walk-forward, honest.
|
||||
|
||||
Markets (truth derived from scores, not trusted labels):
|
||||
MS(1X2), HT-result, OU0.5/1.5/2.5/3.5, HT_OU0.5/1.5, BTTS.
|
||||
|
||||
Outputs:
|
||||
(A) per-market value ROI -> which bet types actually carry edge
|
||||
(B) cross-market SELECTOR -> best value bet per match, with odds-band filter,
|
||||
fold-consistency, and the model-free baseline.
|
||||
|
||||
⚠️ CSV odds are a static capture, not verified closing. Positive = LEAD; forward
|
||||
paper-trade with real CLV before staking.
|
||||
|
||||
Usage: python scripts/multi_market_edge.py --folds 5 --lo 1.5 --hi 2.6 --margin 0.03
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse, os, sys
|
||||
import numpy as np, pandas as pd, xgboost as xgb
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try: sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception: pass
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||||
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
|
||||
"score_home","score_away","ht_score_home","ht_score_away"}
|
||||
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
|
||||
"squad_diff","home_squad_quality","away_squad_quality",
|
||||
"referee_home_bias","referee_avg_goals"}
|
||||
|
||||
# market -> (kind, [odds_cols aligned to classes], truth_fn(sh,sa,hh,ha)->class idx or None)
|
||||
def ou(line): return lambda sh,sa,hh,ha: (0 if (sh+sa) > line else 1) # 0=Over,1=Under
|
||||
def htou(line): return lambda sh,sa,hh,ha: (None if np.isnan(hh) else (0 if (hh+ha) > line else 1))
|
||||
def ms_truth(sh,sa,hh,ha): return 0 if sh>sa else (1 if sh==sa else 2)
|
||||
def ht_truth(sh,sa,hh,ha): return None if np.isnan(hh) else (0 if hh>ha else (1 if hh==ha else 2))
|
||||
def btts_truth(sh,sa,hh,ha): return 0 if (sh>0 and sa>0) else 1 # 0=Yes,1=No
|
||||
|
||||
MARKETS = {
|
||||
"MS": ("multi", ["odds_ms_h","odds_ms_d","odds_ms_a"], ["1","X","2"], ms_truth),
|
||||
"HT": ("multi", ["odds_ht_ms_h","odds_ht_ms_d","odds_ht_ms_a"], ["1","X","2"], ht_truth),
|
||||
"OU05": ("binary", ["odds_ou05_o","odds_ou05_u"], ["Üst","Alt"], ou(0.5)),
|
||||
"OU15": ("binary", ["odds_ou15_o","odds_ou15_u"], ["Üst","Alt"], ou(1.5)),
|
||||
"OU25": ("binary", ["odds_ou25_o","odds_ou25_u"], ["Üst","Alt"], ou(2.5)),
|
||||
"OU35": ("binary", ["odds_ou35_o","odds_ou35_u"], ["Üst","Alt"], ou(3.5)),
|
||||
"HT_OU05": ("binary", ["odds_ht_ou05_o","odds_ht_ou05_u"], ["Üst","Alt"], htou(0.5)),
|
||||
"HT_OU15": ("binary", ["odds_ht_ou15_o","odds_ht_ou15_u"], ["Üst","Alt"], htou(1.5)),
|
||||
"BTTS": ("binary", ["odds_btts_y","odds_btts_n"], ["Var","Yok"], btts_truth),
|
||||
}
|
||||
PARAMS_M = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
|
||||
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
|
||||
PARAMS_B = {"objective":"binary:logistic","max_depth":5,"eta":0.05,
|
||||
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--folds", type=int, default=5)
|
||||
ap.add_argument("--estimators", type=int, default=150)
|
||||
ap.add_argument("--lo", type=float, default=1.5)
|
||||
ap.add_argument("--hi", type=float, default=2.6)
|
||||
ap.add_argument("--margin", type=float, default=0.03)
|
||||
args = ap.parse_args()
|
||||
|
||||
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
|
||||
sh = pd.to_numeric(df["score_home"], errors="coerce")
|
||||
sa = pd.to_numeric(df["score_away"], errors="coerce")
|
||||
ok = sh.notna() & sa.notna()
|
||||
df = df[ok].reset_index(drop=True)
|
||||
SH = sh[ok.values].values.astype(float); SA = sa[ok.values].values.astype(float)
|
||||
HH = pd.to_numeric(df["ht_score_home"], errors="coerce").values.astype(float)
|
||||
HA = pd.to_numeric(df["ht_score_away"], errors="coerce").values.astype(float)
|
||||
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
|
||||
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
N = len(df)
|
||||
print(f"{N:,} matches, {len(feats)} leak-free feats, {len(MARKETS)} markets, folds={args.folds}")
|
||||
|
||||
# precompute truth + odds per market
|
||||
MK = {}
|
||||
for mname,(kind,ocols,picks,tfn) in MARKETS.items():
|
||||
if not all(c in df.columns for c in ocols):
|
||||
print(f" skip {mname}: missing odds cols"); continue
|
||||
O = df[ocols].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
|
||||
truth = np.array([tfn(SH[i],SA[i],HH[i],HA[i]) for i in range(N)], dtype=object)
|
||||
MK[mname] = (kind, O, picks, truth)
|
||||
|
||||
start = int(N*0.5); bounds = np.linspace(start, N, args.folds+1, dtype=int)
|
||||
|
||||
# accumulators
|
||||
per_market = {m: {"n":0,"pnl":0.0,"win":0} for m in MK} # (A) best value pick within market
|
||||
sel = {"n":0,"pnl":0.0,"win":0,"fold":{}} # (B) cross-market selector
|
||||
sel_by_mkt = {m: {"n":0,"pnl":0.0,"win":0} for m in MK}
|
||||
|
||||
for fi in range(args.folds):
|
||||
te0,te1 = bounds[fi], bounds[fi+1]
|
||||
if te1-te0 < 50: continue
|
||||
idx = np.arange(te0,te1)
|
||||
# train each market model on [:te0], predict test
|
||||
cand = {} # market -> (P_matrix[n_test, n_picks], O_test, truth_test)
|
||||
for m,(kind,O,picks,truth) in MK.items():
|
||||
ytr_full = truth[:te0]
|
||||
# mask invalid truth (e.g., HT markets with missing HT score)
|
||||
valid_tr = np.array([v is not None for v in ytr_full])
|
||||
if kind=="multi":
|
||||
ytr = ytr_full[valid_tr].astype(int)
|
||||
bst = xgb.train(PARAMS_M, xgb.DMatrix(X[:te0][valid_tr], label=ytr), num_boost_round=args.estimators)
|
||||
P = bst.predict(xgb.DMatrix(X[te0:te1])) # [n,3]
|
||||
else:
|
||||
ytr = ytr_full[valid_tr].astype(int) # 0=positive,1=neg
|
||||
pos = (ytr==0).astype(int)
|
||||
bst = xgb.train(PARAMS_B, xgb.DMatrix(X[:te0][valid_tr], label=pos), num_boost_round=args.estimators)
|
||||
ppos = bst.predict(xgb.DMatrix(X[te0:te1]))
|
||||
P = np.column_stack([ppos, 1.0-ppos]) # [n,2] -> [pos,neg]
|
||||
cand[m] = (P, O[te0:te1], truth[te0:te1])
|
||||
|
||||
# iterate test matches
|
||||
for j in range(te1-te0):
|
||||
best = None # (edge, market, pickidx, odds, won)
|
||||
for m,(P,Ot,Tt) in cand.items():
|
||||
t = Tt[j]
|
||||
if t is None: continue
|
||||
probs = P[j]; odds = Ot[j]
|
||||
for k in range(len(probs)):
|
||||
o = odds[k]
|
||||
if o <= 1.0: continue
|
||||
edge = probs[k] - 1.0/o
|
||||
won = int(t==k)
|
||||
# (A) per-market: track best value pick in this market (any band, edge>margin)
|
||||
if edge > args.margin:
|
||||
d = per_market[m]
|
||||
# only count the market's single best pick per match
|
||||
# collect for selector if in band + margin
|
||||
if edge > args.margin and args.lo <= o < args.hi:
|
||||
if best is None or edge > best[0]:
|
||||
best = (edge, m, k, o, won)
|
||||
# per-market best pick (separate loop for clean per-market ROI in band)
|
||||
bestk=None
|
||||
for k in range(len(probs)):
|
||||
o=odds[k]
|
||||
if o<=1.0: continue
|
||||
e=probs[k]-1.0/o
|
||||
if e>args.margin and args.lo<=o<args.hi and (bestk is None or e>bestk[0]):
|
||||
bestk=(e,k,o,int(t==k))
|
||||
if bestk is not None:
|
||||
e,k,o,won = bestk
|
||||
pnl = (o-1.0) if won else -1.0
|
||||
d=per_market[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won
|
||||
# selector: single best value bet across all markets for this match
|
||||
if best is not None:
|
||||
edge,m,k,o,won = best
|
||||
pnl = (o-1.0) if won else -1.0
|
||||
sel["n"]+=1; sel["pnl"]+=pnl; sel["win"]+=won
|
||||
sel["fold"][fi] = sel["fold"].get(fi,0.0)+pnl
|
||||
d=sel_by_mkt[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won
|
||||
print(f" fold {fi}: tested {te1-te0:,}")
|
||||
|
||||
def line(name,d):
|
||||
n=d["n"]; roi=100*d["pnl"]/n if n else float('nan'); hit=100*d["win"]/n if n else float('nan')
|
||||
return f" {name:<10} bets={n:>6} hit={hit:>5.1f}% ROI={roi:>7.2f}% net={d['pnl']:>7.1f}u"
|
||||
|
||||
print("\n"+"="*70); print(f"(A) PER-MARKET value ROI (best value pick in band [{args.lo},{args.hi}], margin {args.margin})"); print("="*70)
|
||||
for m in sorted(per_market, key=lambda x:-(100*per_market[x]['pnl']/per_market[x]['n'] if per_market[x]['n'] else -99)):
|
||||
print(line(m, per_market[m]))
|
||||
|
||||
print("\n"+"="*70); print("(B) CROSS-MARKET SELECTOR (best value bet per match, all markets)"); print("="*70)
|
||||
print(line("SELECTOR", sel))
|
||||
folds_pos = sum(1 for v in sel["fold"].values() if v>0)
|
||||
print(f" folds positive: {folds_pos}/{len(sel['fold'])}")
|
||||
print(" selector picks distributed across markets:")
|
||||
for m in sorted(sel_by_mkt, key=lambda x:-sel_by_mkt[x]['n']):
|
||||
if sel_by_mkt[m]["n"]>0: print(" "+line(m, sel_by_mkt[m]).strip())
|
||||
print("\nREAD: a market/selector is a LEAD only if ROI>0, folds consistent, n large.")
|
||||
print("Forward-validate with CLV before staking. Static CSV odds may overstate edge.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user