gg3

2026-06-05 00:36:24 +03:00
parent b9700f9fda
commit 9e41407cb5
10 changed files with 1683 additions and 0 deletions
@@ -0,0 +1,182 @@
+"""
+Multi-Market Edge + Best-Bet Selector — pick the best value bet PER MATCH
+========================================================================
+Not "play the handed main_pick". For each match, score EVERY market the model
+covers, compare model prob vs market implied, and select the single best VALUE
+bet across all markets. Leak-free, walk-forward, honest.
+
+Markets (truth derived from scores, not trusted labels):
+  MS(1X2), HT-result, OU0.5/1.5/2.5/3.5, HT_OU0.5/1.5, BTTS.
+
+Outputs:
+  (A) per-market value ROI  -> which bet types actually carry edge
+  (B) cross-market SELECTOR -> best value bet per match, with odds-band filter,
+      fold-consistency, and the model-free baseline.
+
+⚠️ CSV odds are a static capture, not verified closing. Positive = LEAD; forward
+paper-trade with real CLV before staking.
+
+Usage: python scripts/multi_market_edge.py --folds 5 --lo 1.5 --hi 2.6 --margin 0.03
+"""
+from __future__ import annotations
+import argparse, os, sys
+import numpy as np, pandas as pd, xgboost as xgb
+
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try: sys.stdout.reconfigure(encoding="utf-8")
+    except Exception: pass
+
+AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
+META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
+        "score_home","score_away","ht_score_home","ht_score_away"}
+LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
+         "squad_diff","home_squad_quality","away_squad_quality",
+         "referee_home_bias","referee_avg_goals"}
+
+# market -> (kind, [odds_cols aligned to classes], truth_fn(sh,sa,hh,ha)->class idx or None)
+def ou(line):   return lambda sh,sa,hh,ha: (0 if (sh+sa) > line else 1)            # 0=Over,1=Under
+def htou(line): return lambda sh,sa,hh,ha: (None if np.isnan(hh) else (0 if (hh+ha) > line else 1))
+def ms_truth(sh,sa,hh,ha): return 0 if sh>sa else (1 if sh==sa else 2)
+def ht_truth(sh,sa,hh,ha): return None if np.isnan(hh) else (0 if hh>ha else (1 if hh==ha else 2))
+def btts_truth(sh,sa,hh,ha): return 0 if (sh>0 and sa>0) else 1                    # 0=Yes,1=No
+
+MARKETS = {
+  "MS":      ("multi",  ["odds_ms_h","odds_ms_d","odds_ms_a"], ["1","X","2"], ms_truth),
+  "HT":      ("multi",  ["odds_ht_ms_h","odds_ht_ms_d","odds_ht_ms_a"], ["1","X","2"], ht_truth),
+  "OU05":    ("binary", ["odds_ou05_o","odds_ou05_u"], ["Üst","Alt"], ou(0.5)),
+  "OU15":    ("binary", ["odds_ou15_o","odds_ou15_u"], ["Üst","Alt"], ou(1.5)),
+  "OU25":    ("binary", ["odds_ou25_o","odds_ou25_u"], ["Üst","Alt"], ou(2.5)),
+  "OU35":    ("binary", ["odds_ou35_o","odds_ou35_u"], ["Üst","Alt"], ou(3.5)),
+  "HT_OU05": ("binary", ["odds_ht_ou05_o","odds_ht_ou05_u"], ["Üst","Alt"], htou(0.5)),
+  "HT_OU15": ("binary", ["odds_ht_ou15_o","odds_ht_ou15_u"], ["Üst","Alt"], htou(1.5)),
+  "BTTS":    ("binary", ["odds_btts_y","odds_btts_n"], ["Var","Yok"], btts_truth),
+}
+PARAMS_M = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
+            "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
+PARAMS_B = {"objective":"binary:logistic","max_depth":5,"eta":0.05,
+            "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--folds", type=int, default=5)
+    ap.add_argument("--estimators", type=int, default=150)
+    ap.add_argument("--lo", type=float, default=1.5)
+    ap.add_argument("--hi", type=float, default=2.6)
+    ap.add_argument("--margin", type=float, default=0.03)
+    args = ap.parse_args()
+
+    df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
+    sh = pd.to_numeric(df["score_home"], errors="coerce")
+    sa = pd.to_numeric(df["score_away"], errors="coerce")
+    ok = sh.notna() & sa.notna()
+    df = df[ok].reset_index(drop=True)
+    SH = sh[ok.values].values.astype(float); SA = sa[ok.values].values.astype(float)
+    HH = pd.to_numeric(df["ht_score_home"], errors="coerce").values.astype(float)
+    HA = pd.to_numeric(df["ht_score_away"], errors="coerce").values.astype(float)
+    feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
+    X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+    N = len(df)
+    print(f"{N:,} matches, {len(feats)} leak-free feats, {len(MARKETS)} markets, folds={args.folds}")
+
+    # precompute truth + odds per market
+    MK = {}
+    for mname,(kind,ocols,picks,tfn) in MARKETS.items():
+        if not all(c in df.columns for c in ocols):
+            print(f"  skip {mname}: missing odds cols"); continue
+        O = df[ocols].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+        truth = np.array([tfn(SH[i],SA[i],HH[i],HA[i]) for i in range(N)], dtype=object)
+        MK[mname] = (kind, O, picks, truth)
+
+    start = int(N*0.5); bounds = np.linspace(start, N, args.folds+1, dtype=int)
+
+    # accumulators
+    per_market = {m: {"n":0,"pnl":0.0,"win":0} for m in MK}                 # (A) best value pick within market
+    sel = {"n":0,"pnl":0.0,"win":0,"fold":{}}                                # (B) cross-market selector
+    sel_by_mkt = {m: {"n":0,"pnl":0.0,"win":0} for m in MK}
+
+    for fi in range(args.folds):
+        te0,te1 = bounds[fi], bounds[fi+1]
+        if te1-te0 < 50: continue
+        idx = np.arange(te0,te1)
+        # train each market model on [:te0], predict test
+        cand = {}  # market -> (P_matrix[n_test, n_picks], O_test, truth_test)
+        for m,(kind,O,picks,truth) in MK.items():
+            ytr_full = truth[:te0]
+            # mask invalid truth (e.g., HT markets with missing HT score)
+            valid_tr = np.array([v is not None for v in ytr_full])
+            if kind=="multi":
+                ytr = ytr_full[valid_tr].astype(int)
+                bst = xgb.train(PARAMS_M, xgb.DMatrix(X[:te0][valid_tr], label=ytr), num_boost_round=args.estimators)
+                P = bst.predict(xgb.DMatrix(X[te0:te1]))           # [n,3]
+            else:
+                ytr = ytr_full[valid_tr].astype(int)               # 0=positive,1=neg
+                pos = (ytr==0).astype(int)
+                bst = xgb.train(PARAMS_B, xgb.DMatrix(X[:te0][valid_tr], label=pos), num_boost_round=args.estimators)
+                ppos = bst.predict(xgb.DMatrix(X[te0:te1]))
+                P = np.column_stack([ppos, 1.0-ppos])              # [n,2] -> [pos,neg]
+            cand[m] = (P, O[te0:te1], truth[te0:te1])
+
+        # iterate test matches
+        for j in range(te1-te0):
+            best = None  # (edge, market, pickidx, odds, won)
+            for m,(P,Ot,Tt) in cand.items():
+                t = Tt[j]
+                if t is None: continue
+                probs = P[j]; odds = Ot[j]
+                for k in range(len(probs)):
+                    o = odds[k]
+                    if o <= 1.0: continue
+                    edge = probs[k] - 1.0/o
+                    won = int(t==k)
+                    # (A) per-market: track best value pick in this market (any band, edge>margin)
+                    if edge > args.margin:
+                        d = per_market[m]
+                        # only count the market's single best pick per match
+                    # collect for selector if in band + margin
+                    if edge > args.margin and args.lo <= o < args.hi:
+                        if best is None or edge > best[0]:
+                            best = (edge, m, k, o, won)
+                # per-market best pick (separate loop for clean per-market ROI in band)
+                bestk=None
+                for k in range(len(probs)):
+                    o=odds[k]
+                    if o<=1.0: continue
+                    e=probs[k]-1.0/o
+                    if e>args.margin and args.lo<=o<args.hi and (bestk is None or e>bestk[0]):
+                        bestk=(e,k,o,int(t==k))
+                if bestk is not None:
+                    e,k,o,won = bestk
+                    pnl = (o-1.0) if won else -1.0
+                    d=per_market[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won
+            # selector: single best value bet across all markets for this match
+            if best is not None:
+                edge,m,k,o,won = best
+                pnl = (o-1.0) if won else -1.0
+                sel["n"]+=1; sel["pnl"]+=pnl; sel["win"]+=won
+                sel["fold"][fi] = sel["fold"].get(fi,0.0)+pnl
+                d=sel_by_mkt[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won
+        print(f"  fold {fi}: tested {te1-te0:,}")
+
+    def line(name,d):
+        n=d["n"]; roi=100*d["pnl"]/n if n else float('nan'); hit=100*d["win"]/n if n else float('nan')
+        return f"  {name:<10} bets={n:>6} hit={hit:>5.1f}% ROI={roi:>7.2f}% net={d['pnl']:>7.1f}u"
+
+    print("\n"+"="*70); print(f"(A) PER-MARKET value ROI  (best value pick in band [{args.lo},{args.hi}], margin {args.margin})"); print("="*70)
+    for m in sorted(per_market, key=lambda x:-(100*per_market[x]['pnl']/per_market[x]['n'] if per_market[x]['n'] else -99)):
+        print(line(m, per_market[m]))
+
+    print("\n"+"="*70); print("(B) CROSS-MARKET SELECTOR  (best value bet per match, all markets)"); print("="*70)
+    print(line("SELECTOR", sel))
+    folds_pos = sum(1 for v in sel["fold"].values() if v>0)
+    print(f"  folds positive: {folds_pos}/{len(sel['fold'])}")
+    print("  selector picks distributed across markets:")
+    for m in sorted(sel_by_mkt, key=lambda x:-sel_by_mkt[x]['n']):
+        if sel_by_mkt[m]["n"]>0: print("   "+line(m, sel_by_mkt[m]).strip())
+    print("\nREAD: a market/selector is a LEAD only if ROI>0, folds consistent, n large.")
+    print("Forward-validate with CLV before staking. Static CSV odds may overstate edge.")
+
+
+if __name__ == "__main__":
+    main()