gg3

2026-06-05 00:36:24 +03:00
parent b9700f9fda
commit 9e41407cb5
10 changed files with 1683 additions and 0 deletions
@@ -0,0 +1,137 @@
+"""
+Analyze Match v2 — the per-match multi-market value board + disciplined pick.
+===========================================================================
+Answers "for ONE match, show every bet type's probability + model signal +
+market-vs-model value, and pick the right bet." Leak-free models.
+
+KEY HONEST RULE (proven by multi_market_edge.py): compute & SHOW value for all
+markets, but only MS (1X2) carries real, fold-consistent model edge. In OU/HT/
+BTTS the market is efficient — a big model-vs-market gap there is the MODEL'S
+ERROR, not value. So non-MS rows are INFO-ONLY; only an MS value bet in the
+favourite band is STAKED.
+
+Demo: trains all market models on the first 85% of history, then prints the full
+board for sample matches in the unseen last 15% (with what actually happened).
+
+Usage:
+  python scripts/analyze_match_v2.py --n 6
+  python scripts/analyze_match_v2.py --match <match_id>
+"""
+from __future__ import annotations
+import argparse, os, sys
+import numpy as np, pandas as pd, xgboost as xgb
+
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try: sys.stdout.reconfigure(encoding="utf-8")
+    except Exception: pass
+
+AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
+META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
+        "score_home","score_away","ht_score_home","ht_score_away"}
+LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
+         "squad_diff","home_squad_quality","away_squad_quality",
+         "referee_home_bias","referee_avg_goals"}
+STAKE_LO, STAKE_HI = 1.5, 2.4   # MS favourite band that staking is allowed in
+STAKE_MARGIN = 0.03
+
+def ou(line):   return lambda sh,sa,hh,ha: (0 if (sh+sa) > line else 1)
+def htou(line): return lambda sh,sa,hh,ha: (None if np.isnan(hh) else (0 if (hh+ha) > line else 1))
+MARKETS = {
+  "MS":      ("multi", ["odds_ms_h","odds_ms_d","odds_ms_a"], ["1","X","2"],
+              lambda sh,sa,hh,ha: 0 if sh>sa else (1 if sh==sa else 2)),
+  "OU25":    ("binary",["odds_ou25_o","odds_ou25_u"], ["2.5Üst","2.5Alt"], ou(2.5)),
+  "OU15":    ("binary",["odds_ou15_o","odds_ou15_u"], ["1.5Üst","1.5Alt"], ou(1.5)),
+  "OU35":    ("binary",["odds_ou35_o","odds_ou35_u"], ["3.5Üst","3.5Alt"], ou(3.5)),
+  "BTTS":    ("binary",["odds_btts_y","odds_btts_n"], ["KG Var","KG Yok"],
+              lambda sh,sa,hh,ha: 0 if (sh>0 and sa>0) else 1),
+  "HT":      ("multi", ["odds_ht_ms_h","odds_ht_ms_d","odds_ht_ms_a"], ["İY1","İYX","İY2"],
+              lambda sh,sa,hh,ha: None if np.isnan(hh) else (0 if hh>ha else (1 if hh==ha else 2))),
+  "HT_OU15": ("binary",["odds_ht_ou15_o","odds_ht_ou15_u"], ["İY1.5Üst","İY1.5Alt"], htou(1.5)),
+}
+STAKED_MARKETS = {"MS"}   # only these are bet; rest are info-only
+PM = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
+PB = {"objective":"binary:logistic","max_depth":5,"eta":0.05,"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--n", type=int, default=6, help="how many sample matches")
+    ap.add_argument("--match", help="specific match_id")
+    ap.add_argument("--estimators", type=int, default=250)
+    args = ap.parse_args()
+
+    df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
+    sh = pd.to_numeric(df["score_home"],errors="coerce"); sa = pd.to_numeric(df["score_away"],errors="coerce")
+    ok = sh.notna()&sa.notna(); df = df[ok].reset_index(drop=True)
+    SH=sh[ok.values].values.astype(float); SA=sa[ok.values].values.astype(float)
+    HH=pd.to_numeric(df["ht_score_home"],errors="coerce").values.astype(float)
+    HA=pd.to_numeric(df["ht_score_away"],errors="coerce").values.astype(float)
+    feats=[c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
+    X=df[feats].apply(pd.to_numeric,errors="coerce").fillna(0.0).values
+    N=len(df); cut=int(N*0.85)
+    print(f"Training {len(MARKETS)} leak-free market models on {cut:,} matches ...")
+
+    models={}
+    for m,(kind,ocols,picks,tfn) in MARKETS.items():
+        if not all(c in df.columns for c in ocols): continue
+        truth=np.array([tfn(SH[i],SA[i],HH[i],HA[i]) for i in range(cut)],dtype=object)
+        valid=np.array([v is not None for v in truth])
+        if kind=="multi":
+            b=xgb.train(PM,xgb.DMatrix(X[:cut][valid],label=truth[valid].astype(int)),num_boost_round=args.estimators)
+        else:
+            b=xgb.train(PB,xgb.DMatrix(X[:cut][valid],label=(truth[valid].astype(int)==0).astype(int)),num_boost_round=args.estimators)
+        models[m]=(kind,ocols,picks,tfn,b)
+
+    # choose matches from holdout
+    hold = df.iloc[cut:].reset_index(drop=True)
+    if args.match:
+        sel_idx = df.index[df["match_id"].astype(str)==str(args.match)].tolist()
+        rows = [(i,) for i in sel_idx]
+        base = df
+    else:
+        pick_pos = np.linspace(0, len(hold)-1, args.n, dtype=int)
+        rows = [(cut+p,) for p in pick_pos]
+        base = df
+
+    for (gi,) in rows:
+        r = base.iloc[gi]
+        xrow = X[gi:gi+1]
+        sh_,sa_,hh_,ha_ = SH[gi],SA[gi],HH[gi],HA[gi]
+        ht = f"{int(hh_)}-{int(ha_)}" if not np.isnan(hh_) else "?"
+        print("\n"+"="*72)
+        print(f"MATCH {r['match_id']}  | elo H{r.get('home_overall_elo','?'):.0f} vs A{r.get('away_overall_elo','?'):.0f}"
+              f"  | ACTUAL {int(sh_)}-{int(sa_)} (HT {ht})")
+        print(f"  {'market':<8}{'pick':<10}{'model%':>8}{'impl%':>7}{'edge':>7}{'odds':>7}  flag   result")
+        print("  "+"-"*64)
+        best_ms=None
+        for m,(kind,ocols,picks,tfn,b) in models.items():
+            if kind=="multi":
+                P=b.predict(xgb.DMatrix(xrow))[0]
+            else:
+                p=float(b.predict(xgb.DMatrix(xrow))[0]); P=np.array([p,1-p])
+            O=pd.to_numeric(r[ocols],errors="coerce").fillna(0.0).values
+            truth=tfn(sh_,sa_,hh_,ha_)
+            for k in range(len(picks)):
+                o=O[k]
+                if o<=1.0: continue
+                imp=1.0/o; edge=P[k]-imp
+                res = "—" if truth is None else ("WON" if truth==k else "lost")
+                staked = (m in STAKED_MARKETS) and edge>STAKE_MARGIN and STAKE_LO<=o<STAKE_HI
+                flag = "★BET" if staked else ("val" if edge>STAKE_MARGIN else "")
+                print(f"  {m:<8}{picks[k]:<10}{100*P[k]:>7.1f}{100*imp:>7.1f}{100*edge:>+7.1f}{o:>7.2f}  {flag:<5} {res}")
+                if staked and (best_ms is None or edge>best_ms[0]):
+                    best_ms=(edge,m,picks[k],o,res)
+        print("  "+"-"*64)
+        if best_ms:
+            e,m,p,o,res = best_ms
+            print(f"  >>> STAKE: {m} {p} @ {o:.2f}  (edge +{100*e:.1f}%, favourite band)  -> {res}")
+        else:
+            print(f"  >>> NO STAKE: no MS value in favourite band. (Other markets info-only —")
+            print(f"      their 'value' is model error in efficient markets; do NOT chase it.)")
+    print("\nNOTE: only MS staked (proven edge). All markets shown for transparency.")
+    print("Forward-validate with CLV before real money. Static CSV odds may overstate edge.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,113 @@
+"""
+Betting Policy — the honest, leak-free strategy the data actually supports.
+==========================================================================
+Everything else in this repo bet UNDERDOGS (odds 6-7.5) and lost (-43.7% live).
+The data says the opposite: the only positive, fold-consistent, model-driven
+signal is MILD FAVOURITES the model rates above the market price.
+
+POLICY (MS / 1X2 only):
+  * leak-free model (drops the result-encoding features, see LEAKY)
+  * bet the model's single biggest value edge (model_prob - implied) ...
+  * ONLY if the picked side's odds are in [--lo, --hi]  (favourite band)
+  * ONLY if that edge > --margin
+  * flat 1u stake, one bet per match, never a longshot, never a parlay.
+
+Walk-forward, no leakage. Reports the policy ROI, fold consistency, drawdown,
+and the model-free baseline (blind favourite) so you can see the model's lift.
+
+⚠️ HONEST CAVEAT: CSV odds are a static capture, not the verified obtainable
+closing line. A small backtest edge here is a LEAD, not a guarantee. Forward
+paper-trade with real CLV (capture_closing_odds.py) before risking money.
+
+Usage: python scripts/betting_policy.py --lo 1.5 --hi 2.2 --margin 0.0 --folds 8
+"""
+from __future__ import annotations
+import argparse, os, sys
+import numpy as np, pandas as pd, xgboost as xgb
+
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try: sys.stdout.reconfigure(encoding="utf-8")
+    except Exception: pass
+
+AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
+META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
+        "score_home","score_away","ht_score_home","ht_score_away"}
+LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
+         "squad_diff","home_squad_quality","away_squad_quality",
+         "referee_home_bias","referee_avg_goals"}
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--lo", type=float, default=1.5)
+    ap.add_argument("--hi", type=float, default=2.2)
+    ap.add_argument("--margin", type=float, default=0.0)
+    ap.add_argument("--folds", type=int, default=8)
+    ap.add_argument("--estimators", type=int, default=250)
+    args = ap.parse_args()
+
+    df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
+    sh = pd.to_numeric(df["score_home"], errors="coerce")
+    sa = pd.to_numeric(df["score_away"], errors="coerce")
+    ok = sh.notna() & sa.notna()
+    df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
+    y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
+    O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+    feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
+    X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+
+    n = len(df); start = int(n*0.5)
+    bounds = np.linspace(start, n, args.folds+1, dtype=int)
+    params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
+              "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
+
+    print(f"POLICY: favourite band [{args.lo},{args.hi}]  margin {args.margin}  "
+          f"leak-free feats={len(feats)}  folds={args.folds}\n")
+    all_pnl=[]; fold_rows=[]; base_pnl=[]
+    for fi in range(args.folds):
+        te0,te1 = bounds[fi], bounds[fi+1]
+        if te1-te0 < 50: continue
+        bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators)
+        P = bst.predict(xgb.DMatrix(X[te0:te1]))
+        yte, Ote = y[te0:te1], O[te0:te1]
+        implied = np.where(Ote>1.0, 1.0/Ote, np.nan)
+        edge = np.where(np.isnan(implied), -9.0, P-implied)
+        pick = edge.argmax(1); pe = edge[np.arange(len(yte)),pick]; po = Ote[np.arange(len(yte)),pick]
+        bet = (pe>args.margin) & (po>=args.lo) & (po<args.hi)
+        win = (pick==yte)&bet
+        pnl = np.where(win, po-1.0, -1.0)[bet]
+        # model-free baseline: blind favourite in same band
+        fav=Ote.argmin(1); fo=Ote[np.arange(len(yte)),fav]
+        bmask=(fo>=args.lo)&(fo<args.hi)&(Ote>1.0).all(1)
+        bpnl=np.where(fav[bmask]==yte[bmask], fo[bmask]-1.0, -1.0)
+        roi = 100*pnl.sum()/len(pnl) if len(pnl) else float('nan')
+        broi= 100*bpnl.sum()/len(bpnl) if len(bpnl) else float('nan')
+        fold_rows.append((fi, len(pnl), 100*win.sum()/max(bet.sum(),1), roi, broi))
+        all_pnl.extend(pnl.tolist()); base_pnl.extend(bpnl.tolist())
+        print(f"  fold {fi}: policy_bets={len(pnl):>4}  hit={100*win.sum()/max(bet.sum(),1):>5.1f}%  "
+              f"ROI={roi:>7.2f}%   | baseline(blind fav) ROI={broi:>7.2f}%")
+
+    a=np.array(all_pnl); b=np.array(base_pnl)
+    print("\n"+"="*70)
+    print("AGGREGATE")
+    print("="*70)
+    if len(a):
+        cum=np.cumsum(a); peak=np.maximum.accumulate(cum); dd=(cum-peak).min()
+        folds_pos=sum(1 for r in fold_rows if r[3]>0)
+        print(f"  POLICY:   bets={len(a):>5}  hit={100*(a>0).mean():.1f}%  "
+              f"ROI={100*a.mean():+.2f}%  net={a.sum():+.1f}u  maxDD={dd:.1f}u  "
+              f"folds+={folds_pos}/{len(fold_rows)}")
+    if len(b):
+        print(f"  BASELINE: bets={len(b):>5}  hit={100*(b>0).mean():.1f}%  "
+              f"ROI={100*b.mean():+.2f}%  (blind favourite, same band)")
+    if len(a):
+        print(f"\n  MODEL LIFT over blind favourite: "
+              f"{100*a.mean()-100*b.mean():+.1f} percentage points")
+    print("\nREAD: a believable system has ROI>0, folds+ near full, tolerable maxDD,")
+    print("and clearly beats the blind-favourite baseline. Even then it's a LEAD —")
+    print("forward paper-trade with real CLV before staking real money.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,136 @@
+"""
+Capture Closing Odds — snapshot #2 of the minimal 2-snapshot CLV system.
+=======================================================================
+WHY: CLV (closing line value) is the only reliable proof of betting edge.
+This codebase never captured it: odds are stored as a single static snapshot
+and `odds_history` is empty. But the live sync (DataFetcherTask CRON 1) DOES
+refresh `live_matches.odds` every 15 min before kickoff, and prediction_runs
+already store the bet-time odds blob (odds_snapshot.odds, source=live_match).
+
+This script supplies the missing half: just before kickoff it copies the
+*current* live odds blob onto the match's latest prediction_run as
+`odds_snapshot.closing_odds`. Later, CLV per bet = bet-time pick odds vs
+closing pick odds (computed in live_scoreboard.py once enough data exists).
+
+Run it every ~15 min (e.g. alongside the existing sync, or its own cron):
+  python scripts/capture_closing_odds.py            # default 25-min window
+  python scripts/capture_closing_odds.py --window-min 20 --dry-run
+
+Structure-agnostic: stores the whole live odds blob; no pick parsing here.
+Idempotent: skips runs that already have closing_odds. Only ADDS a JSON key,
+never deletes. Safe to run repeatedly.
+
+⚠️ Needs one supervised test run against a live DB with upcoming matches
+   before scheduling (DB was down at authoring time).
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+import time
+from datetime import datetime, timezone
+
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try:
+        sys.stdout.reconfigure(encoding="utf-8")
+    except Exception:
+        pass
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR)
+sys.path.insert(0, AI_ENGINE_DIR)
+
+from data.db import get_clean_dsn  # noqa: E402
+import psycopg2  # noqa: E402
+from psycopg2.extras import RealDictCursor  # noqa: E402
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--window-min", type=int, default=25,
+                    help="Capture matches kicking off within the next N minutes (default 25)")
+    ap.add_argument("--grace-min", type=int, default=10,
+                    help="Also include matches that kicked off up to N min ago (default 10)")
+    ap.add_argument("--dry-run", action="store_true",
+                    help="Report what would be captured without writing")
+    args = ap.parse_args()
+
+    now_ms = int(time.time() * 1000)
+    lo_ms = now_ms - args.grace_min * 60 * 1000
+    hi_ms = now_ms + args.window_min * 60 * 1000
+
+    captured = skipped = no_run = 0
+    with psycopg2.connect(get_clean_dsn()) as conn:
+        with conn.cursor(cursor_factory=RealDictCursor) as cur:
+            # Upcoming/just-started live matches that still hold pre-kickoff odds.
+            cur.execute(
+                """
+                SELECT id, mst_utc, odds
+                FROM live_matches
+                WHERE odds IS NOT NULL
+                  AND mst_utc BETWEEN %s AND %s
+                ORDER BY mst_utc ASC
+                """,
+                (lo_ms, hi_ms),
+            )
+            matches = cur.fetchall()
+            print(f"[capture_closing_odds] window={args.window_min}m grace={args.grace_min}m "
+                  f"upcoming_with_odds={len(matches)} dry_run={args.dry_run}")
+
+            for m in matches:
+                mid = m["id"]
+                cur.execute(
+                    """
+                    SELECT id, odds_snapshot
+                    FROM prediction_runs
+                    WHERE match_id = %s
+                    ORDER BY generated_at DESC
+                    LIMIT 1
+                    """,
+                    (mid,),
+                )
+                run = cur.fetchone()
+                if not run:
+                    no_run += 1
+                    continue
+                snap = run["odds_snapshot"] or {}
+                if isinstance(snap, str):
+                    try:
+                        snap = json.loads(snap)
+                    except Exception:
+                        snap = {}
+                if snap.get("closing_odds") is not None:
+                    skipped += 1
+                    continue
+
+                patch = {
+                    "closing_odds": m["odds"],
+                    "closing_captured_at": datetime.now(timezone.utc).isoformat(),
+                    "closing_mst_utc": m["mst_utc"],
+                    "closing_source": "live_match",
+                }
+                if args.dry_run:
+                    captured += 1
+                    print(f"  would capture match={mid} run_id={run['id']} mst_utc={m['mst_utc']}")
+                    continue
+                cur.execute(
+                    """
+                    UPDATE prediction_runs
+                    SET odds_snapshot = COALESCE(odds_snapshot, '{}'::jsonb) || %s::jsonb
+                    WHERE id = %s
+                    """,
+                    (json.dumps(patch, default=str), run["id"]),
+                )
+                captured += 1
+        if not args.dry_run:
+            conn.commit()
+
+    print(f"[capture_closing_odds] captured={captured} already_had={skipped} "
+          f"no_prediction_run={no_run}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,224 @@
+"""
+CLV Report — the single most important edge metric.
+===================================================
+Closing Line Value = did we bet at better odds than the market's closing line?
+Consistently positive CLV is the only reliable proof of a real betting edge;
+negative CLV means no edge, regardless of short-term wins/losses.
+
+This codebase stores the BET-TIME odds for ~92% of runs (prediction_runs.
+odds_snapshot.source = 'live_match' with the live odds blob, and the pick's
+odds in payload main_pick.odds). For the closing line we use, in order:
+  1. odds_snapshot.closing_odds  (captured by capture_closing_odds.py, forward)
+  2. odd_selections current value (the static near-final capture — a proxy)
+
+CLV per bet = bet_odds / closing_odds - 1   (positive = beat the close = good).
+
+Read-only. SELECT only.
+Usage:
+  python scripts/clv_report.py
+  python scripts/clv_report.py --staked-only
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from collections import defaultdict
+from typing import Any, Dict, Optional, Tuple
+
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try:
+        sys.stdout.reconfigure(encoding="utf-8")
+    except Exception:
+        pass
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR)
+sys.path.insert(0, AI_ENGINE_DIR)
+
+from data.db import get_clean_dsn  # noqa: E402
+import psycopg2  # noqa: E402
+from psycopg2.extras import RealDictCursor  # noqa: E402
+
+# market code -> (Turkish odds-category name, pick-normalizer -> selection key)
+OU_CATS = {"OU05": "0,5 Alt/Üst", "OU15": "1,5 Alt/Üst", "OU25": "2,5 Alt/Üst",
+           "OU35": "3,5 Alt/Üst", "OU45": "4,5 Alt/Üst"}
+
+
+def _f(x: Any, d: Optional[float] = None) -> Optional[float]:
+    try:
+        return float(x) if x is not None else d
+    except (TypeError, ValueError):
+        return d
+
+
+def _parse(j: Any) -> Dict[str, Any]:
+    if isinstance(j, str):
+        try:
+            return json.loads(j)
+        except Exception:
+            return {}
+    return j or {}
+
+
+def map_pick(market: str, pick: str) -> Optional[Tuple[str, str]]:
+    """Return (category_name, selection_key) for the live-odds JSON / odd_selections."""
+    m = (market or "").upper()
+    p = (pick or "").strip()
+    pl = p.casefold()
+    if m in ("MS", "ML", "1X2"):
+        return ("Maç Sonucu", p if p in ("1", "X", "2") else None) if p in ("1", "X", "2") else None
+    if m == "HT":
+        return ("1. Yarı Sonucu", p) if p in ("1", "X", "2") else None
+    if m in OU_CATS:
+        if "üst" in pl or "ust" in pl or "over" in pl:
+            return (OU_CATS[m], "Üst")
+        if "alt" in pl or "under" in pl:
+            return (OU_CATS[m], "Alt")
+        return None
+    if m == "DC":
+        key = p.upper().replace(" ", "").replace("/", "-")
+        norm = {"1X": "1-X", "X1": "1-X", "X2": "X-2", "2X": "X-2",
+                "12": "1-2", "21": "1-2", "1-X": "1-X", "X-2": "X-2", "1-2": "1-2"}.get(key)
+        return ("Çifte Şans", norm) if norm else None
+    if m == "BTTS":
+        if "var" in pl or "yes" in pl:
+            return ("Karşılıklı Gol", "Var")
+        if "yok" in pl or "no" in pl:
+            return ("Karşılıklı Gol", "Yok")
+        return None
+    if m == "OE":
+        if "tek" in pl or "odd" in pl:
+            return ("Tek/Çift", "Tek")
+        if "çift" in pl or "cift" in pl or "even" in pl:
+            return ("Tek/Çift", "Çift")
+        return None
+    return None
+
+
+def closing_from_blob(blob: Any, cat: str, sel: str) -> Optional[float]:
+    blob = _parse(blob)
+    cat_map = blob.get(cat) if isinstance(blob, dict) else None
+    if isinstance(cat_map, dict):
+        return _f(cat_map.get(sel))
+    return None
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--staked-only", action="store_true",
+                    help="Only playable/staked bets (default: all picks with a mappable market)")
+    args = ap.parse_args()
+
+    rows_out = []
+    with psycopg2.connect(get_clean_dsn()) as conn:
+        with conn.cursor(cursor_factory=RealDictCursor) as cur:
+            cur.execute("""
+                SELECT match_id, engine_version, odds_snapshot, payload_summary,
+                       eventual_outcome, unit_profit
+                FROM prediction_runs
+                WHERE odds_snapshot->>'source' = 'live_match'
+                ORDER BY generated_at ASC
+            """)
+            runs = cur.fetchall()
+
+            for r in runs:
+                snap = _parse(r["odds_snapshot"])
+                ps = _parse(r["payload_summary"])
+                mp = ps.get("main_pick") or {}
+                market = mp.get("market")
+                pick = mp.get("pick")
+                bet_odds = _f(mp.get("odds"))
+                playable = bool(mp.get("playable"))
+                if args.staked_only and not playable:
+                    continue
+                if not market or not pick or not bet_odds or bet_odds <= 1.0:
+                    continue
+                mapped = map_pick(market, pick)
+                if not mapped or not mapped[1]:
+                    continue
+                cat, sel = mapped
+
+                # closing line: prefer captured closing_odds, else static odd_selections
+                closing = closing_from_blob(snap.get("closing_odds"), cat, sel)
+                src = "captured"
+                if closing is None:
+                    cur.execute("""
+                        SELECT os.odd_value FROM odd_categories oc
+                        JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
+                        WHERE oc.match_id = %s AND oc.name = %s AND os.name = %s
+                        LIMIT 1
+                    """, (r["match_id"], cat, sel))
+                    row = cur.fetchone()
+                    closing = _f(row["odd_value"]) if row else None
+                    src = "static_proxy"
+                if closing is None or closing <= 1.0:
+                    continue
+
+                clv = bet_odds / closing - 1.0
+                rows_out.append({
+                    "market": market, "playable": playable,
+                    "bet_odds": bet_odds, "closing": closing, "clv": clv,
+                    "src": src, "profit": _f(r["unit_profit"], 0.0) or 0.0,
+                    "settled": r["eventual_outcome"] is not None
+                    and not str(r["eventual_outcome"]).startswith("NO_BET"),
+                })
+
+    if not rows_out:
+        print("No mappable runs with both bet-time and closing odds found.")
+        return 0
+
+    def agg(rs):
+        n = len(rs)
+        clvs = [x["clv"] for x in rs]
+        pos = sum(1 for c in clvs if c > 0)
+        return {
+            "n": n,
+            "mean_clv_pct": round(100.0 * sum(clvs) / n, 2),
+            "pct_positive": round(100.0 * pos / n, 1),
+            "captured": sum(1 for x in rs if x["src"] == "captured"),
+        }
+
+    print("=" * 70)
+    print("CLV REPORT  —  did we beat the closing line? (the edge compass)")
+    print("=" * 70)
+    o = agg(rows_out)
+    print(f"runs analyzed: {o['n']}   (closing source: {o['captured']} captured, "
+          f"{o['n'] - o['captured']} static-proxy)")
+    print(f"\nOVERALL mean CLV: {o['mean_clv_pct']}%   "
+          f"bets beating close: {o['pct_positive']}%")
+    print("  (positive mean CLV = real edge; ~0 or negative = no edge)\n")
+
+    staked = [x for x in rows_out if x["playable"]]
+    if staked:
+        s = agg(staked)
+        print(f"STAKED only: n={s['n']}  mean CLV={s['mean_clv_pct']}%  "
+              f"beating close={s['pct_positive']}%\n")
+
+    print("BY MARKET")
+    by_m = defaultdict(list)
+    for x in rows_out:
+        by_m[x["market"]].append(x)
+    for m, rs in sorted(by_m.items(), key=lambda kv: -len(kv[1])):
+        a = agg(rs)
+        print(f"  {m:<8} n={a['n']:>4}  mean CLV={a['mean_clv_pct']:>7}%  "
+              f"beating close={a['pct_positive']:>5}%")
+
+    # CLV vs outcome sanity: do positive-CLV bets actually win more / lose less?
+    print("\nCLV vs realized P/L (settled staked)")
+    ss = [x for x in rows_out if x["playable"] and x["settled"]]
+    if ss:
+        posc = [x for x in ss if x["clv"] > 0]
+        negc = [x for x in ss if x["clv"] <= 0]
+        for label, grp in (("CLV>0", posc), ("CLV<=0", negc)):
+            if grp:
+                pr = sum(x["profit"] for x in grp)
+                print(f"  {label:<7} n={len(grp):>3}  profit={pr:>7.2f}u  "
+                      f"ROI(flat1u)={round(100*pr/len(grp),1)}%")
+    print("=" * 70)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,181 @@
+"""
+Edge Search — is there a profitable POCKET (by league) the global model misses?
+==============================================================================
+Global leak-free MS is ~-5.6% (the vig). But efficiency varies: obscure / low-
+tier leagues may be mispriced. This walks a leak-free model forward and slices
+the value-bet ROI BY LEAGUE, requiring a real sample AND multi-fold consistency
+so we don't chase one lucky window.
+
+Leak-free: drops the confirmed/suspected leakage columns (see LEAKY). Uses odds
+in features (realistic). Value bet = biggest model_prob - implied edge > margin.
+
+⚠️ Even a positive pocket here is a LEAD, not proof: the CSV odds are a static
+capture, not the verified closing line. Anything flagged must be forward-
+validated with real CLV (capture_closing_odds.py) before staking.
+
+Usage: python scripts/edge_search.py --folds 6 --min-bets 150
+"""
+from __future__ import annotations
+import argparse, os, sys, time
+import numpy as np, pandas as pd, xgboost as xgb
+
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try: sys.stdout.reconfigure(encoding="utf-8")
+    except Exception: pass
+
+AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, AI_DIR)
+CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
+
+META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
+        "score_home","score_away","ht_score_home","ht_score_away"}
+LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
+         "squad_diff","home_squad_quality","away_squad_quality",
+         "referee_home_bias","referee_avg_goals"}
+
+
+def league_names(ids):
+    """Resilient id->name lookup."""
+    from data.db import get_clean_dsn
+    import psycopg2
+    from psycopg2.extras import RealDictCursor
+    out = {}
+    ids = [str(i) for i in ids if i is not None]
+    if not ids: return out
+    for _ in range(3):
+        try:
+            with psycopg2.connect(get_clean_dsn()) as c:
+                with c.cursor(cursor_factory=RealDictCursor) as cur:
+                    cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (ids,))
+                    for r in cur.fetchall(): out[str(r["id"])] = r["name"]
+            return out
+        except Exception:
+            time.sleep(1.0)
+    return out
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--folds", type=int, default=6)
+    ap.add_argument("--estimators", type=int, default=200)
+    ap.add_argument("--margin", type=float, default=0.0)
+    ap.add_argument("--min-bets", type=int, default=150)
+    args = ap.parse_args()
+
+    print(f"Loading {CSV} ...")
+    df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
+    sh = pd.to_numeric(df["score_home"], errors="coerce")
+    sa = pd.to_numeric(df["score_away"], errors="coerce")
+    ok = sh.notna() & sa.notna()
+    df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
+    y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
+    league = df["league_id"].astype(str).values
+    odds = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+
+    feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
+    X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+    rel = pd.to_numeric(df.get("league_reliability_score", pd.Series([np.nan]*len(df))),
+                        errors="coerce").fillna(-1.0).values
+    print(f"  {len(df):,} rows  features={len(feats)} (leak-free)  folds={args.folds}")
+
+    n = len(df); start = int(n * 0.5)
+    bounds = np.linspace(start, n, args.folds + 1, dtype=int)
+    params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
+              "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
+
+    # reliability quartile edges from the betting universe (rel>=0)
+    rv = rel[rel >= 0]
+    qs = np.quantile(rv, [0.25, 0.5, 0.75]) if len(rv) else [0.3, 0.5, 0.7]
+    def rel_band(x):
+        if x < 0: return "rel:unknown"
+        if x < qs[0]: return f"rel:Q1(<{qs[0]:.2f})"
+        if x < qs[1]: return f"rel:Q2"
+        if x < qs[2]: return f"rel:Q3"
+        return f"rel:Q4(>={qs[2]:.2f})"
+    def odds_band(o):
+        return ("<1.5" if o<1.5 else "1.5-2" if o<2 else "2-3" if o<3 else
+                "3-5" if o<5 else "5-8" if o<8 else "8+")
+
+    recs = []  # (group_key, fold, pnl, win)
+    glob = {"n":0,"pnl":0.0,"win":0}
+    for fi in range(args.folds):
+        te0, te1 = bounds[fi], bounds[fi+1]
+        if te1-te0 < 50: continue
+        bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators)
+        proba = bst.predict(xgb.DMatrix(X[te0:te1]))
+        yte, ote, rte = y[te0:te1], odds[te0:te1], rel[te0:te1]
+        implied = np.where(ote > 1.0, 1.0/ote, np.nan)
+        edge = np.where(np.isnan(implied), -9.0, proba - implied)
+        pick = edge.argmax(1)
+        bet = edge[np.arange(len(yte)), pick] > args.margin
+        win = (pick == yte) & bet
+        pick_odds = ote[np.arange(len(yte)), pick]
+        pnl = np.where(win, pick_odds-1.0, -1.0)
+        for i in range(len(yte)):
+            if not bet[i]: continue
+            glob["n"]+=1; glob["pnl"]+=pnl[i]; glob["win"]+=int(win[i])
+            recs.append((rel_band(rte[i]), fi, pnl[i], int(win[i])))
+            recs.append((odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
+            recs.append((rel_band(rte[i])+" x "+odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
+        print(f"  fold {fi}: tested {len(yte):,}  bets {int(bet.sum()):,}")
+
+    print("\n"+"="*78)
+    print(f"GLOBAL leak-free: bets={glob['n']:,}  hit={100*glob['win']/max(glob['n'],1):.1f}%  "
+          f"ROI(flat1u)={100*glob['pnl']/max(glob['n'],1):.2f}%")
+    print("="*78)
+
+    rdf = pd.DataFrame(recs, columns=["grp","fold","pnl","win"])
+    def report(prefix, title):
+        sub = rdf[rdf["grp"].str.startswith(prefix)]
+        if sub.empty: return
+        print(f"\n{title}")
+        print(f"  {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
+        print("  "+"-"*54)
+        g = sub.groupby("grp")
+        out=[]
+        for k,d in g:
+            nb=len(d)
+            if nb < args.min_bets: continue
+            roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
+            fp=d.groupby("fold")["pnl"].sum(); folds_pos=int((fp>0).sum()); ft=fp.shape[0]
+            out.append((roi,k,nb,hit,folds_pos,ft))
+        for roi,k,nb,hit,fp,ft in sorted(out,reverse=True):
+            print(f"  {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fp)+'/'+str(ft):>8}")
+    report("rel:", "BY LEAGUE-RELIABILITY BAND  (Q1=most obscure ... Q4=most reliable)")
+    report(("<","1","2","3","5","8"), None)  # odds bands start with digit/<
+    # odds-band buckets begin with a digit or '<'
+    sub = rdf[~rdf["grp"].str.startswith("rel:")]
+    sub = sub[~sub["grp"].str.contains(" x ")]
+    if not sub.empty:
+        print("\nBY ODDS BAND")
+        print(f"  {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
+        print("  "+"-"*54)
+        out=[]
+        for k,d in sub.groupby("grp"):
+            nb=len(d)
+            if nb<args.min_bets: continue
+            roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
+            fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
+        for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True):
+            print(f"  {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
+    # 2D reliability x odds
+    sub2 = rdf[rdf["grp"].str.contains(" x ")]
+    if not sub2.empty:
+        print("\nBY RELIABILITY x ODDS  (candidate pockets, n>=min-bets)")
+        print(f"  {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
+        print("  "+"-"*54)
+        out=[]
+        for k,d in sub2.groupby("grp"):
+            nb=len(d)
+            if nb<args.min_bets: continue
+            roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
+            fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
+        for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True)[:15]:
+            print(f"  {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
+    print("\nREAD: a pocket is a real LEAD only if ROI>0 AND positive in MOST folds")
+    print("(folds+ near full) AND bets large. +ROI in 1-2 folds = noise / overfit.")
+    print("Then forward-validate with CLV (capture_closing_odds.py) before staking.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,154 @@
+"""
+Generate Daily Picks — the serving picker for the validated favourite policy.
+============================================================================
+Loads the saved leak-free MS model (models/favorite_v1) and applies the
+favourite-band value policy to a set of matches, emitting the day's STAKED
+picks and logging them for forward paper-trade settlement.
+
+Train/serve consistency: features MUST come from the SAME extractor that built
+training_data_v27.csv. Production path = run the extractor nightly INCLUDING
+upcoming (status NS) matches, then point this script at that CSV. Demo path =
+use the tail of the training CSV as stand-in "today" matches (with the real
+result shown, since those are settled).
+
+Policy: bet the MS side with the biggest model_prob - implied edge, ONLY if
+odds in [--lo,--hi] and edge>--margin. Flat 1u. No longshots, no parlays.
+Non-MS markets are NOT staked (efficient -> model error). One bet per match.
+
+Usage:
+  python scripts/generate_daily_picks.py --demo --n 20          # see it work now
+  python scripts/generate_daily_picks.py --features today.csv    # production
+  python scripts/generate_daily_picks.py --settle                # settle paper log
+"""
+from __future__ import annotations
+import argparse, json, os, sys, datetime
+import numpy as np, pandas as pd, xgboost as xgb
+
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try: sys.stdout.reconfigure(encoding="utf-8")
+    except Exception: pass
+
+AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+MODEL_DIR = os.path.join(AI_DIR, "models", "favorite_v1")
+TRAIN_CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
+PAPER_LOG = os.path.join(AI_DIR, "data", "paper_trades.csv")
+MS_ODDS = ["odds_ms_h", "odds_ms_d", "odds_ms_a"]
+MS_PICKS = ["1", "X", "2"]
+
+
+def load_model():
+    bst = xgb.Booster(); bst.load_model(os.path.join(MODEL_DIR, "model.json"))
+    with open(os.path.join(MODEL_DIR, "feature_cols.json"), encoding="utf-8") as f:
+        feats = json.load(f)
+    with open(os.path.join(MODEL_DIR, "metadata.json"), encoding="utf-8") as f:
+        meta = json.load(f)
+    return bst, feats, meta
+
+
+def pick_for_rows(df, bst, feats, lo, hi, margin):
+    X = df.reindex(columns=feats).apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+    P = bst.predict(xgb.DMatrix(X))                 # [n,3] home/draw/away
+    O = df[MS_ODDS].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+    implied = np.where(O > 1.0, 1.0/O, np.nan)
+    edge = np.where(np.isnan(implied), -9.0, P - implied)
+    out = []
+    for i in range(len(df)):
+        k = int(np.argmax(edge[i])); o = float(O[i, k]); e = float(edge[i, k])
+        staked = (e > margin) and (lo <= o < hi)
+        out.append({"idx": i, "pick": MS_PICKS[k], "odds": round(o, 2),
+                    "model_prob": round(float(P[i, k]), 4), "edge": round(e, 4),
+                    "staked": staked})
+    return out
+
+
+def settle():
+    if not os.path.exists(PAPER_LOG):
+        print("No paper_trades.csv yet."); return
+    pt = pd.read_csv(PAPER_LOG)
+    open_bets = pt[pt["result"].isna()] if "result" in pt.columns else pt
+    if open_bets.empty:
+        print("No open bets to settle.");
+    # settle from training CSV scores if present, else needs DB (left as note)
+    src = pd.read_csv(TRAIN_CSV, low_memory=False, usecols=["match_id","score_home","score_away"])
+    sc = src.set_index("match_id")
+    def res(row):
+        if not pd.isna(row.get("result")): return row["result"]
+        m = sc.index == row["match_id"]
+        if not m.any(): return np.nan
+        r = sc[m].iloc[0]; sh, sa = r["score_home"], r["score_away"]
+        if pd.isna(sh): return np.nan
+        outcome = "1" if sh > sa else ("X" if sh == sa else "2")
+        won = (str(row["pick"]) == outcome)
+        return "WON" if won else "LOST"
+    pt["result"] = pt.apply(res, axis=1)
+    pt["pnl"] = pt.apply(lambda r: (r["odds"]-1.0) if r["result"]=="WON"
+                         else (-1.0 if r["result"]=="LOST" else np.nan), axis=1)
+    pt.to_csv(PAPER_LOG, index=False)
+    s = pt.dropna(subset=["pnl"])
+    if len(s):
+        print(f"Settled {len(s)} bets: hit={100*(s['result']=='WON').mean():.1f}%  "
+              f"ROI={100*s['pnl'].sum()/len(s):+.2f}%  net={s['pnl'].sum():+.1f}u")
+    return
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--features", help="CSV of upcoming matches in training schema")
+    ap.add_argument("--demo", action="store_true", help="use tail of training CSV as 'today'")
+    ap.add_argument("--n", type=int, default=20)
+    ap.add_argument("--lo", type=float, default=1.5)
+    ap.add_argument("--hi", type=float, default=2.2)
+    ap.add_argument("--margin", type=float, default=0.03)
+    ap.add_argument("--settle", action="store_true")
+    ap.add_argument("--log", action="store_true", help="append staked picks to paper_trades.csv")
+    args = ap.parse_args()
+
+    if args.settle:
+        settle(); return
+
+    bst, feats, meta = load_model()
+    print(f"Model {meta['version']} (trained {meta['trained_at']}, holdout "
+          f"ROI {meta['holdout_eval']['roi_pct']}%)  band[{args.lo},{args.hi}] margin {args.margin}\n")
+
+    if args.features:
+        df = pd.read_csv(args.features, low_memory=False)
+        demo = False
+    else:
+        df = pd.read_csv(TRAIN_CSV, low_memory=False).sort_values("mst_utc").tail(args.n).reset_index(drop=True)
+        demo = True
+        print("(DEMO: last matches of training CSV as stand-in for today)\n")
+
+    picks = pick_for_rows(df, bst, feats, args.lo, args.hi, args.margin)
+    staked = [p for p in picks if p["staked"]]
+    print(f"{len(df)} matches scanned -> {len(staked)} STAKED MS picks\n")
+    print(f"  {'match_id':<28}{'pick':>5}{'odds':>7}{'model%':>8}{'edge%':>7}" + ("   result" if demo else ""))
+    print("  "+"-"*60)
+    log_rows = []
+    for p in picks:
+        if not p["staked"]: continue
+        r = df.iloc[p["idx"]]; mid = str(r["match_id"])
+        res = ""
+        if demo:
+            sh, sa = r.get("score_home"), r.get("score_away")
+            if pd.notna(sh):
+                out = "1" if sh>sa else ("X" if sh==sa else "2")
+                res = "  WON" if p["pick"]==out else "  lost"
+        print(f"  {mid:<28}{p['pick']:>5}{p['odds']:>7.2f}{100*p['model_prob']:>8.1f}{100*p['edge']:>+7.1f}{res}")
+        log_rows.append({"logged_at": datetime.datetime.now().isoformat(timespec="seconds"),
+                         "match_id": mid, "market": "MS", "pick": p["pick"], "odds": p["odds"],
+                         "model_prob": p["model_prob"], "edge": p["edge"], "stake": 1.0,
+                         "result": np.nan, "pnl": np.nan})
+    if args.log and log_rows and not demo:
+        new = pd.DataFrame(log_rows)
+        if os.path.exists(PAPER_LOG):
+            new = pd.concat([pd.read_csv(PAPER_LOG), new], ignore_index=True)
+        new.to_csv(PAPER_LOG, index=False)
+        print(f"\n  logged {len(log_rows)} picks -> {PAPER_LOG}")
+    elif args.log and demo:
+        print("\n  (--log ignored in --demo; only real upcoming picks are logged)")
+    print("\nReminder: paper-trade only. Stake real money after weeks of forward")
+    print("CLV>0 + ROI>0 (settle with --settle, check scoreboard/clv_report).")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,253 @@
+"""
+Live Scoreboard — the single source of truth for real betting performance.
+=========================================================================
+Reads the *forward-tracked* results in `prediction_runs` (one row per analyzed
+match, with the staked main pick + actual outcome + realized unit_profit) and
+reports what ACTUALLY happened with real money logic — NOT a backtest.
+
+Why this exists: backtests on this codebase are overfit (a paper "+32.7% ROI"
+strategy that the live engine never even ran). The only trustworthy number is
+the realized P/L recorded after matches settle. This tool surfaces it.
+
+Read-only. SELECT only. Safe to run anytime.
+
+Usage:
+  python scripts/live_scoreboard.py
+  python scripts/live_scoreboard.py --days 30
+  python scripts/live_scoreboard.py --version v28-pro-max
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from collections import defaultdict
+from datetime import datetime, timedelta, timezone
+from typing import Any, Dict, List, Optional
+
+# utf-8 stdout so Turkish market/league names never crash on Windows cp1252
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try:
+        sys.stdout.reconfigure(encoding="utf-8")
+    except Exception:
+        pass
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR)
+sys.path.insert(0, AI_ENGINE_DIR)
+
+from data.db import get_clean_dsn  # noqa: E402
+import psycopg2  # noqa: E402
+from psycopg2.extras import RealDictCursor  # noqa: E402
+
+ODDS_BANDS = [(0, 1.5, "<1.5"), (1.5, 2.0, "1.5-2"), (2.0, 3.0, "2-3"),
+              (3.0, 5.0, "3-5"), (5.0, 6.0, "5-6"), (6.0, 7.5, "6-7.5"),
+              (7.5, 999, "7.5+")]
+
+
+def _f(x: Any, d: Optional[float] = None) -> Optional[float]:
+    try:
+        return float(x) if x is not None else d
+    except (TypeError, ValueError):
+        return d
+
+
+def _parse(j: Any) -> Dict[str, Any]:
+    if isinstance(j, str):
+        try:
+            return json.loads(j)
+        except Exception:
+            return {}
+    return j or {}
+
+
+def _band(odds: Optional[float]) -> str:
+    if odds is None:
+        return "?"
+    for lo, hi, name in ODDS_BANDS:
+        if lo <= odds < hi:
+            return name
+    return "?"
+
+
+def fetch_rows(args) -> List[Dict[str, Any]]:
+    dsn = get_clean_dsn()
+    where = ["eventual_outcome IS NOT NULL"]
+    params: List[Any] = []
+    if args.version:
+        where.append("engine_version = %s")
+        params.append(args.version)
+    if args.days:
+        cutoff = datetime.now(timezone.utc) - timedelta(days=args.days)
+        where.append("generated_at >= %s")
+        params.append(cutoff)
+    sql = f"""
+        SELECT match_id, engine_version, generated_at, eventual_outcome,
+               unit_profit, payload_summary
+        FROM prediction_runs
+        WHERE {' AND '.join(where)}
+        ORDER BY generated_at ASC
+    """
+    with psycopg2.connect(dsn) as conn:
+        with conn.cursor(cursor_factory=RealDictCursor) as cur:
+            cur.execute(sql, params)
+            return cur.fetchall()
+
+
+def distill(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """One analytic record per run with the staked pick + realized P/L."""
+    out = []
+    for r in rows:
+        ps = _parse(r["payload_summary"])
+        mp = ps.get("main_pick") or {}
+        playable = bool(mp.get("playable"))
+        stake = _f(mp.get("stake_units"), 0.0) or 0.0
+        profit = _f(r["unit_profit"], 0.0) or 0.0
+        outcome = str(r["eventual_outcome"] or "")
+        staked = playable and stake > 0
+        # settled stake = a real bet with a win/loss (exclude NO_BET / push)
+        settled_stake = staked and not outcome.startswith(("NO_BET", "PUSH", "VOID", "CANCEL"))
+        out.append({
+            "match_id": r["match_id"],
+            "version": r["engine_version"],
+            "ts": r["generated_at"],
+            "market": mp.get("market") or "?",
+            "pick": mp.get("pick"),
+            "odds": _f(mp.get("odds")),
+            "stake": stake,
+            "profit": profit,
+            "outcome": outcome,
+            "staked": staked,
+            "settled_stake": settled_stake,
+            "win": settled_stake and profit > 0,
+        })
+    return out
+
+
+def _agg(recs: List[Dict[str, Any]]) -> Dict[str, Any]:
+    # NOTE: recorded unit_profit is on a FLAT 1u basis (win=odds-1, loss=-1),
+    # independent of the brain's suggested stake_units. So ROI is profit per
+    # bet at 1u flat = profit / n. (Using stake_units as denominator is wrong:
+    # it double-counts and produces impossible >100% losses.)
+    s = [r for r in recs if r["settled_stake"]]
+    n = len(s)
+    wins = sum(1 for r in s if r["win"])
+    sug_stake = sum(r["stake"] for r in s)
+    profit = sum(r["profit"] for r in s)
+    return {
+        "n": n,
+        "wins": wins,
+        "hit_pct": round(100.0 * wins / n, 1) if n else None,
+        "sug_stake": round(sug_stake, 2),
+        "profit": round(profit, 2),
+        "roi_pct": round(100.0 * profit / n, 1) if n else None,  # flat 1u
+    }
+
+
+def _line(label: str, a: Dict[str, Any]) -> str:
+    return (f"  {label:<14} n={a['n']:>4}  hit={str(a['hit_pct'] if a['hit_pct'] is not None else '-'):>5}%  "
+            f"profit={a['profit']:>8.2f}u  ROI(flat1u)={str(a['roi_pct'] if a['roi_pct'] is not None else '-'):>7}%")
+
+
+def risk_metrics(recs: List[Dict[str, Any]]) -> Dict[str, Any]:
+    s = [r for r in sorted(recs, key=lambda x: x["ts"]) if r["settled_stake"]]
+    cum = 0.0
+    peak = 0.0
+    max_dd = 0.0
+    streak = 0
+    worst_streak = 0
+    for r in s:
+        cum += r["profit"]
+        peak = max(peak, cum)
+        max_dd = min(max_dd, cum - peak)
+        if r["profit"] <= 0:
+            streak += 1
+            worst_streak = max(worst_streak, streak)
+        else:
+            streak = 0
+    return {"max_drawdown_u": round(max_dd, 2),
+            "longest_losing_streak": worst_streak,
+            "final_cum_u": round(cum, 2)}
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--days", type=int, default=None, help="Only last N days")
+    ap.add_argument("--version", help="Filter by engine_version")
+    args = ap.parse_args()
+
+    rows = fetch_rows(args)
+    recs = distill(rows)
+
+    print("=" * 74)
+    print("LIVE SCOREBOARD  —  realized results from prediction_runs (NOT backtest)")
+    print("=" * 74)
+    if recs:
+        lo = min(r["ts"] for r in recs).date()
+        hi = max(r["ts"] for r in recs).date()
+        print(f"window: {lo} .. {hi}   settled runs: {len(recs)}"
+              + (f"   filter: {args.version}" if args.version else ""))
+    print()
+
+    overall = _agg(recs)
+    print("OVERALL (staked = playable bets only)")
+    print(_line("ALL", overall))
+    no_bet = sum(1 for r in recs if not r["staked"])
+    print(f"  (analyzed {len(recs)} matches; {overall['n']} actually staked, "
+          f"{no_bet} NO_BET)")
+    if overall["n"]:
+        rm = risk_metrics(recs)
+        print(f"  max drawdown: {rm['max_drawdown_u']}u   "
+              f"longest losing streak: {rm['longest_losing_streak']}   "
+              f"net: {rm['final_cum_u']}u")
+    print()
+
+    print("BY ENGINE VERSION")
+    by_v = defaultdict(list)
+    for r in recs:
+        by_v[r["version"]].append(r)
+    for v, rs in sorted(by_v.items(), key=lambda kv: -len(kv[1])):
+        print(_line(v, _agg(rs)))
+    print()
+
+    print("BY MARKET (staked)")
+    by_m = defaultdict(list)
+    for r in recs:
+        if r["settled_stake"]:
+            by_m[r["market"]].append(r)
+    for m, rs in sorted(by_m.items(), key=lambda kv: -len(kv[1])):
+        print(_line(m, _agg(rs)))
+    if not by_m:
+        print("  (no staked settled bets in window)")
+    print()
+
+    print("BY ODDS BAND (staked)")
+    by_b = defaultdict(list)
+    for r in recs:
+        if r["settled_stake"]:
+            by_b[_band(r["odds"])].append(r)
+    for _, _, name in ODDS_BANDS:
+        if name in by_b:
+            print(_line(name, _agg(by_b[name])))
+    print()
+
+    print("WEEKLY TREND (staked)")
+    by_w = defaultdict(list)
+    for r in recs:
+        if r["settled_stake"]:
+            iso = r["ts"].isocalendar()
+            by_w[f"{iso[0]}-W{iso[1]:02d}"].append(r)
+    for w in sorted(by_w):
+        a = _agg(by_w[w])
+        print(_line(w, a))
+    print()
+    print("=" * 74)
+    print("READ: ROI < 0 over a meaningful sample = the staked signals are not")
+    print("profitable. 'NO_BET' rows are free (no stake). CLV is unmeasurable")
+    print("until odds movement is captured (see scripts + odds_history fix).")
+    print("=" * 74)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,182 @@
+"""
+Multi-Market Edge + Best-Bet Selector — pick the best value bet PER MATCH
+========================================================================
+Not "play the handed main_pick". For each match, score EVERY market the model
+covers, compare model prob vs market implied, and select the single best VALUE
+bet across all markets. Leak-free, walk-forward, honest.
+
+Markets (truth derived from scores, not trusted labels):
+  MS(1X2), HT-result, OU0.5/1.5/2.5/3.5, HT_OU0.5/1.5, BTTS.
+
+Outputs:
+  (A) per-market value ROI  -> which bet types actually carry edge
+  (B) cross-market SELECTOR -> best value bet per match, with odds-band filter,
+      fold-consistency, and the model-free baseline.
+
+⚠️ CSV odds are a static capture, not verified closing. Positive = LEAD; forward
+paper-trade with real CLV before staking.
+
+Usage: python scripts/multi_market_edge.py --folds 5 --lo 1.5 --hi 2.6 --margin 0.03
+"""
+from __future__ import annotations
+import argparse, os, sys
+import numpy as np, pandas as pd, xgboost as xgb
+
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try: sys.stdout.reconfigure(encoding="utf-8")
+    except Exception: pass
+
+AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
+META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
+        "score_home","score_away","ht_score_home","ht_score_away"}
+LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
+         "squad_diff","home_squad_quality","away_squad_quality",
+         "referee_home_bias","referee_avg_goals"}
+
+# market -> (kind, [odds_cols aligned to classes], truth_fn(sh,sa,hh,ha)->class idx or None)
+def ou(line):   return lambda sh,sa,hh,ha: (0 if (sh+sa) > line else 1)            # 0=Over,1=Under
+def htou(line): return lambda sh,sa,hh,ha: (None if np.isnan(hh) else (0 if (hh+ha) > line else 1))
+def ms_truth(sh,sa,hh,ha): return 0 if sh>sa else (1 if sh==sa else 2)
+def ht_truth(sh,sa,hh,ha): return None if np.isnan(hh) else (0 if hh>ha else (1 if hh==ha else 2))
+def btts_truth(sh,sa,hh,ha): return 0 if (sh>0 and sa>0) else 1                    # 0=Yes,1=No
+
+MARKETS = {
+  "MS":      ("multi",  ["odds_ms_h","odds_ms_d","odds_ms_a"], ["1","X","2"], ms_truth),
+  "HT":      ("multi",  ["odds_ht_ms_h","odds_ht_ms_d","odds_ht_ms_a"], ["1","X","2"], ht_truth),
+  "OU05":    ("binary", ["odds_ou05_o","odds_ou05_u"], ["Üst","Alt"], ou(0.5)),
+  "OU15":    ("binary", ["odds_ou15_o","odds_ou15_u"], ["Üst","Alt"], ou(1.5)),
+  "OU25":    ("binary", ["odds_ou25_o","odds_ou25_u"], ["Üst","Alt"], ou(2.5)),
+  "OU35":    ("binary", ["odds_ou35_o","odds_ou35_u"], ["Üst","Alt"], ou(3.5)),
+  "HT_OU05": ("binary", ["odds_ht_ou05_o","odds_ht_ou05_u"], ["Üst","Alt"], htou(0.5)),
+  "HT_OU15": ("binary", ["odds_ht_ou15_o","odds_ht_ou15_u"], ["Üst","Alt"], htou(1.5)),
+  "BTTS":    ("binary", ["odds_btts_y","odds_btts_n"], ["Var","Yok"], btts_truth),
+}
+PARAMS_M = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
+            "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
+PARAMS_B = {"objective":"binary:logistic","max_depth":5,"eta":0.05,
+            "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--folds", type=int, default=5)
+    ap.add_argument("--estimators", type=int, default=150)
+    ap.add_argument("--lo", type=float, default=1.5)
+    ap.add_argument("--hi", type=float, default=2.6)
+    ap.add_argument("--margin", type=float, default=0.03)
+    args = ap.parse_args()
+
+    df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
+    sh = pd.to_numeric(df["score_home"], errors="coerce")
+    sa = pd.to_numeric(df["score_away"], errors="coerce")
+    ok = sh.notna() & sa.notna()
+    df = df[ok].reset_index(drop=True)
+    SH = sh[ok.values].values.astype(float); SA = sa[ok.values].values.astype(float)
+    HH = pd.to_numeric(df["ht_score_home"], errors="coerce").values.astype(float)
+    HA = pd.to_numeric(df["ht_score_away"], errors="coerce").values.astype(float)
+    feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
+    X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+    N = len(df)
+    print(f"{N:,} matches, {len(feats)} leak-free feats, {len(MARKETS)} markets, folds={args.folds}")
+
+    # precompute truth + odds per market
+    MK = {}
+    for mname,(kind,ocols,picks,tfn) in MARKETS.items():
+        if not all(c in df.columns for c in ocols):
+            print(f"  skip {mname}: missing odds cols"); continue
+        O = df[ocols].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+        truth = np.array([tfn(SH[i],SA[i],HH[i],HA[i]) for i in range(N)], dtype=object)
+        MK[mname] = (kind, O, picks, truth)
+
+    start = int(N*0.5); bounds = np.linspace(start, N, args.folds+1, dtype=int)
+
+    # accumulators
+    per_market = {m: {"n":0,"pnl":0.0,"win":0} for m in MK}                 # (A) best value pick within market
+    sel = {"n":0,"pnl":0.0,"win":0,"fold":{}}                                # (B) cross-market selector
+    sel_by_mkt = {m: {"n":0,"pnl":0.0,"win":0} for m in MK}
+
+    for fi in range(args.folds):
+        te0,te1 = bounds[fi], bounds[fi+1]
+        if te1-te0 < 50: continue
+        idx = np.arange(te0,te1)
+        # train each market model on [:te0], predict test
+        cand = {}  # market -> (P_matrix[n_test, n_picks], O_test, truth_test)
+        for m,(kind,O,picks,truth) in MK.items():
+            ytr_full = truth[:te0]
+            # mask invalid truth (e.g., HT markets with missing HT score)
+            valid_tr = np.array([v is not None for v in ytr_full])
+            if kind=="multi":
+                ytr = ytr_full[valid_tr].astype(int)
+                bst = xgb.train(PARAMS_M, xgb.DMatrix(X[:te0][valid_tr], label=ytr), num_boost_round=args.estimators)
+                P = bst.predict(xgb.DMatrix(X[te0:te1]))           # [n,3]
+            else:
+                ytr = ytr_full[valid_tr].astype(int)               # 0=positive,1=neg
+                pos = (ytr==0).astype(int)
+                bst = xgb.train(PARAMS_B, xgb.DMatrix(X[:te0][valid_tr], label=pos), num_boost_round=args.estimators)
+                ppos = bst.predict(xgb.DMatrix(X[te0:te1]))
+                P = np.column_stack([ppos, 1.0-ppos])              # [n,2] -> [pos,neg]
+            cand[m] = (P, O[te0:te1], truth[te0:te1])
+
+        # iterate test matches
+        for j in range(te1-te0):
+            best = None  # (edge, market, pickidx, odds, won)
+            for m,(P,Ot,Tt) in cand.items():
+                t = Tt[j]
+                if t is None: continue
+                probs = P[j]; odds = Ot[j]
+                for k in range(len(probs)):
+                    o = odds[k]
+                    if o <= 1.0: continue
+                    edge = probs[k] - 1.0/o
+                    won = int(t==k)
+                    # (A) per-market: track best value pick in this market (any band, edge>margin)
+                    if edge > args.margin:
+                        d = per_market[m]
+                        # only count the market's single best pick per match
+                    # collect for selector if in band + margin
+                    if edge > args.margin and args.lo <= o < args.hi:
+                        if best is None or edge > best[0]:
+                            best = (edge, m, k, o, won)
+                # per-market best pick (separate loop for clean per-market ROI in band)
+                bestk=None
+                for k in range(len(probs)):
+                    o=odds[k]
+                    if o<=1.0: continue
+                    e=probs[k]-1.0/o
+                    if e>args.margin and args.lo<=o<args.hi and (bestk is None or e>bestk[0]):
+                        bestk=(e,k,o,int(t==k))
+                if bestk is not None:
+                    e,k,o,won = bestk
+                    pnl = (o-1.0) if won else -1.0
+                    d=per_market[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won
+            # selector: single best value bet across all markets for this match
+            if best is not None:
+                edge,m,k,o,won = best
+                pnl = (o-1.0) if won else -1.0
+                sel["n"]+=1; sel["pnl"]+=pnl; sel["win"]+=won
+                sel["fold"][fi] = sel["fold"].get(fi,0.0)+pnl
+                d=sel_by_mkt[m]; d["n"]+=1; d["pnl"]+=pnl; d["win"]+=won
+        print(f"  fold {fi}: tested {te1-te0:,}")
+
+    def line(name,d):
+        n=d["n"]; roi=100*d["pnl"]/n if n else float('nan'); hit=100*d["win"]/n if n else float('nan')
+        return f"  {name:<10} bets={n:>6} hit={hit:>5.1f}% ROI={roi:>7.2f}% net={d['pnl']:>7.1f}u"
+
+    print("\n"+"="*70); print(f"(A) PER-MARKET value ROI  (best value pick in band [{args.lo},{args.hi}], margin {args.margin})"); print("="*70)
+    for m in sorted(per_market, key=lambda x:-(100*per_market[x]['pnl']/per_market[x]['n'] if per_market[x]['n'] else -99)):
+        print(line(m, per_market[m]))
+
+    print("\n"+"="*70); print("(B) CROSS-MARKET SELECTOR  (best value bet per match, all markets)"); print("="*70)
+    print(line("SELECTOR", sel))
+    folds_pos = sum(1 for v in sel["fold"].values() if v>0)
+    print(f"  folds positive: {folds_pos}/{len(sel['fold'])}")
+    print("  selector picks distributed across markets:")
+    for m in sorted(sel_by_mkt, key=lambda x:-sel_by_mkt[x]['n']):
+        if sel_by_mkt[m]["n"]>0: print("   "+line(m, sel_by_mkt[m]).strip())
+    print("\nREAD: a market/selector is a LEAD only if ROI>0, folds consistent, n large.")
+    print("Forward-validate with CLV before staking. Static CSV odds may overstate edge.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,112 @@
+"""
+Train Favorite-Policy Model (v1) — leak-free MS model for the validated strategy.
+================================================================================
+Trains a LEAK-FREE 1X2 model (drops the result-encoding columns) and saves it
+plus the feature list and policy metadata. This is the brain of the new system;
+the favourite-band value policy (odds ~1.5-2.2, model_prob>implied, flat stake)
+is applied on top of its probabilities at serving time.
+
+Honest holdout: trains on the first --holdout-frac of history, evaluates the
+EXACT policy on the most recent slice (never seen in training), then retrains
+on ALL history for the saved production artifact.
+
+Saves to models/favorite_v1/: model.json, feature_cols.json, metadata.json
+
+Usage: python scripts/train_favorite_model.py
+"""
+from __future__ import annotations
+import argparse, json, os, sys, datetime
+import numpy as np, pandas as pd, xgboost as xgb
+
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try: sys.stdout.reconfigure(encoding="utf-8")
+    except Exception: pass
+
+AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
+OUT = os.path.join(AI_DIR, "models", "favorite_v1")
+
+META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
+        "score_home","score_away","ht_score_home","ht_score_away"}
+# Result-encoding leakage — never feed these to the model (train OR serve).
+LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
+         "squad_diff","home_squad_quality","away_squad_quality",
+         "referee_home_bias","referee_avg_goals"}
+
+PARAMS = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
+          "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
+
+
+def policy_eval(P, y, O, lo, hi, margin):
+    implied = np.where(O > 1.0, 1.0/O, np.nan)
+    edge = np.where(np.isnan(implied), -9.0, P - implied)
+    pick = edge.argmax(1); pe = edge[np.arange(len(y)), pick]; po = O[np.arange(len(y)), pick]
+    bet = (pe > margin) & (po >= lo) & (po < hi)
+    win = (pick == y) & bet
+    pnl = np.where(win, po-1.0, -1.0)[bet]
+    n = int(bet.sum())
+    return {"bets": n, "hit_pct": round(100*win.sum()/max(n,1),1),
+            "roi_pct": round(100*pnl.sum()/max(n,1),2), "net_u": round(float(pnl.sum()),1)}
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--lo", type=float, default=1.5)
+    ap.add_argument("--hi", type=float, default=2.2)
+    ap.add_argument("--margin", type=float, default=0.0)
+    ap.add_argument("--holdout-frac", type=float, default=0.15)
+    ap.add_argument("--estimators", type=int, default=300)
+    args = ap.parse_args()
+
+    print(f"Loading {CSV} ...")
+    df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
+    sh = pd.to_numeric(df["score_home"], errors="coerce")
+    sa = pd.to_numeric(df["score_away"], errors="coerce")
+    ok = sh.notna() & sa.notna()
+    df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
+    y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
+    O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+    feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
+    X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
+    print(f"  {len(df):,} rows, {len(feats)} leak-free features")
+
+    # ── Honest holdout (last slice, never trained on) ──
+    cut = int(len(df) * (1 - args.holdout_frac))
+    bst = xgb.train(PARAMS, xgb.DMatrix(X[:cut], label=y[:cut]), num_boost_round=args.estimators)
+    Ph = bst.predict(xgb.DMatrix(X[cut:]))
+    acc = float((Ph.argmax(1) == y[cut:]).mean())
+    hold = policy_eval(Ph, y[cut:], O[cut:], args.lo, args.hi, args.margin)
+    print(f"\nHOLDOUT (last {args.holdout_frac:.0%}, {len(df)-cut:,} matches, never seen):")
+    print(f"  MS accuracy: {acc*100:.1f}%")
+    print(f"  POLICY band[{args.lo},{args.hi}] margin {args.margin}: {hold}")
+
+    # ── Production model: retrain on ALL history ──
+    print("\nTraining production model on ALL history ...")
+    final = xgb.train(PARAMS, xgb.DMatrix(X, label=y), num_boost_round=args.estimators)
+    os.makedirs(OUT, exist_ok=True)
+    final.save_model(os.path.join(OUT, "model.json"))
+    with open(os.path.join(OUT, "feature_cols.json"), "w", encoding="utf-8") as f:
+        json.dump(feats, f, ensure_ascii=False, indent=2)
+    meta = {
+        "version": "favorite_v1",
+        "trained_at": datetime.datetime.now().isoformat(timespec="seconds"),
+        "market": "MS",
+        "classes": {"0": "home(1)", "1": "draw(X)", "2": "away(2)"},
+        "policy": {"odds_lo": args.lo, "odds_hi": args.hi, "margin": args.margin,
+                   "stake": "flat 1u", "rule": "bet model's max value edge if picked odds in band",
+                   "never": ["longshots odds>=hi", "parlays/combos"]},
+        "n_train": len(df), "n_features": len(feats),
+        "leaky_excluded": sorted(LEAKY),
+        "holdout_eval": {"accuracy_pct": round(acc*100,1), **hold},
+        "caveat": "CSV odds are a static capture, not verified closing. Forward paper-trade with real CLV before staking.",
+    }
+    with open(os.path.join(OUT, "metadata.json"), "w", encoding="utf-8") as f:
+        json.dump(meta, f, ensure_ascii=False, indent=2)
+    print(f"\n✅ Saved production model to {OUT}/")
+    print(f"   model.json, feature_cols.json ({len(feats)} feats), metadata.json")
+    print("\nNEXT: serving wrapper that loads this + applies the policy to upcoming")
+    print("matches, logs paper-trade picks, and we measure real forward CLV/ROI.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,191 @@
+"""
+Walk-Forward Odds-Blind Experiment — THE pivotal test.
+======================================================
+Question this answers: can a model BEAT THE MARKET out-of-sample, betting only
+on information the price doesn't already contain?
+
+Method (no leakage, time-ordered):
+  * data sorted by kickoff (mst_utc); train on the past, test on the future,
+    rolled over several folds.
+  * TWO models on the MS (1X2) market:
+       ALL    = every feature INCLUDING the bookmaker odds (what the live
+                engine does -> it mostly re-learns the price).
+       BLIND  = identical but odds/implied/_present columns REMOVED, so the
+                model must disagree with the market using fundamentals only.
+  * For each, an honest value-bet simulation on the test fold using the REAL
+    odds payouts (margin included): bet the outcome with the biggest
+    model_prob - implied_prob edge above a margin; ROI = realized P/L per 1u.
+
+Read: if BLIND's value ROI is consistently > 0 across folds, there is a real,
+exploitable lead. If both are <= 0 (expected), these markets aren't beatable
+with this data and the honest move is to stop staking.
+
+Usage:
+  python scripts/walkforward_oddsblind.py
+  python scripts/walkforward_oddsblind.py --folds 6 --estimators 300
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+import numpy as np
+import pandas as pd
+
+if sys.stdout and hasattr(sys.stdout, "reconfigure"):
+    try:
+        sys.stdout.reconfigure(encoding="utf-8")
+    except Exception:
+        pass
+
+AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
+
+import xgboost as xgb  # noqa: E402
+
+META = {"match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
+        "score_home", "score_away", "ht_score_home", "ht_score_away"}
+
+# Confirmed target leakage: *_goals_form integer-valued and ~0.63 correlated
+# with THIS match's goals; their diff equals the actual goal diff 73% of the
+# time. Excluded so the experiment measures genuine pre-match predictive power.
+LEAKY = {
+    # CONFIRMED (encode the actual match result):
+    "home_goals_form", "away_goals_form",  # ~0.63 corr w/ this match's goals
+    "total_goals",                          # this match's full-time total
+    "ht_total_goals",                       # this match's half-time total
+    # STRONG SUSPECTS (dominate importance + high outcome corr; audit extractor):
+    "squad_diff", "home_squad_quality", "away_squad_quality",
+    "referee_home_bias", "referee_avg_goals",
+}
+
+
+def is_odds_col(c: str) -> bool:
+    cl = c.lower()
+    return ("odds" in cl) or ("implied" in cl)
+
+
+def logloss(y: np.ndarray, p: np.ndarray) -> float:
+    p = np.clip(p, 1e-9, 1 - 1e-9)
+    return float(-np.mean(np.log(p[np.arange(len(y)), y])))
+
+
+def value_sim(proba: np.ndarray, y: np.ndarray, odds: np.ndarray,
+              margin: float) -> dict:
+    """Bet the class with the biggest (model_prob - 1/odds) edge above margin."""
+    implied = np.where(odds > 1.0, 1.0 / odds, np.nan)
+    edge = proba - implied
+    # ignore classes without valid odds
+    edge = np.where(np.isnan(implied), -9.0, edge)
+    pick = np.argmax(edge, axis=1)
+    best_edge = edge[np.arange(len(y)), pick]
+    bet = best_edge > margin
+    n = int(bet.sum())
+    if n == 0:
+        return {"n": 0, "roi": None, "hit": None}
+    win = (pick == y) & bet
+    pick_odds = odds[np.arange(len(y)), pick]
+    pnl = np.where(win, pick_odds - 1.0, -1.0)
+    pnl = pnl[bet]
+    return {"n": n, "roi": round(100.0 * pnl.sum() / n, 2),
+            "hit": round(100.0 * win[bet].sum() / n, 1)}
+
+
+def train_eval(Xtr, ytr, Xte, yte, odds_te, est, margins):
+    dtr = xgb.DMatrix(Xtr, label=ytr)
+    dte = xgb.DMatrix(Xte)
+    params = {"objective": "multi:softprob", "num_class": 3, "max_depth": 5,
+              "eta": 0.05, "subsample": 0.8, "colsample_bytree": 0.8,
+              "tree_method": "hist", "verbosity": 0}
+    booster = xgb.train(params, dtr, num_boost_round=est)
+    proba = booster.predict(dte)
+    out = {"logloss": round(logloss(yte, proba), 4),
+           "acc": round(100.0 * (proba.argmax(1) == yte).mean(), 1)}
+    for mg in margins:
+        out[f"val@{mg}"] = value_sim(proba, yte, odds_te, mg)
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--folds", type=int, default=5)
+    ap.add_argument("--estimators", type=int, default=250)
+    ap.add_argument("--test-frac", type=float, default=0.5,
+                    help="Fraction at the end used as rolling OOS (default 0.5)")
+    args = ap.parse_args()
+
+    print(f"Loading {CSV} ...")
+    df = pd.read_csv(CSV, low_memory=False)
+    df = df.sort_values("mst_utc").reset_index(drop=True)
+    print(f"  {len(df)} rows, {df.shape[1]} cols")
+
+    # Derive true MS outcome from scores: 0=home,1=draw,2=away (robust, no label trust)
+    sh = pd.to_numeric(df["score_home"], errors="coerce")
+    sa = pd.to_numeric(df["score_away"], errors="coerce")
+    y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
+    valid = sh.notna() & sa.notna()
+    df, y = df[valid].reset_index(drop=True), y[valid.values]
+
+    odds = df[["odds_ms_h", "odds_ms_d", "odds_ms_a"]].apply(
+        pd.to_numeric, errors="coerce").fillna(0.0).values
+
+    feat_all = [c for c in df.columns if c not in META and not c.startswith("label_")
+                and c not in LEAKY]
+    feat_blind = [c for c in feat_all if not is_odds_col(c)]
+    print(f"  excluded leaky cols: {sorted(LEAKY)}")
+    Xall = df[feat_all].apply(pd.to_numeric, errors="coerce").fillna(0.0)
+    Xblind = df[feat_blind].apply(pd.to_numeric, errors="coerce").fillna(0.0)
+    print(f"  features: ALL={len(feat_all)}  BLIND={len(feat_blind)} "
+          f"(dropped {len(feat_all)-len(feat_blind)} odds cols)")
+    print(f"  base rates: home={100*(y==0).mean():.1f}% draw={100*(y==1).mean():.1f}% "
+          f"away={100*(y==2).mean():.1f}%")
+
+    n = len(df)
+    start = int(n * (1 - args.test_frac))
+    bounds = np.linspace(start, n, args.folds + 1, dtype=int)
+    margins = [0.0, 0.05, 0.10]
+
+    agg = {"ALL": {f"val@{m}": [] for m in margins}, "BLIND": {f"val@{m}": [] for m in margins}}
+    agg["ALL"]["logloss"] = []; agg["BLIND"]["logloss"] = []
+
+    print(f"\nWalk-forward: {args.folds} folds, train=expanding, est={args.estimators}\n")
+    hdr = f"{'fold':<5}{'model':<7}{'logloss':>9}{'acc%':>7}" + "".join(
+        f"{('val@'+str(m)):>22}" for m in margins)
+    print(hdr); print("-" * len(hdr))
+    for i in range(args.folds):
+        te0, te1 = bounds[i], bounds[i + 1]
+        if te1 - te0 < 50:
+            continue
+        tr = slice(0, te0)
+        te = slice(te0, te1)
+        for name, X in (("ALL", Xall), ("BLIND", Xblind)):
+            r = train_eval(X.iloc[tr].values, y[tr], X.iloc[te].values, y[te],
+                           odds[te], args.estimators, margins)
+            agg[name]["logloss"].append(r["logloss"])
+            cells = ""
+            for m in margins:
+                v = r[f"val@{m}"]
+                agg[name][f"val@{m}"].append(v)
+                cells += f"{('n=' + str(v['n']) + ' roi=' + str(v['roi'])):>22}"
+            print(f"{i:<5}{name:<7}{r['logloss']:>9}{r['acc']:>7}{cells}")
+        print()
+
+    print("=" * 70)
+    print("AGGREGATE (sum bets, weighted ROI across folds)")
+    print("=" * 70)
+    for name in ("ALL", "BLIND"):
+        ll = np.mean(agg[name]["logloss"]) if agg[name]["logloss"] else float("nan")
+        print(f"\n{name}  mean logloss={ll:.4f}")
+        for m in margins:
+            vs = agg[name][f"val@{m}"]
+            tot_n = sum(v["n"] for v in vs)
+            tot_pnl = sum((v["roi"] / 100.0 * v["n"]) for v in vs if v["roi"] is not None)
+            roi = round(100.0 * tot_pnl / tot_n, 2) if tot_n else None
+            print(f"   margin {m}: total_bets={tot_n:>6}  ROI(flat1u)={roi}%")
+    print("\nREAD: BLIND ROI>0 across margins/folds = real edge. Both <=0 = no")
+    print("exploitable edge in MS with this data (stop staking; the -EV is the vig).")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())