iddaai-be/ai-engine/scripts/walkforward_oddsblind.py

"""
Walk-Forward Odds-Blind Experiment — THE pivotal test.
======================================================
Question this answers: can a model BEAT THE MARKET out-of-sample, betting only
on information the price doesn't already contain?

Method (no leakage, time-ordered):
  * data sorted by kickoff (mst_utc); train on the past, test on the future,
    rolled over several folds.
  * TWO models on the MS (1X2) market:
       ALL    = every feature INCLUDING the bookmaker odds (what the live
                engine does -> it mostly re-learns the price).
       BLIND  = identical but odds/implied/_present columns REMOVED, so the
                model must disagree with the market using fundamentals only.
  * For each, an honest value-bet simulation on the test fold using the REAL
    odds payouts (margin included): bet the outcome with the biggest
    model_prob - implied_prob edge above a margin; ROI = realized P/L per 1u.

Read: if BLIND's value ROI is consistently > 0 across folds, there is a real,
exploitable lead. If both are <= 0 (expected), these markets aren't beatable
with this data and the honest move is to stop staking.

Usage:
  python scripts/walkforward_oddsblind.py
  python scripts/walkforward_oddsblind.py --folds 6 --estimators 300
"""
from __future__ import annotations

import argparse
import os
import sys
import numpy as np
import pandas as pd

if sys.stdout and hasattr(sys.stdout, "reconfigure"):
    try:
        sys.stdout.reconfigure(encoding="utf-8")
    except Exception:
        pass

AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")

import xgboost as xgb  # noqa: E402

META = {"match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
        "score_home", "score_away", "ht_score_home", "ht_score_away"}

# Confirmed target leakage: *_goals_form integer-valued and ~0.63 correlated
# with THIS match's goals; their diff equals the actual goal diff 73% of the
# time. Excluded so the experiment measures genuine pre-match predictive power.
LEAKY = {
    # CONFIRMED (encode the actual match result):
    "home_goals_form", "away_goals_form",  # ~0.63 corr w/ this match's goals
    "total_goals",                          # this match's full-time total
    "ht_total_goals",                       # this match's half-time total
    # STRONG SUSPECTS (dominate importance + high outcome corr; audit extractor):
    "squad_diff", "home_squad_quality", "away_squad_quality",
    "referee_home_bias", "referee_avg_goals",
}


def is_odds_col(c: str) -> bool:
    cl = c.lower()
    return ("odds" in cl) or ("implied" in cl)


def logloss(y: np.ndarray, p: np.ndarray) -> float:
    p = np.clip(p, 1e-9, 1 - 1e-9)
    return float(-np.mean(np.log(p[np.arange(len(y)), y])))


def value_sim(proba: np.ndarray, y: np.ndarray, odds: np.ndarray,
              margin: float) -> dict:
    """Bet the class with the biggest (model_prob - 1/odds) edge above margin."""
    implied = np.where(odds > 1.0, 1.0 / odds, np.nan)
    edge = proba - implied
    # ignore classes without valid odds
    edge = np.where(np.isnan(implied), -9.0, edge)
    pick = np.argmax(edge, axis=1)
    best_edge = edge[np.arange(len(y)), pick]
    bet = best_edge > margin
    n = int(bet.sum())
    if n == 0:
        return {"n": 0, "roi": None, "hit": None}
    win = (pick == y) & bet
    pick_odds = odds[np.arange(len(y)), pick]
    pnl = np.where(win, pick_odds - 1.0, -1.0)
    pnl = pnl[bet]
    return {"n": n, "roi": round(100.0 * pnl.sum() / n, 2),
            "hit": round(100.0 * win[bet].sum() / n, 1)}


def train_eval(Xtr, ytr, Xte, yte, odds_te, est, margins):
    dtr = xgb.DMatrix(Xtr, label=ytr)
    dte = xgb.DMatrix(Xte)
    params = {"objective": "multi:softprob", "num_class": 3, "max_depth": 5,
              "eta": 0.05, "subsample": 0.8, "colsample_bytree": 0.8,
              "tree_method": "hist", "verbosity": 0}
    booster = xgb.train(params, dtr, num_boost_round=est)
    proba = booster.predict(dte)
    out = {"logloss": round(logloss(yte, proba), 4),
           "acc": round(100.0 * (proba.argmax(1) == yte).mean(), 1)}
    for mg in margins:
        out[f"val@{mg}"] = value_sim(proba, yte, odds_te, mg)
    return out


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--folds", type=int, default=5)
    ap.add_argument("--estimators", type=int, default=250)
    ap.add_argument("--test-frac", type=float, default=0.5,
                    help="Fraction at the end used as rolling OOS (default 0.5)")
    args = ap.parse_args()

    print(f"Loading {CSV} ...")
    df = pd.read_csv(CSV, low_memory=False)
    df = df.sort_values("mst_utc").reset_index(drop=True)
    print(f"  {len(df)} rows, {df.shape[1]} cols")

    # Derive true MS outcome from scores: 0=home,1=draw,2=away (robust, no label trust)
    sh = pd.to_numeric(df["score_home"], errors="coerce")
    sa = pd.to_numeric(df["score_away"], errors="coerce")
    y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
    valid = sh.notna() & sa.notna()
    df, y = df[valid].reset_index(drop=True), y[valid.values]

    odds = df[["odds_ms_h", "odds_ms_d", "odds_ms_a"]].apply(
        pd.to_numeric, errors="coerce").fillna(0.0).values

    feat_all = [c for c in df.columns if c not in META and not c.startswith("label_")
                and c not in LEAKY]
    feat_blind = [c for c in feat_all if not is_odds_col(c)]
    print(f"  excluded leaky cols: {sorted(LEAKY)}")
    Xall = df[feat_all].apply(pd.to_numeric, errors="coerce").fillna(0.0)
    Xblind = df[feat_blind].apply(pd.to_numeric, errors="coerce").fillna(0.0)
    print(f"  features: ALL={len(feat_all)}  BLIND={len(feat_blind)} "
          f"(dropped {len(feat_all)-len(feat_blind)} odds cols)")
    print(f"  base rates: home={100*(y==0).mean():.1f}% draw={100*(y==1).mean():.1f}% "
          f"away={100*(y==2).mean():.1f}%")

    n = len(df)
    start = int(n * (1 - args.test_frac))
    bounds = np.linspace(start, n, args.folds + 1, dtype=int)
    margins = [0.0, 0.05, 0.10]

    agg = {"ALL": {f"val@{m}": [] for m in margins}, "BLIND": {f"val@{m}": [] for m in margins}}
    agg["ALL"]["logloss"] = []; agg["BLIND"]["logloss"] = []

    print(f"\nWalk-forward: {args.folds} folds, train=expanding, est={args.estimators}\n")
    hdr = f"{'fold':<5}{'model':<7}{'logloss':>9}{'acc%':>7}" + "".join(
        f"{('val@'+str(m)):>22}" for m in margins)
    print(hdr); print("-" * len(hdr))
    for i in range(args.folds):
        te0, te1 = bounds[i], bounds[i + 1]
        if te1 - te0 < 50:
            continue
        tr = slice(0, te0)
        te = slice(te0, te1)
        for name, X in (("ALL", Xall), ("BLIND", Xblind)):
            r = train_eval(X.iloc[tr].values, y[tr], X.iloc[te].values, y[te],
                           odds[te], args.estimators, margins)
            agg[name]["logloss"].append(r["logloss"])
            cells = ""
            for m in margins:
                v = r[f"val@{m}"]
                agg[name][f"val@{m}"].append(v)
                cells += f"{('n=' + str(v['n']) + ' roi=' + str(v['roi'])):>22}"
            print(f"{i:<5}{name:<7}{r['logloss']:>9}{r['acc']:>7}{cells}")
        print()

    print("=" * 70)
    print("AGGREGATE (sum bets, weighted ROI across folds)")
    print("=" * 70)
    for name in ("ALL", "BLIND"):
        ll = np.mean(agg[name]["logloss"]) if agg[name]["logloss"] else float("nan")
        print(f"\n{name}  mean logloss={ll:.4f}")
        for m in margins:
            vs = agg[name][f"val@{m}"]
            tot_n = sum(v["n"] for v in vs)
            tot_pnl = sum((v["roi"] / 100.0 * v["n"]) for v in vs if v["roi"] is not None)
            roi = round(100.0 * tot_pnl / tot_n, 2) if tot_n else None
            print(f"   margin {m}: total_bets={tot_n:>6}  ROI(flat1u)={roi}%")
    print("\nREAD: BLIND ROI>0 across margins/folds = real edge. Both <=0 = no")
    print("exploitable edge in MS with this data (stop staking; the -EV is the vig).")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())