iddaai-be/ai-engine/scripts/edge_search.py

"""
Edge Search — is there a profitable POCKET (by league) the global model misses?
==============================================================================
Global leak-free MS is ~-5.6% (the vig). But efficiency varies: obscure / low-
tier leagues may be mispriced. This walks a leak-free model forward and slices
the value-bet ROI BY LEAGUE, requiring a real sample AND multi-fold consistency
so we don't chase one lucky window.

Leak-free: drops the confirmed/suspected leakage columns (see LEAKY). Uses odds
in features (realistic). Value bet = biggest model_prob - implied edge > margin.

⚠️ Even a positive pocket here is a LEAD, not proof: the CSV odds are a static
capture, not the verified closing line. Anything flagged must be forward-
validated with real CLV (capture_closing_odds.py) before staking.

Usage: python scripts/edge_search.py --folds 6 --min-bets 150
"""
from __future__ import annotations
import argparse, os, sys, time
import numpy as np, pandas as pd, xgboost as xgb

if sys.stdout and hasattr(sys.stdout, "reconfigure"):
    try: sys.stdout.reconfigure(encoding="utf-8")
    except Exception: pass

AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_DIR)
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")

META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
        "score_home","score_away","ht_score_home","ht_score_away"}
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
         "squad_diff","home_squad_quality","away_squad_quality",
         "referee_home_bias","referee_avg_goals"}


def league_names(ids):
    """Resilient id->name lookup."""
    from data.db import get_clean_dsn
    import psycopg2
    from psycopg2.extras import RealDictCursor
    out = {}
    ids = [str(i) for i in ids if i is not None]
    if not ids: return out
    for _ in range(3):
        try:
            with psycopg2.connect(get_clean_dsn()) as c:
                with c.cursor(cursor_factory=RealDictCursor) as cur:
                    cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (ids,))
                    for r in cur.fetchall(): out[str(r["id"])] = r["name"]
            return out
        except Exception:
            time.sleep(1.0)
    return out


def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--folds", type=int, default=6)
    ap.add_argument("--estimators", type=int, default=200)
    ap.add_argument("--margin", type=float, default=0.0)
    ap.add_argument("--min-bets", type=int, default=150)
    args = ap.parse_args()

    print(f"Loading {CSV} ...")
    df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
    sh = pd.to_numeric(df["score_home"], errors="coerce")
    sa = pd.to_numeric(df["score_away"], errors="coerce")
    ok = sh.notna() & sa.notna()
    df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
    y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
    league = df["league_id"].astype(str).values
    odds = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values

    feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
    X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
    rel = pd.to_numeric(df.get("league_reliability_score", pd.Series([np.nan]*len(df))),
                        errors="coerce").fillna(-1.0).values
    print(f"  {len(df):,} rows  features={len(feats)} (leak-free)  folds={args.folds}")

    n = len(df); start = int(n * 0.5)
    bounds = np.linspace(start, n, args.folds + 1, dtype=int)
    params = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
              "subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}

    # reliability quartile edges from the betting universe (rel>=0)
    rv = rel[rel >= 0]
    qs = np.quantile(rv, [0.25, 0.5, 0.75]) if len(rv) else [0.3, 0.5, 0.7]
    def rel_band(x):
        if x < 0: return "rel:unknown"
        if x < qs[0]: return f"rel:Q1(<{qs[0]:.2f})"
        if x < qs[1]: return f"rel:Q2"
        if x < qs[2]: return f"rel:Q3"
        return f"rel:Q4(>={qs[2]:.2f})"
    def odds_band(o):
        return ("<1.5" if o<1.5 else "1.5-2" if o<2 else "2-3" if o<3 else
                "3-5" if o<5 else "5-8" if o<8 else "8+")

    recs = []  # (group_key, fold, pnl, win)
    glob = {"n":0,"pnl":0.0,"win":0}
    for fi in range(args.folds):
        te0, te1 = bounds[fi], bounds[fi+1]
        if te1-te0 < 50: continue
        bst = xgb.train(params, xgb.DMatrix(X[:te0], label=y[:te0]), num_boost_round=args.estimators)
        proba = bst.predict(xgb.DMatrix(X[te0:te1]))
        yte, ote, rte = y[te0:te1], odds[te0:te1], rel[te0:te1]
        implied = np.where(ote > 1.0, 1.0/ote, np.nan)
        edge = np.where(np.isnan(implied), -9.0, proba - implied)
        pick = edge.argmax(1)
        bet = edge[np.arange(len(yte)), pick] > args.margin
        win = (pick == yte) & bet
        pick_odds = ote[np.arange(len(yte)), pick]
        pnl = np.where(win, pick_odds-1.0, -1.0)
        for i in range(len(yte)):
            if not bet[i]: continue
            glob["n"]+=1; glob["pnl"]+=pnl[i]; glob["win"]+=int(win[i])
            recs.append((rel_band(rte[i]), fi, pnl[i], int(win[i])))
            recs.append((odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
            recs.append((rel_band(rte[i])+" x "+odds_band(pick_odds[i]), fi, pnl[i], int(win[i])))
        print(f"  fold {fi}: tested {len(yte):,}  bets {int(bet.sum()):,}")

    print("\n"+"="*78)
    print(f"GLOBAL leak-free: bets={glob['n']:,}  hit={100*glob['win']/max(glob['n'],1):.1f}%  "
          f"ROI(flat1u)={100*glob['pnl']/max(glob['n'],1):.2f}%")
    print("="*78)

    rdf = pd.DataFrame(recs, columns=["grp","fold","pnl","win"])
    def report(prefix, title):
        sub = rdf[rdf["grp"].str.startswith(prefix)]
        if sub.empty: return
        print(f"\n{title}")
        print(f"  {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
        print("  "+"-"*54)
        g = sub.groupby("grp")
        out=[]
        for k,d in g:
            nb=len(d)
            if nb < args.min_bets: continue
            roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
            fp=d.groupby("fold")["pnl"].sum(); folds_pos=int((fp>0).sum()); ft=fp.shape[0]
            out.append((roi,k,nb,hit,folds_pos,ft))
        for roi,k,nb,hit,fp,ft in sorted(out,reverse=True):
            print(f"  {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fp)+'/'+str(ft):>8}")
    report("rel:", "BY LEAGUE-RELIABILITY BAND  (Q1=most obscure ... Q4=most reliable)")
    report(("<","1","2","3","5","8"), None)  # odds bands start with digit/<
    # odds-band buckets begin with a digit or '<'
    sub = rdf[~rdf["grp"].str.startswith("rel:")]
    sub = sub[~sub["grp"].str.contains(" x ")]
    if not sub.empty:
        print("\nBY ODDS BAND")
        print(f"  {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
        print("  "+"-"*54)
        out=[]
        for k,d in sub.groupby("grp"):
            nb=len(d)
            if nb<args.min_bets: continue
            roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
            fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
        for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True):
            print(f"  {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
    # 2D reliability x odds
    sub2 = rdf[rdf["grp"].str.contains(" x ")]
    if not sub2.empty:
        print("\nBY RELIABILITY x ODDS  (candidate pockets, n>=min-bets)")
        print(f"  {'bucket':<26}{'bets':>6}{'hit%':>7}{'ROI%':>8}{'folds+':>8}")
        print("  "+"-"*54)
        out=[]
        for k,d in sub2.groupby("grp"):
            nb=len(d)
            if nb<args.min_bets: continue
            roi=100*d["pnl"].sum()/nb; hit=100*d["win"].sum()/nb
            fp=d.groupby("fold")["pnl"].sum(); out.append((roi,k,nb,hit,int((fp>0).sum()),fp.shape[0]))
        for roi,k,nb,hit,fpv,ft in sorted(out,reverse=True)[:15]:
            print(f"  {k:<26}{nb:>6}{hit:>7.1f}{roi:>8.1f}{str(fpv)+'/'+str(ft):>8}")
    print("\nREAD: a pocket is a real LEAD only if ROI>0 AND positive in MOST folds")
    print("(folds+ near full) AND bets large. +ROI in 1-2 folds = noise / overfit.")
    print("Then forward-validate with CLV (capture_closing_odds.py) before staking.")


if __name__ == "__main__":
    main()