""" Compare two diagnostic_backtest CSV outputs side-by-side. Used to validate that a filter change actually improved ROI vs the baseline run — and to detect overfitting (in-sample success but out-of-sample collapse). Usage: python scripts/compare_backtests.py python scripts/compare_backtests.py (auto-picks 2 most recent CSVs) """ import sys, os, glob import pandas as pd from typing import Dict REPORTS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "reports") def load(path: str) -> pd.DataFrame: df = pd.read_csv(path) df["won_bool"] = df["won"].map( {True: True, False: False, "True": True, "False": False, 1: True, 0: False} ) return df def stats(df: pd.DataFrame, mask=None) -> Dict: if mask is not None: df = df[mask] playable = df[(df["playable"] == True) & (df["won_bool"].notna())] if len(playable) == 0: return {"n_total": len(df), "n_playable": 0, "hit": 0, "profit": 0, "staked": 0, "roi": 0} wins = playable["won_bool"].sum() profit = playable["unit_profit"].sum() staked = playable["stake_units"].sum() return { "n_total": int(len(df)), "n_playable": int(len(playable)), "wins": int(wins), "losses": int(len(playable) - wins), "hit": round(100.0 * wins / len(playable), 2), "profit": round(profit, 2), "staked": round(staked, 2), "roi": round(100.0 * profit / staked, 2) if staked else 0, } def line(label: str, a: Dict, b: Dict, suffix: str = ""): fields = ["n_total", "n_playable", "hit", "profit", "staked", "roi"] parts = [f"{label:<28}"] for f in fields: va = a.get(f, "-") vb = b.get(f, "-") parts.append(f"{f}: {str(va):>8} → {str(vb):>8}") print(" " + " | ".join(parts) + suffix) def main(): if len(sys.argv) == 3: a_path, b_path = sys.argv[1], sys.argv[2] else: files = sorted(glob.glob(os.path.join(REPORTS_DIR, "diagnostic_backtest_*.csv")), key=os.path.getmtime, reverse=True) if len(files) < 2: print("Need at least 2 backtest CSVs in reports/. Pass paths manually.") return b_path, a_path = files[0], files[1] # newest first as "validation" print(f"Baseline A: {os.path.basename(a_path)}") print(f"Validation B: {os.path.basename(b_path)}") a = load(a_path) b = load(b_path) print(f"\n{'=' * 100}") print(f" OVERALL") print(f"{'=' * 100}") line("ALL", stats(a), stats(b)) print(f"\n{'─' * 100}") print(f" PER MARKET") print(f"{'─' * 100}") markets = sorted(set(a["market"].dropna().unique()) | set(b["market"].dropna().unique())) for m in markets: line(f"market={m}", stats(a, a["market"] == m), stats(b, b["market"] == m)) # New veto family check — did MUTED_MARKETS actually mute? print(f"\n{'─' * 100}") print(f" NEW VETO IMPACT (look for new veto names in betting_brain.vetoes)") print(f"{'─' * 100}") new_vetoes = ["market_muted_by_backtest", "negative_ev_edge", "ev_edge_too_high_trap", "outside_envelope_edge_low", "outside_envelope_edge_high", "outside_envelope_odds_low", "outside_envelope_v27_must_agree"] for veto in new_vetoes: a_hits = a["bb_vetoes"].fillna("").str.contains(veto).sum() b_hits = b["bb_vetoes"].fillna("").str.contains(veto).sum() print(f" {veto:<45} A={a_hits:>4} B={b_hits:>4}") # Top issue tags print(f"\n{'─' * 100}") print(f" BTTS MUTE CHECK — should be ~0 playable in validation") print(f"{'─' * 100}") a_btts_play = ((a["market"] == "BTTS") & (a["playable"] == True)).sum() b_btts_play = ((b["market"] == "BTTS") & (b["playable"] == True)).sum() print(f" BTTS playable bets: A={a_btts_play} → B={b_btts_play} " f"(should be 0 in B if MUTE works)") # Verdict print(f"\n{'=' * 100}") a_s = stats(a) b_s = stats(b) roi_delta = b_s["roi"] - a_s["roi"] if b_s["n_playable"] < 20: verdict = "TOO FEW BETS — sample insufficient" elif roi_delta > 5 and b_s["roi"] > 0: verdict = "✅ FILTERS WORK — ROI improved AND positive" elif roi_delta > 5: verdict = "🟡 PARTIAL — ROI improved but still negative" elif roi_delta > 0: verdict = "🟡 SLIGHT IMPROVEMENT" elif roi_delta < -5: verdict = "❌ OVERFITTING — validation ROI collapsed" else: verdict = "❌ NO MATERIAL CHANGE" print(f" VERDICT: {verdict}") print(f" ROI: {a_s['roi']}% → {b_s['roi']}% (Δ {roi_delta:+.2f}pp)") print(f"{'=' * 100}") if __name__ == "__main__": main()