"""
Compare two diagnostic_backtest CSV outputs side-by-side.
Used to validate that a filter change actually improved ROI vs the
baseline run — and to detect overfitting (in-sample success but
out-of-sample collapse).

Usage:
  python scripts/compare_backtests.py <baseline.csv> <validation.csv>
  python scripts/compare_backtests.py  (auto-picks 2 most recent CSVs)
"""

import sys, os, glob
import pandas as pd
from typing import Dict

REPORTS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "reports")


def load(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["won_bool"] = df["won"].map(
        {True: True, False: False, "True": True, "False": False, 1: True, 0: False}
    )
    return df


def stats(df: pd.DataFrame, mask=None) -> Dict:
    if mask is not None:
        df = df[mask]
    playable = df[(df["playable"] == True) & (df["won_bool"].notna())]
    if len(playable) == 0:
        return {"n_total": len(df), "n_playable": 0, "hit": 0, "profit": 0,
                "staked": 0, "roi": 0}
    wins = playable["won_bool"].sum()
    profit = playable["unit_profit"].sum()
    staked = playable["stake_units"].sum()
    return {
        "n_total": int(len(df)),
        "n_playable": int(len(playable)),
        "wins": int(wins),
        "losses": int(len(playable) - wins),
        "hit": round(100.0 * wins / len(playable), 2),
        "profit": round(profit, 2),
        "staked": round(staked, 2),
        "roi": round(100.0 * profit / staked, 2) if staked else 0,
    }


def line(label: str, a: Dict, b: Dict, suffix: str = ""):
    fields = ["n_total", "n_playable", "hit", "profit", "staked", "roi"]
    parts = [f"{label:<28}"]
    for f in fields:
        va = a.get(f, "-")
        vb = b.get(f, "-")
        parts.append(f"{f}: {str(va):>8} → {str(vb):>8}")
    print("  " + " | ".join(parts) + suffix)


def main():
    if len(sys.argv) == 3:
        a_path, b_path = sys.argv[1], sys.argv[2]
    else:
        files = sorted(glob.glob(os.path.join(REPORTS_DIR, "diagnostic_backtest_*.csv")),
                       key=os.path.getmtime, reverse=True)
        if len(files) < 2:
            print("Need at least 2 backtest CSVs in reports/. Pass paths manually.")
            return
        b_path, a_path = files[0], files[1]  # newest first as "validation"

    print(f"Baseline    A: {os.path.basename(a_path)}")
    print(f"Validation  B: {os.path.basename(b_path)}")

    a = load(a_path)
    b = load(b_path)

    print(f"\n{'=' * 100}")
    print(f"  OVERALL")
    print(f"{'=' * 100}")
    line("ALL", stats(a), stats(b))

    print(f"\n{'─' * 100}")
    print(f"  PER MARKET")
    print(f"{'─' * 100}")
    markets = sorted(set(a["market"].dropna().unique()) | set(b["market"].dropna().unique()))
    for m in markets:
        line(f"market={m}",
             stats(a, a["market"] == m),
             stats(b, b["market"] == m))

    # New veto family check — did MUTED_MARKETS actually mute?
    print(f"\n{'─' * 100}")
    print(f"  NEW VETO IMPACT (look for new veto names in betting_brain.vetoes)")
    print(f"{'─' * 100}")
    new_vetoes = ["market_muted_by_backtest", "negative_ev_edge", "ev_edge_too_high_trap",
                  "outside_envelope_edge_low", "outside_envelope_edge_high",
                  "outside_envelope_odds_low", "outside_envelope_v27_must_agree"]
    for veto in new_vetoes:
        a_hits = a["bb_vetoes"].fillna("").str.contains(veto).sum()
        b_hits = b["bb_vetoes"].fillna("").str.contains(veto).sum()
        print(f"  {veto:<45}  A={a_hits:>4}  B={b_hits:>4}")

    # Top issue tags
    print(f"\n{'─' * 100}")
    print(f"  BTTS MUTE CHECK — should be ~0 playable in validation")
    print(f"{'─' * 100}")
    a_btts_play = ((a["market"] == "BTTS") & (a["playable"] == True)).sum()
    b_btts_play = ((b["market"] == "BTTS") & (b["playable"] == True)).sum()
    print(f"  BTTS playable bets:  A={a_btts_play}  →  B={b_btts_play}  "
          f"(should be 0 in B if MUTE works)")

    # Verdict
    print(f"\n{'=' * 100}")
    a_s = stats(a)
    b_s = stats(b)
    roi_delta = b_s["roi"] - a_s["roi"]
    if b_s["n_playable"] < 20:
        verdict = "TOO FEW BETS — sample insufficient"
    elif roi_delta > 5 and b_s["roi"] > 0:
        verdict = "✅ FILTERS WORK — ROI improved AND positive"
    elif roi_delta > 5:
        verdict = "🟡 PARTIAL — ROI improved but still negative"
    elif roi_delta > 0:
        verdict = "🟡 SLIGHT IMPROVEMENT"
    elif roi_delta < -5:
        verdict = "❌ OVERFITTING — validation ROI collapsed"
    else:
        verdict = "❌ NO MATERIAL CHANGE"
    print(f"  VERDICT: {verdict}")
    print(f"  ROI: {a_s['roi']}% → {b_s['roi']}% (Δ {roi_delta:+.2f}pp)")
    print(f"{'=' * 100}")


if __name__ == "__main__":
    main()