Add backtest pipeline, betting_brain filters, score coherence + social v3

betting_brain.py: - HARD_MIN_SAMPLES=50 floor for calibrator bypass - ev_edge < 0 + >= 0.20 hard vetoes - BTTS muted (grid search found no profitable config) - Per-market optimal envelopes (MS, OU25) - Score coherence filter: main_pick must agree with score prediction - HTFT reversal cross-check for MS picks feature_builder.py / data_loader.py: - Real home/away_position from data (was hardcoded 10) - Cup detection wired into UpsetEngine - _estimate_league_position with 300-day season filter New scripts: - diagnostic_backtest.py: per-bet diagnostic backtest with loss patterns - optimize_filters.py: grid search per-market optimal thresholds - analyze_backtest_csv.py: root-cause hypothesis testing on CSV - compare_backtests.py: side-by-side validation with verdict - test_score_coherence.py: smoke test for coherence filter (20/20 pass) Reports: - diagnostic_backtest_20260525_024437 (50-match smoke) - diagnostic_backtest_20260525_035649 (1000-match in-sample) - filter_optimization_patch.json (grid search winners per market) Social poster v3: - satori + resvg HTML/CSS rendering pipeline - Twemoji football/basketball + flag SVGs - caption SEO: 12 curated hashtags per post - image SEO: descriptive filenames + .json metadata sidecar - /health, /preview-png, /run-now endpoints Docs: - mds/SESSION_HANDOFF.md: full session state for cross-machine continuity - mds/SOCIAL_POSTER_SETUP.md: API keys + test commands Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 20:43:28 +03:00
parent b619c2454a
commit 988ee2f50d
36 changed files with 5268 additions and 46 deletions
@@ -0,0 +1,134 @@
+"""
+Compare two diagnostic_backtest CSV outputs side-by-side.
+Used to validate that a filter change actually improved ROI vs the
+baseline run — and to detect overfitting (in-sample success but
+out-of-sample collapse).
+
+Usage:
+  python scripts/compare_backtests.py <baseline.csv> <validation.csv>
+  python scripts/compare_backtests.py  (auto-picks 2 most recent CSVs)
+"""
+
+import sys, os, glob
+import pandas as pd
+from typing import Dict
+
+REPORTS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "reports")
+
+
+def load(path: str) -> pd.DataFrame:
+    df = pd.read_csv(path)
+    df["won_bool"] = df["won"].map(
+        {True: True, False: False, "True": True, "False": False, 1: True, 0: False}
+    )
+    return df
+
+
+def stats(df: pd.DataFrame, mask=None) -> Dict:
+    if mask is not None:
+        df = df[mask]
+    playable = df[(df["playable"] == True) & (df["won_bool"].notna())]
+    if len(playable) == 0:
+        return {"n_total": len(df), "n_playable": 0, "hit": 0, "profit": 0,
+                "staked": 0, "roi": 0}
+    wins = playable["won_bool"].sum()
+    profit = playable["unit_profit"].sum()
+    staked = playable["stake_units"].sum()
+    return {
+        "n_total": int(len(df)),
+        "n_playable": int(len(playable)),
+        "wins": int(wins),
+        "losses": int(len(playable) - wins),
+        "hit": round(100.0 * wins / len(playable), 2),
+        "profit": round(profit, 2),
+        "staked": round(staked, 2),
+        "roi": round(100.0 * profit / staked, 2) if staked else 0,
+    }
+
+
+def line(label: str, a: Dict, b: Dict, suffix: str = ""):
+    fields = ["n_total", "n_playable", "hit", "profit", "staked", "roi"]
+    parts = [f"{label:<28}"]
+    for f in fields:
+        va = a.get(f, "-")
+        vb = b.get(f, "-")
+        parts.append(f"{f}: {str(va):>8} → {str(vb):>8}")
+    print("  " + " | ".join(parts) + suffix)
+
+
+def main():
+    if len(sys.argv) == 3:
+        a_path, b_path = sys.argv[1], sys.argv[2]
+    else:
+        files = sorted(glob.glob(os.path.join(REPORTS_DIR, "diagnostic_backtest_*.csv")),
+                       key=os.path.getmtime, reverse=True)
+        if len(files) < 2:
+            print("Need at least 2 backtest CSVs in reports/. Pass paths manually.")
+            return
+        b_path, a_path = files[0], files[1]  # newest first as "validation"
+
+    print(f"Baseline    A: {os.path.basename(a_path)}")
+    print(f"Validation  B: {os.path.basename(b_path)}")
+
+    a = load(a_path)
+    b = load(b_path)
+
+    print(f"\n{'=' * 100}")
+    print(f"  OVERALL")
+    print(f"{'=' * 100}")
+    line("ALL", stats(a), stats(b))
+
+    print(f"\n{'─' * 100}")
+    print(f"  PER MARKET")
+    print(f"{'─' * 100}")
+    markets = sorted(set(a["market"].dropna().unique()) | set(b["market"].dropna().unique()))
+    for m in markets:
+        line(f"market={m}",
+             stats(a, a["market"] == m),
+             stats(b, b["market"] == m))
+
+    # New veto family check — did MUTED_MARKETS actually mute?
+    print(f"\n{'─' * 100}")
+    print(f"  NEW VETO IMPACT (look for new veto names in betting_brain.vetoes)")
+    print(f"{'─' * 100}")
+    new_vetoes = ["market_muted_by_backtest", "negative_ev_edge", "ev_edge_too_high_trap",
+                  "outside_envelope_edge_low", "outside_envelope_edge_high",
+                  "outside_envelope_odds_low", "outside_envelope_v27_must_agree"]
+    for veto in new_vetoes:
+        a_hits = a["bb_vetoes"].fillna("").str.contains(veto).sum()
+        b_hits = b["bb_vetoes"].fillna("").str.contains(veto).sum()
+        print(f"  {veto:<45}  A={a_hits:>4}  B={b_hits:>4}")
+
+    # Top issue tags
+    print(f"\n{'─' * 100}")
+    print(f"  BTTS MUTE CHECK — should be ~0 playable in validation")
+    print(f"{'─' * 100}")
+    a_btts_play = ((a["market"] == "BTTS") & (a["playable"] == True)).sum()
+    b_btts_play = ((b["market"] == "BTTS") & (b["playable"] == True)).sum()
+    print(f"  BTTS playable bets:  A={a_btts_play}  →  B={b_btts_play}  "
+          f"(should be 0 in B if MUTE works)")
+
+    # Verdict
+    print(f"\n{'=' * 100}")
+    a_s = stats(a)
+    b_s = stats(b)
+    roi_delta = b_s["roi"] - a_s["roi"]
+    if b_s["n_playable"] < 20:
+        verdict = "TOO FEW BETS — sample insufficient"
+    elif roi_delta > 5 and b_s["roi"] > 0:
+        verdict = "✅ FILTERS WORK — ROI improved AND positive"
+    elif roi_delta > 5:
+        verdict = "🟡 PARTIAL — ROI improved but still negative"
+    elif roi_delta > 0:
+        verdict = "🟡 SLIGHT IMPROVEMENT"
+    elif roi_delta < -5:
+        verdict = "❌ OVERFITTING — validation ROI collapsed"
+    else:
+        verdict = "❌ NO MATERIAL CHANGE"
+    print(f"  VERDICT: {verdict}")
+    print(f"  ROI: {a_s['roi']}% → {b_s['roi']}% (Δ {roi_delta:+.2f}pp)")
+    print(f"{'=' * 100}")
+
+
+if __name__ == "__main__":
+    main()