Add backtest pipeline, betting_brain filters, score coherence + social v3
betting_brain.py: - HARD_MIN_SAMPLES=50 floor for calibrator bypass - ev_edge < 0 + >= 0.20 hard vetoes - BTTS muted (grid search found no profitable config) - Per-market optimal envelopes (MS, OU25) - Score coherence filter: main_pick must agree with score prediction - HTFT reversal cross-check for MS picks feature_builder.py / data_loader.py: - Real home/away_position from data (was hardcoded 10) - Cup detection wired into UpsetEngine - _estimate_league_position with 300-day season filter New scripts: - diagnostic_backtest.py: per-bet diagnostic backtest with loss patterns - optimize_filters.py: grid search per-market optimal thresholds - analyze_backtest_csv.py: root-cause hypothesis testing on CSV - compare_backtests.py: side-by-side validation with verdict - test_score_coherence.py: smoke test for coherence filter (20/20 pass) Reports: - diagnostic_backtest_20260525_024437 (50-match smoke) - diagnostic_backtest_20260525_035649 (1000-match in-sample) - filter_optimization_patch.json (grid search winners per market) Social poster v3: - satori + resvg HTML/CSS rendering pipeline - Twemoji football/basketball + flag SVGs - caption SEO: 12 curated hashtags per post - image SEO: descriptive filenames + .json metadata sidecar - /health, /preview-png, /run-now endpoints Docs: - mds/SESSION_HANDOFF.md: full session state for cross-machine continuity - mds/SOCIAL_POSTER_SETUP.md: API keys + test commands Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,134 @@
|
||||
"""
|
||||
Compare two diagnostic_backtest CSV outputs side-by-side.
|
||||
Used to validate that a filter change actually improved ROI vs the
|
||||
baseline run — and to detect overfitting (in-sample success but
|
||||
out-of-sample collapse).
|
||||
|
||||
Usage:
|
||||
python scripts/compare_backtests.py <baseline.csv> <validation.csv>
|
||||
python scripts/compare_backtests.py (auto-picks 2 most recent CSVs)
|
||||
"""
|
||||
|
||||
import sys, os, glob
|
||||
import pandas as pd
|
||||
from typing import Dict
|
||||
|
||||
REPORTS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "reports")
|
||||
|
||||
|
||||
def load(path: str) -> pd.DataFrame:
|
||||
df = pd.read_csv(path)
|
||||
df["won_bool"] = df["won"].map(
|
||||
{True: True, False: False, "True": True, "False": False, 1: True, 0: False}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
def stats(df: pd.DataFrame, mask=None) -> Dict:
|
||||
if mask is not None:
|
||||
df = df[mask]
|
||||
playable = df[(df["playable"] == True) & (df["won_bool"].notna())]
|
||||
if len(playable) == 0:
|
||||
return {"n_total": len(df), "n_playable": 0, "hit": 0, "profit": 0,
|
||||
"staked": 0, "roi": 0}
|
||||
wins = playable["won_bool"].sum()
|
||||
profit = playable["unit_profit"].sum()
|
||||
staked = playable["stake_units"].sum()
|
||||
return {
|
||||
"n_total": int(len(df)),
|
||||
"n_playable": int(len(playable)),
|
||||
"wins": int(wins),
|
||||
"losses": int(len(playable) - wins),
|
||||
"hit": round(100.0 * wins / len(playable), 2),
|
||||
"profit": round(profit, 2),
|
||||
"staked": round(staked, 2),
|
||||
"roi": round(100.0 * profit / staked, 2) if staked else 0,
|
||||
}
|
||||
|
||||
|
||||
def line(label: str, a: Dict, b: Dict, suffix: str = ""):
|
||||
fields = ["n_total", "n_playable", "hit", "profit", "staked", "roi"]
|
||||
parts = [f"{label:<28}"]
|
||||
for f in fields:
|
||||
va = a.get(f, "-")
|
||||
vb = b.get(f, "-")
|
||||
parts.append(f"{f}: {str(va):>8} → {str(vb):>8}")
|
||||
print(" " + " | ".join(parts) + suffix)
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) == 3:
|
||||
a_path, b_path = sys.argv[1], sys.argv[2]
|
||||
else:
|
||||
files = sorted(glob.glob(os.path.join(REPORTS_DIR, "diagnostic_backtest_*.csv")),
|
||||
key=os.path.getmtime, reverse=True)
|
||||
if len(files) < 2:
|
||||
print("Need at least 2 backtest CSVs in reports/. Pass paths manually.")
|
||||
return
|
||||
b_path, a_path = files[0], files[1] # newest first as "validation"
|
||||
|
||||
print(f"Baseline A: {os.path.basename(a_path)}")
|
||||
print(f"Validation B: {os.path.basename(b_path)}")
|
||||
|
||||
a = load(a_path)
|
||||
b = load(b_path)
|
||||
|
||||
print(f"\n{'=' * 100}")
|
||||
print(f" OVERALL")
|
||||
print(f"{'=' * 100}")
|
||||
line("ALL", stats(a), stats(b))
|
||||
|
||||
print(f"\n{'─' * 100}")
|
||||
print(f" PER MARKET")
|
||||
print(f"{'─' * 100}")
|
||||
markets = sorted(set(a["market"].dropna().unique()) | set(b["market"].dropna().unique()))
|
||||
for m in markets:
|
||||
line(f"market={m}",
|
||||
stats(a, a["market"] == m),
|
||||
stats(b, b["market"] == m))
|
||||
|
||||
# New veto family check — did MUTED_MARKETS actually mute?
|
||||
print(f"\n{'─' * 100}")
|
||||
print(f" NEW VETO IMPACT (look for new veto names in betting_brain.vetoes)")
|
||||
print(f"{'─' * 100}")
|
||||
new_vetoes = ["market_muted_by_backtest", "negative_ev_edge", "ev_edge_too_high_trap",
|
||||
"outside_envelope_edge_low", "outside_envelope_edge_high",
|
||||
"outside_envelope_odds_low", "outside_envelope_v27_must_agree"]
|
||||
for veto in new_vetoes:
|
||||
a_hits = a["bb_vetoes"].fillna("").str.contains(veto).sum()
|
||||
b_hits = b["bb_vetoes"].fillna("").str.contains(veto).sum()
|
||||
print(f" {veto:<45} A={a_hits:>4} B={b_hits:>4}")
|
||||
|
||||
# Top issue tags
|
||||
print(f"\n{'─' * 100}")
|
||||
print(f" BTTS MUTE CHECK — should be ~0 playable in validation")
|
||||
print(f"{'─' * 100}")
|
||||
a_btts_play = ((a["market"] == "BTTS") & (a["playable"] == True)).sum()
|
||||
b_btts_play = ((b["market"] == "BTTS") & (b["playable"] == True)).sum()
|
||||
print(f" BTTS playable bets: A={a_btts_play} → B={b_btts_play} "
|
||||
f"(should be 0 in B if MUTE works)")
|
||||
|
||||
# Verdict
|
||||
print(f"\n{'=' * 100}")
|
||||
a_s = stats(a)
|
||||
b_s = stats(b)
|
||||
roi_delta = b_s["roi"] - a_s["roi"]
|
||||
if b_s["n_playable"] < 20:
|
||||
verdict = "TOO FEW BETS — sample insufficient"
|
||||
elif roi_delta > 5 and b_s["roi"] > 0:
|
||||
verdict = "✅ FILTERS WORK — ROI improved AND positive"
|
||||
elif roi_delta > 5:
|
||||
verdict = "🟡 PARTIAL — ROI improved but still negative"
|
||||
elif roi_delta > 0:
|
||||
verdict = "🟡 SLIGHT IMPROVEMENT"
|
||||
elif roi_delta < -5:
|
||||
verdict = "❌ OVERFITTING — validation ROI collapsed"
|
||||
else:
|
||||
verdict = "❌ NO MATERIAL CHANGE"
|
||||
print(f" VERDICT: {verdict}")
|
||||
print(f" ROI: {a_s['roi']}% → {b_s['roi']}% (Δ {roi_delta:+.2f}pp)")
|
||||
print(f"{'=' * 100}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user