Add backtest pipeline, betting_brain filters, score coherence + social v3

betting_brain.py:
- HARD_MIN_SAMPLES=50 floor for calibrator bypass
- ev_edge < 0 + >= 0.20 hard vetoes
- BTTS muted (grid search found no profitable config)
- Per-market optimal envelopes (MS, OU25)
- Score coherence filter: main_pick must agree with score prediction
- HTFT reversal cross-check for MS picks

feature_builder.py / data_loader.py:
- Real home/away_position from data (was hardcoded 10)
- Cup detection wired into UpsetEngine
- _estimate_league_position with 300-day season filter

New scripts:
- diagnostic_backtest.py: per-bet diagnostic backtest with loss patterns
- optimize_filters.py: grid search per-market optimal thresholds
- analyze_backtest_csv.py: root-cause hypothesis testing on CSV
- compare_backtests.py: side-by-side validation with verdict
- test_score_coherence.py: smoke test for coherence filter (20/20 pass)

Reports:
- diagnostic_backtest_20260525_024437 (50-match smoke)
- diagnostic_backtest_20260525_035649 (1000-match in-sample)
- filter_optimization_patch.json (grid search winners per market)

Social poster v3:
- satori + resvg HTML/CSS rendering pipeline
- Twemoji football/basketball + flag SVGs
- caption SEO: 12 curated hashtags per post
- image SEO: descriptive filenames + .json metadata sidecar
- /health, /preview-png, /run-now endpoints

Docs:
- mds/SESSION_HANDOFF.md: full session state for cross-machine continuity
- mds/SOCIAL_POSTER_SETUP.md: API keys + test commands

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-25 20:43:28 +03:00
parent b619c2454a
commit 988ee2f50d
36 changed files with 5268 additions and 46 deletions
+134
View File
@@ -0,0 +1,134 @@
"""
Compare two diagnostic_backtest CSV outputs side-by-side.
Used to validate that a filter change actually improved ROI vs the
baseline run — and to detect overfitting (in-sample success but
out-of-sample collapse).
Usage:
python scripts/compare_backtests.py <baseline.csv> <validation.csv>
python scripts/compare_backtests.py (auto-picks 2 most recent CSVs)
"""
import sys, os, glob
import pandas as pd
from typing import Dict
REPORTS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "reports")
def load(path: str) -> pd.DataFrame:
df = pd.read_csv(path)
df["won_bool"] = df["won"].map(
{True: True, False: False, "True": True, "False": False, 1: True, 0: False}
)
return df
def stats(df: pd.DataFrame, mask=None) -> Dict:
if mask is not None:
df = df[mask]
playable = df[(df["playable"] == True) & (df["won_bool"].notna())]
if len(playable) == 0:
return {"n_total": len(df), "n_playable": 0, "hit": 0, "profit": 0,
"staked": 0, "roi": 0}
wins = playable["won_bool"].sum()
profit = playable["unit_profit"].sum()
staked = playable["stake_units"].sum()
return {
"n_total": int(len(df)),
"n_playable": int(len(playable)),
"wins": int(wins),
"losses": int(len(playable) - wins),
"hit": round(100.0 * wins / len(playable), 2),
"profit": round(profit, 2),
"staked": round(staked, 2),
"roi": round(100.0 * profit / staked, 2) if staked else 0,
}
def line(label: str, a: Dict, b: Dict, suffix: str = ""):
fields = ["n_total", "n_playable", "hit", "profit", "staked", "roi"]
parts = [f"{label:<28}"]
for f in fields:
va = a.get(f, "-")
vb = b.get(f, "-")
parts.append(f"{f}: {str(va):>8}{str(vb):>8}")
print(" " + " | ".join(parts) + suffix)
def main():
if len(sys.argv) == 3:
a_path, b_path = sys.argv[1], sys.argv[2]
else:
files = sorted(glob.glob(os.path.join(REPORTS_DIR, "diagnostic_backtest_*.csv")),
key=os.path.getmtime, reverse=True)
if len(files) < 2:
print("Need at least 2 backtest CSVs in reports/. Pass paths manually.")
return
b_path, a_path = files[0], files[1] # newest first as "validation"
print(f"Baseline A: {os.path.basename(a_path)}")
print(f"Validation B: {os.path.basename(b_path)}")
a = load(a_path)
b = load(b_path)
print(f"\n{'=' * 100}")
print(f" OVERALL")
print(f"{'=' * 100}")
line("ALL", stats(a), stats(b))
print(f"\n{'' * 100}")
print(f" PER MARKET")
print(f"{'' * 100}")
markets = sorted(set(a["market"].dropna().unique()) | set(b["market"].dropna().unique()))
for m in markets:
line(f"market={m}",
stats(a, a["market"] == m),
stats(b, b["market"] == m))
# New veto family check — did MUTED_MARKETS actually mute?
print(f"\n{'' * 100}")
print(f" NEW VETO IMPACT (look for new veto names in betting_brain.vetoes)")
print(f"{'' * 100}")
new_vetoes = ["market_muted_by_backtest", "negative_ev_edge", "ev_edge_too_high_trap",
"outside_envelope_edge_low", "outside_envelope_edge_high",
"outside_envelope_odds_low", "outside_envelope_v27_must_agree"]
for veto in new_vetoes:
a_hits = a["bb_vetoes"].fillna("").str.contains(veto).sum()
b_hits = b["bb_vetoes"].fillna("").str.contains(veto).sum()
print(f" {veto:<45} A={a_hits:>4} B={b_hits:>4}")
# Top issue tags
print(f"\n{'' * 100}")
print(f" BTTS MUTE CHECK — should be ~0 playable in validation")
print(f"{'' * 100}")
a_btts_play = ((a["market"] == "BTTS") & (a["playable"] == True)).sum()
b_btts_play = ((b["market"] == "BTTS") & (b["playable"] == True)).sum()
print(f" BTTS playable bets: A={a_btts_play} → B={b_btts_play} "
f"(should be 0 in B if MUTE works)")
# Verdict
print(f"\n{'=' * 100}")
a_s = stats(a)
b_s = stats(b)
roi_delta = b_s["roi"] - a_s["roi"]
if b_s["n_playable"] < 20:
verdict = "TOO FEW BETS — sample insufficient"
elif roi_delta > 5 and b_s["roi"] > 0:
verdict = "✅ FILTERS WORK — ROI improved AND positive"
elif roi_delta > 5:
verdict = "🟡 PARTIAL — ROI improved but still negative"
elif roi_delta > 0:
verdict = "🟡 SLIGHT IMPROVEMENT"
elif roi_delta < -5:
verdict = "❌ OVERFITTING — validation ROI collapsed"
else:
verdict = "❌ NO MATERIAL CHANGE"
print(f" VERDICT: {verdict}")
print(f" ROI: {a_s['roi']}% → {b_s['roi']}% (Δ {roi_delta:+.2f}pp)")
print(f"{'=' * 100}")
if __name__ == "__main__":
main()