Add backtest pipeline, betting_brain filters, score coherence + social v3

betting_brain.py:
- HARD_MIN_SAMPLES=50 floor for calibrator bypass
- ev_edge < 0 + >= 0.20 hard vetoes
- BTTS muted (grid search found no profitable config)
- Per-market optimal envelopes (MS, OU25)
- Score coherence filter: main_pick must agree with score prediction
- HTFT reversal cross-check for MS picks

feature_builder.py / data_loader.py:
- Real home/away_position from data (was hardcoded 10)
- Cup detection wired into UpsetEngine
- _estimate_league_position with 300-day season filter

New scripts:
- diagnostic_backtest.py: per-bet diagnostic backtest with loss patterns
- optimize_filters.py: grid search per-market optimal thresholds
- analyze_backtest_csv.py: root-cause hypothesis testing on CSV
- compare_backtests.py: side-by-side validation with verdict
- test_score_coherence.py: smoke test for coherence filter (20/20 pass)

Reports:
- diagnostic_backtest_20260525_024437 (50-match smoke)
- diagnostic_backtest_20260525_035649 (1000-match in-sample)
- filter_optimization_patch.json (grid search winners per market)

Social poster v3:
- satori + resvg HTML/CSS rendering pipeline
- Twemoji football/basketball + flag SVGs
- caption SEO: 12 curated hashtags per post
- image SEO: descriptive filenames + .json metadata sidecar
- /health, /preview-png, /run-now endpoints

Docs:
- mds/SESSION_HANDOFF.md: full session state for cross-machine continuity
- mds/SOCIAL_POSTER_SETUP.md: API keys + test commands

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-25 20:43:28 +03:00
parent b619c2454a
commit 988ee2f50d
36 changed files with 5268 additions and 46 deletions
+13 -1
View File
@@ -449,6 +449,12 @@ class DataLoaderMixin:
return 1.5, 1.2
return weighted_for / total_weight, weighted_against / total_weight
# Approximate European season window — Eredivisie/PL/La Liga start late
# July / mid-August, end May. Using 300 days as a buffer covers most
# competitions while excluding "career points" from previous seasons.
# When a proper seasons table lands this should query season boundaries.
_SEASON_LOOKBACK_MS = 300 * 24 * 60 * 60 * 1000
def _estimate_league_position(
self,
cur: RealDictCursor,
@@ -458,6 +464,7 @@ class DataLoaderMixin:
) -> int:
if not team_id or not league_id:
return 10
season_start_ms = before_date_ms - self._SEASON_LOOKBACK_MS
try:
cur.execute(
"""
@@ -478,6 +485,7 @@ class DataLoaderMixin:
AND m.score_home IS NOT NULL
AND m.score_away IS NOT NULL
AND m.mst_utc < %s
AND m.mst_utc >= %s
UNION ALL
SELECT
m.away_team_id AS team_id,
@@ -492,11 +500,15 @@ class DataLoaderMixin:
AND m.score_home IS NOT NULL
AND m.score_away IS NOT NULL
AND m.mst_utc < %s
AND m.mst_utc >= %s
) tm
GROUP BY tm.team_id
ORDER BY points DESC
""",
(league_id, before_date_ms, league_id, before_date_ms),
(
league_id, before_date_ms, season_start_ms,
league_id, before_date_ms, season_start_ms,
),
)
rows = cur.fetchall()
if not rows:
@@ -225,20 +225,43 @@ class FeatureBuilderMixin:
if enrichment_failures:
print(f"⚠️ Enrichment partial failures for {data.match_id}: {', '.join(enrichment_failures)}")
# ── Cup game detection (used by upset engine + elo dampening below) ──
_league_name_lower = (getattr(data, 'league_name', '') or '').lower()
_cup_keywords = ("kupa", "cup", "coupe", "copa", "coppa", "pokal",
"trophy", "shield", "ziraat", "süper kupa", "super cup",
"beker", "taça", "taca")
_is_cup_match = any(kw in _league_name_lower for kw in _cup_keywords)
# ── League size hint: top European leagues 18-20 teams, lower 16-24 ──
# We don't have a per-league team count, so fall back to 20 (standard).
# When standings infra lands this should pull from seasons table.
_league_total_teams = 20
# Upset engine features
upset_atmosphere, upset_motivation, upset_fatigue = 0.0, 0.0, 0.0
try:
upset_engine = get_upset_engine()
# Use the real position estimates from data_loader; fall back to mid-
# table (10) only when the loader couldn't compute one. Hardcoding 10
# for every team made motivation_score collapse to 0 for everyone.
_home_pos = getattr(data, 'home_position', None)
_away_pos = getattr(data, 'away_position', None)
if _home_pos is None or _home_pos <= 0:
_home_pos = 10
if _away_pos is None or _away_pos <= 0:
_away_pos = 10
upset_feats = upset_engine.get_features(
home_team_name=getattr(data, 'home_team_name', '') or '',
home_team_id=data.home_team_id,
away_team_name=getattr(data, 'away_team_name', '') or '',
league_name=getattr(data, 'league_name', '') or '',
home_position=10,
away_position=10,
home_position=_home_pos,
away_position=_away_pos,
match_date_ms=data.match_date_ms,
is_cup_match=_is_cup_match,
home_days_rest=int(home_rest),
away_days_rest=int(away_rest),
total_teams=_league_total_teams,
)
upset_atmosphere = upset_feats.get('upset_atmosphere', 0.0)
upset_motivation = upset_feats.get('upset_motivation', 0.0)
@@ -276,15 +299,10 @@ class FeatureBuilderMixin:
is_season_start = 1.0 if match_month in (7, 8, 9) else 0.0
is_season_end = 1.0 if match_month in (5, 6) else 0.0
# ── Cup game detection: dampen home advantage in feature space ──
_league_name = (getattr(data, 'league_name', '') or '').lower()
_cup_keywords = ("kupa", "cup", "coupe", "copa", "coppa", "pokal",
"trophy", "shield", "ziraat", "süper kupa", "super cup")
_is_cup = any(kw in _league_name for kw in _cup_keywords)
# ── Derived / Interaction features (V27) ──
# Cup games: home ELO advantage is ~30% weaker (rotation, lower motivation)
elo_diff = (home_elo - away_elo) * (0.70 if _is_cup else 1.0)
# Uses _is_cup_match computed earlier (before upset engine call).
elo_diff = (home_elo - away_elo) * (0.70 if _is_cup_match else 1.0)
form_elo_diff = home_form_elo_val - away_form_elo_val
attack_vs_defense_home = data.home_goals_avg - data.away_conceded_avg
attack_vs_defense_away = data.away_goals_avg - data.home_conceded_avg