Add backtest pipeline, betting_brain filters, score coherence + social v3

betting_brain.py: - HARD_MIN_SAMPLES=50 floor for calibrator bypass - ev_edge < 0 + >= 0.20 hard vetoes - BTTS muted (grid search found no profitable config) - Per-market optimal envelopes (MS, OU25) - Score coherence filter: main_pick must agree with score prediction - HTFT reversal cross-check for MS picks feature_builder.py / data_loader.py: - Real home/away_position from data (was hardcoded 10) - Cup detection wired into UpsetEngine - _estimate_league_position with 300-day season filter New scripts: - diagnostic_backtest.py: per-bet diagnostic backtest with loss patterns - optimize_filters.py: grid search per-market optimal thresholds - analyze_backtest_csv.py: root-cause hypothesis testing on CSV - compare_backtests.py: side-by-side validation with verdict - test_score_coherence.py: smoke test for coherence filter (20/20 pass) Reports: - diagnostic_backtest_20260525_024437 (50-match smoke) - diagnostic_backtest_20260525_035649 (1000-match in-sample) - filter_optimization_patch.json (grid search winners per market) Social poster v3: - satori + resvg HTML/CSS rendering pipeline - Twemoji football/basketball + flag SVGs - caption SEO: 12 curated hashtags per post - image SEO: descriptive filenames + .json metadata sidecar - /health, /preview-png, /run-now endpoints Docs: - mds/SESSION_HANDOFF.md: full session state for cross-machine continuity - mds/SOCIAL_POSTER_SETUP.md: API keys + test commands Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 20:43:28 +03:00
parent b619c2454a
commit 988ee2f50d
36 changed files with 5268 additions and 46 deletions
@@ -449,6 +449,12 @@ class DataLoaderMixin:
            return 1.5, 1.2
        return weighted_for / total_weight, weighted_against / total_weight

+    # Approximate European season window — Eredivisie/PL/La Liga start late
+    # July / mid-August, end May. Using 300 days as a buffer covers most
+    # competitions while excluding "career points" from previous seasons.
+    # When a proper seasons table lands this should query season boundaries.
+    _SEASON_LOOKBACK_MS = 300 * 24 * 60 * 60 * 1000
+
    def _estimate_league_position(
        self,
        cur: RealDictCursor,
@@ -458,6 +464,7 @@ class DataLoaderMixin:
    ) -> int:
        if not team_id or not league_id:
            return 10
+        season_start_ms = before_date_ms - self._SEASON_LOOKBACK_MS
        try:
            cur.execute(
                """
@@ -478,6 +485,7 @@ class DataLoaderMixin:
                      AND m.score_home IS NOT NULL
                      AND m.score_away IS NOT NULL
                      AND m.mst_utc < %s
+                      AND m.mst_utc >= %s
                    UNION ALL
                    SELECT
                        m.away_team_id AS team_id,
@@ -492,11 +500,15 @@ class DataLoaderMixin:
                      AND m.score_home IS NOT NULL
                      AND m.score_away IS NOT NULL
                      AND m.mst_utc < %s
+                      AND m.mst_utc >= %s
                ) tm
                GROUP BY tm.team_id
                ORDER BY points DESC
                """,
-                (league_id, before_date_ms, league_id, before_date_ms),
+                (
+                    league_id, before_date_ms, season_start_ms,
+                    league_id, before_date_ms, season_start_ms,
+                ),
            )
            rows = cur.fetchall()
            if not rows:
@@ -225,20 +225,43 @@ class FeatureBuilderMixin:
        if enrichment_failures:
            print(f"⚠️ Enrichment partial failures for {data.match_id}: {', '.join(enrichment_failures)}")

+        # ── Cup game detection (used by upset engine + elo dampening below) ──
+        _league_name_lower = (getattr(data, 'league_name', '') or '').lower()
+        _cup_keywords = ("kupa", "cup", "coupe", "copa", "coppa", "pokal",
+                         "trophy", "shield", "ziraat", "süper kupa", "super cup",
+                         "beker", "taça", "taca")
+        _is_cup_match = any(kw in _league_name_lower for kw in _cup_keywords)
+
+        # ── League size hint: top European leagues 18-20 teams, lower 16-24 ──
+        # We don't have a per-league team count, so fall back to 20 (standard).
+        # When standings infra lands this should pull from seasons table.
+        _league_total_teams = 20
+
        # Upset engine features
        upset_atmosphere, upset_motivation, upset_fatigue = 0.0, 0.0, 0.0
        try:
            upset_engine = get_upset_engine()
+            # Use the real position estimates from data_loader; fall back to mid-
+            # table (10) only when the loader couldn't compute one. Hardcoding 10
+            # for every team made motivation_score collapse to 0 for everyone.
+            _home_pos = getattr(data, 'home_position', None)
+            _away_pos = getattr(data, 'away_position', None)
+            if _home_pos is None or _home_pos <= 0:
+                _home_pos = 10
+            if _away_pos is None or _away_pos <= 0:
+                _away_pos = 10
            upset_feats = upset_engine.get_features(
                home_team_name=getattr(data, 'home_team_name', '') or '',
                home_team_id=data.home_team_id,
                away_team_name=getattr(data, 'away_team_name', '') or '',
                league_name=getattr(data, 'league_name', '') or '',
-                home_position=10,
-                away_position=10,
+                home_position=_home_pos,
+                away_position=_away_pos,
                match_date_ms=data.match_date_ms,
+                is_cup_match=_is_cup_match,
                home_days_rest=int(home_rest),
                away_days_rest=int(away_rest),
+                total_teams=_league_total_teams,
            )
            upset_atmosphere = upset_feats.get('upset_atmosphere', 0.0)
            upset_motivation = upset_feats.get('upset_motivation', 0.0)
@@ -276,15 +299,10 @@ class FeatureBuilderMixin:
        is_season_start = 1.0 if match_month in (7, 8, 9) else 0.0
        is_season_end = 1.0 if match_month in (5, 6) else 0.0

-        # ── Cup game detection: dampen home advantage in feature space ──
-        _league_name = (getattr(data, 'league_name', '') or '').lower()
-        _cup_keywords = ("kupa", "cup", "coupe", "copa", "coppa", "pokal",
-                         "trophy", "shield", "ziraat", "süper kupa", "super cup")
-        _is_cup = any(kw in _league_name for kw in _cup_keywords)
-
        # ── Derived / Interaction features (V27) ──
        # Cup games: home ELO advantage is ~30% weaker (rotation, lower motivation)
-        elo_diff = (home_elo - away_elo) * (0.70 if _is_cup else 1.0)
+        # Uses _is_cup_match computed earlier (before upset engine call).
+        elo_diff = (home_elo - away_elo) * (0.70 if _is_cup_match else 1.0)
        form_elo_diff = home_form_elo_val - away_form_elo_val
        attack_vs_defense_home = data.home_goals_avg - data.away_conceded_avg
        attack_vs_defense_away = data.away_goals_avg - data.home_conceded_avg