feat(ai): expand training to 68K+ matches, add score model, backfill implied odds

- extract_training_data.py: switch from top_leagues.json (23) to qualified_leagues.json (265) - update_implied_odds.py: new script to backfill implied odds from real market data - train_score_model.py: rewrite with v25 102-feature set + temporal split - single_match_orchestrator.py: integrate ML score model with heuristic fallback
2026-05-05 16:04:00 +03:00
parent 9bb8f39bca
commit 244d8f5366
4 changed files with 626 additions and 173 deletions
@@ -16,6 +16,7 @@ import re
 import time
 import math
 import os
+import pickle
 import pandas as pd
 import numpy as np
 from collections import defaultdict
@@ -258,6 +259,51 @@ class SingleMatchOrchestrator:
        self._v27 = None
        return None

+    def _get_score_model(self) -> Optional[Dict]:
+        """Load XGBoost score prediction model (non-fatal)."""
+        if hasattr(self, "_score_model_cache"):
+            return self._score_model_cache
+        score_model_path = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+            "models", "xgb_score.pkl",
+        )
+        try:
+            if os.path.exists(score_model_path):
+                with open(score_model_path, "rb") as f:
+                    model_data = pickle.load(f)
+                if all(k in model_data for k in ("home_model", "away_model", "ht_home_model", "ht_away_model", "features")):
+                    self._score_model_cache = model_data
+                    print(f"[SCORE] ✅ Score model loaded ({len(model_data['features'])} features)")
+                    return self._score_model_cache
+        except Exception as e:
+            print(f"[SCORE] ⚠ Load failed (non-fatal, using heuristic): {e}")
+        self._score_model_cache = None
+        return None
+
+    def _predict_score_with_model(self, features: Dict[str, float]) -> Optional[Dict[str, float]]:
+        """Predict FT/HT scores using XGBoost score model."""
+        score_model = self._get_score_model()
+        if score_model is None:
+            return None
+        try:
+            import pandas as _pd
+            model_features = score_model["features"]
+            row = {f: float(features.get(f, 0)) for f in model_features}
+            df = _pd.DataFrame([row])
+            ft_home = max(0.0, float(score_model["home_model"].predict(df)[0]))
+            ft_away = max(0.0, float(score_model["away_model"].predict(df)[0]))
+            ht_home = max(0.0, float(score_model["ht_home_model"].predict(df)[0]))
+            ht_away = max(0.0, float(score_model["ht_away_model"].predict(df)[0]))
+            return {
+                "ft_home": round(ft_home, 2),
+                "ft_away": round(ft_away, 2),
+                "ht_home": round(ht_home, 2),
+                "ht_away": round(ht_away, 2),
+            }
+        except Exception as e:
+            print(f"[SCORE] ⚠ Prediction error (fallback to heuristic): {e}")
+            return None
+
    def _build_v25_features(self, data: MatchData) -> Dict[str, float]:
        """
        Build the single authoritative V25 pre-match feature vector.
@@ -869,27 +915,39 @@ class SingleMatchOrchestrator:
        prediction.handicap_pick, hcap_top = self._best_prob_pick(hcap_probs)
        prediction.handicap_confidence = hcap_top * 100.0

-        base_home_xg = max(0.25, (float(data.home_goals_avg) + float(features.get("away_xga", data.away_conceded_avg))) / 2.0)
-        base_away_xg = max(0.25, (float(data.away_goals_avg) + float(features.get("home_xga", data.home_conceded_avg))) / 2.0)
-        ms_edge = prediction.ms_home_prob - prediction.ms_away_prob
-        total_target = max(
-            1.4,
-            min(
-                4.8,
-                (float(features.get("league_avg_goals", 2.7)) * 0.55)
-                + ((float(data.home_goals_avg) + float(data.away_goals_avg)) * 0.45)
-                + ((prediction.over_25_prob - prediction.under_25_prob) * 1.15),
-            ),
-        )
-        home_xg = max(0.2, base_home_xg + (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18))
-        away_xg = max(0.2, base_away_xg - (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18))
-        scale = total_target / max(home_xg + away_xg, 0.1)
-        prediction.home_xg = round(home_xg * scale, 2)
-        prediction.away_xg = round(away_xg * scale, 2)
-        prediction.total_xg = round(prediction.home_xg + prediction.away_xg, 2)
-
-        prediction.predicted_ft_score = f"{int(round(prediction.home_xg))}-{int(round(prediction.away_xg))}"
-        prediction.predicted_ht_score = f"{int(round(prediction.home_xg * 0.45))}-{int(round(prediction.away_xg * 0.45))}"
+        # ── Score Prediction: Model-first, heuristic fallback ──────────
+        score_result = self._predict_score_with_model(features)
+        if score_result is not None:
+            # ML model predicted scores
+            prediction.home_xg = score_result["ft_home"]
+            prediction.away_xg = score_result["ft_away"]
+            prediction.total_xg = round(prediction.home_xg + prediction.away_xg, 2)
+            ht_home_xg = score_result["ht_home"]
+            ht_away_xg = score_result["ht_away"]
+            prediction.predicted_ft_score = f"{int(round(prediction.home_xg))}-{int(round(prediction.away_xg))}"
+            prediction.predicted_ht_score = f"{int(round(ht_home_xg))}-{int(round(ht_away_xg))}"
+        else:
+            # Heuristic fallback (original formula)
+            base_home_xg = max(0.25, (float(data.home_goals_avg) + float(features.get("away_xga", data.away_conceded_avg))) / 2.0)
+            base_away_xg = max(0.25, (float(data.away_goals_avg) + float(features.get("home_xga", data.home_conceded_avg))) / 2.0)
+            ms_edge = prediction.ms_home_prob - prediction.ms_away_prob
+            total_target = max(
+                1.4,
+                min(
+                    4.8,
+                    (float(features.get("league_avg_goals", 2.7)) * 0.55)
+                    + ((float(data.home_goals_avg) + float(data.away_goals_avg)) * 0.45)
+                    + ((prediction.over_25_prob - prediction.under_25_prob) * 1.15),
+                ),
+            )
+            home_xg = max(0.2, base_home_xg + (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18))
+            away_xg = max(0.2, base_away_xg - (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18))
+            scale = total_target / max(home_xg + away_xg, 0.1)
+            prediction.home_xg = round(home_xg * scale, 2)
+            prediction.away_xg = round(away_xg * scale, 2)
+            prediction.total_xg = round(prediction.home_xg + prediction.away_xg, 2)
+            prediction.predicted_ft_score = f"{int(round(prediction.home_xg))}-{int(round(prediction.away_xg))}"
+            prediction.predicted_ht_score = f"{int(round(prediction.home_xg * 0.45))}-{int(round(prediction.away_xg * 0.45))}"
        prediction.ft_scores_top5 = self._poisson_score_top5(prediction.home_xg, prediction.away_xg)

        max_market_conf = max(