gg

2026-05-10 22:52:05 +03:00
parent c525b12dfd
commit f3362f266c
3 changed files with 922 additions and 32 deletions
@@ -14,6 +14,7 @@ import json
 import csv
 import math
 import time
+import bisect
 from datetime import datetime
 from collections import defaultdict

@@ -119,6 +120,14 @@ FEATURE_COLS = [
    "home_key_players", "away_key_players",
    "home_missing_impact", "away_missing_impact",
    "home_goals_form", "away_goals_form",
+
+    # Player-Level Features (12)
+    "home_lineup_goals_per90", "away_lineup_goals_per90",
+    "home_lineup_assists_per90", "away_lineup_assists_per90",
+    "home_squad_continuity", "away_squad_continuity",
+    "home_top_scorer_form", "away_top_scorer_form",
+    "home_avg_player_exp", "away_avg_player_exp",
+    "home_goals_diversity", "away_goals_diversity",
    
    # Labels
    "score_home", "score_away", "total_goals",
@@ -336,7 +345,7 @@ class BatchDataLoader:
            self.team_stats[tid].append((mst, poss, sot, tshots, corn, team_goals))

    def _load_squad_data(self):
-        """Bulk load squad participation + player events for squad features."""
+        """Bulk load squad participation + player events + player career for squad features."""
        ph = ",".join(["%s"] * len(self.top_league_ids))

        # 1) Participation: starting XI count + position distribution per (match, team)
@@ -429,9 +438,90 @@ class BatchDataLoader:
        for m in self.matches:
            match_mst[m[0]] = m[7]  # m[0]=id, m[7]=mst_utc

-        # 6) Build combined cache — NO DATA LEAKAGE
-        # goals_form: avg goals from last 5 matches BEFORE this match (not this match!)
-        # squad_quality: only uses pre-match info (lineup, key players) — no current-match goals/assists
+        # ─── NEW: Player Career Stats (prefix-sum for O(1) temporal lookup) ───
+        # 6a) Goals per player per match date
+        self.cur.execute(f"""
+            SELECT mpe.player_id, m.mst_utc,
+                   SUM(CASE WHEN mpe.event_type = 'goal'
+                            AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
+                       THEN 1 ELSE 0 END) AS goals
+            FROM match_player_events mpe
+            JOIN matches m ON mpe.match_id = m.id
+            WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph})
+            GROUP BY mpe.player_id, m.mst_utc
+        """, self.top_league_ids)
+
+        player_goals_raw = defaultdict(dict)
+        for pid, mst, goals in self.cur.fetchall():
+            player_goals_raw[pid][mst] = (player_goals_raw[pid].get(mst, 0)) + (goals or 0)
+
+        # 6b) Assists per player per match date
+        self.cur.execute(f"""
+            SELECT mpe.assist_player_id, m.mst_utc, COUNT(*) AS assists
+            FROM match_player_events mpe
+            JOIN matches m ON mpe.match_id = m.id
+            WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph})
+              AND mpe.event_type = 'goal' AND mpe.assist_player_id IS NOT NULL
+            GROUP BY mpe.assist_player_id, m.mst_utc
+        """, self.top_league_ids)
+
+        player_assists_raw = defaultdict(dict)
+        for pid, mst, assists in self.cur.fetchall():
+            player_assists_raw[pid][mst] = (player_assists_raw[pid].get(mst, 0)) + (assists or 0)
+
+        # 6c) Player participation dates (starts only)
+        self.cur.execute(f"""
+            SELECT mpp.player_id, m.mst_utc
+            FROM match_player_participation mpp
+            JOIN matches m ON mpp.match_id = m.id
+            WHERE mpp.is_starting = true
+              AND m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph})
+            ORDER BY mpp.player_id, m.mst_utc
+        """, self.top_league_ids)
+
+        player_starts_raw = defaultdict(list)
+        for pid, mst in self.cur.fetchall():
+            player_starts_raw[pid].append(mst)
+
+        # 6d) Build prefix sums per player (goals_prefix[i] = total goals up to start i)
+        player_career = {}
+        all_pids = set(player_starts_raw.keys()) | set(player_goals_raw.keys()) | set(player_assists_raw.keys())
+        for pid in all_pids:
+            starts = sorted(set(player_starts_raw.get(pid, [])))
+            if not starts:
+                continue
+            g_map = player_goals_raw.get(pid, {})
+            a_map = player_assists_raw.get(pid, {})
+            cum_g, cum_a = 0, 0
+            goals_pf, assists_pf = [], []
+            for mst in starts:
+                cum_g += g_map.get(mst, 0)
+                cum_a += a_map.get(mst, 0)
+                goals_pf.append(cum_g)
+                assists_pf.append(cum_a)
+            player_career[pid] = {'msts': starts, 'gp': goals_pf, 'ap': assists_pf}
+
+        # Free raw dicts
+        del player_goals_raw, player_assists_raw, player_starts_raw
+        print(f"    📊 Player careers built: {len(player_career)} players", flush=True)
+
+        # ─── NEW: Team Lineup History (for squad continuity) ───
+        # 7) Per-team sorted lineups: [(mst, frozenset(player_ids))]
+        team_lineup_map = defaultdict(list)
+        for (mid, tid), pids in starting_players.items():
+            mst = match_mst.get(mid, 0)
+            if mst > 0 and pids:
+                team_lineup_map[tid].append((mst, frozenset(pids)))
+
+        team_lineup_history = {}
+        team_lineup_msts = {}
+        for tid, ll in team_lineup_map.items():
+            ll.sort(key=lambda x: x[0])
+            team_lineup_history[tid] = ll
+            team_lineup_msts[tid] = [x[0] for x in ll]
+        del team_lineup_map
+
+        # ─── 8) Build combined cache — NO DATA LEAKAGE ───
        all_keys = set(participation.keys()) | set(events.keys())
        for key in all_keys:
            mid, tid = key
@@ -443,30 +533,78 @@ class BatchDataLoader:
            kp_total = len(key_players_by_team.get(tid, set()))
            kp_missing = max(0, kp_total - kp_in_starting)

-            # Squad quality: composite score — ONLY pre-match info (no current-match goals/assists!)
+            # Squad quality: composite score — ONLY pre-match info
            squad_quality = (
                part['starting_count'] * 0.3 +
                kp_in_starting * 3.0 +
                part['fwd_count'] * 1.5
            )
-            # Missing impact: how many key players are missing
            missing_impact = min(kp_missing / max(kp_total, 1), 1.0)

            # goals_form: avg goals from last 5 matches BEFORE this match
            current_mst = match_mst.get(mid, 0)
            team_history = self.team_matches.get(tid, [])
            recent_goals = [
-                tm[2]  # team_score
-                for tm in team_history
-                if tm[0] < current_mst  # only matches BEFORE this one
-            ][-5:]  # last 5
+                tm[2] for tm in team_history if tm[0] < current_mst
+            ][-5:]
            goals_form = sum(recent_goals) / len(recent_goals) if recent_goals else 1.3

+            # ─── NEW: Player-level aggregation for starting XI ───
+            lineup_g90, lineup_a90, total_exp = 0.0, 0.0, 0
+            best_scorer_total, best_scorer_id = 0, None
+            scorers_in_lineup = 0
+
+            for pid in starters:
+                pc = player_career.get(pid)
+                if not pc:
+                    continue
+                idx = bisect.bisect_left(pc['msts'], current_mst)
+                if idx == 0:
+                    continue  # no prior matches for this player
+                prior_starts = idx
+                prior_goals = pc['gp'][idx - 1]
+                prior_assists = pc['ap'][idx - 1]
+                lineup_g90 += prior_goals / prior_starts
+                lineup_a90 += prior_assists / prior_starts
+                total_exp += prior_starts
+                if prior_goals > 0:
+                    scorers_in_lineup += 1
+                if prior_goals > best_scorer_total:
+                    best_scorer_total = prior_goals
+                    best_scorer_id = pid
+
+            n_st = len(starters) or 1
+
+            # Top scorer recent form (goals in last 5 starts)
+            top_scorer_form = 0
+            if best_scorer_id:
+                pc = player_career.get(best_scorer_id)
+                if pc:
+                    idx = bisect.bisect_left(pc['msts'], current_mst)
+                    if idx > 0:
+                        s5 = max(0, idx - 5)
+                        top_scorer_form = pc['gp'][idx - 1] - (pc['gp'][s5 - 1] if s5 > 0 else 0)
+
+            # Squad continuity (overlap with previous match lineup)
+            squad_continuity = 0.5
+            msts_list = team_lineup_msts.get(tid)
+            if msts_list:
+                li = bisect.bisect_left(msts_list, current_mst)
+                if li > 0:
+                    prev_lineup = team_lineup_history[tid][li - 1][1]
+                    squad_continuity = len(frozenset(starters) & prev_lineup) / n_st
+
            self.squad_cache[key] = {
                'squad_quality': squad_quality,
                'key_players': kp_in_starting,
                'missing_impact': missing_impact,
                'goals_form': round(goals_form, 2),
+                'lineup_goals_per90': round(lineup_g90, 3),
+                'lineup_assists_per90': round(lineup_a90, 3),
+                'squad_continuity': round(squad_continuity, 3),
+                'top_scorer_form': top_scorer_form,
+                'avg_player_exp': round(total_exp / n_st, 1),
+                'goals_diversity': round(scorers_in_lineup / n_st, 3),
            }

    def _load_cards_data(self):
@@ -855,6 +993,20 @@ class FeatureExtractor:
            "away_missing_impact": away_missing_impact,
            "home_goals_form": home_goals_form,
            "away_goals_form": away_goals_form,
+
+            # Player-Level Features
+            "home_lineup_goals_per90": home_sq.get('lineup_goals_per90', 0.0),
+            "away_lineup_goals_per90": away_sq.get('lineup_goals_per90', 0.0),
+            "home_lineup_assists_per90": home_sq.get('lineup_assists_per90', 0.0),
+            "away_lineup_assists_per90": away_sq.get('lineup_assists_per90', 0.0),
+            "home_squad_continuity": home_sq.get('squad_continuity', 0.5),
+            "away_squad_continuity": away_sq.get('squad_continuity', 0.5),
+            "home_top_scorer_form": home_sq.get('top_scorer_form', 0),
+            "away_top_scorer_form": away_sq.get('top_scorer_form', 0),
+            "home_avg_player_exp": home_sq.get('avg_player_exp', 0.0),
+            "away_avg_player_exp": away_sq.get('avg_player_exp', 0.0),
+            "home_goals_diversity": home_sq.get('goals_diversity', 0.0),
+            "away_goals_diversity": away_sq.get('goals_diversity', 0.0),
            
            # Labels
            "score_home": sh,
@@ -23,7 +23,7 @@ import optuna
 from optuna.samplers import TPESampler
 from datetime import datetime
 from sklearn.metrics import accuracy_score, log_loss, classification_report
-from sklearn.calibration import CalibratedClassifierCV
+from sklearn.isotonic import IsotonicRegression
 from sklearn.base import BaseEstimator, ClassifierMixin

 optuna.logging.set_verbosity(optuna.logging.WARNING)
@@ -38,7 +38,7 @@ REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25")
 os.makedirs(MODELS_DIR, exist_ok=True)
 os.makedirs(REPORTS_DIR, exist_ok=True)

-# ─── Feature Columns (83 features, NO target leakage) ───────────────
+# ─── Feature Columns (95 features, NO target leakage) ───────────────
 FEATURES = [
    # ELO (8)
    "home_overall_elo", "away_overall_elo", "elo_diff",
@@ -94,6 +94,13 @@ FEATURES = [
    "home_key_players", "away_key_players",
    "home_missing_impact", "away_missing_impact",
    "home_goals_form", "away_goals_form",
+    # Player-Level Features (12)
+    "home_lineup_goals_per90", "away_lineup_goals_per90",
+    "home_lineup_assists_per90", "away_lineup_assists_per90",
+    "home_squad_continuity", "away_squad_continuity",
+    "home_top_scorer_form", "away_top_scorer_form",
+    "home_avg_player_exp", "away_avg_player_exp",
+    "home_goals_diversity", "away_goals_diversity",
 ]

 MARKET_CONFIGS = [
@@ -349,18 +356,34 @@ def train_market(df, target_col, market_name, num_class, n_trials):
    print(f"[OK] LGB final: iter={lgb_model.best_iteration}")

    # ── Phase 4: Isotonic Calibration on cal set ─────────────────
-    print("[CAL] Fitting Isotonic Regression...")
+    print("[CAL] Fitting Isotonic Regression (per-class)...")

-    # XGB calibration
-    xgb_wrapper = XGBWrapper(xgb_params, num_boost_round=xgb_model.best_iteration)
-    xgb_calibrated = CalibratedClassifierCV(xgb_wrapper, method="isotonic", cv="prefit")
-    xgb_wrapper.fit(X_train, y_train)
-    xgb_calibrated.fit(X_cal, y_cal)
+    # XGB calibration — manual IsotonicRegression per class
+    dcal = xgb.DMatrix(X_cal)
+    xgb_cal_raw = xgb_model.predict(dcal)
+    if len(xgb_cal_raw.shape) == 1:
+        xgb_cal_raw = np.column_stack([1 - xgb_cal_raw, xgb_cal_raw])

-    # LGB calibration — use raw predictions approach
-    lgb_cal_preds = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration)
-    if len(lgb_cal_preds.shape) == 1:
-        lgb_cal_preds = np.column_stack([1 - lgb_cal_preds, lgb_cal_preds])
+    xgb_iso_calibrators = []
+    for cls_idx in range(num_class):
+        ir = IsotonicRegression(out_of_bounds="clip")
+        y_binary = (y_cal == cls_idx).astype(float)
+        ir.fit(xgb_cal_raw[:, cls_idx], y_binary)
+        xgb_iso_calibrators.append(ir)
+    print(f"[OK] XGB Isotonic calibrators fitted: {num_class} classes")
+
+    # LGB calibration — manual IsotonicRegression per class
+    lgb_cal_raw = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration)
+    if len(lgb_cal_raw.shape) == 1:
+        lgb_cal_raw = np.column_stack([1 - lgb_cal_raw, lgb_cal_raw])
+
+    lgb_iso_calibrators = []
+    for cls_idx in range(num_class):
+        ir = IsotonicRegression(out_of_bounds="clip")
+        y_binary = (y_cal == cls_idx).astype(float)
+        ir.fit(lgb_cal_raw[:, cls_idx], y_binary)
+        lgb_iso_calibrators.append(ir)
+    print(f"[OK] LGB Isotonic calibrators fitted: {num_class} classes")

    # ── Phase 5: Evaluate on test set ────────────────────────────
    print("\n[EVAL] Test set evaluation...")
@@ -371,16 +394,26 @@ def train_market(df, target_col, market_name, num_class, n_trials):
    if len(xgb_raw_probs.shape) == 1:
        xgb_raw_probs = np.column_stack([1 - xgb_raw_probs, xgb_raw_probs])

-    # Calibrated XGB
-    xgb_cal_probs = xgb_calibrated.predict_proba(X_test)
+    # Calibrated XGB — apply isotonic per class + renormalize
+    xgb_cal_probs = np.column_stack([
+        xgb_iso_calibrators[i].predict(xgb_raw_probs[:, i]) for i in range(num_class)
+    ])
+    xgb_cal_probs = xgb_cal_probs / xgb_cal_probs.sum(axis=1, keepdims=True)

    # Raw LGB
    lgb_raw_probs = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
    if len(lgb_raw_probs.shape) == 1:
        lgb_raw_probs = np.column_stack([1 - lgb_raw_probs, lgb_raw_probs])

-    # Ensemble (raw)
+    # Calibrated LGB — apply isotonic per class + renormalize
+    lgb_cal_probs = np.column_stack([
+        lgb_iso_calibrators[i].predict(lgb_raw_probs[:, i]) for i in range(num_class)
+    ])
+    lgb_cal_probs = lgb_cal_probs / lgb_cal_probs.sum(axis=1, keepdims=True)
+
+    # Ensembles
    raw_ensemble = (xgb_raw_probs + lgb_raw_probs) / 2
+    cal_ensemble = (xgb_cal_probs + lgb_cal_probs) / 2

    def _eval(probs, label):
        preds = np.argmax(probs, axis=1)
@@ -392,7 +425,9 @@ def train_market(df, target_col, market_name, num_class, n_trials):
    m_xgb_raw = _eval(xgb_raw_probs, "XGB Raw")
    m_xgb_cal = _eval(xgb_cal_probs, "XGB Calibrated")
    m_lgb_raw = _eval(lgb_raw_probs, "LGB Raw")
+    m_lgb_cal = _eval(lgb_cal_probs, "LGB Calibrated")
    m_ensemble = _eval(raw_ensemble, "Ensemble Raw")
+    m_cal_ensemble = _eval(cal_ensemble, "Ensemble Calibrated")

    # Classification report for ensemble
    ens_preds = np.argmax(raw_ensemble, axis=1)
@@ -409,11 +444,16 @@ def train_market(df, target_col, market_name, num_class, n_trials):
    lgb_model.save_model(lgb_path)
    print(f"[SAVE] {lgb_path}")

-    # Calibrated model
-    cal_path = os.path.join(MODELS_DIR, f"cal_xgb_v25_{market_name.lower()}.pkl")
-    with open(cal_path, "wb") as f:
-        pickle.dump(xgb_calibrated, f)
-    print(f"[SAVE] {cal_path}")
+    # Isotonic calibrators (XGB + LGB)
+    xgb_cal_path = os.path.join(MODELS_DIR, f"iso_xgb_v25_{market_name.lower()}.pkl")
+    with open(xgb_cal_path, "wb") as f:
+        pickle.dump(xgb_iso_calibrators, f)
+    print(f"[SAVE] {xgb_cal_path}")
+
+    lgb_cal_path = os.path.join(MODELS_DIR, f"iso_lgb_v25_{market_name.lower()}.pkl")
+    with open(lgb_cal_path, "wb") as f:
+        pickle.dump(lgb_iso_calibrators, f)
+    print(f"[SAVE] {lgb_cal_path}")

    return {
        "market": market_name,
@@ -432,7 +472,9 @@ def train_market(df, target_col, market_name, num_class, n_trials):
        "test_xgb_raw": m_xgb_raw,
        "test_xgb_calibrated": m_xgb_cal,
        "test_lgb_raw": m_lgb_raw,
+        "test_lgb_calibrated": m_lgb_cal,
        "test_ensemble_raw": m_ensemble,
+        "test_ensemble_calibrated": m_cal_ensemble,
    }


@@ -495,8 +537,12 @@ def main():
    print("[SUMMARY]")
    print("=" * 60)
    for name, m in all_metrics["markets"].items():
-        ens = m.get("test_ensemble_raw", {})
-        print(f"  {name:12s} | Acc={ens.get('accuracy','?'):>6s} | LL={ens.get('logloss','?'):>6s} | "
+        ens = m.get("test_ensemble_calibrated", m.get("test_ensemble_raw", {}))
+        acc = ens.get('accuracy', '?')
+        ll = ens.get('logloss', '?')
+        acc_s = f"{acc:.4f}" if isinstance(acc, float) else str(acc)
+        ll_s = f"{ll:.4f}" if isinstance(ll, float) else str(ll)
+        print(f"  {name:12s} | Acc={acc_s:>6s} | LL={ll_s:>6s} | "
              f"XGB_iter={m.get('xgb_best_iteration','?')} LGB_iter={m.get('lgb_best_iteration','?')}")

    print(f"\n[INFO] Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")