diff --git a/ai-engine/reports/training_v25/v25_pro_metrics.json b/ai-engine/reports/training_v25/v25_pro_metrics.json new file mode 100644 index 0000000..9a2973f --- /dev/null +++ b/ai-engine/reports/training_v25/v25_pro_metrics.json @@ -0,0 +1,692 @@ +{ + "trained_at": "2026-05-10 19:48:06", + "trainer": "v25_pro", + "optuna_trials": 50, + "total_features": 114, + "markets": { + "MS": { + "market": "MS", + "samples": 106861, + "train": 64116, + "val": 16029, + "cal": 10686, + "test": 16030, + "features_used": 114, + "xgb_best_params": { + "max_depth": 4, + "eta": 0.022329400652878233, + "subsample": 0.6690795757813364, + "colsample_bytree": 0.5042256538541441, + "min_child_weight": 6, + "gamma": 9.960129417155444e-05, + "reg_lambda": 0.5132295377582388, + "reg_alpha": 6.804503659726287e-08 + }, + "lgb_best_params": { + "max_depth": 4, + "learning_rate": 0.023142410802706542, + "feature_fraction": 0.5728681432360808, + "bagging_fraction": 0.6781774410065095, + "bagging_freq": 2, + "min_child_samples": 26, + "lambda_l1": 3.25216937188593e-05, + "lambda_l2": 4.8081236902660474e-08 + }, + "xgb_best_iteration": 643, + "lgb_best_iteration": 441, + "xgb_optuna_best_logloss": 0.9155, + "lgb_optuna_best_logloss": 0.9146, + "test_xgb_raw": { + "accuracy": 0.5442, + "logloss": 0.943 + }, + "test_xgb_calibrated": { + "accuracy": 0.5404, + "logloss": 0.9438 + }, + "test_lgb_raw": { + "accuracy": 0.5427, + "logloss": 0.943 + }, + "test_lgb_calibrated": { + "accuracy": 0.5417, + "logloss": 0.9447 + }, + "test_ensemble_raw": { + "accuracy": 0.5437, + "logloss": 0.9426 + }, + "test_ensemble_calibrated": { + "accuracy": 0.5418, + "logloss": 0.9435 + } + }, + "OU15": { + "market": "OU15", + "samples": 106861, + "train": 64116, + "val": 16029, + "cal": 10686, + "test": 16030, + "features_used": 114, + "xgb_best_params": { + "max_depth": 5, + "eta": 0.020779487257177966, + "subsample": 0.8109935286948485, + "colsample_bytree": 0.9525413847213635, + "min_child_weight": 6, + "gamma": 0.35330347775044696, + "reg_lambda": 5.373541021746059e-07, + "reg_alpha": 0.2959430087754284 + }, + "lgb_best_params": { + "max_depth": 6, + "learning_rate": 0.013402310027682367, + "feature_fraction": 0.7404728146233901, + "bagging_fraction": 0.9712026511549247, + "bagging_freq": 6, + "min_child_samples": 39, + "lambda_l1": 0.39893027986899576, + "lambda_l2": 0.0626443611997599 + }, + "xgb_best_iteration": 353, + "lgb_best_iteration": 370, + "xgb_optuna_best_logloss": 0.499, + "lgb_optuna_best_logloss": 0.4989, + "test_xgb_raw": { + "accuracy": 0.7521, + "logloss": 0.5267 + }, + "test_xgb_calibrated": { + "accuracy": 0.7521, + "logloss": 0.5344 + }, + "test_lgb_raw": { + "accuracy": 0.7528, + "logloss": 0.5261 + }, + "test_lgb_calibrated": { + "accuracy": 0.7505, + "logloss": 0.5362 + }, + "test_ensemble_raw": { + "accuracy": 0.7518, + "logloss": 0.5261 + }, + "test_ensemble_calibrated": { + "accuracy": 0.7522, + "logloss": 0.5364 + } + }, + "OU25": { + "market": "OU25", + "samples": 106861, + "train": 64116, + "val": 16029, + "cal": 10686, + "test": 16030, + "features_used": 114, + "xgb_best_params": { + "max_depth": 5, + "eta": 0.01274409160014454, + "subsample": 0.8300258899365814, + "colsample_bytree": 0.7336425662264429, + "min_child_weight": 9, + "gamma": 2.5382243933649716e-06, + "reg_lambda": 5.096723080351853e-05, + "reg_alpha": 0.00040919711449493223 + }, + "lgb_best_params": { + "max_depth": 6, + "learning_rate": 0.02301514680733822, + "feature_fraction": 0.9569492061944688, + "bagging_fraction": 0.7249143523144639, + "bagging_freq": 1, + "min_child_samples": 40, + "lambda_l1": 9.954995248644963e-08, + "lambda_l2": 3.82413187126927e-06 + }, + "xgb_best_iteration": 475, + "lgb_best_iteration": 235, + "xgb_optuna_best_logloss": 0.6202, + "lgb_optuna_best_logloss": 0.62, + "test_xgb_raw": { + "accuracy": 0.6221, + "logloss": 0.6352 + }, + "test_xgb_calibrated": { + "accuracy": 0.6226, + "logloss": 0.6344 + }, + "test_lgb_raw": { + "accuracy": 0.6236, + "logloss": 0.6348 + }, + "test_lgb_calibrated": { + "accuracy": 0.6231, + "logloss": 0.6343 + }, + "test_ensemble_raw": { + "accuracy": 0.6239, + "logloss": 0.6349 + }, + "test_ensemble_calibrated": { + "accuracy": 0.6236, + "logloss": 0.6338 + } + }, + "OU35": { + "market": "OU35", + "samples": 106861, + "train": 64116, + "val": 16029, + "cal": 10686, + "test": 16030, + "features_used": 114, + "xgb_best_params": { + "max_depth": 4, + "eta": 0.012538827444713596, + "subsample": 0.7947923612828379, + "colsample_bytree": 0.9717654601553765, + "min_child_weight": 6, + "gamma": 0.011265216242399128, + "reg_lambda": 0.12152579364613436, + "reg_alpha": 0.013995120492957489 + }, + "lgb_best_params": { + "max_depth": 6, + "learning_rate": 0.013456307557939324, + "feature_fraction": 0.8208768633332759, + "bagging_fraction": 0.929472334516626, + "bagging_freq": 6, + "min_child_samples": 35, + "lambda_l1": 0.05522724221034949, + "lambda_l2": 0.21689047644122147 + }, + "xgb_best_iteration": 696, + "lgb_best_iteration": 412, + "xgb_optuna_best_logloss": 0.552, + "lgb_optuna_best_logloss": 0.5515, + "test_xgb_raw": { + "accuracy": 0.7314, + "logloss": 0.5466 + }, + "test_xgb_calibrated": { + "accuracy": 0.7293, + "logloss": 0.5482 + }, + "test_lgb_raw": { + "accuracy": 0.73, + "logloss": 0.5462 + }, + "test_lgb_calibrated": { + "accuracy": 0.7298, + "logloss": 0.5485 + }, + "test_ensemble_raw": { + "accuracy": 0.7312, + "logloss": 0.5462 + }, + "test_ensemble_calibrated": { + "accuracy": 0.7301, + "logloss": 0.5478 + } + }, + "BTTS": { + "market": "BTTS", + "samples": 106861, + "train": 64116, + "val": 16029, + "cal": 10686, + "test": 16030, + "features_used": 114, + "xgb_best_params": { + "max_depth": 4, + "eta": 0.023533647209064805, + "subsample": 0.7469060816054074, + "colsample_bytree": 0.8445418254808608, + "min_child_weight": 8, + "gamma": 1.0503733400514561e-08, + "reg_lambda": 2.0919595769527735e-06, + "reg_alpha": 0.027277017326535417 + }, + "lgb_best_params": { + "max_depth": 4, + "learning_rate": 0.03900730648793646, + "feature_fraction": 0.6968255358438369, + "bagging_fraction": 0.7078349435778689, + "bagging_freq": 1, + "min_child_samples": 46, + "lambda_l1": 1.1796591413903922e-05, + "lambda_l2": 1.574367227995052e-08 + }, + "xgb_best_iteration": 462, + "lgb_best_iteration": 339, + "xgb_optuna_best_logloss": 0.6557, + "lgb_optuna_best_logloss": 0.6554, + "test_xgb_raw": { + "accuracy": 0.5908, + "logloss": 0.6637 + }, + "test_xgb_calibrated": { + "accuracy": 0.5885, + "logloss": 0.6647 + }, + "test_lgb_raw": { + "accuracy": 0.5891, + "logloss": 0.6638 + }, + "test_lgb_calibrated": { + "accuracy": 0.5891, + "logloss": 0.6702 + }, + "test_ensemble_raw": { + "accuracy": 0.5892, + "logloss": 0.6635 + }, + "test_ensemble_calibrated": { + "accuracy": 0.5885, + "logloss": 0.6655 + } + }, + "HT_RESULT": { + "market": "HT_RESULT", + "samples": 103641, + "train": 62184, + "val": 15546, + "cal": 10364, + "test": 15547, + "features_used": 114, + "xgb_best_params": { + "max_depth": 4, + "eta": 0.01736265891311687, + "subsample": 0.8370935625192159, + "colsample_bytree": 0.8091927356001175, + "min_child_weight": 9, + "gamma": 0.0006570311316367184, + "reg_lambda": 0.5206211670360164, + "reg_alpha": 0.0004530536252850605 + }, + "lgb_best_params": { + "max_depth": 4, + "learning_rate": 0.04842652289664568, + "feature_fraction": 0.6277272818879166, + "bagging_fraction": 0.9526964840164693, + "bagging_freq": 3, + "min_child_samples": 23, + "lambda_l1": 0.09429192580834124, + "lambda_l2": 5.5433175427148124e-08 + }, + "xgb_best_iteration": 516, + "lgb_best_iteration": 136, + "xgb_optuna_best_logloss": 1.0128, + "lgb_optuna_best_logloss": 1.0126, + "test_xgb_raw": { + "accuracy": 0.4689, + "logloss": 1.0174 + }, + "test_xgb_calibrated": { + "accuracy": 0.4685, + "logloss": 1.0193 + }, + "test_lgb_raw": { + "accuracy": 0.4696, + "logloss": 1.018 + }, + "test_lgb_calibrated": { + "accuracy": 0.4685, + "logloss": 1.0248 + }, + "test_ensemble_raw": { + "accuracy": 0.4699, + "logloss": 1.0172 + }, + "test_ensemble_calibrated": { + "accuracy": 0.4693, + "logloss": 1.0195 + } + }, + "HT_OU05": { + "market": "HT_OU05", + "samples": 103641, + "train": 62184, + "val": 15546, + "cal": 10364, + "test": 15547, + "features_used": 114, + "xgb_best_params": { + "max_depth": 4, + "eta": 0.02440515089624656, + "subsample": 0.7173767988211683, + "colsample_bytree": 0.5705266148307722, + "min_child_weight": 10, + "gamma": 0.00010295747493868653, + "reg_lambda": 0.00048367003442154754, + "reg_alpha": 0.00018303274057896783 + }, + "lgb_best_params": { + "max_depth": 4, + "learning_rate": 0.043477055106943, + "feature_fraction": 0.5704621124873813, + "bagging_fraction": 0.9208787923016158, + "bagging_freq": 1, + "min_child_samples": 50, + "lambda_l1": 0.015064619068942013, + "lambda_l2": 6.143857495033091e-07 + }, + "xgb_best_iteration": 315, + "lgb_best_iteration": 133, + "xgb_optuna_best_logloss": 0.5756, + "lgb_optuna_best_logloss": 0.5757, + "test_xgb_raw": { + "accuracy": 0.7021, + "logloss": 0.5949 + }, + "test_xgb_calibrated": { + "accuracy": 0.7011, + "logloss": 0.5976 + }, + "test_lgb_raw": { + "accuracy": 0.7009, + "logloss": 0.5954 + }, + "test_lgb_calibrated": { + "accuracy": 0.7019, + "logloss": 0.6002 + }, + "test_ensemble_raw": { + "accuracy": 0.7012, + "logloss": 0.5947 + }, + "test_ensemble_calibrated": { + "accuracy": 0.7016, + "logloss": 0.5994 + } + }, + "HT_OU15": { + "market": "HT_OU15", + "samples": 103641, + "train": 62184, + "val": 15546, + "cal": 10364, + "test": 15547, + "features_used": 114, + "xgb_best_params": { + "max_depth": 4, + "eta": 0.032235943414662994, + "subsample": 0.9298749893021518, + "colsample_bytree": 0.8077813949235508, + "min_child_weight": 8, + "gamma": 0.00020929324388600622, + "reg_lambda": 3.2154973975232725e-05, + "reg_alpha": 1.5945155621686738e-08 + }, + "lgb_best_params": { + "max_depth": 5, + "learning_rate": 0.013909897616748226, + "feature_fraction": 0.5585477334219859, + "bagging_fraction": 0.9398770580467641, + "bagging_freq": 2, + "min_child_samples": 22, + "lambda_l1": 0.001865897980802303, + "lambda_l2": 2.6934572591055333e-06 + }, + "xgb_best_iteration": 188, + "lgb_best_iteration": 387, + "xgb_optuna_best_logloss": 0.616, + "lgb_optuna_best_logloss": 0.6159, + "test_xgb_raw": { + "accuracy": 0.6749, + "logloss": 0.6109 + }, + "test_xgb_calibrated": { + "accuracy": 0.6747, + "logloss": 0.6137 + }, + "test_lgb_raw": { + "accuracy": 0.6745, + "logloss": 0.6112 + }, + "test_lgb_calibrated": { + "accuracy": 0.6745, + "logloss": 0.6201 + }, + "test_ensemble_raw": { + "accuracy": 0.674, + "logloss": 0.6109 + }, + "test_ensemble_calibrated": { + "accuracy": 0.6744, + "logloss": 0.6174 + } + }, + "HTFT": { + "market": "HTFT", + "samples": 103641, + "train": 62184, + "val": 15546, + "cal": 10364, + "test": 15547, + "features_used": 114, + "xgb_best_params": { + "max_depth": 4, + "eta": 0.015239309183459821, + "subsample": 0.7923828997985648, + "colsample_bytree": 0.686316507387916, + "min_child_weight": 6, + "gamma": 0.005249577944740401, + "reg_lambda": 2.1813455810361064e-08, + "reg_alpha": 3.454483107951557e-06 + }, + "lgb_best_params": { + "max_depth": 4, + "learning_rate": 0.010347899501864056, + "feature_fraction": 0.9585697341293057, + "bagging_fraction": 0.9413628962257758, + "bagging_freq": 2, + "min_child_samples": 36, + "lambda_l1": 0.0015332771659626943, + "lambda_l2": 7.3640280079715765 + }, + "xgb_best_iteration": 714, + "lgb_best_iteration": 602, + "xgb_optuna_best_logloss": 1.7863, + "lgb_optuna_best_logloss": 1.7862, + "test_xgb_raw": { + "accuracy": 0.3349, + "logloss": 1.8179 + }, + "test_xgb_calibrated": { + "accuracy": 0.3332, + "logloss": 1.824 + }, + "test_lgb_raw": { + "accuracy": 0.3367, + "logloss": 1.8187 + }, + "test_lgb_calibrated": { + "accuracy": 0.335, + "logloss": 1.8338 + }, + "test_ensemble_raw": { + "accuracy": 0.3363, + "logloss": 1.8176 + }, + "test_ensemble_calibrated": { + "accuracy": 0.3338, + "logloss": 1.828 + } + }, + "ODD_EVEN": { + "market": "ODD_EVEN", + "samples": 106861, + "train": 64116, + "val": 16029, + "cal": 10686, + "test": 16030, + "features_used": 114, + "xgb_best_params": { + "max_depth": 8, + "eta": 0.01010929937405026, + "subsample": 0.9492996501687384, + "colsample_bytree": 0.9061960005014683, + "min_child_weight": 7, + "gamma": 2.664416507237002e-08, + "reg_lambda": 0.0003748192960525308, + "reg_alpha": 0.005287068300306146 + }, + "lgb_best_params": { + "max_depth": 8, + "learning_rate": 0.0634879805509945, + "feature_fraction": 0.9993568368122896, + "bagging_fraction": 0.9246236397710591, + "bagging_freq": 3, + "min_child_samples": 16, + "lambda_l1": 0.0016414429853061781, + "lambda_l2": 6.112007631403553e-05 + }, + "xgb_best_iteration": 322, + "lgb_best_iteration": 55, + "xgb_optuna_best_logloss": 0.6777, + "lgb_optuna_best_logloss": 0.6762, + "test_xgb_raw": { + "accuracy": 0.5216, + "logloss": 0.684 + }, + "test_xgb_calibrated": { + "accuracy": 0.5236, + "logloss": 0.6834 + }, + "test_lgb_raw": { + "accuracy": 0.5279, + "logloss": 0.6826 + }, + "test_lgb_calibrated": { + "accuracy": 0.5274, + "logloss": 0.6861 + }, + "test_ensemble_raw": { + "accuracy": 0.5239, + "logloss": 0.6828 + }, + "test_ensemble_calibrated": { + "accuracy": 0.5236, + "logloss": 0.6861 + } + }, + "CARDS_OU45": { + "market": "CARDS_OU45", + "samples": 106861, + "train": 64116, + "val": 16029, + "cal": 10686, + "test": 16030, + "features_used": 114, + "xgb_best_params": { + "max_depth": 8, + "eta": 0.010098671964329344, + "subsample": 0.9969616653360747, + "colsample_bytree": 0.5085930751344795, + "min_child_weight": 10, + "gamma": 0.8600893137103568, + "reg_lambda": 7.556243125116086, + "reg_alpha": 0.5596869360839299 + }, + "lgb_best_params": { + "max_depth": 8, + "learning_rate": 0.0183440412249233, + "feature_fraction": 0.5416111323291537, + "bagging_fraction": 0.9754210612419695, + "bagging_freq": 2, + "min_child_samples": 5, + "lambda_l1": 0.09157782079463243, + "lambda_l2": 2.559000594641019 + }, + "xgb_best_iteration": 973, + "lgb_best_iteration": 503, + "xgb_optuna_best_logloss": 0.6408, + "lgb_optuna_best_logloss": 0.6407, + "test_xgb_raw": { + "accuracy": 0.597, + "logloss": 0.6501 + }, + "test_xgb_calibrated": { + "accuracy": 0.6019, + "logloss": 0.6471 + }, + "test_lgb_raw": { + "accuracy": 0.5977, + "logloss": 0.6486 + }, + "test_lgb_calibrated": { + "accuracy": 0.6019, + "logloss": 0.6498 + }, + "test_ensemble_raw": { + "accuracy": 0.5964, + "logloss": 0.6487 + }, + "test_ensemble_calibrated": { + "accuracy": 0.6034, + "logloss": 0.6467 + } + }, + "HANDICAP_MS": { + "market": "HANDICAP_MS", + "samples": 106861, + "train": 64116, + "val": 16029, + "cal": 10686, + "test": 16030, + "features_used": 114, + "xgb_best_params": { + "max_depth": 4, + "eta": 0.01475719431584365, + "subsample": 0.867899230696633, + "colsample_bytree": 0.6518567347674479, + "min_child_weight": 9, + "gamma": 0.34932767754310273, + "reg_lambda": 3.3257801082201637e-07, + "reg_alpha": 4.6977721450875555e-06 + }, + "lgb_best_params": { + "max_depth": 7, + "learning_rate": 0.019649745228555244, + "feature_fraction": 0.7903699430858344, + "bagging_fraction": 0.7932436899357213, + "bagging_freq": 3, + "min_child_samples": 30, + "lambda_l1": 9.496143774926949e-08, + "lambda_l2": 0.0049885051588706136 + }, + "xgb_best_iteration": 1016, + "lgb_best_iteration": 364, + "xgb_optuna_best_logloss": 0.8328, + "lgb_optuna_best_logloss": 0.8322, + "test_xgb_raw": { + "accuracy": 0.6062, + "logloss": 0.871 + }, + "test_xgb_calibrated": { + "accuracy": 0.6039, + "logloss": 0.8729 + }, + "test_lgb_raw": { + "accuracy": 0.6079, + "logloss": 0.8713 + }, + "test_lgb_calibrated": { + "accuracy": 0.6067, + "logloss": 0.8736 + }, + "test_ensemble_raw": { + "accuracy": 0.6072, + "logloss": 0.8707 + }, + "test_ensemble_calibrated": { + "accuracy": 0.6066, + "logloss": 0.8728 + } + } + } +} \ No newline at end of file diff --git a/ai-engine/scripts/extract_training_data.py b/ai-engine/scripts/extract_training_data.py index 053004c..8226a21 100755 --- a/ai-engine/scripts/extract_training_data.py +++ b/ai-engine/scripts/extract_training_data.py @@ -14,6 +14,7 @@ import json import csv import math import time +import bisect from datetime import datetime from collections import defaultdict @@ -119,6 +120,14 @@ FEATURE_COLS = [ "home_key_players", "away_key_players", "home_missing_impact", "away_missing_impact", "home_goals_form", "away_goals_form", + + # Player-Level Features (12) + "home_lineup_goals_per90", "away_lineup_goals_per90", + "home_lineup_assists_per90", "away_lineup_assists_per90", + "home_squad_continuity", "away_squad_continuity", + "home_top_scorer_form", "away_top_scorer_form", + "home_avg_player_exp", "away_avg_player_exp", + "home_goals_diversity", "away_goals_diversity", # Labels "score_home", "score_away", "total_goals", @@ -336,7 +345,7 @@ class BatchDataLoader: self.team_stats[tid].append((mst, poss, sot, tshots, corn, team_goals)) def _load_squad_data(self): - """Bulk load squad participation + player events for squad features.""" + """Bulk load squad participation + player events + player career for squad features.""" ph = ",".join(["%s"] * len(self.top_league_ids)) # 1) Participation: starting XI count + position distribution per (match, team) @@ -429,9 +438,90 @@ class BatchDataLoader: for m in self.matches: match_mst[m[0]] = m[7] # m[0]=id, m[7]=mst_utc - # 6) Build combined cache — NO DATA LEAKAGE - # goals_form: avg goals from last 5 matches BEFORE this match (not this match!) - # squad_quality: only uses pre-match info (lineup, key players) — no current-match goals/assists + # ─── NEW: Player Career Stats (prefix-sum for O(1) temporal lookup) ─── + # 6a) Goals per player per match date + self.cur.execute(f""" + SELECT mpe.player_id, m.mst_utc, + SUM(CASE WHEN mpe.event_type = 'goal' + AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%' + THEN 1 ELSE 0 END) AS goals + FROM match_player_events mpe + JOIN matches m ON mpe.match_id = m.id + WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) + GROUP BY mpe.player_id, m.mst_utc + """, self.top_league_ids) + + player_goals_raw = defaultdict(dict) + for pid, mst, goals in self.cur.fetchall(): + player_goals_raw[pid][mst] = (player_goals_raw[pid].get(mst, 0)) + (goals or 0) + + # 6b) Assists per player per match date + self.cur.execute(f""" + SELECT mpe.assist_player_id, m.mst_utc, COUNT(*) AS assists + FROM match_player_events mpe + JOIN matches m ON mpe.match_id = m.id + WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) + AND mpe.event_type = 'goal' AND mpe.assist_player_id IS NOT NULL + GROUP BY mpe.assist_player_id, m.mst_utc + """, self.top_league_ids) + + player_assists_raw = defaultdict(dict) + for pid, mst, assists in self.cur.fetchall(): + player_assists_raw[pid][mst] = (player_assists_raw[pid].get(mst, 0)) + (assists or 0) + + # 6c) Player participation dates (starts only) + self.cur.execute(f""" + SELECT mpp.player_id, m.mst_utc + FROM match_player_participation mpp + JOIN matches m ON mpp.match_id = m.id + WHERE mpp.is_starting = true + AND m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) + ORDER BY mpp.player_id, m.mst_utc + """, self.top_league_ids) + + player_starts_raw = defaultdict(list) + for pid, mst in self.cur.fetchall(): + player_starts_raw[pid].append(mst) + + # 6d) Build prefix sums per player (goals_prefix[i] = total goals up to start i) + player_career = {} + all_pids = set(player_starts_raw.keys()) | set(player_goals_raw.keys()) | set(player_assists_raw.keys()) + for pid in all_pids: + starts = sorted(set(player_starts_raw.get(pid, []))) + if not starts: + continue + g_map = player_goals_raw.get(pid, {}) + a_map = player_assists_raw.get(pid, {}) + cum_g, cum_a = 0, 0 + goals_pf, assists_pf = [], [] + for mst in starts: + cum_g += g_map.get(mst, 0) + cum_a += a_map.get(mst, 0) + goals_pf.append(cum_g) + assists_pf.append(cum_a) + player_career[pid] = {'msts': starts, 'gp': goals_pf, 'ap': assists_pf} + + # Free raw dicts + del player_goals_raw, player_assists_raw, player_starts_raw + print(f" 📊 Player careers built: {len(player_career)} players", flush=True) + + # ─── NEW: Team Lineup History (for squad continuity) ─── + # 7) Per-team sorted lineups: [(mst, frozenset(player_ids))] + team_lineup_map = defaultdict(list) + for (mid, tid), pids in starting_players.items(): + mst = match_mst.get(mid, 0) + if mst > 0 and pids: + team_lineup_map[tid].append((mst, frozenset(pids))) + + team_lineup_history = {} + team_lineup_msts = {} + for tid, ll in team_lineup_map.items(): + ll.sort(key=lambda x: x[0]) + team_lineup_history[tid] = ll + team_lineup_msts[tid] = [x[0] for x in ll] + del team_lineup_map + + # ─── 8) Build combined cache — NO DATA LEAKAGE ─── all_keys = set(participation.keys()) | set(events.keys()) for key in all_keys: mid, tid = key @@ -443,30 +533,78 @@ class BatchDataLoader: kp_total = len(key_players_by_team.get(tid, set())) kp_missing = max(0, kp_total - kp_in_starting) - # Squad quality: composite score — ONLY pre-match info (no current-match goals/assists!) + # Squad quality: composite score — ONLY pre-match info squad_quality = ( part['starting_count'] * 0.3 + kp_in_starting * 3.0 + part['fwd_count'] * 1.5 ) - # Missing impact: how many key players are missing missing_impact = min(kp_missing / max(kp_total, 1), 1.0) # goals_form: avg goals from last 5 matches BEFORE this match current_mst = match_mst.get(mid, 0) team_history = self.team_matches.get(tid, []) recent_goals = [ - tm[2] # team_score - for tm in team_history - if tm[0] < current_mst # only matches BEFORE this one - ][-5:] # last 5 + tm[2] for tm in team_history if tm[0] < current_mst + ][-5:] goals_form = sum(recent_goals) / len(recent_goals) if recent_goals else 1.3 + # ─── NEW: Player-level aggregation for starting XI ─── + lineup_g90, lineup_a90, total_exp = 0.0, 0.0, 0 + best_scorer_total, best_scorer_id = 0, None + scorers_in_lineup = 0 + + for pid in starters: + pc = player_career.get(pid) + if not pc: + continue + idx = bisect.bisect_left(pc['msts'], current_mst) + if idx == 0: + continue # no prior matches for this player + prior_starts = idx + prior_goals = pc['gp'][idx - 1] + prior_assists = pc['ap'][idx - 1] + lineup_g90 += prior_goals / prior_starts + lineup_a90 += prior_assists / prior_starts + total_exp += prior_starts + if prior_goals > 0: + scorers_in_lineup += 1 + if prior_goals > best_scorer_total: + best_scorer_total = prior_goals + best_scorer_id = pid + + n_st = len(starters) or 1 + + # Top scorer recent form (goals in last 5 starts) + top_scorer_form = 0 + if best_scorer_id: + pc = player_career.get(best_scorer_id) + if pc: + idx = bisect.bisect_left(pc['msts'], current_mst) + if idx > 0: + s5 = max(0, idx - 5) + top_scorer_form = pc['gp'][idx - 1] - (pc['gp'][s5 - 1] if s5 > 0 else 0) + + # Squad continuity (overlap with previous match lineup) + squad_continuity = 0.5 + msts_list = team_lineup_msts.get(tid) + if msts_list: + li = bisect.bisect_left(msts_list, current_mst) + if li > 0: + prev_lineup = team_lineup_history[tid][li - 1][1] + squad_continuity = len(frozenset(starters) & prev_lineup) / n_st + self.squad_cache[key] = { 'squad_quality': squad_quality, 'key_players': kp_in_starting, 'missing_impact': missing_impact, 'goals_form': round(goals_form, 2), + 'lineup_goals_per90': round(lineup_g90, 3), + 'lineup_assists_per90': round(lineup_a90, 3), + 'squad_continuity': round(squad_continuity, 3), + 'top_scorer_form': top_scorer_form, + 'avg_player_exp': round(total_exp / n_st, 1), + 'goals_diversity': round(scorers_in_lineup / n_st, 3), } def _load_cards_data(self): @@ -855,6 +993,20 @@ class FeatureExtractor: "away_missing_impact": away_missing_impact, "home_goals_form": home_goals_form, "away_goals_form": away_goals_form, + + # Player-Level Features + "home_lineup_goals_per90": home_sq.get('lineup_goals_per90', 0.0), + "away_lineup_goals_per90": away_sq.get('lineup_goals_per90', 0.0), + "home_lineup_assists_per90": home_sq.get('lineup_assists_per90', 0.0), + "away_lineup_assists_per90": away_sq.get('lineup_assists_per90', 0.0), + "home_squad_continuity": home_sq.get('squad_continuity', 0.5), + "away_squad_continuity": away_sq.get('squad_continuity', 0.5), + "home_top_scorer_form": home_sq.get('top_scorer_form', 0), + "away_top_scorer_form": away_sq.get('top_scorer_form', 0), + "home_avg_player_exp": home_sq.get('avg_player_exp', 0.0), + "away_avg_player_exp": away_sq.get('avg_player_exp', 0.0), + "home_goals_diversity": home_sq.get('goals_diversity', 0.0), + "away_goals_diversity": away_sq.get('goals_diversity', 0.0), # Labels "score_home": sh, diff --git a/ai-engine/scripts/train_v25_pro.py b/ai-engine/scripts/train_v25_pro.py index 991fb13..0e360d0 100644 --- a/ai-engine/scripts/train_v25_pro.py +++ b/ai-engine/scripts/train_v25_pro.py @@ -23,7 +23,7 @@ import optuna from optuna.samplers import TPESampler from datetime import datetime from sklearn.metrics import accuracy_score, log_loss, classification_report -from sklearn.calibration import CalibratedClassifierCV +from sklearn.isotonic import IsotonicRegression from sklearn.base import BaseEstimator, ClassifierMixin optuna.logging.set_verbosity(optuna.logging.WARNING) @@ -38,7 +38,7 @@ REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25") os.makedirs(MODELS_DIR, exist_ok=True) os.makedirs(REPORTS_DIR, exist_ok=True) -# ─── Feature Columns (83 features, NO target leakage) ─────────────── +# ─── Feature Columns (95 features, NO target leakage) ─────────────── FEATURES = [ # ELO (8) "home_overall_elo", "away_overall_elo", "elo_diff", @@ -94,6 +94,13 @@ FEATURES = [ "home_key_players", "away_key_players", "home_missing_impact", "away_missing_impact", "home_goals_form", "away_goals_form", + # Player-Level Features (12) + "home_lineup_goals_per90", "away_lineup_goals_per90", + "home_lineup_assists_per90", "away_lineup_assists_per90", + "home_squad_continuity", "away_squad_continuity", + "home_top_scorer_form", "away_top_scorer_form", + "home_avg_player_exp", "away_avg_player_exp", + "home_goals_diversity", "away_goals_diversity", ] MARKET_CONFIGS = [ @@ -349,18 +356,34 @@ def train_market(df, target_col, market_name, num_class, n_trials): print(f"[OK] LGB final: iter={lgb_model.best_iteration}") # ── Phase 4: Isotonic Calibration on cal set ───────────────── - print("[CAL] Fitting Isotonic Regression...") + print("[CAL] Fitting Isotonic Regression (per-class)...") - # XGB calibration - xgb_wrapper = XGBWrapper(xgb_params, num_boost_round=xgb_model.best_iteration) - xgb_calibrated = CalibratedClassifierCV(xgb_wrapper, method="isotonic", cv="prefit") - xgb_wrapper.fit(X_train, y_train) - xgb_calibrated.fit(X_cal, y_cal) + # XGB calibration — manual IsotonicRegression per class + dcal = xgb.DMatrix(X_cal) + xgb_cal_raw = xgb_model.predict(dcal) + if len(xgb_cal_raw.shape) == 1: + xgb_cal_raw = np.column_stack([1 - xgb_cal_raw, xgb_cal_raw]) - # LGB calibration — use raw predictions approach - lgb_cal_preds = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration) - if len(lgb_cal_preds.shape) == 1: - lgb_cal_preds = np.column_stack([1 - lgb_cal_preds, lgb_cal_preds]) + xgb_iso_calibrators = [] + for cls_idx in range(num_class): + ir = IsotonicRegression(out_of_bounds="clip") + y_binary = (y_cal == cls_idx).astype(float) + ir.fit(xgb_cal_raw[:, cls_idx], y_binary) + xgb_iso_calibrators.append(ir) + print(f"[OK] XGB Isotonic calibrators fitted: {num_class} classes") + + # LGB calibration — manual IsotonicRegression per class + lgb_cal_raw = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration) + if len(lgb_cal_raw.shape) == 1: + lgb_cal_raw = np.column_stack([1 - lgb_cal_raw, lgb_cal_raw]) + + lgb_iso_calibrators = [] + for cls_idx in range(num_class): + ir = IsotonicRegression(out_of_bounds="clip") + y_binary = (y_cal == cls_idx).astype(float) + ir.fit(lgb_cal_raw[:, cls_idx], y_binary) + lgb_iso_calibrators.append(ir) + print(f"[OK] LGB Isotonic calibrators fitted: {num_class} classes") # ── Phase 5: Evaluate on test set ──────────────────────────── print("\n[EVAL] Test set evaluation...") @@ -371,16 +394,26 @@ def train_market(df, target_col, market_name, num_class, n_trials): if len(xgb_raw_probs.shape) == 1: xgb_raw_probs = np.column_stack([1 - xgb_raw_probs, xgb_raw_probs]) - # Calibrated XGB - xgb_cal_probs = xgb_calibrated.predict_proba(X_test) + # Calibrated XGB — apply isotonic per class + renormalize + xgb_cal_probs = np.column_stack([ + xgb_iso_calibrators[i].predict(xgb_raw_probs[:, i]) for i in range(num_class) + ]) + xgb_cal_probs = xgb_cal_probs / xgb_cal_probs.sum(axis=1, keepdims=True) # Raw LGB lgb_raw_probs = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration) if len(lgb_raw_probs.shape) == 1: lgb_raw_probs = np.column_stack([1 - lgb_raw_probs, lgb_raw_probs]) - # Ensemble (raw) + # Calibrated LGB — apply isotonic per class + renormalize + lgb_cal_probs = np.column_stack([ + lgb_iso_calibrators[i].predict(lgb_raw_probs[:, i]) for i in range(num_class) + ]) + lgb_cal_probs = lgb_cal_probs / lgb_cal_probs.sum(axis=1, keepdims=True) + + # Ensembles raw_ensemble = (xgb_raw_probs + lgb_raw_probs) / 2 + cal_ensemble = (xgb_cal_probs + lgb_cal_probs) / 2 def _eval(probs, label): preds = np.argmax(probs, axis=1) @@ -392,7 +425,9 @@ def train_market(df, target_col, market_name, num_class, n_trials): m_xgb_raw = _eval(xgb_raw_probs, "XGB Raw") m_xgb_cal = _eval(xgb_cal_probs, "XGB Calibrated") m_lgb_raw = _eval(lgb_raw_probs, "LGB Raw") + m_lgb_cal = _eval(lgb_cal_probs, "LGB Calibrated") m_ensemble = _eval(raw_ensemble, "Ensemble Raw") + m_cal_ensemble = _eval(cal_ensemble, "Ensemble Calibrated") # Classification report for ensemble ens_preds = np.argmax(raw_ensemble, axis=1) @@ -409,11 +444,16 @@ def train_market(df, target_col, market_name, num_class, n_trials): lgb_model.save_model(lgb_path) print(f"[SAVE] {lgb_path}") - # Calibrated model - cal_path = os.path.join(MODELS_DIR, f"cal_xgb_v25_{market_name.lower()}.pkl") - with open(cal_path, "wb") as f: - pickle.dump(xgb_calibrated, f) - print(f"[SAVE] {cal_path}") + # Isotonic calibrators (XGB + LGB) + xgb_cal_path = os.path.join(MODELS_DIR, f"iso_xgb_v25_{market_name.lower()}.pkl") + with open(xgb_cal_path, "wb") as f: + pickle.dump(xgb_iso_calibrators, f) + print(f"[SAVE] {xgb_cal_path}") + + lgb_cal_path = os.path.join(MODELS_DIR, f"iso_lgb_v25_{market_name.lower()}.pkl") + with open(lgb_cal_path, "wb") as f: + pickle.dump(lgb_iso_calibrators, f) + print(f"[SAVE] {lgb_cal_path}") return { "market": market_name, @@ -432,7 +472,9 @@ def train_market(df, target_col, market_name, num_class, n_trials): "test_xgb_raw": m_xgb_raw, "test_xgb_calibrated": m_xgb_cal, "test_lgb_raw": m_lgb_raw, + "test_lgb_calibrated": m_lgb_cal, "test_ensemble_raw": m_ensemble, + "test_ensemble_calibrated": m_cal_ensemble, } @@ -495,8 +537,12 @@ def main(): print("[SUMMARY]") print("=" * 60) for name, m in all_metrics["markets"].items(): - ens = m.get("test_ensemble_raw", {}) - print(f" {name:12s} | Acc={ens.get('accuracy','?'):>6s} | LL={ens.get('logloss','?'):>6s} | " + ens = m.get("test_ensemble_calibrated", m.get("test_ensemble_raw", {})) + acc = ens.get('accuracy', '?') + ll = ens.get('logloss', '?') + acc_s = f"{acc:.4f}" if isinstance(acc, float) else str(acc) + ll_s = f"{ll:.4f}" if isinstance(ll, float) else str(ll) + print(f" {name:12s} | Acc={acc_s:>6s} | LL={ll_s:>6s} | " f"XGB_iter={m.get('xgb_best_iteration','?')} LGB_iter={m.get('lgb_best_iteration','?')}") print(f"\n[INFO] Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")