2 Commits

Author SHA1 Message Date
fahricansecer 8ce8fa5b94 Merge pull request 'gg' (#6) from v28 into main
Deploy Iddaai Backend / build-and-deploy (push) Successful in 39s
Reviewed-on: #6
2026-05-10 10:39:32 +03:00
fahricansecer 497b5d8d3b Merge pull request 'feat(ai-engine): value sniper thresholds and logic relaxed' (#5) from v28 into main
Deploy Iddaai Backend / build-and-deploy (push) Successful in 30s
Reviewed-on: #5
2026-05-06 17:56:24 +03:00
3 changed files with 32 additions and 922 deletions
@@ -1,692 +0,0 @@
{
"trained_at": "2026-05-10 19:48:06",
"trainer": "v25_pro",
"optuna_trials": 50,
"total_features": 114,
"markets": {
"MS": {
"market": "MS",
"samples": 106861,
"train": 64116,
"val": 16029,
"cal": 10686,
"test": 16030,
"features_used": 114,
"xgb_best_params": {
"max_depth": 4,
"eta": 0.022329400652878233,
"subsample": 0.6690795757813364,
"colsample_bytree": 0.5042256538541441,
"min_child_weight": 6,
"gamma": 9.960129417155444e-05,
"reg_lambda": 0.5132295377582388,
"reg_alpha": 6.804503659726287e-08
},
"lgb_best_params": {
"max_depth": 4,
"learning_rate": 0.023142410802706542,
"feature_fraction": 0.5728681432360808,
"bagging_fraction": 0.6781774410065095,
"bagging_freq": 2,
"min_child_samples": 26,
"lambda_l1": 3.25216937188593e-05,
"lambda_l2": 4.8081236902660474e-08
},
"xgb_best_iteration": 643,
"lgb_best_iteration": 441,
"xgb_optuna_best_logloss": 0.9155,
"lgb_optuna_best_logloss": 0.9146,
"test_xgb_raw": {
"accuracy": 0.5442,
"logloss": 0.943
},
"test_xgb_calibrated": {
"accuracy": 0.5404,
"logloss": 0.9438
},
"test_lgb_raw": {
"accuracy": 0.5427,
"logloss": 0.943
},
"test_lgb_calibrated": {
"accuracy": 0.5417,
"logloss": 0.9447
},
"test_ensemble_raw": {
"accuracy": 0.5437,
"logloss": 0.9426
},
"test_ensemble_calibrated": {
"accuracy": 0.5418,
"logloss": 0.9435
}
},
"OU15": {
"market": "OU15",
"samples": 106861,
"train": 64116,
"val": 16029,
"cal": 10686,
"test": 16030,
"features_used": 114,
"xgb_best_params": {
"max_depth": 5,
"eta": 0.020779487257177966,
"subsample": 0.8109935286948485,
"colsample_bytree": 0.9525413847213635,
"min_child_weight": 6,
"gamma": 0.35330347775044696,
"reg_lambda": 5.373541021746059e-07,
"reg_alpha": 0.2959430087754284
},
"lgb_best_params": {
"max_depth": 6,
"learning_rate": 0.013402310027682367,
"feature_fraction": 0.7404728146233901,
"bagging_fraction": 0.9712026511549247,
"bagging_freq": 6,
"min_child_samples": 39,
"lambda_l1": 0.39893027986899576,
"lambda_l2": 0.0626443611997599
},
"xgb_best_iteration": 353,
"lgb_best_iteration": 370,
"xgb_optuna_best_logloss": 0.499,
"lgb_optuna_best_logloss": 0.4989,
"test_xgb_raw": {
"accuracy": 0.7521,
"logloss": 0.5267
},
"test_xgb_calibrated": {
"accuracy": 0.7521,
"logloss": 0.5344
},
"test_lgb_raw": {
"accuracy": 0.7528,
"logloss": 0.5261
},
"test_lgb_calibrated": {
"accuracy": 0.7505,
"logloss": 0.5362
},
"test_ensemble_raw": {
"accuracy": 0.7518,
"logloss": 0.5261
},
"test_ensemble_calibrated": {
"accuracy": 0.7522,
"logloss": 0.5364
}
},
"OU25": {
"market": "OU25",
"samples": 106861,
"train": 64116,
"val": 16029,
"cal": 10686,
"test": 16030,
"features_used": 114,
"xgb_best_params": {
"max_depth": 5,
"eta": 0.01274409160014454,
"subsample": 0.8300258899365814,
"colsample_bytree": 0.7336425662264429,
"min_child_weight": 9,
"gamma": 2.5382243933649716e-06,
"reg_lambda": 5.096723080351853e-05,
"reg_alpha": 0.00040919711449493223
},
"lgb_best_params": {
"max_depth": 6,
"learning_rate": 0.02301514680733822,
"feature_fraction": 0.9569492061944688,
"bagging_fraction": 0.7249143523144639,
"bagging_freq": 1,
"min_child_samples": 40,
"lambda_l1": 9.954995248644963e-08,
"lambda_l2": 3.82413187126927e-06
},
"xgb_best_iteration": 475,
"lgb_best_iteration": 235,
"xgb_optuna_best_logloss": 0.6202,
"lgb_optuna_best_logloss": 0.62,
"test_xgb_raw": {
"accuracy": 0.6221,
"logloss": 0.6352
},
"test_xgb_calibrated": {
"accuracy": 0.6226,
"logloss": 0.6344
},
"test_lgb_raw": {
"accuracy": 0.6236,
"logloss": 0.6348
},
"test_lgb_calibrated": {
"accuracy": 0.6231,
"logloss": 0.6343
},
"test_ensemble_raw": {
"accuracy": 0.6239,
"logloss": 0.6349
},
"test_ensemble_calibrated": {
"accuracy": 0.6236,
"logloss": 0.6338
}
},
"OU35": {
"market": "OU35",
"samples": 106861,
"train": 64116,
"val": 16029,
"cal": 10686,
"test": 16030,
"features_used": 114,
"xgb_best_params": {
"max_depth": 4,
"eta": 0.012538827444713596,
"subsample": 0.7947923612828379,
"colsample_bytree": 0.9717654601553765,
"min_child_weight": 6,
"gamma": 0.011265216242399128,
"reg_lambda": 0.12152579364613436,
"reg_alpha": 0.013995120492957489
},
"lgb_best_params": {
"max_depth": 6,
"learning_rate": 0.013456307557939324,
"feature_fraction": 0.8208768633332759,
"bagging_fraction": 0.929472334516626,
"bagging_freq": 6,
"min_child_samples": 35,
"lambda_l1": 0.05522724221034949,
"lambda_l2": 0.21689047644122147
},
"xgb_best_iteration": 696,
"lgb_best_iteration": 412,
"xgb_optuna_best_logloss": 0.552,
"lgb_optuna_best_logloss": 0.5515,
"test_xgb_raw": {
"accuracy": 0.7314,
"logloss": 0.5466
},
"test_xgb_calibrated": {
"accuracy": 0.7293,
"logloss": 0.5482
},
"test_lgb_raw": {
"accuracy": 0.73,
"logloss": 0.5462
},
"test_lgb_calibrated": {
"accuracy": 0.7298,
"logloss": 0.5485
},
"test_ensemble_raw": {
"accuracy": 0.7312,
"logloss": 0.5462
},
"test_ensemble_calibrated": {
"accuracy": 0.7301,
"logloss": 0.5478
}
},
"BTTS": {
"market": "BTTS",
"samples": 106861,
"train": 64116,
"val": 16029,
"cal": 10686,
"test": 16030,
"features_used": 114,
"xgb_best_params": {
"max_depth": 4,
"eta": 0.023533647209064805,
"subsample": 0.7469060816054074,
"colsample_bytree": 0.8445418254808608,
"min_child_weight": 8,
"gamma": 1.0503733400514561e-08,
"reg_lambda": 2.0919595769527735e-06,
"reg_alpha": 0.027277017326535417
},
"lgb_best_params": {
"max_depth": 4,
"learning_rate": 0.03900730648793646,
"feature_fraction": 0.6968255358438369,
"bagging_fraction": 0.7078349435778689,
"bagging_freq": 1,
"min_child_samples": 46,
"lambda_l1": 1.1796591413903922e-05,
"lambda_l2": 1.574367227995052e-08
},
"xgb_best_iteration": 462,
"lgb_best_iteration": 339,
"xgb_optuna_best_logloss": 0.6557,
"lgb_optuna_best_logloss": 0.6554,
"test_xgb_raw": {
"accuracy": 0.5908,
"logloss": 0.6637
},
"test_xgb_calibrated": {
"accuracy": 0.5885,
"logloss": 0.6647
},
"test_lgb_raw": {
"accuracy": 0.5891,
"logloss": 0.6638
},
"test_lgb_calibrated": {
"accuracy": 0.5891,
"logloss": 0.6702
},
"test_ensemble_raw": {
"accuracy": 0.5892,
"logloss": 0.6635
},
"test_ensemble_calibrated": {
"accuracy": 0.5885,
"logloss": 0.6655
}
},
"HT_RESULT": {
"market": "HT_RESULT",
"samples": 103641,
"train": 62184,
"val": 15546,
"cal": 10364,
"test": 15547,
"features_used": 114,
"xgb_best_params": {
"max_depth": 4,
"eta": 0.01736265891311687,
"subsample": 0.8370935625192159,
"colsample_bytree": 0.8091927356001175,
"min_child_weight": 9,
"gamma": 0.0006570311316367184,
"reg_lambda": 0.5206211670360164,
"reg_alpha": 0.0004530536252850605
},
"lgb_best_params": {
"max_depth": 4,
"learning_rate": 0.04842652289664568,
"feature_fraction": 0.6277272818879166,
"bagging_fraction": 0.9526964840164693,
"bagging_freq": 3,
"min_child_samples": 23,
"lambda_l1": 0.09429192580834124,
"lambda_l2": 5.5433175427148124e-08
},
"xgb_best_iteration": 516,
"lgb_best_iteration": 136,
"xgb_optuna_best_logloss": 1.0128,
"lgb_optuna_best_logloss": 1.0126,
"test_xgb_raw": {
"accuracy": 0.4689,
"logloss": 1.0174
},
"test_xgb_calibrated": {
"accuracy": 0.4685,
"logloss": 1.0193
},
"test_lgb_raw": {
"accuracy": 0.4696,
"logloss": 1.018
},
"test_lgb_calibrated": {
"accuracy": 0.4685,
"logloss": 1.0248
},
"test_ensemble_raw": {
"accuracy": 0.4699,
"logloss": 1.0172
},
"test_ensemble_calibrated": {
"accuracy": 0.4693,
"logloss": 1.0195
}
},
"HT_OU05": {
"market": "HT_OU05",
"samples": 103641,
"train": 62184,
"val": 15546,
"cal": 10364,
"test": 15547,
"features_used": 114,
"xgb_best_params": {
"max_depth": 4,
"eta": 0.02440515089624656,
"subsample": 0.7173767988211683,
"colsample_bytree": 0.5705266148307722,
"min_child_weight": 10,
"gamma": 0.00010295747493868653,
"reg_lambda": 0.00048367003442154754,
"reg_alpha": 0.00018303274057896783
},
"lgb_best_params": {
"max_depth": 4,
"learning_rate": 0.043477055106943,
"feature_fraction": 0.5704621124873813,
"bagging_fraction": 0.9208787923016158,
"bagging_freq": 1,
"min_child_samples": 50,
"lambda_l1": 0.015064619068942013,
"lambda_l2": 6.143857495033091e-07
},
"xgb_best_iteration": 315,
"lgb_best_iteration": 133,
"xgb_optuna_best_logloss": 0.5756,
"lgb_optuna_best_logloss": 0.5757,
"test_xgb_raw": {
"accuracy": 0.7021,
"logloss": 0.5949
},
"test_xgb_calibrated": {
"accuracy": 0.7011,
"logloss": 0.5976
},
"test_lgb_raw": {
"accuracy": 0.7009,
"logloss": 0.5954
},
"test_lgb_calibrated": {
"accuracy": 0.7019,
"logloss": 0.6002
},
"test_ensemble_raw": {
"accuracy": 0.7012,
"logloss": 0.5947
},
"test_ensemble_calibrated": {
"accuracy": 0.7016,
"logloss": 0.5994
}
},
"HT_OU15": {
"market": "HT_OU15",
"samples": 103641,
"train": 62184,
"val": 15546,
"cal": 10364,
"test": 15547,
"features_used": 114,
"xgb_best_params": {
"max_depth": 4,
"eta": 0.032235943414662994,
"subsample": 0.9298749893021518,
"colsample_bytree": 0.8077813949235508,
"min_child_weight": 8,
"gamma": 0.00020929324388600622,
"reg_lambda": 3.2154973975232725e-05,
"reg_alpha": 1.5945155621686738e-08
},
"lgb_best_params": {
"max_depth": 5,
"learning_rate": 0.013909897616748226,
"feature_fraction": 0.5585477334219859,
"bagging_fraction": 0.9398770580467641,
"bagging_freq": 2,
"min_child_samples": 22,
"lambda_l1": 0.001865897980802303,
"lambda_l2": 2.6934572591055333e-06
},
"xgb_best_iteration": 188,
"lgb_best_iteration": 387,
"xgb_optuna_best_logloss": 0.616,
"lgb_optuna_best_logloss": 0.6159,
"test_xgb_raw": {
"accuracy": 0.6749,
"logloss": 0.6109
},
"test_xgb_calibrated": {
"accuracy": 0.6747,
"logloss": 0.6137
},
"test_lgb_raw": {
"accuracy": 0.6745,
"logloss": 0.6112
},
"test_lgb_calibrated": {
"accuracy": 0.6745,
"logloss": 0.6201
},
"test_ensemble_raw": {
"accuracy": 0.674,
"logloss": 0.6109
},
"test_ensemble_calibrated": {
"accuracy": 0.6744,
"logloss": 0.6174
}
},
"HTFT": {
"market": "HTFT",
"samples": 103641,
"train": 62184,
"val": 15546,
"cal": 10364,
"test": 15547,
"features_used": 114,
"xgb_best_params": {
"max_depth": 4,
"eta": 0.015239309183459821,
"subsample": 0.7923828997985648,
"colsample_bytree": 0.686316507387916,
"min_child_weight": 6,
"gamma": 0.005249577944740401,
"reg_lambda": 2.1813455810361064e-08,
"reg_alpha": 3.454483107951557e-06
},
"lgb_best_params": {
"max_depth": 4,
"learning_rate": 0.010347899501864056,
"feature_fraction": 0.9585697341293057,
"bagging_fraction": 0.9413628962257758,
"bagging_freq": 2,
"min_child_samples": 36,
"lambda_l1": 0.0015332771659626943,
"lambda_l2": 7.3640280079715765
},
"xgb_best_iteration": 714,
"lgb_best_iteration": 602,
"xgb_optuna_best_logloss": 1.7863,
"lgb_optuna_best_logloss": 1.7862,
"test_xgb_raw": {
"accuracy": 0.3349,
"logloss": 1.8179
},
"test_xgb_calibrated": {
"accuracy": 0.3332,
"logloss": 1.824
},
"test_lgb_raw": {
"accuracy": 0.3367,
"logloss": 1.8187
},
"test_lgb_calibrated": {
"accuracy": 0.335,
"logloss": 1.8338
},
"test_ensemble_raw": {
"accuracy": 0.3363,
"logloss": 1.8176
},
"test_ensemble_calibrated": {
"accuracy": 0.3338,
"logloss": 1.828
}
},
"ODD_EVEN": {
"market": "ODD_EVEN",
"samples": 106861,
"train": 64116,
"val": 16029,
"cal": 10686,
"test": 16030,
"features_used": 114,
"xgb_best_params": {
"max_depth": 8,
"eta": 0.01010929937405026,
"subsample": 0.9492996501687384,
"colsample_bytree": 0.9061960005014683,
"min_child_weight": 7,
"gamma": 2.664416507237002e-08,
"reg_lambda": 0.0003748192960525308,
"reg_alpha": 0.005287068300306146
},
"lgb_best_params": {
"max_depth": 8,
"learning_rate": 0.0634879805509945,
"feature_fraction": 0.9993568368122896,
"bagging_fraction": 0.9246236397710591,
"bagging_freq": 3,
"min_child_samples": 16,
"lambda_l1": 0.0016414429853061781,
"lambda_l2": 6.112007631403553e-05
},
"xgb_best_iteration": 322,
"lgb_best_iteration": 55,
"xgb_optuna_best_logloss": 0.6777,
"lgb_optuna_best_logloss": 0.6762,
"test_xgb_raw": {
"accuracy": 0.5216,
"logloss": 0.684
},
"test_xgb_calibrated": {
"accuracy": 0.5236,
"logloss": 0.6834
},
"test_lgb_raw": {
"accuracy": 0.5279,
"logloss": 0.6826
},
"test_lgb_calibrated": {
"accuracy": 0.5274,
"logloss": 0.6861
},
"test_ensemble_raw": {
"accuracy": 0.5239,
"logloss": 0.6828
},
"test_ensemble_calibrated": {
"accuracy": 0.5236,
"logloss": 0.6861
}
},
"CARDS_OU45": {
"market": "CARDS_OU45",
"samples": 106861,
"train": 64116,
"val": 16029,
"cal": 10686,
"test": 16030,
"features_used": 114,
"xgb_best_params": {
"max_depth": 8,
"eta": 0.010098671964329344,
"subsample": 0.9969616653360747,
"colsample_bytree": 0.5085930751344795,
"min_child_weight": 10,
"gamma": 0.8600893137103568,
"reg_lambda": 7.556243125116086,
"reg_alpha": 0.5596869360839299
},
"lgb_best_params": {
"max_depth": 8,
"learning_rate": 0.0183440412249233,
"feature_fraction": 0.5416111323291537,
"bagging_fraction": 0.9754210612419695,
"bagging_freq": 2,
"min_child_samples": 5,
"lambda_l1": 0.09157782079463243,
"lambda_l2": 2.559000594641019
},
"xgb_best_iteration": 973,
"lgb_best_iteration": 503,
"xgb_optuna_best_logloss": 0.6408,
"lgb_optuna_best_logloss": 0.6407,
"test_xgb_raw": {
"accuracy": 0.597,
"logloss": 0.6501
},
"test_xgb_calibrated": {
"accuracy": 0.6019,
"logloss": 0.6471
},
"test_lgb_raw": {
"accuracy": 0.5977,
"logloss": 0.6486
},
"test_lgb_calibrated": {
"accuracy": 0.6019,
"logloss": 0.6498
},
"test_ensemble_raw": {
"accuracy": 0.5964,
"logloss": 0.6487
},
"test_ensemble_calibrated": {
"accuracy": 0.6034,
"logloss": 0.6467
}
},
"HANDICAP_MS": {
"market": "HANDICAP_MS",
"samples": 106861,
"train": 64116,
"val": 16029,
"cal": 10686,
"test": 16030,
"features_used": 114,
"xgb_best_params": {
"max_depth": 4,
"eta": 0.01475719431584365,
"subsample": 0.867899230696633,
"colsample_bytree": 0.6518567347674479,
"min_child_weight": 9,
"gamma": 0.34932767754310273,
"reg_lambda": 3.3257801082201637e-07,
"reg_alpha": 4.6977721450875555e-06
},
"lgb_best_params": {
"max_depth": 7,
"learning_rate": 0.019649745228555244,
"feature_fraction": 0.7903699430858344,
"bagging_fraction": 0.7932436899357213,
"bagging_freq": 3,
"min_child_samples": 30,
"lambda_l1": 9.496143774926949e-08,
"lambda_l2": 0.0049885051588706136
},
"xgb_best_iteration": 1016,
"lgb_best_iteration": 364,
"xgb_optuna_best_logloss": 0.8328,
"lgb_optuna_best_logloss": 0.8322,
"test_xgb_raw": {
"accuracy": 0.6062,
"logloss": 0.871
},
"test_xgb_calibrated": {
"accuracy": 0.6039,
"logloss": 0.8729
},
"test_lgb_raw": {
"accuracy": 0.6079,
"logloss": 0.8713
},
"test_lgb_calibrated": {
"accuracy": 0.6067,
"logloss": 0.8736
},
"test_ensemble_raw": {
"accuracy": 0.6072,
"logloss": 0.8707
},
"test_ensemble_calibrated": {
"accuracy": 0.6066,
"logloss": 0.8728
}
}
}
}
+10 -162
View File
@@ -14,7 +14,6 @@ import json
import csv import csv
import math import math
import time import time
import bisect
from datetime import datetime from datetime import datetime
from collections import defaultdict from collections import defaultdict
@@ -120,14 +119,6 @@ FEATURE_COLS = [
"home_key_players", "away_key_players", "home_key_players", "away_key_players",
"home_missing_impact", "away_missing_impact", "home_missing_impact", "away_missing_impact",
"home_goals_form", "away_goals_form", "home_goals_form", "away_goals_form",
# Player-Level Features (12)
"home_lineup_goals_per90", "away_lineup_goals_per90",
"home_lineup_assists_per90", "away_lineup_assists_per90",
"home_squad_continuity", "away_squad_continuity",
"home_top_scorer_form", "away_top_scorer_form",
"home_avg_player_exp", "away_avg_player_exp",
"home_goals_diversity", "away_goals_diversity",
# Labels # Labels
"score_home", "score_away", "total_goals", "score_home", "score_away", "total_goals",
@@ -345,7 +336,7 @@ class BatchDataLoader:
self.team_stats[tid].append((mst, poss, sot, tshots, corn, team_goals)) self.team_stats[tid].append((mst, poss, sot, tshots, corn, team_goals))
def _load_squad_data(self): def _load_squad_data(self):
"""Bulk load squad participation + player events + player career for squad features.""" """Bulk load squad participation + player events for squad features."""
ph = ",".join(["%s"] * len(self.top_league_ids)) ph = ",".join(["%s"] * len(self.top_league_ids))
# 1) Participation: starting XI count + position distribution per (match, team) # 1) Participation: starting XI count + position distribution per (match, team)
@@ -438,90 +429,9 @@ class BatchDataLoader:
for m in self.matches: for m in self.matches:
match_mst[m[0]] = m[7] # m[0]=id, m[7]=mst_utc match_mst[m[0]] = m[7] # m[0]=id, m[7]=mst_utc
# ─── NEW: Player Career Stats (prefix-sum for O(1) temporal lookup) ─── # 6) Build combined cache — NO DATA LEAKAGE
# 6a) Goals per player per match date # goals_form: avg goals from last 5 matches BEFORE this match (not this match!)
self.cur.execute(f""" # squad_quality: only uses pre-match info (lineup, key players) — no current-match goals/assists
SELECT mpe.player_id, m.mst_utc,
SUM(CASE WHEN mpe.event_type = 'goal'
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
THEN 1 ELSE 0 END) AS goals
FROM match_player_events mpe
JOIN matches m ON mpe.match_id = m.id
WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph})
GROUP BY mpe.player_id, m.mst_utc
""", self.top_league_ids)
player_goals_raw = defaultdict(dict)
for pid, mst, goals in self.cur.fetchall():
player_goals_raw[pid][mst] = (player_goals_raw[pid].get(mst, 0)) + (goals or 0)
# 6b) Assists per player per match date
self.cur.execute(f"""
SELECT mpe.assist_player_id, m.mst_utc, COUNT(*) AS assists
FROM match_player_events mpe
JOIN matches m ON mpe.match_id = m.id
WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph})
AND mpe.event_type = 'goal' AND mpe.assist_player_id IS NOT NULL
GROUP BY mpe.assist_player_id, m.mst_utc
""", self.top_league_ids)
player_assists_raw = defaultdict(dict)
for pid, mst, assists in self.cur.fetchall():
player_assists_raw[pid][mst] = (player_assists_raw[pid].get(mst, 0)) + (assists or 0)
# 6c) Player participation dates (starts only)
self.cur.execute(f"""
SELECT mpp.player_id, m.mst_utc
FROM match_player_participation mpp
JOIN matches m ON mpp.match_id = m.id
WHERE mpp.is_starting = true
AND m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph})
ORDER BY mpp.player_id, m.mst_utc
""", self.top_league_ids)
player_starts_raw = defaultdict(list)
for pid, mst in self.cur.fetchall():
player_starts_raw[pid].append(mst)
# 6d) Build prefix sums per player (goals_prefix[i] = total goals up to start i)
player_career = {}
all_pids = set(player_starts_raw.keys()) | set(player_goals_raw.keys()) | set(player_assists_raw.keys())
for pid in all_pids:
starts = sorted(set(player_starts_raw.get(pid, [])))
if not starts:
continue
g_map = player_goals_raw.get(pid, {})
a_map = player_assists_raw.get(pid, {})
cum_g, cum_a = 0, 0
goals_pf, assists_pf = [], []
for mst in starts:
cum_g += g_map.get(mst, 0)
cum_a += a_map.get(mst, 0)
goals_pf.append(cum_g)
assists_pf.append(cum_a)
player_career[pid] = {'msts': starts, 'gp': goals_pf, 'ap': assists_pf}
# Free raw dicts
del player_goals_raw, player_assists_raw, player_starts_raw
print(f" 📊 Player careers built: {len(player_career)} players", flush=True)
# ─── NEW: Team Lineup History (for squad continuity) ───
# 7) Per-team sorted lineups: [(mst, frozenset(player_ids))]
team_lineup_map = defaultdict(list)
for (mid, tid), pids in starting_players.items():
mst = match_mst.get(mid, 0)
if mst > 0 and pids:
team_lineup_map[tid].append((mst, frozenset(pids)))
team_lineup_history = {}
team_lineup_msts = {}
for tid, ll in team_lineup_map.items():
ll.sort(key=lambda x: x[0])
team_lineup_history[tid] = ll
team_lineup_msts[tid] = [x[0] for x in ll]
del team_lineup_map
# ─── 8) Build combined cache — NO DATA LEAKAGE ───
all_keys = set(participation.keys()) | set(events.keys()) all_keys = set(participation.keys()) | set(events.keys())
for key in all_keys: for key in all_keys:
mid, tid = key mid, tid = key
@@ -533,78 +443,30 @@ class BatchDataLoader:
kp_total = len(key_players_by_team.get(tid, set())) kp_total = len(key_players_by_team.get(tid, set()))
kp_missing = max(0, kp_total - kp_in_starting) kp_missing = max(0, kp_total - kp_in_starting)
# Squad quality: composite score — ONLY pre-match info # Squad quality: composite score — ONLY pre-match info (no current-match goals/assists!)
squad_quality = ( squad_quality = (
part['starting_count'] * 0.3 + part['starting_count'] * 0.3 +
kp_in_starting * 3.0 + kp_in_starting * 3.0 +
part['fwd_count'] * 1.5 part['fwd_count'] * 1.5
) )
# Missing impact: how many key players are missing
missing_impact = min(kp_missing / max(kp_total, 1), 1.0) missing_impact = min(kp_missing / max(kp_total, 1), 1.0)
# goals_form: avg goals from last 5 matches BEFORE this match # goals_form: avg goals from last 5 matches BEFORE this match
current_mst = match_mst.get(mid, 0) current_mst = match_mst.get(mid, 0)
team_history = self.team_matches.get(tid, []) team_history = self.team_matches.get(tid, [])
recent_goals = [ recent_goals = [
tm[2] for tm in team_history if tm[0] < current_mst tm[2] # team_score
][-5:] for tm in team_history
if tm[0] < current_mst # only matches BEFORE this one
][-5:] # last 5
goals_form = sum(recent_goals) / len(recent_goals) if recent_goals else 1.3 goals_form = sum(recent_goals) / len(recent_goals) if recent_goals else 1.3
# ─── NEW: Player-level aggregation for starting XI ───
lineup_g90, lineup_a90, total_exp = 0.0, 0.0, 0
best_scorer_total, best_scorer_id = 0, None
scorers_in_lineup = 0
for pid in starters:
pc = player_career.get(pid)
if not pc:
continue
idx = bisect.bisect_left(pc['msts'], current_mst)
if idx == 0:
continue # no prior matches for this player
prior_starts = idx
prior_goals = pc['gp'][idx - 1]
prior_assists = pc['ap'][idx - 1]
lineup_g90 += prior_goals / prior_starts
lineup_a90 += prior_assists / prior_starts
total_exp += prior_starts
if prior_goals > 0:
scorers_in_lineup += 1
if prior_goals > best_scorer_total:
best_scorer_total = prior_goals
best_scorer_id = pid
n_st = len(starters) or 1
# Top scorer recent form (goals in last 5 starts)
top_scorer_form = 0
if best_scorer_id:
pc = player_career.get(best_scorer_id)
if pc:
idx = bisect.bisect_left(pc['msts'], current_mst)
if idx > 0:
s5 = max(0, idx - 5)
top_scorer_form = pc['gp'][idx - 1] - (pc['gp'][s5 - 1] if s5 > 0 else 0)
# Squad continuity (overlap with previous match lineup)
squad_continuity = 0.5
msts_list = team_lineup_msts.get(tid)
if msts_list:
li = bisect.bisect_left(msts_list, current_mst)
if li > 0:
prev_lineup = team_lineup_history[tid][li - 1][1]
squad_continuity = len(frozenset(starters) & prev_lineup) / n_st
self.squad_cache[key] = { self.squad_cache[key] = {
'squad_quality': squad_quality, 'squad_quality': squad_quality,
'key_players': kp_in_starting, 'key_players': kp_in_starting,
'missing_impact': missing_impact, 'missing_impact': missing_impact,
'goals_form': round(goals_form, 2), 'goals_form': round(goals_form, 2),
'lineup_goals_per90': round(lineup_g90, 3),
'lineup_assists_per90': round(lineup_a90, 3),
'squad_continuity': round(squad_continuity, 3),
'top_scorer_form': top_scorer_form,
'avg_player_exp': round(total_exp / n_st, 1),
'goals_diversity': round(scorers_in_lineup / n_st, 3),
} }
def _load_cards_data(self): def _load_cards_data(self):
@@ -993,20 +855,6 @@ class FeatureExtractor:
"away_missing_impact": away_missing_impact, "away_missing_impact": away_missing_impact,
"home_goals_form": home_goals_form, "home_goals_form": home_goals_form,
"away_goals_form": away_goals_form, "away_goals_form": away_goals_form,
# Player-Level Features
"home_lineup_goals_per90": home_sq.get('lineup_goals_per90', 0.0),
"away_lineup_goals_per90": away_sq.get('lineup_goals_per90', 0.0),
"home_lineup_assists_per90": home_sq.get('lineup_assists_per90', 0.0),
"away_lineup_assists_per90": away_sq.get('lineup_assists_per90', 0.0),
"home_squad_continuity": home_sq.get('squad_continuity', 0.5),
"away_squad_continuity": away_sq.get('squad_continuity', 0.5),
"home_top_scorer_form": home_sq.get('top_scorer_form', 0),
"away_top_scorer_form": away_sq.get('top_scorer_form', 0),
"home_avg_player_exp": home_sq.get('avg_player_exp', 0.0),
"away_avg_player_exp": away_sq.get('avg_player_exp', 0.0),
"home_goals_diversity": home_sq.get('goals_diversity', 0.0),
"away_goals_diversity": away_sq.get('goals_diversity', 0.0),
# Labels # Labels
"score_home": sh, "score_home": sh,
+22 -68
View File
@@ -23,7 +23,7 @@ import optuna
from optuna.samplers import TPESampler from optuna.samplers import TPESampler
from datetime import datetime from datetime import datetime
from sklearn.metrics import accuracy_score, log_loss, classification_report from sklearn.metrics import accuracy_score, log_loss, classification_report
from sklearn.isotonic import IsotonicRegression from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import BaseEstimator, ClassifierMixin
optuna.logging.set_verbosity(optuna.logging.WARNING) optuna.logging.set_verbosity(optuna.logging.WARNING)
@@ -38,7 +38,7 @@ REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25")
os.makedirs(MODELS_DIR, exist_ok=True) os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True) os.makedirs(REPORTS_DIR, exist_ok=True)
# ─── Feature Columns (95 features, NO target leakage) ─────────────── # ─── Feature Columns (83 features, NO target leakage) ───────────────
FEATURES = [ FEATURES = [
# ELO (8) # ELO (8)
"home_overall_elo", "away_overall_elo", "elo_diff", "home_overall_elo", "away_overall_elo", "elo_diff",
@@ -94,13 +94,6 @@ FEATURES = [
"home_key_players", "away_key_players", "home_key_players", "away_key_players",
"home_missing_impact", "away_missing_impact", "home_missing_impact", "away_missing_impact",
"home_goals_form", "away_goals_form", "home_goals_form", "away_goals_form",
# Player-Level Features (12)
"home_lineup_goals_per90", "away_lineup_goals_per90",
"home_lineup_assists_per90", "away_lineup_assists_per90",
"home_squad_continuity", "away_squad_continuity",
"home_top_scorer_form", "away_top_scorer_form",
"home_avg_player_exp", "away_avg_player_exp",
"home_goals_diversity", "away_goals_diversity",
] ]
MARKET_CONFIGS = [ MARKET_CONFIGS = [
@@ -356,34 +349,18 @@ def train_market(df, target_col, market_name, num_class, n_trials):
print(f"[OK] LGB final: iter={lgb_model.best_iteration}") print(f"[OK] LGB final: iter={lgb_model.best_iteration}")
# ── Phase 4: Isotonic Calibration on cal set ───────────────── # ── Phase 4: Isotonic Calibration on cal set ─────────────────
print("[CAL] Fitting Isotonic Regression (per-class)...") print("[CAL] Fitting Isotonic Regression...")
# XGB calibration — manual IsotonicRegression per class # XGB calibration
dcal = xgb.DMatrix(X_cal) xgb_wrapper = XGBWrapper(xgb_params, num_boost_round=xgb_model.best_iteration)
xgb_cal_raw = xgb_model.predict(dcal) xgb_calibrated = CalibratedClassifierCV(xgb_wrapper, method="isotonic", cv="prefit")
if len(xgb_cal_raw.shape) == 1: xgb_wrapper.fit(X_train, y_train)
xgb_cal_raw = np.column_stack([1 - xgb_cal_raw, xgb_cal_raw]) xgb_calibrated.fit(X_cal, y_cal)
xgb_iso_calibrators = [] # LGB calibration — use raw predictions approach
for cls_idx in range(num_class): lgb_cal_preds = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration)
ir = IsotonicRegression(out_of_bounds="clip") if len(lgb_cal_preds.shape) == 1:
y_binary = (y_cal == cls_idx).astype(float) lgb_cal_preds = np.column_stack([1 - lgb_cal_preds, lgb_cal_preds])
ir.fit(xgb_cal_raw[:, cls_idx], y_binary)
xgb_iso_calibrators.append(ir)
print(f"[OK] XGB Isotonic calibrators fitted: {num_class} classes")
# LGB calibration — manual IsotonicRegression per class
lgb_cal_raw = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration)
if len(lgb_cal_raw.shape) == 1:
lgb_cal_raw = np.column_stack([1 - lgb_cal_raw, lgb_cal_raw])
lgb_iso_calibrators = []
for cls_idx in range(num_class):
ir = IsotonicRegression(out_of_bounds="clip")
y_binary = (y_cal == cls_idx).astype(float)
ir.fit(lgb_cal_raw[:, cls_idx], y_binary)
lgb_iso_calibrators.append(ir)
print(f"[OK] LGB Isotonic calibrators fitted: {num_class} classes")
# ── Phase 5: Evaluate on test set ──────────────────────────── # ── Phase 5: Evaluate on test set ────────────────────────────
print("\n[EVAL] Test set evaluation...") print("\n[EVAL] Test set evaluation...")
@@ -394,26 +371,16 @@ def train_market(df, target_col, market_name, num_class, n_trials):
if len(xgb_raw_probs.shape) == 1: if len(xgb_raw_probs.shape) == 1:
xgb_raw_probs = np.column_stack([1 - xgb_raw_probs, xgb_raw_probs]) xgb_raw_probs = np.column_stack([1 - xgb_raw_probs, xgb_raw_probs])
# Calibrated XGB — apply isotonic per class + renormalize # Calibrated XGB
xgb_cal_probs = np.column_stack([ xgb_cal_probs = xgb_calibrated.predict_proba(X_test)
xgb_iso_calibrators[i].predict(xgb_raw_probs[:, i]) for i in range(num_class)
])
xgb_cal_probs = xgb_cal_probs / xgb_cal_probs.sum(axis=1, keepdims=True)
# Raw LGB # Raw LGB
lgb_raw_probs = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration) lgb_raw_probs = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
if len(lgb_raw_probs.shape) == 1: if len(lgb_raw_probs.shape) == 1:
lgb_raw_probs = np.column_stack([1 - lgb_raw_probs, lgb_raw_probs]) lgb_raw_probs = np.column_stack([1 - lgb_raw_probs, lgb_raw_probs])
# Calibrated LGB — apply isotonic per class + renormalize # Ensemble (raw)
lgb_cal_probs = np.column_stack([
lgb_iso_calibrators[i].predict(lgb_raw_probs[:, i]) for i in range(num_class)
])
lgb_cal_probs = lgb_cal_probs / lgb_cal_probs.sum(axis=1, keepdims=True)
# Ensembles
raw_ensemble = (xgb_raw_probs + lgb_raw_probs) / 2 raw_ensemble = (xgb_raw_probs + lgb_raw_probs) / 2
cal_ensemble = (xgb_cal_probs + lgb_cal_probs) / 2
def _eval(probs, label): def _eval(probs, label):
preds = np.argmax(probs, axis=1) preds = np.argmax(probs, axis=1)
@@ -425,9 +392,7 @@ def train_market(df, target_col, market_name, num_class, n_trials):
m_xgb_raw = _eval(xgb_raw_probs, "XGB Raw") m_xgb_raw = _eval(xgb_raw_probs, "XGB Raw")
m_xgb_cal = _eval(xgb_cal_probs, "XGB Calibrated") m_xgb_cal = _eval(xgb_cal_probs, "XGB Calibrated")
m_lgb_raw = _eval(lgb_raw_probs, "LGB Raw") m_lgb_raw = _eval(lgb_raw_probs, "LGB Raw")
m_lgb_cal = _eval(lgb_cal_probs, "LGB Calibrated")
m_ensemble = _eval(raw_ensemble, "Ensemble Raw") m_ensemble = _eval(raw_ensemble, "Ensemble Raw")
m_cal_ensemble = _eval(cal_ensemble, "Ensemble Calibrated")
# Classification report for ensemble # Classification report for ensemble
ens_preds = np.argmax(raw_ensemble, axis=1) ens_preds = np.argmax(raw_ensemble, axis=1)
@@ -444,16 +409,11 @@ def train_market(df, target_col, market_name, num_class, n_trials):
lgb_model.save_model(lgb_path) lgb_model.save_model(lgb_path)
print(f"[SAVE] {lgb_path}") print(f"[SAVE] {lgb_path}")
# Isotonic calibrators (XGB + LGB) # Calibrated model
xgb_cal_path = os.path.join(MODELS_DIR, f"iso_xgb_v25_{market_name.lower()}.pkl") cal_path = os.path.join(MODELS_DIR, f"cal_xgb_v25_{market_name.lower()}.pkl")
with open(xgb_cal_path, "wb") as f: with open(cal_path, "wb") as f:
pickle.dump(xgb_iso_calibrators, f) pickle.dump(xgb_calibrated, f)
print(f"[SAVE] {xgb_cal_path}") print(f"[SAVE] {cal_path}")
lgb_cal_path = os.path.join(MODELS_DIR, f"iso_lgb_v25_{market_name.lower()}.pkl")
with open(lgb_cal_path, "wb") as f:
pickle.dump(lgb_iso_calibrators, f)
print(f"[SAVE] {lgb_cal_path}")
return { return {
"market": market_name, "market": market_name,
@@ -472,9 +432,7 @@ def train_market(df, target_col, market_name, num_class, n_trials):
"test_xgb_raw": m_xgb_raw, "test_xgb_raw": m_xgb_raw,
"test_xgb_calibrated": m_xgb_cal, "test_xgb_calibrated": m_xgb_cal,
"test_lgb_raw": m_lgb_raw, "test_lgb_raw": m_lgb_raw,
"test_lgb_calibrated": m_lgb_cal,
"test_ensemble_raw": m_ensemble, "test_ensemble_raw": m_ensemble,
"test_ensemble_calibrated": m_cal_ensemble,
} }
@@ -537,12 +495,8 @@ def main():
print("[SUMMARY]") print("[SUMMARY]")
print("=" * 60) print("=" * 60)
for name, m in all_metrics["markets"].items(): for name, m in all_metrics["markets"].items():
ens = m.get("test_ensemble_calibrated", m.get("test_ensemble_raw", {})) ens = m.get("test_ensemble_raw", {})
acc = ens.get('accuracy', '?') print(f" {name:12s} | Acc={ens.get('accuracy','?'):>6s} | LL={ens.get('logloss','?'):>6s} | "
ll = ens.get('logloss', '?')
acc_s = f"{acc:.4f}" if isinstance(acc, float) else str(acc)
ll_s = f"{ll:.4f}" if isinstance(ll, float) else str(ll)
print(f" {name:12s} | Acc={acc_s:>6s} | LL={ll_s:>6s} | "
f"XGB_iter={m.get('xgb_best_iteration','?')} LGB_iter={m.get('lgb_best_iteration','?')}") f"XGB_iter={m.get('xgb_best_iteration','?')} LGB_iter={m.get('lgb_best_iteration','?')}")
print(f"\n[INFO] Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"\n[INFO] Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")