This commit is contained in:
2026-05-10 22:52:05 +03:00
parent c525b12dfd
commit f3362f266c
3 changed files with 922 additions and 32 deletions
+68 -22
View File
@@ -23,7 +23,7 @@ import optuna
from optuna.samplers import TPESampler
from datetime import datetime
from sklearn.metrics import accuracy_score, log_loss, classification_report
from sklearn.calibration import CalibratedClassifierCV
from sklearn.isotonic import IsotonicRegression
from sklearn.base import BaseEstimator, ClassifierMixin
optuna.logging.set_verbosity(optuna.logging.WARNING)
@@ -38,7 +38,7 @@ REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25")
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)
# ─── Feature Columns (83 features, NO target leakage) ───────────────
# ─── Feature Columns (95 features, NO target leakage) ───────────────
FEATURES = [
# ELO (8)
"home_overall_elo", "away_overall_elo", "elo_diff",
@@ -94,6 +94,13 @@ FEATURES = [
"home_key_players", "away_key_players",
"home_missing_impact", "away_missing_impact",
"home_goals_form", "away_goals_form",
# Player-Level Features (12)
"home_lineup_goals_per90", "away_lineup_goals_per90",
"home_lineup_assists_per90", "away_lineup_assists_per90",
"home_squad_continuity", "away_squad_continuity",
"home_top_scorer_form", "away_top_scorer_form",
"home_avg_player_exp", "away_avg_player_exp",
"home_goals_diversity", "away_goals_diversity",
]
MARKET_CONFIGS = [
@@ -349,18 +356,34 @@ def train_market(df, target_col, market_name, num_class, n_trials):
print(f"[OK] LGB final: iter={lgb_model.best_iteration}")
# ── Phase 4: Isotonic Calibration on cal set ─────────────────
print("[CAL] Fitting Isotonic Regression...")
print("[CAL] Fitting Isotonic Regression (per-class)...")
# XGB calibration
xgb_wrapper = XGBWrapper(xgb_params, num_boost_round=xgb_model.best_iteration)
xgb_calibrated = CalibratedClassifierCV(xgb_wrapper, method="isotonic", cv="prefit")
xgb_wrapper.fit(X_train, y_train)
xgb_calibrated.fit(X_cal, y_cal)
# XGB calibration — manual IsotonicRegression per class
dcal = xgb.DMatrix(X_cal)
xgb_cal_raw = xgb_model.predict(dcal)
if len(xgb_cal_raw.shape) == 1:
xgb_cal_raw = np.column_stack([1 - xgb_cal_raw, xgb_cal_raw])
# LGB calibration — use raw predictions approach
lgb_cal_preds = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration)
if len(lgb_cal_preds.shape) == 1:
lgb_cal_preds = np.column_stack([1 - lgb_cal_preds, lgb_cal_preds])
xgb_iso_calibrators = []
for cls_idx in range(num_class):
ir = IsotonicRegression(out_of_bounds="clip")
y_binary = (y_cal == cls_idx).astype(float)
ir.fit(xgb_cal_raw[:, cls_idx], y_binary)
xgb_iso_calibrators.append(ir)
print(f"[OK] XGB Isotonic calibrators fitted: {num_class} classes")
# LGB calibration — manual IsotonicRegression per class
lgb_cal_raw = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration)
if len(lgb_cal_raw.shape) == 1:
lgb_cal_raw = np.column_stack([1 - lgb_cal_raw, lgb_cal_raw])
lgb_iso_calibrators = []
for cls_idx in range(num_class):
ir = IsotonicRegression(out_of_bounds="clip")
y_binary = (y_cal == cls_idx).astype(float)
ir.fit(lgb_cal_raw[:, cls_idx], y_binary)
lgb_iso_calibrators.append(ir)
print(f"[OK] LGB Isotonic calibrators fitted: {num_class} classes")
# ── Phase 5: Evaluate on test set ────────────────────────────
print("\n[EVAL] Test set evaluation...")
@@ -371,16 +394,26 @@ def train_market(df, target_col, market_name, num_class, n_trials):
if len(xgb_raw_probs.shape) == 1:
xgb_raw_probs = np.column_stack([1 - xgb_raw_probs, xgb_raw_probs])
# Calibrated XGB
xgb_cal_probs = xgb_calibrated.predict_proba(X_test)
# Calibrated XGB — apply isotonic per class + renormalize
xgb_cal_probs = np.column_stack([
xgb_iso_calibrators[i].predict(xgb_raw_probs[:, i]) for i in range(num_class)
])
xgb_cal_probs = xgb_cal_probs / xgb_cal_probs.sum(axis=1, keepdims=True)
# Raw LGB
lgb_raw_probs = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
if len(lgb_raw_probs.shape) == 1:
lgb_raw_probs = np.column_stack([1 - lgb_raw_probs, lgb_raw_probs])
# Ensemble (raw)
# Calibrated LGB — apply isotonic per class + renormalize
lgb_cal_probs = np.column_stack([
lgb_iso_calibrators[i].predict(lgb_raw_probs[:, i]) for i in range(num_class)
])
lgb_cal_probs = lgb_cal_probs / lgb_cal_probs.sum(axis=1, keepdims=True)
# Ensembles
raw_ensemble = (xgb_raw_probs + lgb_raw_probs) / 2
cal_ensemble = (xgb_cal_probs + lgb_cal_probs) / 2
def _eval(probs, label):
preds = np.argmax(probs, axis=1)
@@ -392,7 +425,9 @@ def train_market(df, target_col, market_name, num_class, n_trials):
m_xgb_raw = _eval(xgb_raw_probs, "XGB Raw")
m_xgb_cal = _eval(xgb_cal_probs, "XGB Calibrated")
m_lgb_raw = _eval(lgb_raw_probs, "LGB Raw")
m_lgb_cal = _eval(lgb_cal_probs, "LGB Calibrated")
m_ensemble = _eval(raw_ensemble, "Ensemble Raw")
m_cal_ensemble = _eval(cal_ensemble, "Ensemble Calibrated")
# Classification report for ensemble
ens_preds = np.argmax(raw_ensemble, axis=1)
@@ -409,11 +444,16 @@ def train_market(df, target_col, market_name, num_class, n_trials):
lgb_model.save_model(lgb_path)
print(f"[SAVE] {lgb_path}")
# Calibrated model
cal_path = os.path.join(MODELS_DIR, f"cal_xgb_v25_{market_name.lower()}.pkl")
with open(cal_path, "wb") as f:
pickle.dump(xgb_calibrated, f)
print(f"[SAVE] {cal_path}")
# Isotonic calibrators (XGB + LGB)
xgb_cal_path = os.path.join(MODELS_DIR, f"iso_xgb_v25_{market_name.lower()}.pkl")
with open(xgb_cal_path, "wb") as f:
pickle.dump(xgb_iso_calibrators, f)
print(f"[SAVE] {xgb_cal_path}")
lgb_cal_path = os.path.join(MODELS_DIR, f"iso_lgb_v25_{market_name.lower()}.pkl")
with open(lgb_cal_path, "wb") as f:
pickle.dump(lgb_iso_calibrators, f)
print(f"[SAVE] {lgb_cal_path}")
return {
"market": market_name,
@@ -432,7 +472,9 @@ def train_market(df, target_col, market_name, num_class, n_trials):
"test_xgb_raw": m_xgb_raw,
"test_xgb_calibrated": m_xgb_cal,
"test_lgb_raw": m_lgb_raw,
"test_lgb_calibrated": m_lgb_cal,
"test_ensemble_raw": m_ensemble,
"test_ensemble_calibrated": m_cal_ensemble,
}
@@ -495,8 +537,12 @@ def main():
print("[SUMMARY]")
print("=" * 60)
for name, m in all_metrics["markets"].items():
ens = m.get("test_ensemble_raw", {})
print(f" {name:12s} | Acc={ens.get('accuracy','?'):>6s} | LL={ens.get('logloss','?'):>6s} | "
ens = m.get("test_ensemble_calibrated", m.get("test_ensemble_raw", {}))
acc = ens.get('accuracy', '?')
ll = ens.get('logloss', '?')
acc_s = f"{acc:.4f}" if isinstance(acc, float) else str(acc)
ll_s = f"{ll:.4f}" if isinstance(ll, float) else str(ll)
print(f" {name:12s} | Acc={acc_s:>6s} | LL={ll_s:>6s} | "
f"XGB_iter={m.get('xgb_best_iteration','?')} LGB_iter={m.get('lgb_best_iteration','?')}")
print(f"\n[INFO] Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")