""" V25 Pro Model Trainer — Optuna + Isotonic Calibration ===================================================== Combines V25's 83 features + 12 markets + temporal split with Optuna hyperparameter tuning and Isotonic Regression calibration. Usage: python scripts/train_v25_pro.py python scripts/train_v25_pro.py --markets MS,OU25,BTTS # specific markets python scripts/train_v25_pro.py --trials 30 # fewer trials """ import os import sys import json import pickle import argparse import numpy as np import pandas as pd import xgboost as xgb import lightgbm as lgb import optuna from optuna.samplers import TPESampler from datetime import datetime from sklearn.metrics import accuracy_score, log_loss, classification_report from sklearn.isotonic import IsotonicRegression from sklearn.base import BaseEstimator, ClassifierMixin optuna.logging.set_verbosity(optuna.logging.WARNING) sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv") MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "v25") REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25") os.makedirs(MODELS_DIR, exist_ok=True) os.makedirs(REPORTS_DIR, exist_ok=True) # ─── Feature Columns (95 features, NO target leakage) ─────────────── FEATURES = [ # ELO (8) "home_overall_elo", "away_overall_elo", "elo_diff", "home_home_elo", "away_away_elo", "home_form_elo", "away_form_elo", "form_elo_diff", # Form (12) "home_goals_avg", "home_conceded_avg", "away_goals_avg", "away_conceded_avg", "home_clean_sheet_rate", "away_clean_sheet_rate", "home_scoring_rate", "away_scoring_rate", "home_winning_streak", "away_winning_streak", "home_unbeaten_streak", "away_unbeaten_streak", # H2H (6) "h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate", "h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate", # Team Stats (8) "home_avg_possession", "away_avg_possession", "home_avg_shots_on_target", "away_avg_shots_on_target", "home_shot_conversion", "away_shot_conversion", "home_avg_corners", "away_avg_corners", # Odds (24 + 20 presence flags) "odds_ms_h", "odds_ms_d", "odds_ms_a", "implied_home", "implied_draw", "implied_away", "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a", "odds_ou05_o", "odds_ou05_u", "odds_ou15_o", "odds_ou15_u", "odds_ou25_o", "odds_ou25_u", "odds_ou35_o", "odds_ou35_u", "odds_ht_ou05_o", "odds_ht_ou05_u", "odds_ht_ou15_o", "odds_ht_ou15_u", "odds_btts_y", "odds_btts_n", "odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present", "odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present", "odds_ou05_o_present", "odds_ou05_u_present", "odds_ou15_o_present", "odds_ou15_u_present", "odds_ou25_o_present", "odds_ou25_u_present", "odds_ou35_o_present", "odds_ou35_u_present", "odds_ht_ou05_o_present", "odds_ht_ou05_u_present", "odds_ht_ou15_o_present", "odds_ht_ou15_u_present", "odds_btts_y_present", "odds_btts_n_present", # League (4) "home_xga", "away_xga", "league_avg_goals", "league_zero_goal_rate", # Upset Engine (4) "upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential", # Referee Engine (5) "referee_home_bias", "referee_avg_goals", "referee_cards_total", "referee_avg_yellow", "referee_experience", # Momentum (3) "home_momentum_score", "away_momentum_score", "momentum_diff", # Squad (9) "home_squad_quality", "away_squad_quality", "squad_diff", "home_key_players", "away_key_players", "home_missing_impact", "away_missing_impact", "home_goals_form", "away_goals_form", # Player-Level Features (12) "home_lineup_goals_per90", "away_lineup_goals_per90", "home_lineup_assists_per90", "away_lineup_assists_per90", "home_squad_continuity", "away_squad_continuity", "home_top_scorer_form", "away_top_scorer_form", "home_avg_player_exp", "away_avg_player_exp", "home_goals_diversity", "away_goals_diversity", ] MARKET_CONFIGS = [ {"target": "label_ms", "name": "MS", "num_class": 3}, {"target": "label_ou15", "name": "OU15", "num_class": 2}, {"target": "label_ou25", "name": "OU25", "num_class": 2}, {"target": "label_ou35", "name": "OU35", "num_class": 2}, {"target": "label_btts", "name": "BTTS", "num_class": 2}, {"target": "label_ht_result", "name": "HT_RESULT", "num_class": 3}, {"target": "label_ht_ou05", "name": "HT_OU05", "num_class": 2}, {"target": "label_ht_ou15", "name": "HT_OU15", "num_class": 2}, {"target": "label_ht_ft", "name": "HTFT", "num_class": 9}, {"target": "label_odd_even", "name": "ODD_EVEN", "num_class": 2}, {"target": "label_cards_ou45", "name": "CARDS_OU45", "num_class": 2}, {"target": "label_handicap_ms", "name": "HANDICAP_MS", "num_class": 3}, ] def load_data(): """Load and prepare training data.""" if not os.path.exists(DATA_PATH): print(f"[ERROR] Data not found: {DATA_PATH}") sys.exit(1) print(f"[INFO] Loading {DATA_PATH}...") df = pd.read_csv(DATA_PATH) for col in FEATURES: if col in df.columns: df[col] = df[col].fillna(0) # Derive odds presence flags for older CSVs odds_flag_sources = { "odds_ms_h_present": "odds_ms_h", "odds_ms_d_present": "odds_ms_d", "odds_ms_a_present": "odds_ms_a", "odds_ht_ms_h_present": "odds_ht_ms_h", "odds_ht_ms_d_present": "odds_ht_ms_d", "odds_ht_ms_a_present": "odds_ht_ms_a", "odds_ou05_o_present": "odds_ou05_o", "odds_ou05_u_present": "odds_ou05_u", "odds_ou15_o_present": "odds_ou15_o", "odds_ou15_u_present": "odds_ou15_u", "odds_ou25_o_present": "odds_ou25_o", "odds_ou25_u_present": "odds_ou25_u", "odds_ou35_o_present": "odds_ou35_o", "odds_ou35_u_present": "odds_ou35_u", "odds_ht_ou05_o_present": "odds_ht_ou05_o", "odds_ht_ou05_u_present": "odds_ht_ou05_u", "odds_ht_ou15_o_present": "odds_ht_ou15_o", "odds_ht_ou15_u_present": "odds_ht_ou15_u", "odds_btts_y_present": "odds_btts_y", "odds_btts_n_present": "odds_btts_n", } for flag_col, odds_col in odds_flag_sources.items(): if flag_col not in df.columns: df[flag_col] = ( pd.to_numeric(df.get(odds_col, 0), errors="coerce").fillna(0) > 1.01 ).astype(float) print(f"[INFO] Shape: {df.shape}, Features: {len(FEATURES)}") return df def temporal_split_4way(valid_df: pd.DataFrame): """Chronological 60/15/10/15 split: train/val/cal/test.""" ordered = valid_df.sort_values("mst_utc").reset_index(drop=True) n = len(ordered) i1 = int(n * 0.60) i2 = int(n * 0.75) i3 = int(n * 0.85) train = ordered.iloc[:i1].copy() val = ordered.iloc[i1:i2].copy() cal = ordered.iloc[i2:i3].copy() test = ordered.iloc[i3:].copy() return train, val, cal, test # ─── XGBoost Wrapper for sklearn CalibratedClassifierCV ───────────── class XGBWrapper(BaseEstimator, ClassifierMixin): """Thin sklearn-compatible wrapper around xgb.train for Isotonic calibration.""" def __init__(self, params, num_boost_round=500): self.params = params self.num_boost_round = num_boost_round self.model_ = None self.classes_ = None def fit(self, X, y, **kwargs): self.classes_ = np.unique(y) dtrain = xgb.DMatrix(X, label=y) self.model_ = xgb.train(self.params, dtrain, num_boost_round=self.num_boost_round) return self def predict_proba(self, X): dm = xgb.DMatrix(X) probs = self.model_.predict(dm) if len(probs.shape) == 1: probs = np.column_stack([1 - probs, probs]) return probs def predict(self, X): return np.argmax(self.predict_proba(X), axis=1) # ─── Optuna Objectives ────────────────────────────────────────────── def xgb_objective(trial, X_train, y_train, X_val, y_val, num_class): params = { "objective": "multi:softprob" if num_class > 2 else "binary:logistic", "eval_metric": "mlogloss" if num_class > 2 else "logloss", "max_depth": trial.suggest_int("max_depth", 3, 8), "eta": trial.suggest_float("eta", 0.01, 0.15, log=True), "subsample": trial.suggest_float("subsample", 0.6, 1.0), "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0), "min_child_weight": trial.suggest_int("min_child_weight", 1, 10), "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True), "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True), "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True), "n_jobs": 4, "random_state": 42, } if num_class > 2: params["num_class"] = num_class dtrain = xgb.DMatrix(X_train, label=y_train) dval = xgb.DMatrix(X_val, label=y_val) model = xgb.train( params, dtrain, num_boost_round=1000, evals=[(dval, "val")], early_stopping_rounds=50, verbose_eval=False, ) preds = model.predict(dval) if len(preds.shape) == 1: preds = np.column_stack([1 - preds, preds]) return log_loss(y_val, preds) def lgb_objective(trial, X_train, y_train, X_val, y_val, num_class): params = { "objective": "multiclass" if num_class > 2 else "binary", "metric": "multi_logloss" if num_class > 2 else "binary_logloss", "max_depth": trial.suggest_int("max_depth", 3, 8), "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15, log=True), "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0), "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0), "bagging_freq": trial.suggest_int("bagging_freq", 1, 7), "min_child_samples": trial.suggest_int("min_child_samples", 5, 50), "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 1.0, log=True), "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True), "n_jobs": 4, "random_state": 42, "verbose": -1, } if num_class > 2: params["num_class"] = num_class train_data = lgb.Dataset(X_train, label=y_train) val_data = lgb.Dataset(X_val, label=y_val, reference=train_data) model = lgb.train( params, train_data, num_boost_round=1000, valid_sets=[val_data], valid_names=["val"], callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)], ) preds = model.predict(X_val, num_iteration=model.best_iteration) if len(preds.shape) == 1: preds = np.column_stack([1 - preds, preds]) return log_loss(y_val, preds) # ─── Main Training Pipeline ───────────────────────────────────────── def train_market(df, target_col, market_name, num_class, n_trials): """Full pipeline for one market: Optuna → Train → Calibrate → Evaluate.""" print(f"\n{'='*60}") print(f"[MARKET] {market_name} (classes={num_class})") print(f"{'='*60}") valid_df = df[df[target_col].notna()].copy() valid_df = valid_df[valid_df[target_col].astype(str) != ""].copy() print(f"[INFO] Valid samples: {len(valid_df)}") if len(valid_df) < 500: print(f"[SKIP] Not enough data for {market_name}") return None available_features = [f for f in FEATURES if f in valid_df.columns] print(f"[INFO] Features: {len(available_features)}/{len(FEATURES)}") train_df, val_df, cal_df, test_df = temporal_split_4way(valid_df) X_train = train_df[available_features].values X_val = val_df[available_features].values X_cal = cal_df[available_features].values X_test = test_df[available_features].values y_train = train_df[target_col].astype(int).values y_val = val_df[target_col].astype(int).values y_cal = cal_df[target_col].astype(int).values y_test = test_df[target_col].astype(int).values print(f"[INFO] Split: train={len(X_train)} val={len(X_val)} cal={len(X_cal)} test={len(X_test)}") # ── Phase 1: Optuna XGBoost ────────────────────────────────── print(f"\n[OPTUNA] XGBoost tuning ({n_trials} trials)...") xgb_study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42)) xgb_study.optimize( lambda trial: xgb_objective(trial, X_train, y_train, X_val, y_val, num_class), n_trials=n_trials, ) xgb_best = xgb_study.best_params print(f"[OK] XGB best logloss: {xgb_study.best_value:.4f}") # ── Phase 2: Optuna LightGBM ───────────────────────────────── print(f"[OPTUNA] LightGBM tuning ({n_trials} trials)...") lgb_study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42)) lgb_study.optimize( lambda trial: lgb_objective(trial, X_train, y_train, X_val, y_val, num_class), n_trials=n_trials, ) lgb_best = lgb_study.best_params print(f"[OK] LGB best logloss: {lgb_study.best_value:.4f}") # ── Phase 3: Train final models with best params ───────────── # XGBoost final xgb_params = { "objective": "multi:softprob" if num_class > 2 else "binary:logistic", "eval_metric": "mlogloss" if num_class > 2 else "logloss", "n_jobs": 4, "random_state": 42, **{k: v for k, v in xgb_best.items()}, } if num_class > 2: xgb_params["num_class"] = num_class dtrain = xgb.DMatrix(X_train, label=y_train) dval = xgb.DMatrix(X_val, label=y_val) xgb_model = xgb.train( xgb_params, dtrain, num_boost_round=1500, evals=[(dtrain, "train"), (dval, "val")], early_stopping_rounds=80, verbose_eval=200, ) print(f"[OK] XGB final: iter={xgb_model.best_iteration}, score={xgb_model.best_score:.4f}") # LightGBM final lgb_params = { "objective": "multiclass" if num_class > 2 else "binary", "metric": "multi_logloss" if num_class > 2 else "binary_logloss", "n_jobs": 4, "random_state": 42, "verbose": -1, **{k: v for k, v in lgb_best.items()}, } if num_class > 2: lgb_params["num_class"] = num_class lgb_train_data = lgb.Dataset(X_train, label=y_train) lgb_val_data = lgb.Dataset(X_val, label=y_val, reference=lgb_train_data) lgb_model = lgb.train( lgb_params, lgb_train_data, num_boost_round=1500, valid_sets=[lgb_train_data, lgb_val_data], valid_names=["train", "val"], callbacks=[lgb.early_stopping(80), lgb.log_evaluation(200)], ) print(f"[OK] LGB final: iter={lgb_model.best_iteration}") # ── Phase 4: Isotonic Calibration on cal set ───────────────── print("[CAL] Fitting Isotonic Regression (per-class)...") # XGB calibration — manual IsotonicRegression per class dcal = xgb.DMatrix(X_cal) xgb_cal_raw = xgb_model.predict(dcal) if len(xgb_cal_raw.shape) == 1: xgb_cal_raw = np.column_stack([1 - xgb_cal_raw, xgb_cal_raw]) xgb_iso_calibrators = [] for cls_idx in range(num_class): ir = IsotonicRegression(out_of_bounds="clip") y_binary = (y_cal == cls_idx).astype(float) ir.fit(xgb_cal_raw[:, cls_idx], y_binary) xgb_iso_calibrators.append(ir) print(f"[OK] XGB Isotonic calibrators fitted: {num_class} classes") # LGB calibration — manual IsotonicRegression per class lgb_cal_raw = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration) if len(lgb_cal_raw.shape) == 1: lgb_cal_raw = np.column_stack([1 - lgb_cal_raw, lgb_cal_raw]) lgb_iso_calibrators = [] for cls_idx in range(num_class): ir = IsotonicRegression(out_of_bounds="clip") y_binary = (y_cal == cls_idx).astype(float) ir.fit(lgb_cal_raw[:, cls_idx], y_binary) lgb_iso_calibrators.append(ir) print(f"[OK] LGB Isotonic calibrators fitted: {num_class} classes") # ── Phase 5: Evaluate on test set ──────────────────────────── print("\n[EVAL] Test set evaluation...") dtest = xgb.DMatrix(X_test) # Raw XGB xgb_raw_probs = xgb_model.predict(dtest) if len(xgb_raw_probs.shape) == 1: xgb_raw_probs = np.column_stack([1 - xgb_raw_probs, xgb_raw_probs]) # Calibrated XGB — apply isotonic per class + renormalize xgb_cal_probs = np.column_stack([ xgb_iso_calibrators[i].predict(xgb_raw_probs[:, i]) for i in range(num_class) ]) xgb_cal_probs = xgb_cal_probs / xgb_cal_probs.sum(axis=1, keepdims=True) # Raw LGB lgb_raw_probs = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration) if len(lgb_raw_probs.shape) == 1: lgb_raw_probs = np.column_stack([1 - lgb_raw_probs, lgb_raw_probs]) # Calibrated LGB — apply isotonic per class + renormalize lgb_cal_probs = np.column_stack([ lgb_iso_calibrators[i].predict(lgb_raw_probs[:, i]) for i in range(num_class) ]) lgb_cal_probs = lgb_cal_probs / lgb_cal_probs.sum(axis=1, keepdims=True) # Ensembles raw_ensemble = (xgb_raw_probs + lgb_raw_probs) / 2 cal_ensemble = (xgb_cal_probs + lgb_cal_probs) / 2 def _eval(probs, label): preds = np.argmax(probs, axis=1) acc = accuracy_score(y_test, preds) ll = log_loss(y_test, probs) print(f" {label}: Acc={acc:.4f} LogLoss={ll:.4f}") return {"accuracy": round(float(acc), 4), "logloss": round(float(ll), 4)} m_xgb_raw = _eval(xgb_raw_probs, "XGB Raw") m_xgb_cal = _eval(xgb_cal_probs, "XGB Calibrated") m_lgb_raw = _eval(lgb_raw_probs, "LGB Raw") m_lgb_cal = _eval(lgb_cal_probs, "LGB Calibrated") m_ensemble = _eval(raw_ensemble, "Ensemble Raw") m_cal_ensemble = _eval(cal_ensemble, "Ensemble Calibrated") # Classification report for ensemble ens_preds = np.argmax(raw_ensemble, axis=1) print(f"\n[REPORT] Ensemble Classification Report:") print(classification_report(y_test, ens_preds)) # ── Phase 6: Save models ───────────────────────────────────── # Raw models (orchestrator compatible) xgb_path = os.path.join(MODELS_DIR, f"xgb_v25_{market_name.lower()}.json") xgb_model.save_model(xgb_path) print(f"[SAVE] {xgb_path}") lgb_path = os.path.join(MODELS_DIR, f"lgb_v25_{market_name.lower()}.txt") lgb_model.save_model(lgb_path) print(f"[SAVE] {lgb_path}") # Isotonic calibrators (XGB + LGB) xgb_cal_path = os.path.join(MODELS_DIR, f"iso_xgb_v25_{market_name.lower()}.pkl") with open(xgb_cal_path, "wb") as f: pickle.dump(xgb_iso_calibrators, f) print(f"[SAVE] {xgb_cal_path}") lgb_cal_path = os.path.join(MODELS_DIR, f"iso_lgb_v25_{market_name.lower()}.pkl") with open(lgb_cal_path, "wb") as f: pickle.dump(lgb_iso_calibrators, f) print(f"[SAVE] {lgb_cal_path}") return { "market": market_name, "samples": int(len(valid_df)), "train": int(len(X_train)), "val": int(len(X_val)), "cal": int(len(X_cal)), "test": int(len(X_test)), "features_used": len(available_features), "xgb_best_params": xgb_best, "lgb_best_params": lgb_best, "xgb_best_iteration": int(xgb_model.best_iteration), "lgb_best_iteration": int(lgb_model.best_iteration), "xgb_optuna_best_logloss": round(float(xgb_study.best_value), 4), "lgb_optuna_best_logloss": round(float(lgb_study.best_value), 4), "test_xgb_raw": m_xgb_raw, "test_xgb_calibrated": m_xgb_cal, "test_lgb_raw": m_lgb_raw, "test_lgb_calibrated": m_lgb_cal, "test_ensemble_raw": m_ensemble, "test_ensemble_calibrated": m_cal_ensemble, } def main(): parser = argparse.ArgumentParser(description="V25 Pro Trainer") parser.add_argument("--markets", type=str, default=None, help="Comma-separated market names (e.g., MS,OU25,BTTS)") parser.add_argument("--trials", type=int, default=50, help="Optuna trials per model per market") args = parser.parse_args() print("=" * 60) print("V25 PRO — Optuna + Isotonic Calibration") print("=" * 60) print(f"[INFO] Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"[INFO] Trials per model: {args.trials}") print(f"[INFO] Total features: {len(FEATURES)}") df = load_data() configs = MARKET_CONFIGS if args.markets: selected = [m.strip().upper() for m in args.markets.split(",")] configs = [c for c in configs if c["name"] in selected] print(f"[INFO] Selected markets: {[c['name'] for c in configs]}") all_metrics = { "trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "trainer": "v25_pro", "optuna_trials": args.trials, "total_features": len(FEATURES), "markets": {}, } for config in configs: target = config["target"] if target not in df.columns: print(f"[SKIP] {config['name']}: missing target {target}") continue metrics = train_market( df, target, config["name"], config["num_class"], args.trials, ) if metrics: all_metrics["markets"][config["name"]] = metrics # Save feature list feature_path = os.path.join(MODELS_DIR, "feature_cols.json") with open(feature_path, "w") as f: json.dump(FEATURES, f, indent=2) # Save full report report_path = os.path.join(REPORTS_DIR, "v25_pro_metrics.json") with open(report_path, "w") as f: json.dump(all_metrics, f, indent=2, default=str) print(f"\n[SAVE] Report: {report_path}") # Summary print("\n" + "=" * 60) print("[SUMMARY]") print("=" * 60) for name, m in all_metrics["markets"].items(): ens = m.get("test_ensemble_calibrated", m.get("test_ensemble_raw", {})) acc = ens.get('accuracy', '?') ll = ens.get('logloss', '?') acc_s = f"{acc:.4f}" if isinstance(acc, float) else str(acc) ll_s = f"{ll:.4f}" if isinstance(ll, float) else str(ll) print(f" {name:12s} | Acc={acc_s:>6s} | LL={ll_s:>6s} | " f"XGB_iter={m.get('xgb_best_iteration','?')} LGB_iter={m.get('lgb_best_iteration','?')}") print(f"\n[INFO] Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("[OK] V25 PRO Training Complete!") if __name__ == "__main__": main()