""" V20 Pro Model Trainer ===================== Advanced training pipeline for Suggest-Bet V20 Ensemble. Features: 1. Optuna Hyperparameter Optimization 2. Stratified K-Fold Cross-Validation 3. Probability Calibration (Isotonic Regression) 4. Market-specific weight handling for reversals (1/2, 2/1) Usage: python3 scripts/train_xgboost_pro.py """ import os import sys import json import pickle import numpy as np import pandas as pd import xgboost as xgb import optuna from optuna.samplers import TPESampler from sklearn.model_selection import StratifiedKFold, train_test_split from sklearn.metrics import accuracy_score, log_loss, brier_score_loss, classification_report from sklearn.calibration import CalibratedClassifierCV, calibration_curve import matplotlib.pyplot as plt # Config AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv") MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "xgboost") REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v20") os.makedirs(MODELS_DIR, exist_ok=True) os.makedirs(REPORTS_DIR, exist_ok=True) # Feature Columns (Must match extraction + inference) FEATURES = [ # ELO "home_overall_elo", "away_overall_elo", "elo_diff", "home_home_elo", "away_away_elo", "form_elo_diff", # Form "home_goals_avg", "home_conceded_avg", "away_goals_avg", "away_conceded_avg", "home_clean_sheet_rate", "away_clean_sheet_rate", "home_scoring_rate", "away_scoring_rate", "home_winning_streak", "away_winning_streak", # H2H "h2h_home_win_rate", "h2h_draw_rate", "h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate", # Stats "home_avg_possession", "away_avg_possession", "home_avg_shots_on_target", "away_avg_shots_on_target", "home_shot_conversion", "away_shot_conversion", # Odds (Implicit market wisdom) "odds_ms_h", "odds_ms_d", "odds_ms_a", "implied_home", "implied_draw", "implied_away", # League/Context "league_avg_goals", "league_zero_goal_rate", "home_xga", "away_xga" ] def load_data(): if not os.path.exists(DATA_PATH): print(f"āŒ Data file not found: {DATA_PATH}") sys.exit(1) print(f"šŸ“¦ Loading data from {DATA_PATH}...") df = pd.read_csv(DATA_PATH) df.fillna(0, inplace=True) print(f" Shape: {df.shape}") return df class MarketTrainer: def __init__(self, df, target_col, market_name, is_multi=False, num_class=None, weights=None): self.df = df[df[target_col].notna()].copy() self.target_col = target_col self.market_name = market_name self.is_multi = is_multi self.num_class = num_class self.weights = weights self.X = self.df[FEATURES] self.y = self.df[target_col].astype(int) # Split for final evaluation hold-out self.X_train, self.X_holdout, self.y_train, self.y_holdout = train_test_split( self.X, self.y, test_size=0.15, random_state=42, stratify=self.y ) def optimize(self, n_trials=50): print(f"\nšŸ” Tuning {self.market_name} with Optuna ({n_trials} trials)...") study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42)) study.optimize(self.objective, n_trials=n_trials) print(f" Best params: {study.best_params}") print(f" Best Cross-Validation LogLoss: {study.best_value:.4f}") return study.best_params def objective(self, trial): params = { "verbosity": 0, "objective": "multi:softprob" if self.is_multi else "binary:logistic", "eval_metric": "mlogloss" if self.is_multi else "logloss", "booster": "gbtree", "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True), "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True), "max_depth": trial.suggest_int("max_depth", 3, 9), "eta": trial.suggest_float("eta", 1e-3, 0.1, log=True), "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True), "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]), "subsample": trial.suggest_float("subsample", 0.5, 1.0), "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0), "n_estimators": trial.suggest_int("n_estimators", 100, 1000), "early_stopping_rounds": 20, "n_jobs": 4, "random_state": 42 } if self.is_multi: params["num_class"] = self.num_class skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) losses = [] for train_idx, val_idx in skf.split(self.X_train, self.y_train): X_t, X_v = self.X_train.iloc[train_idx], self.X_train.iloc[val_idx] y_t, y_v = self.y_train.iloc[train_idx], self.y_train.iloc[val_idx] # Apply weights if available w_t = None if self.weights: w_t = y_t.map(self.weights).fillna(1.0) model = xgb.XGBClassifier(**params) model.fit(X_t, y_t, sample_weight=w_t, eval_set=[(X_v, y_v)], verbose=False) preds = model.predict_proba(X_v) loss = log_loss(y_v, preds) losses.append(loss) return np.mean(losses) def train_final(self, best_params): print(f"šŸš€ Training final calibrated {self.market_name} model...") # Add core params best_params["objective"] = "multi:softprob" if self.is_multi else "binary:logistic" best_params["eval_metric"] = "mlogloss" if self.is_multi else "logloss" if self.is_multi: best_params["num_class"] = self.num_class base_model = xgb.XGBClassifier(**best_params) # Sample weights for training w_train = None if self.weights: w_train = self.y_train.map(self.weights).fillna(1.0) # Calibration using Cross-Validation calibrated_model = CalibratedClassifierCV(base_model, method='isotonic', cv=5) calibrated_model.fit(self.X_train, self.y_train, sample_weight=w_train) # Evaluate on Hold-out holdout_preds_raw = calibrated_model.predict_proba(self.X_holdout) holdout_preds_class = calibrated_model.predict(self.X_holdout) acc = accuracy_score(self.y_holdout, holdout_preds_class) loss = log_loss(self.y_holdout, holdout_preds_raw) print(f"šŸ“Š Hold-out Results for {self.market_name}:") print(f" Accuracy: {acc:.4f} | LogLoss: {loss:.4f}") print(classification_report(self.y_holdout, holdout_preds_class)) # Save model model_path = os.path.join(MODELS_DIR, f"xgb_{self.market_name.lower()}.pkl") with open(model_path, "wb") as f: pickle.dump(calibrated_model, f) print(f"šŸ’¾ Calibrated model saved to {model_path}") return calibrated_model def main(): df = load_data() # 1. MS (1X2) ms_trainer = MarketTrainer(df, "label_ms", "MS", is_multi=True, num_class=3) ms_params = ms_trainer.optimize(n_trials=50) ms_trainer.train_final(ms_params) # 2. OU 2.5 ou25_trainer = MarketTrainer(df, "label_ou25", "OU25") ou25_params = ou25_trainer.optimize(n_trials=30) ou25_trainer.train_final(ou25_params) # 3. BTTS btts_trainer = MarketTrainer(df, "label_btts", "BTTS") btts_params = btts_trainer.optimize(n_trials=30) btts_trainer.train_final(btts_params) # 4. HT/FT SURPRISE HUNTER htft_weights = { 0: 1.0, 1: 3.0, 2: 20.0, # 1/1, 1/X, 1/2 (MAX WEIGHT) 3: 2.0, 4: 2.0, 5: 2.0, 6: 20.0, 7: 3.0, 8: 1.0 # 2/1 (MAX WEIGHT) } htft_trainer = MarketTrainer(df, "label_ht_ft", "HT_FT", is_multi=True, num_class=9, weights=htft_weights) htft_params = htft_trainer.optimize(n_trials=50) htft_trainer.train_final(htft_params) print("\nāœ… Advanced V20 Model Training Complete!") if __name__ == "__main__": main()