""" XGBoost Market Model Trainer ============================ Trains specialized XGBoost models for each betting market. Includes 'Surprise Hunter' logic for HT/FT reversals (1/2, 2/1). Models: 1. MS (1X2) - Multi-class 2. Over/Under 2.5 - Binary 3. BTTS - Binary 4. HT/FT - Multi-class (Imbalanced learning for 1/2, 2/1) 5. Other line variants (1.5, 3.5, etc.) Usage: python3 scripts/train_xgboost_markets.py """ import os import sys import json import pickle import numpy as np import pandas as pd import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, log_loss, classification_report, roc_auc_score from sklearn.preprocessing import LabelEncoder # Config AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv") MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "xgboost") os.makedirs(MODELS_DIR, exist_ok=True) # Feature Columns (Must match extraction + inference) FEATURES = [ # ELO "home_overall_elo", "away_overall_elo", "elo_diff", "home_home_elo", "away_away_elo", "form_elo_diff", # Form "home_goals_avg", "home_conceded_avg", "away_goals_avg", "away_conceded_avg", "home_clean_sheet_rate", "away_clean_sheet_rate", "home_scoring_rate", "away_scoring_rate", "home_winning_streak", "away_winning_streak", # H2H "h2h_home_win_rate", "h2h_draw_rate", "h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate", # Stats "home_avg_possession", "away_avg_possession", "home_avg_shots_on_target", "away_avg_shots_on_target", "home_shot_conversion", "away_shot_conversion", # Odds (Implicit market wisdom) "odds_ms_h", "odds_ms_d", "odds_ms_a", "implied_home", "implied_draw", "implied_away", "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a", "odds_ou05_o", "odds_ou05_u", "odds_ou15_o", "odds_ou15_u", "odds_ou25_o", "odds_ou25_u", "odds_ou35_o", "odds_ou35_u", "odds_ht_ou05_o", "odds_ht_ou05_u", "odds_ht_ou15_o", "odds_ht_ou15_u", "odds_btts_y", "odds_btts_n", # League/Context "league_avg_goals", "league_zero_goal_rate", "home_xga", "away_xga", # Upset Engine "upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential", # Referee Engine "referee_home_bias", "referee_avg_goals", "referee_cards_total", "referee_avg_yellow", "referee_experience", # Momentum Engine "home_momentum_score", "away_momentum_score", "momentum_diff", ] def load_data(): if not os.path.exists(DATA_PATH): print(f"āŒ Data file not found: {DATA_PATH}") sys.exit(1) print(f"šŸ“¦ Loading data from {DATA_PATH}...") df = pd.read_csv(DATA_PATH) # Handle missing values - simple imputation for robustness df.fillna(0, inplace=True) print(f" Shape: {df.shape}") return df def train_model(df, target_col, model_name, objective, metric, num_class=None, class_weights=None): """ Generic trainer for XGBoost models. Supports binary and multi-class. Supports sample weighting for imbalanced classes (like 1/2 reversals). """ print(f"\nšŸš€ Training {model_name} (Target: {target_col})...") # Filter valid rows for this target valid_df = df[df[target_col].notna()].copy() if valid_df.empty: print(f" āš ļø No valid data for {target_col}, skipping.") return X = valid_df[FEATURES] y = valid_df[target_col].astype(int) # Split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Sample Weights (For HT/FT Surprise) sample_weights__train = None if class_weights: print(" āš–ļø Applying class weights for surprise detection...") sample_weights__train = y_train.map(class_weights).fillna(1.0) # Model Params params = { 'objective': objective, 'eval_metric': metric, 'eta': 0.05, 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8, 'nthread': 4, 'seed': 42 } if num_class: params['num_class'] = num_class # Train using Scikit-Learn Wrapper so we can pickle it cleanly for v20_ensemble if objective == "multi:softprob": model = xgb.XGBClassifier(**params, n_estimators=1000, early_stopping_rounds=50) else: model = xgb.XGBClassifier(**params, n_estimators=1000, early_stopping_rounds=50) # Fit with early stopping model.fit( X_train, y_train, sample_weight=sample_weights__train, eval_set=[(X_test, y_test)], verbose=False ) # Evaluation preds = model.predict_proba(X_test) if objective == "multi:softprob": y_pred_class = np.argmax(preds, axis=1) acc = accuracy_score(y_test, y_pred_class) loss = log_loss(y_test, preds) print(f" āœ… Accuracy: {acc:.4f} | LogLoss: {loss:.4f}") # Detailed report for important classes print(classification_report(y_test, y_pred_class)) else: # Binary # Extract the probability for class 1 class_1_preds = preds[:, 1] y_pred_class = (class_1_preds > 0.5).astype(int) acc = accuracy_score(y_test, y_pred_class) auc = roc_auc_score(y_test, class_1_preds) print(f" āœ… Accuracy: {acc:.4f} | AUC: {auc:.4f}") # Save raw json booster model_json_path = os.path.join(MODELS_DIR, f"{model_name}.json") model.get_booster().save_model(model_json_path) # Save sklearn wrapped PKL (What v20_ensemble actually loads for Uncalibrated models like ht_ft!) import pickle model_pkl_path = os.path.join(MODELS_DIR, f"{model_name}.pkl") with open(model_pkl_path, "wb") as f: pickle.dump(model, f) print(f" šŸ’¾ Model saved to {model_json_path} and {model_pkl_path}") def main(): df = load_data() # 1. Match Result (1X2) train_model( df, "label_ms", "xgb_ms", objective="multi:softprob", metric="mlogloss", num_class=3 ) # 2. Over/Under 2.5 train_model( df, "label_ou25", "xgb_ou25", objective="binary:logistic", metric="logloss" ) # 3. BTTS train_model( df, "label_btts", "xgb_btts", objective="binary:logistic", metric="logloss" ) # 4. HT/FT SURPRISE HUNTER # Classes: 0=1/1, 1=1/X, 2=1/2(HOME->AWAY), 3=X/1 ... 6=2/1(AWAY->HOME) ... # We give HUGE weight to 2 (1/2) and 6 (2/1) htft_weights = { 0: 1.0, 1: 3.0, 2: 15.0, # 1/1, 1/X, 1/2 (Reversal!) 3: 2.0, 4: 2.0, 5: 2.0, # X/1, X/X, X/2 6: 15.0, 7: 3.0, 8: 1.0 # 2/1 (Reversal!), 2/X, 2/2 } train_model( df, "label_ht_ft", "xgb_ht_ft", objective="multi:softprob", metric="mlogloss", num_class=9, class_weights=htft_weights ) # 5. Over/Under 1.5 & 3.5 (Optional utility models) train_model(df, "label_ou15", "xgb_ou15", objective="binary:logistic", metric="logloss") train_model(df, "label_ou35", "xgb_ou35", objective="binary:logistic", metric="logloss") # 6. Half-Time 1X2 train_model(df, "label_ht_result", "xgb_ht_result", objective="multi:softprob", metric="mlogloss", num_class=3) # 7. Half-Time Over/Under train_model(df, "label_ht_ou05", "xgb_ht_ou05", objective="binary:logistic", metric="logloss") train_model(df, "label_ht_ou15", "xgb_ht_ou15", objective="binary:logistic", metric="logloss") # 8. Handicap MS and Cards train_model(df, "label_handicap_ms", "xgb_handicap_ms", objective="multi:softprob", metric="mlogloss", num_class=3) train_model(df, "label_cards_ou45", "xgb_cards_ou45", objective="binary:logistic", metric="logloss") print("\nāœ… All models trained successfully!") if __name__ == "__main__": main()