iddaai-be/ai-engine/scripts/train_xgboost_markets.py

"""
XGBoost Market Model Trainer
============================
Trains specialized XGBoost models for each betting market.
Includes 'Surprise Hunter' logic for HT/FT reversals (1/2, 2/1).

Models:
  1. MS (1X2) - Multi-class
  2. Over/Under 2.5 - Binary
  3. BTTS - Binary
  4. HT/FT - Multi-class (Imbalanced learning for 1/2, 2/1)
  5. Other line variants (1.5, 3.5, etc.)

Usage:
  python3 scripts/train_xgboost_markets.py
"""

import os
import sys
import json
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Config
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "xgboost")

os.makedirs(MODELS_DIR, exist_ok=True)

# Feature Columns (Must match extraction + inference)
FEATURES = [
    # ELO
    "home_overall_elo", "away_overall_elo", "elo_diff",
    "home_home_elo", "away_away_elo", "form_elo_diff",

    # Form
    "home_goals_avg", "home_conceded_avg",
    "away_goals_avg", "away_conceded_avg",
    "home_clean_sheet_rate", "away_clean_sheet_rate",
    "home_scoring_rate", "away_scoring_rate",
    "home_winning_streak", "away_winning_streak",

    # H2H
    "h2h_home_win_rate", "h2h_draw_rate",
    "h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",

    # Stats
    "home_avg_possession", "away_avg_possession",
    "home_avg_shots_on_target", "away_avg_shots_on_target",
    "home_shot_conversion", "away_shot_conversion",

    # Odds (Implicit market wisdom)
    "odds_ms_h", "odds_ms_d", "odds_ms_a",
    "implied_home", "implied_draw", "implied_away",

    "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",

    "odds_ou05_o", "odds_ou05_u",
    "odds_ou15_o", "odds_ou15_u",
    "odds_ou25_o", "odds_ou25_u",
    "odds_ou35_o", "odds_ou35_u",

    "odds_ht_ou05_o", "odds_ht_ou05_u",
    "odds_ht_ou15_o", "odds_ht_ou15_u",

    "odds_btts_y", "odds_btts_n",

    # League/Context
    "league_avg_goals", "league_zero_goal_rate",
    "home_xga", "away_xga",

    # Upset Engine
    "upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",

    # Referee Engine
    "referee_home_bias", "referee_avg_goals", "referee_cards_total",
    "referee_avg_yellow", "referee_experience",

    # Momentum Engine
    "home_momentum_score", "away_momentum_score", "momentum_diff",
]

def load_data():
    if not os.path.exists(DATA_PATH):
        print(f"❌ Data file not found: {DATA_PATH}")
        sys.exit(1)

    print(f"📦 Loading data from {DATA_PATH}...")
    df = pd.read_csv(DATA_PATH)

    # Handle missing values - simple imputation for robustness
    df.fillna(0, inplace=True)

    print(f"   Shape: {df.shape}")
    return df

def train_model(df, target_col, model_name, objective, metric, num_class=None, class_weights=None):
    """
    Generic trainer for XGBoost models.
    Supports binary and multi-class.
    Supports sample weighting for imbalanced classes (like 1/2 reversals).
    """
    print(f"\n🚀 Training {model_name} (Target: {target_col})...")

    # Filter valid rows for this target
    valid_df = df[df[target_col].notna()].copy()
    if valid_df.empty:
        print(f"   ⚠️ No valid data for {target_col}, skipping.")
        return

    X = valid_df[FEATURES]
    y = valid_df[target_col].astype(int)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Sample Weights (For HT/FT Surprise)
    sample_weights__train = None
    if class_weights:
        print("   ⚖️ Applying class weights for surprise detection...")
        sample_weights__train = y_train.map(class_weights).fillna(1.0)

    # Model Params
    params = {
        'objective': objective,
        'eval_metric': metric,
        'eta': 0.05,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'nthread': 4,
        'seed': 42
    }

    if num_class:
        params['num_class'] = num_class

    # Train using Scikit-Learn Wrapper so we can pickle it cleanly for v20_ensemble
    if objective == "multi:softprob":
        model = xgb.XGBClassifier(**params, n_estimators=1000, early_stopping_rounds=50)
    else:
        model = xgb.XGBClassifier(**params, n_estimators=1000, early_stopping_rounds=50)

    # Fit with early stopping
    model.fit(
        X_train, y_train,
        sample_weight=sample_weights__train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )

    # Evaluation
    preds = model.predict_proba(X_test)

    if objective == "multi:softprob":
        y_pred_class = np.argmax(preds, axis=1)
        acc = accuracy_score(y_test, y_pred_class)
        loss = log_loss(y_test, preds)
        print(f"   ✅ Accuracy: {acc:.4f} | LogLoss: {loss:.4f}")

        # Detailed report for important classes
        print(classification_report(y_test, y_pred_class))

    else:
        # Binary
        # Extract the probability for class 1
        class_1_preds = preds[:, 1]
        y_pred_class = (class_1_preds > 0.5).astype(int)
        acc = accuracy_score(y_test, y_pred_class)
        auc = roc_auc_score(y_test, class_1_preds)
        print(f"   ✅ Accuracy: {acc:.4f} | AUC: {auc:.4f}")

    # Save raw json booster
    model_json_path = os.path.join(MODELS_DIR, f"{model_name}.json")
    model.get_booster().save_model(model_json_path)

    # Save sklearn wrapped PKL (What v20_ensemble actually loads for Uncalibrated models like ht_ft!)
    import pickle
    model_pkl_path = os.path.join(MODELS_DIR, f"{model_name}.pkl")
    with open(model_pkl_path, "wb") as f:
        pickle.dump(model, f)

    print(f"   💾 Model saved to {model_json_path} and {model_pkl_path}")

def main():
    df = load_data()

    # 1. Match Result (1X2)
    train_model(
        df, "label_ms", "xgb_ms",
        objective="multi:softprob", metric="mlogloss", num_class=3
    )

    # 2. Over/Under 2.5
    train_model(
        df, "label_ou25", "xgb_ou25",
        objective="binary:logistic", metric="logloss"
    )

    # 3. BTTS
    train_model(
        df, "label_btts", "xgb_btts",
        objective="binary:logistic", metric="logloss"
    )

    # 4. HT/FT SURPRISE HUNTER
    # Classes: 0=1/1, 1=1/X, 2=1/2(HOME->AWAY), 3=X/1 ... 6=2/1(AWAY->HOME) ...
    # We give HUGE weight to 2 (1/2) and 6 (2/1)
    htft_weights = {
        0: 1.0, 1: 3.0, 2: 15.0,  # 1/1, 1/X, 1/2 (Reversal!)
        3: 2.0, 4: 2.0, 5: 2.0,   # X/1, X/X, X/2
        6: 15.0, 7: 3.0, 8: 1.0   # 2/1 (Reversal!), 2/X, 2/2
    }

    train_model(
        df, "label_ht_ft", "xgb_ht_ft",
        objective="multi:softprob", metric="mlogloss", num_class=9,
        class_weights=htft_weights
    )

    # 5. Over/Under 1.5 & 3.5 (Optional utility models)
    train_model(df, "label_ou15", "xgb_ou15", objective="binary:logistic", metric="logloss")
    train_model(df, "label_ou35", "xgb_ou35", objective="binary:logistic", metric="logloss")

    # 6. Half-Time 1X2
    train_model(df, "label_ht_result", "xgb_ht_result", objective="multi:softprob", metric="mlogloss", num_class=3)

    # 7. Half-Time Over/Under
    train_model(df, "label_ht_ou05", "xgb_ht_ou05", objective="binary:logistic", metric="logloss")
    train_model(df, "label_ht_ou15", "xgb_ht_ou15", objective="binary:logistic", metric="logloss")
    # 8. Handicap MS and Cards
    train_model(df, "label_handicap_ms", "xgb_handicap_ms", objective="multi:softprob", metric="mlogloss", num_class=3)
    train_model(df, "label_cards_ou45", "xgb_cards_ou45", objective="binary:logistic", metric="logloss")

    print("\n✅ All models trained successfully!")

if __name__ == "__main__":
    main()