iddaai-be/ai-engine/scripts/train_score_model.py


import pandas as pd
import xgboost as xgb
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Paths
DATA_PATH = os.path.join(os.path.dirname(__file__), "../data/training_data.csv")
MODEL_PATH = os.path.join(os.path.dirname(__file__), "../models/xgb_score.pkl")

# Import unified 56-feature array from markets trainer
from train_xgboost_markets import FEATURES

TARGETS = ["score_home", "score_away", "ht_score_home", "ht_score_away"]

def train():
    print("🚀 Training Score Prediction Model (XGBoost) - Full Time & Half Time")
    print("=" * 60)

    if not os.path.exists(DATA_PATH):
        print(f"❌ Data file not found: {DATA_PATH}")
        return

    print(f"📦 Loading data from {DATA_PATH}...")
    df = pd.read_csv(DATA_PATH)

    # Preprocessing
    # Drop rows where target is missing (should verify)
    df = df.dropna(subset=TARGETS)

    # Fill feature NaNs with median/mean or 0
    print(f"   Original rows: {len(df)}")

    # Filter valid odds (at least ms_h > 1.0)
    df = df[df["odds_ms_h"] > 1.0].copy()
    print(f"   Rows with valid odds: {len(df)}")

    X = df[FEATURES]
    y_home = df["score_home"]
    y_away = df["score_away"]
    y_ht_home = df["ht_score_home"]
    y_ht_away = df["ht_score_away"]

    # Train/Test Split
    X_train, X_test, y_h_train, y_h_test, y_a_train, y_a_test, y_ht_h_train, y_ht_h_test, y_ht_a_train, y_ht_a_test = train_test_split(
        X, y_home, y_away, y_ht_home, y_ht_away, test_size=0.2, random_state=42
    )

    print(f"   Training set: {len(X_train)} matches")
    print(f"   Test set: {len(X_test)} matches")

    # --- HOME GOALS MODEL ---
    print("\n🏠 Training Home Goals Model...")
    xgb_home = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=5,
        subsample=0.7,
        colsample_bytree=0.7,
        n_jobs=-1,
        random_state=42,
        early_stopping_rounds=50 # Configure here for newer XGBoost or remove if not supported in constructor (depends on version)
    )
    # Actually, to be safe across versions, let's remove early stopping for now or use validation set properly
    # Using 'eval_set' without early_stopping_rounds just prints metrics
    xgb_home = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=5,
        subsample=0.7,
        colsample_bytree=0.7,
        n_jobs=-1,
        random_state=42
    )
    xgb_home.fit(X_train, y_h_train, eval_set=[(X_test, y_h_test)], verbose=False)

    home_preds = xgb_home.predict(X_test)
    mae_home = mean_absolute_error(y_h_test, home_preds)
    r2_home = r2_score(y_h_test, home_preds)
    print(f"   ✅ FT Home MAE: {mae_home:.4f} goals")
    print(f"   ✅ FT Home R2: {r2_home:.4f}")

    # --- AWAY GOALS MODEL ---
    print("\n✈️ Training FT Away Goals Model...")
    xgb_away = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=5,
        subsample=0.7,
        colsample_bytree=0.7,
        n_jobs=-1,
        random_state=42
    )
    xgb_away.fit(X_train, y_a_train, eval_set=[(X_test, y_a_test)], verbose=False)

    away_preds = xgb_away.predict(X_test)
    mae_away = mean_absolute_error(y_a_test, away_preds)
    r2_away = r2_score(y_a_test, away_preds)
    print(f"   ✅ FT Away MAE: {mae_away:.4f} goals")
    print(f"   ✅ FT Away R2: {r2_away:.4f}")

    # --- HT HOME GOALS MODEL ---
    print("\n🏠 Training HT Home Goals Model...")
    xgb_ht_home = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=5,
        subsample=0.7,
        colsample_bytree=0.7,
        n_jobs=-1,
        random_state=42
    )
    xgb_ht_home.fit(X_train, y_ht_h_train, eval_set=[(X_test, y_ht_h_test)], verbose=False)

    ht_home_preds = xgb_ht_home.predict(X_test)
    mae_ht_home = mean_absolute_error(y_ht_h_test, ht_home_preds)
    print(f"   ✅ HT Home MAE: {mae_ht_home:.4f} goals")

    # --- HT AWAY GOALS MODEL ---
    print("\n✈️ Training HT Away Goals Model...")
    xgb_ht_away = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=5,
        subsample=0.7,
        colsample_bytree=0.7,
        n_jobs=-1,
        random_state=42
    )
    xgb_ht_away.fit(X_train, y_ht_a_train, eval_set=[(X_test, y_ht_a_test)], verbose=False)

    ht_away_preds = xgb_ht_away.predict(X_test)
    mae_ht_away = mean_absolute_error(y_ht_a_test, ht_away_preds)
    print(f"   ✅ HT Away MAE: {mae_ht_away:.4f} goals")

    # --- EVALUATE EXACT SCORE ACCURACY (ROUNDED) ---
    print("\n🎯 Exact FT Score Accuracy (Test Set):")
    correct = 0
    close = 0 # Within 1 goal diff for both

    for h_true, a_true, h_pred, a_pred in zip(y_h_test, y_a_test, home_preds, away_preds):
        h_p = round(h_pred)
        a_p = round(a_pred)
        if h_p == h_true and a_p == a_true:
            correct += 1
        if abs(h_p - h_true) <= 1 and abs(a_p - a_true) <= 1:
            close += 1

    acc = correct / len(X_test) * 100
    close_acc = close / len(X_test) * 100
    print(f"   Exact Match: {acc:.2f}%")
    print(f"   Close Match (+/- 1 goal): {close_acc:.2f}%")

    # Save
    print(f"\n💾 Saving models to {MODEL_PATH}...")
    model_data = {
        "home_model": xgb_home,
        "away_model": xgb_away,
        "ht_home_model": xgb_ht_home,
        "ht_away_model": xgb_ht_away,
        "features": FEATURES,
        "meta": {
            "mae_home": mae_home,
            "mae_away": mae_away,
            "mae_ht_home": mae_ht_home,
            "mae_ht_away": mae_ht_away,
            "acc": acc
        }
    }
    with open(MODEL_PATH, "wb") as f:
        pickle.dump(model_data, f)

    print("✅ Done.")

if __name__ == "__main__":
    train()