""" V25-Compatible Score Prediction Model Trainer =============================================== Trains 4 independent XGBoost regression models for: - FT Home Goals - FT Away Goals - HT Home Goals - HT Away Goals Uses the same 102-feature set as v25_ensemble for full compatibility. Temporal train/test split (80/20) to avoid future leakage. Usage: python3 scripts/train_score_model.py """ import os import sys import pickle import numpy as np import pandas as pd import xgboost as xgb from datetime import datetime from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error # Add parent directory to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Config AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv") MODEL_PATH = os.path.join(AI_ENGINE_DIR, "models", "xgb_score.pkl") # Import the EXACT same feature set as v25 market models from train_v25_clean import FEATURES TARGETS = ["score_home", "score_away", "ht_score_home", "ht_score_away"] # Model hyperparameters (tuned for goal count regression) XGB_PARAMS = { "objective": "reg:squarederror", "n_estimators": 1200, "learning_rate": 0.02, "max_depth": 6, "subsample": 0.8, "colsample_bytree": 0.7, "min_child_weight": 5, "reg_alpha": 0.1, "reg_lambda": 1.0, "n_jobs": -1, "random_state": 42, } def load_data() -> pd.DataFrame: """Load and validate training data.""" if not os.path.exists(DATA_PATH): print(f"❌ Data file not found: {DATA_PATH}") print(" Run extract_training_data.py first") sys.exit(1) print(f"📦 Loading data from {DATA_PATH}...") df = pd.read_csv(DATA_PATH) # Fill feature NaNs with 0 (same as v25 training) for col in FEATURES: if col in df.columns: df[col] = df[col].fillna(0) # Backward-compatible: add odds presence flags if missing odds_base_columns = [ "odds_ms_h", "odds_ms_d", "odds_ms_a", "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a", "odds_ou05_o", "odds_ou05_u", "odds_ou15_o", "odds_ou15_u", "odds_ou25_o", "odds_ou25_u", "odds_ou35_o", "odds_ou35_u", "odds_ht_ou05_o", "odds_ht_ou05_u", "odds_ht_ou15_o", "odds_ht_ou15_u", "odds_btts_y", "odds_btts_n", ] for base_col in odds_base_columns: pres_col = f"{base_col}_present" if pres_col not in df.columns and base_col in df.columns: df[pres_col] = (df[base_col] > 1.0).astype(int) # Drop rows where any target is missing df = df.dropna(subset=TARGETS) # Filter: at least MS odds must be present df = df[df["odds_ms_h"] > 1.0].copy() # Ensure all features exist missing = [f for f in FEATURES if f not in df.columns] if missing: print(f"⚠️ Missing {len(missing)} features, filling with 0: {missing[:5]}...") for f in missing: df[f] = 0 return df def temporal_split(df: pd.DataFrame, train_ratio: float = 0.80): """ Temporal train/test split by match date. Ensures no future information leaks into training. """ if "match_date" in df.columns: df = df.sort_values("match_date").reset_index(drop=True) elif "round" in df.columns: df = df.sort_values("round").reset_index(drop=True) split_idx = int(len(df) * train_ratio) return df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy() def train_single_model(X_train, y_train, X_test, y_test, name: str): """Train a single XGBoost regression model with early stopping.""" print(f"\n🏗️ Training {name} model...") model = xgb.XGBRegressor(**XGB_PARAMS) model.fit( X_train, y_train, eval_set=[(X_test, y_test)], verbose=False, ) preds = model.predict(X_test) mae = mean_absolute_error(y_test, preds) rmse = np.sqrt(mean_squared_error(y_test, preds)) r2 = r2_score(y_test, preds) print(f" MAE: {mae:.4f} goals") print(f" RMSE: {rmse:.4f}") print(f" R²: {r2:.4f}") return model, {"mae": mae, "rmse": rmse, "r2": r2} def evaluate_combined(models: dict, X_test, y_test_dict: dict): """Evaluate combined score accuracy (FT and HT).""" print("\n🎯 Combined Score Evaluation (Test Set):") # FT Score ft_h_preds = models["ft_home"].predict(X_test) ft_a_preds = models["ft_away"].predict(X_test) y_ft_h = y_test_dict["score_home"].values y_ft_a = y_test_dict["score_away"].values exact = 0 close = 0 result_correct = 0 total = len(X_test) for h_true, a_true, h_pred, a_pred in zip(y_ft_h, y_ft_a, ft_h_preds, ft_a_preds): hp = max(0, round(h_pred)) ap = max(0, round(a_pred)) # Exact score if hp == h_true and ap == a_true: exact += 1 # Close (±1 each) if abs(hp - h_true) <= 1 and abs(ap - a_true) <= 1: close += 1 # Result direction (1X2) true_result = 1 if h_true > a_true else (0 if h_true == a_true else -1) pred_result = 1 if hp > ap else (0 if hp == ap else -1) if true_result == pred_result: result_correct += 1 print(f" FT Exact Score: {exact / total * 100:.2f}% ({exact}/{total})") print(f" FT Close (±1): {close / total * 100:.2f}% ({close}/{total})") print(f" FT Result (1X2): {result_correct / total * 100:.2f}% ({result_correct}/{total})") # HT Score ht_h_preds = models["ht_home"].predict(X_test) ht_a_preds = models["ht_away"].predict(X_test) y_ht_h = y_test_dict["ht_score_home"].values y_ht_a = y_test_dict["ht_score_away"].values ht_exact = 0 ht_total = len(X_test) for h_true, a_true, h_pred, a_pred in zip(y_ht_h, y_ht_a, ht_h_preds, ht_a_preds): hp = max(0, round(h_pred)) ap = max(0, round(a_pred)) if hp == h_true and ap == a_true: ht_exact += 1 print(f" HT Exact Score: {ht_exact / ht_total * 100:.2f}% ({ht_exact}/{ht_total})") return { "ft_exact_pct": exact / total * 100, "ft_close_pct": close / total * 100, "ft_result_pct": result_correct / total * 100, "ht_exact_pct": ht_exact / ht_total * 100, } def train(): """Main training pipeline.""" print("🚀 Score Prediction Model Trainer (V25-Compatible)") print(f" Feature count: {len(FEATURES)}") print("=" * 60) # Load data df = load_data() print(f" Total valid rows: {len(df)}") # Temporal split train_df, test_df = temporal_split(df) print(f" Training set: {len(train_df)} matches") print(f" Test set: {len(test_df)} matches (temporally after training)") X_train = train_df[FEATURES] X_test = test_df[FEATURES] # Train 4 models models = {} metrics = {} for target_name, model_key in [ ("score_home", "ft_home"), ("score_away", "ft_away"), ("ht_score_home", "ht_home"), ("ht_score_away", "ht_away"), ]: model, metric = train_single_model( X_train, train_df[target_name], X_test, test_df[target_name], model_key, ) models[model_key] = model metrics[model_key] = metric # Combined evaluation y_test_dict = {t: test_df[t] for t in TARGETS} combined = evaluate_combined(models, X_test, y_test_dict) # Save print(f"\n💾 Saving to {MODEL_PATH}...") model_data = { "home_model": models["ft_home"], "away_model": models["ft_away"], "ht_home_model": models["ht_home"], "ht_away_model": models["ht_away"], "features": FEATURES, "meta": { **{f"{k}_{mk}": mv for k, m in metrics.items() for mk, mv in m.items()}, **combined, "trained_at": datetime.now().isoformat(), "feature_count": len(FEATURES), "train_size": len(train_df), "test_size": len(test_df), }, } with open(MODEL_PATH, "wb") as f: pickle.dump(model_data, f) print("\n✅ Score model training complete!") print(f" Saved: {MODEL_PATH}") if __name__ == "__main__": train()