import pandas as pd import xgboost as xgb import pickle import os from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error, r2_score # Paths DATA_PATH = os.path.join(os.path.dirname(__file__), "../data/training_data.csv") MODEL_PATH = os.path.join(os.path.dirname(__file__), "../models/xgb_score.pkl") # Import unified 56-feature array from markets trainer from train_xgboost_markets import FEATURES TARGETS = ["score_home", "score_away", "ht_score_home", "ht_score_away"] def train(): print("šŸš€ Training Score Prediction Model (XGBoost) - Full Time & Half Time") print("=" * 60) if not os.path.exists(DATA_PATH): print(f"āŒ Data file not found: {DATA_PATH}") return print(f"šŸ“¦ Loading data from {DATA_PATH}...") df = pd.read_csv(DATA_PATH) # Preprocessing # Drop rows where target is missing (should verify) df = df.dropna(subset=TARGETS) # Fill feature NaNs with median/mean or 0 print(f" Original rows: {len(df)}") # Filter valid odds (at least ms_h > 1.0) df = df[df["odds_ms_h"] > 1.0].copy() print(f" Rows with valid odds: {len(df)}") X = df[FEATURES] y_home = df["score_home"] y_away = df["score_away"] y_ht_home = df["ht_score_home"] y_ht_away = df["ht_score_away"] # Train/Test Split X_train, X_test, y_h_train, y_h_test, y_a_train, y_a_test, y_ht_h_train, y_ht_h_test, y_ht_a_train, y_ht_a_test = train_test_split( X, y_home, y_away, y_ht_home, y_ht_away, test_size=0.2, random_state=42 ) print(f" Training set: {len(X_train)} matches") print(f" Test set: {len(X_test)} matches") # --- HOME GOALS MODEL --- print("\nšŸ  Training Home Goals Model...") xgb_home = xgb.XGBRegressor( objective='reg:squarederror', n_estimators=1000, learning_rate=0.01, max_depth=5, subsample=0.7, colsample_bytree=0.7, n_jobs=-1, random_state=42, early_stopping_rounds=50 # Configure here for newer XGBoost or remove if not supported in constructor (depends on version) ) # Actually, to be safe across versions, let's remove early stopping for now or use validation set properly # Using 'eval_set' without early_stopping_rounds just prints metrics xgb_home = xgb.XGBRegressor( objective='reg:squarederror', n_estimators=1000, learning_rate=0.01, max_depth=5, subsample=0.7, colsample_bytree=0.7, n_jobs=-1, random_state=42 ) xgb_home.fit(X_train, y_h_train, eval_set=[(X_test, y_h_test)], verbose=False) home_preds = xgb_home.predict(X_test) mae_home = mean_absolute_error(y_h_test, home_preds) r2_home = r2_score(y_h_test, home_preds) print(f" āœ… FT Home MAE: {mae_home:.4f} goals") print(f" āœ… FT Home R2: {r2_home:.4f}") # --- AWAY GOALS MODEL --- print("\nāœˆļø Training FT Away Goals Model...") xgb_away = xgb.XGBRegressor( objective='reg:squarederror', n_estimators=1000, learning_rate=0.01, max_depth=5, subsample=0.7, colsample_bytree=0.7, n_jobs=-1, random_state=42 ) xgb_away.fit(X_train, y_a_train, eval_set=[(X_test, y_a_test)], verbose=False) away_preds = xgb_away.predict(X_test) mae_away = mean_absolute_error(y_a_test, away_preds) r2_away = r2_score(y_a_test, away_preds) print(f" āœ… FT Away MAE: {mae_away:.4f} goals") print(f" āœ… FT Away R2: {r2_away:.4f}") # --- HT HOME GOALS MODEL --- print("\nšŸ  Training HT Home Goals Model...") xgb_ht_home = xgb.XGBRegressor( objective='reg:squarederror', n_estimators=1000, learning_rate=0.01, max_depth=5, subsample=0.7, colsample_bytree=0.7, n_jobs=-1, random_state=42 ) xgb_ht_home.fit(X_train, y_ht_h_train, eval_set=[(X_test, y_ht_h_test)], verbose=False) ht_home_preds = xgb_ht_home.predict(X_test) mae_ht_home = mean_absolute_error(y_ht_h_test, ht_home_preds) print(f" āœ… HT Home MAE: {mae_ht_home:.4f} goals") # --- HT AWAY GOALS MODEL --- print("\nāœˆļø Training HT Away Goals Model...") xgb_ht_away = xgb.XGBRegressor( objective='reg:squarederror', n_estimators=1000, learning_rate=0.01, max_depth=5, subsample=0.7, colsample_bytree=0.7, n_jobs=-1, random_state=42 ) xgb_ht_away.fit(X_train, y_ht_a_train, eval_set=[(X_test, y_ht_a_test)], verbose=False) ht_away_preds = xgb_ht_away.predict(X_test) mae_ht_away = mean_absolute_error(y_ht_a_test, ht_away_preds) print(f" āœ… HT Away MAE: {mae_ht_away:.4f} goals") # --- EVALUATE EXACT SCORE ACCURACY (ROUNDED) --- print("\nšŸŽÆ Exact FT Score Accuracy (Test Set):") correct = 0 close = 0 # Within 1 goal diff for both for h_true, a_true, h_pred, a_pred in zip(y_h_test, y_a_test, home_preds, away_preds): h_p = round(h_pred) a_p = round(a_pred) if h_p == h_true and a_p == a_true: correct += 1 if abs(h_p - h_true) <= 1 and abs(a_p - a_true) <= 1: close += 1 acc = correct / len(X_test) * 100 close_acc = close / len(X_test) * 100 print(f" Exact Match: {acc:.2f}%") print(f" Close Match (+/- 1 goal): {close_acc:.2f}%") # Save print(f"\nšŸ’¾ Saving models to {MODEL_PATH}...") model_data = { "home_model": xgb_home, "away_model": xgb_away, "ht_home_model": xgb_ht_home, "ht_away_model": xgb_ht_away, "features": FEATURES, "meta": { "mae_home": mae_home, "mae_away": mae_away, "mae_ht_home": mae_ht_home, "mae_ht_away": mae_ht_away, "acc": acc } } with open(MODEL_PATH, "wb") as f: pickle.dump(model_data, f) print("āœ… Done.") if __name__ == "__main__": train()