184 lines
6.0 KiB
Python
Executable File
184 lines
6.0 KiB
Python
Executable File
|
|
import pandas as pd
|
|
import xgboost as xgb
|
|
import pickle
|
|
import os
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import mean_absolute_error, r2_score
|
|
|
|
# Paths
|
|
DATA_PATH = os.path.join(os.path.dirname(__file__), "../data/training_data.csv")
|
|
MODEL_PATH = os.path.join(os.path.dirname(__file__), "../models/xgb_score.pkl")
|
|
|
|
# Import unified 56-feature array from markets trainer
|
|
from train_xgboost_markets import FEATURES
|
|
|
|
TARGETS = ["score_home", "score_away", "ht_score_home", "ht_score_away"]
|
|
|
|
def train():
|
|
print("🚀 Training Score Prediction Model (XGBoost) - Full Time & Half Time")
|
|
print("=" * 60)
|
|
|
|
if not os.path.exists(DATA_PATH):
|
|
print(f"❌ Data file not found: {DATA_PATH}")
|
|
return
|
|
|
|
print(f"📦 Loading data from {DATA_PATH}...")
|
|
df = pd.read_csv(DATA_PATH)
|
|
|
|
# Preprocessing
|
|
# Drop rows where target is missing (should verify)
|
|
df = df.dropna(subset=TARGETS)
|
|
|
|
# Fill feature NaNs with median/mean or 0
|
|
print(f" Original rows: {len(df)}")
|
|
|
|
# Filter valid odds (at least ms_h > 1.0)
|
|
df = df[df["odds_ms_h"] > 1.0].copy()
|
|
print(f" Rows with valid odds: {len(df)}")
|
|
|
|
X = df[FEATURES]
|
|
y_home = df["score_home"]
|
|
y_away = df["score_away"]
|
|
y_ht_home = df["ht_score_home"]
|
|
y_ht_away = df["ht_score_away"]
|
|
|
|
# Train/Test Split
|
|
X_train, X_test, y_h_train, y_h_test, y_a_train, y_a_test, y_ht_h_train, y_ht_h_test, y_ht_a_train, y_ht_a_test = train_test_split(
|
|
X, y_home, y_away, y_ht_home, y_ht_away, test_size=0.2, random_state=42
|
|
)
|
|
|
|
print(f" Training set: {len(X_train)} matches")
|
|
print(f" Test set: {len(X_test)} matches")
|
|
|
|
# --- HOME GOALS MODEL ---
|
|
print("\n🏠 Training Home Goals Model...")
|
|
xgb_home = xgb.XGBRegressor(
|
|
objective='reg:squarederror',
|
|
n_estimators=1000,
|
|
learning_rate=0.01,
|
|
max_depth=5,
|
|
subsample=0.7,
|
|
colsample_bytree=0.7,
|
|
n_jobs=-1,
|
|
random_state=42,
|
|
early_stopping_rounds=50 # Configure here for newer XGBoost or remove if not supported in constructor (depends on version)
|
|
)
|
|
# Actually, to be safe across versions, let's remove early stopping for now or use validation set properly
|
|
# Using 'eval_set' without early_stopping_rounds just prints metrics
|
|
xgb_home = xgb.XGBRegressor(
|
|
objective='reg:squarederror',
|
|
n_estimators=1000,
|
|
learning_rate=0.01,
|
|
max_depth=5,
|
|
subsample=0.7,
|
|
colsample_bytree=0.7,
|
|
n_jobs=-1,
|
|
random_state=42
|
|
)
|
|
xgb_home.fit(X_train, y_h_train, eval_set=[(X_test, y_h_test)], verbose=False)
|
|
|
|
home_preds = xgb_home.predict(X_test)
|
|
mae_home = mean_absolute_error(y_h_test, home_preds)
|
|
r2_home = r2_score(y_h_test, home_preds)
|
|
print(f" ✅ FT Home MAE: {mae_home:.4f} goals")
|
|
print(f" ✅ FT Home R2: {r2_home:.4f}")
|
|
|
|
# --- AWAY GOALS MODEL ---
|
|
print("\n✈️ Training FT Away Goals Model...")
|
|
xgb_away = xgb.XGBRegressor(
|
|
objective='reg:squarederror',
|
|
n_estimators=1000,
|
|
learning_rate=0.01,
|
|
max_depth=5,
|
|
subsample=0.7,
|
|
colsample_bytree=0.7,
|
|
n_jobs=-1,
|
|
random_state=42
|
|
)
|
|
xgb_away.fit(X_train, y_a_train, eval_set=[(X_test, y_a_test)], verbose=False)
|
|
|
|
away_preds = xgb_away.predict(X_test)
|
|
mae_away = mean_absolute_error(y_a_test, away_preds)
|
|
r2_away = r2_score(y_a_test, away_preds)
|
|
print(f" ✅ FT Away MAE: {mae_away:.4f} goals")
|
|
print(f" ✅ FT Away R2: {r2_away:.4f}")
|
|
|
|
# --- HT HOME GOALS MODEL ---
|
|
print("\n🏠 Training HT Home Goals Model...")
|
|
xgb_ht_home = xgb.XGBRegressor(
|
|
objective='reg:squarederror',
|
|
n_estimators=1000,
|
|
learning_rate=0.01,
|
|
max_depth=5,
|
|
subsample=0.7,
|
|
colsample_bytree=0.7,
|
|
n_jobs=-1,
|
|
random_state=42
|
|
)
|
|
xgb_ht_home.fit(X_train, y_ht_h_train, eval_set=[(X_test, y_ht_h_test)], verbose=False)
|
|
|
|
ht_home_preds = xgb_ht_home.predict(X_test)
|
|
mae_ht_home = mean_absolute_error(y_ht_h_test, ht_home_preds)
|
|
print(f" ✅ HT Home MAE: {mae_ht_home:.4f} goals")
|
|
|
|
# --- HT AWAY GOALS MODEL ---
|
|
print("\n✈️ Training HT Away Goals Model...")
|
|
xgb_ht_away = xgb.XGBRegressor(
|
|
objective='reg:squarederror',
|
|
n_estimators=1000,
|
|
learning_rate=0.01,
|
|
max_depth=5,
|
|
subsample=0.7,
|
|
colsample_bytree=0.7,
|
|
n_jobs=-1,
|
|
random_state=42
|
|
)
|
|
xgb_ht_away.fit(X_train, y_ht_a_train, eval_set=[(X_test, y_ht_a_test)], verbose=False)
|
|
|
|
ht_away_preds = xgb_ht_away.predict(X_test)
|
|
mae_ht_away = mean_absolute_error(y_ht_a_test, ht_away_preds)
|
|
print(f" ✅ HT Away MAE: {mae_ht_away:.4f} goals")
|
|
|
|
# --- EVALUATE EXACT SCORE ACCURACY (ROUNDED) ---
|
|
print("\n🎯 Exact FT Score Accuracy (Test Set):")
|
|
correct = 0
|
|
close = 0 # Within 1 goal diff for both
|
|
|
|
for h_true, a_true, h_pred, a_pred in zip(y_h_test, y_a_test, home_preds, away_preds):
|
|
h_p = round(h_pred)
|
|
a_p = round(a_pred)
|
|
if h_p == h_true and a_p == a_true:
|
|
correct += 1
|
|
if abs(h_p - h_true) <= 1 and abs(a_p - a_true) <= 1:
|
|
close += 1
|
|
|
|
acc = correct / len(X_test) * 100
|
|
close_acc = close / len(X_test) * 100
|
|
print(f" Exact Match: {acc:.2f}%")
|
|
print(f" Close Match (+/- 1 goal): {close_acc:.2f}%")
|
|
|
|
# Save
|
|
print(f"\n💾 Saving models to {MODEL_PATH}...")
|
|
model_data = {
|
|
"home_model": xgb_home,
|
|
"away_model": xgb_away,
|
|
"ht_home_model": xgb_ht_home,
|
|
"ht_away_model": xgb_ht_away,
|
|
"features": FEATURES,
|
|
"meta": {
|
|
"mae_home": mae_home,
|
|
"mae_away": mae_away,
|
|
"mae_ht_home": mae_ht_home,
|
|
"mae_ht_away": mae_ht_away,
|
|
"acc": acc
|
|
}
|
|
}
|
|
with open(MODEL_PATH, "wb") as f:
|
|
pickle.dump(model_data, f)
|
|
|
|
print("✅ Done.")
|
|
|
|
if __name__ == "__main__":
|
|
train()
|