Files
iddaai-be/ai-engine/scripts/train_score_model.py
fahricansecer 2f0b85a0c7
Deploy Iddaai Backend / build-and-deploy (push) Failing after 18s
first (part 2: other directories)
2026-04-16 15:11:25 +03:00

184 lines
6.0 KiB
Python
Executable File

import pandas as pd
import xgboost as xgb
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
# Paths
DATA_PATH = os.path.join(os.path.dirname(__file__), "../data/training_data.csv")
MODEL_PATH = os.path.join(os.path.dirname(__file__), "../models/xgb_score.pkl")
# Import unified 56-feature array from markets trainer
from train_xgboost_markets import FEATURES
TARGETS = ["score_home", "score_away", "ht_score_home", "ht_score_away"]
def train():
print("🚀 Training Score Prediction Model (XGBoost) - Full Time & Half Time")
print("=" * 60)
if not os.path.exists(DATA_PATH):
print(f"❌ Data file not found: {DATA_PATH}")
return
print(f"📦 Loading data from {DATA_PATH}...")
df = pd.read_csv(DATA_PATH)
# Preprocessing
# Drop rows where target is missing (should verify)
df = df.dropna(subset=TARGETS)
# Fill feature NaNs with median/mean or 0
print(f" Original rows: {len(df)}")
# Filter valid odds (at least ms_h > 1.0)
df = df[df["odds_ms_h"] > 1.0].copy()
print(f" Rows with valid odds: {len(df)}")
X = df[FEATURES]
y_home = df["score_home"]
y_away = df["score_away"]
y_ht_home = df["ht_score_home"]
y_ht_away = df["ht_score_away"]
# Train/Test Split
X_train, X_test, y_h_train, y_h_test, y_a_train, y_a_test, y_ht_h_train, y_ht_h_test, y_ht_a_train, y_ht_a_test = train_test_split(
X, y_home, y_away, y_ht_home, y_ht_away, test_size=0.2, random_state=42
)
print(f" Training set: {len(X_train)} matches")
print(f" Test set: {len(X_test)} matches")
# --- HOME GOALS MODEL ---
print("\n🏠 Training Home Goals Model...")
xgb_home = xgb.XGBRegressor(
objective='reg:squarederror',
n_estimators=1000,
learning_rate=0.01,
max_depth=5,
subsample=0.7,
colsample_bytree=0.7,
n_jobs=-1,
random_state=42,
early_stopping_rounds=50 # Configure here for newer XGBoost or remove if not supported in constructor (depends on version)
)
# Actually, to be safe across versions, let's remove early stopping for now or use validation set properly
# Using 'eval_set' without early_stopping_rounds just prints metrics
xgb_home = xgb.XGBRegressor(
objective='reg:squarederror',
n_estimators=1000,
learning_rate=0.01,
max_depth=5,
subsample=0.7,
colsample_bytree=0.7,
n_jobs=-1,
random_state=42
)
xgb_home.fit(X_train, y_h_train, eval_set=[(X_test, y_h_test)], verbose=False)
home_preds = xgb_home.predict(X_test)
mae_home = mean_absolute_error(y_h_test, home_preds)
r2_home = r2_score(y_h_test, home_preds)
print(f" ✅ FT Home MAE: {mae_home:.4f} goals")
print(f" ✅ FT Home R2: {r2_home:.4f}")
# --- AWAY GOALS MODEL ---
print("\n✈️ Training FT Away Goals Model...")
xgb_away = xgb.XGBRegressor(
objective='reg:squarederror',
n_estimators=1000,
learning_rate=0.01,
max_depth=5,
subsample=0.7,
colsample_bytree=0.7,
n_jobs=-1,
random_state=42
)
xgb_away.fit(X_train, y_a_train, eval_set=[(X_test, y_a_test)], verbose=False)
away_preds = xgb_away.predict(X_test)
mae_away = mean_absolute_error(y_a_test, away_preds)
r2_away = r2_score(y_a_test, away_preds)
print(f" ✅ FT Away MAE: {mae_away:.4f} goals")
print(f" ✅ FT Away R2: {r2_away:.4f}")
# --- HT HOME GOALS MODEL ---
print("\n🏠 Training HT Home Goals Model...")
xgb_ht_home = xgb.XGBRegressor(
objective='reg:squarederror',
n_estimators=1000,
learning_rate=0.01,
max_depth=5,
subsample=0.7,
colsample_bytree=0.7,
n_jobs=-1,
random_state=42
)
xgb_ht_home.fit(X_train, y_ht_h_train, eval_set=[(X_test, y_ht_h_test)], verbose=False)
ht_home_preds = xgb_ht_home.predict(X_test)
mae_ht_home = mean_absolute_error(y_ht_h_test, ht_home_preds)
print(f" ✅ HT Home MAE: {mae_ht_home:.4f} goals")
# --- HT AWAY GOALS MODEL ---
print("\n✈️ Training HT Away Goals Model...")
xgb_ht_away = xgb.XGBRegressor(
objective='reg:squarederror',
n_estimators=1000,
learning_rate=0.01,
max_depth=5,
subsample=0.7,
colsample_bytree=0.7,
n_jobs=-1,
random_state=42
)
xgb_ht_away.fit(X_train, y_ht_a_train, eval_set=[(X_test, y_ht_a_test)], verbose=False)
ht_away_preds = xgb_ht_away.predict(X_test)
mae_ht_away = mean_absolute_error(y_ht_a_test, ht_away_preds)
print(f" ✅ HT Away MAE: {mae_ht_away:.4f} goals")
# --- EVALUATE EXACT SCORE ACCURACY (ROUNDED) ---
print("\n🎯 Exact FT Score Accuracy (Test Set):")
correct = 0
close = 0 # Within 1 goal diff for both
for h_true, a_true, h_pred, a_pred in zip(y_h_test, y_a_test, home_preds, away_preds):
h_p = round(h_pred)
a_p = round(a_pred)
if h_p == h_true and a_p == a_true:
correct += 1
if abs(h_p - h_true) <= 1 and abs(a_p - a_true) <= 1:
close += 1
acc = correct / len(X_test) * 100
close_acc = close / len(X_test) * 100
print(f" Exact Match: {acc:.2f}%")
print(f" Close Match (+/- 1 goal): {close_acc:.2f}%")
# Save
print(f"\n💾 Saving models to {MODEL_PATH}...")
model_data = {
"home_model": xgb_home,
"away_model": xgb_away,
"ht_home_model": xgb_ht_home,
"ht_away_model": xgb_ht_away,
"features": FEATURES,
"meta": {
"mae_home": mae_home,
"mae_away": mae_away,
"mae_ht_home": mae_ht_home,
"mae_ht_away": mae_ht_away,
"acc": acc
}
}
with open(MODEL_PATH, "wb") as f:
pickle.dump(model_data, f)
print("✅ Done.")
if __name__ == "__main__":
train()