feat(ai): expand training to 68K+ matches, add score model, backfill implied odds
Deploy Iddaai Backend / build-and-deploy (push) Successful in 6s
Deploy Iddaai Backend / build-and-deploy (push) Successful in 6s
- extract_training_data.py: switch from top_leagues.json (23) to qualified_leagues.json (265) - update_implied_odds.py: new script to backfill implied odds from real market data - train_score_model.py: rewrite with v25 102-feature set + temporal split - single_match_orchestrator.py: integrate ML score model with heuristic fallback
This commit is contained in:
@@ -1,183 +1,271 @@
|
||||
"""
|
||||
V25-Compatible Score Prediction Model Trainer
|
||||
===============================================
|
||||
Trains 4 independent XGBoost regression models for:
|
||||
- FT Home Goals
|
||||
- FT Away Goals
|
||||
- HT Home Goals
|
||||
- HT Away Goals
|
||||
|
||||
Uses the same 102-feature set as v25_ensemble for full compatibility.
|
||||
Temporal train/test split (80/20) to avoid future leakage.
|
||||
|
||||
Usage:
|
||||
python3 scripts/train_score_model.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import xgboost as xgb
|
||||
import pickle
|
||||
import os
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import mean_absolute_error, r2_score
|
||||
from datetime import datetime
|
||||
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
|
||||
|
||||
# Paths
|
||||
DATA_PATH = os.path.join(os.path.dirname(__file__), "../data/training_data.csv")
|
||||
MODEL_PATH = os.path.join(os.path.dirname(__file__), "../models/xgb_score.pkl")
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
# Import unified 56-feature array from markets trainer
|
||||
from train_xgboost_markets import FEATURES
|
||||
# Config
|
||||
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
|
||||
MODEL_PATH = os.path.join(AI_ENGINE_DIR, "models", "xgb_score.pkl")
|
||||
|
||||
# Import the EXACT same feature set as v25 market models
|
||||
from train_v25_clean import FEATURES
|
||||
|
||||
TARGETS = ["score_home", "score_away", "ht_score_home", "ht_score_away"]
|
||||
|
||||
def train():
|
||||
print("🚀 Training Score Prediction Model (XGBoost) - Full Time & Half Time")
|
||||
print("=" * 60)
|
||||
# Model hyperparameters (tuned for goal count regression)
|
||||
XGB_PARAMS = {
|
||||
"objective": "reg:squarederror",
|
||||
"n_estimators": 1200,
|
||||
"learning_rate": 0.02,
|
||||
"max_depth": 6,
|
||||
"subsample": 0.8,
|
||||
"colsample_bytree": 0.7,
|
||||
"min_child_weight": 5,
|
||||
"reg_alpha": 0.1,
|
||||
"reg_lambda": 1.0,
|
||||
"n_jobs": -1,
|
||||
"random_state": 42,
|
||||
}
|
||||
|
||||
|
||||
def load_data() -> pd.DataFrame:
|
||||
"""Load and validate training data."""
|
||||
if not os.path.exists(DATA_PATH):
|
||||
print(f"❌ Data file not found: {DATA_PATH}")
|
||||
return
|
||||
print(" Run extract_training_data.py first")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"📦 Loading data from {DATA_PATH}...")
|
||||
df = pd.read_csv(DATA_PATH)
|
||||
|
||||
# Preprocessing
|
||||
# Drop rows where target is missing (should verify)
|
||||
|
||||
# Fill feature NaNs with 0 (same as v25 training)
|
||||
for col in FEATURES:
|
||||
if col in df.columns:
|
||||
df[col] = df[col].fillna(0)
|
||||
|
||||
# Backward-compatible: add odds presence flags if missing
|
||||
odds_base_columns = [
|
||||
"odds_ms_h", "odds_ms_d", "odds_ms_a",
|
||||
"odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
|
||||
"odds_ou05_o", "odds_ou05_u",
|
||||
"odds_ou15_o", "odds_ou15_u",
|
||||
"odds_ou25_o", "odds_ou25_u",
|
||||
"odds_ou35_o", "odds_ou35_u",
|
||||
"odds_ht_ou05_o", "odds_ht_ou05_u",
|
||||
"odds_ht_ou15_o", "odds_ht_ou15_u",
|
||||
"odds_btts_y", "odds_btts_n",
|
||||
]
|
||||
for base_col in odds_base_columns:
|
||||
pres_col = f"{base_col}_present"
|
||||
if pres_col not in df.columns and base_col in df.columns:
|
||||
df[pres_col] = (df[base_col] > 1.0).astype(int)
|
||||
|
||||
# Drop rows where any target is missing
|
||||
df = df.dropna(subset=TARGETS)
|
||||
|
||||
# Fill feature NaNs with median/mean or 0
|
||||
print(f" Original rows: {len(df)}")
|
||||
|
||||
# Filter valid odds (at least ms_h > 1.0)
|
||||
|
||||
# Filter: at least MS odds must be present
|
||||
df = df[df["odds_ms_h"] > 1.0].copy()
|
||||
print(f" Rows with valid odds: {len(df)}")
|
||||
|
||||
X = df[FEATURES]
|
||||
y_home = df["score_home"]
|
||||
y_away = df["score_away"]
|
||||
y_ht_home = df["ht_score_home"]
|
||||
y_ht_away = df["ht_score_away"]
|
||||
|
||||
# Train/Test Split
|
||||
X_train, X_test, y_h_train, y_h_test, y_a_train, y_a_test, y_ht_h_train, y_ht_h_test, y_ht_a_train, y_ht_a_test = train_test_split(
|
||||
X, y_home, y_away, y_ht_home, y_ht_away, test_size=0.2, random_state=42
|
||||
)
|
||||
|
||||
print(f" Training set: {len(X_train)} matches")
|
||||
print(f" Test set: {len(X_test)} matches")
|
||||
|
||||
# --- HOME GOALS MODEL ---
|
||||
print("\n🏠 Training Home Goals Model...")
|
||||
xgb_home = xgb.XGBRegressor(
|
||||
objective='reg:squarederror',
|
||||
n_estimators=1000,
|
||||
learning_rate=0.01,
|
||||
max_depth=5,
|
||||
subsample=0.7,
|
||||
colsample_bytree=0.7,
|
||||
n_jobs=-1,
|
||||
random_state=42,
|
||||
early_stopping_rounds=50 # Configure here for newer XGBoost or remove if not supported in constructor (depends on version)
|
||||
)
|
||||
# Actually, to be safe across versions, let's remove early stopping for now or use validation set properly
|
||||
# Using 'eval_set' without early_stopping_rounds just prints metrics
|
||||
xgb_home = xgb.XGBRegressor(
|
||||
objective='reg:squarederror',
|
||||
n_estimators=1000,
|
||||
learning_rate=0.01,
|
||||
max_depth=5,
|
||||
subsample=0.7,
|
||||
colsample_bytree=0.7,
|
||||
n_jobs=-1,
|
||||
random_state=42
|
||||
)
|
||||
xgb_home.fit(X_train, y_h_train, eval_set=[(X_test, y_h_test)], verbose=False)
|
||||
|
||||
home_preds = xgb_home.predict(X_test)
|
||||
mae_home = mean_absolute_error(y_h_test, home_preds)
|
||||
r2_home = r2_score(y_h_test, home_preds)
|
||||
print(f" ✅ FT Home MAE: {mae_home:.4f} goals")
|
||||
print(f" ✅ FT Home R2: {r2_home:.4f}")
|
||||
# Ensure all features exist
|
||||
missing = [f for f in FEATURES if f not in df.columns]
|
||||
if missing:
|
||||
print(f"⚠️ Missing {len(missing)} features, filling with 0: {missing[:5]}...")
|
||||
for f in missing:
|
||||
df[f] = 0
|
||||
|
||||
# --- AWAY GOALS MODEL ---
|
||||
print("\n✈️ Training FT Away Goals Model...")
|
||||
xgb_away = xgb.XGBRegressor(
|
||||
objective='reg:squarederror',
|
||||
n_estimators=1000,
|
||||
learning_rate=0.01,
|
||||
max_depth=5,
|
||||
subsample=0.7,
|
||||
colsample_bytree=0.7,
|
||||
n_jobs=-1,
|
||||
random_state=42
|
||||
)
|
||||
xgb_away.fit(X_train, y_a_train, eval_set=[(X_test, y_a_test)], verbose=False)
|
||||
|
||||
away_preds = xgb_away.predict(X_test)
|
||||
mae_away = mean_absolute_error(y_a_test, away_preds)
|
||||
r2_away = r2_score(y_a_test, away_preds)
|
||||
print(f" ✅ FT Away MAE: {mae_away:.4f} goals")
|
||||
print(f" ✅ FT Away R2: {r2_away:.4f}")
|
||||
|
||||
# --- HT HOME GOALS MODEL ---
|
||||
print("\n🏠 Training HT Home Goals Model...")
|
||||
xgb_ht_home = xgb.XGBRegressor(
|
||||
objective='reg:squarederror',
|
||||
n_estimators=1000,
|
||||
learning_rate=0.01,
|
||||
max_depth=5,
|
||||
subsample=0.7,
|
||||
colsample_bytree=0.7,
|
||||
n_jobs=-1,
|
||||
random_state=42
|
||||
)
|
||||
xgb_ht_home.fit(X_train, y_ht_h_train, eval_set=[(X_test, y_ht_h_test)], verbose=False)
|
||||
|
||||
ht_home_preds = xgb_ht_home.predict(X_test)
|
||||
mae_ht_home = mean_absolute_error(y_ht_h_test, ht_home_preds)
|
||||
print(f" ✅ HT Home MAE: {mae_ht_home:.4f} goals")
|
||||
return df
|
||||
|
||||
# --- HT AWAY GOALS MODEL ---
|
||||
print("\n✈️ Training HT Away Goals Model...")
|
||||
xgb_ht_away = xgb.XGBRegressor(
|
||||
objective='reg:squarederror',
|
||||
n_estimators=1000,
|
||||
learning_rate=0.01,
|
||||
max_depth=5,
|
||||
subsample=0.7,
|
||||
colsample_bytree=0.7,
|
||||
n_jobs=-1,
|
||||
random_state=42
|
||||
|
||||
def temporal_split(df: pd.DataFrame, train_ratio: float = 0.80):
|
||||
"""
|
||||
Temporal train/test split by match date.
|
||||
Ensures no future information leaks into training.
|
||||
"""
|
||||
if "match_date" in df.columns:
|
||||
df = df.sort_values("match_date").reset_index(drop=True)
|
||||
elif "round" in df.columns:
|
||||
df = df.sort_values("round").reset_index(drop=True)
|
||||
|
||||
split_idx = int(len(df) * train_ratio)
|
||||
return df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy()
|
||||
|
||||
|
||||
def train_single_model(X_train, y_train, X_test, y_test, name: str):
|
||||
"""Train a single XGBoost regression model with early stopping."""
|
||||
print(f"\n🏗️ Training {name} model...")
|
||||
|
||||
model = xgb.XGBRegressor(**XGB_PARAMS)
|
||||
model.fit(
|
||||
X_train, y_train,
|
||||
eval_set=[(X_test, y_test)],
|
||||
verbose=False,
|
||||
)
|
||||
xgb_ht_away.fit(X_train, y_ht_a_train, eval_set=[(X_test, y_ht_a_test)], verbose=False)
|
||||
|
||||
ht_away_preds = xgb_ht_away.predict(X_test)
|
||||
mae_ht_away = mean_absolute_error(y_ht_a_test, ht_away_preds)
|
||||
print(f" ✅ HT Away MAE: {mae_ht_away:.4f} goals")
|
||||
|
||||
# --- EVALUATE EXACT SCORE ACCURACY (ROUNDED) ---
|
||||
print("\n🎯 Exact FT Score Accuracy (Test Set):")
|
||||
correct = 0
|
||||
close = 0 # Within 1 goal diff for both
|
||||
|
||||
for h_true, a_true, h_pred, a_pred in zip(y_h_test, y_a_test, home_preds, away_preds):
|
||||
h_p = round(h_pred)
|
||||
a_p = round(a_pred)
|
||||
if h_p == h_true and a_p == a_true:
|
||||
correct += 1
|
||||
if abs(h_p - h_true) <= 1 and abs(a_p - a_true) <= 1:
|
||||
|
||||
preds = model.predict(X_test)
|
||||
|
||||
mae = mean_absolute_error(y_test, preds)
|
||||
rmse = np.sqrt(mean_squared_error(y_test, preds))
|
||||
r2 = r2_score(y_test, preds)
|
||||
|
||||
print(f" MAE: {mae:.4f} goals")
|
||||
print(f" RMSE: {rmse:.4f}")
|
||||
print(f" R²: {r2:.4f}")
|
||||
|
||||
return model, {"mae": mae, "rmse": rmse, "r2": r2}
|
||||
|
||||
|
||||
def evaluate_combined(models: dict, X_test, y_test_dict: dict):
|
||||
"""Evaluate combined score accuracy (FT and HT)."""
|
||||
print("\n🎯 Combined Score Evaluation (Test Set):")
|
||||
|
||||
# FT Score
|
||||
ft_h_preds = models["ft_home"].predict(X_test)
|
||||
ft_a_preds = models["ft_away"].predict(X_test)
|
||||
|
||||
y_ft_h = y_test_dict["score_home"].values
|
||||
y_ft_a = y_test_dict["score_away"].values
|
||||
|
||||
exact = 0
|
||||
close = 0
|
||||
result_correct = 0
|
||||
total = len(X_test)
|
||||
|
||||
for h_true, a_true, h_pred, a_pred in zip(y_ft_h, y_ft_a, ft_h_preds, ft_a_preds):
|
||||
hp = max(0, round(h_pred))
|
||||
ap = max(0, round(a_pred))
|
||||
|
||||
# Exact score
|
||||
if hp == h_true and ap == a_true:
|
||||
exact += 1
|
||||
|
||||
# Close (±1 each)
|
||||
if abs(hp - h_true) <= 1 and abs(ap - a_true) <= 1:
|
||||
close += 1
|
||||
|
||||
acc = correct / len(X_test) * 100
|
||||
close_acc = close / len(X_test) * 100
|
||||
print(f" Exact Match: {acc:.2f}%")
|
||||
print(f" Close Match (+/- 1 goal): {close_acc:.2f}%")
|
||||
|
||||
# Result direction (1X2)
|
||||
true_result = 1 if h_true > a_true else (0 if h_true == a_true else -1)
|
||||
pred_result = 1 if hp > ap else (0 if hp == ap else -1)
|
||||
if true_result == pred_result:
|
||||
result_correct += 1
|
||||
|
||||
print(f" FT Exact Score: {exact / total * 100:.2f}% ({exact}/{total})")
|
||||
print(f" FT Close (±1): {close / total * 100:.2f}% ({close}/{total})")
|
||||
print(f" FT Result (1X2): {result_correct / total * 100:.2f}% ({result_correct}/{total})")
|
||||
|
||||
# HT Score
|
||||
ht_h_preds = models["ht_home"].predict(X_test)
|
||||
ht_a_preds = models["ht_away"].predict(X_test)
|
||||
|
||||
y_ht_h = y_test_dict["ht_score_home"].values
|
||||
y_ht_a = y_test_dict["ht_score_away"].values
|
||||
|
||||
ht_exact = 0
|
||||
ht_total = len(X_test)
|
||||
|
||||
for h_true, a_true, h_pred, a_pred in zip(y_ht_h, y_ht_a, ht_h_preds, ht_a_preds):
|
||||
hp = max(0, round(h_pred))
|
||||
ap = max(0, round(a_pred))
|
||||
if hp == h_true and ap == a_true:
|
||||
ht_exact += 1
|
||||
|
||||
print(f" HT Exact Score: {ht_exact / ht_total * 100:.2f}% ({ht_exact}/{ht_total})")
|
||||
|
||||
return {
|
||||
"ft_exact_pct": exact / total * 100,
|
||||
"ft_close_pct": close / total * 100,
|
||||
"ft_result_pct": result_correct / total * 100,
|
||||
"ht_exact_pct": ht_exact / ht_total * 100,
|
||||
}
|
||||
|
||||
|
||||
def train():
|
||||
"""Main training pipeline."""
|
||||
print("🚀 Score Prediction Model Trainer (V25-Compatible)")
|
||||
print(f" Feature count: {len(FEATURES)}")
|
||||
print("=" * 60)
|
||||
|
||||
# Load data
|
||||
df = load_data()
|
||||
print(f" Total valid rows: {len(df)}")
|
||||
|
||||
# Temporal split
|
||||
train_df, test_df = temporal_split(df)
|
||||
print(f" Training set: {len(train_df)} matches")
|
||||
print(f" Test set: {len(test_df)} matches (temporally after training)")
|
||||
|
||||
X_train = train_df[FEATURES]
|
||||
X_test = test_df[FEATURES]
|
||||
|
||||
# Train 4 models
|
||||
models = {}
|
||||
metrics = {}
|
||||
|
||||
for target_name, model_key in [
|
||||
("score_home", "ft_home"),
|
||||
("score_away", "ft_away"),
|
||||
("ht_score_home", "ht_home"),
|
||||
("ht_score_away", "ht_away"),
|
||||
]:
|
||||
model, metric = train_single_model(
|
||||
X_train, train_df[target_name],
|
||||
X_test, test_df[target_name],
|
||||
model_key,
|
||||
)
|
||||
models[model_key] = model
|
||||
metrics[model_key] = metric
|
||||
|
||||
# Combined evaluation
|
||||
y_test_dict = {t: test_df[t] for t in TARGETS}
|
||||
combined = evaluate_combined(models, X_test, y_test_dict)
|
||||
|
||||
# Save
|
||||
print(f"\n💾 Saving models to {MODEL_PATH}...")
|
||||
print(f"\n💾 Saving to {MODEL_PATH}...")
|
||||
model_data = {
|
||||
"home_model": xgb_home,
|
||||
"away_model": xgb_away,
|
||||
"ht_home_model": xgb_ht_home,
|
||||
"ht_away_model": xgb_ht_away,
|
||||
"home_model": models["ft_home"],
|
||||
"away_model": models["ft_away"],
|
||||
"ht_home_model": models["ht_home"],
|
||||
"ht_away_model": models["ht_away"],
|
||||
"features": FEATURES,
|
||||
"meta": {
|
||||
"mae_home": mae_home,
|
||||
"mae_away": mae_away,
|
||||
"mae_ht_home": mae_ht_home,
|
||||
"mae_ht_away": mae_ht_away,
|
||||
"acc": acc
|
||||
}
|
||||
**{f"{k}_{mk}": mv for k, m in metrics.items() for mk, mv in m.items()},
|
||||
**combined,
|
||||
"trained_at": datetime.now().isoformat(),
|
||||
"feature_count": len(FEATURES),
|
||||
"train_size": len(train_df),
|
||||
"test_size": len(test_df),
|
||||
},
|
||||
}
|
||||
|
||||
with open(MODEL_PATH, "wb") as f:
|
||||
pickle.dump(model_data, f)
|
||||
|
||||
print("✅ Done.")
|
||||
|
||||
print("\n✅ Score model training complete!")
|
||||
print(f" Saved: {MODEL_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train()
|
||||
|
||||
Reference in New Issue
Block a user