This commit is contained in:
Executable
+222
@@ -0,0 +1,222 @@
|
||||
"""
|
||||
V20 Pro Model Trainer
|
||||
=====================
|
||||
Advanced training pipeline for Suggest-Bet V20 Ensemble.
|
||||
|
||||
Features:
|
||||
1. Optuna Hyperparameter Optimization
|
||||
2. Stratified K-Fold Cross-Validation
|
||||
3. Probability Calibration (Isotonic Regression)
|
||||
4. Market-specific weight handling for reversals (1/2, 2/1)
|
||||
|
||||
Usage:
|
||||
python3 scripts/train_xgboost_pro.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import xgboost as xgb
|
||||
import optuna
|
||||
from optuna.samplers import TPESampler
|
||||
from sklearn.model_selection import StratifiedKFold, train_test_split
|
||||
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss, classification_report
|
||||
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Config
|
||||
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
|
||||
MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "xgboost")
|
||||
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v20")
|
||||
|
||||
os.makedirs(MODELS_DIR, exist_ok=True)
|
||||
os.makedirs(REPORTS_DIR, exist_ok=True)
|
||||
|
||||
# Feature Columns (Must match extraction + inference)
|
||||
FEATURES = [
|
||||
# ELO
|
||||
"home_overall_elo", "away_overall_elo", "elo_diff",
|
||||
"home_home_elo", "away_away_elo", "form_elo_diff",
|
||||
|
||||
# Form
|
||||
"home_goals_avg", "home_conceded_avg",
|
||||
"away_goals_avg", "away_conceded_avg",
|
||||
"home_clean_sheet_rate", "away_clean_sheet_rate",
|
||||
"home_scoring_rate", "away_scoring_rate",
|
||||
"home_winning_streak", "away_winning_streak",
|
||||
|
||||
# H2H
|
||||
"h2h_home_win_rate", "h2h_draw_rate",
|
||||
"h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",
|
||||
|
||||
# Stats
|
||||
"home_avg_possession", "away_avg_possession",
|
||||
"home_avg_shots_on_target", "away_avg_shots_on_target",
|
||||
"home_shot_conversion", "away_shot_conversion",
|
||||
|
||||
# Odds (Implicit market wisdom)
|
||||
"odds_ms_h", "odds_ms_d", "odds_ms_a",
|
||||
"implied_home", "implied_draw", "implied_away",
|
||||
|
||||
# League/Context
|
||||
"league_avg_goals", "league_zero_goal_rate",
|
||||
"home_xga", "away_xga"
|
||||
]
|
||||
|
||||
def load_data():
|
||||
if not os.path.exists(DATA_PATH):
|
||||
print(f"❌ Data file not found: {DATA_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"📦 Loading data from {DATA_PATH}...")
|
||||
df = pd.read_csv(DATA_PATH)
|
||||
df.fillna(0, inplace=True)
|
||||
print(f" Shape: {df.shape}")
|
||||
return df
|
||||
|
||||
class MarketTrainer:
|
||||
def __init__(self, df, target_col, market_name, is_multi=False, num_class=None, weights=None):
|
||||
self.df = df[df[target_col].notna()].copy()
|
||||
self.target_col = target_col
|
||||
self.market_name = market_name
|
||||
self.is_multi = is_multi
|
||||
self.num_class = num_class
|
||||
self.weights = weights
|
||||
|
||||
self.X = self.df[FEATURES]
|
||||
self.y = self.df[target_col].astype(int)
|
||||
|
||||
# Split for final evaluation hold-out
|
||||
self.X_train, self.X_holdout, self.y_train, self.y_holdout = train_test_split(
|
||||
self.X, self.y, test_size=0.15, random_state=42, stratify=self.y
|
||||
)
|
||||
|
||||
def optimize(self, n_trials=50):
|
||||
print(f"\n🔍 Tuning {self.market_name} with Optuna ({n_trials} trials)...")
|
||||
|
||||
study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
|
||||
study.optimize(self.objective, n_trials=n_trials)
|
||||
|
||||
print(f" Best params: {study.best_params}")
|
||||
print(f" Best Cross-Validation LogLoss: {study.best_value:.4f}")
|
||||
return study.best_params
|
||||
|
||||
def objective(self, trial):
|
||||
params = {
|
||||
"verbosity": 0,
|
||||
"objective": "multi:softprob" if self.is_multi else "binary:logistic",
|
||||
"eval_metric": "mlogloss" if self.is_multi else "logloss",
|
||||
"booster": "gbtree",
|
||||
"lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
|
||||
"alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
|
||||
"max_depth": trial.suggest_int("max_depth", 3, 9),
|
||||
"eta": trial.suggest_float("eta", 1e-3, 0.1, log=True),
|
||||
"gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
|
||||
"grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
|
||||
"subsample": trial.suggest_float("subsample", 0.5, 1.0),
|
||||
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
|
||||
"n_estimators": trial.suggest_int("n_estimators", 100, 1000),
|
||||
"early_stopping_rounds": 20,
|
||||
"n_jobs": 4,
|
||||
"random_state": 42
|
||||
}
|
||||
|
||||
if self.is_multi:
|
||||
params["num_class"] = self.num_class
|
||||
|
||||
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
||||
losses = []
|
||||
|
||||
for train_idx, val_idx in skf.split(self.X_train, self.y_train):
|
||||
X_t, X_v = self.X_train.iloc[train_idx], self.X_train.iloc[val_idx]
|
||||
y_t, y_v = self.y_train.iloc[train_idx], self.y_train.iloc[val_idx]
|
||||
|
||||
# Apply weights if available
|
||||
w_t = None
|
||||
if self.weights:
|
||||
w_t = y_t.map(self.weights).fillna(1.0)
|
||||
|
||||
model = xgb.XGBClassifier(**params)
|
||||
model.fit(X_t, y_t, sample_weight=w_t, eval_set=[(X_v, y_v)], verbose=False)
|
||||
|
||||
preds = model.predict_proba(X_v)
|
||||
loss = log_loss(y_v, preds)
|
||||
losses.append(loss)
|
||||
|
||||
return np.mean(losses)
|
||||
|
||||
def train_final(self, best_params):
|
||||
print(f"🚀 Training final calibrated {self.market_name} model...")
|
||||
|
||||
# Add core params
|
||||
best_params["objective"] = "multi:softprob" if self.is_multi else "binary:logistic"
|
||||
best_params["eval_metric"] = "mlogloss" if self.is_multi else "logloss"
|
||||
if self.is_multi:
|
||||
best_params["num_class"] = self.num_class
|
||||
|
||||
base_model = xgb.XGBClassifier(**best_params)
|
||||
|
||||
# Sample weights for training
|
||||
w_train = None
|
||||
if self.weights:
|
||||
w_train = self.y_train.map(self.weights).fillna(1.0)
|
||||
|
||||
# Calibration using Cross-Validation
|
||||
calibrated_model = CalibratedClassifierCV(base_model, method='isotonic', cv=5)
|
||||
calibrated_model.fit(self.X_train, self.y_train, sample_weight=w_train)
|
||||
|
||||
# Evaluate on Hold-out
|
||||
holdout_preds_raw = calibrated_model.predict_proba(self.X_holdout)
|
||||
holdout_preds_class = calibrated_model.predict(self.X_holdout)
|
||||
|
||||
acc = accuracy_score(self.y_holdout, holdout_preds_class)
|
||||
loss = log_loss(self.y_holdout, holdout_preds_raw)
|
||||
|
||||
print(f"📊 Hold-out Results for {self.market_name}:")
|
||||
print(f" Accuracy: {acc:.4f} | LogLoss: {loss:.4f}")
|
||||
print(classification_report(self.y_holdout, holdout_preds_class))
|
||||
|
||||
# Save model
|
||||
model_path = os.path.join(MODELS_DIR, f"xgb_{self.market_name.lower()}.pkl")
|
||||
with open(model_path, "wb") as f:
|
||||
pickle.dump(calibrated_model, f)
|
||||
|
||||
print(f"💾 Calibrated model saved to {model_path}")
|
||||
return calibrated_model
|
||||
|
||||
def main():
|
||||
df = load_data()
|
||||
|
||||
# 1. MS (1X2)
|
||||
ms_trainer = MarketTrainer(df, "label_ms", "MS", is_multi=True, num_class=3)
|
||||
ms_params = ms_trainer.optimize(n_trials=50)
|
||||
ms_trainer.train_final(ms_params)
|
||||
|
||||
# 2. OU 2.5
|
||||
ou25_trainer = MarketTrainer(df, "label_ou25", "OU25")
|
||||
ou25_params = ou25_trainer.optimize(n_trials=30)
|
||||
ou25_trainer.train_final(ou25_params)
|
||||
|
||||
# 3. BTTS
|
||||
btts_trainer = MarketTrainer(df, "label_btts", "BTTS")
|
||||
btts_params = btts_trainer.optimize(n_trials=30)
|
||||
btts_trainer.train_final(btts_params)
|
||||
|
||||
# 4. HT/FT SURPRISE HUNTER
|
||||
htft_weights = {
|
||||
0: 1.0, 1: 3.0, 2: 20.0, # 1/1, 1/X, 1/2 (MAX WEIGHT)
|
||||
3: 2.0, 4: 2.0, 5: 2.0,
|
||||
6: 20.0, 7: 3.0, 8: 1.0 # 2/1 (MAX WEIGHT)
|
||||
}
|
||||
htft_trainer = MarketTrainer(df, "label_ht_ft", "HT_FT", is_multi=True, num_class=9, weights=htft_weights)
|
||||
htft_params = htft_trainer.optimize(n_trials=50)
|
||||
htft_trainer.train_final(htft_params)
|
||||
|
||||
print("\n✅ Advanced V20 Model Training Complete!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user