247 lines
7.9 KiB
Python
Executable File
247 lines
7.9 KiB
Python
Executable File
"""
|
|
XGBoost Market Model Trainer
|
|
============================
|
|
Trains specialized XGBoost models for each betting market.
|
|
Includes 'Surprise Hunter' logic for HT/FT reversals (1/2, 2/1).
|
|
|
|
Models:
|
|
1. MS (1X2) - Multi-class
|
|
2. Over/Under 2.5 - Binary
|
|
3. BTTS - Binary
|
|
4. HT/FT - Multi-class (Imbalanced learning for 1/2, 2/1)
|
|
5. Other line variants (1.5, 3.5, etc.)
|
|
|
|
Usage:
|
|
python3 scripts/train_xgboost_markets.py
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import pickle
|
|
import numpy as np
|
|
import pandas as pd
|
|
import xgboost as xgb
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import accuracy_score, log_loss, classification_report, roc_auc_score
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
|
# Config
|
|
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
|
|
MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "xgboost")
|
|
|
|
os.makedirs(MODELS_DIR, exist_ok=True)
|
|
|
|
# Feature Columns (Must match extraction + inference)
|
|
FEATURES = [
|
|
# ELO
|
|
"home_overall_elo", "away_overall_elo", "elo_diff",
|
|
"home_home_elo", "away_away_elo", "form_elo_diff",
|
|
|
|
# Form
|
|
"home_goals_avg", "home_conceded_avg",
|
|
"away_goals_avg", "away_conceded_avg",
|
|
"home_clean_sheet_rate", "away_clean_sheet_rate",
|
|
"home_scoring_rate", "away_scoring_rate",
|
|
"home_winning_streak", "away_winning_streak",
|
|
|
|
# H2H
|
|
"h2h_home_win_rate", "h2h_draw_rate",
|
|
"h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",
|
|
|
|
# Stats
|
|
"home_avg_possession", "away_avg_possession",
|
|
"home_avg_shots_on_target", "away_avg_shots_on_target",
|
|
"home_shot_conversion", "away_shot_conversion",
|
|
|
|
# Odds (Implicit market wisdom)
|
|
"odds_ms_h", "odds_ms_d", "odds_ms_a",
|
|
"implied_home", "implied_draw", "implied_away",
|
|
|
|
"odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
|
|
|
|
"odds_ou05_o", "odds_ou05_u",
|
|
"odds_ou15_o", "odds_ou15_u",
|
|
"odds_ou25_o", "odds_ou25_u",
|
|
"odds_ou35_o", "odds_ou35_u",
|
|
|
|
"odds_ht_ou05_o", "odds_ht_ou05_u",
|
|
"odds_ht_ou15_o", "odds_ht_ou15_u",
|
|
|
|
"odds_btts_y", "odds_btts_n",
|
|
|
|
# League/Context
|
|
"league_avg_goals", "league_zero_goal_rate",
|
|
"home_xga", "away_xga",
|
|
|
|
# Upset Engine
|
|
"upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",
|
|
|
|
# Referee Engine
|
|
"referee_home_bias", "referee_avg_goals", "referee_cards_total",
|
|
"referee_avg_yellow", "referee_experience",
|
|
|
|
# Momentum Engine
|
|
"home_momentum_score", "away_momentum_score", "momentum_diff",
|
|
]
|
|
|
|
def load_data():
|
|
if not os.path.exists(DATA_PATH):
|
|
print(f"❌ Data file not found: {DATA_PATH}")
|
|
sys.exit(1)
|
|
|
|
print(f"📦 Loading data from {DATA_PATH}...")
|
|
df = pd.read_csv(DATA_PATH)
|
|
|
|
# Handle missing values - simple imputation for robustness
|
|
df.fillna(0, inplace=True)
|
|
|
|
print(f" Shape: {df.shape}")
|
|
return df
|
|
|
|
def train_model(df, target_col, model_name, objective, metric, num_class=None, class_weights=None):
|
|
"""
|
|
Generic trainer for XGBoost models.
|
|
Supports binary and multi-class.
|
|
Supports sample weighting for imbalanced classes (like 1/2 reversals).
|
|
"""
|
|
print(f"\n🚀 Training {model_name} (Target: {target_col})...")
|
|
|
|
# Filter valid rows for this target
|
|
valid_df = df[df[target_col].notna()].copy()
|
|
if valid_df.empty:
|
|
print(f" ⚠️ No valid data for {target_col}, skipping.")
|
|
return
|
|
|
|
X = valid_df[FEATURES]
|
|
y = valid_df[target_col].astype(int)
|
|
|
|
# Split
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y, test_size=0.2, random_state=42, stratify=y
|
|
)
|
|
|
|
# Sample Weights (For HT/FT Surprise)
|
|
sample_weights__train = None
|
|
if class_weights:
|
|
print(" ⚖️ Applying class weights for surprise detection...")
|
|
sample_weights__train = y_train.map(class_weights).fillna(1.0)
|
|
|
|
# Model Params
|
|
params = {
|
|
'objective': objective,
|
|
'eval_metric': metric,
|
|
'eta': 0.05,
|
|
'max_depth': 6,
|
|
'subsample': 0.8,
|
|
'colsample_bytree': 0.8,
|
|
'nthread': 4,
|
|
'seed': 42
|
|
}
|
|
|
|
if num_class:
|
|
params['num_class'] = num_class
|
|
|
|
# Train using Scikit-Learn Wrapper so we can pickle it cleanly for v20_ensemble
|
|
if objective == "multi:softprob":
|
|
model = xgb.XGBClassifier(**params, n_estimators=1000, early_stopping_rounds=50)
|
|
else:
|
|
model = xgb.XGBClassifier(**params, n_estimators=1000, early_stopping_rounds=50)
|
|
|
|
# Fit with early stopping
|
|
model.fit(
|
|
X_train, y_train,
|
|
sample_weight=sample_weights__train,
|
|
eval_set=[(X_test, y_test)],
|
|
verbose=False
|
|
)
|
|
|
|
# Evaluation
|
|
preds = model.predict_proba(X_test)
|
|
|
|
if objective == "multi:softprob":
|
|
y_pred_class = np.argmax(preds, axis=1)
|
|
acc = accuracy_score(y_test, y_pred_class)
|
|
loss = log_loss(y_test, preds)
|
|
print(f" ✅ Accuracy: {acc:.4f} | LogLoss: {loss:.4f}")
|
|
|
|
# Detailed report for important classes
|
|
print(classification_report(y_test, y_pred_class))
|
|
|
|
else:
|
|
# Binary
|
|
# Extract the probability for class 1
|
|
class_1_preds = preds[:, 1]
|
|
y_pred_class = (class_1_preds > 0.5).astype(int)
|
|
acc = accuracy_score(y_test, y_pred_class)
|
|
auc = roc_auc_score(y_test, class_1_preds)
|
|
print(f" ✅ Accuracy: {acc:.4f} | AUC: {auc:.4f}")
|
|
|
|
# Save raw json booster
|
|
model_json_path = os.path.join(MODELS_DIR, f"{model_name}.json")
|
|
model.get_booster().save_model(model_json_path)
|
|
|
|
# Save sklearn wrapped PKL (What v20_ensemble actually loads for Uncalibrated models like ht_ft!)
|
|
import pickle
|
|
model_pkl_path = os.path.join(MODELS_DIR, f"{model_name}.pkl")
|
|
with open(model_pkl_path, "wb") as f:
|
|
pickle.dump(model, f)
|
|
|
|
print(f" 💾 Model saved to {model_json_path} and {model_pkl_path}")
|
|
|
|
def main():
|
|
df = load_data()
|
|
|
|
# 1. Match Result (1X2)
|
|
train_model(
|
|
df, "label_ms", "xgb_ms",
|
|
objective="multi:softprob", metric="mlogloss", num_class=3
|
|
)
|
|
|
|
# 2. Over/Under 2.5
|
|
train_model(
|
|
df, "label_ou25", "xgb_ou25",
|
|
objective="binary:logistic", metric="logloss"
|
|
)
|
|
|
|
# 3. BTTS
|
|
train_model(
|
|
df, "label_btts", "xgb_btts",
|
|
objective="binary:logistic", metric="logloss"
|
|
)
|
|
|
|
# 4. HT/FT SURPRISE HUNTER
|
|
# Classes: 0=1/1, 1=1/X, 2=1/2(HOME->AWAY), 3=X/1 ... 6=2/1(AWAY->HOME) ...
|
|
# We give HUGE weight to 2 (1/2) and 6 (2/1)
|
|
htft_weights = {
|
|
0: 1.0, 1: 3.0, 2: 15.0, # 1/1, 1/X, 1/2 (Reversal!)
|
|
3: 2.0, 4: 2.0, 5: 2.0, # X/1, X/X, X/2
|
|
6: 15.0, 7: 3.0, 8: 1.0 # 2/1 (Reversal!), 2/X, 2/2
|
|
}
|
|
|
|
train_model(
|
|
df, "label_ht_ft", "xgb_ht_ft",
|
|
objective="multi:softprob", metric="mlogloss", num_class=9,
|
|
class_weights=htft_weights
|
|
)
|
|
|
|
# 5. Over/Under 1.5 & 3.5 (Optional utility models)
|
|
train_model(df, "label_ou15", "xgb_ou15", objective="binary:logistic", metric="logloss")
|
|
train_model(df, "label_ou35", "xgb_ou35", objective="binary:logistic", metric="logloss")
|
|
|
|
# 6. Half-Time 1X2
|
|
train_model(df, "label_ht_result", "xgb_ht_result", objective="multi:softprob", metric="mlogloss", num_class=3)
|
|
|
|
# 7. Half-Time Over/Under
|
|
train_model(df, "label_ht_ou05", "xgb_ht_ou05", objective="binary:logistic", metric="logloss")
|
|
train_model(df, "label_ht_ou15", "xgb_ht_ou15", objective="binary:logistic", metric="logloss")
|
|
# 8. Handicap MS and Cards
|
|
train_model(df, "label_handicap_ms", "xgb_handicap_ms", objective="multi:softprob", metric="mlogloss", num_class=3)
|
|
train_model(df, "label_cards_ou45", "xgb_cards_ou45", objective="binary:logistic", metric="logloss")
|
|
|
|
print("\n✅ All models trained successfully!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|