452 lines
15 KiB
Python
452 lines
15 KiB
Python
"""
|
|
V25 Model Trainer - NO TARGET LEAKAGE
|
|
=====================================
|
|
Training script for V25 ensemble model.
|
|
|
|
CRITICAL: This version removes total_goals and ht_total_goals features
|
|
to prevent target leakage. These features are only known AFTER the match ends.
|
|
|
|
Usage:
|
|
python scripts/train_v25_clean.py
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import pickle
|
|
import numpy as np
|
|
import pandas as pd
|
|
import xgboost as xgb
|
|
import lightgbm as lgb
|
|
from datetime import datetime
|
|
from sklearn.metrics import accuracy_score, log_loss, classification_report
|
|
|
|
# Add parent directory to path
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
# Config
|
|
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
|
|
MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "v25")
|
|
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25")
|
|
|
|
os.makedirs(MODELS_DIR, exist_ok=True)
|
|
os.makedirs(REPORTS_DIR, exist_ok=True)
|
|
|
|
# Feature Columns - NO TARGET LEAKAGE
|
|
# These features are available BEFORE the match starts
|
|
FEATURES = [
|
|
# ELO Features (8)
|
|
"home_overall_elo", "away_overall_elo", "elo_diff",
|
|
"home_home_elo", "away_away_elo",
|
|
"home_form_elo", "away_form_elo", "form_elo_diff",
|
|
|
|
# Form Features (12)
|
|
"home_goals_avg", "home_conceded_avg",
|
|
"away_goals_avg", "away_conceded_avg",
|
|
"home_clean_sheet_rate", "away_clean_sheet_rate",
|
|
"home_scoring_rate", "away_scoring_rate",
|
|
"home_winning_streak", "away_winning_streak",
|
|
"home_unbeaten_streak", "away_unbeaten_streak",
|
|
|
|
# H2H Features (6)
|
|
"h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate",
|
|
"h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",
|
|
|
|
# Team Stats Features (8)
|
|
"home_avg_possession", "away_avg_possession",
|
|
"home_avg_shots_on_target", "away_avg_shots_on_target",
|
|
"home_shot_conversion", "away_shot_conversion",
|
|
"home_avg_corners", "away_avg_corners",
|
|
|
|
# Odds Features (24) - Market wisdom
|
|
"odds_ms_h", "odds_ms_d", "odds_ms_a",
|
|
"implied_home", "implied_draw", "implied_away",
|
|
"odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
|
|
"odds_ou05_o", "odds_ou05_u",
|
|
"odds_ou15_o", "odds_ou15_u",
|
|
"odds_ou25_o", "odds_ou25_u",
|
|
"odds_ou35_o", "odds_ou35_u",
|
|
"odds_ht_ou05_o", "odds_ht_ou05_u",
|
|
"odds_ht_ou15_o", "odds_ht_ou15_u",
|
|
"odds_btts_y", "odds_btts_n",
|
|
"odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present",
|
|
"odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present",
|
|
"odds_ou05_o_present", "odds_ou05_u_present",
|
|
"odds_ou15_o_present", "odds_ou15_u_present",
|
|
"odds_ou25_o_present", "odds_ou25_u_present",
|
|
"odds_ou35_o_present", "odds_ou35_u_present",
|
|
"odds_ht_ou05_o_present", "odds_ht_ou05_u_present",
|
|
"odds_ht_ou15_o_present", "odds_ht_ou15_u_present",
|
|
"odds_btts_y_present", "odds_btts_n_present",
|
|
|
|
# League Features (4)
|
|
"home_xga", "away_xga",
|
|
"league_avg_goals", "league_zero_goal_rate",
|
|
|
|
# Upset Engine (4)
|
|
"upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",
|
|
|
|
# Referee Engine (5)
|
|
"referee_home_bias", "referee_avg_goals", "referee_cards_total",
|
|
"referee_avg_yellow", "referee_experience",
|
|
|
|
# Momentum Engine (3)
|
|
"home_momentum_score", "away_momentum_score", "momentum_diff",
|
|
|
|
# Squad Features (9)
|
|
"home_squad_quality", "away_squad_quality", "squad_diff",
|
|
"home_key_players", "away_key_players",
|
|
"home_missing_impact", "away_missing_impact",
|
|
"home_goals_form", "away_goals_form",
|
|
]
|
|
|
|
# REMOVED: total_goals, ht_total_goals (TARGET LEAKAGE!)
|
|
# These are only known AFTER the match ends
|
|
|
|
print(f"[INFO] Total features: {len(FEATURES)}")
|
|
|
|
MARKET_CONFIGS = [
|
|
{"target": "label_ms", "name": "MS", "num_class": 3},
|
|
{"target": "label_ou15", "name": "OU15", "num_class": 2},
|
|
{"target": "label_ou25", "name": "OU25", "num_class": 2},
|
|
{"target": "label_ou35", "name": "OU35", "num_class": 2},
|
|
{"target": "label_btts", "name": "BTTS", "num_class": 2},
|
|
{"target": "label_ht_result", "name": "HT_RESULT", "num_class": 3},
|
|
{"target": "label_ht_ou05", "name": "HT_OU05", "num_class": 2},
|
|
{"target": "label_ht_ou15", "name": "HT_OU15", "num_class": 2},
|
|
{"target": "label_ht_ft", "name": "HTFT", "num_class": 9},
|
|
{"target": "label_odd_even", "name": "ODD_EVEN", "num_class": 2},
|
|
{"target": "label_cards_ou45", "name": "CARDS_OU45", "num_class": 2},
|
|
{"target": "label_handicap_ms", "name": "HANDICAP_MS", "num_class": 3},
|
|
]
|
|
|
|
|
|
def load_data():
|
|
"""Load training data from CSV."""
|
|
if not os.path.exists(DATA_PATH):
|
|
print(f"[ERROR] Data file not found: {DATA_PATH}")
|
|
print("[INFO] Run extract_training_data.py first to generate training data")
|
|
sys.exit(1)
|
|
|
|
print(f"[INFO] Loading data from {DATA_PATH}...")
|
|
df = pd.read_csv(DATA_PATH)
|
|
|
|
# Fill NaN values
|
|
for col in FEATURES:
|
|
if col in df.columns:
|
|
df[col] = df[col].fillna(0)
|
|
|
|
# Backward-compatible derivation for older CSVs without odds availability flags.
|
|
odds_flag_sources = {
|
|
"odds_ms_h_present": "odds_ms_h",
|
|
"odds_ms_d_present": "odds_ms_d",
|
|
"odds_ms_a_present": "odds_ms_a",
|
|
"odds_ht_ms_h_present": "odds_ht_ms_h",
|
|
"odds_ht_ms_d_present": "odds_ht_ms_d",
|
|
"odds_ht_ms_a_present": "odds_ht_ms_a",
|
|
"odds_ou05_o_present": "odds_ou05_o",
|
|
"odds_ou05_u_present": "odds_ou05_u",
|
|
"odds_ou15_o_present": "odds_ou15_o",
|
|
"odds_ou15_u_present": "odds_ou15_u",
|
|
"odds_ou25_o_present": "odds_ou25_o",
|
|
"odds_ou25_u_present": "odds_ou25_u",
|
|
"odds_ou35_o_present": "odds_ou35_o",
|
|
"odds_ou35_u_present": "odds_ou35_u",
|
|
"odds_ht_ou05_o_present": "odds_ht_ou05_o",
|
|
"odds_ht_ou05_u_present": "odds_ht_ou05_u",
|
|
"odds_ht_ou15_o_present": "odds_ht_ou15_o",
|
|
"odds_ht_ou15_u_present": "odds_ht_ou15_u",
|
|
"odds_btts_y_present": "odds_btts_y",
|
|
"odds_btts_n_present": "odds_btts_n",
|
|
}
|
|
for flag_col, odds_col in odds_flag_sources.items():
|
|
if flag_col not in df.columns:
|
|
df[flag_col] = (
|
|
pd.to_numeric(df.get(odds_col, 0), errors="coerce").fillna(0) > 1.01
|
|
).astype(float)
|
|
|
|
print(f"[INFO] Shape: {df.shape}")
|
|
print(f"[INFO] Columns: {list(df.columns)}")
|
|
return df
|
|
|
|
|
|
def temporal_split(valid_df: pd.DataFrame):
|
|
"""Chronological train/val/test split."""
|
|
ordered = valid_df.sort_values("mst_utc").reset_index(drop=True)
|
|
n = len(ordered)
|
|
train_end = max(int(n * 0.70), 1)
|
|
val_end = max(int(n * 0.85), train_end + 1)
|
|
val_end = min(val_end, n - 1)
|
|
|
|
train_df = ordered.iloc[:train_end].copy()
|
|
val_df = ordered.iloc[train_end:val_end].copy()
|
|
test_df = ordered.iloc[val_end:].copy()
|
|
|
|
return train_df, val_df, test_df
|
|
|
|
|
|
def train_xgboost_model(X_train, y_train, X_val, y_val, num_class=3, market_name="MS"):
|
|
"""Train XGBoost model with early stopping."""
|
|
|
|
print(f"\n[INFO] Training XGBoost for {market_name}...")
|
|
|
|
params = {
|
|
"objective": "multi:softprob" if num_class > 2 else "binary:logistic",
|
|
"eval_metric": "mlogloss" if num_class > 2 else "logloss",
|
|
"max_depth": 6,
|
|
"eta": 0.05,
|
|
"subsample": 0.8,
|
|
"colsample_bytree": 0.8,
|
|
"min_child_weight": 3,
|
|
"gamma": 0.1,
|
|
"n_jobs": 4,
|
|
"random_state": 42,
|
|
}
|
|
|
|
if num_class > 2:
|
|
params["num_class"] = num_class
|
|
|
|
dtrain = xgb.DMatrix(X_train, label=y_train)
|
|
dval = xgb.DMatrix(X_val, label=y_val)
|
|
|
|
evals_result = {}
|
|
model = xgb.train(
|
|
params,
|
|
dtrain,
|
|
num_boost_round=1000,
|
|
evals=[(dtrain, 'train'), (dval, 'val')],
|
|
early_stopping_rounds=50,
|
|
evals_result=evals_result,
|
|
verbose_eval=100,
|
|
)
|
|
|
|
print(f"[OK] Best iteration: {model.best_iteration}")
|
|
print(f"[OK] Best score: {model.best_score:.4f}")
|
|
|
|
return model
|
|
|
|
|
|
def train_lightgbm_model(X_train, y_train, X_val, y_val, num_class=3, market_name="MS"):
|
|
"""Train LightGBM model with early stopping."""
|
|
|
|
print(f"\n[INFO] Training LightGBM for {market_name}...")
|
|
|
|
params = {
|
|
"objective": "multiclass" if num_class > 2 else "binary",
|
|
"metric": "multi_logloss" if num_class > 2 else "binary_logloss",
|
|
"max_depth": 6,
|
|
"learning_rate": 0.05,
|
|
"feature_fraction": 0.8,
|
|
"bagging_fraction": 0.8,
|
|
"bagging_freq": 5,
|
|
"min_child_samples": 20,
|
|
"n_jobs": 4,
|
|
"random_state": 42,
|
|
"verbose": -1,
|
|
}
|
|
|
|
if num_class > 2:
|
|
params["num_class"] = num_class
|
|
|
|
train_data = lgb.Dataset(X_train, label=y_train)
|
|
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
|
|
|
|
model = lgb.train(
|
|
params,
|
|
train_data,
|
|
num_boost_round=1000,
|
|
valid_sets=[train_data, val_data],
|
|
valid_names=['train', 'val'],
|
|
callbacks=[
|
|
lgb.early_stopping(stopping_rounds=50),
|
|
lgb.log_evaluation(period=100),
|
|
],
|
|
)
|
|
|
|
print(f"[OK] Best iteration: {model.best_iteration}")
|
|
print(f"[OK] Best score: {model.best_score['val'][params['metric']]:.4f}")
|
|
|
|
return model
|
|
|
|
|
|
def evaluate_model(model, X_test, y_test, model_type='xgb', num_class=3):
|
|
"""Evaluate model on test set."""
|
|
|
|
if model_type == 'xgb':
|
|
dtest = xgb.DMatrix(X_test)
|
|
probs = model.predict(dtest)
|
|
else: # lgb
|
|
probs = model.predict(X_test, num_iteration=model.best_iteration)
|
|
|
|
if len(probs.shape) == 1:
|
|
# Binary classification
|
|
probs = np.column_stack([1 - probs, probs])
|
|
|
|
preds = np.argmax(probs, axis=1)
|
|
|
|
acc = accuracy_score(y_test, preds)
|
|
loss = log_loss(y_test, probs)
|
|
|
|
print(f"\n[RESULTS] Test Results:")
|
|
print(f" Accuracy: {acc:.4f}")
|
|
print(f" Log Loss: {loss:.4f}")
|
|
|
|
# Per-class metrics
|
|
print("\n[REPORT] Classification Report:")
|
|
print(classification_report(y_test, preds))
|
|
|
|
return probs, acc, loss
|
|
|
|
|
|
def train_market(df, target_col, market_name, num_class=3):
|
|
"""Train models for a specific market."""
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"[MARKET] Training {market_name}")
|
|
print(f"{'='*60}")
|
|
|
|
# Filter valid rows
|
|
valid_df = df[df[target_col].notna()].copy()
|
|
valid_df = valid_df[valid_df[target_col].astype(str) != ""].copy()
|
|
print(f"[INFO] Valid samples: {len(valid_df)}")
|
|
|
|
if len(valid_df) < 100:
|
|
print(f"[ERROR] Not enough data for {market_name}")
|
|
return None, None
|
|
|
|
# Prepare features
|
|
available_features = [f for f in FEATURES if f in valid_df.columns]
|
|
print(f"[INFO] Available features: {len(available_features)}/{len(FEATURES)}")
|
|
|
|
train_df, val_df, test_df = temporal_split(valid_df)
|
|
X_train = train_df[available_features].values
|
|
X_val = val_df[available_features].values
|
|
X_test = test_df[available_features].values
|
|
y_train = train_df[target_col].astype(int).values
|
|
y_val = val_df[target_col].astype(int).values
|
|
y_test = test_df[target_col].astype(int).values
|
|
|
|
print(
|
|
f"[INFO] Temporal split -> Train: {len(X_train)},"
|
|
f" Val: {len(X_val)}, Test: {len(X_test)}"
|
|
)
|
|
print(
|
|
f"[INFO] Time windows -> train_end={int(train_df['mst_utc'].max())},"
|
|
f" val_end={int(val_df['mst_utc'].max())},"
|
|
f" test_end={int(test_df['mst_utc'].max())}"
|
|
)
|
|
|
|
# Train XGBoost
|
|
xgb_model = train_xgboost_model(X_train, y_train, X_val, y_val, num_class, market_name)
|
|
|
|
# Train LightGBM
|
|
lgb_model = train_lightgbm_model(X_train, y_train, X_val, y_val, num_class, market_name)
|
|
|
|
# Evaluate
|
|
print("\n[INFO] XGBoost Evaluation:")
|
|
xgb_probs, xgb_acc, xgb_loss = evaluate_model(xgb_model, X_test, y_test, 'xgb', num_class)
|
|
|
|
print("\n[INFO] LightGBM Evaluation:")
|
|
lgb_probs, lgb_acc, lgb_loss = evaluate_model(lgb_model, X_test, y_test, 'lgb', num_class)
|
|
|
|
# Ensemble evaluation
|
|
ensemble_probs = (xgb_probs + lgb_probs) / 2
|
|
ensemble_preds = np.argmax(ensemble_probs, axis=1)
|
|
ensemble_acc = accuracy_score(y_test, ensemble_preds)
|
|
ensemble_loss = log_loss(y_test, ensemble_probs)
|
|
|
|
print(f"\n[INFO] Ensemble Evaluation:")
|
|
print(f" Accuracy: {ensemble_acc:.4f}")
|
|
print(f" Log Loss: {ensemble_loss:.4f}")
|
|
|
|
# Save models
|
|
xgb_path = os.path.join(MODELS_DIR, f"xgb_v25_{market_name.lower()}.json")
|
|
xgb_model.save_model(xgb_path)
|
|
print(f"[OK] XGBoost saved: {xgb_path}")
|
|
|
|
lgb_path = os.path.join(MODELS_DIR, f"lgb_v25_{market_name.lower()}.txt")
|
|
lgb_model.save_model(lgb_path)
|
|
print(f"[OK] LightGBM saved: {lgb_path}")
|
|
|
|
metrics = {
|
|
"samples": int(len(valid_df)),
|
|
"features_used": available_features,
|
|
"train_samples": int(len(X_train)),
|
|
"val_samples": int(len(X_val)),
|
|
"test_samples": int(len(X_test)),
|
|
"xgb_accuracy": round(float(xgb_acc), 4),
|
|
"xgb_logloss": round(float(xgb_loss), 4),
|
|
"lgb_accuracy": round(float(lgb_acc), 4),
|
|
"lgb_logloss": round(float(lgb_loss), 4),
|
|
"ensemble_accuracy": round(float(ensemble_acc), 4),
|
|
"ensemble_logloss": round(float(ensemble_loss), 4),
|
|
"class_count": int(num_class),
|
|
}
|
|
|
|
return xgb_model, lgb_model, metrics
|
|
|
|
|
|
def main():
|
|
"""Main training pipeline."""
|
|
|
|
print("="*60)
|
|
print("V25 Model Training - NO TARGET LEAKAGE")
|
|
print("="*60)
|
|
print(f"[INFO] Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
|
|
# Load data
|
|
df = load_data()
|
|
|
|
target_cols = [col for col in df.columns if col.startswith('label_')]
|
|
print(f"\n[INFO] Available targets: {target_cols}")
|
|
|
|
results = {}
|
|
reports = {
|
|
"trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
"market_results": {},
|
|
}
|
|
|
|
for config in MARKET_CONFIGS:
|
|
target = config["target"]
|
|
market_name = config["name"]
|
|
num_class = config["num_class"]
|
|
|
|
if target not in df.columns:
|
|
print(f"[SKIP] {market_name}: missing target column {target}")
|
|
continue
|
|
|
|
xgb_model, lgb_model, metrics = train_market(
|
|
df, target, market_name, num_class=num_class
|
|
)
|
|
results[market_name] = {
|
|
'xgb': xgb_model is not None,
|
|
'lgb': lgb_model is not None,
|
|
}
|
|
reports["market_results"][market_name] = metrics
|
|
|
|
# Save feature list
|
|
feature_path = os.path.join(MODELS_DIR, "feature_cols.json")
|
|
with open(feature_path, 'w') as f:
|
|
json.dump(FEATURES, f, indent=2)
|
|
print(f"\n[OK] Feature list saved: {feature_path}")
|
|
|
|
report_path = os.path.join(REPORTS_DIR, "v25_market_metrics.json")
|
|
with open(report_path, "w") as f:
|
|
json.dump(reports, f, indent=2)
|
|
print(f"[OK] Metrics report saved: {report_path}")
|
|
|
|
# Summary
|
|
print("\n" + "="*60)
|
|
print("[SUMMARY] Training Results")
|
|
print("="*60)
|
|
for market, status in results.items():
|
|
print(f" {market}: XGB={status['xgb']}, LGB={status['lgb']}")
|
|
|
|
print(f"\n[INFO] Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print("[OK] V25 Training Complete!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|