554 lines
22 KiB
Python
554 lines
22 KiB
Python
"""
|
|
V25 Pro Model Trainer — Optuna + Isotonic Calibration
|
|
=====================================================
|
|
Combines V25's 83 features + 12 markets + temporal split
|
|
with Optuna hyperparameter tuning and Isotonic Regression calibration.
|
|
|
|
Usage:
|
|
python scripts/train_v25_pro.py
|
|
python scripts/train_v25_pro.py --markets MS,OU25,BTTS # specific markets
|
|
python scripts/train_v25_pro.py --trials 30 # fewer trials
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import pickle
|
|
import argparse
|
|
import numpy as np
|
|
import pandas as pd
|
|
import xgboost as xgb
|
|
import lightgbm as lgb
|
|
import optuna
|
|
from optuna.samplers import TPESampler
|
|
from datetime import datetime
|
|
from sklearn.metrics import accuracy_score, log_loss, classification_report
|
|
from sklearn.isotonic import IsotonicRegression
|
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
|
|
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
|
|
MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "v25")
|
|
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25")
|
|
|
|
os.makedirs(MODELS_DIR, exist_ok=True)
|
|
os.makedirs(REPORTS_DIR, exist_ok=True)
|
|
|
|
# ─── Feature Columns (95 features, NO target leakage) ───────────────
|
|
FEATURES = [
|
|
# ELO (8)
|
|
"home_overall_elo", "away_overall_elo", "elo_diff",
|
|
"home_home_elo", "away_away_elo",
|
|
"home_form_elo", "away_form_elo", "form_elo_diff",
|
|
# Form (12)
|
|
"home_goals_avg", "home_conceded_avg",
|
|
"away_goals_avg", "away_conceded_avg",
|
|
"home_clean_sheet_rate", "away_clean_sheet_rate",
|
|
"home_scoring_rate", "away_scoring_rate",
|
|
"home_winning_streak", "away_winning_streak",
|
|
"home_unbeaten_streak", "away_unbeaten_streak",
|
|
# H2H (6)
|
|
"h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate",
|
|
"h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",
|
|
# Team Stats (8)
|
|
"home_avg_possession", "away_avg_possession",
|
|
"home_avg_shots_on_target", "away_avg_shots_on_target",
|
|
"home_shot_conversion", "away_shot_conversion",
|
|
"home_avg_corners", "away_avg_corners",
|
|
# Odds (24 + 20 presence flags)
|
|
"odds_ms_h", "odds_ms_d", "odds_ms_a",
|
|
"implied_home", "implied_draw", "implied_away",
|
|
"odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
|
|
"odds_ou05_o", "odds_ou05_u",
|
|
"odds_ou15_o", "odds_ou15_u",
|
|
"odds_ou25_o", "odds_ou25_u",
|
|
"odds_ou35_o", "odds_ou35_u",
|
|
"odds_ht_ou05_o", "odds_ht_ou05_u",
|
|
"odds_ht_ou15_o", "odds_ht_ou15_u",
|
|
"odds_btts_y", "odds_btts_n",
|
|
"odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present",
|
|
"odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present",
|
|
"odds_ou05_o_present", "odds_ou05_u_present",
|
|
"odds_ou15_o_present", "odds_ou15_u_present",
|
|
"odds_ou25_o_present", "odds_ou25_u_present",
|
|
"odds_ou35_o_present", "odds_ou35_u_present",
|
|
"odds_ht_ou05_o_present", "odds_ht_ou05_u_present",
|
|
"odds_ht_ou15_o_present", "odds_ht_ou15_u_present",
|
|
"odds_btts_y_present", "odds_btts_n_present",
|
|
# League (4)
|
|
"home_xga", "away_xga",
|
|
"league_avg_goals", "league_zero_goal_rate",
|
|
# Upset Engine (4)
|
|
"upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",
|
|
# Referee Engine (5)
|
|
"referee_home_bias", "referee_avg_goals", "referee_cards_total",
|
|
"referee_avg_yellow", "referee_experience",
|
|
# Momentum (3)
|
|
"home_momentum_score", "away_momentum_score", "momentum_diff",
|
|
# Squad (9)
|
|
"home_squad_quality", "away_squad_quality", "squad_diff",
|
|
"home_key_players", "away_key_players",
|
|
"home_missing_impact", "away_missing_impact",
|
|
"home_goals_form", "away_goals_form",
|
|
# Player-Level Features (12)
|
|
"home_lineup_goals_per90", "away_lineup_goals_per90",
|
|
"home_lineup_assists_per90", "away_lineup_assists_per90",
|
|
"home_squad_continuity", "away_squad_continuity",
|
|
"home_top_scorer_form", "away_top_scorer_form",
|
|
"home_avg_player_exp", "away_avg_player_exp",
|
|
"home_goals_diversity", "away_goals_diversity",
|
|
]
|
|
|
|
MARKET_CONFIGS = [
|
|
{"target": "label_ms", "name": "MS", "num_class": 3},
|
|
{"target": "label_ou15", "name": "OU15", "num_class": 2},
|
|
{"target": "label_ou25", "name": "OU25", "num_class": 2},
|
|
{"target": "label_ou35", "name": "OU35", "num_class": 2},
|
|
{"target": "label_btts", "name": "BTTS", "num_class": 2},
|
|
{"target": "label_ht_result", "name": "HT_RESULT", "num_class": 3},
|
|
{"target": "label_ht_ou05", "name": "HT_OU05", "num_class": 2},
|
|
{"target": "label_ht_ou15", "name": "HT_OU15", "num_class": 2},
|
|
{"target": "label_ht_ft", "name": "HTFT", "num_class": 9},
|
|
{"target": "label_odd_even", "name": "ODD_EVEN", "num_class": 2},
|
|
{"target": "label_cards_ou45", "name": "CARDS_OU45", "num_class": 2},
|
|
{"target": "label_handicap_ms", "name": "HANDICAP_MS", "num_class": 3},
|
|
]
|
|
|
|
|
|
def load_data():
|
|
"""Load and prepare training data."""
|
|
if not os.path.exists(DATA_PATH):
|
|
print(f"[ERROR] Data not found: {DATA_PATH}")
|
|
sys.exit(1)
|
|
|
|
print(f"[INFO] Loading {DATA_PATH}...")
|
|
df = pd.read_csv(DATA_PATH)
|
|
|
|
for col in FEATURES:
|
|
if col in df.columns:
|
|
df[col] = df[col].fillna(0)
|
|
|
|
# Derive odds presence flags for older CSVs
|
|
odds_flag_sources = {
|
|
"odds_ms_h_present": "odds_ms_h", "odds_ms_d_present": "odds_ms_d",
|
|
"odds_ms_a_present": "odds_ms_a", "odds_ht_ms_h_present": "odds_ht_ms_h",
|
|
"odds_ht_ms_d_present": "odds_ht_ms_d", "odds_ht_ms_a_present": "odds_ht_ms_a",
|
|
"odds_ou05_o_present": "odds_ou05_o", "odds_ou05_u_present": "odds_ou05_u",
|
|
"odds_ou15_o_present": "odds_ou15_o", "odds_ou15_u_present": "odds_ou15_u",
|
|
"odds_ou25_o_present": "odds_ou25_o", "odds_ou25_u_present": "odds_ou25_u",
|
|
"odds_ou35_o_present": "odds_ou35_o", "odds_ou35_u_present": "odds_ou35_u",
|
|
"odds_ht_ou05_o_present": "odds_ht_ou05_o", "odds_ht_ou05_u_present": "odds_ht_ou05_u",
|
|
"odds_ht_ou15_o_present": "odds_ht_ou15_o", "odds_ht_ou15_u_present": "odds_ht_ou15_u",
|
|
"odds_btts_y_present": "odds_btts_y", "odds_btts_n_present": "odds_btts_n",
|
|
}
|
|
for flag_col, odds_col in odds_flag_sources.items():
|
|
if flag_col not in df.columns:
|
|
df[flag_col] = (
|
|
pd.to_numeric(df.get(odds_col, 0), errors="coerce").fillna(0) > 1.01
|
|
).astype(float)
|
|
|
|
print(f"[INFO] Shape: {df.shape}, Features: {len(FEATURES)}")
|
|
return df
|
|
|
|
|
|
def temporal_split_4way(valid_df: pd.DataFrame):
|
|
"""Chronological 60/15/10/15 split: train/val/cal/test."""
|
|
ordered = valid_df.sort_values("mst_utc").reset_index(drop=True)
|
|
n = len(ordered)
|
|
i1 = int(n * 0.60)
|
|
i2 = int(n * 0.75)
|
|
i3 = int(n * 0.85)
|
|
|
|
train = ordered.iloc[:i1].copy()
|
|
val = ordered.iloc[i1:i2].copy()
|
|
cal = ordered.iloc[i2:i3].copy()
|
|
test = ordered.iloc[i3:].copy()
|
|
|
|
return train, val, cal, test
|
|
|
|
|
|
# ─── XGBoost Wrapper for sklearn CalibratedClassifierCV ─────────────
|
|
class XGBWrapper(BaseEstimator, ClassifierMixin):
|
|
"""Thin sklearn-compatible wrapper around xgb.train for Isotonic calibration."""
|
|
|
|
def __init__(self, params, num_boost_round=500):
|
|
self.params = params
|
|
self.num_boost_round = num_boost_round
|
|
self.model_ = None
|
|
self.classes_ = None
|
|
|
|
def fit(self, X, y, **kwargs):
|
|
self.classes_ = np.unique(y)
|
|
dtrain = xgb.DMatrix(X, label=y)
|
|
self.model_ = xgb.train(self.params, dtrain, num_boost_round=self.num_boost_round)
|
|
return self
|
|
|
|
def predict_proba(self, X):
|
|
dm = xgb.DMatrix(X)
|
|
probs = self.model_.predict(dm)
|
|
if len(probs.shape) == 1:
|
|
probs = np.column_stack([1 - probs, probs])
|
|
return probs
|
|
|
|
def predict(self, X):
|
|
return np.argmax(self.predict_proba(X), axis=1)
|
|
|
|
|
|
# ─── Optuna Objectives ──────────────────────────────────────────────
|
|
def xgb_objective(trial, X_train, y_train, X_val, y_val, num_class):
|
|
params = {
|
|
"objective": "multi:softprob" if num_class > 2 else "binary:logistic",
|
|
"eval_metric": "mlogloss" if num_class > 2 else "logloss",
|
|
"max_depth": trial.suggest_int("max_depth", 3, 8),
|
|
"eta": trial.suggest_float("eta", 0.01, 0.15, log=True),
|
|
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
|
|
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
|
|
"min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
|
|
"gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
|
|
"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
|
|
"reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
|
|
"n_jobs": 4,
|
|
"random_state": 42,
|
|
}
|
|
if num_class > 2:
|
|
params["num_class"] = num_class
|
|
|
|
dtrain = xgb.DMatrix(X_train, label=y_train)
|
|
dval = xgb.DMatrix(X_val, label=y_val)
|
|
|
|
model = xgb.train(
|
|
params, dtrain, num_boost_round=1000,
|
|
evals=[(dval, "val")], early_stopping_rounds=50, verbose_eval=False,
|
|
)
|
|
|
|
preds = model.predict(dval)
|
|
if len(preds.shape) == 1:
|
|
preds = np.column_stack([1 - preds, preds])
|
|
|
|
return log_loss(y_val, preds)
|
|
|
|
|
|
def lgb_objective(trial, X_train, y_train, X_val, y_val, num_class):
|
|
params = {
|
|
"objective": "multiclass" if num_class > 2 else "binary",
|
|
"metric": "multi_logloss" if num_class > 2 else "binary_logloss",
|
|
"max_depth": trial.suggest_int("max_depth", 3, 8),
|
|
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15, log=True),
|
|
"feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
|
|
"bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
|
|
"bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
|
|
"min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
|
|
"lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 1.0, log=True),
|
|
"lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
|
|
"n_jobs": 4, "random_state": 42, "verbose": -1,
|
|
}
|
|
if num_class > 2:
|
|
params["num_class"] = num_class
|
|
|
|
train_data = lgb.Dataset(X_train, label=y_train)
|
|
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
|
|
|
|
model = lgb.train(
|
|
params, train_data, num_boost_round=1000,
|
|
valid_sets=[val_data], valid_names=["val"],
|
|
callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)],
|
|
)
|
|
|
|
preds = model.predict(X_val, num_iteration=model.best_iteration)
|
|
if len(preds.shape) == 1:
|
|
preds = np.column_stack([1 - preds, preds])
|
|
|
|
return log_loss(y_val, preds)
|
|
|
|
|
|
# ─── Main Training Pipeline ─────────────────────────────────────────
|
|
def train_market(df, target_col, market_name, num_class, n_trials):
|
|
"""Full pipeline for one market: Optuna → Train → Calibrate → Evaluate."""
|
|
print(f"\n{'='*60}")
|
|
print(f"[MARKET] {market_name} (classes={num_class})")
|
|
print(f"{'='*60}")
|
|
|
|
valid_df = df[df[target_col].notna()].copy()
|
|
valid_df = valid_df[valid_df[target_col].astype(str) != ""].copy()
|
|
print(f"[INFO] Valid samples: {len(valid_df)}")
|
|
|
|
if len(valid_df) < 500:
|
|
print(f"[SKIP] Not enough data for {market_name}")
|
|
return None
|
|
|
|
available_features = [f for f in FEATURES if f in valid_df.columns]
|
|
print(f"[INFO] Features: {len(available_features)}/{len(FEATURES)}")
|
|
|
|
train_df, val_df, cal_df, test_df = temporal_split_4way(valid_df)
|
|
X_train = train_df[available_features].values
|
|
X_val = val_df[available_features].values
|
|
X_cal = cal_df[available_features].values
|
|
X_test = test_df[available_features].values
|
|
y_train = train_df[target_col].astype(int).values
|
|
y_val = val_df[target_col].astype(int).values
|
|
y_cal = cal_df[target_col].astype(int).values
|
|
y_test = test_df[target_col].astype(int).values
|
|
|
|
print(f"[INFO] Split: train={len(X_train)} val={len(X_val)} cal={len(X_cal)} test={len(X_test)}")
|
|
|
|
# ── Phase 1: Optuna XGBoost ──────────────────────────────────
|
|
print(f"\n[OPTUNA] XGBoost tuning ({n_trials} trials)...")
|
|
xgb_study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
|
|
xgb_study.optimize(
|
|
lambda trial: xgb_objective(trial, X_train, y_train, X_val, y_val, num_class),
|
|
n_trials=n_trials,
|
|
)
|
|
xgb_best = xgb_study.best_params
|
|
print(f"[OK] XGB best logloss: {xgb_study.best_value:.4f}")
|
|
|
|
# ── Phase 2: Optuna LightGBM ─────────────────────────────────
|
|
print(f"[OPTUNA] LightGBM tuning ({n_trials} trials)...")
|
|
lgb_study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
|
|
lgb_study.optimize(
|
|
lambda trial: lgb_objective(trial, X_train, y_train, X_val, y_val, num_class),
|
|
n_trials=n_trials,
|
|
)
|
|
lgb_best = lgb_study.best_params
|
|
print(f"[OK] LGB best logloss: {lgb_study.best_value:.4f}")
|
|
|
|
# ── Phase 3: Train final models with best params ─────────────
|
|
# XGBoost final
|
|
xgb_params = {
|
|
"objective": "multi:softprob" if num_class > 2 else "binary:logistic",
|
|
"eval_metric": "mlogloss" if num_class > 2 else "logloss",
|
|
"n_jobs": 4, "random_state": 42,
|
|
**{k: v for k, v in xgb_best.items()},
|
|
}
|
|
if num_class > 2:
|
|
xgb_params["num_class"] = num_class
|
|
|
|
dtrain = xgb.DMatrix(X_train, label=y_train)
|
|
dval = xgb.DMatrix(X_val, label=y_val)
|
|
xgb_model = xgb.train(
|
|
xgb_params, dtrain, num_boost_round=1500,
|
|
evals=[(dtrain, "train"), (dval, "val")],
|
|
early_stopping_rounds=80, verbose_eval=200,
|
|
)
|
|
print(f"[OK] XGB final: iter={xgb_model.best_iteration}, score={xgb_model.best_score:.4f}")
|
|
|
|
# LightGBM final
|
|
lgb_params = {
|
|
"objective": "multiclass" if num_class > 2 else "binary",
|
|
"metric": "multi_logloss" if num_class > 2 else "binary_logloss",
|
|
"n_jobs": 4, "random_state": 42, "verbose": -1,
|
|
**{k: v for k, v in lgb_best.items()},
|
|
}
|
|
if num_class > 2:
|
|
lgb_params["num_class"] = num_class
|
|
|
|
lgb_train_data = lgb.Dataset(X_train, label=y_train)
|
|
lgb_val_data = lgb.Dataset(X_val, label=y_val, reference=lgb_train_data)
|
|
lgb_model = lgb.train(
|
|
lgb_params, lgb_train_data, num_boost_round=1500,
|
|
valid_sets=[lgb_train_data, lgb_val_data],
|
|
valid_names=["train", "val"],
|
|
callbacks=[lgb.early_stopping(80), lgb.log_evaluation(200)],
|
|
)
|
|
print(f"[OK] LGB final: iter={lgb_model.best_iteration}")
|
|
|
|
# ── Phase 4: Isotonic Calibration on cal set ─────────────────
|
|
print("[CAL] Fitting Isotonic Regression (per-class)...")
|
|
|
|
# XGB calibration — manual IsotonicRegression per class
|
|
dcal = xgb.DMatrix(X_cal)
|
|
xgb_cal_raw = xgb_model.predict(dcal)
|
|
if len(xgb_cal_raw.shape) == 1:
|
|
xgb_cal_raw = np.column_stack([1 - xgb_cal_raw, xgb_cal_raw])
|
|
|
|
xgb_iso_calibrators = []
|
|
for cls_idx in range(num_class):
|
|
ir = IsotonicRegression(out_of_bounds="clip")
|
|
y_binary = (y_cal == cls_idx).astype(float)
|
|
ir.fit(xgb_cal_raw[:, cls_idx], y_binary)
|
|
xgb_iso_calibrators.append(ir)
|
|
print(f"[OK] XGB Isotonic calibrators fitted: {num_class} classes")
|
|
|
|
# LGB calibration — manual IsotonicRegression per class
|
|
lgb_cal_raw = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration)
|
|
if len(lgb_cal_raw.shape) == 1:
|
|
lgb_cal_raw = np.column_stack([1 - lgb_cal_raw, lgb_cal_raw])
|
|
|
|
lgb_iso_calibrators = []
|
|
for cls_idx in range(num_class):
|
|
ir = IsotonicRegression(out_of_bounds="clip")
|
|
y_binary = (y_cal == cls_idx).astype(float)
|
|
ir.fit(lgb_cal_raw[:, cls_idx], y_binary)
|
|
lgb_iso_calibrators.append(ir)
|
|
print(f"[OK] LGB Isotonic calibrators fitted: {num_class} classes")
|
|
|
|
# ── Phase 5: Evaluate on test set ────────────────────────────
|
|
print("\n[EVAL] Test set evaluation...")
|
|
dtest = xgb.DMatrix(X_test)
|
|
|
|
# Raw XGB
|
|
xgb_raw_probs = xgb_model.predict(dtest)
|
|
if len(xgb_raw_probs.shape) == 1:
|
|
xgb_raw_probs = np.column_stack([1 - xgb_raw_probs, xgb_raw_probs])
|
|
|
|
# Calibrated XGB — apply isotonic per class + renormalize
|
|
xgb_cal_probs = np.column_stack([
|
|
xgb_iso_calibrators[i].predict(xgb_raw_probs[:, i]) for i in range(num_class)
|
|
])
|
|
xgb_cal_probs = xgb_cal_probs / xgb_cal_probs.sum(axis=1, keepdims=True)
|
|
|
|
# Raw LGB
|
|
lgb_raw_probs = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
|
|
if len(lgb_raw_probs.shape) == 1:
|
|
lgb_raw_probs = np.column_stack([1 - lgb_raw_probs, lgb_raw_probs])
|
|
|
|
# Calibrated LGB — apply isotonic per class + renormalize
|
|
lgb_cal_probs = np.column_stack([
|
|
lgb_iso_calibrators[i].predict(lgb_raw_probs[:, i]) for i in range(num_class)
|
|
])
|
|
lgb_cal_probs = lgb_cal_probs / lgb_cal_probs.sum(axis=1, keepdims=True)
|
|
|
|
# Ensembles
|
|
raw_ensemble = (xgb_raw_probs + lgb_raw_probs) / 2
|
|
cal_ensemble = (xgb_cal_probs + lgb_cal_probs) / 2
|
|
|
|
def _eval(probs, label):
|
|
preds = np.argmax(probs, axis=1)
|
|
acc = accuracy_score(y_test, preds)
|
|
ll = log_loss(y_test, probs)
|
|
print(f" {label}: Acc={acc:.4f} LogLoss={ll:.4f}")
|
|
return {"accuracy": round(float(acc), 4), "logloss": round(float(ll), 4)}
|
|
|
|
m_xgb_raw = _eval(xgb_raw_probs, "XGB Raw")
|
|
m_xgb_cal = _eval(xgb_cal_probs, "XGB Calibrated")
|
|
m_lgb_raw = _eval(lgb_raw_probs, "LGB Raw")
|
|
m_lgb_cal = _eval(lgb_cal_probs, "LGB Calibrated")
|
|
m_ensemble = _eval(raw_ensemble, "Ensemble Raw")
|
|
m_cal_ensemble = _eval(cal_ensemble, "Ensemble Calibrated")
|
|
|
|
# Classification report for ensemble
|
|
ens_preds = np.argmax(raw_ensemble, axis=1)
|
|
print(f"\n[REPORT] Ensemble Classification Report:")
|
|
print(classification_report(y_test, ens_preds))
|
|
|
|
# ── Phase 6: Save models ─────────────────────────────────────
|
|
# Raw models (orchestrator compatible)
|
|
xgb_path = os.path.join(MODELS_DIR, f"xgb_v25_{market_name.lower()}.json")
|
|
xgb_model.save_model(xgb_path)
|
|
print(f"[SAVE] {xgb_path}")
|
|
|
|
lgb_path = os.path.join(MODELS_DIR, f"lgb_v25_{market_name.lower()}.txt")
|
|
lgb_model.save_model(lgb_path)
|
|
print(f"[SAVE] {lgb_path}")
|
|
|
|
# Isotonic calibrators (XGB + LGB)
|
|
xgb_cal_path = os.path.join(MODELS_DIR, f"iso_xgb_v25_{market_name.lower()}.pkl")
|
|
with open(xgb_cal_path, "wb") as f:
|
|
pickle.dump(xgb_iso_calibrators, f)
|
|
print(f"[SAVE] {xgb_cal_path}")
|
|
|
|
lgb_cal_path = os.path.join(MODELS_DIR, f"iso_lgb_v25_{market_name.lower()}.pkl")
|
|
with open(lgb_cal_path, "wb") as f:
|
|
pickle.dump(lgb_iso_calibrators, f)
|
|
print(f"[SAVE] {lgb_cal_path}")
|
|
|
|
return {
|
|
"market": market_name,
|
|
"samples": int(len(valid_df)),
|
|
"train": int(len(X_train)),
|
|
"val": int(len(X_val)),
|
|
"cal": int(len(X_cal)),
|
|
"test": int(len(X_test)),
|
|
"features_used": len(available_features),
|
|
"xgb_best_params": xgb_best,
|
|
"lgb_best_params": lgb_best,
|
|
"xgb_best_iteration": int(xgb_model.best_iteration),
|
|
"lgb_best_iteration": int(lgb_model.best_iteration),
|
|
"xgb_optuna_best_logloss": round(float(xgb_study.best_value), 4),
|
|
"lgb_optuna_best_logloss": round(float(lgb_study.best_value), 4),
|
|
"test_xgb_raw": m_xgb_raw,
|
|
"test_xgb_calibrated": m_xgb_cal,
|
|
"test_lgb_raw": m_lgb_raw,
|
|
"test_lgb_calibrated": m_lgb_cal,
|
|
"test_ensemble_raw": m_ensemble,
|
|
"test_ensemble_calibrated": m_cal_ensemble,
|
|
}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="V25 Pro Trainer")
|
|
parser.add_argument("--markets", type=str, default=None,
|
|
help="Comma-separated market names (e.g., MS,OU25,BTTS)")
|
|
parser.add_argument("--trials", type=int, default=50,
|
|
help="Optuna trials per model per market")
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 60)
|
|
print("V25 PRO — Optuna + Isotonic Calibration")
|
|
print("=" * 60)
|
|
print(f"[INFO] Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print(f"[INFO] Trials per model: {args.trials}")
|
|
print(f"[INFO] Total features: {len(FEATURES)}")
|
|
|
|
df = load_data()
|
|
|
|
configs = MARKET_CONFIGS
|
|
if args.markets:
|
|
selected = [m.strip().upper() for m in args.markets.split(",")]
|
|
configs = [c for c in configs if c["name"] in selected]
|
|
print(f"[INFO] Selected markets: {[c['name'] for c in configs]}")
|
|
|
|
all_metrics = {
|
|
"trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
"trainer": "v25_pro",
|
|
"optuna_trials": args.trials,
|
|
"total_features": len(FEATURES),
|
|
"markets": {},
|
|
}
|
|
|
|
for config in configs:
|
|
target = config["target"]
|
|
if target not in df.columns:
|
|
print(f"[SKIP] {config['name']}: missing target {target}")
|
|
continue
|
|
|
|
metrics = train_market(
|
|
df, target, config["name"], config["num_class"], args.trials,
|
|
)
|
|
if metrics:
|
|
all_metrics["markets"][config["name"]] = metrics
|
|
|
|
# Save feature list
|
|
feature_path = os.path.join(MODELS_DIR, "feature_cols.json")
|
|
with open(feature_path, "w") as f:
|
|
json.dump(FEATURES, f, indent=2)
|
|
|
|
# Save full report
|
|
report_path = os.path.join(REPORTS_DIR, "v25_pro_metrics.json")
|
|
with open(report_path, "w") as f:
|
|
json.dump(all_metrics, f, indent=2, default=str)
|
|
print(f"\n[SAVE] Report: {report_path}")
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("[SUMMARY]")
|
|
print("=" * 60)
|
|
for name, m in all_metrics["markets"].items():
|
|
ens = m.get("test_ensemble_calibrated", m.get("test_ensemble_raw", {}))
|
|
acc = ens.get('accuracy', '?')
|
|
ll = ens.get('logloss', '?')
|
|
acc_s = f"{acc:.4f}" if isinstance(acc, float) else str(acc)
|
|
ll_s = f"{ll:.4f}" if isinstance(ll, float) else str(ll)
|
|
print(f" {name:12s} | Acc={acc_s:>6s} | LL={ll_s:>6s} | "
|
|
f"XGB_iter={m.get('xgb_best_iteration','?')} LGB_iter={m.get('lgb_best_iteration','?')}")
|
|
|
|
print(f"\n[INFO] Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print("[OK] V25 PRO Training Complete!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|