gg
This commit is contained in:
@@ -0,0 +1,507 @@
|
||||
"""
|
||||
V25 Pro Model Trainer — Optuna + Isotonic Calibration
|
||||
=====================================================
|
||||
Combines V25's 83 features + 12 markets + temporal split
|
||||
with Optuna hyperparameter tuning and Isotonic Regression calibration.
|
||||
|
||||
Usage:
|
||||
python scripts/train_v25_pro.py
|
||||
python scripts/train_v25_pro.py --markets MS,OU25,BTTS # specific markets
|
||||
python scripts/train_v25_pro.py --trials 30 # fewer trials
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import pickle
|
||||
import argparse
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import xgboost as xgb
|
||||
import lightgbm as lgb
|
||||
import optuna
|
||||
from optuna.samplers import TPESampler
|
||||
from datetime import datetime
|
||||
from sklearn.metrics import accuracy_score, log_loss, classification_report
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
|
||||
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
|
||||
MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "v25")
|
||||
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25")
|
||||
|
||||
os.makedirs(MODELS_DIR, exist_ok=True)
|
||||
os.makedirs(REPORTS_DIR, exist_ok=True)
|
||||
|
||||
# ─── Feature Columns (83 features, NO target leakage) ───────────────
|
||||
FEATURES = [
|
||||
# ELO (8)
|
||||
"home_overall_elo", "away_overall_elo", "elo_diff",
|
||||
"home_home_elo", "away_away_elo",
|
||||
"home_form_elo", "away_form_elo", "form_elo_diff",
|
||||
# Form (12)
|
||||
"home_goals_avg", "home_conceded_avg",
|
||||
"away_goals_avg", "away_conceded_avg",
|
||||
"home_clean_sheet_rate", "away_clean_sheet_rate",
|
||||
"home_scoring_rate", "away_scoring_rate",
|
||||
"home_winning_streak", "away_winning_streak",
|
||||
"home_unbeaten_streak", "away_unbeaten_streak",
|
||||
# H2H (6)
|
||||
"h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate",
|
||||
"h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",
|
||||
# Team Stats (8)
|
||||
"home_avg_possession", "away_avg_possession",
|
||||
"home_avg_shots_on_target", "away_avg_shots_on_target",
|
||||
"home_shot_conversion", "away_shot_conversion",
|
||||
"home_avg_corners", "away_avg_corners",
|
||||
# Odds (24 + 20 presence flags)
|
||||
"odds_ms_h", "odds_ms_d", "odds_ms_a",
|
||||
"implied_home", "implied_draw", "implied_away",
|
||||
"odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
|
||||
"odds_ou05_o", "odds_ou05_u",
|
||||
"odds_ou15_o", "odds_ou15_u",
|
||||
"odds_ou25_o", "odds_ou25_u",
|
||||
"odds_ou35_o", "odds_ou35_u",
|
||||
"odds_ht_ou05_o", "odds_ht_ou05_u",
|
||||
"odds_ht_ou15_o", "odds_ht_ou15_u",
|
||||
"odds_btts_y", "odds_btts_n",
|
||||
"odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present",
|
||||
"odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present",
|
||||
"odds_ou05_o_present", "odds_ou05_u_present",
|
||||
"odds_ou15_o_present", "odds_ou15_u_present",
|
||||
"odds_ou25_o_present", "odds_ou25_u_present",
|
||||
"odds_ou35_o_present", "odds_ou35_u_present",
|
||||
"odds_ht_ou05_o_present", "odds_ht_ou05_u_present",
|
||||
"odds_ht_ou15_o_present", "odds_ht_ou15_u_present",
|
||||
"odds_btts_y_present", "odds_btts_n_present",
|
||||
# League (4)
|
||||
"home_xga", "away_xga",
|
||||
"league_avg_goals", "league_zero_goal_rate",
|
||||
# Upset Engine (4)
|
||||
"upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",
|
||||
# Referee Engine (5)
|
||||
"referee_home_bias", "referee_avg_goals", "referee_cards_total",
|
||||
"referee_avg_yellow", "referee_experience",
|
||||
# Momentum (3)
|
||||
"home_momentum_score", "away_momentum_score", "momentum_diff",
|
||||
# Squad (9)
|
||||
"home_squad_quality", "away_squad_quality", "squad_diff",
|
||||
"home_key_players", "away_key_players",
|
||||
"home_missing_impact", "away_missing_impact",
|
||||
"home_goals_form", "away_goals_form",
|
||||
]
|
||||
|
||||
MARKET_CONFIGS = [
|
||||
{"target": "label_ms", "name": "MS", "num_class": 3},
|
||||
{"target": "label_ou15", "name": "OU15", "num_class": 2},
|
||||
{"target": "label_ou25", "name": "OU25", "num_class": 2},
|
||||
{"target": "label_ou35", "name": "OU35", "num_class": 2},
|
||||
{"target": "label_btts", "name": "BTTS", "num_class": 2},
|
||||
{"target": "label_ht_result", "name": "HT_RESULT", "num_class": 3},
|
||||
{"target": "label_ht_ou05", "name": "HT_OU05", "num_class": 2},
|
||||
{"target": "label_ht_ou15", "name": "HT_OU15", "num_class": 2},
|
||||
{"target": "label_ht_ft", "name": "HTFT", "num_class": 9},
|
||||
{"target": "label_odd_even", "name": "ODD_EVEN", "num_class": 2},
|
||||
{"target": "label_cards_ou45", "name": "CARDS_OU45", "num_class": 2},
|
||||
{"target": "label_handicap_ms", "name": "HANDICAP_MS", "num_class": 3},
|
||||
]
|
||||
|
||||
|
||||
def load_data():
|
||||
"""Load and prepare training data."""
|
||||
if not os.path.exists(DATA_PATH):
|
||||
print(f"[ERROR] Data not found: {DATA_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"[INFO] Loading {DATA_PATH}...")
|
||||
df = pd.read_csv(DATA_PATH)
|
||||
|
||||
for col in FEATURES:
|
||||
if col in df.columns:
|
||||
df[col] = df[col].fillna(0)
|
||||
|
||||
# Derive odds presence flags for older CSVs
|
||||
odds_flag_sources = {
|
||||
"odds_ms_h_present": "odds_ms_h", "odds_ms_d_present": "odds_ms_d",
|
||||
"odds_ms_a_present": "odds_ms_a", "odds_ht_ms_h_present": "odds_ht_ms_h",
|
||||
"odds_ht_ms_d_present": "odds_ht_ms_d", "odds_ht_ms_a_present": "odds_ht_ms_a",
|
||||
"odds_ou05_o_present": "odds_ou05_o", "odds_ou05_u_present": "odds_ou05_u",
|
||||
"odds_ou15_o_present": "odds_ou15_o", "odds_ou15_u_present": "odds_ou15_u",
|
||||
"odds_ou25_o_present": "odds_ou25_o", "odds_ou25_u_present": "odds_ou25_u",
|
||||
"odds_ou35_o_present": "odds_ou35_o", "odds_ou35_u_present": "odds_ou35_u",
|
||||
"odds_ht_ou05_o_present": "odds_ht_ou05_o", "odds_ht_ou05_u_present": "odds_ht_ou05_u",
|
||||
"odds_ht_ou15_o_present": "odds_ht_ou15_o", "odds_ht_ou15_u_present": "odds_ht_ou15_u",
|
||||
"odds_btts_y_present": "odds_btts_y", "odds_btts_n_present": "odds_btts_n",
|
||||
}
|
||||
for flag_col, odds_col in odds_flag_sources.items():
|
||||
if flag_col not in df.columns:
|
||||
df[flag_col] = (
|
||||
pd.to_numeric(df.get(odds_col, 0), errors="coerce").fillna(0) > 1.01
|
||||
).astype(float)
|
||||
|
||||
print(f"[INFO] Shape: {df.shape}, Features: {len(FEATURES)}")
|
||||
return df
|
||||
|
||||
|
||||
def temporal_split_4way(valid_df: pd.DataFrame):
|
||||
"""Chronological 60/15/10/15 split: train/val/cal/test."""
|
||||
ordered = valid_df.sort_values("mst_utc").reset_index(drop=True)
|
||||
n = len(ordered)
|
||||
i1 = int(n * 0.60)
|
||||
i2 = int(n * 0.75)
|
||||
i3 = int(n * 0.85)
|
||||
|
||||
train = ordered.iloc[:i1].copy()
|
||||
val = ordered.iloc[i1:i2].copy()
|
||||
cal = ordered.iloc[i2:i3].copy()
|
||||
test = ordered.iloc[i3:].copy()
|
||||
|
||||
return train, val, cal, test
|
||||
|
||||
|
||||
# ─── XGBoost Wrapper for sklearn CalibratedClassifierCV ─────────────
|
||||
class XGBWrapper(BaseEstimator, ClassifierMixin):
|
||||
"""Thin sklearn-compatible wrapper around xgb.train for Isotonic calibration."""
|
||||
|
||||
def __init__(self, params, num_boost_round=500):
|
||||
self.params = params
|
||||
self.num_boost_round = num_boost_round
|
||||
self.model_ = None
|
||||
self.classes_ = None
|
||||
|
||||
def fit(self, X, y, **kwargs):
|
||||
self.classes_ = np.unique(y)
|
||||
dtrain = xgb.DMatrix(X, label=y)
|
||||
self.model_ = xgb.train(self.params, dtrain, num_boost_round=self.num_boost_round)
|
||||
return self
|
||||
|
||||
def predict_proba(self, X):
|
||||
dm = xgb.DMatrix(X)
|
||||
probs = self.model_.predict(dm)
|
||||
if len(probs.shape) == 1:
|
||||
probs = np.column_stack([1 - probs, probs])
|
||||
return probs
|
||||
|
||||
def predict(self, X):
|
||||
return np.argmax(self.predict_proba(X), axis=1)
|
||||
|
||||
|
||||
# ─── Optuna Objectives ──────────────────────────────────────────────
|
||||
def xgb_objective(trial, X_train, y_train, X_val, y_val, num_class):
|
||||
params = {
|
||||
"objective": "multi:softprob" if num_class > 2 else "binary:logistic",
|
||||
"eval_metric": "mlogloss" if num_class > 2 else "logloss",
|
||||
"max_depth": trial.suggest_int("max_depth", 3, 8),
|
||||
"eta": trial.suggest_float("eta", 0.01, 0.15, log=True),
|
||||
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
|
||||
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
|
||||
"min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
|
||||
"gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
|
||||
"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
|
||||
"reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
|
||||
"n_jobs": 4,
|
||||
"random_state": 42,
|
||||
}
|
||||
if num_class > 2:
|
||||
params["num_class"] = num_class
|
||||
|
||||
dtrain = xgb.DMatrix(X_train, label=y_train)
|
||||
dval = xgb.DMatrix(X_val, label=y_val)
|
||||
|
||||
model = xgb.train(
|
||||
params, dtrain, num_boost_round=1000,
|
||||
evals=[(dval, "val")], early_stopping_rounds=50, verbose_eval=False,
|
||||
)
|
||||
|
||||
preds = model.predict(dval)
|
||||
if len(preds.shape) == 1:
|
||||
preds = np.column_stack([1 - preds, preds])
|
||||
|
||||
return log_loss(y_val, preds)
|
||||
|
||||
|
||||
def lgb_objective(trial, X_train, y_train, X_val, y_val, num_class):
|
||||
params = {
|
||||
"objective": "multiclass" if num_class > 2 else "binary",
|
||||
"metric": "multi_logloss" if num_class > 2 else "binary_logloss",
|
||||
"max_depth": trial.suggest_int("max_depth", 3, 8),
|
||||
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15, log=True),
|
||||
"feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
|
||||
"bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
|
||||
"bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
|
||||
"min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
|
||||
"lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 1.0, log=True),
|
||||
"lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
|
||||
"n_jobs": 4, "random_state": 42, "verbose": -1,
|
||||
}
|
||||
if num_class > 2:
|
||||
params["num_class"] = num_class
|
||||
|
||||
train_data = lgb.Dataset(X_train, label=y_train)
|
||||
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
|
||||
|
||||
model = lgb.train(
|
||||
params, train_data, num_boost_round=1000,
|
||||
valid_sets=[val_data], valid_names=["val"],
|
||||
callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)],
|
||||
)
|
||||
|
||||
preds = model.predict(X_val, num_iteration=model.best_iteration)
|
||||
if len(preds.shape) == 1:
|
||||
preds = np.column_stack([1 - preds, preds])
|
||||
|
||||
return log_loss(y_val, preds)
|
||||
|
||||
|
||||
# ─── Main Training Pipeline ─────────────────────────────────────────
|
||||
def train_market(df, target_col, market_name, num_class, n_trials):
|
||||
"""Full pipeline for one market: Optuna → Train → Calibrate → Evaluate."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[MARKET] {market_name} (classes={num_class})")
|
||||
print(f"{'='*60}")
|
||||
|
||||
valid_df = df[df[target_col].notna()].copy()
|
||||
valid_df = valid_df[valid_df[target_col].astype(str) != ""].copy()
|
||||
print(f"[INFO] Valid samples: {len(valid_df)}")
|
||||
|
||||
if len(valid_df) < 500:
|
||||
print(f"[SKIP] Not enough data for {market_name}")
|
||||
return None
|
||||
|
||||
available_features = [f for f in FEATURES if f in valid_df.columns]
|
||||
print(f"[INFO] Features: {len(available_features)}/{len(FEATURES)}")
|
||||
|
||||
train_df, val_df, cal_df, test_df = temporal_split_4way(valid_df)
|
||||
X_train = train_df[available_features].values
|
||||
X_val = val_df[available_features].values
|
||||
X_cal = cal_df[available_features].values
|
||||
X_test = test_df[available_features].values
|
||||
y_train = train_df[target_col].astype(int).values
|
||||
y_val = val_df[target_col].astype(int).values
|
||||
y_cal = cal_df[target_col].astype(int).values
|
||||
y_test = test_df[target_col].astype(int).values
|
||||
|
||||
print(f"[INFO] Split: train={len(X_train)} val={len(X_val)} cal={len(X_cal)} test={len(X_test)}")
|
||||
|
||||
# ── Phase 1: Optuna XGBoost ──────────────────────────────────
|
||||
print(f"\n[OPTUNA] XGBoost tuning ({n_trials} trials)...")
|
||||
xgb_study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
|
||||
xgb_study.optimize(
|
||||
lambda trial: xgb_objective(trial, X_train, y_train, X_val, y_val, num_class),
|
||||
n_trials=n_trials,
|
||||
)
|
||||
xgb_best = xgb_study.best_params
|
||||
print(f"[OK] XGB best logloss: {xgb_study.best_value:.4f}")
|
||||
|
||||
# ── Phase 2: Optuna LightGBM ─────────────────────────────────
|
||||
print(f"[OPTUNA] LightGBM tuning ({n_trials} trials)...")
|
||||
lgb_study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
|
||||
lgb_study.optimize(
|
||||
lambda trial: lgb_objective(trial, X_train, y_train, X_val, y_val, num_class),
|
||||
n_trials=n_trials,
|
||||
)
|
||||
lgb_best = lgb_study.best_params
|
||||
print(f"[OK] LGB best logloss: {lgb_study.best_value:.4f}")
|
||||
|
||||
# ── Phase 3: Train final models with best params ─────────────
|
||||
# XGBoost final
|
||||
xgb_params = {
|
||||
"objective": "multi:softprob" if num_class > 2 else "binary:logistic",
|
||||
"eval_metric": "mlogloss" if num_class > 2 else "logloss",
|
||||
"n_jobs": 4, "random_state": 42,
|
||||
**{k: v for k, v in xgb_best.items()},
|
||||
}
|
||||
if num_class > 2:
|
||||
xgb_params["num_class"] = num_class
|
||||
|
||||
dtrain = xgb.DMatrix(X_train, label=y_train)
|
||||
dval = xgb.DMatrix(X_val, label=y_val)
|
||||
xgb_model = xgb.train(
|
||||
xgb_params, dtrain, num_boost_round=1500,
|
||||
evals=[(dtrain, "train"), (dval, "val")],
|
||||
early_stopping_rounds=80, verbose_eval=200,
|
||||
)
|
||||
print(f"[OK] XGB final: iter={xgb_model.best_iteration}, score={xgb_model.best_score:.4f}")
|
||||
|
||||
# LightGBM final
|
||||
lgb_params = {
|
||||
"objective": "multiclass" if num_class > 2 else "binary",
|
||||
"metric": "multi_logloss" if num_class > 2 else "binary_logloss",
|
||||
"n_jobs": 4, "random_state": 42, "verbose": -1,
|
||||
**{k: v for k, v in lgb_best.items()},
|
||||
}
|
||||
if num_class > 2:
|
||||
lgb_params["num_class"] = num_class
|
||||
|
||||
lgb_train_data = lgb.Dataset(X_train, label=y_train)
|
||||
lgb_val_data = lgb.Dataset(X_val, label=y_val, reference=lgb_train_data)
|
||||
lgb_model = lgb.train(
|
||||
lgb_params, lgb_train_data, num_boost_round=1500,
|
||||
valid_sets=[lgb_train_data, lgb_val_data],
|
||||
valid_names=["train", "val"],
|
||||
callbacks=[lgb.early_stopping(80), lgb.log_evaluation(200)],
|
||||
)
|
||||
print(f"[OK] LGB final: iter={lgb_model.best_iteration}")
|
||||
|
||||
# ── Phase 4: Isotonic Calibration on cal set ─────────────────
|
||||
print("[CAL] Fitting Isotonic Regression...")
|
||||
|
||||
# XGB calibration
|
||||
xgb_wrapper = XGBWrapper(xgb_params, num_boost_round=xgb_model.best_iteration)
|
||||
xgb_calibrated = CalibratedClassifierCV(xgb_wrapper, method="isotonic", cv="prefit")
|
||||
xgb_wrapper.fit(X_train, y_train)
|
||||
xgb_calibrated.fit(X_cal, y_cal)
|
||||
|
||||
# LGB calibration — use raw predictions approach
|
||||
lgb_cal_preds = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration)
|
||||
if len(lgb_cal_preds.shape) == 1:
|
||||
lgb_cal_preds = np.column_stack([1 - lgb_cal_preds, lgb_cal_preds])
|
||||
|
||||
# ── Phase 5: Evaluate on test set ────────────────────────────
|
||||
print("\n[EVAL] Test set evaluation...")
|
||||
dtest = xgb.DMatrix(X_test)
|
||||
|
||||
# Raw XGB
|
||||
xgb_raw_probs = xgb_model.predict(dtest)
|
||||
if len(xgb_raw_probs.shape) == 1:
|
||||
xgb_raw_probs = np.column_stack([1 - xgb_raw_probs, xgb_raw_probs])
|
||||
|
||||
# Calibrated XGB
|
||||
xgb_cal_probs = xgb_calibrated.predict_proba(X_test)
|
||||
|
||||
# Raw LGB
|
||||
lgb_raw_probs = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
|
||||
if len(lgb_raw_probs.shape) == 1:
|
||||
lgb_raw_probs = np.column_stack([1 - lgb_raw_probs, lgb_raw_probs])
|
||||
|
||||
# Ensemble (raw)
|
||||
raw_ensemble = (xgb_raw_probs + lgb_raw_probs) / 2
|
||||
|
||||
def _eval(probs, label):
|
||||
preds = np.argmax(probs, axis=1)
|
||||
acc = accuracy_score(y_test, preds)
|
||||
ll = log_loss(y_test, probs)
|
||||
print(f" {label}: Acc={acc:.4f} LogLoss={ll:.4f}")
|
||||
return {"accuracy": round(float(acc), 4), "logloss": round(float(ll), 4)}
|
||||
|
||||
m_xgb_raw = _eval(xgb_raw_probs, "XGB Raw")
|
||||
m_xgb_cal = _eval(xgb_cal_probs, "XGB Calibrated")
|
||||
m_lgb_raw = _eval(lgb_raw_probs, "LGB Raw")
|
||||
m_ensemble = _eval(raw_ensemble, "Ensemble Raw")
|
||||
|
||||
# Classification report for ensemble
|
||||
ens_preds = np.argmax(raw_ensemble, axis=1)
|
||||
print(f"\n[REPORT] Ensemble Classification Report:")
|
||||
print(classification_report(y_test, ens_preds))
|
||||
|
||||
# ── Phase 6: Save models ─────────────────────────────────────
|
||||
# Raw models (orchestrator compatible)
|
||||
xgb_path = os.path.join(MODELS_DIR, f"xgb_v25_{market_name.lower()}.json")
|
||||
xgb_model.save_model(xgb_path)
|
||||
print(f"[SAVE] {xgb_path}")
|
||||
|
||||
lgb_path = os.path.join(MODELS_DIR, f"lgb_v25_{market_name.lower()}.txt")
|
||||
lgb_model.save_model(lgb_path)
|
||||
print(f"[SAVE] {lgb_path}")
|
||||
|
||||
# Calibrated model
|
||||
cal_path = os.path.join(MODELS_DIR, f"cal_xgb_v25_{market_name.lower()}.pkl")
|
||||
with open(cal_path, "wb") as f:
|
||||
pickle.dump(xgb_calibrated, f)
|
||||
print(f"[SAVE] {cal_path}")
|
||||
|
||||
return {
|
||||
"market": market_name,
|
||||
"samples": int(len(valid_df)),
|
||||
"train": int(len(X_train)),
|
||||
"val": int(len(X_val)),
|
||||
"cal": int(len(X_cal)),
|
||||
"test": int(len(X_test)),
|
||||
"features_used": len(available_features),
|
||||
"xgb_best_params": xgb_best,
|
||||
"lgb_best_params": lgb_best,
|
||||
"xgb_best_iteration": int(xgb_model.best_iteration),
|
||||
"lgb_best_iteration": int(lgb_model.best_iteration),
|
||||
"xgb_optuna_best_logloss": round(float(xgb_study.best_value), 4),
|
||||
"lgb_optuna_best_logloss": round(float(lgb_study.best_value), 4),
|
||||
"test_xgb_raw": m_xgb_raw,
|
||||
"test_xgb_calibrated": m_xgb_cal,
|
||||
"test_lgb_raw": m_lgb_raw,
|
||||
"test_ensemble_raw": m_ensemble,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="V25 Pro Trainer")
|
||||
parser.add_argument("--markets", type=str, default=None,
|
||||
help="Comma-separated market names (e.g., MS,OU25,BTTS)")
|
||||
parser.add_argument("--trials", type=int, default=50,
|
||||
help="Optuna trials per model per market")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("V25 PRO — Optuna + Isotonic Calibration")
|
||||
print("=" * 60)
|
||||
print(f"[INFO] Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"[INFO] Trials per model: {args.trials}")
|
||||
print(f"[INFO] Total features: {len(FEATURES)}")
|
||||
|
||||
df = load_data()
|
||||
|
||||
configs = MARKET_CONFIGS
|
||||
if args.markets:
|
||||
selected = [m.strip().upper() for m in args.markets.split(",")]
|
||||
configs = [c for c in configs if c["name"] in selected]
|
||||
print(f"[INFO] Selected markets: {[c['name'] for c in configs]}")
|
||||
|
||||
all_metrics = {
|
||||
"trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"trainer": "v25_pro",
|
||||
"optuna_trials": args.trials,
|
||||
"total_features": len(FEATURES),
|
||||
"markets": {},
|
||||
}
|
||||
|
||||
for config in configs:
|
||||
target = config["target"]
|
||||
if target not in df.columns:
|
||||
print(f"[SKIP] {config['name']}: missing target {target}")
|
||||
continue
|
||||
|
||||
metrics = train_market(
|
||||
df, target, config["name"], config["num_class"], args.trials,
|
||||
)
|
||||
if metrics:
|
||||
all_metrics["markets"][config["name"]] = metrics
|
||||
|
||||
# Save feature list
|
||||
feature_path = os.path.join(MODELS_DIR, "feature_cols.json")
|
||||
with open(feature_path, "w") as f:
|
||||
json.dump(FEATURES, f, indent=2)
|
||||
|
||||
# Save full report
|
||||
report_path = os.path.join(REPORTS_DIR, "v25_pro_metrics.json")
|
||||
with open(report_path, "w") as f:
|
||||
json.dump(all_metrics, f, indent=2, default=str)
|
||||
print(f"\n[SAVE] Report: {report_path}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("[SUMMARY]")
|
||||
print("=" * 60)
|
||||
for name, m in all_metrics["markets"].items():
|
||||
ens = m.get("test_ensemble_raw", {})
|
||||
print(f" {name:12s} | Acc={ens.get('accuracy','?'):>6s} | LL={ens.get('logloss','?'):>6s} | "
|
||||
f"XGB_iter={m.get('xgb_best_iteration','?')} LGB_iter={m.get('lgb_best_iteration','?')}")
|
||||
|
||||
print(f"\n[INFO] Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print("[OK] V25 PRO Training Complete!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user