gg

2026-05-10 10:37:45 +03:00
parent 4f7090e2d9
commit c525b12dfd
32 changed files with 2374 additions and 209 deletions
@@ -0,0 +1,507 @@
+"""
+V25 Pro Model Trainer — Optuna + Isotonic Calibration
+=====================================================
+Combines V25's 83 features + 12 markets + temporal split
+with Optuna hyperparameter tuning and Isotonic Regression calibration.
+
+Usage:
+  python scripts/train_v25_pro.py
+  python scripts/train_v25_pro.py --markets MS,OU25,BTTS  # specific markets
+  python scripts/train_v25_pro.py --trials 30              # fewer trials
+"""
+
+import os
+import sys
+import json
+import pickle
+import argparse
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+import lightgbm as lgb
+import optuna
+from optuna.samplers import TPESampler
+from datetime import datetime
+from sklearn.metrics import accuracy_score, log_loss, classification_report
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.base import BaseEstimator, ClassifierMixin
+
+optuna.logging.set_verbosity(optuna.logging.WARNING)
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
+MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "v25")
+REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25")
+
+os.makedirs(MODELS_DIR, exist_ok=True)
+os.makedirs(REPORTS_DIR, exist_ok=True)
+
+# ─── Feature Columns (83 features, NO target leakage) ───────────────
+FEATURES = [
+    # ELO (8)
+    "home_overall_elo", "away_overall_elo", "elo_diff",
+    "home_home_elo", "away_away_elo",
+    "home_form_elo", "away_form_elo", "form_elo_diff",
+    # Form (12)
+    "home_goals_avg", "home_conceded_avg",
+    "away_goals_avg", "away_conceded_avg",
+    "home_clean_sheet_rate", "away_clean_sheet_rate",
+    "home_scoring_rate", "away_scoring_rate",
+    "home_winning_streak", "away_winning_streak",
+    "home_unbeaten_streak", "away_unbeaten_streak",
+    # H2H (6)
+    "h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate",
+    "h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",
+    # Team Stats (8)
+    "home_avg_possession", "away_avg_possession",
+    "home_avg_shots_on_target", "away_avg_shots_on_target",
+    "home_shot_conversion", "away_shot_conversion",
+    "home_avg_corners", "away_avg_corners",
+    # Odds (24 + 20 presence flags)
+    "odds_ms_h", "odds_ms_d", "odds_ms_a",
+    "implied_home", "implied_draw", "implied_away",
+    "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
+    "odds_ou05_o", "odds_ou05_u",
+    "odds_ou15_o", "odds_ou15_u",
+    "odds_ou25_o", "odds_ou25_u",
+    "odds_ou35_o", "odds_ou35_u",
+    "odds_ht_ou05_o", "odds_ht_ou05_u",
+    "odds_ht_ou15_o", "odds_ht_ou15_u",
+    "odds_btts_y", "odds_btts_n",
+    "odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present",
+    "odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present",
+    "odds_ou05_o_present", "odds_ou05_u_present",
+    "odds_ou15_o_present", "odds_ou15_u_present",
+    "odds_ou25_o_present", "odds_ou25_u_present",
+    "odds_ou35_o_present", "odds_ou35_u_present",
+    "odds_ht_ou05_o_present", "odds_ht_ou05_u_present",
+    "odds_ht_ou15_o_present", "odds_ht_ou15_u_present",
+    "odds_btts_y_present", "odds_btts_n_present",
+    # League (4)
+    "home_xga", "away_xga",
+    "league_avg_goals", "league_zero_goal_rate",
+    # Upset Engine (4)
+    "upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",
+    # Referee Engine (5)
+    "referee_home_bias", "referee_avg_goals", "referee_cards_total",
+    "referee_avg_yellow", "referee_experience",
+    # Momentum (3)
+    "home_momentum_score", "away_momentum_score", "momentum_diff",
+    # Squad (9)
+    "home_squad_quality", "away_squad_quality", "squad_diff",
+    "home_key_players", "away_key_players",
+    "home_missing_impact", "away_missing_impact",
+    "home_goals_form", "away_goals_form",
+]
+
+MARKET_CONFIGS = [
+    {"target": "label_ms", "name": "MS", "num_class": 3},
+    {"target": "label_ou15", "name": "OU15", "num_class": 2},
+    {"target": "label_ou25", "name": "OU25", "num_class": 2},
+    {"target": "label_ou35", "name": "OU35", "num_class": 2},
+    {"target": "label_btts", "name": "BTTS", "num_class": 2},
+    {"target": "label_ht_result", "name": "HT_RESULT", "num_class": 3},
+    {"target": "label_ht_ou05", "name": "HT_OU05", "num_class": 2},
+    {"target": "label_ht_ou15", "name": "HT_OU15", "num_class": 2},
+    {"target": "label_ht_ft", "name": "HTFT", "num_class": 9},
+    {"target": "label_odd_even", "name": "ODD_EVEN", "num_class": 2},
+    {"target": "label_cards_ou45", "name": "CARDS_OU45", "num_class": 2},
+    {"target": "label_handicap_ms", "name": "HANDICAP_MS", "num_class": 3},
+]
+
+
+def load_data():
+    """Load and prepare training data."""
+    if not os.path.exists(DATA_PATH):
+        print(f"[ERROR] Data not found: {DATA_PATH}")
+        sys.exit(1)
+
+    print(f"[INFO] Loading {DATA_PATH}...")
+    df = pd.read_csv(DATA_PATH)
+
+    for col in FEATURES:
+        if col in df.columns:
+            df[col] = df[col].fillna(0)
+
+    # Derive odds presence flags for older CSVs
+    odds_flag_sources = {
+        "odds_ms_h_present": "odds_ms_h", "odds_ms_d_present": "odds_ms_d",
+        "odds_ms_a_present": "odds_ms_a", "odds_ht_ms_h_present": "odds_ht_ms_h",
+        "odds_ht_ms_d_present": "odds_ht_ms_d", "odds_ht_ms_a_present": "odds_ht_ms_a",
+        "odds_ou05_o_present": "odds_ou05_o", "odds_ou05_u_present": "odds_ou05_u",
+        "odds_ou15_o_present": "odds_ou15_o", "odds_ou15_u_present": "odds_ou15_u",
+        "odds_ou25_o_present": "odds_ou25_o", "odds_ou25_u_present": "odds_ou25_u",
+        "odds_ou35_o_present": "odds_ou35_o", "odds_ou35_u_present": "odds_ou35_u",
+        "odds_ht_ou05_o_present": "odds_ht_ou05_o", "odds_ht_ou05_u_present": "odds_ht_ou05_u",
+        "odds_ht_ou15_o_present": "odds_ht_ou15_o", "odds_ht_ou15_u_present": "odds_ht_ou15_u",
+        "odds_btts_y_present": "odds_btts_y", "odds_btts_n_present": "odds_btts_n",
+    }
+    for flag_col, odds_col in odds_flag_sources.items():
+        if flag_col not in df.columns:
+            df[flag_col] = (
+                pd.to_numeric(df.get(odds_col, 0), errors="coerce").fillna(0) > 1.01
+            ).astype(float)
+
+    print(f"[INFO] Shape: {df.shape}, Features: {len(FEATURES)}")
+    return df
+
+
+def temporal_split_4way(valid_df: pd.DataFrame):
+    """Chronological 60/15/10/15 split: train/val/cal/test."""
+    ordered = valid_df.sort_values("mst_utc").reset_index(drop=True)
+    n = len(ordered)
+    i1 = int(n * 0.60)
+    i2 = int(n * 0.75)
+    i3 = int(n * 0.85)
+
+    train = ordered.iloc[:i1].copy()
+    val = ordered.iloc[i1:i2].copy()
+    cal = ordered.iloc[i2:i3].copy()
+    test = ordered.iloc[i3:].copy()
+
+    return train, val, cal, test
+
+
+# ─── XGBoost Wrapper for sklearn CalibratedClassifierCV ─────────────
+class XGBWrapper(BaseEstimator, ClassifierMixin):
+    """Thin sklearn-compatible wrapper around xgb.train for Isotonic calibration."""
+
+    def __init__(self, params, num_boost_round=500):
+        self.params = params
+        self.num_boost_round = num_boost_round
+        self.model_ = None
+        self.classes_ = None
+
+    def fit(self, X, y, **kwargs):
+        self.classes_ = np.unique(y)
+        dtrain = xgb.DMatrix(X, label=y)
+        self.model_ = xgb.train(self.params, dtrain, num_boost_round=self.num_boost_round)
+        return self
+
+    def predict_proba(self, X):
+        dm = xgb.DMatrix(X)
+        probs = self.model_.predict(dm)
+        if len(probs.shape) == 1:
+            probs = np.column_stack([1 - probs, probs])
+        return probs
+
+    def predict(self, X):
+        return np.argmax(self.predict_proba(X), axis=1)
+
+
+# ─── Optuna Objectives ──────────────────────────────────────────────
+def xgb_objective(trial, X_train, y_train, X_val, y_val, num_class):
+    params = {
+        "objective": "multi:softprob" if num_class > 2 else "binary:logistic",
+        "eval_metric": "mlogloss" if num_class > 2 else "logloss",
+        "max_depth": trial.suggest_int("max_depth", 3, 8),
+        "eta": trial.suggest_float("eta", 0.01, 0.15, log=True),
+        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
+        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
+        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
+        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
+        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
+        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
+        "n_jobs": 4,
+        "random_state": 42,
+    }
+    if num_class > 2:
+        params["num_class"] = num_class
+
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dval = xgb.DMatrix(X_val, label=y_val)
+
+    model = xgb.train(
+        params, dtrain, num_boost_round=1000,
+        evals=[(dval, "val")], early_stopping_rounds=50, verbose_eval=False,
+    )
+
+    preds = model.predict(dval)
+    if len(preds.shape) == 1:
+        preds = np.column_stack([1 - preds, preds])
+
+    return log_loss(y_val, preds)
+
+
+def lgb_objective(trial, X_train, y_train, X_val, y_val, num_class):
+    params = {
+        "objective": "multiclass" if num_class > 2 else "binary",
+        "metric": "multi_logloss" if num_class > 2 else "binary_logloss",
+        "max_depth": trial.suggest_int("max_depth", 3, 8),
+        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15, log=True),
+        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
+        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
+        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
+        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
+        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 1.0, log=True),
+        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
+        "n_jobs": 4, "random_state": 42, "verbose": -1,
+    }
+    if num_class > 2:
+        params["num_class"] = num_class
+
+    train_data = lgb.Dataset(X_train, label=y_train)
+    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
+
+    model = lgb.train(
+        params, train_data, num_boost_round=1000,
+        valid_sets=[val_data], valid_names=["val"],
+        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)],
+    )
+
+    preds = model.predict(X_val, num_iteration=model.best_iteration)
+    if len(preds.shape) == 1:
+        preds = np.column_stack([1 - preds, preds])
+
+    return log_loss(y_val, preds)
+
+
+# ─── Main Training Pipeline ─────────────────────────────────────────
+def train_market(df, target_col, market_name, num_class, n_trials):
+    """Full pipeline for one market: Optuna → Train → Calibrate → Evaluate."""
+    print(f"\n{'='*60}")
+    print(f"[MARKET] {market_name} (classes={num_class})")
+    print(f"{'='*60}")
+
+    valid_df = df[df[target_col].notna()].copy()
+    valid_df = valid_df[valid_df[target_col].astype(str) != ""].copy()
+    print(f"[INFO] Valid samples: {len(valid_df)}")
+
+    if len(valid_df) < 500:
+        print(f"[SKIP] Not enough data for {market_name}")
+        return None
+
+    available_features = [f for f in FEATURES if f in valid_df.columns]
+    print(f"[INFO] Features: {len(available_features)}/{len(FEATURES)}")
+
+    train_df, val_df, cal_df, test_df = temporal_split_4way(valid_df)
+    X_train = train_df[available_features].values
+    X_val = val_df[available_features].values
+    X_cal = cal_df[available_features].values
+    X_test = test_df[available_features].values
+    y_train = train_df[target_col].astype(int).values
+    y_val = val_df[target_col].astype(int).values
+    y_cal = cal_df[target_col].astype(int).values
+    y_test = test_df[target_col].astype(int).values
+
+    print(f"[INFO] Split: train={len(X_train)} val={len(X_val)} cal={len(X_cal)} test={len(X_test)}")
+
+    # ── Phase 1: Optuna XGBoost ──────────────────────────────────
+    print(f"\n[OPTUNA] XGBoost tuning ({n_trials} trials)...")
+    xgb_study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
+    xgb_study.optimize(
+        lambda trial: xgb_objective(trial, X_train, y_train, X_val, y_val, num_class),
+        n_trials=n_trials,
+    )
+    xgb_best = xgb_study.best_params
+    print(f"[OK] XGB best logloss: {xgb_study.best_value:.4f}")
+
+    # ── Phase 2: Optuna LightGBM ─────────────────────────────────
+    print(f"[OPTUNA] LightGBM tuning ({n_trials} trials)...")
+    lgb_study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
+    lgb_study.optimize(
+        lambda trial: lgb_objective(trial, X_train, y_train, X_val, y_val, num_class),
+        n_trials=n_trials,
+    )
+    lgb_best = lgb_study.best_params
+    print(f"[OK] LGB best logloss: {lgb_study.best_value:.4f}")
+
+    # ── Phase 3: Train final models with best params ─────────────
+    # XGBoost final
+    xgb_params = {
+        "objective": "multi:softprob" if num_class > 2 else "binary:logistic",
+        "eval_metric": "mlogloss" if num_class > 2 else "logloss",
+        "n_jobs": 4, "random_state": 42,
+        **{k: v for k, v in xgb_best.items()},
+    }
+    if num_class > 2:
+        xgb_params["num_class"] = num_class
+
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dval = xgb.DMatrix(X_val, label=y_val)
+    xgb_model = xgb.train(
+        xgb_params, dtrain, num_boost_round=1500,
+        evals=[(dtrain, "train"), (dval, "val")],
+        early_stopping_rounds=80, verbose_eval=200,
+    )
+    print(f"[OK] XGB final: iter={xgb_model.best_iteration}, score={xgb_model.best_score:.4f}")
+
+    # LightGBM final
+    lgb_params = {
+        "objective": "multiclass" if num_class > 2 else "binary",
+        "metric": "multi_logloss" if num_class > 2 else "binary_logloss",
+        "n_jobs": 4, "random_state": 42, "verbose": -1,
+        **{k: v for k, v in lgb_best.items()},
+    }
+    if num_class > 2:
+        lgb_params["num_class"] = num_class
+
+    lgb_train_data = lgb.Dataset(X_train, label=y_train)
+    lgb_val_data = lgb.Dataset(X_val, label=y_val, reference=lgb_train_data)
+    lgb_model = lgb.train(
+        lgb_params, lgb_train_data, num_boost_round=1500,
+        valid_sets=[lgb_train_data, lgb_val_data],
+        valid_names=["train", "val"],
+        callbacks=[lgb.early_stopping(80), lgb.log_evaluation(200)],
+    )
+    print(f"[OK] LGB final: iter={lgb_model.best_iteration}")
+
+    # ── Phase 4: Isotonic Calibration on cal set ─────────────────
+    print("[CAL] Fitting Isotonic Regression...")
+
+    # XGB calibration
+    xgb_wrapper = XGBWrapper(xgb_params, num_boost_round=xgb_model.best_iteration)
+    xgb_calibrated = CalibratedClassifierCV(xgb_wrapper, method="isotonic", cv="prefit")
+    xgb_wrapper.fit(X_train, y_train)
+    xgb_calibrated.fit(X_cal, y_cal)
+
+    # LGB calibration — use raw predictions approach
+    lgb_cal_preds = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration)
+    if len(lgb_cal_preds.shape) == 1:
+        lgb_cal_preds = np.column_stack([1 - lgb_cal_preds, lgb_cal_preds])
+
+    # ── Phase 5: Evaluate on test set ────────────────────────────
+    print("\n[EVAL] Test set evaluation...")
+    dtest = xgb.DMatrix(X_test)
+
+    # Raw XGB
+    xgb_raw_probs = xgb_model.predict(dtest)
+    if len(xgb_raw_probs.shape) == 1:
+        xgb_raw_probs = np.column_stack([1 - xgb_raw_probs, xgb_raw_probs])
+
+    # Calibrated XGB
+    xgb_cal_probs = xgb_calibrated.predict_proba(X_test)
+
+    # Raw LGB
+    lgb_raw_probs = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
+    if len(lgb_raw_probs.shape) == 1:
+        lgb_raw_probs = np.column_stack([1 - lgb_raw_probs, lgb_raw_probs])
+
+    # Ensemble (raw)
+    raw_ensemble = (xgb_raw_probs + lgb_raw_probs) / 2
+
+    def _eval(probs, label):
+        preds = np.argmax(probs, axis=1)
+        acc = accuracy_score(y_test, preds)
+        ll = log_loss(y_test, probs)
+        print(f"  {label}: Acc={acc:.4f} LogLoss={ll:.4f}")
+        return {"accuracy": round(float(acc), 4), "logloss": round(float(ll), 4)}
+
+    m_xgb_raw = _eval(xgb_raw_probs, "XGB Raw")
+    m_xgb_cal = _eval(xgb_cal_probs, "XGB Calibrated")
+    m_lgb_raw = _eval(lgb_raw_probs, "LGB Raw")
+    m_ensemble = _eval(raw_ensemble, "Ensemble Raw")
+
+    # Classification report for ensemble
+    ens_preds = np.argmax(raw_ensemble, axis=1)
+    print(f"\n[REPORT] Ensemble Classification Report:")
+    print(classification_report(y_test, ens_preds))
+
+    # ── Phase 6: Save models ─────────────────────────────────────
+    # Raw models (orchestrator compatible)
+    xgb_path = os.path.join(MODELS_DIR, f"xgb_v25_{market_name.lower()}.json")
+    xgb_model.save_model(xgb_path)
+    print(f"[SAVE] {xgb_path}")
+
+    lgb_path = os.path.join(MODELS_DIR, f"lgb_v25_{market_name.lower()}.txt")
+    lgb_model.save_model(lgb_path)
+    print(f"[SAVE] {lgb_path}")
+
+    # Calibrated model
+    cal_path = os.path.join(MODELS_DIR, f"cal_xgb_v25_{market_name.lower()}.pkl")
+    with open(cal_path, "wb") as f:
+        pickle.dump(xgb_calibrated, f)
+    print(f"[SAVE] {cal_path}")
+
+    return {
+        "market": market_name,
+        "samples": int(len(valid_df)),
+        "train": int(len(X_train)),
+        "val": int(len(X_val)),
+        "cal": int(len(X_cal)),
+        "test": int(len(X_test)),
+        "features_used": len(available_features),
+        "xgb_best_params": xgb_best,
+        "lgb_best_params": lgb_best,
+        "xgb_best_iteration": int(xgb_model.best_iteration),
+        "lgb_best_iteration": int(lgb_model.best_iteration),
+        "xgb_optuna_best_logloss": round(float(xgb_study.best_value), 4),
+        "lgb_optuna_best_logloss": round(float(lgb_study.best_value), 4),
+        "test_xgb_raw": m_xgb_raw,
+        "test_xgb_calibrated": m_xgb_cal,
+        "test_lgb_raw": m_lgb_raw,
+        "test_ensemble_raw": m_ensemble,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="V25 Pro Trainer")
+    parser.add_argument("--markets", type=str, default=None,
+                        help="Comma-separated market names (e.g., MS,OU25,BTTS)")
+    parser.add_argument("--trials", type=int, default=50,
+                        help="Optuna trials per model per market")
+    args = parser.parse_args()
+
+    print("=" * 60)
+    print("V25 PRO — Optuna + Isotonic Calibration")
+    print("=" * 60)
+    print(f"[INFO] Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"[INFO] Trials per model: {args.trials}")
+    print(f"[INFO] Total features: {len(FEATURES)}")
+
+    df = load_data()
+
+    configs = MARKET_CONFIGS
+    if args.markets:
+        selected = [m.strip().upper() for m in args.markets.split(",")]
+        configs = [c for c in configs if c["name"] in selected]
+        print(f"[INFO] Selected markets: {[c['name'] for c in configs]}")
+
+    all_metrics = {
+        "trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        "trainer": "v25_pro",
+        "optuna_trials": args.trials,
+        "total_features": len(FEATURES),
+        "markets": {},
+    }
+
+    for config in configs:
+        target = config["target"]
+        if target not in df.columns:
+            print(f"[SKIP] {config['name']}: missing target {target}")
+            continue
+
+        metrics = train_market(
+            df, target, config["name"], config["num_class"], args.trials,
+        )
+        if metrics:
+            all_metrics["markets"][config["name"]] = metrics
+
+    # Save feature list
+    feature_path = os.path.join(MODELS_DIR, "feature_cols.json")
+    with open(feature_path, "w") as f:
+        json.dump(FEATURES, f, indent=2)
+
+    # Save full report
+    report_path = os.path.join(REPORTS_DIR, "v25_pro_metrics.json")
+    with open(report_path, "w") as f:
+        json.dump(all_metrics, f, indent=2, default=str)
+    print(f"\n[SAVE] Report: {report_path}")
+
+    # Summary
+    print("\n" + "=" * 60)
+    print("[SUMMARY]")
+    print("=" * 60)
+    for name, m in all_metrics["markets"].items():
+        ens = m.get("test_ensemble_raw", {})
+        print(f"  {name:12s} | Acc={ens.get('accuracy','?'):>6s} | LL={ens.get('logloss','?'):>6s} | "
+              f"XGB_iter={m.get('xgb_best_iteration','?')} LGB_iter={m.get('lgb_best_iteration','?')}")
+
+    print(f"\n[INFO] Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print("[OK] V25 PRO Training Complete!")
+
+
+if __name__ == "__main__":
+    main()