first (part 2: other directories)

2026-04-16 15:11:25 +03:00
parent 7814e0bc6b
commit 2f0b85a0c7
203 changed files with 59989 additions and 0 deletions
@@ -0,0 +1,451 @@
+"""
+V25 Model Trainer - NO TARGET LEAKAGE
+=====================================
+Training script for V25 ensemble model.
+
+CRITICAL: This version removes total_goals and ht_total_goals features
+to prevent target leakage. These features are only known AFTER the match ends.
+
+Usage:
+  python scripts/train_v25_clean.py
+"""
+
+import os
+import sys
+import json
+import pickle
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+import lightgbm as lgb
+from datetime import datetime
+from sklearn.metrics import accuracy_score, log_loss, classification_report
+
+# Add parent directory to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Config
+AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
+MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "v25")
+REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25")
+
+os.makedirs(MODELS_DIR, exist_ok=True)
+os.makedirs(REPORTS_DIR, exist_ok=True)
+
+# Feature Columns - NO TARGET LEAKAGE
+# These features are available BEFORE the match starts
+FEATURES = [
+    # ELO Features (8)
+    "home_overall_elo", "away_overall_elo", "elo_diff",
+    "home_home_elo", "away_away_elo",
+    "home_form_elo", "away_form_elo", "form_elo_diff",
+    
+    # Form Features (12)
+    "home_goals_avg", "home_conceded_avg",
+    "away_goals_avg", "away_conceded_avg",
+    "home_clean_sheet_rate", "away_clean_sheet_rate",
+    "home_scoring_rate", "away_scoring_rate",
+    "home_winning_streak", "away_winning_streak",
+    "home_unbeaten_streak", "away_unbeaten_streak",
+    
+    # H2H Features (6)
+    "h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate",
+    "h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",
+    
+    # Team Stats Features (8)
+    "home_avg_possession", "away_avg_possession",
+    "home_avg_shots_on_target", "away_avg_shots_on_target",
+    "home_shot_conversion", "away_shot_conversion",
+    "home_avg_corners", "away_avg_corners",
+    
+    # Odds Features (24) - Market wisdom
+    "odds_ms_h", "odds_ms_d", "odds_ms_a",
+    "implied_home", "implied_draw", "implied_away",
+    "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
+    "odds_ou05_o", "odds_ou05_u",
+    "odds_ou15_o", "odds_ou15_u",
+    "odds_ou25_o", "odds_ou25_u",
+    "odds_ou35_o", "odds_ou35_u",
+    "odds_ht_ou05_o", "odds_ht_ou05_u",
+    "odds_ht_ou15_o", "odds_ht_ou15_u",
+    "odds_btts_y", "odds_btts_n",
+    "odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present",
+    "odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present",
+    "odds_ou05_o_present", "odds_ou05_u_present",
+    "odds_ou15_o_present", "odds_ou15_u_present",
+    "odds_ou25_o_present", "odds_ou25_u_present",
+    "odds_ou35_o_present", "odds_ou35_u_present",
+    "odds_ht_ou05_o_present", "odds_ht_ou05_u_present",
+    "odds_ht_ou15_o_present", "odds_ht_ou15_u_present",
+    "odds_btts_y_present", "odds_btts_n_present",
+    
+    # League Features (4)
+    "home_xga", "away_xga",
+    "league_avg_goals", "league_zero_goal_rate",
+    
+    # Upset Engine (4)
+    "upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",
+    
+    # Referee Engine (5)
+    "referee_home_bias", "referee_avg_goals", "referee_cards_total",
+    "referee_avg_yellow", "referee_experience",
+    
+    # Momentum Engine (3)
+    "home_momentum_score", "away_momentum_score", "momentum_diff",
+
+    # Squad Features (9)
+    "home_squad_quality", "away_squad_quality", "squad_diff",
+    "home_key_players", "away_key_players",
+    "home_missing_impact", "away_missing_impact",
+    "home_goals_form", "away_goals_form",
+]
+
+# REMOVED: total_goals, ht_total_goals (TARGET LEAKAGE!)
+# These are only known AFTER the match ends
+
+print(f"[INFO] Total features: {len(FEATURES)}")
+
+MARKET_CONFIGS = [
+    {"target": "label_ms", "name": "MS", "num_class": 3},
+    {"target": "label_ou15", "name": "OU15", "num_class": 2},
+    {"target": "label_ou25", "name": "OU25", "num_class": 2},
+    {"target": "label_ou35", "name": "OU35", "num_class": 2},
+    {"target": "label_btts", "name": "BTTS", "num_class": 2},
+    {"target": "label_ht_result", "name": "HT_RESULT", "num_class": 3},
+    {"target": "label_ht_ou05", "name": "HT_OU05", "num_class": 2},
+    {"target": "label_ht_ou15", "name": "HT_OU15", "num_class": 2},
+    {"target": "label_ht_ft", "name": "HTFT", "num_class": 9},
+    {"target": "label_odd_even", "name": "ODD_EVEN", "num_class": 2},
+    {"target": "label_cards_ou45", "name": "CARDS_OU45", "num_class": 2},
+    {"target": "label_handicap_ms", "name": "HANDICAP_MS", "num_class": 3},
+]
+
+
+def load_data():
+    """Load training data from CSV."""
+    if not os.path.exists(DATA_PATH):
+        print(f"[ERROR] Data file not found: {DATA_PATH}")
+        print("[INFO] Run extract_training_data.py first to generate training data")
+        sys.exit(1)
+        
+    print(f"[INFO] Loading data from {DATA_PATH}...")
+    df = pd.read_csv(DATA_PATH)
+    
+    # Fill NaN values
+    for col in FEATURES:
+        if col in df.columns:
+            df[col] = df[col].fillna(0)
+
+    # Backward-compatible derivation for older CSVs without odds availability flags.
+    odds_flag_sources = {
+        "odds_ms_h_present": "odds_ms_h",
+        "odds_ms_d_present": "odds_ms_d",
+        "odds_ms_a_present": "odds_ms_a",
+        "odds_ht_ms_h_present": "odds_ht_ms_h",
+        "odds_ht_ms_d_present": "odds_ht_ms_d",
+        "odds_ht_ms_a_present": "odds_ht_ms_a",
+        "odds_ou05_o_present": "odds_ou05_o",
+        "odds_ou05_u_present": "odds_ou05_u",
+        "odds_ou15_o_present": "odds_ou15_o",
+        "odds_ou15_u_present": "odds_ou15_u",
+        "odds_ou25_o_present": "odds_ou25_o",
+        "odds_ou25_u_present": "odds_ou25_u",
+        "odds_ou35_o_present": "odds_ou35_o",
+        "odds_ou35_u_present": "odds_ou35_u",
+        "odds_ht_ou05_o_present": "odds_ht_ou05_o",
+        "odds_ht_ou05_u_present": "odds_ht_ou05_u",
+        "odds_ht_ou15_o_present": "odds_ht_ou15_o",
+        "odds_ht_ou15_u_present": "odds_ht_ou15_u",
+        "odds_btts_y_present": "odds_btts_y",
+        "odds_btts_n_present": "odds_btts_n",
+    }
+    for flag_col, odds_col in odds_flag_sources.items():
+        if flag_col not in df.columns:
+            df[flag_col] = (
+                pd.to_numeric(df.get(odds_col, 0), errors="coerce").fillna(0) > 1.01
+            ).astype(float)
+
+    print(f"[INFO] Shape: {df.shape}")
+    print(f"[INFO] Columns: {list(df.columns)}")
+    return df
+
+
+def temporal_split(valid_df: pd.DataFrame):
+    """Chronological train/val/test split."""
+    ordered = valid_df.sort_values("mst_utc").reset_index(drop=True)
+    n = len(ordered)
+    train_end = max(int(n * 0.70), 1)
+    val_end = max(int(n * 0.85), train_end + 1)
+    val_end = min(val_end, n - 1)
+
+    train_df = ordered.iloc[:train_end].copy()
+    val_df = ordered.iloc[train_end:val_end].copy()
+    test_df = ordered.iloc[val_end:].copy()
+
+    return train_df, val_df, test_df
+
+
+def train_xgboost_model(X_train, y_train, X_val, y_val, num_class=3, market_name="MS"):
+    """Train XGBoost model with early stopping."""
+    
+    print(f"\n[INFO] Training XGBoost for {market_name}...")
+    
+    params = {
+        "objective": "multi:softprob" if num_class > 2 else "binary:logistic",
+        "eval_metric": "mlogloss" if num_class > 2 else "logloss",
+        "max_depth": 6,
+        "eta": 0.05,
+        "subsample": 0.8,
+        "colsample_bytree": 0.8,
+        "min_child_weight": 3,
+        "gamma": 0.1,
+        "n_jobs": 4,
+        "random_state": 42,
+    }
+    
+    if num_class > 2:
+        params["num_class"] = num_class
+    
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dval = xgb.DMatrix(X_val, label=y_val)
+    
+    evals_result = {}
+    model = xgb.train(
+        params,
+        dtrain,
+        num_boost_round=1000,
+        evals=[(dtrain, 'train'), (dval, 'val')],
+        early_stopping_rounds=50,
+        evals_result=evals_result,
+        verbose_eval=100,
+    )
+    
+    print(f"[OK] Best iteration: {model.best_iteration}")
+    print(f"[OK] Best score: {model.best_score:.4f}")
+    
+    return model
+
+
+def train_lightgbm_model(X_train, y_train, X_val, y_val, num_class=3, market_name="MS"):
+    """Train LightGBM model with early stopping."""
+    
+    print(f"\n[INFO] Training LightGBM for {market_name}...")
+    
+    params = {
+        "objective": "multiclass" if num_class > 2 else "binary",
+        "metric": "multi_logloss" if num_class > 2 else "binary_logloss",
+        "max_depth": 6,
+        "learning_rate": 0.05,
+        "feature_fraction": 0.8,
+        "bagging_fraction": 0.8,
+        "bagging_freq": 5,
+        "min_child_samples": 20,
+        "n_jobs": 4,
+        "random_state": 42,
+        "verbose": -1,
+    }
+    
+    if num_class > 2:
+        params["num_class"] = num_class
+    
+    train_data = lgb.Dataset(X_train, label=y_train)
+    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
+    
+    model = lgb.train(
+        params,
+        train_data,
+        num_boost_round=1000,
+        valid_sets=[train_data, val_data],
+        valid_names=['train', 'val'],
+        callbacks=[
+            lgb.early_stopping(stopping_rounds=50),
+            lgb.log_evaluation(period=100),
+        ],
+    )
+    
+    print(f"[OK] Best iteration: {model.best_iteration}")
+    print(f"[OK] Best score: {model.best_score['val'][params['metric']]:.4f}")
+    
+    return model
+
+
+def evaluate_model(model, X_test, y_test, model_type='xgb', num_class=3):
+    """Evaluate model on test set."""
+    
+    if model_type == 'xgb':
+        dtest = xgb.DMatrix(X_test)
+        probs = model.predict(dtest)
+    else:  # lgb
+        probs = model.predict(X_test, num_iteration=model.best_iteration)
+    
+    if len(probs.shape) == 1:
+        # Binary classification
+        probs = np.column_stack([1 - probs, probs])
+    
+    preds = np.argmax(probs, axis=1)
+    
+    acc = accuracy_score(y_test, preds)
+    loss = log_loss(y_test, probs)
+    
+    print(f"\n[RESULTS] Test Results:")
+    print(f"   Accuracy: {acc:.4f}")
+    print(f"   Log Loss: {loss:.4f}")
+    
+    # Per-class metrics
+    print("\n[REPORT] Classification Report:")
+    print(classification_report(y_test, preds))
+    
+    return probs, acc, loss
+
+
+def train_market(df, target_col, market_name, num_class=3):
+    """Train models for a specific market."""
+    
+    print(f"\n{'='*60}")
+    print(f"[MARKET] Training {market_name}")
+    print(f"{'='*60}")
+    
+    # Filter valid rows
+    valid_df = df[df[target_col].notna()].copy()
+    valid_df = valid_df[valid_df[target_col].astype(str) != ""].copy()
+    print(f"[INFO] Valid samples: {len(valid_df)}")
+    
+    if len(valid_df) < 100:
+        print(f"[ERROR] Not enough data for {market_name}")
+        return None, None
+    
+    # Prepare features
+    available_features = [f for f in FEATURES if f in valid_df.columns]
+    print(f"[INFO] Available features: {len(available_features)}/{len(FEATURES)}")
+    
+    train_df, val_df, test_df = temporal_split(valid_df)
+    X_train = train_df[available_features].values
+    X_val = val_df[available_features].values
+    X_test = test_df[available_features].values
+    y_train = train_df[target_col].astype(int).values
+    y_val = val_df[target_col].astype(int).values
+    y_test = test_df[target_col].astype(int).values
+
+    print(
+        f"[INFO] Temporal split -> Train: {len(X_train)},"
+        f" Val: {len(X_val)}, Test: {len(X_test)}"
+    )
+    print(
+        f"[INFO] Time windows -> train_end={int(train_df['mst_utc'].max())},"
+        f" val_end={int(val_df['mst_utc'].max())},"
+        f" test_end={int(test_df['mst_utc'].max())}"
+    )
+    
+    # Train XGBoost
+    xgb_model = train_xgboost_model(X_train, y_train, X_val, y_val, num_class, market_name)
+    
+    # Train LightGBM
+    lgb_model = train_lightgbm_model(X_train, y_train, X_val, y_val, num_class, market_name)
+    
+    # Evaluate
+    print("\n[INFO] XGBoost Evaluation:")
+    xgb_probs, xgb_acc, xgb_loss = evaluate_model(xgb_model, X_test, y_test, 'xgb', num_class)
+    
+    print("\n[INFO] LightGBM Evaluation:")
+    lgb_probs, lgb_acc, lgb_loss = evaluate_model(lgb_model, X_test, y_test, 'lgb', num_class)
+    
+    # Ensemble evaluation
+    ensemble_probs = (xgb_probs + lgb_probs) / 2
+    ensemble_preds = np.argmax(ensemble_probs, axis=1)
+    ensemble_acc = accuracy_score(y_test, ensemble_preds)
+    ensemble_loss = log_loss(y_test, ensemble_probs)
+    
+    print(f"\n[INFO] Ensemble Evaluation:")
+    print(f"   Accuracy: {ensemble_acc:.4f}")
+    print(f"   Log Loss: {ensemble_loss:.4f}")
+    
+    # Save models
+    xgb_path = os.path.join(MODELS_DIR, f"xgb_v25_{market_name.lower()}.json")
+    xgb_model.save_model(xgb_path)
+    print(f"[OK] XGBoost saved: {xgb_path}")
+    
+    lgb_path = os.path.join(MODELS_DIR, f"lgb_v25_{market_name.lower()}.txt")
+    lgb_model.save_model(lgb_path)
+    print(f"[OK] LightGBM saved: {lgb_path}")
+    
+    metrics = {
+        "samples": int(len(valid_df)),
+        "features_used": available_features,
+        "train_samples": int(len(X_train)),
+        "val_samples": int(len(X_val)),
+        "test_samples": int(len(X_test)),
+        "xgb_accuracy": round(float(xgb_acc), 4),
+        "xgb_logloss": round(float(xgb_loss), 4),
+        "lgb_accuracy": round(float(lgb_acc), 4),
+        "lgb_logloss": round(float(lgb_loss), 4),
+        "ensemble_accuracy": round(float(ensemble_acc), 4),
+        "ensemble_logloss": round(float(ensemble_loss), 4),
+        "class_count": int(num_class),
+    }
+
+    return xgb_model, lgb_model, metrics
+
+
+def main():
+    """Main training pipeline."""
+    
+    print("="*60)
+    print("V25 Model Training - NO TARGET LEAKAGE")
+    print("="*60)
+    print(f"[INFO] Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    
+    # Load data
+    df = load_data()
+    
+    target_cols = [col for col in df.columns if col.startswith('label_')]
+    print(f"\n[INFO] Available targets: {target_cols}")
+
+    results = {}
+    reports = {
+        "trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        "market_results": {},
+    }
+
+    for config in MARKET_CONFIGS:
+        target = config["target"]
+        market_name = config["name"]
+        num_class = config["num_class"]
+
+        if target not in df.columns:
+            print(f"[SKIP] {market_name}: missing target column {target}")
+            continue
+
+        xgb_model, lgb_model, metrics = train_market(
+            df, target, market_name, num_class=num_class
+        )
+        results[market_name] = {
+            'xgb': xgb_model is not None,
+            'lgb': lgb_model is not None,
+        }
+        reports["market_results"][market_name] = metrics
+    
+    # Save feature list
+    feature_path = os.path.join(MODELS_DIR, "feature_cols.json")
+    with open(feature_path, 'w') as f:
+        json.dump(FEATURES, f, indent=2)
+    print(f"\n[OK] Feature list saved: {feature_path}")
+
+    report_path = os.path.join(REPORTS_DIR, "v25_market_metrics.json")
+    with open(report_path, "w") as f:
+        json.dump(reports, f, indent=2)
+    print(f"[OK] Metrics report saved: {report_path}")
+    
+    # Summary
+    print("\n" + "="*60)
+    print("[SUMMARY] Training Results")
+    print("="*60)
+    for market, status in results.items():
+        print(f"   {market}: XGB={status['xgb']}, LGB={status['lgb']}")
+    
+    print(f"\n[INFO] Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print("[OK] V25 Training Complete!")
+
+
+if __name__ == "__main__":
+    main()