""" V25 Model Trainer - NO TARGET LEAKAGE ===================================== Training script for V25 ensemble model. CRITICAL: This version removes total_goals and ht_total_goals features to prevent target leakage. These features are only known AFTER the match ends. Usage: python scripts/train_v25_clean.py """ import os import sys import json import pickle import numpy as np import pandas as pd import xgboost as xgb import lightgbm as lgb from datetime import datetime from sklearn.metrics import accuracy_score, log_loss, classification_report # Add parent directory to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Config AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv") MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "v25") REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25") os.makedirs(MODELS_DIR, exist_ok=True) os.makedirs(REPORTS_DIR, exist_ok=True) # Feature Columns - NO TARGET LEAKAGE # These features are available BEFORE the match starts FEATURES = [ # ELO Features (8) "home_overall_elo", "away_overall_elo", "elo_diff", "home_home_elo", "away_away_elo", "home_form_elo", "away_form_elo", "form_elo_diff", # Form Features (12) "home_goals_avg", "home_conceded_avg", "away_goals_avg", "away_conceded_avg", "home_clean_sheet_rate", "away_clean_sheet_rate", "home_scoring_rate", "away_scoring_rate", "home_winning_streak", "away_winning_streak", "home_unbeaten_streak", "away_unbeaten_streak", # H2H Features (6) "h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate", "h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate", # Team Stats Features (8) "home_avg_possession", "away_avg_possession", "home_avg_shots_on_target", "away_avg_shots_on_target", "home_shot_conversion", "away_shot_conversion", "home_avg_corners", "away_avg_corners", # Odds Features (24) - Market wisdom "odds_ms_h", "odds_ms_d", "odds_ms_a", "implied_home", "implied_draw", "implied_away", "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a", "odds_ou05_o", "odds_ou05_u", "odds_ou15_o", "odds_ou15_u", "odds_ou25_o", "odds_ou25_u", "odds_ou35_o", "odds_ou35_u", "odds_ht_ou05_o", "odds_ht_ou05_u", "odds_ht_ou15_o", "odds_ht_ou15_u", "odds_btts_y", "odds_btts_n", "odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present", "odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present", "odds_ou05_o_present", "odds_ou05_u_present", "odds_ou15_o_present", "odds_ou15_u_present", "odds_ou25_o_present", "odds_ou25_u_present", "odds_ou35_o_present", "odds_ou35_u_present", "odds_ht_ou05_o_present", "odds_ht_ou05_u_present", "odds_ht_ou15_o_present", "odds_ht_ou15_u_present", "odds_btts_y_present", "odds_btts_n_present", # League Features (4) "home_xga", "away_xga", "league_avg_goals", "league_zero_goal_rate", # Upset Engine (4) "upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential", # Referee Engine (5) "referee_home_bias", "referee_avg_goals", "referee_cards_total", "referee_avg_yellow", "referee_experience", # Momentum Engine (3) "home_momentum_score", "away_momentum_score", "momentum_diff", # Squad Features (9) "home_squad_quality", "away_squad_quality", "squad_diff", "home_key_players", "away_key_players", "home_missing_impact", "away_missing_impact", "home_goals_form", "away_goals_form", ] # REMOVED: total_goals, ht_total_goals (TARGET LEAKAGE!) # These are only known AFTER the match ends print(f"[INFO] Total features: {len(FEATURES)}") MARKET_CONFIGS = [ {"target": "label_ms", "name": "MS", "num_class": 3}, {"target": "label_ou15", "name": "OU15", "num_class": 2}, {"target": "label_ou25", "name": "OU25", "num_class": 2}, {"target": "label_ou35", "name": "OU35", "num_class": 2}, {"target": "label_btts", "name": "BTTS", "num_class": 2}, {"target": "label_ht_result", "name": "HT_RESULT", "num_class": 3}, {"target": "label_ht_ou05", "name": "HT_OU05", "num_class": 2}, {"target": "label_ht_ou15", "name": "HT_OU15", "num_class": 2}, {"target": "label_ht_ft", "name": "HTFT", "num_class": 9}, {"target": "label_odd_even", "name": "ODD_EVEN", "num_class": 2}, {"target": "label_cards_ou45", "name": "CARDS_OU45", "num_class": 2}, {"target": "label_handicap_ms", "name": "HANDICAP_MS", "num_class": 3}, ] def load_data(): """Load training data from CSV.""" if not os.path.exists(DATA_PATH): print(f"[ERROR] Data file not found: {DATA_PATH}") print("[INFO] Run extract_training_data.py first to generate training data") sys.exit(1) print(f"[INFO] Loading data from {DATA_PATH}...") df = pd.read_csv(DATA_PATH) # Fill NaN values for col in FEATURES: if col in df.columns: df[col] = df[col].fillna(0) # Backward-compatible derivation for older CSVs without odds availability flags. odds_flag_sources = { "odds_ms_h_present": "odds_ms_h", "odds_ms_d_present": "odds_ms_d", "odds_ms_a_present": "odds_ms_a", "odds_ht_ms_h_present": "odds_ht_ms_h", "odds_ht_ms_d_present": "odds_ht_ms_d", "odds_ht_ms_a_present": "odds_ht_ms_a", "odds_ou05_o_present": "odds_ou05_o", "odds_ou05_u_present": "odds_ou05_u", "odds_ou15_o_present": "odds_ou15_o", "odds_ou15_u_present": "odds_ou15_u", "odds_ou25_o_present": "odds_ou25_o", "odds_ou25_u_present": "odds_ou25_u", "odds_ou35_o_present": "odds_ou35_o", "odds_ou35_u_present": "odds_ou35_u", "odds_ht_ou05_o_present": "odds_ht_ou05_o", "odds_ht_ou05_u_present": "odds_ht_ou05_u", "odds_ht_ou15_o_present": "odds_ht_ou15_o", "odds_ht_ou15_u_present": "odds_ht_ou15_u", "odds_btts_y_present": "odds_btts_y", "odds_btts_n_present": "odds_btts_n", } for flag_col, odds_col in odds_flag_sources.items(): if flag_col not in df.columns: df[flag_col] = ( pd.to_numeric(df.get(odds_col, 0), errors="coerce").fillna(0) > 1.01 ).astype(float) print(f"[INFO] Shape: {df.shape}") print(f"[INFO] Columns: {list(df.columns)}") return df def temporal_split(valid_df: pd.DataFrame): """Chronological train/val/test split.""" ordered = valid_df.sort_values("mst_utc").reset_index(drop=True) n = len(ordered) train_end = max(int(n * 0.70), 1) val_end = max(int(n * 0.85), train_end + 1) val_end = min(val_end, n - 1) train_df = ordered.iloc[:train_end].copy() val_df = ordered.iloc[train_end:val_end].copy() test_df = ordered.iloc[val_end:].copy() return train_df, val_df, test_df def train_xgboost_model(X_train, y_train, X_val, y_val, num_class=3, market_name="MS"): """Train XGBoost model with early stopping.""" print(f"\n[INFO] Training XGBoost for {market_name}...") params = { "objective": "multi:softprob" if num_class > 2 else "binary:logistic", "eval_metric": "mlogloss" if num_class > 2 else "logloss", "max_depth": 6, "eta": 0.05, "subsample": 0.8, "colsample_bytree": 0.8, "min_child_weight": 3, "gamma": 0.1, "n_jobs": 4, "random_state": 42, } if num_class > 2: params["num_class"] = num_class dtrain = xgb.DMatrix(X_train, label=y_train) dval = xgb.DMatrix(X_val, label=y_val) evals_result = {} model = xgb.train( params, dtrain, num_boost_round=1000, evals=[(dtrain, 'train'), (dval, 'val')], early_stopping_rounds=50, evals_result=evals_result, verbose_eval=100, ) print(f"[OK] Best iteration: {model.best_iteration}") print(f"[OK] Best score: {model.best_score:.4f}") return model def train_lightgbm_model(X_train, y_train, X_val, y_val, num_class=3, market_name="MS"): """Train LightGBM model with early stopping.""" print(f"\n[INFO] Training LightGBM for {market_name}...") params = { "objective": "multiclass" if num_class > 2 else "binary", "metric": "multi_logloss" if num_class > 2 else "binary_logloss", "max_depth": 6, "learning_rate": 0.05, "feature_fraction": 0.8, "bagging_fraction": 0.8, "bagging_freq": 5, "min_child_samples": 20, "n_jobs": 4, "random_state": 42, "verbose": -1, } if num_class > 2: params["num_class"] = num_class train_data = lgb.Dataset(X_train, label=y_train) val_data = lgb.Dataset(X_val, label=y_val, reference=train_data) model = lgb.train( params, train_data, num_boost_round=1000, valid_sets=[train_data, val_data], valid_names=['train', 'val'], callbacks=[ lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=100), ], ) print(f"[OK] Best iteration: {model.best_iteration}") print(f"[OK] Best score: {model.best_score['val'][params['metric']]:.4f}") return model def evaluate_model(model, X_test, y_test, model_type='xgb', num_class=3): """Evaluate model on test set.""" if model_type == 'xgb': dtest = xgb.DMatrix(X_test) probs = model.predict(dtest) else: # lgb probs = model.predict(X_test, num_iteration=model.best_iteration) if len(probs.shape) == 1: # Binary classification probs = np.column_stack([1 - probs, probs]) preds = np.argmax(probs, axis=1) acc = accuracy_score(y_test, preds) loss = log_loss(y_test, probs) print(f"\n[RESULTS] Test Results:") print(f" Accuracy: {acc:.4f}") print(f" Log Loss: {loss:.4f}") # Per-class metrics print("\n[REPORT] Classification Report:") print(classification_report(y_test, preds)) return probs, acc, loss def train_market(df, target_col, market_name, num_class=3): """Train models for a specific market.""" print(f"\n{'='*60}") print(f"[MARKET] Training {market_name}") print(f"{'='*60}") # Filter valid rows valid_df = df[df[target_col].notna()].copy() valid_df = valid_df[valid_df[target_col].astype(str) != ""].copy() print(f"[INFO] Valid samples: {len(valid_df)}") if len(valid_df) < 100: print(f"[ERROR] Not enough data for {market_name}") return None, None # Prepare features available_features = [f for f in FEATURES if f in valid_df.columns] print(f"[INFO] Available features: {len(available_features)}/{len(FEATURES)}") train_df, val_df, test_df = temporal_split(valid_df) X_train = train_df[available_features].values X_val = val_df[available_features].values X_test = test_df[available_features].values y_train = train_df[target_col].astype(int).values y_val = val_df[target_col].astype(int).values y_test = test_df[target_col].astype(int).values print( f"[INFO] Temporal split -> Train: {len(X_train)}," f" Val: {len(X_val)}, Test: {len(X_test)}" ) print( f"[INFO] Time windows -> train_end={int(train_df['mst_utc'].max())}," f" val_end={int(val_df['mst_utc'].max())}," f" test_end={int(test_df['mst_utc'].max())}" ) # Train XGBoost xgb_model = train_xgboost_model(X_train, y_train, X_val, y_val, num_class, market_name) # Train LightGBM lgb_model = train_lightgbm_model(X_train, y_train, X_val, y_val, num_class, market_name) # Evaluate print("\n[INFO] XGBoost Evaluation:") xgb_probs, xgb_acc, xgb_loss = evaluate_model(xgb_model, X_test, y_test, 'xgb', num_class) print("\n[INFO] LightGBM Evaluation:") lgb_probs, lgb_acc, lgb_loss = evaluate_model(lgb_model, X_test, y_test, 'lgb', num_class) # Ensemble evaluation ensemble_probs = (xgb_probs + lgb_probs) / 2 ensemble_preds = np.argmax(ensemble_probs, axis=1) ensemble_acc = accuracy_score(y_test, ensemble_preds) ensemble_loss = log_loss(y_test, ensemble_probs) print(f"\n[INFO] Ensemble Evaluation:") print(f" Accuracy: {ensemble_acc:.4f}") print(f" Log Loss: {ensemble_loss:.4f}") # Save models xgb_path = os.path.join(MODELS_DIR, f"xgb_v25_{market_name.lower()}.json") xgb_model.save_model(xgb_path) print(f"[OK] XGBoost saved: {xgb_path}") lgb_path = os.path.join(MODELS_DIR, f"lgb_v25_{market_name.lower()}.txt") lgb_model.save_model(lgb_path) print(f"[OK] LightGBM saved: {lgb_path}") metrics = { "samples": int(len(valid_df)), "features_used": available_features, "train_samples": int(len(X_train)), "val_samples": int(len(X_val)), "test_samples": int(len(X_test)), "xgb_accuracy": round(float(xgb_acc), 4), "xgb_logloss": round(float(xgb_loss), 4), "lgb_accuracy": round(float(lgb_acc), 4), "lgb_logloss": round(float(lgb_loss), 4), "ensemble_accuracy": round(float(ensemble_acc), 4), "ensemble_logloss": round(float(ensemble_loss), 4), "class_count": int(num_class), } return xgb_model, lgb_model, metrics def main(): """Main training pipeline.""" print("="*60) print("V25 Model Training - NO TARGET LEAKAGE") print("="*60) print(f"[INFO] Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") # Load data df = load_data() target_cols = [col for col in df.columns if col.startswith('label_')] print(f"\n[INFO] Available targets: {target_cols}") results = {} reports = { "trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "market_results": {}, } for config in MARKET_CONFIGS: target = config["target"] market_name = config["name"] num_class = config["num_class"] if target not in df.columns: print(f"[SKIP] {market_name}: missing target column {target}") continue xgb_model, lgb_model, metrics = train_market( df, target, market_name, num_class=num_class ) results[market_name] = { 'xgb': xgb_model is not None, 'lgb': lgb_model is not None, } reports["market_results"][market_name] = metrics # Save feature list feature_path = os.path.join(MODELS_DIR, "feature_cols.json") with open(feature_path, 'w') as f: json.dump(FEATURES, f, indent=2) print(f"\n[OK] Feature list saved: {feature_path}") report_path = os.path.join(REPORTS_DIR, "v25_market_metrics.json") with open(report_path, "w") as f: json.dump(reports, f, indent=2) print(f"[OK] Metrics report saved: {report_path}") # Summary print("\n" + "="*60) print("[SUMMARY] Training Results") print("="*60) for market, status in results.items(): print(f" {market}: XGB={status['xgb']}, LGB={status['lgb']}") print(f"\n[INFO] Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("[OK] V25 Training Complete!") if __name__ == "__main__": main()