iddaai-be/ai-engine/scripts/train_v25_clean.py

"""
V25 Model Trainer - NO TARGET LEAKAGE
=====================================
Training script for V25 ensemble model.

CRITICAL: This version removes total_goals and ht_total_goals features
to prevent target leakage. These features are only known AFTER the match ends.

Usage:
  python scripts/train_v25_clean.py
"""

import os
import sys
import json
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime
from sklearn.metrics import accuracy_score, log_loss, classification_report

# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Config
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "v25")
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25")

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)

# Feature Columns - NO TARGET LEAKAGE
# These features are available BEFORE the match starts
FEATURES = [
    # ELO Features (8)
    "home_overall_elo", "away_overall_elo", "elo_diff",
    "home_home_elo", "away_away_elo",
    "home_form_elo", "away_form_elo", "form_elo_diff",

    # Form Features (12)
    "home_goals_avg", "home_conceded_avg",
    "away_goals_avg", "away_conceded_avg",
    "home_clean_sheet_rate", "away_clean_sheet_rate",
    "home_scoring_rate", "away_scoring_rate",
    "home_winning_streak", "away_winning_streak",
    "home_unbeaten_streak", "away_unbeaten_streak",

    # H2H Features (6)
    "h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate",
    "h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",

    # Team Stats Features (8)
    "home_avg_possession", "away_avg_possession",
    "home_avg_shots_on_target", "away_avg_shots_on_target",
    "home_shot_conversion", "away_shot_conversion",
    "home_avg_corners", "away_avg_corners",

    # Odds Features (24) - Market wisdom
    "odds_ms_h", "odds_ms_d", "odds_ms_a",
    "implied_home", "implied_draw", "implied_away",
    "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
    "odds_ou05_o", "odds_ou05_u",
    "odds_ou15_o", "odds_ou15_u",
    "odds_ou25_o", "odds_ou25_u",
    "odds_ou35_o", "odds_ou35_u",
    "odds_ht_ou05_o", "odds_ht_ou05_u",
    "odds_ht_ou15_o", "odds_ht_ou15_u",
    "odds_btts_y", "odds_btts_n",
    "odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present",
    "odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present",
    "odds_ou05_o_present", "odds_ou05_u_present",
    "odds_ou15_o_present", "odds_ou15_u_present",
    "odds_ou25_o_present", "odds_ou25_u_present",
    "odds_ou35_o_present", "odds_ou35_u_present",
    "odds_ht_ou05_o_present", "odds_ht_ou05_u_present",
    "odds_ht_ou15_o_present", "odds_ht_ou15_u_present",
    "odds_btts_y_present", "odds_btts_n_present",

    # League Features (4)
    "home_xga", "away_xga",
    "league_avg_goals", "league_zero_goal_rate",

    # Upset Engine (4)
    "upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",

    # Referee Engine (5)
    "referee_home_bias", "referee_avg_goals", "referee_cards_total",
    "referee_avg_yellow", "referee_experience",

    # Momentum Engine (3)
    "home_momentum_score", "away_momentum_score", "momentum_diff",

    # Squad Features (9)
    "home_squad_quality", "away_squad_quality", "squad_diff",
    "home_key_players", "away_key_players",
    "home_missing_impact", "away_missing_impact",
    "home_goals_form", "away_goals_form",
]

# REMOVED: total_goals, ht_total_goals (TARGET LEAKAGE!)
# These are only known AFTER the match ends

print(f"[INFO] Total features: {len(FEATURES)}")

MARKET_CONFIGS = [
    {"target": "label_ms", "name": "MS", "num_class": 3},
    {"target": "label_ou15", "name": "OU15", "num_class": 2},
    {"target": "label_ou25", "name": "OU25", "num_class": 2},
    {"target": "label_ou35", "name": "OU35", "num_class": 2},
    {"target": "label_btts", "name": "BTTS", "num_class": 2},
    {"target": "label_ht_result", "name": "HT_RESULT", "num_class": 3},
    {"target": "label_ht_ou05", "name": "HT_OU05", "num_class": 2},
    {"target": "label_ht_ou15", "name": "HT_OU15", "num_class": 2},
    {"target": "label_ht_ft", "name": "HTFT", "num_class": 9},
    {"target": "label_odd_even", "name": "ODD_EVEN", "num_class": 2},
    {"target": "label_cards_ou45", "name": "CARDS_OU45", "num_class": 2},
    {"target": "label_handicap_ms", "name": "HANDICAP_MS", "num_class": 3},
]


def load_data():
    """Load training data from CSV."""
    if not os.path.exists(DATA_PATH):
        print(f"[ERROR] Data file not found: {DATA_PATH}")
        print("[INFO] Run extract_training_data.py first to generate training data")
        sys.exit(1)

    print(f"[INFO] Loading data from {DATA_PATH}...")
    df = pd.read_csv(DATA_PATH)

    # Fill NaN values
    for col in FEATURES:
        if col in df.columns:
            df[col] = df[col].fillna(0)

    # Backward-compatible derivation for older CSVs without odds availability flags.
    odds_flag_sources = {
        "odds_ms_h_present": "odds_ms_h",
        "odds_ms_d_present": "odds_ms_d",
        "odds_ms_a_present": "odds_ms_a",
        "odds_ht_ms_h_present": "odds_ht_ms_h",
        "odds_ht_ms_d_present": "odds_ht_ms_d",
        "odds_ht_ms_a_present": "odds_ht_ms_a",
        "odds_ou05_o_present": "odds_ou05_o",
        "odds_ou05_u_present": "odds_ou05_u",
        "odds_ou15_o_present": "odds_ou15_o",
        "odds_ou15_u_present": "odds_ou15_u",
        "odds_ou25_o_present": "odds_ou25_o",
        "odds_ou25_u_present": "odds_ou25_u",
        "odds_ou35_o_present": "odds_ou35_o",
        "odds_ou35_u_present": "odds_ou35_u",
        "odds_ht_ou05_o_present": "odds_ht_ou05_o",
        "odds_ht_ou05_u_present": "odds_ht_ou05_u",
        "odds_ht_ou15_o_present": "odds_ht_ou15_o",
        "odds_ht_ou15_u_present": "odds_ht_ou15_u",
        "odds_btts_y_present": "odds_btts_y",
        "odds_btts_n_present": "odds_btts_n",
    }
    for flag_col, odds_col in odds_flag_sources.items():
        if flag_col not in df.columns:
            df[flag_col] = (
                pd.to_numeric(df.get(odds_col, 0), errors="coerce").fillna(0) > 1.01
            ).astype(float)

    print(f"[INFO] Shape: {df.shape}")
    print(f"[INFO] Columns: {list(df.columns)}")
    return df


def temporal_split(valid_df: pd.DataFrame):
    """Chronological train/val/test split."""
    ordered = valid_df.sort_values("mst_utc").reset_index(drop=True)
    n = len(ordered)
    train_end = max(int(n * 0.70), 1)
    val_end = max(int(n * 0.85), train_end + 1)
    val_end = min(val_end, n - 1)

    train_df = ordered.iloc[:train_end].copy()
    val_df = ordered.iloc[train_end:val_end].copy()
    test_df = ordered.iloc[val_end:].copy()

    return train_df, val_df, test_df


def train_xgboost_model(X_train, y_train, X_val, y_val, num_class=3, market_name="MS"):
    """Train XGBoost model with early stopping."""

    print(f"\n[INFO] Training XGBoost for {market_name}...")

    params = {
        "objective": "multi:softprob" if num_class > 2 else "binary:logistic",
        "eval_metric": "mlogloss" if num_class > 2 else "logloss",
        "max_depth": 6,
        "eta": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "min_child_weight": 3,
        "gamma": 0.1,
        "n_jobs": 4,
        "random_state": 42,
    }

    if num_class > 2:
        params["num_class"] = num_class

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    evals_result = {}
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dtrain, 'train'), (dval, 'val')],
        early_stopping_rounds=50,
        evals_result=evals_result,
        verbose_eval=100,
    )

    print(f"[OK] Best iteration: {model.best_iteration}")
    print(f"[OK] Best score: {model.best_score:.4f}")

    return model


def train_lightgbm_model(X_train, y_train, X_val, y_val, num_class=3, market_name="MS"):
    """Train LightGBM model with early stopping."""

    print(f"\n[INFO] Training LightGBM for {market_name}...")

    params = {
        "objective": "multiclass" if num_class > 2 else "binary",
        "metric": "multi_logloss" if num_class > 2 else "binary_logloss",
        "max_depth": 6,
        "learning_rate": 0.05,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "min_child_samples": 20,
        "n_jobs": 4,
        "random_state": 42,
        "verbose": -1,
    }

    if num_class > 2:
        params["num_class"] = num_class

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[train_data, val_data],
        valid_names=['train', 'val'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=100),
        ],
    )

    print(f"[OK] Best iteration: {model.best_iteration}")
    print(f"[OK] Best score: {model.best_score['val'][params['metric']]:.4f}")

    return model


def evaluate_model(model, X_test, y_test, model_type='xgb', num_class=3):
    """Evaluate model on test set."""

    if model_type == 'xgb':
        dtest = xgb.DMatrix(X_test)
        probs = model.predict(dtest)
    else:  # lgb
        probs = model.predict(X_test, num_iteration=model.best_iteration)

    if len(probs.shape) == 1:
        # Binary classification
        probs = np.column_stack([1 - probs, probs])

    preds = np.argmax(probs, axis=1)

    acc = accuracy_score(y_test, preds)
    loss = log_loss(y_test, probs)

    print(f"\n[RESULTS] Test Results:")
    print(f"   Accuracy: {acc:.4f}")
    print(f"   Log Loss: {loss:.4f}")

    # Per-class metrics
    print("\n[REPORT] Classification Report:")
    print(classification_report(y_test, preds))

    return probs, acc, loss


def train_market(df, target_col, market_name, num_class=3):
    """Train models for a specific market."""

    print(f"\n{'='*60}")
    print(f"[MARKET] Training {market_name}")
    print(f"{'='*60}")

    # Filter valid rows
    valid_df = df[df[target_col].notna()].copy()
    valid_df = valid_df[valid_df[target_col].astype(str) != ""].copy()
    print(f"[INFO] Valid samples: {len(valid_df)}")

    if len(valid_df) < 100:
        print(f"[ERROR] Not enough data for {market_name}")
        return None, None

    # Prepare features
    available_features = [f for f in FEATURES if f in valid_df.columns]
    print(f"[INFO] Available features: {len(available_features)}/{len(FEATURES)}")

    train_df, val_df, test_df = temporal_split(valid_df)
    X_train = train_df[available_features].values
    X_val = val_df[available_features].values
    X_test = test_df[available_features].values
    y_train = train_df[target_col].astype(int).values
    y_val = val_df[target_col].astype(int).values
    y_test = test_df[target_col].astype(int).values

    print(
        f"[INFO] Temporal split -> Train: {len(X_train)},"
        f" Val: {len(X_val)}, Test: {len(X_test)}"
    )
    print(
        f"[INFO] Time windows -> train_end={int(train_df['mst_utc'].max())},"
        f" val_end={int(val_df['mst_utc'].max())},"
        f" test_end={int(test_df['mst_utc'].max())}"
    )

    # Train XGBoost
    xgb_model = train_xgboost_model(X_train, y_train, X_val, y_val, num_class, market_name)

    # Train LightGBM
    lgb_model = train_lightgbm_model(X_train, y_train, X_val, y_val, num_class, market_name)

    # Evaluate
    print("\n[INFO] XGBoost Evaluation:")
    xgb_probs, xgb_acc, xgb_loss = evaluate_model(xgb_model, X_test, y_test, 'xgb', num_class)

    print("\n[INFO] LightGBM Evaluation:")
    lgb_probs, lgb_acc, lgb_loss = evaluate_model(lgb_model, X_test, y_test, 'lgb', num_class)

    # Ensemble evaluation
    ensemble_probs = (xgb_probs + lgb_probs) / 2
    ensemble_preds = np.argmax(ensemble_probs, axis=1)
    ensemble_acc = accuracy_score(y_test, ensemble_preds)
    ensemble_loss = log_loss(y_test, ensemble_probs)

    print(f"\n[INFO] Ensemble Evaluation:")
    print(f"   Accuracy: {ensemble_acc:.4f}")
    print(f"   Log Loss: {ensemble_loss:.4f}")

    # Save models
    xgb_path = os.path.join(MODELS_DIR, f"xgb_v25_{market_name.lower()}.json")
    xgb_model.save_model(xgb_path)
    print(f"[OK] XGBoost saved: {xgb_path}")

    lgb_path = os.path.join(MODELS_DIR, f"lgb_v25_{market_name.lower()}.txt")
    lgb_model.save_model(lgb_path)
    print(f"[OK] LightGBM saved: {lgb_path}")

    metrics = {
        "samples": int(len(valid_df)),
        "features_used": available_features,
        "train_samples": int(len(X_train)),
        "val_samples": int(len(X_val)),
        "test_samples": int(len(X_test)),
        "xgb_accuracy": round(float(xgb_acc), 4),
        "xgb_logloss": round(float(xgb_loss), 4),
        "lgb_accuracy": round(float(lgb_acc), 4),
        "lgb_logloss": round(float(lgb_loss), 4),
        "ensemble_accuracy": round(float(ensemble_acc), 4),
        "ensemble_logloss": round(float(ensemble_loss), 4),
        "class_count": int(num_class),
    }

    return xgb_model, lgb_model, metrics


def main():
    """Main training pipeline."""

    print("="*60)
    print("V25 Model Training - NO TARGET LEAKAGE")
    print("="*60)
    print(f"[INFO] Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    # Load data
    df = load_data()

    target_cols = [col for col in df.columns if col.startswith('label_')]
    print(f"\n[INFO] Available targets: {target_cols}")

    results = {}
    reports = {
        "trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "market_results": {},
    }

    for config in MARKET_CONFIGS:
        target = config["target"]
        market_name = config["name"]
        num_class = config["num_class"]

        if target not in df.columns:
            print(f"[SKIP] {market_name}: missing target column {target}")
            continue

        xgb_model, lgb_model, metrics = train_market(
            df, target, market_name, num_class=num_class
        )
        results[market_name] = {
            'xgb': xgb_model is not None,
            'lgb': lgb_model is not None,
        }
        reports["market_results"][market_name] = metrics

    # Save feature list
    feature_path = os.path.join(MODELS_DIR, "feature_cols.json")
    with open(feature_path, 'w') as f:
        json.dump(FEATURES, f, indent=2)
    print(f"\n[OK] Feature list saved: {feature_path}")

    report_path = os.path.join(REPORTS_DIR, "v25_market_metrics.json")
    with open(report_path, "w") as f:
        json.dump(reports, f, indent=2)
    print(f"[OK] Metrics report saved: {report_path}")

    # Summary
    print("\n" + "="*60)
    print("[SUMMARY] Training Results")
    print("="*60)
    for market, status in results.items():
        print(f"   {market}: XGB={status['xgb']}, LGB={status['lgb']}")

    print(f"\n[INFO] Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("[OK] V25 Training Complete!")


if __name__ == "__main__":
    main()