iddaai-be/ai-engine/scripts/train_league_models.py

"""
League-Specific Model Trainer
==============================
Trains dedicated XGBoost models + isotonic calibration for each qualified league.

Tiers:
  - >=500 FT matches  → full XGBoost (12 markets) + calibration
  - 100-499 matches   → isotonic calibration only (over general V25 predictions)
  - <100 matches      → skipped

Usage:
  python scripts/train_league_models.py
  python scripts/train_league_models.py --min-samples 300   # stricter threshold
  python scripts/train_league_models.py --colab             # Colab-friendly output
"""

import os
import sys
import json
import pickle
import argparse
import time
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import accuracy_score, log_loss

warnings.filterwarnings("ignore")
optuna_available = False
try:
    import optuna
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    optuna_available = True
except ImportError:
    pass

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH     = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
MODELS_DIR    = os.path.join(AI_ENGINE_DIR, "models", "league_specific")
REPORTS_DIR   = os.path.join(AI_ENGINE_DIR, "reports", "league_models")
QUALIFIED_LEAGUES_PATH = os.path.join(os.path.dirname(AI_ENGINE_DIR), "qualified_leagues.json")

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)

# ─── Markets ────────────────────────────────────────────────────────
MARKETS = {
    "MS":         {"label": "label_ms",         "num_class": 3, "min_samples": 200},
    "OU15":       {"label": "label_ou15",        "num_class": 2, "min_samples": 150},
    "OU25":       {"label": "label_ou25",        "num_class": 2, "min_samples": 150},
    "OU35":       {"label": "label_ou35",        "num_class": 2, "min_samples": 150},
    "BTTS":       {"label": "label_btts",        "num_class": 2, "min_samples": 150},
    "HT":         {"label": "label_ht_result",   "num_class": 3, "min_samples": 150},
    "HT_OU05":    {"label": "label_ht_ou05",     "num_class": 2, "min_samples": 150},
    "HT_OU15":    {"label": "label_ht_ou15",     "num_class": 2, "min_samples": 150},
    "HTFT":       {"label": "label_ht_ft",       "num_class": 9, "min_samples": 300},
    "OE":         {"label": "label_odd_even",    "num_class": 2, "min_samples": 150},
    "CARDS":      {"label": "label_cards_ou45",  "num_class": 2, "min_samples": 150},
    "HANDICAP":   {"label": "label_handicap_ms", "num_class": 3, "min_samples": 200},
}

# Feature columns (from training_data.csv, excluding metadata + labels)
SKIP_COLS = {
    "match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
    "score_home", "score_away", "total_goals", "ht_score_home", "ht_score_away",
    "ht_total_goals",
    "label_ms", "label_ou05", "label_ou15", "label_ou25", "label_ou35",
    "label_btts", "label_ht_result", "label_ht_ou05", "label_ht_ou15",
    "label_ht_ft", "label_odd_even", "label_yellow_cards", "label_cards_ou45",
    "label_handicap_ms",
}

# XGBoost defaults — fast, no Optuna
XGB_PARAMS_BINARY = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "max_depth": 4,
    "eta": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 5,
    "gamma": 0.1,
    "reg_lambda": 1.0,
    "verbosity": 0,
    "seed": 42,
    "nthread": -1,
}

XGB_PARAMS_MULTI = {
    **XGB_PARAMS_BINARY,
    "objective": "multi:softprob",
    "eval_metric": "mlogloss",
}


def load_data() -> pd.DataFrame:
    print(f"Loading training data from {DATA_PATH} ...")
    df = pd.read_csv(DATA_PATH, low_memory=False)
    print(f"  {len(df):,} rows, {len(df.columns)} columns")
    return df


def get_feature_cols(df: pd.DataFrame) -> list:
    return [c for c in df.columns if c not in SKIP_COLS]


def load_qualified_leagues() -> list:
    if os.path.exists(QUALIFIED_LEAGUES_PATH):
        with open(QUALIFIED_LEAGUES_PATH) as f:
            return json.load(f)
    # fallback: all leagues in CSV
    return []


def train_xgb_market(
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray,
    num_class: int,
    feature_cols: list,
) -> tuple:
    """Train XGBoost for one market. Returns (model, accuracy, logloss)."""
    params = dict(XGB_PARAMS_MULTI if num_class > 2 else XGB_PARAMS_BINARY)
    if num_class > 2:
        params["num_class"] = num_class

    dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_cols)
    dtest  = xgb.DMatrix(X_test,  label=y_test,  feature_names=feature_cols)

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=300,
        evals=[(dtest, "val")],
        early_stopping_rounds=30,
        verbose_eval=False,
    )

    raw = model.predict(dtest)
    if num_class > 2:
        probs = raw.reshape(-1, num_class)
        preds = np.argmax(probs, axis=1)
        ll = log_loss(y_test, probs)
    else:
        preds = (raw >= 0.5).astype(int)
        ll = log_loss(y_test, raw)

    acc = accuracy_score(y_test, preds)
    return model, acc, ll


def train_isotonic(raw_probs: np.ndarray, y_true: np.ndarray) -> IsotonicRegression:
    iso = IsotonicRegression(out_of_bounds="clip")
    iso.fit(raw_probs, y_true)
    return iso


def get_general_v25_probs(df_league: pd.DataFrame, feature_cols: list, market: str, num_class: int):
    """Use general V25 model to get predictions on this league's matches (for cal-only leagues)."""
    try:
        from models.v25_ensemble import get_v25_predictor
        v25 = get_v25_predictor()
        if not v25._loaded:
            v25.load_models()

        label_col = MARKETS[market]["label"]
        valid = df_league[feature_cols + [label_col]].dropna()
        if len(valid) < 50:
            return None, None

        market_key_map = {
            "MS": "ms", "OU15": "ou15", "OU25": "ou25", "OU35": "ou35",
            "BTTS": "btts", "HT": "ht_result", "HT_OU05": "ht_ou05",
            "HT_OU15": "ht_ou15", "HTFT": "htft", "OE": "odd_even",
            "CARDS": "cards_ou45", "HANDICAP": "handicap_ms",
        }
        mkey = market_key_map.get(market)
        if not mkey or not v25.has_market(mkey):
            return None, None

        X = valid[feature_cols].fillna(0).values
        y = valid[label_col].values

        all_probs = []
        for i in range(0, len(X), 500):
            batch = X[i:i+500]
            feat_dict = {col: float(batch[j, k]) for j, row in enumerate(batch) for k, col in enumerate(feature_cols)}
            # batch predict
            df_batch = pd.DataFrame(batch, columns=feature_cols)
            dmat = xgb.DMatrix(df_batch)
            models = v25.models.get(mkey, {})
            batch_probs = []
            if "xgb" in models:
                p = models["xgb"].predict(dmat)
                if num_class > 2:
                    p = p.reshape(-1, num_class)
                batch_probs.append(p)
            if batch_probs:
                all_probs.append(np.mean(batch_probs, axis=0))

        if not all_probs:
            return None, None

        probs = np.vstack(all_probs) if num_class > 2 else np.concatenate(all_probs)
        return probs, y
    except Exception as e:
        return None, None


def process_league(
    league_id: str,
    df_league: pd.DataFrame,
    feature_cols: list,
    full_model: bool,
    league_name: str,
) -> dict:
    """Train models for one league. Returns metrics dict."""
    n = len(df_league)
    out_dir = os.path.join(MODELS_DIR, league_id)
    os.makedirs(out_dir, exist_ok=True)

    metrics = {"league_id": league_id, "league_name": league_name, "n_matches": n, "markets": {}}

    # Time-based split: last 20% as test
    split_idx = int(n * 0.80)
    df_sorted = df_league.sort_values("mst_utc")
    df_train = df_sorted.iloc[:split_idx]
    df_test  = df_sorted.iloc[split_idx:]

    saved_feature_cols = False

    for market, cfg in MARKETS.items():
        label_col  = cfg["label"]
        num_class  = cfg["num_class"]
        min_samp   = cfg["min_samples"]

        if label_col not in df_league.columns:
            continue

        valid_train = df_train[feature_cols + [label_col]].dropna()
        valid_test  = df_test[feature_cols + [label_col]].dropna()

        if len(valid_train) < min_samp or len(valid_test) < 30:
            continue

        X_train = valid_train[feature_cols].fillna(0).values
        y_train = valid_train[label_col].values.astype(int)
        X_test  = valid_test[feature_cols].fillna(0).values
        y_test  = valid_test[label_col].values.astype(int)

        mkt_metrics = {"n_train": len(X_train), "n_test": len(X_test)}

        if full_model:
            try:
                model, acc, ll = train_xgb_market(X_train, y_train, X_test, y_test, num_class, feature_cols)
                model_path = os.path.join(out_dir, f"xgb_{market.lower()}.json")
                model.save_model(model_path)
                mkt_metrics.update({"accuracy": round(acc, 4), "logloss": round(ll, 4), "model": "xgb"})

                if not saved_feature_cols:
                    with open(os.path.join(out_dir, "feature_cols.json"), "w") as f:
                        json.dump(feature_cols, f)
                    saved_feature_cols = True

                # Isotonic calibration from own model predictions
                dtest_xgb = xgb.DMatrix(X_test, feature_names=feature_cols)
                raw = model.predict(dtest_xgb)
                if num_class > 2:
                    raw = raw.reshape(-1, num_class)
                    for cls_idx in range(num_class):
                        iso = train_isotonic(raw[:, cls_idx], (y_test == cls_idx).astype(int))
                        with open(os.path.join(out_dir, f"cal_{market.lower()}_{cls_idx}.pkl"), "wb") as f:
                            pickle.dump(iso, f)
                else:
                    iso = train_isotonic(raw, y_test)
                    with open(os.path.join(out_dir, f"cal_{market.lower()}.pkl"), "wb") as f:
                        pickle.dump(iso, f)

            except Exception as e:
                mkt_metrics["error"] = str(e)
        else:
            # Calibration only: use general V25 model
            try:
                all_valid = df_league[feature_cols + [label_col]].dropna()
                if len(all_valid) < min_samp:
                    continue

                X_all = all_valid[feature_cols].fillna(0).values
                y_all = all_valid[label_col].values.astype(int)

                # Use V25 general model
                from models.v25_ensemble import get_v25_predictor
                v25 = get_v25_predictor()
                if not v25._loaded:
                    v25.load_models()

                market_key_map = {
                    "MS": "ms", "OU15": "ou15", "OU25": "ou25", "OU35": "ou35",
                    "BTTS": "btts", "HT": "ht_result", "HT_OU05": "ht_ou05",
                    "HT_OU15": "ht_ou15", "HTFT": "htft", "OE": "odd_even",
                    "CARDS": "cards_ou45", "HANDICAP": "handicap_ms",
                }
                mkey = market_key_map.get(market)
                if not mkey or not v25.has_market(mkey):
                    continue

                df_feat = pd.DataFrame(X_all, columns=feature_cols)
                dmat = xgb.DMatrix(df_feat)
                models_v25 = v25.models.get(mkey, {})
                if "xgb" not in models_v25:
                    continue
                raw = models_v25["xgb"].predict(dmat)

                if num_class > 2:
                    raw = raw.reshape(-1, num_class)
                    for cls_idx in range(num_class):
                        iso = train_isotonic(raw[:, cls_idx], (y_all == cls_idx).astype(int))
                        with open(os.path.join(out_dir, f"cal_{market.lower()}_{cls_idx}.pkl"), "wb") as f:
                            pickle.dump(iso, f)
                else:
                    iso = train_isotonic(raw, y_all)
                    with open(os.path.join(out_dir, f"cal_{market.lower()}.pkl"), "wb") as f:
                        pickle.dump(iso, f)

                mkt_metrics.update({"n_train": len(X_all), "model": "cal_only"})
            except Exception as e:
                mkt_metrics["error"] = str(e)

        metrics["markets"][market] = mkt_metrics

    # Save metrics
    with open(os.path.join(out_dir, "metrics.json"), "w") as f:
        json.dump(metrics, f, indent=2)

    return metrics


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--min-samples", type=int, default=500, help="Min matches for full model")
    parser.add_argument("--cal-min",     type=int, default=100, help="Min matches for calibration")
    parser.add_argument("--colab",       action="store_true",   help="Colab-friendly verbose output")
    args = parser.parse_args()

    start_total = time.time()

    df = load_data()
    feature_cols = get_feature_cols(df)
    print(f"Feature columns: {len(feature_cols)}")

    qualified = load_qualified_leagues()
    if not qualified:
        qualified = df["league_id"].unique().tolist()
    print(f"Qualified leagues: {len(qualified)}")

    # Get league names
    league_names = {}
    try:
        import psycopg2
        from data.db import get_clean_dsn
        conn = psycopg2.connect(get_clean_dsn())
        cur = conn.cursor()
        cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (qualified,))
        league_names = {r[0]: r[1] for r in cur.fetchall()}
        conn.close()
    except Exception:
        pass

    # Filter to qualified leagues with enough data
    counts = df[df["league_id"].isin(qualified)].groupby("league_id").size()
    full_model_ids = counts[counts >= args.min_samples].index.tolist()
    cal_only_ids   = counts[(counts >= args.cal_min) & (counts < args.min_samples)].index.tolist()

    print(f"\nTam model ({args.min_samples}+ maç): {len(full_model_ids)} lig")
    print(f"Kalibrasyon ({args.cal_min}-{args.min_samples-1} maç): {len(cal_only_ids)} lig")
    print(f"Atlandı (<{args.cal_min} maç): {len([l for l in qualified if l not in full_model_ids and l not in cal_only_ids])} lig")
    print()

    all_results = []
    total = len(full_model_ids) + len(cal_only_ids)
    done = 0

    for league_id, full_model in (
        [(lid, True) for lid in full_model_ids] +
        [(lid, False) for lid in cal_only_ids]
    ):
        t0 = time.time()
        df_league = df[df["league_id"] == league_id].copy()
        n = len(df_league)
        name = league_names.get(league_id, league_id[:12])
        tier = "FULL" if full_model else "CAL"

        try:
            result = process_league(league_id, df_league, feature_cols, full_model, name)
            done += 1
            elapsed = time.time() - t0

            # Build accuracy string for key markets
            acc_parts = []
            for mkt in ["MS", "OU15", "OU25", "BTTS"]:
                m = result["markets"].get(mkt, {})
                if "accuracy" in m:
                    acc_parts.append(f"{mkt}={m['accuracy']*100:.1f}%")
            acc_str = " | ".join(acc_parts) if acc_parts else "(cal only)"

            print(f"[{done:>3}/{total}] [{tier}] {name:<35} {n:>6,} maç | {acc_str} | {elapsed:.1f}s")
            all_results.append(result)

        except Exception as e:
            done += 1
            print(f"[{done:>3}/{total}] [{tier}] {name:<35} ERROR: {e}")

        if done % 10 == 0:
            elapsed_total = time.time() - start_total
            remaining = (elapsed_total / done) * (total - done)
            print(f"  ── {done}/{total} tamamlandı | geçen: {elapsed_total/60:.1f}dk | kalan tahmini: {remaining/60:.1f}dk ──")

    # Final report
    total_elapsed = time.time() - start_total
    print(f"\n{'='*70}")
    print(f"TAMAMLANDI: {len(all_results)}/{total} lig | Süre: {total_elapsed/60:.1f} dakika")
    print(f"{'='*70}")

    # Top 20 by accuracy
    printable = [(r["league_name"], r["n_matches"], r["markets"]) for r in all_results
                 if "MS" in r["markets"] and "accuracy" in r["markets"]["MS"]]
    printable.sort(key=lambda x: x[2]["MS"].get("accuracy", 0), reverse=True)

    print(f"\n{'Liga':<35} {'Maç':>6} {'MS':>7} {'OU15':>7} {'OU25':>7} {'BTTS':>7}")
    print("-" * 70)
    for name, n, mkts in printable[:30]:
        ms   = mkts.get("MS",   {}).get("accuracy", 0) * 100
        ou15 = mkts.get("OU15", {}).get("accuracy", 0) * 100
        ou25 = mkts.get("OU25", {}).get("accuracy", 0) * 100
        btts = mkts.get("BTTS", {}).get("accuracy", 0) * 100
        print(f"{name:<35} {n:>6,} {ms:>6.1f}% {ou15:>6.1f}% {ou25:>6.1f}% {btts:>6.1f}%")

    # Save master report
    report = {
        "generated_at": datetime.now().isoformat(),
        "total_leagues": len(all_results),
        "elapsed_minutes": round(total_elapsed / 60, 1),
        "results": all_results,
    }
    report_path = os.path.join(REPORTS_DIR, "league_models_report.json")
    with open(report_path, "w") as f:
        json.dump(report, f, indent=2)
    print(f"\nRapor kaydedildi: {report_path}")


if __name__ == "__main__":
    main()