main

2026-05-17 02:17:22 +03:00
parent 17ace9bd12
commit 94c7a4481a
53 changed files with 29602 additions and 7832 deletions
@@ -0,0 +1,459 @@
+"""
+League-Specific Model Trainer
+==============================
+Trains dedicated XGBoost models + isotonic calibration for each qualified league.
+
+Tiers:
+  - >=500 FT matches  → full XGBoost (12 markets) + calibration
+  - 100-499 matches   → isotonic calibration only (over general V25 predictions)
+  - <100 matches      → skipped
+
+Usage:
+  python scripts/train_league_models.py
+  python scripts/train_league_models.py --min-samples 300   # stricter threshold
+  python scripts/train_league_models.py --colab             # Colab-friendly output
+"""
+
+import os
+import sys
+import json
+import pickle
+import argparse
+import time
+import warnings
+from datetime import datetime
+
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from sklearn.isotonic import IsotonicRegression
+from sklearn.metrics import accuracy_score, log_loss
+
+warnings.filterwarnings("ignore")
+optuna_available = False
+try:
+    import optuna
+    optuna.logging.set_verbosity(optuna.logging.WARNING)
+    optuna_available = True
+except ImportError:
+    pass
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+DATA_PATH     = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
+MODELS_DIR    = os.path.join(AI_ENGINE_DIR, "models", "league_specific")
+REPORTS_DIR   = os.path.join(AI_ENGINE_DIR, "reports", "league_models")
+QUALIFIED_LEAGUES_PATH = os.path.join(os.path.dirname(AI_ENGINE_DIR), "qualified_leagues.json")
+
+os.makedirs(MODELS_DIR, exist_ok=True)
+os.makedirs(REPORTS_DIR, exist_ok=True)
+
+# ─── Markets ────────────────────────────────────────────────────────
+MARKETS = {
+    "MS":         {"label": "label_ms",         "num_class": 3, "min_samples": 200},
+    "OU15":       {"label": "label_ou15",        "num_class": 2, "min_samples": 150},
+    "OU25":       {"label": "label_ou25",        "num_class": 2, "min_samples": 150},
+    "OU35":       {"label": "label_ou35",        "num_class": 2, "min_samples": 150},
+    "BTTS":       {"label": "label_btts",        "num_class": 2, "min_samples": 150},
+    "HT":         {"label": "label_ht_result",   "num_class": 3, "min_samples": 150},
+    "HT_OU05":    {"label": "label_ht_ou05",     "num_class": 2, "min_samples": 150},
+    "HT_OU15":    {"label": "label_ht_ou15",     "num_class": 2, "min_samples": 150},
+    "HTFT":       {"label": "label_ht_ft",       "num_class": 9, "min_samples": 300},
+    "OE":         {"label": "label_odd_even",    "num_class": 2, "min_samples": 150},
+    "CARDS":      {"label": "label_cards_ou45",  "num_class": 2, "min_samples": 150},
+    "HANDICAP":   {"label": "label_handicap_ms", "num_class": 3, "min_samples": 200},
+}
+
+# Feature columns (from training_data.csv, excluding metadata + labels)
+SKIP_COLS = {
+    "match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
+    "score_home", "score_away", "total_goals", "ht_score_home", "ht_score_away",
+    "ht_total_goals",
+    "label_ms", "label_ou05", "label_ou15", "label_ou25", "label_ou35",
+    "label_btts", "label_ht_result", "label_ht_ou05", "label_ht_ou15",
+    "label_ht_ft", "label_odd_even", "label_yellow_cards", "label_cards_ou45",
+    "label_handicap_ms",
+}
+
+# XGBoost defaults — fast, no Optuna
+XGB_PARAMS_BINARY = {
+    "objective": "binary:logistic",
+    "eval_metric": "logloss",
+    "max_depth": 4,
+    "eta": 0.05,
+    "subsample": 0.8,
+    "colsample_bytree": 0.8,
+    "min_child_weight": 5,
+    "gamma": 0.1,
+    "reg_lambda": 1.0,
+    "verbosity": 0,
+    "seed": 42,
+    "nthread": -1,
+}
+
+XGB_PARAMS_MULTI = {
+    **XGB_PARAMS_BINARY,
+    "objective": "multi:softprob",
+    "eval_metric": "mlogloss",
+}
+
+
+def load_data() -> pd.DataFrame:
+    print(f"Loading training data from {DATA_PATH} ...")
+    df = pd.read_csv(DATA_PATH, low_memory=False)
+    print(f"  {len(df):,} rows, {len(df.columns)} columns")
+    return df
+
+
+def get_feature_cols(df: pd.DataFrame) -> list:
+    return [c for c in df.columns if c not in SKIP_COLS]
+
+
+def load_qualified_leagues() -> list:
+    if os.path.exists(QUALIFIED_LEAGUES_PATH):
+        with open(QUALIFIED_LEAGUES_PATH) as f:
+            return json.load(f)
+    # fallback: all leagues in CSV
+    return []
+
+
+def train_xgb_market(
+    X_train: np.ndarray,
+    y_train: np.ndarray,
+    X_test: np.ndarray,
+    y_test: np.ndarray,
+    num_class: int,
+    feature_cols: list,
+) -> tuple:
+    """Train XGBoost for one market. Returns (model, accuracy, logloss)."""
+    params = dict(XGB_PARAMS_MULTI if num_class > 2 else XGB_PARAMS_BINARY)
+    if num_class > 2:
+        params["num_class"] = num_class
+
+    dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_cols)
+    dtest  = xgb.DMatrix(X_test,  label=y_test,  feature_names=feature_cols)
+
+    model = xgb.train(
+        params,
+        dtrain,
+        num_boost_round=300,
+        evals=[(dtest, "val")],
+        early_stopping_rounds=30,
+        verbose_eval=False,
+    )
+
+    raw = model.predict(dtest)
+    if num_class > 2:
+        probs = raw.reshape(-1, num_class)
+        preds = np.argmax(probs, axis=1)
+        ll = log_loss(y_test, probs)
+    else:
+        preds = (raw >= 0.5).astype(int)
+        ll = log_loss(y_test, raw)
+
+    acc = accuracy_score(y_test, preds)
+    return model, acc, ll
+
+
+def train_isotonic(raw_probs: np.ndarray, y_true: np.ndarray) -> IsotonicRegression:
+    iso = IsotonicRegression(out_of_bounds="clip")
+    iso.fit(raw_probs, y_true)
+    return iso
+
+
+def get_general_v25_probs(df_league: pd.DataFrame, feature_cols: list, market: str, num_class: int):
+    """Use general V25 model to get predictions on this league's matches (for cal-only leagues)."""
+    try:
+        from models.v25_ensemble import get_v25_predictor
+        v25 = get_v25_predictor()
+        if not v25._loaded:
+            v25.load_models()
+
+        label_col = MARKETS[market]["label"]
+        valid = df_league[feature_cols + [label_col]].dropna()
+        if len(valid) < 50:
+            return None, None
+
+        market_key_map = {
+            "MS": "ms", "OU15": "ou15", "OU25": "ou25", "OU35": "ou35",
+            "BTTS": "btts", "HT": "ht_result", "HT_OU05": "ht_ou05",
+            "HT_OU15": "ht_ou15", "HTFT": "htft", "OE": "odd_even",
+            "CARDS": "cards_ou45", "HANDICAP": "handicap_ms",
+        }
+        mkey = market_key_map.get(market)
+        if not mkey or not v25.has_market(mkey):
+            return None, None
+
+        X = valid[feature_cols].fillna(0).values
+        y = valid[label_col].values
+
+        all_probs = []
+        for i in range(0, len(X), 500):
+            batch = X[i:i+500]
+            feat_dict = {col: float(batch[j, k]) for j, row in enumerate(batch) for k, col in enumerate(feature_cols)}
+            # batch predict
+            df_batch = pd.DataFrame(batch, columns=feature_cols)
+            dmat = xgb.DMatrix(df_batch)
+            models = v25.models.get(mkey, {})
+            batch_probs = []
+            if "xgb" in models:
+                p = models["xgb"].predict(dmat)
+                if num_class > 2:
+                    p = p.reshape(-1, num_class)
+                batch_probs.append(p)
+            if batch_probs:
+                all_probs.append(np.mean(batch_probs, axis=0))
+
+        if not all_probs:
+            return None, None
+
+        probs = np.vstack(all_probs) if num_class > 2 else np.concatenate(all_probs)
+        return probs, y
+    except Exception as e:
+        return None, None
+
+
+def process_league(
+    league_id: str,
+    df_league: pd.DataFrame,
+    feature_cols: list,
+    full_model: bool,
+    league_name: str,
+) -> dict:
+    """Train models for one league. Returns metrics dict."""
+    n = len(df_league)
+    out_dir = os.path.join(MODELS_DIR, league_id)
+    os.makedirs(out_dir, exist_ok=True)
+
+    metrics = {"league_id": league_id, "league_name": league_name, "n_matches": n, "markets": {}}
+
+    # Time-based split: last 20% as test
+    split_idx = int(n * 0.80)
+    df_sorted = df_league.sort_values("mst_utc")
+    df_train = df_sorted.iloc[:split_idx]
+    df_test  = df_sorted.iloc[split_idx:]
+
+    saved_feature_cols = False
+
+    for market, cfg in MARKETS.items():
+        label_col  = cfg["label"]
+        num_class  = cfg["num_class"]
+        min_samp   = cfg["min_samples"]
+
+        if label_col not in df_league.columns:
+            continue
+
+        valid_train = df_train[feature_cols + [label_col]].dropna()
+        valid_test  = df_test[feature_cols + [label_col]].dropna()
+
+        if len(valid_train) < min_samp or len(valid_test) < 30:
+            continue
+
+        X_train = valid_train[feature_cols].fillna(0).values
+        y_train = valid_train[label_col].values.astype(int)
+        X_test  = valid_test[feature_cols].fillna(0).values
+        y_test  = valid_test[label_col].values.astype(int)
+
+        mkt_metrics = {"n_train": len(X_train), "n_test": len(X_test)}
+
+        if full_model:
+            try:
+                model, acc, ll = train_xgb_market(X_train, y_train, X_test, y_test, num_class, feature_cols)
+                model_path = os.path.join(out_dir, f"xgb_{market.lower()}.json")
+                model.save_model(model_path)
+                mkt_metrics.update({"accuracy": round(acc, 4), "logloss": round(ll, 4), "model": "xgb"})
+
+                if not saved_feature_cols:
+                    with open(os.path.join(out_dir, "feature_cols.json"), "w") as f:
+                        json.dump(feature_cols, f)
+                    saved_feature_cols = True
+
+                # Isotonic calibration from own model predictions
+                dtest_xgb = xgb.DMatrix(X_test, feature_names=feature_cols)
+                raw = model.predict(dtest_xgb)
+                if num_class > 2:
+                    raw = raw.reshape(-1, num_class)
+                    for cls_idx in range(num_class):
+                        iso = train_isotonic(raw[:, cls_idx], (y_test == cls_idx).astype(int))
+                        with open(os.path.join(out_dir, f"cal_{market.lower()}_{cls_idx}.pkl"), "wb") as f:
+                            pickle.dump(iso, f)
+                else:
+                    iso = train_isotonic(raw, y_test)
+                    with open(os.path.join(out_dir, f"cal_{market.lower()}.pkl"), "wb") as f:
+                        pickle.dump(iso, f)
+
+            except Exception as e:
+                mkt_metrics["error"] = str(e)
+        else:
+            # Calibration only: use general V25 model
+            try:
+                all_valid = df_league[feature_cols + [label_col]].dropna()
+                if len(all_valid) < min_samp:
+                    continue
+
+                X_all = all_valid[feature_cols].fillna(0).values
+                y_all = all_valid[label_col].values.astype(int)
+
+                # Use V25 general model
+                from models.v25_ensemble import get_v25_predictor
+                v25 = get_v25_predictor()
+                if not v25._loaded:
+                    v25.load_models()
+
+                market_key_map = {
+                    "MS": "ms", "OU15": "ou15", "OU25": "ou25", "OU35": "ou35",
+                    "BTTS": "btts", "HT": "ht_result", "HT_OU05": "ht_ou05",
+                    "HT_OU15": "ht_ou15", "HTFT": "htft", "OE": "odd_even",
+                    "CARDS": "cards_ou45", "HANDICAP": "handicap_ms",
+                }
+                mkey = market_key_map.get(market)
+                if not mkey or not v25.has_market(mkey):
+                    continue
+
+                df_feat = pd.DataFrame(X_all, columns=feature_cols)
+                dmat = xgb.DMatrix(df_feat)
+                models_v25 = v25.models.get(mkey, {})
+                if "xgb" not in models_v25:
+                    continue
+                raw = models_v25["xgb"].predict(dmat)
+
+                if num_class > 2:
+                    raw = raw.reshape(-1, num_class)
+                    for cls_idx in range(num_class):
+                        iso = train_isotonic(raw[:, cls_idx], (y_all == cls_idx).astype(int))
+                        with open(os.path.join(out_dir, f"cal_{market.lower()}_{cls_idx}.pkl"), "wb") as f:
+                            pickle.dump(iso, f)
+                else:
+                    iso = train_isotonic(raw, y_all)
+                    with open(os.path.join(out_dir, f"cal_{market.lower()}.pkl"), "wb") as f:
+                        pickle.dump(iso, f)
+
+                mkt_metrics.update({"n_train": len(X_all), "model": "cal_only"})
+            except Exception as e:
+                mkt_metrics["error"] = str(e)
+
+        metrics["markets"][market] = mkt_metrics
+
+    # Save metrics
+    with open(os.path.join(out_dir, "metrics.json"), "w") as f:
+        json.dump(metrics, f, indent=2)
+
+    return metrics
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--min-samples", type=int, default=500, help="Min matches for full model")
+    parser.add_argument("--cal-min",     type=int, default=100, help="Min matches for calibration")
+    parser.add_argument("--colab",       action="store_true",   help="Colab-friendly verbose output")
+    args = parser.parse_args()
+
+    start_total = time.time()
+
+    df = load_data()
+    feature_cols = get_feature_cols(df)
+    print(f"Feature columns: {len(feature_cols)}")
+
+    qualified = load_qualified_leagues()
+    if not qualified:
+        qualified = df["league_id"].unique().tolist()
+    print(f"Qualified leagues: {len(qualified)}")
+
+    # Get league names
+    league_names = {}
+    try:
+        import psycopg2
+        from data.db import get_clean_dsn
+        conn = psycopg2.connect(get_clean_dsn())
+        cur = conn.cursor()
+        cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (qualified,))
+        league_names = {r[0]: r[1] for r in cur.fetchall()}
+        conn.close()
+    except Exception:
+        pass
+
+    # Filter to qualified leagues with enough data
+    counts = df[df["league_id"].isin(qualified)].groupby("league_id").size()
+    full_model_ids = counts[counts >= args.min_samples].index.tolist()
+    cal_only_ids   = counts[(counts >= args.cal_min) & (counts < args.min_samples)].index.tolist()
+
+    print(f"\nTam model ({args.min_samples}+ maç): {len(full_model_ids)} lig")
+    print(f"Kalibrasyon ({args.cal_min}-{args.min_samples-1} maç): {len(cal_only_ids)} lig")
+    print(f"Atlandı (<{args.cal_min} maç): {len([l for l in qualified if l not in full_model_ids and l not in cal_only_ids])} lig")
+    print()
+
+    all_results = []
+    total = len(full_model_ids) + len(cal_only_ids)
+    done = 0
+
+    for league_id, full_model in (
+        [(lid, True) for lid in full_model_ids] +
+        [(lid, False) for lid in cal_only_ids]
+    ):
+        t0 = time.time()
+        df_league = df[df["league_id"] == league_id].copy()
+        n = len(df_league)
+        name = league_names.get(league_id, league_id[:12])
+        tier = "FULL" if full_model else "CAL"
+
+        try:
+            result = process_league(league_id, df_league, feature_cols, full_model, name)
+            done += 1
+            elapsed = time.time() - t0
+
+            # Build accuracy string for key markets
+            acc_parts = []
+            for mkt in ["MS", "OU15", "OU25", "BTTS"]:
+                m = result["markets"].get(mkt, {})
+                if "accuracy" in m:
+                    acc_parts.append(f"{mkt}={m['accuracy']*100:.1f}%")
+            acc_str = " | ".join(acc_parts) if acc_parts else "(cal only)"
+
+            print(f"[{done:>3}/{total}] [{tier}] {name:<35} {n:>6,} maç | {acc_str} | {elapsed:.1f}s")
+            all_results.append(result)
+
+        except Exception as e:
+            done += 1
+            print(f"[{done:>3}/{total}] [{tier}] {name:<35} ERROR: {e}")
+
+        if done % 10 == 0:
+            elapsed_total = time.time() - start_total
+            remaining = (elapsed_total / done) * (total - done)
+            print(f"  ── {done}/{total} tamamlandı | geçen: {elapsed_total/60:.1f}dk | kalan tahmini: {remaining/60:.1f}dk ──")
+
+    # Final report
+    total_elapsed = time.time() - start_total
+    print(f"\n{'='*70}")
+    print(f"TAMAMLANDI: {len(all_results)}/{total} lig | Süre: {total_elapsed/60:.1f} dakika")
+    print(f"{'='*70}")
+
+    # Top 20 by accuracy
+    printable = [(r["league_name"], r["n_matches"], r["markets"]) for r in all_results
+                 if "MS" in r["markets"] and "accuracy" in r["markets"]["MS"]]
+    printable.sort(key=lambda x: x[2]["MS"].get("accuracy", 0), reverse=True)
+
+    print(f"\n{'Liga':<35} {'Maç':>6} {'MS':>7} {'OU15':>7} {'OU25':>7} {'BTTS':>7}")
+    print("-" * 70)
+    for name, n, mkts in printable[:30]:
+        ms   = mkts.get("MS",   {}).get("accuracy", 0) * 100
+        ou15 = mkts.get("OU15", {}).get("accuracy", 0) * 100
+        ou25 = mkts.get("OU25", {}).get("accuracy", 0) * 100
+        btts = mkts.get("BTTS", {}).get("accuracy", 0) * 100
+        print(f"{name:<35} {n:>6,} {ms:>6.1f}% {ou15:>6.1f}% {ou25:>6.1f}% {btts:>6.1f}%")
+
+    # Save master report
+    report = {
+        "generated_at": datetime.now().isoformat(),
+        "total_leagues": len(all_results),
+        "elapsed_minutes": round(total_elapsed / 60, 1),
+        "results": all_results,
+    }
+    report_path = os.path.join(REPORTS_DIR, "league_models_report.json")
+    with open(report_path, "w") as f:
+        json.dump(report, f, indent=2)
+    print(f"\nRapor kaydedildi: {report_path}")
+
+
+if __name__ == "__main__":
+    main()