""" League-Specific Model Trainer ============================== Trains dedicated XGBoost models + isotonic calibration for each qualified league. Tiers: - >=500 FT matches → full XGBoost (12 markets) + calibration - 100-499 matches → isotonic calibration only (over general V25 predictions) - <100 matches → skipped Usage: python scripts/train_league_models.py python scripts/train_league_models.py --min-samples 300 # stricter threshold python scripts/train_league_models.py --colab # Colab-friendly output """ import os import sys import json import pickle import argparse import time import warnings from datetime import datetime import numpy as np import pandas as pd import xgboost as xgb from sklearn.isotonic import IsotonicRegression from sklearn.metrics import accuracy_score, log_loss warnings.filterwarnings("ignore") optuna_available = False try: import optuna optuna.logging.set_verbosity(optuna.logging.WARNING) optuna_available = True except ImportError: pass sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv") MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "league_specific") REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "league_models") QUALIFIED_LEAGUES_PATH = os.path.join(os.path.dirname(AI_ENGINE_DIR), "qualified_leagues.json") os.makedirs(MODELS_DIR, exist_ok=True) os.makedirs(REPORTS_DIR, exist_ok=True) # ─── Markets ──────────────────────────────────────────────────────── MARKETS = { "MS": {"label": "label_ms", "num_class": 3, "min_samples": 200}, "OU15": {"label": "label_ou15", "num_class": 2, "min_samples": 150}, "OU25": {"label": "label_ou25", "num_class": 2, "min_samples": 150}, "OU35": {"label": "label_ou35", "num_class": 2, "min_samples": 150}, "BTTS": {"label": "label_btts", "num_class": 2, "min_samples": 150}, "HT": {"label": "label_ht_result", "num_class": 3, "min_samples": 150}, "HT_OU05": {"label": "label_ht_ou05", "num_class": 2, "min_samples": 150}, "HT_OU15": {"label": "label_ht_ou15", "num_class": 2, "min_samples": 150}, "HTFT": {"label": "label_ht_ft", "num_class": 9, "min_samples": 300}, "OE": {"label": "label_odd_even", "num_class": 2, "min_samples": 150}, "CARDS": {"label": "label_cards_ou45", "num_class": 2, "min_samples": 150}, "HANDICAP": {"label": "label_handicap_ms", "num_class": 3, "min_samples": 200}, } # Feature columns (from training_data.csv, excluding metadata + labels) SKIP_COLS = { "match_id", "home_team_id", "away_team_id", "league_id", "mst_utc", "score_home", "score_away", "total_goals", "ht_score_home", "ht_score_away", "ht_total_goals", "label_ms", "label_ou05", "label_ou15", "label_ou25", "label_ou35", "label_btts", "label_ht_result", "label_ht_ou05", "label_ht_ou15", "label_ht_ft", "label_odd_even", "label_yellow_cards", "label_cards_ou45", "label_handicap_ms", } # XGBoost defaults — fast, no Optuna XGB_PARAMS_BINARY = { "objective": "binary:logistic", "eval_metric": "logloss", "max_depth": 4, "eta": 0.05, "subsample": 0.8, "colsample_bytree": 0.8, "min_child_weight": 5, "gamma": 0.1, "reg_lambda": 1.0, "verbosity": 0, "seed": 42, "nthread": -1, } XGB_PARAMS_MULTI = { **XGB_PARAMS_BINARY, "objective": "multi:softprob", "eval_metric": "mlogloss", } def load_data() -> pd.DataFrame: print(f"Loading training data from {DATA_PATH} ...") df = pd.read_csv(DATA_PATH, low_memory=False) print(f" {len(df):,} rows, {len(df.columns)} columns") return df def get_feature_cols(df: pd.DataFrame) -> list: return [c for c in df.columns if c not in SKIP_COLS] def load_qualified_leagues() -> list: if os.path.exists(QUALIFIED_LEAGUES_PATH): with open(QUALIFIED_LEAGUES_PATH) as f: return json.load(f) # fallback: all leagues in CSV return [] def train_xgb_market( X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, num_class: int, feature_cols: list, ) -> tuple: """Train XGBoost for one market. Returns (model, accuracy, logloss).""" params = dict(XGB_PARAMS_MULTI if num_class > 2 else XGB_PARAMS_BINARY) if num_class > 2: params["num_class"] = num_class dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_cols) dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_cols) model = xgb.train( params, dtrain, num_boost_round=300, evals=[(dtest, "val")], early_stopping_rounds=30, verbose_eval=False, ) raw = model.predict(dtest) if num_class > 2: probs = raw.reshape(-1, num_class) preds = np.argmax(probs, axis=1) ll = log_loss(y_test, probs) else: preds = (raw >= 0.5).astype(int) ll = log_loss(y_test, raw) acc = accuracy_score(y_test, preds) return model, acc, ll def train_isotonic(raw_probs: np.ndarray, y_true: np.ndarray) -> IsotonicRegression: iso = IsotonicRegression(out_of_bounds="clip") iso.fit(raw_probs, y_true) return iso def get_general_v25_probs(df_league: pd.DataFrame, feature_cols: list, market: str, num_class: int): """Use general V25 model to get predictions on this league's matches (for cal-only leagues).""" try: from models.v25_ensemble import get_v25_predictor v25 = get_v25_predictor() if not v25._loaded: v25.load_models() label_col = MARKETS[market]["label"] valid = df_league[feature_cols + [label_col]].dropna() if len(valid) < 50: return None, None market_key_map = { "MS": "ms", "OU15": "ou15", "OU25": "ou25", "OU35": "ou35", "BTTS": "btts", "HT": "ht_result", "HT_OU05": "ht_ou05", "HT_OU15": "ht_ou15", "HTFT": "htft", "OE": "odd_even", "CARDS": "cards_ou45", "HANDICAP": "handicap_ms", } mkey = market_key_map.get(market) if not mkey or not v25.has_market(mkey): return None, None X = valid[feature_cols].fillna(0).values y = valid[label_col].values all_probs = [] for i in range(0, len(X), 500): batch = X[i:i+500] feat_dict = {col: float(batch[j, k]) for j, row in enumerate(batch) for k, col in enumerate(feature_cols)} # batch predict df_batch = pd.DataFrame(batch, columns=feature_cols) dmat = xgb.DMatrix(df_batch) models = v25.models.get(mkey, {}) batch_probs = [] if "xgb" in models: p = models["xgb"].predict(dmat) if num_class > 2: p = p.reshape(-1, num_class) batch_probs.append(p) if batch_probs: all_probs.append(np.mean(batch_probs, axis=0)) if not all_probs: return None, None probs = np.vstack(all_probs) if num_class > 2 else np.concatenate(all_probs) return probs, y except Exception as e: return None, None def process_league( league_id: str, df_league: pd.DataFrame, feature_cols: list, full_model: bool, league_name: str, ) -> dict: """Train models for one league. Returns metrics dict.""" n = len(df_league) out_dir = os.path.join(MODELS_DIR, league_id) os.makedirs(out_dir, exist_ok=True) metrics = {"league_id": league_id, "league_name": league_name, "n_matches": n, "markets": {}} # Time-based split: last 20% as test split_idx = int(n * 0.80) df_sorted = df_league.sort_values("mst_utc") df_train = df_sorted.iloc[:split_idx] df_test = df_sorted.iloc[split_idx:] saved_feature_cols = False for market, cfg in MARKETS.items(): label_col = cfg["label"] num_class = cfg["num_class"] min_samp = cfg["min_samples"] if label_col not in df_league.columns: continue valid_train = df_train[feature_cols + [label_col]].dropna() valid_test = df_test[feature_cols + [label_col]].dropna() if len(valid_train) < min_samp or len(valid_test) < 30: continue X_train = valid_train[feature_cols].fillna(0).values y_train = valid_train[label_col].values.astype(int) X_test = valid_test[feature_cols].fillna(0).values y_test = valid_test[label_col].values.astype(int) mkt_metrics = {"n_train": len(X_train), "n_test": len(X_test)} if full_model: try: model, acc, ll = train_xgb_market(X_train, y_train, X_test, y_test, num_class, feature_cols) model_path = os.path.join(out_dir, f"xgb_{market.lower()}.json") model.save_model(model_path) mkt_metrics.update({"accuracy": round(acc, 4), "logloss": round(ll, 4), "model": "xgb"}) if not saved_feature_cols: with open(os.path.join(out_dir, "feature_cols.json"), "w") as f: json.dump(feature_cols, f) saved_feature_cols = True # Isotonic calibration from own model predictions dtest_xgb = xgb.DMatrix(X_test, feature_names=feature_cols) raw = model.predict(dtest_xgb) if num_class > 2: raw = raw.reshape(-1, num_class) for cls_idx in range(num_class): iso = train_isotonic(raw[:, cls_idx], (y_test == cls_idx).astype(int)) with open(os.path.join(out_dir, f"cal_{market.lower()}_{cls_idx}.pkl"), "wb") as f: pickle.dump(iso, f) else: iso = train_isotonic(raw, y_test) with open(os.path.join(out_dir, f"cal_{market.lower()}.pkl"), "wb") as f: pickle.dump(iso, f) except Exception as e: mkt_metrics["error"] = str(e) else: # Calibration only: use general V25 model try: all_valid = df_league[feature_cols + [label_col]].dropna() if len(all_valid) < min_samp: continue X_all = all_valid[feature_cols].fillna(0).values y_all = all_valid[label_col].values.astype(int) # Use V25 general model from models.v25_ensemble import get_v25_predictor v25 = get_v25_predictor() if not v25._loaded: v25.load_models() market_key_map = { "MS": "ms", "OU15": "ou15", "OU25": "ou25", "OU35": "ou35", "BTTS": "btts", "HT": "ht_result", "HT_OU05": "ht_ou05", "HT_OU15": "ht_ou15", "HTFT": "htft", "OE": "odd_even", "CARDS": "cards_ou45", "HANDICAP": "handicap_ms", } mkey = market_key_map.get(market) if not mkey or not v25.has_market(mkey): continue df_feat = pd.DataFrame(X_all, columns=feature_cols) dmat = xgb.DMatrix(df_feat) models_v25 = v25.models.get(mkey, {}) if "xgb" not in models_v25: continue raw = models_v25["xgb"].predict(dmat) if num_class > 2: raw = raw.reshape(-1, num_class) for cls_idx in range(num_class): iso = train_isotonic(raw[:, cls_idx], (y_all == cls_idx).astype(int)) with open(os.path.join(out_dir, f"cal_{market.lower()}_{cls_idx}.pkl"), "wb") as f: pickle.dump(iso, f) else: iso = train_isotonic(raw, y_all) with open(os.path.join(out_dir, f"cal_{market.lower()}.pkl"), "wb") as f: pickle.dump(iso, f) mkt_metrics.update({"n_train": len(X_all), "model": "cal_only"}) except Exception as e: mkt_metrics["error"] = str(e) metrics["markets"][market] = mkt_metrics # Save metrics with open(os.path.join(out_dir, "metrics.json"), "w") as f: json.dump(metrics, f, indent=2) return metrics def main(): parser = argparse.ArgumentParser() parser.add_argument("--min-samples", type=int, default=500, help="Min matches for full model") parser.add_argument("--cal-min", type=int, default=100, help="Min matches for calibration") parser.add_argument("--colab", action="store_true", help="Colab-friendly verbose output") args = parser.parse_args() start_total = time.time() df = load_data() feature_cols = get_feature_cols(df) print(f"Feature columns: {len(feature_cols)}") qualified = load_qualified_leagues() if not qualified: qualified = df["league_id"].unique().tolist() print(f"Qualified leagues: {len(qualified)}") # Get league names league_names = {} try: import psycopg2 from data.db import get_clean_dsn conn = psycopg2.connect(get_clean_dsn()) cur = conn.cursor() cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (qualified,)) league_names = {r[0]: r[1] for r in cur.fetchall()} conn.close() except Exception: pass # Filter to qualified leagues with enough data counts = df[df["league_id"].isin(qualified)].groupby("league_id").size() full_model_ids = counts[counts >= args.min_samples].index.tolist() cal_only_ids = counts[(counts >= args.cal_min) & (counts < args.min_samples)].index.tolist() print(f"\nTam model ({args.min_samples}+ maç): {len(full_model_ids)} lig") print(f"Kalibrasyon ({args.cal_min}-{args.min_samples-1} maç): {len(cal_only_ids)} lig") print(f"Atlandı (<{args.cal_min} maç): {len([l for l in qualified if l not in full_model_ids and l not in cal_only_ids])} lig") print() all_results = [] total = len(full_model_ids) + len(cal_only_ids) done = 0 for league_id, full_model in ( [(lid, True) for lid in full_model_ids] + [(lid, False) for lid in cal_only_ids] ): t0 = time.time() df_league = df[df["league_id"] == league_id].copy() n = len(df_league) name = league_names.get(league_id, league_id[:12]) tier = "FULL" if full_model else "CAL" try: result = process_league(league_id, df_league, feature_cols, full_model, name) done += 1 elapsed = time.time() - t0 # Build accuracy string for key markets acc_parts = [] for mkt in ["MS", "OU15", "OU25", "BTTS"]: m = result["markets"].get(mkt, {}) if "accuracy" in m: acc_parts.append(f"{mkt}={m['accuracy']*100:.1f}%") acc_str = " | ".join(acc_parts) if acc_parts else "(cal only)" print(f"[{done:>3}/{total}] [{tier}] {name:<35} {n:>6,} maç | {acc_str} | {elapsed:.1f}s") all_results.append(result) except Exception as e: done += 1 print(f"[{done:>3}/{total}] [{tier}] {name:<35} ERROR: {e}") if done % 10 == 0: elapsed_total = time.time() - start_total remaining = (elapsed_total / done) * (total - done) print(f" ── {done}/{total} tamamlandı | geçen: {elapsed_total/60:.1f}dk | kalan tahmini: {remaining/60:.1f}dk ──") # Final report total_elapsed = time.time() - start_total print(f"\n{'='*70}") print(f"TAMAMLANDI: {len(all_results)}/{total} lig | Süre: {total_elapsed/60:.1f} dakika") print(f"{'='*70}") # Top 20 by accuracy printable = [(r["league_name"], r["n_matches"], r["markets"]) for r in all_results if "MS" in r["markets"] and "accuracy" in r["markets"]["MS"]] printable.sort(key=lambda x: x[2]["MS"].get("accuracy", 0), reverse=True) print(f"\n{'Liga':<35} {'Maç':>6} {'MS':>7} {'OU15':>7} {'OU25':>7} {'BTTS':>7}") print("-" * 70) for name, n, mkts in printable[:30]: ms = mkts.get("MS", {}).get("accuracy", 0) * 100 ou15 = mkts.get("OU15", {}).get("accuracy", 0) * 100 ou25 = mkts.get("OU25", {}).get("accuracy", 0) * 100 btts = mkts.get("BTTS", {}).get("accuracy", 0) * 100 print(f"{name:<35} {n:>6,} {ms:>6.1f}% {ou15:>6.1f}% {ou25:>6.1f}% {btts:>6.1f}%") # Save master report report = { "generated_at": datetime.now().isoformat(), "total_leagues": len(all_results), "elapsed_minutes": round(total_elapsed / 60, 1), "results": all_results, } report_path = os.path.join(REPORTS_DIR, "league_models_report.json") with open(report_path, "w") as f: json.dump(report, f, indent=2) print(f"\nRapor kaydedildi: {report_path}") if __name__ == "__main__": main()