Files
iddaai-be/ai-engine/scripts/train_league_models.py
T
fahricansecer 94c7a4481a
Deploy Iddaai Backend / build-and-deploy (push) Successful in 37s
main
2026-05-17 02:17:22 +03:00

460 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
League-Specific Model Trainer
==============================
Trains dedicated XGBoost models + isotonic calibration for each qualified league.
Tiers:
- >=500 FT matches → full XGBoost (12 markets) + calibration
- 100-499 matches → isotonic calibration only (over general V25 predictions)
- <100 matches → skipped
Usage:
python scripts/train_league_models.py
python scripts/train_league_models.py --min-samples 300 # stricter threshold
python scripts/train_league_models.py --colab # Colab-friendly output
"""
import os
import sys
import json
import pickle
import argparse
import time
import warnings
from datetime import datetime
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import accuracy_score, log_loss
warnings.filterwarnings("ignore")
optuna_available = False
try:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
optuna_available = True
except ImportError:
pass
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "league_specific")
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "league_models")
QUALIFIED_LEAGUES_PATH = os.path.join(os.path.dirname(AI_ENGINE_DIR), "qualified_leagues.json")
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)
# ─── Markets ────────────────────────────────────────────────────────
MARKETS = {
"MS": {"label": "label_ms", "num_class": 3, "min_samples": 200},
"OU15": {"label": "label_ou15", "num_class": 2, "min_samples": 150},
"OU25": {"label": "label_ou25", "num_class": 2, "min_samples": 150},
"OU35": {"label": "label_ou35", "num_class": 2, "min_samples": 150},
"BTTS": {"label": "label_btts", "num_class": 2, "min_samples": 150},
"HT": {"label": "label_ht_result", "num_class": 3, "min_samples": 150},
"HT_OU05": {"label": "label_ht_ou05", "num_class": 2, "min_samples": 150},
"HT_OU15": {"label": "label_ht_ou15", "num_class": 2, "min_samples": 150},
"HTFT": {"label": "label_ht_ft", "num_class": 9, "min_samples": 300},
"OE": {"label": "label_odd_even", "num_class": 2, "min_samples": 150},
"CARDS": {"label": "label_cards_ou45", "num_class": 2, "min_samples": 150},
"HANDICAP": {"label": "label_handicap_ms", "num_class": 3, "min_samples": 200},
}
# Feature columns (from training_data.csv, excluding metadata + labels)
SKIP_COLS = {
"match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
"score_home", "score_away", "total_goals", "ht_score_home", "ht_score_away",
"ht_total_goals",
"label_ms", "label_ou05", "label_ou15", "label_ou25", "label_ou35",
"label_btts", "label_ht_result", "label_ht_ou05", "label_ht_ou15",
"label_ht_ft", "label_odd_even", "label_yellow_cards", "label_cards_ou45",
"label_handicap_ms",
}
# XGBoost defaults — fast, no Optuna
XGB_PARAMS_BINARY = {
"objective": "binary:logistic",
"eval_metric": "logloss",
"max_depth": 4,
"eta": 0.05,
"subsample": 0.8,
"colsample_bytree": 0.8,
"min_child_weight": 5,
"gamma": 0.1,
"reg_lambda": 1.0,
"verbosity": 0,
"seed": 42,
"nthread": -1,
}
XGB_PARAMS_MULTI = {
**XGB_PARAMS_BINARY,
"objective": "multi:softprob",
"eval_metric": "mlogloss",
}
def load_data() -> pd.DataFrame:
print(f"Loading training data from {DATA_PATH} ...")
df = pd.read_csv(DATA_PATH, low_memory=False)
print(f" {len(df):,} rows, {len(df.columns)} columns")
return df
def get_feature_cols(df: pd.DataFrame) -> list:
return [c for c in df.columns if c not in SKIP_COLS]
def load_qualified_leagues() -> list:
if os.path.exists(QUALIFIED_LEAGUES_PATH):
with open(QUALIFIED_LEAGUES_PATH) as f:
return json.load(f)
# fallback: all leagues in CSV
return []
def train_xgb_market(
X_train: np.ndarray,
y_train: np.ndarray,
X_test: np.ndarray,
y_test: np.ndarray,
num_class: int,
feature_cols: list,
) -> tuple:
"""Train XGBoost for one market. Returns (model, accuracy, logloss)."""
params = dict(XGB_PARAMS_MULTI if num_class > 2 else XGB_PARAMS_BINARY)
if num_class > 2:
params["num_class"] = num_class
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_cols)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_cols)
model = xgb.train(
params,
dtrain,
num_boost_round=300,
evals=[(dtest, "val")],
early_stopping_rounds=30,
verbose_eval=False,
)
raw = model.predict(dtest)
if num_class > 2:
probs = raw.reshape(-1, num_class)
preds = np.argmax(probs, axis=1)
ll = log_loss(y_test, probs)
else:
preds = (raw >= 0.5).astype(int)
ll = log_loss(y_test, raw)
acc = accuracy_score(y_test, preds)
return model, acc, ll
def train_isotonic(raw_probs: np.ndarray, y_true: np.ndarray) -> IsotonicRegression:
iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(raw_probs, y_true)
return iso
def get_general_v25_probs(df_league: pd.DataFrame, feature_cols: list, market: str, num_class: int):
"""Use general V25 model to get predictions on this league's matches (for cal-only leagues)."""
try:
from models.v25_ensemble import get_v25_predictor
v25 = get_v25_predictor()
if not v25._loaded:
v25.load_models()
label_col = MARKETS[market]["label"]
valid = df_league[feature_cols + [label_col]].dropna()
if len(valid) < 50:
return None, None
market_key_map = {
"MS": "ms", "OU15": "ou15", "OU25": "ou25", "OU35": "ou35",
"BTTS": "btts", "HT": "ht_result", "HT_OU05": "ht_ou05",
"HT_OU15": "ht_ou15", "HTFT": "htft", "OE": "odd_even",
"CARDS": "cards_ou45", "HANDICAP": "handicap_ms",
}
mkey = market_key_map.get(market)
if not mkey or not v25.has_market(mkey):
return None, None
X = valid[feature_cols].fillna(0).values
y = valid[label_col].values
all_probs = []
for i in range(0, len(X), 500):
batch = X[i:i+500]
feat_dict = {col: float(batch[j, k]) for j, row in enumerate(batch) for k, col in enumerate(feature_cols)}
# batch predict
df_batch = pd.DataFrame(batch, columns=feature_cols)
dmat = xgb.DMatrix(df_batch)
models = v25.models.get(mkey, {})
batch_probs = []
if "xgb" in models:
p = models["xgb"].predict(dmat)
if num_class > 2:
p = p.reshape(-1, num_class)
batch_probs.append(p)
if batch_probs:
all_probs.append(np.mean(batch_probs, axis=0))
if not all_probs:
return None, None
probs = np.vstack(all_probs) if num_class > 2 else np.concatenate(all_probs)
return probs, y
except Exception as e:
return None, None
def process_league(
league_id: str,
df_league: pd.DataFrame,
feature_cols: list,
full_model: bool,
league_name: str,
) -> dict:
"""Train models for one league. Returns metrics dict."""
n = len(df_league)
out_dir = os.path.join(MODELS_DIR, league_id)
os.makedirs(out_dir, exist_ok=True)
metrics = {"league_id": league_id, "league_name": league_name, "n_matches": n, "markets": {}}
# Time-based split: last 20% as test
split_idx = int(n * 0.80)
df_sorted = df_league.sort_values("mst_utc")
df_train = df_sorted.iloc[:split_idx]
df_test = df_sorted.iloc[split_idx:]
saved_feature_cols = False
for market, cfg in MARKETS.items():
label_col = cfg["label"]
num_class = cfg["num_class"]
min_samp = cfg["min_samples"]
if label_col not in df_league.columns:
continue
valid_train = df_train[feature_cols + [label_col]].dropna()
valid_test = df_test[feature_cols + [label_col]].dropna()
if len(valid_train) < min_samp or len(valid_test) < 30:
continue
X_train = valid_train[feature_cols].fillna(0).values
y_train = valid_train[label_col].values.astype(int)
X_test = valid_test[feature_cols].fillna(0).values
y_test = valid_test[label_col].values.astype(int)
mkt_metrics = {"n_train": len(X_train), "n_test": len(X_test)}
if full_model:
try:
model, acc, ll = train_xgb_market(X_train, y_train, X_test, y_test, num_class, feature_cols)
model_path = os.path.join(out_dir, f"xgb_{market.lower()}.json")
model.save_model(model_path)
mkt_metrics.update({"accuracy": round(acc, 4), "logloss": round(ll, 4), "model": "xgb"})
if not saved_feature_cols:
with open(os.path.join(out_dir, "feature_cols.json"), "w") as f:
json.dump(feature_cols, f)
saved_feature_cols = True
# Isotonic calibration from own model predictions
dtest_xgb = xgb.DMatrix(X_test, feature_names=feature_cols)
raw = model.predict(dtest_xgb)
if num_class > 2:
raw = raw.reshape(-1, num_class)
for cls_idx in range(num_class):
iso = train_isotonic(raw[:, cls_idx], (y_test == cls_idx).astype(int))
with open(os.path.join(out_dir, f"cal_{market.lower()}_{cls_idx}.pkl"), "wb") as f:
pickle.dump(iso, f)
else:
iso = train_isotonic(raw, y_test)
with open(os.path.join(out_dir, f"cal_{market.lower()}.pkl"), "wb") as f:
pickle.dump(iso, f)
except Exception as e:
mkt_metrics["error"] = str(e)
else:
# Calibration only: use general V25 model
try:
all_valid = df_league[feature_cols + [label_col]].dropna()
if len(all_valid) < min_samp:
continue
X_all = all_valid[feature_cols].fillna(0).values
y_all = all_valid[label_col].values.astype(int)
# Use V25 general model
from models.v25_ensemble import get_v25_predictor
v25 = get_v25_predictor()
if not v25._loaded:
v25.load_models()
market_key_map = {
"MS": "ms", "OU15": "ou15", "OU25": "ou25", "OU35": "ou35",
"BTTS": "btts", "HT": "ht_result", "HT_OU05": "ht_ou05",
"HT_OU15": "ht_ou15", "HTFT": "htft", "OE": "odd_even",
"CARDS": "cards_ou45", "HANDICAP": "handicap_ms",
}
mkey = market_key_map.get(market)
if not mkey or not v25.has_market(mkey):
continue
df_feat = pd.DataFrame(X_all, columns=feature_cols)
dmat = xgb.DMatrix(df_feat)
models_v25 = v25.models.get(mkey, {})
if "xgb" not in models_v25:
continue
raw = models_v25["xgb"].predict(dmat)
if num_class > 2:
raw = raw.reshape(-1, num_class)
for cls_idx in range(num_class):
iso = train_isotonic(raw[:, cls_idx], (y_all == cls_idx).astype(int))
with open(os.path.join(out_dir, f"cal_{market.lower()}_{cls_idx}.pkl"), "wb") as f:
pickle.dump(iso, f)
else:
iso = train_isotonic(raw, y_all)
with open(os.path.join(out_dir, f"cal_{market.lower()}.pkl"), "wb") as f:
pickle.dump(iso, f)
mkt_metrics.update({"n_train": len(X_all), "model": "cal_only"})
except Exception as e:
mkt_metrics["error"] = str(e)
metrics["markets"][market] = mkt_metrics
# Save metrics
with open(os.path.join(out_dir, "metrics.json"), "w") as f:
json.dump(metrics, f, indent=2)
return metrics
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--min-samples", type=int, default=500, help="Min matches for full model")
parser.add_argument("--cal-min", type=int, default=100, help="Min matches for calibration")
parser.add_argument("--colab", action="store_true", help="Colab-friendly verbose output")
args = parser.parse_args()
start_total = time.time()
df = load_data()
feature_cols = get_feature_cols(df)
print(f"Feature columns: {len(feature_cols)}")
qualified = load_qualified_leagues()
if not qualified:
qualified = df["league_id"].unique().tolist()
print(f"Qualified leagues: {len(qualified)}")
# Get league names
league_names = {}
try:
import psycopg2
from data.db import get_clean_dsn
conn = psycopg2.connect(get_clean_dsn())
cur = conn.cursor()
cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (qualified,))
league_names = {r[0]: r[1] for r in cur.fetchall()}
conn.close()
except Exception:
pass
# Filter to qualified leagues with enough data
counts = df[df["league_id"].isin(qualified)].groupby("league_id").size()
full_model_ids = counts[counts >= args.min_samples].index.tolist()
cal_only_ids = counts[(counts >= args.cal_min) & (counts < args.min_samples)].index.tolist()
print(f"\nTam model ({args.min_samples}+ maç): {len(full_model_ids)} lig")
print(f"Kalibrasyon ({args.cal_min}-{args.min_samples-1} maç): {len(cal_only_ids)} lig")
print(f"Atlandı (<{args.cal_min} maç): {len([l for l in qualified if l not in full_model_ids and l not in cal_only_ids])} lig")
print()
all_results = []
total = len(full_model_ids) + len(cal_only_ids)
done = 0
for league_id, full_model in (
[(lid, True) for lid in full_model_ids] +
[(lid, False) for lid in cal_only_ids]
):
t0 = time.time()
df_league = df[df["league_id"] == league_id].copy()
n = len(df_league)
name = league_names.get(league_id, league_id[:12])
tier = "FULL" if full_model else "CAL"
try:
result = process_league(league_id, df_league, feature_cols, full_model, name)
done += 1
elapsed = time.time() - t0
# Build accuracy string for key markets
acc_parts = []
for mkt in ["MS", "OU15", "OU25", "BTTS"]:
m = result["markets"].get(mkt, {})
if "accuracy" in m:
acc_parts.append(f"{mkt}={m['accuracy']*100:.1f}%")
acc_str = " | ".join(acc_parts) if acc_parts else "(cal only)"
print(f"[{done:>3}/{total}] [{tier}] {name:<35} {n:>6,} maç | {acc_str} | {elapsed:.1f}s")
all_results.append(result)
except Exception as e:
done += 1
print(f"[{done:>3}/{total}] [{tier}] {name:<35} ERROR: {e}")
if done % 10 == 0:
elapsed_total = time.time() - start_total
remaining = (elapsed_total / done) * (total - done)
print(f" ── {done}/{total} tamamlandı | geçen: {elapsed_total/60:.1f}dk | kalan tahmini: {remaining/60:.1f}dk ──")
# Final report
total_elapsed = time.time() - start_total
print(f"\n{'='*70}")
print(f"TAMAMLANDI: {len(all_results)}/{total} lig | Süre: {total_elapsed/60:.1f} dakika")
print(f"{'='*70}")
# Top 20 by accuracy
printable = [(r["league_name"], r["n_matches"], r["markets"]) for r in all_results
if "MS" in r["markets"] and "accuracy" in r["markets"]["MS"]]
printable.sort(key=lambda x: x[2]["MS"].get("accuracy", 0), reverse=True)
print(f"\n{'Liga':<35} {'Maç':>6} {'MS':>7} {'OU15':>7} {'OU25':>7} {'BTTS':>7}")
print("-" * 70)
for name, n, mkts in printable[:30]:
ms = mkts.get("MS", {}).get("accuracy", 0) * 100
ou15 = mkts.get("OU15", {}).get("accuracy", 0) * 100
ou25 = mkts.get("OU25", {}).get("accuracy", 0) * 100
btts = mkts.get("BTTS", {}).get("accuracy", 0) * 100
print(f"{name:<35} {n:>6,} {ms:>6.1f}% {ou15:>6.1f}% {ou25:>6.1f}% {btts:>6.1f}%")
# Save master report
report = {
"generated_at": datetime.now().isoformat(),
"total_leagues": len(all_results),
"elapsed_minutes": round(total_elapsed / 60, 1),
"results": all_results,
}
report_path = os.path.join(REPORTS_DIR, "league_models_report.json")
with open(report_path, "w") as f:
json.dump(report, f, indent=2)
print(f"\nRapor kaydedildi: {report_path}")
if __name__ == "__main__":
main()