@@ -0,0 +1,459 @@
|
||||
"""
|
||||
League-Specific Model Trainer
|
||||
==============================
|
||||
Trains dedicated XGBoost models + isotonic calibration for each qualified league.
|
||||
|
||||
Tiers:
|
||||
- >=500 FT matches → full XGBoost (12 markets) + calibration
|
||||
- 100-499 matches → isotonic calibration only (over general V25 predictions)
|
||||
- <100 matches → skipped
|
||||
|
||||
Usage:
|
||||
python scripts/train_league_models.py
|
||||
python scripts/train_league_models.py --min-samples 300 # stricter threshold
|
||||
python scripts/train_league_models.py --colab # Colab-friendly output
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import pickle
|
||||
import argparse
|
||||
import time
|
||||
import warnings
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import xgboost as xgb
|
||||
from sklearn.isotonic import IsotonicRegression
|
||||
from sklearn.metrics import accuracy_score, log_loss
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
optuna_available = False
|
||||
try:
|
||||
import optuna
|
||||
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
||||
optuna_available = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
|
||||
MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "league_specific")
|
||||
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "league_models")
|
||||
QUALIFIED_LEAGUES_PATH = os.path.join(os.path.dirname(AI_ENGINE_DIR), "qualified_leagues.json")
|
||||
|
||||
os.makedirs(MODELS_DIR, exist_ok=True)
|
||||
os.makedirs(REPORTS_DIR, exist_ok=True)
|
||||
|
||||
# ─── Markets ────────────────────────────────────────────────────────
|
||||
MARKETS = {
|
||||
"MS": {"label": "label_ms", "num_class": 3, "min_samples": 200},
|
||||
"OU15": {"label": "label_ou15", "num_class": 2, "min_samples": 150},
|
||||
"OU25": {"label": "label_ou25", "num_class": 2, "min_samples": 150},
|
||||
"OU35": {"label": "label_ou35", "num_class": 2, "min_samples": 150},
|
||||
"BTTS": {"label": "label_btts", "num_class": 2, "min_samples": 150},
|
||||
"HT": {"label": "label_ht_result", "num_class": 3, "min_samples": 150},
|
||||
"HT_OU05": {"label": "label_ht_ou05", "num_class": 2, "min_samples": 150},
|
||||
"HT_OU15": {"label": "label_ht_ou15", "num_class": 2, "min_samples": 150},
|
||||
"HTFT": {"label": "label_ht_ft", "num_class": 9, "min_samples": 300},
|
||||
"OE": {"label": "label_odd_even", "num_class": 2, "min_samples": 150},
|
||||
"CARDS": {"label": "label_cards_ou45", "num_class": 2, "min_samples": 150},
|
||||
"HANDICAP": {"label": "label_handicap_ms", "num_class": 3, "min_samples": 200},
|
||||
}
|
||||
|
||||
# Feature columns (from training_data.csv, excluding metadata + labels)
|
||||
SKIP_COLS = {
|
||||
"match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
|
||||
"score_home", "score_away", "total_goals", "ht_score_home", "ht_score_away",
|
||||
"ht_total_goals",
|
||||
"label_ms", "label_ou05", "label_ou15", "label_ou25", "label_ou35",
|
||||
"label_btts", "label_ht_result", "label_ht_ou05", "label_ht_ou15",
|
||||
"label_ht_ft", "label_odd_even", "label_yellow_cards", "label_cards_ou45",
|
||||
"label_handicap_ms",
|
||||
}
|
||||
|
||||
# XGBoost defaults — fast, no Optuna
|
||||
XGB_PARAMS_BINARY = {
|
||||
"objective": "binary:logistic",
|
||||
"eval_metric": "logloss",
|
||||
"max_depth": 4,
|
||||
"eta": 0.05,
|
||||
"subsample": 0.8,
|
||||
"colsample_bytree": 0.8,
|
||||
"min_child_weight": 5,
|
||||
"gamma": 0.1,
|
||||
"reg_lambda": 1.0,
|
||||
"verbosity": 0,
|
||||
"seed": 42,
|
||||
"nthread": -1,
|
||||
}
|
||||
|
||||
XGB_PARAMS_MULTI = {
|
||||
**XGB_PARAMS_BINARY,
|
||||
"objective": "multi:softprob",
|
||||
"eval_metric": "mlogloss",
|
||||
}
|
||||
|
||||
|
||||
def load_data() -> pd.DataFrame:
|
||||
print(f"Loading training data from {DATA_PATH} ...")
|
||||
df = pd.read_csv(DATA_PATH, low_memory=False)
|
||||
print(f" {len(df):,} rows, {len(df.columns)} columns")
|
||||
return df
|
||||
|
||||
|
||||
def get_feature_cols(df: pd.DataFrame) -> list:
|
||||
return [c for c in df.columns if c not in SKIP_COLS]
|
||||
|
||||
|
||||
def load_qualified_leagues() -> list:
|
||||
if os.path.exists(QUALIFIED_LEAGUES_PATH):
|
||||
with open(QUALIFIED_LEAGUES_PATH) as f:
|
||||
return json.load(f)
|
||||
# fallback: all leagues in CSV
|
||||
return []
|
||||
|
||||
|
||||
def train_xgb_market(
|
||||
X_train: np.ndarray,
|
||||
y_train: np.ndarray,
|
||||
X_test: np.ndarray,
|
||||
y_test: np.ndarray,
|
||||
num_class: int,
|
||||
feature_cols: list,
|
||||
) -> tuple:
|
||||
"""Train XGBoost for one market. Returns (model, accuracy, logloss)."""
|
||||
params = dict(XGB_PARAMS_MULTI if num_class > 2 else XGB_PARAMS_BINARY)
|
||||
if num_class > 2:
|
||||
params["num_class"] = num_class
|
||||
|
||||
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_cols)
|
||||
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_cols)
|
||||
|
||||
model = xgb.train(
|
||||
params,
|
||||
dtrain,
|
||||
num_boost_round=300,
|
||||
evals=[(dtest, "val")],
|
||||
early_stopping_rounds=30,
|
||||
verbose_eval=False,
|
||||
)
|
||||
|
||||
raw = model.predict(dtest)
|
||||
if num_class > 2:
|
||||
probs = raw.reshape(-1, num_class)
|
||||
preds = np.argmax(probs, axis=1)
|
||||
ll = log_loss(y_test, probs)
|
||||
else:
|
||||
preds = (raw >= 0.5).astype(int)
|
||||
ll = log_loss(y_test, raw)
|
||||
|
||||
acc = accuracy_score(y_test, preds)
|
||||
return model, acc, ll
|
||||
|
||||
|
||||
def train_isotonic(raw_probs: np.ndarray, y_true: np.ndarray) -> IsotonicRegression:
|
||||
iso = IsotonicRegression(out_of_bounds="clip")
|
||||
iso.fit(raw_probs, y_true)
|
||||
return iso
|
||||
|
||||
|
||||
def get_general_v25_probs(df_league: pd.DataFrame, feature_cols: list, market: str, num_class: int):
|
||||
"""Use general V25 model to get predictions on this league's matches (for cal-only leagues)."""
|
||||
try:
|
||||
from models.v25_ensemble import get_v25_predictor
|
||||
v25 = get_v25_predictor()
|
||||
if not v25._loaded:
|
||||
v25.load_models()
|
||||
|
||||
label_col = MARKETS[market]["label"]
|
||||
valid = df_league[feature_cols + [label_col]].dropna()
|
||||
if len(valid) < 50:
|
||||
return None, None
|
||||
|
||||
market_key_map = {
|
||||
"MS": "ms", "OU15": "ou15", "OU25": "ou25", "OU35": "ou35",
|
||||
"BTTS": "btts", "HT": "ht_result", "HT_OU05": "ht_ou05",
|
||||
"HT_OU15": "ht_ou15", "HTFT": "htft", "OE": "odd_even",
|
||||
"CARDS": "cards_ou45", "HANDICAP": "handicap_ms",
|
||||
}
|
||||
mkey = market_key_map.get(market)
|
||||
if not mkey or not v25.has_market(mkey):
|
||||
return None, None
|
||||
|
||||
X = valid[feature_cols].fillna(0).values
|
||||
y = valid[label_col].values
|
||||
|
||||
all_probs = []
|
||||
for i in range(0, len(X), 500):
|
||||
batch = X[i:i+500]
|
||||
feat_dict = {col: float(batch[j, k]) for j, row in enumerate(batch) for k, col in enumerate(feature_cols)}
|
||||
# batch predict
|
||||
df_batch = pd.DataFrame(batch, columns=feature_cols)
|
||||
dmat = xgb.DMatrix(df_batch)
|
||||
models = v25.models.get(mkey, {})
|
||||
batch_probs = []
|
||||
if "xgb" in models:
|
||||
p = models["xgb"].predict(dmat)
|
||||
if num_class > 2:
|
||||
p = p.reshape(-1, num_class)
|
||||
batch_probs.append(p)
|
||||
if batch_probs:
|
||||
all_probs.append(np.mean(batch_probs, axis=0))
|
||||
|
||||
if not all_probs:
|
||||
return None, None
|
||||
|
||||
probs = np.vstack(all_probs) if num_class > 2 else np.concatenate(all_probs)
|
||||
return probs, y
|
||||
except Exception as e:
|
||||
return None, None
|
||||
|
||||
|
||||
def process_league(
|
||||
league_id: str,
|
||||
df_league: pd.DataFrame,
|
||||
feature_cols: list,
|
||||
full_model: bool,
|
||||
league_name: str,
|
||||
) -> dict:
|
||||
"""Train models for one league. Returns metrics dict."""
|
||||
n = len(df_league)
|
||||
out_dir = os.path.join(MODELS_DIR, league_id)
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
metrics = {"league_id": league_id, "league_name": league_name, "n_matches": n, "markets": {}}
|
||||
|
||||
# Time-based split: last 20% as test
|
||||
split_idx = int(n * 0.80)
|
||||
df_sorted = df_league.sort_values("mst_utc")
|
||||
df_train = df_sorted.iloc[:split_idx]
|
||||
df_test = df_sorted.iloc[split_idx:]
|
||||
|
||||
saved_feature_cols = False
|
||||
|
||||
for market, cfg in MARKETS.items():
|
||||
label_col = cfg["label"]
|
||||
num_class = cfg["num_class"]
|
||||
min_samp = cfg["min_samples"]
|
||||
|
||||
if label_col not in df_league.columns:
|
||||
continue
|
||||
|
||||
valid_train = df_train[feature_cols + [label_col]].dropna()
|
||||
valid_test = df_test[feature_cols + [label_col]].dropna()
|
||||
|
||||
if len(valid_train) < min_samp or len(valid_test) < 30:
|
||||
continue
|
||||
|
||||
X_train = valid_train[feature_cols].fillna(0).values
|
||||
y_train = valid_train[label_col].values.astype(int)
|
||||
X_test = valid_test[feature_cols].fillna(0).values
|
||||
y_test = valid_test[label_col].values.astype(int)
|
||||
|
||||
mkt_metrics = {"n_train": len(X_train), "n_test": len(X_test)}
|
||||
|
||||
if full_model:
|
||||
try:
|
||||
model, acc, ll = train_xgb_market(X_train, y_train, X_test, y_test, num_class, feature_cols)
|
||||
model_path = os.path.join(out_dir, f"xgb_{market.lower()}.json")
|
||||
model.save_model(model_path)
|
||||
mkt_metrics.update({"accuracy": round(acc, 4), "logloss": round(ll, 4), "model": "xgb"})
|
||||
|
||||
if not saved_feature_cols:
|
||||
with open(os.path.join(out_dir, "feature_cols.json"), "w") as f:
|
||||
json.dump(feature_cols, f)
|
||||
saved_feature_cols = True
|
||||
|
||||
# Isotonic calibration from own model predictions
|
||||
dtest_xgb = xgb.DMatrix(X_test, feature_names=feature_cols)
|
||||
raw = model.predict(dtest_xgb)
|
||||
if num_class > 2:
|
||||
raw = raw.reshape(-1, num_class)
|
||||
for cls_idx in range(num_class):
|
||||
iso = train_isotonic(raw[:, cls_idx], (y_test == cls_idx).astype(int))
|
||||
with open(os.path.join(out_dir, f"cal_{market.lower()}_{cls_idx}.pkl"), "wb") as f:
|
||||
pickle.dump(iso, f)
|
||||
else:
|
||||
iso = train_isotonic(raw, y_test)
|
||||
with open(os.path.join(out_dir, f"cal_{market.lower()}.pkl"), "wb") as f:
|
||||
pickle.dump(iso, f)
|
||||
|
||||
except Exception as e:
|
||||
mkt_metrics["error"] = str(e)
|
||||
else:
|
||||
# Calibration only: use general V25 model
|
||||
try:
|
||||
all_valid = df_league[feature_cols + [label_col]].dropna()
|
||||
if len(all_valid) < min_samp:
|
||||
continue
|
||||
|
||||
X_all = all_valid[feature_cols].fillna(0).values
|
||||
y_all = all_valid[label_col].values.astype(int)
|
||||
|
||||
# Use V25 general model
|
||||
from models.v25_ensemble import get_v25_predictor
|
||||
v25 = get_v25_predictor()
|
||||
if not v25._loaded:
|
||||
v25.load_models()
|
||||
|
||||
market_key_map = {
|
||||
"MS": "ms", "OU15": "ou15", "OU25": "ou25", "OU35": "ou35",
|
||||
"BTTS": "btts", "HT": "ht_result", "HT_OU05": "ht_ou05",
|
||||
"HT_OU15": "ht_ou15", "HTFT": "htft", "OE": "odd_even",
|
||||
"CARDS": "cards_ou45", "HANDICAP": "handicap_ms",
|
||||
}
|
||||
mkey = market_key_map.get(market)
|
||||
if not mkey or not v25.has_market(mkey):
|
||||
continue
|
||||
|
||||
df_feat = pd.DataFrame(X_all, columns=feature_cols)
|
||||
dmat = xgb.DMatrix(df_feat)
|
||||
models_v25 = v25.models.get(mkey, {})
|
||||
if "xgb" not in models_v25:
|
||||
continue
|
||||
raw = models_v25["xgb"].predict(dmat)
|
||||
|
||||
if num_class > 2:
|
||||
raw = raw.reshape(-1, num_class)
|
||||
for cls_idx in range(num_class):
|
||||
iso = train_isotonic(raw[:, cls_idx], (y_all == cls_idx).astype(int))
|
||||
with open(os.path.join(out_dir, f"cal_{market.lower()}_{cls_idx}.pkl"), "wb") as f:
|
||||
pickle.dump(iso, f)
|
||||
else:
|
||||
iso = train_isotonic(raw, y_all)
|
||||
with open(os.path.join(out_dir, f"cal_{market.lower()}.pkl"), "wb") as f:
|
||||
pickle.dump(iso, f)
|
||||
|
||||
mkt_metrics.update({"n_train": len(X_all), "model": "cal_only"})
|
||||
except Exception as e:
|
||||
mkt_metrics["error"] = str(e)
|
||||
|
||||
metrics["markets"][market] = mkt_metrics
|
||||
|
||||
# Save metrics
|
||||
with open(os.path.join(out_dir, "metrics.json"), "w") as f:
|
||||
json.dump(metrics, f, indent=2)
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--min-samples", type=int, default=500, help="Min matches for full model")
|
||||
parser.add_argument("--cal-min", type=int, default=100, help="Min matches for calibration")
|
||||
parser.add_argument("--colab", action="store_true", help="Colab-friendly verbose output")
|
||||
args = parser.parse_args()
|
||||
|
||||
start_total = time.time()
|
||||
|
||||
df = load_data()
|
||||
feature_cols = get_feature_cols(df)
|
||||
print(f"Feature columns: {len(feature_cols)}")
|
||||
|
||||
qualified = load_qualified_leagues()
|
||||
if not qualified:
|
||||
qualified = df["league_id"].unique().tolist()
|
||||
print(f"Qualified leagues: {len(qualified)}")
|
||||
|
||||
# Get league names
|
||||
league_names = {}
|
||||
try:
|
||||
import psycopg2
|
||||
from data.db import get_clean_dsn
|
||||
conn = psycopg2.connect(get_clean_dsn())
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (qualified,))
|
||||
league_names = {r[0]: r[1] for r in cur.fetchall()}
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Filter to qualified leagues with enough data
|
||||
counts = df[df["league_id"].isin(qualified)].groupby("league_id").size()
|
||||
full_model_ids = counts[counts >= args.min_samples].index.tolist()
|
||||
cal_only_ids = counts[(counts >= args.cal_min) & (counts < args.min_samples)].index.tolist()
|
||||
|
||||
print(f"\nTam model ({args.min_samples}+ maç): {len(full_model_ids)} lig")
|
||||
print(f"Kalibrasyon ({args.cal_min}-{args.min_samples-1} maç): {len(cal_only_ids)} lig")
|
||||
print(f"Atlandı (<{args.cal_min} maç): {len([l for l in qualified if l not in full_model_ids and l not in cal_only_ids])} lig")
|
||||
print()
|
||||
|
||||
all_results = []
|
||||
total = len(full_model_ids) + len(cal_only_ids)
|
||||
done = 0
|
||||
|
||||
for league_id, full_model in (
|
||||
[(lid, True) for lid in full_model_ids] +
|
||||
[(lid, False) for lid in cal_only_ids]
|
||||
):
|
||||
t0 = time.time()
|
||||
df_league = df[df["league_id"] == league_id].copy()
|
||||
n = len(df_league)
|
||||
name = league_names.get(league_id, league_id[:12])
|
||||
tier = "FULL" if full_model else "CAL"
|
||||
|
||||
try:
|
||||
result = process_league(league_id, df_league, feature_cols, full_model, name)
|
||||
done += 1
|
||||
elapsed = time.time() - t0
|
||||
|
||||
# Build accuracy string for key markets
|
||||
acc_parts = []
|
||||
for mkt in ["MS", "OU15", "OU25", "BTTS"]:
|
||||
m = result["markets"].get(mkt, {})
|
||||
if "accuracy" in m:
|
||||
acc_parts.append(f"{mkt}={m['accuracy']*100:.1f}%")
|
||||
acc_str = " | ".join(acc_parts) if acc_parts else "(cal only)"
|
||||
|
||||
print(f"[{done:>3}/{total}] [{tier}] {name:<35} {n:>6,} maç | {acc_str} | {elapsed:.1f}s")
|
||||
all_results.append(result)
|
||||
|
||||
except Exception as e:
|
||||
done += 1
|
||||
print(f"[{done:>3}/{total}] [{tier}] {name:<35} ERROR: {e}")
|
||||
|
||||
if done % 10 == 0:
|
||||
elapsed_total = time.time() - start_total
|
||||
remaining = (elapsed_total / done) * (total - done)
|
||||
print(f" ── {done}/{total} tamamlandı | geçen: {elapsed_total/60:.1f}dk | kalan tahmini: {remaining/60:.1f}dk ──")
|
||||
|
||||
# Final report
|
||||
total_elapsed = time.time() - start_total
|
||||
print(f"\n{'='*70}")
|
||||
print(f"TAMAMLANDI: {len(all_results)}/{total} lig | Süre: {total_elapsed/60:.1f} dakika")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# Top 20 by accuracy
|
||||
printable = [(r["league_name"], r["n_matches"], r["markets"]) for r in all_results
|
||||
if "MS" in r["markets"] and "accuracy" in r["markets"]["MS"]]
|
||||
printable.sort(key=lambda x: x[2]["MS"].get("accuracy", 0), reverse=True)
|
||||
|
||||
print(f"\n{'Liga':<35} {'Maç':>6} {'MS':>7} {'OU15':>7} {'OU25':>7} {'BTTS':>7}")
|
||||
print("-" * 70)
|
||||
for name, n, mkts in printable[:30]:
|
||||
ms = mkts.get("MS", {}).get("accuracy", 0) * 100
|
||||
ou15 = mkts.get("OU15", {}).get("accuracy", 0) * 100
|
||||
ou25 = mkts.get("OU25", {}).get("accuracy", 0) * 100
|
||||
btts = mkts.get("BTTS", {}).get("accuracy", 0) * 100
|
||||
print(f"{name:<35} {n:>6,} {ms:>6.1f}% {ou15:>6.1f}% {ou25:>6.1f}% {btts:>6.1f}%")
|
||||
|
||||
# Save master report
|
||||
report = {
|
||||
"generated_at": datetime.now().isoformat(),
|
||||
"total_leagues": len(all_results),
|
||||
"elapsed_minutes": round(total_elapsed / 60, 1),
|
||||
"results": all_results,
|
||||
}
|
||||
report_path = os.path.join(REPORTS_DIR, "league_models_report.json")
|
||||
with open(report_path, "w") as f:
|
||||
json.dump(report, f, indent=2)
|
||||
print(f"\nRapor kaydedildi: {report_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user