Files
iddaai-be/ai-engine/scripts/backtest_league_models.py
fahricansecer 94c7a4481a
Deploy Iddaai Backend / build-and-deploy (push) Successful in 37s
main
2026-05-17 02:17:22 +03:00

311 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
League Model Backtest — Son 100+ Maç
======================================
Her lig için en son 100-200 maçı (eğitim datasından bağımsız, test seti)
lig bazlı modelle tahmin eder ve gerçek sonuçla karşılaştırır.
Usage:
python scripts/backtest_league_models.py
python scripts/backtest_league_models.py --min-matches 150
"""
import os, sys, json, warnings, argparse
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score
warnings.filterwarnings("ignore")
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from models.league_model import get_league_model_loader, MARKET_META, FILE_TO_SIGNAL
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports")
QL_PATH = os.path.join(os.path.dirname(AI_ENGINE_DIR), "qualified_leagues.json")
# Gerçek label kolonları (CSV'den)
LABEL_COLS = {
"MS": "label_ms",
"OU15": "label_ou15",
"OU25": "label_ou25",
"OU35": "label_ou35",
"BTTS": "label_btts",
"HT": "label_ht_result",
"HT_OU05": "label_ht_ou05",
"HT_OU15": "label_ht_ou15",
"HTFT": "label_ht_ft",
"OE": "label_odd_even",
"CARDS": "label_cards_ou45",
"HCAP": "label_handicap_ms",
}
# Model dosya adı → signal key eşlemesi
SIGNAL_TO_FILE = {v: k for k, v in FILE_TO_SIGNAL.items()}
SKIP_COLS = {
"match_id","home_team_id","away_team_id","league_id","mst_utc",
"score_home","score_away","total_goals","ht_score_home","ht_score_away","ht_total_goals",
"label_ms","label_ou05","label_ou15","label_ou25","label_ou35","label_btts",
"label_ht_result","label_ht_ou05","label_ht_ou15","label_ht_ft",
"label_odd_even","label_yellow_cards","label_cards_ou45","label_handicap_ms",
}
def backtest_league(
league_id: str,
df_league: pd.DataFrame,
feature_cols: list,
league_model,
n_test: int,
) -> dict:
"""Son n_test maçı backtest et, her market için doğruluk döndür."""
df_sorted = df_league.sort_values("mst_utc")
df_test = df_sorted.tail(n_test)
X = df_test[feature_cols].fillna(0)
results = {}
for sig_key, mfile_key in SIGNAL_TO_FILE.items():
label_col = LABEL_COLS.get(sig_key)
if not label_col or label_col not in df_test.columns:
continue
y_true = df_test[label_col].dropna().values
if len(y_true) < 30:
continue
# League-specific model varsa kullan
if league_model and league_model.has_market(mfile_key):
probs_list = []
preds = []
for _, row in df_test.iterrows():
feat = row[feature_cols].fillna(0).to_dict()
probs = league_model.predict_market(mfile_key, feat)
if probs:
best = max(probs, key=probs.__getitem__)
meta = MARKET_META[mfile_key]
labels = meta[1]
pred_idx = labels.index(best)
preds.append(pred_idx)
probs_list.append(list(probs.values()))
if not preds:
continue
y_valid = df_test[label_col].dropna()
if len(preds) != len(y_valid):
min_len = min(len(preds), len(y_valid))
preds = preds[:min_len]
y_valid = y_valid.values[:min_len]
else:
y_valid = y_valid.values
acc = accuracy_score(y_valid, preds)
results[sig_key] = {
"accuracy": round(acc, 4),
"n": len(preds),
"source": "league_specific",
}
return results
def backtest_with_general_v25(
df_test: pd.DataFrame,
feature_cols: list,
) -> dict:
"""Genel V25 modeli ile backtest."""
try:
from models.v25_ensemble import get_v25_predictor
v25 = get_v25_predictor()
if not v25._loaded:
v25.load_models()
except Exception as e:
return {}
X = df_test[feature_cols].fillna(0)
results = {}
mkey_map = {
"MS": ("ms", {"1": 0, "X": 1, "2": 2}),
"OU15": ("ou15", {"Over": 0, "Under": 1}),
"OU25": ("ou25", {"Over": 0, "Under": 1}),
"OU35": ("ou35", {"Over": 0, "Under": 1}),
"BTTS": ("btts", {"Yes": 0, "No": 1}),
}
for sig_key, (mkey, label_to_idx) in mkey_map.items():
label_col = LABEL_COLS.get(sig_key)
if not label_col or label_col not in df_test.columns:
continue
y_true = df_test[label_col].dropna().values
if len(y_true) < 30 or not v25.has_market(mkey):
continue
try:
dmat = xgb.DMatrix(X.values, feature_names=feature_cols)
models_v25 = v25.models.get(mkey, {})
if "xgb" not in models_v25:
continue
raw = models_v25["xgb"].predict(dmat)
num_class = list(MARKET_META.get(mkey, (2,)))[0]
if num_class > 2:
raw = raw.reshape(-1, num_class)
preds = np.argmax(raw, axis=1)
else:
preds = (raw >= 0.5).astype(int)
acc = accuracy_score(y_true, preds)
results[sig_key] = {
"accuracy": round(acc, 4),
"n": len(preds),
"source": "general_v25",
}
except Exception:
continue
return results
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--min-matches", type=int, default=100)
parser.add_argument("--test-size", type=int, default=150,
help="Son kaç maçı test için kullan (min 100)")
args = parser.parse_args()
n_test = max(args.min_matches, args.test_size)
print(f"Loading training data ...")
df = pd.read_csv(DATA_PATH, low_memory=False)
feature_cols = [c for c in df.columns if c not in SKIP_COLS]
print(f" {len(df):,} maç | {len(feature_cols)} feature")
qualified = json.load(open(QL_PATH)) if os.path.exists(QL_PATH) else []
loader = get_league_model_loader()
try:
import psycopg2
from data.db import get_clean_dsn
conn = psycopg2.connect(get_clean_dsn())
cur = conn.cursor()
cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (qualified,))
league_names = {r[0]: r[1] for r in cur.fetchall()}
conn.close()
except Exception:
league_names = {}
counts = df[df["league_id"].isin(qualified)].groupby("league_id").size()
leagues_to_test = counts[counts >= n_test].index.tolist()
print(f"\nBacktest: {len(leagues_to_test)} lig (>={n_test} maç) | son {n_test} maç kullanılacak\n")
all_results = []
markets_order = ["MS", "OU15", "OU25", "OU35", "BTTS", "HT", "HT_OU05", "HT_OU15", "HTFT", "OE", "CARDS", "HCAP"]
header = f"{'Liga':<35} {'Maç':>5} | " + " | ".join(f"{m:>7}" for m in markets_order)
print(header)
print("-" * len(header))
for league_id in leagues_to_test:
df_league = df[df["league_id"] == league_id].copy()
name = league_names.get(league_id, league_id[:20])
league_model = loader.get(league_id)
if league_model and league_model.models:
# Batch predict from CSV features (fast)
df_test = df_league.sort_values("mst_utc").tail(n_test)
X = df_test[feature_cols].fillna(0)
mkt_results = {}
for mfile_key in list(league_model.models.keys()):
sig_key = FILE_TO_SIGNAL.get(mfile_key)
if not sig_key:
continue
label_col = LABEL_COLS.get(sig_key)
if not label_col or label_col not in df_test.columns:
continue
y_true = df_test[label_col].dropna().values
if len(y_true) < 30:
continue
try:
dmat = xgb.DMatrix(X.values, feature_names=feature_cols)
raw = league_model.models[mfile_key].predict(dmat)
nc = MARKET_META[mfile_key][0]
if nc > 2:
preds = np.argmax(raw.reshape(-1, nc), axis=1)
else:
preds = (raw >= 0.5).astype(int)
acc = accuracy_score(y_true[:len(preds)], preds[:len(y_true)])
mkt_results[sig_key] = {"accuracy": round(float(acc), 4), "n": len(preds), "source": "league_xgb"}
except Exception as e:
mkt_results[sig_key] = {"error": str(e)}
# Fill missing markets with general V25
missing_mkts_df = df_league.sort_values("mst_utc").tail(n_test)
gen_results = backtest_with_general_v25(missing_mkts_df, feature_cols)
for k, v in gen_results.items():
if k not in mkt_results:
mkt_results[k] = {**v, "source": "general_v25_fallback"}
else:
# No league model — use general V25
df_test = df_league.sort_values("mst_utc").tail(n_test)
mkt_results = backtest_with_general_v25(df_test, feature_cols)
for k in mkt_results:
mkt_results[k]["source"] = "general_v25"
n_used = min(n_test, len(df_league))
# Print row
accs = []
for m in markets_order:
r = mkt_results.get(m, {})
if "accuracy" in r:
accs.append(f"{r['accuracy']*100:>6.1f}%")
else:
accs.append(f"{'':>7}")
print(f"{name:<35} {n_used:>5} | " + " | ".join(accs))
all_results.append({
"league_id": league_id,
"league_name": name,
"n_tested": n_used,
"markets": mkt_results,
})
# ── Özet ──────────────────────────────────────────────────────
print("\n" + "=" * len(header))
print("ORTALAMA DOĞRULUK (tüm ligler):")
for m in markets_order:
accs = [r["markets"][m]["accuracy"] for r in all_results if m in r["markets"] and "accuracy" in r["markets"][m]]
if accs:
print(f" {m:<10}: {np.mean(accs)*100:.1f}% (min={min(accs)*100:.1f}% max={max(accs)*100:.1f}% n_leagues={len(accs)})")
# En iyi / en kötü MS ligleri
ms_sorted = sorted(
[(r["league_name"], r["markets"].get("MS",{}).get("accuracy",0), r["n_tested"])
for r in all_results if "MS" in r["markets"] and "accuracy" in r["markets"]["MS"]],
key=lambda x: x[1], reverse=True
)
print("\nEN İYİ MS (Top 10):")
for name, acc, n in ms_sorted[:10]:
print(f" {name:<35} {acc*100:.1f}% ({n} maç)")
print("\nEN KÖTÜ MS (Bottom 10):")
for name, acc, n in ms_sorted[-10:]:
print(f" {name:<35} {acc*100:.1f}% ({n} maç)")
# Save
report = {"generated_at": pd.Timestamp.now().isoformat(), "n_test_per_league": n_test, "results": all_results}
out_path = os.path.join(REPORTS_DIR, "backtest_league_results.json")
with open(out_path, "w") as f:
json.dump(report, f, indent=2)
print(f"\nRapor: {out_path}")
if __name__ == "__main__":
main()