311 lines
11 KiB
Python
311 lines
11 KiB
Python
"""
|
||
League Model Backtest — Son 100+ Maç
|
||
======================================
|
||
Her lig için en son 100-200 maçı (eğitim datasından bağımsız, test seti)
|
||
lig bazlı modelle tahmin eder ve gerçek sonuçla karşılaştırır.
|
||
|
||
Usage:
|
||
python scripts/backtest_league_models.py
|
||
python scripts/backtest_league_models.py --min-matches 150
|
||
"""
|
||
|
||
import os, sys, json, warnings, argparse
|
||
import numpy as np
|
||
import pandas as pd
|
||
import xgboost as xgb
|
||
from sklearn.metrics import accuracy_score
|
||
|
||
warnings.filterwarnings("ignore")
|
||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
|
||
from models.league_model import get_league_model_loader, MARKET_META, FILE_TO_SIGNAL
|
||
|
||
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
|
||
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports")
|
||
QL_PATH = os.path.join(os.path.dirname(AI_ENGINE_DIR), "qualified_leagues.json")
|
||
|
||
# Gerçek label kolonları (CSV'den)
|
||
LABEL_COLS = {
|
||
"MS": "label_ms",
|
||
"OU15": "label_ou15",
|
||
"OU25": "label_ou25",
|
||
"OU35": "label_ou35",
|
||
"BTTS": "label_btts",
|
||
"HT": "label_ht_result",
|
||
"HT_OU05": "label_ht_ou05",
|
||
"HT_OU15": "label_ht_ou15",
|
||
"HTFT": "label_ht_ft",
|
||
"OE": "label_odd_even",
|
||
"CARDS": "label_cards_ou45",
|
||
"HCAP": "label_handicap_ms",
|
||
}
|
||
|
||
# Model dosya adı → signal key eşlemesi
|
||
SIGNAL_TO_FILE = {v: k for k, v in FILE_TO_SIGNAL.items()}
|
||
|
||
SKIP_COLS = {
|
||
"match_id","home_team_id","away_team_id","league_id","mst_utc",
|
||
"score_home","score_away","total_goals","ht_score_home","ht_score_away","ht_total_goals",
|
||
"label_ms","label_ou05","label_ou15","label_ou25","label_ou35","label_btts",
|
||
"label_ht_result","label_ht_ou05","label_ht_ou15","label_ht_ft",
|
||
"label_odd_even","label_yellow_cards","label_cards_ou45","label_handicap_ms",
|
||
}
|
||
|
||
|
||
def backtest_league(
|
||
league_id: str,
|
||
df_league: pd.DataFrame,
|
||
feature_cols: list,
|
||
league_model,
|
||
n_test: int,
|
||
) -> dict:
|
||
"""Son n_test maçı backtest et, her market için doğruluk döndür."""
|
||
df_sorted = df_league.sort_values("mst_utc")
|
||
df_test = df_sorted.tail(n_test)
|
||
|
||
X = df_test[feature_cols].fillna(0)
|
||
results = {}
|
||
|
||
for sig_key, mfile_key in SIGNAL_TO_FILE.items():
|
||
label_col = LABEL_COLS.get(sig_key)
|
||
if not label_col or label_col not in df_test.columns:
|
||
continue
|
||
|
||
y_true = df_test[label_col].dropna().values
|
||
if len(y_true) < 30:
|
||
continue
|
||
|
||
# League-specific model varsa kullan
|
||
if league_model and league_model.has_market(mfile_key):
|
||
probs_list = []
|
||
preds = []
|
||
for _, row in df_test.iterrows():
|
||
feat = row[feature_cols].fillna(0).to_dict()
|
||
probs = league_model.predict_market(mfile_key, feat)
|
||
if probs:
|
||
best = max(probs, key=probs.__getitem__)
|
||
meta = MARKET_META[mfile_key]
|
||
labels = meta[1]
|
||
pred_idx = labels.index(best)
|
||
preds.append(pred_idx)
|
||
probs_list.append(list(probs.values()))
|
||
|
||
if not preds:
|
||
continue
|
||
|
||
y_valid = df_test[label_col].dropna()
|
||
if len(preds) != len(y_valid):
|
||
min_len = min(len(preds), len(y_valid))
|
||
preds = preds[:min_len]
|
||
y_valid = y_valid.values[:min_len]
|
||
else:
|
||
y_valid = y_valid.values
|
||
|
||
acc = accuracy_score(y_valid, preds)
|
||
results[sig_key] = {
|
||
"accuracy": round(acc, 4),
|
||
"n": len(preds),
|
||
"source": "league_specific",
|
||
}
|
||
|
||
return results
|
||
|
||
|
||
def backtest_with_general_v25(
|
||
df_test: pd.DataFrame,
|
||
feature_cols: list,
|
||
) -> dict:
|
||
"""Genel V25 modeli ile backtest."""
|
||
try:
|
||
from models.v25_ensemble import get_v25_predictor
|
||
v25 = get_v25_predictor()
|
||
if not v25._loaded:
|
||
v25.load_models()
|
||
except Exception as e:
|
||
return {}
|
||
|
||
X = df_test[feature_cols].fillna(0)
|
||
results = {}
|
||
|
||
mkey_map = {
|
||
"MS": ("ms", {"1": 0, "X": 1, "2": 2}),
|
||
"OU15": ("ou15", {"Over": 0, "Under": 1}),
|
||
"OU25": ("ou25", {"Over": 0, "Under": 1}),
|
||
"OU35": ("ou35", {"Over": 0, "Under": 1}),
|
||
"BTTS": ("btts", {"Yes": 0, "No": 1}),
|
||
}
|
||
|
||
for sig_key, (mkey, label_to_idx) in mkey_map.items():
|
||
label_col = LABEL_COLS.get(sig_key)
|
||
if not label_col or label_col not in df_test.columns:
|
||
continue
|
||
y_true = df_test[label_col].dropna().values
|
||
if len(y_true) < 30 or not v25.has_market(mkey):
|
||
continue
|
||
|
||
try:
|
||
dmat = xgb.DMatrix(X.values, feature_names=feature_cols)
|
||
models_v25 = v25.models.get(mkey, {})
|
||
if "xgb" not in models_v25:
|
||
continue
|
||
raw = models_v25["xgb"].predict(dmat)
|
||
num_class = list(MARKET_META.get(mkey, (2,)))[0]
|
||
|
||
if num_class > 2:
|
||
raw = raw.reshape(-1, num_class)
|
||
preds = np.argmax(raw, axis=1)
|
||
else:
|
||
preds = (raw >= 0.5).astype(int)
|
||
|
||
acc = accuracy_score(y_true, preds)
|
||
results[sig_key] = {
|
||
"accuracy": round(acc, 4),
|
||
"n": len(preds),
|
||
"source": "general_v25",
|
||
}
|
||
except Exception:
|
||
continue
|
||
|
||
return results
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("--min-matches", type=int, default=100)
|
||
parser.add_argument("--test-size", type=int, default=150,
|
||
help="Son kaç maçı test için kullan (min 100)")
|
||
args = parser.parse_args()
|
||
n_test = max(args.min_matches, args.test_size)
|
||
|
||
print(f"Loading training data ...")
|
||
df = pd.read_csv(DATA_PATH, low_memory=False)
|
||
feature_cols = [c for c in df.columns if c not in SKIP_COLS]
|
||
print(f" {len(df):,} maç | {len(feature_cols)} feature")
|
||
|
||
qualified = json.load(open(QL_PATH)) if os.path.exists(QL_PATH) else []
|
||
loader = get_league_model_loader()
|
||
|
||
try:
|
||
import psycopg2
|
||
from data.db import get_clean_dsn
|
||
conn = psycopg2.connect(get_clean_dsn())
|
||
cur = conn.cursor()
|
||
cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (qualified,))
|
||
league_names = {r[0]: r[1] for r in cur.fetchall()}
|
||
conn.close()
|
||
except Exception:
|
||
league_names = {}
|
||
|
||
counts = df[df["league_id"].isin(qualified)].groupby("league_id").size()
|
||
leagues_to_test = counts[counts >= n_test].index.tolist()
|
||
print(f"\nBacktest: {len(leagues_to_test)} lig (>={n_test} maç) | son {n_test} maç kullanılacak\n")
|
||
|
||
all_results = []
|
||
markets_order = ["MS", "OU15", "OU25", "OU35", "BTTS", "HT", "HT_OU05", "HT_OU15", "HTFT", "OE", "CARDS", "HCAP"]
|
||
|
||
header = f"{'Liga':<35} {'Maç':>5} | " + " | ".join(f"{m:>7}" for m in markets_order)
|
||
print(header)
|
||
print("-" * len(header))
|
||
|
||
for league_id in leagues_to_test:
|
||
df_league = df[df["league_id"] == league_id].copy()
|
||
name = league_names.get(league_id, league_id[:20])
|
||
|
||
league_model = loader.get(league_id)
|
||
|
||
if league_model and league_model.models:
|
||
# Batch predict from CSV features (fast)
|
||
df_test = df_league.sort_values("mst_utc").tail(n_test)
|
||
X = df_test[feature_cols].fillna(0)
|
||
mkt_results = {}
|
||
|
||
for mfile_key in list(league_model.models.keys()):
|
||
sig_key = FILE_TO_SIGNAL.get(mfile_key)
|
||
if not sig_key:
|
||
continue
|
||
label_col = LABEL_COLS.get(sig_key)
|
||
if not label_col or label_col not in df_test.columns:
|
||
continue
|
||
y_true = df_test[label_col].dropna().values
|
||
if len(y_true) < 30:
|
||
continue
|
||
|
||
try:
|
||
dmat = xgb.DMatrix(X.values, feature_names=feature_cols)
|
||
raw = league_model.models[mfile_key].predict(dmat)
|
||
nc = MARKET_META[mfile_key][0]
|
||
if nc > 2:
|
||
preds = np.argmax(raw.reshape(-1, nc), axis=1)
|
||
else:
|
||
preds = (raw >= 0.5).astype(int)
|
||
|
||
acc = accuracy_score(y_true[:len(preds)], preds[:len(y_true)])
|
||
mkt_results[sig_key] = {"accuracy": round(float(acc), 4), "n": len(preds), "source": "league_xgb"}
|
||
except Exception as e:
|
||
mkt_results[sig_key] = {"error": str(e)}
|
||
|
||
# Fill missing markets with general V25
|
||
missing_mkts_df = df_league.sort_values("mst_utc").tail(n_test)
|
||
gen_results = backtest_with_general_v25(missing_mkts_df, feature_cols)
|
||
for k, v in gen_results.items():
|
||
if k not in mkt_results:
|
||
mkt_results[k] = {**v, "source": "general_v25_fallback"}
|
||
else:
|
||
# No league model — use general V25
|
||
df_test = df_league.sort_values("mst_utc").tail(n_test)
|
||
mkt_results = backtest_with_general_v25(df_test, feature_cols)
|
||
for k in mkt_results:
|
||
mkt_results[k]["source"] = "general_v25"
|
||
|
||
n_used = min(n_test, len(df_league))
|
||
|
||
# Print row
|
||
accs = []
|
||
for m in markets_order:
|
||
r = mkt_results.get(m, {})
|
||
if "accuracy" in r:
|
||
accs.append(f"{r['accuracy']*100:>6.1f}%")
|
||
else:
|
||
accs.append(f"{'—':>7}")
|
||
print(f"{name:<35} {n_used:>5} | " + " | ".join(accs))
|
||
|
||
all_results.append({
|
||
"league_id": league_id,
|
||
"league_name": name,
|
||
"n_tested": n_used,
|
||
"markets": mkt_results,
|
||
})
|
||
|
||
# ── Özet ──────────────────────────────────────────────────────
|
||
print("\n" + "=" * len(header))
|
||
print("ORTALAMA DOĞRULUK (tüm ligler):")
|
||
for m in markets_order:
|
||
accs = [r["markets"][m]["accuracy"] for r in all_results if m in r["markets"] and "accuracy" in r["markets"][m]]
|
||
if accs:
|
||
print(f" {m:<10}: {np.mean(accs)*100:.1f}% (min={min(accs)*100:.1f}% max={max(accs)*100:.1f}% n_leagues={len(accs)})")
|
||
|
||
# En iyi / en kötü MS ligleri
|
||
ms_sorted = sorted(
|
||
[(r["league_name"], r["markets"].get("MS",{}).get("accuracy",0), r["n_tested"])
|
||
for r in all_results if "MS" in r["markets"] and "accuracy" in r["markets"]["MS"]],
|
||
key=lambda x: x[1], reverse=True
|
||
)
|
||
print("\nEN İYİ MS (Top 10):")
|
||
for name, acc, n in ms_sorted[:10]:
|
||
print(f" {name:<35} {acc*100:.1f}% ({n} maç)")
|
||
print("\nEN KÖTÜ MS (Bottom 10):")
|
||
for name, acc, n in ms_sorted[-10:]:
|
||
print(f" {name:<35} {acc*100:.1f}% ({n} maç)")
|
||
|
||
# Save
|
||
report = {"generated_at": pd.Timestamp.now().isoformat(), "n_test_per_league": n_test, "results": all_results}
|
||
out_path = os.path.join(REPORTS_DIR, "backtest_league_results.json")
|
||
with open(out_path, "w") as f:
|
||
json.dump(report, f, indent=2)
|
||
print(f"\nRapor: {out_path}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|