main
Deploy Iddaai Backend / build-and-deploy (push) Successful in 37s

This commit is contained in:
2026-05-17 02:17:22 +03:00
parent 17ace9bd12
commit 94c7a4481a
53 changed files with 29602 additions and 7832 deletions
+310
View File
@@ -0,0 +1,310 @@
"""
League Model Backtest — Son 100+ Maç
======================================
Her lig için en son 100-200 maçı (eğitim datasından bağımsız, test seti)
lig bazlı modelle tahmin eder ve gerçek sonuçla karşılaştırır.
Usage:
python scripts/backtest_league_models.py
python scripts/backtest_league_models.py --min-matches 150
"""
import os, sys, json, warnings, argparse
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score
warnings.filterwarnings("ignore")
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from models.league_model import get_league_model_loader, MARKET_META, FILE_TO_SIGNAL
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports")
QL_PATH = os.path.join(os.path.dirname(AI_ENGINE_DIR), "qualified_leagues.json")
# Gerçek label kolonları (CSV'den)
LABEL_COLS = {
"MS": "label_ms",
"OU15": "label_ou15",
"OU25": "label_ou25",
"OU35": "label_ou35",
"BTTS": "label_btts",
"HT": "label_ht_result",
"HT_OU05": "label_ht_ou05",
"HT_OU15": "label_ht_ou15",
"HTFT": "label_ht_ft",
"OE": "label_odd_even",
"CARDS": "label_cards_ou45",
"HCAP": "label_handicap_ms",
}
# Model dosya adı → signal key eşlemesi
SIGNAL_TO_FILE = {v: k for k, v in FILE_TO_SIGNAL.items()}
SKIP_COLS = {
"match_id","home_team_id","away_team_id","league_id","mst_utc",
"score_home","score_away","total_goals","ht_score_home","ht_score_away","ht_total_goals",
"label_ms","label_ou05","label_ou15","label_ou25","label_ou35","label_btts",
"label_ht_result","label_ht_ou05","label_ht_ou15","label_ht_ft",
"label_odd_even","label_yellow_cards","label_cards_ou45","label_handicap_ms",
}
def backtest_league(
league_id: str,
df_league: pd.DataFrame,
feature_cols: list,
league_model,
n_test: int,
) -> dict:
"""Son n_test maçı backtest et, her market için doğruluk döndür."""
df_sorted = df_league.sort_values("mst_utc")
df_test = df_sorted.tail(n_test)
X = df_test[feature_cols].fillna(0)
results = {}
for sig_key, mfile_key in SIGNAL_TO_FILE.items():
label_col = LABEL_COLS.get(sig_key)
if not label_col or label_col not in df_test.columns:
continue
y_true = df_test[label_col].dropna().values
if len(y_true) < 30:
continue
# League-specific model varsa kullan
if league_model and league_model.has_market(mfile_key):
probs_list = []
preds = []
for _, row in df_test.iterrows():
feat = row[feature_cols].fillna(0).to_dict()
probs = league_model.predict_market(mfile_key, feat)
if probs:
best = max(probs, key=probs.__getitem__)
meta = MARKET_META[mfile_key]
labels = meta[1]
pred_idx = labels.index(best)
preds.append(pred_idx)
probs_list.append(list(probs.values()))
if not preds:
continue
y_valid = df_test[label_col].dropna()
if len(preds) != len(y_valid):
min_len = min(len(preds), len(y_valid))
preds = preds[:min_len]
y_valid = y_valid.values[:min_len]
else:
y_valid = y_valid.values
acc = accuracy_score(y_valid, preds)
results[sig_key] = {
"accuracy": round(acc, 4),
"n": len(preds),
"source": "league_specific",
}
return results
def backtest_with_general_v25(
df_test: pd.DataFrame,
feature_cols: list,
) -> dict:
"""Genel V25 modeli ile backtest."""
try:
from models.v25_ensemble import get_v25_predictor
v25 = get_v25_predictor()
if not v25._loaded:
v25.load_models()
except Exception as e:
return {}
X = df_test[feature_cols].fillna(0)
results = {}
mkey_map = {
"MS": ("ms", {"1": 0, "X": 1, "2": 2}),
"OU15": ("ou15", {"Over": 0, "Under": 1}),
"OU25": ("ou25", {"Over": 0, "Under": 1}),
"OU35": ("ou35", {"Over": 0, "Under": 1}),
"BTTS": ("btts", {"Yes": 0, "No": 1}),
}
for sig_key, (mkey, label_to_idx) in mkey_map.items():
label_col = LABEL_COLS.get(sig_key)
if not label_col or label_col not in df_test.columns:
continue
y_true = df_test[label_col].dropna().values
if len(y_true) < 30 or not v25.has_market(mkey):
continue
try:
dmat = xgb.DMatrix(X.values, feature_names=feature_cols)
models_v25 = v25.models.get(mkey, {})
if "xgb" not in models_v25:
continue
raw = models_v25["xgb"].predict(dmat)
num_class = list(MARKET_META.get(mkey, (2,)))[0]
if num_class > 2:
raw = raw.reshape(-1, num_class)
preds = np.argmax(raw, axis=1)
else:
preds = (raw >= 0.5).astype(int)
acc = accuracy_score(y_true, preds)
results[sig_key] = {
"accuracy": round(acc, 4),
"n": len(preds),
"source": "general_v25",
}
except Exception:
continue
return results
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--min-matches", type=int, default=100)
parser.add_argument("--test-size", type=int, default=150,
help="Son kaç maçı test için kullan (min 100)")
args = parser.parse_args()
n_test = max(args.min_matches, args.test_size)
print(f"Loading training data ...")
df = pd.read_csv(DATA_PATH, low_memory=False)
feature_cols = [c for c in df.columns if c not in SKIP_COLS]
print(f" {len(df):,} maç | {len(feature_cols)} feature")
qualified = json.load(open(QL_PATH)) if os.path.exists(QL_PATH) else []
loader = get_league_model_loader()
try:
import psycopg2
from data.db import get_clean_dsn
conn = psycopg2.connect(get_clean_dsn())
cur = conn.cursor()
cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (qualified,))
league_names = {r[0]: r[1] for r in cur.fetchall()}
conn.close()
except Exception:
league_names = {}
counts = df[df["league_id"].isin(qualified)].groupby("league_id").size()
leagues_to_test = counts[counts >= n_test].index.tolist()
print(f"\nBacktest: {len(leagues_to_test)} lig (>={n_test} maç) | son {n_test} maç kullanılacak\n")
all_results = []
markets_order = ["MS", "OU15", "OU25", "OU35", "BTTS", "HT", "HT_OU05", "HT_OU15", "HTFT", "OE", "CARDS", "HCAP"]
header = f"{'Liga':<35} {'Maç':>5} | " + " | ".join(f"{m:>7}" for m in markets_order)
print(header)
print("-" * len(header))
for league_id in leagues_to_test:
df_league = df[df["league_id"] == league_id].copy()
name = league_names.get(league_id, league_id[:20])
league_model = loader.get(league_id)
if league_model and league_model.models:
# Batch predict from CSV features (fast)
df_test = df_league.sort_values("mst_utc").tail(n_test)
X = df_test[feature_cols].fillna(0)
mkt_results = {}
for mfile_key in list(league_model.models.keys()):
sig_key = FILE_TO_SIGNAL.get(mfile_key)
if not sig_key:
continue
label_col = LABEL_COLS.get(sig_key)
if not label_col or label_col not in df_test.columns:
continue
y_true = df_test[label_col].dropna().values
if len(y_true) < 30:
continue
try:
dmat = xgb.DMatrix(X.values, feature_names=feature_cols)
raw = league_model.models[mfile_key].predict(dmat)
nc = MARKET_META[mfile_key][0]
if nc > 2:
preds = np.argmax(raw.reshape(-1, nc), axis=1)
else:
preds = (raw >= 0.5).astype(int)
acc = accuracy_score(y_true[:len(preds)], preds[:len(y_true)])
mkt_results[sig_key] = {"accuracy": round(float(acc), 4), "n": len(preds), "source": "league_xgb"}
except Exception as e:
mkt_results[sig_key] = {"error": str(e)}
# Fill missing markets with general V25
missing_mkts_df = df_league.sort_values("mst_utc").tail(n_test)
gen_results = backtest_with_general_v25(missing_mkts_df, feature_cols)
for k, v in gen_results.items():
if k not in mkt_results:
mkt_results[k] = {**v, "source": "general_v25_fallback"}
else:
# No league model — use general V25
df_test = df_league.sort_values("mst_utc").tail(n_test)
mkt_results = backtest_with_general_v25(df_test, feature_cols)
for k in mkt_results:
mkt_results[k]["source"] = "general_v25"
n_used = min(n_test, len(df_league))
# Print row
accs = []
for m in markets_order:
r = mkt_results.get(m, {})
if "accuracy" in r:
accs.append(f"{r['accuracy']*100:>6.1f}%")
else:
accs.append(f"{'':>7}")
print(f"{name:<35} {n_used:>5} | " + " | ".join(accs))
all_results.append({
"league_id": league_id,
"league_name": name,
"n_tested": n_used,
"markets": mkt_results,
})
# ── Özet ──────────────────────────────────────────────────────
print("\n" + "=" * len(header))
print("ORTALAMA DOĞRULUK (tüm ligler):")
for m in markets_order:
accs = [r["markets"][m]["accuracy"] for r in all_results if m in r["markets"] and "accuracy" in r["markets"][m]]
if accs:
print(f" {m:<10}: {np.mean(accs)*100:.1f}% (min={min(accs)*100:.1f}% max={max(accs)*100:.1f}% n_leagues={len(accs)})")
# En iyi / en kötü MS ligleri
ms_sorted = sorted(
[(r["league_name"], r["markets"].get("MS",{}).get("accuracy",0), r["n_tested"])
for r in all_results if "MS" in r["markets"] and "accuracy" in r["markets"]["MS"]],
key=lambda x: x[1], reverse=True
)
print("\nEN İYİ MS (Top 10):")
for name, acc, n in ms_sorted[:10]:
print(f" {name:<35} {acc*100:.1f}% ({n} maç)")
print("\nEN KÖTÜ MS (Bottom 10):")
for name, acc, n in ms_sorted[-10:]:
print(f" {name:<35} {acc*100:.1f}% ({n} maç)")
# Save
report = {"generated_at": pd.Timestamp.now().isoformat(), "n_test_per_league": n_test, "results": all_results}
out_path = os.path.join(REPORTS_DIR, "backtest_league_results.json")
with open(out_path, "w") as f:
json.dump(report, f, indent=2)
print(f"\nRapor: {out_path}")
if __name__ == "__main__":
main()