Files
iddaai-be/ai-engine/scripts/calibration_report.py
T
fahricansecer 7b17aa1fee
Deploy Iddaai Backend / build-and-deploy (push) Successful in 33s
gg2
2026-06-07 15:59:41 +03:00

113 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Calibration Report — are the model's probabilities "kusursuz"?
=============================================================
"Flawless probability" has a precise technical meaning: CALIBRATION. When the
model says 60%, the event must happen ~60% of the time. This measures exactly
that for the leak-free MS (1X2) model, and shows how much isotonic calibration
improves it.
Metrics:
* Reliability table: bin predicted prob -> avg predicted vs ACTUAL frequency.
Calibrated = avg_pred ≈ actual in every bin (gap ≈ 0).
* ECE (Expected Calibration Error): weighted mean |pred - actual|. Lower=better.
* Brier score, Log-loss: overall probability accuracy. Lower=better.
Time-split (no leakage): train 70% -> fit isotonic on next 15% -> test last 15%.
Usage: python scripts/calibration_report.py
"""
from __future__ import annotations
import os, sys
import numpy as np, pandas as pd, xgboost as xgb
from sklearn.isotonic import IsotonicRegression
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try: sys.stdout.reconfigure(encoding="utf-8")
except Exception: pass
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
"score_home","score_away","ht_score_home","ht_score_away"}
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
"squad_diff","home_squad_quality","away_squad_quality",
"referee_home_bias","referee_avg_goals"}
PARAMS = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
def reliability(probs, y, nbins=10):
"""Pool one-vs-rest predictions; bin by predicted prob; compare to actual freq."""
P = probs.reshape(-1)
hit = np.zeros((len(y), probs.shape[1]))
hit[np.arange(len(y)), y] = 1.0
H = hit.reshape(-1)
edges = np.linspace(0, 1, nbins + 1)
rows, ece, N = [], 0.0, len(P)
for i in range(nbins):
lo, hi = edges[i], edges[i+1]
m = (P >= lo) & (P < hi) if i < nbins-1 else (P >= lo) & (P <= hi)
if m.sum() == 0:
continue
ap, af, n = P[m].mean(), H[m].mean(), int(m.sum())
rows.append((f"{int(lo*100)}-{int(hi*100)}%", n, ap, af, af-ap))
ece += (n / N) * abs(ap - af)
return rows, ece
def brier(probs, y):
oh = np.zeros_like(probs); oh[np.arange(len(y)), y] = 1.0
return float(np.mean(np.sum((probs - oh) ** 2, axis=1)))
def logloss(probs, y):
p = np.clip(probs[np.arange(len(y)), y], 1e-9, 1)
return float(-np.mean(np.log(p)))
def main():
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
sh = pd.to_numeric(df["score_home"], errors="coerce")
sa = pd.to_numeric(df["score_away"], errors="coerce")
ok = sh.notna() & sa.notna()
df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
n = len(df); a, b = int(n*0.70), int(n*0.85)
Xtr, ytr = X[:a], y[:a]
Xca, yca = X[a:b], y[a:b]
Xte, yte = X[b:], y[b:]
print(f"{n:,} matches | train {len(ytr):,} / calib {len(yca):,} / test {len(yte):,} (time-split)")
bst = xgb.train(PARAMS, xgb.DMatrix(Xtr, label=ytr), num_boost_round=300)
raw_ca = bst.predict(xgb.DMatrix(Xca))
raw_te = bst.predict(xgb.DMatrix(Xte))
# isotonic per class (fit on calib), apply to test, renormalize
isos = []
for k in range(3):
ir = IsotonicRegression(out_of_bounds="clip", y_min=0, y_max=1)
ir.fit(raw_ca[:, k], (yca == k).astype(float))
isos.append(ir)
cal_te = np.column_stack([isos[k].predict(raw_te[:, k]) for k in range(3)])
cal_te = np.clip(cal_te, 1e-6, 1)
cal_te = cal_te / cal_te.sum(axis=1, keepdims=True)
for name, P in (("RAW (kalibrasyonsuz)", raw_te), ("ISOTONIC KALİBRELİ", cal_te)):
rows, ece = reliability(P, yte)
print(f"\n{'='*64}\n{name}\n{'='*64}")
print(f" {'tahmin bandı':<12}{'n':>7}{'ort.tahmin':>12}{'gerçek':>9}{'fark':>8}")
for band, nn, ap, af, gap in rows:
print(f" {band:<12}{nn:>7}{100*ap:>11.1f}%{100*af:>8.1f}%{100*gap:>+7.1f}")
print(f" ECE={100*ece:.2f}% Brier={brier(P,yte):.4f} LogLoss={logloss(P,yte):.4f}")
print("\nOKUMA: 'fark' ≈ 0 ise olasılıklar KUSURSUZ (söylediği %X gerçekten %X).")
print("ECE/Brier/LogLoss düştüyse kalibrasyon işe yaradı. Bu kalibre olasılıklar,")
print("maçın olası sonuçlarını dürüstçe gösterir — kayıp-minimizasyonun temeli budur.")
if __name__ == "__main__":
main()