"""Calibration scoreboard — "dediğimiz vs olan" karnesi. Measures, on settled real-odds matches, how honest the DISPLAYED numbers are: 1. ANCHORED PIPELINE (what V35 shows): per market (MS 1/X/2, OU2.5, BTTS) reliability buckets — mean stated probability vs actual frequency, plus ECE / Brier per market. 2. SCORE CARD (V36): modal-score hit vs stated modal probability, top-5 coverage, HT modal hit. 3. STORED RUNS: prediction_runs settled per engine_version (the `.sim-finished` buckets — the user's manual finished-match tests — are reported separately and never mixed into the live karne). It recomputes the anchored numbers with the SAME modules the engine ships (models/market_anchor.py + models/score_matrix.py), so the scoreboard always grades current pipeline math, not a copy of it. DB: uses DATABASE_URL (data/db.py). Reads are gentle: a server-side cursor over an indexed, date-bounded join — never aggregate-scans the giant odds tables (prod runs on a Raspberry Pi). Usage: python scripts/calibration_scoreboard.py [--days 365] [--buckets 10] """ from __future__ import annotations import argparse import os import sys import time from collections import defaultdict from typing import Any, Dict, List, Optional, Tuple sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) import psycopg2 # noqa: E402 from psycopg2.extras import RealDictCursor # noqa: E402 from data.db import get_clean_dsn # noqa: E402 from models.market_anchor import apply_corrections # noqa: E402 from models.score_matrix import build_calibrated_score_package # noqa: E402 REAL_ODDS_MIN_OVERROUND = 0.05 # the user's hard rule: no real odds -> excluded def _fetch_settled_matches(days: int) -> List[Dict[str, Any]]: """Finished, real-odds matches with stored de-vigged implied probs.""" since_ms = int((time.time() - days * 86400) * 1000) sql = """ SELECT f.implied_home, f.implied_draw, f.implied_away, f.implied_over25, f.implied_btts_yes, f.odds_overround, m.score_home, m.score_away, m.ht_score_home, m.ht_score_away FROM football_ai_features f JOIN matches m ON m.id = f.match_id WHERE m.sport = 'football' AND m.winner IN ('home', 'away', 'draw') AND m.score_home IS NOT NULL AND f.odds_overround > %s AND m.mst_utc >= %s """ rows: List[Dict[str, Any]] = [] with psycopg2.connect(get_clean_dsn()) as conn: with conn.cursor() as cur: cur.execute("SET statement_timeout = '120s'") # server-side (named) cursor: streams gently instead of one big fetch with conn.cursor("scoreboard_stream", cursor_factory=RealDictCursor) as cur: cur.itersize = 5000 cur.execute(sql, (REAL_ODDS_MIN_OVERROUND, since_ms)) for r in cur: rows.append(dict(r)) return rows def _anchored_probs(row: Dict[str, Any]) -> Optional[Tuple[float, float, float]]: """The MS vector the V35 pipeline would display (devig is already done in the stored features; apply the active home-favourite correction).""" try: p1 = float(row["implied_home"]); px = float(row["implied_draw"]); p2 = float(row["implied_away"]) except (TypeError, ValueError): return None if not (0.0 < p1 < 1.0 and 0.0 < px < 1.0 and 0.0 < p2 < 1.0): return None if abs(p1 + px + p2 - 1.0) > 0.02: # not a clean de-vigged vector return None return apply_corrections(p1, px, p2) class Reliability: """Accumulates (stated probability, outcome) pairs into buckets.""" def __init__(self, n_buckets: int) -> None: self.n_buckets = n_buckets self.n = defaultdict(int) self.sum_p = defaultdict(float) self.sum_y = defaultdict(int) def add(self, p: float, hit: bool) -> None: b = min(self.n_buckets - 1, int(p * self.n_buckets)) self.n[b] += 1 self.sum_p[b] += p self.sum_y[b] += 1 if hit else 0 def report(self, title: str) -> Tuple[float, float]: total = sum(self.n.values()) if not total: print(f"\n== {title}: no data ==") return 0.0, 0.0 ece = 0.0 brier_num = 0.0 print(f"\n== {title} (n={total}) ==") print(f"{'band':>10} {'n':>8} {'said%':>8} {'actual%':>8} {'gap_pt':>7}") for b in sorted(self.n): n = self.n[b] said = self.sum_p[b] / n act = self.sum_y[b] / n ece += n * abs(said - act) print(f"{b / self.n_buckets:>5.2f}-{(b + 1) / self.n_buckets:<4.2f} " f"{n:>8} {100 * said:>8.1f} {100 * act:>8.1f} {100 * (act - said):>7.1f}") ece /= total # Brier from bucket stats is approximate; recompute exactly elsewhere # if needed. ECE is the headline honesty metric here. print(f"{'ECE':>10}: {100 * ece:.2f}%") return ece, brier_num def grade_pipeline(rows: List[Dict[str, Any]], n_buckets: int) -> None: ms1 = Reliability(n_buckets); msx = Reliability(n_buckets); ms2 = Reliability(n_buckets) ou = Reliability(n_buckets); btts = Reliability(n_buckets) top1 = top5 = ht1 = 0 stated_modal = 0.0 n_score = 0 for r in rows: anch = _anchored_probs(r) sh, sa = int(r["score_home"]), int(r["score_away"]) winner = "home" if sh > sa else "away" if sa > sh else "draw" if anch is not None: p1, px, p2 = anch ms1.add(p1, winner == "home") msx.add(px, winner == "draw") ms2.add(p2, winner == "away") # exactly-0.5 values are DEFAULT FILL for matches without a real OU/BTTS # market (measured: 15,993 of 78k OU rows) — never grade or use them. try: po = float(r["implied_over25"]) if po == 0.5 or not (0.05 < po < 0.95): po = None else: ou.add(po, sh + sa >= 3) except (TypeError, ValueError): po = None try: pb = float(r["implied_btts_yes"]) if pb != 0.5 and 0.05 < pb < 0.95: btts.add(pb, sh > 0 and sa > 0) except (TypeError, ValueError): pass # V36 score card (sampled fully — pure math, no I/O) if anch is not None and po is not None and 0.05 < po < 0.95: pkg = build_calibrated_score_package(*anch, po) actual = f"{min(sh, 10)}-{min(sa, 10)}" n_score += 1 stated_modal += float(pkg["scenario_top5"][0]["prob"]) if pkg["ft"] == actual: top1 += 1 if actual in [d["score"] for d in pkg["scenario_top5"]]: top5 += 1 hh, ha = r.get("ht_score_home"), r.get("ht_score_away") if hh is not None and ha is not None and pkg["ht"] == f"{min(int(hh),10)}-{min(int(ha),10)}": ht1 += 1 ms1.report("MS ev (1) — anchored pipeline") msx.report("MS beraberlik (X) — anchored pipeline") ms2.report("MS deplasman (2) — anchored pipeline") ou.report("Ust/Alt 2.5 (over) — devig") btts.report("KG Var — devig") if n_score: print(f"\n== V36 skor karti (n={n_score}) ==") print(f" modal skor isabeti : {100 * top1 / n_score:.1f}% (soylenen: {100 * stated_modal / n_score:.1f}%)") print(f" top-5 kapsama : {100 * top5 / n_score:.1f}%") print(f" IY modal isabeti : {100 * ht1 / n_score:.1f}%") def grade_stored_runs() -> None: """Settle prediction_runs main_pick stated probabilities per engine_version. `.sim-finished` buckets (manual finished-match tests) report separately.""" sql = """ SELECT pr.engine_version, pr.payload_summary->'main_pick'->>'market' AS market, pr.payload_summary->'main_pick'->>'pick' AS pick, COALESCE((pr.payload_summary->'main_pick'->>'calibrated_probability')::float, (pr.payload_summary->'main_pick'->>'probability')::float) AS p, m.score_home AS sh, m.score_away AS sa, m.winner AS w FROM prediction_runs pr JOIN matches m ON m.id = pr.match_id WHERE m.score_home IS NOT NULL AND jsonb_typeof(pr.payload_summary->'main_pick') = 'object' """ with psycopg2.connect(get_clean_dsn()) as conn: with conn.cursor() as cur: cur.execute("SET statement_timeout = '60s'") with conn.cursor(cursor_factory=RealDictCursor) as cur: cur.execute(sql) rows = cur.fetchall() def settle(market: str, pick: str, sh: int, sa: int, w: str) -> Optional[bool]: total = sh + sa pick_u = (pick or "").upper() over = "UST" in pick_u.replace("Ü", "U") or "OVER" in pick_u if market == "MS": return {"1": w == "home", "X": w == "draw", "2": w == "away"}.get(pick) if market in ("OU15", "OU25", "OU35"): line = {"OU15": 1.5, "OU25": 2.5, "OU35": 3.5}[market] return total > line if over else total < line if market == "BTTS": yes = "VAR" in pick_u or "YES" in pick_u return (sh > 0 and sa > 0) if yes else not (sh > 0 and sa > 0) return None stats: Dict[str, List[Tuple[float, bool]]] = defaultdict(list) for r in rows: if r["p"] is None: continue hit = settle(str(r["market"]), str(r["pick"]), int(r["sh"]), int(r["sa"]), str(r["w"])) if hit is None: continue stats[str(r["engine_version"])].append((float(r["p"]), bool(hit))) print("\n== prediction_runs karnesi (main_pick, soylenen vs olan) ==") print(f"{'engine_version':<34} {'n':>5} {'said%':>8} {'actual%':>8}") for ver in sorted(stats): pairs = stats[ver] n = len(pairs) said = sum(p for p, _ in pairs) / n act = sum(1 for _, h in pairs if h) / n tag = " <- test kovasi" if ver.endswith(".sim-finished") else "" print(f"{ver:<34} {n:>5} {100 * said:>8.1f} {100 * act:>8.1f}{tag}") if not stats: print(" (settle edilebilir kayit yok)") def main() -> None: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--days", type=int, default=365, help="lookback window (days)") ap.add_argument("--buckets", type=int, default=10) args = ap.parse_args() t0 = time.time() rows = _fetch_settled_matches(args.days) print(f"settled real-odds matches loaded: {len(rows)} (last {args.days} days, " f"{time.time() - t0:.1f}s)") if rows: grade_pipeline(rows, args.buckets) grade_stored_runs() if __name__ == "__main__": main()