wow

2026-06-11 00:25:45 +03:00
parent bb911176df
commit 4c137fbab6
9 changed files with 1246 additions and 6 deletions
@@ -0,0 +1,261 @@
+"""Calibration scoreboard — "dediğimiz vs olan" karnesi.
+
+Measures, on settled real-odds matches, how honest the DISPLAYED numbers are:
+
+  1. ANCHORED PIPELINE (what V35 shows): per market (MS 1/X/2, OU2.5, BTTS)
+     reliability buckets — mean stated probability vs actual frequency,
+     plus ECE / Brier per market.
+  2. SCORE CARD (V36): modal-score hit vs stated modal probability, top-5
+     coverage, HT modal hit.
+  3. STORED RUNS: prediction_runs settled per engine_version (the
+     `.sim-finished` buckets — the user's manual finished-match tests — are
+     reported separately and never mixed into the live karne).
+
+It recomputes the anchored numbers with the SAME modules the engine ships
+(models/market_anchor.py + models/score_matrix.py), so the scoreboard always
+grades current pipeline math, not a copy of it.
+
+DB: uses DATABASE_URL (data/db.py). Reads are gentle: a server-side cursor
+over an indexed, date-bounded join — never aggregate-scans the giant odds
+tables (prod runs on a Raspberry Pi).
+
+Usage:
+    python scripts/calibration_scoreboard.py [--days 365] [--buckets 10]
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+import time
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Tuple
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+import psycopg2  # noqa: E402
+from psycopg2.extras import RealDictCursor  # noqa: E402
+
+from data.db import get_clean_dsn  # noqa: E402
+from models.market_anchor import apply_corrections  # noqa: E402
+from models.score_matrix import build_calibrated_score_package  # noqa: E402
+
+REAL_ODDS_MIN_OVERROUND = 0.05  # the user's hard rule: no real odds -> excluded
+
+
+def _fetch_settled_matches(days: int) -> List[Dict[str, Any]]:
+    """Finished, real-odds matches with stored de-vigged implied probs."""
+    since_ms = int((time.time() - days * 86400) * 1000)
+    sql = """
+        SELECT f.implied_home, f.implied_draw, f.implied_away,
+               f.implied_over25, f.implied_btts_yes, f.odds_overround,
+               m.score_home, m.score_away, m.ht_score_home, m.ht_score_away
+        FROM football_ai_features f
+        JOIN matches m ON m.id = f.match_id
+        WHERE m.sport = 'football'
+          AND m.winner IN ('home', 'away', 'draw')
+          AND m.score_home IS NOT NULL
+          AND f.odds_overround > %s
+          AND m.mst_utc >= %s
+    """
+    rows: List[Dict[str, Any]] = []
+    with psycopg2.connect(get_clean_dsn()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SET statement_timeout = '120s'")
+        # server-side (named) cursor: streams gently instead of one big fetch
+        with conn.cursor("scoreboard_stream", cursor_factory=RealDictCursor) as cur:
+            cur.itersize = 5000
+            cur.execute(sql, (REAL_ODDS_MIN_OVERROUND, since_ms))
+            for r in cur:
+                rows.append(dict(r))
+    return rows
+
+
+def _anchored_probs(row: Dict[str, Any]) -> Optional[Tuple[float, float, float]]:
+    """The MS vector the V35 pipeline would display (devig is already done in
+    the stored features; apply the active home-favourite correction)."""
+    try:
+        p1 = float(row["implied_home"]); px = float(row["implied_draw"]); p2 = float(row["implied_away"])
+    except (TypeError, ValueError):
+        return None
+    if not (0.0 < p1 < 1.0 and 0.0 < px < 1.0 and 0.0 < p2 < 1.0):
+        return None
+    if abs(p1 + px + p2 - 1.0) > 0.02:  # not a clean de-vigged vector
+        return None
+    return apply_corrections(p1, px, p2)
+
+
+class Reliability:
+    """Accumulates (stated probability, outcome) pairs into buckets."""
+
+    def __init__(self, n_buckets: int) -> None:
+        self.n_buckets = n_buckets
+        self.n = defaultdict(int)
+        self.sum_p = defaultdict(float)
+        self.sum_y = defaultdict(int)
+
+    def add(self, p: float, hit: bool) -> None:
+        b = min(self.n_buckets - 1, int(p * self.n_buckets))
+        self.n[b] += 1
+        self.sum_p[b] += p
+        self.sum_y[b] += 1 if hit else 0
+
+    def report(self, title: str) -> Tuple[float, float]:
+        total = sum(self.n.values())
+        if not total:
+            print(f"\n== {title}: no data ==")
+            return 0.0, 0.0
+        ece = 0.0
+        brier_num = 0.0
+        print(f"\n== {title}  (n={total}) ==")
+        print(f"{'band':>10} {'n':>8} {'said%':>8} {'actual%':>8} {'gap_pt':>7}")
+        for b in sorted(self.n):
+            n = self.n[b]
+            said = self.sum_p[b] / n
+            act = self.sum_y[b] / n
+            ece += n * abs(said - act)
+            print(f"{b / self.n_buckets:>5.2f}-{(b + 1) / self.n_buckets:<4.2f} "
+                  f"{n:>8} {100 * said:>8.1f} {100 * act:>8.1f} {100 * (act - said):>7.1f}")
+        ece /= total
+        # Brier from bucket stats is approximate; recompute exactly elsewhere
+        # if needed. ECE is the headline honesty metric here.
+        print(f"{'ECE':>10}: {100 * ece:.2f}%")
+        return ece, brier_num
+
+
+def grade_pipeline(rows: List[Dict[str, Any]], n_buckets: int) -> None:
+    ms1 = Reliability(n_buckets); msx = Reliability(n_buckets); ms2 = Reliability(n_buckets)
+    ou = Reliability(n_buckets); btts = Reliability(n_buckets)
+    top1 = top5 = ht1 = 0
+    stated_modal = 0.0
+    n_score = 0
+
+    for r in rows:
+        anch = _anchored_probs(r)
+        sh, sa = int(r["score_home"]), int(r["score_away"])
+        winner = "home" if sh > sa else "away" if sa > sh else "draw"
+        if anch is not None:
+            p1, px, p2 = anch
+            ms1.add(p1, winner == "home")
+            msx.add(px, winner == "draw")
+            ms2.add(p2, winner == "away")
+        # exactly-0.5 values are DEFAULT FILL for matches without a real OU/BTTS
+        # market (measured: 15,993 of 78k OU rows) — never grade or use them.
+        try:
+            po = float(r["implied_over25"])
+            if po == 0.5 or not (0.05 < po < 0.95):
+                po = None
+            else:
+                ou.add(po, sh + sa >= 3)
+        except (TypeError, ValueError):
+            po = None
+        try:
+            pb = float(r["implied_btts_yes"])
+            if pb != 0.5 and 0.05 < pb < 0.95:
+                btts.add(pb, sh > 0 and sa > 0)
+        except (TypeError, ValueError):
+            pass
+
+        # V36 score card (sampled fully — pure math, no I/O)
+        if anch is not None and po is not None and 0.05 < po < 0.95:
+            pkg = build_calibrated_score_package(*anch, po)
+            actual = f"{min(sh, 10)}-{min(sa, 10)}"
+            n_score += 1
+            stated_modal += float(pkg["scenario_top5"][0]["prob"])
+            if pkg["ft"] == actual:
+                top1 += 1
+            if actual in [d["score"] for d in pkg["scenario_top5"]]:
+                top5 += 1
+            hh, ha = r.get("ht_score_home"), r.get("ht_score_away")
+            if hh is not None and ha is not None and pkg["ht"] == f"{min(int(hh),10)}-{min(int(ha),10)}":
+                ht1 += 1
+
+    ms1.report("MS ev (1) — anchored pipeline")
+    msx.report("MS beraberlik (X) — anchored pipeline")
+    ms2.report("MS deplasman (2) — anchored pipeline")
+    ou.report("Ust/Alt 2.5 (over) — devig")
+    btts.report("KG Var — devig")
+
+    if n_score:
+        print(f"\n== V36 skor karti (n={n_score}) ==")
+        print(f"  modal skor isabeti : {100 * top1 / n_score:.1f}%  (soylenen: {100 * stated_modal / n_score:.1f}%)")
+        print(f"  top-5 kapsama      : {100 * top5 / n_score:.1f}%")
+        print(f"  IY modal isabeti   : {100 * ht1 / n_score:.1f}%")
+
+
+def grade_stored_runs() -> None:
+    """Settle prediction_runs main_pick stated probabilities per engine_version.
+    `.sim-finished` buckets (manual finished-match tests) report separately."""
+    sql = """
+        SELECT pr.engine_version,
+               pr.payload_summary->'main_pick'->>'market' AS market,
+               pr.payload_summary->'main_pick'->>'pick' AS pick,
+               COALESCE((pr.payload_summary->'main_pick'->>'calibrated_probability')::float,
+                        (pr.payload_summary->'main_pick'->>'probability')::float) AS p,
+               m.score_home AS sh, m.score_away AS sa, m.winner AS w
+        FROM prediction_runs pr
+        JOIN matches m ON m.id = pr.match_id
+        WHERE m.score_home IS NOT NULL
+          AND jsonb_typeof(pr.payload_summary->'main_pick') = 'object'
+    """
+    with psycopg2.connect(get_clean_dsn()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SET statement_timeout = '60s'")
+        with conn.cursor(cursor_factory=RealDictCursor) as cur:
+            cur.execute(sql)
+            rows = cur.fetchall()
+
+    def settle(market: str, pick: str, sh: int, sa: int, w: str) -> Optional[bool]:
+        total = sh + sa
+        pick_u = (pick or "").upper()
+        over = "UST" in pick_u.replace("Ü", "U") or "OVER" in pick_u
+        if market == "MS":
+            return {"1": w == "home", "X": w == "draw", "2": w == "away"}.get(pick)
+        if market in ("OU15", "OU25", "OU35"):
+            line = {"OU15": 1.5, "OU25": 2.5, "OU35": 3.5}[market]
+            return total > line if over else total < line
+        if market == "BTTS":
+            yes = "VAR" in pick_u or "YES" in pick_u
+            return (sh > 0 and sa > 0) if yes else not (sh > 0 and sa > 0)
+        return None
+
+    stats: Dict[str, List[Tuple[float, bool]]] = defaultdict(list)
+    for r in rows:
+        if r["p"] is None:
+            continue
+        hit = settle(str(r["market"]), str(r["pick"]), int(r["sh"]), int(r["sa"]), str(r["w"]))
+        if hit is None:
+            continue
+        stats[str(r["engine_version"])].append((float(r["p"]), bool(hit)))
+
+    print("\n== prediction_runs karnesi (main_pick, soylenen vs olan) ==")
+    print(f"{'engine_version':<34} {'n':>5} {'said%':>8} {'actual%':>8}")
+    for ver in sorted(stats):
+        pairs = stats[ver]
+        n = len(pairs)
+        said = sum(p for p, _ in pairs) / n
+        act = sum(1 for _, h in pairs if h) / n
+        tag = "  <- test kovasi" if ver.endswith(".sim-finished") else ""
+        print(f"{ver:<34} {n:>5} {100 * said:>8.1f} {100 * act:>8.1f}{tag}")
+    if not stats:
+        print("  (settle edilebilir kayit yok)")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--days", type=int, default=365, help="lookback window (days)")
+    ap.add_argument("--buckets", type=int, default=10)
+    args = ap.parse_args()
+
+    t0 = time.time()
+    rows = _fetch_settled_matches(args.days)
+    print(f"settled real-odds matches loaded: {len(rows)} (last {args.days} days, "
+          f"{time.time() - t0:.1f}s)")
+    if rows:
+        grade_pipeline(rows, args.buckets)
+    grade_stored_runs()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,255 @@
+"""Guarded self-correction loop — fits the market-anchor correction table.
+
+What it does (the "tablo üreteci" of the feedback loop):
+
+  1. MEASURE: on settled real-odds matches, per implied-probability band, the
+     gap between the RAW de-vigged probability and the actual rate — for BOTH
+     the home side (ms_home) and the away side (ms_away).
+  2. BRAKE: a band only earns a correction if it passes the safety gates —
+        * min sample  (>= MIN_N matches in the band, fitted on TRAIN window)
+        * shrinkage   (delta = SHRINK x measured gap — never the full gap)
+        * clipping    (|delta| <= CLIP)
+        * materiality (|delta| >= MIN_DELTA, else 0 — don't chase noise)
+  3. PROVE: the candidate table must beat the CURRENTLY ACTIVE corrections
+     out-of-sample (most recent TEST_DAYS, never seen during fitting) on
+     combined home+away ECE. If it doesn't, nothing is written.
+  4. WRITE: versioned artifact `config/market_anchor_corrections.json`
+     (+ timestamped copy under `config/history/`). The engine reads the table
+     at runtime (models/market_anchor.py) — the loop never modifies code.
+
+Run weekly (cron) or manually after big data ingests:
+    python scripts/fit_anchor_corrections.py [--days 540] [--test-days 90]
+    python scripts/fit_anchor_corrections.py --dry-run   # measure only
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+import sys
+import time
+from collections import defaultdict
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+import psycopg2  # noqa: E402
+from psycopg2.extras import RealDictCursor  # noqa: E402
+
+from data.db import get_clean_dsn  # noqa: E402
+from models.market_anchor import (  # noqa: E402
+    away_favorite_delta,
+    home_favorite_delta,
+)
+
+# ── safety gates ─────────────────────────────────────────────────────
+MIN_N = 1500        # band needs this many TRAIN matches to earn a correction
+SHRINK = 0.5        # apply only half of the measured gap
+CLIP = 0.05         # never correct more than 5 points
+MIN_DELTA = 0.004   # below this the correction is noise — emit 0
+ACCEPT_MARGIN = 0.0002  # candidate must beat active combined ECE by this
+
+BANDS: Tuple[Tuple[float, float], ...] = (
+    (0.05, 0.15), (0.15, 0.25), (0.25, 0.35), (0.35, 0.45),
+    (0.45, 0.55), (0.55, 0.65), (0.65, 0.75), (0.75, 0.85), (0.85, 1.01),
+)
+
+REAL_ODDS_MIN_OVERROUND = 0.05
+
+
+def fetch(days: int) -> List[Dict[str, Any]]:
+    since_ms = int((time.time() - days * 86400) * 1000)
+    sql = """
+        SELECT f.implied_home AS p1, f.implied_draw AS px, f.implied_away AS p2,
+               m.mst_utc,
+               (m.winner = 'home')::int AS home_won,
+               (m.winner = 'away')::int AS away_won
+        FROM football_ai_features f
+        JOIN matches m ON m.id = f.match_id
+        WHERE m.sport = 'football'
+          AND m.winner IN ('home', 'away', 'draw')
+          AND f.odds_overround > %s
+          AND m.mst_utc >= %s
+    """
+    out: List[Dict[str, Any]] = []
+    with psycopg2.connect(get_clean_dsn()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SET statement_timeout = '120s'")
+        with conn.cursor("fit_stream", cursor_factory=RealDictCursor) as cur:
+            cur.itersize = 5000
+            cur.execute(sql, (REAL_ODDS_MIN_OVERROUND, since_ms))
+            for r in cur:
+                p1, px, p2 = r["p1"], r["px"], r["p2"]
+                if p1 is None or px is None or p2 is None:
+                    continue
+                if abs(float(p1) + float(px) + float(p2) - 1.0) > 0.02:
+                    continue
+                out.append({
+                    "p1": float(p1), "p2": float(p2),
+                    "y1": int(r["home_won"]), "y2": int(r["away_won"]),
+                    "mst_utc": int(r["mst_utc"]),
+                })
+    return out
+
+
+def band_of(p: float) -> Optional[int]:
+    for i, (lo, hi) in enumerate(BANDS):
+        if lo <= p < hi:
+            return i
+    return None
+
+
+def fit_candidate(
+    train: List[Dict[str, Any]], pkey: str, ykey: str
+) -> List[Dict[str, Any]]:
+    n = defaultdict(int); sp = defaultdict(float); sy = defaultdict(int)
+    for r in train:
+        b = band_of(r[pkey])
+        if b is None:
+            continue
+        n[b] += 1; sp[b] += r[pkey]; sy[b] += r[ykey]
+    bands: List[Dict[str, Any]] = []
+    for i, (lo, hi) in enumerate(BANDS):
+        if n[i] < MIN_N:
+            continue  # gate: not enough evidence — no correction for this band
+        raw_gap = (sy[i] / n[i]) - (sp[i] / n[i])
+        delta = max(-CLIP, min(CLIP, SHRINK * raw_gap))
+        if abs(delta) < MIN_DELTA:
+            delta = 0.0
+        bands.append({"lo": lo, "hi": hi, "delta": round(delta, 4),
+                      "n": n[i], "raw_gap": round(raw_gap, 4)})
+    return bands
+
+
+def table_delta_fn(table: List[Dict[str, Any]]) -> Callable[[float], float]:
+    def fn(p: float) -> float:
+        for b in table:
+            if b["lo"] <= p < b["hi"]:
+                return b["delta"]
+        return 0.0
+    return fn
+
+
+def ece(rows: List[Dict[str, Any]], pkey: str, ykey: str,
+        delta_fn: Callable[[float], float]) -> float:
+    n = defaultdict(int); sp = defaultdict(float); sy = defaultdict(int)
+    for r in rows:
+        pc = min(0.98, r[pkey] + delta_fn(r[pkey]))
+        b = min(19, int(pc * 20))
+        n[b] += 1; sp[b] += pc; sy[b] += r[ykey]
+    total = sum(n.values())
+    if not total:
+        return 0.0
+    return sum(n[b] * abs(sp[b] / n[b] - sy[b] / n[b]) for b in n) / total
+
+
+def print_bands(title: str, bands: List[Dict[str, Any]]) -> None:
+    print(f"\ncandidate bands — {title} (after gates):")
+    print(f"{'band':>12} {'n':>8} {'raw_gap_pt':>11} {'delta_pt':>9}")
+    for b in bands:
+        print(f"{b['lo']:>5.2f}-{b['hi']:<5.2f} {b['n']:>8} "
+              f"{100 * b['raw_gap']:>11.2f} {100 * b['delta']:>9.2f}")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--days", type=int, default=540, help="total lookback")
+    ap.add_argument("--test-days", type=int, default=90,
+                    help="most recent window held out for acceptance")
+    ap.add_argument("--dry-run", action="store_true",
+                    help="measure and report only — never write")
+    args = ap.parse_args()
+
+    rows = fetch(args.days)
+    cutoff_ms = int((time.time() - args.test_days * 86400) * 1000)
+    train = [r for r in rows if r["mst_utc"] < cutoff_ms]
+    test = [r for r in rows if r["mst_utc"] >= cutoff_ms]
+    print(f"matches: total={len(rows)}  train={len(train)}  test(OOS)={len(test)}")
+    if len(train) < 10 * MIN_N or len(test) < 2000:
+        print("ABORT: not enough data for a safe fit — keeping active table.")
+        return
+
+    cand_home = fit_candidate(train, "p1", "y1")
+    cand_away = fit_candidate(train, "p2", "y2")
+    print_bands("ms_home", cand_home)
+    print_bands("ms_away", cand_away)
+
+    # active = whatever the engine currently loads (artifact or fallback)
+    ece_h_act = ece(test, "p1", "y1", home_favorite_delta)
+    ece_a_act = ece(test, "p2", "y2", away_favorite_delta)
+    ece_h_cand = ece(test, "p1", "y1", table_delta_fn(cand_home))
+    ece_a_cand = ece(test, "p2", "y2", table_delta_fn(cand_away))
+    ece_h_raw = ece(test, "p1", "y1", lambda _p: 0.0)
+    ece_a_raw = ece(test, "p2", "y2", lambda _p: 0.0)
+
+    print(f"\nOOS ECE (home/away/combined):")
+    print(f"  raw (devig only) : {100 * ece_h_raw:.3f}% / {100 * ece_a_raw:.3f}% "
+          f"/ {100 * (ece_h_raw + ece_a_raw):.3f}%")
+    print(f"  ACTIVE table     : {100 * ece_h_act:.3f}% / {100 * ece_a_act:.3f}% "
+          f"/ {100 * (ece_h_act + ece_a_act):.3f}%")
+    print(f"  CANDIDATE table  : {100 * ece_h_cand:.3f}% / {100 * ece_a_cand:.3f}% "
+          f"/ {100 * (ece_h_cand + ece_a_cand):.3f}%")
+
+    if args.dry_run:
+        print("\n(dry-run: nothing written)")
+        return
+
+    combined_act = ece_h_act + ece_a_act
+    combined_cand = ece_h_cand + ece_a_cand
+    if combined_cand > combined_act - ACCEPT_MARGIN:
+        print("\nREJECTED: candidate does not beat the active table "
+              "out-of-sample. Active corrections stay. (Bu fren tasarim geregi:"
+              " kanitlayamayan tablo yazilmaz.)")
+        return
+
+    cfg_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "config"))
+    path = os.path.join(cfg_dir, "market_anchor_corrections.json")
+    artifact = {
+        "version": time.strftime("%Y-%m-%dT%H:%M:%S"),
+        "fitted_on": {"days": args.days, "test_days": args.test_days,
+                      "n_train": len(train), "n_test": len(test)},
+        "validated": {
+            "ece_home": {"raw": round(ece_h_raw, 5), "active_before": round(ece_h_act, 5),
+                         "candidate_oos": round(ece_h_cand, 5)},
+            "ece_away": {"raw": round(ece_a_raw, 5), "active_before": round(ece_a_act, 5),
+                         "candidate_oos": round(ece_a_cand, 5)},
+        },
+        "gates": {"min_n": MIN_N, "shrink": SHRINK, "clip": CLIP,
+                  "min_delta": MIN_DELTA},
+        "corrections": {"ms_home": cand_home, "ms_away": cand_away},
+    }
+    hist_dir = os.path.join(cfg_dir, "history")
+    os.makedirs(hist_dir, exist_ok=True)
+    if os.path.exists(path):
+        shutil.copy2(path, os.path.join(
+            hist_dir, f"market_anchor_corrections-{int(time.time())}.json"))
+    with open(path, "w", encoding="utf-8") as fh:
+        json.dump(artifact, fh, ensure_ascii=False, indent=2)
+    print(f"\nACCEPTED: wrote {path}")
+
+    # The deployed ai-engine container has NO volume mounts, so the file above
+    # is invisible to it — app_settings is the shared medium. Running engines
+    # re-read it within ~10 minutes (TTL in models/market_anchor.py).
+    try:
+        with psycopg2.connect(get_clean_dsn()) as conn:
+            with conn.cursor() as cur:
+                cur.execute(
+                    """
+                    INSERT INTO app_settings (key, value, updated_at)
+                    VALUES ('market_anchor_corrections', %s, now())
+                    ON CONFLICT (key)
+                    DO UPDATE SET value = EXCLUDED.value, updated_at = now()
+                    """,
+                    (json.dumps(artifact, ensure_ascii=False),),
+                )
+            conn.commit()
+        print("ACCEPTED: upserted app_settings['market_anchor_corrections'] "
+              "(live engines refresh within ~10 min)")
+    except Exception as exc:  # file artifact still written — warn only
+        print(f"WARN: app_settings upsert failed: {exc}")
+
+
+if __name__ == "__main__":
+    main()