iddaai-be/ai-engine/models/market_anchor.py

"""Market-anchored calibration (V35) — pure functions, no I/O.

WHY THIS EXISTS
---------------
The model's invented per-market probabilities were *measured* to be badly
overconfident. Grading the engine's own stored predictions against actual
results: it says ~50% where reality is ~25%, ~67% where reality is ~37%
(calibration error / ECE on the order of 25-30%). That mis-calibration is the
direct cause of the false "value" picks and the negative realised ROI.

The de-vigged market price, by contrast, is empirically near-perfectly
calibrated. Out-of-sample (correction fit on 2023-24, tested on 2025-26;
78k real-odds football matches) the de-vigged market's ECE was:
    home 1.56% | draw 1.85% | away 1.49% | over2.5 1.79% | btts 1.38%
Adding one small, large-sample home-favourite correction cut MS-home ECE
from 1.56% -> 0.64%.

So for the DISPLAYED probabilities we anchor to the de-vigged market and apply
only that one proven correction. ~20-40x more calibrated than the model's
numbers, and fully transparent.

These functions are pure (stdlib only) so they can be unit-tested in isolation
without the DB or the heavy model stack.
"""

from __future__ import annotations

import json
import os
import threading
import time
from typing import Any, Dict, List, Optional, Tuple


def devig(odds: List[Optional[float]]) -> Optional[List[float]]:
    """Vig-removed (fair) probabilities from a group of decimal odds.

    ``p_i = (1/odds_i) / Σ(1/odds_j)`` — normalising the raw implied
    probabilities to sum to 1 removes the bookmaker margin.

    Returns ``None`` when ANY leg is missing or non-real (``<= 1.01``). That is
    deliberate: a market with a missing/placeholder leg has no real price, and
    the product rule is to never fabricate numbers for a match without odds.
    """
    if not odds or any(o is None or float(o) <= 1.01 for o in odds):
        return None
    inv = [1.0 / float(o) for o in odds]
    total = sum(inv)
    if total <= 0.0:
        return None
    return [x / total for x in inv]


# Home-favourite correction: measured (actual home-win rate − de-vigged implied)
# by implied-home band, out-of-sample on real-odds matches. Big home favourites
# win a few points MORE than the de-vigged price implies; underdogs are roughly
# unbiased. Values are deliberately conservative — universal and shrunk toward 0
# vs the raw tier-0 (soft-league) edge, because the bias is weaker in efficient
# top leagues. Applying these took MS-home OOS ECE 1.56% -> 0.64%.
#
# These static bands are the BUILT-IN FALLBACK. The live values come from the
# versioned artifact `config/market_anchor_corrections.json`, refreshed by
# `scripts/fit_anchor_corrections.py` (the guarded self-correction loop:
# measure on settled matches -> shrink/clip/min-sample gates -> out-of-sample
# acceptance -> write table). The engine only ever consumes the TABLE — the
# loop never modifies code.
_HOME_FAV_BANDS: Tuple[Tuple[float, float, float], ...] = (
    (0.45, 0.55, 0.010),
    (0.55, 0.65, 0.018),
    (0.65, 0.75, 0.028),
    (0.75, 1.01, 0.034),
)

_DEFAULT_CORRECTIONS_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), "..", "config",
    "market_anchor_corrections.json",
)


def _corrections_path() -> str:
    return os.environ.get(
        "MARKET_ANCHOR_CORRECTIONS_PATH", _DEFAULT_CORRECTIONS_PATH
    )
_corrections_lock = threading.Lock()
_corrections_cache: Optional[Dict[str, Any]] = None
_corrections_ts: float = 0.0
# Re-check sources at most every 10 minutes: the self-correction cron writes a
# new table to app_settings; running engines pick it up WITHOUT a restart.
_CORRECTIONS_TTL_S = 600.0


def _parse_corrections(raw: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    parsed_table: Dict[str, Any] = {}
    for key in ("ms_home", "ms_away"):
        bands = raw.get("corrections", {}).get(key)
        if not (isinstance(bands, list) and bands):
            continue
        parsed = []
        for b in bands:
            lo = float(b["lo"]); hi = float(b["hi"]); delta = float(b["delta"])
            if not (0.0 <= lo < hi <= 1.01) or abs(delta) > 0.10:
                raise ValueError(f"correction band out of range: {b}")
            parsed.append((lo, hi, delta))
        parsed_table[key] = tuple(parsed)
    if not parsed_table:
        return None
    parsed_table["version"] = str(raw.get("version", "?"))
    return parsed_table


def _db_corrections_raw() -> Optional[Dict[str, Any]]:
    """Fetch the correction artifact from app_settings (the deployment's shared
    medium — the ai-engine container has no volume mounts, so a host-side cron
    can only reach the running engine through the database). Guarded: any
    failure → None, never breaks a prediction. Disable with MARKET_ANCHOR_DB=0."""
    if os.environ.get("MARKET_ANCHOR_DB", "1") == "0":
        return None
    try:
        import psycopg2  # local import: keeps module usable without DB deps
        from data.db import get_clean_dsn

        with psycopg2.connect(get_clean_dsn(), connect_timeout=3) as conn:
            with conn.cursor() as cur:
                cur.execute(
                    "SELECT value FROM app_settings"
                    " WHERE key = 'market_anchor_corrections'"
                )
                row = cur.fetchone()
        if row and row[0]:
            return json.loads(row[0])
    except Exception:
        return None
    return None


def _load_corrections() -> Optional[Dict[str, Any]]:
    """Resolve the active correction table (thread-safe, TTL-cached).

    Source order:
      1. MARKET_ANCHOR_CORRECTIONS_PATH env file (tests/dev — file-only mode,
         malformed → static fallback, DB and default file are NOT consulted)
      2. app_settings DB row 'market_anchor_corrections' (production path —
         refreshed by scripts/fit_anchor_corrections.py)
      3. bundled config/market_anchor_corrections.json
      4. None → built-in static fallback bands
    """
    global _corrections_cache, _corrections_ts
    now = time.time()
    if now - _corrections_ts < _CORRECTIONS_TTL_S:
        return _corrections_cache
    with _corrections_lock:
        if now - _corrections_ts < _CORRECTIONS_TTL_S:
            return _corrections_cache
        table: Optional[Dict[str, Any]] = None
        env_path = os.environ.get("MARKET_ANCHOR_CORRECTIONS_PATH")
        if env_path:
            try:
                with open(env_path, "r", encoding="utf-8") as fh:
                    table = _parse_corrections(json.load(fh))
            except (OSError, ValueError, KeyError, TypeError, json.JSONDecodeError):
                table = None
        else:
            raw = _db_corrections_raw()
            if raw is not None:
                try:
                    table = _parse_corrections(raw)
                except (ValueError, KeyError, TypeError):
                    table = None
            if table is None:
                try:
                    with open(_corrections_path(), "r", encoding="utf-8") as fh:
                        table = _parse_corrections(json.load(fh))
                except (OSError, ValueError, KeyError, TypeError, json.JSONDecodeError):
                    table = None
        _corrections_cache = table
        _corrections_ts = time.time()
        return _corrections_cache


def reload_corrections() -> None:
    """Force re-read of the correction sources (used after a refresh/tests)."""
    global _corrections_ts, _corrections_cache
    with _corrections_lock:
        _corrections_ts = 0.0
        _corrections_cache = None


def home_favorite_delta(p_home: float) -> float:
    """Additive correction to the de-vigged home-win probability.

    Band semantics: a fitted-artifact band OVERRIDES the static prior where it
    exists (including an explicit delta of 0 — evidence of "no bias"). Where
    the artifact is SILENT (a range that never passed the min-sample gate,
    e.g. big favourites 0.75+), the static prior still applies — missing
    evidence must not silently erase proven knowledge."""
    table = _load_corrections()
    if table and "ms_home" in table:
        for lo, hi, delta in table["ms_home"]:
            if lo <= p_home < hi:
                return delta
    for lo, hi, delta in _HOME_FAV_BANDS:
        if lo <= p_home < hi:
            return delta
    return 0.0


def away_favorite_delta(p_away: float) -> float:
    """Additive correction to the de-vigged away-win probability.

    Scoreboard measurement (2026-06): away favourites also win a few points
    MORE than the de-vigged price implies (+2.6..+4.2pt). Unlike the home
    side there is NO built-in fallback — away corrections must be EARNED via
    the fitted artifact (scripts/fit_anchor_corrections.py passing its
    out-of-sample acceptance gate). No artifact → zero → prior behaviour."""
    table = _load_corrections()
    bands = table.get("ms_away", ()) if table else ()
    for lo, hi, delta in bands:
        if lo <= p_away < hi:
            return delta
    return 0.0


def apply_corrections(
    p1: float, px: float, p2: float
) -> Tuple[float, float, float]:
    """Apply favourite corrections to a 3-way (1, X, 2) vector.

    In practice only one side can be a favourite (both ≥0.45 would leave no
    room for the draw); if both bands somehow fire, the larger delta wins.
    The other two outcomes are renormalised so the vector still sums to 1."""
    d1 = home_favorite_delta(p1)
    d2 = away_favorite_delta(p2)
    if d1 <= 0.0 and d2 <= 0.0:
        return p1, px, p2
    if d1 >= d2:
        return apply_home_correction(p1, px, p2)
    p2n = min(0.98, p2 + d2)
    remaining = 1.0 - p2n
    rest = p1 + px
    if rest <= 0.0:
        return p1, px, p2n
    return p1 / rest * remaining, px / rest * remaining, p2n


def apply_home_correction(
    p1: float, px: float, p2: float
) -> Tuple[float, float, float]:
    """Apply the home-favourite delta to a 3-way (1, X, 2) probability vector,
    renormalising draw/away so the three still sum to 1.0."""
    delta = home_favorite_delta(p1)
    if delta <= 0.0:
        return p1, px, p2
    p1n = min(0.98, p1 + delta)
    remaining = 1.0 - p1n
    rest = px + p2
    if rest <= 0.0:
        return p1n, px, p2
    return p1n, px / rest * remaining, p2 / rest * remaining