iddaai-be/ai-engine/scripts/run_backtest_and_calibrate.py

"""
V25 Backtest + Calibration Training Script
==========================================
Runs a full backtest on historical football matches, measures model accuracy
by market / confidence band / league, and trains isotonic calibration models
for MS, OU15, OU25, and BTTS markets.

Usage:
    venv/bin/python scripts/run_backtest_and_calibrate.py
"""

from __future__ import annotations

import os
import sys
import json
import pickle
import time
from collections import defaultdict
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any

import numpy as np
import pandas as pd
import psycopg2
from psycopg2.extras import RealDictCursor

# ---------------------------------------------------------------------------
# Path setup — works whether executed from ai-engine/ or project root
# ---------------------------------------------------------------------------
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR)
sys.path.insert(0, AI_ENGINE_DIR)

from data.db import get_clean_dsn
from models.calibration import Calibrator

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
QUALIFIED_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "qualified_leagues.json")
CALIBRATION_DIR = os.path.join(AI_ENGINE_DIR, "models", "calibration")
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports")
MAX_MATCHES = 3000          # target upper bound
PROGRESS_INTERVAL = 100     # print every N matches

os.makedirs(CALIBRATION_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)

# Mapping: Turkish category name -> internal feature key
ODDS_CATEGORY_MAP = {
    "Maç Sonucu": {
        "1": "odds_ms_h",
        "X": "odds_ms_d",
        "2": "odds_ms_a",
    },
    "1,5 Alt/Üst": {
        "Üst": "odds_ou15_o",
        "Alt": "odds_ou15_u",
    },
    "2,5 Alt/Üst": {
        "Üst": "odds_ou25_o",
        "Alt": "odds_ou25_u",
    },
    "3,5 Alt/Üst": {
        "Üst": "odds_ou35_o",
        "Alt": "odds_ou35_u",
    },
    "0,5 Alt/Üst": {
        "Üst": "odds_ou05_o",
        "Alt": "odds_ou05_u",
    },
    "Karşılıklı Gol": {
        "Var": "odds_btts_y",
        "Yok": "odds_btts_n",
    },
    "1. Yarı Sonucu": {
        "1": "odds_ht_ms_h",
        "X": "odds_ht_ms_d",
        "2": "odds_ht_ms_a",
    },
    "1. Yarı 0,5 Alt/Üst": {
        "Üst": "odds_ht_ou05_o",
        "Alt": "odds_ht_ou05_u",
    },
    "1. Yarı 1,5 Alt/Üst": {
        "Üst": "odds_ht_ou15_o",
        "Alt": "odds_ht_ou15_u",
    },
}

# Top 5 leagues by name for individual breakdown (will be matched by league_id)
TOP5_LEAGUE_NAMES = {
    "Premier League",
    "La Liga",
    "Bundesliga",
    "Serie A",
    "Ligue 1",
}

# ============================================================================
# STEP 1 — Load qualified league IDs
# ============================================================================

def load_qualified_leagues() -> List[str]:
    path = os.path.abspath(QUALIFIED_LEAGUES_PATH)
    with open(path, "r") as f:
        leagues = json.load(f)
    print(f"[Step 1] Loaded {len(leagues)} qualified league IDs.")
    return [str(lid) for lid in leagues]


# ============================================================================
# STEP 1b — Fetch matches + pre-computed features in batch
# ============================================================================

def fetch_matches(conn, league_ids: List[str]) -> pd.DataFrame:
    """
    Single batch query: matches + football_ai_features + league name.
    Only returns matches that also have odds data (inner join on odd_categories).
    Returns a DataFrame with one row per match.
    """
    print("[Step 1b] Fetching matches with pre-computed features and odds ...")
    cur = conn.cursor(cursor_factory=RealDictCursor)

    cur.execute(
        """
        SELECT
            m.id                AS match_id,
            m.league_id,
            l.name              AS league_name,
            m.score_home,
            m.score_away,
            m.mst_utc,
            -- From football_ai_features
            f.home_elo          AS home_overall_elo,
            f.away_elo          AS away_overall_elo,
            f.elo_diff,
            f.home_home_elo,
            f.away_away_elo,
            f.home_form_elo,
            f.away_form_elo,
            f.home_goals_avg_5  AS home_goals_avg,
            f.away_goals_avg_5  AS away_goals_avg,
            f.home_conceded_avg_5 AS home_conceded_avg,
            f.away_conceded_avg_5 AS away_conceded_avg,
            f.home_clean_sheet_rate,
            f.away_clean_sheet_rate,
            f.home_scoring_rate,
            f.away_scoring_rate,
            f.home_win_streak   AS home_winning_streak,
            f.away_win_streak   AS away_winning_streak,
            f.home_avg_possession,
            f.away_avg_possession,
            f.home_avg_shots_on_target,
            f.away_avg_shots_on_target,
            f.home_shot_conversion,
            f.away_shot_conversion,
            f.home_avg_corners,
            f.away_avg_corners,
            f.h2h_total         AS h2h_total_matches,
            f.h2h_home_win_rate,
            f.h2h_avg_goals,
            f.h2h_over25_rate,
            f.h2h_btts_rate,
            f.league_avg_goals,
            f.league_home_win_pct AS league_home_win_rate,
            f.league_over25_pct   AS league_ou25_rate,
            f.referee_avg_cards   AS referee_cards_total,
            f.referee_home_bias,
            f.referee_avg_goals,
            f.missing_players_impact AS home_missing_impact,
            f.implied_home,
            f.implied_draw,
            f.implied_away
        FROM matches m
        JOIN football_ai_features f ON f.match_id = m.id
        -- Only matches that have odds data
        JOIN (SELECT DISTINCT match_id FROM odd_categories WHERE sport = 'football') oc
            ON oc.match_id = m.id
        LEFT JOIN leagues l ON l.id = m.league_id
        WHERE m.status = 'FT'
          AND m.score_home IS NOT NULL
          AND m.score_away IS NOT NULL
          AND m.league_id = ANY(%s)
        ORDER BY m.mst_utc DESC
        LIMIT %s
        """,
        (league_ids, MAX_MATCHES),
    )

    rows = cur.fetchall()
    cur.close()
    df = pd.DataFrame([dict(r) for r in rows])
    print(f"[Step 1b] Fetched {len(df)} matches with features + odds coverage.")
    return df


# ============================================================================
# STEP 1c — Fetch all odds for the matched match IDs in one query
# ============================================================================

def fetch_odds_bulk(conn, match_ids: List[str]) -> Dict[str, Dict[str, float]]:
    """
    Returns {match_id: {feature_key: odd_value, ...}} for all known categories.
    """
    print(f"[Step 1c] Fetching odds for {len(match_ids)} matches ...")
    cur = conn.cursor(cursor_factory=RealDictCursor)

    # Build a set of known category names
    known_cats = tuple(ODDS_CATEGORY_MAP.keys())

    cur.execute(
        """
        SELECT oc.match_id, oc.name AS cat_name, os.name AS sel_name, os.odd_value
        FROM odd_categories oc
        JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
        WHERE oc.match_id = ANY(%s)
          AND oc.name = ANY(%s)
          AND oc.sport = 'football'
          AND os.odd_value IS NOT NULL
          AND os.odd_value ~ '^[0-9]+(\.[0-9]+)?$'
        """,
        (match_ids, list(known_cats)),
    )

    rows = cur.fetchall()
    cur.close()

    # Build nested dict: match_id -> {feature_key -> value}
    odds_map: Dict[str, Dict[str, float]] = defaultdict(dict)
    for r in rows:
        cat_name = r["cat_name"]
        sel_name = r["sel_name"]
        if cat_name in ODDS_CATEGORY_MAP and sel_name in ODDS_CATEGORY_MAP[cat_name]:
            feat_key = ODDS_CATEGORY_MAP[cat_name][sel_name]
            try:
                val = float(r["odd_value"])
                if val > 1.0:
                    # Keep first encountered (most recent or primary bookmaker)
                    if feat_key not in odds_map[r["match_id"]]:
                        odds_map[r["match_id"]][feat_key] = val
            except (TypeError, ValueError):
                pass

    print(f"[Step 1c] Odds loaded for {len(odds_map)} matches.")
    return dict(odds_map)


# ============================================================================
# STEP 2 — Build 114-feature vector per match
# ============================================================================

def load_feature_cols() -> List[str]:
    path = os.path.join(AI_ENGINE_DIR, "models", "v25", "feature_cols.json")
    with open(path, "r") as f:
        return json.load(f)


def build_feature_vector(
    match_row: pd.Series,
    odds: Dict[str, float],
    feature_cols: List[str],
) -> Dict[str, float]:
    """
    Construct the full feature dict for one match.
    Falls back to 0.0 for any missing feature.
    """
    feat: Dict[str, float] = {col: 0.0 for col in feature_cols}

    # ---- Direct columns from match row ----
    direct_map = {
        "home_overall_elo": "home_overall_elo",
        "away_overall_elo": "away_overall_elo",
        "elo_diff": "elo_diff",
        "home_home_elo": "home_home_elo",
        "away_away_elo": "away_away_elo",
        "home_form_elo": "home_form_elo",
        "away_form_elo": "away_form_elo",
        "home_goals_avg": "home_goals_avg",
        "away_goals_avg": "away_goals_avg",
        "home_conceded_avg": "home_conceded_avg",
        "away_conceded_avg": "away_conceded_avg",
        "home_clean_sheet_rate": "home_clean_sheet_rate",
        "away_clean_sheet_rate": "away_clean_sheet_rate",
        "home_scoring_rate": "home_scoring_rate",
        "away_scoring_rate": "away_scoring_rate",
        "home_winning_streak": "home_winning_streak",
        "away_winning_streak": "away_winning_streak",
        "home_avg_possession": "home_avg_possession",
        "away_avg_possession": "away_avg_possession",
        "home_avg_shots_on_target": "home_avg_shots_on_target",
        "away_avg_shots_on_target": "away_avg_shots_on_target",
        "home_shot_conversion": "home_shot_conversion",
        "away_shot_conversion": "away_shot_conversion",
        "home_avg_corners": "home_avg_corners",
        "away_avg_corners": "away_avg_corners",
        "h2h_total_matches": "h2h_total_matches",
        "h2h_home_win_rate": "h2h_home_win_rate",
        "h2h_avg_goals": "h2h_avg_goals",
        "h2h_over25_rate": "h2h_over25_rate",
        "h2h_btts_rate": "h2h_btts_rate",
        "league_avg_goals": "league_avg_goals",
        "league_home_win_rate": "league_home_win_rate",
        "league_ou25_rate": "league_ou25_rate",
        "referee_cards_total": "referee_cards_total",
        "referee_home_bias": "referee_home_bias",
        "referee_avg_goals": "referee_avg_goals",
        "home_missing_impact": "home_missing_impact",
        "implied_home": "implied_home",
        "implied_draw": "implied_draw",
        "implied_away": "implied_away",
    }

    for src_col, feat_col in direct_map.items():
        if feat_col in feat and src_col in match_row.index:
            val = match_row.get(src_col)
            if val is not None and not (isinstance(val, float) and np.isnan(val)):
                feat[feat_col] = float(val)

    # ---- Derived elo features ----
    if feat.get("home_form_elo", 0) and feat.get("away_form_elo", 0):
        feat["form_elo_diff"] = feat["home_form_elo"] - feat["away_form_elo"]

    # ---- Odds features from relational tables ----
    odds_features = [
        "odds_ms_h", "odds_ms_d", "odds_ms_a",
        "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
        "odds_ou05_o", "odds_ou05_u",
        "odds_ou15_o", "odds_ou15_u",
        "odds_ou25_o", "odds_ou25_u",
        "odds_ou35_o", "odds_ou35_u",
        "odds_ht_ou05_o", "odds_ht_ou05_u",
        "odds_ht_ou15_o", "odds_ht_ou15_u",
        "odds_btts_y", "odds_btts_n",
    ]
    for ok in odds_features:
        if ok in odds:
            feat[ok] = odds[ok]
            presence_key = f"{ok}_present"
            if presence_key in feat:
                feat[presence_key] = 1.0

    # Recompute implied probabilities from odds if available and not already set
    if feat.get("odds_ms_h", 0) > 1 and feat.get("odds_ms_d", 0) > 1 and feat.get("odds_ms_a", 0) > 1:
        raw_h = 1.0 / feat["odds_ms_h"]
        raw_d = 1.0 / feat["odds_ms_d"]
        raw_a = 1.0 / feat["odds_ms_a"]
        total = raw_h + raw_d + raw_a
        if total > 0:
            feat["implied_home"] = raw_h / total
            feat["implied_draw"] = raw_d / total
            feat["implied_away"] = raw_a / total

    # ---- Derived match metadata ----
    mst = match_row.get("mst_utc")
    if mst is not None:
        try:
            ts_s = int(mst) / 1000  # stored as epoch ms
            dt = datetime.utcfromtimestamp(ts_s)
            if "match_month" in feat:
                feat["match_month"] = float(dt.month)
            # Season markers: Sept-Oct = start, April-May = end
            if "is_season_start" in feat:
                feat["is_season_start"] = 1.0 if dt.month in (8, 9, 10) else 0.0
            if "is_season_end" in feat:
                feat["is_season_end"] = 1.0 if dt.month in (4, 5) else 0.0
        except Exception:
            pass

    # ---- Interaction features ----
    if "attack_vs_defense_home" in feat:
        feat["attack_vs_defense_home"] = feat.get("home_goals_avg", 0) - feat.get("away_conceded_avg", 0)
    if "attack_vs_defense_away" in feat:
        feat["attack_vs_defense_away"] = feat.get("away_goals_avg", 0) - feat.get("home_conceded_avg", 0)
    if "form_momentum_interaction" in feat:
        feat["form_momentum_interaction"] = (
            feat.get("home_momentum_score", 0) * feat.get("home_goals_avg", 0)
            - feat.get("away_momentum_score", 0) * feat.get("away_goals_avg", 0)
        )
    if "elo_form_consistency" in feat:
        feat["elo_form_consistency"] = feat.get("elo_diff", 0) * feat.get("home_goals_avg", 0)

    return feat


# ============================================================================
# STEP 3 — Run V25 predictions
# ============================================================================

def load_predictor():
    from models.v25_ensemble import get_v25_predictor
    print("[Step 3] Loading V25 predictor ...")
    pred = get_v25_predictor()
    print("[Step 3] V25 predictor ready.")
    return pred


# ============================================================================
# STEP 4 — Compute actual outcomes from scores
# ============================================================================

def compute_actuals(score_home: int, score_away: int) -> Dict[str, Any]:
    total = score_home + score_away
    return {
        "ms_actual": "1" if score_home > score_away else ("X" if score_home == score_away else "2"),
        "ou15_actual": "Over" if total >= 2 else "Under",
        "ou25_actual": "Over" if total >= 3 else "Under",
        "btts_actual": "Yes" if score_home > 0 and score_away > 0 else "No",
    }


# ============================================================================
# STEP 5 — Accuracy helpers
# ============================================================================

def confidence_band(prob: float) -> str:
    if prob < 0.50:
        return "<50%"
    elif prob < 0.65:
        return "50-65%"
    elif prob < 0.75:
        return "65-75%"
    else:
        return "75%+"


def pick_from_ms(home_prob: float, draw_prob: float, away_prob: float) -> Tuple[str, float]:
    picks = {"1": home_prob, "X": draw_prob, "2": away_prob}
    best = max(picks, key=picks.__getitem__)
    return best, picks[best]


def pick_from_binary(yes_prob: float, no_prob: float, yes_label: str, no_label: str) -> Tuple[str, float]:
    if yes_prob >= no_prob:
        return yes_label, yes_prob
    return no_label, no_prob


# ============================================================================
# MAIN
# ============================================================================

def main():
    t_start = time.time()
    print("=" * 70)
    print("  V25 Backtest + Calibration Training")
    print(f"  Run at: {datetime.utcnow().isoformat()} UTC")
    print("=" * 70)

    # ------------------------------------------------------------------
    # Step 1 — Load qualified leagues
    # ------------------------------------------------------------------
    league_ids = load_qualified_leagues()

    # ------------------------------------------------------------------
    # Step 1b — Fetch matches with features
    # ------------------------------------------------------------------
    conn = psycopg2.connect(get_clean_dsn())
    try:
        matches_df = fetch_matches(conn, league_ids)

        if matches_df.empty:
            print("[ERROR] No matches found. Check DB connection and league IDs.")
            return

        match_ids = matches_df["match_id"].tolist()

        # ------------------------------------------------------------------
        # Step 1c — Fetch odds in bulk
        # ------------------------------------------------------------------
        odds_map = fetch_odds_bulk(conn, match_ids)
    finally:
        conn.close()

    # ------------------------------------------------------------------
    # Step 2 — Build feature vectors
    # ------------------------------------------------------------------
    print(f"\n[Step 2] Building feature vectors for {len(matches_df)} matches ...")
    feature_cols = load_feature_cols()

    # ------------------------------------------------------------------
    # Step 3 — Load V25 predictor
    # ------------------------------------------------------------------
    predictor = load_predictor()

    # ------------------------------------------------------------------
    # Main loop — predict each match, collect results
    # ------------------------------------------------------------------
    print(f"\n[Loop] Running predictions ...")

    # Storage for calibration training
    calib_data: Dict[str, List[Tuple[float, int]]] = {
        "ms_home": [],   # (prob, 1 if home win)
        "ms_draw": [],
        "ms_away": [],
        "ou15": [],
        "ou25": [],
        "btts": [],
    }

    # Storage for accuracy reporting
    records = []

    skipped = 0
    processed = 0

    for idx, row in matches_df.iterrows():
        match_id = row["match_id"]
        score_home = row.get("score_home")
        score_away = row.get("score_away")

        # Validate scores
        try:
            score_home = int(score_home)
            score_away = int(score_away)
        except (TypeError, ValueError):
            skipped += 1
            continue

        # Build features
        match_odds = odds_map.get(match_id, {})
        feat = build_feature_vector(row, match_odds, feature_cols)

        # Run predictions
        try:
            home_prob, draw_prob, away_prob = predictor.predict_ms(feat)
            over25_prob, under25_prob = predictor.predict_ou25(feat)
            btts_yes_prob, btts_no_prob = predictor.predict_btts(feat)

            # ou15 is loaded via predict_market (returns np.ndarray for binary)
            ou15_arr = predictor.predict_market("ou15", feat)
            if ou15_arr is not None and len(ou15_arr) > 0:
                over15_prob = float(ou15_arr[0])
                under15_prob = 1.0 - over15_prob
            else:
                over15_prob = 0.5
                under15_prob = 0.5

        except Exception as e:
            skipped += 1
            continue

        # Compute actuals
        actuals = compute_actuals(score_home, score_away)

        # MS picks
        ms_pick, ms_conf = pick_from_ms(home_prob, draw_prob, away_prob)
        ms_correct = int(ms_pick == actuals["ms_actual"])

        # OU15
        ou15_pick, ou15_conf = pick_from_binary(over15_prob, under15_prob, "Over", "Under")
        ou15_correct = int(ou15_pick == actuals["ou15_actual"])

        # OU25
        ou25_pick, ou25_conf = pick_from_binary(over25_prob, under25_prob, "Over", "Under")
        ou25_correct = int(ou25_pick == actuals["ou25_actual"])

        # BTTS
        btts_pick, btts_conf = pick_from_binary(btts_yes_prob, btts_no_prob, "Yes", "No")
        btts_correct = int(btts_pick == actuals["btts_actual"])

        # Collect calibration data
        calib_data["ms_home"].append((home_prob, int(actuals["ms_actual"] == "1")))
        calib_data["ms_draw"].append((draw_prob, int(actuals["ms_actual"] == "X")))
        calib_data["ms_away"].append((away_prob, int(actuals["ms_actual"] == "2")))
        calib_data["ou15"].append((over15_prob, int(actuals["ou15_actual"] == "Over")))
        calib_data["ou25"].append((over25_prob, int(actuals["ou25_actual"] == "Over")))
        calib_data["btts"].append((btts_yes_prob, int(actuals["btts_actual"] == "Yes")))

        # Determine league group
        league_name = str(row.get("league_name", "Other") or "Other")
        league_group = league_name if league_name in TOP5_LEAGUE_NAMES else "Other"

        records.append({
            "match_id": match_id,
            "league_name": league_name,
            "league_group": league_group,
            "score_home": score_home,
            "score_away": score_away,
            # MS
            "ms_pick": ms_pick,
            "ms_actual": actuals["ms_actual"],
            "ms_conf": ms_conf,
            "ms_conf_band": confidence_band(ms_conf),
            "ms_correct": ms_correct,
            "ms_home_prob": home_prob,
            "ms_draw_prob": draw_prob,
            "ms_away_prob": away_prob,
            # OU15
            "ou15_pick": ou15_pick,
            "ou15_actual": actuals["ou15_actual"],
            "ou15_conf": ou15_conf,
            "ou15_conf_band": confidence_band(ou15_conf),
            "ou15_correct": ou15_correct,
            "ou15_over_prob": over15_prob,
            # OU25
            "ou25_pick": ou25_pick,
            "ou25_actual": actuals["ou25_actual"],
            "ou25_conf": ou25_conf,
            "ou25_conf_band": confidence_band(ou25_conf),
            "ou25_correct": ou25_correct,
            "ou25_over_prob": over25_prob,
            # BTTS
            "btts_pick": btts_pick,
            "btts_actual": actuals["btts_actual"],
            "btts_conf": btts_conf,
            "btts_conf_band": confidence_band(btts_conf),
            "btts_correct": btts_correct,
            "btts_yes_prob": btts_yes_prob,
        })

        processed += 1
        if processed % PROGRESS_INTERVAL == 0:
            elapsed = time.time() - t_start
            print(f"  [Progress] {processed}/{len(matches_df)} matches | "
                  f"skipped={skipped} | elapsed={elapsed:.1f}s")

    print(f"\n[Loop] Done. Processed={processed}, Skipped={skipped}")

    if not records:
        print("[ERROR] No records to analyze. Exiting.")
        return

    results_df = pd.DataFrame(records)

    # ------------------------------------------------------------------
    # Step 5 — Accuracy report
    # ------------------------------------------------------------------
    print("\n" + "=" * 70)
    print("  ACCURACY REPORT")
    print("=" * 70)

    markets = [
        ("MS",   "ms_correct",   "ms_conf",   "ms_conf_band",   "ms_pick"),
        ("OU15", "ou15_correct", "ou15_conf", "ou15_conf_band", "ou15_pick"),
        ("OU25", "ou25_correct", "ou25_conf", "ou25_conf_band", "ou25_pick"),
        ("BTTS", "btts_correct", "btts_conf", "btts_conf_band", "btts_pick"),
    ]

    summary: Dict[str, Any] = {
        "generated_at": datetime.utcnow().isoformat(),
        "matches_processed": processed,
        "matches_skipped": skipped,
        "markets": {},
    }

    for market_label, correct_col, conf_col, band_col, pick_col in markets:
        print(f"\n--- {market_label} ---")
        sub = results_df[[correct_col, conf_col, band_col, pick_col, "league_group"]].copy()
        total = len(sub)
        overall_acc = sub[correct_col].mean() * 100
        print(f"  Overall accuracy: {overall_acc:.1f}% ({sub[correct_col].sum()}/{total})")

        market_summary = {
            "overall_accuracy": round(overall_acc, 2),
            "total_matches": total,
            "by_confidence_band": {},
            "by_league": {},
            "by_pick_direction": {},
        }

        # By confidence band
        print(f"  By confidence band:")
        bands = ["<50%", "50-65%", "65-75%", "75%+"]
        for band in bands:
            mask = sub[band_col] == band
            n = mask.sum()
            if n > 0:
                acc = sub.loc[mask, correct_col].mean() * 100
                mean_conf = sub.loc[mask, conf_col].mean() * 100
                print(f"    {band:8s}: {acc:5.1f}% acc | {n:4d} matches | "
                      f"mean_conf={mean_conf:.1f}%")
                market_summary["by_confidence_band"][band] = {
                    "accuracy": round(acc, 2),
                    "count": int(n),
                    "mean_confidence": round(mean_conf, 2),
                }

        # By league group
        print(f"  By league:")
        league_groups = list(results_df["league_group"].unique())
        # Sort: named leagues first, then Other
        named = sorted([g for g in league_groups if g != "Other"])
        ordered = named + (["Other"] if "Other" in league_groups else [])
        for lg in ordered:
            mask = sub["league_group"] == lg
            n = mask.sum()
            if n > 0:
                acc = sub.loc[mask, correct_col].mean() * 100
                print(f"    {lg[:20]:20s}: {acc:5.1f}% ({n} matches)")
                market_summary["by_league"][lg] = {
                    "accuracy": round(acc, 2),
                    "count": int(n),
                }

        # By pick direction
        print(f"  By pick direction:")
        for pick_val in sorted(sub[pick_col].unique()):
            mask = sub[pick_col] == pick_val
            n = mask.sum()
            if n > 0:
                acc = sub.loc[mask, correct_col].mean() * 100
                mean_conf = sub.loc[mask, conf_col].mean() * 100
                print(f"    {pick_val:8s}: {acc:5.1f}% acc | {n:4d} matches | "
                      f"mean_conf={mean_conf:.1f}%")
                market_summary["by_pick_direction"][pick_val] = {
                    "accuracy": round(acc, 2),
                    "count": int(n),
                    "mean_confidence": round(mean_conf, 2),
                }

        summary["markets"][market_label] = market_summary

    # ------------------------------------------------------------------
    # Step 6 — Train calibration models
    # ------------------------------------------------------------------
    print("\n" + "=" * 70)
    print("  CALIBRATION TRAINING")
    print("=" * 70)

    calibrator = Calibrator()

    # Market config: market_key -> (label for prob, label for actual binary)
    calib_market_map = {
        "ms_home": "ms_home",
        "ms_draw": "ms_draw",
        "ms_away": "ms_away",
        "ou15": "ou15",
        "ou25": "ou25",
        "btts": "btts",
    }

    calibration_results: Dict[str, Dict] = {}

    for market_key in calib_market_map:
        pairs = calib_data[market_key]
        if len(pairs) < 100:
            print(f"[Calib] {market_key}: only {len(pairs)} samples — skipping.")
            continue

        probs = np.array([p for p, _ in pairs])
        actuals_bin = np.array([a for _, a in pairs])

        # Build a tiny DataFrame to use Calibrator.train_calibration
        calib_df = pd.DataFrame({
            "prob": probs,
            "actual": actuals_bin,
        })

        metrics = calibrator.train_calibration(
            df=calib_df,
            market=market_key,
            prob_col="prob",
            actual_col="actual",
            min_samples=100,
            save=True,
        )
        calibration_results[market_key] = metrics.to_dict()
        print(f"  [Calib] {market_key}: Brier={metrics.brier_score:.4f} | "
              f"ECE={metrics.calibration_error:.4f} | n={metrics.sample_count}")

    # ------------------------------------------------------------------
    # Step 7 — Save results
    # ------------------------------------------------------------------
    output_path = os.path.join(REPORTS_DIR, "backtest_results.json")
    full_report = {
        **summary,
        "calibration": calibration_results,
        "runtime_seconds": round(time.time() - t_start, 1),
    }

    with open(output_path, "w") as f:
        json.dump(full_report, f, indent=2)
    print(f"\n[Step 7] Report saved to {output_path}")

    # ------------------------------------------------------------------
    # Final summary table
    # ------------------------------------------------------------------
    print("\n" + "=" * 70)
    print("  FINAL SUMMARY TABLE")
    print("=" * 70)
    print(f"{'Market':<8} {'Overall Acc':>12} {'Matches':>8} "
          f"{'Best Band (acc)':>18}")
    print("-" * 70)
    for market_label, _, _, _, _ in markets:
        ms = summary["markets"].get(market_label, {})
        overall = ms.get("overall_accuracy", 0)
        total_m = ms.get("total_matches", 0)
        bands_d = ms.get("by_confidence_band", {})
        # Find best accuracy band with >= 50 matches
        best_band = "-"
        best_acc = 0.0
        for band, bdata in bands_d.items():
            if bdata["count"] >= 50 and bdata["accuracy"] > best_acc:
                best_acc = bdata["accuracy"]
                best_band = f"{band} ({best_acc:.1f}%)"
        print(f"{market_label:<8} {overall:>11.1f}% {total_m:>8d} {best_band:>18s}")

    elapsed_total = time.time() - t_start
    print(f"\nTotal runtime: {elapsed_total:.1f}s")
    print("=" * 70)


if __name__ == "__main__":
    main()