""" Calibration Training Script =========================== Trains Isotonic Regression calibration models for all betting markets. This script: 1. Fetches historical match data with predictions and actual results 2. Trains Isotonic Regression models for each market 3. Calculates calibration metrics (Brier Score, ECE) 4. Saves models to ai-engine/models/calibration/ Usage: # Train on last 90 days of data python3 ai-engine/scripts/train_calibration.py # Train on specific date range python3 ai-engine/scripts/train_calibration.py --start 2026-01-01 --end 2026-02-15 # Train only specific markets python3 ai-engine/scripts/train_calibration.py --markets ou25 btts ms_home """ import os import sys import json import argparse import psycopg2 import pandas as pd import numpy as np from datetime import datetime, timedelta from dotenv import load_dotenv from typing import Dict, List, Tuple, Any, Optional # Setup path for ai-engine imports AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, AI_ENGINE_DIR) from models.calibration import get_calibrator, SUPPORTED_MARKETS load_dotenv() # ============================================================================= # CONFIG # ============================================================================= TOP_LEAGUES_PATH = os.path.join( os.path.dirname(os.path.dirname(AI_ENGINE_DIR)), "top_leagues.json" ) # Default: last 90 days DEFAULT_START_DATE = (datetime.utcnow() - timedelta(days=90)).strftime("%Y-%m-%d") DEFAULT_END_DATE = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d") # ============================================================================= # DB CONNECTION # ============================================================================= def get_conn(): """Get PostgreSQL connection.""" db_url = os.getenv("DATABASE_URL") if not db_url: raise ValueError("DATABASE_URL not set") if "?schema=" in db_url: db_url = db_url.split("?schema=")[0] return psycopg2.connect(db_url) def load_top_league_ids() -> List[str]: """Load top league IDs from JSON file.""" if not os.path.exists(TOP_LEAGUES_PATH): print(f"[Warning] top_leagues.json not found at {TOP_LEAGUES_PATH}") return [] with open(TOP_LEAGUES_PATH, "r") as f: data = json.load(f) # Handle both list and dict formats if isinstance(data, dict): return data.get("football", []) return data # ============================================================================= # DATA EXTRACTION # ============================================================================= def fetch_training_data( cur, start_date: str, end_date: str, league_ids: List[str] = None, ) -> pd.DataFrame: """ Fetch match data with odds and results for calibration training. Returns DataFrame with columns: - match_id - home_team, away_team - ms_h, ms_d, ms_a (odds) - score_home, score_away (actual result) - ht_score_home, ht_score_away - ou25_actual, btts_actual, etc. """ start_ms = int(datetime.strptime(start_date, "%Y-%m-%d").timestamp() * 1000) end_ms = int(datetime.strptime(end_date, "%Y-%m-%d").timestamp() * 1000) + 86400000 # +1 day # Build league filter league_filter = "" params = [start_ms, end_ms] if league_ids: placeholders = ",".join(["%s"] * len(league_ids)) league_filter = f"AND m.league_id IN ({placeholders})" params.extend(league_ids) query = f""" SELECT m.id as match_id, m.home_team_id, m.away_team_id, m.score_home, m.score_away, m.ht_score_home, m.ht_score_away, m.mst_utc, -- Odds from odd_categories/selections MAX(CASE WHEN oc.name = 'Maç Sonucu' AND os.name = '1' THEN os.odd_value END) as ms_h, MAX(CASE WHEN oc.name = 'Maç Sonucu' AND os.name = 'X' THEN os.odd_value END) as ms_d, MAX(CASE WHEN oc.name = 'Maç Sonucu' AND os.name = '2' THEN os.odd_value END) as ms_a, MAX(CASE WHEN oc.name = '2,5 Alt/Üst' AND os.name = 'Üst' THEN os.odd_value END) as ou25_over, MAX(CASE WHEN oc.name = '2,5 Alt/Üst' AND os.name = 'Alt' THEN os.odd_value END) as ou25_under, MAX(CASE WHEN oc.name = '1,5 Alt/Üst' AND os.name = 'Üst' THEN os.odd_value END) as ou15_over, MAX(CASE WHEN oc.name = '3,5 Alt/Üst' AND os.name = 'Üst' THEN os.odd_value END) as ou35_over, MAX(CASE WHEN oc.name = 'Karşılıklı Gol' AND os.name = 'Var' THEN os.odd_value END) as btts_yes, MAX(CASE WHEN oc.name = 'Karşılıklı Gol' AND os.name = 'Yok' THEN os.odd_value END) as btts_no FROM matches m LEFT JOIN odd_categories oc ON oc.match_id = m.id LEFT JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE m.mst_utc >= %s AND m.mst_utc < %s AND m.status = 'FT' AND m.score_home IS NOT NULL AND m.score_away IS NOT NULL {league_filter} GROUP BY m.id, m.home_team_id, m.away_team_id, m.score_home, m.score_away, m.ht_score_home, m.ht_score_away, m.mst_utc ORDER BY m.mst_utc DESC """ cur.execute(query, params) rows = cur.fetchall() columns = [desc[0] for desc in cur.description] df = pd.DataFrame(rows, columns=columns) print(f"[Data] Fetched {len(df)} matches from {start_date} to {end_date}") return df def calculate_actual_outcomes(df: pd.DataFrame) -> pd.DataFrame: """ Calculate actual binary outcomes for each market. Adds columns: - ms_home_actual: 1 if home won, 0 otherwise - ms_draw_actual: 1 if draw, 0 otherwise - ms_away_actual: 1 if away won, 0 otherwise - ou25_over_actual: 1 if total goals > 2.5, 0 otherwise - ou15_over_actual: 1 if total goals > 1.5, 0 otherwise - ou35_over_actual: 1 if total goals > 3.5, 0 otherwise - btts_yes_actual: 1 if both teams scored, 0 otherwise """ # Total goals df["total_goals"] = df["score_home"] + df["score_away"] df["ht_total_goals"] = df["ht_score_home"].fillna(0) + df["ht_score_away"].fillna(0) # Match result outcomes df["ms_home_actual"] = (df["score_home"] > df["score_away"]).astype(int) df["ms_draw_actual"] = (df["score_home"] == df["score_away"]).astype(int) df["ms_away_actual"] = (df["score_home"] < df["score_away"]).astype(int) # Over/Under outcomes df["ou25_over_actual"] = (df["total_goals"] > 2.5).astype(int) df["ou15_over_actual"] = (df["total_goals"] > 1.5).astype(int) df["ou35_over_actual"] = (df["total_goals"] > 3.5).astype(int) # BTTS outcome df["btts_yes_actual"] = ((df["score_home"] > 0) & (df["score_away"] > 0)).astype(int) # Half-Time result df["ht_home_actual"] = (df["ht_score_home"] > df["ht_score_away"]).astype(int) df["ht_draw_actual"] = (df["ht_score_home"] == df["ht_score_away"]).astype(int) df["ht_away_actual"] = (df["ht_score_home"] < df["ht_score_away"]).astype(int) return df def calculate_implied_probabilities(df: pd.DataFrame) -> pd.DataFrame: """ Calculate implied probabilities from odds. Adds columns: - ms_home_prob: implied probability from odds - ms_draw_prob - ms_away_prob - ou25_over_prob - etc. """ def safe_implied_prob(odd_str: str) -> float: """Convert odds string to implied probability.""" if pd.isna(odd_str) or odd_str is None: return np.nan try: odd = float(odd_str) if odd <= 1.0: return np.nan return 1.0 / odd except (ValueError, TypeError): return np.nan # Match result implied probabilities df["ms_home_prob"] = df["ms_h"].apply(safe_implied_prob) df["ms_draw_prob"] = df["ms_d"].apply(safe_implied_prob) df["ms_away_prob"] = df["ms_a"].apply(safe_implied_prob) # Over/Under implied probabilities df["ou25_over_prob"] = df["ou25_over"].apply(safe_implied_prob) df["ou15_over_prob"] = df["ou15_over"].apply(safe_implied_prob) df["ou35_over_prob"] = df["ou35_over"].apply(safe_implied_prob) # BTTS implied probabilities df["btts_yes_prob"] = df["btts_yes"].apply(safe_implied_prob) # ----------------------------------------------------- # CONTEXT-AWARE BUCKETS # Create separate probability and actual columns for odds buckets # ms_home odds: ms_h (note ms_h is the bookmaker odds for home win) # ----------------------------------------------------- # Helper to safe-cast to float df['ms_h_num'] = pd.to_numeric(df['ms_h'], errors='coerce') # Bucket 1: Heavy Fav (odds <= 1.40) b1_mask = df['ms_h_num'] <= 1.40 df.loc[b1_mask, 'ms_home_heavy_fav_prob'] = df.loc[b1_mask, 'ms_home_prob'] df.loc[b1_mask, 'ms_home_heavy_fav_actual'] = df.loc[b1_mask, 'ms_home_actual'] # Bucket 2: Fav (1.40 < odds <= 1.80) b2_mask = (df['ms_h_num'] > 1.40) & (df['ms_h_num'] <= 1.80) df.loc[b2_mask, 'ms_home_fav_prob'] = df.loc[b2_mask, 'ms_home_prob'] df.loc[b2_mask, 'ms_home_fav_actual'] = df.loc[b2_mask, 'ms_home_actual'] # Bucket 3: Balanced (1.80 < odds <= 2.50) b3_mask = (df['ms_h_num'] > 1.80) & (df['ms_h_num'] <= 2.50) df.loc[b3_mask, 'ms_home_balanced_prob'] = df.loc[b3_mask, 'ms_home_prob'] df.loc[b3_mask, 'ms_home_balanced_actual'] = df.loc[b3_mask, 'ms_home_actual'] # Bucket 4: Underdog (odds > 2.50) b4_mask = df['ms_h_num'] > 2.50 df.loc[b4_mask, 'ms_home_underdog_prob'] = df.loc[b4_mask, 'ms_home_prob'] df.loc[b4_mask, 'ms_home_underdog_actual'] = df.loc[b4_mask, 'ms_home_actual'] return df # ============================================================================= # MODEL PREDICTIONS (Optional - if you want to calibrate model outputs) # ============================================================================= def get_model_predictions( df: pd.DataFrame, cur, ) -> pd.DataFrame: """ Get model predictions for each match. This is optional - if you want to calibrate model outputs rather than raw odds-implied probabilities. TODO: Implement if needed. For now, we use odds-implied probabilities as a proxy for model predictions. """ # For now, return odds-implied probabilities as "model predictions" # In a full implementation, you would: # 1. Load the V20 predictor # 2. Run predictions for each match # 3. Store raw model probabilities return df # ============================================================================= # MAIN TRAINING # ============================================================================= def train_calibration_models( df: pd.DataFrame, markets: List[str] = None, min_samples: int = 100, ) -> Dict[str, Any]: """ Train calibration models for specified markets. Args: df: DataFrame with probabilities and actual outcomes markets: List of markets to train (default: all supported) min_samples: Minimum samples required per market Returns: Dict with training results """ if markets is None: markets = SUPPORTED_MARKETS calibrator = get_calibrator() # Define market config: market -> (prob_col, actual_col) market_config = { "ms_home": ("ms_home_prob", "ms_home_actual"), "ms_home_heavy_fav": ("ms_home_heavy_fav_prob", "ms_home_heavy_fav_actual"), "ms_home_fav": ("ms_home_fav_prob", "ms_home_fav_actual"), "ms_home_balanced": ("ms_home_balanced_prob", "ms_home_balanced_actual"), "ms_home_underdog": ("ms_home_underdog_prob", "ms_home_underdog_actual"), "ms_draw": ("ms_draw_prob", "ms_draw_actual"), "ms_away": ("ms_away_prob", "ms_away_actual"), "ou15": ("ou15_over_prob", "ou15_over_actual"), "ou25": ("ou25_over_prob", "ou25_over_actual"), "ou35": ("ou35_over_prob", "ou35_over_actual"), "btts": ("btts_yes_prob", "btts_yes_actual"), "ht_home": ("ht_home_prob", "ht_home_actual"), # Note: need to add ht probs "ht_draw": ("ht_draw_prob", "ht_draw_actual"), "ht_away": ("ht_away_prob", "ht_away_actual"), } # Filter to requested markets market_config = {k: v for k, v in market_config.items() if k in markets} # Train all markets results = calibrator.train_all_markets( df=df, market_config=market_config, min_samples=min_samples, ) return results def print_calibration_report(results: Dict[str, Any]): """Print a formatted calibration report.""" print("\n" + "=" * 70) print("CALIBRATION TRAINING REPORT") print("=" * 70) print(f"\n{'Market':<15} {'Brier':<10} {'ECE':<10} {'Samples':<10} {'Status'}") print("-" * 60) for market, metrics in results.items(): status = "✓ Trained" if metrics.sample_count >= 100 else "⚠ Insufficient" print(f"{market:<15} {metrics.brier_score:<10.4f} {metrics.calibration_error:<10.4f} " f"{metrics.sample_count:<10} {status}") print("\n" + "=" * 70) print("Interpretation:") print(" - Brier Score: Lower is better (0 = perfect, 0.25 = random)") print(" - ECE (Expected Calibration Error): Lower is better (0 = perfect)") print(" - Models saved to: ai-engine/models/calibration/") print("=" * 70) # ============================================================================= # CLI # ============================================================================= def main(): parser = argparse.ArgumentParser(description="Train calibration models") parser.add_argument("--start", type=str, default=DEFAULT_START_DATE, help="Start date (YYYY-MM-DD)") parser.add_argument("--end", type=str, default=DEFAULT_END_DATE, help="End date (YYYY-MM-DD)") parser.add_argument("--markets", nargs="+", default=None, help="Markets to train (default: all)") parser.add_argument("--min-samples", type=int, default=100, help="Minimum samples per market") parser.add_argument("--top-leagues-only", action="store_true", help="Only use top leagues data") args = parser.parse_args() print(f"\n[Calibration Training] {args.start} to {args.end}") # Load top leagues if requested league_ids = None if args.top_leagues_only: league_ids = load_top_league_ids() print(f"[Data] Filtering to {len(league_ids)} top leagues") # Fetch data conn = get_conn() cur = conn.cursor() try: df = fetch_training_data(cur, args.start, args.end, league_ids) if len(df) == 0: print("[Error] No data found for the specified date range") return # Calculate outcomes and probabilities df = calculate_actual_outcomes(df) df = calculate_implied_probabilities(df) # Train models results = train_calibration_models( df=df, markets=args.markets, min_samples=args.min_samples, ) # Print report print_calibration_report(results) finally: cur.close() conn.close() if __name__ == "__main__": main()