""" Calibration Module for XGBoost Models ===================================== Calibrates raw probabilities from XGBoost models using Isotonic Regression. Ensures that a predicted probability of 70% actually corresponds to a 70% win rate. Usage: from ai_engine.models.calibration import Calibrator calibrator = Calibrator() calibrated_prob = calibrator.calibrate("ms", raw_prob) # Training new calibration models: calibrator.train_calibration(valid_df, market="ms") """ import os import pickle import json import numpy as np import pandas as pd from datetime import datetime from typing import Dict, List, Optional, Tuple, Any from sklearn.isotonic import IsotonicRegression from sklearn.calibration import calibration_curve from sklearn.metrics import brier_score_loss AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) CALIBRATION_DIR = os.path.join(AI_ENGINE_DIR, "models", "calibration") os.makedirs(CALIBRATION_DIR, exist_ok=True) # Supported markets for calibration SUPPORTED_MARKETS = [ "ms", # Match Result (1X2) - multi-class, calibrated per class "ms_home", # Standard Home win probability "ms_home_heavy_fav", # Context: home odds <= 1.40 "ms_home_fav", # Context: 1.40 < home odds <= 1.80 "ms_home_balanced", # Context: 1.80 < home odds <= 2.50 "ms_home_underdog", # Context: home odds > 2.50 "ms_draw", # Draw probability "ms_away", # Away win probability "ou15", # Over/Under 1.5 "ou25", # Over/Under 2.5 "ou35", # Over/Under 3.5 "btts", # Both Teams to Score "ht_ft", # Half-Time/Full-Time "dc", # Double Chance "ht", # Half-Time Result "ht_home", # Half-Time Home win "ht_draw", # Half-Time Draw "ht_away", # Half-Time Away win ] class CalibrationMetrics: """Stores calibration quality metrics for a market.""" def __init__(self): self.brier_score: float = 0.0 self.calibration_error: float = 0.0 self.sample_count: int = 0 self.last_trained: str = "" self.mean_predicted: float = 0.0 self.mean_actual: float = 0.0 def to_dict(self) -> Dict: return { "brier_score": round(self.brier_score, 4), "calibration_error": round(self.calibration_error, 4), "sample_count": self.sample_count, "last_trained": self.last_trained, "mean_predicted": round(self.mean_predicted, 4), "mean_actual": round(self.mean_actual, 4), } class Calibrator: """ Probability calibration using Isotonic Regression. Isotonic Regression is a non-parametric method that fits a piecewise constant function that is monotonically increasing. It's ideal for calibrating probabilities because: 1. It preserves ranking (if P(A) > P(B) before, P(A) > P(B) after) 2. It doesn't assume a specific distribution shape 3. It can correct systematic over/under-confidence Example: # Before calibration: model predicts 70% but actual win rate is 60% # After calibration: model predicts 70% → calibrated to 60% """ def __init__(self): self.calibrators: Dict[str, IsotonicRegression] = {} self.metrics: Dict[str, CalibrationMetrics] = {} # Less aggressive shrinkage — only meaningful overconfident bands are pulled. # Default raised from ~0.85-0.90 to 0.95+ since the orchestrator and config # already apply market-level multipliers; double-shrinkage was the root cause # of 24-35pt avg calibrated-vs-raw drops in production traces. self.heuristic_fallback: Dict[str, float] = { "ms": 0.96, "ms_home": 0.96, "ms_home_heavy_fav": 0.98, "ms_home_fav": 0.96, "ms_home_balanced": 0.94, "ms_home_underdog": 0.92, "ms_draw": 0.94, "ms_away": 0.96, "ou15": 0.96, "ou25": 0.96, "ou35": 0.94, "btts": 0.96, "ht_ft": 0.92, "dc": 0.97, "ht": 0.92, "ht_home": 0.92, "ht_draw": 0.92, "ht_away": 0.92, } self._load_calibrators() def _load_calibrators(self): """Load trained calibrators for each market from disk.""" for market in SUPPORTED_MARKETS: model_path = os.path.join(CALIBRATION_DIR, f"{market}_calibrator.pkl") metrics_path = os.path.join(CALIBRATION_DIR, f"{market}_metrics.json") if os.path.exists(model_path): try: with open(model_path, "rb") as f: self.calibrators[market] = pickle.load(f) print(f"[Calibrator] Loaded calibration model for {market}") except Exception as e: print(f"[Calibrator] Warning: Failed to load {market}: {e}") if os.path.exists(metrics_path): try: with open(metrics_path, "r") as f: data = json.load(f) metrics = CalibrationMetrics() metrics.brier_score = data.get("brier_score", 0.0) metrics.calibration_error = data.get("calibration_error", 0.0) metrics.sample_count = data.get("sample_count", 0) metrics.last_trained = data.get("last_trained", "") metrics.mean_predicted = data.get("mean_predicted", 0.0) metrics.mean_actual = data.get("mean_actual", 0.0) self.metrics[market] = metrics except Exception as e: print(f"[Calibrator] Warning: Failed to load metrics for {market}: {e}") # Below this sample count, the isotonic model is treated as untrained # (raw_prob is returned). Between MIN and FLOOR we ramp from 0 to ~15% # trust. Between FLOOR and CEILING we ramp to full trust. # Rationale: 12-sample calibrators are statistical noise; even 30% # blending on them propagates the noise into the confidence value the # betting_brain reads downstream. HARD_MIN_SAMPLES = 50 TRUSTED_SAMPLE_FLOOR = 100 TRUSTED_SAMPLE_CEILING = 400 # Hard cap on how far calibration can move probability in either direction. MAX_DELTA = 0.20 def calibrate(self, market_type: str, raw_prob: float, odds_val: Optional[float] = None) -> float: """ Calibrate a raw probability using Isotonic Regression with safeguards. Args: market_type (str): 'ms_home', 'ou25', 'btts', 'ht_ft', etc. raw_prob (float): The raw probability from XGBoost (0.0 - 1.0) odds_val (float, optional): The pre-match odds, used for context-aware bucket mapping Returns: float: Calibrated probability (0.0 - 1.0) Safeguards: * Low-sample trained models are blended with raw_prob to dampen overfit. * MAX_DELTA caps the per-call adjustment (prevents 40pp swings). """ # Normalize market type market_key = market_type.lower().replace("-", "_") # Route to bucket if ms_home and odds provided if market_key == "ms_home" and odds_val is not None and odds_val > 1.0: if odds_val <= 1.40: bucket_key = "ms_home_heavy_fav" elif odds_val <= 1.80: bucket_key = "ms_home_fav" elif odds_val <= 2.50: bucket_key = "ms_home_balanced" else: bucket_key = "ms_home_underdog" if bucket_key in self.calibrators: market_key = bucket_key # If we have a trained Isotonic Regression model, use it (with safeguards) if market_key in self.calibrators: try: iso_pred = float(self.calibrators[market_key].predict([raw_prob])[0]) # Sample-count weighted blend with raw probability. # Sparse models barely move probability; mature models dominate. metrics = self.metrics.get(market_key) n_samples = metrics.sample_count if metrics else 0 if n_samples < self.HARD_MIN_SAMPLES: # Below 50 samples isotonic fit is unreliable — bypass it # entirely and return raw_prob. The heuristic shrinkage # below would still apply a model-version multiplier elsewhere. return float(np.clip(raw_prob, 0.01, 0.99)) if n_samples >= self.TRUSTED_SAMPLE_CEILING: iso_weight = 1.0 elif n_samples <= self.TRUSTED_SAMPLE_FLOOR: # Linear ramp from 0% at HARD_MIN_SAMPLES to ~25% at FLOOR span = self.TRUSTED_SAMPLE_FLOOR - self.HARD_MIN_SAMPLES iso_weight = 0.25 * (n_samples - self.HARD_MIN_SAMPLES) / span else: # Linearly ramp 25% → 100% between floor and ceiling span = self.TRUSTED_SAMPLE_CEILING - self.TRUSTED_SAMPLE_FLOOR iso_weight = 0.25 + 0.75 * (n_samples - self.TRUSTED_SAMPLE_FLOOR) / span blended = iso_weight * iso_pred + (1.0 - iso_weight) * raw_prob # Cap delta to avoid huge swings on noisy calibrators delta = blended - raw_prob if delta > self.MAX_DELTA: blended = raw_prob + self.MAX_DELTA elif delta < -self.MAX_DELTA: blended = raw_prob - self.MAX_DELTA return float(np.clip(blended, 0.01, 0.99)) except Exception as e: print(f"[Calibrator] Warning: Isotonic failed for {market_key}: {e}") # Fall through to heuristic # Fallback to heuristic calibration return self._heuristic_calibrate(market_key, raw_prob) def _heuristic_calibrate(self, market_type: str, raw_prob: float) -> float: """ Heuristic calibration fallback when no trained model exists. This applies a conservative shrinkage towards the mean: - Binary markets (OU, BTTS): shrink towards 0.5 - Multi-class (MS): shrink towards 0.33 - HT/FT: stronger shrinkage due to higher variance """ # Get shrinkage factor for this market shrinkage = self.heuristic_fallback.get(market_type, 0.90) if market_type in ["ms", "ms_home", "ms_home_heavy_fav", "ms_home_fav", "ms_home_balanced", "ms_home_underdog", "ms_draw", "ms_away"]: # Pull towards 0.33 (uniform for 3-class) return (raw_prob * shrinkage) + (0.33 * (1.0 - shrinkage)) elif market_type in ["ou15", "ou25", "ou35", "btts"]: # Pull towards 0.5 (uniform for binary) return (raw_prob * shrinkage) + (0.5 * (1.0 - shrinkage)) elif market_type in ["ht_ft", "ht"]: # Stronger shrinkage for high-variance markets return raw_prob * shrinkage elif market_type == "dc": # Double chance is more reliable return (raw_prob * shrinkage) + (0.66 * (1.0 - shrinkage)) return raw_prob def train_calibration( self, df: pd.DataFrame, market: str, prob_col: str, actual_col: str, min_samples: int = 100, save: bool = True, ) -> CalibrationMetrics: """ Train an Isotonic Regression calibration model for a specific market. Args: df: DataFrame with predictions and actual outcomes market: Market identifier (e.g., 'ms_home', 'ou25', 'btts') prob_col: Column name for raw probabilities actual_col: Column name for actual outcomes (0 or 1) min_samples: Minimum samples required to train save: Whether to save the model to disk Returns: CalibrationMetrics with quality metrics """ # Filter valid data valid_df = df[[prob_col, actual_col]].dropna() n_samples = len(valid_df) if n_samples < min_samples: print(f"[Calibrator] Warning: Only {n_samples} samples for {market}, " f"need at least {min_samples}") metrics = CalibrationMetrics() metrics.sample_count = n_samples return metrics # Extract arrays raw_probs = valid_df[prob_col].values actuals = valid_df[actual_col].values # Train Isotonic Regression iso = IsotonicRegression(out_of_bounds="clip", increasing=True) iso.fit(raw_probs, actuals) # Calculate calibrated probabilities calibrated_probs = iso.predict(raw_probs) # Calculate metrics metrics = CalibrationMetrics() metrics.sample_count = n_samples metrics.last_trained = datetime.utcnow().isoformat() metrics.brier_score = brier_score_loss(actuals, calibrated_probs) metrics.mean_predicted = np.mean(raw_probs) metrics.mean_actual = np.mean(actuals) # Calculate Expected Calibration Error (ECE) metrics.calibration_error = self._calculate_ece( calibrated_probs, actuals, n_bins=10 ) # Store in memory self.calibrators[market] = iso self.metrics[market] = metrics # Save to disk if save: self._save_calibration(market, iso, metrics) print(f"[Calibrator] Trained {market}: " f"Brier={metrics.brier_score:.4f}, " f"ECE={metrics.calibration_error:.4f}, " f"n={n_samples}") return metrics def train_all_markets( self, df: pd.DataFrame, market_config: Dict[str, Tuple[str, str]], min_samples: int = 100, ) -> Dict[str, CalibrationMetrics]: """ Train calibration models for multiple markets at once. Args: df: DataFrame with all predictions and outcomes market_config: Dict mapping market -> (prob_col, actual_col) e.g., {'ou25': ('ou25_over_prob', 'ou25_over_actual')} min_samples: Minimum samples per market Returns: Dict of market -> CalibrationMetrics """ results = {} for market, (prob_col, actual_col) in market_config.items(): print(f"\n[Calibrator] Training {market}...") try: metrics = self.train_calibration( df=df, market=market, prob_col=prob_col, actual_col=actual_col, min_samples=min_samples, save=True, ) results[market] = metrics except Exception as e: print(f"[Calibrator] Failed to train {market}: {e}") return results def _calculate_ece( self, probs: np.ndarray, actuals: np.ndarray, n_bins: int = 10 ) -> float: """ Calculate Expected Calibration Error (ECE). ECE = sum(|bin_accuracy - bin_confidence| * bin_weight) Lower is better. Perfect calibration = 0. """ bin_boundaries = np.linspace(0, 1, n_bins + 1) ece = 0.0 for i in range(n_bins): in_bin = (probs >= bin_boundaries[i]) & (probs < bin_boundaries[i + 1]) prop_in_bin = np.mean(in_bin) if prop_in_bin > 0: accuracy_in_bin = np.mean(actuals[in_bin]) avg_confidence_in_bin = np.mean(probs[in_bin]) ece += np.abs(accuracy_in_bin - avg_confidence_in_bin) * prop_in_bin return ece def _save_calibration( self, market: str, calibrator: IsotonicRegression, metrics: CalibrationMetrics ): """Save calibration model and metrics to disk.""" # Save model model_path = os.path.join(CALIBRATION_DIR, f"{market}_calibrator.pkl") with open(model_path, "wb") as f: pickle.dump(calibrator, f) # Save metrics metrics_path = os.path.join(CALIBRATION_DIR, f"{market}_metrics.json") with open(metrics_path, "w") as f: json.dump(metrics.to_dict(), f, indent=2) print(f"[Calibrator] Saved {market} to {CALIBRATION_DIR}") def get_calibration_report(self) -> Dict[str, Any]: """Generate a summary report of all calibration models.""" report = { "trained_markets": list(self.calibrators.keys()), "metrics": {}, "heuristic_only": [], } for market in SUPPORTED_MARKETS: if market in self.metrics: report["metrics"][market] = self.metrics[market].to_dict() elif market not in self.calibrators: report["heuristic_only"].append(market) return report def get_calibrated_probabilities( self, market: str, raw_probs: np.ndarray ) -> np.ndarray: """ Batch calibration for array of probabilities. Args: market: Market type raw_probs: Array of raw probabilities Returns: Array of calibrated probabilities """ return np.array([self.calibrate(market, p) for p in raw_probs]) # Singleton instance _calibrator_instance: Optional[Calibrator] = None def get_calibrator() -> Calibrator: """Get or create the global Calibrator instance.""" global _calibrator_instance if _calibrator_instance is None: _calibrator_instance = Calibrator() return _calibrator_instance