""" V25 Ensemble Predictor - NO TARGET LEAKAGE =========================================== Multi-model ensemble for match prediction using XGBoost and LightGBM. Features: - 73 engineered features (NO target leakage) - Market-specific models (MS, OU25, BTTS) - Weighted ensemble predictions - Value bet detection """ import os import json import numpy as np import pandas as pd from typing import Dict, List, Optional, Any from dataclasses import dataclass, field import xgboost as xgb import lightgbm as lgb import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) try: from config.config_loader import get_config as _get_cfg except ImportError: _get_cfg = None # type: ignore[assignment] # CatBoost is optional try: from catboost import CatBoostClassifier CATBOOST_AVAILABLE = True except ImportError: CatBoostClassifier = None CATBOOST_AVAILABLE = False # Paths MODELS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'v25') @dataclass class MarketPrediction: """Prediction for a single betting market.""" market_type: str pick: str probability: float confidence: float odds: float = 0.0 is_value_bet: bool = False edge: float = 0.0 def to_dict(self) -> dict: return { 'market_type': self.market_type, 'pick': self.pick, 'probability': round(self.probability * 100, 1), 'confidence': round(self.confidence, 1), 'odds': self.odds, 'is_value_bet': self.is_value_bet, 'edge': round(self.edge * 100, 1), } @dataclass class ValueBet: """Detected value bet opportunity.""" market_type: str pick: str probability: float odds: float edge: float confidence: float def to_dict(self) -> dict: return { 'market_type': self.market_type, 'pick': self.pick, 'probability': round(self.probability * 100, 1), 'odds': self.odds, 'edge': round(self.edge * 100, 1), 'confidence': round(self.confidence, 1), } @dataclass class MatchPrediction: """Complete match prediction with all markets.""" match_id: str home_team: str away_team: str # MS predictions home_prob: float = 0.0 draw_prob: float = 0.0 away_prob: float = 0.0 ms_pick: str = '' ms_confidence: float = 0.0 # OU25 predictions over_prob: float = 0.0 under_prob: float = 0.0 ou25_pick: str = '' ou25_confidence: float = 0.0 # BTTS predictions btts_yes_prob: float = 0.0 btts_no_prob: float = 0.0 btts_pick: str = '' btts_confidence: float = 0.0 # Value bets value_bets: List[ValueBet] = field(default_factory=list) def to_dict(self) -> dict: return { 'match_id': self.match_id, 'home_team': self.home_team, 'away_team': self.away_team, 'ms': { 'home_prob': round(self.home_prob * 100, 1), 'draw_prob': round(self.draw_prob * 100, 1), 'away_prob': round(self.away_prob * 100, 1), 'pick': self.ms_pick, 'confidence': round(self.ms_confidence, 1), }, 'ou25': { 'over_prob': round(self.over_prob * 100, 1), 'under_prob': round(self.under_prob * 100, 1), 'pick': self.ou25_pick, 'confidence': round(self.ou25_confidence, 1), }, 'btts': { 'yes_prob': round(self.btts_yes_prob * 100, 1), 'no_prob': round(self.btts_no_prob * 100, 1), 'pick': self.btts_pick, 'confidence': round(self.btts_confidence, 1), }, 'value_bets': [vb.to_dict() for vb in self.value_bets], } class V25Predictor: """ V25 Ensemble Predictor - NO TARGET LEAKAGE Uses market-specific XGBoost and LightGBM models. Each market (MS, OU25, BTTS) has its own trained models. """ # Feature columns — loaded dynamically from feature_cols.json to stay # in sync with the trained models. The hardcoded list below is only a # fallback in case the JSON file is missing. _FALLBACK_FEATURE_COLS = [ # ELO Features (8) 'home_overall_elo', 'away_overall_elo', 'elo_diff', 'home_home_elo', 'away_away_elo', 'home_form_elo', 'away_form_elo', 'form_elo_diff', # Form Features (12) 'home_goals_avg', 'home_conceded_avg', 'away_goals_avg', 'away_conceded_avg', 'home_clean_sheet_rate', 'away_clean_sheet_rate', 'home_scoring_rate', 'away_scoring_rate', 'home_winning_streak', 'away_winning_streak', 'home_unbeaten_streak', 'away_unbeaten_streak', # H2H Features (6) 'h2h_total_matches', 'h2h_home_win_rate', 'h2h_draw_rate', 'h2h_avg_goals', 'h2h_btts_rate', 'h2h_over25_rate', # Team Stats Features (8) 'home_avg_possession', 'away_avg_possession', 'home_avg_shots_on_target', 'away_avg_shots_on_target', 'home_shot_conversion', 'away_shot_conversion', 'home_avg_corners', 'away_avg_corners', # Odds Features (24) 'odds_ms_h', 'odds_ms_d', 'odds_ms_a', 'implied_home', 'implied_draw', 'implied_away', 'odds_ht_ms_h', 'odds_ht_ms_d', 'odds_ht_ms_a', 'odds_ou05_o', 'odds_ou05_u', 'odds_ou15_o', 'odds_ou15_u', 'odds_ou25_o', 'odds_ou25_u', 'odds_ou35_o', 'odds_ou35_u', 'odds_ht_ou05_o', 'odds_ht_ou05_u', 'odds_ht_ou15_o', 'odds_ht_ou15_u', 'odds_btts_y', 'odds_btts_n', # Odds Presence Flags (20) 'odds_ms_h_present', 'odds_ms_d_present', 'odds_ms_a_present', 'odds_ht_ms_h_present', 'odds_ht_ms_d_present', 'odds_ht_ms_a_present', 'odds_ou05_o_present', 'odds_ou05_u_present', 'odds_ou15_o_present', 'odds_ou15_u_present', 'odds_ou25_o_present', 'odds_ou25_u_present', 'odds_ou35_o_present', 'odds_ou35_u_present', 'odds_ht_ou05_o_present', 'odds_ht_ou05_u_present', 'odds_ht_ou15_o_present', 'odds_ht_ou15_u_present', 'odds_btts_y_present', 'odds_btts_n_present', # League Features (4) 'home_xga', 'away_xga', 'league_avg_goals', 'league_zero_goal_rate', # Upset Engine (4) 'upset_atmosphere', 'upset_motivation', 'upset_fatigue', 'upset_potential', # Referee Engine (5) 'referee_home_bias', 'referee_avg_goals', 'referee_cards_total', 'referee_avg_yellow', 'referee_experience', # Momentum Engine (3) 'home_momentum_score', 'away_momentum_score', 'momentum_diff', # Squad Features (9) 'home_squad_quality', 'away_squad_quality', 'squad_diff', 'home_key_players', 'away_key_players', 'home_missing_impact', 'away_missing_impact', 'home_goals_form', 'away_goals_form', ] @staticmethod def _load_feature_cols() -> list: """Load feature columns from feature_cols.json, falling back to hardcoded list.""" feature_json = os.path.join(MODELS_DIR, 'feature_cols.json') try: if os.path.exists(feature_json): with open(feature_json, 'r', encoding='utf-8') as f: cols = json.load(f) if isinstance(cols, list) and len(cols) > 0: print(f"[V25] Loaded {len(cols)} feature columns from feature_cols.json") return cols except Exception as e: print(f"[V25] Warning: could not load feature_cols.json: {e}") print(f"[V25] Using fallback feature columns ({len(V25Predictor._FALLBACK_FEATURE_COLS)} features)") return V25Predictor._FALLBACK_FEATURE_COLS # Model weights for ensemble (overridden from config in __init__) DEFAULT_WEIGHTS = { 'xgb': 0.50, 'lgb': 0.50, } def __init__(self, models_dir: Optional[str] = None): """ Initialize V25 Predictor. Args: models_dir: Directory containing model files. Defaults to v25/ directory. """ self.models_dir = models_dir or MODELS_DIR self.models = {} # market -> {'xgb': model, 'lgb': model} self._loaded = False self.FEATURE_COLS = self._load_feature_cols() # Load weights from config (falls back to class default 0.50/0.50) if _get_cfg is not None: try: cfg = _get_cfg() self.DEFAULT_WEIGHTS = { 'xgb': float(cfg.get('model_ensemble.xgb_weight', 0.50)), 'lgb': float(cfg.get('model_ensemble.lgb_weight', 0.50)), } except Exception: pass # keep class-level defaults # All trained market models available in V25 ALL_MARKETS = [ 'ms', 'ou25', 'btts', # Core markets 'ou15', 'ou35', # Additional OU lines 'ht_result', 'ht_ou05', 'ht_ou15', # HT markets 'htft', # HT/FT combo 'cards_ou45', # Cards market 'handicap_ms', # Handicap 'odd_even', # Odd/Even goals ] # Multi-class markets (output > 2 classes) MULTICLASS_MARKETS = {'ms', 'ht_result', 'htft', 'handicap_ms'} def load_models(self) -> bool: """Load all market-specific models from disk.""" try: loaded_count = 0 for market in self.ALL_MARKETS: self.models[market] = {} # Load XGBoost (read content in Python to avoid non-ASCII path issues) xgb_path = os.path.join(self.models_dir, f'xgb_v25_{market}.json') if os.path.exists(xgb_path) and os.path.getsize(xgb_path) > 0: with open(xgb_path, 'r', encoding='utf-8') as f: xgb_content = f.read() booster = xgb.Booster() booster.load_model(bytearray(xgb_content, 'utf-8')) # Corruption detection: verify model can run a dummy prediction try: _dummy = pd.DataFrame([{col: 0.0 for col in self.FEATURE_COLS}]) booster.predict(xgb.DMatrix(_dummy)) self.models[market]['xgb'] = booster loaded_count += 1 except Exception as _ce: print(f"[V25] ⚠️ XGB model for {market} failed integrity check: {_ce} — skipping") # Load LightGBM (read content in Python to avoid non-ASCII path issues) lgb_path = os.path.join(self.models_dir, f'lgb_v25_{market}.txt') if os.path.exists(lgb_path) and os.path.getsize(lgb_path) > 0: with open(lgb_path, 'r', encoding='utf-8') as f: model_str = f.read() lgb_model = lgb.Booster(model_str=model_str) # Corruption detection: verify model can run a dummy prediction try: _dummy = pd.DataFrame([{col: 0.0 for col in self.FEATURE_COLS}]) lgb_model.predict(_dummy) self.models[market]['lgb'] = lgb_model loaded_count += 1 except Exception as _ce: print(f"[V25] ⚠️ LGB model for {market} failed integrity check: {_ce} — skipping") # Remove empty entries if not self.models[market]: del self.models[market] print(f"[V25] Loaded {loaded_count} model files across {len(self.models)} markets: {list(self.models.keys())}") self._loaded = loaded_count > 0 return self._loaded except Exception as e: print(f"[ERROR] Error loading models: {e}") import traceback traceback.print_exc() return False def _ensure_loaded(self): """Ensure models are loaded before prediction.""" if not self._loaded: if not self.load_models(): raise RuntimeError("Failed to load V25 models") def readiness_summary(self) -> Dict[str, Any]: """Return per-market model status for health check endpoint.""" if not self._loaded: self.load_models() market_status = {} for market in self.ALL_MARKETS: m = self.models.get(market, {}) market_status[market] = { "xgb": "xgb" in m, "lgb": "lgb" in m, "ready": bool(m), } loaded_markets = [k for k, v in market_status.items() if v["ready"]] return { "fully_loaded": len(loaded_markets) == len(self.ALL_MARKETS), "loaded_markets": loaded_markets, "missing_markets": [m for m in self.ALL_MARKETS if m not in loaded_markets], "weights": self.DEFAULT_WEIGHTS, } def _prepare_features(self, features: Dict[str, float]) -> pd.DataFrame: """Prepare feature vector for prediction.""" X = pd.DataFrame([{col: features.get(col, 0.0) for col in self.FEATURE_COLS}]) return X def predict_ms(self, features: Dict[str, float]) -> tuple: """ Predict match result (1X2). Returns: (home_prob, draw_prob, away_prob) """ self._ensure_loaded() X = self._prepare_features(features) probs = [] # XGBoost if 'xgb' in self.models.get('ms', {}): dmat = xgb.DMatrix(X) xgb_proba = self.models['ms']['xgb'].predict(dmat) if len(xgb_proba.shape) == 1: xgb_proba = np.array([xgb_proba]) probs.append(xgb_proba[0] * self.DEFAULT_WEIGHTS['xgb']) # LightGBM if 'lgb' in self.models.get('ms', {}): lgb_proba = self.models['ms']['lgb'].predict(X) if len(lgb_proba.shape) == 2: probs.append(lgb_proba[0] * self.DEFAULT_WEIGHTS['lgb']) if not probs: return 0.33, 0.33, 0.33 ensemble_proba = np.sum(probs, axis=0) ensemble_proba = ensemble_proba / ensemble_proba.sum() return float(ensemble_proba[0]), float(ensemble_proba[1]), float(ensemble_proba[2]) def predict_ou25(self, features: Dict[str, float]) -> tuple: """ Predict Over/Under 2.5 goals. Returns: (over_prob, under_prob) """ self._ensure_loaded() X = self._prepare_features(features) probs = [] # XGBoost if 'xgb' in self.models.get('ou25', {}): dmat = xgb.DMatrix(X) xgb_proba = self.models['ou25']['xgb'].predict(dmat) if isinstance(xgb_proba, np.ndarray) and len(xgb_proba.shape) == 1: probs.append(xgb_proba[0]) # LightGBM if 'lgb' in self.models.get('ou25', {}): lgb_proba = self.models['ou25']['lgb'].predict(X) if isinstance(lgb_proba, np.ndarray): probs.append(lgb_proba[0]) if not probs: return 0.5, 0.5 # Average probability avg_prob = np.mean(probs) return float(avg_prob), float(1 - avg_prob) def predict_btts(self, features: Dict[str, float]) -> tuple: """ Predict Both Teams To Score. Returns: (yes_prob, no_prob) """ self._ensure_loaded() X = self._prepare_features(features) probs = [] # XGBoost if 'xgb' in self.models.get('btts', {}): dmat = xgb.DMatrix(X) xgb_proba = self.models['btts']['xgb'].predict(dmat) if isinstance(xgb_proba, np.ndarray) and len(xgb_proba.shape) == 1: probs.append(xgb_proba[0]) # LightGBM if 'lgb' in self.models.get('btts', {}): lgb_proba = self.models['btts']['lgb'].predict(X) if isinstance(lgb_proba, np.ndarray): probs.append(lgb_proba[0]) if not probs: return 0.5, 0.5 # Average probability avg_prob = np.mean(probs) return float(avg_prob), float(1 - avg_prob) def predict_market(self, market: str, features: Dict[str, float]) -> Optional[np.ndarray]: """ Generic prediction for any loaded market. Args: market: Market key (e.g. 'ht_result', 'htft', 'cards_ou45') features: Feature dictionary. Returns: numpy array of probabilities. For binary markets: [positive_prob] For multi-class markets: [class0_prob, class1_prob, ...] """ self._ensure_loaded() if market not in self.models: return None X = self._prepare_features(features) probs = [] weights = [] is_multiclass = market in self.MULTICLASS_MARKETS # XGBoost if 'xgb' in self.models[market]: dmat = xgb.DMatrix(X) xgb_proba = self.models[market]['xgb'].predict(dmat) if isinstance(xgb_proba, np.ndarray): if is_multiclass and len(xgb_proba.shape) == 2: probs.append(xgb_proba[0]) elif is_multiclass and len(xgb_proba.shape) == 1: probs.append(xgb_proba) else: probs.append(np.array([xgb_proba[0]])) weights.append(self.DEFAULT_WEIGHTS['xgb']) # LightGBM if 'lgb' in self.models[market]: lgb_proba = self.models[market]['lgb'].predict(X) if isinstance(lgb_proba, np.ndarray): if is_multiclass and len(lgb_proba.shape) == 2: probs.append(lgb_proba[0]) elif is_multiclass and len(lgb_proba.shape) == 1: probs.append(lgb_proba) else: probs.append(np.array([lgb_proba[0]])) weights.append(self.DEFAULT_WEIGHTS['lgb']) if not probs: return None # Weighted average if len(probs) == 1: return probs[0] total_w = sum(weights[:len(probs)]) result = np.zeros_like(probs[0]) for p, w in zip(probs, weights): result += p * (w / total_w) # Normalize multi-class if is_multiclass and result.sum() > 0: result = result / result.sum() return result def has_market(self, market: str) -> bool: """Check if a specific market model is loaded.""" return market in self.models def predict_match( self, match_id: str, home_team: str, away_team: str, features: Dict[str, float], odds: Optional[Dict[str, float]] = None, ) -> MatchPrediction: """ Predict all markets for a match. Args: match_id: Match identifier. home_team: Home team name. away_team: Away team name. features: Feature dictionary. odds: Optional odds dictionary for value bet detection. Returns: MatchPrediction object. """ # Get predictions for each market home_prob, draw_prob, away_prob = self.predict_ms(features) over_prob, under_prob = self.predict_ou25(features) btts_yes_prob, btts_no_prob = self.predict_btts(features) # Determine picks ms_probs = {'1': home_prob, 'X': draw_prob, '2': away_prob} ms_pick = max(ms_probs, key=ms_probs.__getitem__) ms_confidence = ms_probs[ms_pick] * 100 ou25_probs = {'Over': over_prob, 'Under': under_prob} ou25_pick = max(ou25_probs, key=ou25_probs.__getitem__) ou25_confidence = ou25_probs[ou25_pick] * 100 btts_probs = {'Yes': btts_yes_prob, 'No': btts_no_prob} btts_pick = max(btts_probs, key=btts_probs.__getitem__) btts_confidence = btts_probs[btts_pick] * 100 # Create prediction prediction = MatchPrediction( match_id=match_id, home_team=home_team, away_team=away_team, home_prob=home_prob, draw_prob=draw_prob, away_prob=away_prob, ms_pick=ms_pick, ms_confidence=ms_confidence, over_prob=over_prob, under_prob=under_prob, ou25_pick=ou25_pick, ou25_confidence=ou25_confidence, btts_yes_prob=btts_yes_prob, btts_no_prob=btts_no_prob, btts_pick=btts_pick, btts_confidence=btts_confidence, ) # Detect value bets if odds: prediction.value_bets = self._detect_value_bets( prediction, odds, home_prob, draw_prob, away_prob, over_prob, under_prob, btts_yes_prob, btts_no_prob ) return prediction def _detect_value_bets( self, prediction: MatchPrediction, odds: Dict[str, float], home_prob: float, draw_prob: float, away_prob: float, over_prob: float, under_prob: float, btts_yes_prob: float, btts_no_prob: float, ) -> List[ValueBet]: """Detect value bets based on model vs market odds.""" value_bets = [] # Market-specific minimum edge thresholds # MS: higher variance → require more edge # OU/BTTS: binary markets → tighter edge acceptable EDGE_THRESHOLDS = { 'MS': 0.06, 'OU25': 0.04, 'BTTS': 0.04, } ms_edge = EDGE_THRESHOLDS['MS'] ou_edge = EDGE_THRESHOLDS['OU25'] btts_edge = EDGE_THRESHOLDS['BTTS'] # MS value bets if 'ms_h' in odds and odds['ms_h'] > 0: implied = 1 / odds['ms_h'] edge = home_prob - implied if edge > ms_edge: value_bets.append(ValueBet( market_type='MS', pick='1', probability=home_prob, odds=odds['ms_h'], edge=edge, confidence=home_prob * 100, )) if 'ms_d' in odds and odds['ms_d'] > 0: implied = 1 / odds['ms_d'] edge = draw_prob - implied if edge > ms_edge: value_bets.append(ValueBet( market_type='MS', pick='X', probability=draw_prob, odds=odds['ms_d'], edge=edge, confidence=draw_prob * 100, )) if 'ms_a' in odds and odds['ms_a'] > 0: implied = 1 / odds['ms_a'] edge = away_prob - implied if edge > ms_edge: value_bets.append(ValueBet( market_type='MS', pick='2', probability=away_prob, odds=odds['ms_a'], edge=edge, confidence=away_prob * 100, )) # OU25 value bets if 'ou25_o' in odds and odds['ou25_o'] > 0: implied = 1 / odds['ou25_o'] edge = over_prob - implied if edge > ou_edge: value_bets.append(ValueBet( market_type='OU25', pick='Over', probability=over_prob, odds=odds['ou25_o'], edge=edge, confidence=over_prob * 100, )) if 'ou25_u' in odds and odds['ou25_u'] > 0: implied = 1 / odds['ou25_u'] edge = under_prob - implied if edge > ou_edge: value_bets.append(ValueBet( market_type='OU25', pick='Under', probability=under_prob, odds=odds['ou25_u'], edge=edge, confidence=under_prob * 100, )) # BTTS value bets if 'btts_y' in odds and odds['btts_y'] > 0: implied = 1 / odds['btts_y'] edge = btts_yes_prob - implied if edge > btts_edge: value_bets.append(ValueBet( market_type='BTTS', pick='Yes', probability=btts_yes_prob, odds=odds['btts_y'], edge=edge, confidence=btts_yes_prob * 100, )) if 'btts_n' in odds and odds['btts_n'] > 0: implied = 1 / odds['btts_n'] edge = btts_no_prob - implied if edge > btts_edge: value_bets.append(ValueBet( market_type='BTTS', pick='No', probability=btts_no_prob, odds=odds['btts_n'], edge=edge, confidence=btts_no_prob * 100, )) return value_bets # Singleton instance _v25_predictor: Optional[V25Predictor] = None def get_v25_predictor() -> V25Predictor: """Get or create V25 predictor instance.""" global _v25_predictor if _v25_predictor is None: _v25_predictor = V25Predictor() _v25_predictor.load_models() return _v25_predictor