""" V27 Pro Predictor — Odds-Free Fundamentals + Value Edge Detection This module loads V27 ensemble models (XGBoost, LightGBM, CatBoost) and produces market-independent probability estimates. The key insight: V27 is trained WITHOUT odds features, so it produces "true" probabilities unbiased by market pricing. The divergence between V25 (odds-aware) and V27 (odds-free) predictions signals market mispricing. """ import json import logging import os import pickle from pathlib import Path from typing import Dict, List, Optional, Tuple import numpy as np logger = logging.getLogger(__name__) V27_DIR = Path(__file__).parent / "v27" class V27Predictor: """ Loads V27 ensemble models and provides predictions using the 82-feature odds-free vector. """ MARKETS = ['ms', 'ou25', 'btts'] def __init__(self): self.models: Dict[str, Dict[str, object]] = {} self.feature_cols: List[str] = [] self._loaded = False def load_models(self) -> bool: """Load all V27 ensemble models and feature column spec.""" if self._loaded: return True # Feature columns cols_path = V27_DIR / "v27_feature_cols.json" if not cols_path.exists(): logger.error("[V27] Feature columns file not found: %s", cols_path) return False try: with open(cols_path, "r", encoding="utf-8") as f: self.feature_cols = json.load(f) logger.info("[V27] Loaded %d feature columns", len(self.feature_cols)) except Exception as e: logger.error("[V27] Failed to load feature columns: %s", e) return False # Load models per market model_types = {"xgb": "xgb", "lgb": "lgb"} for market in self.MARKETS: self.models[market] = {} for short, label in model_types.items(): # Try market-specific file first: v27_ms_xgb.pkl path = V27_DIR / f"v27_{market}_{short}.pkl" if not path.exists(): # Fallback to generic: v27_xgboost.pkl (for MS only) generic_names = {"xgb": "v27_xgboost.pkl", "lgb": "v27_lightgbm.pkl", "cb": "v27_catboost.pkl"} path = V27_DIR / generic_names.get(short, "") if not path.exists(): logger.warning("[V27] Model file not found for %s/%s", market, short) continue try: with open(path, "rb") as f: model = pickle.load(f) self.models[market][label] = model logger.info("[V27] ✓ Loaded %s/%s from %s", market, label, path.name) except Exception as e: logger.error("[V27] ✗ Failed to load %s/%s: %s", market, label, e) loaded_count = sum(len(v) for v in self.models.values()) if loaded_count == 0: logger.error("[V27] No models loaded!") return False self._loaded = True logger.info("[V27] Total models loaded: %d across %d markets", loaded_count, len(self.models)) return True def _build_feature_array(self, features: Dict[str, float]) -> np.ndarray: """ Build ordered feature array from the full feature dict. V27 uses only its 82 features (odds-free subset). """ row = [] for col in self.feature_cols: row.append(float(features.get(col, 0.0))) return np.array([row]) def _predict_with_model(self, model, X: np.ndarray, label: str, expected_classes: int) -> Optional[np.ndarray]: """ Predict probabilities from a model, handling both sklearn wrappers (predict_proba) and raw Booster objects (predict). For raw XGBoost Boosters, DMatrix is created WITH feature_names to match the training schema. """ import xgboost as xgb import lightgbm as lgbm import pandas as pd # 1. Try sklearn-style predict_proba first if hasattr(model, 'predict_proba'): try: proba = model.predict_proba(X)[0] if len(proba) == expected_classes: return proba logger.warning("[V27] %s predict_proba returned %d classes, expected %d", label, len(proba), expected_classes) except Exception: pass # Fall through to raw predict # 2. Raw xgboost.Booster — MUST pass feature_names if isinstance(model, xgb.Booster): try: feature_names = self.feature_cols if self.feature_cols else None dmat = xgb.DMatrix(X, feature_names=feature_names) raw = model.predict(dmat) if isinstance(raw, np.ndarray): if raw.ndim == 2 and raw.shape[1] == expected_classes: return raw[0] elif raw.ndim == 1 and expected_classes == 2: p = float(raw[0]) return np.array([1.0 - p, p]) elif raw.ndim == 1 and len(raw) == expected_classes: return raw except Exception as e: logger.warning("[V27] %s xgb.Booster predict failed: %s", label, e) return None # 3. Raw lightgbm.Booster — pass as DataFrame with column names if isinstance(model, lgbm.Booster): try: if self.feature_cols: X_named = pd.DataFrame(X, columns=self.feature_cols) raw = model.predict(X_named) else: raw = model.predict(X) if isinstance(raw, np.ndarray): if raw.ndim == 2 and raw.shape[1] == expected_classes: return raw[0] elif raw.ndim == 1 and expected_classes == 2: p = float(raw[0]) return np.array([1.0 - p, p]) elif raw.ndim == 1 and len(raw) == expected_classes: return raw except Exception as e: logger.warning("[V27] %s lgb.Booster predict failed: %s", label, e) return None # 4. Generic fallback (CatBoost, etc.) try: if hasattr(model, 'predict'): raw = model.predict(X) if isinstance(raw, np.ndarray): if raw.ndim == 2 and raw.shape[1] == expected_classes: return raw[0] elif raw.ndim == 1 and expected_classes == 2: p = float(raw[0]) return np.array([1.0 - p, p]) elif raw.ndim == 1 and len(raw) == expected_classes: return raw except Exception as e: logger.warning("[V27] %s generic predict failed: %s", label, e) return None def predict_ms(self, features: Dict[str, float]) -> Optional[Dict[str, float]]: """ Predict Match Score probabilities (Home/Draw/Away). Returns dict with keys: home, draw, away. """ if not self._loaded or "ms" not in self.models or not self.models["ms"]: return None X = self._build_feature_array(features) probs_list = [] for label, model in self.models["ms"].items(): proba = self._predict_with_model(model, X, f"MS/{label}", expected_classes=3) if proba is not None and len(proba) == 3: probs_list.append(proba) if not probs_list: return None # Ensemble average avg = np.mean(probs_list, axis=0) return { "home": float(avg[0]), "draw": float(avg[1]), "away": float(avg[2]), } def predict_ou25(self, features: Dict[str, float]) -> Optional[Dict[str, float]]: """ Predict Over/Under 2.5 probabilities. Returns dict with keys: under, over. """ if not self._loaded or "ou25" not in self.models or not self.models["ou25"]: return None X = self._build_feature_array(features) probs_list = [] for label, model in self.models["ou25"].items(): proba = self._predict_with_model(model, X, f"OU25/{label}", expected_classes=2) if proba is not None and len(proba) == 2: probs_list.append(proba) if not probs_list: return None avg = np.mean(probs_list, axis=0) return { "under": float(avg[0]), "over": float(avg[1]), } def predict_btts(self, features: Dict[str, float]) -> Optional[Dict[str, float]]: """ Predict Both Teams To Score probabilities. Returns dict with keys: no, yes. """ if not self._loaded or 'btts' not in self.models or not self.models['btts']: return None X = self._build_feature_array(features) probs_list = [] for label, model in self.models['btts'].items(): proba = self._predict_with_model(model, X, f'BTTS/{label}', expected_classes=2) if proba is not None and len(proba) == 2: probs_list.append(proba) if not probs_list: return None avg = np.mean(probs_list, axis=0) return { 'no': float(avg[0]), 'yes': float(avg[1]), } def predict_dc(self, features: Dict[str, float]) -> Optional[Dict[str, float]]: """ Predict Double Chance probabilities. DC is algebraically derived from MS predictions: 1X = home + draw X2 = draw + away 12 = home + away This gives an odds-free DC estimate for divergence detection. """ ms_probs = self.predict_ms(features) if not ms_probs: return None home = ms_probs['home'] draw = ms_probs['draw'] away = ms_probs['away'] return { '1x': round(home + draw, 4), 'x2': round(draw + away, 4), '12': round(home + away, 4), } def predict_all(self, features: Dict[str, float]) -> Dict[str, Optional[Dict[str, float]]]: """Run predictions for all supported markets.""" return { 'ms': self.predict_ms(features), 'ou25': self.predict_ou25(features), 'btts': self.predict_btts(features), 'dc': self.predict_dc(features), } def compute_divergence( v25_probs: Dict[str, float], v27_probs: Dict[str, float], ) -> Dict[str, float]: """ Compute the divergence signal between V25 (odds-aware) and V27 (odds-free). Positive divergence = V27 thinks it's MORE likely than the market → VALUE BET Negative divergence = V27 thinks it's LESS likely than the market → PASS Returns per-outcome divergence values. """ divergence = {} for key in v27_probs: v25_val = v25_probs.get(key, 0.33) v27_val = v27_probs.get(key, 0.33) divergence[key] = round(v27_val - v25_val, 4) return divergence def compute_value_edge( v25_probs: Dict[str, float], v27_probs: Dict[str, float], odds: Dict[str, float], ) -> Dict[str, Dict]: """ Detect value bets by combining V25/V27 divergence with odds. A value bet exists when: 1. V27 (odds-free) probability > implied odds probability (model says it's underpriced) 2. V27 and V25 divergence is positive (V27 sees more signal than the market) Returns per-outcome: { probability, implied_prob, edge, is_value } """ results = {} for key in v27_probs: v27_p = v27_probs[key] v25_p = v25_probs.get(key, 0.33) odds_val = odds.get(key, 0.0) implied_p = (1.0 / odds_val) if odds_val > 1.01 else 0.0 divergence = v27_p - v25_p edge = v27_p - implied_p if implied_p > 0 else 0.0 results[key] = { "v27_prob": round(v27_p, 4), "v25_prob": round(v25_p, 4), "implied_prob": round(implied_p, 4), "divergence": round(divergence, 4), "edge": round(edge, 4), "is_value": edge > 0.05 and divergence > 0.02, # 5% edge + 2% divergence } return results