iddaai-be/ai-engine/models/v27_predictor.py

"""
V27 Pro Predictor — Odds-Free Fundamentals + Value Edge Detection

This module loads V27 ensemble models (XGBoost, LightGBM, CatBoost)
and produces market-independent probability estimates.

The key insight: V27 is trained WITHOUT odds features, so it produces
"true" probabilities unbiased by market pricing. The divergence between
V25 (odds-aware) and V27 (odds-free) predictions signals market mispricing.
"""

import json
import logging
import os
import pickle
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np

logger = logging.getLogger(__name__)

V27_DIR = Path(__file__).parent / "v27"


class V27Predictor:
    """
    Loads V27 ensemble models and provides predictions using the
    82-feature odds-free vector.
    """

    MARKETS = ["ms", "ou25"]

    def __init__(self):
        self.models: Dict[str, Dict[str, object]] = {}
        self.feature_cols: List[str] = []
        self._loaded = False

    def load_models(self) -> bool:
        """Load all V27 ensemble models and feature column spec."""
        if self._loaded:
            return True

        # Feature columns
        cols_path = V27_DIR / "v27_feature_cols.json"
        if not cols_path.exists():
            logger.error("[V27] Feature columns file not found: %s", cols_path)
            return False

        try:
            with open(cols_path, "r", encoding="utf-8") as f:
                self.feature_cols = json.load(f)
            logger.info("[V27] Loaded %d feature columns", len(self.feature_cols))
        except Exception as e:
            logger.error("[V27] Failed to load feature columns: %s", e)
            return False

        # Load models per market
        model_types = {"xgb": "xgb", "lgb": "lgb", "cb": "cb"}

        for market in self.MARKETS:
            self.models[market] = {}
            for short, label in model_types.items():
                # Try market-specific file first: v27_ms_xgb.pkl
                path = V27_DIR / f"v27_{market}_{short}.pkl"
                if not path.exists():
                    # Fallback to generic: v27_xgboost.pkl (for MS only)
                    generic_names = {"xgb": "v27_xgboost.pkl", "lgb": "v27_lightgbm.pkl", "cb": "v27_catboost.pkl"}
                    path = V27_DIR / generic_names.get(short, "")
                    if not path.exists():
                        logger.warning("[V27] Model file not found for %s/%s", market, short)
                        continue

                try:
                    with open(path, "rb") as f:
                        model = pickle.load(f)
                    self.models[market][label] = model
                    logger.info("[V27] ✓ Loaded %s/%s from %s", market, label, path.name)
                except Exception as e:
                    logger.error("[V27] ✗ Failed to load %s/%s: %s", market, label, e)

        loaded_count = sum(len(v) for v in self.models.values())
        if loaded_count == 0:
            logger.error("[V27] No models loaded!")
            return False

        self._loaded = True
        logger.info("[V27] Total models loaded: %d across %d markets", loaded_count, len(self.models))
        return True

    def _build_feature_array(self, features: Dict[str, float]) -> np.ndarray:
        """
        Build ordered feature array from the full feature dict.
        V27 uses only its 82 features (odds-free subset).
        """
        row = []
        for col in self.feature_cols:
            row.append(float(features.get(col, 0.0)))
        return np.array([row])

    def _predict_with_model(self, model, X: np.ndarray, label: str, expected_classes: int) -> Optional[np.ndarray]:
        """
        Predict probabilities from a model, handling both sklearn wrappers
        (predict_proba) and raw Booster objects (predict).

        For raw XGBoost Boosters, DMatrix is created WITH feature_names
        to match the training schema.
        """
        import xgboost as xgb
        import lightgbm as lgbm
        import pandas as pd

        # 1. Try sklearn-style predict_proba first
        if hasattr(model, 'predict_proba'):
            try:
                proba = model.predict_proba(X)[0]
                if len(proba) == expected_classes:
                    return proba
                logger.warning("[V27] %s predict_proba returned %d classes, expected %d", label, len(proba), expected_classes)
            except Exception:
                pass  # Fall through to raw predict

        # 2. Raw xgboost.Booster — MUST pass feature_names
        if isinstance(model, xgb.Booster):
            try:
                feature_names = self.feature_cols if self.feature_cols else None
                dmat = xgb.DMatrix(X, feature_names=feature_names)
                raw = model.predict(dmat)
                if isinstance(raw, np.ndarray):
                    if raw.ndim == 2 and raw.shape[1] == expected_classes:
                        return raw[0]
                    elif raw.ndim == 1 and expected_classes == 2:
                        p = float(raw[0])
                        return np.array([1.0 - p, p])
                    elif raw.ndim == 1 and len(raw) == expected_classes:
                        return raw
            except Exception as e:
                logger.warning("[V27] %s xgb.Booster predict failed: %s", label, e)
                return None

        # 3. Raw lightgbm.Booster — pass as DataFrame with column names
        if isinstance(model, lgbm.Booster):
            try:
                if self.feature_cols:
                    X_named = pd.DataFrame(X, columns=self.feature_cols)
                    raw = model.predict(X_named)
                else:
                    raw = model.predict(X)
                if isinstance(raw, np.ndarray):
                    if raw.ndim == 2 and raw.shape[1] == expected_classes:
                        return raw[0]
                    elif raw.ndim == 1 and expected_classes == 2:
                        p = float(raw[0])
                        return np.array([1.0 - p, p])
                    elif raw.ndim == 1 and len(raw) == expected_classes:
                        return raw
            except Exception as e:
                logger.warning("[V27] %s lgb.Booster predict failed: %s", label, e)
                return None

        # 4. Generic fallback (CatBoost, etc.)
        try:
            if hasattr(model, 'predict'):
                raw = model.predict(X)
                if isinstance(raw, np.ndarray):
                    if raw.ndim == 2 and raw.shape[1] == expected_classes:
                        return raw[0]
                    elif raw.ndim == 1 and expected_classes == 2:
                        p = float(raw[0])
                        return np.array([1.0 - p, p])
                    elif raw.ndim == 1 and len(raw) == expected_classes:
                        return raw
        except Exception as e:
            logger.warning("[V27] %s generic predict failed: %s", label, e)

        return None

    def predict_ms(self, features: Dict[str, float]) -> Optional[Dict[str, float]]:
        """
        Predict Match Score probabilities (Home/Draw/Away).
        Returns dict with keys: home, draw, away.
        """
        if not self._loaded or "ms" not in self.models or not self.models["ms"]:
            return None

        X = self._build_feature_array(features)
        probs_list = []

        for label, model in self.models["ms"].items():
            proba = self._predict_with_model(model, X, f"MS/{label}", expected_classes=3)
            if proba is not None and len(proba) == 3:
                probs_list.append(proba)

        if not probs_list:
            return None

        # Ensemble average
        avg = np.mean(probs_list, axis=0)
        return {
            "home": float(avg[0]),
            "draw": float(avg[1]),
            "away": float(avg[2]),
        }

    def predict_ou25(self, features: Dict[str, float]) -> Optional[Dict[str, float]]:
        """
        Predict Over/Under 2.5 probabilities.
        Returns dict with keys: under, over.
        """
        if not self._loaded or "ou25" not in self.models or not self.models["ou25"]:
            return None

        X = self._build_feature_array(features)
        probs_list = []

        for label, model in self.models["ou25"].items():
            proba = self._predict_with_model(model, X, f"OU25/{label}", expected_classes=2)
            if proba is not None and len(proba) == 2:
                probs_list.append(proba)

        if not probs_list:
            return None

        avg = np.mean(probs_list, axis=0)
        return {
            "under": float(avg[0]),
            "over": float(avg[1]),
        }

    def predict_all(self, features: Dict[str, float]) -> Dict[str, Optional[Dict[str, float]]]:
        """Run predictions for all supported markets."""
        return {
            "ms": self.predict_ms(features),
            "ou25": self.predict_ou25(features),
        }


def compute_divergence(
    v25_probs: Dict[str, float],
    v27_probs: Dict[str, float],
) -> Dict[str, float]:
    """
    Compute the divergence signal between V25 (odds-aware) and V27 (odds-free).

    Positive divergence = V27 thinks it's MORE likely than the market → VALUE BET
    Negative divergence = V27 thinks it's LESS likely than the market → PASS

    Returns per-outcome divergence values.
    """
    divergence = {}
    for key in v27_probs:
        v25_val = v25_probs.get(key, 0.33)
        v27_val = v27_probs.get(key, 0.33)
        divergence[key] = round(v27_val - v25_val, 4)
    return divergence


def compute_value_edge(
    v25_probs: Dict[str, float],
    v27_probs: Dict[str, float],
    odds: Dict[str, float],
) -> Dict[str, Dict]:
    """
    Detect value bets by combining V25/V27 divergence with odds.

    A value bet exists when:
    1. V27 (odds-free) probability > implied odds probability  (model says it's underpriced)
    2. V27 and V25 divergence is positive  (V27 sees more signal than the market)

    Returns per-outcome: { probability, implied_prob, edge, is_value }
    """
    results = {}
    for key in v27_probs:
        v27_p = v27_probs[key]
        v25_p = v25_probs.get(key, 0.33)
        odds_val = odds.get(key, 0.0)

        implied_p = (1.0 / odds_val) if odds_val > 1.01 else 0.0
        divergence = v27_p - v25_p
        edge = v27_p - implied_p if implied_p > 0 else 0.0

        results[key] = {
            "v27_prob": round(v27_p, 4),
            "v25_prob": round(v25_p, 4),
            "implied_prob": round(implied_p, 4),
            "divergence": round(divergence, 4),
            "edge": round(edge, 4),
            "is_value": edge > 0.05 and divergence > 0.02,  # 5% edge + 2% divergence
        }

    return results