v28

2026-04-24 23:46:28 +03:00
parent 3875f2a512
commit 9027cc9900
17 changed files with 4315 additions and 122 deletions
@@ -0,0 +1,291 @@
+"""
+V27 Pro Predictor — Odds-Free Fundamentals + Value Edge Detection
+
+This module loads V27 ensemble models (XGBoost, LightGBM, CatBoost)
+and produces market-independent probability estimates.
+
+The key insight: V27 is trained WITHOUT odds features, so it produces
+"true" probabilities unbiased by market pricing. The divergence between
+V25 (odds-aware) and V27 (odds-free) predictions signals market mispricing.
+"""
+
+import json
+import logging
+import os
+import pickle
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+V27_DIR = Path(__file__).parent / "v27"
+
+
+class V27Predictor:
+    """
+    Loads V27 ensemble models and provides predictions using the
+    82-feature odds-free vector.
+    """
+
+    MARKETS = ["ms", "ou25"]
+
+    def __init__(self):
+        self.models: Dict[str, Dict[str, object]] = {}
+        self.feature_cols: List[str] = []
+        self._loaded = False
+
+    def load_models(self) -> bool:
+        """Load all V27 ensemble models and feature column spec."""
+        if self._loaded:
+            return True
+
+        # Feature columns
+        cols_path = V27_DIR / "v27_feature_cols.json"
+        if not cols_path.exists():
+            logger.error("[V27] Feature columns file not found: %s", cols_path)
+            return False
+
+        try:
+            with open(cols_path, "r", encoding="utf-8") as f:
+                self.feature_cols = json.load(f)
+            logger.info("[V27] Loaded %d feature columns", len(self.feature_cols))
+        except Exception as e:
+            logger.error("[V27] Failed to load feature columns: %s", e)
+            return False
+
+        # Load models per market
+        model_types = {"xgb": "xgb", "lgb": "lgb", "cb": "cb"}
+
+        for market in self.MARKETS:
+            self.models[market] = {}
+            for short, label in model_types.items():
+                # Try market-specific file first: v27_ms_xgb.pkl
+                path = V27_DIR / f"v27_{market}_{short}.pkl"
+                if not path.exists():
+                    # Fallback to generic: v27_xgboost.pkl (for MS only)
+                    generic_names = {"xgb": "v27_xgboost.pkl", "lgb": "v27_lightgbm.pkl", "cb": "v27_catboost.pkl"}
+                    path = V27_DIR / generic_names.get(short, "")
+                    if not path.exists():
+                        logger.warning("[V27] Model file not found for %s/%s", market, short)
+                        continue
+
+                try:
+                    with open(path, "rb") as f:
+                        model = pickle.load(f)
+                    self.models[market][label] = model
+                    logger.info("[V27] ✓ Loaded %s/%s from %s", market, label, path.name)
+                except Exception as e:
+                    logger.error("[V27] ✗ Failed to load %s/%s: %s", market, label, e)
+
+        loaded_count = sum(len(v) for v in self.models.values())
+        if loaded_count == 0:
+            logger.error("[V27] No models loaded!")
+            return False
+
+        self._loaded = True
+        logger.info("[V27] Total models loaded: %d across %d markets", loaded_count, len(self.models))
+        return True
+
+    def _build_feature_array(self, features: Dict[str, float]) -> np.ndarray:
+        """
+        Build ordered feature array from the full feature dict.
+        V27 uses only its 82 features (odds-free subset).
+        """
+        row = []
+        for col in self.feature_cols:
+            row.append(float(features.get(col, 0.0)))
+        return np.array([row])
+
+    def _predict_with_model(self, model, X: np.ndarray, label: str, expected_classes: int) -> Optional[np.ndarray]:
+        """
+        Predict probabilities from a model, handling both sklearn wrappers
+        (predict_proba) and raw Booster objects (predict).
+
+        For raw XGBoost Boosters, DMatrix is created WITH feature_names
+        to match the training schema.
+        """
+        import xgboost as xgb
+        import lightgbm as lgbm
+        import pandas as pd
+
+        # 1. Try sklearn-style predict_proba first
+        if hasattr(model, 'predict_proba'):
+            try:
+                proba = model.predict_proba(X)[0]
+                if len(proba) == expected_classes:
+                    return proba
+                logger.warning("[V27] %s predict_proba returned %d classes, expected %d", label, len(proba), expected_classes)
+            except Exception:
+                pass  # Fall through to raw predict
+
+        # 2. Raw xgboost.Booster — MUST pass feature_names
+        if isinstance(model, xgb.Booster):
+            try:
+                feature_names = self.feature_cols if self.feature_cols else None
+                dmat = xgb.DMatrix(X, feature_names=feature_names)
+                raw = model.predict(dmat)
+                if isinstance(raw, np.ndarray):
+                    if raw.ndim == 2 and raw.shape[1] == expected_classes:
+                        return raw[0]
+                    elif raw.ndim == 1 and expected_classes == 2:
+                        p = float(raw[0])
+                        return np.array([1.0 - p, p])
+                    elif raw.ndim == 1 and len(raw) == expected_classes:
+                        return raw
+            except Exception as e:
+                logger.warning("[V27] %s xgb.Booster predict failed: %s", label, e)
+                return None
+
+        # 3. Raw lightgbm.Booster — pass as DataFrame with column names
+        if isinstance(model, lgbm.Booster):
+            try:
+                if self.feature_cols:
+                    X_named = pd.DataFrame(X, columns=self.feature_cols)
+                    raw = model.predict(X_named)
+                else:
+                    raw = model.predict(X)
+                if isinstance(raw, np.ndarray):
+                    if raw.ndim == 2 and raw.shape[1] == expected_classes:
+                        return raw[0]
+                    elif raw.ndim == 1 and expected_classes == 2:
+                        p = float(raw[0])
+                        return np.array([1.0 - p, p])
+                    elif raw.ndim == 1 and len(raw) == expected_classes:
+                        return raw
+            except Exception as e:
+                logger.warning("[V27] %s lgb.Booster predict failed: %s", label, e)
+                return None
+
+        # 4. Generic fallback (CatBoost, etc.)
+        try:
+            if hasattr(model, 'predict'):
+                raw = model.predict(X)
+                if isinstance(raw, np.ndarray):
+                    if raw.ndim == 2 and raw.shape[1] == expected_classes:
+                        return raw[0]
+                    elif raw.ndim == 1 and expected_classes == 2:
+                        p = float(raw[0])
+                        return np.array([1.0 - p, p])
+                    elif raw.ndim == 1 and len(raw) == expected_classes:
+                        return raw
+        except Exception as e:
+            logger.warning("[V27] %s generic predict failed: %s", label, e)
+
+        return None
+
+    def predict_ms(self, features: Dict[str, float]) -> Optional[Dict[str, float]]:
+        """
+        Predict Match Score probabilities (Home/Draw/Away).
+        Returns dict with keys: home, draw, away.
+        """
+        if not self._loaded or "ms" not in self.models or not self.models["ms"]:
+            return None
+
+        X = self._build_feature_array(features)
+        probs_list = []
+
+        for label, model in self.models["ms"].items():
+            proba = self._predict_with_model(model, X, f"MS/{label}", expected_classes=3)
+            if proba is not None and len(proba) == 3:
+                probs_list.append(proba)
+
+        if not probs_list:
+            return None
+
+        # Ensemble average
+        avg = np.mean(probs_list, axis=0)
+        return {
+            "home": float(avg[0]),
+            "draw": float(avg[1]),
+            "away": float(avg[2]),
+        }
+
+    def predict_ou25(self, features: Dict[str, float]) -> Optional[Dict[str, float]]:
+        """
+        Predict Over/Under 2.5 probabilities.
+        Returns dict with keys: under, over.
+        """
+        if not self._loaded or "ou25" not in self.models or not self.models["ou25"]:
+            return None
+
+        X = self._build_feature_array(features)
+        probs_list = []
+
+        for label, model in self.models["ou25"].items():
+            proba = self._predict_with_model(model, X, f"OU25/{label}", expected_classes=2)
+            if proba is not None and len(proba) == 2:
+                probs_list.append(proba)
+
+        if not probs_list:
+            return None
+
+        avg = np.mean(probs_list, axis=0)
+        return {
+            "under": float(avg[0]),
+            "over": float(avg[1]),
+        }
+
+    def predict_all(self, features: Dict[str, float]) -> Dict[str, Optional[Dict[str, float]]]:
+        """Run predictions for all supported markets."""
+        return {
+            "ms": self.predict_ms(features),
+            "ou25": self.predict_ou25(features),
+        }
+
+
+def compute_divergence(
+    v25_probs: Dict[str, float],
+    v27_probs: Dict[str, float],
+) -> Dict[str, float]:
+    """
+    Compute the divergence signal between V25 (odds-aware) and V27 (odds-free).
+
+    Positive divergence = V27 thinks it's MORE likely than the market → VALUE BET
+    Negative divergence = V27 thinks it's LESS likely than the market → PASS
+
+    Returns per-outcome divergence values.
+    """
+    divergence = {}
+    for key in v27_probs:
+        v25_val = v25_probs.get(key, 0.33)
+        v27_val = v27_probs.get(key, 0.33)
+        divergence[key] = round(v27_val - v25_val, 4)
+    return divergence
+
+
+def compute_value_edge(
+    v25_probs: Dict[str, float],
+    v27_probs: Dict[str, float],
+    odds: Dict[str, float],
+) -> Dict[str, Dict]:
+    """
+    Detect value bets by combining V25/V27 divergence with odds.
+
+    A value bet exists when:
+    1. V27 (odds-free) probability > implied odds probability  (model says it's underpriced)
+    2. V27 and V25 divergence is positive  (V27 sees more signal than the market)
+
+    Returns per-outcome: { probability, implied_prob, edge, is_value }
+    """
+    results = {}
+    for key in v27_probs:
+        v27_p = v27_probs[key]
+        v25_p = v25_probs.get(key, 0.33)
+        odds_val = odds.get(key, 0.0)
+
+        implied_p = (1.0 / odds_val) if odds_val > 1.01 else 0.0
+        divergence = v27_p - v25_p
+        edge = v27_p - implied_p if implied_p > 0 else 0.0
+
+        results[key] = {
+            "v27_prob": round(v27_p, 4),
+            "v25_prob": round(v25_p, 4),
+            "implied_prob": round(implied_p, 4),
+            "divergence": round(divergence, 4),
+            "edge": round(edge, 4),
+            "is_value": edge > 0.05 and divergence > 0.02,  # 5% edge + 2% divergence
+        }
+
+    return results