292 lines
11 KiB
Python
292 lines
11 KiB
Python
"""
|
|
V27 Pro Predictor — Odds-Free Fundamentals + Value Edge Detection
|
|
|
|
This module loads V27 ensemble models (XGBoost, LightGBM, CatBoost)
|
|
and produces market-independent probability estimates.
|
|
|
|
The key insight: V27 is trained WITHOUT odds features, so it produces
|
|
"true" probabilities unbiased by market pricing. The divergence between
|
|
V25 (odds-aware) and V27 (odds-free) predictions signals market mispricing.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import pickle
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
V27_DIR = Path(__file__).parent / "v27"
|
|
|
|
|
|
class V27Predictor:
|
|
"""
|
|
Loads V27 ensemble models and provides predictions using the
|
|
82-feature odds-free vector.
|
|
"""
|
|
|
|
MARKETS = ["ms", "ou25"]
|
|
|
|
def __init__(self):
|
|
self.models: Dict[str, Dict[str, object]] = {}
|
|
self.feature_cols: List[str] = []
|
|
self._loaded = False
|
|
|
|
def load_models(self) -> bool:
|
|
"""Load all V27 ensemble models and feature column spec."""
|
|
if self._loaded:
|
|
return True
|
|
|
|
# Feature columns
|
|
cols_path = V27_DIR / "v27_feature_cols.json"
|
|
if not cols_path.exists():
|
|
logger.error("[V27] Feature columns file not found: %s", cols_path)
|
|
return False
|
|
|
|
try:
|
|
with open(cols_path, "r", encoding="utf-8") as f:
|
|
self.feature_cols = json.load(f)
|
|
logger.info("[V27] Loaded %d feature columns", len(self.feature_cols))
|
|
except Exception as e:
|
|
logger.error("[V27] Failed to load feature columns: %s", e)
|
|
return False
|
|
|
|
# Load models per market
|
|
model_types = {"xgb": "xgb", "lgb": "lgb", "cb": "cb"}
|
|
|
|
for market in self.MARKETS:
|
|
self.models[market] = {}
|
|
for short, label in model_types.items():
|
|
# Try market-specific file first: v27_ms_xgb.pkl
|
|
path = V27_DIR / f"v27_{market}_{short}.pkl"
|
|
if not path.exists():
|
|
# Fallback to generic: v27_xgboost.pkl (for MS only)
|
|
generic_names = {"xgb": "v27_xgboost.pkl", "lgb": "v27_lightgbm.pkl", "cb": "v27_catboost.pkl"}
|
|
path = V27_DIR / generic_names.get(short, "")
|
|
if not path.exists():
|
|
logger.warning("[V27] Model file not found for %s/%s", market, short)
|
|
continue
|
|
|
|
try:
|
|
with open(path, "rb") as f:
|
|
model = pickle.load(f)
|
|
self.models[market][label] = model
|
|
logger.info("[V27] ✓ Loaded %s/%s from %s", market, label, path.name)
|
|
except Exception as e:
|
|
logger.error("[V27] ✗ Failed to load %s/%s: %s", market, label, e)
|
|
|
|
loaded_count = sum(len(v) for v in self.models.values())
|
|
if loaded_count == 0:
|
|
logger.error("[V27] No models loaded!")
|
|
return False
|
|
|
|
self._loaded = True
|
|
logger.info("[V27] Total models loaded: %d across %d markets", loaded_count, len(self.models))
|
|
return True
|
|
|
|
def _build_feature_array(self, features: Dict[str, float]) -> np.ndarray:
|
|
"""
|
|
Build ordered feature array from the full feature dict.
|
|
V27 uses only its 82 features (odds-free subset).
|
|
"""
|
|
row = []
|
|
for col in self.feature_cols:
|
|
row.append(float(features.get(col, 0.0)))
|
|
return np.array([row])
|
|
|
|
def _predict_with_model(self, model, X: np.ndarray, label: str, expected_classes: int) -> Optional[np.ndarray]:
|
|
"""
|
|
Predict probabilities from a model, handling both sklearn wrappers
|
|
(predict_proba) and raw Booster objects (predict).
|
|
|
|
For raw XGBoost Boosters, DMatrix is created WITH feature_names
|
|
to match the training schema.
|
|
"""
|
|
import xgboost as xgb
|
|
import lightgbm as lgbm
|
|
import pandas as pd
|
|
|
|
# 1. Try sklearn-style predict_proba first
|
|
if hasattr(model, 'predict_proba'):
|
|
try:
|
|
proba = model.predict_proba(X)[0]
|
|
if len(proba) == expected_classes:
|
|
return proba
|
|
logger.warning("[V27] %s predict_proba returned %d classes, expected %d", label, len(proba), expected_classes)
|
|
except Exception:
|
|
pass # Fall through to raw predict
|
|
|
|
# 2. Raw xgboost.Booster — MUST pass feature_names
|
|
if isinstance(model, xgb.Booster):
|
|
try:
|
|
feature_names = self.feature_cols if self.feature_cols else None
|
|
dmat = xgb.DMatrix(X, feature_names=feature_names)
|
|
raw = model.predict(dmat)
|
|
if isinstance(raw, np.ndarray):
|
|
if raw.ndim == 2 and raw.shape[1] == expected_classes:
|
|
return raw[0]
|
|
elif raw.ndim == 1 and expected_classes == 2:
|
|
p = float(raw[0])
|
|
return np.array([1.0 - p, p])
|
|
elif raw.ndim == 1 and len(raw) == expected_classes:
|
|
return raw
|
|
except Exception as e:
|
|
logger.warning("[V27] %s xgb.Booster predict failed: %s", label, e)
|
|
return None
|
|
|
|
# 3. Raw lightgbm.Booster — pass as DataFrame with column names
|
|
if isinstance(model, lgbm.Booster):
|
|
try:
|
|
if self.feature_cols:
|
|
X_named = pd.DataFrame(X, columns=self.feature_cols)
|
|
raw = model.predict(X_named)
|
|
else:
|
|
raw = model.predict(X)
|
|
if isinstance(raw, np.ndarray):
|
|
if raw.ndim == 2 and raw.shape[1] == expected_classes:
|
|
return raw[0]
|
|
elif raw.ndim == 1 and expected_classes == 2:
|
|
p = float(raw[0])
|
|
return np.array([1.0 - p, p])
|
|
elif raw.ndim == 1 and len(raw) == expected_classes:
|
|
return raw
|
|
except Exception as e:
|
|
logger.warning("[V27] %s lgb.Booster predict failed: %s", label, e)
|
|
return None
|
|
|
|
# 4. Generic fallback (CatBoost, etc.)
|
|
try:
|
|
if hasattr(model, 'predict'):
|
|
raw = model.predict(X)
|
|
if isinstance(raw, np.ndarray):
|
|
if raw.ndim == 2 and raw.shape[1] == expected_classes:
|
|
return raw[0]
|
|
elif raw.ndim == 1 and expected_classes == 2:
|
|
p = float(raw[0])
|
|
return np.array([1.0 - p, p])
|
|
elif raw.ndim == 1 and len(raw) == expected_classes:
|
|
return raw
|
|
except Exception as e:
|
|
logger.warning("[V27] %s generic predict failed: %s", label, e)
|
|
|
|
return None
|
|
|
|
def predict_ms(self, features: Dict[str, float]) -> Optional[Dict[str, float]]:
|
|
"""
|
|
Predict Match Score probabilities (Home/Draw/Away).
|
|
Returns dict with keys: home, draw, away.
|
|
"""
|
|
if not self._loaded or "ms" not in self.models or not self.models["ms"]:
|
|
return None
|
|
|
|
X = self._build_feature_array(features)
|
|
probs_list = []
|
|
|
|
for label, model in self.models["ms"].items():
|
|
proba = self._predict_with_model(model, X, f"MS/{label}", expected_classes=3)
|
|
if proba is not None and len(proba) == 3:
|
|
probs_list.append(proba)
|
|
|
|
if not probs_list:
|
|
return None
|
|
|
|
# Ensemble average
|
|
avg = np.mean(probs_list, axis=0)
|
|
return {
|
|
"home": float(avg[0]),
|
|
"draw": float(avg[1]),
|
|
"away": float(avg[2]),
|
|
}
|
|
|
|
def predict_ou25(self, features: Dict[str, float]) -> Optional[Dict[str, float]]:
|
|
"""
|
|
Predict Over/Under 2.5 probabilities.
|
|
Returns dict with keys: under, over.
|
|
"""
|
|
if not self._loaded or "ou25" not in self.models or not self.models["ou25"]:
|
|
return None
|
|
|
|
X = self._build_feature_array(features)
|
|
probs_list = []
|
|
|
|
for label, model in self.models["ou25"].items():
|
|
proba = self._predict_with_model(model, X, f"OU25/{label}", expected_classes=2)
|
|
if proba is not None and len(proba) == 2:
|
|
probs_list.append(proba)
|
|
|
|
if not probs_list:
|
|
return None
|
|
|
|
avg = np.mean(probs_list, axis=0)
|
|
return {
|
|
"under": float(avg[0]),
|
|
"over": float(avg[1]),
|
|
}
|
|
|
|
def predict_all(self, features: Dict[str, float]) -> Dict[str, Optional[Dict[str, float]]]:
|
|
"""Run predictions for all supported markets."""
|
|
return {
|
|
"ms": self.predict_ms(features),
|
|
"ou25": self.predict_ou25(features),
|
|
}
|
|
|
|
|
|
def compute_divergence(
|
|
v25_probs: Dict[str, float],
|
|
v27_probs: Dict[str, float],
|
|
) -> Dict[str, float]:
|
|
"""
|
|
Compute the divergence signal between V25 (odds-aware) and V27 (odds-free).
|
|
|
|
Positive divergence = V27 thinks it's MORE likely than the market → VALUE BET
|
|
Negative divergence = V27 thinks it's LESS likely than the market → PASS
|
|
|
|
Returns per-outcome divergence values.
|
|
"""
|
|
divergence = {}
|
|
for key in v27_probs:
|
|
v25_val = v25_probs.get(key, 0.33)
|
|
v27_val = v27_probs.get(key, 0.33)
|
|
divergence[key] = round(v27_val - v25_val, 4)
|
|
return divergence
|
|
|
|
|
|
def compute_value_edge(
|
|
v25_probs: Dict[str, float],
|
|
v27_probs: Dict[str, float],
|
|
odds: Dict[str, float],
|
|
) -> Dict[str, Dict]:
|
|
"""
|
|
Detect value bets by combining V25/V27 divergence with odds.
|
|
|
|
A value bet exists when:
|
|
1. V27 (odds-free) probability > implied odds probability (model says it's underpriced)
|
|
2. V27 and V25 divergence is positive (V27 sees more signal than the market)
|
|
|
|
Returns per-outcome: { probability, implied_prob, edge, is_value }
|
|
"""
|
|
results = {}
|
|
for key in v27_probs:
|
|
v27_p = v27_probs[key]
|
|
v25_p = v25_probs.get(key, 0.33)
|
|
odds_val = odds.get(key, 0.0)
|
|
|
|
implied_p = (1.0 / odds_val) if odds_val > 1.01 else 0.0
|
|
divergence = v27_p - v25_p
|
|
edge = v27_p - implied_p if implied_p > 0 else 0.0
|
|
|
|
results[key] = {
|
|
"v27_prob": round(v27_p, 4),
|
|
"v25_prob": round(v25_p, 4),
|
|
"implied_prob": round(implied_p, 4),
|
|
"divergence": round(divergence, 4),
|
|
"edge": round(edge, 4),
|
|
"is_value": edge > 0.05 and divergence > 0.02, # 5% edge + 2% divergence
|
|
}
|
|
|
|
return results
|