v28
Deploy Iddaai Backend / build-and-deploy (push) Successful in 3m21s

This commit is contained in:
2026-04-24 23:46:28 +03:00
parent 3875f2a512
commit 9027cc9900
17 changed files with 4315 additions and 122 deletions
+413
View File
@@ -0,0 +1,413 @@
"""
Calibration Module for XGBoost Models
=====================================
Calibrates raw probabilities from XGBoost models using Isotonic Regression.
Ensures that a predicted probability of 70% actually corresponds to a 70% win rate.
Usage:
from ai_engine.models.calibration import Calibrator
calibrator = Calibrator()
calibrated_prob = calibrator.calibrate("ms", raw_prob)
# Training new calibration models:
calibrator.train_calibration(valid_df, market="ms")
"""
import os
import pickle
import json
import numpy as np
import pandas as pd
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
CALIBRATION_DIR = os.path.join(AI_ENGINE_DIR, "models", "calibration")
os.makedirs(CALIBRATION_DIR, exist_ok=True)
# Supported markets for calibration
SUPPORTED_MARKETS = [
"ms", # Match Result (1X2) - multi-class, calibrated per class
"ms_home", # Standard Home win probability
"ms_home_heavy_fav", # Context: home odds <= 1.40
"ms_home_fav", # Context: 1.40 < home odds <= 1.80
"ms_home_balanced", # Context: 1.80 < home odds <= 2.50
"ms_home_underdog", # Context: home odds > 2.50
"ms_draw", # Draw probability
"ms_away", # Away win probability
"ou15", # Over/Under 1.5
"ou25", # Over/Under 2.5
"ou35", # Over/Under 3.5
"btts", # Both Teams to Score
"ht_ft", # Half-Time/Full-Time
"dc", # Double Chance
"ht", # Half-Time Result
]
class CalibrationMetrics:
"""Stores calibration quality metrics for a market."""
def __init__(self):
self.brier_score: float = 0.0
self.calibration_error: float = 0.0
self.sample_count: int = 0
self.last_trained: str = ""
self.mean_predicted: float = 0.0
self.mean_actual: float = 0.0
def to_dict(self) -> Dict:
return {
"brier_score": round(self.brier_score, 4),
"calibration_error": round(self.calibration_error, 4),
"sample_count": self.sample_count,
"last_trained": self.last_trained,
"mean_predicted": round(self.mean_predicted, 4),
"mean_actual": round(self.mean_actual, 4),
}
class Calibrator:
"""
Probability calibration using Isotonic Regression.
Isotonic Regression is a non-parametric method that fits a piecewise
constant function that is monotonically increasing. It's ideal for
calibrating probabilities because:
1. It preserves ranking (if P(A) > P(B) before, P(A) > P(B) after)
2. It doesn't assume a specific distribution shape
3. It can correct systematic over/under-confidence
Example:
# Before calibration: model predicts 70% but actual win rate is 60%
# After calibration: model predicts 70% → calibrated to 60%
"""
def __init__(self):
self.calibrators: Dict[str, IsotonicRegression] = {}
self.metrics: Dict[str, CalibrationMetrics] = {}
self.heuristic_fallback: Dict[str, float] = {
"ms": 0.90,
"ms_home": 0.90,
"ms_home_heavy_fav": 0.95,
"ms_home_fav": 0.90,
"ms_home_balanced": 0.85,
"ms_home_underdog": 0.80,
"ms_draw": 0.90,
"ms_away": 0.90,
"ou15": 0.90,
"ou25": 0.90,
"ou35": 0.90,
"btts": 0.90,
"ht_ft": 0.85,
"dc": 0.93,
"ht": 0.85,
}
self._load_calibrators()
def _load_calibrators(self):
"""Load trained calibrators for each market from disk."""
for market in SUPPORTED_MARKETS:
model_path = os.path.join(CALIBRATION_DIR, f"{market}_calibrator.pkl")
metrics_path = os.path.join(CALIBRATION_DIR, f"{market}_metrics.json")
if os.path.exists(model_path):
try:
with open(model_path, "rb") as f:
self.calibrators[market] = pickle.load(f)
print(f"[Calibrator] Loaded calibration model for {market}")
except Exception as e:
print(f"[Calibrator] Warning: Failed to load {market}: {e}")
if os.path.exists(metrics_path):
try:
with open(metrics_path, "r") as f:
data = json.load(f)
metrics = CalibrationMetrics()
metrics.brier_score = data.get("brier_score", 0.0)
metrics.calibration_error = data.get("calibration_error", 0.0)
metrics.sample_count = data.get("sample_count", 0)
metrics.last_trained = data.get("last_trained", "")
metrics.mean_predicted = data.get("mean_predicted", 0.0)
metrics.mean_actual = data.get("mean_actual", 0.0)
self.metrics[market] = metrics
except Exception as e:
print(f"[Calibrator] Warning: Failed to load metrics for {market}: {e}")
def calibrate(self, market_type: str, raw_prob: float, odds_val: Optional[float] = None) -> float:
"""
Calibrate a raw probability using Isotonic Regression.
Args:
market_type (str): 'ms_home', 'ou25', 'btts', 'ht_ft', etc.
raw_prob (float): The raw probability from XGBoost (0.0 - 1.0)
odds_val (float, optional): The pre-match odds, used for context-aware bucket mapping
Returns:
float: Calibrated probability (0.0 - 1.0)
"""
# Normalize market type
market_key = market_type.lower().replace("-", "_")
# Route to bucket if ms_home and odds provided
if market_key == "ms_home" and odds_val is not None and odds_val > 1.0:
if odds_val <= 1.40:
bucket_key = "ms_home_heavy_fav"
elif odds_val <= 1.80:
bucket_key = "ms_home_fav"
elif odds_val <= 2.50:
bucket_key = "ms_home_balanced"
else:
bucket_key = "ms_home_underdog"
if bucket_key in self.calibrators:
market_key = bucket_key
# If we have a trained Isotonic Regression model, use it
if market_key in self.calibrators:
try:
calibrated = self.calibrators[market_key].predict([raw_prob])[0]
# Ensure output is valid probability
return float(np.clip(calibrated, 0.01, 0.99))
except Exception as e:
print(f"[Calibrator] Warning: Isotonic failed for {market_key}: {e}")
# Fall through to heuristic
# Fallback to heuristic calibration
return self._heuristic_calibrate(market_key, raw_prob)
def _heuristic_calibrate(self, market_type: str, raw_prob: float) -> float:
"""
Heuristic calibration fallback when no trained model exists.
This applies a conservative shrinkage towards the mean:
- Binary markets (OU, BTTS): shrink towards 0.5
- Multi-class (MS): shrink towards 0.33
- HT/FT: stronger shrinkage due to higher variance
"""
# Get shrinkage factor for this market
shrinkage = self.heuristic_fallback.get(market_type, 0.90)
if market_type in ["ms", "ms_home", "ms_home_heavy_fav", "ms_home_fav", "ms_home_balanced", "ms_home_underdog", "ms_draw", "ms_away"]:
# Pull towards 0.33 (uniform for 3-class)
return (raw_prob * shrinkage) + (0.33 * (1.0 - shrinkage))
elif market_type in ["ou15", "ou25", "ou35", "btts"]:
# Pull towards 0.5 (uniform for binary)
return (raw_prob * shrinkage) + (0.5 * (1.0 - shrinkage))
elif market_type in ["ht_ft", "ht"]:
# Stronger shrinkage for high-variance markets
return raw_prob * shrinkage
elif market_type == "dc":
# Double chance is more reliable
return (raw_prob * shrinkage) + (0.66 * (1.0 - shrinkage))
return raw_prob
def train_calibration(
self,
df: pd.DataFrame,
market: str,
prob_col: str,
actual_col: str,
min_samples: int = 100,
save: bool = True,
) -> CalibrationMetrics:
"""
Train an Isotonic Regression calibration model for a specific market.
Args:
df: DataFrame with predictions and actual outcomes
market: Market identifier (e.g., 'ms_home', 'ou25', 'btts')
prob_col: Column name for raw probabilities
actual_col: Column name for actual outcomes (0 or 1)
min_samples: Minimum samples required to train
save: Whether to save the model to disk
Returns:
CalibrationMetrics with quality metrics
"""
# Filter valid data
valid_df = df[[prob_col, actual_col]].dropna()
n_samples = len(valid_df)
if n_samples < min_samples:
print(f"[Calibrator] Warning: Only {n_samples} samples for {market}, "
f"need at least {min_samples}")
metrics = CalibrationMetrics()
metrics.sample_count = n_samples
return metrics
# Extract arrays
raw_probs = valid_df[prob_col].values
actuals = valid_df[actual_col].values
# Train Isotonic Regression
iso = IsotonicRegression(out_of_bounds="clip", increasing=True)
iso.fit(raw_probs, actuals)
# Calculate calibrated probabilities
calibrated_probs = iso.predict(raw_probs)
# Calculate metrics
metrics = CalibrationMetrics()
metrics.sample_count = n_samples
metrics.last_trained = datetime.utcnow().isoformat()
metrics.brier_score = brier_score_loss(actuals, calibrated_probs)
metrics.mean_predicted = np.mean(raw_probs)
metrics.mean_actual = np.mean(actuals)
# Calculate Expected Calibration Error (ECE)
metrics.calibration_error = self._calculate_ece(
calibrated_probs, actuals, n_bins=10
)
# Store in memory
self.calibrators[market] = iso
self.metrics[market] = metrics
# Save to disk
if save:
self._save_calibration(market, iso, metrics)
print(f"[Calibrator] Trained {market}: "
f"Brier={metrics.brier_score:.4f}, "
f"ECE={metrics.calibration_error:.4f}, "
f"n={n_samples}")
return metrics
def train_all_markets(
self,
df: pd.DataFrame,
market_config: Dict[str, Tuple[str, str]],
min_samples: int = 100,
) -> Dict[str, CalibrationMetrics]:
"""
Train calibration models for multiple markets at once.
Args:
df: DataFrame with all predictions and outcomes
market_config: Dict mapping market -> (prob_col, actual_col)
e.g., {'ou25': ('ou25_over_prob', 'ou25_over_actual')}
min_samples: Minimum samples per market
Returns:
Dict of market -> CalibrationMetrics
"""
results = {}
for market, (prob_col, actual_col) in market_config.items():
print(f"\n[Calibrator] Training {market}...")
try:
metrics = self.train_calibration(
df=df,
market=market,
prob_col=prob_col,
actual_col=actual_col,
min_samples=min_samples,
save=True,
)
results[market] = metrics
except Exception as e:
print(f"[Calibrator] Failed to train {market}: {e}")
return results
def _calculate_ece(
self,
probs: np.ndarray,
actuals: np.ndarray,
n_bins: int = 10
) -> float:
"""
Calculate Expected Calibration Error (ECE).
ECE = sum(|bin_accuracy - bin_confidence| * bin_weight)
Lower is better. Perfect calibration = 0.
"""
bin_boundaries = np.linspace(0, 1, n_bins + 1)
ece = 0.0
for i in range(n_bins):
in_bin = (probs >= bin_boundaries[i]) & (probs < bin_boundaries[i + 1])
prop_in_bin = np.mean(in_bin)
if prop_in_bin > 0:
accuracy_in_bin = np.mean(actuals[in_bin])
avg_confidence_in_bin = np.mean(probs[in_bin])
ece += np.abs(accuracy_in_bin - avg_confidence_in_bin) * prop_in_bin
return ece
def _save_calibration(
self,
market: str,
calibrator: IsotonicRegression,
metrics: CalibrationMetrics
):
"""Save calibration model and metrics to disk."""
# Save model
model_path = os.path.join(CALIBRATION_DIR, f"{market}_calibrator.pkl")
with open(model_path, "wb") as f:
pickle.dump(calibrator, f)
# Save metrics
metrics_path = os.path.join(CALIBRATION_DIR, f"{market}_metrics.json")
with open(metrics_path, "w") as f:
json.dump(metrics.to_dict(), f, indent=2)
print(f"[Calibrator] Saved {market} to {CALIBRATION_DIR}")
def get_calibration_report(self) -> Dict[str, Any]:
"""Generate a summary report of all calibration models."""
report = {
"trained_markets": list(self.calibrators.keys()),
"metrics": {},
"heuristic_only": [],
}
for market in SUPPORTED_MARKETS:
if market in self.metrics:
report["metrics"][market] = self.metrics[market].to_dict()
elif market not in self.calibrators:
report["heuristic_only"].append(market)
return report
def get_calibrated_probabilities(
self,
market: str,
raw_probs: np.ndarray
) -> np.ndarray:
"""
Batch calibration for array of probabilities.
Args:
market: Market type
raw_probs: Array of raw probabilities
Returns:
Array of calibrated probabilities
"""
return np.array([self.calibrate(market, p) for p in raw_probs])
# Singleton instance
_calibrator_instance: Optional[Calibrator] = None
def get_calibrator() -> Calibrator:
"""Get or create the global Calibrator instance."""
global _calibrator_instance
if _calibrator_instance is None:
_calibrator_instance = Calibrator()
return _calibrator_instance
File diff suppressed because it is too large Load Diff
+645
View File
@@ -0,0 +1,645 @@
"""
V25 Ensemble Predictor - NO TARGET LEAKAGE
===========================================
Multi-model ensemble for match prediction using XGBoost and LightGBM.
Features:
- 73 engineered features (NO target leakage)
- Market-specific models (MS, OU25, BTTS)
- Weighted ensemble predictions
- Value bet detection
"""
import os
import json
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
import xgboost as xgb
import lightgbm as lgb
# CatBoost is optional
try:
from catboost import CatBoostClassifier
CATBOOST_AVAILABLE = True
except ImportError:
CatBoostClassifier = None
CATBOOST_AVAILABLE = False
# Paths
MODELS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'v25')
@dataclass
class MarketPrediction:
"""Prediction for a single betting market."""
market_type: str
pick: str
probability: float
confidence: float
odds: float = 0.0
is_value_bet: bool = False
edge: float = 0.0
def to_dict(self) -> dict:
return {
'market_type': self.market_type,
'pick': self.pick,
'probability': round(self.probability * 100, 1),
'confidence': round(self.confidence, 1),
'odds': self.odds,
'is_value_bet': self.is_value_bet,
'edge': round(self.edge * 100, 1),
}
@dataclass
class ValueBet:
"""Detected value bet opportunity."""
market_type: str
pick: str
probability: float
odds: float
edge: float
confidence: float
def to_dict(self) -> dict:
return {
'market_type': self.market_type,
'pick': self.pick,
'probability': round(self.probability * 100, 1),
'odds': self.odds,
'edge': round(self.edge * 100, 1),
'confidence': round(self.confidence, 1),
}
@dataclass
class MatchPrediction:
"""Complete match prediction with all markets."""
match_id: str
home_team: str
away_team: str
# MS predictions
home_prob: float = 0.0
draw_prob: float = 0.0
away_prob: float = 0.0
ms_pick: str = ''
ms_confidence: float = 0.0
# OU25 predictions
over_prob: float = 0.0
under_prob: float = 0.0
ou25_pick: str = ''
ou25_confidence: float = 0.0
# BTTS predictions
btts_yes_prob: float = 0.0
btts_no_prob: float = 0.0
btts_pick: str = ''
btts_confidence: float = 0.0
# Value bets
value_bets: List[ValueBet] = field(default_factory=list)
def to_dict(self) -> dict:
return {
'match_id': self.match_id,
'home_team': self.home_team,
'away_team': self.away_team,
'ms': {
'home_prob': round(self.home_prob * 100, 1),
'draw_prob': round(self.draw_prob * 100, 1),
'away_prob': round(self.away_prob * 100, 1),
'pick': self.ms_pick,
'confidence': round(self.ms_confidence, 1),
},
'ou25': {
'over_prob': round(self.over_prob * 100, 1),
'under_prob': round(self.under_prob * 100, 1),
'pick': self.ou25_pick,
'confidence': round(self.ou25_confidence, 1),
},
'btts': {
'yes_prob': round(self.btts_yes_prob * 100, 1),
'no_prob': round(self.btts_no_prob * 100, 1),
'pick': self.btts_pick,
'confidence': round(self.btts_confidence, 1),
},
'value_bets': [vb.to_dict() for vb in self.value_bets],
}
class V25Predictor:
"""
V25 Ensemble Predictor - NO TARGET LEAKAGE
Uses market-specific XGBoost and LightGBM models.
Each market (MS, OU25, BTTS) has its own trained models.
"""
# Feature columns (82 features, NO target leakage)
FEATURE_COLS = [
# ELO Features (8)
'home_overall_elo', 'away_overall_elo', 'elo_diff',
'home_home_elo', 'away_away_elo',
'home_form_elo', 'away_form_elo', 'form_elo_diff',
# Form Features (12)
'home_goals_avg', 'home_conceded_avg',
'away_goals_avg', 'away_conceded_avg',
'home_clean_sheet_rate', 'away_clean_sheet_rate',
'home_scoring_rate', 'away_scoring_rate',
'home_winning_streak', 'away_winning_streak',
'home_unbeaten_streak', 'away_unbeaten_streak',
# H2H Features (6)
'h2h_total_matches', 'h2h_home_win_rate', 'h2h_draw_rate',
'h2h_avg_goals', 'h2h_btts_rate', 'h2h_over25_rate',
# Team Stats Features (8)
'home_avg_possession', 'away_avg_possession',
'home_avg_shots_on_target', 'away_avg_shots_on_target',
'home_shot_conversion', 'away_shot_conversion',
'home_avg_corners', 'away_avg_corners',
# Odds Features (24)
'odds_ms_h', 'odds_ms_d', 'odds_ms_a',
'implied_home', 'implied_draw', 'implied_away',
'odds_ht_ms_h', 'odds_ht_ms_d', 'odds_ht_ms_a',
'odds_ou05_o', 'odds_ou05_u',
'odds_ou15_o', 'odds_ou15_u',
'odds_ou25_o', 'odds_ou25_u',
'odds_ou35_o', 'odds_ou35_u',
'odds_ht_ou05_o', 'odds_ht_ou05_u',
'odds_ht_ou15_o', 'odds_ht_ou15_u',
'odds_btts_y', 'odds_btts_n',
# League Features (4)
'home_xga', 'away_xga',
'league_avg_goals', 'league_zero_goal_rate',
# Upset Engine (4)
'upset_atmosphere', 'upset_motivation', 'upset_fatigue', 'upset_potential',
# Referee Engine (5)
'referee_home_bias', 'referee_avg_goals', 'referee_cards_total',
'referee_avg_yellow', 'referee_experience',
# Momentum Engine (3)
'home_momentum_score', 'away_momentum_score', 'momentum_diff',
# Squad Features (9)
'home_squad_quality', 'away_squad_quality', 'squad_diff',
'home_key_players', 'away_key_players',
'home_missing_impact', 'away_missing_impact',
'home_goals_form', 'away_goals_form',
]
# Model weights for ensemble
DEFAULT_WEIGHTS = {
'xgb': 0.50,
'lgb': 0.50,
}
def __init__(self, models_dir: str = None):
"""
Initialize V25 Predictor.
Args:
models_dir: Directory containing model files. Defaults to v25/ directory.
"""
self.models_dir = models_dir or MODELS_DIR
self.models = {} # market -> {'xgb': model, 'lgb': model}
self._loaded = False
# All trained market models available in V25
ALL_MARKETS = [
'ms', 'ou25', 'btts', # Core markets
'ou15', 'ou35', # Additional OU lines
'ht_result', 'ht_ou05', 'ht_ou15', # HT markets
'htft', # HT/FT combo
'cards_ou45', # Cards market
'handicap_ms', # Handicap
'odd_even', # Odd/Even goals
]
# Multi-class markets (output > 2 classes)
MULTICLASS_MARKETS = {'ms', 'ht_result', 'htft', 'handicap_ms'}
def load_models(self) -> bool:
"""Load all market-specific models from disk."""
try:
loaded_count = 0
for market in self.ALL_MARKETS:
self.models[market] = {}
# Load XGBoost (read content in Python to avoid non-ASCII path issues)
xgb_path = os.path.join(self.models_dir, f'xgb_v25_{market}.json')
if os.path.exists(xgb_path) and os.path.getsize(xgb_path) > 0:
with open(xgb_path, 'r', encoding='utf-8') as f:
xgb_content = f.read()
booster = xgb.Booster()
booster.load_model(bytearray(xgb_content, 'utf-8'))
self.models[market]['xgb'] = booster
loaded_count += 1
# Load LightGBM (read content in Python to avoid non-ASCII path issues)
lgb_path = os.path.join(self.models_dir, f'lgb_v25_{market}.txt')
if os.path.exists(lgb_path) and os.path.getsize(lgb_path) > 0:
with open(lgb_path, 'r', encoding='utf-8') as f:
model_str = f.read()
self.models[market]['lgb'] = lgb.Booster(model_str=model_str)
loaded_count += 1
# Remove empty entries
if not self.models[market]:
del self.models[market]
print(f"[V25] Loaded {loaded_count} model files across {len(self.models)} markets: {list(self.models.keys())}")
self._loaded = loaded_count > 0
return self._loaded
except Exception as e:
print(f"[ERROR] Error loading models: {e}")
import traceback
traceback.print_exc()
return False
def _ensure_loaded(self):
"""Ensure models are loaded before prediction."""
if not self._loaded:
if not self.load_models():
raise RuntimeError("Failed to load V25 models")
def _prepare_features(self, features: Dict[str, float]) -> pd.DataFrame:
"""Prepare feature vector for prediction."""
X = pd.DataFrame([{col: features.get(col, 0.0) for col in self.FEATURE_COLS}])
return X
def predict_ms(self, features: Dict[str, float]) -> tuple:
"""
Predict match result (1X2).
Returns:
(home_prob, draw_prob, away_prob)
"""
self._ensure_loaded()
X = self._prepare_features(features)
probs = []
# XGBoost
if 'xgb' in self.models.get('ms', {}):
dmat = xgb.DMatrix(X)
xgb_proba = self.models['ms']['xgb'].predict(dmat)
if len(xgb_proba.shape) == 1:
xgb_proba = np.array([xgb_proba])
probs.append(xgb_proba[0] * self.DEFAULT_WEIGHTS['xgb'])
# LightGBM
if 'lgb' in self.models.get('ms', {}):
lgb_proba = self.models['ms']['lgb'].predict(X)
if len(lgb_proba.shape) == 2:
probs.append(lgb_proba[0] * self.DEFAULT_WEIGHTS['lgb'])
if not probs:
return 0.33, 0.33, 0.33
ensemble_proba = np.sum(probs, axis=0)
ensemble_proba = ensemble_proba / ensemble_proba.sum()
return float(ensemble_proba[0]), float(ensemble_proba[1]), float(ensemble_proba[2])
def predict_ou25(self, features: Dict[str, float]) -> tuple:
"""
Predict Over/Under 2.5 goals.
Returns:
(over_prob, under_prob)
"""
self._ensure_loaded()
X = self._prepare_features(features)
probs = []
# XGBoost
if 'xgb' in self.models.get('ou25', {}):
dmat = xgb.DMatrix(X)
xgb_proba = self.models['ou25']['xgb'].predict(dmat)
if isinstance(xgb_proba, np.ndarray) and len(xgb_proba.shape) == 1:
probs.append(xgb_proba[0])
# LightGBM
if 'lgb' in self.models.get('ou25', {}):
lgb_proba = self.models['ou25']['lgb'].predict(X)
if isinstance(lgb_proba, np.ndarray):
probs.append(lgb_proba[0])
if not probs:
return 0.5, 0.5
# Average probability
avg_prob = np.mean(probs)
return float(avg_prob), float(1 - avg_prob)
def predict_btts(self, features: Dict[str, float]) -> tuple:
"""
Predict Both Teams To Score.
Returns:
(yes_prob, no_prob)
"""
self._ensure_loaded()
X = self._prepare_features(features)
probs = []
# XGBoost
if 'xgb' in self.models.get('btts', {}):
dmat = xgb.DMatrix(X)
xgb_proba = self.models['btts']['xgb'].predict(dmat)
if isinstance(xgb_proba, np.ndarray) and len(xgb_proba.shape) == 1:
probs.append(xgb_proba[0])
# LightGBM
if 'lgb' in self.models.get('btts', {}):
lgb_proba = self.models['btts']['lgb'].predict(X)
if isinstance(lgb_proba, np.ndarray):
probs.append(lgb_proba[0])
if not probs:
return 0.5, 0.5
# Average probability
avg_prob = np.mean(probs)
return float(avg_prob), float(1 - avg_prob)
def predict_market(self, market: str, features: Dict[str, float]) -> np.ndarray:
"""
Generic prediction for any loaded market.
Args:
market: Market key (e.g. 'ht_result', 'htft', 'cards_ou45')
features: Feature dictionary.
Returns:
numpy array of probabilities.
For binary markets: [positive_prob]
For multi-class markets: [class0_prob, class1_prob, ...]
"""
self._ensure_loaded()
if market not in self.models:
return None
X = self._prepare_features(features)
probs = []
weights = []
is_multiclass = market in self.MULTICLASS_MARKETS
# XGBoost
if 'xgb' in self.models[market]:
dmat = xgb.DMatrix(X)
xgb_proba = self.models[market]['xgb'].predict(dmat)
if isinstance(xgb_proba, np.ndarray):
if is_multiclass and len(xgb_proba.shape) == 2:
probs.append(xgb_proba[0])
elif is_multiclass and len(xgb_proba.shape) == 1:
probs.append(xgb_proba)
else:
probs.append(np.array([xgb_proba[0]]))
weights.append(self.DEFAULT_WEIGHTS['xgb'])
# LightGBM
if 'lgb' in self.models[market]:
lgb_proba = self.models[market]['lgb'].predict(X)
if isinstance(lgb_proba, np.ndarray):
if is_multiclass and len(lgb_proba.shape) == 2:
probs.append(lgb_proba[0])
elif is_multiclass and len(lgb_proba.shape) == 1:
probs.append(lgb_proba)
else:
probs.append(np.array([lgb_proba[0]]))
weights.append(self.DEFAULT_WEIGHTS['lgb'])
if not probs:
return None
# Weighted average
if len(probs) == 1:
return probs[0]
total_w = sum(weights[:len(probs)])
result = np.zeros_like(probs[0])
for p, w in zip(probs, weights):
result += p * (w / total_w)
# Normalize multi-class
if is_multiclass and result.sum() > 0:
result = result / result.sum()
return result
def has_market(self, market: str) -> bool:
"""Check if a specific market model is loaded."""
return market in self.models
def predict_match(
self,
match_id: str,
home_team: str,
away_team: str,
features: Dict[str, float],
odds: Optional[Dict[str, float]] = None,
) -> MatchPrediction:
"""
Predict all markets for a match.
Args:
match_id: Match identifier.
home_team: Home team name.
away_team: Away team name.
features: Feature dictionary.
odds: Optional odds dictionary for value bet detection.
Returns:
MatchPrediction object.
"""
# Get predictions for each market
home_prob, draw_prob, away_prob = self.predict_ms(features)
over_prob, under_prob = self.predict_ou25(features)
btts_yes_prob, btts_no_prob = self.predict_btts(features)
# Determine picks
ms_probs = {'1': home_prob, 'X': draw_prob, '2': away_prob}
ms_pick = max(ms_probs, key=ms_probs.get)
ms_confidence = ms_probs[ms_pick] * 100
ou25_probs = {'Over': over_prob, 'Under': under_prob}
ou25_pick = max(ou25_probs, key=ou25_probs.get)
ou25_confidence = ou25_probs[ou25_pick] * 100
btts_probs = {'Yes': btts_yes_prob, 'No': btts_no_prob}
btts_pick = max(btts_probs, key=btts_probs.get)
btts_confidence = btts_probs[btts_pick] * 100
# Create prediction
prediction = MatchPrediction(
match_id=match_id,
home_team=home_team,
away_team=away_team,
home_prob=home_prob,
draw_prob=draw_prob,
away_prob=away_prob,
ms_pick=ms_pick,
ms_confidence=ms_confidence,
over_prob=over_prob,
under_prob=under_prob,
ou25_pick=ou25_pick,
ou25_confidence=ou25_confidence,
btts_yes_prob=btts_yes_prob,
btts_no_prob=btts_no_prob,
btts_pick=btts_pick,
btts_confidence=btts_confidence,
)
# Detect value bets
if odds:
prediction.value_bets = self._detect_value_bets(
prediction, odds, home_prob, draw_prob, away_prob,
over_prob, under_prob, btts_yes_prob, btts_no_prob
)
return prediction
def _detect_value_bets(
self,
prediction: MatchPrediction,
odds: Dict[str, float],
home_prob: float,
draw_prob: float,
away_prob: float,
over_prob: float,
under_prob: float,
btts_yes_prob: float,
btts_no_prob: float,
) -> List[ValueBet]:
"""Detect value bets based on model vs market odds."""
value_bets = []
min_edge = 0.05 # 5% minimum edge
# MS value bets
if 'ms_h' in odds and odds['ms_h'] > 0:
implied = 1 / odds['ms_h']
edge = home_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='MS',
pick='1',
probability=home_prob,
odds=odds['ms_h'],
edge=edge,
confidence=home_prob * 100,
))
if 'ms_d' in odds and odds['ms_d'] > 0:
implied = 1 / odds['ms_d']
edge = draw_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='MS',
pick='X',
probability=draw_prob,
odds=odds['ms_d'],
edge=edge,
confidence=draw_prob * 100,
))
if 'ms_a' in odds and odds['ms_a'] > 0:
implied = 1 / odds['ms_a']
edge = away_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='MS',
pick='2',
probability=away_prob,
odds=odds['ms_a'],
edge=edge,
confidence=away_prob * 100,
))
# OU25 value bets
if 'ou25_o' in odds and odds['ou25_o'] > 0:
implied = 1 / odds['ou25_o']
edge = over_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='OU25',
pick='Over',
probability=over_prob,
odds=odds['ou25_o'],
edge=edge,
confidence=over_prob * 100,
))
if 'ou25_u' in odds and odds['ou25_u'] > 0:
implied = 1 / odds['ou25_u']
edge = under_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='OU25',
pick='Under',
probability=under_prob,
odds=odds['ou25_u'],
edge=edge,
confidence=under_prob * 100,
))
# BTTS value bets
if 'btts_y' in odds and odds['btts_y'] > 0:
implied = 1 / odds['btts_y']
edge = btts_yes_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='BTTS',
pick='Yes',
probability=btts_yes_prob,
odds=odds['btts_y'],
edge=edge,
confidence=btts_yes_prob * 100,
))
if 'btts_n' in odds and odds['btts_n'] > 0:
implied = 1 / odds['btts_n']
edge = btts_no_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='BTTS',
pick='No',
probability=btts_no_prob,
odds=odds['btts_n'],
edge=edge,
confidence=btts_no_prob * 100,
))
return value_bets
# Singleton instance
_v25_predictor: Optional[V25Predictor] = None
def get_v25_predictor() -> V25Predictor:
"""Get or create V25 predictor instance."""
global _v25_predictor
if _v25_predictor is None:
_v25_predictor = V25Predictor()
_v25_predictor.load_models()
return _v25_predictor
+291
View File
@@ -0,0 +1,291 @@
"""
V27 Pro Predictor — Odds-Free Fundamentals + Value Edge Detection
This module loads V27 ensemble models (XGBoost, LightGBM, CatBoost)
and produces market-independent probability estimates.
The key insight: V27 is trained WITHOUT odds features, so it produces
"true" probabilities unbiased by market pricing. The divergence between
V25 (odds-aware) and V27 (odds-free) predictions signals market mispricing.
"""
import json
import logging
import os
import pickle
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import numpy as np
logger = logging.getLogger(__name__)
V27_DIR = Path(__file__).parent / "v27"
class V27Predictor:
"""
Loads V27 ensemble models and provides predictions using the
82-feature odds-free vector.
"""
MARKETS = ["ms", "ou25"]
def __init__(self):
self.models: Dict[str, Dict[str, object]] = {}
self.feature_cols: List[str] = []
self._loaded = False
def load_models(self) -> bool:
"""Load all V27 ensemble models and feature column spec."""
if self._loaded:
return True
# Feature columns
cols_path = V27_DIR / "v27_feature_cols.json"
if not cols_path.exists():
logger.error("[V27] Feature columns file not found: %s", cols_path)
return False
try:
with open(cols_path, "r", encoding="utf-8") as f:
self.feature_cols = json.load(f)
logger.info("[V27] Loaded %d feature columns", len(self.feature_cols))
except Exception as e:
logger.error("[V27] Failed to load feature columns: %s", e)
return False
# Load models per market
model_types = {"xgb": "xgb", "lgb": "lgb", "cb": "cb"}
for market in self.MARKETS:
self.models[market] = {}
for short, label in model_types.items():
# Try market-specific file first: v27_ms_xgb.pkl
path = V27_DIR / f"v27_{market}_{short}.pkl"
if not path.exists():
# Fallback to generic: v27_xgboost.pkl (for MS only)
generic_names = {"xgb": "v27_xgboost.pkl", "lgb": "v27_lightgbm.pkl", "cb": "v27_catboost.pkl"}
path = V27_DIR / generic_names.get(short, "")
if not path.exists():
logger.warning("[V27] Model file not found for %s/%s", market, short)
continue
try:
with open(path, "rb") as f:
model = pickle.load(f)
self.models[market][label] = model
logger.info("[V27] ✓ Loaded %s/%s from %s", market, label, path.name)
except Exception as e:
logger.error("[V27] ✗ Failed to load %s/%s: %s", market, label, e)
loaded_count = sum(len(v) for v in self.models.values())
if loaded_count == 0:
logger.error("[V27] No models loaded!")
return False
self._loaded = True
logger.info("[V27] Total models loaded: %d across %d markets", loaded_count, len(self.models))
return True
def _build_feature_array(self, features: Dict[str, float]) -> np.ndarray:
"""
Build ordered feature array from the full feature dict.
V27 uses only its 82 features (odds-free subset).
"""
row = []
for col in self.feature_cols:
row.append(float(features.get(col, 0.0)))
return np.array([row])
def _predict_with_model(self, model, X: np.ndarray, label: str, expected_classes: int) -> Optional[np.ndarray]:
"""
Predict probabilities from a model, handling both sklearn wrappers
(predict_proba) and raw Booster objects (predict).
For raw XGBoost Boosters, DMatrix is created WITH feature_names
to match the training schema.
"""
import xgboost as xgb
import lightgbm as lgbm
import pandas as pd
# 1. Try sklearn-style predict_proba first
if hasattr(model, 'predict_proba'):
try:
proba = model.predict_proba(X)[0]
if len(proba) == expected_classes:
return proba
logger.warning("[V27] %s predict_proba returned %d classes, expected %d", label, len(proba), expected_classes)
except Exception:
pass # Fall through to raw predict
# 2. Raw xgboost.Booster — MUST pass feature_names
if isinstance(model, xgb.Booster):
try:
feature_names = self.feature_cols if self.feature_cols else None
dmat = xgb.DMatrix(X, feature_names=feature_names)
raw = model.predict(dmat)
if isinstance(raw, np.ndarray):
if raw.ndim == 2 and raw.shape[1] == expected_classes:
return raw[0]
elif raw.ndim == 1 and expected_classes == 2:
p = float(raw[0])
return np.array([1.0 - p, p])
elif raw.ndim == 1 and len(raw) == expected_classes:
return raw
except Exception as e:
logger.warning("[V27] %s xgb.Booster predict failed: %s", label, e)
return None
# 3. Raw lightgbm.Booster — pass as DataFrame with column names
if isinstance(model, lgbm.Booster):
try:
if self.feature_cols:
X_named = pd.DataFrame(X, columns=self.feature_cols)
raw = model.predict(X_named)
else:
raw = model.predict(X)
if isinstance(raw, np.ndarray):
if raw.ndim == 2 and raw.shape[1] == expected_classes:
return raw[0]
elif raw.ndim == 1 and expected_classes == 2:
p = float(raw[0])
return np.array([1.0 - p, p])
elif raw.ndim == 1 and len(raw) == expected_classes:
return raw
except Exception as e:
logger.warning("[V27] %s lgb.Booster predict failed: %s", label, e)
return None
# 4. Generic fallback (CatBoost, etc.)
try:
if hasattr(model, 'predict'):
raw = model.predict(X)
if isinstance(raw, np.ndarray):
if raw.ndim == 2 and raw.shape[1] == expected_classes:
return raw[0]
elif raw.ndim == 1 and expected_classes == 2:
p = float(raw[0])
return np.array([1.0 - p, p])
elif raw.ndim == 1 and len(raw) == expected_classes:
return raw
except Exception as e:
logger.warning("[V27] %s generic predict failed: %s", label, e)
return None
def predict_ms(self, features: Dict[str, float]) -> Optional[Dict[str, float]]:
"""
Predict Match Score probabilities (Home/Draw/Away).
Returns dict with keys: home, draw, away.
"""
if not self._loaded or "ms" not in self.models or not self.models["ms"]:
return None
X = self._build_feature_array(features)
probs_list = []
for label, model in self.models["ms"].items():
proba = self._predict_with_model(model, X, f"MS/{label}", expected_classes=3)
if proba is not None and len(proba) == 3:
probs_list.append(proba)
if not probs_list:
return None
# Ensemble average
avg = np.mean(probs_list, axis=0)
return {
"home": float(avg[0]),
"draw": float(avg[1]),
"away": float(avg[2]),
}
def predict_ou25(self, features: Dict[str, float]) -> Optional[Dict[str, float]]:
"""
Predict Over/Under 2.5 probabilities.
Returns dict with keys: under, over.
"""
if not self._loaded or "ou25" not in self.models or not self.models["ou25"]:
return None
X = self._build_feature_array(features)
probs_list = []
for label, model in self.models["ou25"].items():
proba = self._predict_with_model(model, X, f"OU25/{label}", expected_classes=2)
if proba is not None and len(proba) == 2:
probs_list.append(proba)
if not probs_list:
return None
avg = np.mean(probs_list, axis=0)
return {
"under": float(avg[0]),
"over": float(avg[1]),
}
def predict_all(self, features: Dict[str, float]) -> Dict[str, Optional[Dict[str, float]]]:
"""Run predictions for all supported markets."""
return {
"ms": self.predict_ms(features),
"ou25": self.predict_ou25(features),
}
def compute_divergence(
v25_probs: Dict[str, float],
v27_probs: Dict[str, float],
) -> Dict[str, float]:
"""
Compute the divergence signal between V25 (odds-aware) and V27 (odds-free).
Positive divergence = V27 thinks it's MORE likely than the market → VALUE BET
Negative divergence = V27 thinks it's LESS likely than the market → PASS
Returns per-outcome divergence values.
"""
divergence = {}
for key in v27_probs:
v25_val = v25_probs.get(key, 0.33)
v27_val = v27_probs.get(key, 0.33)
divergence[key] = round(v27_val - v25_val, 4)
return divergence
def compute_value_edge(
v25_probs: Dict[str, float],
v27_probs: Dict[str, float],
odds: Dict[str, float],
) -> Dict[str, Dict]:
"""
Detect value bets by combining V25/V27 divergence with odds.
A value bet exists when:
1. V27 (odds-free) probability > implied odds probability (model says it's underpriced)
2. V27 and V25 divergence is positive (V27 sees more signal than the market)
Returns per-outcome: { probability, implied_prob, edge, is_value }
"""
results = {}
for key in v27_probs:
v27_p = v27_probs[key]
v25_p = v25_probs.get(key, 0.33)
odds_val = odds.get(key, 0.0)
implied_p = (1.0 / odds_val) if odds_val > 1.01 else 0.0
divergence = v27_p - v25_p
edge = v27_p - implied_p if implied_p > 0 else 0.0
results[key] = {
"v27_prob": round(v27_p, 4),
"v25_prob": round(v25_p, 4),
"implied_prob": round(implied_p, 4),
"divergence": round(divergence, 4),
"edge": round(edge, 4),
"is_value": edge > 0.05 and divergence > 0.02, # 5% edge + 2% divergence
}
return results