Files
iddaai-be/ai-engine/models/v25_ensemble.py
T
fahricansecer b6d64b59bf
Deploy Iddaai Backend / build-and-deploy (push) Failing after 2m6s
main
2026-05-12 02:43:02 +03:00

675 lines
23 KiB
Python

"""
V25 Ensemble Predictor - NO TARGET LEAKAGE
===========================================
Multi-model ensemble for match prediction using XGBoost and LightGBM.
Features:
- 73 engineered features (NO target leakage)
- Market-specific models (MS, OU25, BTTS)
- Weighted ensemble predictions
- Value bet detection
"""
import os
import json
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
import xgboost as xgb
import lightgbm as lgb
# CatBoost is optional
try:
from catboost import CatBoostClassifier
CATBOOST_AVAILABLE = True
except ImportError:
CatBoostClassifier = None
CATBOOST_AVAILABLE = False
# Paths
MODELS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'v25')
@dataclass
class MarketPrediction:
"""Prediction for a single betting market."""
market_type: str
pick: str
probability: float
confidence: float
odds: float = 0.0
is_value_bet: bool = False
edge: float = 0.0
def to_dict(self) -> dict:
return {
'market_type': self.market_type,
'pick': self.pick,
'probability': round(self.probability * 100, 1),
'confidence': round(self.confidence, 1),
'odds': self.odds,
'is_value_bet': self.is_value_bet,
'edge': round(self.edge * 100, 1),
}
@dataclass
class ValueBet:
"""Detected value bet opportunity."""
market_type: str
pick: str
probability: float
odds: float
edge: float
confidence: float
def to_dict(self) -> dict:
return {
'market_type': self.market_type,
'pick': self.pick,
'probability': round(self.probability * 100, 1),
'odds': self.odds,
'edge': round(self.edge * 100, 1),
'confidence': round(self.confidence, 1),
}
@dataclass
class MatchPrediction:
"""Complete match prediction with all markets."""
match_id: str
home_team: str
away_team: str
# MS predictions
home_prob: float = 0.0
draw_prob: float = 0.0
away_prob: float = 0.0
ms_pick: str = ''
ms_confidence: float = 0.0
# OU25 predictions
over_prob: float = 0.0
under_prob: float = 0.0
ou25_pick: str = ''
ou25_confidence: float = 0.0
# BTTS predictions
btts_yes_prob: float = 0.0
btts_no_prob: float = 0.0
btts_pick: str = ''
btts_confidence: float = 0.0
# Value bets
value_bets: List[ValueBet] = field(default_factory=list)
def to_dict(self) -> dict:
return {
'match_id': self.match_id,
'home_team': self.home_team,
'away_team': self.away_team,
'ms': {
'home_prob': round(self.home_prob * 100, 1),
'draw_prob': round(self.draw_prob * 100, 1),
'away_prob': round(self.away_prob * 100, 1),
'pick': self.ms_pick,
'confidence': round(self.ms_confidence, 1),
},
'ou25': {
'over_prob': round(self.over_prob * 100, 1),
'under_prob': round(self.under_prob * 100, 1),
'pick': self.ou25_pick,
'confidence': round(self.ou25_confidence, 1),
},
'btts': {
'yes_prob': round(self.btts_yes_prob * 100, 1),
'no_prob': round(self.btts_no_prob * 100, 1),
'pick': self.btts_pick,
'confidence': round(self.btts_confidence, 1),
},
'value_bets': [vb.to_dict() for vb in self.value_bets],
}
class V25Predictor:
"""
V25 Ensemble Predictor - NO TARGET LEAKAGE
Uses market-specific XGBoost and LightGBM models.
Each market (MS, OU25, BTTS) has its own trained models.
"""
# Feature columns — loaded dynamically from feature_cols.json to stay
# in sync with the trained models. The hardcoded list below is only a
# fallback in case the JSON file is missing.
_FALLBACK_FEATURE_COLS = [
# ELO Features (8)
'home_overall_elo', 'away_overall_elo', 'elo_diff',
'home_home_elo', 'away_away_elo',
'home_form_elo', 'away_form_elo', 'form_elo_diff',
# Form Features (12)
'home_goals_avg', 'home_conceded_avg',
'away_goals_avg', 'away_conceded_avg',
'home_clean_sheet_rate', 'away_clean_sheet_rate',
'home_scoring_rate', 'away_scoring_rate',
'home_winning_streak', 'away_winning_streak',
'home_unbeaten_streak', 'away_unbeaten_streak',
# H2H Features (6)
'h2h_total_matches', 'h2h_home_win_rate', 'h2h_draw_rate',
'h2h_avg_goals', 'h2h_btts_rate', 'h2h_over25_rate',
# Team Stats Features (8)
'home_avg_possession', 'away_avg_possession',
'home_avg_shots_on_target', 'away_avg_shots_on_target',
'home_shot_conversion', 'away_shot_conversion',
'home_avg_corners', 'away_avg_corners',
# Odds Features (24)
'odds_ms_h', 'odds_ms_d', 'odds_ms_a',
'implied_home', 'implied_draw', 'implied_away',
'odds_ht_ms_h', 'odds_ht_ms_d', 'odds_ht_ms_a',
'odds_ou05_o', 'odds_ou05_u',
'odds_ou15_o', 'odds_ou15_u',
'odds_ou25_o', 'odds_ou25_u',
'odds_ou35_o', 'odds_ou35_u',
'odds_ht_ou05_o', 'odds_ht_ou05_u',
'odds_ht_ou15_o', 'odds_ht_ou15_u',
'odds_btts_y', 'odds_btts_n',
# Odds Presence Flags (20)
'odds_ms_h_present', 'odds_ms_d_present', 'odds_ms_a_present',
'odds_ht_ms_h_present', 'odds_ht_ms_d_present', 'odds_ht_ms_a_present',
'odds_ou05_o_present', 'odds_ou05_u_present',
'odds_ou15_o_present', 'odds_ou15_u_present',
'odds_ou25_o_present', 'odds_ou25_u_present',
'odds_ou35_o_present', 'odds_ou35_u_present',
'odds_ht_ou05_o_present', 'odds_ht_ou05_u_present',
'odds_ht_ou15_o_present', 'odds_ht_ou15_u_present',
'odds_btts_y_present', 'odds_btts_n_present',
# League Features (4)
'home_xga', 'away_xga',
'league_avg_goals', 'league_zero_goal_rate',
# Upset Engine (4)
'upset_atmosphere', 'upset_motivation', 'upset_fatigue', 'upset_potential',
# Referee Engine (5)
'referee_home_bias', 'referee_avg_goals', 'referee_cards_total',
'referee_avg_yellow', 'referee_experience',
# Momentum Engine (3)
'home_momentum_score', 'away_momentum_score', 'momentum_diff',
# Squad Features (9)
'home_squad_quality', 'away_squad_quality', 'squad_diff',
'home_key_players', 'away_key_players',
'home_missing_impact', 'away_missing_impact',
'home_goals_form', 'away_goals_form',
]
@staticmethod
def _load_feature_cols() -> list:
"""Load feature columns from feature_cols.json, falling back to hardcoded list."""
feature_json = os.path.join(MODELS_DIR, 'feature_cols.json')
try:
if os.path.exists(feature_json):
with open(feature_json, 'r', encoding='utf-8') as f:
cols = json.load(f)
if isinstance(cols, list) and len(cols) > 0:
print(f"[V25] Loaded {len(cols)} feature columns from feature_cols.json")
return cols
except Exception as e:
print(f"[V25] Warning: could not load feature_cols.json: {e}")
print(f"[V25] Using fallback feature columns ({len(V25Predictor._FALLBACK_FEATURE_COLS)} features)")
return V25Predictor._FALLBACK_FEATURE_COLS
# Model weights for ensemble
DEFAULT_WEIGHTS = {
'xgb': 0.50,
'lgb': 0.50,
}
def __init__(self, models_dir: Optional[str] = None):
"""
Initialize V25 Predictor.
Args:
models_dir: Directory containing model files. Defaults to v25/ directory.
"""
self.models_dir = models_dir or MODELS_DIR
self.models = {} # market -> {'xgb': model, 'lgb': model}
self._loaded = False
self.FEATURE_COLS = self._load_feature_cols()
# All trained market models available in V25
ALL_MARKETS = [
'ms', 'ou25', 'btts', # Core markets
'ou15', 'ou35', # Additional OU lines
'ht_result', 'ht_ou05', 'ht_ou15', # HT markets
'htft', # HT/FT combo
'cards_ou45', # Cards market
'handicap_ms', # Handicap
'odd_even', # Odd/Even goals
]
# Multi-class markets (output > 2 classes)
MULTICLASS_MARKETS = {'ms', 'ht_result', 'htft', 'handicap_ms'}
def load_models(self) -> bool:
"""Load all market-specific models from disk."""
try:
loaded_count = 0
for market in self.ALL_MARKETS:
self.models[market] = {}
# Load XGBoost (read content in Python to avoid non-ASCII path issues)
xgb_path = os.path.join(self.models_dir, f'xgb_v25_{market}.json')
if os.path.exists(xgb_path) and os.path.getsize(xgb_path) > 0:
with open(xgb_path, 'r', encoding='utf-8') as f:
xgb_content = f.read()
booster = xgb.Booster()
booster.load_model(bytearray(xgb_content, 'utf-8'))
self.models[market]['xgb'] = booster
loaded_count += 1
# Load LightGBM (read content in Python to avoid non-ASCII path issues)
lgb_path = os.path.join(self.models_dir, f'lgb_v25_{market}.txt')
if os.path.exists(lgb_path) and os.path.getsize(lgb_path) > 0:
with open(lgb_path, 'r', encoding='utf-8') as f:
model_str = f.read()
self.models[market]['lgb'] = lgb.Booster(model_str=model_str)
loaded_count += 1
# Remove empty entries
if not self.models[market]:
del self.models[market]
print(f"[V25] Loaded {loaded_count} model files across {len(self.models)} markets: {list(self.models.keys())}")
self._loaded = loaded_count > 0
return self._loaded
except Exception as e:
print(f"[ERROR] Error loading models: {e}")
import traceback
traceback.print_exc()
return False
def _ensure_loaded(self):
"""Ensure models are loaded before prediction."""
if not self._loaded:
if not self.load_models():
raise RuntimeError("Failed to load V25 models")
def _prepare_features(self, features: Dict[str, float]) -> pd.DataFrame:
"""Prepare feature vector for prediction."""
X = pd.DataFrame([{col: features.get(col, 0.0) for col in self.FEATURE_COLS}])
return X
def predict_ms(self, features: Dict[str, float]) -> tuple:
"""
Predict match result (1X2).
Returns:
(home_prob, draw_prob, away_prob)
"""
self._ensure_loaded()
X = self._prepare_features(features)
probs = []
# XGBoost
if 'xgb' in self.models.get('ms', {}):
dmat = xgb.DMatrix(X)
xgb_proba = self.models['ms']['xgb'].predict(dmat)
if len(xgb_proba.shape) == 1:
xgb_proba = np.array([xgb_proba])
probs.append(xgb_proba[0] * self.DEFAULT_WEIGHTS['xgb'])
# LightGBM
if 'lgb' in self.models.get('ms', {}):
lgb_proba = self.models['ms']['lgb'].predict(X)
if len(lgb_proba.shape) == 2:
probs.append(lgb_proba[0] * self.DEFAULT_WEIGHTS['lgb'])
if not probs:
return 0.33, 0.33, 0.33
ensemble_proba = np.sum(probs, axis=0)
ensemble_proba = ensemble_proba / ensemble_proba.sum()
return float(ensemble_proba[0]), float(ensemble_proba[1]), float(ensemble_proba[2])
def predict_ou25(self, features: Dict[str, float]) -> tuple:
"""
Predict Over/Under 2.5 goals.
Returns:
(over_prob, under_prob)
"""
self._ensure_loaded()
X = self._prepare_features(features)
probs = []
# XGBoost
if 'xgb' in self.models.get('ou25', {}):
dmat = xgb.DMatrix(X)
xgb_proba = self.models['ou25']['xgb'].predict(dmat)
if isinstance(xgb_proba, np.ndarray) and len(xgb_proba.shape) == 1:
probs.append(xgb_proba[0])
# LightGBM
if 'lgb' in self.models.get('ou25', {}):
lgb_proba = self.models['ou25']['lgb'].predict(X)
if isinstance(lgb_proba, np.ndarray):
probs.append(lgb_proba[0])
if not probs:
return 0.5, 0.5
# Average probability
avg_prob = np.mean(probs)
return float(avg_prob), float(1 - avg_prob)
def predict_btts(self, features: Dict[str, float]) -> tuple:
"""
Predict Both Teams To Score.
Returns:
(yes_prob, no_prob)
"""
self._ensure_loaded()
X = self._prepare_features(features)
probs = []
# XGBoost
if 'xgb' in self.models.get('btts', {}):
dmat = xgb.DMatrix(X)
xgb_proba = self.models['btts']['xgb'].predict(dmat)
if isinstance(xgb_proba, np.ndarray) and len(xgb_proba.shape) == 1:
probs.append(xgb_proba[0])
# LightGBM
if 'lgb' in self.models.get('btts', {}):
lgb_proba = self.models['btts']['lgb'].predict(X)
if isinstance(lgb_proba, np.ndarray):
probs.append(lgb_proba[0])
if not probs:
return 0.5, 0.5
# Average probability
avg_prob = np.mean(probs)
return float(avg_prob), float(1 - avg_prob)
def predict_market(self, market: str, features: Dict[str, float]) -> Optional[np.ndarray]:
"""
Generic prediction for any loaded market.
Args:
market: Market key (e.g. 'ht_result', 'htft', 'cards_ou45')
features: Feature dictionary.
Returns:
numpy array of probabilities.
For binary markets: [positive_prob]
For multi-class markets: [class0_prob, class1_prob, ...]
"""
self._ensure_loaded()
if market not in self.models:
return None
X = self._prepare_features(features)
probs = []
weights = []
is_multiclass = market in self.MULTICLASS_MARKETS
# XGBoost
if 'xgb' in self.models[market]:
dmat = xgb.DMatrix(X)
xgb_proba = self.models[market]['xgb'].predict(dmat)
if isinstance(xgb_proba, np.ndarray):
if is_multiclass and len(xgb_proba.shape) == 2:
probs.append(xgb_proba[0])
elif is_multiclass and len(xgb_proba.shape) == 1:
probs.append(xgb_proba)
else:
probs.append(np.array([xgb_proba[0]]))
weights.append(self.DEFAULT_WEIGHTS['xgb'])
# LightGBM
if 'lgb' in self.models[market]:
lgb_proba = self.models[market]['lgb'].predict(X)
if isinstance(lgb_proba, np.ndarray):
if is_multiclass and len(lgb_proba.shape) == 2:
probs.append(lgb_proba[0])
elif is_multiclass and len(lgb_proba.shape) == 1:
probs.append(lgb_proba)
else:
probs.append(np.array([lgb_proba[0]]))
weights.append(self.DEFAULT_WEIGHTS['lgb'])
if not probs:
return None
# Weighted average
if len(probs) == 1:
return probs[0]
total_w = sum(weights[:len(probs)])
result = np.zeros_like(probs[0])
for p, w in zip(probs, weights):
result += p * (w / total_w)
# Normalize multi-class
if is_multiclass and result.sum() > 0:
result = result / result.sum()
return result
def has_market(self, market: str) -> bool:
"""Check if a specific market model is loaded."""
return market in self.models
def predict_match(
self,
match_id: str,
home_team: str,
away_team: str,
features: Dict[str, float],
odds: Optional[Dict[str, float]] = None,
) -> MatchPrediction:
"""
Predict all markets for a match.
Args:
match_id: Match identifier.
home_team: Home team name.
away_team: Away team name.
features: Feature dictionary.
odds: Optional odds dictionary for value bet detection.
Returns:
MatchPrediction object.
"""
# Get predictions for each market
home_prob, draw_prob, away_prob = self.predict_ms(features)
over_prob, under_prob = self.predict_ou25(features)
btts_yes_prob, btts_no_prob = self.predict_btts(features)
# Determine picks
ms_probs = {'1': home_prob, 'X': draw_prob, '2': away_prob}
ms_pick = max(ms_probs, key=ms_probs.__getitem__)
ms_confidence = ms_probs[ms_pick] * 100
ou25_probs = {'Over': over_prob, 'Under': under_prob}
ou25_pick = max(ou25_probs, key=ou25_probs.__getitem__)
ou25_confidence = ou25_probs[ou25_pick] * 100
btts_probs = {'Yes': btts_yes_prob, 'No': btts_no_prob}
btts_pick = max(btts_probs, key=btts_probs.__getitem__)
btts_confidence = btts_probs[btts_pick] * 100
# Create prediction
prediction = MatchPrediction(
match_id=match_id,
home_team=home_team,
away_team=away_team,
home_prob=home_prob,
draw_prob=draw_prob,
away_prob=away_prob,
ms_pick=ms_pick,
ms_confidence=ms_confidence,
over_prob=over_prob,
under_prob=under_prob,
ou25_pick=ou25_pick,
ou25_confidence=ou25_confidence,
btts_yes_prob=btts_yes_prob,
btts_no_prob=btts_no_prob,
btts_pick=btts_pick,
btts_confidence=btts_confidence,
)
# Detect value bets
if odds:
prediction.value_bets = self._detect_value_bets(
prediction, odds, home_prob, draw_prob, away_prob,
over_prob, under_prob, btts_yes_prob, btts_no_prob
)
return prediction
def _detect_value_bets(
self,
prediction: MatchPrediction,
odds: Dict[str, float],
home_prob: float,
draw_prob: float,
away_prob: float,
over_prob: float,
under_prob: float,
btts_yes_prob: float,
btts_no_prob: float,
) -> List[ValueBet]:
"""Detect value bets based on model vs market odds."""
value_bets = []
min_edge = 0.05 # 5% minimum edge
# MS value bets
if 'ms_h' in odds and odds['ms_h'] > 0:
implied = 1 / odds['ms_h']
edge = home_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='MS',
pick='1',
probability=home_prob,
odds=odds['ms_h'],
edge=edge,
confidence=home_prob * 100,
))
if 'ms_d' in odds and odds['ms_d'] > 0:
implied = 1 / odds['ms_d']
edge = draw_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='MS',
pick='X',
probability=draw_prob,
odds=odds['ms_d'],
edge=edge,
confidence=draw_prob * 100,
))
if 'ms_a' in odds and odds['ms_a'] > 0:
implied = 1 / odds['ms_a']
edge = away_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='MS',
pick='2',
probability=away_prob,
odds=odds['ms_a'],
edge=edge,
confidence=away_prob * 100,
))
# OU25 value bets
if 'ou25_o' in odds and odds['ou25_o'] > 0:
implied = 1 / odds['ou25_o']
edge = over_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='OU25',
pick='Over',
probability=over_prob,
odds=odds['ou25_o'],
edge=edge,
confidence=over_prob * 100,
))
if 'ou25_u' in odds and odds['ou25_u'] > 0:
implied = 1 / odds['ou25_u']
edge = under_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='OU25',
pick='Under',
probability=under_prob,
odds=odds['ou25_u'],
edge=edge,
confidence=under_prob * 100,
))
# BTTS value bets
if 'btts_y' in odds and odds['btts_y'] > 0:
implied = 1 / odds['btts_y']
edge = btts_yes_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='BTTS',
pick='Yes',
probability=btts_yes_prob,
odds=odds['btts_y'],
edge=edge,
confidence=btts_yes_prob * 100,
))
if 'btts_n' in odds and odds['btts_n'] > 0:
implied = 1 / odds['btts_n']
edge = btts_no_prob - implied
if edge > min_edge:
value_bets.append(ValueBet(
market_type='BTTS',
pick='No',
probability=btts_no_prob,
odds=odds['btts_n'],
edge=edge,
confidence=btts_no_prob * 100,
))
return value_bets
# Singleton instance
_v25_predictor: Optional[V25Predictor] = None
def get_v25_predictor() -> V25Predictor:
"""Get or create V25 predictor instance."""
global _v25_predictor
if _v25_predictor is None:
_v25_predictor = V25Predictor()
_v25_predictor.load_models()
return _v25_predictor