This commit is contained in:
2026-05-10 10:37:45 +03:00
parent 4f7090e2d9
commit c525b12dfd
32 changed files with 2374 additions and 209 deletions
+73 -4
View File
@@ -1,17 +1,19 @@
import os
import json
import yaml
from typing import Dict, Any, Optional
class EnsembleConfig:
_instance: Optional['EnsembleConfig'] = None
_config: Dict[str, Any] = {}
def __new__(cls):
if cls._instance is None:
cls._instance = super(EnsembleConfig, cls).__new__(cls)
cls._instance._load_config()
return cls._instance
def _load_config(self):
"""Load configuration from YAML file."""
config_path = os.path.join(os.path.dirname(__file__), 'ensemble_config.yaml')
@@ -22,12 +24,12 @@ class EnsembleConfig:
except Exception as e:
print(f"❌ Failed to load ensemble config: {e}")
self._config = {}
def get(self, key: str, default: Any = None) -> Any:
"""Get configuration value by key (supports dot notation for nested keys)."""
keys = key.split('.')
value = self._config
try:
for k in keys:
value = value[k]
@@ -35,12 +37,79 @@ class EnsembleConfig:
except (KeyError, TypeError):
return default
# Singleton accessor
def get_config() -> EnsembleConfig:
return EnsembleConfig()
# ── Market Thresholds Loader ────────────────────────────────────────────
_market_thresholds_cache: Optional[Dict[str, Any]] = None
def load_market_thresholds() -> Dict[str, Any]:
"""
Load market thresholds from JSON config file.
Returns the full config dict with 'markets' and 'defaults' keys.
Caches after first load for performance.
"""
global _market_thresholds_cache
if _market_thresholds_cache is not None:
return _market_thresholds_cache
config_path = os.path.join(os.path.dirname(__file__), 'market_thresholds.json')
try:
with open(config_path, 'r', encoding='utf-8') as f:
data = json.load(f)
_market_thresholds_cache = data
print(f"✅ Market thresholds loaded: {len(data.get('markets', {}))} markets (v={data.get('_meta', {}).get('version', '?')})")
return data
except Exception as e:
print(f"❌ Failed to load market thresholds: {e} — using built-in defaults")
_market_thresholds_cache = {"markets": {}, "defaults": {
"calibration": 0.55,
"min_conf": 55.0,
"min_play_score": 68.0,
"min_edge": 0.02,
"odds_band_min_sample": 0.0,
"odds_band_min_edge": 0.0,
}}
return _market_thresholds_cache
def build_threshold_dict(field: str) -> Dict[str, float]:
"""
Build a flat {market: value} dict for a specific threshold field.
Usage:
calibration_map = build_threshold_dict("calibration")
# → {"MS": 0.62, "DC": 0.82, ...}
"""
data = load_market_thresholds()
markets = data.get("markets", {})
result: Dict[str, float] = {}
for market, cfg in markets.items():
if field in cfg:
result[market] = float(cfg[field])
return result
def get_threshold_default(field: str) -> float:
"""Get the default fallback value for a threshold field."""
data = load_market_thresholds()
defaults = data.get("defaults", {})
return float(defaults.get(field, 0.0))
if __name__ == "__main__":
# Test
cfg = get_config()
print(f"Weights: {cfg.get('engine_weights')}")
print(f"Team Weight: {cfg.get('engine_weights.team')}")
print()
print("--- Market Thresholds ---")
for field in ["calibration", "min_conf", "min_play_score", "min_edge"]:
d = build_threshold_dict(field)
print(f"{field}: {d}")
print(f"Default calibration: {get_threshold_default('calibration')}")
+115
View File
@@ -0,0 +1,115 @@
{
"_meta": {
"version": "v34",
"description": "Market-specific thresholds for the betting engine pipeline — V34 odds-aware gate fix",
"rule": "max_reachable (100 × calibration) MUST be > min_conf + 8",
"updated_at": "2026-05-10",
"changelog": "V34: Reduced min_edge to realistic levels for odds-aware V25 model. Model output ≈ market-implied, so large EV edges are mathematically impossible."
},
"markets": {
"MS": {
"calibration": 0.62,
"min_conf": 20.0,
"min_play_score": 28.0,
"min_edge": 0.005,
"odds_band_min_sample": 8.0,
"odds_band_min_edge": 0.005
},
"DC": {
"calibration": 0.82,
"min_conf": 40.0,
"min_play_score": 50.0,
"min_edge": 0.003,
"odds_band_min_sample": 8.0,
"odds_band_min_edge": 0.005
},
"OU15": {
"calibration": 0.84,
"min_conf": 45.0,
"min_play_score": 50.0,
"min_edge": 0.003,
"odds_band_min_sample": 8.0,
"odds_band_min_edge": 0.005
},
"OU25": {
"calibration": 0.68,
"min_conf": 30.0,
"min_play_score": 40.0,
"min_edge": 0.005,
"odds_band_min_sample": 8.0,
"odds_band_min_edge": 0.005
},
"OU35": {
"calibration": 0.60,
"min_conf": 20.0,
"min_play_score": 30.0,
"min_edge": 0.008,
"odds_band_min_sample": 8.0,
"odds_band_min_edge": 0.008
},
"BTTS": {
"calibration": 0.65,
"min_conf": 30.0,
"min_play_score": 40.0,
"min_edge": 0.005,
"odds_band_min_sample": 8.0,
"odds_band_min_edge": 0.005
},
"HT": {
"calibration": 0.58,
"min_conf": 20.0,
"min_play_score": 28.0,
"min_edge": 0.01,
"odds_band_min_sample": 8.0,
"odds_band_min_edge": 0.008
},
"HT_OU05": {
"calibration": 0.68,
"min_conf": 35.0,
"min_play_score": 42.0,
"min_edge": 0.005,
"odds_band_min_sample": 8.0,
"odds_band_min_edge": 0.005
},
"HT_OU15": {
"calibration": 0.60,
"min_conf": 25.0,
"min_play_score": 32.0,
"min_edge": 0.008,
"odds_band_min_sample": 8.0,
"odds_band_min_edge": 0.008
},
"OE": {
"calibration": 0.62,
"min_conf": 35.0,
"min_play_score": 32.0,
"min_edge": 0.005
},
"CARDS": {
"calibration": 0.58,
"min_conf": 30.0,
"min_play_score": 35.0,
"min_edge": 0.008
},
"HCAP": {
"calibration": 0.56,
"min_conf": 25.0,
"min_play_score": 30.0,
"min_edge": 0.015
},
"HTFT": {
"calibration": 0.45,
"min_conf": 10.0,
"min_play_score": 18.0,
"min_edge": 0.02
}
},
"defaults": {
"calibration": 0.55,
"min_conf": 55.0,
"min_play_score": 60.0,
"min_edge": 0.008,
"odds_band_min_sample": 0.0,
"odds_band_min_edge": 0.0
}
}
+23 -7
View File
@@ -59,7 +59,7 @@ def fetch_matches(conn, sport: str):
def flush_features_batch(conn, rows, dry_run: bool, sport: str = 'football'):
"""Bulk upsert a batch of (match_id, home_elo, away_elo) into sport-partitioned ai_features table."""
"""Bulk upsert ELO features into sport-partitioned ai_features table."""
if not rows or dry_run:
return
@@ -70,19 +70,27 @@ def flush_features_batch(conn, rows, dry_run: bool, sport: str = 'football'):
f"""
INSERT INTO {table_name}
(match_id, home_elo, away_elo,
home_home_elo, away_away_elo,
home_form_elo, away_form_elo,
elo_diff,
home_form_score, away_form_score,
missing_players_impact, calculator_ver, updated_at)
VALUES %s
ON CONFLICT (match_id) DO UPDATE SET
home_elo = EXCLUDED.home_elo,
away_elo = EXCLUDED.away_elo,
home_home_elo = EXCLUDED.home_home_elo,
away_away_elo = EXCLUDED.away_away_elo,
home_form_elo = EXCLUDED.home_form_elo,
away_form_elo = EXCLUDED.away_form_elo,
elo_diff = EXCLUDED.elo_diff,
home_form_score = EXCLUDED.home_form_score,
away_form_score = EXCLUDED.away_form_score,
calculator_ver = EXCLUDED.calculator_ver,
updated_at = EXCLUDED.updated_at
""",
rows,
template="(%s, %s, %s, %s, %s, 0.0, %s, NOW())",
template="(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 0.0, %s, NOW())",
page_size=500,
)
conn.commit()
@@ -136,16 +144,24 @@ def backfill(sport: str, batch_size: int, dry_run: bool):
if not home_id or not away_id:
continue
# Snapshot PRE-match ELO
# Snapshot PRE-match ELO (all dimensions)
home_rating = elo.get_or_create_rating(home_id, h_name or "")
away_rating = elo.get_or_create_rating(away_id, a_name or "")
h_overall = round(home_rating.overall_elo, 2)
a_overall = round(away_rating.overall_elo, 2)
feature_buf.append((
match_id,
round(home_rating.overall_elo, 2),
round(away_rating.overall_elo, 2),
round(form_to_score(home_rating.recent_form), 2),
round(form_to_score(away_rating.recent_form), 2),
h_overall, # home_elo
a_overall, # away_elo
round(home_rating.home_elo, 2), # home_home_elo
round(away_rating.away_elo, 2), # away_away_elo
round(home_rating.form_elo, 2), # home_form_elo
round(away_rating.form_elo, 2), # away_form_elo
round(h_overall - a_overall, 2), # elo_diff
round(form_to_score(home_rating.recent_form), 2), # home_form_score
round(form_to_score(away_rating.recent_form), 2), # away_form_score
CALCULATOR_VER,
))
+507
View File
@@ -0,0 +1,507 @@
"""
V25 Pro Model Trainer — Optuna + Isotonic Calibration
=====================================================
Combines V25's 83 features + 12 markets + temporal split
with Optuna hyperparameter tuning and Isotonic Regression calibration.
Usage:
python scripts/train_v25_pro.py
python scripts/train_v25_pro.py --markets MS,OU25,BTTS # specific markets
python scripts/train_v25_pro.py --trials 30 # fewer trials
"""
import os
import sys
import json
import pickle
import argparse
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
from datetime import datetime
from sklearn.metrics import accuracy_score, log_loss, classification_report
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import BaseEstimator, ClassifierMixin
optuna.logging.set_verbosity(optuna.logging.WARNING)
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "v25")
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25")
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)
# ─── Feature Columns (83 features, NO target leakage) ───────────────
FEATURES = [
# ELO (8)
"home_overall_elo", "away_overall_elo", "elo_diff",
"home_home_elo", "away_away_elo",
"home_form_elo", "away_form_elo", "form_elo_diff",
# Form (12)
"home_goals_avg", "home_conceded_avg",
"away_goals_avg", "away_conceded_avg",
"home_clean_sheet_rate", "away_clean_sheet_rate",
"home_scoring_rate", "away_scoring_rate",
"home_winning_streak", "away_winning_streak",
"home_unbeaten_streak", "away_unbeaten_streak",
# H2H (6)
"h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate",
"h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",
# Team Stats (8)
"home_avg_possession", "away_avg_possession",
"home_avg_shots_on_target", "away_avg_shots_on_target",
"home_shot_conversion", "away_shot_conversion",
"home_avg_corners", "away_avg_corners",
# Odds (24 + 20 presence flags)
"odds_ms_h", "odds_ms_d", "odds_ms_a",
"implied_home", "implied_draw", "implied_away",
"odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
"odds_ou05_o", "odds_ou05_u",
"odds_ou15_o", "odds_ou15_u",
"odds_ou25_o", "odds_ou25_u",
"odds_ou35_o", "odds_ou35_u",
"odds_ht_ou05_o", "odds_ht_ou05_u",
"odds_ht_ou15_o", "odds_ht_ou15_u",
"odds_btts_y", "odds_btts_n",
"odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present",
"odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present",
"odds_ou05_o_present", "odds_ou05_u_present",
"odds_ou15_o_present", "odds_ou15_u_present",
"odds_ou25_o_present", "odds_ou25_u_present",
"odds_ou35_o_present", "odds_ou35_u_present",
"odds_ht_ou05_o_present", "odds_ht_ou05_u_present",
"odds_ht_ou15_o_present", "odds_ht_ou15_u_present",
"odds_btts_y_present", "odds_btts_n_present",
# League (4)
"home_xga", "away_xga",
"league_avg_goals", "league_zero_goal_rate",
# Upset Engine (4)
"upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",
# Referee Engine (5)
"referee_home_bias", "referee_avg_goals", "referee_cards_total",
"referee_avg_yellow", "referee_experience",
# Momentum (3)
"home_momentum_score", "away_momentum_score", "momentum_diff",
# Squad (9)
"home_squad_quality", "away_squad_quality", "squad_diff",
"home_key_players", "away_key_players",
"home_missing_impact", "away_missing_impact",
"home_goals_form", "away_goals_form",
]
MARKET_CONFIGS = [
{"target": "label_ms", "name": "MS", "num_class": 3},
{"target": "label_ou15", "name": "OU15", "num_class": 2},
{"target": "label_ou25", "name": "OU25", "num_class": 2},
{"target": "label_ou35", "name": "OU35", "num_class": 2},
{"target": "label_btts", "name": "BTTS", "num_class": 2},
{"target": "label_ht_result", "name": "HT_RESULT", "num_class": 3},
{"target": "label_ht_ou05", "name": "HT_OU05", "num_class": 2},
{"target": "label_ht_ou15", "name": "HT_OU15", "num_class": 2},
{"target": "label_ht_ft", "name": "HTFT", "num_class": 9},
{"target": "label_odd_even", "name": "ODD_EVEN", "num_class": 2},
{"target": "label_cards_ou45", "name": "CARDS_OU45", "num_class": 2},
{"target": "label_handicap_ms", "name": "HANDICAP_MS", "num_class": 3},
]
def load_data():
"""Load and prepare training data."""
if not os.path.exists(DATA_PATH):
print(f"[ERROR] Data not found: {DATA_PATH}")
sys.exit(1)
print(f"[INFO] Loading {DATA_PATH}...")
df = pd.read_csv(DATA_PATH)
for col in FEATURES:
if col in df.columns:
df[col] = df[col].fillna(0)
# Derive odds presence flags for older CSVs
odds_flag_sources = {
"odds_ms_h_present": "odds_ms_h", "odds_ms_d_present": "odds_ms_d",
"odds_ms_a_present": "odds_ms_a", "odds_ht_ms_h_present": "odds_ht_ms_h",
"odds_ht_ms_d_present": "odds_ht_ms_d", "odds_ht_ms_a_present": "odds_ht_ms_a",
"odds_ou05_o_present": "odds_ou05_o", "odds_ou05_u_present": "odds_ou05_u",
"odds_ou15_o_present": "odds_ou15_o", "odds_ou15_u_present": "odds_ou15_u",
"odds_ou25_o_present": "odds_ou25_o", "odds_ou25_u_present": "odds_ou25_u",
"odds_ou35_o_present": "odds_ou35_o", "odds_ou35_u_present": "odds_ou35_u",
"odds_ht_ou05_o_present": "odds_ht_ou05_o", "odds_ht_ou05_u_present": "odds_ht_ou05_u",
"odds_ht_ou15_o_present": "odds_ht_ou15_o", "odds_ht_ou15_u_present": "odds_ht_ou15_u",
"odds_btts_y_present": "odds_btts_y", "odds_btts_n_present": "odds_btts_n",
}
for flag_col, odds_col in odds_flag_sources.items():
if flag_col not in df.columns:
df[flag_col] = (
pd.to_numeric(df.get(odds_col, 0), errors="coerce").fillna(0) > 1.01
).astype(float)
print(f"[INFO] Shape: {df.shape}, Features: {len(FEATURES)}")
return df
def temporal_split_4way(valid_df: pd.DataFrame):
"""Chronological 60/15/10/15 split: train/val/cal/test."""
ordered = valid_df.sort_values("mst_utc").reset_index(drop=True)
n = len(ordered)
i1 = int(n * 0.60)
i2 = int(n * 0.75)
i3 = int(n * 0.85)
train = ordered.iloc[:i1].copy()
val = ordered.iloc[i1:i2].copy()
cal = ordered.iloc[i2:i3].copy()
test = ordered.iloc[i3:].copy()
return train, val, cal, test
# ─── XGBoost Wrapper for sklearn CalibratedClassifierCV ─────────────
class XGBWrapper(BaseEstimator, ClassifierMixin):
"""Thin sklearn-compatible wrapper around xgb.train for Isotonic calibration."""
def __init__(self, params, num_boost_round=500):
self.params = params
self.num_boost_round = num_boost_round
self.model_ = None
self.classes_ = None
def fit(self, X, y, **kwargs):
self.classes_ = np.unique(y)
dtrain = xgb.DMatrix(X, label=y)
self.model_ = xgb.train(self.params, dtrain, num_boost_round=self.num_boost_round)
return self
def predict_proba(self, X):
dm = xgb.DMatrix(X)
probs = self.model_.predict(dm)
if len(probs.shape) == 1:
probs = np.column_stack([1 - probs, probs])
return probs
def predict(self, X):
return np.argmax(self.predict_proba(X), axis=1)
# ─── Optuna Objectives ──────────────────────────────────────────────
def xgb_objective(trial, X_train, y_train, X_val, y_val, num_class):
params = {
"objective": "multi:softprob" if num_class > 2 else "binary:logistic",
"eval_metric": "mlogloss" if num_class > 2 else "logloss",
"max_depth": trial.suggest_int("max_depth", 3, 8),
"eta": trial.suggest_float("eta", 0.01, 0.15, log=True),
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
"min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
"gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
"reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
"n_jobs": 4,
"random_state": 42,
}
if num_class > 2:
params["num_class"] = num_class
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
model = xgb.train(
params, dtrain, num_boost_round=1000,
evals=[(dval, "val")], early_stopping_rounds=50, verbose_eval=False,
)
preds = model.predict(dval)
if len(preds.shape) == 1:
preds = np.column_stack([1 - preds, preds])
return log_loss(y_val, preds)
def lgb_objective(trial, X_train, y_train, X_val, y_val, num_class):
params = {
"objective": "multiclass" if num_class > 2 else "binary",
"metric": "multi_logloss" if num_class > 2 else "binary_logloss",
"max_depth": trial.suggest_int("max_depth", 3, 8),
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15, log=True),
"feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
"bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
"bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
"min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
"lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 1.0, log=True),
"lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
"n_jobs": 4, "random_state": 42, "verbose": -1,
}
if num_class > 2:
params["num_class"] = num_class
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
model = lgb.train(
params, train_data, num_boost_round=1000,
valid_sets=[val_data], valid_names=["val"],
callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)],
)
preds = model.predict(X_val, num_iteration=model.best_iteration)
if len(preds.shape) == 1:
preds = np.column_stack([1 - preds, preds])
return log_loss(y_val, preds)
# ─── Main Training Pipeline ─────────────────────────────────────────
def train_market(df, target_col, market_name, num_class, n_trials):
"""Full pipeline for one market: Optuna → Train → Calibrate → Evaluate."""
print(f"\n{'='*60}")
print(f"[MARKET] {market_name} (classes={num_class})")
print(f"{'='*60}")
valid_df = df[df[target_col].notna()].copy()
valid_df = valid_df[valid_df[target_col].astype(str) != ""].copy()
print(f"[INFO] Valid samples: {len(valid_df)}")
if len(valid_df) < 500:
print(f"[SKIP] Not enough data for {market_name}")
return None
available_features = [f for f in FEATURES if f in valid_df.columns]
print(f"[INFO] Features: {len(available_features)}/{len(FEATURES)}")
train_df, val_df, cal_df, test_df = temporal_split_4way(valid_df)
X_train = train_df[available_features].values
X_val = val_df[available_features].values
X_cal = cal_df[available_features].values
X_test = test_df[available_features].values
y_train = train_df[target_col].astype(int).values
y_val = val_df[target_col].astype(int).values
y_cal = cal_df[target_col].astype(int).values
y_test = test_df[target_col].astype(int).values
print(f"[INFO] Split: train={len(X_train)} val={len(X_val)} cal={len(X_cal)} test={len(X_test)}")
# ── Phase 1: Optuna XGBoost ──────────────────────────────────
print(f"\n[OPTUNA] XGBoost tuning ({n_trials} trials)...")
xgb_study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
xgb_study.optimize(
lambda trial: xgb_objective(trial, X_train, y_train, X_val, y_val, num_class),
n_trials=n_trials,
)
xgb_best = xgb_study.best_params
print(f"[OK] XGB best logloss: {xgb_study.best_value:.4f}")
# ── Phase 2: Optuna LightGBM ─────────────────────────────────
print(f"[OPTUNA] LightGBM tuning ({n_trials} trials)...")
lgb_study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
lgb_study.optimize(
lambda trial: lgb_objective(trial, X_train, y_train, X_val, y_val, num_class),
n_trials=n_trials,
)
lgb_best = lgb_study.best_params
print(f"[OK] LGB best logloss: {lgb_study.best_value:.4f}")
# ── Phase 3: Train final models with best params ─────────────
# XGBoost final
xgb_params = {
"objective": "multi:softprob" if num_class > 2 else "binary:logistic",
"eval_metric": "mlogloss" if num_class > 2 else "logloss",
"n_jobs": 4, "random_state": 42,
**{k: v for k, v in xgb_best.items()},
}
if num_class > 2:
xgb_params["num_class"] = num_class
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
xgb_model = xgb.train(
xgb_params, dtrain, num_boost_round=1500,
evals=[(dtrain, "train"), (dval, "val")],
early_stopping_rounds=80, verbose_eval=200,
)
print(f"[OK] XGB final: iter={xgb_model.best_iteration}, score={xgb_model.best_score:.4f}")
# LightGBM final
lgb_params = {
"objective": "multiclass" if num_class > 2 else "binary",
"metric": "multi_logloss" if num_class > 2 else "binary_logloss",
"n_jobs": 4, "random_state": 42, "verbose": -1,
**{k: v for k, v in lgb_best.items()},
}
if num_class > 2:
lgb_params["num_class"] = num_class
lgb_train_data = lgb.Dataset(X_train, label=y_train)
lgb_val_data = lgb.Dataset(X_val, label=y_val, reference=lgb_train_data)
lgb_model = lgb.train(
lgb_params, lgb_train_data, num_boost_round=1500,
valid_sets=[lgb_train_data, lgb_val_data],
valid_names=["train", "val"],
callbacks=[lgb.early_stopping(80), lgb.log_evaluation(200)],
)
print(f"[OK] LGB final: iter={lgb_model.best_iteration}")
# ── Phase 4: Isotonic Calibration on cal set ─────────────────
print("[CAL] Fitting Isotonic Regression...")
# XGB calibration
xgb_wrapper = XGBWrapper(xgb_params, num_boost_round=xgb_model.best_iteration)
xgb_calibrated = CalibratedClassifierCV(xgb_wrapper, method="isotonic", cv="prefit")
xgb_wrapper.fit(X_train, y_train)
xgb_calibrated.fit(X_cal, y_cal)
# LGB calibration — use raw predictions approach
lgb_cal_preds = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration)
if len(lgb_cal_preds.shape) == 1:
lgb_cal_preds = np.column_stack([1 - lgb_cal_preds, lgb_cal_preds])
# ── Phase 5: Evaluate on test set ────────────────────────────
print("\n[EVAL] Test set evaluation...")
dtest = xgb.DMatrix(X_test)
# Raw XGB
xgb_raw_probs = xgb_model.predict(dtest)
if len(xgb_raw_probs.shape) == 1:
xgb_raw_probs = np.column_stack([1 - xgb_raw_probs, xgb_raw_probs])
# Calibrated XGB
xgb_cal_probs = xgb_calibrated.predict_proba(X_test)
# Raw LGB
lgb_raw_probs = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
if len(lgb_raw_probs.shape) == 1:
lgb_raw_probs = np.column_stack([1 - lgb_raw_probs, lgb_raw_probs])
# Ensemble (raw)
raw_ensemble = (xgb_raw_probs + lgb_raw_probs) / 2
def _eval(probs, label):
preds = np.argmax(probs, axis=1)
acc = accuracy_score(y_test, preds)
ll = log_loss(y_test, probs)
print(f" {label}: Acc={acc:.4f} LogLoss={ll:.4f}")
return {"accuracy": round(float(acc), 4), "logloss": round(float(ll), 4)}
m_xgb_raw = _eval(xgb_raw_probs, "XGB Raw")
m_xgb_cal = _eval(xgb_cal_probs, "XGB Calibrated")
m_lgb_raw = _eval(lgb_raw_probs, "LGB Raw")
m_ensemble = _eval(raw_ensemble, "Ensemble Raw")
# Classification report for ensemble
ens_preds = np.argmax(raw_ensemble, axis=1)
print(f"\n[REPORT] Ensemble Classification Report:")
print(classification_report(y_test, ens_preds))
# ── Phase 6: Save models ─────────────────────────────────────
# Raw models (orchestrator compatible)
xgb_path = os.path.join(MODELS_DIR, f"xgb_v25_{market_name.lower()}.json")
xgb_model.save_model(xgb_path)
print(f"[SAVE] {xgb_path}")
lgb_path = os.path.join(MODELS_DIR, f"lgb_v25_{market_name.lower()}.txt")
lgb_model.save_model(lgb_path)
print(f"[SAVE] {lgb_path}")
# Calibrated model
cal_path = os.path.join(MODELS_DIR, f"cal_xgb_v25_{market_name.lower()}.pkl")
with open(cal_path, "wb") as f:
pickle.dump(xgb_calibrated, f)
print(f"[SAVE] {cal_path}")
return {
"market": market_name,
"samples": int(len(valid_df)),
"train": int(len(X_train)),
"val": int(len(X_val)),
"cal": int(len(X_cal)),
"test": int(len(X_test)),
"features_used": len(available_features),
"xgb_best_params": xgb_best,
"lgb_best_params": lgb_best,
"xgb_best_iteration": int(xgb_model.best_iteration),
"lgb_best_iteration": int(lgb_model.best_iteration),
"xgb_optuna_best_logloss": round(float(xgb_study.best_value), 4),
"lgb_optuna_best_logloss": round(float(lgb_study.best_value), 4),
"test_xgb_raw": m_xgb_raw,
"test_xgb_calibrated": m_xgb_cal,
"test_lgb_raw": m_lgb_raw,
"test_ensemble_raw": m_ensemble,
}
def main():
parser = argparse.ArgumentParser(description="V25 Pro Trainer")
parser.add_argument("--markets", type=str, default=None,
help="Comma-separated market names (e.g., MS,OU25,BTTS)")
parser.add_argument("--trials", type=int, default=50,
help="Optuna trials per model per market")
args = parser.parse_args()
print("=" * 60)
print("V25 PRO — Optuna + Isotonic Calibration")
print("=" * 60)
print(f"[INFO] Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"[INFO] Trials per model: {args.trials}")
print(f"[INFO] Total features: {len(FEATURES)}")
df = load_data()
configs = MARKET_CONFIGS
if args.markets:
selected = [m.strip().upper() for m in args.markets.split(",")]
configs = [c for c in configs if c["name"] in selected]
print(f"[INFO] Selected markets: {[c['name'] for c in configs]}")
all_metrics = {
"trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"trainer": "v25_pro",
"optuna_trials": args.trials,
"total_features": len(FEATURES),
"markets": {},
}
for config in configs:
target = config["target"]
if target not in df.columns:
print(f"[SKIP] {config['name']}: missing target {target}")
continue
metrics = train_market(
df, target, config["name"], config["num_class"], args.trials,
)
if metrics:
all_metrics["markets"][config["name"]] = metrics
# Save feature list
feature_path = os.path.join(MODELS_DIR, "feature_cols.json")
with open(feature_path, "w") as f:
json.dump(FEATURES, f, indent=2)
# Save full report
report_path = os.path.join(REPORTS_DIR, "v25_pro_metrics.json")
with open(report_path, "w") as f:
json.dump(all_metrics, f, indent=2, default=str)
print(f"\n[SAVE] Report: {report_path}")
# Summary
print("\n" + "=" * 60)
print("[SUMMARY]")
print("=" * 60)
for name, m in all_metrics["markets"].items():
ens = m.get("test_ensemble_raw", {})
print(f" {name:12s} | Acc={ens.get('accuracy','?'):>6s} | LL={ens.get('logloss','?'):>6s} | "
f"XGB_iter={m.get('xgb_best_iteration','?')} LGB_iter={m.get('lgb_best_iteration','?')}")
print(f"\n[INFO] Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("[OK] V25 PRO Training Complete!")
if __name__ == "__main__":
main()
+367
View File
@@ -0,0 +1,367 @@
"""
Match Commentary Generator
===========================
Generates human-readable Turkish commentary from the analysis package.
Reads all engine signals (model, odds band, betting brain, triple value)
and produces a clear, actionable summary for end users.
No LLM required — fully template-based.
"""
from __future__ import annotations
from typing import Any, Dict, List, Optional
def generate_match_commentary(package: Dict[str, Any]) -> Dict[str, Any]:
"""
Main entry point. Takes a full analysis package and returns a commentary dict.
Returns:
{
"action": "BET" | "WATCH" | "SKIP",
"headline": "...",
"summary": "...",
"notes": ["...", "..."],
"contradictions": ["...", "..."],
"confidence_label": "YÜKSEK" | "ORTA" | "DÜŞÜK" | "ÇOK DÜŞÜK"
}
"""
match_info = package.get("match_info") or {}
home = match_info.get("home_team", "Ev Sahibi")
away = match_info.get("away_team", "Deplasman")
main_pick = package.get("main_pick") or {}
betting_brain = package.get("betting_brain") or {}
v27_engine = package.get("v27_engine") or {}
market_board = package.get("market_board") or {}
score_pred = package.get("score_prediction") or {}
risk = package.get("risk") or {}
data_quality = package.get("data_quality") or {}
# ── Determine action ──────────────────────────────────────────
brain_decision = str(betting_brain.get("decision") or "NO_BET").upper()
main_playable = bool(main_pick.get("playable"))
main_vetoed = bool((main_pick.get("upper_brain") or {}).get("veto"))
approved_count = int(betting_brain.get("approved_count", 0) or 0)
if main_playable and not main_vetoed and approved_count > 0:
action = "BET"
elif approved_count == 0 and brain_decision == "NO_BET":
action = "SKIP"
else:
action = "WATCH"
# ── Headline ──────────────────────────────────────────────────
headline = _build_headline(action, main_pick, home, away)
# ── Summary paragraph ─────────────────────────────────────────
summary = _build_summary(
action, main_pick, market_board, v27_engine,
score_pred, risk, data_quality, home, away,
)
# ── Quick notes ───────────────────────────────────────────────
notes = _build_notes(market_board, v27_engine, score_pred, risk, home, away)
# ── Contradiction detection ───────────────────────────────────
contradictions = _detect_contradictions(market_board, v27_engine, package)
# ── Overall confidence label ──────────────────────────────────
confidence_label = _overall_confidence_label(main_pick, data_quality)
return {
"action": action,
"headline": headline,
"summary": summary,
"notes": notes[:6],
"contradictions": contradictions[:4],
"confidence_label": confidence_label,
}
# ═══════════════════════════════════════════════════════════════════════
# Headline
# ═══════════════════════════════════════════════════════════════════════
def _build_headline(
action: str,
main_pick: Dict[str, Any],
home: str,
away: str,
) -> str:
if action == "BET":
market = main_pick.get("market", "")
pick = main_pick.get("pick", "")
odds = main_pick.get("odds", 0.0)
conf = main_pick.get("calibrated_confidence", main_pick.get("confidence", 0))
market_tr = _market_to_turkish(market, pick)
return f"🎯 {market_tr} önerisi — Oran: {odds}, Güven: %{conf:.0f}"
if action == "WATCH":
return f"👀 {home} vs {away} — İzlemeye değer sinyaller var"
return f"⚠️ {home} vs {away} — Şu an net bir fırsat görülmüyor"
# ═══════════════════════════════════════════════════════════════════════
# Summary
# ═══════════════════════════════════════════════════════════════════════
def _build_summary(
action: str,
main_pick: Dict[str, Any],
market_board: Dict[str, Any],
v27_engine: Dict[str, Any],
score_pred: Dict[str, Any],
risk: Dict[str, Any],
data_quality: Dict[str, Any],
home: str,
away: str,
) -> str:
parts: List[str] = []
# Who is the favourite?
ms_board = market_board.get("MS") or {}
ms_pick = ms_board.get("pick", "")
ms_conf = float(ms_board.get("confidence", 50) or 50)
if ms_pick == "1" and ms_conf > 45:
parts.append(f"{home} hafif favori görünüyor")
elif ms_pick == "1" and ms_conf > 55:
parts.append(f"{home} net favori")
elif ms_pick == "2" and ms_conf > 45:
parts.append(f"{away} hafif favori görünüyor")
elif ms_pick == "2" and ms_conf > 55:
parts.append(f"{away} net favori")
else:
parts.append("İki takım da birbirine yakın güçte")
# xG expectation
xg_home = float(score_pred.get("xg_home", 0) or 0)
xg_away = float(score_pred.get("xg_away", 0) or 0)
xg_total = xg_home + xg_away
if xg_total > 3.0:
parts.append(f"Gol beklentisi yüksek (toplam xG: {xg_total:.1f})")
elif xg_total < 2.0:
parts.append(f"Düşük gol beklentisi (toplam xG: {xg_total:.1f})")
# Consensus check
consensus = str(v27_engine.get("consensus") or "").upper()
if consensus == "AGREE":
parts.append("Model motorları aynı fikirde")
elif consensus == "DISAGREE":
parts.append("Model motorları farklı sonuçlara ulaşıyor — belirsizlik var")
# Action-specific
if action == "BET":
market_tr = _market_to_turkish(
main_pick.get("market", ""), main_pick.get("pick", "")
)
edge = float(main_pick.get("ev_edge", 0) or 0)
parts.append(
f"{market_tr} yönünde değer tespit edildi (EV edge: {edge:+.1%})"
)
elif action == "SKIP":
parts.append(
"Hiçbir markette piyasanın fiyatlamadığı bir avantaj görülmüyor"
)
# Risk
risk_level = str(risk.get("level") or "MEDIUM").upper()
if risk_level == "HIGH":
parts.append("⚠️ Risk seviyesi yüksek")
elif risk_level == "EXTREME":
parts.append("🔴 Çok yüksek risk — dikkatli olun")
# Data quality
quality_label = str(data_quality.get("label") or "MEDIUM").upper()
if quality_label == "LOW":
parts.append("Veri kalitesi düşük — tahminler daha az güvenilir")
return ". ".join(parts) + "."
# ═══════════════════════════════════════════════════════════════════════
# Quick Notes
# ═══════════════════════════════════════════════════════════════════════
def _build_notes(
market_board: Dict[str, Any],
v27_engine: Dict[str, Any],
score_pred: Dict[str, Any],
risk: Dict[str, Any],
home: str,
away: str,
) -> List[str]:
notes: List[str] = []
triple_value = v27_engine.get("triple_value") or {}
odds_band = v27_engine.get("odds_band") or {}
# MS note
ms = market_board.get("MS") or {}
ms_conf = float(ms.get("confidence", 0) or 0)
if ms_conf < 45:
notes.append("Maç sonucu belirsiz, net favori yok")
elif ms.get("pick") == "1":
notes.append(f"{home} favori ama oran değerli mi kontrol et")
elif ms.get("pick") == "2":
notes.append(f"{away} favori ama oran değerli mi kontrol et")
# OU25 note
ou25 = market_board.get("OU25") or {}
ou25_probs = ou25.get("probs") or {}
over_prob = float(ou25_probs.get("over", 0.5) or 0.5)
if over_prob > 0.58:
notes.append("2.5 Üst yönünde eğilim var")
elif over_prob < 0.42:
notes.append("2.5 Alt yönünde eğilim var")
else:
notes.append("2.5 Üst/Alt dengeli — kesin sinyal yok")
# BTTS note
btts = market_board.get("BTTS") or {}
btts_probs = btts.get("probs") or {}
btts_yes = float(btts_probs.get("yes", 0.5) or 0.5)
if btts_yes > 0.58:
notes.append("Her iki takımın da gol atması bekleniyor")
elif btts_yes < 0.42:
notes.append("KG olasılığı düşük")
# HT note
ht = market_board.get("HT") or {}
ht_pick = ht.get("pick", "")
ht_conf = float(ht.get("confidence", 0) or 0)
if ht_conf > 40 and ht_pick:
ht_label = {"1": f"İY {home}", "2": f"İY {away}", "X": "İY beraberlik"}.get(
ht_pick, f"İY {ht_pick}"
)
notes.append(f"{ht_label} yönünde hafif sinyal (%{ht_conf:.0f})")
# Risk warnings
warnings = risk.get("warnings") or []
for w in warnings[:2]:
notes.append(f"⚠️ {w}")
return notes
# ═══════════════════════════════════════════════════════════════════════
# Contradiction Detection
# ═══════════════════════════════════════════════════════════════════════
def _detect_contradictions(
market_board: Dict[str, Any],
v27_engine: Dict[str, Any],
package: Dict[str, Any],
) -> List[str]:
"""
Detect cases where model prediction and odds band/triple value
point in opposite directions — the user's main complaint.
"""
contradictions: List[str] = []
triple_value = v27_engine.get("triple_value") or {}
predictions = v27_engine.get("predictions") or {}
# MS contradiction: model says home but triple_value says away has value
ms_preds = predictions.get("ms") or {}
ms_home = float(ms_preds.get("home", 0) or 0)
ms_away = float(ms_preds.get("away", 0) or 0)
home_triple = triple_value.get("home") or {}
away_triple = triple_value.get("away") or {}
model_favours_home = ms_home > ms_away
away_is_value = bool(away_triple.get("is_value"))
home_is_value = bool(home_triple.get("is_value"))
if model_favours_home and away_is_value:
contradictions.append(
"Model ev sahibini favori görüyor ama oran bandı deplasmanda değer buluyor — "
"bu çelişki nedeniyle MS tahminine dikkatli yaklaş"
)
elif not model_favours_home and home_is_value:
contradictions.append(
"Model deplasmanı favori görüyor ama oran bandı ev sahibinde değer buluyor — "
"bu çelişki nedeniyle MS tahminine dikkatli yaklaş"
)
# HT contradiction
ht_board = market_board.get("HT") or {}
ht_pick = ht_board.get("pick", "")
ht_home_triple = triple_value.get("ht_home") or {}
ht_away_triple = triple_value.get("ht_away") or {}
if ht_pick == "1" and bool(ht_away_triple.get("is_value")):
contradictions.append(
"Model İY ev sahibi diyor ama oran bandı İY deplasmanda değer buluyor — "
"İY tahmini güvenilir değil"
)
elif ht_pick == "2" and bool(ht_home_triple.get("is_value")):
contradictions.append(
"Model İY deplasman diyor ama oran bandı İY ev sahibinde değer buluyor — "
"İY tahmini güvenilir değil"
)
# OU25 contradiction
ou25_board = market_board.get("OU25") or {}
ou25_pick = ou25_board.get("pick", "")
ou25_over_triple = triple_value.get("ou25_over") or {}
ou25_under_triple = triple_value.get("ou25_under") or {}
if ou25_pick == "Üst" and bool(ou25_under_triple.get("is_value")):
contradictions.append(
"Model 2.5 Üst diyor ama oran bandı 2.5 Alt'ta değer buluyor — çelişki var"
)
elif ou25_pick == "Alt" and bool(ou25_over_triple.get("is_value")):
contradictions.append(
"Model 2.5 Alt diyor ama oran bandı 2.5 Üst'te değer buluyor — çelişki var"
)
return contradictions
# ═══════════════════════════════════════════════════════════════════════
# Helpers
# ═══════════════════════════════════════════════════════════════════════
def _overall_confidence_label(
main_pick: Dict[str, Any],
data_quality: Dict[str, Any],
) -> str:
"""Overall confidence label for the entire analysis."""
quality_score = float(data_quality.get("score", 0.5) or 0.5)
main_conf = float(
main_pick.get("calibrated_confidence", main_pick.get("confidence", 0)) or 0
)
main_playable = bool(main_pick.get("playable"))
if main_playable and main_conf >= 60 and quality_score >= 0.8:
return "YÜKSEK"
if main_playable and main_conf >= 45:
return "ORTA"
if main_conf >= 30:
return "DÜŞÜK"
return "ÇOK DÜŞÜK"
_MARKET_TR_MAP = {
"MS": {"1": "Maç Sonucu Ev Sahibi", "2": "Maç Sonucu Deplasman", "X": "Beraberlik"},
"DC": {"1X": "Çifte Şans 1X", "X2": "Çifte Şans X2", "12": "Çifte Şans 12"},
"OU25": {"Üst": "2.5 Üst", "Alt": "2.5 Alt", "Over": "2.5 Üst", "Under": "2.5 Alt"},
"OU15": {"Üst": "1.5 Üst", "Alt": "1.5 Alt", "Over": "1.5 Üst", "Under": "1.5 Alt"},
"OU35": {"Üst": "3.5 Üst", "Alt": "3.5 Alt", "Over": "3.5 Üst", "Under": "3.5 Alt"},
"BTTS": {"KG Var": "Karşılıklı Gol Var", "KG Yok": "Karşılıklı Gol Yok",
"Yes": "Karşılıklı Gol Var", "No": "Karşılıklı Gol Yok"},
"HT": {"1": "İlk Yarı Ev Sahibi", "2": "İlk Yarı Deplasman", "X": "İlk Yarı Beraberlik"},
"HT_OU05": {"Üst": "İY 0.5 Üst", "Alt": "İY 0.5 Alt"},
"HT_OU15": {"Üst": "İY 1.5 Üst", "Alt": "İY 1.5 Alt"},
"OE": {"Tek": "Tek", "Çift": "Çift", "Odd": "Tek", "Even": "Çift"},
"CARDS": {"Üst": "Kart Üst", "Alt": "Kart Alt"},
}
def _market_to_turkish(market: str, pick: str) -> str:
market_map = _MARKET_TR_MAP.get(market, {})
result = market_map.get(pick)
if result:
return result
return f"{market} {pick}"
+62 -112
View File
@@ -51,8 +51,10 @@ from core.engines.player_predictor import PlayerPrediction, get_player_predictor
from services.feature_enrichment import FeatureEnrichmentService
from services.betting_brain import BettingBrain
from services.v26_shadow_engine import V26ShadowEngine, get_v26_shadow_engine
from services.match_commentary import generate_match_commentary
from utils.top_leagues import load_top_league_ids
from utils.league_reliability import load_league_reliability
from config.config_loader import build_threshold_dict, get_threshold_default
@dataclass
@@ -165,99 +167,15 @@ class SingleMatchOrchestrator:
self.league_reliability = load_league_reliability()
self.enrichment = FeatureEnrichmentService()
self.odds_band_analyzer = OddsBandAnalyzer()
# ── V32 Calibration Rebalance ──────────────────────────────────
# RULE: max_reachable = 100 × calibration MUST be > min_conf + 8
# Previous values had 5 markets where this was IMPOSSIBLE:
# HT(0.42×100=42 < 45), HCAP(0.40×100=40 < 46), HTFT(0.28×100=28 < 32)
# HT_OU15(0.46×100=46 < 48), CARDS(0.45×100=45 < 48)
# These markets could NEVER become playable → all predictions were PASS.
#
# New calibration: conservative but mathematically achievable.
# Each market's calibration ensures high-confidence model outputs CAN pass.
self.market_calibration: Dict[str, float] = {
"MS": 0.62, # max=62 vs min=42 ✓ (was 0.48→max=48 vs 44 ⚠️)
"DC": 0.82, # max=82 vs min=52 ✓ (unchanged, already good)
"OU15": 0.84, # max=84 vs min=55 ✓ (unchanged, already good)
"OU25": 0.68, # max=68 vs min=48 ✓ (was 0.54→max=54 vs 52 ⚠️)
"OU35": 0.60, # max=60 vs min=48 ✓ (was 0.44→max=44 vs 54 ❌)
"BTTS": 0.65, # max=65 vs min=46 ✓ (was 0.50→max=50 vs 50 ⚠️)
"HT": 0.58, # max=58 vs min=40 ✓ (was 0.42→max=42 vs 45 ❌)
"HT_OU05": 0.68, # max=68 vs min=50 ✓ (unchanged)
"HT_OU15": 0.60, # max=60 vs min=42 ✓ (was 0.46→max=46 vs 48 ❌)
"OE": 0.62, # max=62 vs min=46 ✓ (was 0.58→max=58 vs 50 ok)
"CARDS": 0.58, # max=58 vs min=42 ✓ (was 0.45→max=45 vs 48 ❌)
"HCAP": 0.56, # max=56 vs min=40 ✓ (was 0.40→max=40 vs 46 ❌)
"HTFT": 0.45, # max=45 vs min=28 ✓ (was 0.28→max=28 vs 32 ❌)
}
# Min confidence: lowered to be achievable (max_reachable - 16 to -20)
self.market_min_conf: Dict[str, float] = {
"MS": 20.0, # was 42 — drastically lowered to allow underdog/draw value bets
"DC": 40.0, # was 52
"OU15": 45.0, # was 55
"OU25": 30.0, # was 48
"OU35": 20.0, # was 48
"BTTS": 30.0, # was 46
"HT": 20.0, # was 40
"HT_OU05": 35.0, # was 50
"HT_OU15": 25.0, # was 42
"OE": 35.0, # was 46
"CARDS": 30.0, # was 42
"HCAP": 25.0, # was 40
"HTFT": 10.0, # was 28
}
# Min play score: Significantly reduced to stop blocking value bets on underdogs
self.market_min_play_score: Dict[str, float] = {
"MS": 30.0, # was 65
"DC": 55.0, # was 58
"OU15": 55.0, # was 60
"OU25": 45.0, # was 64
"OU35": 35.0, # was 68
"BTTS": 45.0, # was 64
"HT": 30.0, # was 66
"HT_OU05": 45.0, # was 60
"HT_OU15": 35.0, # was 64
"OE": 35.0, # was 60
"CARDS": 40.0, # was 66
"HCAP": 35.0, # was 68
"HTFT": 20.0, # was 72
}
self.market_min_edge: Dict[str, float] = {
"MS": 0.02, # was 0.03 — slight relaxation
"DC": 0.01, # unchanged
"OU15": 0.01, # unchanged
"OU25": 0.02, # unchanged
"OU35": 0.03, # was 0.04
"BTTS": 0.02, # was 0.03
"HT": 0.03, # was 0.04
"HT_OU05": 0.01, # unchanged
"HT_OU15": 0.02, # was 0.03
"OE": 0.02, # unchanged
"CARDS": 0.02, # was 0.03
"HCAP": 0.03, # was 0.04
"HTFT": 0.05, # was 0.06
}
self.odds_band_min_sample: Dict[str, float] = {
"MS": 8.0,
"DC": 8.0,
"OU15": 8.0,
"OU25": 8.0,
"OU35": 8.0,
"BTTS": 8.0,
"HT": 8.0,
"HT_OU05": 8.0,
"HT_OU15": 8.0,
}
self.odds_band_min_edge: Dict[str, float] = {
"MS": 0.015,
"DC": 0.012,
"OU15": 0.012,
"OU25": 0.015,
"OU35": 0.018,
"BTTS": 0.015,
"HT": 0.018,
"HT_OU05": 0.012,
"HT_OU15": 0.015,
}
# ── Market Thresholds (loaded from config/market_thresholds.json) ──
# All values are centralized in a single JSON file for easy tuning
# without code changes. See config/market_thresholds.json for details.
self.market_calibration: Dict[str, float] = build_threshold_dict("calibration")
self.market_min_conf: Dict[str, float] = build_threshold_dict("min_conf")
self.market_min_play_score: Dict[str, float] = build_threshold_dict("min_play_score")
self.market_min_edge: Dict[str, float] = build_threshold_dict("min_edge")
self.odds_band_min_sample: Dict[str, float] = build_threshold_dict("odds_band_min_sample")
self.odds_band_min_edge: Dict[str, float] = build_threshold_dict("odds_band_min_edge")
def _get_v25_predictor(self) -> V25Predictor:
if self.v25_predictor is None:
@@ -720,7 +638,7 @@ class SingleMatchOrchestrator:
signal: Dict[str, Any] = {}
def _temperature_scale(probs_dict: Dict[str, float], temperature: float = 2.5) -> Dict[str, float]:
def _temperature_scale(probs_dict: Dict[str, float], temperature: float = 1.5) -> Dict[str, float]:
"""
Apply temperature scaling to soften overconfident model outputs.
@@ -729,19 +647,22 @@ class SingleMatchOrchestrator:
T=1.0 → no change, T>1 → softer probabilities.
Standard approach for post-hoc model calibration (Guo et al., 2017).
V34: Reduced from 2.5 to 1.5 — V25 model is already calibrated via
odds-aware training. Excessive flattening was destroying signal.
"""
import math
eps = 1e-7 # numerical stability
n = len(probs_dict)
# Determine appropriate temperature based on market type
# V34: Reduced temperature — odds-aware model is already calibrated
# Binary markets (2-class) tend to be more overconfident in LGB
if n <= 2:
T = max(temperature, 2.0)
T = max(temperature, 1.5) # was 2.0
elif n == 3:
T = max(temperature * 0.8, 1.5) # 3-way slightly less aggressive
T = max(temperature * 0.8, 1.2) # was 1.5 — 3-way slightly less aggressive
else:
T = max(temperature * 0.6, 1.3) # 9-way (HTFT) already spread
T = max(temperature * 0.6, 1.0) # was 1.3 — 9-way (HTFT) already spread
# Convert to log-odds and apply temperature
labels = list(probs_dict.keys())
@@ -767,8 +688,8 @@ class SingleMatchOrchestrator:
Applies temperature scaling to convert overconfident LightGBM outputs
into realistic, calibrated probabilities.
"""
# Apply temperature scaling to soften extreme probabilities
scaled_probs = _temperature_scale(probs_dict, temperature=2.5)
# V34: Apply temperature scaling — reduced from 2.5 to 1.5
scaled_probs = _temperature_scale(probs_dict, temperature=1.5)
best_label = max(scaled_probs, key=scaled_probs.get)
best_prob = float(scaled_probs[best_label])
@@ -1532,6 +1453,13 @@ class SingleMatchOrchestrator:
base_package = self._apply_upper_brain_guards(base_package)
# ── Match Commentary: human-readable summary ──────────────
try:
base_package["match_commentary"] = generate_match_commentary(base_package)
except Exception as e:
print(f"[Commentary] ⚠ Generation failed (non-fatal): {e}")
base_package["match_commentary"] = None
mode = str(getattr(self, "engine_mode", "v28-pro-max") or "v28-pro-max").lower()
if mode not in {"v25", "v26", "dual", "v28", "v28-pro-max"}:
mode = "v25"
@@ -1545,6 +1473,7 @@ class SingleMatchOrchestrator:
)
if mode == "v26":
shadow_package["match_commentary"] = base_package.get("match_commentary")
return shadow_package
if mode == "dual":
merged = dict(base_package)
@@ -5239,7 +5168,9 @@ class SingleMatchOrchestrator:
reasons: List[str] = []
playable = True
is_value_sniper = ev_edge >= 0.03
# V34: Broadened value_sniper bypass — odds-aware model rarely shows 3% EV edge
# Allow high-confidence predictions OR modest positive EV to bypass secondary gates
is_value_sniper = ev_edge >= 0.008 or calibrated_conf >= 55.0
if calibrated_conf < min_conf:
if not is_value_sniper:
@@ -5261,29 +5192,48 @@ class SingleMatchOrchestrator:
# Most pre-match predictions use probable_xi — blocking kills all output
lineup_penalty += 6.0
reasons.append("lineup_probable_xi_penalty")
base_score = calibrated_conf + (simple_edge * 100.0 * edge_multiplier)
# V34: Added confidence bonus — high raw model probability gets a boost
# This prevents over-penalization when edge is near-zero but model is confident
raw_top_prob = float(row.get("probability", 0.0))
confidence_bonus = 0.0
if raw_top_prob >= 0.65:
confidence_bonus = 15.0
elif raw_top_prob >= 0.55:
confidence_bonus = 10.0
elif raw_top_prob >= 0.45:
confidence_bonus = 5.0
base_score = calibrated_conf + (simple_edge * 100.0 * edge_multiplier) + confidence_bonus
play_score = max(
0.0,
min(100.0, base_score - risk_penalty - quality_penalty - lineup_penalty),
)
if bool(band_verdict.get("required")) and not bool(band_verdict.get("aligned")):
# V34: odds_band gate — only hard-block when band data is AVAILABLE and aligned=False
# When band data is sparse (available=False), skip alignment check entirely
band_available = bool(band_verdict.get("available", False))
if band_available and bool(band_verdict.get("required")) and not bool(band_verdict.get("aligned")):
if not is_value_sniper:
playable = False
reasons.append(str(band_verdict.get("reason") or "odds_band_not_aligned"))
if bool(band_verdict.get("required")) and implied_prob > 0.0 and model_edge <= 0.0:
if not is_value_sniper:
playable = False
reasons.append(f"model_not_above_market_{model_edge:+.3f}")
# V31: negative edge threshold adapts to league reliability
# Reliable league: stricter (-0.03), unreliable: looser (-0.08)
neg_edge_threshold = -0.03 - (1.0 - odds_rel) * 0.05
elif not band_available and bool(band_verdict.get("required")):
# Sparse data — log but don't block
reasons.append("odds_band_data_sparse_skipped")
# V34: REMOVED model_not_above_market gate entirely
# V25 model is odds-informed BY DESIGN → model output ≈ market-implied probability
# Requiring model > market is mathematically impossible with this architecture
# The negative_model_edge gate below still catches truly anti-value picks
# V34: negative edge threshold relaxed — odds-aware model's edge is naturally near zero
# Reliable league: -0.08, unreliable: up to -0.15
# Only blocks truly anti-value picks (model significantly below market)
neg_edge_threshold = -0.08 - (1.0 - odds_rel) * 0.07
if odd > 1.0 and simple_edge < neg_edge_threshold:
if not is_value_sniper:
playable = False
reasons.append(f"negative_model_edge_{simple_edge:+.3f}")
# V34: Added value_sniper bypass — was missing before, causing hard blocks
if odd > 1.0 and ev_edge < min_edge:
playable = False
reasons.append(f"below_market_edge_threshold_{ev_edge:+.3f}")
if not is_value_sniper:
playable = False
reasons.append(f"below_market_edge_threshold_{ev_edge:+.3f}")
if play_score < min_play_score:
if not is_value_sniper:
playable = False