168 lines
5.3 KiB
Python
168 lines
5.3 KiB
Python
"""
|
|
Shared VQWEN feature contract
|
|
=============================
|
|
|
|
One place defines how VQWEN features are produced.
|
|
Both training and runtime inference must use this module so the model sees
|
|
the same feature semantics in historical data and live analysis.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
|
|
import numpy as np
|
|
|
|
FEATURE_COLUMNS = [
|
|
"elo_diff",
|
|
"h_xg",
|
|
"a_xg",
|
|
"total_xg",
|
|
"pow_diff",
|
|
"rest_diff",
|
|
"h_fat",
|
|
"a_fat",
|
|
"imp_h",
|
|
"imp_d",
|
|
"imp_a",
|
|
"h_xi",
|
|
"a_xi",
|
|
"h2h_h_wr",
|
|
"form_diff",
|
|
]
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class VqwenFeatureInput:
|
|
home_elo: float
|
|
away_elo: float
|
|
home_avg_goals_scored: float
|
|
away_avg_goals_scored: float
|
|
home_avg_goals_conceded: float
|
|
away_avg_goals_conceded: float
|
|
home_avg_shots_on_target: float
|
|
away_avg_shots_on_target: float
|
|
home_avg_possession: float
|
|
away_avg_possession: float
|
|
home_rest_days: float
|
|
away_rest_days: float
|
|
implied_prob_home: float
|
|
implied_prob_draw: float
|
|
implied_prob_away: float
|
|
home_lineup_availability: float = 1.0
|
|
away_lineup_availability: float = 1.0
|
|
h2h_home_win_rate: float = 0.5
|
|
home_form_score: float = 0.0
|
|
away_form_score: float = 0.0
|
|
league_avg_goals: float = 2.6
|
|
referee_avg_goals: float = 2.6
|
|
referee_home_bias: float = 0.0
|
|
home_squad_strength: float = 0.5
|
|
away_squad_strength: float = 0.5
|
|
home_key_players: float = 0.0
|
|
away_key_players: float = 0.0
|
|
missing_players_impact: float = 0.0
|
|
|
|
|
|
def fatigue_multiplier(rest_days: float) -> float:
|
|
if rest_days < 3.0:
|
|
return 0.85
|
|
if rest_days < 5.0:
|
|
return 0.95
|
|
return 1.0
|
|
|
|
|
|
def clamp(value: float, lower: float, upper: float) -> float:
|
|
return min(max(float(value), lower), upper)
|
|
|
|
|
|
def build_vqwen_feature_row(values: VqwenFeatureInput) -> dict[str, float]:
|
|
home_fatigue = fatigue_multiplier(values.home_rest_days)
|
|
away_fatigue = fatigue_multiplier(values.away_rest_days)
|
|
goal_environment = (
|
|
float(values.league_avg_goals) + float(values.referee_avg_goals)
|
|
) / 2.0
|
|
goal_environment_multiplier = clamp(goal_environment / 2.6, 0.85, 1.2)
|
|
squad_diff = float(values.home_squad_strength) - float(values.away_squad_strength)
|
|
key_player_diff = float(values.home_key_players) - float(values.away_key_players)
|
|
missing_penalty = clamp(float(values.missing_players_impact), 0.0, 1.0)
|
|
referee_bias = clamp(float(values.referee_home_bias), -0.25, 0.25)
|
|
home_squad_multiplier = clamp(
|
|
1.0 + squad_diff * 0.08 + key_player_diff * 0.025 - missing_penalty * 0.08 + referee_bias * 0.03,
|
|
0.82,
|
|
1.18,
|
|
)
|
|
away_squad_multiplier = clamp(
|
|
1.0 - squad_diff * 0.08 - key_player_diff * 0.025 - missing_penalty * 0.08 - referee_bias * 0.03,
|
|
0.82,
|
|
1.18,
|
|
)
|
|
|
|
home_xg = max(
|
|
0.05,
|
|
(
|
|
float(values.home_avg_goals_scored)
|
|
+ float(values.away_avg_goals_conceded)
|
|
)
|
|
/ 2.0,
|
|
) * home_fatigue * goal_environment_multiplier * home_squad_multiplier
|
|
away_xg = max(
|
|
0.05,
|
|
(
|
|
float(values.away_avg_goals_scored)
|
|
+ float(values.home_avg_goals_conceded)
|
|
)
|
|
/ 2.0,
|
|
) * away_fatigue * goal_environment_multiplier * away_squad_multiplier
|
|
|
|
home_power = (
|
|
float(values.home_avg_goals_scored) * 5.0
|
|
- float(values.home_avg_goals_conceded) * 5.0
|
|
+ float(values.home_avg_shots_on_target) * 2.0
|
|
+ float(values.home_avg_possession) * 0.1
|
|
+ float(values.home_squad_strength) * 3.0
|
|
+ float(values.home_key_players) * 0.8
|
|
+ referee_bias * 6.0
|
|
)
|
|
away_power = (
|
|
float(values.away_avg_goals_scored) * 5.0
|
|
- float(values.away_avg_goals_conceded) * 5.0
|
|
+ float(values.away_avg_shots_on_target) * 2.0
|
|
+ float(values.away_avg_possession) * 0.1
|
|
+ float(values.away_squad_strength) * 3.0
|
|
+ float(values.away_key_players) * 0.8
|
|
- referee_bias * 6.0
|
|
)
|
|
|
|
return {
|
|
"elo_diff": float(values.home_elo) - float(values.away_elo),
|
|
"h_xg": home_xg,
|
|
"a_xg": away_xg,
|
|
"total_xg": home_xg + away_xg,
|
|
"pow_diff": home_power - away_power,
|
|
"rest_diff": float(values.home_rest_days) - float(values.away_rest_days),
|
|
"h_fat": home_fatigue,
|
|
"a_fat": away_fatigue,
|
|
"imp_h": clamp(values.implied_prob_home, 0.01, 0.98),
|
|
"imp_d": clamp(values.implied_prob_draw, 0.01, 0.98),
|
|
"imp_a": clamp(values.implied_prob_away, 0.01, 0.98),
|
|
# Column names are preserved for artifact compatibility.
|
|
# Semantics are now "pre-match lineup availability" instead of leaked
|
|
# post-match starting-XI counts.
|
|
"h_xi": clamp(values.home_lineup_availability, 0.0, 1.0),
|
|
"a_xi": clamp(values.away_lineup_availability, 0.0, 1.0),
|
|
"h2h_h_wr": clamp(values.h2h_home_win_rate, 0.0, 1.0),
|
|
"form_diff": (
|
|
float(values.home_form_score)
|
|
- float(values.away_form_score)
|
|
+ squad_diff * 1.5
|
|
+ key_player_diff * 0.35
|
|
+ referee_bias * 2.0
|
|
- missing_penalty * 1.75
|
|
),
|
|
}
|
|
|
|
|
|
def row_to_array(row: dict[str, float]) -> np.ndarray:
|
|
return np.array([[float(row[column]) for column in FEATURE_COLUMNS]], dtype=np.float64)
|