""" Shared VQWEN feature contract ============================= One place defines how VQWEN features are produced. Both training and runtime inference must use this module so the model sees the same feature semantics in historical data and live analysis. """ from __future__ import annotations from dataclasses import dataclass import numpy as np FEATURE_COLUMNS = [ "elo_diff", "h_xg", "a_xg", "total_xg", "pow_diff", "rest_diff", "h_fat", "a_fat", "imp_h", "imp_d", "imp_a", "h_xi", "a_xi", "h2h_h_wr", "form_diff", ] @dataclass(slots=True) class VqwenFeatureInput: home_elo: float away_elo: float home_avg_goals_scored: float away_avg_goals_scored: float home_avg_goals_conceded: float away_avg_goals_conceded: float home_avg_shots_on_target: float away_avg_shots_on_target: float home_avg_possession: float away_avg_possession: float home_rest_days: float away_rest_days: float implied_prob_home: float implied_prob_draw: float implied_prob_away: float home_lineup_availability: float = 1.0 away_lineup_availability: float = 1.0 h2h_home_win_rate: float = 0.5 home_form_score: float = 0.0 away_form_score: float = 0.0 league_avg_goals: float = 2.6 referee_avg_goals: float = 2.6 referee_home_bias: float = 0.0 home_squad_strength: float = 0.5 away_squad_strength: float = 0.5 home_key_players: float = 0.0 away_key_players: float = 0.0 missing_players_impact: float = 0.0 def fatigue_multiplier(rest_days: float) -> float: if rest_days < 3.0: return 0.85 if rest_days < 5.0: return 0.95 return 1.0 def clamp(value: float, lower: float, upper: float) -> float: return min(max(float(value), lower), upper) def build_vqwen_feature_row(values: VqwenFeatureInput) -> dict[str, float]: home_fatigue = fatigue_multiplier(values.home_rest_days) away_fatigue = fatigue_multiplier(values.away_rest_days) goal_environment = ( float(values.league_avg_goals) + float(values.referee_avg_goals) ) / 2.0 goal_environment_multiplier = clamp(goal_environment / 2.6, 0.85, 1.2) squad_diff = float(values.home_squad_strength) - float(values.away_squad_strength) key_player_diff = float(values.home_key_players) - float(values.away_key_players) missing_penalty = clamp(float(values.missing_players_impact), 0.0, 1.0) referee_bias = clamp(float(values.referee_home_bias), -0.25, 0.25) home_squad_multiplier = clamp( 1.0 + squad_diff * 0.08 + key_player_diff * 0.025 - missing_penalty * 0.08 + referee_bias * 0.03, 0.82, 1.18, ) away_squad_multiplier = clamp( 1.0 - squad_diff * 0.08 - key_player_diff * 0.025 - missing_penalty * 0.08 - referee_bias * 0.03, 0.82, 1.18, ) home_xg = max( 0.05, ( float(values.home_avg_goals_scored) + float(values.away_avg_goals_conceded) ) / 2.0, ) * home_fatigue * goal_environment_multiplier * home_squad_multiplier away_xg = max( 0.05, ( float(values.away_avg_goals_scored) + float(values.home_avg_goals_conceded) ) / 2.0, ) * away_fatigue * goal_environment_multiplier * away_squad_multiplier home_power = ( float(values.home_avg_goals_scored) * 5.0 - float(values.home_avg_goals_conceded) * 5.0 + float(values.home_avg_shots_on_target) * 2.0 + float(values.home_avg_possession) * 0.1 + float(values.home_squad_strength) * 3.0 + float(values.home_key_players) * 0.8 + referee_bias * 6.0 ) away_power = ( float(values.away_avg_goals_scored) * 5.0 - float(values.away_avg_goals_conceded) * 5.0 + float(values.away_avg_shots_on_target) * 2.0 + float(values.away_avg_possession) * 0.1 + float(values.away_squad_strength) * 3.0 + float(values.away_key_players) * 0.8 - referee_bias * 6.0 ) return { "elo_diff": float(values.home_elo) - float(values.away_elo), "h_xg": home_xg, "a_xg": away_xg, "total_xg": home_xg + away_xg, "pow_diff": home_power - away_power, "rest_diff": float(values.home_rest_days) - float(values.away_rest_days), "h_fat": home_fatigue, "a_fat": away_fatigue, "imp_h": clamp(values.implied_prob_home, 0.01, 0.98), "imp_d": clamp(values.implied_prob_draw, 0.01, 0.98), "imp_a": clamp(values.implied_prob_away, 0.01, 0.98), # Column names are preserved for artifact compatibility. # Semantics are now "pre-match lineup availability" instead of leaked # post-match starting-XI counts. "h_xi": clamp(values.home_lineup_availability, 0.0, 1.0), "a_xi": clamp(values.away_lineup_availability, 0.0, 1.0), "h2h_h_wr": clamp(values.h2h_home_win_rate, 0.0, 1.0), "form_diff": ( float(values.home_form_score) - float(values.away_form_score) + squad_diff * 1.5 + key_player_diff * 0.35 + referee_bias * 2.0 - missing_penalty * 1.75 ), } def row_to_array(row: dict[str, float]) -> np.ndarray: return np.array([[float(row[column]) for column in FEATURE_COLUMNS]], dtype=np.float64)