""" HT/FT Tendency Feature Engine ================================ Produces team-level HT/FT tendency features for match prediction. Computes ~15 features per match based on historical data: - 1st half scoring/conceding rates - Comeback rates - Half-specific goal distribution - League-level HT/FT profiles All features are computed from the `matches` table using only data BEFORE the match date (no future leakage). """ import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from typing import Dict, Optional, Tuple from dataclasses import dataclass, field from data.db import get_clean_dsn import psycopg2 @dataclass class TeamHtftProfile: """HT/FT tendency profile for a single team.""" matches: int = 0 ht_scored: int = 0 # Matches where team scored in 1st half ht_conceded: int = 0 # Matches where team conceded in 1st half ht_leading: int = 0 # Matches where team led at HT ht_trailing: int = 0 # Matches where team trailed at HT comeback_wins: int = 0 # Trailing at HT -> Won goals_1h: int = 0 goals_2h: int = 0 conceded_1h: int = 0 conceded_2h: int = 0 @property def ht_scoring_rate(self): return self.ht_scored / self.matches if self.matches > 0 else 0.5 @property def ht_concede_rate(self): return self.ht_conceded / self.matches if self.matches > 0 else 0.5 @property def ht_win_rate(self): return self.ht_leading / self.matches if self.matches > 0 else 0.33 @property def comeback_rate(self): return self.comeback_wins / self.ht_trailing if self.ht_trailing > 0 else 0.0 @property def first_half_goal_pct(self): total = self.goals_1h + self.goals_2h return self.goals_1h / total if total > 0 else 0.5 @property def second_half_surge(self): """Ratio of 2H goals vs 1H goals. >1 means more dangerous in 2nd half.""" return self.goals_2h / self.goals_1h if self.goals_1h > 0 else 1.0 @dataclass class LeagueHtftProfile: """League-level HT/FT statistics.""" matches: int = 0 ht_goals_total: int = 0 ft_goals_total: int = 0 reversals: int = 0 htft_counts: Dict[str, int] = field(default_factory=dict) @property def avg_ht_goals(self): return self.ht_goals_total / self.matches if self.matches > 0 else 1.0 @property def avg_2h_goals(self): ft = self.ft_goals_total / self.matches if self.matches > 0 else 2.5 return ft - self.avg_ht_goals @property def reversal_rate(self): return self.reversals / self.matches if self.matches > 0 else 0.05 @property def first_half_pct(self): return self.ht_goals_total / self.ft_goals_total if self.ft_goals_total > 0 else 0.44 class HtftTendencyEngine: """ Computes HT/FT tendency features for a given match. Uses historical data from `matches` table, filtering by date to avoid future leakage. Features are based on team-level and league-level tendencies, which are DIFFERENT from the existing model features (ELO, form, H2H score). """ def __init__(self): self.conn = None self._team_cache: Dict[Tuple[str, bool], TeamHtftProfile] = {} self._league_cache: Dict[str, LeagueHtftProfile] = {} def get_conn(self): if self.conn is None or self.conn.closed: dsn = get_clean_dsn() self.conn = psycopg2.connect(dsn) return self.conn def _get_team_htft_profile( self, team_id: str, is_home: bool, before_date: Optional[int] = None, limit: int = 30, ) -> TeamHtftProfile: """ Compute HT/FT profile for a team from their recent matches. Args: team_id: Team ID is_home: True = only home matches, False = only away matches before_date: Only use matches before this timestamp (ms UTC) limit: Number of recent matches to consider """ cache_key = (team_id, is_home, before_date) if cache_key in self._team_cache: return self._team_cache[cache_key] conn = self.get_conn() cur = conn.cursor() if is_home: query = """ SELECT ht_score_home, ht_score_away, score_home, score_away FROM matches WHERE home_team_id = %s AND sport = 'football' AND status = 'FT' AND ht_score_home IS NOT NULL AND ht_score_away IS NOT NULL """ else: query = """ SELECT ht_score_away, ht_score_home, score_away, score_home FROM matches WHERE away_team_id = %s AND sport = 'football' AND status = 'FT' AND ht_score_home IS NOT NULL AND ht_score_away IS NOT NULL """ params = [team_id] if before_date: query += " AND mst_utc < %s" params.append(before_date) query += " ORDER BY mst_utc DESC LIMIT %s" params.append(limit) cur.execute(query, params) rows = cur.fetchall() cur.close() profile = TeamHtftProfile() profile.matches = len(rows) for ht_mine, ht_opp, ft_mine, ft_opp in rows: # 1st half scoring if ht_mine > 0: profile.ht_scored += 1 if ht_opp > 0: profile.ht_conceded += 1 # HT situation if ht_mine > ht_opp: profile.ht_leading += 1 elif ht_mine < ht_opp: profile.ht_trailing += 1 # Comeback if ft_mine > ft_opp: profile.comeback_wins += 1 # Goal distribution profile.goals_1h += ht_mine profile.goals_2h += (ft_mine - ht_mine) profile.conceded_1h += ht_opp profile.conceded_2h += (ft_opp - ht_opp) self._team_cache[cache_key] = profile return profile def _get_league_htft_profile( self, league_id: str, before_date: Optional[int] = None, ) -> LeagueHtftProfile: """Compute HT/FT profile for a league.""" cache_key = (league_id, before_date) if cache_key in self._league_cache: return self._league_cache[cache_key] conn = self.get_conn() cur = conn.cursor() query = """ SELECT ht_score_home, ht_score_away, score_home, score_away FROM matches WHERE league_id = %s AND sport = 'football' AND status = 'FT' AND ht_score_home IS NOT NULL AND ht_score_away IS NOT NULL """ params = [league_id] if before_date: query += " AND mst_utc < %s" params.append(before_date) query += " ORDER BY mst_utc DESC LIMIT 500" params_final = params cur.execute(query, params_final) rows = cur.fetchall() cur.close() profile = LeagueHtftProfile() profile.matches = len(rows) for hth, hta, sh, sa in rows: profile.ht_goals_total += hth + hta profile.ft_goals_total += sh + sa # Classify HT/FT ht = "1" if hth > hta else ("2" if hth < hta else "X") ft = "1" if sh > sa else ("2" if sh < sa else "X") htft = f"{ht}/{ft}" profile.htft_counts[htft] = profile.htft_counts.get(htft, 0) + 1 if htft in ("1/2", "2/1"): profile.reversals += 1 self._league_cache[cache_key] = profile return profile def get_features( self, home_team_id: str, away_team_id: str, league_id: Optional[str] = None, before_date: Optional[int] = None, ) -> Dict[str, float]: """ Get HT/FT tendency features for a match. Returns dict with ~15 features. """ # Team profiles (home side for home team, away side for away team) home_prof = self._get_team_htft_profile(home_team_id, is_home=True, before_date=before_date) away_prof = self._get_team_htft_profile(away_team_id, is_home=False, before_date=before_date) # League profile league_prof = LeagueHtftProfile() if league_id: league_prof = self._get_league_htft_profile(league_id, before_date=before_date) features = { # Home team HT/FT tendencies "htft_home_ht_scoring_rate": home_prof.ht_scoring_rate, "htft_home_ht_concede_rate": home_prof.ht_concede_rate, "htft_home_ht_win_rate": home_prof.ht_win_rate, "htft_home_comeback_rate": home_prof.comeback_rate, "htft_home_first_half_goal_pct": home_prof.first_half_goal_pct, "htft_home_second_half_surge": min(home_prof.second_half_surge, 3.0), # Away team HT/FT tendencies "htft_away_ht_scoring_rate": away_prof.ht_scoring_rate, "htft_away_ht_concede_rate": away_prof.ht_concede_rate, "htft_away_ht_win_rate": away_prof.ht_win_rate, "htft_away_comeback_rate": away_prof.comeback_rate, "htft_away_first_half_goal_pct": away_prof.first_half_goal_pct, "htft_away_second_half_surge": min(away_prof.second_half_surge, 3.0), # League-level "htft_league_avg_ht_goals": league_prof.avg_ht_goals, "htft_league_reversal_rate": league_prof.reversal_rate, "htft_league_first_half_pct": league_prof.first_half_pct, # Data quality (how many matches we have for these features) "htft_home_sample_size": min(home_prof.matches / 30.0, 1.0), "htft_away_sample_size": min(away_prof.matches / 30.0, 1.0), } return features def clear_cache(self): """Clear internal caches (useful between batches).""" self._team_cache.clear() self._league_cache.clear() # Singleton _engine = None def get_htft_tendency_engine() -> HtftTendencyEngine: global _engine if _engine is None: _engine = HtftTendencyEngine() return _engine # ── Test ───────────────────────────────────────────────────────────────────── if __name__ == "__main__": engine = get_htft_tendency_engine() conn = engine.get_conn() cur = conn.cursor() cur.execute(""" SELECT home_team_id, away_team_id, league_id, mst_utc, match_name FROM matches WHERE sport = 'football' AND status = 'FT' AND home_team_id IS NOT NULL AND away_team_id IS NOT NULL ORDER BY mst_utc DESC LIMIT 3 """) matches = cur.fetchall() cur.close() for hid, aid, lid, mst, name in matches: print(f"\n🏟️ {name}") features = engine.get_features(hid, aid, lid, mst) for k, v in sorted(features.items()): print(f" {k}: {v:.4f}")