This commit is contained in:
@@ -0,0 +1,343 @@
|
||||
"""
|
||||
HT/FT Tendency Feature Engine
|
||||
================================
|
||||
Produces team-level HT/FT tendency features for match prediction.
|
||||
|
||||
Computes ~15 features per match based on historical data:
|
||||
- 1st half scoring/conceding rates
|
||||
- Comeback rates
|
||||
- Half-specific goal distribution
|
||||
- League-level HT/FT profiles
|
||||
|
||||
All features are computed from the `matches` table using only data
|
||||
BEFORE the match date (no future leakage).
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from typing import Dict, Optional, Tuple
|
||||
from dataclasses import dataclass, field
|
||||
from data.db import get_clean_dsn
|
||||
import psycopg2
|
||||
|
||||
|
||||
@dataclass
|
||||
class TeamHtftProfile:
|
||||
"""HT/FT tendency profile for a single team."""
|
||||
matches: int = 0
|
||||
ht_scored: int = 0 # Matches where team scored in 1st half
|
||||
ht_conceded: int = 0 # Matches where team conceded in 1st half
|
||||
ht_leading: int = 0 # Matches where team led at HT
|
||||
ht_trailing: int = 0 # Matches where team trailed at HT
|
||||
comeback_wins: int = 0 # Trailing at HT -> Won
|
||||
goals_1h: int = 0
|
||||
goals_2h: int = 0
|
||||
conceded_1h: int = 0
|
||||
conceded_2h: int = 0
|
||||
|
||||
@property
|
||||
def ht_scoring_rate(self):
|
||||
return self.ht_scored / self.matches if self.matches > 0 else 0.5
|
||||
|
||||
@property
|
||||
def ht_concede_rate(self):
|
||||
return self.ht_conceded / self.matches if self.matches > 0 else 0.5
|
||||
|
||||
@property
|
||||
def ht_win_rate(self):
|
||||
return self.ht_leading / self.matches if self.matches > 0 else 0.33
|
||||
|
||||
@property
|
||||
def comeback_rate(self):
|
||||
return self.comeback_wins / self.ht_trailing if self.ht_trailing > 0 else 0.0
|
||||
|
||||
@property
|
||||
def first_half_goal_pct(self):
|
||||
total = self.goals_1h + self.goals_2h
|
||||
return self.goals_1h / total if total > 0 else 0.5
|
||||
|
||||
@property
|
||||
def second_half_surge(self):
|
||||
"""Ratio of 2H goals vs 1H goals. >1 means more dangerous in 2nd half."""
|
||||
return self.goals_2h / self.goals_1h if self.goals_1h > 0 else 1.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class LeagueHtftProfile:
|
||||
"""League-level HT/FT statistics."""
|
||||
matches: int = 0
|
||||
ht_goals_total: int = 0
|
||||
ft_goals_total: int = 0
|
||||
reversals: int = 0
|
||||
htft_counts: Dict[str, int] = field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def avg_ht_goals(self):
|
||||
return self.ht_goals_total / self.matches if self.matches > 0 else 1.0
|
||||
|
||||
@property
|
||||
def avg_2h_goals(self):
|
||||
ft = self.ft_goals_total / self.matches if self.matches > 0 else 2.5
|
||||
return ft - self.avg_ht_goals
|
||||
|
||||
@property
|
||||
def reversal_rate(self):
|
||||
return self.reversals / self.matches if self.matches > 0 else 0.05
|
||||
|
||||
@property
|
||||
def first_half_pct(self):
|
||||
return self.ht_goals_total / self.ft_goals_total if self.ft_goals_total > 0 else 0.44
|
||||
|
||||
|
||||
class HtftTendencyEngine:
|
||||
"""
|
||||
Computes HT/FT tendency features for a given match.
|
||||
|
||||
Uses historical data from `matches` table, filtering by date to
|
||||
avoid future leakage.
|
||||
|
||||
Features are based on team-level and league-level tendencies, which
|
||||
are DIFFERENT from the existing model features (ELO, form, H2H score).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.conn = None
|
||||
self._team_cache: Dict[Tuple[str, bool], TeamHtftProfile] = {}
|
||||
self._league_cache: Dict[str, LeagueHtftProfile] = {}
|
||||
|
||||
def get_conn(self):
|
||||
if self.conn is None or self.conn.closed:
|
||||
dsn = get_clean_dsn()
|
||||
self.conn = psycopg2.connect(dsn)
|
||||
return self.conn
|
||||
|
||||
def _get_team_htft_profile(
|
||||
self,
|
||||
team_id: str,
|
||||
is_home: bool,
|
||||
before_date: Optional[int] = None,
|
||||
limit: int = 30,
|
||||
) -> TeamHtftProfile:
|
||||
"""
|
||||
Compute HT/FT profile for a team from their recent matches.
|
||||
|
||||
Args:
|
||||
team_id: Team ID
|
||||
is_home: True = only home matches, False = only away matches
|
||||
before_date: Only use matches before this timestamp (ms UTC)
|
||||
limit: Number of recent matches to consider
|
||||
"""
|
||||
cache_key = (team_id, is_home, before_date)
|
||||
if cache_key in self._team_cache:
|
||||
return self._team_cache[cache_key]
|
||||
|
||||
conn = self.get_conn()
|
||||
cur = conn.cursor()
|
||||
|
||||
if is_home:
|
||||
query = """
|
||||
SELECT ht_score_home, ht_score_away, score_home, score_away
|
||||
FROM matches
|
||||
WHERE home_team_id = %s
|
||||
AND sport = 'football'
|
||||
AND status = 'FT'
|
||||
AND ht_score_home IS NOT NULL
|
||||
AND ht_score_away IS NOT NULL
|
||||
"""
|
||||
else:
|
||||
query = """
|
||||
SELECT ht_score_away, ht_score_home, score_away, score_home
|
||||
FROM matches
|
||||
WHERE away_team_id = %s
|
||||
AND sport = 'football'
|
||||
AND status = 'FT'
|
||||
AND ht_score_home IS NOT NULL
|
||||
AND ht_score_away IS NOT NULL
|
||||
"""
|
||||
|
||||
params = [team_id]
|
||||
|
||||
if before_date:
|
||||
query += " AND mst_utc < %s"
|
||||
params.append(before_date)
|
||||
|
||||
query += " ORDER BY mst_utc DESC LIMIT %s"
|
||||
params.append(limit)
|
||||
|
||||
cur.execute(query, params)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
|
||||
profile = TeamHtftProfile()
|
||||
profile.matches = len(rows)
|
||||
|
||||
for ht_mine, ht_opp, ft_mine, ft_opp in rows:
|
||||
# 1st half scoring
|
||||
if ht_mine > 0:
|
||||
profile.ht_scored += 1
|
||||
if ht_opp > 0:
|
||||
profile.ht_conceded += 1
|
||||
|
||||
# HT situation
|
||||
if ht_mine > ht_opp:
|
||||
profile.ht_leading += 1
|
||||
elif ht_mine < ht_opp:
|
||||
profile.ht_trailing += 1
|
||||
# Comeback
|
||||
if ft_mine > ft_opp:
|
||||
profile.comeback_wins += 1
|
||||
|
||||
# Goal distribution
|
||||
profile.goals_1h += ht_mine
|
||||
profile.goals_2h += (ft_mine - ht_mine)
|
||||
profile.conceded_1h += ht_opp
|
||||
profile.conceded_2h += (ft_opp - ht_opp)
|
||||
|
||||
self._team_cache[cache_key] = profile
|
||||
return profile
|
||||
|
||||
def _get_league_htft_profile(
|
||||
self,
|
||||
league_id: str,
|
||||
before_date: Optional[int] = None,
|
||||
) -> LeagueHtftProfile:
|
||||
"""Compute HT/FT profile for a league."""
|
||||
cache_key = (league_id, before_date)
|
||||
if cache_key in self._league_cache:
|
||||
return self._league_cache[cache_key]
|
||||
|
||||
conn = self.get_conn()
|
||||
cur = conn.cursor()
|
||||
|
||||
query = """
|
||||
SELECT ht_score_home, ht_score_away, score_home, score_away
|
||||
FROM matches
|
||||
WHERE league_id = %s
|
||||
AND sport = 'football'
|
||||
AND status = 'FT'
|
||||
AND ht_score_home IS NOT NULL
|
||||
AND ht_score_away IS NOT NULL
|
||||
"""
|
||||
params = [league_id]
|
||||
|
||||
if before_date:
|
||||
query += " AND mst_utc < %s"
|
||||
params.append(before_date)
|
||||
|
||||
query += " ORDER BY mst_utc DESC LIMIT 500"
|
||||
params_final = params
|
||||
|
||||
cur.execute(query, params_final)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
|
||||
profile = LeagueHtftProfile()
|
||||
profile.matches = len(rows)
|
||||
|
||||
for hth, hta, sh, sa in rows:
|
||||
profile.ht_goals_total += hth + hta
|
||||
profile.ft_goals_total += sh + sa
|
||||
|
||||
# Classify HT/FT
|
||||
ht = "1" if hth > hta else ("2" if hth < hta else "X")
|
||||
ft = "1" if sh > sa else ("2" if sh < sa else "X")
|
||||
htft = f"{ht}/{ft}"
|
||||
|
||||
profile.htft_counts[htft] = profile.htft_counts.get(htft, 0) + 1
|
||||
if htft in ("1/2", "2/1"):
|
||||
profile.reversals += 1
|
||||
|
||||
self._league_cache[cache_key] = profile
|
||||
return profile
|
||||
|
||||
def get_features(
|
||||
self,
|
||||
home_team_id: str,
|
||||
away_team_id: str,
|
||||
league_id: Optional[str] = None,
|
||||
before_date: Optional[int] = None,
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Get HT/FT tendency features for a match.
|
||||
|
||||
Returns dict with ~15 features.
|
||||
"""
|
||||
# Team profiles (home side for home team, away side for away team)
|
||||
home_prof = self._get_team_htft_profile(home_team_id, is_home=True, before_date=before_date)
|
||||
away_prof = self._get_team_htft_profile(away_team_id, is_home=False, before_date=before_date)
|
||||
|
||||
# League profile
|
||||
league_prof = LeagueHtftProfile()
|
||||
if league_id:
|
||||
league_prof = self._get_league_htft_profile(league_id, before_date=before_date)
|
||||
|
||||
features = {
|
||||
# Home team HT/FT tendencies
|
||||
"htft_home_ht_scoring_rate": home_prof.ht_scoring_rate,
|
||||
"htft_home_ht_concede_rate": home_prof.ht_concede_rate,
|
||||
"htft_home_ht_win_rate": home_prof.ht_win_rate,
|
||||
"htft_home_comeback_rate": home_prof.comeback_rate,
|
||||
"htft_home_first_half_goal_pct": home_prof.first_half_goal_pct,
|
||||
"htft_home_second_half_surge": min(home_prof.second_half_surge, 3.0),
|
||||
|
||||
# Away team HT/FT tendencies
|
||||
"htft_away_ht_scoring_rate": away_prof.ht_scoring_rate,
|
||||
"htft_away_ht_concede_rate": away_prof.ht_concede_rate,
|
||||
"htft_away_ht_win_rate": away_prof.ht_win_rate,
|
||||
"htft_away_comeback_rate": away_prof.comeback_rate,
|
||||
"htft_away_first_half_goal_pct": away_prof.first_half_goal_pct,
|
||||
"htft_away_second_half_surge": min(away_prof.second_half_surge, 3.0),
|
||||
|
||||
# League-level
|
||||
"htft_league_avg_ht_goals": league_prof.avg_ht_goals,
|
||||
"htft_league_reversal_rate": league_prof.reversal_rate,
|
||||
"htft_league_first_half_pct": league_prof.first_half_pct,
|
||||
|
||||
# Data quality (how many matches we have for these features)
|
||||
"htft_home_sample_size": min(home_prof.matches / 30.0, 1.0),
|
||||
"htft_away_sample_size": min(away_prof.matches / 30.0, 1.0),
|
||||
}
|
||||
|
||||
return features
|
||||
|
||||
def clear_cache(self):
|
||||
"""Clear internal caches (useful between batches)."""
|
||||
self._team_cache.clear()
|
||||
self._league_cache.clear()
|
||||
|
||||
|
||||
# Singleton
|
||||
_engine = None
|
||||
|
||||
|
||||
def get_htft_tendency_engine() -> HtftTendencyEngine:
|
||||
global _engine
|
||||
if _engine is None:
|
||||
_engine = HtftTendencyEngine()
|
||||
return _engine
|
||||
|
||||
|
||||
# ── Test ─────────────────────────────────────────────────────────────────────
|
||||
if __name__ == "__main__":
|
||||
engine = get_htft_tendency_engine()
|
||||
|
||||
conn = engine.get_conn()
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT home_team_id, away_team_id, league_id, mst_utc, match_name
|
||||
FROM matches
|
||||
WHERE sport = 'football' AND status = 'FT'
|
||||
AND home_team_id IS NOT NULL AND away_team_id IS NOT NULL
|
||||
ORDER BY mst_utc DESC LIMIT 3
|
||||
""")
|
||||
matches = cur.fetchall()
|
||||
cur.close()
|
||||
|
||||
for hid, aid, lid, mst, name in matches:
|
||||
print(f"\n🏟️ {name}")
|
||||
features = engine.get_features(hid, aid, lid, mst)
|
||||
for k, v in sorted(features.items()):
|
||||
print(f" {k}: {v:.4f}")
|
||||
Reference in New Issue
Block a user