344 lines
11 KiB
Python
344 lines
11 KiB
Python
"""
|
|
HT/FT Tendency Feature Engine
|
|
================================
|
|
Produces team-level HT/FT tendency features for match prediction.
|
|
|
|
Computes ~15 features per match based on historical data:
|
|
- 1st half scoring/conceding rates
|
|
- Comeback rates
|
|
- Half-specific goal distribution
|
|
- League-level HT/FT profiles
|
|
|
|
All features are computed from the `matches` table using only data
|
|
BEFORE the match date (no future leakage).
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from typing import Dict, Optional, Tuple
|
|
from dataclasses import dataclass, field
|
|
from data.db import get_clean_dsn
|
|
import psycopg2
|
|
|
|
|
|
@dataclass
|
|
class TeamHtftProfile:
|
|
"""HT/FT tendency profile for a single team."""
|
|
matches: int = 0
|
|
ht_scored: int = 0 # Matches where team scored in 1st half
|
|
ht_conceded: int = 0 # Matches where team conceded in 1st half
|
|
ht_leading: int = 0 # Matches where team led at HT
|
|
ht_trailing: int = 0 # Matches where team trailed at HT
|
|
comeback_wins: int = 0 # Trailing at HT -> Won
|
|
goals_1h: int = 0
|
|
goals_2h: int = 0
|
|
conceded_1h: int = 0
|
|
conceded_2h: int = 0
|
|
|
|
@property
|
|
def ht_scoring_rate(self):
|
|
return self.ht_scored / self.matches if self.matches > 0 else 0.5
|
|
|
|
@property
|
|
def ht_concede_rate(self):
|
|
return self.ht_conceded / self.matches if self.matches > 0 else 0.5
|
|
|
|
@property
|
|
def ht_win_rate(self):
|
|
return self.ht_leading / self.matches if self.matches > 0 else 0.33
|
|
|
|
@property
|
|
def comeback_rate(self):
|
|
return self.comeback_wins / self.ht_trailing if self.ht_trailing > 0 else 0.0
|
|
|
|
@property
|
|
def first_half_goal_pct(self):
|
|
total = self.goals_1h + self.goals_2h
|
|
return self.goals_1h / total if total > 0 else 0.5
|
|
|
|
@property
|
|
def second_half_surge(self):
|
|
"""Ratio of 2H goals vs 1H goals. >1 means more dangerous in 2nd half."""
|
|
return self.goals_2h / self.goals_1h if self.goals_1h > 0 else 1.0
|
|
|
|
|
|
@dataclass
|
|
class LeagueHtftProfile:
|
|
"""League-level HT/FT statistics."""
|
|
matches: int = 0
|
|
ht_goals_total: int = 0
|
|
ft_goals_total: int = 0
|
|
reversals: int = 0
|
|
htft_counts: Dict[str, int] = field(default_factory=dict)
|
|
|
|
@property
|
|
def avg_ht_goals(self):
|
|
return self.ht_goals_total / self.matches if self.matches > 0 else 1.0
|
|
|
|
@property
|
|
def avg_2h_goals(self):
|
|
ft = self.ft_goals_total / self.matches if self.matches > 0 else 2.5
|
|
return ft - self.avg_ht_goals
|
|
|
|
@property
|
|
def reversal_rate(self):
|
|
return self.reversals / self.matches if self.matches > 0 else 0.05
|
|
|
|
@property
|
|
def first_half_pct(self):
|
|
return self.ht_goals_total / self.ft_goals_total if self.ft_goals_total > 0 else 0.44
|
|
|
|
|
|
class HtftTendencyEngine:
|
|
"""
|
|
Computes HT/FT tendency features for a given match.
|
|
|
|
Uses historical data from `matches` table, filtering by date to
|
|
avoid future leakage.
|
|
|
|
Features are based on team-level and league-level tendencies, which
|
|
are DIFFERENT from the existing model features (ELO, form, H2H score).
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.conn = None
|
|
self._team_cache: Dict[Tuple[str, bool], TeamHtftProfile] = {}
|
|
self._league_cache: Dict[str, LeagueHtftProfile] = {}
|
|
|
|
def get_conn(self):
|
|
if self.conn is None or self.conn.closed:
|
|
dsn = get_clean_dsn()
|
|
self.conn = psycopg2.connect(dsn)
|
|
return self.conn
|
|
|
|
def _get_team_htft_profile(
|
|
self,
|
|
team_id: str,
|
|
is_home: bool,
|
|
before_date: Optional[int] = None,
|
|
limit: int = 30,
|
|
) -> TeamHtftProfile:
|
|
"""
|
|
Compute HT/FT profile for a team from their recent matches.
|
|
|
|
Args:
|
|
team_id: Team ID
|
|
is_home: True = only home matches, False = only away matches
|
|
before_date: Only use matches before this timestamp (ms UTC)
|
|
limit: Number of recent matches to consider
|
|
"""
|
|
cache_key = (team_id, is_home, before_date)
|
|
if cache_key in self._team_cache:
|
|
return self._team_cache[cache_key]
|
|
|
|
conn = self.get_conn()
|
|
cur = conn.cursor()
|
|
|
|
if is_home:
|
|
query = """
|
|
SELECT ht_score_home, ht_score_away, score_home, score_away
|
|
FROM matches
|
|
WHERE home_team_id = %s
|
|
AND sport = 'football'
|
|
AND status = 'FT'
|
|
AND ht_score_home IS NOT NULL
|
|
AND ht_score_away IS NOT NULL
|
|
"""
|
|
else:
|
|
query = """
|
|
SELECT ht_score_away, ht_score_home, score_away, score_home
|
|
FROM matches
|
|
WHERE away_team_id = %s
|
|
AND sport = 'football'
|
|
AND status = 'FT'
|
|
AND ht_score_home IS NOT NULL
|
|
AND ht_score_away IS NOT NULL
|
|
"""
|
|
|
|
params = [team_id]
|
|
|
|
if before_date:
|
|
query += " AND mst_utc < %s"
|
|
params.append(before_date)
|
|
|
|
query += " ORDER BY mst_utc DESC LIMIT %s"
|
|
params.append(limit)
|
|
|
|
cur.execute(query, params)
|
|
rows = cur.fetchall()
|
|
cur.close()
|
|
|
|
profile = TeamHtftProfile()
|
|
profile.matches = len(rows)
|
|
|
|
for ht_mine, ht_opp, ft_mine, ft_opp in rows:
|
|
# 1st half scoring
|
|
if ht_mine > 0:
|
|
profile.ht_scored += 1
|
|
if ht_opp > 0:
|
|
profile.ht_conceded += 1
|
|
|
|
# HT situation
|
|
if ht_mine > ht_opp:
|
|
profile.ht_leading += 1
|
|
elif ht_mine < ht_opp:
|
|
profile.ht_trailing += 1
|
|
# Comeback
|
|
if ft_mine > ft_opp:
|
|
profile.comeback_wins += 1
|
|
|
|
# Goal distribution
|
|
profile.goals_1h += ht_mine
|
|
profile.goals_2h += (ft_mine - ht_mine)
|
|
profile.conceded_1h += ht_opp
|
|
profile.conceded_2h += (ft_opp - ht_opp)
|
|
|
|
self._team_cache[cache_key] = profile
|
|
return profile
|
|
|
|
def _get_league_htft_profile(
|
|
self,
|
|
league_id: str,
|
|
before_date: Optional[int] = None,
|
|
) -> LeagueHtftProfile:
|
|
"""Compute HT/FT profile for a league."""
|
|
cache_key = (league_id, before_date)
|
|
if cache_key in self._league_cache:
|
|
return self._league_cache[cache_key]
|
|
|
|
conn = self.get_conn()
|
|
cur = conn.cursor()
|
|
|
|
query = """
|
|
SELECT ht_score_home, ht_score_away, score_home, score_away
|
|
FROM matches
|
|
WHERE league_id = %s
|
|
AND sport = 'football'
|
|
AND status = 'FT'
|
|
AND ht_score_home IS NOT NULL
|
|
AND ht_score_away IS NOT NULL
|
|
"""
|
|
params = [league_id]
|
|
|
|
if before_date:
|
|
query += " AND mst_utc < %s"
|
|
params.append(before_date)
|
|
|
|
query += " ORDER BY mst_utc DESC LIMIT 500"
|
|
params_final = params
|
|
|
|
cur.execute(query, params_final)
|
|
rows = cur.fetchall()
|
|
cur.close()
|
|
|
|
profile = LeagueHtftProfile()
|
|
profile.matches = len(rows)
|
|
|
|
for hth, hta, sh, sa in rows:
|
|
profile.ht_goals_total += hth + hta
|
|
profile.ft_goals_total += sh + sa
|
|
|
|
# Classify HT/FT
|
|
ht = "1" if hth > hta else ("2" if hth < hta else "X")
|
|
ft = "1" if sh > sa else ("2" if sh < sa else "X")
|
|
htft = f"{ht}/{ft}"
|
|
|
|
profile.htft_counts[htft] = profile.htft_counts.get(htft, 0) + 1
|
|
if htft in ("1/2", "2/1"):
|
|
profile.reversals += 1
|
|
|
|
self._league_cache[cache_key] = profile
|
|
return profile
|
|
|
|
def get_features(
|
|
self,
|
|
home_team_id: str,
|
|
away_team_id: str,
|
|
league_id: Optional[str] = None,
|
|
before_date: Optional[int] = None,
|
|
) -> Dict[str, float]:
|
|
"""
|
|
Get HT/FT tendency features for a match.
|
|
|
|
Returns dict with ~15 features.
|
|
"""
|
|
# Team profiles (home side for home team, away side for away team)
|
|
home_prof = self._get_team_htft_profile(home_team_id, is_home=True, before_date=before_date)
|
|
away_prof = self._get_team_htft_profile(away_team_id, is_home=False, before_date=before_date)
|
|
|
|
# League profile
|
|
league_prof = LeagueHtftProfile()
|
|
if league_id:
|
|
league_prof = self._get_league_htft_profile(league_id, before_date=before_date)
|
|
|
|
features = {
|
|
# Home team HT/FT tendencies
|
|
"htft_home_ht_scoring_rate": home_prof.ht_scoring_rate,
|
|
"htft_home_ht_concede_rate": home_prof.ht_concede_rate,
|
|
"htft_home_ht_win_rate": home_prof.ht_win_rate,
|
|
"htft_home_comeback_rate": home_prof.comeback_rate,
|
|
"htft_home_first_half_goal_pct": home_prof.first_half_goal_pct,
|
|
"htft_home_second_half_surge": min(home_prof.second_half_surge, 3.0),
|
|
|
|
# Away team HT/FT tendencies
|
|
"htft_away_ht_scoring_rate": away_prof.ht_scoring_rate,
|
|
"htft_away_ht_concede_rate": away_prof.ht_concede_rate,
|
|
"htft_away_ht_win_rate": away_prof.ht_win_rate,
|
|
"htft_away_comeback_rate": away_prof.comeback_rate,
|
|
"htft_away_first_half_goal_pct": away_prof.first_half_goal_pct,
|
|
"htft_away_second_half_surge": min(away_prof.second_half_surge, 3.0),
|
|
|
|
# League-level
|
|
"htft_league_avg_ht_goals": league_prof.avg_ht_goals,
|
|
"htft_league_reversal_rate": league_prof.reversal_rate,
|
|
"htft_league_first_half_pct": league_prof.first_half_pct,
|
|
|
|
# Data quality (how many matches we have for these features)
|
|
"htft_home_sample_size": min(home_prof.matches / 30.0, 1.0),
|
|
"htft_away_sample_size": min(away_prof.matches / 30.0, 1.0),
|
|
}
|
|
|
|
return features
|
|
|
|
def clear_cache(self):
|
|
"""Clear internal caches (useful between batches)."""
|
|
self._team_cache.clear()
|
|
self._league_cache.clear()
|
|
|
|
|
|
# Singleton
|
|
_engine = None
|
|
|
|
|
|
def get_htft_tendency_engine() -> HtftTendencyEngine:
|
|
global _engine
|
|
if _engine is None:
|
|
_engine = HtftTendencyEngine()
|
|
return _engine
|
|
|
|
|
|
# ── Test ─────────────────────────────────────────────────────────────────────
|
|
if __name__ == "__main__":
|
|
engine = get_htft_tendency_engine()
|
|
|
|
conn = engine.get_conn()
|
|
cur = conn.cursor()
|
|
cur.execute("""
|
|
SELECT home_team_id, away_team_id, league_id, mst_utc, match_name
|
|
FROM matches
|
|
WHERE sport = 'football' AND status = 'FT'
|
|
AND home_team_id IS NOT NULL AND away_team_id IS NOT NULL
|
|
ORDER BY mst_utc DESC LIMIT 3
|
|
""")
|
|
matches = cur.fetchall()
|
|
cur.close()
|
|
|
|
for hid, aid, lid, mst, name in matches:
|
|
print(f"\n🏟️ {name}")
|
|
features = engine.get_features(hid, aid, lid, mst)
|
|
for k, v in sorted(features.items()):
|
|
print(f" {k}: {v:.4f}")
|