Files
iddaai-be/ai-engine/features/htft_tendency_engine.py
T
fahricansecer 2f0b85a0c7
Deploy Iddaai Backend / build-and-deploy (push) Failing after 18s
first (part 2: other directories)
2026-04-16 15:11:25 +03:00

344 lines
11 KiB
Python

"""
HT/FT Tendency Feature Engine
================================
Produces team-level HT/FT tendency features for match prediction.
Computes ~15 features per match based on historical data:
- 1st half scoring/conceding rates
- Comeback rates
- Half-specific goal distribution
- League-level HT/FT profiles
All features are computed from the `matches` table using only data
BEFORE the match date (no future leakage).
"""
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from typing import Dict, Optional, Tuple
from dataclasses import dataclass, field
from data.db import get_clean_dsn
import psycopg2
@dataclass
class TeamHtftProfile:
"""HT/FT tendency profile for a single team."""
matches: int = 0
ht_scored: int = 0 # Matches where team scored in 1st half
ht_conceded: int = 0 # Matches where team conceded in 1st half
ht_leading: int = 0 # Matches where team led at HT
ht_trailing: int = 0 # Matches where team trailed at HT
comeback_wins: int = 0 # Trailing at HT -> Won
goals_1h: int = 0
goals_2h: int = 0
conceded_1h: int = 0
conceded_2h: int = 0
@property
def ht_scoring_rate(self):
return self.ht_scored / self.matches if self.matches > 0 else 0.5
@property
def ht_concede_rate(self):
return self.ht_conceded / self.matches if self.matches > 0 else 0.5
@property
def ht_win_rate(self):
return self.ht_leading / self.matches if self.matches > 0 else 0.33
@property
def comeback_rate(self):
return self.comeback_wins / self.ht_trailing if self.ht_trailing > 0 else 0.0
@property
def first_half_goal_pct(self):
total = self.goals_1h + self.goals_2h
return self.goals_1h / total if total > 0 else 0.5
@property
def second_half_surge(self):
"""Ratio of 2H goals vs 1H goals. >1 means more dangerous in 2nd half."""
return self.goals_2h / self.goals_1h if self.goals_1h > 0 else 1.0
@dataclass
class LeagueHtftProfile:
"""League-level HT/FT statistics."""
matches: int = 0
ht_goals_total: int = 0
ft_goals_total: int = 0
reversals: int = 0
htft_counts: Dict[str, int] = field(default_factory=dict)
@property
def avg_ht_goals(self):
return self.ht_goals_total / self.matches if self.matches > 0 else 1.0
@property
def avg_2h_goals(self):
ft = self.ft_goals_total / self.matches if self.matches > 0 else 2.5
return ft - self.avg_ht_goals
@property
def reversal_rate(self):
return self.reversals / self.matches if self.matches > 0 else 0.05
@property
def first_half_pct(self):
return self.ht_goals_total / self.ft_goals_total if self.ft_goals_total > 0 else 0.44
class HtftTendencyEngine:
"""
Computes HT/FT tendency features for a given match.
Uses historical data from `matches` table, filtering by date to
avoid future leakage.
Features are based on team-level and league-level tendencies, which
are DIFFERENT from the existing model features (ELO, form, H2H score).
"""
def __init__(self):
self.conn = None
self._team_cache: Dict[Tuple[str, bool], TeamHtftProfile] = {}
self._league_cache: Dict[str, LeagueHtftProfile] = {}
def get_conn(self):
if self.conn is None or self.conn.closed:
dsn = get_clean_dsn()
self.conn = psycopg2.connect(dsn)
return self.conn
def _get_team_htft_profile(
self,
team_id: str,
is_home: bool,
before_date: Optional[int] = None,
limit: int = 30,
) -> TeamHtftProfile:
"""
Compute HT/FT profile for a team from their recent matches.
Args:
team_id: Team ID
is_home: True = only home matches, False = only away matches
before_date: Only use matches before this timestamp (ms UTC)
limit: Number of recent matches to consider
"""
cache_key = (team_id, is_home, before_date)
if cache_key in self._team_cache:
return self._team_cache[cache_key]
conn = self.get_conn()
cur = conn.cursor()
if is_home:
query = """
SELECT ht_score_home, ht_score_away, score_home, score_away
FROM matches
WHERE home_team_id = %s
AND sport = 'football'
AND status = 'FT'
AND ht_score_home IS NOT NULL
AND ht_score_away IS NOT NULL
"""
else:
query = """
SELECT ht_score_away, ht_score_home, score_away, score_home
FROM matches
WHERE away_team_id = %s
AND sport = 'football'
AND status = 'FT'
AND ht_score_home IS NOT NULL
AND ht_score_away IS NOT NULL
"""
params = [team_id]
if before_date:
query += " AND mst_utc < %s"
params.append(before_date)
query += " ORDER BY mst_utc DESC LIMIT %s"
params.append(limit)
cur.execute(query, params)
rows = cur.fetchall()
cur.close()
profile = TeamHtftProfile()
profile.matches = len(rows)
for ht_mine, ht_opp, ft_mine, ft_opp in rows:
# 1st half scoring
if ht_mine > 0:
profile.ht_scored += 1
if ht_opp > 0:
profile.ht_conceded += 1
# HT situation
if ht_mine > ht_opp:
profile.ht_leading += 1
elif ht_mine < ht_opp:
profile.ht_trailing += 1
# Comeback
if ft_mine > ft_opp:
profile.comeback_wins += 1
# Goal distribution
profile.goals_1h += ht_mine
profile.goals_2h += (ft_mine - ht_mine)
profile.conceded_1h += ht_opp
profile.conceded_2h += (ft_opp - ht_opp)
self._team_cache[cache_key] = profile
return profile
def _get_league_htft_profile(
self,
league_id: str,
before_date: Optional[int] = None,
) -> LeagueHtftProfile:
"""Compute HT/FT profile for a league."""
cache_key = (league_id, before_date)
if cache_key in self._league_cache:
return self._league_cache[cache_key]
conn = self.get_conn()
cur = conn.cursor()
query = """
SELECT ht_score_home, ht_score_away, score_home, score_away
FROM matches
WHERE league_id = %s
AND sport = 'football'
AND status = 'FT'
AND ht_score_home IS NOT NULL
AND ht_score_away IS NOT NULL
"""
params = [league_id]
if before_date:
query += " AND mst_utc < %s"
params.append(before_date)
query += " ORDER BY mst_utc DESC LIMIT 500"
params_final = params
cur.execute(query, params_final)
rows = cur.fetchall()
cur.close()
profile = LeagueHtftProfile()
profile.matches = len(rows)
for hth, hta, sh, sa in rows:
profile.ht_goals_total += hth + hta
profile.ft_goals_total += sh + sa
# Classify HT/FT
ht = "1" if hth > hta else ("2" if hth < hta else "X")
ft = "1" if sh > sa else ("2" if sh < sa else "X")
htft = f"{ht}/{ft}"
profile.htft_counts[htft] = profile.htft_counts.get(htft, 0) + 1
if htft in ("1/2", "2/1"):
profile.reversals += 1
self._league_cache[cache_key] = profile
return profile
def get_features(
self,
home_team_id: str,
away_team_id: str,
league_id: Optional[str] = None,
before_date: Optional[int] = None,
) -> Dict[str, float]:
"""
Get HT/FT tendency features for a match.
Returns dict with ~15 features.
"""
# Team profiles (home side for home team, away side for away team)
home_prof = self._get_team_htft_profile(home_team_id, is_home=True, before_date=before_date)
away_prof = self._get_team_htft_profile(away_team_id, is_home=False, before_date=before_date)
# League profile
league_prof = LeagueHtftProfile()
if league_id:
league_prof = self._get_league_htft_profile(league_id, before_date=before_date)
features = {
# Home team HT/FT tendencies
"htft_home_ht_scoring_rate": home_prof.ht_scoring_rate,
"htft_home_ht_concede_rate": home_prof.ht_concede_rate,
"htft_home_ht_win_rate": home_prof.ht_win_rate,
"htft_home_comeback_rate": home_prof.comeback_rate,
"htft_home_first_half_goal_pct": home_prof.first_half_goal_pct,
"htft_home_second_half_surge": min(home_prof.second_half_surge, 3.0),
# Away team HT/FT tendencies
"htft_away_ht_scoring_rate": away_prof.ht_scoring_rate,
"htft_away_ht_concede_rate": away_prof.ht_concede_rate,
"htft_away_ht_win_rate": away_prof.ht_win_rate,
"htft_away_comeback_rate": away_prof.comeback_rate,
"htft_away_first_half_goal_pct": away_prof.first_half_goal_pct,
"htft_away_second_half_surge": min(away_prof.second_half_surge, 3.0),
# League-level
"htft_league_avg_ht_goals": league_prof.avg_ht_goals,
"htft_league_reversal_rate": league_prof.reversal_rate,
"htft_league_first_half_pct": league_prof.first_half_pct,
# Data quality (how many matches we have for these features)
"htft_home_sample_size": min(home_prof.matches / 30.0, 1.0),
"htft_away_sample_size": min(away_prof.matches / 30.0, 1.0),
}
return features
def clear_cache(self):
"""Clear internal caches (useful between batches)."""
self._team_cache.clear()
self._league_cache.clear()
# Singleton
_engine = None
def get_htft_tendency_engine() -> HtftTendencyEngine:
global _engine
if _engine is None:
_engine = HtftTendencyEngine()
return _engine
# ── Test ─────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
engine = get_htft_tendency_engine()
conn = engine.get_conn()
cur = conn.cursor()
cur.execute("""
SELECT home_team_id, away_team_id, league_id, mst_utc, match_name
FROM matches
WHERE sport = 'football' AND status = 'FT'
AND home_team_id IS NOT NULL AND away_team_id IS NOT NULL
ORDER BY mst_utc DESC LIMIT 3
""")
matches = cur.fetchall()
cur.close()
for hid, aid, lid, mst, name in matches:
print(f"\n🏟️ {name}")
features = engine.get_features(hid, aid, lid, mst)
for k, v in sorted(features.items()):
print(f" {k}: {v:.4f}")