iddaai-be/ai-engine/services/feature_enrichment.py

"""
Feature Enrichment Service
===========================
Computes real statistical features from DB for V25 model input.

Replaces hardcoded defaults in `_build_v25_features()` with rolling
averages from football_team_stats, matches, match_officials, and
match_player_events tables.

Each method receives a psycopg2 cursor + params and returns a dict.
All methods are fail-safe: they return sensible defaults when data
is missing or queries fail.
"""

from __future__ import annotations

from typing import Any, Dict, Optional, Tuple

from psycopg2.extras import RealDictCursor


class FeatureEnrichmentService:
    """Stateless service — all state comes from DB via cursor."""

    # ─── Default fallback values ─────────────────────────────────────
    _DEFAULT_TEAM_STATS = {
        'avg_possession': 50.0,
        'avg_shots_on_target': 4.0,
        'shot_conversion': 0.1,
        'avg_corners': 5.0,
    }
    _DEFAULT_H2H = {
        'total_matches': 0,
        'home_win_rate': 0.33,
        'draw_rate': 0.33,
        'avg_goals': 2.5,
        'btts_rate': 0.5,
        'over25_rate': 0.5,
    }
    _DEFAULT_FORM = {
        'clean_sheet_rate': 0.2,
        'scoring_rate': 0.8,
        'winning_streak': 0,
        'unbeaten_streak': 0,
    }
    _DEFAULT_REFEREE = {
        'home_bias': 0.0,
        'avg_goals': 2.5,
        'cards_total': 4.0,
        'avg_yellow': 3.0,
        'experience': 0,
    }
    _DEFAULT_LEAGUE = {
        'avg_goals': 2.7,
        'zero_goal_rate': 0.07,
    }

    # ─── 1. Team Stats ──────────────────────────────────────────────

    def compute_team_stats(
        self,
        cur: RealDictCursor,
        team_id: str,
        before_date_ms: int,
        limit: int = 10,
    ) -> Dict[str, float]:
        """
        Rolling averages from football_team_stats for a team's last N matches.

        Returns avg_possession, avg_shots_on_target, shot_conversion, avg_corners.
        """
        if not team_id:
            return dict(self._DEFAULT_TEAM_STATS)
        try:
            cur.execute(
                """
                SELECT
                    mts.possession_percentage,
                    mts.shots_on_target,
                    mts.total_shots,
                    mts.corners
                FROM football_team_stats mts
                JOIN matches m ON m.id = mts.match_id
                WHERE mts.team_id = %s
                  AND m.status = 'FT'
                  AND m.mst_utc < %s
                  AND m.sport = 'football'
                  AND mts.possession_percentage IS NOT NULL
                  AND mts.possession_percentage > 0
                ORDER BY m.mst_utc DESC
                LIMIT %s
                """,
                (team_id, before_date_ms, limit),
            )
            rows = cur.fetchall()
        except Exception:
            return dict(self._DEFAULT_TEAM_STATS)

        if not rows:
            return dict(self._DEFAULT_TEAM_STATS)

        possession_vals = []
        sot_vals = []
        conversion_vals = []
        corner_vals = []

        for row in rows:
            poss = row.get('possession_percentage')
            if poss is not None:
                possession_vals.append(float(poss))

            sot = row.get('shots_on_target')
            if sot is not None:
                sot_vals.append(float(sot))

            total_shots = row.get('total_shots')
            if total_shots and sot and float(total_shots) > 0:
                conversion_vals.append(float(sot) / float(total_shots))

            corners = row.get('corners')
            if corners is not None:
                corner_vals.append(float(corners))

        return {
            'avg_possession': _safe_avg(possession_vals, 50.0),
            'avg_shots_on_target': _safe_avg(sot_vals, 4.0),
            'shot_conversion': _safe_avg(conversion_vals, 0.1),
            'avg_corners': _safe_avg(corner_vals, 5.0),
        }

    # ─── 2. Head-to-Head ────────────────────────────────────────────

    def compute_h2h(
        self,
        cur: RealDictCursor,
        home_team_id: str,
        away_team_id: str,
        before_date_ms: int,
        limit: int = 20,
    ) -> Dict[str, float]:
        """
        Historical head-to-head between two teams (both directions).

        Returns total_matches, home_win_rate, draw_rate, avg_goals,
        btts_rate, over25_rate.
        """
        if not home_team_id or not away_team_id:
            return dict(self._DEFAULT_H2H)
        try:
            cur.execute(
                """
                SELECT
                    m.home_team_id,
                    m.away_team_id,
                    m.score_home,
                    m.score_away
                FROM matches m
                WHERE m.status = 'FT'
                  AND m.score_home IS NOT NULL
                  AND m.score_away IS NOT NULL
                  AND m.mst_utc < %s
                  AND (
                      (m.home_team_id = %s AND m.away_team_id = %s) OR
                      (m.home_team_id = %s AND m.away_team_id = %s)
                  )
                ORDER BY m.mst_utc DESC
                LIMIT %s
                """,
                (
                    before_date_ms,
                    home_team_id, away_team_id,
                    away_team_id, home_team_id,
                    limit,
                ),
            )
            rows = cur.fetchall()
        except Exception:
            return dict(self._DEFAULT_H2H)

        if not rows:
            return dict(self._DEFAULT_H2H)

        total = len(rows)
        home_wins = 0
        draws = 0
        total_goals = 0
        btts_count = 0
        over25_count = 0

        for row in rows:
            sh = int(row['score_home'])
            sa = int(row['score_away'])
            match_goals = sh + sa
            total_goals += match_goals

            # Normalise: who is "home team" in THIS prediction context
            if str(row['home_team_id']) == home_team_id:
                if sh > sa:
                    home_wins += 1
                elif sh == sa:
                    draws += 1
            else:
                # Reversed fixture: away_team was at home
                if sa > sh:
                    home_wins += 1
                elif sh == sa:
                    draws += 1

            if sh > 0 and sa > 0:
                btts_count += 1
            if match_goals > 2:
                over25_count += 1

        return {
            'total_matches': total,
            'home_win_rate': home_wins / total,
            'draw_rate': draws / total,
            'avg_goals': total_goals / total,
            'btts_rate': btts_count / total,
            'over25_rate': over25_count / total,
        }

    # ─── 3. Form & Streaks ──────────────────────────────────────────

    def compute_form_streaks(
        self,
        cur: RealDictCursor,
        team_id: str,
        before_date_ms: int,
        limit: int = 10,
    ) -> Dict[str, float]:
        """
        Clean sheet rate, scoring rate, and current streaks.
        """
        if not team_id:
            return dict(self._DEFAULT_FORM)
        try:
            cur.execute(
                """
                SELECT
                    m.home_team_id,
                    m.away_team_id,
                    m.score_home,
                    m.score_away
                FROM matches m
                WHERE (m.home_team_id = %s OR m.away_team_id = %s)
                  AND m.status = 'FT'
                  AND m.score_home IS NOT NULL
                  AND m.score_away IS NOT NULL
                  AND m.mst_utc < %s
                ORDER BY m.mst_utc DESC
                LIMIT %s
                """,
                (team_id, team_id, before_date_ms, limit),
            )
            rows = cur.fetchall()
        except Exception:
            return dict(self._DEFAULT_FORM)

        if not rows:
            return dict(self._DEFAULT_FORM)

        total = len(rows)
        clean_sheets = 0
        scored_count = 0
        winning_streak = 0
        unbeaten_streak = 0
        streak_broken_w = False
        streak_broken_u = False

        for row in rows:
            is_home = str(row['home_team_id']) == team_id
            goals_for = int(row['score_home'] if is_home else row['score_away'])
            goals_against = int(row['score_away'] if is_home else row['score_home'])

            if goals_against == 0:
                clean_sheets += 1
            if goals_for > 0:
                scored_count += 1

            # Streak counting (most recent first)
            won = goals_for > goals_against
            not_lost = goals_for >= goals_against

            if not streak_broken_w:
                if won:
                    winning_streak += 1
                else:
                    streak_broken_w = True

            if not streak_broken_u:
                if not_lost:
                    unbeaten_streak += 1
                else:
                    streak_broken_u = True

        return {
            'clean_sheet_rate': clean_sheets / total,
            'scoring_rate': scored_count / total,
            'winning_streak': winning_streak,
            'unbeaten_streak': unbeaten_streak,
        }

    # ─── 4. Referee Stats ───────────────────────────────────────────

    def compute_referee_stats(
        self,
        cur: RealDictCursor,
        referee_name: Optional[str],
        before_date_ms: int,
        limit: int = 30,
    ) -> Dict[str, float]:
        """
        Referee tendencies: home win bias, avg goals, card rates.
        Matches referee by name in match_officials (role_id=1 = Orta Hakem).
        """
        if not referee_name:
            return dict(self._DEFAULT_REFEREE)
        try:
            # Get match IDs officiated by this referee
            cur.execute(
                """
                SELECT
                    m.home_team_id,
                    m.score_home,
                    m.score_away,
                    m.id AS match_id
                FROM match_officials mo
                JOIN matches m ON m.id = mo.match_id
                WHERE mo.name = %s
                  AND mo.role_id = 1
                  AND m.status = 'FT'
                  AND m.score_home IS NOT NULL
                  AND m.score_away IS NOT NULL
                  AND m.mst_utc < %s
                ORDER BY m.mst_utc DESC
                LIMIT %s
                """,
                (referee_name, before_date_ms, limit),
            )
            rows = cur.fetchall()
        except Exception:
            return dict(self._DEFAULT_REFEREE)

        if not rows:
            return dict(self._DEFAULT_REFEREE)

        total = len(rows)
        home_wins = 0
        total_goals = 0
        match_ids = []

        for row in rows:
            sh = int(row['score_home'])
            sa = int(row['score_away'])
            total_goals += sh + sa
            if sh > sa:
                home_wins += 1
            match_ids.append(row['match_id'])

        # Card stats from match_player_events
        total_yellows = 0.0
        total_cards = 0.0
        if match_ids:
            try:
                cur.execute(
                    """
                    SELECT
                        COUNT(*) FILTER (WHERE event_subtype = 'yc') AS yellows,
                        COUNT(*) AS total_cards
                    FROM match_player_events
                    WHERE match_id = ANY(%s)
                      AND event_type = 'card'
                    """,
                    (match_ids,),
                )
                card_row = cur.fetchone()
                if card_row:
                    total_yellows = float(card_row.get('yellows') or 0)
                    total_cards = float(card_row.get('total_cards') or 0)
            except Exception:
                pass

        # home_bias: (actual home win rate) - 0.46 (league average ~46%)
        home_bias = (home_wins / total) - 0.46

        return {
            'home_bias': round(home_bias, 4),
            'avg_goals': total_goals / total,
            'cards_total': total_cards / total if total > 0 else 4.0,
            'avg_yellow': total_yellows / total if total > 0 else 3.0,
            'experience': total,
        }

    # ─── 5. League Averages ─────────────────────────────────────────

    def compute_league_averages(
        self,
        cur: RealDictCursor,
        league_id: Optional[str],
        before_date_ms: int,
        limit: int = 100,
    ) -> Dict[str, float]:
        """
        League-wide scoring tendencies.
        """
        if not league_id:
            return dict(self._DEFAULT_LEAGUE)
        try:
            cur.execute(
                """
                SELECT
                    m.score_home,
                    m.score_away
                FROM matches m
                WHERE m.league_id = %s
                  AND m.status = 'FT'
                  AND m.score_home IS NOT NULL
                  AND m.score_away IS NOT NULL
                  AND m.mst_utc < %s
                ORDER BY m.mst_utc DESC
                LIMIT %s
                """,
                (league_id, before_date_ms, limit),
            )
            rows = cur.fetchall()
        except Exception:
            return dict(self._DEFAULT_LEAGUE)

        if not rows:
            return dict(self._DEFAULT_LEAGUE)

        total = len(rows)
        total_goals = 0
        zero_goal_matches = 0

        for row in rows:
            sh = int(row['score_home'])
            sa = int(row['score_away'])
            match_goals = sh + sa
            total_goals += match_goals
            if match_goals == 0:
                zero_goal_matches += 1

        return {
            'avg_goals': total_goals / total,
            'zero_goal_rate': zero_goal_matches / total,
        }

    # ─── 6. Momentum ───────────────────────────────────────────────

    def compute_momentum(
        self,
        cur: RealDictCursor,
        team_id: str,
        before_date_ms: int,
        limit: int = 5,
    ) -> float:
        """
        Recency-weighted momentum score: W=3, D=1, L=-1.
        Returns normalised score in [-1.0, 1.0].
        """
        if not team_id:
            return 0.0
        try:
            cur.execute(
                """
                SELECT
                    m.home_team_id,
                    m.score_home,
                    m.score_away
                FROM matches m
                WHERE (m.home_team_id = %s OR m.away_team_id = %s)
                  AND m.status = 'FT'
                  AND m.score_home IS NOT NULL
                  AND m.score_away IS NOT NULL
                  AND m.mst_utc < %s
                ORDER BY m.mst_utc DESC
                LIMIT %s
                """,
                (team_id, team_id, before_date_ms, limit),
            )
            rows = cur.fetchall()
        except Exception:
            return 0.0

        if not rows:
            return 0.0

        total_count = len(rows)
        weighted_score = 0.0
        max_possible = 0.0

        for idx, row in enumerate(rows):
            weight = float(total_count - idx)  # most recent = highest weight
            is_home = str(row['home_team_id']) == team_id
            gf = int(row['score_home'] if is_home else row['score_away'])
            ga = int(row['score_away'] if is_home else row['score_home'])

            if gf > ga:
                result_score = 3.0
            elif gf == ga:
                result_score = 1.0
            else:
                result_score = -1.0

            weighted_score += result_score * weight
            max_possible += 3.0 * weight  # max = all wins

        if max_possible <= 0:
            return 0.0

        # Normalise to [-1.0, 1.0]
        return round(weighted_score / max_possible, 4)


# ─── Utility ────────────────────────────────────────────────────────

def _safe_avg(values: list, default: float) -> float:
    """Average with fallback for empty lists."""
    if not values:
        return default
    return sum(values) / len(values)