iddaai-be/ai-engine/scripts/enrich_ai_features.py

#!/usr/bin/env python3
"""
AI Features Full Enrichment Script
====================================
Fills empty/default columns in football_ai_features that were not populated
by the original elo_backfill_v1 script.

Enriches: H2H, referee, team_stats, league_averages, form_streaks,
          rolling_goals, implied_odds, and clean_sheet/scoring rates.

Usage:
    python scripts/enrich_ai_features.py                          # enrich all
    python scripts/enrich_ai_features.py --batch-size 500         # smaller batches
    python scripts/enrich_ai_features.py --dry-run                # preview only
    python scripts/enrich_ai_features.py --force                  # re-enrich all rows
    python scripts/enrich_ai_features.py --limit 1000             # process N rows max

Designed to be idempotent: uses ON CONFLICT upserts, skips already-enriched rows.
"""

from __future__ import annotations

import os
import sys
import time
import argparse
from typing import Any, Dict, List, Optional, Tuple

# Add ai-engine root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import psycopg2
from psycopg2.extras import RealDictCursor, execute_values

from data.db import get_clean_dsn
from services.feature_enrichment import FeatureEnrichmentService

# ────────────────────────── constants ──────────────────────────

CALCULATOR_VER = 'enrichment_v2.0'
DEFAULT_BATCH_SIZE = 200


# ────────────────────────── helpers ────────────────────────────

def fetch_unenriched_matches(
    conn: psycopg2.extensions.connection,
    force: bool = False,
    limit: Optional[int] = None,
) -> List[Dict[str, Any]]:
    """
    Fetch matches from football_ai_features that still have default values
    in the enrichment columns (h2h_total=0 AND referee_avg_cards=0).

    If force=True, fetches ALL rows regardless of current state.
    """
    with conn.cursor(cursor_factory=RealDictCursor) as cur:
        where_clause = "WHERE 1=1" if force else (
            "WHERE (faf.h2h_total = 0 AND faf.referee_avg_cards = 0)"
        )
        limit_clause = f"LIMIT {limit}" if limit else ""

        cur.execute(f"""
            SELECT
                faf.match_id,
                m.home_team_id,
                m.away_team_id,
                m.mst_utc,
                m.league_id,
                m.score_home,
                m.score_away
            FROM football_ai_features faf
            JOIN matches m ON m.id = faf.match_id
            WHERE m.status = 'FT'
              AND m.score_home IS NOT NULL
              AND m.sport = 'football'
              AND ({where_clause.replace('WHERE ', '')})
            ORDER BY m.mst_utc ASC
            {limit_clause}
        """)
        return cur.fetchall()


def fetch_referee_for_match(
    cur: RealDictCursor,
    match_id: str,
) -> Optional[str]:
    """Get the head referee name for a match from match_officials."""
    try:
        cur.execute("""
            SELECT mo.name
            FROM match_officials mo
            WHERE mo.match_id = %s
              AND mo.role_id = 1
            LIMIT 1
        """, (match_id,))
        row = cur.fetchone()
        return row['name'] if row else None
    except Exception:
        return None


def fetch_implied_odds(
    cur: RealDictCursor,
    match_id: str,
) -> Dict[str, float]:
    """Get implied probabilities from odd_categories + odd_selections."""
    defaults = {
        'implied_home': 0.33,
        'implied_draw': 0.33,
        'implied_away': 0.33,
        'implied_over25': 0.50,
        'implied_btts_yes': 0.50,
        'odds_overround': 0.0,
    }
    try:
        cur.execute("""
            SELECT oc.name AS cat_name, os.name AS sel_name, os.odd_value
            FROM odd_selections os
            JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
            WHERE oc.match_id = %s
        """, (match_id,))
        rows = cur.fetchall()
    except Exception:
        return defaults

    odds: Dict[str, float] = {}
    for row in rows:
        try:
            cat = (row.get('cat_name') or '').lower().strip()
            sel = (row.get('sel_name') or '').strip()
            val = float(row.get('odd_value', 0))
            if val <= 0:
                continue

            if cat == 'maç sonucu':
                if sel == '1':
                    odds['ms_h'] = val
                elif sel in ('0', 'X'):
                    odds['ms_d'] = val
                elif sel == '2':
                    odds['ms_a'] = val
            elif cat == '2,5 alt/üst':
                if 'üst' in sel.lower():
                    odds['ou25_o'] = val
                elif 'alt' in sel.lower():
                    odds['ou25_u'] = val
            elif cat == 'karşılıklı gol':
                if 'var' in sel.lower():
                    odds['btts_y'] = val
                elif 'yok' in sel.lower():
                    odds['btts_n'] = val
        except (ValueError, TypeError):
            continue

    # Compute implied probabilities
    ms_h = odds.get('ms_h', 0)
    ms_d = odds.get('ms_d', 0)
    ms_a = odds.get('ms_a', 0)

    if ms_h > 1.0 and ms_d > 1.0 and ms_a > 1.0:
        raw_sum = 1 / ms_h + 1 / ms_d + 1 / ms_a
        overround = raw_sum - 1.0
        defaults['implied_home'] = round((1 / ms_h) / raw_sum, 4)
        defaults['implied_draw'] = round((1 / ms_d) / raw_sum, 4)
        defaults['implied_away'] = round((1 / ms_a) / raw_sum, 4)
        defaults['odds_overround'] = round(overround, 4)

    ou25_o = odds.get('ou25_o', 0)
    ou25_u = odds.get('ou25_u', 0)
    if ou25_o > 1.0 and ou25_u > 1.0:
        raw_sum = 1 / ou25_o + 1 / ou25_u
        defaults['implied_over25'] = round((1 / ou25_o) / raw_sum, 4)

    btts_y = odds.get('btts_y', 0)
    btts_n = odds.get('btts_n', 0)
    if btts_y > 1.0 and btts_n > 1.0:
        raw_sum = 1 / btts_y + 1 / btts_n
        defaults['implied_btts_yes'] = round((1 / btts_y) / raw_sum, 4)

    return defaults


def enrich_single_match(
    enrichment: FeatureEnrichmentService,
    cur: RealDictCursor,
    match: Dict[str, Any],
) -> Dict[str, Any]:
    """
    Compute all enrichment features for a single match and return
    a dict ready for DB upsert.
    """
    match_id = match['match_id']
    home_id = str(match['home_team_id'])
    away_id = str(match['away_team_id'])
    mst_utc = int(match['mst_utc']) if match['mst_utc'] else 0
    league_id = str(match['league_id']) if match['league_id'] else None

    # 1. Team stats
    home_stats = enrichment.compute_team_stats(cur, home_id, mst_utc)
    away_stats = enrichment.compute_team_stats(cur, away_id, mst_utc)

    # 2. H2H
    h2h = enrichment.compute_h2h(cur, home_id, away_id, mst_utc)

    # 3. Form & streaks
    home_form = enrichment.compute_form_streaks(cur, home_id, mst_utc)
    away_form = enrichment.compute_form_streaks(cur, away_id, mst_utc)

    # 4. Referee
    referee_name = fetch_referee_for_match(cur, match_id)
    referee = enrichment.compute_referee_stats(cur, referee_name, mst_utc)

    # 5. League averages
    league = enrichment.compute_league_averages(cur, league_id, mst_utc)

    # 6. Rolling stats (for goals avg)
    home_rolling = enrichment.compute_rolling_stats(cur, home_id, mst_utc)
    away_rolling = enrichment.compute_rolling_stats(cur, away_id, mst_utc)

    # 7. Implied odds
    implied = fetch_implied_odds(cur, match_id)

    return {
        'match_id': match_id,
        # Team stats
        'home_avg_possession': round(home_stats['avg_possession'], 2),
        'away_avg_possession': round(away_stats['avg_possession'], 2),
        'home_avg_shots_on_target': round(home_stats['avg_shots_on_target'], 2),
        'away_avg_shots_on_target': round(away_stats['avg_shots_on_target'], 2),
        'home_shot_conversion': round(home_stats['shot_conversion'], 4),
        'away_shot_conversion': round(away_stats['shot_conversion'], 4),
        'home_avg_corners': round(home_stats['avg_corners'], 2),
        'away_avg_corners': round(away_stats['avg_corners'], 2),
        # H2H
        'h2h_total': h2h['total_matches'],
        'h2h_home_win_rate': round(h2h['home_win_rate'], 4),
        'h2h_avg_goals': round(h2h['avg_goals'], 2),
        'h2h_over25_rate': round(h2h['over25_rate'], 4),
        'h2h_btts_rate': round(h2h['btts_rate'], 4),
        # Form
        'home_clean_sheet_rate': round(home_form['clean_sheet_rate'], 4),
        'away_clean_sheet_rate': round(away_form['clean_sheet_rate'], 4),
        'home_scoring_rate': round(home_form['scoring_rate'], 4),
        'away_scoring_rate': round(away_form['scoring_rate'], 4),
        'home_win_streak': home_form['winning_streak'],
        'away_win_streak': away_form['winning_streak'],
        # Rolling goals
        'home_goals_avg_5': round(home_rolling['rolling5_goals'], 2),
        'away_goals_avg_5': round(away_rolling['rolling5_goals'], 2),
        'home_conceded_avg_5': round(home_rolling['rolling5_conceded'], 2),
        'away_conceded_avg_5': round(away_rolling['rolling5_conceded'], 2),
        # Referee
        'referee_avg_cards': round(referee['cards_total'], 2),
        'referee_home_bias': round(referee['home_bias'], 4),
        'referee_avg_goals': round(referee['avg_goals'], 2),
        # League
        'league_avg_goals': round(league['avg_goals'], 2),
        'league_home_win_pct': round(league['home_win_rate'], 4),
        'league_over25_pct': round(league['ou25_rate'], 4),
        # Implied odds
        'implied_home': implied['implied_home'],
        'implied_draw': implied['implied_draw'],
        'implied_away': implied['implied_away'],
        'implied_over25': implied['implied_over25'],
        'implied_btts_yes': implied['implied_btts_yes'],
        'odds_overround': implied['odds_overround'],
        # Missing players impact — default (no lineup data for historical)
        'missing_players_impact': 0.0,
        # Version
        'calculator_ver': CALCULATOR_VER,
    }


def flush_enrichment_batch(
    conn: psycopg2.extensions.connection,
    rows: List[Dict[str, Any]],
    dry_run: bool,
) -> int:
    """Bulk upsert enriched features into football_ai_features."""
    if not rows or dry_run:
        return 0

    columns = [
        'match_id',
        'home_avg_possession', 'away_avg_possession',
        'home_avg_shots_on_target', 'away_avg_shots_on_target',
        'home_shot_conversion', 'away_shot_conversion',
        'home_avg_corners', 'away_avg_corners',
        'h2h_total', 'h2h_home_win_rate', 'h2h_avg_goals',
        'h2h_over25_rate', 'h2h_btts_rate',
        'home_clean_sheet_rate', 'away_clean_sheet_rate',
        'home_scoring_rate', 'away_scoring_rate',
        'home_win_streak', 'away_win_streak',
        'home_goals_avg_5', 'away_goals_avg_5',
        'home_conceded_avg_5', 'away_conceded_avg_5',
        'referee_avg_cards', 'referee_home_bias', 'referee_avg_goals',
        'league_avg_goals', 'league_home_win_pct', 'league_over25_pct',
        'implied_home', 'implied_draw', 'implied_away',
        'implied_over25', 'implied_btts_yes', 'odds_overround',
        'missing_players_impact', 'calculator_ver',
    ]

    # Build update SET clause (skip match_id)
    update_cols = [c for c in columns if c != 'match_id']
    set_clause = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)

    placeholders = ', '.join(['%s'] * len(columns))
    values = [
        tuple(row[c] for c in columns)
        for row in rows
    ]

    with conn.cursor() as cur:
        execute_values(
            cur,
            f"""
            INSERT INTO football_ai_features ({', '.join(columns)})
            VALUES %s
            ON CONFLICT (match_id) DO UPDATE SET
                {set_clause},
                updated_at = NOW()
            """,
            values,
            template=f"({placeholders})",
            page_size=200,
        )
    conn.commit()
    return len(rows)


# ────────────────────────── main ───────────────────────────────

def run_enrichment(
    batch_size: int,
    dry_run: bool,
    force: bool,
    limit: Optional[int],
) -> None:
    """Core enrichment loop."""
    dsn = get_clean_dsn()
    conn = psycopg2.connect(dsn)

    print(f"\n{'=' * 60}")
    print(f"🧠 AI Features Full Enrichment — {CALCULATOR_VER}")
    print(f"   batch_size={batch_size}  dry_run={dry_run}  force={force}")
    print(f"{'=' * 60}")

    # 1. Fetch unenriched matches
    t0 = time.time()
    matches = fetch_unenriched_matches(conn, force=force, limit=limit)
    print(f"\n📊 {len(matches):,} matches to enrich ({time.time() - t0:.1f}s)")

    if not matches:
        print("✅ Nothing to enrich — all rows already populated.")
        conn.close()
        return

    # 2. Initialize enrichment service
    enrichment = FeatureEnrichmentService()

    # 3. Process in batches
    total = len(matches)
    processed = 0
    written = 0
    errors = 0
    batch_buf: List[Dict[str, Any]] = []
    t_start = time.time()

    # Use a dedicated cursor with RealDictCursor for all enrichment queries
    enrich_cur = conn.cursor(cursor_factory=RealDictCursor)

    for idx, match in enumerate(matches):
        try:
            enriched = enrich_single_match(enrichment, enrich_cur, match)
            batch_buf.append(enriched)
        except Exception as e:
            errors += 1
            if errors <= 10:
                print(f"   ⚠️ Error enriching {match.get('match_id', '?')}: {e}")

        processed += 1

        # Flush batch
        if len(batch_buf) >= batch_size:
            flushed = flush_enrichment_batch(conn, batch_buf, dry_run)
            written += flushed
            batch_buf.clear()

        # Progress reporting
        if processed % 500 == 0:
            elapsed = time.time() - t_start
            rate = processed / elapsed if elapsed > 0 else 0
            remaining = (total - processed) / rate if rate > 0 else 0
            pct = processed / total * 100
            print(
                f"   [{processed:>8,} / {total:,}] "
                f"({pct:.1f}%) | {rate:.0f} matches/s | "
                f"ETA: {remaining / 60:.1f} min | "
                f"errors: {errors}"
            )

    # Flush remaining
    if batch_buf:
        flushed = flush_enrichment_batch(conn, batch_buf, dry_run)
        written += flushed

    enrich_cur.close()

    elapsed = time.time() - t_start
    print(f"\n{'=' * 60}")
    print(f"✅ Enrichment complete:")
    print(f"   Processed: {processed:,} matches in {elapsed:.1f}s")
    print(f"   Written:   {written:,} rows")
    print(f"   Errors:    {errors:,}")
    print(f"   Rate:      {processed / elapsed:.0f} matches/s")
    print(f"{'=' * 60}")

    conn.close()


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Enrich football_ai_features with H2H, referee, stats, and odds data"
    )
    parser.add_argument(
        '--batch-size',
        type=int,
        default=DEFAULT_BATCH_SIZE,
        help=f'DB insert batch size (default: {DEFAULT_BATCH_SIZE})',
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Compute features but do not write to DB',
    )
    parser.add_argument(
        '--force',
        action='store_true',
        help='Re-enrich ALL rows, not just empty ones',
    )
    parser.add_argument(
        '--limit',
        type=int,
        default=None,
        help='Max number of matches to process',
    )
    args = parser.parse_args()

    run_enrichment(
        batch_size=args.batch_size,
        dry_run=args.dry_run,
        force=args.force,
        limit=args.limit,
    )


if __name__ == '__main__':
    main()