iddaai-be/ai-engine/scripts/extract_basketball_data.py

"""
XGBoost Training Data Extraction (Basketball)
==============================================
Batch feature extraction for top-league basketball matches.
Extracts features + labels per match for XGBoost model training.

Usage:
    python3 scripts/extract_basketball_data.py
"""

import os
import sys
import json
import csv
import math
import time
from datetime import datetime
from collections import defaultdict

import psycopg2
from psycopg2.extras import RealDictCursor
from dotenv import load_dotenv

load_dotenv()

# =============================================================================
# CONFIG
# =============================================================================
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_ENGINE_DIR)

TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "basketball_top_leagues.json")
OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "basketball_training_data.csv")

os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)


def get_conn():
    db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0]
    return psycopg2.connect(db_url)


# =============================================================================
# FEATURE COLUMNS (ORDER MATTERS — matches CSV header)
# =============================================================================
FEATURE_COLS = [
    # Match identifiers
    "match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",

    # Form Features (8)
    "home_points_avg", "home_conceded_avg",
    "away_points_avg", "away_conceded_avg",
    "home_winning_streak", "away_winning_streak",
    "home_win_rate", "away_win_rate",

    # H2H Features (4)
    "h2h_total_matches", "h2h_home_win_rate",
    "h2h_avg_points", "h2h_over140_rate",

    # Odds Features (6)
    "odds_ml_h", "odds_ml_a",
    "odds_tot_o", "odds_tot_u", "odds_tot_line",
    "odds_spread_h", "odds_spread_a", "odds_spread_line",

    # Labels
    "score_home", "score_away", "total_points",
    "label_ml",          # 0=Home, 1=Away
    "label_tot",         # 0=Under, 1=Over (dynamic line)
    "label_spread",      # 0=Away Cover, 1=Home Cover (dynamic line)
]


# =============================================================================
# BATCH LOADERS — Pre-load data to avoid N+1 queries
# =============================================================================

class BatchDataLoader:
    """Pre-loads all necessary data in bulk, then serves features per match."""

    def __init__(self, conn, top_league_ids: list):
        self.conn = conn
        self.cur = conn.cursor(cursor_factory=RealDictCursor)
        self.top_league_ids = top_league_ids

        # Pre-loaded data caches
        self.matches = []
        self.odds_cache = {}           # match_id → {ml_h, ml_a, ...}
        self.form_cache = {}           # (team_id, match_id) → form features
        self.h2h_cache = {}            # (home_id, away_id, match_id) → h2h features

    def load_all(self):
        """Load all data in batch."""
        t0 = time.time()

        self._load_matches()
        print(f"  ✅ Matches: {len(self.matches)} ({time.time()-t0:.1f}s)", flush=True)

        t1 = time.time()
        self._load_odds()
        print(f"  ✅ Odds: {len(self.odds_cache)} matches ({time.time()-t1:.1f}s)", flush=True)

        t3 = time.time()
        self._load_team_history()
        print(f"  ✅ Team History & Stats cache built ({time.time()-t3:.1f}s)", flush=True)

        print(f"  📊 Total load time: {time.time()-t0:.1f}s", flush=True)

    def _load_matches(self):
        query = """
            SELECT
                id,
                mst_utc,
                league_id,
                home_team_id,
                away_team_id,
                score_home,
                score_away,
                status
            FROM matches
            WHERE sport = 'basketball'
              AND status = 'FT'
              AND score_home IS NOT NULL
              AND score_away IS NOT NULL
              AND mst_utc > 1640995200000 -- Since Jan 1, 2022
        """
        if self.top_league_ids:
            format_strings = ",".join(["%s"] * len(self.top_league_ids))
            query += f" AND league_id IN ({format_strings})"
            self.cur.execute(query + " ORDER BY mst_utc ASC", tuple(self.top_league_ids))
        else:
            self.cur.execute(query + " ORDER BY mst_utc ASC")

        self.matches = self.cur.fetchall()

    def _load_odds(self):
        query = """
            SELECT match_id, name as category_name, db_id as category_id
            FROM odd_categories
            WHERE match_id IN (
                SELECT id FROM matches WHERE sport = 'basketball' AND status = 'FT'
            )
        """
        self.cur.execute(query)
        cats = self.cur.fetchall()

        # map cat -> match
        cat_to_match = {c['category_id']: c['match_id'] for c in cats}

        query2 = """
            SELECT odd_category_db_id, name, odd_value
            FROM odd_selections
            WHERE odd_category_db_id IN %(cat_ids)s
        """
        cat_ids = tuple(cat_to_match.keys())
        if not cat_ids:
            return

        cat_id_to_name = {c['category_id']: c['category_name'] for c in cats}

        chunk_size = 50000
        cats_list = list(cat_ids)
        total_chunks = len(cats_list) // chunk_size + 1
        print(f"    Fetching {len(cats_list)} categories in {total_chunks} chunks...", flush=True)

        for idx, i in enumerate(range(0, len(cats_list), chunk_size)):
            chunk = tuple(cats_list[i:i+chunk_size])
            self.cur.execute("SELECT odd_category_db_id, name, odd_value FROM odd_selections WHERE odd_category_db_id IN %s", (chunk,))
            rows = self.cur.fetchall()

            for row in rows:
                c_id = row['odd_category_db_id']
                m_id = cat_to_match[c_id]
                c_name = cat_id_to_name.get(c_id, "")

                if m_id not in self.odds_cache:
                    self.odds_cache[m_id] = {}

                self._parse_single_odd(m_id, c_name, str(row['name']), float(row['odd_value']))
            print(f"      Processed chunk {idx+1}/{total_chunks} ({len(rows)} selections).", flush=True)

    def _parse_single_odd(self, match_id, category_name, sel_name, odd_value):
        if odd_value <= 1.0: return
        cat_lower = category_name.lower()
        sel_lower = sel_name.lower()

        target = self.odds_cache[match_id]

        # ML
        if cat_lower in ("maç sonucu (uzt. dahil)", "mac sonucu (uzt. dahil)", "maç sonucu", "mac sonucu"):
            if sel_lower == "1": target["ml_h"] = odd_value
            elif sel_lower == "2": target["ml_a"] = odd_value

        # Totals
        if "alt/üst" in cat_lower or "alt/ust" in cat_lower:
            # Extract line
            line = None
            try:
                left = cat_lower.find("(")
                right = cat_lower.find(")", left + 1)
                if left > -1 and right > -1:
                    line = float(cat_lower[left+1:right].replace(",", "."))
            except: pass

            if line and "tot_line" not in target:
                target["tot_line"] = line

            if "üst" in sel_lower or "ust" in sel_lower or "over" in sel_lower:
                target.setdefault("tot_o", odd_value)
            elif "alt" in sel_lower or "under" in sel_lower:
                target.setdefault("tot_u", odd_value)

        # Spread
        if "hnd. ms" in cat_lower or "hand. ms" in cat_lower or "hnd ms" in cat_lower:
            line = None
            try:
                left = cat_lower.find("(")
                right = cat_lower.find(")", left + 1)
                if left > -1 and right > -1:
                    payload = cat_lower[left+1:right].replace(",", ".")
                    if ":" in payload:
                        home_hcp = float(payload.split(":")[0])
                        away_hcp = float(payload.split(":")[1])
                        if abs(home_hcp) < 1e-6 and away_hcp > 0: line = -away_hcp
                        elif home_hcp > 0 and abs(away_hcp) < 1e-6: line = home_hcp
                        elif abs(home_hcp - away_hcp) < 1e-6 and home_hcp > 0: line = 0.0
            except: pass

            if line is not None and "spread_line" not in target:
                target["spread_line"] = line

            if sel_lower == "1": target.setdefault("spread_h", odd_value)
            elif sel_lower == "2": target.setdefault("spread_a", odd_value)


    def _load_team_history(self):
        # We need historical form (avg points scored/conceded, win rate).
        team_matches = defaultdict(list)
        for m in self.matches:
            # m has id, mst_utc, home_team_id, away_team_id, score_home, score_away
            team_matches[m['home_team_id']].append((m['mst_utc'], m['score_home'], m['score_away'], 'H'))
            team_matches[m['away_team_id']].append((m['mst_utc'], m['score_away'], m['score_home'], 'A'))

        for team_id, hist in team_matches.items():
            hist.sort(key=lambda x: x[0])  # Sort by time

            for i, (mst_utc, scored, conceded, location) in enumerate(hist):
                # Filter past matches
                past = [x for x in hist[:i] if x[0] < mst_utc]
                if not past:
                    self.form_cache[(team_id, mst_utc)] = {
                        "points_avg": 80.0,
                        "conceded_avg": 80.0,
                        "winning_streak": 0,
                        "win_rate": 0.5
                    }
                    continue

                last_5 = past[-5:]

                pts = sum(x[1] for x in last_5) / len(last_5)
                conc = sum(x[2] for x in last_5) / len(last_5)

                wins = sum(1 for x in past if x[1] > x[2])
                win_rate = wins / len(past) if len(past) > 0 else 0.5

                streak = 0
                for x in reversed(past):
                    if x[1] > x[2]: streak += 1
                    else: break

                self.form_cache[(team_id, mst_utc)] = {
                    "points_avg": pts,
                    "conceded_avg": conc,
                    "winning_streak": streak,
                    "win_rate": win_rate
                }

        # Build H2H
        h2h_map = defaultdict(list)
        for m in self.matches:
            pair = tuple(sorted([str(m['home_team_id']), str(m['away_team_id'])]))
            tgt = m['home_team_id']
            h_win = 1 if m['score_home'] > m['score_away'] else 0
            if tgt != pair[0]: # Ensure orientation is relative to pair[0] usually, but let's just do directional
                pass
            directional_pair = (str(m['home_team_id']), str(m['away_team_id']))
            h2h_map[directional_pair].append((m['mst_utc'], m['score_home'], m['score_away']))

        for (h_id, a_id), hist in h2h_map.items():
            hist.sort(key=lambda x: x[0])
            for i, (mst_utc, sh, sa) in enumerate(hist):
                past = [x for x in hist[:i] if x[0] < mst_utc]

                if not past:
                    self.h2h_cache[(h_id, a_id, mst_utc)] = {
                        "total": 0, "home_win_rate": 0.5,
                        "avg_points": 160.0, "over140_rate": 0.5
                    }
                else:
                    home_wins = sum(1 for x in past if x[1] > x[2])
                    total_pts = sum(x[1] + x[2] for x in past)
                    over140 = sum(1 for x in past if x[1] + x[2] > 140)

                    self.h2h_cache[(h_id, a_id, mst_utc)] = {
                        "total": len(past),
                        "home_win_rate": home_wins / len(past),
                        "avg_points": total_pts / len(past),
                        "over140_rate": over140 / len(past)
                    }

# =============================================================================
# FEATURE EXTRACTION PIPELINE
# =============================================================================

def process_matches(loader: BatchDataLoader):
    """Processes loaded matches, maps to features, handles implicit fallbacks, saves to CSV."""
    f = open(OUTPUT_CSV, "w", newline='')
    writer = csv.writer(f)
    writer.writerow(FEATURE_COLS)

    extracted_count = 0
    missing_odds_count = 0

    for match in loader.matches:
        mid = str(match['id'])
        mst = int(match['mst_utc'])
        hid = str(match['home_team_id'])
        aid = str(match['away_team_id'])

        # True Results
        s_home = int(match['score_home'])
        s_away = int(match['score_away'])
        total_pts = s_home + s_away

        c_odds = loader.odds_cache.get(mid, {})
        c_form_h = loader.form_cache.get((hid, mst), {})
        c_form_a = loader.form_cache.get((aid, mst), {})
        c_h2h = loader.h2h_cache.get((hid, aid, mst), {})

        # Basic validation: ensure we have at least ML odds
        if "ml_h" not in c_odds or "ml_a" not in c_odds:
            missing_odds_count += 1
            continue

        # Target Variables (Labels)
        label_ml = 0 if s_home > s_away else 1 # Home Win vs Away Win

        # Totals label (evaluate against dynamic line)
        line_tot = c_odds.get("tot_line", 160.0)
        label_tot = 1 if total_pts > line_tot else 0 # Over = 1, Under = 0

        # Spread label (evaluate against dynamic line)
        # Home Spread Coverage. Example: line= -5.5. s_home + line = s_home - 5.5.
        line_spread = c_odds.get("spread_line", 0.0)
        hc_score = float(s_home) + float(line_spread)
        label_spread = 1 if hc_score > float(s_away) else 0 # Spread Coverage: 1=Home, 0=Away

        # Compile Row
        row = [
            # Identifiers
            mid, hid, aid, match.get('league_id', ''), mst,

            # Form cache
            c_form_h.get("points_avg", 80), c_form_h.get("conceded_avg", 80),
            c_form_a.get("points_avg", 80), c_form_a.get("conceded_avg", 80),
            c_form_h.get("winning_streak", 0), c_form_a.get("winning_streak", 0),
            c_form_h.get("win_rate", 0), c_form_a.get("win_rate", 0),

            # H2H cache
            c_h2h.get("total", 0), c_h2h.get("home_win_rate", 0.5),
            c_h2h.get("avg_points", 160.0), c_h2h.get("over140_rate", 0.5),

            # Odds
            c_odds.get("ml_h", 1.9), c_odds.get("ml_a", 1.9),
            c_odds.get("tot_o", 1.9), c_odds.get("tot_u", 1.9), line_tot,
            c_odds.get("spread_h", 1.9), c_odds.get("spread_a", 1.9), line_spread,

            # Labels
            s_home, s_away, total_pts,
            label_ml,
            label_tot,
            label_spread,
        ]

        # Safeguard length
        if len(row) != len(FEATURE_COLS):
            print(f"Error: Row length mismatch {len(row)} != {len(FEATURE_COLS)}")
            sys.exit(1)

        writer.writerow(row)
        extracted_count += 1

    f.close()

    print("\nExtraction Summary")
    print("=========================")
    print(f"Total Matches in Scope: {len(loader.matches)}")
    print(f"Filtered (Missing ML Odds): {missing_odds_count}")
    print(f"✅ Successfully Extracted: {extracted_count}")
    print(f"📂 Saved to: {OUTPUT_CSV}")


if __name__ == "__main__":
    t_start = time.time()

    # Load leagues
    if not os.path.exists(TOP_LEAGUES_PATH):
        print(f"Error: file not found {TOP_LEAGUES_PATH}")
        sys.exit(1)

    with open(TOP_LEAGUES_PATH, "r") as f:
        top_leagues = json.load(f)

    print(f"🏀 Extracting Basketball Training Data (XGBoost)")
    print(f"==================================================")
    print(f"Loaded {len(top_leagues)} top leagues.")

    conn = get_conn()
    loader = BatchDataLoader(conn, top_leagues)

    # 1. Pre-load everything into memory
    loader.load_all()

    # 2. Extract and match features, then write CSV
    process_matches(loader)

    conn.close()
    print(f"Total Script Run Time: {time.time()-t_start:.1f}s")