""" XGBoost Training Data Extraction (Basketball) ============================================== Batch feature extraction for top-league basketball matches. Extracts features + labels per match for XGBoost model training. Usage: python3 scripts/extract_basketball_data.py """ import os import sys import json import csv import math import time from datetime import datetime from collections import defaultdict import psycopg2 from psycopg2.extras import RealDictCursor from dotenv import load_dotenv load_dotenv() # ============================================================================= # CONFIG # ============================================================================= AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, AI_ENGINE_DIR) TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "basketball_top_leagues.json") OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "basketball_training_data.csv") os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True) def get_conn(): db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0] return psycopg2.connect(db_url) # ============================================================================= # FEATURE COLUMNS (ORDER MATTERS — matches CSV header) # ============================================================================= FEATURE_COLS = [ # Match identifiers "match_id", "home_team_id", "away_team_id", "league_id", "mst_utc", # Form Features (8) "home_points_avg", "home_conceded_avg", "away_points_avg", "away_conceded_avg", "home_winning_streak", "away_winning_streak", "home_win_rate", "away_win_rate", # H2H Features (4) "h2h_total_matches", "h2h_home_win_rate", "h2h_avg_points", "h2h_over140_rate", # Odds Features (6) "odds_ml_h", "odds_ml_a", "odds_tot_o", "odds_tot_u", "odds_tot_line", "odds_spread_h", "odds_spread_a", "odds_spread_line", # Labels "score_home", "score_away", "total_points", "label_ml", # 0=Home, 1=Away "label_tot", # 0=Under, 1=Over (dynamic line) "label_spread", # 0=Away Cover, 1=Home Cover (dynamic line) ] # ============================================================================= # BATCH LOADERS — Pre-load data to avoid N+1 queries # ============================================================================= class BatchDataLoader: """Pre-loads all necessary data in bulk, then serves features per match.""" def __init__(self, conn, top_league_ids: list): self.conn = conn self.cur = conn.cursor(cursor_factory=RealDictCursor) self.top_league_ids = top_league_ids # Pre-loaded data caches self.matches = [] self.odds_cache = {} # match_id → {ml_h, ml_a, ...} self.form_cache = {} # (team_id, match_id) → form features self.h2h_cache = {} # (home_id, away_id, match_id) → h2h features def load_all(self): """Load all data in batch.""" t0 = time.time() self._load_matches() print(f" ✅ Matches: {len(self.matches)} ({time.time()-t0:.1f}s)", flush=True) t1 = time.time() self._load_odds() print(f" ✅ Odds: {len(self.odds_cache)} matches ({time.time()-t1:.1f}s)", flush=True) t3 = time.time() self._load_team_history() print(f" ✅ Team History & Stats cache built ({time.time()-t3:.1f}s)", flush=True) print(f" 📊 Total load time: {time.time()-t0:.1f}s", flush=True) def _load_matches(self): query = """ SELECT id, mst_utc, league_id, home_team_id, away_team_id, score_home, score_away, status FROM matches WHERE sport = 'basketball' AND status = 'FT' AND score_home IS NOT NULL AND score_away IS NOT NULL AND mst_utc > 1640995200000 -- Since Jan 1, 2022 """ if self.top_league_ids: format_strings = ",".join(["%s"] * len(self.top_league_ids)) query += f" AND league_id IN ({format_strings})" self.cur.execute(query + " ORDER BY mst_utc ASC", tuple(self.top_league_ids)) else: self.cur.execute(query + " ORDER BY mst_utc ASC") self.matches = self.cur.fetchall() def _load_odds(self): query = """ SELECT match_id, name as category_name, db_id as category_id FROM odd_categories WHERE match_id IN ( SELECT id FROM matches WHERE sport = 'basketball' AND status = 'FT' ) """ self.cur.execute(query) cats = self.cur.fetchall() # map cat -> match cat_to_match = {c['category_id']: c['match_id'] for c in cats} query2 = """ SELECT odd_category_db_id, name, odd_value FROM odd_selections WHERE odd_category_db_id IN %(cat_ids)s """ cat_ids = tuple(cat_to_match.keys()) if not cat_ids: return cat_id_to_name = {c['category_id']: c['category_name'] for c in cats} chunk_size = 50000 cats_list = list(cat_ids) total_chunks = len(cats_list) // chunk_size + 1 print(f" Fetching {len(cats_list)} categories in {total_chunks} chunks...", flush=True) for idx, i in enumerate(range(0, len(cats_list), chunk_size)): chunk = tuple(cats_list[i:i+chunk_size]) self.cur.execute("SELECT odd_category_db_id, name, odd_value FROM odd_selections WHERE odd_category_db_id IN %s", (chunk,)) rows = self.cur.fetchall() for row in rows: c_id = row['odd_category_db_id'] m_id = cat_to_match[c_id] c_name = cat_id_to_name.get(c_id, "") if m_id not in self.odds_cache: self.odds_cache[m_id] = {} self._parse_single_odd(m_id, c_name, str(row['name']), float(row['odd_value'])) print(f" Processed chunk {idx+1}/{total_chunks} ({len(rows)} selections).", flush=True) def _parse_single_odd(self, match_id, category_name, sel_name, odd_value): if odd_value <= 1.0: return cat_lower = category_name.lower() sel_lower = sel_name.lower() target = self.odds_cache[match_id] # ML if cat_lower in ("maç sonucu (uzt. dahil)", "mac sonucu (uzt. dahil)", "maç sonucu", "mac sonucu"): if sel_lower == "1": target["ml_h"] = odd_value elif sel_lower == "2": target["ml_a"] = odd_value # Totals if "alt/üst" in cat_lower or "alt/ust" in cat_lower: # Extract line line = None try: left = cat_lower.find("(") right = cat_lower.find(")", left + 1) if left > -1 and right > -1: line = float(cat_lower[left+1:right].replace(",", ".")) except: pass if line and "tot_line" not in target: target["tot_line"] = line if "üst" in sel_lower or "ust" in sel_lower or "over" in sel_lower: target.setdefault("tot_o", odd_value) elif "alt" in sel_lower or "under" in sel_lower: target.setdefault("tot_u", odd_value) # Spread if "hnd. ms" in cat_lower or "hand. ms" in cat_lower or "hnd ms" in cat_lower: line = None try: left = cat_lower.find("(") right = cat_lower.find(")", left + 1) if left > -1 and right > -1: payload = cat_lower[left+1:right].replace(",", ".") if ":" in payload: home_hcp = float(payload.split(":")[0]) away_hcp = float(payload.split(":")[1]) if abs(home_hcp) < 1e-6 and away_hcp > 0: line = -away_hcp elif home_hcp > 0 and abs(away_hcp) < 1e-6: line = home_hcp elif abs(home_hcp - away_hcp) < 1e-6 and home_hcp > 0: line = 0.0 except: pass if line is not None and "spread_line" not in target: target["spread_line"] = line if sel_lower == "1": target.setdefault("spread_h", odd_value) elif sel_lower == "2": target.setdefault("spread_a", odd_value) def _load_team_history(self): # We need historical form (avg points scored/conceded, win rate). team_matches = defaultdict(list) for m in self.matches: # m has id, mst_utc, home_team_id, away_team_id, score_home, score_away team_matches[m['home_team_id']].append((m['mst_utc'], m['score_home'], m['score_away'], 'H')) team_matches[m['away_team_id']].append((m['mst_utc'], m['score_away'], m['score_home'], 'A')) for team_id, hist in team_matches.items(): hist.sort(key=lambda x: x[0]) # Sort by time for i, (mst_utc, scored, conceded, location) in enumerate(hist): # Filter past matches past = [x for x in hist[:i] if x[0] < mst_utc] if not past: self.form_cache[(team_id, mst_utc)] = { "points_avg": 80.0, "conceded_avg": 80.0, "winning_streak": 0, "win_rate": 0.5 } continue last_5 = past[-5:] pts = sum(x[1] for x in last_5) / len(last_5) conc = sum(x[2] for x in last_5) / len(last_5) wins = sum(1 for x in past if x[1] > x[2]) win_rate = wins / len(past) if len(past) > 0 else 0.5 streak = 0 for x in reversed(past): if x[1] > x[2]: streak += 1 else: break self.form_cache[(team_id, mst_utc)] = { "points_avg": pts, "conceded_avg": conc, "winning_streak": streak, "win_rate": win_rate } # Build H2H h2h_map = defaultdict(list) for m in self.matches: pair = tuple(sorted([str(m['home_team_id']), str(m['away_team_id'])])) tgt = m['home_team_id'] h_win = 1 if m['score_home'] > m['score_away'] else 0 if tgt != pair[0]: # Ensure orientation is relative to pair[0] usually, but let's just do directional pass directional_pair = (str(m['home_team_id']), str(m['away_team_id'])) h2h_map[directional_pair].append((m['mst_utc'], m['score_home'], m['score_away'])) for (h_id, a_id), hist in h2h_map.items(): hist.sort(key=lambda x: x[0]) for i, (mst_utc, sh, sa) in enumerate(hist): past = [x for x in hist[:i] if x[0] < mst_utc] if not past: self.h2h_cache[(h_id, a_id, mst_utc)] = { "total": 0, "home_win_rate": 0.5, "avg_points": 160.0, "over140_rate": 0.5 } else: home_wins = sum(1 for x in past if x[1] > x[2]) total_pts = sum(x[1] + x[2] for x in past) over140 = sum(1 for x in past if x[1] + x[2] > 140) self.h2h_cache[(h_id, a_id, mst_utc)] = { "total": len(past), "home_win_rate": home_wins / len(past), "avg_points": total_pts / len(past), "over140_rate": over140 / len(past) } # ============================================================================= # FEATURE EXTRACTION PIPELINE # ============================================================================= def process_matches(loader: BatchDataLoader): """Processes loaded matches, maps to features, handles implicit fallbacks, saves to CSV.""" f = open(OUTPUT_CSV, "w", newline='') writer = csv.writer(f) writer.writerow(FEATURE_COLS) extracted_count = 0 missing_odds_count = 0 for match in loader.matches: mid = str(match['id']) mst = int(match['mst_utc']) hid = str(match['home_team_id']) aid = str(match['away_team_id']) # True Results s_home = int(match['score_home']) s_away = int(match['score_away']) total_pts = s_home + s_away c_odds = loader.odds_cache.get(mid, {}) c_form_h = loader.form_cache.get((hid, mst), {}) c_form_a = loader.form_cache.get((aid, mst), {}) c_h2h = loader.h2h_cache.get((hid, aid, mst), {}) # Basic validation: ensure we have at least ML odds if "ml_h" not in c_odds or "ml_a" not in c_odds: missing_odds_count += 1 continue # Target Variables (Labels) label_ml = 0 if s_home > s_away else 1 # Home Win vs Away Win # Totals label (evaluate against dynamic line) line_tot = c_odds.get("tot_line", 160.0) label_tot = 1 if total_pts > line_tot else 0 # Over = 1, Under = 0 # Spread label (evaluate against dynamic line) # Home Spread Coverage. Example: line= -5.5. s_home + line = s_home - 5.5. line_spread = c_odds.get("spread_line", 0.0) hc_score = float(s_home) + float(line_spread) label_spread = 1 if hc_score > float(s_away) else 0 # Spread Coverage: 1=Home, 0=Away # Compile Row row = [ # Identifiers mid, hid, aid, match.get('league_id', ''), mst, # Form cache c_form_h.get("points_avg", 80), c_form_h.get("conceded_avg", 80), c_form_a.get("points_avg", 80), c_form_a.get("conceded_avg", 80), c_form_h.get("winning_streak", 0), c_form_a.get("winning_streak", 0), c_form_h.get("win_rate", 0), c_form_a.get("win_rate", 0), # H2H cache c_h2h.get("total", 0), c_h2h.get("home_win_rate", 0.5), c_h2h.get("avg_points", 160.0), c_h2h.get("over140_rate", 0.5), # Odds c_odds.get("ml_h", 1.9), c_odds.get("ml_a", 1.9), c_odds.get("tot_o", 1.9), c_odds.get("tot_u", 1.9), line_tot, c_odds.get("spread_h", 1.9), c_odds.get("spread_a", 1.9), line_spread, # Labels s_home, s_away, total_pts, label_ml, label_tot, label_spread, ] # Safeguard length if len(row) != len(FEATURE_COLS): print(f"Error: Row length mismatch {len(row)} != {len(FEATURE_COLS)}") sys.exit(1) writer.writerow(row) extracted_count += 1 f.close() print("\nExtraction Summary") print("=========================") print(f"Total Matches in Scope: {len(loader.matches)}") print(f"Filtered (Missing ML Odds): {missing_odds_count}") print(f"✅ Successfully Extracted: {extracted_count}") print(f"📂 Saved to: {OUTPUT_CSV}") if __name__ == "__main__": t_start = time.time() # Load leagues if not os.path.exists(TOP_LEAGUES_PATH): print(f"Error: file not found {TOP_LEAGUES_PATH}") sys.exit(1) with open(TOP_LEAGUES_PATH, "r") as f: top_leagues = json.load(f) print(f"🏀 Extracting Basketball Training Data (XGBoost)") print(f"==================================================") print(f"Loaded {len(top_leagues)} top leagues.") conn = get_conn() loader = BatchDataLoader(conn, top_leagues) # 1. Pre-load everything into memory loader.load_all() # 2. Extract and match features, then write CSV process_matches(loader) conn.close() print(f"Total Script Run Time: {time.time()-t_start:.1f}s")