""" XGBoost Training Data Extraction (Advanced Basketball V21) ============================================================ Batch feature extraction for top-league basketball matches. Extracts 60+ features per match including deep team stats (FG%, Rebounds, Qrt pacing). Usage: python3 scripts/extract_advanced_basketball_data.py """ import os import sys import json import csv import math import time from datetime import datetime from collections import defaultdict import psycopg2 from psycopg2.extras import RealDictCursor from dotenv import load_dotenv load_dotenv() # ============================================================================= # CONFIG # ============================================================================= AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, AI_ENGINE_DIR) TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "basketball_top_leagues.json") OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "advanced_basketball_training_data.csv") os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True) def get_conn(): db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0] return psycopg2.connect(db_url) # ============================================================================= # FEATURE COLUMNS (ORDER MATTERS) # ============================================================================= FEATURE_COLS = [ "match_id", "home_team_id", "away_team_id", "league_id", "mst_utc", # Form & Winning "home_winning_streak", "away_winning_streak", "home_win_rate", "away_win_rate", # Home Team Offense (Averages of last 5) "home_pts_avg", "home_reb_avg", "home_ast_avg", "home_stl_avg", "home_blk_avg", "home_tov_avg", "home_fg_pct", "home_3pt_pct", "home_ft_pct", "home_q1_avg", "home_q2_avg", "home_q3_avg", "home_q4_avg", # Home Team Defense (Averages of opponent stats in last 5) "home_conc_pts", "home_conc_reb", "home_conc_ast", "home_conc_tov", "home_conc_fg_pct", "home_conc_3pt_pct", # Away Team Offense (Averages of last 5) "away_pts_avg", "away_reb_avg", "away_ast_avg", "away_stl_avg", "away_blk_avg", "away_tov_avg", "away_fg_pct", "away_3pt_pct", "away_ft_pct", "away_q1_avg", "away_q2_avg", "away_q3_avg", "away_q4_avg", # Away Team Defense (Averages of opponent stats in last 5) "away_conc_pts", "away_conc_reb", "away_conc_ast", "away_conc_tov", "away_conc_fg_pct", "away_conc_3pt_pct", # H2H Features "h2h_total_matches", "h2h_home_win_rate", "h2h_avg_points", "h2h_over140_rate", # Odds Features "odds_ml_h", "odds_ml_a", "odds_tot_o", "odds_tot_u", "odds_tot_line", "odds_spread_h", "odds_spread_a", "odds_spread_line", # Labels "score_home", "score_away", "total_points", "label_ml", # 0=Home, 1=Away "label_tot", # 0=Under, 1=Over (dynamic line) "label_spread", # 0=Away Cover, 1=Home Cover (dynamic line) ] # ============================================================================= # BATCH LOADERS # ============================================================================= class AdvancedDataLoader: def __init__(self, conn, top_league_ids: list): self.conn = conn self.cur = conn.cursor(cursor_factory=RealDictCursor) self.top_league_ids = top_league_ids self.matches = [] self.odds_cache = {} self.team_stats_cache = {} # (match_id, team_id) -> stats dict self.form_cache = {} self.h2h_cache = {} def load_all(self): t0 = time.time() self._load_matches() print(f" ✅ Matches: {len(self.matches)} ({time.time()-t0:.1f}s)", flush=True) t1 = time.time() self._load_team_stats() print(f" ✅ Team Stats: {len(self.team_stats_cache)} records ({time.time()-t1:.1f}s)", flush=True) t2 = time.time() self._load_odds() print(f" ✅ Odds: {len(self.odds_cache)} matches ({time.time()-t2:.1f}s)", flush=True) t3 = time.time() self._build_advanced_history() print(f" ✅ Advanced History & Stats cache built ({time.time()-t3:.1f}s)", flush=True) print(f" 📊 Total load time: {time.time()-t0:.1f}s", flush=True) def _load_matches(self): query = """ SELECT id, mst_utc, league_id, home_team_id, away_team_id, score_home, score_away FROM matches WHERE sport = 'basketball' AND status = 'FT' AND score_home IS NOT NULL AND score_away IS NOT NULL AND mst_utc > 1640995200000 """ if self.top_league_ids: format_strings = ",".join(["%s"] * len(self.top_league_ids)) query += f" AND league_id IN ({format_strings})" self.cur.execute(query + " ORDER BY mst_utc ASC", tuple(self.top_league_ids)) else: self.cur.execute(query + " ORDER BY mst_utc ASC") self.matches = self.cur.fetchall() def _load_team_stats(self): query = """ SELECT match_id, team_id, points, rebounds, assists, steals, blocks, turnovers, fg_made, fg_attempted, three_pt_made, three_pt_attempted, ft_made, ft_attempted, q1_score, q2_score, q3_score, q4_score FROM basketball_team_stats WHERE match_id IN ( SELECT id FROM matches WHERE sport = 'basketball' AND status = 'FT' ) """ self.cur.execute(query) rows = self.cur.fetchall() for r in rows: self.team_stats_cache[(str(r['match_id']), str(r['team_id']))] = r def _load_odds(self): # Using exact same odds parser as original script query = """ SELECT match_id, name as category_name, db_id as category_id FROM odd_categories WHERE match_id IN ( SELECT id FROM matches WHERE sport = 'basketball' AND status = 'FT' ) """ self.cur.execute(query) cats = self.cur.fetchall() cat_to_match = {c['category_id']: c['match_id'] for c in cats} cat_ids = tuple(cat_to_match.keys()) if not cat_ids: return cat_id_to_name = {c['category_id']: c['category_name'] for c in cats} chunk_size = 50000 cats_list = list(cat_ids) total_chunks = len(cats_list) // chunk_size + 1 for idx, i in enumerate(range(0, len(cats_list), chunk_size)): chunk = tuple(cats_list[i:i+chunk_size]) self.cur.execute("SELECT odd_category_db_id, name, odd_value FROM odd_selections WHERE odd_category_db_id IN %s", (chunk,)) rows = self.cur.fetchall() for row in rows: c_id = row['odd_category_db_id'] m_id = str(cat_to_match[c_id]) c_name = cat_id_to_name.get(c_id, "") if m_id not in self.odds_cache: self.odds_cache[m_id] = {} self._parse_single_odd(m_id, c_name, str(row['name']), float(row['odd_value'])) def _parse_single_odd(self, match_id, category_name, sel_name, odd_value): if odd_value <= 1.0: return cat_lower = category_name.lower() sel_lower = sel_name.lower() target = self.odds_cache[match_id] # ML if cat_lower in ("maç sonucu (uzt. dahil)", "mac sonucu (uzt. dahil)", "maç sonucu", "mac sonucu"): if sel_lower == "1": target["ml_h"] = odd_value elif sel_lower == "2": target["ml_a"] = odd_value # Totals if "alt/üst" in cat_lower or "alt/ust" in cat_lower: line = None try: left = cat_lower.find("(") right = cat_lower.find(")", left + 1) if left > -1 and right > -1: line = float(cat_lower[left+1:right].replace(",", ".")) except: pass if line and "tot_line" not in target: target["tot_line"] = line if "üst" in sel_lower or "ust" in sel_lower or "over" in sel_lower: target.setdefault("tot_o", odd_value) elif "alt" in sel_lower or "under" in sel_lower: target.setdefault("tot_u", odd_value) # Spread if "hnd. ms" in cat_lower or "hand. ms" in cat_lower or "hnd ms" in cat_lower: line = None try: left = cat_lower.find("(") right = cat_lower.find(")", left + 1) if left > -1 and right > -1: payload = cat_lower[left+1:right].replace(",", ".") if ":" in payload: home_hcp = float(payload.split(":")[0]) away_hcp = float(payload.split(":")[1]) if abs(home_hcp) < 1e-6 and away_hcp > 0: line = -away_hcp elif home_hcp > 0 and abs(away_hcp) < 1e-6: line = home_hcp elif abs(home_hcp - away_hcp) < 1e-6 and home_hcp > 0: line = 0.0 except: pass if line is not None and "spread_line" not in target: target["spread_line"] = line if sel_lower == "1": target.setdefault("spread_h", odd_value) elif sel_lower == "2": target.setdefault("spread_a", odd_value) def _build_advanced_history(self): team_matches = defaultdict(list) for m in self.matches: mid = str(m['id']) hid = str(m['home_team_id']) aid = str(m['away_team_id']) # Fetch stats from cache h_stat = self.team_stats_cache.get((mid, hid)) a_stat = self.team_stats_cache.get((mid, aid)) if h_stat and a_stat: m_data = { "utc": int(m['mst_utc']), "mid": mid, } # For Home Team History (it stores what THEY did, and what Opp did) team_matches[hid].append({ "utc": int(m['mst_utc']), "scored": m['score_home'], "conceded": m['score_away'], "offense": h_stat, "defense": a_stat }) # For Away Team History team_matches[aid].append({ "utc": int(m['mst_utc']), "scored": m['score_away'], "conceded": m['score_home'], "offense": a_stat, "defense": h_stat }) else: # If advanced stats are missing, we still push the scores to maintain streak tracking team_matches[hid].append({ "utc": int(m['mst_utc']), "scored": m['score_home'], "conceded": m['score_away'], "offense": None, "defense": None }) team_matches[aid].append({ "utc": int(m['mst_utc']), "scored": m['score_away'], "conceded": m['score_home'], "offense": None, "defense": None }) for team_id, hist in team_matches.items(): hist.sort(key=lambda x: x["utc"]) for i, match_info in enumerate(hist): mst_utc = match_info["utc"] past = [x for x in hist[:i] if x["utc"] < mst_utc] if not past: self.form_cache[(team_id, mst_utc)] = self._empty_form() continue last_5 = past[-5:] wins = sum(1 for x in past if x["scored"] > x["conceded"]) win_rate = wins / len(past) if len(past) > 0 else 0.5 streak = 0 for x in reversed(past): if x["scored"] > x["conceded"]: streak += 1 else: break # Averages off_pts, off_reb, off_ast, off_stl, off_blk, off_tov = 0,0,0,0,0,0 off_fg_m, off_fg_a, off_3pt_m, off_3pt_a, off_ft_m, off_ft_a = 0,0,0,0,0,0 off_q1, off_q2, off_q3, off_q4 = 0,0,0,0 def_pts, def_reb, def_ast, def_tov = 0,0,0,0 def_fg_m, def_fg_a, def_3pt_m, def_3pt_a = 0,0,0,0 valid_stats_count = sum(1 for x in last_5 if x["offense"] is not None) if valid_stats_count > 0: for x in last_5: o = x["offense"] d = x["defense"] if o and d: off_pts += (o["points"] or 0) off_reb += (o["rebounds"] or 0) off_ast += (o["assists"] or 0) off_stl += (o["steals"] or 0) off_blk += (o["blocks"] or 0) off_tov += (o["turnovers"] or 0) off_fg_m += (o["fg_made"] or 0) off_fg_a += (o["fg_attempted"] or 0) off_3pt_m += (o["three_pt_made"] or 0) off_3pt_a += (o["three_pt_attempted"] or 0) off_ft_m += (o["ft_made"] or 0) off_ft_a += (o["ft_attempted"] or 0) off_q1 += (o["q1_score"] or 0) off_q2 += (o["q2_score"] or 0) off_q3 += (o["q3_score"] or 0) off_q4 += (o["q4_score"] or 0) def_pts += (d["points"] or 0) # Conceded points based on opponents "offense" data def_reb += (d["rebounds"] or 0) def_ast += (d["assists"] or 0) def_tov += (d["turnovers"] or 0) def_fg_m += (d["fg_made"] or 0) def_fg_a += (d["fg_attempted"] or 0) def_3pt_m += (d["three_pt_made"] or 0) def_3pt_a += (d["three_pt_attempted"] or 0) avg_c = float(valid_stats_count) self.form_cache[(team_id, mst_utc)] = { "winning_streak": streak, "win_rate": win_rate, "pts_avg": off_pts/avg_c, "reb_avg": off_reb/avg_c, "ast_avg": off_ast/avg_c, "stl_avg": off_stl/avg_c, "blk_avg": off_blk/avg_c, "tov_avg": off_tov/avg_c, "fg_pct": (off_fg_m / off_fg_a) if off_fg_a > 0 else 0.45, "3pt_pct": (off_3pt_m / off_3pt_a) if off_3pt_a > 0 else 0.35, "ft_pct": (off_ft_m / off_ft_a) if off_ft_a > 0 else 0.75, "q1_avg": off_q1/avg_c, "q2_avg": off_q2/avg_c, "q3_avg": off_q3/avg_c, "q4_avg": off_q4/avg_c, "conc_pts": def_pts/avg_c, "conc_reb": def_reb/avg_c, "conc_ast": def_ast/avg_c, "conc_tov": def_tov/avg_c, "conc_fg_pct": (def_fg_m / def_fg_a) if def_fg_a > 0 else 0.45, "conc_3pt_pct": (def_3pt_m / def_3pt_a) if def_3pt_a > 0 else 0.35, } else: self.form_cache[(team_id, mst_utc)] = self._empty_form() self.form_cache[(team_id, mst_utc)]["winning_streak"] = streak self.form_cache[(team_id, mst_utc)]["win_rate"] = win_rate # Build H2H similarly h2h_map = defaultdict(list) for m in self.matches: directional_pair = (str(m['home_team_id']), str(m['away_team_id'])) h2h_map[directional_pair].append((m['mst_utc'], m['score_home'], m['score_away'])) for (h_id, a_id), hist in h2h_map.items(): hist.sort(key=lambda x: x[0]) for i, (mst_utc, sh, sa) in enumerate(hist): past = [x for x in hist[:i] if x[0] < mst_utc] if not past: self.h2h_cache[(h_id, a_id, mst_utc)] = { "total": 0, "home_win_rate": 0.5, "avg_points": 160.0, "over140_rate": 0.5 } else: home_wins = sum(1 for x in past if x[1] > x[2]) total_pts = sum(x[1] + x[2] for x in past) over140 = sum(1 for x in past if x[1] + x[2] > 140) self.h2h_cache[(h_id, a_id, mst_utc)] = { "total": len(past), "home_win_rate": home_wins / len(past), "avg_points": total_pts / len(past), "over140_rate": over140 / len(past) } def _empty_form(self): return { "winning_streak": 0, "win_rate": 0.5, "pts_avg": 80.0, "reb_avg": 35.0, "ast_avg": 20.0, "stl_avg": 7.0, "blk_avg": 3.0, "tov_avg": 13.0, "fg_pct": 0.45, "3pt_pct": 0.35, "ft_pct": 0.75, "q1_avg": 20.0, "q2_avg": 20.0, "q3_avg": 20.0, "q4_avg": 20.0, "conc_pts": 80.0, "conc_reb": 35.0, "conc_ast": 20.0, "conc_tov": 13.0, "conc_fg_pct": 0.45, "conc_3pt_pct": 0.35, } # ============================================================================= # FEATURE EXTRACTION PIPELINE # ============================================================================= def process_matches(loader: AdvancedDataLoader): f = open(OUTPUT_CSV, "w", newline='') writer = csv.writer(f) writer.writerow(FEATURE_COLS) extracted_count = 0 missing_odds_count = 0 for match in loader.matches: mid = str(match['id']) mst = int(match['mst_utc']) hid = str(match['home_team_id']) aid = str(match['away_team_id']) s_home = int(match['score_home']) s_away = int(match['score_away']) total_pts = s_home + s_away c_odds = loader.odds_cache.get(mid, {}) c_form_h = loader.form_cache.get((hid, mst), {}) c_form_a = loader.form_cache.get((aid, mst), {}) c_h2h = loader.h2h_cache.get((hid, aid, mst), {}) if "ml_h" not in c_odds or "ml_a" not in c_odds: missing_odds_count += 1 continue label_ml = 0 if s_home > s_away else 1 line_tot = c_odds.get("tot_line", 160.0) label_tot = 1 if total_pts > line_tot else 0 line_spread = c_odds.get("spread_line", 0.0) hc_score = float(s_home) + float(line_spread) label_spread = 1 if hc_score > float(s_away) else 0 row = [ mid, hid, aid, match.get('league_id', ''), mst, c_form_h.get("winning_streak", 0), c_form_a.get("winning_streak", 0), c_form_h.get("win_rate", 0), c_form_a.get("win_rate", 0), # Home Offense c_form_h.get("pts_avg", 80), c_form_h.get("reb_avg", 35), c_form_h.get("ast_avg", 20), c_form_h.get("stl_avg", 7), c_form_h.get("blk_avg", 3), c_form_h.get("tov_avg", 13), c_form_h.get("fg_pct", 0.45), c_form_h.get("3pt_pct", 0.35), c_form_h.get("ft_pct", 0.75), c_form_h.get("q1_avg", 20), c_form_h.get("q2_avg", 20), c_form_h.get("q3_avg", 20), c_form_h.get("q4_avg", 20), # Home Defense c_form_h.get("conc_pts", 80), c_form_h.get("conc_reb", 35), c_form_h.get("conc_ast", 20), c_form_h.get("conc_tov", 13), c_form_h.get("conc_fg_pct", 0.45), c_form_h.get("conc_3pt_pct", 0.35), # Away Offense c_form_a.get("pts_avg", 80), c_form_a.get("reb_avg", 35), c_form_a.get("ast_avg", 20), c_form_a.get("stl_avg", 7), c_form_a.get("blk_avg", 3), c_form_a.get("tov_avg", 13), c_form_a.get("fg_pct", 0.45), c_form_a.get("3pt_pct", 0.35), c_form_a.get("ft_pct", 0.75), c_form_a.get("q1_avg", 20), c_form_a.get("q2_avg", 20), c_form_a.get("q3_avg", 20), c_form_a.get("q4_avg", 20), # Away Defense c_form_a.get("conc_pts", 80), c_form_a.get("conc_reb", 35), c_form_a.get("conc_ast", 20), c_form_a.get("conc_tov", 13), c_form_a.get("conc_fg_pct", 0.45), c_form_a.get("conc_3pt_pct", 0.35), c_h2h.get("total", 0), c_h2h.get("home_win_rate", 0.5), c_h2h.get("avg_points", 160.0), c_h2h.get("over140_rate", 0.5), c_odds.get("ml_h", 1.9), c_odds.get("ml_a", 1.9), c_odds.get("tot_o", 1.9), c_odds.get("tot_u", 1.9), line_tot, c_odds.get("spread_h", 1.9), c_odds.get("spread_a", 1.9), line_spread, s_home, s_away, total_pts, label_ml, label_tot, label_spread, ] if len(row) != len(FEATURE_COLS): print(f"Error: Row length mismatch {len(row)} != {len(FEATURE_COLS)}") sys.exit(1) writer.writerow(row) extracted_count += 1 f.close() print("\nExtraction Summary") print("=========================") print(f"Total Matches in Scope: {len(loader.matches)}") print(f"Filtered (Missing ML Odds): {missing_odds_count}") print(f"✅ Successfully Extracted: {extracted_count}") print(f"📂 Saved to: {OUTPUT_CSV}") if __name__ == "__main__": t_start = time.time() if not os.path.exists(TOP_LEAGUES_PATH): print(f"Error: file not found {TOP_LEAGUES_PATH}") sys.exit(1) with open(TOP_LEAGUES_PATH, "r") as f: top_leagues = json.load(f) print(f"🏀 Extracting Advanced Basketball Training Data (V21)") print(f"=====================================================") print(f"Loaded {len(top_leagues)} top leagues.") conn = get_conn() loader = AdvancedDataLoader(conn, top_leagues) loader.load_all() process_matches(loader) conn.close() print(f"Total Script Run Time: {time.time()-t_start:.1f}s")