""" Extract basketball V25-style training data. Scope: - top leagues from basketball_top_leagues.json - finished basketball matches - pre-match features only - labels for moneyline / total / spread markets """ from __future__ import annotations import csv import json import os import sys import time from collections import defaultdict from typing import Any, Dict, List, Tuple import psycopg2 from psycopg2.extras import RealDictCursor from dotenv import load_dotenv load_dotenv() AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, AI_ENGINE_DIR) from models.basketball_v25_features import DEFAULT_FEATURE_COLS TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "basketball_top_leagues.json") OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "basketball_training_data_v25.csv") IDENTIFIER_COLS = ["match_id", "home_team_id", "away_team_id", "league_id", "mst_utc"] LABEL_COLS = [ "score_home", "score_away", "total_points", "label_ml", "label_total", "label_spread", ] CSV_COLS = IDENTIFIER_COLS + DEFAULT_FEATURE_COLS + LABEL_COLS def get_conn(): db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0] if not db_url: raise RuntimeError("DATABASE_URL is required") return psycopg2.connect(db_url) def safe_float(value: Any, default: float = 0.0) -> float: try: if value is None: return default return float(value) except (TypeError, ValueError): return default def pct(num: float, den: float, default: float = 0.0) -> float: if den <= 0: return default return float(num) / float(den) def default_recent_stats() -> Dict[str, float]: return { "points_avg": 82.0, "conceded_avg": 80.0, "net_rating": 2.0, "win_rate": 0.5, "winning_streak": 0.0, "rest_days": 3.0, "rebounds_avg": 35.0, "assists_avg": 18.0, "steals_avg": 6.5, "blocks_avg": 3.0, "turnovers_avg": 13.0, "fg_pct": 0.45, "three_pt_pct": 0.34, "ft_pct": 0.75, "q1_avg": 20.0, "q4_avg": 21.0, "conc_rebounds_avg": 35.0, "conc_assists_avg": 18.0, "conc_turnovers_avg": 13.0, "conc_fg_pct": 0.45, "conc_three_pt_pct": 0.34, } def summarize_team_history(history: List[Dict[str, Any]], match_date_ms: int) -> Dict[str, float]: if not history: return default_recent_stats() recent = history[-8:] form_window = history[-12:] scored = [safe_float(item["scored"]) for item in recent] conceded = [safe_float(item["conceded"]) for item in recent] wins = sum(1 for item in form_window if safe_float(item["scored"]) > safe_float(item["conceded"])) streak = 0 for item in reversed(form_window): if safe_float(item["scored"]) > safe_float(item["conceded"]): streak += 1 else: break last_match_ms = safe_float(history[-1].get("mst_utc"), 0.0) rest_days = max(0.0, (float(match_date_ms) - last_match_ms) / 86_400_000.0) if last_match_ms else 3.0 def avg_key(key: str, fallback: float) -> float: values = [safe_float(item.get(key), fallback) for item in recent] return sum(values) / max(len(values), 1) points_avg = sum(scored) / max(len(scored), 1) conceded_avg = sum(conceded) / max(len(conceded), 1) return { "points_avg": points_avg, "conceded_avg": conceded_avg, "net_rating": points_avg - conceded_avg, "win_rate": wins / max(len(form_window), 1), "winning_streak": float(streak), "rest_days": rest_days, "rebounds_avg": avg_key("rebounds", 35.0), "assists_avg": avg_key("assists", 18.0), "steals_avg": avg_key("steals", 6.5), "blocks_avg": avg_key("blocks", 3.0), "turnovers_avg": avg_key("turnovers", 13.0), "fg_pct": avg_key("fg_pct", 0.45), "three_pt_pct": avg_key("three_pt_pct", 0.34), "ft_pct": avg_key("ft_pct", 0.75), "q1_avg": avg_key("q1_score", 20.0), "q4_avg": avg_key("q4_score", 21.0), "conc_rebounds_avg": avg_key("opp_rebounds", 35.0), "conc_assists_avg": avg_key("opp_assists", 18.0), "conc_turnovers_avg": avg_key("opp_turnovers", 13.0), "conc_fg_pct": avg_key("opp_fg_pct", 0.45), "conc_three_pt_pct": avg_key("opp_three_pt_pct", 0.34), } def summarize_h2h( history: List[Dict[str, Any]], current_home_id: str, total_line: float, spread_home_line: float, ) -> Dict[str, float]: if not history: return { "h2h_total_matches": 0.0, "h2h_home_win_rate": 0.5, "h2h_avg_points": 160.0, "h2h_avg_margin": 0.0, "h2h_over_total_rate": 0.5, "h2h_home_cover_rate": 0.5, } recent = history[-10:] home_wins = 0 total_points = 0.0 total_margin = 0.0 over_hits = 0 cover_hits = 0 for item in recent: if item["home_team_id"] == current_home_id: home_score = safe_float(item["score_home"]) away_score = safe_float(item["score_away"]) else: home_score = safe_float(item["score_away"]) away_score = safe_float(item["score_home"]) if home_score > away_score: home_wins += 1 margin = home_score - away_score total_margin += margin total_points += home_score + away_score if total_line > 0 and (home_score + away_score) > total_line: over_hits += 1 if (home_score + spread_home_line) > away_score: cover_hits += 1 size = float(len(recent)) return { "h2h_total_matches": size, "h2h_home_win_rate": home_wins / size, "h2h_avg_points": total_points / size, "h2h_avg_margin": total_margin / size, "h2h_over_total_rate": over_hits / size if total_line > 0 else 0.5, "h2h_home_cover_rate": cover_hits / size, } def summarize_league( history: List[Dict[str, Any]], total_line: float, spread_home_line: float, ) -> Dict[str, float]: if not history: return { "league_avg_points": 160.0, "league_home_win_rate": 0.56, "league_over_total_rate": 0.5, "league_home_cover_rate": 0.5, } recent = history[-200:] total_points = 0.0 home_wins = 0 over_hits = 0 cover_hits = 0 for item in recent: score_home = safe_float(item["score_home"]) score_away = safe_float(item["score_away"]) total_points += score_home + score_away if score_home > score_away: home_wins += 1 if total_line > 0 and (score_home + score_away) > total_line: over_hits += 1 if (score_home + spread_home_line) > score_away: cover_hits += 1 size = float(len(recent)) return { "league_avg_points": total_points / size, "league_home_win_rate": home_wins / size, "league_over_total_rate": over_hits / size if total_line > 0 else 0.5, "league_home_cover_rate": cover_hits / size, } def normalize_text(value: Any) -> str: return ( str(value or "") .strip() .lower() .replace("ı", "i") .replace("ç", "c") .replace("ş", "s") .replace("ğ", "g") .replace("ö", "o") .replace("ü", "u") ) def extract_parenthesized_number(category_name: str) -> float | None: left = category_name.find("(") right = category_name.find(")", left + 1) if left < 0 or right < 0: return None payload = category_name[left + 1 : right].replace(",", ".") if ":" in payload: return None try: return float(payload) except ValueError: return None def parse_handicap_home_line(category_name: str) -> float | None: left = category_name.find("(") right = category_name.find(")", left + 1) if left < 0 or right < 0: return None payload = category_name[left + 1 : right].replace(",", ".") if ":" not in payload: return None home_raw, away_raw = payload.split(":", 1) try: home_line = float(home_raw) away_line = float(away_raw) except ValueError: return None if abs(home_line) < 1e-9 and away_line > 0: return -away_line if home_line > 0 and abs(away_line) < 1e-9: return home_line if abs(home_line - away_line) < 1e-9 and home_line > 0: return 0.0 return home_line def parse_odds(categories: List[Dict[str, Any]], selections: List[Dict[str, Any]]) -> Dict[str, Dict[str, float]]: match_odds: Dict[str, Dict[str, float]] = defaultdict(dict) category_map = { row["category_id"]: (str(row["match_id"]), str(row["category_name"])) for row in categories } for row in selections: category_id = row["odd_category_db_id"] if category_id not in category_map: continue match_id, category_name = category_map[category_id] category_norm = normalize_text(category_name) selection_norm = normalize_text(row["name"]) odd_value = safe_float(row["odd_value"], 0.0) if odd_value <= 1.0: continue target = match_odds[match_id] if category_norm in ("mac sonucu", "mac sonucu (uzt. dahil)"): if selection_norm == "1": target["ml_h"] = odd_value elif selection_norm == "2": target["ml_a"] = odd_value if ("alt/ust" in category_norm or "alt/üst" in str(category_name).lower()) and not any( token in category_norm for token in ("1. yari", "1. yarı", "periyot", "ev sahibi", "deplasman") ): total_line = extract_parenthesized_number(category_name) if total_line is not None: target.setdefault("tot_line", total_line) if any(token in selection_norm for token in ("ust", "over")): target.setdefault("tot_o", odd_value) elif any(token in selection_norm for token in ("alt", "under")): target.setdefault("tot_u", odd_value) if "hnd. ms" in category_norm or "hand. ms" in category_norm or "hnd ms" in category_norm: home_line = parse_handicap_home_line(category_name) if home_line is not None: target.setdefault("spread_home_line", home_line) if selection_norm == "1": target.setdefault("spread_h", odd_value) elif selection_norm == "2": target.setdefault("spread_a", odd_value) return match_odds class ExtractionContext: def __init__(self, conn, league_ids: List[str]): self.conn = conn self.cur = conn.cursor(cursor_factory=RealDictCursor) self.league_ids = league_ids self.matches: List[Dict[str, Any]] = [] self.team_stats: Dict[Tuple[str, str], Dict[str, Any]] = {} self.ai_features: Dict[str, Dict[str, Any]] = {} self.odds_cache: Dict[str, Dict[str, float]] = {} def load(self) -> None: self._load_matches() self._load_team_stats() self._load_ai_features() self._load_odds() def _load_matches(self) -> None: query = """ SELECT id, league_id, home_team_id, away_team_id, mst_utc, score_home, score_away FROM matches WHERE sport = 'basketball' AND status = 'FT' AND score_home IS NOT NULL AND score_away IS NOT NULL AND mst_utc >= 1640995200000 """ params: Tuple[Any, ...] = () if self.league_ids: placeholders = ",".join(["%s"] * len(self.league_ids)) query += f" AND league_id IN ({placeholders})" params = tuple(self.league_ids) query += " ORDER BY mst_utc ASC" self.cur.execute(query, params) self.matches = self.cur.fetchall() def _load_team_stats(self) -> None: self.cur.execute( """ SELECT match_id, team_id, points, rebounds, assists, steals, blocks, turnovers, fg_made, fg_attempted, three_pt_made, three_pt_attempted, ft_made, ft_attempted, q1_score, q4_score FROM basketball_team_stats """ ) for row in self.cur.fetchall(): key = (str(row["match_id"]), str(row["team_id"])) self.team_stats[key] = row def _load_ai_features(self) -> None: self.cur.execute("SELECT * FROM basketball_ai_features") for row in self.cur.fetchall(): self.ai_features[str(row["match_id"])] = row def _load_odds(self) -> None: self.cur.execute( """ SELECT db_id AS category_id, match_id, name AS category_name FROM odd_categories WHERE match_id IN ( SELECT id FROM matches WHERE sport = 'basketball' AND status = 'FT' ) """ ) categories = self.cur.fetchall() category_ids = [row["category_id"] for row in categories] if not category_ids: return selections: List[Dict[str, Any]] = [] chunk_size = 50000 for idx in range(0, len(category_ids), chunk_size): chunk = tuple(category_ids[idx : idx + chunk_size]) self.cur.execute( """ SELECT odd_category_db_id, name, odd_value FROM odd_selections WHERE odd_category_db_id IN %s """, (chunk,), ) selections.extend(self.cur.fetchall()) self.odds_cache = parse_odds(categories, selections) def build_match_feature_row( match: Dict[str, Any], ctx: ExtractionContext, team_history: Dict[str, List[Dict[str, Any]]], pair_history: Dict[Tuple[str, str], List[Dict[str, Any]]], league_history: Dict[str, List[Dict[str, Any]]], ) -> Dict[str, Any] | None: match_id = str(match["id"]) home_id = str(match["home_team_id"]) away_id = str(match["away_team_id"]) league_id = str(match["league_id"] or "") mst_utc = int(match["mst_utc"]) odds = ctx.odds_cache.get(match_id, {}) if safe_float(odds.get("ml_h"), 0.0) <= 1.0 or safe_float(odds.get("ml_a"), 0.0) <= 1.0: return None ai_row = ctx.ai_features.get(match_id, {}) home_recent = summarize_team_history(team_history[home_id], mst_utc) away_recent = summarize_team_history(team_history[away_id], mst_utc) total_line = safe_float(odds.get("tot_line"), 160.0) spread_home_line = safe_float(odds.get("spread_home_line"), 0.0) pair_key = tuple(sorted((home_id, away_id))) h2h = summarize_h2h(pair_history[pair_key], home_id, total_line, spread_home_line) league = summarize_league(league_history[league_id], total_line, spread_home_line) ml_h = safe_float(odds.get("ml_h"), 1.90) ml_a = safe_float(odds.get("ml_a"), 1.90) tot_o = safe_float(odds.get("tot_o"), 1.90) tot_u = safe_float(odds.get("tot_u"), 1.90) spr_h = safe_float(odds.get("spread_h"), 1.90) spr_a = safe_float(odds.get("spread_a"), 1.90) raw_home = 1.0 / ml_h raw_away = 1.0 / ml_a raw_total = raw_home + raw_away implied_home = (raw_home / raw_total) if raw_total > 0 else 0.5 implied_away = (raw_away / raw_total) if raw_total > 0 else 0.5 raw_over = 1.0 / tot_o if tot_o > 1.0 else 0.0 raw_under = 1.0 / tot_u if tot_u > 1.0 else 0.0 raw_total_ou = raw_over + raw_under implied_total_over = (raw_over / raw_total_ou) if raw_total_ou > 0 else 0.5 implied_total_under = (raw_under / raw_total_ou) if raw_total_ou > 0 else 0.5 raw_home_cover = 1.0 / spr_h if spr_h > 1.0 else 0.0 raw_away_cover = 1.0 / spr_a if spr_a > 1.0 else 0.0 raw_total_spread = raw_home_cover + raw_away_cover implied_spread_home = (raw_home_cover / raw_total_spread) if raw_total_spread > 0 else 0.5 implied_spread_away = (raw_away_cover / raw_total_spread) if raw_total_spread > 0 else 0.5 projected_total_form = ( home_recent["points_avg"] + away_recent["points_avg"] + home_recent["conceded_avg"] + away_recent["conceded_avg"] ) / 2.0 projected_margin_form = home_recent["net_rating"] - away_recent["net_rating"] features = { "home_overall_elo": safe_float(ai_row.get("home_elo"), 1500.0), "away_overall_elo": safe_float(ai_row.get("away_elo"), 1500.0), "elo_diff": safe_float(ai_row.get("elo_diff"), 0.0), "home_home_elo": safe_float(ai_row.get("home_home_elo"), safe_float(ai_row.get("home_elo"), 1500.0)), "away_away_elo": safe_float(ai_row.get("away_away_elo"), safe_float(ai_row.get("away_elo"), 1500.0)), "home_form_elo": safe_float(ai_row.get("home_form_elo"), safe_float(ai_row.get("home_elo"), 1500.0)), "away_form_elo": safe_float(ai_row.get("away_form_elo"), safe_float(ai_row.get("away_elo"), 1500.0)), "home_form_score": safe_float(ai_row.get("home_form_score"), home_recent["win_rate"] * 100.0), "away_form_score": safe_float(ai_row.get("away_form_score"), away_recent["win_rate"] * 100.0), "form_score_diff": safe_float(ai_row.get("home_form_score"), home_recent["win_rate"] * 100.0) - safe_float(ai_row.get("away_form_score"), away_recent["win_rate"] * 100.0), "home_points_avg": safe_float(ai_row.get("home_pts_avg_5"), home_recent["points_avg"]), "away_points_avg": safe_float(ai_row.get("away_pts_avg_5"), away_recent["points_avg"]), "points_avg_diff": safe_float(ai_row.get("home_pts_avg_5"), home_recent["points_avg"]) - safe_float(ai_row.get("away_pts_avg_5"), away_recent["points_avg"]), "home_conceded_avg": safe_float(ai_row.get("home_conceded_avg_5"), home_recent["conceded_avg"]), "away_conceded_avg": safe_float(ai_row.get("away_conceded_avg_5"), away_recent["conceded_avg"]), "conceded_avg_diff": safe_float(ai_row.get("home_conceded_avg_5"), home_recent["conceded_avg"]) - safe_float(ai_row.get("away_conceded_avg_5"), away_recent["conceded_avg"]), "home_net_rating": home_recent["net_rating"], "away_net_rating": away_recent["net_rating"], "net_rating_diff": home_recent["net_rating"] - away_recent["net_rating"], "home_win_rate": home_recent["win_rate"], "away_win_rate": away_recent["win_rate"], "win_rate_diff": home_recent["win_rate"] - away_recent["win_rate"], "home_winning_streak": safe_float(ai_row.get("home_win_streak"), home_recent["winning_streak"]), "away_winning_streak": safe_float(ai_row.get("away_win_streak"), away_recent["winning_streak"]), "streak_diff": safe_float(ai_row.get("home_win_streak"), home_recent["winning_streak"]) - safe_float(ai_row.get("away_win_streak"), away_recent["winning_streak"]), "home_rest_days": home_recent["rest_days"], "away_rest_days": away_recent["rest_days"], "rest_diff": home_recent["rest_days"] - away_recent["rest_days"], "home_rebounds_avg": safe_float(ai_row.get("home_avg_rebounds"), home_recent["rebounds_avg"]), "away_rebounds_avg": safe_float(ai_row.get("away_avg_rebounds"), away_recent["rebounds_avg"]), "rebounds_diff": safe_float(ai_row.get("home_avg_rebounds"), home_recent["rebounds_avg"]) - safe_float(ai_row.get("away_avg_rebounds"), away_recent["rebounds_avg"]), "home_assists_avg": home_recent["assists_avg"], "away_assists_avg": away_recent["assists_avg"], "assists_diff": home_recent["assists_avg"] - away_recent["assists_avg"], "home_steals_avg": home_recent["steals_avg"], "away_steals_avg": away_recent["steals_avg"], "steals_diff": home_recent["steals_avg"] - away_recent["steals_avg"], "home_blocks_avg": home_recent["blocks_avg"], "away_blocks_avg": away_recent["blocks_avg"], "blocks_diff": home_recent["blocks_avg"] - away_recent["blocks_avg"], "home_turnovers_avg": safe_float(ai_row.get("home_avg_turnovers"), home_recent["turnovers_avg"]), "away_turnovers_avg": safe_float(ai_row.get("away_avg_turnovers"), away_recent["turnovers_avg"]), "turnovers_diff": safe_float(ai_row.get("home_avg_turnovers"), home_recent["turnovers_avg"]) - safe_float(ai_row.get("away_avg_turnovers"), away_recent["turnovers_avg"]), "home_fg_pct": safe_float(ai_row.get("home_fg_pct"), home_recent["fg_pct"]), "away_fg_pct": safe_float(ai_row.get("away_fg_pct"), away_recent["fg_pct"]), "fg_pct_diff": safe_float(ai_row.get("home_fg_pct"), home_recent["fg_pct"]) - safe_float(ai_row.get("away_fg_pct"), away_recent["fg_pct"]), "home_three_pt_pct": pct( safe_float(ai_row.get("home_avg_three_pt_made"), home_recent["three_pt_pct"] * 25.0), 25.0, home_recent["three_pt_pct"], ), "away_three_pt_pct": pct( safe_float(ai_row.get("away_avg_three_pt_made"), away_recent["three_pt_pct"] * 25.0), 25.0, away_recent["three_pt_pct"], ), "three_pt_pct_diff": pct( safe_float(ai_row.get("home_avg_three_pt_made"), home_recent["three_pt_pct"] * 25.0), 25.0, home_recent["three_pt_pct"], ) - pct( safe_float(ai_row.get("away_avg_three_pt_made"), away_recent["three_pt_pct"] * 25.0), 25.0, away_recent["three_pt_pct"], ), "home_ft_pct": home_recent["ft_pct"], "away_ft_pct": away_recent["ft_pct"], "ft_pct_diff": home_recent["ft_pct"] - away_recent["ft_pct"], "home_q1_avg": home_recent["q1_avg"], "away_q1_avg": away_recent["q1_avg"], "home_q4_avg": home_recent["q4_avg"], "away_q4_avg": away_recent["q4_avg"], "home_conc_rebounds_avg": home_recent["conc_rebounds_avg"], "away_conc_rebounds_avg": away_recent["conc_rebounds_avg"], "home_conc_assists_avg": home_recent["conc_assists_avg"], "away_conc_assists_avg": away_recent["conc_assists_avg"], "home_conc_turnovers_avg": home_recent["conc_turnovers_avg"], "away_conc_turnovers_avg": away_recent["conc_turnovers_avg"], "home_conc_fg_pct": home_recent["conc_fg_pct"], "away_conc_fg_pct": away_recent["conc_fg_pct"], "home_conc_three_pt_pct": home_recent["conc_three_pt_pct"], "away_conc_three_pt_pct": away_recent["conc_three_pt_pct"], **h2h, **league, "ml_home_odds": ml_h, "ml_away_odds": ml_a, "implied_home": safe_float(ai_row.get("implied_home"), implied_home), "implied_away": safe_float(ai_row.get("implied_away"), implied_away), "total_line": total_line, "total_over_odds": tot_o, "total_under_odds": tot_u, "implied_total_over": safe_float(ai_row.get("implied_over_total"), implied_total_over), "implied_total_under": implied_total_under, "spread_home_line": spread_home_line, "spread_home_odds": spr_h, "spread_away_odds": spr_a, "implied_spread_home": safe_float(ai_row.get("implied_spread_home"), implied_spread_home), "implied_spread_away": implied_spread_away, "odds_overround": safe_float(ai_row.get("odds_overround"), raw_total - 1.0), "home_sidelined_count": 0.0, "away_sidelined_count": 0.0, "sidelined_diff": 0.0, "missing_players_impact": safe_float(ai_row.get("missing_players_impact"), 0.0), "total_points_form": projected_total_form, "total_points_allowed_form": home_recent["conceded_avg"] + away_recent["conceded_avg"], "projected_total_delta_vs_line": projected_total_form - total_line, "projected_margin_vs_spread": projected_margin_form + spread_home_line, } score_home = int(match["score_home"]) score_away = int(match["score_away"]) total_points = score_home + score_away return { "match_id": match_id, "home_team_id": home_id, "away_team_id": away_id, "league_id": league_id, "mst_utc": mst_utc, **{feature: safe_float(features.get(feature), 0.0) for feature in DEFAULT_FEATURE_COLS}, "score_home": score_home, "score_away": score_away, "total_points": total_points, "label_ml": 0 if score_home > score_away else 1, "label_total": 1 if total_points > total_line else 0, "label_spread": 1 if (score_home + spread_home_line) > score_away else 0, } def update_histories( match: Dict[str, Any], ctx: ExtractionContext, team_history: Dict[str, List[Dict[str, Any]]], pair_history: Dict[Tuple[str, str], List[Dict[str, Any]]], league_history: Dict[str, List[Dict[str, Any]]], ) -> None: match_id = str(match["id"]) home_id = str(match["home_team_id"]) away_id = str(match["away_team_id"]) league_id = str(match["league_id"] or "") score_home = int(match["score_home"]) score_away = int(match["score_away"]) home_stats = ctx.team_stats.get((match_id, home_id), {}) away_stats = ctx.team_stats.get((match_id, away_id), {}) home_record = { "mst_utc": int(match["mst_utc"]), "scored": score_home, "conceded": score_away, "rebounds": safe_float(home_stats.get("rebounds"), 35.0), "assists": safe_float(home_stats.get("assists"), 18.0), "steals": safe_float(home_stats.get("steals"), 6.5), "blocks": safe_float(home_stats.get("blocks"), 3.0), "turnovers": safe_float(home_stats.get("turnovers"), 13.0), "fg_pct": pct(safe_float(home_stats.get("fg_made")), safe_float(home_stats.get("fg_attempted")), 0.45), "three_pt_pct": pct( safe_float(home_stats.get("three_pt_made")), safe_float(home_stats.get("three_pt_attempted")), 0.34, ), "ft_pct": pct(safe_float(home_stats.get("ft_made")), safe_float(home_stats.get("ft_attempted")), 0.75), "q1_score": safe_float(home_stats.get("q1_score"), 20.0), "q4_score": safe_float(home_stats.get("q4_score"), 21.0), "opp_rebounds": safe_float(away_stats.get("rebounds"), 35.0), "opp_assists": safe_float(away_stats.get("assists"), 18.0), "opp_turnovers": safe_float(away_stats.get("turnovers"), 13.0), "opp_fg_pct": pct(safe_float(away_stats.get("fg_made")), safe_float(away_stats.get("fg_attempted")), 0.45), "opp_three_pt_pct": pct( safe_float(away_stats.get("three_pt_made")), safe_float(away_stats.get("three_pt_attempted")), 0.34, ), } away_record = { "mst_utc": int(match["mst_utc"]), "scored": score_away, "conceded": score_home, "rebounds": safe_float(away_stats.get("rebounds"), 35.0), "assists": safe_float(away_stats.get("assists"), 18.0), "steals": safe_float(away_stats.get("steals"), 6.5), "blocks": safe_float(away_stats.get("blocks"), 3.0), "turnovers": safe_float(away_stats.get("turnovers"), 13.0), "fg_pct": pct(safe_float(away_stats.get("fg_made")), safe_float(away_stats.get("fg_attempted")), 0.45), "three_pt_pct": pct( safe_float(away_stats.get("three_pt_made")), safe_float(away_stats.get("three_pt_attempted")), 0.34, ), "ft_pct": pct(safe_float(away_stats.get("ft_made")), safe_float(away_stats.get("ft_attempted")), 0.75), "q1_score": safe_float(away_stats.get("q1_score"), 20.0), "q4_score": safe_float(away_stats.get("q4_score"), 21.0), "opp_rebounds": safe_float(home_stats.get("rebounds"), 35.0), "opp_assists": safe_float(home_stats.get("assists"), 18.0), "opp_turnovers": safe_float(home_stats.get("turnovers"), 13.0), "opp_fg_pct": pct(safe_float(home_stats.get("fg_made")), safe_float(home_stats.get("fg_attempted")), 0.45), "opp_three_pt_pct": pct( safe_float(home_stats.get("three_pt_made")), safe_float(home_stats.get("three_pt_attempted")), 0.34, ), } team_history[home_id].append(home_record) team_history[away_id].append(away_record) pair_history[tuple(sorted((home_id, away_id)))].append( { "home_team_id": home_id, "away_team_id": away_id, "score_home": score_home, "score_away": score_away, } ) league_history[league_id].append( { "score_home": score_home, "score_away": score_away, } ) def main() -> None: started_at = time.time() if not os.path.exists(TOP_LEAGUES_PATH): raise FileNotFoundError(TOP_LEAGUES_PATH) with open(TOP_LEAGUES_PATH, "r", encoding="utf-8") as handle: league_ids = json.load(handle) os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True) conn = get_conn() ctx = ExtractionContext(conn, league_ids) ctx.load() team_history: Dict[str, List[Dict[str, Any]]] = defaultdict(list) pair_history: Dict[Tuple[str, str], List[Dict[str, Any]]] = defaultdict(list) league_history: Dict[str, List[Dict[str, Any]]] = defaultdict(list) extracted = 0 skipped = 0 with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as handle: writer = csv.DictWriter(handle, fieldnames=CSV_COLS) writer.writeheader() for idx, match in enumerate(ctx.matches, start=1): row = build_match_feature_row(match, ctx, team_history, pair_history, league_history) if row is None: skipped += 1 else: writer.writerow(row) extracted += 1 update_histories(match, ctx, team_history, pair_history, league_history) if idx % 2000 == 0: print( f"[INFO] processed={idx} extracted={extracted} skipped={skipped}", flush=True, ) conn.close() print("[OK] Basketball V25 extraction complete", flush=True) print(f"[INFO] matches={len(ctx.matches)} extracted={extracted} skipped={skipped}", flush=True) print(f"[INFO] output={OUTPUT_CSV}", flush=True) print(f"[INFO] duration_sec={time.time() - started_at:.1f}", flush=True) if __name__ == "__main__": main()