Files
iddaai-be/ai-engine/scripts/extract_basketball_v25_data.py
T
fahricansecer 2f0b85a0c7
Deploy Iddaai Backend / build-and-deploy (push) Failing after 18s
first (part 2: other directories)
2026-04-16 15:11:25 +03:00

766 lines
30 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Extract basketball V25-style training data.
Scope:
- top leagues from basketball_top_leagues.json
- finished basketball matches
- pre-match features only
- labels for moneyline / total / spread markets
"""
from __future__ import annotations
import csv
import json
import os
import sys
import time
from collections import defaultdict
from typing import Any, Dict, List, Tuple
import psycopg2
from psycopg2.extras import RealDictCursor
from dotenv import load_dotenv
load_dotenv()
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_ENGINE_DIR)
from models.basketball_v25_features import DEFAULT_FEATURE_COLS
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "basketball_top_leagues.json")
OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "basketball_training_data_v25.csv")
IDENTIFIER_COLS = ["match_id", "home_team_id", "away_team_id", "league_id", "mst_utc"]
LABEL_COLS = [
"score_home",
"score_away",
"total_points",
"label_ml",
"label_total",
"label_spread",
]
CSV_COLS = IDENTIFIER_COLS + DEFAULT_FEATURE_COLS + LABEL_COLS
def get_conn():
db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0]
if not db_url:
raise RuntimeError("DATABASE_URL is required")
return psycopg2.connect(db_url)
def safe_float(value: Any, default: float = 0.0) -> float:
try:
if value is None:
return default
return float(value)
except (TypeError, ValueError):
return default
def pct(num: float, den: float, default: float = 0.0) -> float:
if den <= 0:
return default
return float(num) / float(den)
def default_recent_stats() -> Dict[str, float]:
return {
"points_avg": 82.0,
"conceded_avg": 80.0,
"net_rating": 2.0,
"win_rate": 0.5,
"winning_streak": 0.0,
"rest_days": 3.0,
"rebounds_avg": 35.0,
"assists_avg": 18.0,
"steals_avg": 6.5,
"blocks_avg": 3.0,
"turnovers_avg": 13.0,
"fg_pct": 0.45,
"three_pt_pct": 0.34,
"ft_pct": 0.75,
"q1_avg": 20.0,
"q4_avg": 21.0,
"conc_rebounds_avg": 35.0,
"conc_assists_avg": 18.0,
"conc_turnovers_avg": 13.0,
"conc_fg_pct": 0.45,
"conc_three_pt_pct": 0.34,
}
def summarize_team_history(history: List[Dict[str, Any]], match_date_ms: int) -> Dict[str, float]:
if not history:
return default_recent_stats()
recent = history[-8:]
form_window = history[-12:]
scored = [safe_float(item["scored"]) for item in recent]
conceded = [safe_float(item["conceded"]) for item in recent]
wins = sum(1 for item in form_window if safe_float(item["scored"]) > safe_float(item["conceded"]))
streak = 0
for item in reversed(form_window):
if safe_float(item["scored"]) > safe_float(item["conceded"]):
streak += 1
else:
break
last_match_ms = safe_float(history[-1].get("mst_utc"), 0.0)
rest_days = max(0.0, (float(match_date_ms) - last_match_ms) / 86_400_000.0) if last_match_ms else 3.0
def avg_key(key: str, fallback: float) -> float:
values = [safe_float(item.get(key), fallback) for item in recent]
return sum(values) / max(len(values), 1)
points_avg = sum(scored) / max(len(scored), 1)
conceded_avg = sum(conceded) / max(len(conceded), 1)
return {
"points_avg": points_avg,
"conceded_avg": conceded_avg,
"net_rating": points_avg - conceded_avg,
"win_rate": wins / max(len(form_window), 1),
"winning_streak": float(streak),
"rest_days": rest_days,
"rebounds_avg": avg_key("rebounds", 35.0),
"assists_avg": avg_key("assists", 18.0),
"steals_avg": avg_key("steals", 6.5),
"blocks_avg": avg_key("blocks", 3.0),
"turnovers_avg": avg_key("turnovers", 13.0),
"fg_pct": avg_key("fg_pct", 0.45),
"three_pt_pct": avg_key("three_pt_pct", 0.34),
"ft_pct": avg_key("ft_pct", 0.75),
"q1_avg": avg_key("q1_score", 20.0),
"q4_avg": avg_key("q4_score", 21.0),
"conc_rebounds_avg": avg_key("opp_rebounds", 35.0),
"conc_assists_avg": avg_key("opp_assists", 18.0),
"conc_turnovers_avg": avg_key("opp_turnovers", 13.0),
"conc_fg_pct": avg_key("opp_fg_pct", 0.45),
"conc_three_pt_pct": avg_key("opp_three_pt_pct", 0.34),
}
def summarize_h2h(
history: List[Dict[str, Any]],
current_home_id: str,
total_line: float,
spread_home_line: float,
) -> Dict[str, float]:
if not history:
return {
"h2h_total_matches": 0.0,
"h2h_home_win_rate": 0.5,
"h2h_avg_points": 160.0,
"h2h_avg_margin": 0.0,
"h2h_over_total_rate": 0.5,
"h2h_home_cover_rate": 0.5,
}
recent = history[-10:]
home_wins = 0
total_points = 0.0
total_margin = 0.0
over_hits = 0
cover_hits = 0
for item in recent:
if item["home_team_id"] == current_home_id:
home_score = safe_float(item["score_home"])
away_score = safe_float(item["score_away"])
else:
home_score = safe_float(item["score_away"])
away_score = safe_float(item["score_home"])
if home_score > away_score:
home_wins += 1
margin = home_score - away_score
total_margin += margin
total_points += home_score + away_score
if total_line > 0 and (home_score + away_score) > total_line:
over_hits += 1
if (home_score + spread_home_line) > away_score:
cover_hits += 1
size = float(len(recent))
return {
"h2h_total_matches": size,
"h2h_home_win_rate": home_wins / size,
"h2h_avg_points": total_points / size,
"h2h_avg_margin": total_margin / size,
"h2h_over_total_rate": over_hits / size if total_line > 0 else 0.5,
"h2h_home_cover_rate": cover_hits / size,
}
def summarize_league(
history: List[Dict[str, Any]],
total_line: float,
spread_home_line: float,
) -> Dict[str, float]:
if not history:
return {
"league_avg_points": 160.0,
"league_home_win_rate": 0.56,
"league_over_total_rate": 0.5,
"league_home_cover_rate": 0.5,
}
recent = history[-200:]
total_points = 0.0
home_wins = 0
over_hits = 0
cover_hits = 0
for item in recent:
score_home = safe_float(item["score_home"])
score_away = safe_float(item["score_away"])
total_points += score_home + score_away
if score_home > score_away:
home_wins += 1
if total_line > 0 and (score_home + score_away) > total_line:
over_hits += 1
if (score_home + spread_home_line) > score_away:
cover_hits += 1
size = float(len(recent))
return {
"league_avg_points": total_points / size,
"league_home_win_rate": home_wins / size,
"league_over_total_rate": over_hits / size if total_line > 0 else 0.5,
"league_home_cover_rate": cover_hits / size,
}
def normalize_text(value: Any) -> str:
return (
str(value or "")
.strip()
.lower()
.replace("ı", "i")
.replace("ç", "c")
.replace("ş", "s")
.replace("ğ", "g")
.replace("ö", "o")
.replace("ü", "u")
)
def extract_parenthesized_number(category_name: str) -> float | None:
left = category_name.find("(")
right = category_name.find(")", left + 1)
if left < 0 or right < 0:
return None
payload = category_name[left + 1 : right].replace(",", ".")
if ":" in payload:
return None
try:
return float(payload)
except ValueError:
return None
def parse_handicap_home_line(category_name: str) -> float | None:
left = category_name.find("(")
right = category_name.find(")", left + 1)
if left < 0 or right < 0:
return None
payload = category_name[left + 1 : right].replace(",", ".")
if ":" not in payload:
return None
home_raw, away_raw = payload.split(":", 1)
try:
home_line = float(home_raw)
away_line = float(away_raw)
except ValueError:
return None
if abs(home_line) < 1e-9 and away_line > 0:
return -away_line
if home_line > 0 and abs(away_line) < 1e-9:
return home_line
if abs(home_line - away_line) < 1e-9 and home_line > 0:
return 0.0
return home_line
def parse_odds(categories: List[Dict[str, Any]], selections: List[Dict[str, Any]]) -> Dict[str, Dict[str, float]]:
match_odds: Dict[str, Dict[str, float]] = defaultdict(dict)
category_map = {
row["category_id"]: (str(row["match_id"]), str(row["category_name"]))
for row in categories
}
for row in selections:
category_id = row["odd_category_db_id"]
if category_id not in category_map:
continue
match_id, category_name = category_map[category_id]
category_norm = normalize_text(category_name)
selection_norm = normalize_text(row["name"])
odd_value = safe_float(row["odd_value"], 0.0)
if odd_value <= 1.0:
continue
target = match_odds[match_id]
if category_norm in ("mac sonucu", "mac sonucu (uzt. dahil)"):
if selection_norm == "1":
target["ml_h"] = odd_value
elif selection_norm == "2":
target["ml_a"] = odd_value
if ("alt/ust" in category_norm or "alt/üst" in str(category_name).lower()) and not any(
token in category_norm for token in ("1. yari", "1. yarı", "periyot", "ev sahibi", "deplasman")
):
total_line = extract_parenthesized_number(category_name)
if total_line is not None:
target.setdefault("tot_line", total_line)
if any(token in selection_norm for token in ("ust", "over")):
target.setdefault("tot_o", odd_value)
elif any(token in selection_norm for token in ("alt", "under")):
target.setdefault("tot_u", odd_value)
if "hnd. ms" in category_norm or "hand. ms" in category_norm or "hnd ms" in category_norm:
home_line = parse_handicap_home_line(category_name)
if home_line is not None:
target.setdefault("spread_home_line", home_line)
if selection_norm == "1":
target.setdefault("spread_h", odd_value)
elif selection_norm == "2":
target.setdefault("spread_a", odd_value)
return match_odds
class ExtractionContext:
def __init__(self, conn, league_ids: List[str]):
self.conn = conn
self.cur = conn.cursor(cursor_factory=RealDictCursor)
self.league_ids = league_ids
self.matches: List[Dict[str, Any]] = []
self.team_stats: Dict[Tuple[str, str], Dict[str, Any]] = {}
self.ai_features: Dict[str, Dict[str, Any]] = {}
self.odds_cache: Dict[str, Dict[str, float]] = {}
def load(self) -> None:
self._load_matches()
self._load_team_stats()
self._load_ai_features()
self._load_odds()
def _load_matches(self) -> None:
query = """
SELECT id, league_id, home_team_id, away_team_id, mst_utc, score_home, score_away
FROM matches
WHERE sport = 'basketball'
AND status = 'FT'
AND score_home IS NOT NULL
AND score_away IS NOT NULL
AND mst_utc >= 1640995200000
"""
params: Tuple[Any, ...] = ()
if self.league_ids:
placeholders = ",".join(["%s"] * len(self.league_ids))
query += f" AND league_id IN ({placeholders})"
params = tuple(self.league_ids)
query += " ORDER BY mst_utc ASC"
self.cur.execute(query, params)
self.matches = self.cur.fetchall()
def _load_team_stats(self) -> None:
self.cur.execute(
"""
SELECT
match_id,
team_id,
points,
rebounds,
assists,
steals,
blocks,
turnovers,
fg_made,
fg_attempted,
three_pt_made,
three_pt_attempted,
ft_made,
ft_attempted,
q1_score,
q4_score
FROM basketball_team_stats
"""
)
for row in self.cur.fetchall():
key = (str(row["match_id"]), str(row["team_id"]))
self.team_stats[key] = row
def _load_ai_features(self) -> None:
self.cur.execute("SELECT * FROM basketball_ai_features")
for row in self.cur.fetchall():
self.ai_features[str(row["match_id"])] = row
def _load_odds(self) -> None:
self.cur.execute(
"""
SELECT db_id AS category_id, match_id, name AS category_name
FROM odd_categories
WHERE match_id IN (
SELECT id
FROM matches
WHERE sport = 'basketball'
AND status = 'FT'
)
"""
)
categories = self.cur.fetchall()
category_ids = [row["category_id"] for row in categories]
if not category_ids:
return
selections: List[Dict[str, Any]] = []
chunk_size = 50000
for idx in range(0, len(category_ids), chunk_size):
chunk = tuple(category_ids[idx : idx + chunk_size])
self.cur.execute(
"""
SELECT odd_category_db_id, name, odd_value
FROM odd_selections
WHERE odd_category_db_id IN %s
""",
(chunk,),
)
selections.extend(self.cur.fetchall())
self.odds_cache = parse_odds(categories, selections)
def build_match_feature_row(
match: Dict[str, Any],
ctx: ExtractionContext,
team_history: Dict[str, List[Dict[str, Any]]],
pair_history: Dict[Tuple[str, str], List[Dict[str, Any]]],
league_history: Dict[str, List[Dict[str, Any]]],
) -> Dict[str, Any] | None:
match_id = str(match["id"])
home_id = str(match["home_team_id"])
away_id = str(match["away_team_id"])
league_id = str(match["league_id"] or "")
mst_utc = int(match["mst_utc"])
odds = ctx.odds_cache.get(match_id, {})
if safe_float(odds.get("ml_h"), 0.0) <= 1.0 or safe_float(odds.get("ml_a"), 0.0) <= 1.0:
return None
ai_row = ctx.ai_features.get(match_id, {})
home_recent = summarize_team_history(team_history[home_id], mst_utc)
away_recent = summarize_team_history(team_history[away_id], mst_utc)
total_line = safe_float(odds.get("tot_line"), 160.0)
spread_home_line = safe_float(odds.get("spread_home_line"), 0.0)
pair_key = tuple(sorted((home_id, away_id)))
h2h = summarize_h2h(pair_history[pair_key], home_id, total_line, spread_home_line)
league = summarize_league(league_history[league_id], total_line, spread_home_line)
ml_h = safe_float(odds.get("ml_h"), 1.90)
ml_a = safe_float(odds.get("ml_a"), 1.90)
tot_o = safe_float(odds.get("tot_o"), 1.90)
tot_u = safe_float(odds.get("tot_u"), 1.90)
spr_h = safe_float(odds.get("spread_h"), 1.90)
spr_a = safe_float(odds.get("spread_a"), 1.90)
raw_home = 1.0 / ml_h
raw_away = 1.0 / ml_a
raw_total = raw_home + raw_away
implied_home = (raw_home / raw_total) if raw_total > 0 else 0.5
implied_away = (raw_away / raw_total) if raw_total > 0 else 0.5
raw_over = 1.0 / tot_o if tot_o > 1.0 else 0.0
raw_under = 1.0 / tot_u if tot_u > 1.0 else 0.0
raw_total_ou = raw_over + raw_under
implied_total_over = (raw_over / raw_total_ou) if raw_total_ou > 0 else 0.5
implied_total_under = (raw_under / raw_total_ou) if raw_total_ou > 0 else 0.5
raw_home_cover = 1.0 / spr_h if spr_h > 1.0 else 0.0
raw_away_cover = 1.0 / spr_a if spr_a > 1.0 else 0.0
raw_total_spread = raw_home_cover + raw_away_cover
implied_spread_home = (raw_home_cover / raw_total_spread) if raw_total_spread > 0 else 0.5
implied_spread_away = (raw_away_cover / raw_total_spread) if raw_total_spread > 0 else 0.5
projected_total_form = (
home_recent["points_avg"]
+ away_recent["points_avg"]
+ home_recent["conceded_avg"]
+ away_recent["conceded_avg"]
) / 2.0
projected_margin_form = home_recent["net_rating"] - away_recent["net_rating"]
features = {
"home_overall_elo": safe_float(ai_row.get("home_elo"), 1500.0),
"away_overall_elo": safe_float(ai_row.get("away_elo"), 1500.0),
"elo_diff": safe_float(ai_row.get("elo_diff"), 0.0),
"home_home_elo": safe_float(ai_row.get("home_home_elo"), safe_float(ai_row.get("home_elo"), 1500.0)),
"away_away_elo": safe_float(ai_row.get("away_away_elo"), safe_float(ai_row.get("away_elo"), 1500.0)),
"home_form_elo": safe_float(ai_row.get("home_form_elo"), safe_float(ai_row.get("home_elo"), 1500.0)),
"away_form_elo": safe_float(ai_row.get("away_form_elo"), safe_float(ai_row.get("away_elo"), 1500.0)),
"home_form_score": safe_float(ai_row.get("home_form_score"), home_recent["win_rate"] * 100.0),
"away_form_score": safe_float(ai_row.get("away_form_score"), away_recent["win_rate"] * 100.0),
"form_score_diff": safe_float(ai_row.get("home_form_score"), home_recent["win_rate"] * 100.0)
- safe_float(ai_row.get("away_form_score"), away_recent["win_rate"] * 100.0),
"home_points_avg": safe_float(ai_row.get("home_pts_avg_5"), home_recent["points_avg"]),
"away_points_avg": safe_float(ai_row.get("away_pts_avg_5"), away_recent["points_avg"]),
"points_avg_diff": safe_float(ai_row.get("home_pts_avg_5"), home_recent["points_avg"])
- safe_float(ai_row.get("away_pts_avg_5"), away_recent["points_avg"]),
"home_conceded_avg": safe_float(ai_row.get("home_conceded_avg_5"), home_recent["conceded_avg"]),
"away_conceded_avg": safe_float(ai_row.get("away_conceded_avg_5"), away_recent["conceded_avg"]),
"conceded_avg_diff": safe_float(ai_row.get("home_conceded_avg_5"), home_recent["conceded_avg"])
- safe_float(ai_row.get("away_conceded_avg_5"), away_recent["conceded_avg"]),
"home_net_rating": home_recent["net_rating"],
"away_net_rating": away_recent["net_rating"],
"net_rating_diff": home_recent["net_rating"] - away_recent["net_rating"],
"home_win_rate": home_recent["win_rate"],
"away_win_rate": away_recent["win_rate"],
"win_rate_diff": home_recent["win_rate"] - away_recent["win_rate"],
"home_winning_streak": safe_float(ai_row.get("home_win_streak"), home_recent["winning_streak"]),
"away_winning_streak": safe_float(ai_row.get("away_win_streak"), away_recent["winning_streak"]),
"streak_diff": safe_float(ai_row.get("home_win_streak"), home_recent["winning_streak"])
- safe_float(ai_row.get("away_win_streak"), away_recent["winning_streak"]),
"home_rest_days": home_recent["rest_days"],
"away_rest_days": away_recent["rest_days"],
"rest_diff": home_recent["rest_days"] - away_recent["rest_days"],
"home_rebounds_avg": safe_float(ai_row.get("home_avg_rebounds"), home_recent["rebounds_avg"]),
"away_rebounds_avg": safe_float(ai_row.get("away_avg_rebounds"), away_recent["rebounds_avg"]),
"rebounds_diff": safe_float(ai_row.get("home_avg_rebounds"), home_recent["rebounds_avg"])
- safe_float(ai_row.get("away_avg_rebounds"), away_recent["rebounds_avg"]),
"home_assists_avg": home_recent["assists_avg"],
"away_assists_avg": away_recent["assists_avg"],
"assists_diff": home_recent["assists_avg"] - away_recent["assists_avg"],
"home_steals_avg": home_recent["steals_avg"],
"away_steals_avg": away_recent["steals_avg"],
"steals_diff": home_recent["steals_avg"] - away_recent["steals_avg"],
"home_blocks_avg": home_recent["blocks_avg"],
"away_blocks_avg": away_recent["blocks_avg"],
"blocks_diff": home_recent["blocks_avg"] - away_recent["blocks_avg"],
"home_turnovers_avg": safe_float(ai_row.get("home_avg_turnovers"), home_recent["turnovers_avg"]),
"away_turnovers_avg": safe_float(ai_row.get("away_avg_turnovers"), away_recent["turnovers_avg"]),
"turnovers_diff": safe_float(ai_row.get("home_avg_turnovers"), home_recent["turnovers_avg"])
- safe_float(ai_row.get("away_avg_turnovers"), away_recent["turnovers_avg"]),
"home_fg_pct": safe_float(ai_row.get("home_fg_pct"), home_recent["fg_pct"]),
"away_fg_pct": safe_float(ai_row.get("away_fg_pct"), away_recent["fg_pct"]),
"fg_pct_diff": safe_float(ai_row.get("home_fg_pct"), home_recent["fg_pct"])
- safe_float(ai_row.get("away_fg_pct"), away_recent["fg_pct"]),
"home_three_pt_pct": pct(
safe_float(ai_row.get("home_avg_three_pt_made"), home_recent["three_pt_pct"] * 25.0),
25.0,
home_recent["three_pt_pct"],
),
"away_three_pt_pct": pct(
safe_float(ai_row.get("away_avg_three_pt_made"), away_recent["three_pt_pct"] * 25.0),
25.0,
away_recent["three_pt_pct"],
),
"three_pt_pct_diff": pct(
safe_float(ai_row.get("home_avg_three_pt_made"), home_recent["three_pt_pct"] * 25.0),
25.0,
home_recent["three_pt_pct"],
)
- pct(
safe_float(ai_row.get("away_avg_three_pt_made"), away_recent["three_pt_pct"] * 25.0),
25.0,
away_recent["three_pt_pct"],
),
"home_ft_pct": home_recent["ft_pct"],
"away_ft_pct": away_recent["ft_pct"],
"ft_pct_diff": home_recent["ft_pct"] - away_recent["ft_pct"],
"home_q1_avg": home_recent["q1_avg"],
"away_q1_avg": away_recent["q1_avg"],
"home_q4_avg": home_recent["q4_avg"],
"away_q4_avg": away_recent["q4_avg"],
"home_conc_rebounds_avg": home_recent["conc_rebounds_avg"],
"away_conc_rebounds_avg": away_recent["conc_rebounds_avg"],
"home_conc_assists_avg": home_recent["conc_assists_avg"],
"away_conc_assists_avg": away_recent["conc_assists_avg"],
"home_conc_turnovers_avg": home_recent["conc_turnovers_avg"],
"away_conc_turnovers_avg": away_recent["conc_turnovers_avg"],
"home_conc_fg_pct": home_recent["conc_fg_pct"],
"away_conc_fg_pct": away_recent["conc_fg_pct"],
"home_conc_three_pt_pct": home_recent["conc_three_pt_pct"],
"away_conc_three_pt_pct": away_recent["conc_three_pt_pct"],
**h2h,
**league,
"ml_home_odds": ml_h,
"ml_away_odds": ml_a,
"implied_home": safe_float(ai_row.get("implied_home"), implied_home),
"implied_away": safe_float(ai_row.get("implied_away"), implied_away),
"total_line": total_line,
"total_over_odds": tot_o,
"total_under_odds": tot_u,
"implied_total_over": safe_float(ai_row.get("implied_over_total"), implied_total_over),
"implied_total_under": implied_total_under,
"spread_home_line": spread_home_line,
"spread_home_odds": spr_h,
"spread_away_odds": spr_a,
"implied_spread_home": safe_float(ai_row.get("implied_spread_home"), implied_spread_home),
"implied_spread_away": implied_spread_away,
"odds_overround": safe_float(ai_row.get("odds_overround"), raw_total - 1.0),
"home_sidelined_count": 0.0,
"away_sidelined_count": 0.0,
"sidelined_diff": 0.0,
"missing_players_impact": safe_float(ai_row.get("missing_players_impact"), 0.0),
"total_points_form": projected_total_form,
"total_points_allowed_form": home_recent["conceded_avg"] + away_recent["conceded_avg"],
"projected_total_delta_vs_line": projected_total_form - total_line,
"projected_margin_vs_spread": projected_margin_form + spread_home_line,
}
score_home = int(match["score_home"])
score_away = int(match["score_away"])
total_points = score_home + score_away
return {
"match_id": match_id,
"home_team_id": home_id,
"away_team_id": away_id,
"league_id": league_id,
"mst_utc": mst_utc,
**{feature: safe_float(features.get(feature), 0.0) for feature in DEFAULT_FEATURE_COLS},
"score_home": score_home,
"score_away": score_away,
"total_points": total_points,
"label_ml": 0 if score_home > score_away else 1,
"label_total": 1 if total_points > total_line else 0,
"label_spread": 1 if (score_home + spread_home_line) > score_away else 0,
}
def update_histories(
match: Dict[str, Any],
ctx: ExtractionContext,
team_history: Dict[str, List[Dict[str, Any]]],
pair_history: Dict[Tuple[str, str], List[Dict[str, Any]]],
league_history: Dict[str, List[Dict[str, Any]]],
) -> None:
match_id = str(match["id"])
home_id = str(match["home_team_id"])
away_id = str(match["away_team_id"])
league_id = str(match["league_id"] or "")
score_home = int(match["score_home"])
score_away = int(match["score_away"])
home_stats = ctx.team_stats.get((match_id, home_id), {})
away_stats = ctx.team_stats.get((match_id, away_id), {})
home_record = {
"mst_utc": int(match["mst_utc"]),
"scored": score_home,
"conceded": score_away,
"rebounds": safe_float(home_stats.get("rebounds"), 35.0),
"assists": safe_float(home_stats.get("assists"), 18.0),
"steals": safe_float(home_stats.get("steals"), 6.5),
"blocks": safe_float(home_stats.get("blocks"), 3.0),
"turnovers": safe_float(home_stats.get("turnovers"), 13.0),
"fg_pct": pct(safe_float(home_stats.get("fg_made")), safe_float(home_stats.get("fg_attempted")), 0.45),
"three_pt_pct": pct(
safe_float(home_stats.get("three_pt_made")),
safe_float(home_stats.get("three_pt_attempted")),
0.34,
),
"ft_pct": pct(safe_float(home_stats.get("ft_made")), safe_float(home_stats.get("ft_attempted")), 0.75),
"q1_score": safe_float(home_stats.get("q1_score"), 20.0),
"q4_score": safe_float(home_stats.get("q4_score"), 21.0),
"opp_rebounds": safe_float(away_stats.get("rebounds"), 35.0),
"opp_assists": safe_float(away_stats.get("assists"), 18.0),
"opp_turnovers": safe_float(away_stats.get("turnovers"), 13.0),
"opp_fg_pct": pct(safe_float(away_stats.get("fg_made")), safe_float(away_stats.get("fg_attempted")), 0.45),
"opp_three_pt_pct": pct(
safe_float(away_stats.get("three_pt_made")),
safe_float(away_stats.get("three_pt_attempted")),
0.34,
),
}
away_record = {
"mst_utc": int(match["mst_utc"]),
"scored": score_away,
"conceded": score_home,
"rebounds": safe_float(away_stats.get("rebounds"), 35.0),
"assists": safe_float(away_stats.get("assists"), 18.0),
"steals": safe_float(away_stats.get("steals"), 6.5),
"blocks": safe_float(away_stats.get("blocks"), 3.0),
"turnovers": safe_float(away_stats.get("turnovers"), 13.0),
"fg_pct": pct(safe_float(away_stats.get("fg_made")), safe_float(away_stats.get("fg_attempted")), 0.45),
"three_pt_pct": pct(
safe_float(away_stats.get("three_pt_made")),
safe_float(away_stats.get("three_pt_attempted")),
0.34,
),
"ft_pct": pct(safe_float(away_stats.get("ft_made")), safe_float(away_stats.get("ft_attempted")), 0.75),
"q1_score": safe_float(away_stats.get("q1_score"), 20.0),
"q4_score": safe_float(away_stats.get("q4_score"), 21.0),
"opp_rebounds": safe_float(home_stats.get("rebounds"), 35.0),
"opp_assists": safe_float(home_stats.get("assists"), 18.0),
"opp_turnovers": safe_float(home_stats.get("turnovers"), 13.0),
"opp_fg_pct": pct(safe_float(home_stats.get("fg_made")), safe_float(home_stats.get("fg_attempted")), 0.45),
"opp_three_pt_pct": pct(
safe_float(home_stats.get("three_pt_made")),
safe_float(home_stats.get("three_pt_attempted")),
0.34,
),
}
team_history[home_id].append(home_record)
team_history[away_id].append(away_record)
pair_history[tuple(sorted((home_id, away_id)))].append(
{
"home_team_id": home_id,
"away_team_id": away_id,
"score_home": score_home,
"score_away": score_away,
}
)
league_history[league_id].append(
{
"score_home": score_home,
"score_away": score_away,
}
)
def main() -> None:
started_at = time.time()
if not os.path.exists(TOP_LEAGUES_PATH):
raise FileNotFoundError(TOP_LEAGUES_PATH)
with open(TOP_LEAGUES_PATH, "r", encoding="utf-8") as handle:
league_ids = json.load(handle)
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
conn = get_conn()
ctx = ExtractionContext(conn, league_ids)
ctx.load()
team_history: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
pair_history: Dict[Tuple[str, str], List[Dict[str, Any]]] = defaultdict(list)
league_history: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
extracted = 0
skipped = 0
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=CSV_COLS)
writer.writeheader()
for idx, match in enumerate(ctx.matches, start=1):
row = build_match_feature_row(match, ctx, team_history, pair_history, league_history)
if row is None:
skipped += 1
else:
writer.writerow(row)
extracted += 1
update_histories(match, ctx, team_history, pair_history, league_history)
if idx % 2000 == 0:
print(
f"[INFO] processed={idx} extracted={extracted} skipped={skipped}",
flush=True,
)
conn.close()
print("[OK] Basketball V25 extraction complete", flush=True)
print(f"[INFO] matches={len(ctx.matches)} extracted={extracted} skipped={skipped}", flush=True)
print(f"[INFO] output={OUTPUT_CSV}", flush=True)
print(f"[INFO] duration_sec={time.time() - started_at:.1f}", flush=True)
if __name__ == "__main__":
main()