766 lines
30 KiB
Python
766 lines
30 KiB
Python
"""
|
||
Extract basketball V25-style training data.
|
||
|
||
Scope:
|
||
- top leagues from basketball_top_leagues.json
|
||
- finished basketball matches
|
||
- pre-match features only
|
||
- labels for moneyline / total / spread markets
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import csv
|
||
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
from collections import defaultdict
|
||
from typing import Any, Dict, List, Tuple
|
||
|
||
import psycopg2
|
||
from psycopg2.extras import RealDictCursor
|
||
from dotenv import load_dotenv
|
||
|
||
load_dotenv()
|
||
|
||
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
sys.path.insert(0, AI_ENGINE_DIR)
|
||
|
||
from models.basketball_v25_features import DEFAULT_FEATURE_COLS
|
||
|
||
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "basketball_top_leagues.json")
|
||
OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "basketball_training_data_v25.csv")
|
||
|
||
IDENTIFIER_COLS = ["match_id", "home_team_id", "away_team_id", "league_id", "mst_utc"]
|
||
LABEL_COLS = [
|
||
"score_home",
|
||
"score_away",
|
||
"total_points",
|
||
"label_ml",
|
||
"label_total",
|
||
"label_spread",
|
||
]
|
||
CSV_COLS = IDENTIFIER_COLS + DEFAULT_FEATURE_COLS + LABEL_COLS
|
||
|
||
|
||
def get_conn():
|
||
db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0]
|
||
if not db_url:
|
||
raise RuntimeError("DATABASE_URL is required")
|
||
return psycopg2.connect(db_url)
|
||
|
||
|
||
def safe_float(value: Any, default: float = 0.0) -> float:
|
||
try:
|
||
if value is None:
|
||
return default
|
||
return float(value)
|
||
except (TypeError, ValueError):
|
||
return default
|
||
|
||
|
||
def pct(num: float, den: float, default: float = 0.0) -> float:
|
||
if den <= 0:
|
||
return default
|
||
return float(num) / float(den)
|
||
|
||
|
||
def default_recent_stats() -> Dict[str, float]:
|
||
return {
|
||
"points_avg": 82.0,
|
||
"conceded_avg": 80.0,
|
||
"net_rating": 2.0,
|
||
"win_rate": 0.5,
|
||
"winning_streak": 0.0,
|
||
"rest_days": 3.0,
|
||
"rebounds_avg": 35.0,
|
||
"assists_avg": 18.0,
|
||
"steals_avg": 6.5,
|
||
"blocks_avg": 3.0,
|
||
"turnovers_avg": 13.0,
|
||
"fg_pct": 0.45,
|
||
"three_pt_pct": 0.34,
|
||
"ft_pct": 0.75,
|
||
"q1_avg": 20.0,
|
||
"q4_avg": 21.0,
|
||
"conc_rebounds_avg": 35.0,
|
||
"conc_assists_avg": 18.0,
|
||
"conc_turnovers_avg": 13.0,
|
||
"conc_fg_pct": 0.45,
|
||
"conc_three_pt_pct": 0.34,
|
||
}
|
||
|
||
|
||
def summarize_team_history(history: List[Dict[str, Any]], match_date_ms: int) -> Dict[str, float]:
|
||
if not history:
|
||
return default_recent_stats()
|
||
|
||
recent = history[-8:]
|
||
form_window = history[-12:]
|
||
scored = [safe_float(item["scored"]) for item in recent]
|
||
conceded = [safe_float(item["conceded"]) for item in recent]
|
||
wins = sum(1 for item in form_window if safe_float(item["scored"]) > safe_float(item["conceded"]))
|
||
|
||
streak = 0
|
||
for item in reversed(form_window):
|
||
if safe_float(item["scored"]) > safe_float(item["conceded"]):
|
||
streak += 1
|
||
else:
|
||
break
|
||
|
||
last_match_ms = safe_float(history[-1].get("mst_utc"), 0.0)
|
||
rest_days = max(0.0, (float(match_date_ms) - last_match_ms) / 86_400_000.0) if last_match_ms else 3.0
|
||
|
||
def avg_key(key: str, fallback: float) -> float:
|
||
values = [safe_float(item.get(key), fallback) for item in recent]
|
||
return sum(values) / max(len(values), 1)
|
||
|
||
points_avg = sum(scored) / max(len(scored), 1)
|
||
conceded_avg = sum(conceded) / max(len(conceded), 1)
|
||
return {
|
||
"points_avg": points_avg,
|
||
"conceded_avg": conceded_avg,
|
||
"net_rating": points_avg - conceded_avg,
|
||
"win_rate": wins / max(len(form_window), 1),
|
||
"winning_streak": float(streak),
|
||
"rest_days": rest_days,
|
||
"rebounds_avg": avg_key("rebounds", 35.0),
|
||
"assists_avg": avg_key("assists", 18.0),
|
||
"steals_avg": avg_key("steals", 6.5),
|
||
"blocks_avg": avg_key("blocks", 3.0),
|
||
"turnovers_avg": avg_key("turnovers", 13.0),
|
||
"fg_pct": avg_key("fg_pct", 0.45),
|
||
"three_pt_pct": avg_key("three_pt_pct", 0.34),
|
||
"ft_pct": avg_key("ft_pct", 0.75),
|
||
"q1_avg": avg_key("q1_score", 20.0),
|
||
"q4_avg": avg_key("q4_score", 21.0),
|
||
"conc_rebounds_avg": avg_key("opp_rebounds", 35.0),
|
||
"conc_assists_avg": avg_key("opp_assists", 18.0),
|
||
"conc_turnovers_avg": avg_key("opp_turnovers", 13.0),
|
||
"conc_fg_pct": avg_key("opp_fg_pct", 0.45),
|
||
"conc_three_pt_pct": avg_key("opp_three_pt_pct", 0.34),
|
||
}
|
||
|
||
|
||
def summarize_h2h(
|
||
history: List[Dict[str, Any]],
|
||
current_home_id: str,
|
||
total_line: float,
|
||
spread_home_line: float,
|
||
) -> Dict[str, float]:
|
||
if not history:
|
||
return {
|
||
"h2h_total_matches": 0.0,
|
||
"h2h_home_win_rate": 0.5,
|
||
"h2h_avg_points": 160.0,
|
||
"h2h_avg_margin": 0.0,
|
||
"h2h_over_total_rate": 0.5,
|
||
"h2h_home_cover_rate": 0.5,
|
||
}
|
||
|
||
recent = history[-10:]
|
||
home_wins = 0
|
||
total_points = 0.0
|
||
total_margin = 0.0
|
||
over_hits = 0
|
||
cover_hits = 0
|
||
for item in recent:
|
||
if item["home_team_id"] == current_home_id:
|
||
home_score = safe_float(item["score_home"])
|
||
away_score = safe_float(item["score_away"])
|
||
else:
|
||
home_score = safe_float(item["score_away"])
|
||
away_score = safe_float(item["score_home"])
|
||
if home_score > away_score:
|
||
home_wins += 1
|
||
margin = home_score - away_score
|
||
total_margin += margin
|
||
total_points += home_score + away_score
|
||
if total_line > 0 and (home_score + away_score) > total_line:
|
||
over_hits += 1
|
||
if (home_score + spread_home_line) > away_score:
|
||
cover_hits += 1
|
||
|
||
size = float(len(recent))
|
||
return {
|
||
"h2h_total_matches": size,
|
||
"h2h_home_win_rate": home_wins / size,
|
||
"h2h_avg_points": total_points / size,
|
||
"h2h_avg_margin": total_margin / size,
|
||
"h2h_over_total_rate": over_hits / size if total_line > 0 else 0.5,
|
||
"h2h_home_cover_rate": cover_hits / size,
|
||
}
|
||
|
||
|
||
def summarize_league(
|
||
history: List[Dict[str, Any]],
|
||
total_line: float,
|
||
spread_home_line: float,
|
||
) -> Dict[str, float]:
|
||
if not history:
|
||
return {
|
||
"league_avg_points": 160.0,
|
||
"league_home_win_rate": 0.56,
|
||
"league_over_total_rate": 0.5,
|
||
"league_home_cover_rate": 0.5,
|
||
}
|
||
|
||
recent = history[-200:]
|
||
total_points = 0.0
|
||
home_wins = 0
|
||
over_hits = 0
|
||
cover_hits = 0
|
||
for item in recent:
|
||
score_home = safe_float(item["score_home"])
|
||
score_away = safe_float(item["score_away"])
|
||
total_points += score_home + score_away
|
||
if score_home > score_away:
|
||
home_wins += 1
|
||
if total_line > 0 and (score_home + score_away) > total_line:
|
||
over_hits += 1
|
||
if (score_home + spread_home_line) > score_away:
|
||
cover_hits += 1
|
||
size = float(len(recent))
|
||
return {
|
||
"league_avg_points": total_points / size,
|
||
"league_home_win_rate": home_wins / size,
|
||
"league_over_total_rate": over_hits / size if total_line > 0 else 0.5,
|
||
"league_home_cover_rate": cover_hits / size,
|
||
}
|
||
|
||
|
||
def normalize_text(value: Any) -> str:
|
||
return (
|
||
str(value or "")
|
||
.strip()
|
||
.lower()
|
||
.replace("ı", "i")
|
||
.replace("ç", "c")
|
||
.replace("ş", "s")
|
||
.replace("ğ", "g")
|
||
.replace("ö", "o")
|
||
.replace("ü", "u")
|
||
)
|
||
|
||
|
||
def extract_parenthesized_number(category_name: str) -> float | None:
|
||
left = category_name.find("(")
|
||
right = category_name.find(")", left + 1)
|
||
if left < 0 or right < 0:
|
||
return None
|
||
payload = category_name[left + 1 : right].replace(",", ".")
|
||
if ":" in payload:
|
||
return None
|
||
try:
|
||
return float(payload)
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def parse_handicap_home_line(category_name: str) -> float | None:
|
||
left = category_name.find("(")
|
||
right = category_name.find(")", left + 1)
|
||
if left < 0 or right < 0:
|
||
return None
|
||
payload = category_name[left + 1 : right].replace(",", ".")
|
||
if ":" not in payload:
|
||
return None
|
||
home_raw, away_raw = payload.split(":", 1)
|
||
try:
|
||
home_line = float(home_raw)
|
||
away_line = float(away_raw)
|
||
except ValueError:
|
||
return None
|
||
if abs(home_line) < 1e-9 and away_line > 0:
|
||
return -away_line
|
||
if home_line > 0 and abs(away_line) < 1e-9:
|
||
return home_line
|
||
if abs(home_line - away_line) < 1e-9 and home_line > 0:
|
||
return 0.0
|
||
return home_line
|
||
|
||
|
||
def parse_odds(categories: List[Dict[str, Any]], selections: List[Dict[str, Any]]) -> Dict[str, Dict[str, float]]:
|
||
match_odds: Dict[str, Dict[str, float]] = defaultdict(dict)
|
||
category_map = {
|
||
row["category_id"]: (str(row["match_id"]), str(row["category_name"]))
|
||
for row in categories
|
||
}
|
||
for row in selections:
|
||
category_id = row["odd_category_db_id"]
|
||
if category_id not in category_map:
|
||
continue
|
||
match_id, category_name = category_map[category_id]
|
||
category_norm = normalize_text(category_name)
|
||
selection_norm = normalize_text(row["name"])
|
||
odd_value = safe_float(row["odd_value"], 0.0)
|
||
if odd_value <= 1.0:
|
||
continue
|
||
|
||
target = match_odds[match_id]
|
||
if category_norm in ("mac sonucu", "mac sonucu (uzt. dahil)"):
|
||
if selection_norm == "1":
|
||
target["ml_h"] = odd_value
|
||
elif selection_norm == "2":
|
||
target["ml_a"] = odd_value
|
||
|
||
if ("alt/ust" in category_norm or "alt/üst" in str(category_name).lower()) and not any(
|
||
token in category_norm for token in ("1. yari", "1. yarı", "periyot", "ev sahibi", "deplasman")
|
||
):
|
||
total_line = extract_parenthesized_number(category_name)
|
||
if total_line is not None:
|
||
target.setdefault("tot_line", total_line)
|
||
if any(token in selection_norm for token in ("ust", "over")):
|
||
target.setdefault("tot_o", odd_value)
|
||
elif any(token in selection_norm for token in ("alt", "under")):
|
||
target.setdefault("tot_u", odd_value)
|
||
|
||
if "hnd. ms" in category_norm or "hand. ms" in category_norm or "hnd ms" in category_norm:
|
||
home_line = parse_handicap_home_line(category_name)
|
||
if home_line is not None:
|
||
target.setdefault("spread_home_line", home_line)
|
||
if selection_norm == "1":
|
||
target.setdefault("spread_h", odd_value)
|
||
elif selection_norm == "2":
|
||
target.setdefault("spread_a", odd_value)
|
||
return match_odds
|
||
|
||
|
||
class ExtractionContext:
|
||
def __init__(self, conn, league_ids: List[str]):
|
||
self.conn = conn
|
||
self.cur = conn.cursor(cursor_factory=RealDictCursor)
|
||
self.league_ids = league_ids
|
||
self.matches: List[Dict[str, Any]] = []
|
||
self.team_stats: Dict[Tuple[str, str], Dict[str, Any]] = {}
|
||
self.ai_features: Dict[str, Dict[str, Any]] = {}
|
||
self.odds_cache: Dict[str, Dict[str, float]] = {}
|
||
|
||
def load(self) -> None:
|
||
self._load_matches()
|
||
self._load_team_stats()
|
||
self._load_ai_features()
|
||
self._load_odds()
|
||
|
||
def _load_matches(self) -> None:
|
||
query = """
|
||
SELECT id, league_id, home_team_id, away_team_id, mst_utc, score_home, score_away
|
||
FROM matches
|
||
WHERE sport = 'basketball'
|
||
AND status = 'FT'
|
||
AND score_home IS NOT NULL
|
||
AND score_away IS NOT NULL
|
||
AND mst_utc >= 1640995200000
|
||
"""
|
||
params: Tuple[Any, ...] = ()
|
||
if self.league_ids:
|
||
placeholders = ",".join(["%s"] * len(self.league_ids))
|
||
query += f" AND league_id IN ({placeholders})"
|
||
params = tuple(self.league_ids)
|
||
query += " ORDER BY mst_utc ASC"
|
||
self.cur.execute(query, params)
|
||
self.matches = self.cur.fetchall()
|
||
|
||
def _load_team_stats(self) -> None:
|
||
self.cur.execute(
|
||
"""
|
||
SELECT
|
||
match_id,
|
||
team_id,
|
||
points,
|
||
rebounds,
|
||
assists,
|
||
steals,
|
||
blocks,
|
||
turnovers,
|
||
fg_made,
|
||
fg_attempted,
|
||
three_pt_made,
|
||
three_pt_attempted,
|
||
ft_made,
|
||
ft_attempted,
|
||
q1_score,
|
||
q4_score
|
||
FROM basketball_team_stats
|
||
"""
|
||
)
|
||
for row in self.cur.fetchall():
|
||
key = (str(row["match_id"]), str(row["team_id"]))
|
||
self.team_stats[key] = row
|
||
|
||
def _load_ai_features(self) -> None:
|
||
self.cur.execute("SELECT * FROM basketball_ai_features")
|
||
for row in self.cur.fetchall():
|
||
self.ai_features[str(row["match_id"])] = row
|
||
|
||
def _load_odds(self) -> None:
|
||
self.cur.execute(
|
||
"""
|
||
SELECT db_id AS category_id, match_id, name AS category_name
|
||
FROM odd_categories
|
||
WHERE match_id IN (
|
||
SELECT id
|
||
FROM matches
|
||
WHERE sport = 'basketball'
|
||
AND status = 'FT'
|
||
)
|
||
"""
|
||
)
|
||
categories = self.cur.fetchall()
|
||
category_ids = [row["category_id"] for row in categories]
|
||
if not category_ids:
|
||
return
|
||
|
||
selections: List[Dict[str, Any]] = []
|
||
chunk_size = 50000
|
||
for idx in range(0, len(category_ids), chunk_size):
|
||
chunk = tuple(category_ids[idx : idx + chunk_size])
|
||
self.cur.execute(
|
||
"""
|
||
SELECT odd_category_db_id, name, odd_value
|
||
FROM odd_selections
|
||
WHERE odd_category_db_id IN %s
|
||
""",
|
||
(chunk,),
|
||
)
|
||
selections.extend(self.cur.fetchall())
|
||
self.odds_cache = parse_odds(categories, selections)
|
||
|
||
|
||
def build_match_feature_row(
|
||
match: Dict[str, Any],
|
||
ctx: ExtractionContext,
|
||
team_history: Dict[str, List[Dict[str, Any]]],
|
||
pair_history: Dict[Tuple[str, str], List[Dict[str, Any]]],
|
||
league_history: Dict[str, List[Dict[str, Any]]],
|
||
) -> Dict[str, Any] | None:
|
||
match_id = str(match["id"])
|
||
home_id = str(match["home_team_id"])
|
||
away_id = str(match["away_team_id"])
|
||
league_id = str(match["league_id"] or "")
|
||
mst_utc = int(match["mst_utc"])
|
||
odds = ctx.odds_cache.get(match_id, {})
|
||
if safe_float(odds.get("ml_h"), 0.0) <= 1.0 or safe_float(odds.get("ml_a"), 0.0) <= 1.0:
|
||
return None
|
||
|
||
ai_row = ctx.ai_features.get(match_id, {})
|
||
home_recent = summarize_team_history(team_history[home_id], mst_utc)
|
||
away_recent = summarize_team_history(team_history[away_id], mst_utc)
|
||
|
||
total_line = safe_float(odds.get("tot_line"), 160.0)
|
||
spread_home_line = safe_float(odds.get("spread_home_line"), 0.0)
|
||
pair_key = tuple(sorted((home_id, away_id)))
|
||
h2h = summarize_h2h(pair_history[pair_key], home_id, total_line, spread_home_line)
|
||
league = summarize_league(league_history[league_id], total_line, spread_home_line)
|
||
|
||
ml_h = safe_float(odds.get("ml_h"), 1.90)
|
||
ml_a = safe_float(odds.get("ml_a"), 1.90)
|
||
tot_o = safe_float(odds.get("tot_o"), 1.90)
|
||
tot_u = safe_float(odds.get("tot_u"), 1.90)
|
||
spr_h = safe_float(odds.get("spread_h"), 1.90)
|
||
spr_a = safe_float(odds.get("spread_a"), 1.90)
|
||
|
||
raw_home = 1.0 / ml_h
|
||
raw_away = 1.0 / ml_a
|
||
raw_total = raw_home + raw_away
|
||
implied_home = (raw_home / raw_total) if raw_total > 0 else 0.5
|
||
implied_away = (raw_away / raw_total) if raw_total > 0 else 0.5
|
||
|
||
raw_over = 1.0 / tot_o if tot_o > 1.0 else 0.0
|
||
raw_under = 1.0 / tot_u if tot_u > 1.0 else 0.0
|
||
raw_total_ou = raw_over + raw_under
|
||
implied_total_over = (raw_over / raw_total_ou) if raw_total_ou > 0 else 0.5
|
||
implied_total_under = (raw_under / raw_total_ou) if raw_total_ou > 0 else 0.5
|
||
|
||
raw_home_cover = 1.0 / spr_h if spr_h > 1.0 else 0.0
|
||
raw_away_cover = 1.0 / spr_a if spr_a > 1.0 else 0.0
|
||
raw_total_spread = raw_home_cover + raw_away_cover
|
||
implied_spread_home = (raw_home_cover / raw_total_spread) if raw_total_spread > 0 else 0.5
|
||
implied_spread_away = (raw_away_cover / raw_total_spread) if raw_total_spread > 0 else 0.5
|
||
|
||
projected_total_form = (
|
||
home_recent["points_avg"]
|
||
+ away_recent["points_avg"]
|
||
+ home_recent["conceded_avg"]
|
||
+ away_recent["conceded_avg"]
|
||
) / 2.0
|
||
projected_margin_form = home_recent["net_rating"] - away_recent["net_rating"]
|
||
|
||
features = {
|
||
"home_overall_elo": safe_float(ai_row.get("home_elo"), 1500.0),
|
||
"away_overall_elo": safe_float(ai_row.get("away_elo"), 1500.0),
|
||
"elo_diff": safe_float(ai_row.get("elo_diff"), 0.0),
|
||
"home_home_elo": safe_float(ai_row.get("home_home_elo"), safe_float(ai_row.get("home_elo"), 1500.0)),
|
||
"away_away_elo": safe_float(ai_row.get("away_away_elo"), safe_float(ai_row.get("away_elo"), 1500.0)),
|
||
"home_form_elo": safe_float(ai_row.get("home_form_elo"), safe_float(ai_row.get("home_elo"), 1500.0)),
|
||
"away_form_elo": safe_float(ai_row.get("away_form_elo"), safe_float(ai_row.get("away_elo"), 1500.0)),
|
||
"home_form_score": safe_float(ai_row.get("home_form_score"), home_recent["win_rate"] * 100.0),
|
||
"away_form_score": safe_float(ai_row.get("away_form_score"), away_recent["win_rate"] * 100.0),
|
||
"form_score_diff": safe_float(ai_row.get("home_form_score"), home_recent["win_rate"] * 100.0)
|
||
- safe_float(ai_row.get("away_form_score"), away_recent["win_rate"] * 100.0),
|
||
"home_points_avg": safe_float(ai_row.get("home_pts_avg_5"), home_recent["points_avg"]),
|
||
"away_points_avg": safe_float(ai_row.get("away_pts_avg_5"), away_recent["points_avg"]),
|
||
"points_avg_diff": safe_float(ai_row.get("home_pts_avg_5"), home_recent["points_avg"])
|
||
- safe_float(ai_row.get("away_pts_avg_5"), away_recent["points_avg"]),
|
||
"home_conceded_avg": safe_float(ai_row.get("home_conceded_avg_5"), home_recent["conceded_avg"]),
|
||
"away_conceded_avg": safe_float(ai_row.get("away_conceded_avg_5"), away_recent["conceded_avg"]),
|
||
"conceded_avg_diff": safe_float(ai_row.get("home_conceded_avg_5"), home_recent["conceded_avg"])
|
||
- safe_float(ai_row.get("away_conceded_avg_5"), away_recent["conceded_avg"]),
|
||
"home_net_rating": home_recent["net_rating"],
|
||
"away_net_rating": away_recent["net_rating"],
|
||
"net_rating_diff": home_recent["net_rating"] - away_recent["net_rating"],
|
||
"home_win_rate": home_recent["win_rate"],
|
||
"away_win_rate": away_recent["win_rate"],
|
||
"win_rate_diff": home_recent["win_rate"] - away_recent["win_rate"],
|
||
"home_winning_streak": safe_float(ai_row.get("home_win_streak"), home_recent["winning_streak"]),
|
||
"away_winning_streak": safe_float(ai_row.get("away_win_streak"), away_recent["winning_streak"]),
|
||
"streak_diff": safe_float(ai_row.get("home_win_streak"), home_recent["winning_streak"])
|
||
- safe_float(ai_row.get("away_win_streak"), away_recent["winning_streak"]),
|
||
"home_rest_days": home_recent["rest_days"],
|
||
"away_rest_days": away_recent["rest_days"],
|
||
"rest_diff": home_recent["rest_days"] - away_recent["rest_days"],
|
||
"home_rebounds_avg": safe_float(ai_row.get("home_avg_rebounds"), home_recent["rebounds_avg"]),
|
||
"away_rebounds_avg": safe_float(ai_row.get("away_avg_rebounds"), away_recent["rebounds_avg"]),
|
||
"rebounds_diff": safe_float(ai_row.get("home_avg_rebounds"), home_recent["rebounds_avg"])
|
||
- safe_float(ai_row.get("away_avg_rebounds"), away_recent["rebounds_avg"]),
|
||
"home_assists_avg": home_recent["assists_avg"],
|
||
"away_assists_avg": away_recent["assists_avg"],
|
||
"assists_diff": home_recent["assists_avg"] - away_recent["assists_avg"],
|
||
"home_steals_avg": home_recent["steals_avg"],
|
||
"away_steals_avg": away_recent["steals_avg"],
|
||
"steals_diff": home_recent["steals_avg"] - away_recent["steals_avg"],
|
||
"home_blocks_avg": home_recent["blocks_avg"],
|
||
"away_blocks_avg": away_recent["blocks_avg"],
|
||
"blocks_diff": home_recent["blocks_avg"] - away_recent["blocks_avg"],
|
||
"home_turnovers_avg": safe_float(ai_row.get("home_avg_turnovers"), home_recent["turnovers_avg"]),
|
||
"away_turnovers_avg": safe_float(ai_row.get("away_avg_turnovers"), away_recent["turnovers_avg"]),
|
||
"turnovers_diff": safe_float(ai_row.get("home_avg_turnovers"), home_recent["turnovers_avg"])
|
||
- safe_float(ai_row.get("away_avg_turnovers"), away_recent["turnovers_avg"]),
|
||
"home_fg_pct": safe_float(ai_row.get("home_fg_pct"), home_recent["fg_pct"]),
|
||
"away_fg_pct": safe_float(ai_row.get("away_fg_pct"), away_recent["fg_pct"]),
|
||
"fg_pct_diff": safe_float(ai_row.get("home_fg_pct"), home_recent["fg_pct"])
|
||
- safe_float(ai_row.get("away_fg_pct"), away_recent["fg_pct"]),
|
||
"home_three_pt_pct": pct(
|
||
safe_float(ai_row.get("home_avg_three_pt_made"), home_recent["three_pt_pct"] * 25.0),
|
||
25.0,
|
||
home_recent["three_pt_pct"],
|
||
),
|
||
"away_three_pt_pct": pct(
|
||
safe_float(ai_row.get("away_avg_three_pt_made"), away_recent["three_pt_pct"] * 25.0),
|
||
25.0,
|
||
away_recent["three_pt_pct"],
|
||
),
|
||
"three_pt_pct_diff": pct(
|
||
safe_float(ai_row.get("home_avg_three_pt_made"), home_recent["three_pt_pct"] * 25.0),
|
||
25.0,
|
||
home_recent["three_pt_pct"],
|
||
)
|
||
- pct(
|
||
safe_float(ai_row.get("away_avg_three_pt_made"), away_recent["three_pt_pct"] * 25.0),
|
||
25.0,
|
||
away_recent["three_pt_pct"],
|
||
),
|
||
"home_ft_pct": home_recent["ft_pct"],
|
||
"away_ft_pct": away_recent["ft_pct"],
|
||
"ft_pct_diff": home_recent["ft_pct"] - away_recent["ft_pct"],
|
||
"home_q1_avg": home_recent["q1_avg"],
|
||
"away_q1_avg": away_recent["q1_avg"],
|
||
"home_q4_avg": home_recent["q4_avg"],
|
||
"away_q4_avg": away_recent["q4_avg"],
|
||
"home_conc_rebounds_avg": home_recent["conc_rebounds_avg"],
|
||
"away_conc_rebounds_avg": away_recent["conc_rebounds_avg"],
|
||
"home_conc_assists_avg": home_recent["conc_assists_avg"],
|
||
"away_conc_assists_avg": away_recent["conc_assists_avg"],
|
||
"home_conc_turnovers_avg": home_recent["conc_turnovers_avg"],
|
||
"away_conc_turnovers_avg": away_recent["conc_turnovers_avg"],
|
||
"home_conc_fg_pct": home_recent["conc_fg_pct"],
|
||
"away_conc_fg_pct": away_recent["conc_fg_pct"],
|
||
"home_conc_three_pt_pct": home_recent["conc_three_pt_pct"],
|
||
"away_conc_three_pt_pct": away_recent["conc_three_pt_pct"],
|
||
**h2h,
|
||
**league,
|
||
"ml_home_odds": ml_h,
|
||
"ml_away_odds": ml_a,
|
||
"implied_home": safe_float(ai_row.get("implied_home"), implied_home),
|
||
"implied_away": safe_float(ai_row.get("implied_away"), implied_away),
|
||
"total_line": total_line,
|
||
"total_over_odds": tot_o,
|
||
"total_under_odds": tot_u,
|
||
"implied_total_over": safe_float(ai_row.get("implied_over_total"), implied_total_over),
|
||
"implied_total_under": implied_total_under,
|
||
"spread_home_line": spread_home_line,
|
||
"spread_home_odds": spr_h,
|
||
"spread_away_odds": spr_a,
|
||
"implied_spread_home": safe_float(ai_row.get("implied_spread_home"), implied_spread_home),
|
||
"implied_spread_away": implied_spread_away,
|
||
"odds_overround": safe_float(ai_row.get("odds_overround"), raw_total - 1.0),
|
||
"home_sidelined_count": 0.0,
|
||
"away_sidelined_count": 0.0,
|
||
"sidelined_diff": 0.0,
|
||
"missing_players_impact": safe_float(ai_row.get("missing_players_impact"), 0.0),
|
||
"total_points_form": projected_total_form,
|
||
"total_points_allowed_form": home_recent["conceded_avg"] + away_recent["conceded_avg"],
|
||
"projected_total_delta_vs_line": projected_total_form - total_line,
|
||
"projected_margin_vs_spread": projected_margin_form + spread_home_line,
|
||
}
|
||
|
||
score_home = int(match["score_home"])
|
||
score_away = int(match["score_away"])
|
||
total_points = score_home + score_away
|
||
return {
|
||
"match_id": match_id,
|
||
"home_team_id": home_id,
|
||
"away_team_id": away_id,
|
||
"league_id": league_id,
|
||
"mst_utc": mst_utc,
|
||
**{feature: safe_float(features.get(feature), 0.0) for feature in DEFAULT_FEATURE_COLS},
|
||
"score_home": score_home,
|
||
"score_away": score_away,
|
||
"total_points": total_points,
|
||
"label_ml": 0 if score_home > score_away else 1,
|
||
"label_total": 1 if total_points > total_line else 0,
|
||
"label_spread": 1 if (score_home + spread_home_line) > score_away else 0,
|
||
}
|
||
|
||
|
||
def update_histories(
|
||
match: Dict[str, Any],
|
||
ctx: ExtractionContext,
|
||
team_history: Dict[str, List[Dict[str, Any]]],
|
||
pair_history: Dict[Tuple[str, str], List[Dict[str, Any]]],
|
||
league_history: Dict[str, List[Dict[str, Any]]],
|
||
) -> None:
|
||
match_id = str(match["id"])
|
||
home_id = str(match["home_team_id"])
|
||
away_id = str(match["away_team_id"])
|
||
league_id = str(match["league_id"] or "")
|
||
score_home = int(match["score_home"])
|
||
score_away = int(match["score_away"])
|
||
home_stats = ctx.team_stats.get((match_id, home_id), {})
|
||
away_stats = ctx.team_stats.get((match_id, away_id), {})
|
||
|
||
home_record = {
|
||
"mst_utc": int(match["mst_utc"]),
|
||
"scored": score_home,
|
||
"conceded": score_away,
|
||
"rebounds": safe_float(home_stats.get("rebounds"), 35.0),
|
||
"assists": safe_float(home_stats.get("assists"), 18.0),
|
||
"steals": safe_float(home_stats.get("steals"), 6.5),
|
||
"blocks": safe_float(home_stats.get("blocks"), 3.0),
|
||
"turnovers": safe_float(home_stats.get("turnovers"), 13.0),
|
||
"fg_pct": pct(safe_float(home_stats.get("fg_made")), safe_float(home_stats.get("fg_attempted")), 0.45),
|
||
"three_pt_pct": pct(
|
||
safe_float(home_stats.get("three_pt_made")),
|
||
safe_float(home_stats.get("three_pt_attempted")),
|
||
0.34,
|
||
),
|
||
"ft_pct": pct(safe_float(home_stats.get("ft_made")), safe_float(home_stats.get("ft_attempted")), 0.75),
|
||
"q1_score": safe_float(home_stats.get("q1_score"), 20.0),
|
||
"q4_score": safe_float(home_stats.get("q4_score"), 21.0),
|
||
"opp_rebounds": safe_float(away_stats.get("rebounds"), 35.0),
|
||
"opp_assists": safe_float(away_stats.get("assists"), 18.0),
|
||
"opp_turnovers": safe_float(away_stats.get("turnovers"), 13.0),
|
||
"opp_fg_pct": pct(safe_float(away_stats.get("fg_made")), safe_float(away_stats.get("fg_attempted")), 0.45),
|
||
"opp_three_pt_pct": pct(
|
||
safe_float(away_stats.get("three_pt_made")),
|
||
safe_float(away_stats.get("three_pt_attempted")),
|
||
0.34,
|
||
),
|
||
}
|
||
away_record = {
|
||
"mst_utc": int(match["mst_utc"]),
|
||
"scored": score_away,
|
||
"conceded": score_home,
|
||
"rebounds": safe_float(away_stats.get("rebounds"), 35.0),
|
||
"assists": safe_float(away_stats.get("assists"), 18.0),
|
||
"steals": safe_float(away_stats.get("steals"), 6.5),
|
||
"blocks": safe_float(away_stats.get("blocks"), 3.0),
|
||
"turnovers": safe_float(away_stats.get("turnovers"), 13.0),
|
||
"fg_pct": pct(safe_float(away_stats.get("fg_made")), safe_float(away_stats.get("fg_attempted")), 0.45),
|
||
"three_pt_pct": pct(
|
||
safe_float(away_stats.get("three_pt_made")),
|
||
safe_float(away_stats.get("three_pt_attempted")),
|
||
0.34,
|
||
),
|
||
"ft_pct": pct(safe_float(away_stats.get("ft_made")), safe_float(away_stats.get("ft_attempted")), 0.75),
|
||
"q1_score": safe_float(away_stats.get("q1_score"), 20.0),
|
||
"q4_score": safe_float(away_stats.get("q4_score"), 21.0),
|
||
"opp_rebounds": safe_float(home_stats.get("rebounds"), 35.0),
|
||
"opp_assists": safe_float(home_stats.get("assists"), 18.0),
|
||
"opp_turnovers": safe_float(home_stats.get("turnovers"), 13.0),
|
||
"opp_fg_pct": pct(safe_float(home_stats.get("fg_made")), safe_float(home_stats.get("fg_attempted")), 0.45),
|
||
"opp_three_pt_pct": pct(
|
||
safe_float(home_stats.get("three_pt_made")),
|
||
safe_float(home_stats.get("three_pt_attempted")),
|
||
0.34,
|
||
),
|
||
}
|
||
|
||
team_history[home_id].append(home_record)
|
||
team_history[away_id].append(away_record)
|
||
pair_history[tuple(sorted((home_id, away_id)))].append(
|
||
{
|
||
"home_team_id": home_id,
|
||
"away_team_id": away_id,
|
||
"score_home": score_home,
|
||
"score_away": score_away,
|
||
}
|
||
)
|
||
league_history[league_id].append(
|
||
{
|
||
"score_home": score_home,
|
||
"score_away": score_away,
|
||
}
|
||
)
|
||
|
||
|
||
def main() -> None:
|
||
started_at = time.time()
|
||
if not os.path.exists(TOP_LEAGUES_PATH):
|
||
raise FileNotFoundError(TOP_LEAGUES_PATH)
|
||
|
||
with open(TOP_LEAGUES_PATH, "r", encoding="utf-8") as handle:
|
||
league_ids = json.load(handle)
|
||
|
||
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
|
||
conn = get_conn()
|
||
ctx = ExtractionContext(conn, league_ids)
|
||
ctx.load()
|
||
|
||
team_history: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
||
pair_history: Dict[Tuple[str, str], List[Dict[str, Any]]] = defaultdict(list)
|
||
league_history: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
||
|
||
extracted = 0
|
||
skipped = 0
|
||
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as handle:
|
||
writer = csv.DictWriter(handle, fieldnames=CSV_COLS)
|
||
writer.writeheader()
|
||
|
||
for idx, match in enumerate(ctx.matches, start=1):
|
||
row = build_match_feature_row(match, ctx, team_history, pair_history, league_history)
|
||
if row is None:
|
||
skipped += 1
|
||
else:
|
||
writer.writerow(row)
|
||
extracted += 1
|
||
update_histories(match, ctx, team_history, pair_history, league_history)
|
||
|
||
if idx % 2000 == 0:
|
||
print(
|
||
f"[INFO] processed={idx} extracted={extracted} skipped={skipped}",
|
||
flush=True,
|
||
)
|
||
|
||
conn.close()
|
||
print("[OK] Basketball V25 extraction complete", flush=True)
|
||
print(f"[INFO] matches={len(ctx.matches)} extracted={extracted} skipped={skipped}", flush=True)
|
||
print(f"[INFO] output={OUTPUT_CSV}", flush=True)
|
||
print(f"[INFO] duration_sec={time.time() - started_at:.1f}", flush=True)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|