This commit is contained in:
@@ -0,0 +1,519 @@
|
||||
"""
|
||||
XGBoost Training Data Extraction (Advanced Basketball V21)
|
||||
============================================================
|
||||
Batch feature extraction for top-league basketball matches.
|
||||
Extracts 60+ features per match including deep team stats (FG%, Rebounds, Qrt pacing).
|
||||
|
||||
Usage:
|
||||
python3 scripts/extract_advanced_basketball_data.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import csv
|
||||
import math
|
||||
import time
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# =============================================================================
|
||||
# CONFIG
|
||||
# =============================================================================
|
||||
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0, AI_ENGINE_DIR)
|
||||
|
||||
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "basketball_top_leagues.json")
|
||||
OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "advanced_basketball_training_data.csv")
|
||||
|
||||
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
|
||||
|
||||
def get_conn():
|
||||
db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0]
|
||||
return psycopg2.connect(db_url)
|
||||
|
||||
# =============================================================================
|
||||
# FEATURE COLUMNS (ORDER MATTERS)
|
||||
# =============================================================================
|
||||
FEATURE_COLS = [
|
||||
"match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
|
||||
|
||||
# Form & Winning
|
||||
"home_winning_streak", "away_winning_streak",
|
||||
"home_win_rate", "away_win_rate",
|
||||
|
||||
# Home Team Offense (Averages of last 5)
|
||||
"home_pts_avg", "home_reb_avg", "home_ast_avg", "home_stl_avg", "home_blk_avg", "home_tov_avg",
|
||||
"home_fg_pct", "home_3pt_pct", "home_ft_pct",
|
||||
"home_q1_avg", "home_q2_avg", "home_q3_avg", "home_q4_avg",
|
||||
|
||||
# Home Team Defense (Averages of opponent stats in last 5)
|
||||
"home_conc_pts", "home_conc_reb", "home_conc_ast", "home_conc_tov",
|
||||
"home_conc_fg_pct", "home_conc_3pt_pct",
|
||||
|
||||
# Away Team Offense (Averages of last 5)
|
||||
"away_pts_avg", "away_reb_avg", "away_ast_avg", "away_stl_avg", "away_blk_avg", "away_tov_avg",
|
||||
"away_fg_pct", "away_3pt_pct", "away_ft_pct",
|
||||
"away_q1_avg", "away_q2_avg", "away_q3_avg", "away_q4_avg",
|
||||
|
||||
# Away Team Defense (Averages of opponent stats in last 5)
|
||||
"away_conc_pts", "away_conc_reb", "away_conc_ast", "away_conc_tov",
|
||||
"away_conc_fg_pct", "away_conc_3pt_pct",
|
||||
|
||||
# H2H Features
|
||||
"h2h_total_matches", "h2h_home_win_rate",
|
||||
"h2h_avg_points", "h2h_over140_rate",
|
||||
|
||||
# Odds Features
|
||||
"odds_ml_h", "odds_ml_a",
|
||||
"odds_tot_o", "odds_tot_u", "odds_tot_line",
|
||||
"odds_spread_h", "odds_spread_a", "odds_spread_line",
|
||||
|
||||
# Labels
|
||||
"score_home", "score_away", "total_points",
|
||||
"label_ml", # 0=Home, 1=Away
|
||||
"label_tot", # 0=Under, 1=Over (dynamic line)
|
||||
"label_spread", # 0=Away Cover, 1=Home Cover (dynamic line)
|
||||
]
|
||||
|
||||
# =============================================================================
|
||||
# BATCH LOADERS
|
||||
# =============================================================================
|
||||
|
||||
class AdvancedDataLoader:
|
||||
def __init__(self, conn, top_league_ids: list):
|
||||
self.conn = conn
|
||||
self.cur = conn.cursor(cursor_factory=RealDictCursor)
|
||||
self.top_league_ids = top_league_ids
|
||||
|
||||
self.matches = []
|
||||
self.odds_cache = {}
|
||||
self.team_stats_cache = {} # (match_id, team_id) -> stats dict
|
||||
self.form_cache = {}
|
||||
self.h2h_cache = {}
|
||||
|
||||
def load_all(self):
|
||||
t0 = time.time()
|
||||
self._load_matches()
|
||||
print(f" ✅ Matches: {len(self.matches)} ({time.time()-t0:.1f}s)", flush=True)
|
||||
|
||||
t1 = time.time()
|
||||
self._load_team_stats()
|
||||
print(f" ✅ Team Stats: {len(self.team_stats_cache)} records ({time.time()-t1:.1f}s)", flush=True)
|
||||
|
||||
t2 = time.time()
|
||||
self._load_odds()
|
||||
print(f" ✅ Odds: {len(self.odds_cache)} matches ({time.time()-t2:.1f}s)", flush=True)
|
||||
|
||||
t3 = time.time()
|
||||
self._build_advanced_history()
|
||||
print(f" ✅ Advanced History & Stats cache built ({time.time()-t3:.1f}s)", flush=True)
|
||||
|
||||
print(f" 📊 Total load time: {time.time()-t0:.1f}s", flush=True)
|
||||
|
||||
def _load_matches(self):
|
||||
query = """
|
||||
SELECT
|
||||
id, mst_utc, league_id, home_team_id, away_team_id,
|
||||
score_home, score_away
|
||||
FROM matches
|
||||
WHERE sport = 'basketball'
|
||||
AND status = 'FT'
|
||||
AND score_home IS NOT NULL
|
||||
AND score_away IS NOT NULL
|
||||
AND mst_utc > 1640995200000
|
||||
"""
|
||||
if self.top_league_ids:
|
||||
format_strings = ",".join(["%s"] * len(self.top_league_ids))
|
||||
query += f" AND league_id IN ({format_strings})"
|
||||
self.cur.execute(query + " ORDER BY mst_utc ASC", tuple(self.top_league_ids))
|
||||
else:
|
||||
self.cur.execute(query + " ORDER BY mst_utc ASC")
|
||||
|
||||
self.matches = self.cur.fetchall()
|
||||
|
||||
def _load_team_stats(self):
|
||||
query = """
|
||||
SELECT
|
||||
match_id, team_id,
|
||||
points, rebounds, assists, steals, blocks, turnovers,
|
||||
fg_made, fg_attempted,
|
||||
three_pt_made, three_pt_attempted,
|
||||
ft_made, ft_attempted,
|
||||
q1_score, q2_score, q3_score, q4_score
|
||||
FROM basketball_team_stats
|
||||
WHERE match_id IN (
|
||||
SELECT id FROM matches WHERE sport = 'basketball' AND status = 'FT'
|
||||
)
|
||||
"""
|
||||
self.cur.execute(query)
|
||||
rows = self.cur.fetchall()
|
||||
for r in rows:
|
||||
self.team_stats_cache[(str(r['match_id']), str(r['team_id']))] = r
|
||||
|
||||
def _load_odds(self):
|
||||
# Using exact same odds parser as original script
|
||||
query = """
|
||||
SELECT match_id, name as category_name, db_id as category_id
|
||||
FROM odd_categories
|
||||
WHERE match_id IN (
|
||||
SELECT id FROM matches WHERE sport = 'basketball' AND status = 'FT'
|
||||
)
|
||||
"""
|
||||
self.cur.execute(query)
|
||||
cats = self.cur.fetchall()
|
||||
|
||||
cat_to_match = {c['category_id']: c['match_id'] for c in cats}
|
||||
cat_ids = tuple(cat_to_match.keys())
|
||||
if not cat_ids: return
|
||||
|
||||
cat_id_to_name = {c['category_id']: c['category_name'] for c in cats}
|
||||
|
||||
chunk_size = 50000
|
||||
cats_list = list(cat_ids)
|
||||
total_chunks = len(cats_list) // chunk_size + 1
|
||||
|
||||
for idx, i in enumerate(range(0, len(cats_list), chunk_size)):
|
||||
chunk = tuple(cats_list[i:i+chunk_size])
|
||||
self.cur.execute("SELECT odd_category_db_id, name, odd_value FROM odd_selections WHERE odd_category_db_id IN %s", (chunk,))
|
||||
rows = self.cur.fetchall()
|
||||
|
||||
for row in rows:
|
||||
c_id = row['odd_category_db_id']
|
||||
m_id = str(cat_to_match[c_id])
|
||||
c_name = cat_id_to_name.get(c_id, "")
|
||||
|
||||
if m_id not in self.odds_cache:
|
||||
self.odds_cache[m_id] = {}
|
||||
self._parse_single_odd(m_id, c_name, str(row['name']), float(row['odd_value']))
|
||||
|
||||
def _parse_single_odd(self, match_id, category_name, sel_name, odd_value):
|
||||
if odd_value <= 1.0: return
|
||||
cat_lower = category_name.lower()
|
||||
sel_lower = sel_name.lower()
|
||||
target = self.odds_cache[match_id]
|
||||
|
||||
# ML
|
||||
if cat_lower in ("maç sonucu (uzt. dahil)", "mac sonucu (uzt. dahil)", "maç sonucu", "mac sonucu"):
|
||||
if sel_lower == "1": target["ml_h"] = odd_value
|
||||
elif sel_lower == "2": target["ml_a"] = odd_value
|
||||
|
||||
# Totals
|
||||
if "alt/üst" in cat_lower or "alt/ust" in cat_lower:
|
||||
line = None
|
||||
try:
|
||||
left = cat_lower.find("(")
|
||||
right = cat_lower.find(")", left + 1)
|
||||
if left > -1 and right > -1:
|
||||
line = float(cat_lower[left+1:right].replace(",", "."))
|
||||
except: pass
|
||||
if line and "tot_line" not in target: target["tot_line"] = line
|
||||
|
||||
if "üst" in sel_lower or "ust" in sel_lower or "over" in sel_lower:
|
||||
target.setdefault("tot_o", odd_value)
|
||||
elif "alt" in sel_lower or "under" in sel_lower:
|
||||
target.setdefault("tot_u", odd_value)
|
||||
|
||||
# Spread
|
||||
if "hnd. ms" in cat_lower or "hand. ms" in cat_lower or "hnd ms" in cat_lower:
|
||||
line = None
|
||||
try:
|
||||
left = cat_lower.find("(")
|
||||
right = cat_lower.find(")", left + 1)
|
||||
if left > -1 and right > -1:
|
||||
payload = cat_lower[left+1:right].replace(",", ".")
|
||||
if ":" in payload:
|
||||
home_hcp = float(payload.split(":")[0])
|
||||
away_hcp = float(payload.split(":")[1])
|
||||
if abs(home_hcp) < 1e-6 and away_hcp > 0: line = -away_hcp
|
||||
elif home_hcp > 0 and abs(away_hcp) < 1e-6: line = home_hcp
|
||||
elif abs(home_hcp - away_hcp) < 1e-6 and home_hcp > 0: line = 0.0
|
||||
except: pass
|
||||
if line is not None and "spread_line" not in target:
|
||||
target["spread_line"] = line
|
||||
|
||||
if sel_lower == "1": target.setdefault("spread_h", odd_value)
|
||||
elif sel_lower == "2": target.setdefault("spread_a", odd_value)
|
||||
|
||||
|
||||
def _build_advanced_history(self):
|
||||
team_matches = defaultdict(list)
|
||||
for m in self.matches:
|
||||
mid = str(m['id'])
|
||||
hid = str(m['home_team_id'])
|
||||
aid = str(m['away_team_id'])
|
||||
|
||||
# Fetch stats from cache
|
||||
h_stat = self.team_stats_cache.get((mid, hid))
|
||||
a_stat = self.team_stats_cache.get((mid, aid))
|
||||
|
||||
if h_stat and a_stat:
|
||||
m_data = {
|
||||
"utc": int(m['mst_utc']),
|
||||
"mid": mid,
|
||||
}
|
||||
# For Home Team History (it stores what THEY did, and what Opp did)
|
||||
team_matches[hid].append({
|
||||
"utc": int(m['mst_utc']),
|
||||
"scored": m['score_home'], "conceded": m['score_away'],
|
||||
"offense": h_stat, "defense": a_stat
|
||||
})
|
||||
# For Away Team History
|
||||
team_matches[aid].append({
|
||||
"utc": int(m['mst_utc']),
|
||||
"scored": m['score_away'], "conceded": m['score_home'],
|
||||
"offense": a_stat, "defense": h_stat
|
||||
})
|
||||
else:
|
||||
# If advanced stats are missing, we still push the scores to maintain streak tracking
|
||||
team_matches[hid].append({
|
||||
"utc": int(m['mst_utc']),
|
||||
"scored": m['score_home'], "conceded": m['score_away'],
|
||||
"offense": None, "defense": None
|
||||
})
|
||||
team_matches[aid].append({
|
||||
"utc": int(m['mst_utc']),
|
||||
"scored": m['score_away'], "conceded": m['score_home'],
|
||||
"offense": None, "defense": None
|
||||
})
|
||||
|
||||
for team_id, hist in team_matches.items():
|
||||
hist.sort(key=lambda x: x["utc"])
|
||||
|
||||
for i, match_info in enumerate(hist):
|
||||
mst_utc = match_info["utc"]
|
||||
past = [x for x in hist[:i] if x["utc"] < mst_utc]
|
||||
|
||||
if not past:
|
||||
self.form_cache[(team_id, mst_utc)] = self._empty_form()
|
||||
continue
|
||||
|
||||
last_5 = past[-5:]
|
||||
|
||||
wins = sum(1 for x in past if x["scored"] > x["conceded"])
|
||||
win_rate = wins / len(past) if len(past) > 0 else 0.5
|
||||
|
||||
streak = 0
|
||||
for x in reversed(past):
|
||||
if x["scored"] > x["conceded"]: streak += 1
|
||||
else: break
|
||||
|
||||
# Averages
|
||||
off_pts, off_reb, off_ast, off_stl, off_blk, off_tov = 0,0,0,0,0,0
|
||||
off_fg_m, off_fg_a, off_3pt_m, off_3pt_a, off_ft_m, off_ft_a = 0,0,0,0,0,0
|
||||
off_q1, off_q2, off_q3, off_q4 = 0,0,0,0
|
||||
|
||||
def_pts, def_reb, def_ast, def_tov = 0,0,0,0
|
||||
def_fg_m, def_fg_a, def_3pt_m, def_3pt_a = 0,0,0,0
|
||||
|
||||
valid_stats_count = sum(1 for x in last_5 if x["offense"] is not None)
|
||||
|
||||
if valid_stats_count > 0:
|
||||
for x in last_5:
|
||||
o = x["offense"]
|
||||
d = x["defense"]
|
||||
if o and d:
|
||||
off_pts += (o["points"] or 0)
|
||||
off_reb += (o["rebounds"] or 0)
|
||||
off_ast += (o["assists"] or 0)
|
||||
off_stl += (o["steals"] or 0)
|
||||
off_blk += (o["blocks"] or 0)
|
||||
off_tov += (o["turnovers"] or 0)
|
||||
off_fg_m += (o["fg_made"] or 0)
|
||||
off_fg_a += (o["fg_attempted"] or 0)
|
||||
off_3pt_m += (o["three_pt_made"] or 0)
|
||||
off_3pt_a += (o["three_pt_attempted"] or 0)
|
||||
off_ft_m += (o["ft_made"] or 0)
|
||||
off_ft_a += (o["ft_attempted"] or 0)
|
||||
off_q1 += (o["q1_score"] or 0)
|
||||
off_q2 += (o["q2_score"] or 0)
|
||||
off_q3 += (o["q3_score"] or 0)
|
||||
off_q4 += (o["q4_score"] or 0)
|
||||
|
||||
def_pts += (d["points"] or 0) # Conceded points based on opponents "offense" data
|
||||
def_reb += (d["rebounds"] or 0)
|
||||
def_ast += (d["assists"] or 0)
|
||||
def_tov += (d["turnovers"] or 0)
|
||||
def_fg_m += (d["fg_made"] or 0)
|
||||
def_fg_a += (d["fg_attempted"] or 0)
|
||||
def_3pt_m += (d["three_pt_made"] or 0)
|
||||
def_3pt_a += (d["three_pt_attempted"] or 0)
|
||||
|
||||
avg_c = float(valid_stats_count)
|
||||
self.form_cache[(team_id, mst_utc)] = {
|
||||
"winning_streak": streak, "win_rate": win_rate,
|
||||
"pts_avg": off_pts/avg_c, "reb_avg": off_reb/avg_c,
|
||||
"ast_avg": off_ast/avg_c, "stl_avg": off_stl/avg_c,
|
||||
"blk_avg": off_blk/avg_c, "tov_avg": off_tov/avg_c,
|
||||
"fg_pct": (off_fg_m / off_fg_a) if off_fg_a > 0 else 0.45,
|
||||
"3pt_pct": (off_3pt_m / off_3pt_a) if off_3pt_a > 0 else 0.35,
|
||||
"ft_pct": (off_ft_m / off_ft_a) if off_ft_a > 0 else 0.75,
|
||||
"q1_avg": off_q1/avg_c, "q2_avg": off_q2/avg_c,
|
||||
"q3_avg": off_q3/avg_c, "q4_avg": off_q4/avg_c,
|
||||
|
||||
"conc_pts": def_pts/avg_c, "conc_reb": def_reb/avg_c,
|
||||
"conc_ast": def_ast/avg_c, "conc_tov": def_tov/avg_c,
|
||||
"conc_fg_pct": (def_fg_m / def_fg_a) if def_fg_a > 0 else 0.45,
|
||||
"conc_3pt_pct": (def_3pt_m / def_3pt_a) if def_3pt_a > 0 else 0.35,
|
||||
}
|
||||
else:
|
||||
self.form_cache[(team_id, mst_utc)] = self._empty_form()
|
||||
self.form_cache[(team_id, mst_utc)]["winning_streak"] = streak
|
||||
self.form_cache[(team_id, mst_utc)]["win_rate"] = win_rate
|
||||
|
||||
# Build H2H similarly
|
||||
h2h_map = defaultdict(list)
|
||||
for m in self.matches:
|
||||
directional_pair = (str(m['home_team_id']), str(m['away_team_id']))
|
||||
h2h_map[directional_pair].append((m['mst_utc'], m['score_home'], m['score_away']))
|
||||
|
||||
for (h_id, a_id), hist in h2h_map.items():
|
||||
hist.sort(key=lambda x: x[0])
|
||||
for i, (mst_utc, sh, sa) in enumerate(hist):
|
||||
past = [x for x in hist[:i] if x[0] < mst_utc]
|
||||
if not past:
|
||||
self.h2h_cache[(h_id, a_id, mst_utc)] = {
|
||||
"total": 0, "home_win_rate": 0.5,
|
||||
"avg_points": 160.0, "over140_rate": 0.5
|
||||
}
|
||||
else:
|
||||
home_wins = sum(1 for x in past if x[1] > x[2])
|
||||
total_pts = sum(x[1] + x[2] for x in past)
|
||||
over140 = sum(1 for x in past if x[1] + x[2] > 140)
|
||||
self.h2h_cache[(h_id, a_id, mst_utc)] = {
|
||||
"total": len(past), "home_win_rate": home_wins / len(past),
|
||||
"avg_points": total_pts / len(past), "over140_rate": over140 / len(past)
|
||||
}
|
||||
|
||||
def _empty_form(self):
|
||||
return {
|
||||
"winning_streak": 0, "win_rate": 0.5,
|
||||
"pts_avg": 80.0, "reb_avg": 35.0, "ast_avg": 20.0,
|
||||
"stl_avg": 7.0, "blk_avg": 3.0, "tov_avg": 13.0,
|
||||
"fg_pct": 0.45, "3pt_pct": 0.35, "ft_pct": 0.75,
|
||||
"q1_avg": 20.0, "q2_avg": 20.0, "q3_avg": 20.0, "q4_avg": 20.0,
|
||||
|
||||
"conc_pts": 80.0, "conc_reb": 35.0, "conc_ast": 20.0, "conc_tov": 13.0,
|
||||
"conc_fg_pct": 0.45, "conc_3pt_pct": 0.35,
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# FEATURE EXTRACTION PIPELINE
|
||||
# =============================================================================
|
||||
|
||||
def process_matches(loader: AdvancedDataLoader):
|
||||
f = open(OUTPUT_CSV, "w", newline='')
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(FEATURE_COLS)
|
||||
|
||||
extracted_count = 0
|
||||
missing_odds_count = 0
|
||||
|
||||
for match in loader.matches:
|
||||
mid = str(match['id'])
|
||||
mst = int(match['mst_utc'])
|
||||
hid = str(match['home_team_id'])
|
||||
aid = str(match['away_team_id'])
|
||||
|
||||
s_home = int(match['score_home'])
|
||||
s_away = int(match['score_away'])
|
||||
total_pts = s_home + s_away
|
||||
|
||||
c_odds = loader.odds_cache.get(mid, {})
|
||||
c_form_h = loader.form_cache.get((hid, mst), {})
|
||||
c_form_a = loader.form_cache.get((aid, mst), {})
|
||||
c_h2h = loader.h2h_cache.get((hid, aid, mst), {})
|
||||
|
||||
if "ml_h" not in c_odds or "ml_a" not in c_odds:
|
||||
missing_odds_count += 1
|
||||
continue
|
||||
|
||||
label_ml = 0 if s_home > s_away else 1
|
||||
line_tot = c_odds.get("tot_line", 160.0)
|
||||
label_tot = 1 if total_pts > line_tot else 0
|
||||
|
||||
line_spread = c_odds.get("spread_line", 0.0)
|
||||
hc_score = float(s_home) + float(line_spread)
|
||||
label_spread = 1 if hc_score > float(s_away) else 0
|
||||
|
||||
row = [
|
||||
mid, hid, aid, match.get('league_id', ''), mst,
|
||||
|
||||
c_form_h.get("winning_streak", 0), c_form_a.get("winning_streak", 0),
|
||||
c_form_h.get("win_rate", 0), c_form_a.get("win_rate", 0),
|
||||
|
||||
# Home Offense
|
||||
c_form_h.get("pts_avg", 80), c_form_h.get("reb_avg", 35), c_form_h.get("ast_avg", 20),
|
||||
c_form_h.get("stl_avg", 7), c_form_h.get("blk_avg", 3), c_form_h.get("tov_avg", 13),
|
||||
c_form_h.get("fg_pct", 0.45), c_form_h.get("3pt_pct", 0.35), c_form_h.get("ft_pct", 0.75),
|
||||
c_form_h.get("q1_avg", 20), c_form_h.get("q2_avg", 20), c_form_h.get("q3_avg", 20), c_form_h.get("q4_avg", 20),
|
||||
|
||||
# Home Defense
|
||||
c_form_h.get("conc_pts", 80), c_form_h.get("conc_reb", 35), c_form_h.get("conc_ast", 20), c_form_h.get("conc_tov", 13),
|
||||
c_form_h.get("conc_fg_pct", 0.45), c_form_h.get("conc_3pt_pct", 0.35),
|
||||
|
||||
# Away Offense
|
||||
c_form_a.get("pts_avg", 80), c_form_a.get("reb_avg", 35), c_form_a.get("ast_avg", 20),
|
||||
c_form_a.get("stl_avg", 7), c_form_a.get("blk_avg", 3), c_form_a.get("tov_avg", 13),
|
||||
c_form_a.get("fg_pct", 0.45), c_form_a.get("3pt_pct", 0.35), c_form_a.get("ft_pct", 0.75),
|
||||
c_form_a.get("q1_avg", 20), c_form_a.get("q2_avg", 20), c_form_a.get("q3_avg", 20), c_form_a.get("q4_avg", 20),
|
||||
|
||||
# Away Defense
|
||||
c_form_a.get("conc_pts", 80), c_form_a.get("conc_reb", 35), c_form_a.get("conc_ast", 20), c_form_a.get("conc_tov", 13),
|
||||
c_form_a.get("conc_fg_pct", 0.45), c_form_a.get("conc_3pt_pct", 0.35),
|
||||
|
||||
c_h2h.get("total", 0), c_h2h.get("home_win_rate", 0.5),
|
||||
c_h2h.get("avg_points", 160.0), c_h2h.get("over140_rate", 0.5),
|
||||
|
||||
c_odds.get("ml_h", 1.9), c_odds.get("ml_a", 1.9),
|
||||
c_odds.get("tot_o", 1.9), c_odds.get("tot_u", 1.9), line_tot,
|
||||
c_odds.get("spread_h", 1.9), c_odds.get("spread_a", 1.9), line_spread,
|
||||
|
||||
s_home, s_away, total_pts,
|
||||
label_ml, label_tot, label_spread,
|
||||
]
|
||||
|
||||
if len(row) != len(FEATURE_COLS):
|
||||
print(f"Error: Row length mismatch {len(row)} != {len(FEATURE_COLS)}")
|
||||
sys.exit(1)
|
||||
|
||||
writer.writerow(row)
|
||||
extracted_count += 1
|
||||
|
||||
f.close()
|
||||
|
||||
print("\nExtraction Summary")
|
||||
print("=========================")
|
||||
print(f"Total Matches in Scope: {len(loader.matches)}")
|
||||
print(f"Filtered (Missing ML Odds): {missing_odds_count}")
|
||||
print(f"✅ Successfully Extracted: {extracted_count}")
|
||||
print(f"📂 Saved to: {OUTPUT_CSV}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
t_start = time.time()
|
||||
|
||||
if not os.path.exists(TOP_LEAGUES_PATH):
|
||||
print(f"Error: file not found {TOP_LEAGUES_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
with open(TOP_LEAGUES_PATH, "r") as f:
|
||||
top_leagues = json.load(f)
|
||||
|
||||
print(f"🏀 Extracting Advanced Basketball Training Data (V21)")
|
||||
print(f"=====================================================")
|
||||
print(f"Loaded {len(top_leagues)} top leagues.")
|
||||
|
||||
conn = get_conn()
|
||||
loader = AdvancedDataLoader(conn, top_leagues)
|
||||
|
||||
loader.load_all()
|
||||
process_matches(loader)
|
||||
|
||||
conn.close()
|
||||
print(f"Total Script Run Time: {time.time()-t_start:.1f}s")
|
||||
Reference in New Issue
Block a user