""" XGBoost Training Data Extraction ================================= Batch feature extraction for top-league matches. Extracts ~82 features + labels per match for XGBoost model training. Usage: python3 scripts/extract_training_data.py """ import os import sys import json import csv import math import time import bisect from datetime import datetime from collections import defaultdict import psycopg2 from psycopg2.extras import RealDictCursor from dotenv import load_dotenv load_dotenv() # ============================================================================= # CONFIG # ============================================================================= AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, AI_ENGINE_DIR) from features.upset_engine import get_upset_engine from features.referee_engine import get_referee_engine from features.momentum_engine import get_momentum_engine TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "qualified_leagues.json") OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv") # Ensure output dir exists os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True) def get_conn(): db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0] return psycopg2.connect(db_url) # ============================================================================= # FEATURE COLUMNS (ORDER MATTERS — matches CSV header) # ============================================================================= FEATURE_COLS = [ # Match identifiers "match_id", "home_team_id", "away_team_id", "league_id", "mst_utc", # ELO Features (8) "home_overall_elo", "away_overall_elo", "elo_diff", "home_home_elo", "away_away_elo", "home_form_elo", "away_form_elo", "form_elo_diff", # Form Features (12) "home_goals_avg", "home_conceded_avg", "away_goals_avg", "away_conceded_avg", "home_clean_sheet_rate", "away_clean_sheet_rate", "home_scoring_rate", "away_scoring_rate", "home_winning_streak", "away_winning_streak", "home_unbeaten_streak", "away_unbeaten_streak", # H2H Features (6) "h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate", "h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate", # Team Stats Features (8) "home_avg_possession", "away_avg_possession", "home_avg_shots_on_target", "away_avg_shots_on_target", "home_shot_conversion", "away_shot_conversion", "home_avg_corners", "away_avg_corners", # Odds Features (24) "odds_ms_h", "odds_ms_d", "odds_ms_a", "implied_home", "implied_draw", "implied_away", "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a", "odds_ou05_o", "odds_ou05_u", "odds_ou15_o", "odds_ou15_u", "odds_ou25_o", "odds_ou25_u", "odds_ou35_o", "odds_ou35_u", "odds_ht_ou05_o", "odds_ht_ou05_u", "odds_ht_ou15_o", "odds_ht_ou15_u", "odds_btts_y", "odds_btts_n", "odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present", "odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present", "odds_ou05_o_present", "odds_ou05_u_present", "odds_ou15_o_present", "odds_ou15_u_present", "odds_ou25_o_present", "odds_ou25_u_present", "odds_ou35_o_present", "odds_ou35_u_present", "odds_ht_ou05_o_present", "odds_ht_ou05_u_present", "odds_ht_ou15_o_present", "odds_ht_ou15_u_present", "odds_btts_y_present", "odds_btts_n_present", # Defensive/League Features (4) "home_xga", "away_xga", "league_avg_goals", "league_zero_goal_rate", # Upset Engine (4) "upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential", # Referee Engine (5) "referee_home_bias", "referee_avg_goals", "referee_cards_total", "referee_avg_yellow", "referee_experience", # Momentum Engine (3) "home_momentum_score", "away_momentum_score", "momentum_diff", # Squad Features (9) "home_squad_quality", "away_squad_quality", "squad_diff", "home_key_players", "away_key_players", "home_missing_impact", "away_missing_impact", "home_goals_form", "away_goals_form", # Player-Level Features (12) "home_lineup_goals_per90", "away_lineup_goals_per90", "home_lineup_assists_per90", "away_lineup_assists_per90", "home_squad_continuity", "away_squad_continuity", "home_top_scorer_form", "away_top_scorer_form", "home_avg_player_exp", "away_avg_player_exp", "home_goals_diversity", "away_goals_diversity", # Labels "score_home", "score_away", "total_goals", "ht_score_home", "ht_score_away", "ht_total_goals", "label_ms", # 0=Home, 1=Draw, 2=Away "label_ou05", # 0=Under, 1=Over "label_ou15", "label_ou25", "label_ou35", "label_btts", # 0=No, 1=Yes "label_ht_result", # 0=Home, 1=Draw, 2=Away "label_ht_ou05", # 0=Under, 1=Over "label_ht_ou15", "label_ht_ft", # 0=1/1, 1=1/X, 2=1/2 ... 8=2/2 "label_odd_even", # 1=Odd, 0=Even "label_yellow_cards",# Count of cards (yellow=1, red=2) "label_cards_ou45", # 0=Under 4.5, 1=Over 4.5 "label_handicap_ms", # Handikap (Home starts -1): 0=Home wins by 2+, 1=Home wins by exactly 1, 2=Draw or Away wins ] # ============================================================================= # BATCH LOADERS — Pre-load data to avoid N+1 queries # ============================================================================= class BatchDataLoader: """Pre-loads all necessary data in bulk, then serves features per match.""" def __init__(self, conn, top_league_ids: list): self.conn = conn self.cur = conn.cursor() self.top_league_ids = top_league_ids # Pre-loaded data caches self.matches = [] self.odds_cache = {} # match_id → {ms_h, ms_d, ms_a, ...} self.team_stats_cache = {} # (team_id, before_date_bucket) → stats self.form_cache = {} # (team_id, match_id) → form features self.h2h_cache = {} # (home_id, away_id, match_id) → h2h features self.league_stats_cache = {} # league_id → {avg_goals, zero_rate} self.squad_cache = {} # (match_id, team_id) → {starting, goals, assists, key_players, ...} self.cards_cache = {} # match_id → total cards scored def load_all(self): """Load all data in batch.""" t0 = time.time() self._load_matches() print(f" ✅ Matches: {len(self.matches)} ({time.time()-t0:.1f}s)", flush=True) t1 = time.time() self._load_odds() print(f" ✅ Odds: {len(self.odds_cache)} matches ({time.time()-t1:.1f}s)", flush=True) t2 = time.time() self._load_league_stats() print(f" ✅ League stats: {len(self.league_stats_cache)} leagues ({time.time()-t2:.1f}s)", flush=True) t3 = time.time() self._load_team_history() print(f" ✅ Team History & Stats cache built ({time.time()-t3:.1f}s)", flush=True) t4 = time.time() self._load_squad_data() print(f" ✅ Squad data: {len(self.squad_cache)} team-matches ({time.time()-t4:.1f}s)", flush=True) t5 = time.time() self._load_cards_data() print(f" ✅ Cards data: {len(self.cards_cache)} matches ({time.time()-t5:.1f}s)", flush=True) print(f" 📊 Total load time: {time.time()-t0:.1f}s", flush=True) def _load_matches(self): ph = ",".join(["%s"] * len(self.top_league_ids)) self.cur.execute(f""" SELECT m.id, m.home_team_id, m.away_team_id, m.score_home, m.score_away, m.ht_score_home, m.ht_score_away, m.mst_utc, m.league_id, ht.name as home_name, at.name as away_name, l.name as league_name FROM matches m JOIN teams ht ON m.home_team_id = ht.id JOIN teams at ON m.away_team_id = at.id JOIN leagues l ON m.league_id = l.id WHERE m.status = 'FT' AND m.score_home IS NOT NULL AND m.sport = 'football' AND m.league_id IN ({ph}) ORDER BY m.mst_utc ASC """, self.top_league_ids) self.matches = self.cur.fetchall() def _load_odds(self): """Bulk load all odds for top league matches.""" ph = ",".join(["%s"] * len(self.top_league_ids)) self.cur.execute(f""" SELECT oc.match_id, oc.name, os.name, os.odd_value FROM odd_selections os JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id JOIN matches m ON oc.match_id = m.id WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) """, self.top_league_ids) for match_id, cat_name, sel_name, odd_val in self.cur.fetchall(): try: v = float(odd_val) if odd_val else 0 if v <= 0 or not cat_name or not sel_name: continue if match_id not in self.odds_cache: self.odds_cache[match_id] = {} cat_lower = cat_name.lower().strip() sel_lower = sel_name.lower().strip() # Match Result (1X2) if cat_lower == 'maç sonucu': if sel_name == '1': self.odds_cache[match_id]['ms_h'] = v elif sel_name in ('0', 'X'): self.odds_cache[match_id]['ms_d'] = v elif sel_name == '2': self.odds_cache[match_id]['ms_a'] = v # HT Result elif cat_lower == '1. yarı sonucu': if sel_name == '1': self.odds_cache[match_id]['ht_ms_h'] = v elif sel_name in ('0', 'X'): self.odds_cache[match_id]['ht_ms_d'] = v elif sel_name == '2': self.odds_cache[match_id]['ht_ms_a'] = v # BTTS elif cat_lower == 'karşılıklı gol': if 'var' in sel_lower: self.odds_cache[match_id]['btts_y'] = v elif 'yok' in sel_lower: self.odds_cache[match_id]['btts_n'] = v # Over/Under FT elif cat_lower == '0,5 alt/üst': if 'alt' in sel_lower: self.odds_cache[match_id]['ou05_u'] = v elif 'üst' in sel_lower: self.odds_cache[match_id]['ou05_o'] = v elif cat_lower == '1,5 alt/üst': if 'alt' in sel_lower: self.odds_cache[match_id]['ou15_u'] = v elif 'üst' in sel_lower: self.odds_cache[match_id]['ou15_o'] = v elif cat_lower == '2,5 alt/üst': if 'alt' in sel_lower: self.odds_cache[match_id]['ou25_u'] = v elif 'üst' in sel_lower: self.odds_cache[match_id]['ou25_o'] = v elif cat_lower == '3,5 alt/üst': if 'alt' in sel_lower: self.odds_cache[match_id]['ou35_u'] = v elif 'üst' in sel_lower: self.odds_cache[match_id]['ou35_o'] = v # Over/Under HT elif cat_lower == '1. yarı 0,5 alt/üst': if 'alt' in sel_lower: self.odds_cache[match_id]['ht_ou05_u'] = v elif 'üst' in sel_lower: self.odds_cache[match_id]['ht_ou05_o'] = v elif cat_lower == '1. yarı 1,5 alt/üst': if 'alt' in sel_lower: self.odds_cache[match_id]['ht_ou15_u'] = v elif 'üst' in sel_lower: self.odds_cache[match_id]['ht_ou15_o'] = v except (ValueError, TypeError): pass def _load_league_stats(self): """Calculate league-level aggregated stats.""" ph = ",".join(["%s"] * len(self.top_league_ids)) self.cur.execute(f""" SELECT league_id, AVG(score_home + score_away) as avg_goals, AVG(CASE WHEN score_home = 0 AND score_away = 0 THEN 1.0 ELSE 0.0 END) as zero_rate, COUNT(*) as match_count FROM matches WHERE status = 'FT' AND score_home IS NOT NULL AND sport = 'football' AND league_id IN ({ph}) GROUP BY league_id """, self.top_league_ids) for league_id, avg_goals, zero_rate, cnt in self.cur.fetchall(): self.league_stats_cache[league_id] = { "avg_goals": float(avg_goals) if avg_goals else 2.5, "zero_rate": float(zero_rate) if zero_rate else 0.07, "match_count": cnt } def _load_team_history(self): """Bulk load all matches and team stats for lightning fast memory lookups""" from collections import defaultdict self.team_matches = defaultdict(list) self.team_stats = defaultdict(list) # Load all matches for form/h2h self.cur.execute(""" SELECT home_team_id, away_team_id, score_home, score_away, mst_utc FROM matches WHERE status = 'FT' AND score_home IS NOT NULL AND sport = 'football' ORDER BY mst_utc ASC """) for hid, aid, sh, sa, mst in self.cur.fetchall(): # (mst, is_home, team_score, opp_score, opp_id) self.team_matches[hid].append((mst, True, sh, sa, aid)) self.team_matches[aid].append((mst, False, sa, sh, hid)) # Load all football_team_stats (sport-partitioned schema) self.cur.execute(""" SELECT mts.team_id, m.mst_utc, mts.possession_percentage, mts.shots_on_target, mts.total_shots, mts.corners, m.score_home, m.score_away, m.home_team_id FROM football_team_stats mts JOIN matches m ON mts.match_id = m.id WHERE m.sport = 'football' AND m.status = 'FT' AND m.score_home IS NOT NULL ORDER BY m.mst_utc ASC """) for tid, mst, poss, sot, tshots, corn, sh, sa, hid in self.cur.fetchall(): team_goals = sh if hid == tid else sa self.team_stats[tid].append((mst, poss, sot, tshots, corn, team_goals)) def _load_squad_data(self): """Bulk load squad participation + player events + player career for squad features.""" ph = ",".join(["%s"] * len(self.top_league_ids)) # 1) Participation: starting XI count + position distribution per (match, team) self.cur.execute(f""" SELECT mpp.match_id, mpp.team_id, COUNT(*) FILTER (WHERE mpp.is_starting = true) AS starting_count, COUNT(*) AS total_squad, COUNT(*) FILTER (WHERE mpp.is_starting = true AND LOWER(COALESCE(mpp.position::TEXT,'')) ~ '(forward|fwd|forvet|striker)') AS fwd_count FROM match_player_participation mpp JOIN matches m ON mpp.match_id = m.id WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) GROUP BY mpp.match_id, mpp.team_id """, self.top_league_ids) participation = {} for mid, tid, st_count, tot, fwd in self.cur.fetchall(): participation[(mid, tid)] = { 'starting_count': st_count or 0, 'total_squad': tot or 0, 'fwd_count': fwd or 0, } # 2) Player events: goals + assists per (match, team) self.cur.execute(f""" SELECT mpe.match_id, mpe.team_id, COUNT(*) FILTER ( WHERE mpe.event_type = 'goal' AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%' ) AS goal_count, COUNT(DISTINCT mpe.assist_player_id) FILTER ( WHERE mpe.event_type = 'goal' AND mpe.assist_player_id IS NOT NULL ) AS assist_count, COUNT(DISTINCT mpe.player_id) FILTER ( WHERE mpe.event_type = 'goal' AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%' ) AS unique_scorers FROM match_player_events mpe JOIN matches m ON mpe.match_id = m.id WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) GROUP BY mpe.match_id, mpe.team_id """, self.top_league_ids) events = {} for mid, tid, goals, assists, scorers in self.cur.fetchall(): events[(mid, tid)] = { 'goals': goals or 0, 'assists': assists or 0, 'unique_scorers': scorers or 0, } # 3) Key players: players with 3+ goals across all their matches, per team self.cur.execute(f""" SELECT mpe.team_id, mpe.player_id, COUNT(*) AS total_goals FROM match_player_events mpe JOIN matches m ON mpe.match_id = m.id WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) AND mpe.event_type = 'goal' AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%' GROUP BY mpe.team_id, mpe.player_id HAVING COUNT(*) >= 3 """, self.top_league_ids) key_players_by_team = defaultdict(set) for tid, pid, _ in self.cur.fetchall(): key_players_by_team[tid].add(pid) # 4) Starting key players per (match, team) self.cur.execute(f""" SELECT mpp.match_id, mpp.team_id, mpp.player_id FROM match_player_participation mpp JOIN matches m ON mpp.match_id = m.id WHERE mpp.is_starting = true AND m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) """, self.top_league_ids) starting_players = defaultdict(list) for mid, tid, pid in self.cur.fetchall(): starting_players[(mid, tid)].append(pid) # 5) Build match_id → mst_utc mapping for temporal filtering match_mst = {} for m in self.matches: match_mst[m[0]] = m[7] # m[0]=id, m[7]=mst_utc # ─── NEW: Player Career Stats (prefix-sum for O(1) temporal lookup) ─── # 6a) Goals per player per match date self.cur.execute(f""" SELECT mpe.player_id, m.mst_utc, SUM(CASE WHEN mpe.event_type = 'goal' AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%' THEN 1 ELSE 0 END) AS goals FROM match_player_events mpe JOIN matches m ON mpe.match_id = m.id WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) GROUP BY mpe.player_id, m.mst_utc """, self.top_league_ids) player_goals_raw = defaultdict(dict) for pid, mst, goals in self.cur.fetchall(): player_goals_raw[pid][mst] = (player_goals_raw[pid].get(mst, 0)) + (goals or 0) # 6b) Assists per player per match date self.cur.execute(f""" SELECT mpe.assist_player_id, m.mst_utc, COUNT(*) AS assists FROM match_player_events mpe JOIN matches m ON mpe.match_id = m.id WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) AND mpe.event_type = 'goal' AND mpe.assist_player_id IS NOT NULL GROUP BY mpe.assist_player_id, m.mst_utc """, self.top_league_ids) player_assists_raw = defaultdict(dict) for pid, mst, assists in self.cur.fetchall(): player_assists_raw[pid][mst] = (player_assists_raw[pid].get(mst, 0)) + (assists or 0) # 6c) Player participation dates (starts only) self.cur.execute(f""" SELECT mpp.player_id, m.mst_utc FROM match_player_participation mpp JOIN matches m ON mpp.match_id = m.id WHERE mpp.is_starting = true AND m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) ORDER BY mpp.player_id, m.mst_utc """, self.top_league_ids) player_starts_raw = defaultdict(list) for pid, mst in self.cur.fetchall(): player_starts_raw[pid].append(mst) # 6d) Build prefix sums per player (goals_prefix[i] = total goals up to start i) player_career = {} all_pids = set(player_starts_raw.keys()) | set(player_goals_raw.keys()) | set(player_assists_raw.keys()) for pid in all_pids: starts = sorted(set(player_starts_raw.get(pid, []))) if not starts: continue g_map = player_goals_raw.get(pid, {}) a_map = player_assists_raw.get(pid, {}) cum_g, cum_a = 0, 0 goals_pf, assists_pf = [], [] for mst in starts: cum_g += g_map.get(mst, 0) cum_a += a_map.get(mst, 0) goals_pf.append(cum_g) assists_pf.append(cum_a) player_career[pid] = {'msts': starts, 'gp': goals_pf, 'ap': assists_pf} # Free raw dicts del player_goals_raw, player_assists_raw, player_starts_raw print(f" 📊 Player careers built: {len(player_career)} players", flush=True) # ─── NEW: Team Lineup History (for squad continuity) ─── # 7) Per-team sorted lineups: [(mst, frozenset(player_ids))] team_lineup_map = defaultdict(list) for (mid, tid), pids in starting_players.items(): mst = match_mst.get(mid, 0) if mst > 0 and pids: team_lineup_map[tid].append((mst, frozenset(pids))) team_lineup_history = {} team_lineup_msts = {} for tid, ll in team_lineup_map.items(): ll.sort(key=lambda x: x[0]) team_lineup_history[tid] = ll team_lineup_msts[tid] = [x[0] for x in ll] del team_lineup_map # ─── 8) Build combined cache — NO DATA LEAKAGE ─── all_keys = set(participation.keys()) | set(events.keys()) for key in all_keys: mid, tid = key part = participation.get(key, {'starting_count': 0, 'total_squad': 0, 'fwd_count': 0}) # Count key players in starting XI starters = starting_players.get(key, []) kp_in_starting = sum(1 for p in starters if p in key_players_by_team.get(tid, set())) kp_total = len(key_players_by_team.get(tid, set())) kp_missing = max(0, kp_total - kp_in_starting) # Squad quality: composite score — ONLY pre-match info squad_quality = ( part['starting_count'] * 0.3 + kp_in_starting * 3.0 + part['fwd_count'] * 1.5 ) missing_impact = min(kp_missing / max(kp_total, 1), 1.0) # goals_form: avg goals from last 5 matches BEFORE this match current_mst = match_mst.get(mid, 0) team_history = self.team_matches.get(tid, []) recent_goals = [ tm[2] for tm in team_history if tm[0] < current_mst ][-5:] goals_form = sum(recent_goals) / len(recent_goals) if recent_goals else 1.3 # ─── NEW: Player-level aggregation for starting XI ─── lineup_g90, lineup_a90, total_exp = 0.0, 0.0, 0 best_scorer_total, best_scorer_id = 0, None scorers_in_lineup = 0 for pid in starters: pc = player_career.get(pid) if not pc: continue idx = bisect.bisect_left(pc['msts'], current_mst) if idx == 0: continue # no prior matches for this player prior_starts = idx prior_goals = pc['gp'][idx - 1] prior_assists = pc['ap'][idx - 1] lineup_g90 += prior_goals / prior_starts lineup_a90 += prior_assists / prior_starts total_exp += prior_starts if prior_goals > 0: scorers_in_lineup += 1 if prior_goals > best_scorer_total: best_scorer_total = prior_goals best_scorer_id = pid n_st = len(starters) or 1 # Top scorer recent form (goals in last 5 starts) top_scorer_form = 0 if best_scorer_id: pc = player_career.get(best_scorer_id) if pc: idx = bisect.bisect_left(pc['msts'], current_mst) if idx > 0: s5 = max(0, idx - 5) top_scorer_form = pc['gp'][idx - 1] - (pc['gp'][s5 - 1] if s5 > 0 else 0) # Squad continuity (overlap with previous match lineup) squad_continuity = 0.5 msts_list = team_lineup_msts.get(tid) if msts_list: li = bisect.bisect_left(msts_list, current_mst) if li > 0: prev_lineup = team_lineup_history[tid][li - 1][1] squad_continuity = len(frozenset(starters) & prev_lineup) / n_st self.squad_cache[key] = { 'squad_quality': squad_quality, 'key_players': kp_in_starting, 'missing_impact': missing_impact, 'goals_form': round(goals_form, 2), 'lineup_goals_per90': round(lineup_g90, 3), 'lineup_assists_per90': round(lineup_a90, 3), 'squad_continuity': round(squad_continuity, 3), 'top_scorer_form': top_scorer_form, 'avg_player_exp': round(total_exp / n_st, 1), 'goals_diversity': round(scorers_in_lineup / n_st, 3), } def _load_cards_data(self): """Bulk load all distinct cards per match (yellow=1, red=2).""" ph = ",".join(["%s"] * len(self.top_league_ids)) self.cur.execute(f""" SELECT mpe.match_id, SUM(CASE WHEN mpe.event_type::text LIKE '%%yellow_card%%' THEN 1 WHEN mpe.event_type::text LIKE '%%red_card%%' THEN 2 ELSE 1 END) as cards_weight FROM match_player_events mpe JOIN matches m ON mpe.match_id = m.id WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) AND mpe.event_type::text LIKE '%%card%%' GROUP BY mpe.match_id """, self.top_league_ids) for mid, cards_weight in self.cur.fetchall(): self.cards_cache[mid] = float(cards_weight) if cards_weight else 0.0 class FeatureExtractor: """Extract features for a single match using pre-loaded data + on-demand queries.""" def __init__(self, conn, loader: BatchDataLoader): self.conn = conn self.cur = conn.cursor() self.loader = loader # ELO cache: team_id → {overall, home, away, form} self.elo_ratings = defaultdict(lambda: { "overall": 1500.0, "home": 1500.0, "away": 1500.0, "form": 1500.0, "matches": 0 }) self._elo_initialized = False self.upset_engine = get_upset_engine() self.referee_engine = get_referee_engine() self.momentum_engine = get_momentum_engine() # ── Data Quality Thresholds ── # Matches below these thresholds produce default-only features that # teach the model noise rather than signal. DQ_MIN_FORM_MATCHES = 3 # team must have ≥3 prior matches DQ_MIN_FEATURE_COVERAGE = 0.30 # ≥30% of key features must be non-default def extract_all(self) -> list: """Extract features for all matches with data quality validation.""" matches = self.loader.matches total = len(matches) rows = [] skipped = 0 dq_rejected = 0 dq_reasons: dict = defaultdict(int) t_start = time.time() print(f"\n🔄 Extracting features for {total} matches...", flush=True) # Process chronologically — ELO grows as we go for i, m in enumerate(matches): ( mid, hid, aid, sh, sa, hth, hta, mst, lid, home_name, away_name, league_name, ) = m if i % 100 == 0 and i > 0: elapsed = time.time() - t_start rate = i / elapsed # matches per second remaining = (total - i) / rate if rate > 0 else 0 pct = i / total * 100 print( f" [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | " f"ETA: {remaining/60:.1f} dk | skipped: {skipped} | " f"dq_rejected: {dq_rejected}", flush=True, ) row = self._extract_one( mid, hid, aid, sh, sa, hth, hta, mst, lid, home_name, away_name, league_name, ) if row: # ── Data Quality Gate ── dq_pass, reason = self._validate_row_quality(row, hid, aid, mst) if dq_pass: rows.append(row) else: dq_rejected += 1 dq_reasons[reason] += 1 else: skipped += 1 # Update ELO after processing (so ELO is calculated BEFORE the match) self._update_elo(hid, aid, sh, sa) print(f" ✅ Extracted {len(rows)} rows, skipped {skipped}, DQ rejected {dq_rejected}", flush=True) if dq_reasons: print(f" 📊 DQ Rejection reasons:") for reason, count in sorted(dq_reasons.items(), key=lambda x: -x[1]): print(f" {reason}: {count}") return rows def _extract_one( self, mid, hid, aid, sh, sa, hth, hta, mst, lid, home_name, away_name, league_name, ): """Extract features for a single match.""" # === LABELS === total_goals = sh + sa ht_total = (hth + hta) if hth is not None and hta is not None else None if sh > sa: label_ms = 0 elif sh < sa: label_ms = 2 else: label_ms = 1 label_ou05 = 1 if total_goals > 0.5 else 0 label_ou15 = 1 if total_goals > 1.5 else 0 label_ou25 = 1 if total_goals > 2.5 else 0 label_ou35 = 1 if total_goals > 3.5 else 0 label_btts = 1 if (sh > 0 and sa > 0) else 0 label_odd_even = 1 if total_goals % 2 != 0 else 0 # Handicap MS Label (Home starts -1): 0=Home wins by 2+, 1=Home wins by exactly 1, 2=Draw or Away wins score_diff = sh - sa if score_diff >= 2: label_handicap_ms = 0 elif score_diff == 1: label_handicap_ms = 1 else: label_handicap_ms = 2 # Cards Label cards_count = self.loader.cards_cache.get(mid, 0.0) label_yellow_cards = cards_count label_cards_ou45 = 1 if cards_count > 4.5 else 0 # HT labels label_ht_result = None label_ht_ou05 = None label_ht_ou15 = None if hth is not None and hta is not None: if hth > hta: label_ht_result = 0 elif hth < hta: label_ht_result = 2 else: label_ht_result = 1 label_ht_ou05 = 1 if ht_total > 0.5 else 0 label_ht_ou15 = 1 if ht_total > 1.5 else 0 # HT/FT Label (0-8) # 0: 1/1, 1: 1/X, 2: 1/2 # 3: X/1, 4: X/X, 5: X/2 # 6: 2/1, 7: 2/X, 8: 2/2 label_ht_ft = None if label_ht_result is not None: label_ht_ft = label_ht_result * 3 + label_ms # === ELO FEATURES === h_elo = self.elo_ratings[hid] a_elo = self.elo_ratings[aid] elo_features = { "home_overall_elo": h_elo["overall"], "away_overall_elo": a_elo["overall"], "elo_diff": h_elo["overall"] - a_elo["overall"], "home_home_elo": h_elo["home"], "away_away_elo": a_elo["away"], "home_form_elo": h_elo["form"], "away_form_elo": a_elo["form"], "form_elo_diff": h_elo["form"] - a_elo["form"], } # === FORM FEATURES === form_features = self._get_form_features(hid, aid, mst) # === H2H FEATURES === h2h_features = self._get_h2h_features(hid, aid, mst) # === TEAM STATS FEATURES === stats_features = self._get_team_stats_features(hid, aid, mst) # === ODDS FEATURES === odds = self.loader.odds_cache.get(mid, {}) ms_h = odds.get("ms_h", 0.0) ms_d = odds.get("ms_d", 0.0) ms_a = odds.get("ms_a", 0.0) # Implied probabilities (normalized vig-free) if ms_h > 0 and ms_d > 0 and ms_a > 0: raw_sum = 1/ms_h + 1/ms_d + 1/ms_a implied_home = (1/ms_h) / raw_sum implied_draw = (1/ms_d) / raw_sum implied_away = (1/ms_a) / raw_sum else: implied_home = implied_draw = implied_away = 0.33 odds_features = { "odds_ms_h": ms_h, "odds_ms_d": ms_d, "odds_ms_a": ms_a, "implied_home": implied_home, "implied_draw": implied_draw, "implied_away": implied_away, "odds_ht_ms_h": odds.get("ht_ms_h", 0.0), "odds_ht_ms_d": odds.get("ht_ms_d", 0.0), "odds_ht_ms_a": odds.get("ht_ms_a", 0.0), "odds_ou05_o": odds.get("ou05_o", 0.0), "odds_ou05_u": odds.get("ou05_u", 0.0), "odds_ou15_o": odds.get("ou15_o", 0.0), "odds_ou15_u": odds.get("ou15_u", 0.0), "odds_ou25_o": odds.get("ou25_o", 0.0), "odds_ou25_u": odds.get("ou25_u", 0.0), "odds_ou35_o": odds.get("ou35_o", 0.0), "odds_ou35_u": odds.get("ou35_u", 0.0), "odds_ht_ou05_o": odds.get("ht_ou05_o", 0.0), "odds_ht_ou05_u": odds.get("ht_ou05_u", 0.0), "odds_ht_ou15_o": odds.get("ht_ou15_o", 0.0), "odds_ht_ou15_u": odds.get("ht_ou15_u", 0.0), "odds_btts_y": odds.get("btts_y", 0.0), "odds_btts_n": odds.get("btts_n", 0.0), "odds_ms_h_present": 1.0 if ms_h > 1.01 else 0.0, "odds_ms_d_present": 1.0 if ms_d > 1.01 else 0.0, "odds_ms_a_present": 1.0 if ms_a > 1.01 else 0.0, "odds_ht_ms_h_present": 1.0 if odds.get("ht_ms_h", 0.0) > 1.01 else 0.0, "odds_ht_ms_d_present": 1.0 if odds.get("ht_ms_d", 0.0) > 1.01 else 0.0, "odds_ht_ms_a_present": 1.0 if odds.get("ht_ms_a", 0.0) > 1.01 else 0.0, "odds_ou05_o_present": 1.0 if odds.get("ou05_o", 0.0) > 1.01 else 0.0, "odds_ou05_u_present": 1.0 if odds.get("ou05_u", 0.0) > 1.01 else 0.0, "odds_ou15_o_present": 1.0 if odds.get("ou15_o", 0.0) > 1.01 else 0.0, "odds_ou15_u_present": 1.0 if odds.get("ou15_u", 0.0) > 1.01 else 0.0, "odds_ou25_o_present": 1.0 if odds.get("ou25_o", 0.0) > 1.01 else 0.0, "odds_ou25_u_present": 1.0 if odds.get("ou25_u", 0.0) > 1.01 else 0.0, "odds_ou35_o_present": 1.0 if odds.get("ou35_o", 0.0) > 1.01 else 0.0, "odds_ou35_u_present": 1.0 if odds.get("ou35_u", 0.0) > 1.01 else 0.0, "odds_ht_ou05_o_present": 1.0 if odds.get("ht_ou05_o", 0.0) > 1.01 else 0.0, "odds_ht_ou05_u_present": 1.0 if odds.get("ht_ou05_u", 0.0) > 1.01 else 0.0, "odds_ht_ou15_o_present": 1.0 if odds.get("ht_ou15_o", 0.0) > 1.01 else 0.0, "odds_ht_ou15_u_present": 1.0 if odds.get("ht_ou15_u", 0.0) > 1.01 else 0.0, "odds_btts_y_present": 1.0 if odds.get("btts_y", 0.0) > 1.01 else 0.0, "odds_btts_n_present": 1.0 if odds.get("btts_n", 0.0) > 1.01 else 0.0, } # === LEAGUE FEATURES === league = self.loader.league_stats_cache.get(lid, {"avg_goals": 2.5, "zero_rate": 0.07}) league_features = { "league_avg_goals": league["avg_goals"], "league_zero_goal_rate": league["zero_rate"], } # === UPSET FEATURES === try: upset_feats = self.upset_engine.get_features( home_team_name=home_name or "", home_team_id=hid, away_team_name=away_name or "", league_name=league_name or "", home_position=10, away_position=10, match_date_ms=mst, ) except Exception: upset_feats = { "upset_atmosphere": 0.0, "upset_motivation": 0.0, "upset_fatigue": 0.0, "upset_potential": 0.0, } # === REFEREE FEATURES === try: referee_feats = self.referee_engine.get_features( match_id=mid, league_id=str(lid), ) except Exception: referee_feats = { "referee_home_bias": 0.0, "referee_avg_goals": 2.7, "referee_cards_total": 4.0, "referee_avg_yellow": 3.5, "referee_avg_red": 0.1, "referee_experience": 0.5, } # === MOMENTUM FEATURES === try: momentum_feats = self.momentum_engine.get_features( home_team_id=hid, away_team_id=aid, match_date_ms=mst, ) home_momentum_score = momentum_feats.get("home_momentum_score", 0.5) away_momentum_score = momentum_feats.get("away_momentum_score", 0.5) momentum_diff = momentum_feats.get( "momentum_diff", home_momentum_score - away_momentum_score ) except Exception: home_momentum_score = 0.5 away_momentum_score = 0.5 momentum_diff = 0.0 # === SQUAD FEATURES === home_sq = self.loader.squad_cache.get((mid, hid), {}) away_sq = self.loader.squad_cache.get((mid, aid), {}) home_squad_quality = home_sq.get('squad_quality', 0.0) away_squad_quality = away_sq.get('squad_quality', 0.0) squad_diff = home_squad_quality - away_squad_quality home_key_players = home_sq.get('key_players', 0) away_key_players = away_sq.get('key_players', 0) home_missing_impact = home_sq.get('missing_impact', 0.0) away_missing_impact = away_sq.get('missing_impact', 0.0) home_goals_form = home_sq.get('goals_form', 0) away_goals_form = away_sq.get('goals_form', 0) # === ASSEMBLE ROW === row = { "match_id": mid, "home_team_id": hid, "away_team_id": aid, "league_id": lid, "mst_utc": mst, **elo_features, **form_features, **h2h_features, **stats_features, **odds_features, "home_xga": form_features["home_conceded_avg"], "away_xga": form_features["away_conceded_avg"], **league_features, "upset_atmosphere": upset_feats.get("upset_atmosphere", 0.0), "upset_motivation": upset_feats.get("upset_motivation", 0.0), "upset_fatigue": upset_feats.get("upset_fatigue", 0.0), "upset_potential": upset_feats.get("upset_potential", 0.0), "referee_home_bias": referee_feats.get("referee_home_bias", 0.0), "referee_avg_goals": referee_feats.get("referee_avg_goals", 2.7), "referee_cards_total": referee_feats.get("referee_cards_total", 4.0), "referee_avg_yellow": referee_feats.get("referee_avg_yellow", 3.5), "referee_experience": referee_feats.get("referee_experience", 0.5), "home_momentum_score": home_momentum_score, "away_momentum_score": away_momentum_score, "momentum_diff": momentum_diff, # Squad Features "home_squad_quality": home_squad_quality, "away_squad_quality": away_squad_quality, "squad_diff": squad_diff, "home_key_players": home_key_players, "away_key_players": away_key_players, "home_missing_impact": home_missing_impact, "away_missing_impact": away_missing_impact, "home_goals_form": home_goals_form, "away_goals_form": away_goals_form, # Player-Level Features "home_lineup_goals_per90": home_sq.get('lineup_goals_per90', 0.0), "away_lineup_goals_per90": away_sq.get('lineup_goals_per90', 0.0), "home_lineup_assists_per90": home_sq.get('lineup_assists_per90', 0.0), "away_lineup_assists_per90": away_sq.get('lineup_assists_per90', 0.0), "home_squad_continuity": home_sq.get('squad_continuity', 0.5), "away_squad_continuity": away_sq.get('squad_continuity', 0.5), "home_top_scorer_form": home_sq.get('top_scorer_form', 0), "away_top_scorer_form": away_sq.get('top_scorer_form', 0), "home_avg_player_exp": home_sq.get('avg_player_exp', 0.0), "away_avg_player_exp": away_sq.get('avg_player_exp', 0.0), "home_goals_diversity": home_sq.get('goals_diversity', 0.0), "away_goals_diversity": away_sq.get('goals_diversity', 0.0), # Labels "score_home": sh, "score_away": sa, "total_goals": total_goals, "ht_score_home": hth if hth is not None else "", "ht_score_away": hta if hta is not None else "", "ht_total_goals": ht_total if ht_total is not None else "", "label_ms": label_ms, "label_ou05": label_ou05, "label_ou15": label_ou15, "label_ou25": label_ou25, "label_ou35": label_ou35, "label_btts": label_btts, "label_ht_result": label_ht_result if label_ht_result is not None else "", "label_ht_ou05": label_ht_ou05 if label_ht_ou05 is not None else "", "label_ht_ou15": label_ht_ou15 if label_ht_ou15 is not None else "", "label_ht_ft": label_ht_ft if label_ht_ft is not None else "", "label_odd_even": label_odd_even, "label_yellow_cards": label_yellow_cards, "label_cards_ou45": label_cards_ou45, "label_handicap_ms": label_handicap_ms, } return row def _validate_row_quality( self, row: dict, home_id: str, away_id: str, before_date: int, ) -> tuple: """ Data quality gate for training rows. Ensures the feature vector has enough real signal to be useful for training. Rejects rows where critical features are all at their default/fallback values — these teach the model noise, not patterns. Returns (pass: bool, reason: str | None). """ # 1. Minimum form history: both teams must have enough prior matches home_history = self.loader.team_matches.get(home_id, []) away_history = self.loader.team_matches.get(away_id, []) home_prior = sum(1 for m in home_history if m[0] < before_date) away_prior = sum(1 for m in away_history if m[0] < before_date) if home_prior < self.DQ_MIN_FORM_MATCHES: return False, 'home_insufficient_history' if away_prior < self.DQ_MIN_FORM_MATCHES: return False, 'away_insufficient_history' # 2. Feature coverage check: count how many key features are non-default key_features = [ ('home_goals_avg', 1.3), ('away_goals_avg', 1.3), ('home_clean_sheet_rate', 0.25), ('away_clean_sheet_rate', 0.25), ('home_avg_possession', 0.50), ('away_avg_possession', 0.50), ('home_avg_shots_on_target', 3.5), ('away_avg_shots_on_target', 3.5), ('h2h_total_matches', 0), ('odds_ms_h', 0.0), ] non_default = sum( 1 for feat_name, default_val in key_features if abs(float(row.get(feat_name, default_val)) - default_val) > 0.01 ) coverage = non_default / len(key_features) if coverage < self.DQ_MIN_FEATURE_COVERAGE: return False, f'low_feature_coverage_{coverage:.0%}' return True, None # ------------------------------------------------------------------------- # ELO (simplified inline version — doesn't need DB, grows incrementally) # ------------------------------------------------------------------------- def _update_elo(self, home_id, away_id, score_home, score_away): """Update ELO ratings after a match.""" h = self.elo_ratings[home_id] a = self.elo_ratings[away_id] HOME_ADVANTAGE = 65 K_BASE = 32 # Expected scores exp_h = 1.0 / (1 + 10 ** ((a["overall"] - h["overall"] - HOME_ADVANTAGE) / 400)) exp_a = 1.0 - exp_h # Actual scores if score_home > score_away: actual_h, actual_a = 1.0, 0.0 elif score_home < score_away: actual_h, actual_a = 0.0, 1.0 else: actual_h, actual_a = 0.5, 0.5 # Goal difference multiplier gd = abs(score_home - score_away) gd_mult = math.log(max(gd, 1) + 1) * 0.7 + 1.0 # Dynamic K k_h = K_BASE * gd_mult * (1.3 if h["matches"] < 10 else 1.0) k_a = K_BASE * gd_mult * (1.3 if a["matches"] < 10 else 1.0) delta_h = k_h * (actual_h - exp_h) delta_a = k_a * (actual_a - exp_a) # Update all ELO variants h["overall"] += delta_h a["overall"] += delta_a h["home"] += delta_h * 1.1 a["away"] += delta_a * 1.1 # Form ELO (heavier weight on recent) form_k = K_BASE * 1.5 * gd_mult h["form"] = h["form"] * 0.85 + (h["form"] + form_k * (actual_h - exp_h)) * 0.15 a["form"] = a["form"] * 0.85 + (a["form"] + form_k * (actual_a - exp_a)) * 0.15 h["matches"] += 1 a["matches"] += 1 # ------------------------------------------------------------------------- # FORM (last 5 matches) # ------------------------------------------------------------------------- def _get_form_features(self, home_id, away_id, before_date) -> dict: """Get form features for both teams.""" h_form = self._calc_team_form(home_id, before_date) a_form = self._calc_team_form(away_id, before_date) return { "home_goals_avg": h_form["goals_avg"], "home_conceded_avg": h_form["conceded_avg"], "away_goals_avg": a_form["goals_avg"], "away_conceded_avg": a_form["conceded_avg"], "home_clean_sheet_rate": h_form["clean_sheet_rate"], "away_clean_sheet_rate": a_form["clean_sheet_rate"], "home_scoring_rate": h_form["scoring_rate"], "away_scoring_rate": a_form["scoring_rate"], "home_winning_streak": h_form["winning_streak"], "away_winning_streak": a_form["winning_streak"], "home_unbeaten_streak": h_form["unbeaten_streak"], "away_unbeaten_streak": a_form["unbeaten_streak"], } def _calc_team_form(self, team_id, before_date, limit=5) -> dict: """Calculate form from last N matches (weighted moving average).""" history = self.loader.team_matches.get(team_id, []) # Filter and get last `limit` matches before date valid_matches = [m for m in history if m[0] < before_date] rows = valid_matches[-limit:] if valid_matches else [] if not rows: return { "goals_avg": 1.3, "conceded_avg": 1.2, "clean_sheet_rate": 0.25, "scoring_rate": 0.75, "winning_streak": 0, "unbeaten_streak": 0, } w_goals, w_conceded, w_total = 0.0, 0.0, 0.0 clean_sheets, scored_matches = 0, 0 winning_streak, unbeaten_streak = 0, 0 streak_broken, unbeaten_broken = False, False for i, (mst, is_home, team_goals, opp_goals, opp_id) in enumerate(reversed(rows)): weight = float(limit - i) w_goals += team_goals * weight w_conceded += opp_goals * weight w_total += weight if opp_goals == 0: clean_sheets += 1 if team_goals > 0: scored_matches += 1 if not streak_broken: if team_goals > opp_goals: winning_streak += 1 else: streak_broken = True if not unbeaten_broken: if team_goals >= opp_goals: unbeaten_streak += 1 else: unbeaten_broken = True n = len(rows) return { "goals_avg": w_goals / w_total if w_total > 0 else 1.3, "conceded_avg": w_conceded / w_total if w_total > 0 else 1.2, "clean_sheet_rate": clean_sheets / n, "scoring_rate": scored_matches / n, "winning_streak": winning_streak, "unbeaten_streak": unbeaten_streak, } # ------------------------------------------------------------------------- # H2H # ------------------------------------------------------------------------- def _get_h2h_features(self, home_id, away_id, before_date) -> dict: """Get head-to-head features.""" h_history = self.loader.team_matches.get(home_id, []) # Matches against away_id before date h2h_matches = [m for m in h_history if m[4] == away_id and m[0] < before_date] rows = h2h_matches[-20:] if h2h_matches else [] if not rows: return { "h2h_total_matches": 0, "h2h_home_win_rate": 0.33, "h2h_draw_rate": 0.33, "h2h_avg_goals": 2.5, "h2h_btts_rate": 0.5, "h2h_over25_rate": 0.5, } home_wins, draws = 0, 0 total_goals_sum = 0 btts_count, over25_count = 0, 0 n = len(rows) for mst, is_our_home, team_goals, opp_goals, opp_id in rows: sh = team_goals if is_our_home else opp_goals sa = opp_goals if is_our_home else team_goals total = sh + sa total_goals_sum += total if sh > 0 and sa > 0: btts_count += 1 if total > 2.5: over25_count += 1 # Determine result relative to our home team if is_our_home: if sh > sa: home_wins += 1 elif sh == sa: draws += 1 else: if sa > sh: home_wins += 1 elif sh == sa: draws += 1 return { "h2h_total_matches": n, "h2h_home_win_rate": home_wins / n, "h2h_draw_rate": draws / n, "h2h_avg_goals": total_goals_sum / n, "h2h_btts_rate": btts_count / n, "h2h_over25_rate": over25_count / n, } # ------------------------------------------------------------------------- # TEAM STATS (possession, shots, corners) # ------------------------------------------------------------------------- def _get_team_stats_features(self, home_id, away_id, before_date) -> dict: """Get team-level match stats features.""" h_stats = self._calc_team_stats(home_id, before_date) a_stats = self._calc_team_stats(away_id, before_date) return { "home_avg_possession": h_stats["possession"], "away_avg_possession": a_stats["possession"], "home_avg_shots_on_target": h_stats["shots_on_target"], "away_avg_shots_on_target": a_stats["shots_on_target"], "home_shot_conversion": h_stats["shot_conversion"], "away_shot_conversion": a_stats["shot_conversion"], "home_avg_corners": h_stats["corners"], "away_avg_corners": a_stats["corners"], } def _calc_team_stats(self, team_id, before_date, limit=10) -> dict: """Calculate team stats from match_team_stats table.""" defaults = { "possession": 0.50, "shots_on_target": 3.5, "shot_conversion": 0.10, "corners": 4.5 } stats_history = self.loader.team_stats.get(team_id, []) valid_stats = [s for s in stats_history if s[0] < before_date] rows = valid_stats[-limit:] if valid_stats else [] if not rows: return defaults poss_sum, sot_sum, shots_sum, corners_sum = 0.0, 0.0, 0.0, 0.0 goals_scored = 0 poss_count = 0 n = len(rows) for mst, poss, sot, total_shots, corners, team_goals in rows: if poss and poss > 0: poss_sum += float(poss) poss_count += 1 sot_sum += float(sot or 0) shots_sum += float(total_shots or 0) corners_sum += float(corners or 0) goals_scored += float(team_goals or 0) return { "possession": (poss_sum / poss_count / 100) if poss_count > 0 else 0.50, "shots_on_target": sot_sum / n, "shot_conversion": goals_scored / shots_sum if shots_sum > 0 else 0.10, "corners": corners_sum / n, } # ============================================================================= # MAIN # ============================================================================= def main(): print("🚀 XGBoost Training Data Extraction") print("=" * 60) # Load top leagues with open(TOP_LEAGUES_PATH) as f: top_leagues = json.load(f) print(f"📋 {len(top_leagues)} top leagues") # Connect conn = get_conn() # Batch load print("\n📦 Loading batch data...") loader = BatchDataLoader(conn, top_leagues) loader.load_all() # Extract features extractor = FeatureExtractor(conn, loader) rows = extractor.extract_all() if not rows: print("❌ No data extracted!") return # Write CSV print(f"\n💾 Writing {len(rows)} rows to {OUTPUT_CSV}...") with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=FEATURE_COLS) writer.writeheader() writer.writerows(rows) # Summary stats print(f"\n📊 Data Summary:") print(f" Total rows: {len(rows)}") # Label distributions ms_dist = defaultdict(int) ou25_dist = defaultdict(int) btts_dist = defaultdict(int) with_odds = sum(1 for r in rows if r.get("odds_ms_h", 0) > 0) for r in rows: ms_dist[r["label_ms"]] += 1 ou25_dist[r["label_ou25"]] += 1 btts_dist[r["label_btts"]] += 1 n = len(rows) print(f" With odds: {with_odds} ({with_odds/n*100:.1f}%)") print(f" MS dist: Home={ms_dist[0]/n*100:.1f}% Draw={ms_dist[1]/n*100:.1f}% Away={ms_dist[2]/n*100:.1f}%") print(f" O/U 2.5: Over={ou25_dist[1]/n*100:.1f}% Under={ou25_dist[0]/n*100:.1f}%") print(f" BTTS: Yes={btts_dist[1]/n*100:.1f}% No={btts_dist[0]/n*100:.1f}%") # HT/FT Distribution htft_dist = defaultdict(int) for r in rows: if r.get("label_ht_ft") is not None and r.get("label_ht_ft") != "": htft_dist[r["label_ht_ft"]] += 1 print(f"\n HT/FT Distribution:") # Interesting ones: 1/2 (2) and 2/1 (6) rev_1_2 = htft_dist.get(2, 0) rev_2_1 = htft_dist.get(6, 0) print(f" 1/2 (Home->Away): {rev_1_2} ({rev_1_2/n*100:.2f}%)") print(f" 2/1 (Away->Home): {rev_2_1} ({rev_2_1/n*100:.2f}%)") print(f" 1/1: {htft_dist.get(0,0)} | X/X: {htft_dist.get(4,0)} | 2/2: {htft_dist.get(8,0)}") # Feature NaN check nan_cols = [] for col in FEATURE_COLS: nans = sum(1 for r in rows if r.get(col, "") == "" or r.get(col) is None) if nans > 0 and col not in ("ht_score_home", "ht_score_away", "ht_total_goals", "label_ht_result", "label_ht_ou05", "label_ht_ou15"): nan_cols.append((col, nans)) if nan_cols: print(f"\n ⚠️ Columns with missing values:") for col, cnt in nan_cols: print(f" {col}: {cnt} ({cnt/n*100:.1f}%)") else: print(f"\n ✅ No missing values in feature columns!") print(f"\n✅ Done! Output: {OUTPUT_CSV}") conn.close() if __name__ == "__main__": main()