""" XGBoost Training Data Extraction ================================= Batch feature extraction for top-league matches. Extracts ~82 features + labels per match for XGBoost model training. Usage: python3 scripts/extract_training_data.py """ import os import sys import json import csv import math import time from datetime import datetime from collections import defaultdict import psycopg2 from psycopg2.extras import RealDictCursor from dotenv import load_dotenv load_dotenv() # ============================================================================= # CONFIG # ============================================================================= AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, AI_ENGINE_DIR) from features.upset_engine import get_upset_engine from features.referee_engine import get_referee_engine from features.momentum_engine import get_momentum_engine TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "qualified_leagues.json") OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv") # Ensure output dir exists os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True) def get_conn(): db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0] return psycopg2.connect(db_url) # ============================================================================= # FEATURE COLUMNS (ORDER MATTERS — matches CSV header) # ============================================================================= FEATURE_COLS = [ # Match identifiers "match_id", "home_team_id", "away_team_id", "league_id", "mst_utc", # ELO Features (8) "home_overall_elo", "away_overall_elo", "elo_diff", "home_home_elo", "away_away_elo", "home_form_elo", "away_form_elo", "form_elo_diff", # Form Features (12) "home_goals_avg", "home_conceded_avg", "away_goals_avg", "away_conceded_avg", "home_clean_sheet_rate", "away_clean_sheet_rate", "home_scoring_rate", "away_scoring_rate", "home_winning_streak", "away_winning_streak", "home_unbeaten_streak", "away_unbeaten_streak", # H2H Features (6) "h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate", "h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate", # Team Stats Features (8) "home_avg_possession", "away_avg_possession", "home_avg_shots_on_target", "away_avg_shots_on_target", "home_shot_conversion", "away_shot_conversion", "home_avg_corners", "away_avg_corners", # Odds Features (24) "odds_ms_h", "odds_ms_d", "odds_ms_a", "implied_home", "implied_draw", "implied_away", "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a", "odds_ou05_o", "odds_ou05_u", "odds_ou15_o", "odds_ou15_u", "odds_ou25_o", "odds_ou25_u", "odds_ou35_o", "odds_ou35_u", "odds_ht_ou05_o", "odds_ht_ou05_u", "odds_ht_ou15_o", "odds_ht_ou15_u", "odds_btts_y", "odds_btts_n", "odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present", "odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present", "odds_ou05_o_present", "odds_ou05_u_present", "odds_ou15_o_present", "odds_ou15_u_present", "odds_ou25_o_present", "odds_ou25_u_present", "odds_ou35_o_present", "odds_ou35_u_present", "odds_ht_ou05_o_present", "odds_ht_ou05_u_present", "odds_ht_ou15_o_present", "odds_ht_ou15_u_present", "odds_btts_y_present", "odds_btts_n_present", # Defensive/League Features (4) "home_xga", "away_xga", "league_avg_goals", "league_zero_goal_rate", # Upset Engine (4) "upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential", # Referee Engine (5) "referee_home_bias", "referee_avg_goals", "referee_cards_total", "referee_avg_yellow", "referee_experience", # Momentum Engine (3) "home_momentum_score", "away_momentum_score", "momentum_diff", # Squad Features (9) "home_squad_quality", "away_squad_quality", "squad_diff", "home_key_players", "away_key_players", "home_missing_impact", "away_missing_impact", "home_goals_form", "away_goals_form", # Labels "score_home", "score_away", "total_goals", "ht_score_home", "ht_score_away", "ht_total_goals", "label_ms", # 0=Home, 1=Draw, 2=Away "label_ou05", # 0=Under, 1=Over "label_ou15", "label_ou25", "label_ou35", "label_btts", # 0=No, 1=Yes "label_ht_result", # 0=Home, 1=Draw, 2=Away "label_ht_ou05", # 0=Under, 1=Over "label_ht_ou15", "label_ht_ft", # 0=1/1, 1=1/X, 2=1/2 ... 8=2/2 "label_odd_even", # 1=Odd, 0=Even "label_yellow_cards",# Count of cards (yellow=1, red=2) "label_cards_ou45", # 0=Under 4.5, 1=Over 4.5 "label_handicap_ms", # Handikap (Home starts -1): 0=Home wins by 2+, 1=Home wins by exactly 1, 2=Draw or Away wins ] # ============================================================================= # BATCH LOADERS — Pre-load data to avoid N+1 queries # ============================================================================= class BatchDataLoader: """Pre-loads all necessary data in bulk, then serves features per match.""" def __init__(self, conn, top_league_ids: list): self.conn = conn self.cur = conn.cursor() self.top_league_ids = top_league_ids # Pre-loaded data caches self.matches = [] self.odds_cache = {} # match_id → {ms_h, ms_d, ms_a, ...} self.team_stats_cache = {} # (team_id, before_date_bucket) → stats self.form_cache = {} # (team_id, match_id) → form features self.h2h_cache = {} # (home_id, away_id, match_id) → h2h features self.league_stats_cache = {} # league_id → {avg_goals, zero_rate} self.squad_cache = {} # (match_id, team_id) → {starting, goals, assists, key_players, ...} self.cards_cache = {} # match_id → total cards scored def load_all(self): """Load all data in batch.""" t0 = time.time() self._load_matches() print(f" ✅ Matches: {len(self.matches)} ({time.time()-t0:.1f}s)", flush=True) t1 = time.time() self._load_odds() print(f" ✅ Odds: {len(self.odds_cache)} matches ({time.time()-t1:.1f}s)", flush=True) t2 = time.time() self._load_league_stats() print(f" ✅ League stats: {len(self.league_stats_cache)} leagues ({time.time()-t2:.1f}s)", flush=True) t3 = time.time() self._load_team_history() print(f" ✅ Team History & Stats cache built ({time.time()-t3:.1f}s)", flush=True) t4 = time.time() self._load_squad_data() print(f" ✅ Squad data: {len(self.squad_cache)} team-matches ({time.time()-t4:.1f}s)", flush=True) t5 = time.time() self._load_cards_data() print(f" ✅ Cards data: {len(self.cards_cache)} matches ({time.time()-t5:.1f}s)", flush=True) print(f" 📊 Total load time: {time.time()-t0:.1f}s", flush=True) def _load_matches(self): ph = ",".join(["%s"] * len(self.top_league_ids)) self.cur.execute(f""" SELECT m.id, m.home_team_id, m.away_team_id, m.score_home, m.score_away, m.ht_score_home, m.ht_score_away, m.mst_utc, m.league_id, ht.name as home_name, at.name as away_name, l.name as league_name FROM matches m JOIN teams ht ON m.home_team_id = ht.id JOIN teams at ON m.away_team_id = at.id JOIN leagues l ON m.league_id = l.id WHERE m.status = 'FT' AND m.score_home IS NOT NULL AND m.sport = 'football' AND m.league_id IN ({ph}) ORDER BY m.mst_utc ASC """, self.top_league_ids) self.matches = self.cur.fetchall() def _load_odds(self): """Bulk load all odds for top league matches.""" ph = ",".join(["%s"] * len(self.top_league_ids)) self.cur.execute(f""" SELECT oc.match_id, oc.name, os.name, os.odd_value FROM odd_selections os JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id JOIN matches m ON oc.match_id = m.id WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) """, self.top_league_ids) for match_id, cat_name, sel_name, odd_val in self.cur.fetchall(): try: v = float(odd_val) if odd_val else 0 if v <= 0 or not cat_name or not sel_name: continue if match_id not in self.odds_cache: self.odds_cache[match_id] = {} cat_lower = cat_name.lower().strip() sel_lower = sel_name.lower().strip() # Match Result (1X2) if cat_lower == 'maç sonucu': if sel_name == '1': self.odds_cache[match_id]['ms_h'] = v elif sel_name in ('0', 'X'): self.odds_cache[match_id]['ms_d'] = v elif sel_name == '2': self.odds_cache[match_id]['ms_a'] = v # HT Result elif cat_lower == '1. yarı sonucu': if sel_name == '1': self.odds_cache[match_id]['ht_ms_h'] = v elif sel_name in ('0', 'X'): self.odds_cache[match_id]['ht_ms_d'] = v elif sel_name == '2': self.odds_cache[match_id]['ht_ms_a'] = v # BTTS elif cat_lower == 'karşılıklı gol': if 'var' in sel_lower: self.odds_cache[match_id]['btts_y'] = v elif 'yok' in sel_lower: self.odds_cache[match_id]['btts_n'] = v # Over/Under FT elif cat_lower == '0,5 alt/üst': if 'alt' in sel_lower: self.odds_cache[match_id]['ou05_u'] = v elif 'üst' in sel_lower: self.odds_cache[match_id]['ou05_o'] = v elif cat_lower == '1,5 alt/üst': if 'alt' in sel_lower: self.odds_cache[match_id]['ou15_u'] = v elif 'üst' in sel_lower: self.odds_cache[match_id]['ou15_o'] = v elif cat_lower == '2,5 alt/üst': if 'alt' in sel_lower: self.odds_cache[match_id]['ou25_u'] = v elif 'üst' in sel_lower: self.odds_cache[match_id]['ou25_o'] = v elif cat_lower == '3,5 alt/üst': if 'alt' in sel_lower: self.odds_cache[match_id]['ou35_u'] = v elif 'üst' in sel_lower: self.odds_cache[match_id]['ou35_o'] = v # Over/Under HT elif cat_lower == '1. yarı 0,5 alt/üst': if 'alt' in sel_lower: self.odds_cache[match_id]['ht_ou05_u'] = v elif 'üst' in sel_lower: self.odds_cache[match_id]['ht_ou05_o'] = v elif cat_lower == '1. yarı 1,5 alt/üst': if 'alt' in sel_lower: self.odds_cache[match_id]['ht_ou15_u'] = v elif 'üst' in sel_lower: self.odds_cache[match_id]['ht_ou15_o'] = v except (ValueError, TypeError): pass def _load_league_stats(self): """Calculate league-level aggregated stats.""" ph = ",".join(["%s"] * len(self.top_league_ids)) self.cur.execute(f""" SELECT league_id, AVG(score_home + score_away) as avg_goals, AVG(CASE WHEN score_home = 0 AND score_away = 0 THEN 1.0 ELSE 0.0 END) as zero_rate, COUNT(*) as match_count FROM matches WHERE status = 'FT' AND score_home IS NOT NULL AND sport = 'football' AND league_id IN ({ph}) GROUP BY league_id """, self.top_league_ids) for league_id, avg_goals, zero_rate, cnt in self.cur.fetchall(): self.league_stats_cache[league_id] = { "avg_goals": float(avg_goals) if avg_goals else 2.5, "zero_rate": float(zero_rate) if zero_rate else 0.07, "match_count": cnt } def _load_team_history(self): """Bulk load all matches and team stats for lightning fast memory lookups""" from collections import defaultdict self.team_matches = defaultdict(list) self.team_stats = defaultdict(list) # Load all matches for form/h2h self.cur.execute(""" SELECT home_team_id, away_team_id, score_home, score_away, mst_utc FROM matches WHERE status = 'FT' AND score_home IS NOT NULL AND sport = 'football' ORDER BY mst_utc ASC """) for hid, aid, sh, sa, mst in self.cur.fetchall(): # (mst, is_home, team_score, opp_score, opp_id) self.team_matches[hid].append((mst, True, sh, sa, aid)) self.team_matches[aid].append((mst, False, sa, sh, hid)) # Load all football_team_stats (sport-partitioned schema) self.cur.execute(""" SELECT mts.team_id, m.mst_utc, mts.possession_percentage, mts.shots_on_target, mts.total_shots, mts.corners, m.score_home, m.score_away, m.home_team_id FROM football_team_stats mts JOIN matches m ON mts.match_id = m.id WHERE m.sport = 'football' AND m.status = 'FT' AND m.score_home IS NOT NULL ORDER BY m.mst_utc ASC """) for tid, mst, poss, sot, tshots, corn, sh, sa, hid in self.cur.fetchall(): team_goals = sh if hid == tid else sa self.team_stats[tid].append((mst, poss, sot, tshots, corn, team_goals)) def _load_squad_data(self): """Bulk load squad participation + player events for squad features.""" ph = ",".join(["%s"] * len(self.top_league_ids)) # 1) Participation: starting XI count + position distribution per (match, team) self.cur.execute(f""" SELECT mpp.match_id, mpp.team_id, COUNT(*) FILTER (WHERE mpp.is_starting = true) AS starting_count, COUNT(*) AS total_squad, COUNT(*) FILTER (WHERE mpp.is_starting = true AND LOWER(COALESCE(mpp.position::TEXT,'')) ~ '(forward|fwd|forvet|striker)') AS fwd_count FROM match_player_participation mpp JOIN matches m ON mpp.match_id = m.id WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) GROUP BY mpp.match_id, mpp.team_id """, self.top_league_ids) participation = {} for mid, tid, st_count, tot, fwd in self.cur.fetchall(): participation[(mid, tid)] = { 'starting_count': st_count or 0, 'total_squad': tot or 0, 'fwd_count': fwd or 0, } # 2) Player events: goals + assists per (match, team) self.cur.execute(f""" SELECT mpe.match_id, mpe.team_id, COUNT(*) FILTER ( WHERE mpe.event_type = 'goal' AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%' ) AS goal_count, COUNT(DISTINCT mpe.assist_player_id) FILTER ( WHERE mpe.event_type = 'goal' AND mpe.assist_player_id IS NOT NULL ) AS assist_count, COUNT(DISTINCT mpe.player_id) FILTER ( WHERE mpe.event_type = 'goal' AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%' ) AS unique_scorers FROM match_player_events mpe JOIN matches m ON mpe.match_id = m.id WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) GROUP BY mpe.match_id, mpe.team_id """, self.top_league_ids) events = {} for mid, tid, goals, assists, scorers in self.cur.fetchall(): events[(mid, tid)] = { 'goals': goals or 0, 'assists': assists or 0, 'unique_scorers': scorers or 0, } # 3) Key players: players with 3+ goals across all their matches, per team self.cur.execute(f""" SELECT mpe.team_id, mpe.player_id, COUNT(*) AS total_goals FROM match_player_events mpe JOIN matches m ON mpe.match_id = m.id WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) AND mpe.event_type = 'goal' AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%' GROUP BY mpe.team_id, mpe.player_id HAVING COUNT(*) >= 3 """, self.top_league_ids) key_players_by_team = defaultdict(set) for tid, pid, _ in self.cur.fetchall(): key_players_by_team[tid].add(pid) # 4) Starting key players per (match, team) self.cur.execute(f""" SELECT mpp.match_id, mpp.team_id, mpp.player_id FROM match_player_participation mpp JOIN matches m ON mpp.match_id = m.id WHERE mpp.is_starting = true AND m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) """, self.top_league_ids) starting_players = defaultdict(list) for mid, tid, pid in self.cur.fetchall(): starting_players[(mid, tid)].append(pid) # 5) Build combined cache all_keys = set(participation.keys()) | set(events.keys()) for key in all_keys: mid, tid = key part = participation.get(key, {'starting_count': 0, 'total_squad': 0, 'fwd_count': 0}) evt = events.get(key, {'goals': 0, 'assists': 0, 'unique_scorers': 0}) # Count key players in starting XI starters = starting_players.get(key, []) kp_in_starting = sum(1 for p in starters if p in key_players_by_team.get(tid, set())) kp_total = len(key_players_by_team.get(tid, set())) kp_missing = max(0, kp_total - kp_in_starting) # Squad quality: composite score squad_quality = ( part['starting_count'] * 0.3 + evt['goals'] * 2.0 + evt['assists'] * 1.0 + kp_in_starting * 3.0 + part['fwd_count'] * 1.5 ) # Missing impact: how many key players are missing missing_impact = min(kp_missing / max(kp_total, 1), 1.0) self.squad_cache[key] = { 'squad_quality': squad_quality, 'key_players': kp_in_starting, 'missing_impact': missing_impact, 'goals_form': evt['goals'], } def _load_cards_data(self): """Bulk load all distinct cards per match (yellow=1, red=2).""" ph = ",".join(["%s"] * len(self.top_league_ids)) self.cur.execute(f""" SELECT mpe.match_id, SUM(CASE WHEN mpe.event_type::text LIKE '%%yellow_card%%' THEN 1 WHEN mpe.event_type::text LIKE '%%red_card%%' THEN 2 ELSE 1 END) as cards_weight FROM match_player_events mpe JOIN matches m ON mpe.match_id = m.id WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph}) AND mpe.event_type::text LIKE '%%card%%' GROUP BY mpe.match_id """, self.top_league_ids) for mid, cards_weight in self.cur.fetchall(): self.cards_cache[mid] = float(cards_weight) if cards_weight else 0.0 class FeatureExtractor: """Extract features for a single match using pre-loaded data + on-demand queries.""" def __init__(self, conn, loader: BatchDataLoader): self.conn = conn self.cur = conn.cursor() self.loader = loader # ELO cache: team_id → {overall, home, away, form} self.elo_ratings = defaultdict(lambda: { "overall": 1500.0, "home": 1500.0, "away": 1500.0, "form": 1500.0, "matches": 0 }) self._elo_initialized = False self.upset_engine = get_upset_engine() self.referee_engine = get_referee_engine() self.momentum_engine = get_momentum_engine() def extract_all(self) -> list: """Extract features for all matches, yield row dicts.""" matches = self.loader.matches total = len(matches) rows = [] skipped = 0 t_start = time.time() print(f"\n🔄 Extracting features for {total} matches...", flush=True) # Process chronologically — ELO grows as we go for i, m in enumerate(matches): ( mid, hid, aid, sh, sa, hth, hta, mst, lid, home_name, away_name, league_name, ) = m if i % 100 == 0 and i > 0: elapsed = time.time() - t_start rate = i / elapsed # matches per second remaining = (total - i) / rate if rate > 0 else 0 pct = i / total * 100 print(f" [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | ETA: {remaining/60:.1f} dk | skipped: {skipped}", flush=True) row = self._extract_one( mid, hid, aid, sh, sa, hth, hta, mst, lid, home_name, away_name, league_name, ) if row: rows.append(row) else: skipped += 1 # Update ELO after processing (so ELO is calculated BEFORE the match) self._update_elo(hid, aid, sh, sa) print(f" ✅ Extracted {len(rows)} rows, skipped {skipped}", flush=True) return rows def _extract_one( self, mid, hid, aid, sh, sa, hth, hta, mst, lid, home_name, away_name, league_name, ): """Extract features for a single match.""" # === LABELS === total_goals = sh + sa ht_total = (hth + hta) if hth is not None and hta is not None else None if sh > sa: label_ms = 0 elif sh < sa: label_ms = 2 else: label_ms = 1 label_ou05 = 1 if total_goals > 0.5 else 0 label_ou15 = 1 if total_goals > 1.5 else 0 label_ou25 = 1 if total_goals > 2.5 else 0 label_ou35 = 1 if total_goals > 3.5 else 0 label_btts = 1 if (sh > 0 and sa > 0) else 0 label_odd_even = 1 if total_goals % 2 != 0 else 0 # Handicap MS Label (Home starts -1): 0=Home wins by 2+, 1=Home wins by exactly 1, 2=Draw or Away wins score_diff = sh - sa if score_diff >= 2: label_handicap_ms = 0 elif score_diff == 1: label_handicap_ms = 1 else: label_handicap_ms = 2 # Cards Label cards_count = self.loader.cards_cache.get(mid, 0.0) label_yellow_cards = cards_count label_cards_ou45 = 1 if cards_count > 4.5 else 0 # HT labels label_ht_result = None label_ht_ou05 = None label_ht_ou15 = None if hth is not None and hta is not None: if hth > hta: label_ht_result = 0 elif hth < hta: label_ht_result = 2 else: label_ht_result = 1 label_ht_ou05 = 1 if ht_total > 0.5 else 0 label_ht_ou15 = 1 if ht_total > 1.5 else 0 # HT/FT Label (0-8) # 0: 1/1, 1: 1/X, 2: 1/2 # 3: X/1, 4: X/X, 5: X/2 # 6: 2/1, 7: 2/X, 8: 2/2 label_ht_ft = None if label_ht_result is not None: label_ht_ft = label_ht_result * 3 + label_ms # === ELO FEATURES === h_elo = self.elo_ratings[hid] a_elo = self.elo_ratings[aid] elo_features = { "home_overall_elo": h_elo["overall"], "away_overall_elo": a_elo["overall"], "elo_diff": h_elo["overall"] - a_elo["overall"], "home_home_elo": h_elo["home"], "away_away_elo": a_elo["away"], "home_form_elo": h_elo["form"], "away_form_elo": a_elo["form"], "form_elo_diff": h_elo["form"] - a_elo["form"], } # === FORM FEATURES === form_features = self._get_form_features(hid, aid, mst) # === H2H FEATURES === h2h_features = self._get_h2h_features(hid, aid, mst) # === TEAM STATS FEATURES === stats_features = self._get_team_stats_features(hid, aid, mst) # === ODDS FEATURES === odds = self.loader.odds_cache.get(mid, {}) ms_h = odds.get("ms_h", 0.0) ms_d = odds.get("ms_d", 0.0) ms_a = odds.get("ms_a", 0.0) # Implied probabilities (normalized vig-free) if ms_h > 0 and ms_d > 0 and ms_a > 0: raw_sum = 1/ms_h + 1/ms_d + 1/ms_a implied_home = (1/ms_h) / raw_sum implied_draw = (1/ms_d) / raw_sum implied_away = (1/ms_a) / raw_sum else: implied_home = implied_draw = implied_away = 0.33 odds_features = { "odds_ms_h": ms_h, "odds_ms_d": ms_d, "odds_ms_a": ms_a, "implied_home": implied_home, "implied_draw": implied_draw, "implied_away": implied_away, "odds_ht_ms_h": odds.get("ht_ms_h", 0.0), "odds_ht_ms_d": odds.get("ht_ms_d", 0.0), "odds_ht_ms_a": odds.get("ht_ms_a", 0.0), "odds_ou05_o": odds.get("ou05_o", 0.0), "odds_ou05_u": odds.get("ou05_u", 0.0), "odds_ou15_o": odds.get("ou15_o", 0.0), "odds_ou15_u": odds.get("ou15_u", 0.0), "odds_ou25_o": odds.get("ou25_o", 0.0), "odds_ou25_u": odds.get("ou25_u", 0.0), "odds_ou35_o": odds.get("ou35_o", 0.0), "odds_ou35_u": odds.get("ou35_u", 0.0), "odds_ht_ou05_o": odds.get("ht_ou05_o", 0.0), "odds_ht_ou05_u": odds.get("ht_ou05_u", 0.0), "odds_ht_ou15_o": odds.get("ht_ou15_o", 0.0), "odds_ht_ou15_u": odds.get("ht_ou15_u", 0.0), "odds_btts_y": odds.get("btts_y", 0.0), "odds_btts_n": odds.get("btts_n", 0.0), "odds_ms_h_present": 1.0 if ms_h > 1.01 else 0.0, "odds_ms_d_present": 1.0 if ms_d > 1.01 else 0.0, "odds_ms_a_present": 1.0 if ms_a > 1.01 else 0.0, "odds_ht_ms_h_present": 1.0 if odds.get("ht_ms_h", 0.0) > 1.01 else 0.0, "odds_ht_ms_d_present": 1.0 if odds.get("ht_ms_d", 0.0) > 1.01 else 0.0, "odds_ht_ms_a_present": 1.0 if odds.get("ht_ms_a", 0.0) > 1.01 else 0.0, "odds_ou05_o_present": 1.0 if odds.get("ou05_o", 0.0) > 1.01 else 0.0, "odds_ou05_u_present": 1.0 if odds.get("ou05_u", 0.0) > 1.01 else 0.0, "odds_ou15_o_present": 1.0 if odds.get("ou15_o", 0.0) > 1.01 else 0.0, "odds_ou15_u_present": 1.0 if odds.get("ou15_u", 0.0) > 1.01 else 0.0, "odds_ou25_o_present": 1.0 if odds.get("ou25_o", 0.0) > 1.01 else 0.0, "odds_ou25_u_present": 1.0 if odds.get("ou25_u", 0.0) > 1.01 else 0.0, "odds_ou35_o_present": 1.0 if odds.get("ou35_o", 0.0) > 1.01 else 0.0, "odds_ou35_u_present": 1.0 if odds.get("ou35_u", 0.0) > 1.01 else 0.0, "odds_ht_ou05_o_present": 1.0 if odds.get("ht_ou05_o", 0.0) > 1.01 else 0.0, "odds_ht_ou05_u_present": 1.0 if odds.get("ht_ou05_u", 0.0) > 1.01 else 0.0, "odds_ht_ou15_o_present": 1.0 if odds.get("ht_ou15_o", 0.0) > 1.01 else 0.0, "odds_ht_ou15_u_present": 1.0 if odds.get("ht_ou15_u", 0.0) > 1.01 else 0.0, "odds_btts_y_present": 1.0 if odds.get("btts_y", 0.0) > 1.01 else 0.0, "odds_btts_n_present": 1.0 if odds.get("btts_n", 0.0) > 1.01 else 0.0, } # === LEAGUE FEATURES === league = self.loader.league_stats_cache.get(lid, {"avg_goals": 2.5, "zero_rate": 0.07}) league_features = { "league_avg_goals": league["avg_goals"], "league_zero_goal_rate": league["zero_rate"], } # === UPSET FEATURES === try: upset_feats = self.upset_engine.get_features( home_team_name=home_name or "", home_team_id=hid, away_team_name=away_name or "", league_name=league_name or "", home_position=10, away_position=10, match_date_ms=mst, ) except Exception: upset_feats = { "upset_atmosphere": 0.0, "upset_motivation": 0.0, "upset_fatigue": 0.0, "upset_potential": 0.0, } # === REFEREE FEATURES === try: referee_feats = self.referee_engine.get_features( match_id=mid, league_id=str(lid), ) except Exception: referee_feats = { "referee_home_bias": 0.0, "referee_avg_goals": 2.7, "referee_cards_total": 4.0, "referee_avg_yellow": 3.5, "referee_avg_red": 0.1, "referee_experience": 0.5, } # === MOMENTUM FEATURES === try: momentum_feats = self.momentum_engine.get_features( home_team_id=hid, away_team_id=aid, match_date_ms=mst, ) home_momentum_score = momentum_feats.get("home_momentum_score", 0.5) away_momentum_score = momentum_feats.get("away_momentum_score", 0.5) momentum_diff = momentum_feats.get( "momentum_diff", home_momentum_score - away_momentum_score ) except Exception: home_momentum_score = 0.5 away_momentum_score = 0.5 momentum_diff = 0.0 # === SQUAD FEATURES === home_sq = self.loader.squad_cache.get((mid, hid), {}) away_sq = self.loader.squad_cache.get((mid, aid), {}) home_squad_quality = home_sq.get('squad_quality', 0.0) away_squad_quality = away_sq.get('squad_quality', 0.0) squad_diff = home_squad_quality - away_squad_quality home_key_players = home_sq.get('key_players', 0) away_key_players = away_sq.get('key_players', 0) home_missing_impact = home_sq.get('missing_impact', 0.0) away_missing_impact = away_sq.get('missing_impact', 0.0) home_goals_form = home_sq.get('goals_form', 0) away_goals_form = away_sq.get('goals_form', 0) # === ASSEMBLE ROW === row = { "match_id": mid, "home_team_id": hid, "away_team_id": aid, "league_id": lid, "mst_utc": mst, **elo_features, **form_features, **h2h_features, **stats_features, **odds_features, "home_xga": form_features["home_conceded_avg"], "away_xga": form_features["away_conceded_avg"], **league_features, "upset_atmosphere": upset_feats.get("upset_atmosphere", 0.0), "upset_motivation": upset_feats.get("upset_motivation", 0.0), "upset_fatigue": upset_feats.get("upset_fatigue", 0.0), "upset_potential": upset_feats.get("upset_potential", 0.0), "referee_home_bias": referee_feats.get("referee_home_bias", 0.0), "referee_avg_goals": referee_feats.get("referee_avg_goals", 2.7), "referee_cards_total": referee_feats.get("referee_cards_total", 4.0), "referee_avg_yellow": referee_feats.get("referee_avg_yellow", 3.5), "referee_experience": referee_feats.get("referee_experience", 0.5), "home_momentum_score": home_momentum_score, "away_momentum_score": away_momentum_score, "momentum_diff": momentum_diff, # Squad Features "home_squad_quality": home_squad_quality, "away_squad_quality": away_squad_quality, "squad_diff": squad_diff, "home_key_players": home_key_players, "away_key_players": away_key_players, "home_missing_impact": home_missing_impact, "away_missing_impact": away_missing_impact, "home_goals_form": home_goals_form, "away_goals_form": away_goals_form, # Labels "score_home": sh, "score_away": sa, "total_goals": total_goals, "ht_score_home": hth if hth is not None else "", "ht_score_away": hta if hta is not None else "", "ht_total_goals": ht_total if ht_total is not None else "", "label_ms": label_ms, "label_ou05": label_ou05, "label_ou15": label_ou15, "label_ou25": label_ou25, "label_ou35": label_ou35, "label_btts": label_btts, "label_ht_result": label_ht_result if label_ht_result is not None else "", "label_ht_ou05": label_ht_ou05 if label_ht_ou05 is not None else "", "label_ht_ou15": label_ht_ou15 if label_ht_ou15 is not None else "", "label_ht_ft": label_ht_ft if label_ht_ft is not None else "", "label_odd_even": label_odd_even, "label_yellow_cards": label_yellow_cards, "label_cards_ou45": label_cards_ou45, "label_handicap_ms": label_handicap_ms, } return row # ------------------------------------------------------------------------- # ELO (simplified inline version — doesn't need DB, grows incrementally) # ------------------------------------------------------------------------- def _update_elo(self, home_id, away_id, score_home, score_away): """Update ELO ratings after a match.""" h = self.elo_ratings[home_id] a = self.elo_ratings[away_id] HOME_ADVANTAGE = 65 K_BASE = 32 # Expected scores exp_h = 1.0 / (1 + 10 ** ((a["overall"] - h["overall"] - HOME_ADVANTAGE) / 400)) exp_a = 1.0 - exp_h # Actual scores if score_home > score_away: actual_h, actual_a = 1.0, 0.0 elif score_home < score_away: actual_h, actual_a = 0.0, 1.0 else: actual_h, actual_a = 0.5, 0.5 # Goal difference multiplier gd = abs(score_home - score_away) gd_mult = math.log(max(gd, 1) + 1) * 0.7 + 1.0 # Dynamic K k_h = K_BASE * gd_mult * (1.3 if h["matches"] < 10 else 1.0) k_a = K_BASE * gd_mult * (1.3 if a["matches"] < 10 else 1.0) delta_h = k_h * (actual_h - exp_h) delta_a = k_a * (actual_a - exp_a) # Update all ELO variants h["overall"] += delta_h a["overall"] += delta_a h["home"] += delta_h * 1.1 a["away"] += delta_a * 1.1 # Form ELO (heavier weight on recent) form_k = K_BASE * 1.5 * gd_mult h["form"] = h["form"] * 0.85 + (h["form"] + form_k * (actual_h - exp_h)) * 0.15 a["form"] = a["form"] * 0.85 + (a["form"] + form_k * (actual_a - exp_a)) * 0.15 h["matches"] += 1 a["matches"] += 1 # ------------------------------------------------------------------------- # FORM (last 5 matches) # ------------------------------------------------------------------------- def _get_form_features(self, home_id, away_id, before_date) -> dict: """Get form features for both teams.""" h_form = self._calc_team_form(home_id, before_date) a_form = self._calc_team_form(away_id, before_date) return { "home_goals_avg": h_form["goals_avg"], "home_conceded_avg": h_form["conceded_avg"], "away_goals_avg": a_form["goals_avg"], "away_conceded_avg": a_form["conceded_avg"], "home_clean_sheet_rate": h_form["clean_sheet_rate"], "away_clean_sheet_rate": a_form["clean_sheet_rate"], "home_scoring_rate": h_form["scoring_rate"], "away_scoring_rate": a_form["scoring_rate"], "home_winning_streak": h_form["winning_streak"], "away_winning_streak": a_form["winning_streak"], "home_unbeaten_streak": h_form["unbeaten_streak"], "away_unbeaten_streak": a_form["unbeaten_streak"], } def _calc_team_form(self, team_id, before_date, limit=5) -> dict: """Calculate form from last N matches (weighted moving average).""" history = self.loader.team_matches.get(team_id, []) # Filter and get last `limit` matches before date valid_matches = [m for m in history if m[0] < before_date] rows = valid_matches[-limit:] if valid_matches else [] if not rows: return { "goals_avg": 1.3, "conceded_avg": 1.2, "clean_sheet_rate": 0.25, "scoring_rate": 0.75, "winning_streak": 0, "unbeaten_streak": 0, } w_goals, w_conceded, w_total = 0.0, 0.0, 0.0 clean_sheets, scored_matches = 0, 0 winning_streak, unbeaten_streak = 0, 0 streak_broken, unbeaten_broken = False, False for i, (mst, is_home, team_goals, opp_goals, opp_id) in enumerate(reversed(rows)): weight = float(limit - i) w_goals += team_goals * weight w_conceded += opp_goals * weight w_total += weight if opp_goals == 0: clean_sheets += 1 if team_goals > 0: scored_matches += 1 if not streak_broken: if team_goals > opp_goals: winning_streak += 1 else: streak_broken = True if not unbeaten_broken: if team_goals >= opp_goals: unbeaten_streak += 1 else: unbeaten_broken = True n = len(rows) return { "goals_avg": w_goals / w_total if w_total > 0 else 1.3, "conceded_avg": w_conceded / w_total if w_total > 0 else 1.2, "clean_sheet_rate": clean_sheets / n, "scoring_rate": scored_matches / n, "winning_streak": winning_streak, "unbeaten_streak": unbeaten_streak, } # ------------------------------------------------------------------------- # H2H # ------------------------------------------------------------------------- def _get_h2h_features(self, home_id, away_id, before_date) -> dict: """Get head-to-head features.""" h_history = self.loader.team_matches.get(home_id, []) # Matches against away_id before date h2h_matches = [m for m in h_history if m[4] == away_id and m[0] < before_date] rows = h2h_matches[-20:] if h2h_matches else [] if not rows: return { "h2h_total_matches": 0, "h2h_home_win_rate": 0.33, "h2h_draw_rate": 0.33, "h2h_avg_goals": 2.5, "h2h_btts_rate": 0.5, "h2h_over25_rate": 0.5, } home_wins, draws = 0, 0 total_goals_sum = 0 btts_count, over25_count = 0, 0 n = len(rows) for mst, is_our_home, team_goals, opp_goals, opp_id in rows: sh = team_goals if is_our_home else opp_goals sa = opp_goals if is_our_home else team_goals total = sh + sa total_goals_sum += total if sh > 0 and sa > 0: btts_count += 1 if total > 2.5: over25_count += 1 # Determine result relative to our home team if is_our_home: if sh > sa: home_wins += 1 elif sh == sa: draws += 1 else: if sa > sh: home_wins += 1 elif sh == sa: draws += 1 return { "h2h_total_matches": n, "h2h_home_win_rate": home_wins / n, "h2h_draw_rate": draws / n, "h2h_avg_goals": total_goals_sum / n, "h2h_btts_rate": btts_count / n, "h2h_over25_rate": over25_count / n, } # ------------------------------------------------------------------------- # TEAM STATS (possession, shots, corners) # ------------------------------------------------------------------------- def _get_team_stats_features(self, home_id, away_id, before_date) -> dict: """Get team-level match stats features.""" h_stats = self._calc_team_stats(home_id, before_date) a_stats = self._calc_team_stats(away_id, before_date) return { "home_avg_possession": h_stats["possession"], "away_avg_possession": a_stats["possession"], "home_avg_shots_on_target": h_stats["shots_on_target"], "away_avg_shots_on_target": a_stats["shots_on_target"], "home_shot_conversion": h_stats["shot_conversion"], "away_shot_conversion": a_stats["shot_conversion"], "home_avg_corners": h_stats["corners"], "away_avg_corners": a_stats["corners"], } def _calc_team_stats(self, team_id, before_date, limit=10) -> dict: """Calculate team stats from match_team_stats table.""" defaults = { "possession": 0.50, "shots_on_target": 3.5, "shot_conversion": 0.10, "corners": 4.5 } stats_history = self.loader.team_stats.get(team_id, []) valid_stats = [s for s in stats_history if s[0] < before_date] rows = valid_stats[-limit:] if valid_stats else [] if not rows: return defaults poss_sum, sot_sum, shots_sum, corners_sum = 0.0, 0.0, 0.0, 0.0 goals_scored = 0 poss_count = 0 n = len(rows) for mst, poss, sot, total_shots, corners, team_goals in rows: if poss and poss > 0: poss_sum += float(poss) poss_count += 1 sot_sum += float(sot or 0) shots_sum += float(total_shots or 0) corners_sum += float(corners or 0) goals_scored += float(team_goals or 0) return { "possession": (poss_sum / poss_count / 100) if poss_count > 0 else 0.50, "shots_on_target": sot_sum / n, "shot_conversion": goals_scored / shots_sum if shots_sum > 0 else 0.10, "corners": corners_sum / n, } # ============================================================================= # MAIN # ============================================================================= def main(): print("🚀 XGBoost Training Data Extraction") print("=" * 60) # Load top leagues with open(TOP_LEAGUES_PATH) as f: top_leagues = json.load(f) print(f"📋 {len(top_leagues)} top leagues") # Connect conn = get_conn() # Batch load print("\n📦 Loading batch data...") loader = BatchDataLoader(conn, top_leagues) loader.load_all() # Extract features extractor = FeatureExtractor(conn, loader) rows = extractor.extract_all() if not rows: print("❌ No data extracted!") return # Write CSV print(f"\n💾 Writing {len(rows)} rows to {OUTPUT_CSV}...") with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=FEATURE_COLS) writer.writeheader() writer.writerows(rows) # Summary stats print(f"\n📊 Data Summary:") print(f" Total rows: {len(rows)}") # Label distributions ms_dist = defaultdict(int) ou25_dist = defaultdict(int) btts_dist = defaultdict(int) with_odds = sum(1 for r in rows if r.get("odds_ms_h", 0) > 0) for r in rows: ms_dist[r["label_ms"]] += 1 ou25_dist[r["label_ou25"]] += 1 btts_dist[r["label_btts"]] += 1 n = len(rows) print(f" With odds: {with_odds} ({with_odds/n*100:.1f}%)") print(f" MS dist: Home={ms_dist[0]/n*100:.1f}% Draw={ms_dist[1]/n*100:.1f}% Away={ms_dist[2]/n*100:.1f}%") print(f" O/U 2.5: Over={ou25_dist[1]/n*100:.1f}% Under={ou25_dist[0]/n*100:.1f}%") print(f" BTTS: Yes={btts_dist[1]/n*100:.1f}% No={btts_dist[0]/n*100:.1f}%") # HT/FT Distribution htft_dist = defaultdict(int) for r in rows: if r.get("label_ht_ft") is not None and r.get("label_ht_ft") != "": htft_dist[r["label_ht_ft"]] += 1 print(f"\n HT/FT Distribution:") # Interesting ones: 1/2 (2) and 2/1 (6) rev_1_2 = htft_dist.get(2, 0) rev_2_1 = htft_dist.get(6, 0) print(f" 1/2 (Home->Away): {rev_1_2} ({rev_1_2/n*100:.2f}%)") print(f" 2/1 (Away->Home): {rev_2_1} ({rev_2_1/n*100:.2f}%)") print(f" 1/1: {htft_dist.get(0,0)} | X/X: {htft_dist.get(4,0)} | 2/2: {htft_dist.get(8,0)}") # Feature NaN check nan_cols = [] for col in FEATURE_COLS: nans = sum(1 for r in rows if r.get(col, "") == "" or r.get(col) is None) if nans > 0 and col not in ("ht_score_home", "ht_score_away", "ht_total_goals", "label_ht_result", "label_ht_ou05", "label_ht_ou15"): nan_cols.append((col, nans)) if nan_cols: print(f"\n ⚠️ Columns with missing values:") for col, cnt in nan_cols: print(f" {col}: {cnt} ({cnt/n*100:.1f}%)") else: print(f"\n ✅ No missing values in feature columns!") print(f"\n✅ Done! Output: {OUTPUT_CSV}") conn.close() if __name__ == "__main__": main()