Files
iddaai-be/ai-engine/scripts/extract_training_data.py
2026-04-22 02:17:02 +03:00

1181 lines
46 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
XGBoost Training Data Extraction
=================================
Batch feature extraction for top-league matches.
Extracts ~82 features + labels per match for XGBoost model training.
Usage:
python3 scripts/extract_training_data.py
"""
import os
import sys
import json
import csv
import math
import time
from datetime import datetime
from collections import defaultdict
import psycopg2
from psycopg2.extras import RealDictCursor
from dotenv import load_dotenv
load_dotenv()
# =============================================================================
# CONFIG
# =============================================================================
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_ENGINE_DIR)
from features.upset_engine import get_upset_engine
from features.referee_engine import get_referee_engine
from features.momentum_engine import get_momentum_engine
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "top_leagues.json")
OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
# Ensure output dir exists
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
def get_conn():
db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0]
return psycopg2.connect(db_url)
# =============================================================================
# FEATURE COLUMNS (ORDER MATTERS — matches CSV header)
# =============================================================================
FEATURE_COLS = [
# Match identifiers
"match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
# ELO Features (8)
"home_overall_elo", "away_overall_elo", "elo_diff",
"home_home_elo", "away_away_elo",
"home_form_elo", "away_form_elo", "form_elo_diff",
# Form Features (12)
"home_goals_avg", "home_conceded_avg",
"away_goals_avg", "away_conceded_avg",
"home_clean_sheet_rate", "away_clean_sheet_rate",
"home_scoring_rate", "away_scoring_rate",
"home_winning_streak", "away_winning_streak",
"home_unbeaten_streak", "away_unbeaten_streak",
# H2H Features (6)
"h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate",
"h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",
# Team Stats Features (8)
"home_avg_possession", "away_avg_possession",
"home_avg_shots_on_target", "away_avg_shots_on_target",
"home_shot_conversion", "away_shot_conversion",
"home_avg_corners", "away_avg_corners",
# Odds Features (24)
"odds_ms_h", "odds_ms_d", "odds_ms_a",
"implied_home", "implied_draw", "implied_away",
"odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
"odds_ou05_o", "odds_ou05_u",
"odds_ou15_o", "odds_ou15_u",
"odds_ou25_o", "odds_ou25_u",
"odds_ou35_o", "odds_ou35_u",
"odds_ht_ou05_o", "odds_ht_ou05_u",
"odds_ht_ou15_o", "odds_ht_ou15_u",
"odds_btts_y", "odds_btts_n",
"odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present",
"odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present",
"odds_ou05_o_present", "odds_ou05_u_present",
"odds_ou15_o_present", "odds_ou15_u_present",
"odds_ou25_o_present", "odds_ou25_u_present",
"odds_ou35_o_present", "odds_ou35_u_present",
"odds_ht_ou05_o_present", "odds_ht_ou05_u_present",
"odds_ht_ou15_o_present", "odds_ht_ou15_u_present",
"odds_btts_y_present", "odds_btts_n_present",
# Defensive/League Features (4)
"home_xga", "away_xga",
"league_avg_goals", "league_zero_goal_rate",
# Upset Engine (4)
"upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",
# Referee Engine (5)
"referee_home_bias", "referee_avg_goals", "referee_cards_total",
"referee_avg_yellow", "referee_experience",
# Momentum Engine (3)
"home_momentum_score", "away_momentum_score", "momentum_diff",
# Squad Features (9)
"home_squad_quality", "away_squad_quality", "squad_diff",
"home_key_players", "away_key_players",
"home_missing_impact", "away_missing_impact",
"home_goals_form", "away_goals_form",
# Labels
"score_home", "score_away", "total_goals",
"ht_score_home", "ht_score_away", "ht_total_goals",
"label_ms", # 0=Home, 1=Draw, 2=Away
"label_ou05", # 0=Under, 1=Over
"label_ou15",
"label_ou25",
"label_ou35",
"label_btts", # 0=No, 1=Yes
"label_ht_result", # 0=Home, 1=Draw, 2=Away
"label_ht_ou05", # 0=Under, 1=Over
"label_ht_ou15",
"label_ht_ft", # 0=1/1, 1=1/X, 2=1/2 ... 8=2/2
"label_odd_even", # 1=Odd, 0=Even
"label_yellow_cards",# Count of cards (yellow=1, red=2)
"label_cards_ou45", # 0=Under 4.5, 1=Over 4.5
"label_handicap_ms", # Handikap (Home starts -1): 0=Home wins by 2+, 1=Home wins by exactly 1, 2=Draw or Away wins
]
# =============================================================================
# BATCH LOADERS — Pre-load data to avoid N+1 queries
# =============================================================================
class BatchDataLoader:
"""Pre-loads all necessary data in bulk, then serves features per match."""
def __init__(self, conn, top_league_ids: list):
self.conn = conn
self.cur = conn.cursor()
self.top_league_ids = top_league_ids
# Pre-loaded data caches
self.matches = []
self.odds_cache = {} # match_id → {ms_h, ms_d, ms_a, ...}
self.team_stats_cache = {} # (team_id, before_date_bucket) → stats
self.form_cache = {} # (team_id, match_id) → form features
self.h2h_cache = {} # (home_id, away_id, match_id) → h2h features
self.league_stats_cache = {} # league_id → {avg_goals, zero_rate}
self.squad_cache = {} # (match_id, team_id) → {starting, goals, assists, key_players, ...}
self.cards_cache = {} # match_id → total cards scored
def load_all(self):
"""Load all data in batch."""
t0 = time.time()
self._load_matches()
print(f" ✅ Matches: {len(self.matches)} ({time.time()-t0:.1f}s)", flush=True)
t1 = time.time()
self._load_odds()
print(f" ✅ Odds: {len(self.odds_cache)} matches ({time.time()-t1:.1f}s)", flush=True)
t2 = time.time()
self._load_league_stats()
print(f" ✅ League stats: {len(self.league_stats_cache)} leagues ({time.time()-t2:.1f}s)", flush=True)
t3 = time.time()
self._load_team_history()
print(f" ✅ Team History & Stats cache built ({time.time()-t3:.1f}s)", flush=True)
t4 = time.time()
self._load_squad_data()
print(f" ✅ Squad data: {len(self.squad_cache)} team-matches ({time.time()-t4:.1f}s)", flush=True)
t5 = time.time()
self._load_cards_data()
print(f" ✅ Cards data: {len(self.cards_cache)} matches ({time.time()-t5:.1f}s)", flush=True)
print(f" 📊 Total load time: {time.time()-t0:.1f}s", flush=True)
def _load_matches(self):
ph = ",".join(["%s"] * len(self.top_league_ids))
self.cur.execute(f"""
SELECT m.id, m.home_team_id, m.away_team_id,
m.score_home, m.score_away,
m.ht_score_home, m.ht_score_away,
m.mst_utc, m.league_id,
ht.name as home_name,
at.name as away_name,
l.name as league_name
FROM matches m
JOIN teams ht ON m.home_team_id = ht.id
JOIN teams at ON m.away_team_id = at.id
JOIN leagues l ON m.league_id = l.id
WHERE m.status = 'FT'
AND m.score_home IS NOT NULL
AND m.sport = 'football'
AND m.league_id IN ({ph})
ORDER BY m.mst_utc ASC
""", self.top_league_ids)
self.matches = self.cur.fetchall()
def _load_odds(self):
"""Bulk load all odds for top league matches."""
ph = ",".join(["%s"] * len(self.top_league_ids))
self.cur.execute(f"""
SELECT oc.match_id, oc.name, os.name, os.odd_value
FROM odd_selections os
JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
JOIN matches m ON oc.match_id = m.id
WHERE m.status = 'FT'
AND m.sport = 'football'
AND m.league_id IN ({ph})
""", self.top_league_ids)
for match_id, cat_name, sel_name, odd_val in self.cur.fetchall():
try:
v = float(odd_val) if odd_val else 0
if v <= 0 or not cat_name or not sel_name:
continue
if match_id not in self.odds_cache:
self.odds_cache[match_id] = {}
cat_lower = cat_name.lower().strip()
sel_lower = sel_name.lower().strip()
# Match Result (1X2)
if cat_lower == 'maç sonucu':
if sel_name == '1': self.odds_cache[match_id]['ms_h'] = v
elif sel_name in ('0', 'X'): self.odds_cache[match_id]['ms_d'] = v
elif sel_name == '2': self.odds_cache[match_id]['ms_a'] = v
# HT Result
elif cat_lower == '1. yarı sonucu':
if sel_name == '1': self.odds_cache[match_id]['ht_ms_h'] = v
elif sel_name in ('0', 'X'): self.odds_cache[match_id]['ht_ms_d'] = v
elif sel_name == '2': self.odds_cache[match_id]['ht_ms_a'] = v
# BTTS
elif cat_lower == 'karşılıklı gol':
if 'var' in sel_lower: self.odds_cache[match_id]['btts_y'] = v
elif 'yok' in sel_lower: self.odds_cache[match_id]['btts_n'] = v
# Over/Under FT
elif cat_lower == '0,5 alt/üst':
if 'alt' in sel_lower: self.odds_cache[match_id]['ou05_u'] = v
elif 'üst' in sel_lower: self.odds_cache[match_id]['ou05_o'] = v
elif cat_lower == '1,5 alt/üst':
if 'alt' in sel_lower: self.odds_cache[match_id]['ou15_u'] = v
elif 'üst' in sel_lower: self.odds_cache[match_id]['ou15_o'] = v
elif cat_lower == '2,5 alt/üst':
if 'alt' in sel_lower: self.odds_cache[match_id]['ou25_u'] = v
elif 'üst' in sel_lower: self.odds_cache[match_id]['ou25_o'] = v
elif cat_lower == '3,5 alt/üst':
if 'alt' in sel_lower: self.odds_cache[match_id]['ou35_u'] = v
elif 'üst' in sel_lower: self.odds_cache[match_id]['ou35_o'] = v
# Over/Under HT
elif cat_lower == '1. yarı 0,5 alt/üst':
if 'alt' in sel_lower: self.odds_cache[match_id]['ht_ou05_u'] = v
elif 'üst' in sel_lower: self.odds_cache[match_id]['ht_ou05_o'] = v
elif cat_lower == '1. yarı 1,5 alt/üst':
if 'alt' in sel_lower: self.odds_cache[match_id]['ht_ou15_u'] = v
elif 'üst' in sel_lower: self.odds_cache[match_id]['ht_ou15_o'] = v
except (ValueError, TypeError):
pass
def _load_league_stats(self):
"""Calculate league-level aggregated stats."""
ph = ",".join(["%s"] * len(self.top_league_ids))
self.cur.execute(f"""
SELECT league_id,
AVG(score_home + score_away) as avg_goals,
AVG(CASE WHEN score_home = 0 AND score_away = 0 THEN 1.0 ELSE 0.0 END) as zero_rate,
COUNT(*) as match_count
FROM matches
WHERE status = 'FT'
AND score_home IS NOT NULL
AND sport = 'football'
AND league_id IN ({ph})
GROUP BY league_id
""", self.top_league_ids)
for league_id, avg_goals, zero_rate, cnt in self.cur.fetchall():
self.league_stats_cache[league_id] = {
"avg_goals": float(avg_goals) if avg_goals else 2.5,
"zero_rate": float(zero_rate) if zero_rate else 0.07,
"match_count": cnt
}
def _load_team_history(self):
"""Bulk load all matches and team stats for lightning fast memory lookups"""
from collections import defaultdict
self.team_matches = defaultdict(list)
self.team_stats = defaultdict(list)
# Load all matches for form/h2h
self.cur.execute("""
SELECT home_team_id, away_team_id, score_home, score_away, mst_utc
FROM matches
WHERE status = 'FT' AND score_home IS NOT NULL AND sport = 'football'
ORDER BY mst_utc ASC
""")
for hid, aid, sh, sa, mst in self.cur.fetchall():
# (mst, is_home, team_score, opp_score, opp_id)
self.team_matches[hid].append((mst, True, sh, sa, aid))
self.team_matches[aid].append((mst, False, sa, sh, hid))
# Load all football_team_stats (sport-partitioned schema)
self.cur.execute("""
SELECT mts.team_id, m.mst_utc, mts.possession_percentage,
mts.shots_on_target, mts.total_shots, mts.corners,
m.score_home, m.score_away, m.home_team_id
FROM football_team_stats mts
JOIN matches m ON mts.match_id = m.id
WHERE m.sport = 'football' AND m.status = 'FT' AND m.score_home IS NOT NULL
ORDER BY m.mst_utc ASC
""")
for tid, mst, poss, sot, tshots, corn, sh, sa, hid in self.cur.fetchall():
team_goals = sh if hid == tid else sa
self.team_stats[tid].append((mst, poss, sot, tshots, corn, team_goals))
def _load_squad_data(self):
"""Bulk load squad participation + player events for squad features."""
ph = ",".join(["%s"] * len(self.top_league_ids))
# 1) Participation: starting XI count + position distribution per (match, team)
self.cur.execute(f"""
SELECT mpp.match_id, mpp.team_id,
COUNT(*) FILTER (WHERE mpp.is_starting = true) AS starting_count,
COUNT(*) AS total_squad,
COUNT(*) FILTER (WHERE mpp.is_starting = true AND LOWER(COALESCE(mpp.position::TEXT,'')) ~ '(forward|fwd|forvet|striker)') AS fwd_count
FROM match_player_participation mpp
JOIN matches m ON mpp.match_id = m.id
WHERE m.status = 'FT'
AND m.sport = 'football'
AND m.league_id IN ({ph})
GROUP BY mpp.match_id, mpp.team_id
""", self.top_league_ids)
participation = {}
for mid, tid, st_count, tot, fwd in self.cur.fetchall():
participation[(mid, tid)] = {
'starting_count': st_count or 0,
'total_squad': tot or 0,
'fwd_count': fwd or 0,
}
# 2) Player events: goals + assists per (match, team)
self.cur.execute(f"""
SELECT mpe.match_id, mpe.team_id,
COUNT(*) FILTER (
WHERE mpe.event_type = 'goal'
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
) AS goal_count,
COUNT(DISTINCT mpe.assist_player_id) FILTER (
WHERE mpe.event_type = 'goal' AND mpe.assist_player_id IS NOT NULL
) AS assist_count,
COUNT(DISTINCT mpe.player_id) FILTER (
WHERE mpe.event_type = 'goal'
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
) AS unique_scorers
FROM match_player_events mpe
JOIN matches m ON mpe.match_id = m.id
WHERE m.status = 'FT'
AND m.sport = 'football'
AND m.league_id IN ({ph})
GROUP BY mpe.match_id, mpe.team_id
""", self.top_league_ids)
events = {}
for mid, tid, goals, assists, scorers in self.cur.fetchall():
events[(mid, tid)] = {
'goals': goals or 0,
'assists': assists or 0,
'unique_scorers': scorers or 0,
}
# 3) Key players: players with 3+ goals across all their matches, per team
self.cur.execute(f"""
SELECT mpe.team_id, mpe.player_id, COUNT(*) AS total_goals
FROM match_player_events mpe
JOIN matches m ON mpe.match_id = m.id
WHERE m.status = 'FT'
AND m.sport = 'football'
AND m.league_id IN ({ph})
AND mpe.event_type = 'goal'
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
GROUP BY mpe.team_id, mpe.player_id
HAVING COUNT(*) >= 3
""", self.top_league_ids)
key_players_by_team = defaultdict(set)
for tid, pid, _ in self.cur.fetchall():
key_players_by_team[tid].add(pid)
# 4) Starting key players per (match, team)
self.cur.execute(f"""
SELECT mpp.match_id, mpp.team_id, mpp.player_id
FROM match_player_participation mpp
JOIN matches m ON mpp.match_id = m.id
WHERE mpp.is_starting = true
AND m.status = 'FT'
AND m.sport = 'football'
AND m.league_id IN ({ph})
""", self.top_league_ids)
starting_players = defaultdict(list)
for mid, tid, pid in self.cur.fetchall():
starting_players[(mid, tid)].append(pid)
# 5) Build combined cache
all_keys = set(participation.keys()) | set(events.keys())
for key in all_keys:
mid, tid = key
part = participation.get(key, {'starting_count': 0, 'total_squad': 0, 'fwd_count': 0})
evt = events.get(key, {'goals': 0, 'assists': 0, 'unique_scorers': 0})
# Count key players in starting XI
starters = starting_players.get(key, [])
kp_in_starting = sum(1 for p in starters if p in key_players_by_team.get(tid, set()))
kp_total = len(key_players_by_team.get(tid, set()))
kp_missing = max(0, kp_total - kp_in_starting)
# Squad quality: composite score
squad_quality = (
part['starting_count'] * 0.3 +
evt['goals'] * 2.0 +
evt['assists'] * 1.0 +
kp_in_starting * 3.0 +
part['fwd_count'] * 1.5
)
# Missing impact: how many key players are missing
missing_impact = min(kp_missing / max(kp_total, 1), 1.0)
self.squad_cache[key] = {
'squad_quality': squad_quality,
'key_players': kp_in_starting,
'missing_impact': missing_impact,
'goals_form': evt['goals'],
}
def _load_cards_data(self):
"""Bulk load all distinct cards per match (yellow=1, red=2)."""
ph = ",".join(["%s"] * len(self.top_league_ids))
self.cur.execute(f"""
SELECT mpe.match_id,
SUM(CASE
WHEN mpe.event_type::text LIKE '%%yellow_card%%' THEN 1
WHEN mpe.event_type::text LIKE '%%red_card%%' THEN 2
ELSE 1 END) as cards_weight
FROM match_player_events mpe
JOIN matches m ON mpe.match_id = m.id
WHERE m.status = 'FT'
AND m.sport = 'football'
AND m.league_id IN ({ph})
AND mpe.event_type::text LIKE '%%card%%'
GROUP BY mpe.match_id
""", self.top_league_ids)
for mid, cards_weight in self.cur.fetchall():
self.cards_cache[mid] = float(cards_weight) if cards_weight else 0.0
class FeatureExtractor:
"""Extract features for a single match using pre-loaded data + on-demand queries."""
def __init__(self, conn, loader: BatchDataLoader):
self.conn = conn
self.cur = conn.cursor()
self.loader = loader
# ELO cache: team_id → {overall, home, away, form}
self.elo_ratings = defaultdict(lambda: {
"overall": 1500.0, "home": 1500.0,
"away": 1500.0, "form": 1500.0,
"matches": 0
})
self._elo_initialized = False
self.upset_engine = get_upset_engine()
self.referee_engine = get_referee_engine()
self.momentum_engine = get_momentum_engine()
def extract_all(self) -> list:
"""Extract features for all matches, yield row dicts."""
matches = self.loader.matches
total = len(matches)
rows = []
skipped = 0
t_start = time.time()
print(f"\n🔄 Extracting features for {total} matches...", flush=True)
# Process chronologically — ELO grows as we go
for i, m in enumerate(matches):
(
mid,
hid,
aid,
sh,
sa,
hth,
hta,
mst,
lid,
home_name,
away_name,
league_name,
) = m
if i % 100 == 0 and i > 0:
elapsed = time.time() - t_start
rate = i / elapsed # matches per second
remaining = (total - i) / rate if rate > 0 else 0
pct = i / total * 100
print(f" [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | ETA: {remaining/60:.1f} dk | skipped: {skipped}", flush=True)
row = self._extract_one(
mid,
hid,
aid,
sh,
sa,
hth,
hta,
mst,
lid,
home_name,
away_name,
league_name,
)
if row:
rows.append(row)
else:
skipped += 1
# Update ELO after processing (so ELO is calculated BEFORE the match)
self._update_elo(hid, aid, sh, sa)
print(f" ✅ Extracted {len(rows)} rows, skipped {skipped}", flush=True)
return rows
def _extract_one(
self,
mid,
hid,
aid,
sh,
sa,
hth,
hta,
mst,
lid,
home_name,
away_name,
league_name,
):
"""Extract features for a single match."""
# === LABELS ===
total_goals = sh + sa
ht_total = (hth + hta) if hth is not None and hta is not None else None
if sh > sa:
label_ms = 0
elif sh < sa:
label_ms = 2
else:
label_ms = 1
label_ou05 = 1 if total_goals > 0.5 else 0
label_ou15 = 1 if total_goals > 1.5 else 0
label_ou25 = 1 if total_goals > 2.5 else 0
label_ou35 = 1 if total_goals > 3.5 else 0
label_btts = 1 if (sh > 0 and sa > 0) else 0
label_odd_even = 1 if total_goals % 2 != 0 else 0
# Handicap MS Label (Home starts -1): 0=Home wins by 2+, 1=Home wins by exactly 1, 2=Draw or Away wins
score_diff = sh - sa
if score_diff >= 2:
label_handicap_ms = 0
elif score_diff == 1:
label_handicap_ms = 1
else:
label_handicap_ms = 2
# Cards Label
cards_count = self.loader.cards_cache.get(mid, 0.0)
label_yellow_cards = cards_count
label_cards_ou45 = 1 if cards_count > 4.5 else 0
# HT labels
label_ht_result = None
label_ht_ou05 = None
label_ht_ou15 = None
if hth is not None and hta is not None:
if hth > hta:
label_ht_result = 0
elif hth < hta:
label_ht_result = 2
else:
label_ht_result = 1
label_ht_ou05 = 1 if ht_total > 0.5 else 0
label_ht_ou15 = 1 if ht_total > 1.5 else 0
# HT/FT Label (0-8)
# 0: 1/1, 1: 1/X, 2: 1/2
# 3: X/1, 4: X/X, 5: X/2
# 6: 2/1, 7: 2/X, 8: 2/2
label_ht_ft = None
if label_ht_result is not None:
label_ht_ft = label_ht_result * 3 + label_ms
# === ELO FEATURES ===
h_elo = self.elo_ratings[hid]
a_elo = self.elo_ratings[aid]
elo_features = {
"home_overall_elo": h_elo["overall"],
"away_overall_elo": a_elo["overall"],
"elo_diff": h_elo["overall"] - a_elo["overall"],
"home_home_elo": h_elo["home"],
"away_away_elo": a_elo["away"],
"home_form_elo": h_elo["form"],
"away_form_elo": a_elo["form"],
"form_elo_diff": h_elo["form"] - a_elo["form"],
}
# === FORM FEATURES ===
form_features = self._get_form_features(hid, aid, mst)
# === H2H FEATURES ===
h2h_features = self._get_h2h_features(hid, aid, mst)
# === TEAM STATS FEATURES ===
stats_features = self._get_team_stats_features(hid, aid, mst)
# === ODDS FEATURES ===
odds = self.loader.odds_cache.get(mid, {})
ms_h = odds.get("ms_h", 0.0)
ms_d = odds.get("ms_d", 0.0)
ms_a = odds.get("ms_a", 0.0)
# Implied probabilities (normalized vig-free)
if ms_h > 0 and ms_d > 0 and ms_a > 0:
raw_sum = 1/ms_h + 1/ms_d + 1/ms_a
implied_home = (1/ms_h) / raw_sum
implied_draw = (1/ms_d) / raw_sum
implied_away = (1/ms_a) / raw_sum
else:
implied_home = implied_draw = implied_away = 0.33
odds_features = {
"odds_ms_h": ms_h,
"odds_ms_d": ms_d,
"odds_ms_a": ms_a,
"implied_home": implied_home,
"implied_draw": implied_draw,
"implied_away": implied_away,
"odds_ht_ms_h": odds.get("ht_ms_h", 0.0),
"odds_ht_ms_d": odds.get("ht_ms_d", 0.0),
"odds_ht_ms_a": odds.get("ht_ms_a", 0.0),
"odds_ou05_o": odds.get("ou05_o", 0.0),
"odds_ou05_u": odds.get("ou05_u", 0.0),
"odds_ou15_o": odds.get("ou15_o", 0.0),
"odds_ou15_u": odds.get("ou15_u", 0.0),
"odds_ou25_o": odds.get("ou25_o", 0.0),
"odds_ou25_u": odds.get("ou25_u", 0.0),
"odds_ou35_o": odds.get("ou35_o", 0.0),
"odds_ou35_u": odds.get("ou35_u", 0.0),
"odds_ht_ou05_o": odds.get("ht_ou05_o", 0.0),
"odds_ht_ou05_u": odds.get("ht_ou05_u", 0.0),
"odds_ht_ou15_o": odds.get("ht_ou15_o", 0.0),
"odds_ht_ou15_u": odds.get("ht_ou15_u", 0.0),
"odds_btts_y": odds.get("btts_y", 0.0),
"odds_btts_n": odds.get("btts_n", 0.0),
"odds_ms_h_present": 1.0 if ms_h > 1.01 else 0.0,
"odds_ms_d_present": 1.0 if ms_d > 1.01 else 0.0,
"odds_ms_a_present": 1.0 if ms_a > 1.01 else 0.0,
"odds_ht_ms_h_present": 1.0 if odds.get("ht_ms_h", 0.0) > 1.01 else 0.0,
"odds_ht_ms_d_present": 1.0 if odds.get("ht_ms_d", 0.0) > 1.01 else 0.0,
"odds_ht_ms_a_present": 1.0 if odds.get("ht_ms_a", 0.0) > 1.01 else 0.0,
"odds_ou05_o_present": 1.0 if odds.get("ou05_o", 0.0) > 1.01 else 0.0,
"odds_ou05_u_present": 1.0 if odds.get("ou05_u", 0.0) > 1.01 else 0.0,
"odds_ou15_o_present": 1.0 if odds.get("ou15_o", 0.0) > 1.01 else 0.0,
"odds_ou15_u_present": 1.0 if odds.get("ou15_u", 0.0) > 1.01 else 0.0,
"odds_ou25_o_present": 1.0 if odds.get("ou25_o", 0.0) > 1.01 else 0.0,
"odds_ou25_u_present": 1.0 if odds.get("ou25_u", 0.0) > 1.01 else 0.0,
"odds_ou35_o_present": 1.0 if odds.get("ou35_o", 0.0) > 1.01 else 0.0,
"odds_ou35_u_present": 1.0 if odds.get("ou35_u", 0.0) > 1.01 else 0.0,
"odds_ht_ou05_o_present": 1.0 if odds.get("ht_ou05_o", 0.0) > 1.01 else 0.0,
"odds_ht_ou05_u_present": 1.0 if odds.get("ht_ou05_u", 0.0) > 1.01 else 0.0,
"odds_ht_ou15_o_present": 1.0 if odds.get("ht_ou15_o", 0.0) > 1.01 else 0.0,
"odds_ht_ou15_u_present": 1.0 if odds.get("ht_ou15_u", 0.0) > 1.01 else 0.0,
"odds_btts_y_present": 1.0 if odds.get("btts_y", 0.0) > 1.01 else 0.0,
"odds_btts_n_present": 1.0 if odds.get("btts_n", 0.0) > 1.01 else 0.0,
}
# === LEAGUE FEATURES ===
league = self.loader.league_stats_cache.get(lid, {"avg_goals": 2.5, "zero_rate": 0.07})
league_features = {
"league_avg_goals": league["avg_goals"],
"league_zero_goal_rate": league["zero_rate"],
}
# === UPSET FEATURES ===
try:
upset_feats = self.upset_engine.get_features(
home_team_name=home_name or "",
home_team_id=hid,
away_team_name=away_name or "",
league_name=league_name or "",
home_position=10,
away_position=10,
match_date_ms=mst,
)
except Exception:
upset_feats = {
"upset_atmosphere": 0.0,
"upset_motivation": 0.0,
"upset_fatigue": 0.0,
"upset_potential": 0.0,
}
# === REFEREE FEATURES ===
try:
referee_feats = self.referee_engine.get_features(
match_id=mid,
league_id=str(lid),
)
except Exception:
referee_feats = {
"referee_home_bias": 0.0,
"referee_avg_goals": 2.7,
"referee_cards_total": 4.0,
"referee_avg_yellow": 3.5,
"referee_avg_red": 0.1,
"referee_experience": 0.5,
}
# === MOMENTUM FEATURES ===
try:
momentum_feats = self.momentum_engine.get_features(
home_team_id=hid,
away_team_id=aid,
match_date_ms=mst,
)
home_momentum_score = momentum_feats.get("home_momentum_score", 0.5)
away_momentum_score = momentum_feats.get("away_momentum_score", 0.5)
momentum_diff = momentum_feats.get(
"momentum_diff", home_momentum_score - away_momentum_score
)
except Exception:
home_momentum_score = 0.5
away_momentum_score = 0.5
momentum_diff = 0.0
# === SQUAD FEATURES ===
home_sq = self.loader.squad_cache.get((mid, hid), {})
away_sq = self.loader.squad_cache.get((mid, aid), {})
home_squad_quality = home_sq.get('squad_quality', 0.0)
away_squad_quality = away_sq.get('squad_quality', 0.0)
squad_diff = home_squad_quality - away_squad_quality
home_key_players = home_sq.get('key_players', 0)
away_key_players = away_sq.get('key_players', 0)
home_missing_impact = home_sq.get('missing_impact', 0.0)
away_missing_impact = away_sq.get('missing_impact', 0.0)
home_goals_form = home_sq.get('goals_form', 0)
away_goals_form = away_sq.get('goals_form', 0)
# === ASSEMBLE ROW ===
row = {
"match_id": mid,
"home_team_id": hid,
"away_team_id": aid,
"league_id": lid,
"mst_utc": mst,
**elo_features,
**form_features,
**h2h_features,
**stats_features,
**odds_features,
"home_xga": form_features["home_conceded_avg"],
"away_xga": form_features["away_conceded_avg"],
**league_features,
"upset_atmosphere": upset_feats.get("upset_atmosphere", 0.0),
"upset_motivation": upset_feats.get("upset_motivation", 0.0),
"upset_fatigue": upset_feats.get("upset_fatigue", 0.0),
"upset_potential": upset_feats.get("upset_potential", 0.0),
"referee_home_bias": referee_feats.get("referee_home_bias", 0.0),
"referee_avg_goals": referee_feats.get("referee_avg_goals", 2.7),
"referee_cards_total": referee_feats.get("referee_cards_total", 4.0),
"referee_avg_yellow": referee_feats.get("referee_avg_yellow", 3.5),
"referee_experience": referee_feats.get("referee_experience", 0.5),
"home_momentum_score": home_momentum_score,
"away_momentum_score": away_momentum_score,
"momentum_diff": momentum_diff,
# Squad Features
"home_squad_quality": home_squad_quality,
"away_squad_quality": away_squad_quality,
"squad_diff": squad_diff,
"home_key_players": home_key_players,
"away_key_players": away_key_players,
"home_missing_impact": home_missing_impact,
"away_missing_impact": away_missing_impact,
"home_goals_form": home_goals_form,
"away_goals_form": away_goals_form,
# Labels
"score_home": sh,
"score_away": sa,
"total_goals": total_goals,
"ht_score_home": hth if hth is not None else "",
"ht_score_away": hta if hta is not None else "",
"ht_total_goals": ht_total if ht_total is not None else "",
"label_ms": label_ms,
"label_ou05": label_ou05,
"label_ou15": label_ou15,
"label_ou25": label_ou25,
"label_ou35": label_ou35,
"label_btts": label_btts,
"label_ht_result": label_ht_result if label_ht_result is not None else "",
"label_ht_ou05": label_ht_ou05 if label_ht_ou05 is not None else "",
"label_ht_ou15": label_ht_ou15 if label_ht_ou15 is not None else "",
"label_ht_ft": label_ht_ft if label_ht_ft is not None else "",
"label_odd_even": label_odd_even,
"label_yellow_cards": label_yellow_cards,
"label_cards_ou45": label_cards_ou45,
"label_handicap_ms": label_handicap_ms,
}
return row
# -------------------------------------------------------------------------
# ELO (simplified inline version — doesn't need DB, grows incrementally)
# -------------------------------------------------------------------------
def _update_elo(self, home_id, away_id, score_home, score_away):
"""Update ELO ratings after a match."""
h = self.elo_ratings[home_id]
a = self.elo_ratings[away_id]
HOME_ADVANTAGE = 65
K_BASE = 32
# Expected scores
exp_h = 1.0 / (1 + 10 ** ((a["overall"] - h["overall"] - HOME_ADVANTAGE) / 400))
exp_a = 1.0 - exp_h
# Actual scores
if score_home > score_away:
actual_h, actual_a = 1.0, 0.0
elif score_home < score_away:
actual_h, actual_a = 0.0, 1.0
else:
actual_h, actual_a = 0.5, 0.5
# Goal difference multiplier
gd = abs(score_home - score_away)
gd_mult = math.log(max(gd, 1) + 1) * 0.7 + 1.0
# Dynamic K
k_h = K_BASE * gd_mult * (1.3 if h["matches"] < 10 else 1.0)
k_a = K_BASE * gd_mult * (1.3 if a["matches"] < 10 else 1.0)
delta_h = k_h * (actual_h - exp_h)
delta_a = k_a * (actual_a - exp_a)
# Update all ELO variants
h["overall"] += delta_h
a["overall"] += delta_a
h["home"] += delta_h * 1.1
a["away"] += delta_a * 1.1
# Form ELO (heavier weight on recent)
form_k = K_BASE * 1.5 * gd_mult
h["form"] = h["form"] * 0.85 + (h["form"] + form_k * (actual_h - exp_h)) * 0.15
a["form"] = a["form"] * 0.85 + (a["form"] + form_k * (actual_a - exp_a)) * 0.15
h["matches"] += 1
a["matches"] += 1
# -------------------------------------------------------------------------
# FORM (last 5 matches)
# -------------------------------------------------------------------------
def _get_form_features(self, home_id, away_id, before_date) -> dict:
"""Get form features for both teams."""
h_form = self._calc_team_form(home_id, before_date)
a_form = self._calc_team_form(away_id, before_date)
return {
"home_goals_avg": h_form["goals_avg"],
"home_conceded_avg": h_form["conceded_avg"],
"away_goals_avg": a_form["goals_avg"],
"away_conceded_avg": a_form["conceded_avg"],
"home_clean_sheet_rate": h_form["clean_sheet_rate"],
"away_clean_sheet_rate": a_form["clean_sheet_rate"],
"home_scoring_rate": h_form["scoring_rate"],
"away_scoring_rate": a_form["scoring_rate"],
"home_winning_streak": h_form["winning_streak"],
"away_winning_streak": a_form["winning_streak"],
"home_unbeaten_streak": h_form["unbeaten_streak"],
"away_unbeaten_streak": a_form["unbeaten_streak"],
}
def _calc_team_form(self, team_id, before_date, limit=5) -> dict:
"""Calculate form from last N matches (weighted moving average)."""
history = self.loader.team_matches.get(team_id, [])
# Filter and get last `limit` matches before date
valid_matches = [m for m in history if m[0] < before_date]
rows = valid_matches[-limit:] if valid_matches else []
if not rows:
return {
"goals_avg": 1.3, "conceded_avg": 1.2,
"clean_sheet_rate": 0.25, "scoring_rate": 0.75,
"winning_streak": 0, "unbeaten_streak": 0,
}
w_goals, w_conceded, w_total = 0.0, 0.0, 0.0
clean_sheets, scored_matches = 0, 0
winning_streak, unbeaten_streak = 0, 0
streak_broken, unbeaten_broken = False, False
for i, (mst, is_home, team_goals, opp_goals, opp_id) in enumerate(reversed(rows)):
weight = float(limit - i)
w_goals += team_goals * weight
w_conceded += opp_goals * weight
w_total += weight
if opp_goals == 0:
clean_sheets += 1
if team_goals > 0:
scored_matches += 1
if not streak_broken:
if team_goals > opp_goals:
winning_streak += 1
else:
streak_broken = True
if not unbeaten_broken:
if team_goals >= opp_goals:
unbeaten_streak += 1
else:
unbeaten_broken = True
n = len(rows)
return {
"goals_avg": w_goals / w_total if w_total > 0 else 1.3,
"conceded_avg": w_conceded / w_total if w_total > 0 else 1.2,
"clean_sheet_rate": clean_sheets / n,
"scoring_rate": scored_matches / n,
"winning_streak": winning_streak,
"unbeaten_streak": unbeaten_streak,
}
# -------------------------------------------------------------------------
# H2H
# -------------------------------------------------------------------------
def _get_h2h_features(self, home_id, away_id, before_date) -> dict:
"""Get head-to-head features."""
h_history = self.loader.team_matches.get(home_id, [])
# Matches against away_id before date
h2h_matches = [m for m in h_history if m[4] == away_id and m[0] < before_date]
rows = h2h_matches[-20:] if h2h_matches else []
if not rows:
return {
"h2h_total_matches": 0,
"h2h_home_win_rate": 0.33,
"h2h_draw_rate": 0.33,
"h2h_avg_goals": 2.5,
"h2h_btts_rate": 0.5,
"h2h_over25_rate": 0.5,
}
home_wins, draws = 0, 0
total_goals_sum = 0
btts_count, over25_count = 0, 0
n = len(rows)
for mst, is_our_home, team_goals, opp_goals, opp_id in rows:
sh = team_goals if is_our_home else opp_goals
sa = opp_goals if is_our_home else team_goals
total = sh + sa
total_goals_sum += total
if sh > 0 and sa > 0:
btts_count += 1
if total > 2.5:
over25_count += 1
# Determine result relative to our home team
if is_our_home:
if sh > sa: home_wins += 1
elif sh == sa: draws += 1
else:
if sa > sh: home_wins += 1
elif sh == sa: draws += 1
return {
"h2h_total_matches": n,
"h2h_home_win_rate": home_wins / n,
"h2h_draw_rate": draws / n,
"h2h_avg_goals": total_goals_sum / n,
"h2h_btts_rate": btts_count / n,
"h2h_over25_rate": over25_count / n,
}
# -------------------------------------------------------------------------
# TEAM STATS (possession, shots, corners)
# -------------------------------------------------------------------------
def _get_team_stats_features(self, home_id, away_id, before_date) -> dict:
"""Get team-level match stats features."""
h_stats = self._calc_team_stats(home_id, before_date)
a_stats = self._calc_team_stats(away_id, before_date)
return {
"home_avg_possession": h_stats["possession"],
"away_avg_possession": a_stats["possession"],
"home_avg_shots_on_target": h_stats["shots_on_target"],
"away_avg_shots_on_target": a_stats["shots_on_target"],
"home_shot_conversion": h_stats["shot_conversion"],
"away_shot_conversion": a_stats["shot_conversion"],
"home_avg_corners": h_stats["corners"],
"away_avg_corners": a_stats["corners"],
}
def _calc_team_stats(self, team_id, before_date, limit=10) -> dict:
"""Calculate team stats from match_team_stats table."""
defaults = {
"possession": 0.50, "shots_on_target": 3.5,
"shot_conversion": 0.10, "corners": 4.5
}
stats_history = self.loader.team_stats.get(team_id, [])
valid_stats = [s for s in stats_history if s[0] < before_date]
rows = valid_stats[-limit:] if valid_stats else []
if not rows:
return defaults
poss_sum, sot_sum, shots_sum, corners_sum = 0.0, 0.0, 0.0, 0.0
goals_scored = 0
poss_count = 0
n = len(rows)
for mst, poss, sot, total_shots, corners, team_goals in rows:
if poss and poss > 0:
poss_sum += float(poss)
poss_count += 1
sot_sum += float(sot or 0)
shots_sum += float(total_shots or 0)
corners_sum += float(corners or 0)
goals_scored += float(team_goals or 0)
return {
"possession": (poss_sum / poss_count / 100) if poss_count > 0 else 0.50,
"shots_on_target": sot_sum / n,
"shot_conversion": goals_scored / shots_sum if shots_sum > 0 else 0.10,
"corners": corners_sum / n,
}
# =============================================================================
# MAIN
# =============================================================================
def main():
print("🚀 XGBoost Training Data Extraction")
print("=" * 60)
# Load top leagues
with open(TOP_LEAGUES_PATH) as f:
top_leagues = json.load(f)
print(f"📋 {len(top_leagues)} top leagues")
# Connect
conn = get_conn()
# Batch load
print("\n📦 Loading batch data...")
loader = BatchDataLoader(conn, top_leagues)
loader.load_all()
# Extract features
extractor = FeatureExtractor(conn, loader)
rows = extractor.extract_all()
if not rows:
print("❌ No data extracted!")
return
# Write CSV
print(f"\n💾 Writing {len(rows)} rows to {OUTPUT_CSV}...")
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=FEATURE_COLS)
writer.writeheader()
writer.writerows(rows)
# Summary stats
print(f"\n📊 Data Summary:")
print(f" Total rows: {len(rows)}")
# Label distributions
ms_dist = defaultdict(int)
ou25_dist = defaultdict(int)
btts_dist = defaultdict(int)
with_odds = sum(1 for r in rows if r.get("odds_ms_h", 0) > 0)
for r in rows:
ms_dist[r["label_ms"]] += 1
ou25_dist[r["label_ou25"]] += 1
btts_dist[r["label_btts"]] += 1
n = len(rows)
print(f" With odds: {with_odds} ({with_odds/n*100:.1f}%)")
print(f" MS dist: Home={ms_dist[0]/n*100:.1f}% Draw={ms_dist[1]/n*100:.1f}% Away={ms_dist[2]/n*100:.1f}%")
print(f" O/U 2.5: Over={ou25_dist[1]/n*100:.1f}% Under={ou25_dist[0]/n*100:.1f}%")
print(f" BTTS: Yes={btts_dist[1]/n*100:.1f}% No={btts_dist[0]/n*100:.1f}%")
# HT/FT Distribution
htft_dist = defaultdict(int)
for r in rows:
if r.get("label_ht_ft") is not None and r.get("label_ht_ft") != "":
htft_dist[r["label_ht_ft"]] += 1
print(f"\n HT/FT Distribution:")
# Interesting ones: 1/2 (2) and 2/1 (6)
rev_1_2 = htft_dist.get(2, 0)
rev_2_1 = htft_dist.get(6, 0)
print(f" 1/2 (Home->Away): {rev_1_2} ({rev_1_2/n*100:.2f}%)")
print(f" 2/1 (Away->Home): {rev_2_1} ({rev_2_1/n*100:.2f}%)")
print(f" 1/1: {htft_dist.get(0,0)} | X/X: {htft_dist.get(4,0)} | 2/2: {htft_dist.get(8,0)}")
# Feature NaN check
nan_cols = []
for col in FEATURE_COLS:
nans = sum(1 for r in rows if r.get(col, "") == "" or r.get(col) is None)
if nans > 0 and col not in ("ht_score_home", "ht_score_away", "ht_total_goals",
"label_ht_result", "label_ht_ou05", "label_ht_ou15"):
nan_cols.append((col, nans))
if nan_cols:
print(f"\n ⚠️ Columns with missing values:")
for col, cnt in nan_cols:
print(f" {col}: {cnt} ({cnt/n*100:.1f}%)")
else:
print(f"\n ✅ No missing values in feature columns!")
print(f"\n✅ Done! Output: {OUTPUT_CSV}")
conn.close()
if __name__ == "__main__":
main()