Files
iddaai-be/ai-engine/scripts/extract_training_data.py
fahricansecer 94c7a4481a
Deploy Iddaai Backend / build-and-deploy (push) Successful in 37s
main
2026-05-17 02:17:22 +03:00

1572 lines
64 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
XGBoost Training Data Extraction
=================================
Batch feature extraction for top-league matches.
Extracts ~82 features + labels per match for XGBoost model training.
Usage:
python3 scripts/extract_training_data.py
"""
import os
import sys
import json
import csv
import math
import time
import bisect
from datetime import datetime
from collections import defaultdict
import psycopg2
from psycopg2.extras import RealDictCursor
from dotenv import load_dotenv
load_dotenv()
# =============================================================================
# CONFIG
# =============================================================================
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_ENGINE_DIR)
from features.upset_engine import get_upset_engine
from features.referee_engine import get_referee_engine
from features.momentum_engine import get_momentum_engine
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "qualified_leagues.json")
OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
# Ensure output dir exists
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
def get_conn():
db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0]
return psycopg2.connect(db_url)
# =============================================================================
# FEATURE COLUMNS (ORDER MATTERS — matches CSV header)
# =============================================================================
FEATURE_COLS = [
# Match identifiers
"match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
# ELO Features (8)
"home_overall_elo", "away_overall_elo", "elo_diff",
"home_home_elo", "away_away_elo",
"home_form_elo", "away_form_elo", "form_elo_diff",
# Form Features (12)
"home_goals_avg", "home_conceded_avg",
"away_goals_avg", "away_conceded_avg",
"home_clean_sheet_rate", "away_clean_sheet_rate",
"home_scoring_rate", "away_scoring_rate",
"home_winning_streak", "away_winning_streak",
"home_unbeaten_streak", "away_unbeaten_streak",
# H2H Features (6)
"h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate",
"h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",
# Team Stats Features (8)
"home_avg_possession", "away_avg_possession",
"home_avg_shots_on_target", "away_avg_shots_on_target",
"home_shot_conversion", "away_shot_conversion",
"home_avg_corners", "away_avg_corners",
# Odds Features (24)
"odds_ms_h", "odds_ms_d", "odds_ms_a",
"implied_home", "implied_draw", "implied_away",
"odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
"odds_ou05_o", "odds_ou05_u",
"odds_ou15_o", "odds_ou15_u",
"odds_ou25_o", "odds_ou25_u",
"odds_ou35_o", "odds_ou35_u",
"odds_ht_ou05_o", "odds_ht_ou05_u",
"odds_ht_ou15_o", "odds_ht_ou15_u",
"odds_btts_y", "odds_btts_n",
"odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present",
"odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present",
"odds_ou05_o_present", "odds_ou05_u_present",
"odds_ou15_o_present", "odds_ou15_u_present",
"odds_ou25_o_present", "odds_ou25_u_present",
"odds_ou35_o_present", "odds_ou35_u_present",
"odds_ht_ou05_o_present", "odds_ht_ou05_u_present",
"odds_ht_ou15_o_present", "odds_ht_ou15_u_present",
"odds_btts_y_present", "odds_btts_n_present",
# Defensive/League Features (4)
"home_xga", "away_xga",
"league_avg_goals", "league_zero_goal_rate",
# Upset Engine (4)
"upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",
# Referee Engine (5)
"referee_home_bias", "referee_avg_goals", "referee_cards_total",
"referee_avg_yellow", "referee_experience",
# Momentum Engine (3)
"home_momentum_score", "away_momentum_score", "momentum_diff",
# Squad Features (9)
"home_squad_quality", "away_squad_quality", "squad_diff",
"home_key_players", "away_key_players",
"home_missing_impact", "away_missing_impact",
"home_goals_form", "away_goals_form",
# Player-Level Features (12)
"home_lineup_goals_per90", "away_lineup_goals_per90",
"home_lineup_assists_per90", "away_lineup_assists_per90",
"home_squad_continuity", "away_squad_continuity",
"home_top_scorer_form", "away_top_scorer_form",
"home_avg_player_exp", "away_avg_player_exp",
"home_goals_diversity", "away_goals_diversity",
# V27 H2H Expanded (4)
"h2h_home_goals_avg", "h2h_away_goals_avg",
"h2h_recent_trend", "h2h_venue_advantage",
# V27 Rolling Stats (13)
"home_rolling5_goals", "home_rolling5_conceded",
"home_rolling10_goals", "home_rolling10_conceded",
"home_rolling20_goals", "home_rolling20_conceded",
"away_rolling5_goals", "away_rolling5_conceded",
"away_rolling10_goals", "away_rolling10_conceded",
"home_rolling5_cs", "away_rolling5_cs",
# V27 Venue Stats (4)
"home_venue_goals", "home_venue_conceded",
"away_venue_goals", "away_venue_conceded",
# V27 Goal Trend (2)
"home_goal_trend", "away_goal_trend",
# V27 Calendar (5)
"home_days_rest", "away_days_rest",
"match_month", "is_season_start", "is_season_end",
# V27 Interaction (6)
"attack_vs_defense_home", "attack_vs_defense_away",
"xg_diff", "form_momentum_interaction",
"elo_form_consistency", "upset_x_elo_gap",
# V27 League Expanded (5)
"league_home_win_rate", "league_draw_rate",
"league_btts_rate", "league_ou25_rate",
"league_reliability_score",
# Labels
"score_home", "score_away", "total_goals",
"ht_score_home", "ht_score_away", "ht_total_goals",
"label_ms", # 0=Home, 1=Draw, 2=Away
"label_ou05", # 0=Under, 1=Over
"label_ou15",
"label_ou25",
"label_ou35",
"label_btts", # 0=No, 1=Yes
"label_ht_result", # 0=Home, 1=Draw, 2=Away
"label_ht_ou05", # 0=Under, 1=Over
"label_ht_ou15",
"label_ht_ft", # 0=1/1, 1=1/X, 2=1/2 ... 8=2/2
"label_odd_even", # 1=Odd, 0=Even
"label_yellow_cards",# Count of cards (yellow=1, red=2)
"label_cards_ou45", # 0=Under 4.5, 1=Over 4.5
"label_handicap_ms", # Handikap (Home starts -1): 0=Home wins by 2+, 1=Home wins by exactly 1, 2=Draw or Away wins
]
# =============================================================================
# BATCH LOADERS — Pre-load data to avoid N+1 queries
# =============================================================================
class BatchDataLoader:
"""Pre-loads all necessary data in bulk, then serves features per match."""
def __init__(self, conn, top_league_ids: list):
self.conn = conn
self.cur = conn.cursor()
self.top_league_ids = top_league_ids
# Pre-loaded data caches
self.matches = []
self.odds_cache = {} # match_id → {ms_h, ms_d, ms_a, ...}
self.team_stats_cache = {} # (team_id, before_date_bucket) → stats
self.form_cache = {} # (team_id, match_id) → form features
self.h2h_cache = {} # (home_id, away_id, match_id) → h2h features
self.league_stats_cache = {} # league_id → {avg_goals, zero_rate}
self.squad_cache = {} # (match_id, team_id) → {starting, goals, assists, key_players, ...}
self.cards_cache = {} # match_id → total cards scored
def load_all(self):
"""Load all data in batch."""
t0 = time.time()
self._load_matches()
print(f" ✅ Matches: {len(self.matches)} ({time.time()-t0:.1f}s)", flush=True)
t1 = time.time()
self._load_odds()
print(f" ✅ Odds: {len(self.odds_cache)} matches ({time.time()-t1:.1f}s)", flush=True)
t2 = time.time()
self._load_league_stats()
print(f" ✅ League stats: {len(self.league_stats_cache)} leagues ({time.time()-t2:.1f}s)", flush=True)
t3 = time.time()
self._load_team_history()
print(f" ✅ Team History & Stats cache built ({time.time()-t3:.1f}s)", flush=True)
t4 = time.time()
self._load_squad_data()
print(f" ✅ Squad data: {len(self.squad_cache)} team-matches ({time.time()-t4:.1f}s)", flush=True)
t5 = time.time()
self._load_cards_data()
print(f" ✅ Cards data: {len(self.cards_cache)} matches ({time.time()-t5:.1f}s)", flush=True)
print(f" 📊 Total load time: {time.time()-t0:.1f}s", flush=True)
def _load_matches(self):
ph = ",".join(["%s"] * len(self.top_league_ids))
self.cur.execute(f"""
SELECT m.id, m.home_team_id, m.away_team_id,
m.score_home, m.score_away,
m.ht_score_home, m.ht_score_away,
m.mst_utc, m.league_id,
ht.name as home_name,
at.name as away_name,
l.name as league_name
FROM matches m
JOIN teams ht ON m.home_team_id = ht.id
JOIN teams at ON m.away_team_id = at.id
JOIN leagues l ON m.league_id = l.id
WHERE m.status = 'FT'
AND m.score_home IS NOT NULL
AND m.sport = 'football'
AND m.league_id IN ({ph})
ORDER BY m.mst_utc ASC
""", self.top_league_ids)
self.matches = self.cur.fetchall()
def _load_odds(self):
"""Bulk load all odds for top league matches."""
ph = ",".join(["%s"] * len(self.top_league_ids))
self.cur.execute(f"""
SELECT oc.match_id, oc.name, os.name, os.odd_value
FROM odd_selections os
JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
JOIN matches m ON oc.match_id = m.id
WHERE m.status = 'FT'
AND m.sport = 'football'
AND m.league_id IN ({ph})
""", self.top_league_ids)
for match_id, cat_name, sel_name, odd_val in self.cur.fetchall():
try:
v = float(odd_val) if odd_val else 0
if v <= 0 or not cat_name or not sel_name:
continue
if match_id not in self.odds_cache:
self.odds_cache[match_id] = {}
cat_lower = cat_name.lower().strip()
sel_lower = sel_name.lower().strip()
# Match Result (1X2)
if cat_lower == 'maç sonucu':
if sel_name == '1': self.odds_cache[match_id]['ms_h'] = v
elif sel_name in ('0', 'X'): self.odds_cache[match_id]['ms_d'] = v
elif sel_name == '2': self.odds_cache[match_id]['ms_a'] = v
# HT Result
elif cat_lower == '1. yarı sonucu':
if sel_name == '1': self.odds_cache[match_id]['ht_ms_h'] = v
elif sel_name in ('0', 'X'): self.odds_cache[match_id]['ht_ms_d'] = v
elif sel_name == '2': self.odds_cache[match_id]['ht_ms_a'] = v
# BTTS
elif cat_lower == 'karşılıklı gol':
if 'var' in sel_lower: self.odds_cache[match_id]['btts_y'] = v
elif 'yok' in sel_lower: self.odds_cache[match_id]['btts_n'] = v
# Over/Under FT
elif cat_lower == '0,5 alt/üst':
if 'alt' in sel_lower: self.odds_cache[match_id]['ou05_u'] = v
elif 'üst' in sel_lower: self.odds_cache[match_id]['ou05_o'] = v
elif cat_lower == '1,5 alt/üst':
if 'alt' in sel_lower: self.odds_cache[match_id]['ou15_u'] = v
elif 'üst' in sel_lower: self.odds_cache[match_id]['ou15_o'] = v
elif cat_lower == '2,5 alt/üst':
if 'alt' in sel_lower: self.odds_cache[match_id]['ou25_u'] = v
elif 'üst' in sel_lower: self.odds_cache[match_id]['ou25_o'] = v
elif cat_lower == '3,5 alt/üst':
if 'alt' in sel_lower: self.odds_cache[match_id]['ou35_u'] = v
elif 'üst' in sel_lower: self.odds_cache[match_id]['ou35_o'] = v
# Over/Under HT
elif cat_lower == '1. yarı 0,5 alt/üst':
if 'alt' in sel_lower: self.odds_cache[match_id]['ht_ou05_u'] = v
elif 'üst' in sel_lower: self.odds_cache[match_id]['ht_ou05_o'] = v
elif cat_lower == '1. yarı 1,5 alt/üst':
if 'alt' in sel_lower: self.odds_cache[match_id]['ht_ou15_u'] = v
elif 'üst' in sel_lower: self.odds_cache[match_id]['ht_ou15_o'] = v
except (ValueError, TypeError):
pass
def _load_league_stats(self):
"""Calculate league-level aggregated stats."""
ph = ",".join(["%s"] * len(self.top_league_ids))
self.cur.execute(f"""
SELECT league_id,
AVG(score_home + score_away) as avg_goals,
AVG(CASE WHEN score_home = 0 AND score_away = 0 THEN 1.0 ELSE 0.0 END) as zero_rate,
AVG(CASE WHEN score_home > score_away THEN 1.0 ELSE 0.0 END) as home_win_rate,
AVG(CASE WHEN score_home = score_away THEN 1.0 ELSE 0.0 END) as draw_rate,
AVG(CASE WHEN score_home > 0 AND score_away > 0 THEN 1.0 ELSE 0.0 END) as btts_rate,
AVG(CASE WHEN score_home + score_away > 2.5 THEN 1.0 ELSE 0.0 END) as ou25_rate,
COUNT(*) as match_count
FROM matches
WHERE status = 'FT'
AND score_home IS NOT NULL
AND sport = 'football'
AND league_id IN ({ph})
GROUP BY league_id
""", self.top_league_ids)
for row in self.cur.fetchall():
league_id, avg_goals, zero_rate, home_win_rate, draw_rate, btts_rate, ou25_rate, cnt = row
self.league_stats_cache[league_id] = {
"avg_goals": float(avg_goals) if avg_goals else 2.5,
"zero_rate": float(zero_rate) if zero_rate else 0.07,
"home_win_rate": float(home_win_rate) if home_win_rate else 0.45,
"draw_rate": float(draw_rate) if draw_rate else 0.25,
"btts_rate": float(btts_rate) if btts_rate else 0.50,
"ou25_rate": float(ou25_rate) if ou25_rate else 0.50,
"match_count": cnt,
}
def _load_team_history(self):
"""Bulk load all matches and team stats for lightning fast memory lookups"""
from collections import defaultdict
self.team_matches = defaultdict(list)
self.team_stats = defaultdict(list)
# Load all matches for form/h2h
self.cur.execute("""
SELECT home_team_id, away_team_id, score_home, score_away, mst_utc
FROM matches
WHERE status = 'FT' AND score_home IS NOT NULL AND sport = 'football'
ORDER BY mst_utc ASC
""")
for hid, aid, sh, sa, mst in self.cur.fetchall():
# (mst, is_home, team_score, opp_score, opp_id)
self.team_matches[hid].append((mst, True, sh, sa, aid))
self.team_matches[aid].append((mst, False, sa, sh, hid))
# Load all football_team_stats (sport-partitioned schema)
self.cur.execute("""
SELECT mts.team_id, m.mst_utc, mts.possession_percentage,
mts.shots_on_target, mts.total_shots, mts.corners,
m.score_home, m.score_away, m.home_team_id
FROM football_team_stats mts
JOIN matches m ON mts.match_id = m.id
WHERE m.sport = 'football' AND m.status = 'FT' AND m.score_home IS NOT NULL
ORDER BY m.mst_utc ASC
""")
for tid, mst, poss, sot, tshots, corn, sh, sa, hid in self.cur.fetchall():
team_goals = sh if hid == tid else sa
self.team_stats[tid].append((mst, poss, sot, tshots, corn, team_goals))
def _load_squad_data(self):
"""Bulk load squad participation + player events + player career for squad features."""
ph = ",".join(["%s"] * len(self.top_league_ids))
# 1) Participation: starting XI count + position distribution per (match, team)
self.cur.execute(f"""
SELECT mpp.match_id, mpp.team_id,
COUNT(*) FILTER (WHERE mpp.is_starting = true) AS starting_count,
COUNT(*) AS total_squad,
COUNT(*) FILTER (WHERE mpp.is_starting = true AND LOWER(COALESCE(mpp.position::TEXT,'')) ~ '(forward|fwd|forvet|striker)') AS fwd_count
FROM match_player_participation mpp
JOIN matches m ON mpp.match_id = m.id
WHERE m.status = 'FT'
AND m.sport = 'football'
AND m.league_id IN ({ph})
GROUP BY mpp.match_id, mpp.team_id
""", self.top_league_ids)
participation = {}
for mid, tid, st_count, tot, fwd in self.cur.fetchall():
participation[(mid, tid)] = {
'starting_count': st_count or 0,
'total_squad': tot or 0,
'fwd_count': fwd or 0,
}
# 2) Player events: goals + assists per (match, team)
self.cur.execute(f"""
SELECT mpe.match_id, mpe.team_id,
COUNT(*) FILTER (
WHERE mpe.event_type = 'goal'
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
) AS goal_count,
COUNT(DISTINCT mpe.assist_player_id) FILTER (
WHERE mpe.event_type = 'goal' AND mpe.assist_player_id IS NOT NULL
) AS assist_count,
COUNT(DISTINCT mpe.player_id) FILTER (
WHERE mpe.event_type = 'goal'
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
) AS unique_scorers
FROM match_player_events mpe
JOIN matches m ON mpe.match_id = m.id
WHERE m.status = 'FT'
AND m.sport = 'football'
AND m.league_id IN ({ph})
GROUP BY mpe.match_id, mpe.team_id
""", self.top_league_ids)
events = {}
for mid, tid, goals, assists, scorers in self.cur.fetchall():
events[(mid, tid)] = {
'goals': goals or 0,
'assists': assists or 0,
'unique_scorers': scorers or 0,
}
# 3) Key players: players with 3+ goals across all their matches, per team
self.cur.execute(f"""
SELECT mpe.team_id, mpe.player_id, COUNT(*) AS total_goals
FROM match_player_events mpe
JOIN matches m ON mpe.match_id = m.id
WHERE m.status = 'FT'
AND m.sport = 'football'
AND m.league_id IN ({ph})
AND mpe.event_type = 'goal'
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
GROUP BY mpe.team_id, mpe.player_id
HAVING COUNT(*) >= 3
""", self.top_league_ids)
key_players_by_team = defaultdict(set)
for tid, pid, _ in self.cur.fetchall():
key_players_by_team[tid].add(pid)
# 4) Starting key players per (match, team)
self.cur.execute(f"""
SELECT mpp.match_id, mpp.team_id, mpp.player_id
FROM match_player_participation mpp
JOIN matches m ON mpp.match_id = m.id
WHERE mpp.is_starting = true
AND m.status = 'FT'
AND m.sport = 'football'
AND m.league_id IN ({ph})
""", self.top_league_ids)
starting_players = defaultdict(list)
for mid, tid, pid in self.cur.fetchall():
starting_players[(mid, tid)].append(pid)
# 5) Build match_id → mst_utc mapping for temporal filtering
match_mst = {}
for m in self.matches:
match_mst[m[0]] = m[7] # m[0]=id, m[7]=mst_utc
# ─── NEW: Player Career Stats (prefix-sum for O(1) temporal lookup) ───
# 6a) Goals per player per match date
self.cur.execute(f"""
SELECT mpe.player_id, m.mst_utc,
SUM(CASE WHEN mpe.event_type = 'goal'
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
THEN 1 ELSE 0 END) AS goals
FROM match_player_events mpe
JOIN matches m ON mpe.match_id = m.id
WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph})
GROUP BY mpe.player_id, m.mst_utc
""", self.top_league_ids)
player_goals_raw = defaultdict(dict)
for pid, mst, goals in self.cur.fetchall():
player_goals_raw[pid][mst] = (player_goals_raw[pid].get(mst, 0)) + (goals or 0)
# 6b) Assists per player per match date
self.cur.execute(f"""
SELECT mpe.assist_player_id, m.mst_utc, COUNT(*) AS assists
FROM match_player_events mpe
JOIN matches m ON mpe.match_id = m.id
WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph})
AND mpe.event_type = 'goal' AND mpe.assist_player_id IS NOT NULL
GROUP BY mpe.assist_player_id, m.mst_utc
""", self.top_league_ids)
player_assists_raw = defaultdict(dict)
for pid, mst, assists in self.cur.fetchall():
player_assists_raw[pid][mst] = (player_assists_raw[pid].get(mst, 0)) + (assists or 0)
# 6c) Player participation dates (starts only)
self.cur.execute(f"""
SELECT mpp.player_id, m.mst_utc
FROM match_player_participation mpp
JOIN matches m ON mpp.match_id = m.id
WHERE mpp.is_starting = true
AND m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph})
ORDER BY mpp.player_id, m.mst_utc
""", self.top_league_ids)
player_starts_raw = defaultdict(list)
for pid, mst in self.cur.fetchall():
player_starts_raw[pid].append(mst)
# 6d) Build prefix sums per player (goals_prefix[i] = total goals up to start i)
player_career = {}
all_pids = set(player_starts_raw.keys()) | set(player_goals_raw.keys()) | set(player_assists_raw.keys())
for pid in all_pids:
starts = sorted(set(player_starts_raw.get(pid, [])))
if not starts:
continue
g_map = player_goals_raw.get(pid, {})
a_map = player_assists_raw.get(pid, {})
cum_g, cum_a = 0, 0
goals_pf, assists_pf = [], []
for mst in starts:
cum_g += g_map.get(mst, 0)
cum_a += a_map.get(mst, 0)
goals_pf.append(cum_g)
assists_pf.append(cum_a)
player_career[pid] = {'msts': starts, 'gp': goals_pf, 'ap': assists_pf}
# Free raw dicts
del player_goals_raw, player_assists_raw, player_starts_raw
print(f" 📊 Player careers built: {len(player_career)} players", flush=True)
# ─── NEW: Team Lineup History (for squad continuity) ───
# 7) Per-team sorted lineups: [(mst, frozenset(player_ids))]
team_lineup_map = defaultdict(list)
for (mid, tid), pids in starting_players.items():
mst = match_mst.get(mid, 0)
if mst > 0 and pids:
team_lineup_map[tid].append((mst, frozenset(pids)))
team_lineup_history = {}
team_lineup_msts = {}
for tid, ll in team_lineup_map.items():
ll.sort(key=lambda x: x[0])
team_lineup_history[tid] = ll
team_lineup_msts[tid] = [x[0] for x in ll]
del team_lineup_map
# ─── 8) Build combined cache — NO DATA LEAKAGE ───
all_keys = set(participation.keys()) | set(events.keys())
for key in all_keys:
mid, tid = key
part = participation.get(key, {'starting_count': 0, 'total_squad': 0, 'fwd_count': 0})
# Count key players in starting XI
starters = starting_players.get(key, [])
kp_in_starting = sum(1 for p in starters if p in key_players_by_team.get(tid, set()))
kp_total = len(key_players_by_team.get(tid, set()))
kp_missing = max(0, kp_total - kp_in_starting)
# Squad quality: composite score — ONLY pre-match info
squad_quality = (
part['starting_count'] * 0.3 +
kp_in_starting * 3.0 +
part['fwd_count'] * 1.5
)
missing_impact = min(kp_missing / max(kp_total, 1), 1.0)
# goals_form: avg goals from last 5 matches BEFORE this match
current_mst = match_mst.get(mid, 0)
team_history = self.team_matches.get(tid, [])
recent_goals = [
tm[2] for tm in team_history if tm[0] < current_mst
][-5:]
goals_form = sum(recent_goals) / len(recent_goals) if recent_goals else 1.3
# ─── NEW: Player-level aggregation for starting XI ───
lineup_g90, lineup_a90, total_exp = 0.0, 0.0, 0
best_scorer_total, best_scorer_id = 0, None
scorers_in_lineup = 0
for pid in starters:
pc = player_career.get(pid)
if not pc:
continue
idx = bisect.bisect_left(pc['msts'], current_mst)
if idx == 0:
continue # no prior matches for this player
prior_starts = idx
prior_goals = pc['gp'][idx - 1]
prior_assists = pc['ap'][idx - 1]
lineup_g90 += prior_goals / prior_starts
lineup_a90 += prior_assists / prior_starts
total_exp += prior_starts
if prior_goals > 0:
scorers_in_lineup += 1
if prior_goals > best_scorer_total:
best_scorer_total = prior_goals
best_scorer_id = pid
n_st = len(starters) or 1
# Top scorer recent form (goals in last 5 starts)
top_scorer_form = 0
if best_scorer_id:
pc = player_career.get(best_scorer_id)
if pc:
idx = bisect.bisect_left(pc['msts'], current_mst)
if idx > 0:
s5 = max(0, idx - 5)
top_scorer_form = pc['gp'][idx - 1] - (pc['gp'][s5 - 1] if s5 > 0 else 0)
# Squad continuity (overlap with previous match lineup)
squad_continuity = 0.5
msts_list = team_lineup_msts.get(tid)
if msts_list:
li = bisect.bisect_left(msts_list, current_mst)
if li > 0:
prev_lineup = team_lineup_history[tid][li - 1][1]
squad_continuity = len(frozenset(starters) & prev_lineup) / n_st
self.squad_cache[key] = {
'squad_quality': squad_quality,
'key_players': kp_in_starting,
'missing_impact': missing_impact,
'goals_form': round(goals_form, 2),
'lineup_goals_per90': round(lineup_g90, 3),
'lineup_assists_per90': round(lineup_a90, 3),
'squad_continuity': round(squad_continuity, 3),
'top_scorer_form': top_scorer_form,
'avg_player_exp': round(total_exp / n_st, 1),
'goals_diversity': round(scorers_in_lineup / n_st, 3),
}
def _load_cards_data(self):
"""Bulk load all distinct cards per match (yellow=1, red=2)."""
ph = ",".join(["%s"] * len(self.top_league_ids))
self.cur.execute(f"""
SELECT mpe.match_id,
SUM(CASE
WHEN mpe.event_type::text LIKE '%%yellow_card%%' THEN 1
WHEN mpe.event_type::text LIKE '%%red_card%%' THEN 2
ELSE 1 END) as cards_weight
FROM match_player_events mpe
JOIN matches m ON mpe.match_id = m.id
WHERE m.status = 'FT'
AND m.sport = 'football'
AND m.league_id IN ({ph})
AND mpe.event_type::text LIKE '%%card%%'
GROUP BY mpe.match_id
""", self.top_league_ids)
for mid, cards_weight in self.cur.fetchall():
self.cards_cache[mid] = float(cards_weight) if cards_weight else 0.0
class FeatureExtractor:
"""Extract features for a single match using pre-loaded data + on-demand queries."""
def __init__(self, conn, loader: BatchDataLoader):
self.conn = conn
self.cur = conn.cursor()
self.loader = loader
# ELO cache: team_id → {overall, home, away, form}
self.elo_ratings = defaultdict(lambda: {
"overall": 1500.0, "home": 1500.0,
"away": 1500.0, "form": 1500.0,
"matches": 0
})
self._elo_initialized = False
self.upset_engine = get_upset_engine()
self.referee_engine = get_referee_engine()
self.momentum_engine = get_momentum_engine()
# ── Data Quality Thresholds ──
# Matches below these thresholds produce default-only features that
# teach the model noise rather than signal.
DQ_MIN_FORM_MATCHES = 3 # team must have ≥3 prior matches
DQ_MIN_FEATURE_COVERAGE = 0.30 # ≥30% of key features must be non-default
def extract_all(self) -> list:
"""Extract features for all matches with data quality validation."""
matches = self.loader.matches
total = len(matches)
rows = []
skipped = 0
dq_rejected = 0
dq_reasons: dict = defaultdict(int)
t_start = time.time()
print(f"\n🔄 Extracting features for {total} matches...", flush=True)
_last_print = t_start
_PRINT_INTERVAL = 60 # her dakika bir ilerleme
# Process chronologically — ELO grows as we go
for i, m in enumerate(matches):
(
mid,
hid,
aid,
sh,
sa,
hth,
hta,
mst,
lid,
home_name,
away_name,
league_name,
) = m
now = time.time()
if now - _last_print >= _PRINT_INTERVAL and i > 0:
elapsed = now - t_start
rate = i / elapsed
remaining = (total - i) / rate if rate > 0 else 0
pct = i / total * 100
eta_h = int(remaining // 3600)
eta_m = int((remaining % 3600) // 60)
eta_s = int(remaining % 60)
eta_str = (f"{eta_h}s {eta_m}dk" if eta_h else f"{eta_m}dk {eta_s}s")
print(
f" ⏱ [{i:>6}/{total}] %{pct:>4.1f} | "
f"{rate:.1f} maç/s | "
f"bitti: {len(rows):,} | "
f"atlanan: {skipped+dq_rejected} | "
f"ETA: {eta_str}",
flush=True,
)
_last_print = now
row = self._extract_one(
mid, hid, aid, sh, sa, hth, hta, mst, lid,
home_name, away_name, league_name,
)
if row:
# ── Data Quality Gate ──
dq_pass, reason = self._validate_row_quality(row, hid, aid, mst)
if dq_pass:
rows.append(row)
else:
dq_rejected += 1
dq_reasons[reason] += 1
else:
skipped += 1
# Update ELO after processing (so ELO is calculated BEFORE the match)
self._update_elo(hid, aid, sh, sa)
print(f" ✅ Extracted {len(rows)} rows, skipped {skipped}, DQ rejected {dq_rejected}", flush=True)
if dq_reasons:
print(f" 📊 DQ Rejection reasons:")
for reason, count in sorted(dq_reasons.items(), key=lambda x: -x[1]):
print(f" {reason}: {count}")
return rows
def _extract_one(
self,
mid,
hid,
aid,
sh,
sa,
hth,
hta,
mst,
lid,
home_name,
away_name,
league_name,
):
"""Extract features for a single match."""
# === LABELS ===
total_goals = sh + sa
ht_total = (hth + hta) if hth is not None and hta is not None else None
if sh > sa:
label_ms = 0
elif sh < sa:
label_ms = 2
else:
label_ms = 1
label_ou05 = 1 if total_goals > 0.5 else 0
label_ou15 = 1 if total_goals > 1.5 else 0
label_ou25 = 1 if total_goals > 2.5 else 0
label_ou35 = 1 if total_goals > 3.5 else 0
label_btts = 1 if (sh > 0 and sa > 0) else 0
label_odd_even = 1 if total_goals % 2 != 0 else 0
# Handicap MS Label (Home starts -1): 0=Home wins by 2+, 1=Home wins by exactly 1, 2=Draw or Away wins
score_diff = sh - sa
if score_diff >= 2:
label_handicap_ms = 0
elif score_diff == 1:
label_handicap_ms = 1
else:
label_handicap_ms = 2
# Cards Label
cards_count = self.loader.cards_cache.get(mid, 0.0)
label_yellow_cards = cards_count
label_cards_ou45 = 1 if cards_count > 4.5 else 0
# HT labels
label_ht_result = None
label_ht_ou05 = None
label_ht_ou15 = None
if hth is not None and hta is not None:
if hth > hta:
label_ht_result = 0
elif hth < hta:
label_ht_result = 2
else:
label_ht_result = 1
label_ht_ou05 = 1 if ht_total > 0.5 else 0
label_ht_ou15 = 1 if ht_total > 1.5 else 0
# HT/FT Label (0-8)
# 0: 1/1, 1: 1/X, 2: 1/2
# 3: X/1, 4: X/X, 5: X/2
# 6: 2/1, 7: 2/X, 8: 2/2
label_ht_ft = None
if label_ht_result is not None:
label_ht_ft = label_ht_result * 3 + label_ms
# === ELO FEATURES ===
h_elo = self.elo_ratings[hid]
a_elo = self.elo_ratings[aid]
elo_features = {
"home_overall_elo": h_elo["overall"],
"away_overall_elo": a_elo["overall"],
"elo_diff": h_elo["overall"] - a_elo["overall"],
"home_home_elo": h_elo["home"],
"away_away_elo": a_elo["away"],
"home_form_elo": h_elo["form"],
"away_form_elo": a_elo["form"],
"form_elo_diff": h_elo["form"] - a_elo["form"],
}
# === FORM FEATURES ===
form_features = self._get_form_features(hid, aid, mst)
# === H2H FEATURES ===
h2h_features = self._get_h2h_features(hid, aid, mst)
# === TEAM STATS FEATURES ===
stats_features = self._get_team_stats_features(hid, aid, mst)
# === ODDS FEATURES ===
odds = self.loader.odds_cache.get(mid, {})
ms_h = odds.get("ms_h", 0.0)
ms_d = odds.get("ms_d", 0.0)
ms_a = odds.get("ms_a", 0.0)
# Implied probabilities (normalized vig-free)
if ms_h > 0 and ms_d > 0 and ms_a > 0:
raw_sum = 1/ms_h + 1/ms_d + 1/ms_a
implied_home = (1/ms_h) / raw_sum
implied_draw = (1/ms_d) / raw_sum
implied_away = (1/ms_a) / raw_sum
else:
implied_home = implied_draw = implied_away = 0.33
odds_features = {
"odds_ms_h": ms_h,
"odds_ms_d": ms_d,
"odds_ms_a": ms_a,
"implied_home": implied_home,
"implied_draw": implied_draw,
"implied_away": implied_away,
"odds_ht_ms_h": odds.get("ht_ms_h", 0.0),
"odds_ht_ms_d": odds.get("ht_ms_d", 0.0),
"odds_ht_ms_a": odds.get("ht_ms_a", 0.0),
"odds_ou05_o": odds.get("ou05_o", 0.0),
"odds_ou05_u": odds.get("ou05_u", 0.0),
"odds_ou15_o": odds.get("ou15_o", 0.0),
"odds_ou15_u": odds.get("ou15_u", 0.0),
"odds_ou25_o": odds.get("ou25_o", 0.0),
"odds_ou25_u": odds.get("ou25_u", 0.0),
"odds_ou35_o": odds.get("ou35_o", 0.0),
"odds_ou35_u": odds.get("ou35_u", 0.0),
"odds_ht_ou05_o": odds.get("ht_ou05_o", 0.0),
"odds_ht_ou05_u": odds.get("ht_ou05_u", 0.0),
"odds_ht_ou15_o": odds.get("ht_ou15_o", 0.0),
"odds_ht_ou15_u": odds.get("ht_ou15_u", 0.0),
"odds_btts_y": odds.get("btts_y", 0.0),
"odds_btts_n": odds.get("btts_n", 0.0),
"odds_ms_h_present": 1.0 if ms_h > 1.01 else 0.0,
"odds_ms_d_present": 1.0 if ms_d > 1.01 else 0.0,
"odds_ms_a_present": 1.0 if ms_a > 1.01 else 0.0,
"odds_ht_ms_h_present": 1.0 if odds.get("ht_ms_h", 0.0) > 1.01 else 0.0,
"odds_ht_ms_d_present": 1.0 if odds.get("ht_ms_d", 0.0) > 1.01 else 0.0,
"odds_ht_ms_a_present": 1.0 if odds.get("ht_ms_a", 0.0) > 1.01 else 0.0,
"odds_ou05_o_present": 1.0 if odds.get("ou05_o", 0.0) > 1.01 else 0.0,
"odds_ou05_u_present": 1.0 if odds.get("ou05_u", 0.0) > 1.01 else 0.0,
"odds_ou15_o_present": 1.0 if odds.get("ou15_o", 0.0) > 1.01 else 0.0,
"odds_ou15_u_present": 1.0 if odds.get("ou15_u", 0.0) > 1.01 else 0.0,
"odds_ou25_o_present": 1.0 if odds.get("ou25_o", 0.0) > 1.01 else 0.0,
"odds_ou25_u_present": 1.0 if odds.get("ou25_u", 0.0) > 1.01 else 0.0,
"odds_ou35_o_present": 1.0 if odds.get("ou35_o", 0.0) > 1.01 else 0.0,
"odds_ou35_u_present": 1.0 if odds.get("ou35_u", 0.0) > 1.01 else 0.0,
"odds_ht_ou05_o_present": 1.0 if odds.get("ht_ou05_o", 0.0) > 1.01 else 0.0,
"odds_ht_ou05_u_present": 1.0 if odds.get("ht_ou05_u", 0.0) > 1.01 else 0.0,
"odds_ht_ou15_o_present": 1.0 if odds.get("ht_ou15_o", 0.0) > 1.01 else 0.0,
"odds_ht_ou15_u_present": 1.0 if odds.get("ht_ou15_u", 0.0) > 1.01 else 0.0,
"odds_btts_y_present": 1.0 if odds.get("btts_y", 0.0) > 1.01 else 0.0,
"odds_btts_n_present": 1.0 if odds.get("btts_n", 0.0) > 1.01 else 0.0,
}
# === LEAGUE FEATURES ===
league = self.loader.league_stats_cache.get(lid, {
"avg_goals": 2.5, "zero_rate": 0.07, "home_win_rate": 0.45,
"draw_rate": 0.25, "btts_rate": 0.50, "ou25_rate": 0.50, "match_count": 0,
})
league_features = {
"league_avg_goals": league["avg_goals"],
"league_zero_goal_rate": league["zero_rate"],
}
# === UPSET FEATURES ===
try:
upset_feats = self.upset_engine.get_features(
home_team_name=home_name or "",
home_team_id=hid,
away_team_name=away_name or "",
league_name=league_name or "",
home_position=10,
away_position=10,
match_date_ms=mst,
)
except Exception:
upset_feats = {
"upset_atmosphere": 0.0,
"upset_motivation": 0.0,
"upset_fatigue": 0.0,
"upset_potential": 0.0,
}
# === REFEREE FEATURES ===
try:
referee_feats = self.referee_engine.get_features(
match_id=mid,
league_id=str(lid),
)
except Exception:
referee_feats = {
"referee_home_bias": 0.0,
"referee_avg_goals": 2.7,
"referee_cards_total": 4.0,
"referee_avg_yellow": 3.5,
"referee_avg_red": 0.1,
"referee_experience": 0.5,
}
# === MOMENTUM FEATURES ===
try:
momentum_feats = self.momentum_engine.get_features(
home_team_id=hid,
away_team_id=aid,
match_date_ms=mst,
)
home_momentum_score = momentum_feats.get("home_momentum_score", 0.5)
away_momentum_score = momentum_feats.get("away_momentum_score", 0.5)
momentum_diff = momentum_feats.get(
"momentum_diff", home_momentum_score - away_momentum_score
)
except Exception:
home_momentum_score = 0.5
away_momentum_score = 0.5
momentum_diff = 0.0
# === SQUAD FEATURES ===
home_sq = self.loader.squad_cache.get((mid, hid), {})
away_sq = self.loader.squad_cache.get((mid, aid), {})
home_squad_quality = home_sq.get('squad_quality', 0.0)
away_squad_quality = away_sq.get('squad_quality', 0.0)
squad_diff = home_squad_quality - away_squad_quality
home_key_players = home_sq.get('key_players', 0)
away_key_players = away_sq.get('key_players', 0)
home_missing_impact = home_sq.get('missing_impact', 0.0)
away_missing_impact = away_sq.get('missing_impact', 0.0)
home_goals_form = home_sq.get('goals_form', 0)
away_goals_form = away_sq.get('goals_form', 0)
# === V27 ROLLING / VENUE / CALENDAR FEATURES ===
v27 = self._compute_v27_features(hid, aid, mst, elo_features, form_features,
home_momentum_score, away_momentum_score,
upset_feats, h2h_features, league)
# === ASSEMBLE ROW ===
row = {
"match_id": mid,
"home_team_id": hid,
"away_team_id": aid,
"league_id": lid,
"mst_utc": mst,
**elo_features,
**form_features,
**h2h_features,
**stats_features,
**odds_features,
"home_xga": form_features["home_conceded_avg"],
"away_xga": form_features["away_conceded_avg"],
**league_features,
"upset_atmosphere": upset_feats.get("upset_atmosphere", 0.0),
"upset_motivation": upset_feats.get("upset_motivation", 0.0),
"upset_fatigue": upset_feats.get("upset_fatigue", 0.0),
"upset_potential": upset_feats.get("upset_potential", 0.0),
"referee_home_bias": referee_feats.get("referee_home_bias", 0.0),
"referee_avg_goals": referee_feats.get("referee_avg_goals", 2.7),
"referee_cards_total": referee_feats.get("referee_cards_total", 4.0),
"referee_avg_yellow": referee_feats.get("referee_avg_yellow", 3.5),
"referee_experience": referee_feats.get("referee_experience", 0.5),
"home_momentum_score": home_momentum_score,
"away_momentum_score": away_momentum_score,
"momentum_diff": momentum_diff,
# Squad Features
"home_squad_quality": home_squad_quality,
"away_squad_quality": away_squad_quality,
"squad_diff": squad_diff,
"home_key_players": home_key_players,
"away_key_players": away_key_players,
"home_missing_impact": home_missing_impact,
"away_missing_impact": away_missing_impact,
"home_goals_form": home_goals_form,
"away_goals_form": away_goals_form,
# Player-Level Features
"home_lineup_goals_per90": home_sq.get('lineup_goals_per90', 0.0),
"away_lineup_goals_per90": away_sq.get('lineup_goals_per90', 0.0),
"home_lineup_assists_per90": home_sq.get('lineup_assists_per90', 0.0),
"away_lineup_assists_per90": away_sq.get('lineup_assists_per90', 0.0),
"home_squad_continuity": home_sq.get('squad_continuity', 0.5),
"away_squad_continuity": away_sq.get('squad_continuity', 0.5),
"home_top_scorer_form": home_sq.get('top_scorer_form', 0),
"away_top_scorer_form": away_sq.get('top_scorer_form', 0),
"home_avg_player_exp": home_sq.get('avg_player_exp', 0.0),
"away_avg_player_exp": away_sq.get('avg_player_exp', 0.0),
"home_goals_diversity": home_sq.get('goals_diversity', 0.0),
"away_goals_diversity": away_sq.get('goals_diversity', 0.0),
# V27 Features
**v27,
# Labels
"score_home": sh,
"score_away": sa,
"total_goals": total_goals,
"ht_score_home": hth if hth is not None else "",
"ht_score_away": hta if hta is not None else "",
"ht_total_goals": ht_total if ht_total is not None else "",
"label_ms": label_ms,
"label_ou05": label_ou05,
"label_ou15": label_ou15,
"label_ou25": label_ou25,
"label_ou35": label_ou35,
"label_btts": label_btts,
"label_ht_result": label_ht_result if label_ht_result is not None else "",
"label_ht_ou05": label_ht_ou05 if label_ht_ou05 is not None else "",
"label_ht_ou15": label_ht_ou15 if label_ht_ou15 is not None else "",
"label_ht_ft": label_ht_ft if label_ht_ft is not None else "",
"label_odd_even": label_odd_even,
"label_yellow_cards": label_yellow_cards,
"label_cards_ou45": label_cards_ou45,
"label_handicap_ms": label_handicap_ms,
}
return row
def _compute_v27_features(self, hid, aid, mst, elo_features, form_features,
home_momentum, away_momentum, upset_feats, h2h_features, league):
"""Compute V27 rolling, venue, calendar, interaction features from pre-loaded data."""
home_history = self.loader.team_matches.get(hid, [])
away_history = self.loader.team_matches.get(aid, [])
def _rolling(history, n):
recent = [m for m in history if m[0] < mst][-n:]
if not recent:
return 1.3, 1.1, 0.0
goals = sum(m[2] for m in recent) / len(recent)
conceded = sum(m[3] for m in recent) / len(recent)
cs = sum(1 for m in recent if m[3] == 0) / len(recent)
return round(goals, 3), round(conceded, 3), round(cs, 3)
def _venue(history, is_home):
recent = [m for m in history if m[0] < mst and m[1] == is_home][-10:]
if not recent:
return 1.3, 1.1
goals = sum(m[2] for m in recent) / len(recent)
conceded = sum(m[3] for m in recent) / len(recent)
return round(goals, 3), round(conceded, 3)
def _days_rest(history):
prior = [m[0] for m in history if m[0] < mst]
if not prior:
return 7.0
last = prior[-1]
return round(min((mst - last) / 86400000.0, 30.0), 1)
h5g, h5c, h5cs = _rolling(home_history, 5)
h10g, h10c, _ = _rolling(home_history, 10)
h20g, h20c, _ = _rolling(home_history, 20)
a5g, a5c, a5cs = _rolling(away_history, 5)
a10g, a10c, _ = _rolling(away_history, 10)
hvg, hvc = _venue(home_history, True)
avg, avc = _venue(away_history, False)
home_rest = _days_rest(home_history)
away_rest = _days_rest(away_history)
import datetime
match_dt = datetime.datetime.utcfromtimestamp(mst / 1000)
match_month = match_dt.month
elo_diff = elo_features["elo_diff"]
form_elo_diff = elo_features["form_elo_diff"]
mom_diff = home_momentum - away_momentum
home_conceded = form_features["home_conceded_avg"]
away_conceded = form_features["away_conceded_avg"]
home_goals = form_features["home_goals_avg"]
away_goals = form_features["away_goals_avg"]
upset_potential = upset_feats.get("upset_potential", 0.0)
h2h_prior = [m for m in home_history if m[0] < mst and m[4] == aid]
h2h_home_goals_avg = sum(m[2] for m in h2h_prior) / len(h2h_prior) if h2h_prior else 1.3
h2h_away_goals_avg = sum(m[3] for m in h2h_prior) / len(h2h_prior) if h2h_prior else 1.1
recent_h2h = h2h_prior[-3:]
h2h_recent_trend = sum(1 if m[2] > m[3] else -1 if m[2] < m[3] else 0 for m in recent_h2h) / max(len(recent_h2h), 1)
venue_h2h = [m for m in h2h_prior if m[1]]
h2h_venue_advantage = sum(1 if m[2] > m[3] else 0 for m in venue_h2h) / max(len(venue_h2h), 1) if venue_h2h else 0.5
league_count = league.get("match_count", 0)
return {
"h2h_home_goals_avg": round(h2h_home_goals_avg, 3),
"h2h_away_goals_avg": round(h2h_away_goals_avg, 3),
"h2h_recent_trend": round(h2h_recent_trend, 3),
"h2h_venue_advantage": round(h2h_venue_advantage, 3),
"home_rolling5_goals": h5g, "home_rolling5_conceded": h5c,
"home_rolling10_goals": h10g, "home_rolling10_conceded": h10c,
"home_rolling20_goals": h20g, "home_rolling20_conceded": h20c,
"away_rolling5_goals": a5g, "away_rolling5_conceded": a5c,
"away_rolling10_goals": a10g, "away_rolling10_conceded": a10c,
"home_rolling5_cs": h5cs, "away_rolling5_cs": a5cs,
"home_venue_goals": hvg, "home_venue_conceded": hvc,
"away_venue_goals": avg, "away_venue_conceded": avc,
"home_goal_trend": round(h5g - h10g, 3),
"away_goal_trend": round(a5g - a10g, 3),
"home_days_rest": home_rest, "away_days_rest": away_rest,
"match_month": float(match_month),
"is_season_start": 1.0 if match_month in (7, 8, 9) else 0.0,
"is_season_end": 1.0 if match_month in (5, 6) else 0.0,
"attack_vs_defense_home": round(home_goals - away_conceded, 3),
"attack_vs_defense_away": round(away_goals - home_conceded, 3),
"xg_diff": round(home_conceded - away_conceded, 3),
"form_momentum_interaction": round(mom_diff * form_elo_diff / 1000.0, 4),
"elo_form_consistency": round(1.0 - abs(elo_diff - form_elo_diff) / max(abs(elo_diff), 100.0), 4),
"upset_x_elo_gap": round(upset_potential * abs(elo_diff) / 500.0, 4),
"league_home_win_rate": league.get("home_win_rate", 0.45),
"league_draw_rate": league.get("draw_rate", 0.25),
"league_btts_rate": league.get("btts_rate", 0.50),
"league_ou25_rate": league.get("ou25_rate", 0.50),
"league_reliability_score": min(1.0, league_count / 500.0) if league_count else 0.3,
}
def _validate_row_quality(
self,
row: dict,
home_id: str,
away_id: str,
before_date: int,
) -> tuple:
"""
Data quality gate for training rows.
Ensures the feature vector has enough real signal to be useful for
training. Rejects rows where critical features are all at their
default/fallback values — these teach the model noise, not patterns.
Returns (pass: bool, reason: str | None).
"""
# 1. Minimum form history: both teams must have enough prior matches
home_history = self.loader.team_matches.get(home_id, [])
away_history = self.loader.team_matches.get(away_id, [])
home_prior = sum(1 for m in home_history if m[0] < before_date)
away_prior = sum(1 for m in away_history if m[0] < before_date)
if home_prior < self.DQ_MIN_FORM_MATCHES:
return False, 'home_insufficient_history'
if away_prior < self.DQ_MIN_FORM_MATCHES:
return False, 'away_insufficient_history'
# 2. Feature coverage check: count how many key features are non-default
key_features = [
('home_goals_avg', 1.3),
('away_goals_avg', 1.3),
('home_clean_sheet_rate', 0.25),
('away_clean_sheet_rate', 0.25),
('home_avg_possession', 0.50),
('away_avg_possession', 0.50),
('home_avg_shots_on_target', 3.5),
('away_avg_shots_on_target', 3.5),
('h2h_total_matches', 0),
('odds_ms_h', 0.0),
]
non_default = sum(
1 for feat_name, default_val in key_features
if abs(float(row.get(feat_name, default_val)) - default_val) > 0.01
)
coverage = non_default / len(key_features)
if coverage < self.DQ_MIN_FEATURE_COVERAGE:
return False, f'low_feature_coverage_{coverage:.0%}'
return True, None
# -------------------------------------------------------------------------
# ELO (simplified inline version — doesn't need DB, grows incrementally)
# -------------------------------------------------------------------------
def _update_elo(self, home_id, away_id, score_home, score_away):
"""Update ELO ratings after a match."""
h = self.elo_ratings[home_id]
a = self.elo_ratings[away_id]
HOME_ADVANTAGE = 65
K_BASE = 32
# Expected scores
exp_h = 1.0 / (1 + 10 ** ((a["overall"] - h["overall"] - HOME_ADVANTAGE) / 400))
exp_a = 1.0 - exp_h
# Actual scores
if score_home > score_away:
actual_h, actual_a = 1.0, 0.0
elif score_home < score_away:
actual_h, actual_a = 0.0, 1.0
else:
actual_h, actual_a = 0.5, 0.5
# Goal difference multiplier
gd = abs(score_home - score_away)
gd_mult = math.log(max(gd, 1) + 1) * 0.7 + 1.0
# Dynamic K
k_h = K_BASE * gd_mult * (1.3 if h["matches"] < 10 else 1.0)
k_a = K_BASE * gd_mult * (1.3 if a["matches"] < 10 else 1.0)
delta_h = k_h * (actual_h - exp_h)
delta_a = k_a * (actual_a - exp_a)
# Update all ELO variants
h["overall"] += delta_h
a["overall"] += delta_a
h["home"] += delta_h * 1.1
a["away"] += delta_a * 1.1
# Form ELO (heavier weight on recent)
form_k = K_BASE * 1.5 * gd_mult
h["form"] = h["form"] * 0.85 + (h["form"] + form_k * (actual_h - exp_h)) * 0.15
a["form"] = a["form"] * 0.85 + (a["form"] + form_k * (actual_a - exp_a)) * 0.15
h["matches"] += 1
a["matches"] += 1
# -------------------------------------------------------------------------
# FORM (last 5 matches)
# -------------------------------------------------------------------------
def _get_form_features(self, home_id, away_id, before_date) -> dict:
"""Get form features for both teams."""
h_form = self._calc_team_form(home_id, before_date)
a_form = self._calc_team_form(away_id, before_date)
return {
"home_goals_avg": h_form["goals_avg"],
"home_conceded_avg": h_form["conceded_avg"],
"away_goals_avg": a_form["goals_avg"],
"away_conceded_avg": a_form["conceded_avg"],
"home_clean_sheet_rate": h_form["clean_sheet_rate"],
"away_clean_sheet_rate": a_form["clean_sheet_rate"],
"home_scoring_rate": h_form["scoring_rate"],
"away_scoring_rate": a_form["scoring_rate"],
"home_winning_streak": h_form["winning_streak"],
"away_winning_streak": a_form["winning_streak"],
"home_unbeaten_streak": h_form["unbeaten_streak"],
"away_unbeaten_streak": a_form["unbeaten_streak"],
}
def _calc_team_form(self, team_id, before_date, limit=5) -> dict:
"""Calculate form from last N matches (weighted moving average)."""
history = self.loader.team_matches.get(team_id, [])
# Filter and get last `limit` matches before date
valid_matches = [m for m in history if m[0] < before_date]
rows = valid_matches[-limit:] if valid_matches else []
if not rows:
return {
"goals_avg": 1.3, "conceded_avg": 1.2,
"clean_sheet_rate": 0.25, "scoring_rate": 0.75,
"winning_streak": 0, "unbeaten_streak": 0,
}
w_goals, w_conceded, w_total = 0.0, 0.0, 0.0
clean_sheets, scored_matches = 0, 0
winning_streak, unbeaten_streak = 0, 0
streak_broken, unbeaten_broken = False, False
for i, (mst, is_home, team_goals, opp_goals, opp_id) in enumerate(reversed(rows)):
weight = float(limit - i)
w_goals += team_goals * weight
w_conceded += opp_goals * weight
w_total += weight
if opp_goals == 0:
clean_sheets += 1
if team_goals > 0:
scored_matches += 1
if not streak_broken:
if team_goals > opp_goals:
winning_streak += 1
else:
streak_broken = True
if not unbeaten_broken:
if team_goals >= opp_goals:
unbeaten_streak += 1
else:
unbeaten_broken = True
n = len(rows)
return {
"goals_avg": w_goals / w_total if w_total > 0 else 1.3,
"conceded_avg": w_conceded / w_total if w_total > 0 else 1.2,
"clean_sheet_rate": clean_sheets / n,
"scoring_rate": scored_matches / n,
"winning_streak": winning_streak,
"unbeaten_streak": unbeaten_streak,
}
# -------------------------------------------------------------------------
# H2H
# -------------------------------------------------------------------------
def _get_h2h_features(self, home_id, away_id, before_date) -> dict:
"""Get head-to-head features."""
h_history = self.loader.team_matches.get(home_id, [])
# Matches against away_id before date
h2h_matches = [m for m in h_history if m[4] == away_id and m[0] < before_date]
rows = h2h_matches[-20:] if h2h_matches else []
if not rows:
return {
"h2h_total_matches": 0,
"h2h_home_win_rate": 0.33,
"h2h_draw_rate": 0.33,
"h2h_avg_goals": 2.5,
"h2h_btts_rate": 0.5,
"h2h_over25_rate": 0.5,
}
home_wins, draws = 0, 0
total_goals_sum = 0
btts_count, over25_count = 0, 0
n = len(rows)
for mst, is_our_home, team_goals, opp_goals, opp_id in rows:
sh = team_goals if is_our_home else opp_goals
sa = opp_goals if is_our_home else team_goals
total = sh + sa
total_goals_sum += total
if sh > 0 and sa > 0:
btts_count += 1
if total > 2.5:
over25_count += 1
# Determine result relative to our home team
if is_our_home:
if sh > sa: home_wins += 1
elif sh == sa: draws += 1
else:
if sa > sh: home_wins += 1
elif sh == sa: draws += 1
return {
"h2h_total_matches": n,
"h2h_home_win_rate": home_wins / n,
"h2h_draw_rate": draws / n,
"h2h_avg_goals": total_goals_sum / n,
"h2h_btts_rate": btts_count / n,
"h2h_over25_rate": over25_count / n,
}
# -------------------------------------------------------------------------
# TEAM STATS (possession, shots, corners)
# -------------------------------------------------------------------------
def _get_team_stats_features(self, home_id, away_id, before_date) -> dict:
"""Get team-level match stats features."""
h_stats = self._calc_team_stats(home_id, before_date)
a_stats = self._calc_team_stats(away_id, before_date)
return {
"home_avg_possession": h_stats["possession"],
"away_avg_possession": a_stats["possession"],
"home_avg_shots_on_target": h_stats["shots_on_target"],
"away_avg_shots_on_target": a_stats["shots_on_target"],
"home_shot_conversion": h_stats["shot_conversion"],
"away_shot_conversion": a_stats["shot_conversion"],
"home_avg_corners": h_stats["corners"],
"away_avg_corners": a_stats["corners"],
}
def _calc_team_stats(self, team_id, before_date, limit=10) -> dict:
"""Calculate team stats from match_team_stats table."""
defaults = {
"possession": 0.50, "shots_on_target": 3.5,
"shot_conversion": 0.10, "corners": 4.5
}
stats_history = self.loader.team_stats.get(team_id, [])
valid_stats = [s for s in stats_history if s[0] < before_date]
rows = valid_stats[-limit:] if valid_stats else []
if not rows:
return defaults
poss_sum, sot_sum, shots_sum, corners_sum = 0.0, 0.0, 0.0, 0.0
goals_scored = 0
poss_count = 0
n = len(rows)
for mst, poss, sot, total_shots, corners, team_goals in rows:
if poss and poss > 0:
poss_sum += float(poss)
poss_count += 1
sot_sum += float(sot or 0)
shots_sum += float(total_shots or 0)
corners_sum += float(corners or 0)
goals_scored += float(team_goals or 0)
return {
"possession": (poss_sum / poss_count / 100) if poss_count > 0 else 0.50,
"shots_on_target": sot_sum / n,
"shot_conversion": goals_scored / shots_sum if shots_sum > 0 else 0.10,
"corners": corners_sum / n,
}
# =============================================================================
# MAIN
# =============================================================================
def main():
print("🚀 XGBoost Training Data Extraction")
print("=" * 60)
# Load top leagues
with open(TOP_LEAGUES_PATH) as f:
top_leagues = json.load(f)
print(f"📋 {len(top_leagues)} top leagues")
# Connect
conn = get_conn()
# Batch load
print("\n📦 Loading batch data...")
loader = BatchDataLoader(conn, top_leagues)
loader.load_all()
# Extract features
extractor = FeatureExtractor(conn, loader)
rows = extractor.extract_all()
if not rows:
print("❌ No data extracted!")
return
# Write CSV
print(f"\n💾 Writing {len(rows)} rows to {OUTPUT_CSV}...")
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=FEATURE_COLS)
writer.writeheader()
writer.writerows(rows)
# Summary stats
print(f"\n📊 Data Summary:")
print(f" Total rows: {len(rows)}")
# Label distributions
ms_dist = defaultdict(int)
ou25_dist = defaultdict(int)
btts_dist = defaultdict(int)
with_odds = sum(1 for r in rows if r.get("odds_ms_h", 0) > 0)
for r in rows:
ms_dist[r["label_ms"]] += 1
ou25_dist[r["label_ou25"]] += 1
btts_dist[r["label_btts"]] += 1
n = len(rows)
print(f" With odds: {with_odds} ({with_odds/n*100:.1f}%)")
print(f" MS dist: Home={ms_dist[0]/n*100:.1f}% Draw={ms_dist[1]/n*100:.1f}% Away={ms_dist[2]/n*100:.1f}%")
print(f" O/U 2.5: Over={ou25_dist[1]/n*100:.1f}% Under={ou25_dist[0]/n*100:.1f}%")
print(f" BTTS: Yes={btts_dist[1]/n*100:.1f}% No={btts_dist[0]/n*100:.1f}%")
# HT/FT Distribution
htft_dist = defaultdict(int)
for r in rows:
if r.get("label_ht_ft") is not None and r.get("label_ht_ft") != "":
htft_dist[r["label_ht_ft"]] += 1
print(f"\n HT/FT Distribution:")
# Interesting ones: 1/2 (2) and 2/1 (6)
rev_1_2 = htft_dist.get(2, 0)
rev_2_1 = htft_dist.get(6, 0)
print(f" 1/2 (Home->Away): {rev_1_2} ({rev_1_2/n*100:.2f}%)")
print(f" 2/1 (Away->Home): {rev_2_1} ({rev_2_1/n*100:.2f}%)")
print(f" 1/1: {htft_dist.get(0,0)} | X/X: {htft_dist.get(4,0)} | 2/2: {htft_dist.get(8,0)}")
# Feature NaN check
nan_cols = []
for col in FEATURE_COLS:
nans = sum(1 for r in rows if r.get(col, "") == "" or r.get(col) is None)
if nans > 0 and col not in ("ht_score_home", "ht_score_away", "ht_total_goals",
"label_ht_result", "label_ht_ou05", "label_ht_ou15"):
nan_cols.append((col, nans))
if nan_cols:
print(f"\n ⚠️ Columns with missing values:")
for col, cnt in nan_cols:
print(f" {col}: {cnt} ({cnt/n*100:.1f}%)")
else:
print(f"\n ✅ No missing values in feature columns!")
print(f"\n✅ Done! Output: {OUTPUT_CSV}")
conn.close()
if __name__ == "__main__":
main()