5b5f83c8cf
Deploy Iddaai Backend / build-and-deploy (push) Successful in 6s
- goals_form now uses avg of last 5 historical matches instead of current match goals - squad_quality removes current match goals/assists, uses only pre-match known data - adds temporal filtering via match_id -> mst_utc mapping
1195 lines
47 KiB
Python
Executable File
1195 lines
47 KiB
Python
Executable File
"""
|
||
XGBoost Training Data Extraction
|
||
=================================
|
||
Batch feature extraction for top-league matches.
|
||
Extracts ~82 features + labels per match for XGBoost model training.
|
||
|
||
Usage:
|
||
python3 scripts/extract_training_data.py
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import csv
|
||
import math
|
||
import time
|
||
from datetime import datetime
|
||
from collections import defaultdict
|
||
|
||
import psycopg2
|
||
from psycopg2.extras import RealDictCursor
|
||
from dotenv import load_dotenv
|
||
|
||
load_dotenv()
|
||
|
||
# =============================================================================
|
||
# CONFIG
|
||
# =============================================================================
|
||
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
sys.path.insert(0, AI_ENGINE_DIR)
|
||
|
||
from features.upset_engine import get_upset_engine
|
||
from features.referee_engine import get_referee_engine
|
||
from features.momentum_engine import get_momentum_engine
|
||
|
||
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "qualified_leagues.json")
|
||
OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
|
||
|
||
# Ensure output dir exists
|
||
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
|
||
|
||
|
||
def get_conn():
|
||
db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0]
|
||
return psycopg2.connect(db_url)
|
||
|
||
|
||
# =============================================================================
|
||
# FEATURE COLUMNS (ORDER MATTERS — matches CSV header)
|
||
# =============================================================================
|
||
FEATURE_COLS = [
|
||
# Match identifiers
|
||
"match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
|
||
|
||
# ELO Features (8)
|
||
"home_overall_elo", "away_overall_elo", "elo_diff",
|
||
"home_home_elo", "away_away_elo",
|
||
"home_form_elo", "away_form_elo", "form_elo_diff",
|
||
|
||
# Form Features (12)
|
||
"home_goals_avg", "home_conceded_avg",
|
||
"away_goals_avg", "away_conceded_avg",
|
||
"home_clean_sheet_rate", "away_clean_sheet_rate",
|
||
"home_scoring_rate", "away_scoring_rate",
|
||
"home_winning_streak", "away_winning_streak",
|
||
"home_unbeaten_streak", "away_unbeaten_streak",
|
||
|
||
# H2H Features (6)
|
||
"h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate",
|
||
"h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",
|
||
|
||
# Team Stats Features (8)
|
||
"home_avg_possession", "away_avg_possession",
|
||
"home_avg_shots_on_target", "away_avg_shots_on_target",
|
||
"home_shot_conversion", "away_shot_conversion",
|
||
"home_avg_corners", "away_avg_corners",
|
||
|
||
# Odds Features (24)
|
||
"odds_ms_h", "odds_ms_d", "odds_ms_a",
|
||
"implied_home", "implied_draw", "implied_away",
|
||
|
||
"odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
|
||
|
||
"odds_ou05_o", "odds_ou05_u",
|
||
"odds_ou15_o", "odds_ou15_u",
|
||
"odds_ou25_o", "odds_ou25_u",
|
||
"odds_ou35_o", "odds_ou35_u",
|
||
|
||
"odds_ht_ou05_o", "odds_ht_ou05_u",
|
||
"odds_ht_ou15_o", "odds_ht_ou15_u",
|
||
|
||
"odds_btts_y", "odds_btts_n",
|
||
"odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present",
|
||
"odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present",
|
||
"odds_ou05_o_present", "odds_ou05_u_present",
|
||
"odds_ou15_o_present", "odds_ou15_u_present",
|
||
"odds_ou25_o_present", "odds_ou25_u_present",
|
||
"odds_ou35_o_present", "odds_ou35_u_present",
|
||
"odds_ht_ou05_o_present", "odds_ht_ou05_u_present",
|
||
"odds_ht_ou15_o_present", "odds_ht_ou15_u_present",
|
||
"odds_btts_y_present", "odds_btts_n_present",
|
||
|
||
# Defensive/League Features (4)
|
||
"home_xga", "away_xga",
|
||
"league_avg_goals", "league_zero_goal_rate",
|
||
|
||
# Upset Engine (4)
|
||
"upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",
|
||
|
||
# Referee Engine (5)
|
||
"referee_home_bias", "referee_avg_goals", "referee_cards_total",
|
||
"referee_avg_yellow", "referee_experience",
|
||
|
||
# Momentum Engine (3)
|
||
"home_momentum_score", "away_momentum_score", "momentum_diff",
|
||
|
||
# Squad Features (9)
|
||
"home_squad_quality", "away_squad_quality", "squad_diff",
|
||
"home_key_players", "away_key_players",
|
||
"home_missing_impact", "away_missing_impact",
|
||
"home_goals_form", "away_goals_form",
|
||
|
||
# Labels
|
||
"score_home", "score_away", "total_goals",
|
||
"ht_score_home", "ht_score_away", "ht_total_goals",
|
||
"label_ms", # 0=Home, 1=Draw, 2=Away
|
||
"label_ou05", # 0=Under, 1=Over
|
||
"label_ou15",
|
||
"label_ou25",
|
||
"label_ou35",
|
||
"label_btts", # 0=No, 1=Yes
|
||
"label_ht_result", # 0=Home, 1=Draw, 2=Away
|
||
"label_ht_ou05", # 0=Under, 1=Over
|
||
"label_ht_ou15",
|
||
"label_ht_ft", # 0=1/1, 1=1/X, 2=1/2 ... 8=2/2
|
||
"label_odd_even", # 1=Odd, 0=Even
|
||
"label_yellow_cards",# Count of cards (yellow=1, red=2)
|
||
"label_cards_ou45", # 0=Under 4.5, 1=Over 4.5
|
||
"label_handicap_ms", # Handikap (Home starts -1): 0=Home wins by 2+, 1=Home wins by exactly 1, 2=Draw or Away wins
|
||
]
|
||
|
||
|
||
# =============================================================================
|
||
# BATCH LOADERS — Pre-load data to avoid N+1 queries
|
||
# =============================================================================
|
||
|
||
class BatchDataLoader:
|
||
"""Pre-loads all necessary data in bulk, then serves features per match."""
|
||
|
||
def __init__(self, conn, top_league_ids: list):
|
||
self.conn = conn
|
||
self.cur = conn.cursor()
|
||
self.top_league_ids = top_league_ids
|
||
|
||
# Pre-loaded data caches
|
||
self.matches = []
|
||
self.odds_cache = {} # match_id → {ms_h, ms_d, ms_a, ...}
|
||
self.team_stats_cache = {} # (team_id, before_date_bucket) → stats
|
||
self.form_cache = {} # (team_id, match_id) → form features
|
||
self.h2h_cache = {} # (home_id, away_id, match_id) → h2h features
|
||
self.league_stats_cache = {} # league_id → {avg_goals, zero_rate}
|
||
self.squad_cache = {} # (match_id, team_id) → {starting, goals, assists, key_players, ...}
|
||
self.cards_cache = {} # match_id → total cards scored
|
||
|
||
def load_all(self):
|
||
"""Load all data in batch."""
|
||
t0 = time.time()
|
||
|
||
self._load_matches()
|
||
print(f" ✅ Matches: {len(self.matches)} ({time.time()-t0:.1f}s)", flush=True)
|
||
|
||
t1 = time.time()
|
||
self._load_odds()
|
||
print(f" ✅ Odds: {len(self.odds_cache)} matches ({time.time()-t1:.1f}s)", flush=True)
|
||
|
||
t2 = time.time()
|
||
self._load_league_stats()
|
||
print(f" ✅ League stats: {len(self.league_stats_cache)} leagues ({time.time()-t2:.1f}s)", flush=True)
|
||
|
||
t3 = time.time()
|
||
self._load_team_history()
|
||
print(f" ✅ Team History & Stats cache built ({time.time()-t3:.1f}s)", flush=True)
|
||
|
||
t4 = time.time()
|
||
self._load_squad_data()
|
||
print(f" ✅ Squad data: {len(self.squad_cache)} team-matches ({time.time()-t4:.1f}s)", flush=True)
|
||
|
||
t5 = time.time()
|
||
self._load_cards_data()
|
||
print(f" ✅ Cards data: {len(self.cards_cache)} matches ({time.time()-t5:.1f}s)", flush=True)
|
||
|
||
print(f" 📊 Total load time: {time.time()-t0:.1f}s", flush=True)
|
||
|
||
def _load_matches(self):
|
||
ph = ",".join(["%s"] * len(self.top_league_ids))
|
||
self.cur.execute(f"""
|
||
SELECT m.id, m.home_team_id, m.away_team_id,
|
||
m.score_home, m.score_away,
|
||
m.ht_score_home, m.ht_score_away,
|
||
m.mst_utc, m.league_id,
|
||
ht.name as home_name,
|
||
at.name as away_name,
|
||
l.name as league_name
|
||
FROM matches m
|
||
JOIN teams ht ON m.home_team_id = ht.id
|
||
JOIN teams at ON m.away_team_id = at.id
|
||
JOIN leagues l ON m.league_id = l.id
|
||
WHERE m.status = 'FT'
|
||
AND m.score_home IS NOT NULL
|
||
AND m.sport = 'football'
|
||
AND m.league_id IN ({ph})
|
||
ORDER BY m.mst_utc ASC
|
||
""", self.top_league_ids)
|
||
|
||
self.matches = self.cur.fetchall()
|
||
|
||
def _load_odds(self):
|
||
"""Bulk load all odds for top league matches."""
|
||
ph = ",".join(["%s"] * len(self.top_league_ids))
|
||
self.cur.execute(f"""
|
||
SELECT oc.match_id, oc.name, os.name, os.odd_value
|
||
FROM odd_selections os
|
||
JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
|
||
JOIN matches m ON oc.match_id = m.id
|
||
WHERE m.status = 'FT'
|
||
AND m.sport = 'football'
|
||
AND m.league_id IN ({ph})
|
||
""", self.top_league_ids)
|
||
|
||
for match_id, cat_name, sel_name, odd_val in self.cur.fetchall():
|
||
try:
|
||
v = float(odd_val) if odd_val else 0
|
||
if v <= 0 or not cat_name or not sel_name:
|
||
continue
|
||
|
||
if match_id not in self.odds_cache:
|
||
self.odds_cache[match_id] = {}
|
||
|
||
cat_lower = cat_name.lower().strip()
|
||
sel_lower = sel_name.lower().strip()
|
||
|
||
# Match Result (1X2)
|
||
if cat_lower == 'maç sonucu':
|
||
if sel_name == '1': self.odds_cache[match_id]['ms_h'] = v
|
||
elif sel_name in ('0', 'X'): self.odds_cache[match_id]['ms_d'] = v
|
||
elif sel_name == '2': self.odds_cache[match_id]['ms_a'] = v
|
||
|
||
# HT Result
|
||
elif cat_lower == '1. yarı sonucu':
|
||
if sel_name == '1': self.odds_cache[match_id]['ht_ms_h'] = v
|
||
elif sel_name in ('0', 'X'): self.odds_cache[match_id]['ht_ms_d'] = v
|
||
elif sel_name == '2': self.odds_cache[match_id]['ht_ms_a'] = v
|
||
|
||
# BTTS
|
||
elif cat_lower == 'karşılıklı gol':
|
||
if 'var' in sel_lower: self.odds_cache[match_id]['btts_y'] = v
|
||
elif 'yok' in sel_lower: self.odds_cache[match_id]['btts_n'] = v
|
||
|
||
# Over/Under FT
|
||
elif cat_lower == '0,5 alt/üst':
|
||
if 'alt' in sel_lower: self.odds_cache[match_id]['ou05_u'] = v
|
||
elif 'üst' in sel_lower: self.odds_cache[match_id]['ou05_o'] = v
|
||
elif cat_lower == '1,5 alt/üst':
|
||
if 'alt' in sel_lower: self.odds_cache[match_id]['ou15_u'] = v
|
||
elif 'üst' in sel_lower: self.odds_cache[match_id]['ou15_o'] = v
|
||
elif cat_lower == '2,5 alt/üst':
|
||
if 'alt' in sel_lower: self.odds_cache[match_id]['ou25_u'] = v
|
||
elif 'üst' in sel_lower: self.odds_cache[match_id]['ou25_o'] = v
|
||
elif cat_lower == '3,5 alt/üst':
|
||
if 'alt' in sel_lower: self.odds_cache[match_id]['ou35_u'] = v
|
||
elif 'üst' in sel_lower: self.odds_cache[match_id]['ou35_o'] = v
|
||
|
||
# Over/Under HT
|
||
elif cat_lower == '1. yarı 0,5 alt/üst':
|
||
if 'alt' in sel_lower: self.odds_cache[match_id]['ht_ou05_u'] = v
|
||
elif 'üst' in sel_lower: self.odds_cache[match_id]['ht_ou05_o'] = v
|
||
elif cat_lower == '1. yarı 1,5 alt/üst':
|
||
if 'alt' in sel_lower: self.odds_cache[match_id]['ht_ou15_u'] = v
|
||
elif 'üst' in sel_lower: self.odds_cache[match_id]['ht_ou15_o'] = v
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
def _load_league_stats(self):
|
||
"""Calculate league-level aggregated stats."""
|
||
ph = ",".join(["%s"] * len(self.top_league_ids))
|
||
self.cur.execute(f"""
|
||
SELECT league_id,
|
||
AVG(score_home + score_away) as avg_goals,
|
||
AVG(CASE WHEN score_home = 0 AND score_away = 0 THEN 1.0 ELSE 0.0 END) as zero_rate,
|
||
COUNT(*) as match_count
|
||
FROM matches
|
||
WHERE status = 'FT'
|
||
AND score_home IS NOT NULL
|
||
AND sport = 'football'
|
||
AND league_id IN ({ph})
|
||
GROUP BY league_id
|
||
""", self.top_league_ids)
|
||
|
||
for league_id, avg_goals, zero_rate, cnt in self.cur.fetchall():
|
||
self.league_stats_cache[league_id] = {
|
||
"avg_goals": float(avg_goals) if avg_goals else 2.5,
|
||
"zero_rate": float(zero_rate) if zero_rate else 0.07,
|
||
"match_count": cnt
|
||
}
|
||
|
||
def _load_team_history(self):
|
||
"""Bulk load all matches and team stats for lightning fast memory lookups"""
|
||
from collections import defaultdict
|
||
self.team_matches = defaultdict(list)
|
||
self.team_stats = defaultdict(list)
|
||
|
||
# Load all matches for form/h2h
|
||
self.cur.execute("""
|
||
SELECT home_team_id, away_team_id, score_home, score_away, mst_utc
|
||
FROM matches
|
||
WHERE status = 'FT' AND score_home IS NOT NULL AND sport = 'football'
|
||
ORDER BY mst_utc ASC
|
||
""")
|
||
for hid, aid, sh, sa, mst in self.cur.fetchall():
|
||
# (mst, is_home, team_score, opp_score, opp_id)
|
||
self.team_matches[hid].append((mst, True, sh, sa, aid))
|
||
self.team_matches[aid].append((mst, False, sa, sh, hid))
|
||
|
||
# Load all football_team_stats (sport-partitioned schema)
|
||
self.cur.execute("""
|
||
SELECT mts.team_id, m.mst_utc, mts.possession_percentage,
|
||
mts.shots_on_target, mts.total_shots, mts.corners,
|
||
m.score_home, m.score_away, m.home_team_id
|
||
FROM football_team_stats mts
|
||
JOIN matches m ON mts.match_id = m.id
|
||
WHERE m.sport = 'football' AND m.status = 'FT' AND m.score_home IS NOT NULL
|
||
ORDER BY m.mst_utc ASC
|
||
""")
|
||
for tid, mst, poss, sot, tshots, corn, sh, sa, hid in self.cur.fetchall():
|
||
team_goals = sh if hid == tid else sa
|
||
self.team_stats[tid].append((mst, poss, sot, tshots, corn, team_goals))
|
||
|
||
def _load_squad_data(self):
|
||
"""Bulk load squad participation + player events for squad features."""
|
||
ph = ",".join(["%s"] * len(self.top_league_ids))
|
||
|
||
# 1) Participation: starting XI count + position distribution per (match, team)
|
||
self.cur.execute(f"""
|
||
SELECT mpp.match_id, mpp.team_id,
|
||
COUNT(*) FILTER (WHERE mpp.is_starting = true) AS starting_count,
|
||
COUNT(*) AS total_squad,
|
||
COUNT(*) FILTER (WHERE mpp.is_starting = true AND LOWER(COALESCE(mpp.position::TEXT,'')) ~ '(forward|fwd|forvet|striker)') AS fwd_count
|
||
FROM match_player_participation mpp
|
||
JOIN matches m ON mpp.match_id = m.id
|
||
WHERE m.status = 'FT'
|
||
AND m.sport = 'football'
|
||
AND m.league_id IN ({ph})
|
||
GROUP BY mpp.match_id, mpp.team_id
|
||
""", self.top_league_ids)
|
||
|
||
participation = {}
|
||
for mid, tid, st_count, tot, fwd in self.cur.fetchall():
|
||
participation[(mid, tid)] = {
|
||
'starting_count': st_count or 0,
|
||
'total_squad': tot or 0,
|
||
'fwd_count': fwd or 0,
|
||
}
|
||
|
||
# 2) Player events: goals + assists per (match, team)
|
||
self.cur.execute(f"""
|
||
SELECT mpe.match_id, mpe.team_id,
|
||
COUNT(*) FILTER (
|
||
WHERE mpe.event_type = 'goal'
|
||
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
|
||
) AS goal_count,
|
||
COUNT(DISTINCT mpe.assist_player_id) FILTER (
|
||
WHERE mpe.event_type = 'goal' AND mpe.assist_player_id IS NOT NULL
|
||
) AS assist_count,
|
||
COUNT(DISTINCT mpe.player_id) FILTER (
|
||
WHERE mpe.event_type = 'goal'
|
||
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
|
||
) AS unique_scorers
|
||
FROM match_player_events mpe
|
||
JOIN matches m ON mpe.match_id = m.id
|
||
WHERE m.status = 'FT'
|
||
AND m.sport = 'football'
|
||
AND m.league_id IN ({ph})
|
||
GROUP BY mpe.match_id, mpe.team_id
|
||
""", self.top_league_ids)
|
||
|
||
events = {}
|
||
for mid, tid, goals, assists, scorers in self.cur.fetchall():
|
||
events[(mid, tid)] = {
|
||
'goals': goals or 0,
|
||
'assists': assists or 0,
|
||
'unique_scorers': scorers or 0,
|
||
}
|
||
|
||
# 3) Key players: players with 3+ goals across all their matches, per team
|
||
self.cur.execute(f"""
|
||
SELECT mpe.team_id, mpe.player_id, COUNT(*) AS total_goals
|
||
FROM match_player_events mpe
|
||
JOIN matches m ON mpe.match_id = m.id
|
||
WHERE m.status = 'FT'
|
||
AND m.sport = 'football'
|
||
AND m.league_id IN ({ph})
|
||
AND mpe.event_type = 'goal'
|
||
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
|
||
GROUP BY mpe.team_id, mpe.player_id
|
||
HAVING COUNT(*) >= 3
|
||
""", self.top_league_ids)
|
||
|
||
key_players_by_team = defaultdict(set)
|
||
for tid, pid, _ in self.cur.fetchall():
|
||
key_players_by_team[tid].add(pid)
|
||
|
||
# 4) Starting key players per (match, team)
|
||
self.cur.execute(f"""
|
||
SELECT mpp.match_id, mpp.team_id, mpp.player_id
|
||
FROM match_player_participation mpp
|
||
JOIN matches m ON mpp.match_id = m.id
|
||
WHERE mpp.is_starting = true
|
||
AND m.status = 'FT'
|
||
AND m.sport = 'football'
|
||
AND m.league_id IN ({ph})
|
||
""", self.top_league_ids)
|
||
|
||
starting_players = defaultdict(list)
|
||
for mid, tid, pid in self.cur.fetchall():
|
||
starting_players[(mid, tid)].append(pid)
|
||
|
||
# 5) Build match_id → mst_utc mapping for temporal filtering
|
||
match_mst = {}
|
||
for m in self.matches:
|
||
match_mst[m[0]] = m[7] # m[0]=id, m[7]=mst_utc
|
||
|
||
# 6) Build combined cache — NO DATA LEAKAGE
|
||
# goals_form: avg goals from last 5 matches BEFORE this match (not this match!)
|
||
# squad_quality: only uses pre-match info (lineup, key players) — no current-match goals/assists
|
||
all_keys = set(participation.keys()) | set(events.keys())
|
||
for key in all_keys:
|
||
mid, tid = key
|
||
part = participation.get(key, {'starting_count': 0, 'total_squad': 0, 'fwd_count': 0})
|
||
|
||
# Count key players in starting XI
|
||
starters = starting_players.get(key, [])
|
||
kp_in_starting = sum(1 for p in starters if p in key_players_by_team.get(tid, set()))
|
||
kp_total = len(key_players_by_team.get(tid, set()))
|
||
kp_missing = max(0, kp_total - kp_in_starting)
|
||
|
||
# Squad quality: composite score — ONLY pre-match info (no current-match goals/assists!)
|
||
squad_quality = (
|
||
part['starting_count'] * 0.3 +
|
||
kp_in_starting * 3.0 +
|
||
part['fwd_count'] * 1.5
|
||
)
|
||
# Missing impact: how many key players are missing
|
||
missing_impact = min(kp_missing / max(kp_total, 1), 1.0)
|
||
|
||
# goals_form: avg goals from last 5 matches BEFORE this match
|
||
current_mst = match_mst.get(mid, 0)
|
||
team_history = self.team_matches.get(tid, [])
|
||
recent_goals = [
|
||
tm[2] # team_score
|
||
for tm in team_history
|
||
if tm[0] < current_mst # only matches BEFORE this one
|
||
][-5:] # last 5
|
||
goals_form = sum(recent_goals) / len(recent_goals) if recent_goals else 1.3
|
||
|
||
self.squad_cache[key] = {
|
||
'squad_quality': squad_quality,
|
||
'key_players': kp_in_starting,
|
||
'missing_impact': missing_impact,
|
||
'goals_form': round(goals_form, 2),
|
||
}
|
||
|
||
def _load_cards_data(self):
|
||
"""Bulk load all distinct cards per match (yellow=1, red=2)."""
|
||
ph = ",".join(["%s"] * len(self.top_league_ids))
|
||
self.cur.execute(f"""
|
||
SELECT mpe.match_id,
|
||
SUM(CASE
|
||
WHEN mpe.event_type::text LIKE '%%yellow_card%%' THEN 1
|
||
WHEN mpe.event_type::text LIKE '%%red_card%%' THEN 2
|
||
ELSE 1 END) as cards_weight
|
||
FROM match_player_events mpe
|
||
JOIN matches m ON mpe.match_id = m.id
|
||
WHERE m.status = 'FT'
|
||
AND m.sport = 'football'
|
||
AND m.league_id IN ({ph})
|
||
AND mpe.event_type::text LIKE '%%card%%'
|
||
GROUP BY mpe.match_id
|
||
""", self.top_league_ids)
|
||
|
||
for mid, cards_weight in self.cur.fetchall():
|
||
self.cards_cache[mid] = float(cards_weight) if cards_weight else 0.0
|
||
|
||
|
||
class FeatureExtractor:
|
||
"""Extract features for a single match using pre-loaded data + on-demand queries."""
|
||
|
||
def __init__(self, conn, loader: BatchDataLoader):
|
||
self.conn = conn
|
||
self.cur = conn.cursor()
|
||
self.loader = loader
|
||
|
||
# ELO cache: team_id → {overall, home, away, form}
|
||
self.elo_ratings = defaultdict(lambda: {
|
||
"overall": 1500.0, "home": 1500.0,
|
||
"away": 1500.0, "form": 1500.0,
|
||
"matches": 0
|
||
})
|
||
self._elo_initialized = False
|
||
self.upset_engine = get_upset_engine()
|
||
self.referee_engine = get_referee_engine()
|
||
self.momentum_engine = get_momentum_engine()
|
||
|
||
def extract_all(self) -> list:
|
||
"""Extract features for all matches, yield row dicts."""
|
||
matches = self.loader.matches
|
||
total = len(matches)
|
||
rows = []
|
||
skipped = 0
|
||
t_start = time.time()
|
||
|
||
print(f"\n🔄 Extracting features for {total} matches...", flush=True)
|
||
|
||
# Process chronologically — ELO grows as we go
|
||
for i, m in enumerate(matches):
|
||
(
|
||
mid,
|
||
hid,
|
||
aid,
|
||
sh,
|
||
sa,
|
||
hth,
|
||
hta,
|
||
mst,
|
||
lid,
|
||
home_name,
|
||
away_name,
|
||
league_name,
|
||
) = m
|
||
|
||
if i % 100 == 0 and i > 0:
|
||
elapsed = time.time() - t_start
|
||
rate = i / elapsed # matches per second
|
||
remaining = (total - i) / rate if rate > 0 else 0
|
||
pct = i / total * 100
|
||
print(f" [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | ETA: {remaining/60:.1f} dk | skipped: {skipped}", flush=True)
|
||
|
||
row = self._extract_one(
|
||
mid,
|
||
hid,
|
||
aid,
|
||
sh,
|
||
sa,
|
||
hth,
|
||
hta,
|
||
mst,
|
||
lid,
|
||
home_name,
|
||
away_name,
|
||
league_name,
|
||
)
|
||
|
||
if row:
|
||
rows.append(row)
|
||
else:
|
||
skipped += 1
|
||
|
||
# Update ELO after processing (so ELO is calculated BEFORE the match)
|
||
self._update_elo(hid, aid, sh, sa)
|
||
|
||
print(f" ✅ Extracted {len(rows)} rows, skipped {skipped}", flush=True)
|
||
return rows
|
||
|
||
def _extract_one(
|
||
self,
|
||
mid,
|
||
hid,
|
||
aid,
|
||
sh,
|
||
sa,
|
||
hth,
|
||
hta,
|
||
mst,
|
||
lid,
|
||
home_name,
|
||
away_name,
|
||
league_name,
|
||
):
|
||
"""Extract features for a single match."""
|
||
|
||
# === LABELS ===
|
||
total_goals = sh + sa
|
||
ht_total = (hth + hta) if hth is not None and hta is not None else None
|
||
|
||
if sh > sa:
|
||
label_ms = 0
|
||
elif sh < sa:
|
||
label_ms = 2
|
||
else:
|
||
label_ms = 1
|
||
|
||
label_ou05 = 1 if total_goals > 0.5 else 0
|
||
label_ou15 = 1 if total_goals > 1.5 else 0
|
||
label_ou25 = 1 if total_goals > 2.5 else 0
|
||
label_ou35 = 1 if total_goals > 3.5 else 0
|
||
label_btts = 1 if (sh > 0 and sa > 0) else 0
|
||
label_odd_even = 1 if total_goals % 2 != 0 else 0
|
||
|
||
# Handicap MS Label (Home starts -1): 0=Home wins by 2+, 1=Home wins by exactly 1, 2=Draw or Away wins
|
||
score_diff = sh - sa
|
||
if score_diff >= 2:
|
||
label_handicap_ms = 0
|
||
elif score_diff == 1:
|
||
label_handicap_ms = 1
|
||
else:
|
||
label_handicap_ms = 2
|
||
|
||
# Cards Label
|
||
cards_count = self.loader.cards_cache.get(mid, 0.0)
|
||
label_yellow_cards = cards_count
|
||
label_cards_ou45 = 1 if cards_count > 4.5 else 0
|
||
|
||
# HT labels
|
||
label_ht_result = None
|
||
label_ht_ou05 = None
|
||
label_ht_ou15 = None
|
||
if hth is not None and hta is not None:
|
||
if hth > hta:
|
||
label_ht_result = 0
|
||
elif hth < hta:
|
||
label_ht_result = 2
|
||
else:
|
||
label_ht_result = 1
|
||
label_ht_ou05 = 1 if ht_total > 0.5 else 0
|
||
label_ht_ou15 = 1 if ht_total > 1.5 else 0
|
||
|
||
# HT/FT Label (0-8)
|
||
# 0: 1/1, 1: 1/X, 2: 1/2
|
||
# 3: X/1, 4: X/X, 5: X/2
|
||
# 6: 2/1, 7: 2/X, 8: 2/2
|
||
label_ht_ft = None
|
||
if label_ht_result is not None:
|
||
label_ht_ft = label_ht_result * 3 + label_ms
|
||
|
||
# === ELO FEATURES ===
|
||
h_elo = self.elo_ratings[hid]
|
||
a_elo = self.elo_ratings[aid]
|
||
|
||
elo_features = {
|
||
"home_overall_elo": h_elo["overall"],
|
||
"away_overall_elo": a_elo["overall"],
|
||
"elo_diff": h_elo["overall"] - a_elo["overall"],
|
||
"home_home_elo": h_elo["home"],
|
||
"away_away_elo": a_elo["away"],
|
||
"home_form_elo": h_elo["form"],
|
||
"away_form_elo": a_elo["form"],
|
||
"form_elo_diff": h_elo["form"] - a_elo["form"],
|
||
}
|
||
|
||
# === FORM FEATURES ===
|
||
form_features = self._get_form_features(hid, aid, mst)
|
||
|
||
# === H2H FEATURES ===
|
||
h2h_features = self._get_h2h_features(hid, aid, mst)
|
||
|
||
# === TEAM STATS FEATURES ===
|
||
stats_features = self._get_team_stats_features(hid, aid, mst)
|
||
|
||
# === ODDS FEATURES ===
|
||
odds = self.loader.odds_cache.get(mid, {})
|
||
ms_h = odds.get("ms_h", 0.0)
|
||
ms_d = odds.get("ms_d", 0.0)
|
||
ms_a = odds.get("ms_a", 0.0)
|
||
|
||
# Implied probabilities (normalized vig-free)
|
||
if ms_h > 0 and ms_d > 0 and ms_a > 0:
|
||
raw_sum = 1/ms_h + 1/ms_d + 1/ms_a
|
||
implied_home = (1/ms_h) / raw_sum
|
||
implied_draw = (1/ms_d) / raw_sum
|
||
implied_away = (1/ms_a) / raw_sum
|
||
else:
|
||
implied_home = implied_draw = implied_away = 0.33
|
||
|
||
odds_features = {
|
||
"odds_ms_h": ms_h,
|
||
"odds_ms_d": ms_d,
|
||
"odds_ms_a": ms_a,
|
||
"implied_home": implied_home,
|
||
"implied_draw": implied_draw,
|
||
"implied_away": implied_away,
|
||
|
||
"odds_ht_ms_h": odds.get("ht_ms_h", 0.0),
|
||
"odds_ht_ms_d": odds.get("ht_ms_d", 0.0),
|
||
"odds_ht_ms_a": odds.get("ht_ms_a", 0.0),
|
||
|
||
"odds_ou05_o": odds.get("ou05_o", 0.0),
|
||
"odds_ou05_u": odds.get("ou05_u", 0.0),
|
||
"odds_ou15_o": odds.get("ou15_o", 0.0),
|
||
"odds_ou15_u": odds.get("ou15_u", 0.0),
|
||
"odds_ou25_o": odds.get("ou25_o", 0.0),
|
||
"odds_ou25_u": odds.get("ou25_u", 0.0),
|
||
"odds_ou35_o": odds.get("ou35_o", 0.0),
|
||
"odds_ou35_u": odds.get("ou35_u", 0.0),
|
||
|
||
"odds_ht_ou05_o": odds.get("ht_ou05_o", 0.0),
|
||
"odds_ht_ou05_u": odds.get("ht_ou05_u", 0.0),
|
||
"odds_ht_ou15_o": odds.get("ht_ou15_o", 0.0),
|
||
"odds_ht_ou15_u": odds.get("ht_ou15_u", 0.0),
|
||
|
||
"odds_btts_y": odds.get("btts_y", 0.0),
|
||
"odds_btts_n": odds.get("btts_n", 0.0),
|
||
"odds_ms_h_present": 1.0 if ms_h > 1.01 else 0.0,
|
||
"odds_ms_d_present": 1.0 if ms_d > 1.01 else 0.0,
|
||
"odds_ms_a_present": 1.0 if ms_a > 1.01 else 0.0,
|
||
"odds_ht_ms_h_present": 1.0 if odds.get("ht_ms_h", 0.0) > 1.01 else 0.0,
|
||
"odds_ht_ms_d_present": 1.0 if odds.get("ht_ms_d", 0.0) > 1.01 else 0.0,
|
||
"odds_ht_ms_a_present": 1.0 if odds.get("ht_ms_a", 0.0) > 1.01 else 0.0,
|
||
"odds_ou05_o_present": 1.0 if odds.get("ou05_o", 0.0) > 1.01 else 0.0,
|
||
"odds_ou05_u_present": 1.0 if odds.get("ou05_u", 0.0) > 1.01 else 0.0,
|
||
"odds_ou15_o_present": 1.0 if odds.get("ou15_o", 0.0) > 1.01 else 0.0,
|
||
"odds_ou15_u_present": 1.0 if odds.get("ou15_u", 0.0) > 1.01 else 0.0,
|
||
"odds_ou25_o_present": 1.0 if odds.get("ou25_o", 0.0) > 1.01 else 0.0,
|
||
"odds_ou25_u_present": 1.0 if odds.get("ou25_u", 0.0) > 1.01 else 0.0,
|
||
"odds_ou35_o_present": 1.0 if odds.get("ou35_o", 0.0) > 1.01 else 0.0,
|
||
"odds_ou35_u_present": 1.0 if odds.get("ou35_u", 0.0) > 1.01 else 0.0,
|
||
"odds_ht_ou05_o_present": 1.0 if odds.get("ht_ou05_o", 0.0) > 1.01 else 0.0,
|
||
"odds_ht_ou05_u_present": 1.0 if odds.get("ht_ou05_u", 0.0) > 1.01 else 0.0,
|
||
"odds_ht_ou15_o_present": 1.0 if odds.get("ht_ou15_o", 0.0) > 1.01 else 0.0,
|
||
"odds_ht_ou15_u_present": 1.0 if odds.get("ht_ou15_u", 0.0) > 1.01 else 0.0,
|
||
"odds_btts_y_present": 1.0 if odds.get("btts_y", 0.0) > 1.01 else 0.0,
|
||
"odds_btts_n_present": 1.0 if odds.get("btts_n", 0.0) > 1.01 else 0.0,
|
||
}
|
||
|
||
# === LEAGUE FEATURES ===
|
||
league = self.loader.league_stats_cache.get(lid, {"avg_goals": 2.5, "zero_rate": 0.07})
|
||
league_features = {
|
||
"league_avg_goals": league["avg_goals"],
|
||
"league_zero_goal_rate": league["zero_rate"],
|
||
}
|
||
|
||
# === UPSET FEATURES ===
|
||
try:
|
||
upset_feats = self.upset_engine.get_features(
|
||
home_team_name=home_name or "",
|
||
home_team_id=hid,
|
||
away_team_name=away_name or "",
|
||
league_name=league_name or "",
|
||
home_position=10,
|
||
away_position=10,
|
||
match_date_ms=mst,
|
||
)
|
||
except Exception:
|
||
upset_feats = {
|
||
"upset_atmosphere": 0.0,
|
||
"upset_motivation": 0.0,
|
||
"upset_fatigue": 0.0,
|
||
"upset_potential": 0.0,
|
||
}
|
||
|
||
# === REFEREE FEATURES ===
|
||
try:
|
||
referee_feats = self.referee_engine.get_features(
|
||
match_id=mid,
|
||
league_id=str(lid),
|
||
)
|
||
except Exception:
|
||
referee_feats = {
|
||
"referee_home_bias": 0.0,
|
||
"referee_avg_goals": 2.7,
|
||
"referee_cards_total": 4.0,
|
||
"referee_avg_yellow": 3.5,
|
||
"referee_avg_red": 0.1,
|
||
"referee_experience": 0.5,
|
||
}
|
||
|
||
# === MOMENTUM FEATURES ===
|
||
try:
|
||
momentum_feats = self.momentum_engine.get_features(
|
||
home_team_id=hid,
|
||
away_team_id=aid,
|
||
match_date_ms=mst,
|
||
)
|
||
home_momentum_score = momentum_feats.get("home_momentum_score", 0.5)
|
||
away_momentum_score = momentum_feats.get("away_momentum_score", 0.5)
|
||
momentum_diff = momentum_feats.get(
|
||
"momentum_diff", home_momentum_score - away_momentum_score
|
||
)
|
||
except Exception:
|
||
home_momentum_score = 0.5
|
||
away_momentum_score = 0.5
|
||
momentum_diff = 0.0
|
||
|
||
# === SQUAD FEATURES ===
|
||
home_sq = self.loader.squad_cache.get((mid, hid), {})
|
||
away_sq = self.loader.squad_cache.get((mid, aid), {})
|
||
home_squad_quality = home_sq.get('squad_quality', 0.0)
|
||
away_squad_quality = away_sq.get('squad_quality', 0.0)
|
||
squad_diff = home_squad_quality - away_squad_quality
|
||
home_key_players = home_sq.get('key_players', 0)
|
||
away_key_players = away_sq.get('key_players', 0)
|
||
home_missing_impact = home_sq.get('missing_impact', 0.0)
|
||
away_missing_impact = away_sq.get('missing_impact', 0.0)
|
||
home_goals_form = home_sq.get('goals_form', 0)
|
||
away_goals_form = away_sq.get('goals_form', 0)
|
||
|
||
# === ASSEMBLE ROW ===
|
||
row = {
|
||
"match_id": mid,
|
||
"home_team_id": hid,
|
||
"away_team_id": aid,
|
||
"league_id": lid,
|
||
"mst_utc": mst,
|
||
|
||
**elo_features,
|
||
**form_features,
|
||
**h2h_features,
|
||
**stats_features,
|
||
**odds_features,
|
||
|
||
"home_xga": form_features["home_conceded_avg"],
|
||
"away_xga": form_features["away_conceded_avg"],
|
||
**league_features,
|
||
"upset_atmosphere": upset_feats.get("upset_atmosphere", 0.0),
|
||
"upset_motivation": upset_feats.get("upset_motivation", 0.0),
|
||
"upset_fatigue": upset_feats.get("upset_fatigue", 0.0),
|
||
"upset_potential": upset_feats.get("upset_potential", 0.0),
|
||
"referee_home_bias": referee_feats.get("referee_home_bias", 0.0),
|
||
"referee_avg_goals": referee_feats.get("referee_avg_goals", 2.7),
|
||
"referee_cards_total": referee_feats.get("referee_cards_total", 4.0),
|
||
"referee_avg_yellow": referee_feats.get("referee_avg_yellow", 3.5),
|
||
"referee_experience": referee_feats.get("referee_experience", 0.5),
|
||
"home_momentum_score": home_momentum_score,
|
||
"away_momentum_score": away_momentum_score,
|
||
"momentum_diff": momentum_diff,
|
||
|
||
# Squad Features
|
||
"home_squad_quality": home_squad_quality,
|
||
"away_squad_quality": away_squad_quality,
|
||
"squad_diff": squad_diff,
|
||
"home_key_players": home_key_players,
|
||
"away_key_players": away_key_players,
|
||
"home_missing_impact": home_missing_impact,
|
||
"away_missing_impact": away_missing_impact,
|
||
"home_goals_form": home_goals_form,
|
||
"away_goals_form": away_goals_form,
|
||
|
||
# Labels
|
||
"score_home": sh,
|
||
"score_away": sa,
|
||
"total_goals": total_goals,
|
||
"ht_score_home": hth if hth is not None else "",
|
||
"ht_score_away": hta if hta is not None else "",
|
||
"ht_total_goals": ht_total if ht_total is not None else "",
|
||
"label_ms": label_ms,
|
||
"label_ou05": label_ou05,
|
||
"label_ou15": label_ou15,
|
||
"label_ou25": label_ou25,
|
||
"label_ou35": label_ou35,
|
||
"label_btts": label_btts,
|
||
"label_ht_result": label_ht_result if label_ht_result is not None else "",
|
||
"label_ht_ou05": label_ht_ou05 if label_ht_ou05 is not None else "",
|
||
"label_ht_ou15": label_ht_ou15 if label_ht_ou15 is not None else "",
|
||
"label_ht_ft": label_ht_ft if label_ht_ft is not None else "",
|
||
"label_odd_even": label_odd_even,
|
||
"label_yellow_cards": label_yellow_cards,
|
||
"label_cards_ou45": label_cards_ou45,
|
||
"label_handicap_ms": label_handicap_ms,
|
||
}
|
||
|
||
return row
|
||
|
||
# -------------------------------------------------------------------------
|
||
# ELO (simplified inline version — doesn't need DB, grows incrementally)
|
||
# -------------------------------------------------------------------------
|
||
def _update_elo(self, home_id, away_id, score_home, score_away):
|
||
"""Update ELO ratings after a match."""
|
||
h = self.elo_ratings[home_id]
|
||
a = self.elo_ratings[away_id]
|
||
|
||
HOME_ADVANTAGE = 65
|
||
K_BASE = 32
|
||
|
||
# Expected scores
|
||
exp_h = 1.0 / (1 + 10 ** ((a["overall"] - h["overall"] - HOME_ADVANTAGE) / 400))
|
||
exp_a = 1.0 - exp_h
|
||
|
||
# Actual scores
|
||
if score_home > score_away:
|
||
actual_h, actual_a = 1.0, 0.0
|
||
elif score_home < score_away:
|
||
actual_h, actual_a = 0.0, 1.0
|
||
else:
|
||
actual_h, actual_a = 0.5, 0.5
|
||
|
||
# Goal difference multiplier
|
||
gd = abs(score_home - score_away)
|
||
gd_mult = math.log(max(gd, 1) + 1) * 0.7 + 1.0
|
||
|
||
# Dynamic K
|
||
k_h = K_BASE * gd_mult * (1.3 if h["matches"] < 10 else 1.0)
|
||
k_a = K_BASE * gd_mult * (1.3 if a["matches"] < 10 else 1.0)
|
||
|
||
delta_h = k_h * (actual_h - exp_h)
|
||
delta_a = k_a * (actual_a - exp_a)
|
||
|
||
# Update all ELO variants
|
||
h["overall"] += delta_h
|
||
a["overall"] += delta_a
|
||
h["home"] += delta_h * 1.1
|
||
a["away"] += delta_a * 1.1
|
||
|
||
# Form ELO (heavier weight on recent)
|
||
form_k = K_BASE * 1.5 * gd_mult
|
||
h["form"] = h["form"] * 0.85 + (h["form"] + form_k * (actual_h - exp_h)) * 0.15
|
||
a["form"] = a["form"] * 0.85 + (a["form"] + form_k * (actual_a - exp_a)) * 0.15
|
||
|
||
h["matches"] += 1
|
||
a["matches"] += 1
|
||
|
||
# -------------------------------------------------------------------------
|
||
# FORM (last 5 matches)
|
||
# -------------------------------------------------------------------------
|
||
def _get_form_features(self, home_id, away_id, before_date) -> dict:
|
||
"""Get form features for both teams."""
|
||
h_form = self._calc_team_form(home_id, before_date)
|
||
a_form = self._calc_team_form(away_id, before_date)
|
||
|
||
return {
|
||
"home_goals_avg": h_form["goals_avg"],
|
||
"home_conceded_avg": h_form["conceded_avg"],
|
||
"away_goals_avg": a_form["goals_avg"],
|
||
"away_conceded_avg": a_form["conceded_avg"],
|
||
"home_clean_sheet_rate": h_form["clean_sheet_rate"],
|
||
"away_clean_sheet_rate": a_form["clean_sheet_rate"],
|
||
"home_scoring_rate": h_form["scoring_rate"],
|
||
"away_scoring_rate": a_form["scoring_rate"],
|
||
"home_winning_streak": h_form["winning_streak"],
|
||
"away_winning_streak": a_form["winning_streak"],
|
||
"home_unbeaten_streak": h_form["unbeaten_streak"],
|
||
"away_unbeaten_streak": a_form["unbeaten_streak"],
|
||
}
|
||
|
||
def _calc_team_form(self, team_id, before_date, limit=5) -> dict:
|
||
"""Calculate form from last N matches (weighted moving average)."""
|
||
history = self.loader.team_matches.get(team_id, [])
|
||
# Filter and get last `limit` matches before date
|
||
valid_matches = [m for m in history if m[0] < before_date]
|
||
rows = valid_matches[-limit:] if valid_matches else []
|
||
|
||
if not rows:
|
||
return {
|
||
"goals_avg": 1.3, "conceded_avg": 1.2,
|
||
"clean_sheet_rate": 0.25, "scoring_rate": 0.75,
|
||
"winning_streak": 0, "unbeaten_streak": 0,
|
||
}
|
||
|
||
w_goals, w_conceded, w_total = 0.0, 0.0, 0.0
|
||
clean_sheets, scored_matches = 0, 0
|
||
winning_streak, unbeaten_streak = 0, 0
|
||
streak_broken, unbeaten_broken = False, False
|
||
|
||
for i, (mst, is_home, team_goals, opp_goals, opp_id) in enumerate(reversed(rows)):
|
||
weight = float(limit - i)
|
||
|
||
w_goals += team_goals * weight
|
||
w_conceded += opp_goals * weight
|
||
w_total += weight
|
||
|
||
if opp_goals == 0:
|
||
clean_sheets += 1
|
||
if team_goals > 0:
|
||
scored_matches += 1
|
||
|
||
if not streak_broken:
|
||
if team_goals > opp_goals:
|
||
winning_streak += 1
|
||
else:
|
||
streak_broken = True
|
||
|
||
if not unbeaten_broken:
|
||
if team_goals >= opp_goals:
|
||
unbeaten_streak += 1
|
||
else:
|
||
unbeaten_broken = True
|
||
|
||
n = len(rows)
|
||
return {
|
||
"goals_avg": w_goals / w_total if w_total > 0 else 1.3,
|
||
"conceded_avg": w_conceded / w_total if w_total > 0 else 1.2,
|
||
"clean_sheet_rate": clean_sheets / n,
|
||
"scoring_rate": scored_matches / n,
|
||
"winning_streak": winning_streak,
|
||
"unbeaten_streak": unbeaten_streak,
|
||
}
|
||
|
||
# -------------------------------------------------------------------------
|
||
# H2H
|
||
# -------------------------------------------------------------------------
|
||
def _get_h2h_features(self, home_id, away_id, before_date) -> dict:
|
||
"""Get head-to-head features."""
|
||
h_history = self.loader.team_matches.get(home_id, [])
|
||
# Matches against away_id before date
|
||
h2h_matches = [m for m in h_history if m[4] == away_id and m[0] < before_date]
|
||
rows = h2h_matches[-20:] if h2h_matches else []
|
||
|
||
if not rows:
|
||
return {
|
||
"h2h_total_matches": 0,
|
||
"h2h_home_win_rate": 0.33,
|
||
"h2h_draw_rate": 0.33,
|
||
"h2h_avg_goals": 2.5,
|
||
"h2h_btts_rate": 0.5,
|
||
"h2h_over25_rate": 0.5,
|
||
}
|
||
|
||
home_wins, draws = 0, 0
|
||
total_goals_sum = 0
|
||
btts_count, over25_count = 0, 0
|
||
n = len(rows)
|
||
|
||
for mst, is_our_home, team_goals, opp_goals, opp_id in rows:
|
||
sh = team_goals if is_our_home else opp_goals
|
||
sa = opp_goals if is_our_home else team_goals
|
||
total = sh + sa
|
||
total_goals_sum += total
|
||
|
||
if sh > 0 and sa > 0:
|
||
btts_count += 1
|
||
if total > 2.5:
|
||
over25_count += 1
|
||
|
||
# Determine result relative to our home team
|
||
if is_our_home:
|
||
if sh > sa: home_wins += 1
|
||
elif sh == sa: draws += 1
|
||
else:
|
||
if sa > sh: home_wins += 1
|
||
elif sh == sa: draws += 1
|
||
|
||
return {
|
||
"h2h_total_matches": n,
|
||
"h2h_home_win_rate": home_wins / n,
|
||
"h2h_draw_rate": draws / n,
|
||
"h2h_avg_goals": total_goals_sum / n,
|
||
"h2h_btts_rate": btts_count / n,
|
||
"h2h_over25_rate": over25_count / n,
|
||
}
|
||
|
||
# -------------------------------------------------------------------------
|
||
# TEAM STATS (possession, shots, corners)
|
||
# -------------------------------------------------------------------------
|
||
def _get_team_stats_features(self, home_id, away_id, before_date) -> dict:
|
||
"""Get team-level match stats features."""
|
||
h_stats = self._calc_team_stats(home_id, before_date)
|
||
a_stats = self._calc_team_stats(away_id, before_date)
|
||
|
||
return {
|
||
"home_avg_possession": h_stats["possession"],
|
||
"away_avg_possession": a_stats["possession"],
|
||
"home_avg_shots_on_target": h_stats["shots_on_target"],
|
||
"away_avg_shots_on_target": a_stats["shots_on_target"],
|
||
"home_shot_conversion": h_stats["shot_conversion"],
|
||
"away_shot_conversion": a_stats["shot_conversion"],
|
||
"home_avg_corners": h_stats["corners"],
|
||
"away_avg_corners": a_stats["corners"],
|
||
}
|
||
|
||
def _calc_team_stats(self, team_id, before_date, limit=10) -> dict:
|
||
"""Calculate team stats from match_team_stats table."""
|
||
defaults = {
|
||
"possession": 0.50, "shots_on_target": 3.5,
|
||
"shot_conversion": 0.10, "corners": 4.5
|
||
}
|
||
|
||
stats_history = self.loader.team_stats.get(team_id, [])
|
||
valid_stats = [s for s in stats_history if s[0] < before_date]
|
||
rows = valid_stats[-limit:] if valid_stats else []
|
||
|
||
if not rows:
|
||
return defaults
|
||
|
||
poss_sum, sot_sum, shots_sum, corners_sum = 0.0, 0.0, 0.0, 0.0
|
||
goals_scored = 0
|
||
poss_count = 0
|
||
n = len(rows)
|
||
|
||
for mst, poss, sot, total_shots, corners, team_goals in rows:
|
||
if poss and poss > 0:
|
||
poss_sum += float(poss)
|
||
poss_count += 1
|
||
sot_sum += float(sot or 0)
|
||
shots_sum += float(total_shots or 0)
|
||
corners_sum += float(corners or 0)
|
||
|
||
goals_scored += float(team_goals or 0)
|
||
|
||
return {
|
||
"possession": (poss_sum / poss_count / 100) if poss_count > 0 else 0.50,
|
||
"shots_on_target": sot_sum / n,
|
||
"shot_conversion": goals_scored / shots_sum if shots_sum > 0 else 0.10,
|
||
"corners": corners_sum / n,
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# MAIN
|
||
# =============================================================================
|
||
def main():
|
||
print("🚀 XGBoost Training Data Extraction")
|
||
print("=" * 60)
|
||
|
||
# Load top leagues
|
||
with open(TOP_LEAGUES_PATH) as f:
|
||
top_leagues = json.load(f)
|
||
print(f"📋 {len(top_leagues)} top leagues")
|
||
|
||
# Connect
|
||
conn = get_conn()
|
||
|
||
# Batch load
|
||
print("\n📦 Loading batch data...")
|
||
loader = BatchDataLoader(conn, top_leagues)
|
||
loader.load_all()
|
||
|
||
# Extract features
|
||
extractor = FeatureExtractor(conn, loader)
|
||
rows = extractor.extract_all()
|
||
|
||
if not rows:
|
||
print("❌ No data extracted!")
|
||
return
|
||
|
||
# Write CSV
|
||
print(f"\n💾 Writing {len(rows)} rows to {OUTPUT_CSV}...")
|
||
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
|
||
writer = csv.DictWriter(f, fieldnames=FEATURE_COLS)
|
||
writer.writeheader()
|
||
writer.writerows(rows)
|
||
|
||
# Summary stats
|
||
print(f"\n📊 Data Summary:")
|
||
print(f" Total rows: {len(rows)}")
|
||
|
||
# Label distributions
|
||
ms_dist = defaultdict(int)
|
||
ou25_dist = defaultdict(int)
|
||
btts_dist = defaultdict(int)
|
||
with_odds = sum(1 for r in rows if r.get("odds_ms_h", 0) > 0)
|
||
|
||
for r in rows:
|
||
ms_dist[r["label_ms"]] += 1
|
||
ou25_dist[r["label_ou25"]] += 1
|
||
btts_dist[r["label_btts"]] += 1
|
||
|
||
n = len(rows)
|
||
print(f" With odds: {with_odds} ({with_odds/n*100:.1f}%)")
|
||
print(f" MS dist: Home={ms_dist[0]/n*100:.1f}% Draw={ms_dist[1]/n*100:.1f}% Away={ms_dist[2]/n*100:.1f}%")
|
||
print(f" O/U 2.5: Over={ou25_dist[1]/n*100:.1f}% Under={ou25_dist[0]/n*100:.1f}%")
|
||
print(f" BTTS: Yes={btts_dist[1]/n*100:.1f}% No={btts_dist[0]/n*100:.1f}%")
|
||
|
||
# HT/FT Distribution
|
||
htft_dist = defaultdict(int)
|
||
for r in rows:
|
||
if r.get("label_ht_ft") is not None and r.get("label_ht_ft") != "":
|
||
htft_dist[r["label_ht_ft"]] += 1
|
||
|
||
print(f"\n HT/FT Distribution:")
|
||
# Interesting ones: 1/2 (2) and 2/1 (6)
|
||
rev_1_2 = htft_dist.get(2, 0)
|
||
rev_2_1 = htft_dist.get(6, 0)
|
||
print(f" 1/2 (Home->Away): {rev_1_2} ({rev_1_2/n*100:.2f}%)")
|
||
print(f" 2/1 (Away->Home): {rev_2_1} ({rev_2_1/n*100:.2f}%)")
|
||
print(f" 1/1: {htft_dist.get(0,0)} | X/X: {htft_dist.get(4,0)} | 2/2: {htft_dist.get(8,0)}")
|
||
|
||
# Feature NaN check
|
||
nan_cols = []
|
||
for col in FEATURE_COLS:
|
||
nans = sum(1 for r in rows if r.get(col, "") == "" or r.get(col) is None)
|
||
if nans > 0 and col not in ("ht_score_home", "ht_score_away", "ht_total_goals",
|
||
"label_ht_result", "label_ht_ou05", "label_ht_ou15"):
|
||
nan_cols.append((col, nans))
|
||
|
||
if nan_cols:
|
||
print(f"\n ⚠️ Columns with missing values:")
|
||
for col, cnt in nan_cols:
|
||
print(f" {col}: {cnt} ({cnt/n*100:.1f}%)")
|
||
else:
|
||
print(f"\n ✅ No missing values in feature columns!")
|
||
|
||
print(f"\n✅ Done! Output: {OUTPUT_CSV}")
|
||
conn.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|