This commit is contained in:
2026-05-10 22:52:05 +03:00
parent c525b12dfd
commit f3362f266c
3 changed files with 922 additions and 32 deletions
+162 -10
View File
@@ -14,6 +14,7 @@ import json
import csv
import math
import time
import bisect
from datetime import datetime
from collections import defaultdict
@@ -119,6 +120,14 @@ FEATURE_COLS = [
"home_key_players", "away_key_players",
"home_missing_impact", "away_missing_impact",
"home_goals_form", "away_goals_form",
# Player-Level Features (12)
"home_lineup_goals_per90", "away_lineup_goals_per90",
"home_lineup_assists_per90", "away_lineup_assists_per90",
"home_squad_continuity", "away_squad_continuity",
"home_top_scorer_form", "away_top_scorer_form",
"home_avg_player_exp", "away_avg_player_exp",
"home_goals_diversity", "away_goals_diversity",
# Labels
"score_home", "score_away", "total_goals",
@@ -336,7 +345,7 @@ class BatchDataLoader:
self.team_stats[tid].append((mst, poss, sot, tshots, corn, team_goals))
def _load_squad_data(self):
"""Bulk load squad participation + player events for squad features."""
"""Bulk load squad participation + player events + player career for squad features."""
ph = ",".join(["%s"] * len(self.top_league_ids))
# 1) Participation: starting XI count + position distribution per (match, team)
@@ -429,9 +438,90 @@ class BatchDataLoader:
for m in self.matches:
match_mst[m[0]] = m[7] # m[0]=id, m[7]=mst_utc
# 6) Build combined cache — NO DATA LEAKAGE
# goals_form: avg goals from last 5 matches BEFORE this match (not this match!)
# squad_quality: only uses pre-match info (lineup, key players) — no current-match goals/assists
# ─── NEW: Player Career Stats (prefix-sum for O(1) temporal lookup) ───
# 6a) Goals per player per match date
self.cur.execute(f"""
SELECT mpe.player_id, m.mst_utc,
SUM(CASE WHEN mpe.event_type = 'goal'
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
THEN 1 ELSE 0 END) AS goals
FROM match_player_events mpe
JOIN matches m ON mpe.match_id = m.id
WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph})
GROUP BY mpe.player_id, m.mst_utc
""", self.top_league_ids)
player_goals_raw = defaultdict(dict)
for pid, mst, goals in self.cur.fetchall():
player_goals_raw[pid][mst] = (player_goals_raw[pid].get(mst, 0)) + (goals or 0)
# 6b) Assists per player per match date
self.cur.execute(f"""
SELECT mpe.assist_player_id, m.mst_utc, COUNT(*) AS assists
FROM match_player_events mpe
JOIN matches m ON mpe.match_id = m.id
WHERE m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph})
AND mpe.event_type = 'goal' AND mpe.assist_player_id IS NOT NULL
GROUP BY mpe.assist_player_id, m.mst_utc
""", self.top_league_ids)
player_assists_raw = defaultdict(dict)
for pid, mst, assists in self.cur.fetchall():
player_assists_raw[pid][mst] = (player_assists_raw[pid].get(mst, 0)) + (assists or 0)
# 6c) Player participation dates (starts only)
self.cur.execute(f"""
SELECT mpp.player_id, m.mst_utc
FROM match_player_participation mpp
JOIN matches m ON mpp.match_id = m.id
WHERE mpp.is_starting = true
AND m.status = 'FT' AND m.sport = 'football' AND m.league_id IN ({ph})
ORDER BY mpp.player_id, m.mst_utc
""", self.top_league_ids)
player_starts_raw = defaultdict(list)
for pid, mst in self.cur.fetchall():
player_starts_raw[pid].append(mst)
# 6d) Build prefix sums per player (goals_prefix[i] = total goals up to start i)
player_career = {}
all_pids = set(player_starts_raw.keys()) | set(player_goals_raw.keys()) | set(player_assists_raw.keys())
for pid in all_pids:
starts = sorted(set(player_starts_raw.get(pid, [])))
if not starts:
continue
g_map = player_goals_raw.get(pid, {})
a_map = player_assists_raw.get(pid, {})
cum_g, cum_a = 0, 0
goals_pf, assists_pf = [], []
for mst in starts:
cum_g += g_map.get(mst, 0)
cum_a += a_map.get(mst, 0)
goals_pf.append(cum_g)
assists_pf.append(cum_a)
player_career[pid] = {'msts': starts, 'gp': goals_pf, 'ap': assists_pf}
# Free raw dicts
del player_goals_raw, player_assists_raw, player_starts_raw
print(f" 📊 Player careers built: {len(player_career)} players", flush=True)
# ─── NEW: Team Lineup History (for squad continuity) ───
# 7) Per-team sorted lineups: [(mst, frozenset(player_ids))]
team_lineup_map = defaultdict(list)
for (mid, tid), pids in starting_players.items():
mst = match_mst.get(mid, 0)
if mst > 0 and pids:
team_lineup_map[tid].append((mst, frozenset(pids)))
team_lineup_history = {}
team_lineup_msts = {}
for tid, ll in team_lineup_map.items():
ll.sort(key=lambda x: x[0])
team_lineup_history[tid] = ll
team_lineup_msts[tid] = [x[0] for x in ll]
del team_lineup_map
# ─── 8) Build combined cache — NO DATA LEAKAGE ───
all_keys = set(participation.keys()) | set(events.keys())
for key in all_keys:
mid, tid = key
@@ -443,30 +533,78 @@ class BatchDataLoader:
kp_total = len(key_players_by_team.get(tid, set()))
kp_missing = max(0, kp_total - kp_in_starting)
# Squad quality: composite score — ONLY pre-match info (no current-match goals/assists!)
# Squad quality: composite score — ONLY pre-match info
squad_quality = (
part['starting_count'] * 0.3 +
kp_in_starting * 3.0 +
part['fwd_count'] * 1.5
)
# Missing impact: how many key players are missing
missing_impact = min(kp_missing / max(kp_total, 1), 1.0)
# goals_form: avg goals from last 5 matches BEFORE this match
current_mst = match_mst.get(mid, 0)
team_history = self.team_matches.get(tid, [])
recent_goals = [
tm[2] # team_score
for tm in team_history
if tm[0] < current_mst # only matches BEFORE this one
][-5:] # last 5
tm[2] for tm in team_history if tm[0] < current_mst
][-5:]
goals_form = sum(recent_goals) / len(recent_goals) if recent_goals else 1.3
# ─── NEW: Player-level aggregation for starting XI ───
lineup_g90, lineup_a90, total_exp = 0.0, 0.0, 0
best_scorer_total, best_scorer_id = 0, None
scorers_in_lineup = 0
for pid in starters:
pc = player_career.get(pid)
if not pc:
continue
idx = bisect.bisect_left(pc['msts'], current_mst)
if idx == 0:
continue # no prior matches for this player
prior_starts = idx
prior_goals = pc['gp'][idx - 1]
prior_assists = pc['ap'][idx - 1]
lineup_g90 += prior_goals / prior_starts
lineup_a90 += prior_assists / prior_starts
total_exp += prior_starts
if prior_goals > 0:
scorers_in_lineup += 1
if prior_goals > best_scorer_total:
best_scorer_total = prior_goals
best_scorer_id = pid
n_st = len(starters) or 1
# Top scorer recent form (goals in last 5 starts)
top_scorer_form = 0
if best_scorer_id:
pc = player_career.get(best_scorer_id)
if pc:
idx = bisect.bisect_left(pc['msts'], current_mst)
if idx > 0:
s5 = max(0, idx - 5)
top_scorer_form = pc['gp'][idx - 1] - (pc['gp'][s5 - 1] if s5 > 0 else 0)
# Squad continuity (overlap with previous match lineup)
squad_continuity = 0.5
msts_list = team_lineup_msts.get(tid)
if msts_list:
li = bisect.bisect_left(msts_list, current_mst)
if li > 0:
prev_lineup = team_lineup_history[tid][li - 1][1]
squad_continuity = len(frozenset(starters) & prev_lineup) / n_st
self.squad_cache[key] = {
'squad_quality': squad_quality,
'key_players': kp_in_starting,
'missing_impact': missing_impact,
'goals_form': round(goals_form, 2),
'lineup_goals_per90': round(lineup_g90, 3),
'lineup_assists_per90': round(lineup_a90, 3),
'squad_continuity': round(squad_continuity, 3),
'top_scorer_form': top_scorer_form,
'avg_player_exp': round(total_exp / n_st, 1),
'goals_diversity': round(scorers_in_lineup / n_st, 3),
}
def _load_cards_data(self):
@@ -855,6 +993,20 @@ class FeatureExtractor:
"away_missing_impact": away_missing_impact,
"home_goals_form": home_goals_form,
"away_goals_form": away_goals_form,
# Player-Level Features
"home_lineup_goals_per90": home_sq.get('lineup_goals_per90', 0.0),
"away_lineup_goals_per90": away_sq.get('lineup_goals_per90', 0.0),
"home_lineup_assists_per90": home_sq.get('lineup_assists_per90', 0.0),
"away_lineup_assists_per90": away_sq.get('lineup_assists_per90', 0.0),
"home_squad_continuity": home_sq.get('squad_continuity', 0.5),
"away_squad_continuity": away_sq.get('squad_continuity', 0.5),
"home_top_scorer_form": home_sq.get('top_scorer_form', 0),
"away_top_scorer_form": away_sq.get('top_scorer_form', 0),
"home_avg_player_exp": home_sq.get('avg_player_exp', 0.0),
"away_avg_player_exp": away_sq.get('avg_player_exp', 0.0),
"home_goals_diversity": home_sq.get('goals_diversity', 0.0),
"away_goals_diversity": away_sq.get('goals_diversity', 0.0),
# Labels
"score_home": sh,