diff --git a/ai-engine/scripts/extract_training_data.py b/ai-engine/scripts/extract_training_data.py index 4c92ca2..99819f5 100755 --- a/ai-engine/scripts/extract_training_data.py +++ b/ai-engine/scripts/extract_training_data.py @@ -424,12 +424,18 @@ class BatchDataLoader: for mid, tid, pid in self.cur.fetchall(): starting_players[(mid, tid)].append(pid) - # 5) Build combined cache + # 5) Build match_id → mst_utc mapping for temporal filtering + match_mst = {} + for m in self.matches: + match_mst[m[0]] = m[7] # m[0]=id, m[7]=mst_utc + + # 6) Build combined cache — NO DATA LEAKAGE + # goals_form: avg goals from last 5 matches BEFORE this match (not this match!) + # squad_quality: only uses pre-match info (lineup, key players) — no current-match goals/assists all_keys = set(participation.keys()) | set(events.keys()) for key in all_keys: mid, tid = key part = participation.get(key, {'starting_count': 0, 'total_squad': 0, 'fwd_count': 0}) - evt = events.get(key, {'goals': 0, 'assists': 0, 'unique_scorers': 0}) # Count key players in starting XI starters = starting_players.get(key, []) @@ -437,22 +443,30 @@ class BatchDataLoader: kp_total = len(key_players_by_team.get(tid, set())) kp_missing = max(0, kp_total - kp_in_starting) - # Squad quality: composite score + # Squad quality: composite score — ONLY pre-match info (no current-match goals/assists!) squad_quality = ( part['starting_count'] * 0.3 + - evt['goals'] * 2.0 + - evt['assists'] * 1.0 + kp_in_starting * 3.0 + part['fwd_count'] * 1.5 ) # Missing impact: how many key players are missing missing_impact = min(kp_missing / max(kp_total, 1), 1.0) + # goals_form: avg goals from last 5 matches BEFORE this match + current_mst = match_mst.get(mid, 0) + team_history = self.team_matches.get(tid, []) + recent_goals = [ + tm[2] # team_score + for tm in team_history + if tm[0] < current_mst # only matches BEFORE this one + ][-5:] # last 5 + goals_form = sum(recent_goals) / len(recent_goals) if recent_goals else 1.3 + self.squad_cache[key] = { 'squad_quality': squad_quality, 'key_players': kp_in_starting, 'missing_impact': missing_impact, - 'goals_form': evt['goals'], + 'goals_form': round(goals_form, 2), } def _load_cards_data(self):