fix(ai-engine): remove target leakage from training data extraction
Deploy Iddaai Backend / build-and-deploy (push) Successful in 6s

- goals_form now uses avg of last 5 historical matches instead of current match goals
- squad_quality removes current match goals/assists, uses only pre-match known data
- adds temporal filtering via match_id -> mst_utc mapping
This commit is contained in:
2026-05-05 22:35:04 +03:00
parent bfddcaca7d
commit 5b5f83c8cf
+20 -6
View File
@@ -424,12 +424,18 @@ class BatchDataLoader:
for mid, tid, pid in self.cur.fetchall(): for mid, tid, pid in self.cur.fetchall():
starting_players[(mid, tid)].append(pid) starting_players[(mid, tid)].append(pid)
# 5) Build combined cache # 5) Build match_id → mst_utc mapping for temporal filtering
match_mst = {}
for m in self.matches:
match_mst[m[0]] = m[7] # m[0]=id, m[7]=mst_utc
# 6) Build combined cache — NO DATA LEAKAGE
# goals_form: avg goals from last 5 matches BEFORE this match (not this match!)
# squad_quality: only uses pre-match info (lineup, key players) — no current-match goals/assists
all_keys = set(participation.keys()) | set(events.keys()) all_keys = set(participation.keys()) | set(events.keys())
for key in all_keys: for key in all_keys:
mid, tid = key mid, tid = key
part = participation.get(key, {'starting_count': 0, 'total_squad': 0, 'fwd_count': 0}) part = participation.get(key, {'starting_count': 0, 'total_squad': 0, 'fwd_count': 0})
evt = events.get(key, {'goals': 0, 'assists': 0, 'unique_scorers': 0})
# Count key players in starting XI # Count key players in starting XI
starters = starting_players.get(key, []) starters = starting_players.get(key, [])
@@ -437,22 +443,30 @@ class BatchDataLoader:
kp_total = len(key_players_by_team.get(tid, set())) kp_total = len(key_players_by_team.get(tid, set()))
kp_missing = max(0, kp_total - kp_in_starting) kp_missing = max(0, kp_total - kp_in_starting)
# Squad quality: composite score # Squad quality: composite score — ONLY pre-match info (no current-match goals/assists!)
squad_quality = ( squad_quality = (
part['starting_count'] * 0.3 + part['starting_count'] * 0.3 +
evt['goals'] * 2.0 +
evt['assists'] * 1.0 +
kp_in_starting * 3.0 + kp_in_starting * 3.0 +
part['fwd_count'] * 1.5 part['fwd_count'] * 1.5
) )
# Missing impact: how many key players are missing # Missing impact: how many key players are missing
missing_impact = min(kp_missing / max(kp_total, 1), 1.0) missing_impact = min(kp_missing / max(kp_total, 1), 1.0)
# goals_form: avg goals from last 5 matches BEFORE this match
current_mst = match_mst.get(mid, 0)
team_history = self.team_matches.get(tid, [])
recent_goals = [
tm[2] # team_score
for tm in team_history
if tm[0] < current_mst # only matches BEFORE this one
][-5:] # last 5
goals_form = sum(recent_goals) / len(recent_goals) if recent_goals else 1.3
self.squad_cache[key] = { self.squad_cache[key] = {
'squad_quality': squad_quality, 'squad_quality': squad_quality,
'key_players': kp_in_starting, 'key_players': kp_in_starting,
'missing_impact': missing_impact, 'missing_impact': missing_impact,
'goals_form': evt['goals'], 'goals_form': round(goals_form, 2),
} }
def _load_cards_data(self): def _load_cards_data(self):