""" V27 Training Data Extraction - Value Sniper Extends V25 to ALL matches with odds (~104K). Adds rolling window, league quality, time, H2H, strength features. Usage: python3 scripts/extract_training_data_v27.py """ import os, sys, csv, time from collections import defaultdict AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, AI_DIR) from scripts.extract_training_data import ( BatchDataLoader as V25Loader, FeatureExtractor as V25Extractor, FEATURE_COLS as V25_COLS, get_conn, ) from features.rolling_features import ( calc_rolling_features, calc_league_quality, calc_time_features, calc_advanced_h2h, calc_strength_diff, ) OUTPUT = os.path.join(AI_DIR, "data", "training_data_v27.csv") os.makedirs(os.path.dirname(OUTPUT), exist_ok=True) V27_NEW = [ "home_rolling5_goals","home_rolling5_conceded", "home_rolling10_goals","home_rolling10_conceded", "home_rolling20_goals","home_rolling20_conceded", "away_rolling5_goals","away_rolling5_conceded", "away_rolling10_goals","away_rolling10_conceded", "home_rolling5_cs","away_rolling5_cs", "home_venue_goals","home_venue_conceded", "away_venue_goals","away_venue_conceded", "home_goal_trend","away_goal_trend", "league_home_win_rate","league_draw_rate", "league_btts_rate","league_ou25_rate", "league_reliability_score", "home_days_rest","away_days_rest", "match_month","is_season_start","is_season_end", "h2h_home_goals_avg","h2h_away_goals_avg", "h2h_recent_trend","h2h_venue_advantage", "attack_vs_defense_home","attack_vs_defense_away", "xg_diff","form_momentum_interaction", "elo_form_consistency","upset_x_elo_gap", ] ALL_COLS = V25_COLS + V27_NEW class V27Loader(V25Loader): """Load ALL matches with odds, not just top leagues.""" def __init__(self, conn): super().__init__(conn, []) self.league_matches_cache = {} def _load_matches(self): self.cur.execute(""" SELECT m.id, m.home_team_id, m.away_team_id, m.score_home, m.score_away, m.ht_score_home, m.ht_score_away, m.mst_utc, m.league_id, ht.name, at.name, l.name FROM matches m JOIN teams ht ON m.home_team_id = ht.id JOIN teams at ON m.away_team_id = at.id JOIN leagues l ON m.league_id = l.id WHERE m.status='FT' AND m.score_home IS NOT NULL AND m.sport='football' AND EXISTS(SELECT 1 FROM odd_categories oc WHERE oc.match_id=m.id) ORDER BY m.mst_utc ASC """) self.matches = self.cur.fetchall() def _load_odds(self): self.cur.execute(""" SELECT oc.match_id, oc.name, os.name, os.odd_value FROM odd_selections os JOIN odd_categories oc ON os.odd_category_db_id=oc.db_id JOIN matches m ON oc.match_id=m.id WHERE m.status='FT' AND m.sport='football' """) for mid, cat, sel, val in self.cur.fetchall(): try: v = float(val) if val else 0 if v <= 0 or not cat or not sel: continue if mid not in self.odds_cache: self.odds_cache[mid] = {} c = cat.lower().strip() s = sel.lower().strip() o = self.odds_cache[mid] if c == 'maç sonucu': if sel=='1': o['ms_h']=v elif sel in('0','X'): o['ms_d']=v elif sel=='2': o['ms_a']=v elif c == '1. yarı sonucu': if sel=='1': o['ht_ms_h']=v elif sel in('0','X'): o['ht_ms_d']=v elif sel=='2': o['ht_ms_a']=v elif c == 'karşılıklı gol': if 'var' in s: o['btts_y']=v elif 'yok' in s: o['btts_n']=v elif c == '2,5 alt/üst': if 'alt' in s: o['ou25_u']=v elif 'üst' in s: o['ou25_o']=v elif c == '1,5 alt/üst': if 'alt' in s: o['ou15_u']=v elif 'üst' in s: o['ou15_o']=v elif c == '3,5 alt/üst': if 'alt' in s: o['ou35_u']=v elif 'üst' in s: o['ou35_o']=v elif c == '0,5 alt/üst': if 'alt' in s: o['ou05_u']=v elif 'üst' in s: o['ou05_o']=v elif c == '1. yarı 0,5 alt/üst': if 'alt' in s: o['ht_ou05_u']=v elif 'üst' in s: o['ht_ou05_o']=v elif c == '1. yarı 1,5 alt/üst': if 'alt' in s: o['ht_ou15_u']=v elif 'üst' in s: o['ht_ou15_o']=v except (ValueError, TypeError): pass def _load_league_stats(self): self.cur.execute(""" SELECT league_id, AVG(score_home+score_away), AVG(CASE WHEN score_home=0 AND score_away=0 THEN 1.0 ELSE 0.0 END), COUNT(*) FROM matches WHERE status='FT' AND score_home IS NOT NULL AND sport='football' GROUP BY league_id """) for lid, ag, zr, cnt in self.cur.fetchall(): self.league_stats_cache[lid] = { "avg_goals": float(ag) if ag else 2.5, "zero_rate": float(zr) if zr else 0.07, "match_count": cnt } def _load_squad_data(self): self.cur.execute(""" SELECT mpp.match_id, mpp.team_id, COUNT(*) FILTER(WHERE mpp.is_starting=true), COUNT(*), COUNT(*) FILTER(WHERE mpp.is_starting=true AND LOWER(COALESCE(mpp.position::TEXT,''))~'(forward|fwd|forvet|striker)') FROM match_player_participation mpp JOIN matches m ON mpp.match_id=m.id WHERE m.status='FT' AND m.sport='football' GROUP BY mpp.match_id, mpp.team_id """) part = {} for mid,tid,st,tot,fwd in self.cur.fetchall(): part[(mid,tid)]={'starting_count':st or 0,'total_squad':tot or 0,'fwd_count':fwd or 0} self.cur.execute(""" SELECT mpe.match_id, mpe.team_id, COUNT(*) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'), COUNT(DISTINCT mpe.assist_player_id) FILTER(WHERE mpe.event_type='goal' AND mpe.assist_player_id IS NOT NULL), COUNT(DISTINCT mpe.player_id) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%') FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id WHERE m.status='FT' AND m.sport='football' GROUP BY mpe.match_id, mpe.team_id """) evts = {} for mid,tid,g,a,sc in self.cur.fetchall(): evts[(mid,tid)]={'goals':g or 0,'assists':a or 0,'unique_scorers':sc or 0} self.cur.execute(""" SELECT mpe.team_id, mpe.player_id, COUNT(*) FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id WHERE m.status='FT' AND m.sport='football' AND mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%' GROUP BY mpe.team_id, mpe.player_id HAVING COUNT(*)>=3 """) kp_by_team = defaultdict(set) for tid,pid,_ in self.cur.fetchall(): kp_by_team[tid].add(pid) self.cur.execute(""" SELECT mpp.match_id, mpp.team_id, mpp.player_id FROM match_player_participation mpp JOIN matches m ON mpp.match_id=m.id WHERE mpp.is_starting=true AND m.status='FT' AND m.sport='football' """) starters = defaultdict(list) for mid,tid,pid in self.cur.fetchall(): starters[(mid,tid)].append(pid) for key in set(part)|set(evts): mid,tid = key p = part.get(key,{'starting_count':0,'total_squad':0,'fwd_count':0}) e = evts.get(key,{'goals':0,'assists':0,'unique_scorers':0}) s = starters.get(key,[]) kp_in = sum(1 for x in s if x in kp_by_team.get(tid,set())) kp_tot = len(kp_by_team.get(tid,set())) kp_miss = max(0, kp_tot - kp_in) sq = p['starting_count']*0.3 + e['goals']*2.0 + e['assists']*1.0 + kp_in*3.0 + p['fwd_count']*1.5 mi = min(kp_miss/max(kp_tot,1), 1.0) self.squad_cache[key] = {'squad_quality':sq,'key_players':kp_in,'missing_impact':mi,'goals_form':e['goals']} def _load_cards_data(self): self.cur.execute(""" SELECT mpe.match_id, SUM(CASE WHEN mpe.event_type::text LIKE '%%yellow_card%%' THEN 1 WHEN mpe.event_type::text LIKE '%%red_card%%' THEN 2 ELSE 1 END) FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id WHERE m.status='FT' AND m.sport='football' AND mpe.event_type::text LIKE '%%card%%' GROUP BY mpe.match_id """) for mid, cw in self.cur.fetchall(): self.cards_cache[mid] = float(cw) if cw else 0.0 def load_league_matches(self): for m in self.matches: lid = m[8] if lid not in self.league_matches_cache: self.league_matches_cache[lid] = [] self.league_matches_cache[lid].append((m[7],None,m[3],m[4],None)) class V27Extractor(V25Extractor): """Adds V27 features on top of V25.""" def _extract_one(self, mid, hid, aid, sh, sa, hth, hta, mst, lid, hn, an, ln): row = super()._extract_one(mid,hid,aid,sh,sa,hth,hta,mst,lid,hn,an,ln) if not row: return None hm = self.loader.team_matches.get(hid,[]) am = self.loader.team_matches.get(aid,[]) hr = calc_rolling_features(hm, mst, True) ar = calc_rolling_features(am, mst, False) for pfx,r in [("home",hr),("away",ar)]: row[f"{pfx}_rolling5_goals"]=r["rolling5_goals_avg"] row[f"{pfx}_rolling5_conceded"]=r["rolling5_conceded_avg"] row[f"{pfx}_rolling10_goals"]=r["rolling10_goals_avg"] row[f"{pfx}_rolling10_conceded"]=r["rolling10_conceded_avg"] row[f"{pfx}_rolling20_goals"]=r["rolling20_goals_avg"] row[f"{pfx}_rolling20_conceded"]=r["rolling20_conceded_avg"] row[f"{pfx}_rolling5_cs"]=r["rolling5_clean_sheets"] row[f"{pfx}_venue_goals"]=r["venue_goals_avg"] row[f"{pfx}_venue_conceded"]=r["venue_conceded_avg"] row[f"{pfx}_goal_trend"]=r["goal_trend"] lb = [x for x in self.loader.league_matches_cache.get(lid,[]) if x[0]0) md = defaultdict(int) for r in rows: md[r["label_ms"]]+=1 print(f"\n📊 Summary:") print(f" Rows: {n}") print(f" With odds: {wo} ({wo/n*100:.1f}%)") print(f" Features: {len(ALL_COLS)} ({len(V25_COLS)} V25 + {len(V27_NEW)} new)") print(f" MS: H={md[0]/n*100:.1f}% D={md[1]/n*100:.1f}% A={md[2]/n*100:.1f}%") print(f" Time: {(time.time()-t0)/60:.1f}min") print(f"\n✅ Done! → {OUTPUT}") conn.close() if __name__=="__main__": main()