Files
iddaai-be/ai-engine/scripts/extract_training_data_v27.py
2026-04-22 02:17:02 +03:00

306 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
V27 Training Data Extraction - Value Sniper
Extends V25 to ALL matches with odds (~104K).
Adds rolling window, league quality, time, H2H, strength features.
Usage: python3 scripts/extract_training_data_v27.py
"""
import os, sys, csv, time
from collections import defaultdict
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_DIR)
from scripts.extract_training_data import (
BatchDataLoader as V25Loader,
FeatureExtractor as V25Extractor,
FEATURE_COLS as V25_COLS,
get_conn,
)
from features.rolling_features import (
calc_rolling_features, calc_league_quality,
calc_time_features, calc_advanced_h2h, calc_strength_diff,
)
OUTPUT = os.path.join(AI_DIR, "data", "training_data_v27.csv")
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
V27_NEW = [
"home_rolling5_goals","home_rolling5_conceded",
"home_rolling10_goals","home_rolling10_conceded",
"home_rolling20_goals","home_rolling20_conceded",
"away_rolling5_goals","away_rolling5_conceded",
"away_rolling10_goals","away_rolling10_conceded",
"home_rolling5_cs","away_rolling5_cs",
"home_venue_goals","home_venue_conceded",
"away_venue_goals","away_venue_conceded",
"home_goal_trend","away_goal_trend",
"league_home_win_rate","league_draw_rate",
"league_btts_rate","league_ou25_rate",
"league_reliability_score",
"home_days_rest","away_days_rest",
"match_month","is_season_start","is_season_end",
"h2h_home_goals_avg","h2h_away_goals_avg",
"h2h_recent_trend","h2h_venue_advantage",
"attack_vs_defense_home","attack_vs_defense_away",
"xg_diff","form_momentum_interaction",
"elo_form_consistency","upset_x_elo_gap",
]
ALL_COLS = V25_COLS + V27_NEW
class V27Loader(V25Loader):
"""Load ALL matches with odds, not just top leagues."""
def __init__(self, conn):
super().__init__(conn, [])
self.league_matches_cache = {}
def _load_matches(self):
self.cur.execute("""
SELECT m.id, m.home_team_id, m.away_team_id,
m.score_home, m.score_away,
m.ht_score_home, m.ht_score_away,
m.mst_utc, m.league_id,
ht.name, at.name, l.name
FROM matches m
JOIN teams ht ON m.home_team_id = ht.id
JOIN teams at ON m.away_team_id = at.id
JOIN leagues l ON m.league_id = l.id
WHERE m.status='FT' AND m.score_home IS NOT NULL
AND m.sport='football'
AND EXISTS(SELECT 1 FROM odd_categories oc WHERE oc.match_id=m.id)
ORDER BY m.mst_utc ASC
""")
self.matches = self.cur.fetchall()
def _load_odds(self):
self.cur.execute("""
SELECT oc.match_id, oc.name, os.name, os.odd_value
FROM odd_selections os
JOIN odd_categories oc ON os.odd_category_db_id=oc.db_id
JOIN matches m ON oc.match_id=m.id
WHERE m.status='FT' AND m.sport='football'
""")
for mid, cat, sel, val in self.cur.fetchall():
try:
v = float(val) if val else 0
if v <= 0 or not cat or not sel: continue
if mid not in self.odds_cache: self.odds_cache[mid] = {}
c = cat.lower().strip()
s = sel.lower().strip()
o = self.odds_cache[mid]
if c == 'maç sonucu':
if sel=='1': o['ms_h']=v
elif sel in('0','X'): o['ms_d']=v
elif sel=='2': o['ms_a']=v
elif c == '1. yarı sonucu':
if sel=='1': o['ht_ms_h']=v
elif sel in('0','X'): o['ht_ms_d']=v
elif sel=='2': o['ht_ms_a']=v
elif c == 'karşılıklı gol':
if 'var' in s: o['btts_y']=v
elif 'yok' in s: o['btts_n']=v
elif c == '2,5 alt/üst':
if 'alt' in s: o['ou25_u']=v
elif 'üst' in s: o['ou25_o']=v
elif c == '1,5 alt/üst':
if 'alt' in s: o['ou15_u']=v
elif 'üst' in s: o['ou15_o']=v
elif c == '3,5 alt/üst':
if 'alt' in s: o['ou35_u']=v
elif 'üst' in s: o['ou35_o']=v
elif c == '0,5 alt/üst':
if 'alt' in s: o['ou05_u']=v
elif 'üst' in s: o['ou05_o']=v
elif c == '1. yarı 0,5 alt/üst':
if 'alt' in s: o['ht_ou05_u']=v
elif 'üst' in s: o['ht_ou05_o']=v
elif c == '1. yarı 1,5 alt/üst':
if 'alt' in s: o['ht_ou15_u']=v
elif 'üst' in s: o['ht_ou15_o']=v
except (ValueError, TypeError): pass
def _load_league_stats(self):
self.cur.execute("""
SELECT league_id,
AVG(score_home+score_away), AVG(CASE WHEN score_home=0 AND score_away=0 THEN 1.0 ELSE 0.0 END),
COUNT(*)
FROM matches WHERE status='FT' AND score_home IS NOT NULL AND sport='football'
GROUP BY league_id
""")
for lid, ag, zr, cnt in self.cur.fetchall():
self.league_stats_cache[lid] = {
"avg_goals": float(ag) if ag else 2.5,
"zero_rate": float(zr) if zr else 0.07,
"match_count": cnt
}
def _load_squad_data(self):
self.cur.execute("""
SELECT mpp.match_id, mpp.team_id,
COUNT(*) FILTER(WHERE mpp.is_starting=true),
COUNT(*),
COUNT(*) FILTER(WHERE mpp.is_starting=true
AND LOWER(COALESCE(mpp.position::TEXT,''))~'(forward|fwd|forvet|striker)')
FROM match_player_participation mpp
JOIN matches m ON mpp.match_id=m.id
WHERE m.status='FT' AND m.sport='football'
GROUP BY mpp.match_id, mpp.team_id
""")
part = {}
for mid,tid,st,tot,fwd in self.cur.fetchall():
part[(mid,tid)]={'starting_count':st or 0,'total_squad':tot or 0,'fwd_count':fwd or 0}
self.cur.execute("""
SELECT mpe.match_id, mpe.team_id,
COUNT(*) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'),
COUNT(DISTINCT mpe.assist_player_id) FILTER(WHERE mpe.event_type='goal' AND mpe.assist_player_id IS NOT NULL),
COUNT(DISTINCT mpe.player_id) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%')
FROM match_player_events mpe
JOIN matches m ON mpe.match_id=m.id
WHERE m.status='FT' AND m.sport='football'
GROUP BY mpe.match_id, mpe.team_id
""")
evts = {}
for mid,tid,g,a,sc in self.cur.fetchall():
evts[(mid,tid)]={'goals':g or 0,'assists':a or 0,'unique_scorers':sc or 0}
self.cur.execute("""
SELECT mpe.team_id, mpe.player_id, COUNT(*)
FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id
WHERE m.status='FT' AND m.sport='football' AND mpe.event_type='goal'
AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'
GROUP BY mpe.team_id, mpe.player_id HAVING COUNT(*)>=3
""")
kp_by_team = defaultdict(set)
for tid,pid,_ in self.cur.fetchall(): kp_by_team[tid].add(pid)
self.cur.execute("""
SELECT mpp.match_id, mpp.team_id, mpp.player_id
FROM match_player_participation mpp JOIN matches m ON mpp.match_id=m.id
WHERE mpp.is_starting=true AND m.status='FT' AND m.sport='football'
""")
starters = defaultdict(list)
for mid,tid,pid in self.cur.fetchall(): starters[(mid,tid)].append(pid)
for key in set(part)|set(evts):
mid,tid = key
p = part.get(key,{'starting_count':0,'total_squad':0,'fwd_count':0})
e = evts.get(key,{'goals':0,'assists':0,'unique_scorers':0})
s = starters.get(key,[])
kp_in = sum(1 for x in s if x in kp_by_team.get(tid,set()))
kp_tot = len(kp_by_team.get(tid,set()))
kp_miss = max(0, kp_tot - kp_in)
sq = p['starting_count']*0.3 + e['goals']*2.0 + e['assists']*1.0 + kp_in*3.0 + p['fwd_count']*1.5
mi = min(kp_miss/max(kp_tot,1), 1.0)
self.squad_cache[key] = {'squad_quality':sq,'key_players':kp_in,'missing_impact':mi,'goals_form':e['goals']}
def _load_cards_data(self):
self.cur.execute("""
SELECT mpe.match_id,
SUM(CASE WHEN mpe.event_type::text LIKE '%%yellow_card%%' THEN 1
WHEN mpe.event_type::text LIKE '%%red_card%%' THEN 2 ELSE 1 END)
FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id
WHERE m.status='FT' AND m.sport='football' AND mpe.event_type::text LIKE '%%card%%'
GROUP BY mpe.match_id
""")
for mid, cw in self.cur.fetchall():
self.cards_cache[mid] = float(cw) if cw else 0.0
def load_league_matches(self):
for m in self.matches:
lid = m[8]
if lid not in self.league_matches_cache:
self.league_matches_cache[lid] = []
self.league_matches_cache[lid].append((m[7],None,m[3],m[4],None))
class V27Extractor(V25Extractor):
"""Adds V27 features on top of V25."""
def _extract_one(self, mid, hid, aid, sh, sa, hth, hta, mst, lid,
hn, an, ln):
row = super()._extract_one(mid,hid,aid,sh,sa,hth,hta,mst,lid,hn,an,ln)
if not row: return None
hm = self.loader.team_matches.get(hid,[])
am = self.loader.team_matches.get(aid,[])
hr = calc_rolling_features(hm, mst, True)
ar = calc_rolling_features(am, mst, False)
for pfx,r in [("home",hr),("away",ar)]:
row[f"{pfx}_rolling5_goals"]=r["rolling5_goals_avg"]
row[f"{pfx}_rolling5_conceded"]=r["rolling5_conceded_avg"]
row[f"{pfx}_rolling10_goals"]=r["rolling10_goals_avg"]
row[f"{pfx}_rolling10_conceded"]=r["rolling10_conceded_avg"]
row[f"{pfx}_rolling20_goals"]=r["rolling20_goals_avg"]
row[f"{pfx}_rolling20_conceded"]=r["rolling20_conceded_avg"]
row[f"{pfx}_rolling5_cs"]=r["rolling5_clean_sheets"]
row[f"{pfx}_venue_goals"]=r["venue_goals_avg"]
row[f"{pfx}_venue_conceded"]=r["venue_conceded_avg"]
row[f"{pfx}_goal_trend"]=r["goal_trend"]
lb = [x for x in self.loader.league_matches_cache.get(lid,[]) if x[0]<mst]
lq = calc_league_quality(lb)
for k,v in lq.items(): row[k]=v
ht = calc_time_features(hm, mst)
at = calc_time_features(am, mst)
row["home_days_rest"]=ht["days_rest"]
row["away_days_rest"]=at["days_rest"]
row["match_month"]=ht["match_month"]
row["is_season_start"]=ht["is_season_start"]
row["is_season_end"]=ht["is_season_end"]
h2h = calc_advanced_h2h(hm, hid, aid, mst)
for k,v in h2h.items(): row[k]=v
sd = calc_strength_diff(
{"goals_avg":row.get("home_goals_avg",1.3),"conceded_avg":row.get("home_conceded_avg",1.2),"scoring_rate":row.get("home_scoring_rate",0.75)},
{"goals_avg":row.get("away_goals_avg",1.3),"conceded_avg":row.get("away_conceded_avg",1.2),"scoring_rate":row.get("away_scoring_rate",0.75)},
self.elo_ratings[hid], self.elo_ratings[aid],
row.get("home_momentum_score",0.5), row.get("away_momentum_score",0.5),
row.get("upset_potential",0.0),
)
row.update(sd)
return row
def main():
print("🚀 V27 Value Sniper — Training Data Extraction")
print("="*60)
t0 = time.time()
conn = get_conn()
print("\n📦 Loading ALL odds-bearing matches...")
loader = V27Loader(conn)
loader.load_all()
loader.load_league_matches()
print(f" Matches: {len(loader.matches)}")
print(f" Leagues: {len(loader.league_stats_cache)}")
print(f" Odds: {len(loader.odds_cache)}")
ext = V27Extractor(conn, loader)
rows = ext.extract_all()
if not rows:
print("❌ No data!"); return
print(f"\n💾 Writing {len(rows)} rows...")
with open(OUTPUT,"w",newline="",encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=ALL_COLS, extrasaction='ignore')
w.writeheader(); w.writerows(rows)
n = len(rows)
wo = sum(1 for r in rows if r.get("odds_ms_h",0)>0)
md = defaultdict(int)
for r in rows: md[r["label_ms"]]+=1
print(f"\n📊 Summary:")
print(f" Rows: {n}")
print(f" With odds: {wo} ({wo/n*100:.1f}%)")
print(f" Features: {len(ALL_COLS)} ({len(V25_COLS)} V25 + {len(V27_NEW)} new)")
print(f" MS: H={md[0]/n*100:.1f}% D={md[1]/n*100:.1f}% A={md[2]/n*100:.1f}%")
print(f" Time: {(time.time()-t0)/60:.1f}min")
print(f"\n✅ Done! → {OUTPUT}")
conn.close()
if __name__=="__main__":
main()