306 lines
13 KiB
Python
306 lines
13 KiB
Python
"""
|
||
V27 Training Data Extraction - Value Sniper
|
||
Extends V25 to ALL matches with odds (~104K).
|
||
Adds rolling window, league quality, time, H2H, strength features.
|
||
Usage: python3 scripts/extract_training_data_v27.py
|
||
"""
|
||
import os, sys, csv, time
|
||
from collections import defaultdict
|
||
|
||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
sys.path.insert(0, AI_DIR)
|
||
|
||
from scripts.extract_training_data import (
|
||
BatchDataLoader as V25Loader,
|
||
FeatureExtractor as V25Extractor,
|
||
FEATURE_COLS as V25_COLS,
|
||
get_conn,
|
||
)
|
||
from features.rolling_features import (
|
||
calc_rolling_features, calc_league_quality,
|
||
calc_time_features, calc_advanced_h2h, calc_strength_diff,
|
||
)
|
||
|
||
OUTPUT = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
|
||
|
||
V27_NEW = [
|
||
"home_rolling5_goals","home_rolling5_conceded",
|
||
"home_rolling10_goals","home_rolling10_conceded",
|
||
"home_rolling20_goals","home_rolling20_conceded",
|
||
"away_rolling5_goals","away_rolling5_conceded",
|
||
"away_rolling10_goals","away_rolling10_conceded",
|
||
"home_rolling5_cs","away_rolling5_cs",
|
||
"home_venue_goals","home_venue_conceded",
|
||
"away_venue_goals","away_venue_conceded",
|
||
"home_goal_trend","away_goal_trend",
|
||
"league_home_win_rate","league_draw_rate",
|
||
"league_btts_rate","league_ou25_rate",
|
||
"league_reliability_score",
|
||
"home_days_rest","away_days_rest",
|
||
"match_month","is_season_start","is_season_end",
|
||
"h2h_home_goals_avg","h2h_away_goals_avg",
|
||
"h2h_recent_trend","h2h_venue_advantage",
|
||
"attack_vs_defense_home","attack_vs_defense_away",
|
||
"xg_diff","form_momentum_interaction",
|
||
"elo_form_consistency","upset_x_elo_gap",
|
||
]
|
||
ALL_COLS = V25_COLS + V27_NEW
|
||
|
||
|
||
class V27Loader(V25Loader):
|
||
"""Load ALL matches with odds, not just top leagues."""
|
||
def __init__(self, conn):
|
||
super().__init__(conn, [])
|
||
self.league_matches_cache = {}
|
||
|
||
def _load_matches(self):
|
||
self.cur.execute("""
|
||
SELECT m.id, m.home_team_id, m.away_team_id,
|
||
m.score_home, m.score_away,
|
||
m.ht_score_home, m.ht_score_away,
|
||
m.mst_utc, m.league_id,
|
||
ht.name, at.name, l.name
|
||
FROM matches m
|
||
JOIN teams ht ON m.home_team_id = ht.id
|
||
JOIN teams at ON m.away_team_id = at.id
|
||
JOIN leagues l ON m.league_id = l.id
|
||
WHERE m.status='FT' AND m.score_home IS NOT NULL
|
||
AND m.sport='football'
|
||
AND EXISTS(SELECT 1 FROM odd_categories oc WHERE oc.match_id=m.id)
|
||
ORDER BY m.mst_utc ASC
|
||
""")
|
||
self.matches = self.cur.fetchall()
|
||
|
||
def _load_odds(self):
|
||
self.cur.execute("""
|
||
SELECT oc.match_id, oc.name, os.name, os.odd_value
|
||
FROM odd_selections os
|
||
JOIN odd_categories oc ON os.odd_category_db_id=oc.db_id
|
||
JOIN matches m ON oc.match_id=m.id
|
||
WHERE m.status='FT' AND m.sport='football'
|
||
""")
|
||
for mid, cat, sel, val in self.cur.fetchall():
|
||
try:
|
||
v = float(val) if val else 0
|
||
if v <= 0 or not cat or not sel: continue
|
||
if mid not in self.odds_cache: self.odds_cache[mid] = {}
|
||
c = cat.lower().strip()
|
||
s = sel.lower().strip()
|
||
o = self.odds_cache[mid]
|
||
if c == 'maç sonucu':
|
||
if sel=='1': o['ms_h']=v
|
||
elif sel in('0','X'): o['ms_d']=v
|
||
elif sel=='2': o['ms_a']=v
|
||
elif c == '1. yarı sonucu':
|
||
if sel=='1': o['ht_ms_h']=v
|
||
elif sel in('0','X'): o['ht_ms_d']=v
|
||
elif sel=='2': o['ht_ms_a']=v
|
||
elif c == 'karşılıklı gol':
|
||
if 'var' in s: o['btts_y']=v
|
||
elif 'yok' in s: o['btts_n']=v
|
||
elif c == '2,5 alt/üst':
|
||
if 'alt' in s: o['ou25_u']=v
|
||
elif 'üst' in s: o['ou25_o']=v
|
||
elif c == '1,5 alt/üst':
|
||
if 'alt' in s: o['ou15_u']=v
|
||
elif 'üst' in s: o['ou15_o']=v
|
||
elif c == '3,5 alt/üst':
|
||
if 'alt' in s: o['ou35_u']=v
|
||
elif 'üst' in s: o['ou35_o']=v
|
||
elif c == '0,5 alt/üst':
|
||
if 'alt' in s: o['ou05_u']=v
|
||
elif 'üst' in s: o['ou05_o']=v
|
||
elif c == '1. yarı 0,5 alt/üst':
|
||
if 'alt' in s: o['ht_ou05_u']=v
|
||
elif 'üst' in s: o['ht_ou05_o']=v
|
||
elif c == '1. yarı 1,5 alt/üst':
|
||
if 'alt' in s: o['ht_ou15_u']=v
|
||
elif 'üst' in s: o['ht_ou15_o']=v
|
||
except (ValueError, TypeError): pass
|
||
|
||
def _load_league_stats(self):
|
||
self.cur.execute("""
|
||
SELECT league_id,
|
||
AVG(score_home+score_away), AVG(CASE WHEN score_home=0 AND score_away=0 THEN 1.0 ELSE 0.0 END),
|
||
COUNT(*)
|
||
FROM matches WHERE status='FT' AND score_home IS NOT NULL AND sport='football'
|
||
GROUP BY league_id
|
||
""")
|
||
for lid, ag, zr, cnt in self.cur.fetchall():
|
||
self.league_stats_cache[lid] = {
|
||
"avg_goals": float(ag) if ag else 2.5,
|
||
"zero_rate": float(zr) if zr else 0.07,
|
||
"match_count": cnt
|
||
}
|
||
|
||
def _load_squad_data(self):
|
||
self.cur.execute("""
|
||
SELECT mpp.match_id, mpp.team_id,
|
||
COUNT(*) FILTER(WHERE mpp.is_starting=true),
|
||
COUNT(*),
|
||
COUNT(*) FILTER(WHERE mpp.is_starting=true
|
||
AND LOWER(COALESCE(mpp.position::TEXT,''))~'(forward|fwd|forvet|striker)')
|
||
FROM match_player_participation mpp
|
||
JOIN matches m ON mpp.match_id=m.id
|
||
WHERE m.status='FT' AND m.sport='football'
|
||
GROUP BY mpp.match_id, mpp.team_id
|
||
""")
|
||
part = {}
|
||
for mid,tid,st,tot,fwd in self.cur.fetchall():
|
||
part[(mid,tid)]={'starting_count':st or 0,'total_squad':tot or 0,'fwd_count':fwd or 0}
|
||
|
||
self.cur.execute("""
|
||
SELECT mpe.match_id, mpe.team_id,
|
||
COUNT(*) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'),
|
||
COUNT(DISTINCT mpe.assist_player_id) FILTER(WHERE mpe.event_type='goal' AND mpe.assist_player_id IS NOT NULL),
|
||
COUNT(DISTINCT mpe.player_id) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%')
|
||
FROM match_player_events mpe
|
||
JOIN matches m ON mpe.match_id=m.id
|
||
WHERE m.status='FT' AND m.sport='football'
|
||
GROUP BY mpe.match_id, mpe.team_id
|
||
""")
|
||
evts = {}
|
||
for mid,tid,g,a,sc in self.cur.fetchall():
|
||
evts[(mid,tid)]={'goals':g or 0,'assists':a or 0,'unique_scorers':sc or 0}
|
||
|
||
self.cur.execute("""
|
||
SELECT mpe.team_id, mpe.player_id, COUNT(*)
|
||
FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id
|
||
WHERE m.status='FT' AND m.sport='football' AND mpe.event_type='goal'
|
||
AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'
|
||
GROUP BY mpe.team_id, mpe.player_id HAVING COUNT(*)>=3
|
||
""")
|
||
kp_by_team = defaultdict(set)
|
||
for tid,pid,_ in self.cur.fetchall(): kp_by_team[tid].add(pid)
|
||
|
||
self.cur.execute("""
|
||
SELECT mpp.match_id, mpp.team_id, mpp.player_id
|
||
FROM match_player_participation mpp JOIN matches m ON mpp.match_id=m.id
|
||
WHERE mpp.is_starting=true AND m.status='FT' AND m.sport='football'
|
||
""")
|
||
starters = defaultdict(list)
|
||
for mid,tid,pid in self.cur.fetchall(): starters[(mid,tid)].append(pid)
|
||
|
||
for key in set(part)|set(evts):
|
||
mid,tid = key
|
||
p = part.get(key,{'starting_count':0,'total_squad':0,'fwd_count':0})
|
||
e = evts.get(key,{'goals':0,'assists':0,'unique_scorers':0})
|
||
s = starters.get(key,[])
|
||
kp_in = sum(1 for x in s if x in kp_by_team.get(tid,set()))
|
||
kp_tot = len(kp_by_team.get(tid,set()))
|
||
kp_miss = max(0, kp_tot - kp_in)
|
||
sq = p['starting_count']*0.3 + e['goals']*2.0 + e['assists']*1.0 + kp_in*3.0 + p['fwd_count']*1.5
|
||
mi = min(kp_miss/max(kp_tot,1), 1.0)
|
||
self.squad_cache[key] = {'squad_quality':sq,'key_players':kp_in,'missing_impact':mi,'goals_form':e['goals']}
|
||
|
||
def _load_cards_data(self):
|
||
self.cur.execute("""
|
||
SELECT mpe.match_id,
|
||
SUM(CASE WHEN mpe.event_type::text LIKE '%%yellow_card%%' THEN 1
|
||
WHEN mpe.event_type::text LIKE '%%red_card%%' THEN 2 ELSE 1 END)
|
||
FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id
|
||
WHERE m.status='FT' AND m.sport='football' AND mpe.event_type::text LIKE '%%card%%'
|
||
GROUP BY mpe.match_id
|
||
""")
|
||
for mid, cw in self.cur.fetchall():
|
||
self.cards_cache[mid] = float(cw) if cw else 0.0
|
||
|
||
def load_league_matches(self):
|
||
for m in self.matches:
|
||
lid = m[8]
|
||
if lid not in self.league_matches_cache:
|
||
self.league_matches_cache[lid] = []
|
||
self.league_matches_cache[lid].append((m[7],None,m[3],m[4],None))
|
||
|
||
|
||
class V27Extractor(V25Extractor):
|
||
"""Adds V27 features on top of V25."""
|
||
def _extract_one(self, mid, hid, aid, sh, sa, hth, hta, mst, lid,
|
||
hn, an, ln):
|
||
row = super()._extract_one(mid,hid,aid,sh,sa,hth,hta,mst,lid,hn,an,ln)
|
||
if not row: return None
|
||
|
||
hm = self.loader.team_matches.get(hid,[])
|
||
am = self.loader.team_matches.get(aid,[])
|
||
|
||
hr = calc_rolling_features(hm, mst, True)
|
||
ar = calc_rolling_features(am, mst, False)
|
||
for pfx,r in [("home",hr),("away",ar)]:
|
||
row[f"{pfx}_rolling5_goals"]=r["rolling5_goals_avg"]
|
||
row[f"{pfx}_rolling5_conceded"]=r["rolling5_conceded_avg"]
|
||
row[f"{pfx}_rolling10_goals"]=r["rolling10_goals_avg"]
|
||
row[f"{pfx}_rolling10_conceded"]=r["rolling10_conceded_avg"]
|
||
row[f"{pfx}_rolling20_goals"]=r["rolling20_goals_avg"]
|
||
row[f"{pfx}_rolling20_conceded"]=r["rolling20_conceded_avg"]
|
||
row[f"{pfx}_rolling5_cs"]=r["rolling5_clean_sheets"]
|
||
row[f"{pfx}_venue_goals"]=r["venue_goals_avg"]
|
||
row[f"{pfx}_venue_conceded"]=r["venue_conceded_avg"]
|
||
row[f"{pfx}_goal_trend"]=r["goal_trend"]
|
||
|
||
lb = [x for x in self.loader.league_matches_cache.get(lid,[]) if x[0]<mst]
|
||
lq = calc_league_quality(lb)
|
||
for k,v in lq.items(): row[k]=v
|
||
|
||
ht = calc_time_features(hm, mst)
|
||
at = calc_time_features(am, mst)
|
||
row["home_days_rest"]=ht["days_rest"]
|
||
row["away_days_rest"]=at["days_rest"]
|
||
row["match_month"]=ht["match_month"]
|
||
row["is_season_start"]=ht["is_season_start"]
|
||
row["is_season_end"]=ht["is_season_end"]
|
||
|
||
h2h = calc_advanced_h2h(hm, hid, aid, mst)
|
||
for k,v in h2h.items(): row[k]=v
|
||
|
||
sd = calc_strength_diff(
|
||
{"goals_avg":row.get("home_goals_avg",1.3),"conceded_avg":row.get("home_conceded_avg",1.2),"scoring_rate":row.get("home_scoring_rate",0.75)},
|
||
{"goals_avg":row.get("away_goals_avg",1.3),"conceded_avg":row.get("away_conceded_avg",1.2),"scoring_rate":row.get("away_scoring_rate",0.75)},
|
||
self.elo_ratings[hid], self.elo_ratings[aid],
|
||
row.get("home_momentum_score",0.5), row.get("away_momentum_score",0.5),
|
||
row.get("upset_potential",0.0),
|
||
)
|
||
row.update(sd)
|
||
return row
|
||
|
||
|
||
def main():
|
||
print("🚀 V27 Value Sniper — Training Data Extraction")
|
||
print("="*60)
|
||
t0 = time.time()
|
||
conn = get_conn()
|
||
|
||
print("\n📦 Loading ALL odds-bearing matches...")
|
||
loader = V27Loader(conn)
|
||
loader.load_all()
|
||
loader.load_league_matches()
|
||
print(f" Matches: {len(loader.matches)}")
|
||
print(f" Leagues: {len(loader.league_stats_cache)}")
|
||
print(f" Odds: {len(loader.odds_cache)}")
|
||
|
||
ext = V27Extractor(conn, loader)
|
||
rows = ext.extract_all()
|
||
if not rows:
|
||
print("❌ No data!"); return
|
||
|
||
print(f"\n💾 Writing {len(rows)} rows...")
|
||
with open(OUTPUT,"w",newline="",encoding="utf-8") as f:
|
||
w = csv.DictWriter(f, fieldnames=ALL_COLS, extrasaction='ignore')
|
||
w.writeheader(); w.writerows(rows)
|
||
|
||
n = len(rows)
|
||
wo = sum(1 for r in rows if r.get("odds_ms_h",0)>0)
|
||
md = defaultdict(int)
|
||
for r in rows: md[r["label_ms"]]+=1
|
||
print(f"\n📊 Summary:")
|
||
print(f" Rows: {n}")
|
||
print(f" With odds: {wo} ({wo/n*100:.1f}%)")
|
||
print(f" Features: {len(ALL_COLS)} ({len(V25_COLS)} V25 + {len(V27_NEW)} new)")
|
||
print(f" MS: H={md[0]/n*100:.1f}% D={md[1]/n*100:.1f}% A={md[2]/n*100:.1f}%")
|
||
print(f" Time: {(time.time()-t0)/60:.1f}min")
|
||
print(f"\n✅ Done! → {OUTPUT}")
|
||
conn.close()
|
||
|
||
if __name__=="__main__":
|
||
main()
|