This commit is contained in:
2026-04-22 02:17:02 +03:00
parent 2ccd6831eb
commit df428ed1e8
19 changed files with 6436 additions and 9 deletions
@@ -0,0 +1,305 @@
"""
V27 Training Data Extraction - Value Sniper
Extends V25 to ALL matches with odds (~104K).
Adds rolling window, league quality, time, H2H, strength features.
Usage: python3 scripts/extract_training_data_v27.py
"""
import os, sys, csv, time
from collections import defaultdict
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_DIR)
from scripts.extract_training_data import (
BatchDataLoader as V25Loader,
FeatureExtractor as V25Extractor,
FEATURE_COLS as V25_COLS,
get_conn,
)
from features.rolling_features import (
calc_rolling_features, calc_league_quality,
calc_time_features, calc_advanced_h2h, calc_strength_diff,
)
OUTPUT = os.path.join(AI_DIR, "data", "training_data_v27.csv")
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
V27_NEW = [
"home_rolling5_goals","home_rolling5_conceded",
"home_rolling10_goals","home_rolling10_conceded",
"home_rolling20_goals","home_rolling20_conceded",
"away_rolling5_goals","away_rolling5_conceded",
"away_rolling10_goals","away_rolling10_conceded",
"home_rolling5_cs","away_rolling5_cs",
"home_venue_goals","home_venue_conceded",
"away_venue_goals","away_venue_conceded",
"home_goal_trend","away_goal_trend",
"league_home_win_rate","league_draw_rate",
"league_btts_rate","league_ou25_rate",
"league_reliability_score",
"home_days_rest","away_days_rest",
"match_month","is_season_start","is_season_end",
"h2h_home_goals_avg","h2h_away_goals_avg",
"h2h_recent_trend","h2h_venue_advantage",
"attack_vs_defense_home","attack_vs_defense_away",
"xg_diff","form_momentum_interaction",
"elo_form_consistency","upset_x_elo_gap",
]
ALL_COLS = V25_COLS + V27_NEW
class V27Loader(V25Loader):
"""Load ALL matches with odds, not just top leagues."""
def __init__(self, conn):
super().__init__(conn, [])
self.league_matches_cache = {}
def _load_matches(self):
self.cur.execute("""
SELECT m.id, m.home_team_id, m.away_team_id,
m.score_home, m.score_away,
m.ht_score_home, m.ht_score_away,
m.mst_utc, m.league_id,
ht.name, at.name, l.name
FROM matches m
JOIN teams ht ON m.home_team_id = ht.id
JOIN teams at ON m.away_team_id = at.id
JOIN leagues l ON m.league_id = l.id
WHERE m.status='FT' AND m.score_home IS NOT NULL
AND m.sport='football'
AND EXISTS(SELECT 1 FROM odd_categories oc WHERE oc.match_id=m.id)
ORDER BY m.mst_utc ASC
""")
self.matches = self.cur.fetchall()
def _load_odds(self):
self.cur.execute("""
SELECT oc.match_id, oc.name, os.name, os.odd_value
FROM odd_selections os
JOIN odd_categories oc ON os.odd_category_db_id=oc.db_id
JOIN matches m ON oc.match_id=m.id
WHERE m.status='FT' AND m.sport='football'
""")
for mid, cat, sel, val in self.cur.fetchall():
try:
v = float(val) if val else 0
if v <= 0 or not cat or not sel: continue
if mid not in self.odds_cache: self.odds_cache[mid] = {}
c = cat.lower().strip()
s = sel.lower().strip()
o = self.odds_cache[mid]
if c == 'maç sonucu':
if sel=='1': o['ms_h']=v
elif sel in('0','X'): o['ms_d']=v
elif sel=='2': o['ms_a']=v
elif c == '1. yarı sonucu':
if sel=='1': o['ht_ms_h']=v
elif sel in('0','X'): o['ht_ms_d']=v
elif sel=='2': o['ht_ms_a']=v
elif c == 'karşılıklı gol':
if 'var' in s: o['btts_y']=v
elif 'yok' in s: o['btts_n']=v
elif c == '2,5 alt/üst':
if 'alt' in s: o['ou25_u']=v
elif 'üst' in s: o['ou25_o']=v
elif c == '1,5 alt/üst':
if 'alt' in s: o['ou15_u']=v
elif 'üst' in s: o['ou15_o']=v
elif c == '3,5 alt/üst':
if 'alt' in s: o['ou35_u']=v
elif 'üst' in s: o['ou35_o']=v
elif c == '0,5 alt/üst':
if 'alt' in s: o['ou05_u']=v
elif 'üst' in s: o['ou05_o']=v
elif c == '1. yarı 0,5 alt/üst':
if 'alt' in s: o['ht_ou05_u']=v
elif 'üst' in s: o['ht_ou05_o']=v
elif c == '1. yarı 1,5 alt/üst':
if 'alt' in s: o['ht_ou15_u']=v
elif 'üst' in s: o['ht_ou15_o']=v
except (ValueError, TypeError): pass
def _load_league_stats(self):
self.cur.execute("""
SELECT league_id,
AVG(score_home+score_away), AVG(CASE WHEN score_home=0 AND score_away=0 THEN 1.0 ELSE 0.0 END),
COUNT(*)
FROM matches WHERE status='FT' AND score_home IS NOT NULL AND sport='football'
GROUP BY league_id
""")
for lid, ag, zr, cnt in self.cur.fetchall():
self.league_stats_cache[lid] = {
"avg_goals": float(ag) if ag else 2.5,
"zero_rate": float(zr) if zr else 0.07,
"match_count": cnt
}
def _load_squad_data(self):
self.cur.execute("""
SELECT mpp.match_id, mpp.team_id,
COUNT(*) FILTER(WHERE mpp.is_starting=true),
COUNT(*),
COUNT(*) FILTER(WHERE mpp.is_starting=true
AND LOWER(COALESCE(mpp.position::TEXT,''))~'(forward|fwd|forvet|striker)')
FROM match_player_participation mpp
JOIN matches m ON mpp.match_id=m.id
WHERE m.status='FT' AND m.sport='football'
GROUP BY mpp.match_id, mpp.team_id
""")
part = {}
for mid,tid,st,tot,fwd in self.cur.fetchall():
part[(mid,tid)]={'starting_count':st or 0,'total_squad':tot or 0,'fwd_count':fwd or 0}
self.cur.execute("""
SELECT mpe.match_id, mpe.team_id,
COUNT(*) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'),
COUNT(DISTINCT mpe.assist_player_id) FILTER(WHERE mpe.event_type='goal' AND mpe.assist_player_id IS NOT NULL),
COUNT(DISTINCT mpe.player_id) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%')
FROM match_player_events mpe
JOIN matches m ON mpe.match_id=m.id
WHERE m.status='FT' AND m.sport='football'
GROUP BY mpe.match_id, mpe.team_id
""")
evts = {}
for mid,tid,g,a,sc in self.cur.fetchall():
evts[(mid,tid)]={'goals':g or 0,'assists':a or 0,'unique_scorers':sc or 0}
self.cur.execute("""
SELECT mpe.team_id, mpe.player_id, COUNT(*)
FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id
WHERE m.status='FT' AND m.sport='football' AND mpe.event_type='goal'
AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'
GROUP BY mpe.team_id, mpe.player_id HAVING COUNT(*)>=3
""")
kp_by_team = defaultdict(set)
for tid,pid,_ in self.cur.fetchall(): kp_by_team[tid].add(pid)
self.cur.execute("""
SELECT mpp.match_id, mpp.team_id, mpp.player_id
FROM match_player_participation mpp JOIN matches m ON mpp.match_id=m.id
WHERE mpp.is_starting=true AND m.status='FT' AND m.sport='football'
""")
starters = defaultdict(list)
for mid,tid,pid in self.cur.fetchall(): starters[(mid,tid)].append(pid)
for key in set(part)|set(evts):
mid,tid = key
p = part.get(key,{'starting_count':0,'total_squad':0,'fwd_count':0})
e = evts.get(key,{'goals':0,'assists':0,'unique_scorers':0})
s = starters.get(key,[])
kp_in = sum(1 for x in s if x in kp_by_team.get(tid,set()))
kp_tot = len(kp_by_team.get(tid,set()))
kp_miss = max(0, kp_tot - kp_in)
sq = p['starting_count']*0.3 + e['goals']*2.0 + e['assists']*1.0 + kp_in*3.0 + p['fwd_count']*1.5
mi = min(kp_miss/max(kp_tot,1), 1.0)
self.squad_cache[key] = {'squad_quality':sq,'key_players':kp_in,'missing_impact':mi,'goals_form':e['goals']}
def _load_cards_data(self):
self.cur.execute("""
SELECT mpe.match_id,
SUM(CASE WHEN mpe.event_type::text LIKE '%%yellow_card%%' THEN 1
WHEN mpe.event_type::text LIKE '%%red_card%%' THEN 2 ELSE 1 END)
FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id
WHERE m.status='FT' AND m.sport='football' AND mpe.event_type::text LIKE '%%card%%'
GROUP BY mpe.match_id
""")
for mid, cw in self.cur.fetchall():
self.cards_cache[mid] = float(cw) if cw else 0.0
def load_league_matches(self):
for m in self.matches:
lid = m[8]
if lid not in self.league_matches_cache:
self.league_matches_cache[lid] = []
self.league_matches_cache[lid].append((m[7],None,m[3],m[4],None))
class V27Extractor(V25Extractor):
"""Adds V27 features on top of V25."""
def _extract_one(self, mid, hid, aid, sh, sa, hth, hta, mst, lid,
hn, an, ln):
row = super()._extract_one(mid,hid,aid,sh,sa,hth,hta,mst,lid,hn,an,ln)
if not row: return None
hm = self.loader.team_matches.get(hid,[])
am = self.loader.team_matches.get(aid,[])
hr = calc_rolling_features(hm, mst, True)
ar = calc_rolling_features(am, mst, False)
for pfx,r in [("home",hr),("away",ar)]:
row[f"{pfx}_rolling5_goals"]=r["rolling5_goals_avg"]
row[f"{pfx}_rolling5_conceded"]=r["rolling5_conceded_avg"]
row[f"{pfx}_rolling10_goals"]=r["rolling10_goals_avg"]
row[f"{pfx}_rolling10_conceded"]=r["rolling10_conceded_avg"]
row[f"{pfx}_rolling20_goals"]=r["rolling20_goals_avg"]
row[f"{pfx}_rolling20_conceded"]=r["rolling20_conceded_avg"]
row[f"{pfx}_rolling5_cs"]=r["rolling5_clean_sheets"]
row[f"{pfx}_venue_goals"]=r["venue_goals_avg"]
row[f"{pfx}_venue_conceded"]=r["venue_conceded_avg"]
row[f"{pfx}_goal_trend"]=r["goal_trend"]
lb = [x for x in self.loader.league_matches_cache.get(lid,[]) if x[0]<mst]
lq = calc_league_quality(lb)
for k,v in lq.items(): row[k]=v
ht = calc_time_features(hm, mst)
at = calc_time_features(am, mst)
row["home_days_rest"]=ht["days_rest"]
row["away_days_rest"]=at["days_rest"]
row["match_month"]=ht["match_month"]
row["is_season_start"]=ht["is_season_start"]
row["is_season_end"]=ht["is_season_end"]
h2h = calc_advanced_h2h(hm, hid, aid, mst)
for k,v in h2h.items(): row[k]=v
sd = calc_strength_diff(
{"goals_avg":row.get("home_goals_avg",1.3),"conceded_avg":row.get("home_conceded_avg",1.2),"scoring_rate":row.get("home_scoring_rate",0.75)},
{"goals_avg":row.get("away_goals_avg",1.3),"conceded_avg":row.get("away_conceded_avg",1.2),"scoring_rate":row.get("away_scoring_rate",0.75)},
self.elo_ratings[hid], self.elo_ratings[aid],
row.get("home_momentum_score",0.5), row.get("away_momentum_score",0.5),
row.get("upset_potential",0.0),
)
row.update(sd)
return row
def main():
print("🚀 V27 Value Sniper — Training Data Extraction")
print("="*60)
t0 = time.time()
conn = get_conn()
print("\n📦 Loading ALL odds-bearing matches...")
loader = V27Loader(conn)
loader.load_all()
loader.load_league_matches()
print(f" Matches: {len(loader.matches)}")
print(f" Leagues: {len(loader.league_stats_cache)}")
print(f" Odds: {len(loader.odds_cache)}")
ext = V27Extractor(conn, loader)
rows = ext.extract_all()
if not rows:
print("❌ No data!"); return
print(f"\n💾 Writing {len(rows)} rows...")
with open(OUTPUT,"w",newline="",encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=ALL_COLS, extrasaction='ignore')
w.writeheader(); w.writerows(rows)
n = len(rows)
wo = sum(1 for r in rows if r.get("odds_ms_h",0)>0)
md = defaultdict(int)
for r in rows: md[r["label_ms"]]+=1
print(f"\n📊 Summary:")
print(f" Rows: {n}")
print(f" With odds: {wo} ({wo/n*100:.1f}%)")
print(f" Features: {len(ALL_COLS)} ({len(V25_COLS)} V25 + {len(V27_NEW)} new)")
print(f" MS: H={md[0]/n*100:.1f}% D={md[1]/n*100:.1f}% A={md[2]/n*100:.1f}%")
print(f" Time: {(time.time()-t0)/60:.1f}min")
print(f"\n✅ Done! → {OUTPUT}")
conn.close()
if __name__=="__main__":
main()