gg
This commit is contained in:
@@ -0,0 +1,305 @@
|
||||
"""
|
||||
V27 Training Data Extraction - Value Sniper
|
||||
Extends V25 to ALL matches with odds (~104K).
|
||||
Adds rolling window, league quality, time, H2H, strength features.
|
||||
Usage: python3 scripts/extract_training_data_v27.py
|
||||
"""
|
||||
import os, sys, csv, time
|
||||
from collections import defaultdict
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0, AI_DIR)
|
||||
|
||||
from scripts.extract_training_data import (
|
||||
BatchDataLoader as V25Loader,
|
||||
FeatureExtractor as V25Extractor,
|
||||
FEATURE_COLS as V25_COLS,
|
||||
get_conn,
|
||||
)
|
||||
from features.rolling_features import (
|
||||
calc_rolling_features, calc_league_quality,
|
||||
calc_time_features, calc_advanced_h2h, calc_strength_diff,
|
||||
)
|
||||
|
||||
OUTPUT = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||||
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
|
||||
|
||||
V27_NEW = [
|
||||
"home_rolling5_goals","home_rolling5_conceded",
|
||||
"home_rolling10_goals","home_rolling10_conceded",
|
||||
"home_rolling20_goals","home_rolling20_conceded",
|
||||
"away_rolling5_goals","away_rolling5_conceded",
|
||||
"away_rolling10_goals","away_rolling10_conceded",
|
||||
"home_rolling5_cs","away_rolling5_cs",
|
||||
"home_venue_goals","home_venue_conceded",
|
||||
"away_venue_goals","away_venue_conceded",
|
||||
"home_goal_trend","away_goal_trend",
|
||||
"league_home_win_rate","league_draw_rate",
|
||||
"league_btts_rate","league_ou25_rate",
|
||||
"league_reliability_score",
|
||||
"home_days_rest","away_days_rest",
|
||||
"match_month","is_season_start","is_season_end",
|
||||
"h2h_home_goals_avg","h2h_away_goals_avg",
|
||||
"h2h_recent_trend","h2h_venue_advantage",
|
||||
"attack_vs_defense_home","attack_vs_defense_away",
|
||||
"xg_diff","form_momentum_interaction",
|
||||
"elo_form_consistency","upset_x_elo_gap",
|
||||
]
|
||||
ALL_COLS = V25_COLS + V27_NEW
|
||||
|
||||
|
||||
class V27Loader(V25Loader):
|
||||
"""Load ALL matches with odds, not just top leagues."""
|
||||
def __init__(self, conn):
|
||||
super().__init__(conn, [])
|
||||
self.league_matches_cache = {}
|
||||
|
||||
def _load_matches(self):
|
||||
self.cur.execute("""
|
||||
SELECT m.id, m.home_team_id, m.away_team_id,
|
||||
m.score_home, m.score_away,
|
||||
m.ht_score_home, m.ht_score_away,
|
||||
m.mst_utc, m.league_id,
|
||||
ht.name, at.name, l.name
|
||||
FROM matches m
|
||||
JOIN teams ht ON m.home_team_id = ht.id
|
||||
JOIN teams at ON m.away_team_id = at.id
|
||||
JOIN leagues l ON m.league_id = l.id
|
||||
WHERE m.status='FT' AND m.score_home IS NOT NULL
|
||||
AND m.sport='football'
|
||||
AND EXISTS(SELECT 1 FROM odd_categories oc WHERE oc.match_id=m.id)
|
||||
ORDER BY m.mst_utc ASC
|
||||
""")
|
||||
self.matches = self.cur.fetchall()
|
||||
|
||||
def _load_odds(self):
|
||||
self.cur.execute("""
|
||||
SELECT oc.match_id, oc.name, os.name, os.odd_value
|
||||
FROM odd_selections os
|
||||
JOIN odd_categories oc ON os.odd_category_db_id=oc.db_id
|
||||
JOIN matches m ON oc.match_id=m.id
|
||||
WHERE m.status='FT' AND m.sport='football'
|
||||
""")
|
||||
for mid, cat, sel, val in self.cur.fetchall():
|
||||
try:
|
||||
v = float(val) if val else 0
|
||||
if v <= 0 or not cat or not sel: continue
|
||||
if mid not in self.odds_cache: self.odds_cache[mid] = {}
|
||||
c = cat.lower().strip()
|
||||
s = sel.lower().strip()
|
||||
o = self.odds_cache[mid]
|
||||
if c == 'maç sonucu':
|
||||
if sel=='1': o['ms_h']=v
|
||||
elif sel in('0','X'): o['ms_d']=v
|
||||
elif sel=='2': o['ms_a']=v
|
||||
elif c == '1. yarı sonucu':
|
||||
if sel=='1': o['ht_ms_h']=v
|
||||
elif sel in('0','X'): o['ht_ms_d']=v
|
||||
elif sel=='2': o['ht_ms_a']=v
|
||||
elif c == 'karşılıklı gol':
|
||||
if 'var' in s: o['btts_y']=v
|
||||
elif 'yok' in s: o['btts_n']=v
|
||||
elif c == '2,5 alt/üst':
|
||||
if 'alt' in s: o['ou25_u']=v
|
||||
elif 'üst' in s: o['ou25_o']=v
|
||||
elif c == '1,5 alt/üst':
|
||||
if 'alt' in s: o['ou15_u']=v
|
||||
elif 'üst' in s: o['ou15_o']=v
|
||||
elif c == '3,5 alt/üst':
|
||||
if 'alt' in s: o['ou35_u']=v
|
||||
elif 'üst' in s: o['ou35_o']=v
|
||||
elif c == '0,5 alt/üst':
|
||||
if 'alt' in s: o['ou05_u']=v
|
||||
elif 'üst' in s: o['ou05_o']=v
|
||||
elif c == '1. yarı 0,5 alt/üst':
|
||||
if 'alt' in s: o['ht_ou05_u']=v
|
||||
elif 'üst' in s: o['ht_ou05_o']=v
|
||||
elif c == '1. yarı 1,5 alt/üst':
|
||||
if 'alt' in s: o['ht_ou15_u']=v
|
||||
elif 'üst' in s: o['ht_ou15_o']=v
|
||||
except (ValueError, TypeError): pass
|
||||
|
||||
def _load_league_stats(self):
|
||||
self.cur.execute("""
|
||||
SELECT league_id,
|
||||
AVG(score_home+score_away), AVG(CASE WHEN score_home=0 AND score_away=0 THEN 1.0 ELSE 0.0 END),
|
||||
COUNT(*)
|
||||
FROM matches WHERE status='FT' AND score_home IS NOT NULL AND sport='football'
|
||||
GROUP BY league_id
|
||||
""")
|
||||
for lid, ag, zr, cnt in self.cur.fetchall():
|
||||
self.league_stats_cache[lid] = {
|
||||
"avg_goals": float(ag) if ag else 2.5,
|
||||
"zero_rate": float(zr) if zr else 0.07,
|
||||
"match_count": cnt
|
||||
}
|
||||
|
||||
def _load_squad_data(self):
|
||||
self.cur.execute("""
|
||||
SELECT mpp.match_id, mpp.team_id,
|
||||
COUNT(*) FILTER(WHERE mpp.is_starting=true),
|
||||
COUNT(*),
|
||||
COUNT(*) FILTER(WHERE mpp.is_starting=true
|
||||
AND LOWER(COALESCE(mpp.position::TEXT,''))~'(forward|fwd|forvet|striker)')
|
||||
FROM match_player_participation mpp
|
||||
JOIN matches m ON mpp.match_id=m.id
|
||||
WHERE m.status='FT' AND m.sport='football'
|
||||
GROUP BY mpp.match_id, mpp.team_id
|
||||
""")
|
||||
part = {}
|
||||
for mid,tid,st,tot,fwd in self.cur.fetchall():
|
||||
part[(mid,tid)]={'starting_count':st or 0,'total_squad':tot or 0,'fwd_count':fwd or 0}
|
||||
|
||||
self.cur.execute("""
|
||||
SELECT mpe.match_id, mpe.team_id,
|
||||
COUNT(*) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'),
|
||||
COUNT(DISTINCT mpe.assist_player_id) FILTER(WHERE mpe.event_type='goal' AND mpe.assist_player_id IS NOT NULL),
|
||||
COUNT(DISTINCT mpe.player_id) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%')
|
||||
FROM match_player_events mpe
|
||||
JOIN matches m ON mpe.match_id=m.id
|
||||
WHERE m.status='FT' AND m.sport='football'
|
||||
GROUP BY mpe.match_id, mpe.team_id
|
||||
""")
|
||||
evts = {}
|
||||
for mid,tid,g,a,sc in self.cur.fetchall():
|
||||
evts[(mid,tid)]={'goals':g or 0,'assists':a or 0,'unique_scorers':sc or 0}
|
||||
|
||||
self.cur.execute("""
|
||||
SELECT mpe.team_id, mpe.player_id, COUNT(*)
|
||||
FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id
|
||||
WHERE m.status='FT' AND m.sport='football' AND mpe.event_type='goal'
|
||||
AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'
|
||||
GROUP BY mpe.team_id, mpe.player_id HAVING COUNT(*)>=3
|
||||
""")
|
||||
kp_by_team = defaultdict(set)
|
||||
for tid,pid,_ in self.cur.fetchall(): kp_by_team[tid].add(pid)
|
||||
|
||||
self.cur.execute("""
|
||||
SELECT mpp.match_id, mpp.team_id, mpp.player_id
|
||||
FROM match_player_participation mpp JOIN matches m ON mpp.match_id=m.id
|
||||
WHERE mpp.is_starting=true AND m.status='FT' AND m.sport='football'
|
||||
""")
|
||||
starters = defaultdict(list)
|
||||
for mid,tid,pid in self.cur.fetchall(): starters[(mid,tid)].append(pid)
|
||||
|
||||
for key in set(part)|set(evts):
|
||||
mid,tid = key
|
||||
p = part.get(key,{'starting_count':0,'total_squad':0,'fwd_count':0})
|
||||
e = evts.get(key,{'goals':0,'assists':0,'unique_scorers':0})
|
||||
s = starters.get(key,[])
|
||||
kp_in = sum(1 for x in s if x in kp_by_team.get(tid,set()))
|
||||
kp_tot = len(kp_by_team.get(tid,set()))
|
||||
kp_miss = max(0, kp_tot - kp_in)
|
||||
sq = p['starting_count']*0.3 + e['goals']*2.0 + e['assists']*1.0 + kp_in*3.0 + p['fwd_count']*1.5
|
||||
mi = min(kp_miss/max(kp_tot,1), 1.0)
|
||||
self.squad_cache[key] = {'squad_quality':sq,'key_players':kp_in,'missing_impact':mi,'goals_form':e['goals']}
|
||||
|
||||
def _load_cards_data(self):
|
||||
self.cur.execute("""
|
||||
SELECT mpe.match_id,
|
||||
SUM(CASE WHEN mpe.event_type::text LIKE '%%yellow_card%%' THEN 1
|
||||
WHEN mpe.event_type::text LIKE '%%red_card%%' THEN 2 ELSE 1 END)
|
||||
FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id
|
||||
WHERE m.status='FT' AND m.sport='football' AND mpe.event_type::text LIKE '%%card%%'
|
||||
GROUP BY mpe.match_id
|
||||
""")
|
||||
for mid, cw in self.cur.fetchall():
|
||||
self.cards_cache[mid] = float(cw) if cw else 0.0
|
||||
|
||||
def load_league_matches(self):
|
||||
for m in self.matches:
|
||||
lid = m[8]
|
||||
if lid not in self.league_matches_cache:
|
||||
self.league_matches_cache[lid] = []
|
||||
self.league_matches_cache[lid].append((m[7],None,m[3],m[4],None))
|
||||
|
||||
|
||||
class V27Extractor(V25Extractor):
|
||||
"""Adds V27 features on top of V25."""
|
||||
def _extract_one(self, mid, hid, aid, sh, sa, hth, hta, mst, lid,
|
||||
hn, an, ln):
|
||||
row = super()._extract_one(mid,hid,aid,sh,sa,hth,hta,mst,lid,hn,an,ln)
|
||||
if not row: return None
|
||||
|
||||
hm = self.loader.team_matches.get(hid,[])
|
||||
am = self.loader.team_matches.get(aid,[])
|
||||
|
||||
hr = calc_rolling_features(hm, mst, True)
|
||||
ar = calc_rolling_features(am, mst, False)
|
||||
for pfx,r in [("home",hr),("away",ar)]:
|
||||
row[f"{pfx}_rolling5_goals"]=r["rolling5_goals_avg"]
|
||||
row[f"{pfx}_rolling5_conceded"]=r["rolling5_conceded_avg"]
|
||||
row[f"{pfx}_rolling10_goals"]=r["rolling10_goals_avg"]
|
||||
row[f"{pfx}_rolling10_conceded"]=r["rolling10_conceded_avg"]
|
||||
row[f"{pfx}_rolling20_goals"]=r["rolling20_goals_avg"]
|
||||
row[f"{pfx}_rolling20_conceded"]=r["rolling20_conceded_avg"]
|
||||
row[f"{pfx}_rolling5_cs"]=r["rolling5_clean_sheets"]
|
||||
row[f"{pfx}_venue_goals"]=r["venue_goals_avg"]
|
||||
row[f"{pfx}_venue_conceded"]=r["venue_conceded_avg"]
|
||||
row[f"{pfx}_goal_trend"]=r["goal_trend"]
|
||||
|
||||
lb = [x for x in self.loader.league_matches_cache.get(lid,[]) if x[0]<mst]
|
||||
lq = calc_league_quality(lb)
|
||||
for k,v in lq.items(): row[k]=v
|
||||
|
||||
ht = calc_time_features(hm, mst)
|
||||
at = calc_time_features(am, mst)
|
||||
row["home_days_rest"]=ht["days_rest"]
|
||||
row["away_days_rest"]=at["days_rest"]
|
||||
row["match_month"]=ht["match_month"]
|
||||
row["is_season_start"]=ht["is_season_start"]
|
||||
row["is_season_end"]=ht["is_season_end"]
|
||||
|
||||
h2h = calc_advanced_h2h(hm, hid, aid, mst)
|
||||
for k,v in h2h.items(): row[k]=v
|
||||
|
||||
sd = calc_strength_diff(
|
||||
{"goals_avg":row.get("home_goals_avg",1.3),"conceded_avg":row.get("home_conceded_avg",1.2),"scoring_rate":row.get("home_scoring_rate",0.75)},
|
||||
{"goals_avg":row.get("away_goals_avg",1.3),"conceded_avg":row.get("away_conceded_avg",1.2),"scoring_rate":row.get("away_scoring_rate",0.75)},
|
||||
self.elo_ratings[hid], self.elo_ratings[aid],
|
||||
row.get("home_momentum_score",0.5), row.get("away_momentum_score",0.5),
|
||||
row.get("upset_potential",0.0),
|
||||
)
|
||||
row.update(sd)
|
||||
return row
|
||||
|
||||
|
||||
def main():
|
||||
print("🚀 V27 Value Sniper — Training Data Extraction")
|
||||
print("="*60)
|
||||
t0 = time.time()
|
||||
conn = get_conn()
|
||||
|
||||
print("\n📦 Loading ALL odds-bearing matches...")
|
||||
loader = V27Loader(conn)
|
||||
loader.load_all()
|
||||
loader.load_league_matches()
|
||||
print(f" Matches: {len(loader.matches)}")
|
||||
print(f" Leagues: {len(loader.league_stats_cache)}")
|
||||
print(f" Odds: {len(loader.odds_cache)}")
|
||||
|
||||
ext = V27Extractor(conn, loader)
|
||||
rows = ext.extract_all()
|
||||
if not rows:
|
||||
print("❌ No data!"); return
|
||||
|
||||
print(f"\n💾 Writing {len(rows)} rows...")
|
||||
with open(OUTPUT,"w",newline="",encoding="utf-8") as f:
|
||||
w = csv.DictWriter(f, fieldnames=ALL_COLS, extrasaction='ignore')
|
||||
w.writeheader(); w.writerows(rows)
|
||||
|
||||
n = len(rows)
|
||||
wo = sum(1 for r in rows if r.get("odds_ms_h",0)>0)
|
||||
md = defaultdict(int)
|
||||
for r in rows: md[r["label_ms"]]+=1
|
||||
print(f"\n📊 Summary:")
|
||||
print(f" Rows: {n}")
|
||||
print(f" With odds: {wo} ({wo/n*100:.1f}%)")
|
||||
print(f" Features: {len(ALL_COLS)} ({len(V25_COLS)} V25 + {len(V27_NEW)} new)")
|
||||
print(f" MS: H={md[0]/n*100:.1f}% D={md[1]/n*100:.1f}% A={md[2]/n*100:.1f}%")
|
||||
print(f" Time: {(time.time()-t0)/60:.1f}min")
|
||||
print(f"\n✅ Done! → {OUTPUT}")
|
||||
conn.close()
|
||||
|
||||
if __name__=="__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user