199 lines
8.0 KiB
Python
199 lines
8.0 KiB
Python
"""
|
||
Extract Upcoming Features — leak-free feature rows for UPCOMING (NS) matches,
|
||
produced by the EXACT same pipeline that built training_data_v27.csv.
|
||
=============================================================================
|
||
Why this exists: the picker (generate_daily_picks.py) needs the 133 leak-free
|
||
features for tomorrow's matches, computed IDENTICALLY to training (any drift =
|
||
train/serve skew = broken model). So we reuse V27Loader + V27Extractor verbatim:
|
||
|
||
1. load_all() builds ELO / team history / league / squad caches from FT
|
||
matches ONLY (untouched — guarantees identical feature computation).
|
||
2. We then APPEND upcoming NS matches as targets and inject their odds from
|
||
live_matches.odds (all markets, same mapping as the trainer's _load_odds).
|
||
3. extract_all() replays FT chronologically (ELO fully built), then computes
|
||
features for the NS targets at the end. ELO update + labels are guarded
|
||
for null scores (NS has no result yet); the 133 model features never use
|
||
the current score, so they come out identical to training.
|
||
4. Write ONLY the upcoming rows -> data/upcoming_features.csv
|
||
|
||
Then: generate_daily_picks.py --features data/upcoming_features.csv --log
|
||
|
||
Run nightly (heavy: full ELO replay, like training). Read-only on the DB.
|
||
"""
|
||
from __future__ import annotations
|
||
import csv
|
||
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
|
||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||
try:
|
||
sys.stdout.reconfigure(encoding="utf-8")
|
||
except Exception:
|
||
pass
|
||
|
||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
sys.path.insert(0, AI_DIR)
|
||
|
||
from scripts.extract_training_data_v27 import ( # noqa: E402
|
||
V27Loader, V27Extractor, ALL_COLS, get_conn,
|
||
)
|
||
|
||
OUTPUT = os.path.join(AI_DIR, "data", "upcoming_features.csv")
|
||
DAYS_AHEAD = 4
|
||
|
||
|
||
def map_live_odds(odds_json) -> dict:
|
||
"""Map live_matches.odds JSON → odds_cache keys, IDENTICAL to the trainer's
|
||
_load_odds category/selection logic (so odds features match training)."""
|
||
out: dict = {}
|
||
if isinstance(odds_json, str):
|
||
try:
|
||
odds_json = json.loads(odds_json)
|
||
except Exception:
|
||
return out
|
||
if not isinstance(odds_json, dict):
|
||
return out
|
||
for cat, sels in odds_json.items():
|
||
if not isinstance(sels, dict):
|
||
continue
|
||
c = str(cat).lower().strip()
|
||
for sel, val in sels.items():
|
||
try:
|
||
v = float(val)
|
||
except (TypeError, ValueError):
|
||
continue
|
||
if v <= 0:
|
||
continue
|
||
sn = str(sel)
|
||
s = sn.lower().strip()
|
||
if c == "maç sonucu":
|
||
if sn == "1": out["ms_h"] = v
|
||
elif sn in ("0", "X"): out["ms_d"] = v
|
||
elif sn == "2": out["ms_a"] = v
|
||
elif c == "1. yarı sonucu":
|
||
if sn == "1": out["ht_ms_h"] = v
|
||
elif sn in ("0", "X"): out["ht_ms_d"] = v
|
||
elif sn == "2": out["ht_ms_a"] = v
|
||
elif c == "karşılıklı gol":
|
||
if "var" in s: out["btts_y"] = v
|
||
elif "yok" in s: out["btts_n"] = v
|
||
elif c == "0,5 alt/üst":
|
||
if "alt" in s: out["ou05_u"] = v
|
||
elif "üst" in s: out["ou05_o"] = v
|
||
elif c == "1,5 alt/üst":
|
||
if "alt" in s: out["ou15_u"] = v
|
||
elif "üst" in s: out["ou15_o"] = v
|
||
elif c == "2,5 alt/üst":
|
||
if "alt" in s: out["ou25_u"] = v
|
||
elif "üst" in s: out["ou25_o"] = v
|
||
elif c == "3,5 alt/üst":
|
||
if "alt" in s: out["ou35_u"] = v
|
||
elif "üst" in s: out["ou35_o"] = v
|
||
elif c == "1. yarı 0,5 alt/üst":
|
||
if "alt" in s: out["ht_ou05_u"] = v
|
||
elif "üst" in s: out["ht_ou05_o"] = v
|
||
elif c == "1. yarı 1,5 alt/üst":
|
||
if "alt" in s: out["ht_ou15_u"] = v
|
||
elif "üst" in s: out["ht_ou15_o"] = v
|
||
return out
|
||
|
||
|
||
class UpcomingExtractor(V27Extractor):
|
||
"""Same feature computation as training; only guards null (NS) scores."""
|
||
|
||
def _update_elo(self, home_id, away_id, score_home, score_away):
|
||
if score_home is None or score_away is None:
|
||
return # upcoming match — no result, don't move ELO
|
||
return super()._update_elo(home_id, away_id, score_home, score_away)
|
||
|
||
def _extract_one(self, mid, hid, aid, sh, sa, hth, hta, mst, lid, hn, an, ln):
|
||
if sh is None or sa is None:
|
||
# Upcoming TARGET. Dummy scores so label/total_goals don't crash;
|
||
# those columns are labels/LEAKY and are NOT among the 133 model
|
||
# features, so the served feature vector is identical to training.
|
||
row = super()._extract_one(mid, hid, aid, 0, 0, 0, 0, mst, lid, hn, an, ln)
|
||
if row:
|
||
row["_upcoming"] = 1
|
||
return row
|
||
# FT match: needed ONLY to advance ELO (extract_all calls _update_elo
|
||
# afterwards regardless). Skip the expensive per-match feature
|
||
# computation — that turns a ~6h full extraction into seconds while
|
||
# producing the IDENTICAL final ELO the upcoming targets read.
|
||
return None
|
||
|
||
|
||
def main():
|
||
t0 = time.time()
|
||
conn = get_conn()
|
||
|
||
# ── Cheap check FIRST: are there upcoming matches with odds? ──
|
||
now_ms = int(time.time() * 1000)
|
||
hi_ms = now_ms + DAYS_AHEAD * 24 * 3600 * 1000
|
||
cur = conn.cursor()
|
||
cur.execute(
|
||
"""
|
||
SELECT lm.id, lm.home_team_id, lm.away_team_id, lm.mst_utc, lm.league_id,
|
||
ht.name, at.name, l.name, lm.odds
|
||
FROM live_matches lm
|
||
JOIN teams ht ON ht.id = lm.home_team_id
|
||
JOIN teams at ON at.id = lm.away_team_id
|
||
JOIN leagues l ON l.id = lm.league_id
|
||
WHERE lm.sport = 'football'
|
||
AND lm.odds IS NOT NULL
|
||
AND lm.mst_utc > %s AND lm.mst_utc <= %s
|
||
ORDER BY lm.mst_utc ASC
|
||
""",
|
||
(now_ms, hi_ms),
|
||
)
|
||
upcoming = cur.fetchall()
|
||
targets = []
|
||
for mid, hid, aid, mst, lid, hn, an, ln, odds_json in upcoming:
|
||
oc = map_live_odds(odds_json)
|
||
if "ms_h" not in oc or "ms_a" not in oc:
|
||
continue # need MS odds for the policy
|
||
targets.append((mid, hid, aid, mst, lid, hn, an, ln, oc))
|
||
print(f"Upcoming NS matches with MS odds (next {DAYS_AHEAD}d): {len(targets)}", flush=True)
|
||
if not targets:
|
||
print("⚠️ Nothing to extract. Deploy the 4-day window + let the odds cron\n"
|
||
" populate live_matches, then re-run.")
|
||
conn.close()
|
||
return
|
||
|
||
print("📦 Loading FT history (ELO/form/league/squad caches; heavy) ...", flush=True)
|
||
loader = V27Loader(conn)
|
||
loader.load_all()
|
||
loader.load_league_matches()
|
||
print(f" FT matches: {len(loader.matches)}", flush=True)
|
||
|
||
for mid, hid, aid, mst, lid, hn, an, ln, oc in targets:
|
||
loader.odds_cache[mid] = oc
|
||
loader.matches.append(
|
||
(mid, hid, aid, None, None, None, None, mst, lid, hn, an, ln)
|
||
)
|
||
# NS targets must be processed AFTER all FT (ELO fully built)
|
||
loader.matches.sort(key=lambda m: m[7] if m[7] is not None else 0)
|
||
added = len(targets)
|
||
|
||
print("🔄 Extracting features (FT replay + upcoming targets) ...", flush=True)
|
||
ext = UpcomingExtractor(conn, loader)
|
||
rows = ext.extract_all()
|
||
up_rows = [r for r in rows if r.get("_upcoming")]
|
||
|
||
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
|
||
with open(OUTPUT, "w", newline="", encoding="utf-8") as f:
|
||
w = csv.DictWriter(f, fieldnames=ALL_COLS, extrasaction="ignore")
|
||
w.writeheader()
|
||
w.writerows(up_rows)
|
||
|
||
with_odds = sum(1 for r in up_rows if r.get("odds_ms_h", 0) and r["odds_ms_h"] > 0)
|
||
print(f"\n✅ Wrote {len(up_rows)} upcoming feature rows ({with_odds} with MS odds) → {OUTPUT}")
|
||
print(f" Time: {(time.time()-t0)/60:.1f} min")
|
||
print(" Next: python scripts/generate_daily_picks.py --features data/upcoming_features.csv --log")
|
||
conn.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|