@@ -0,0 +1,198 @@
|
||||
"""
|
||||
Extract Upcoming Features — leak-free feature rows for UPCOMING (NS) matches,
|
||||
produced by the EXACT same pipeline that built training_data_v27.csv.
|
||||
=============================================================================
|
||||
Why this exists: the picker (generate_daily_picks.py) needs the 133 leak-free
|
||||
features for tomorrow's matches, computed IDENTICALLY to training (any drift =
|
||||
train/serve skew = broken model). So we reuse V27Loader + V27Extractor verbatim:
|
||||
|
||||
1. load_all() builds ELO / team history / league / squad caches from FT
|
||||
matches ONLY (untouched — guarantees identical feature computation).
|
||||
2. We then APPEND upcoming NS matches as targets and inject their odds from
|
||||
live_matches.odds (all markets, same mapping as the trainer's _load_odds).
|
||||
3. extract_all() replays FT chronologically (ELO fully built), then computes
|
||||
features for the NS targets at the end. ELO update + labels are guarded
|
||||
for null scores (NS has no result yet); the 133 model features never use
|
||||
the current score, so they come out identical to training.
|
||||
4. Write ONLY the upcoming rows -> data/upcoming_features.csv
|
||||
|
||||
Then: generate_daily_picks.py --features data/upcoming_features.csv --log
|
||||
|
||||
Run nightly (heavy: full ELO replay, like training). Read-only on the DB.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0, AI_DIR)
|
||||
|
||||
from scripts.extract_training_data_v27 import ( # noqa: E402
|
||||
V27Loader, V27Extractor, ALL_COLS, get_conn,
|
||||
)
|
||||
|
||||
OUTPUT = os.path.join(AI_DIR, "data", "upcoming_features.csv")
|
||||
DAYS_AHEAD = 4
|
||||
|
||||
|
||||
def map_live_odds(odds_json) -> dict:
|
||||
"""Map live_matches.odds JSON → odds_cache keys, IDENTICAL to the trainer's
|
||||
_load_odds category/selection logic (so odds features match training)."""
|
||||
out: dict = {}
|
||||
if isinstance(odds_json, str):
|
||||
try:
|
||||
odds_json = json.loads(odds_json)
|
||||
except Exception:
|
||||
return out
|
||||
if not isinstance(odds_json, dict):
|
||||
return out
|
||||
for cat, sels in odds_json.items():
|
||||
if not isinstance(sels, dict):
|
||||
continue
|
||||
c = str(cat).lower().strip()
|
||||
for sel, val in sels.items():
|
||||
try:
|
||||
v = float(val)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if v <= 0:
|
||||
continue
|
||||
sn = str(sel)
|
||||
s = sn.lower().strip()
|
||||
if c == "maç sonucu":
|
||||
if sn == "1": out["ms_h"] = v
|
||||
elif sn in ("0", "X"): out["ms_d"] = v
|
||||
elif sn == "2": out["ms_a"] = v
|
||||
elif c == "1. yarı sonucu":
|
||||
if sn == "1": out["ht_ms_h"] = v
|
||||
elif sn in ("0", "X"): out["ht_ms_d"] = v
|
||||
elif sn == "2": out["ht_ms_a"] = v
|
||||
elif c == "karşılıklı gol":
|
||||
if "var" in s: out["btts_y"] = v
|
||||
elif "yok" in s: out["btts_n"] = v
|
||||
elif c == "0,5 alt/üst":
|
||||
if "alt" in s: out["ou05_u"] = v
|
||||
elif "üst" in s: out["ou05_o"] = v
|
||||
elif c == "1,5 alt/üst":
|
||||
if "alt" in s: out["ou15_u"] = v
|
||||
elif "üst" in s: out["ou15_o"] = v
|
||||
elif c == "2,5 alt/üst":
|
||||
if "alt" in s: out["ou25_u"] = v
|
||||
elif "üst" in s: out["ou25_o"] = v
|
||||
elif c == "3,5 alt/üst":
|
||||
if "alt" in s: out["ou35_u"] = v
|
||||
elif "üst" in s: out["ou35_o"] = v
|
||||
elif c == "1. yarı 0,5 alt/üst":
|
||||
if "alt" in s: out["ht_ou05_u"] = v
|
||||
elif "üst" in s: out["ht_ou05_o"] = v
|
||||
elif c == "1. yarı 1,5 alt/üst":
|
||||
if "alt" in s: out["ht_ou15_u"] = v
|
||||
elif "üst" in s: out["ht_ou15_o"] = v
|
||||
return out
|
||||
|
||||
|
||||
class UpcomingExtractor(V27Extractor):
|
||||
"""Same feature computation as training; only guards null (NS) scores."""
|
||||
|
||||
def _update_elo(self, home_id, away_id, score_home, score_away):
|
||||
if score_home is None or score_away is None:
|
||||
return # upcoming match — no result, don't move ELO
|
||||
return super()._update_elo(home_id, away_id, score_home, score_away)
|
||||
|
||||
def _extract_one(self, mid, hid, aid, sh, sa, hth, hta, mst, lid, hn, an, ln):
|
||||
if sh is None or sa is None:
|
||||
# Upcoming TARGET. Dummy scores so label/total_goals don't crash;
|
||||
# those columns are labels/LEAKY and are NOT among the 133 model
|
||||
# features, so the served feature vector is identical to training.
|
||||
row = super()._extract_one(mid, hid, aid, 0, 0, 0, 0, mst, lid, hn, an, ln)
|
||||
if row:
|
||||
row["_upcoming"] = 1
|
||||
return row
|
||||
# FT match: needed ONLY to advance ELO (extract_all calls _update_elo
|
||||
# afterwards regardless). Skip the expensive per-match feature
|
||||
# computation — that turns a ~6h full extraction into seconds while
|
||||
# producing the IDENTICAL final ELO the upcoming targets read.
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
t0 = time.time()
|
||||
conn = get_conn()
|
||||
|
||||
# ── Cheap check FIRST: are there upcoming matches with odds? ──
|
||||
now_ms = int(time.time() * 1000)
|
||||
hi_ms = now_ms + DAYS_AHEAD * 24 * 3600 * 1000
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT lm.id, lm.home_team_id, lm.away_team_id, lm.mst_utc, lm.league_id,
|
||||
ht.name, at.name, l.name, lm.odds
|
||||
FROM live_matches lm
|
||||
JOIN teams ht ON ht.id = lm.home_team_id
|
||||
JOIN teams at ON at.id = lm.away_team_id
|
||||
JOIN leagues l ON l.id = lm.league_id
|
||||
WHERE lm.sport = 'football'
|
||||
AND lm.odds IS NOT NULL
|
||||
AND lm.mst_utc > %s AND lm.mst_utc <= %s
|
||||
ORDER BY lm.mst_utc ASC
|
||||
""",
|
||||
(now_ms, hi_ms),
|
||||
)
|
||||
upcoming = cur.fetchall()
|
||||
targets = []
|
||||
for mid, hid, aid, mst, lid, hn, an, ln, odds_json in upcoming:
|
||||
oc = map_live_odds(odds_json)
|
||||
if "ms_h" not in oc or "ms_a" not in oc:
|
||||
continue # need MS odds for the policy
|
||||
targets.append((mid, hid, aid, mst, lid, hn, an, ln, oc))
|
||||
print(f"Upcoming NS matches with MS odds (next {DAYS_AHEAD}d): {len(targets)}", flush=True)
|
||||
if not targets:
|
||||
print("⚠️ Nothing to extract. Deploy the 4-day window + let the odds cron\n"
|
||||
" populate live_matches, then re-run.")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
print("📦 Loading FT history (ELO/form/league/squad caches; heavy) ...", flush=True)
|
||||
loader = V27Loader(conn)
|
||||
loader.load_all()
|
||||
loader.load_league_matches()
|
||||
print(f" FT matches: {len(loader.matches)}", flush=True)
|
||||
|
||||
for mid, hid, aid, mst, lid, hn, an, ln, oc in targets:
|
||||
loader.odds_cache[mid] = oc
|
||||
loader.matches.append(
|
||||
(mid, hid, aid, None, None, None, None, mst, lid, hn, an, ln)
|
||||
)
|
||||
# NS targets must be processed AFTER all FT (ELO fully built)
|
||||
loader.matches.sort(key=lambda m: m[7] if m[7] is not None else 0)
|
||||
added = len(targets)
|
||||
|
||||
print("🔄 Extracting features (FT replay + upcoming targets) ...", flush=True)
|
||||
ext = UpcomingExtractor(conn, loader)
|
||||
rows = ext.extract_all()
|
||||
up_rows = [r for r in rows if r.get("_upcoming")]
|
||||
|
||||
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
|
||||
with open(OUTPUT, "w", newline="", encoding="utf-8") as f:
|
||||
w = csv.DictWriter(f, fieldnames=ALL_COLS, extrasaction="ignore")
|
||||
w.writeheader()
|
||||
w.writerows(up_rows)
|
||||
|
||||
with_odds = sum(1 for r in up_rows if r.get("odds_ms_h", 0) and r["odds_ms_h"] > 0)
|
||||
print(f"\n✅ Wrote {len(up_rows)} upcoming feature rows ({with_odds} with MS odds) → {OUTPUT}")
|
||||
print(f" Time: {(time.time()-t0)/60:.1f} min")
|
||||
print(" Next: python scripts/generate_daily_picks.py --features data/upcoming_features.csv --log")
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user