""" Extract Upcoming Features — leak-free feature rows for UPCOMING (NS) matches, produced by the EXACT same pipeline that built training_data_v27.csv. ============================================================================= Why this exists: the picker (generate_daily_picks.py) needs the 133 leak-free features for tomorrow's matches, computed IDENTICALLY to training (any drift = train/serve skew = broken model). So we reuse V27Loader + V27Extractor verbatim: 1. load_all() builds ELO / team history / league / squad caches from FT matches ONLY (untouched — guarantees identical feature computation). 2. We then APPEND upcoming NS matches as targets and inject their odds from live_matches.odds (all markets, same mapping as the trainer's _load_odds). 3. extract_all() replays FT chronologically (ELO fully built), then computes features for the NS targets at the end. ELO update + labels are guarded for null scores (NS has no result yet); the 133 model features never use the current score, so they come out identical to training. 4. Write ONLY the upcoming rows -> data/upcoming_features.csv Then: generate_daily_picks.py --features data/upcoming_features.csv --log Run nightly (heavy: full ELO replay, like training). Read-only on the DB. """ from __future__ import annotations import csv import json import os import sys import time if sys.stdout and hasattr(sys.stdout, "reconfigure"): try: sys.stdout.reconfigure(encoding="utf-8") except Exception: pass AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, AI_DIR) from scripts.extract_training_data_v27 import ( # noqa: E402 V27Loader, V27Extractor, ALL_COLS, get_conn, ) OUTPUT = os.path.join(AI_DIR, "data", "upcoming_features.csv") DAYS_AHEAD = 4 def map_live_odds(odds_json) -> dict: """Map live_matches.odds JSON → odds_cache keys, IDENTICAL to the trainer's _load_odds category/selection logic (so odds features match training).""" out: dict = {} if isinstance(odds_json, str): try: odds_json = json.loads(odds_json) except Exception: return out if not isinstance(odds_json, dict): return out for cat, sels in odds_json.items(): if not isinstance(sels, dict): continue c = str(cat).lower().strip() for sel, val in sels.items(): try: v = float(val) except (TypeError, ValueError): continue if v <= 0: continue sn = str(sel) s = sn.lower().strip() if c == "maç sonucu": if sn == "1": out["ms_h"] = v elif sn in ("0", "X"): out["ms_d"] = v elif sn == "2": out["ms_a"] = v elif c == "1. yarı sonucu": if sn == "1": out["ht_ms_h"] = v elif sn in ("0", "X"): out["ht_ms_d"] = v elif sn == "2": out["ht_ms_a"] = v elif c == "karşılıklı gol": if "var" in s: out["btts_y"] = v elif "yok" in s: out["btts_n"] = v elif c == "0,5 alt/üst": if "alt" in s: out["ou05_u"] = v elif "üst" in s: out["ou05_o"] = v elif c == "1,5 alt/üst": if "alt" in s: out["ou15_u"] = v elif "üst" in s: out["ou15_o"] = v elif c == "2,5 alt/üst": if "alt" in s: out["ou25_u"] = v elif "üst" in s: out["ou25_o"] = v elif c == "3,5 alt/üst": if "alt" in s: out["ou35_u"] = v elif "üst" in s: out["ou35_o"] = v elif c == "1. yarı 0,5 alt/üst": if "alt" in s: out["ht_ou05_u"] = v elif "üst" in s: out["ht_ou05_o"] = v elif c == "1. yarı 1,5 alt/üst": if "alt" in s: out["ht_ou15_u"] = v elif "üst" in s: out["ht_ou15_o"] = v return out class UpcomingExtractor(V27Extractor): """Same feature computation as training; only guards null (NS) scores.""" def _update_elo(self, home_id, away_id, score_home, score_away): if score_home is None or score_away is None: return # upcoming match — no result, don't move ELO return super()._update_elo(home_id, away_id, score_home, score_away) def _extract_one(self, mid, hid, aid, sh, sa, hth, hta, mst, lid, hn, an, ln): if sh is None or sa is None: # Upcoming TARGET. Dummy scores so label/total_goals don't crash; # those columns are labels/LEAKY and are NOT among the 133 model # features, so the served feature vector is identical to training. row = super()._extract_one(mid, hid, aid, 0, 0, 0, 0, mst, lid, hn, an, ln) if row: row["_upcoming"] = 1 return row # FT match: needed ONLY to advance ELO (extract_all calls _update_elo # afterwards regardless). Skip the expensive per-match feature # computation — that turns a ~6h full extraction into seconds while # producing the IDENTICAL final ELO the upcoming targets read. return None def main(): t0 = time.time() conn = get_conn() # ── Cheap check FIRST: are there upcoming matches with odds? ── now_ms = int(time.time() * 1000) hi_ms = now_ms + DAYS_AHEAD * 24 * 3600 * 1000 cur = conn.cursor() cur.execute( """ SELECT lm.id, lm.home_team_id, lm.away_team_id, lm.mst_utc, lm.league_id, ht.name, at.name, l.name, lm.odds FROM live_matches lm JOIN teams ht ON ht.id = lm.home_team_id JOIN teams at ON at.id = lm.away_team_id JOIN leagues l ON l.id = lm.league_id WHERE lm.sport = 'football' AND lm.odds IS NOT NULL AND lm.mst_utc > %s AND lm.mst_utc <= %s ORDER BY lm.mst_utc ASC """, (now_ms, hi_ms), ) upcoming = cur.fetchall() targets = [] for mid, hid, aid, mst, lid, hn, an, ln, odds_json in upcoming: oc = map_live_odds(odds_json) if "ms_h" not in oc or "ms_a" not in oc: continue # need MS odds for the policy targets.append((mid, hid, aid, mst, lid, hn, an, ln, oc)) print(f"Upcoming NS matches with MS odds (next {DAYS_AHEAD}d): {len(targets)}", flush=True) if not targets: print("⚠️ Nothing to extract. Deploy the 4-day window + let the odds cron\n" " populate live_matches, then re-run.") conn.close() return print("📦 Loading FT history (ELO/form/league/squad caches; heavy) ...", flush=True) loader = V27Loader(conn) loader.load_all() loader.load_league_matches() print(f" FT matches: {len(loader.matches)}", flush=True) for mid, hid, aid, mst, lid, hn, an, ln, oc in targets: loader.odds_cache[mid] = oc loader.matches.append( (mid, hid, aid, None, None, None, None, mst, lid, hn, an, ln) ) # NS targets must be processed AFTER all FT (ELO fully built) loader.matches.sort(key=lambda m: m[7] if m[7] is not None else 0) added = len(targets) print("🔄 Extracting features (FT replay + upcoming targets) ...", flush=True) ext = UpcomingExtractor(conn, loader) rows = ext.extract_all() up_rows = [r for r in rows if r.get("_upcoming")] os.makedirs(os.path.dirname(OUTPUT), exist_ok=True) with open(OUTPUT, "w", newline="", encoding="utf-8") as f: w = csv.DictWriter(f, fieldnames=ALL_COLS, extrasaction="ignore") w.writeheader() w.writerows(up_rows) with_odds = sum(1 for r in up_rows if r.get("odds_ms_h", 0) and r["odds_ms_h"] > 0) print(f"\n✅ Wrote {len(up_rows)} upcoming feature rows ({with_odds} with MS odds) → {OUTPUT}") print(f" Time: {(time.time()-t0)/60:.1f} min") print(" Next: python scripts/generate_daily_picks.py --features data/upcoming_features.csv --log") conn.close() if __name__ == "__main__": main()