iddaai-be/ai-engine/scripts/extract_upcoming_features.py

"""
Extract Upcoming Features — leak-free feature rows for UPCOMING (NS) matches,
produced by the EXACT same pipeline that built training_data_v27.csv.
=============================================================================
Why this exists: the picker (generate_daily_picks.py) needs the 133 leak-free
features for tomorrow's matches, computed IDENTICALLY to training (any drift =
train/serve skew = broken model). So we reuse V27Loader + V27Extractor verbatim:

  1. load_all() builds ELO / team history / league / squad caches from FT
     matches ONLY (untouched — guarantees identical feature computation).
  2. We then APPEND upcoming NS matches as targets and inject their odds from
     live_matches.odds (all markets, same mapping as the trainer's _load_odds).
  3. extract_all() replays FT chronologically (ELO fully built), then computes
     features for the NS targets at the end. ELO update + labels are guarded
     for null scores (NS has no result yet); the 133 model features never use
     the current score, so they come out identical to training.
  4. Write ONLY the upcoming rows -> data/upcoming_features.csv

Then: generate_daily_picks.py --features data/upcoming_features.csv --log

Run nightly (heavy: full ELO replay, like training). Read-only on the DB.
"""
from __future__ import annotations
import csv
import json
import os
import sys
import time

if sys.stdout and hasattr(sys.stdout, "reconfigure"):
    try:
        sys.stdout.reconfigure(encoding="utf-8")
    except Exception:
        pass

AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_DIR)

from scripts.extract_training_data_v27 import (  # noqa: E402
    V27Loader, V27Extractor, ALL_COLS, get_conn,
)

OUTPUT = os.path.join(AI_DIR, "data", "upcoming_features.csv")
DAYS_AHEAD = 4


def map_live_odds(odds_json) -> dict:
    """Map live_matches.odds JSON → odds_cache keys, IDENTICAL to the trainer's
    _load_odds category/selection logic (so odds features match training)."""
    out: dict = {}
    if isinstance(odds_json, str):
        try:
            odds_json = json.loads(odds_json)
        except Exception:
            return out
    if not isinstance(odds_json, dict):
        return out
    for cat, sels in odds_json.items():
        if not isinstance(sels, dict):
            continue
        c = str(cat).lower().strip()
        for sel, val in sels.items():
            try:
                v = float(val)
            except (TypeError, ValueError):
                continue
            if v <= 0:
                continue
            sn = str(sel)
            s = sn.lower().strip()
            if c == "maç sonucu":
                if sn == "1": out["ms_h"] = v
                elif sn in ("0", "X"): out["ms_d"] = v
                elif sn == "2": out["ms_a"] = v
            elif c == "1. yarı sonucu":
                if sn == "1": out["ht_ms_h"] = v
                elif sn in ("0", "X"): out["ht_ms_d"] = v
                elif sn == "2": out["ht_ms_a"] = v
            elif c == "karşılıklı gol":
                if "var" in s: out["btts_y"] = v
                elif "yok" in s: out["btts_n"] = v
            elif c == "0,5 alt/üst":
                if "alt" in s: out["ou05_u"] = v
                elif "üst" in s: out["ou05_o"] = v
            elif c == "1,5 alt/üst":
                if "alt" in s: out["ou15_u"] = v
                elif "üst" in s: out["ou15_o"] = v
            elif c == "2,5 alt/üst":
                if "alt" in s: out["ou25_u"] = v
                elif "üst" in s: out["ou25_o"] = v
            elif c == "3,5 alt/üst":
                if "alt" in s: out["ou35_u"] = v
                elif "üst" in s: out["ou35_o"] = v
            elif c == "1. yarı 0,5 alt/üst":
                if "alt" in s: out["ht_ou05_u"] = v
                elif "üst" in s: out["ht_ou05_o"] = v
            elif c == "1. yarı 1,5 alt/üst":
                if "alt" in s: out["ht_ou15_u"] = v
                elif "üst" in s: out["ht_ou15_o"] = v
    return out


class UpcomingExtractor(V27Extractor):
    """Same feature computation as training; only guards null (NS) scores."""

    def _update_elo(self, home_id, away_id, score_home, score_away):
        if score_home is None or score_away is None:
            return  # upcoming match — no result, don't move ELO
        return super()._update_elo(home_id, away_id, score_home, score_away)

    def _extract_one(self, mid, hid, aid, sh, sa, hth, hta, mst, lid, hn, an, ln):
        if sh is None or sa is None:
            # Upcoming TARGET. Dummy scores so label/total_goals don't crash;
            # those columns are labels/LEAKY and are NOT among the 133 model
            # features, so the served feature vector is identical to training.
            row = super()._extract_one(mid, hid, aid, 0, 0, 0, 0, mst, lid, hn, an, ln)
            if row:
                row["_upcoming"] = 1
            return row
        # FT match: needed ONLY to advance ELO (extract_all calls _update_elo
        # afterwards regardless). Skip the expensive per-match feature
        # computation — that turns a ~6h full extraction into seconds while
        # producing the IDENTICAL final ELO the upcoming targets read.
        return None


def main():
    t0 = time.time()
    conn = get_conn()

    # ── Cheap check FIRST: are there upcoming matches with odds? ──
    now_ms = int(time.time() * 1000)
    hi_ms = now_ms + DAYS_AHEAD * 24 * 3600 * 1000
    cur = conn.cursor()
    cur.execute(
        """
        SELECT lm.id, lm.home_team_id, lm.away_team_id, lm.mst_utc, lm.league_id,
               ht.name, at.name, l.name, lm.odds
        FROM live_matches lm
        JOIN teams ht ON ht.id = lm.home_team_id
        JOIN teams at ON at.id = lm.away_team_id
        JOIN leagues l ON l.id = lm.league_id
        WHERE lm.sport = 'football'
          AND lm.odds IS NOT NULL
          AND lm.mst_utc > %s AND lm.mst_utc <= %s
        ORDER BY lm.mst_utc ASC
        """,
        (now_ms, hi_ms),
    )
    upcoming = cur.fetchall()
    targets = []
    for mid, hid, aid, mst, lid, hn, an, ln, odds_json in upcoming:
        oc = map_live_odds(odds_json)
        if "ms_h" not in oc or "ms_a" not in oc:
            continue  # need MS odds for the policy
        targets.append((mid, hid, aid, mst, lid, hn, an, ln, oc))
    print(f"Upcoming NS matches with MS odds (next {DAYS_AHEAD}d): {len(targets)}", flush=True)
    if not targets:
        print("⚠️ Nothing to extract. Deploy the 4-day window + let the odds cron\n"
              "   populate live_matches, then re-run.")
        conn.close()
        return

    print("📦 Loading FT history (ELO/form/league/squad caches; heavy) ...", flush=True)
    loader = V27Loader(conn)
    loader.load_all()
    loader.load_league_matches()
    print(f"  FT matches: {len(loader.matches)}", flush=True)

    for mid, hid, aid, mst, lid, hn, an, ln, oc in targets:
        loader.odds_cache[mid] = oc
        loader.matches.append(
            (mid, hid, aid, None, None, None, None, mst, lid, hn, an, ln)
        )
    # NS targets must be processed AFTER all FT (ELO fully built)
    loader.matches.sort(key=lambda m: m[7] if m[7] is not None else 0)
    added = len(targets)

    print("🔄 Extracting features (FT replay + upcoming targets) ...", flush=True)
    ext = UpcomingExtractor(conn, loader)
    rows = ext.extract_all()
    up_rows = [r for r in rows if r.get("_upcoming")]

    os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
    with open(OUTPUT, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=ALL_COLS, extrasaction="ignore")
        w.writeheader()
        w.writerows(up_rows)

    with_odds = sum(1 for r in up_rows if r.get("odds_ms_h", 0) and r["odds_ms_h"] > 0)
    print(f"\n✅ Wrote {len(up_rows)} upcoming feature rows ({with_odds} with MS odds) → {OUTPUT}")
    print(f"   Time: {(time.time()-t0)/60:.1f} min")
    print("   Next: python scripts/generate_daily_picks.py --features data/upcoming_features.csv --log")
    conn.close()


if __name__ == "__main__":
    main()