Files
iddaai-be/ai-engine/scripts/extract_upcoming_features.py
T
fahricansecer c338aba1c0
Deploy Iddaai Backend / build-and-deploy (push) Successful in 1m5s
gg4
2026-06-07 15:17:08 +03:00

199 lines
8.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Extract Upcoming Features — leak-free feature rows for UPCOMING (NS) matches,
produced by the EXACT same pipeline that built training_data_v27.csv.
=============================================================================
Why this exists: the picker (generate_daily_picks.py) needs the 133 leak-free
features for tomorrow's matches, computed IDENTICALLY to training (any drift =
train/serve skew = broken model). So we reuse V27Loader + V27Extractor verbatim:
1. load_all() builds ELO / team history / league / squad caches from FT
matches ONLY (untouched — guarantees identical feature computation).
2. We then APPEND upcoming NS matches as targets and inject their odds from
live_matches.odds (all markets, same mapping as the trainer's _load_odds).
3. extract_all() replays FT chronologically (ELO fully built), then computes
features for the NS targets at the end. ELO update + labels are guarded
for null scores (NS has no result yet); the 133 model features never use
the current score, so they come out identical to training.
4. Write ONLY the upcoming rows -> data/upcoming_features.csv
Then: generate_daily_picks.py --features data/upcoming_features.csv --log
Run nightly (heavy: full ELO replay, like training). Read-only on the DB.
"""
from __future__ import annotations
import csv
import json
import os
import sys
import time
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try:
sys.stdout.reconfigure(encoding="utf-8")
except Exception:
pass
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_DIR)
from scripts.extract_training_data_v27 import ( # noqa: E402
V27Loader, V27Extractor, ALL_COLS, get_conn,
)
OUTPUT = os.path.join(AI_DIR, "data", "upcoming_features.csv")
DAYS_AHEAD = 4
def map_live_odds(odds_json) -> dict:
"""Map live_matches.odds JSON → odds_cache keys, IDENTICAL to the trainer's
_load_odds category/selection logic (so odds features match training)."""
out: dict = {}
if isinstance(odds_json, str):
try:
odds_json = json.loads(odds_json)
except Exception:
return out
if not isinstance(odds_json, dict):
return out
for cat, sels in odds_json.items():
if not isinstance(sels, dict):
continue
c = str(cat).lower().strip()
for sel, val in sels.items():
try:
v = float(val)
except (TypeError, ValueError):
continue
if v <= 0:
continue
sn = str(sel)
s = sn.lower().strip()
if c == "maç sonucu":
if sn == "1": out["ms_h"] = v
elif sn in ("0", "X"): out["ms_d"] = v
elif sn == "2": out["ms_a"] = v
elif c == "1. yarı sonucu":
if sn == "1": out["ht_ms_h"] = v
elif sn in ("0", "X"): out["ht_ms_d"] = v
elif sn == "2": out["ht_ms_a"] = v
elif c == "karşılıklı gol":
if "var" in s: out["btts_y"] = v
elif "yok" in s: out["btts_n"] = v
elif c == "0,5 alt/üst":
if "alt" in s: out["ou05_u"] = v
elif "üst" in s: out["ou05_o"] = v
elif c == "1,5 alt/üst":
if "alt" in s: out["ou15_u"] = v
elif "üst" in s: out["ou15_o"] = v
elif c == "2,5 alt/üst":
if "alt" in s: out["ou25_u"] = v
elif "üst" in s: out["ou25_o"] = v
elif c == "3,5 alt/üst":
if "alt" in s: out["ou35_u"] = v
elif "üst" in s: out["ou35_o"] = v
elif c == "1. yarı 0,5 alt/üst":
if "alt" in s: out["ht_ou05_u"] = v
elif "üst" in s: out["ht_ou05_o"] = v
elif c == "1. yarı 1,5 alt/üst":
if "alt" in s: out["ht_ou15_u"] = v
elif "üst" in s: out["ht_ou15_o"] = v
return out
class UpcomingExtractor(V27Extractor):
"""Same feature computation as training; only guards null (NS) scores."""
def _update_elo(self, home_id, away_id, score_home, score_away):
if score_home is None or score_away is None:
return # upcoming match — no result, don't move ELO
return super()._update_elo(home_id, away_id, score_home, score_away)
def _extract_one(self, mid, hid, aid, sh, sa, hth, hta, mst, lid, hn, an, ln):
if sh is None or sa is None:
# Upcoming TARGET. Dummy scores so label/total_goals don't crash;
# those columns are labels/LEAKY and are NOT among the 133 model
# features, so the served feature vector is identical to training.
row = super()._extract_one(mid, hid, aid, 0, 0, 0, 0, mst, lid, hn, an, ln)
if row:
row["_upcoming"] = 1
return row
# FT match: needed ONLY to advance ELO (extract_all calls _update_elo
# afterwards regardless). Skip the expensive per-match feature
# computation — that turns a ~6h full extraction into seconds while
# producing the IDENTICAL final ELO the upcoming targets read.
return None
def main():
t0 = time.time()
conn = get_conn()
# ── Cheap check FIRST: are there upcoming matches with odds? ──
now_ms = int(time.time() * 1000)
hi_ms = now_ms + DAYS_AHEAD * 24 * 3600 * 1000
cur = conn.cursor()
cur.execute(
"""
SELECT lm.id, lm.home_team_id, lm.away_team_id, lm.mst_utc, lm.league_id,
ht.name, at.name, l.name, lm.odds
FROM live_matches lm
JOIN teams ht ON ht.id = lm.home_team_id
JOIN teams at ON at.id = lm.away_team_id
JOIN leagues l ON l.id = lm.league_id
WHERE lm.sport = 'football'
AND lm.odds IS NOT NULL
AND lm.mst_utc > %s AND lm.mst_utc <= %s
ORDER BY lm.mst_utc ASC
""",
(now_ms, hi_ms),
)
upcoming = cur.fetchall()
targets = []
for mid, hid, aid, mst, lid, hn, an, ln, odds_json in upcoming:
oc = map_live_odds(odds_json)
if "ms_h" not in oc or "ms_a" not in oc:
continue # need MS odds for the policy
targets.append((mid, hid, aid, mst, lid, hn, an, ln, oc))
print(f"Upcoming NS matches with MS odds (next {DAYS_AHEAD}d): {len(targets)}", flush=True)
if not targets:
print("⚠️ Nothing to extract. Deploy the 4-day window + let the odds cron\n"
" populate live_matches, then re-run.")
conn.close()
return
print("📦 Loading FT history (ELO/form/league/squad caches; heavy) ...", flush=True)
loader = V27Loader(conn)
loader.load_all()
loader.load_league_matches()
print(f" FT matches: {len(loader.matches)}", flush=True)
for mid, hid, aid, mst, lid, hn, an, ln, oc in targets:
loader.odds_cache[mid] = oc
loader.matches.append(
(mid, hid, aid, None, None, None, None, mst, lid, hn, an, ln)
)
# NS targets must be processed AFTER all FT (ELO fully built)
loader.matches.sort(key=lambda m: m[7] if m[7] is not None else 0)
added = len(targets)
print("🔄 Extracting features (FT replay + upcoming targets) ...", flush=True)
ext = UpcomingExtractor(conn, loader)
rows = ext.extract_all()
up_rows = [r for r in rows if r.get("_upcoming")]
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
with open(OUTPUT, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=ALL_COLS, extrasaction="ignore")
w.writeheader()
w.writerows(up_rows)
with_odds = sum(1 for r in up_rows if r.get("odds_ms_h", 0) and r["odds_ms_h"] > 0)
print(f"\n✅ Wrote {len(up_rows)} upcoming feature rows ({with_odds} with MS odds) → {OUTPUT}")
print(f" Time: {(time.time()-t0)/60:.1f} min")
print(" Next: python scripts/generate_daily_picks.py --features data/upcoming_features.csv --log")
conn.close()
if __name__ == "__main__":
main()