iddaai-be/ai-engine/scripts/extract_training_data_v26.py

from __future__ import annotations

import json
from pathlib import Path

import pandas as pd


AI_ENGINE_DIR = Path(__file__).resolve().parents[1]
SOURCE_CSV = AI_ENGINE_DIR / "data" / "training_data.csv"
TARGET_DIR = AI_ENGINE_DIR / "data" / "v26_shadow"
TARGET_DIR.mkdir(parents=True, exist_ok=True)


def _rolling_windows(frame: pd.DataFrame) -> list[dict[str, int]]:
    ordered = frame.sort_values("mst_utc").reset_index(drop=True)
    windows: list[dict[str, int]] = []
    if ordered.empty:
        return windows

    size = len(ordered)
    cuts = [0.55, 0.7, 0.85]
    for idx, cut in enumerate(cuts, start=1):
        end_ix = max(int(size * cut), 1)
        test_end = min(size - 1, end_ix + max(int(size * 0.10), 1))
        windows.append(
            {
                "window": idx,
                "train_end_ix": end_ix - 1,
                "test_start_ix": end_ix,
                "test_end_ix": test_end,
                "train_end_mst_utc": int(ordered.iloc[end_ix - 1]["mst_utc"]),
                "test_end_mst_utc": int(ordered.iloc[test_end]["mst_utc"]),
            }
        )
    return windows


def main() -> None:
    if not SOURCE_CSV.exists():
        raise SystemExit(f"Missing source CSV: {SOURCE_CSV}")

    frame = pd.read_csv(SOURCE_CSV)
    if "mst_utc" not in frame.columns:
        raise SystemExit("training_data.csv must include mst_utc")

    ordered = frame.sort_values("mst_utc").reset_index(drop=True)
    ordered["lineup_completeness"] = 1.0
    ordered["referee_available"] = (
        ordered.get("referee_experience", pd.Series([0] * len(ordered))).fillna(0) > 0
    ).astype(float)
    ordered["league_reliability"] = ordered.get("league_zero_goal_rate", 0).fillna(0).apply(
        lambda value: round(max(0.25, min(0.95, 0.85 - float(value))), 4)
    )
    ordered["odds_snapshot_freshness"] = 1.0

    train_end = max(int(len(ordered) * 0.70), 1)
    validation_end = max(int(len(ordered) * 0.85), train_end + 1)
    validation_end = min(validation_end, len(ordered) - 1)

    train_df = ordered.iloc[:train_end].copy()
    validation_df = ordered.iloc[train_end:validation_end].copy()
    holdout_df = ordered.iloc[validation_end:].copy()

    train_df.to_csv(TARGET_DIR / "train.csv", index=False)
    validation_df.to_csv(TARGET_DIR / "validation.csv", index=False)
    holdout_df.to_csv(TARGET_DIR / "holdout.csv", index=False)

    meta = {
      "source": str(SOURCE_CSV),
      "rows": int(len(ordered)),
      "train_rows": int(len(train_df)),
      "validation_rows": int(len(validation_df)),
      "holdout_rows": int(len(holdout_df)),
      "rolling_windows": _rolling_windows(ordered),
      "derived_columns": [
          "lineup_completeness",
          "referee_available",
          "league_reliability",
          "odds_snapshot_freshness",
      ],
      "feature_policy": "prediction_time_only",
    }
    (TARGET_DIR / "dataset_meta.json").write_text(
        json.dumps(meta, indent=2),
        encoding="utf-8",
    )

    print(f"[OK] V26 dataset written to {TARGET_DIR}")


if __name__ == "__main__":
    main()