from __future__ import annotations import json from pathlib import Path import pandas as pd AI_ENGINE_DIR = Path(__file__).resolve().parents[1] SOURCE_CSV = AI_ENGINE_DIR / "data" / "training_data.csv" TARGET_DIR = AI_ENGINE_DIR / "data" / "v26_shadow" TARGET_DIR.mkdir(parents=True, exist_ok=True) def _rolling_windows(frame: pd.DataFrame) -> list[dict[str, int]]: ordered = frame.sort_values("mst_utc").reset_index(drop=True) windows: list[dict[str, int]] = [] if ordered.empty: return windows size = len(ordered) cuts = [0.55, 0.7, 0.85] for idx, cut in enumerate(cuts, start=1): end_ix = max(int(size * cut), 1) test_end = min(size - 1, end_ix + max(int(size * 0.10), 1)) windows.append( { "window": idx, "train_end_ix": end_ix - 1, "test_start_ix": end_ix, "test_end_ix": test_end, "train_end_mst_utc": int(ordered.iloc[end_ix - 1]["mst_utc"]), "test_end_mst_utc": int(ordered.iloc[test_end]["mst_utc"]), } ) return windows def main() -> None: if not SOURCE_CSV.exists(): raise SystemExit(f"Missing source CSV: {SOURCE_CSV}") frame = pd.read_csv(SOURCE_CSV) if "mst_utc" not in frame.columns: raise SystemExit("training_data.csv must include mst_utc") ordered = frame.sort_values("mst_utc").reset_index(drop=True) ordered["lineup_completeness"] = 1.0 ordered["referee_available"] = ( ordered.get("referee_experience", pd.Series([0] * len(ordered))).fillna(0) > 0 ).astype(float) ordered["league_reliability"] = ordered.get("league_zero_goal_rate", 0).fillna(0).apply( lambda value: round(max(0.25, min(0.95, 0.85 - float(value))), 4) ) ordered["odds_snapshot_freshness"] = 1.0 train_end = max(int(len(ordered) * 0.70), 1) validation_end = max(int(len(ordered) * 0.85), train_end + 1) validation_end = min(validation_end, len(ordered) - 1) train_df = ordered.iloc[:train_end].copy() validation_df = ordered.iloc[train_end:validation_end].copy() holdout_df = ordered.iloc[validation_end:].copy() train_df.to_csv(TARGET_DIR / "train.csv", index=False) validation_df.to_csv(TARGET_DIR / "validation.csv", index=False) holdout_df.to_csv(TARGET_DIR / "holdout.csv", index=False) meta = { "source": str(SOURCE_CSV), "rows": int(len(ordered)), "train_rows": int(len(train_df)), "validation_rows": int(len(validation_df)), "holdout_rows": int(len(holdout_df)), "rolling_windows": _rolling_windows(ordered), "derived_columns": [ "lineup_completeness", "referee_available", "league_reliability", "odds_snapshot_freshness", ], "feature_policy": "prediction_time_only", } (TARGET_DIR / "dataset_meta.json").write_text( json.dumps(meta, indent=2), encoding="utf-8", ) print(f"[OK] V26 dataset written to {TARGET_DIR}") if __name__ == "__main__": main()