94 lines
3.0 KiB
Python
94 lines
3.0 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
|
|
AI_ENGINE_DIR = Path(__file__).resolve().parents[1]
|
|
SOURCE_CSV = AI_ENGINE_DIR / "data" / "training_data.csv"
|
|
TARGET_DIR = AI_ENGINE_DIR / "data" / "v26_shadow"
|
|
TARGET_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def _rolling_windows(frame: pd.DataFrame) -> list[dict[str, int]]:
|
|
ordered = frame.sort_values("mst_utc").reset_index(drop=True)
|
|
windows: list[dict[str, int]] = []
|
|
if ordered.empty:
|
|
return windows
|
|
|
|
size = len(ordered)
|
|
cuts = [0.55, 0.7, 0.85]
|
|
for idx, cut in enumerate(cuts, start=1):
|
|
end_ix = max(int(size * cut), 1)
|
|
test_end = min(size - 1, end_ix + max(int(size * 0.10), 1))
|
|
windows.append(
|
|
{
|
|
"window": idx,
|
|
"train_end_ix": end_ix - 1,
|
|
"test_start_ix": end_ix,
|
|
"test_end_ix": test_end,
|
|
"train_end_mst_utc": int(ordered.iloc[end_ix - 1]["mst_utc"]),
|
|
"test_end_mst_utc": int(ordered.iloc[test_end]["mst_utc"]),
|
|
}
|
|
)
|
|
return windows
|
|
|
|
|
|
def main() -> None:
|
|
if not SOURCE_CSV.exists():
|
|
raise SystemExit(f"Missing source CSV: {SOURCE_CSV}")
|
|
|
|
frame = pd.read_csv(SOURCE_CSV)
|
|
if "mst_utc" not in frame.columns:
|
|
raise SystemExit("training_data.csv must include mst_utc")
|
|
|
|
ordered = frame.sort_values("mst_utc").reset_index(drop=True)
|
|
ordered["lineup_completeness"] = 1.0
|
|
ordered["referee_available"] = (
|
|
ordered.get("referee_experience", pd.Series([0] * len(ordered))).fillna(0) > 0
|
|
).astype(float)
|
|
ordered["league_reliability"] = ordered.get("league_zero_goal_rate", 0).fillna(0).apply(
|
|
lambda value: round(max(0.25, min(0.95, 0.85 - float(value))), 4)
|
|
)
|
|
ordered["odds_snapshot_freshness"] = 1.0
|
|
|
|
train_end = max(int(len(ordered) * 0.70), 1)
|
|
validation_end = max(int(len(ordered) * 0.85), train_end + 1)
|
|
validation_end = min(validation_end, len(ordered) - 1)
|
|
|
|
train_df = ordered.iloc[:train_end].copy()
|
|
validation_df = ordered.iloc[train_end:validation_end].copy()
|
|
holdout_df = ordered.iloc[validation_end:].copy()
|
|
|
|
train_df.to_csv(TARGET_DIR / "train.csv", index=False)
|
|
validation_df.to_csv(TARGET_DIR / "validation.csv", index=False)
|
|
holdout_df.to_csv(TARGET_DIR / "holdout.csv", index=False)
|
|
|
|
meta = {
|
|
"source": str(SOURCE_CSV),
|
|
"rows": int(len(ordered)),
|
|
"train_rows": int(len(train_df)),
|
|
"validation_rows": int(len(validation_df)),
|
|
"holdout_rows": int(len(holdout_df)),
|
|
"rolling_windows": _rolling_windows(ordered),
|
|
"derived_columns": [
|
|
"lineup_completeness",
|
|
"referee_available",
|
|
"league_reliability",
|
|
"odds_snapshot_freshness",
|
|
],
|
|
"feature_policy": "prediction_time_only",
|
|
}
|
|
(TARGET_DIR / "dataset_meta.json").write_text(
|
|
json.dumps(meta, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
print(f"[OK] V26 dataset written to {TARGET_DIR}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|