This commit is contained in:
2026-04-21 16:53:56 +03:00
parent 1346924387
commit 2ccd6831eb
26 changed files with 430403 additions and 3 deletions
@@ -0,0 +1,93 @@
from __future__ import annotations
import json
from pathlib import Path
import pandas as pd
AI_ENGINE_DIR = Path(__file__).resolve().parents[1]
SOURCE_CSV = AI_ENGINE_DIR / "data" / "training_data.csv"
TARGET_DIR = AI_ENGINE_DIR / "data" / "v26_shadow"
TARGET_DIR.mkdir(parents=True, exist_ok=True)
def _rolling_windows(frame: pd.DataFrame) -> list[dict[str, int]]:
ordered = frame.sort_values("mst_utc").reset_index(drop=True)
windows: list[dict[str, int]] = []
if ordered.empty:
return windows
size = len(ordered)
cuts = [0.55, 0.7, 0.85]
for idx, cut in enumerate(cuts, start=1):
end_ix = max(int(size * cut), 1)
test_end = min(size - 1, end_ix + max(int(size * 0.10), 1))
windows.append(
{
"window": idx,
"train_end_ix": end_ix - 1,
"test_start_ix": end_ix,
"test_end_ix": test_end,
"train_end_mst_utc": int(ordered.iloc[end_ix - 1]["mst_utc"]),
"test_end_mst_utc": int(ordered.iloc[test_end]["mst_utc"]),
}
)
return windows
def main() -> None:
if not SOURCE_CSV.exists():
raise SystemExit(f"Missing source CSV: {SOURCE_CSV}")
frame = pd.read_csv(SOURCE_CSV)
if "mst_utc" not in frame.columns:
raise SystemExit("training_data.csv must include mst_utc")
ordered = frame.sort_values("mst_utc").reset_index(drop=True)
ordered["lineup_completeness"] = 1.0
ordered["referee_available"] = (
ordered.get("referee_experience", pd.Series([0] * len(ordered))).fillna(0) > 0
).astype(float)
ordered["league_reliability"] = ordered.get("league_zero_goal_rate", 0).fillna(0).apply(
lambda value: round(max(0.25, min(0.95, 0.85 - float(value))), 4)
)
ordered["odds_snapshot_freshness"] = 1.0
train_end = max(int(len(ordered) * 0.70), 1)
validation_end = max(int(len(ordered) * 0.85), train_end + 1)
validation_end = min(validation_end, len(ordered) - 1)
train_df = ordered.iloc[:train_end].copy()
validation_df = ordered.iloc[train_end:validation_end].copy()
holdout_df = ordered.iloc[validation_end:].copy()
train_df.to_csv(TARGET_DIR / "train.csv", index=False)
validation_df.to_csv(TARGET_DIR / "validation.csv", index=False)
holdout_df.to_csv(TARGET_DIR / "holdout.csv", index=False)
meta = {
"source": str(SOURCE_CSV),
"rows": int(len(ordered)),
"train_rows": int(len(train_df)),
"validation_rows": int(len(validation_df)),
"holdout_rows": int(len(holdout_df)),
"rolling_windows": _rolling_windows(ordered),
"derived_columns": [
"lineup_completeness",
"referee_available",
"league_reliability",
"odds_snapshot_freshness",
],
"feature_policy": "prediction_time_only",
}
(TARGET_DIR / "dataset_meta.json").write_text(
json.dumps(meta, indent=2),
encoding="utf-8",
)
print(f"[OK] V26 dataset written to {TARGET_DIR}")
if __name__ == "__main__":
main()