This commit is contained in:
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ELO Backfill Script — Chronological Replay
|
||||
|
||||
Replays all finished matches in chronological order, computes ELO ratings,
|
||||
and persists:
|
||||
1. Per-match pre-match ELO snapshots → match_ai_features
|
||||
2. Final team ELO state → team_elo_ratings
|
||||
|
||||
Usage:
|
||||
python scripts/elo_backfill.py # football (default)
|
||||
python scripts/elo_backfill.py --sport basketball
|
||||
python scripts/elo_backfill.py --sport all
|
||||
python scripts/elo_backfill.py --dry-run # no DB writes
|
||||
python scripts/elo_backfill.py --batch-size 2000
|
||||
|
||||
Designed to be idempotent: uses ON CONFLICT upserts everywhere.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
|
||||
# Add ai-engine root to path
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
from data.db import get_clean_dsn
|
||||
from features.elo_system import ELORatingSystem
|
||||
|
||||
# ────────────────────────── constants ──────────────────────────
|
||||
|
||||
CALCULATOR_VER = "elo_backfill_v1"
|
||||
DEFAULT_BATCH_SIZE = 1000
|
||||
|
||||
|
||||
# ────────────────────────── helpers ────────────────────────────
|
||||
|
||||
def fetch_matches(conn, sport: str):
|
||||
"""Fetch all finished matches chronologically."""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT m.id, m.home_team_id, m.away_team_id,
|
||||
m.score_home, m.score_away,
|
||||
t1.name AS home_name, t2.name AS away_name,
|
||||
l.name AS league_name
|
||||
FROM matches m
|
||||
LEFT JOIN teams t1 ON m.home_team_id = t1.id
|
||||
LEFT JOIN teams t2 ON m.away_team_id = t2.id
|
||||
LEFT JOIN leagues l ON m.league_id = l.id
|
||||
WHERE m.sport = %s
|
||||
AND m.score_home IS NOT NULL
|
||||
AND m.score_away IS NOT NULL
|
||||
ORDER BY m.mst_utc ASC
|
||||
""", (sport,))
|
||||
return cur.fetchall()
|
||||
|
||||
|
||||
def flush_features_batch(conn, rows, dry_run: bool, sport: str = 'football'):
|
||||
"""Bulk upsert a batch of (match_id, home_elo, away_elo) into sport-partitioned ai_features table."""
|
||||
if not rows or dry_run:
|
||||
return
|
||||
|
||||
table_name = 'football_ai_features' if sport == 'football' else 'basketball_ai_features'
|
||||
with conn.cursor() as cur:
|
||||
execute_values(
|
||||
cur,
|
||||
f"""
|
||||
INSERT INTO {table_name}
|
||||
(match_id, home_elo, away_elo,
|
||||
home_form_score, away_form_score,
|
||||
missing_players_impact, calculator_ver, updated_at)
|
||||
VALUES %s
|
||||
ON CONFLICT (match_id) DO UPDATE SET
|
||||
home_elo = EXCLUDED.home_elo,
|
||||
away_elo = EXCLUDED.away_elo,
|
||||
home_form_score = EXCLUDED.home_form_score,
|
||||
away_form_score = EXCLUDED.away_form_score,
|
||||
calculator_ver = EXCLUDED.calculator_ver,
|
||||
updated_at = EXCLUDED.updated_at
|
||||
""",
|
||||
rows,
|
||||
template="(%s, %s, %s, %s, %s, 0.0, %s, NOW())",
|
||||
page_size=500,
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
# ────────────────────────── main ───────────────────────────────
|
||||
|
||||
def backfill(sport: str, batch_size: int, dry_run: bool):
|
||||
"""Core backfill: chronological replay → match_ai_features + team_elo_ratings"""
|
||||
|
||||
dsn = get_clean_dsn()
|
||||
conn = psycopg2.connect(dsn)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"🏆 ELO Backfill — {sport.upper()}")
|
||||
print(f" batch_size={batch_size} dry_run={dry_run}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# ── 1. Fetch matches ──
|
||||
t0 = time.time()
|
||||
matches = fetch_matches(conn, sport)
|
||||
print(f"📊 {len(matches):,} matches fetched in {time.time()-t0:.1f}s")
|
||||
|
||||
if not matches:
|
||||
print("⚠️ No matches found — nothing to do.")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# ── 2. Fresh ELO system (no preloaded ratings) ──
|
||||
elo = ELORatingSystem.__new__(ELORatingSystem)
|
||||
elo.ratings = {}
|
||||
elo.league_cache = {}
|
||||
elo.conn = conn
|
||||
|
||||
# ── 3. Chronological replay ──
|
||||
feature_buf = []
|
||||
processed = 0
|
||||
features_written = 0
|
||||
t_start = time.time()
|
||||
|
||||
def form_to_score(form: str) -> float:
|
||||
"""Convert WDLWW form string to 0-100 float (matches existing DB convention)."""
|
||||
if not form:
|
||||
return 50.0
|
||||
s = sum(1.0 if c == 'W' else 0.5 if c == 'D' else 0.0 for c in form)
|
||||
return (s / max(len(form), 1)) * 100.0
|
||||
|
||||
for row in matches:
|
||||
match_id, home_id, away_id, score_h, score_a, h_name, a_name, league = row
|
||||
|
||||
if not home_id or not away_id:
|
||||
continue
|
||||
|
||||
# Snapshot PRE-match ELO
|
||||
home_rating = elo.get_or_create_rating(home_id, h_name or "")
|
||||
away_rating = elo.get_or_create_rating(away_id, a_name or "")
|
||||
|
||||
feature_buf.append((
|
||||
match_id,
|
||||
round(home_rating.overall_elo, 2),
|
||||
round(away_rating.overall_elo, 2),
|
||||
round(form_to_score(home_rating.recent_form), 2),
|
||||
round(form_to_score(away_rating.recent_form), 2),
|
||||
CALCULATOR_VER,
|
||||
))
|
||||
|
||||
# Update ELO after the match
|
||||
elo.update_after_match(
|
||||
home_id, away_id, score_h, score_a,
|
||||
h_name or "", a_name or "", league or "",
|
||||
)
|
||||
|
||||
processed += 1
|
||||
|
||||
# Flush batch
|
||||
if len(feature_buf) >= batch_size:
|
||||
flush_features_batch(conn, feature_buf, dry_run, sport)
|
||||
features_written += len(feature_buf)
|
||||
feature_buf.clear()
|
||||
|
||||
if processed % 10_000 == 0:
|
||||
elapsed = time.time() - t_start
|
||||
rate = processed / elapsed if elapsed > 0 else 0
|
||||
print(f" {processed:>8,} / {len(matches):,} processed "
|
||||
f"({rate:,.0f} matches/s) "
|
||||
f"teams={len(elo.ratings)}")
|
||||
|
||||
# Flush remaining
|
||||
if feature_buf:
|
||||
flush_features_batch(conn, feature_buf, dry_run, sport)
|
||||
features_written += len(feature_buf)
|
||||
|
||||
elapsed = time.time() - t_start
|
||||
print(f"\n✅ Replay complete: {processed:,} matches in {elapsed:.1f}s")
|
||||
table_name = 'football_ai_features' if sport == 'football' else 'basketball_ai_features'
|
||||
print(f" {features_written:,} {table_name} rows written")
|
||||
print(f" {len(elo.ratings):,} teams rated")
|
||||
|
||||
# ── 4. Persist final team ELO state ──
|
||||
if not dry_run:
|
||||
elo.save_ratings_to_db()
|
||||
elo.save_ratings()
|
||||
print("💾 team_elo_ratings + JSON saved")
|
||||
else:
|
||||
print("🔸 DRY-RUN: no DB writes performed")
|
||||
|
||||
# ── 5. Show top teams ──
|
||||
elo._show_top_teams(10)
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="ELO Backfill — chronological replay → match_ai_features & team_elo_ratings"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sport",
|
||||
choices=["football", "basketball", "all"],
|
||||
default="football",
|
||||
help="Sport to compute ELO for (default: football)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=DEFAULT_BATCH_SIZE,
|
||||
help=f"DB insert batch size (default: {DEFAULT_BATCH_SIZE})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Run replay without writing to DB",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
sports = ["football", "basketball"] if args.sport == "all" else [args.sport]
|
||||
|
||||
for sport in sports:
|
||||
backfill(sport, args.batch_size, args.dry_run)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user