245 lines
8.7 KiB
Python
245 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
ELO Backfill Script — Chronological Replay
|
|
|
|
Replays all finished matches in chronological order, computes ELO ratings,
|
|
and persists:
|
|
1. Per-match pre-match ELO snapshots → match_ai_features
|
|
2. Final team ELO state → team_elo_ratings
|
|
|
|
Usage:
|
|
python scripts/elo_backfill.py # football (default)
|
|
python scripts/elo_backfill.py --sport basketball
|
|
python scripts/elo_backfill.py --sport all
|
|
python scripts/elo_backfill.py --dry-run # no DB writes
|
|
python scripts/elo_backfill.py --batch-size 2000
|
|
|
|
Designed to be idempotent: uses ON CONFLICT upserts everywhere.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import argparse
|
|
|
|
# Add ai-engine root to path
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
import psycopg2
|
|
from psycopg2.extras import execute_values
|
|
from data.db import get_clean_dsn
|
|
from features.elo_system import ELORatingSystem
|
|
|
|
# ────────────────────────── constants ──────────────────────────
|
|
|
|
CALCULATOR_VER = "elo_backfill_v1"
|
|
DEFAULT_BATCH_SIZE = 1000
|
|
|
|
|
|
# ────────────────────────── helpers ────────────────────────────
|
|
|
|
def fetch_matches(conn, sport: str):
|
|
"""Fetch all finished matches chronologically."""
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
SELECT m.id, m.home_team_id, m.away_team_id,
|
|
m.score_home, m.score_away,
|
|
t1.name AS home_name, t2.name AS away_name,
|
|
l.name AS league_name
|
|
FROM matches m
|
|
LEFT JOIN teams t1 ON m.home_team_id = t1.id
|
|
LEFT JOIN teams t2 ON m.away_team_id = t2.id
|
|
LEFT JOIN leagues l ON m.league_id = l.id
|
|
WHERE m.sport = %s
|
|
AND m.score_home IS NOT NULL
|
|
AND m.score_away IS NOT NULL
|
|
ORDER BY m.mst_utc ASC
|
|
""", (sport,))
|
|
return cur.fetchall()
|
|
|
|
|
|
def flush_features_batch(conn, rows, dry_run: bool, sport: str = 'football'):
|
|
"""Bulk upsert ELO features into sport-partitioned ai_features table."""
|
|
if not rows or dry_run:
|
|
return
|
|
|
|
table_name = 'football_ai_features' if sport == 'football' else 'basketball_ai_features'
|
|
with conn.cursor() as cur:
|
|
execute_values(
|
|
cur,
|
|
f"""
|
|
INSERT INTO {table_name}
|
|
(match_id, home_elo, away_elo,
|
|
home_home_elo, away_away_elo,
|
|
home_form_elo, away_form_elo,
|
|
elo_diff,
|
|
home_form_score, away_form_score,
|
|
missing_players_impact, calculator_ver, updated_at)
|
|
VALUES %s
|
|
ON CONFLICT (match_id) DO UPDATE SET
|
|
home_elo = EXCLUDED.home_elo,
|
|
away_elo = EXCLUDED.away_elo,
|
|
home_home_elo = EXCLUDED.home_home_elo,
|
|
away_away_elo = EXCLUDED.away_away_elo,
|
|
home_form_elo = EXCLUDED.home_form_elo,
|
|
away_form_elo = EXCLUDED.away_form_elo,
|
|
elo_diff = EXCLUDED.elo_diff,
|
|
home_form_score = EXCLUDED.home_form_score,
|
|
away_form_score = EXCLUDED.away_form_score,
|
|
calculator_ver = EXCLUDED.calculator_ver,
|
|
updated_at = EXCLUDED.updated_at
|
|
""",
|
|
rows,
|
|
template="(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 0.0, %s, NOW())",
|
|
page_size=500,
|
|
)
|
|
conn.commit()
|
|
|
|
|
|
# ────────────────────────── main ───────────────────────────────
|
|
|
|
def backfill(sport: str, batch_size: int, dry_run: bool):
|
|
"""Core backfill: chronological replay → match_ai_features + team_elo_ratings"""
|
|
|
|
dsn = get_clean_dsn()
|
|
conn = psycopg2.connect(dsn)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"🏆 ELO Backfill — {sport.upper()}")
|
|
print(f" batch_size={batch_size} dry_run={dry_run}")
|
|
print(f"{'='*60}")
|
|
|
|
# ── 1. Fetch matches ──
|
|
t0 = time.time()
|
|
matches = fetch_matches(conn, sport)
|
|
print(f"📊 {len(matches):,} matches fetched in {time.time()-t0:.1f}s")
|
|
|
|
if not matches:
|
|
print("⚠️ No matches found — nothing to do.")
|
|
conn.close()
|
|
return
|
|
|
|
# ── 2. Fresh ELO system (no preloaded ratings) ──
|
|
elo = ELORatingSystem.__new__(ELORatingSystem)
|
|
elo.ratings = {}
|
|
elo.league_cache = {}
|
|
elo.conn = conn
|
|
|
|
# ── 3. Chronological replay ──
|
|
feature_buf = []
|
|
processed = 0
|
|
features_written = 0
|
|
t_start = time.time()
|
|
|
|
def form_to_score(form: str) -> float:
|
|
"""Convert WDLWW form string to 0-100 float (matches existing DB convention)."""
|
|
if not form:
|
|
return 50.0
|
|
s = sum(1.0 if c == 'W' else 0.5 if c == 'D' else 0.0 for c in form)
|
|
return (s / max(len(form), 1)) * 100.0
|
|
|
|
for row in matches:
|
|
match_id, home_id, away_id, score_h, score_a, h_name, a_name, league = row
|
|
|
|
if not home_id or not away_id:
|
|
continue
|
|
|
|
# Snapshot PRE-match ELO (all dimensions)
|
|
home_rating = elo.get_or_create_rating(home_id, h_name or "")
|
|
away_rating = elo.get_or_create_rating(away_id, a_name or "")
|
|
|
|
h_overall = round(home_rating.overall_elo, 2)
|
|
a_overall = round(away_rating.overall_elo, 2)
|
|
|
|
feature_buf.append((
|
|
match_id,
|
|
h_overall, # home_elo
|
|
a_overall, # away_elo
|
|
round(home_rating.home_elo, 2), # home_home_elo
|
|
round(away_rating.away_elo, 2), # away_away_elo
|
|
round(home_rating.form_elo, 2), # home_form_elo
|
|
round(away_rating.form_elo, 2), # away_form_elo
|
|
round(h_overall - a_overall, 2), # elo_diff
|
|
round(form_to_score(home_rating.recent_form), 2), # home_form_score
|
|
round(form_to_score(away_rating.recent_form), 2), # away_form_score
|
|
CALCULATOR_VER,
|
|
))
|
|
|
|
# Update ELO after the match
|
|
elo.update_after_match(
|
|
home_id, away_id, score_h, score_a,
|
|
h_name or "", a_name or "", league or "",
|
|
)
|
|
|
|
processed += 1
|
|
|
|
# Flush batch
|
|
if len(feature_buf) >= batch_size:
|
|
flush_features_batch(conn, feature_buf, dry_run, sport)
|
|
features_written += len(feature_buf)
|
|
feature_buf.clear()
|
|
|
|
if processed % 10_000 == 0:
|
|
elapsed = time.time() - t_start
|
|
rate = processed / elapsed if elapsed > 0 else 0
|
|
print(f" {processed:>8,} / {len(matches):,} processed "
|
|
f"({rate:,.0f} matches/s) "
|
|
f"teams={len(elo.ratings)}")
|
|
|
|
# Flush remaining
|
|
if feature_buf:
|
|
flush_features_batch(conn, feature_buf, dry_run, sport)
|
|
features_written += len(feature_buf)
|
|
|
|
elapsed = time.time() - t_start
|
|
print(f"\n✅ Replay complete: {processed:,} matches in {elapsed:.1f}s")
|
|
table_name = 'football_ai_features' if sport == 'football' else 'basketball_ai_features'
|
|
print(f" {features_written:,} {table_name} rows written")
|
|
print(f" {len(elo.ratings):,} teams rated")
|
|
|
|
# ── 4. Persist final team ELO state ──
|
|
if not dry_run:
|
|
elo.save_ratings_to_db()
|
|
elo.save_ratings()
|
|
print("💾 team_elo_ratings + JSON saved")
|
|
else:
|
|
print("🔸 DRY-RUN: no DB writes performed")
|
|
|
|
# ── 5. Show top teams ──
|
|
elo._show_top_teams(10)
|
|
|
|
conn.close()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="ELO Backfill — chronological replay → match_ai_features & team_elo_ratings"
|
|
)
|
|
parser.add_argument(
|
|
"--sport",
|
|
choices=["football", "basketball", "all"],
|
|
default="football",
|
|
help="Sport to compute ELO for (default: football)",
|
|
)
|
|
parser.add_argument(
|
|
"--batch-size",
|
|
type=int,
|
|
default=DEFAULT_BATCH_SIZE,
|
|
help=f"DB insert batch size (default: {DEFAULT_BATCH_SIZE})",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Run replay without writing to DB",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
sports = ["football", "basketball"] if args.sport == "all" else [args.sport]
|
|
|
|
for sport in sports:
|
|
backfill(sport, args.batch_size, args.dry_run)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|