#!/usr/bin/env python3 """ ELO Backfill Script — Chronological Replay Replays all finished matches in chronological order, computes ELO ratings, and persists: 1. Per-match pre-match ELO snapshots → match_ai_features 2. Final team ELO state → team_elo_ratings Usage: python scripts/elo_backfill.py # football (default) python scripts/elo_backfill.py --sport basketball python scripts/elo_backfill.py --sport all python scripts/elo_backfill.py --dry-run # no DB writes python scripts/elo_backfill.py --batch-size 2000 Designed to be idempotent: uses ON CONFLICT upserts everywhere. """ import os import sys import time import argparse # Add ai-engine root to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import psycopg2 from psycopg2.extras import execute_values from data.db import get_clean_dsn from features.elo_system import ELORatingSystem # ────────────────────────── constants ────────────────────────── CALCULATOR_VER = "elo_backfill_v1" DEFAULT_BATCH_SIZE = 1000 # ────────────────────────── helpers ──────────────────────────── def fetch_matches(conn, sport: str): """Fetch all finished matches chronologically.""" with conn.cursor() as cur: cur.execute(""" SELECT m.id, m.home_team_id, m.away_team_id, m.score_home, m.score_away, t1.name AS home_name, t2.name AS away_name, l.name AS league_name FROM matches m LEFT JOIN teams t1 ON m.home_team_id = t1.id LEFT JOIN teams t2 ON m.away_team_id = t2.id LEFT JOIN leagues l ON m.league_id = l.id WHERE m.sport = %s AND m.score_home IS NOT NULL AND m.score_away IS NOT NULL ORDER BY m.mst_utc ASC """, (sport,)) return cur.fetchall() def flush_features_batch(conn, rows, dry_run: bool, sport: str = 'football'): """Bulk upsert ELO features into sport-partitioned ai_features table.""" if not rows or dry_run: return table_name = 'football_ai_features' if sport == 'football' else 'basketball_ai_features' with conn.cursor() as cur: execute_values( cur, f""" INSERT INTO {table_name} (match_id, home_elo, away_elo, home_home_elo, away_away_elo, home_form_elo, away_form_elo, elo_diff, home_form_score, away_form_score, missing_players_impact, calculator_ver, updated_at) VALUES %s ON CONFLICT (match_id) DO UPDATE SET home_elo = EXCLUDED.home_elo, away_elo = EXCLUDED.away_elo, home_home_elo = EXCLUDED.home_home_elo, away_away_elo = EXCLUDED.away_away_elo, home_form_elo = EXCLUDED.home_form_elo, away_form_elo = EXCLUDED.away_form_elo, elo_diff = EXCLUDED.elo_diff, home_form_score = EXCLUDED.home_form_score, away_form_score = EXCLUDED.away_form_score, calculator_ver = EXCLUDED.calculator_ver, updated_at = EXCLUDED.updated_at """, rows, template="(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 0.0, %s, NOW())", page_size=500, ) conn.commit() # ────────────────────────── main ─────────────────────────────── def backfill(sport: str, batch_size: int, dry_run: bool): """Core backfill: chronological replay → match_ai_features + team_elo_ratings""" dsn = get_clean_dsn() conn = psycopg2.connect(dsn) print(f"\n{'='*60}") print(f"🏆 ELO Backfill — {sport.upper()}") print(f" batch_size={batch_size} dry_run={dry_run}") print(f"{'='*60}") # ── 1. Fetch matches ── t0 = time.time() matches = fetch_matches(conn, sport) print(f"📊 {len(matches):,} matches fetched in {time.time()-t0:.1f}s") if not matches: print("⚠️ No matches found — nothing to do.") conn.close() return # ── 2. Fresh ELO system (no preloaded ratings) ── elo = ELORatingSystem.__new__(ELORatingSystem) elo.ratings = {} elo.league_cache = {} elo.conn = conn # ── 3. Chronological replay ── feature_buf = [] processed = 0 features_written = 0 t_start = time.time() def form_to_score(form: str) -> float: """Convert WDLWW form string to 0-100 float (matches existing DB convention).""" if not form: return 50.0 s = sum(1.0 if c == 'W' else 0.5 if c == 'D' else 0.0 for c in form) return (s / max(len(form), 1)) * 100.0 for row in matches: match_id, home_id, away_id, score_h, score_a, h_name, a_name, league = row if not home_id or not away_id: continue # Snapshot PRE-match ELO (all dimensions) home_rating = elo.get_or_create_rating(home_id, h_name or "") away_rating = elo.get_or_create_rating(away_id, a_name or "") h_overall = round(home_rating.overall_elo, 2) a_overall = round(away_rating.overall_elo, 2) feature_buf.append(( match_id, h_overall, # home_elo a_overall, # away_elo round(home_rating.home_elo, 2), # home_home_elo round(away_rating.away_elo, 2), # away_away_elo round(home_rating.form_elo, 2), # home_form_elo round(away_rating.form_elo, 2), # away_form_elo round(h_overall - a_overall, 2), # elo_diff round(form_to_score(home_rating.recent_form), 2), # home_form_score round(form_to_score(away_rating.recent_form), 2), # away_form_score CALCULATOR_VER, )) # Update ELO after the match elo.update_after_match( home_id, away_id, score_h, score_a, h_name or "", a_name or "", league or "", ) processed += 1 # Flush batch if len(feature_buf) >= batch_size: flush_features_batch(conn, feature_buf, dry_run, sport) features_written += len(feature_buf) feature_buf.clear() if processed % 10_000 == 0: elapsed = time.time() - t_start rate = processed / elapsed if elapsed > 0 else 0 print(f" {processed:>8,} / {len(matches):,} processed " f"({rate:,.0f} matches/s) " f"teams={len(elo.ratings)}") # Flush remaining if feature_buf: flush_features_batch(conn, feature_buf, dry_run, sport) features_written += len(feature_buf) elapsed = time.time() - t_start print(f"\n✅ Replay complete: {processed:,} matches in {elapsed:.1f}s") table_name = 'football_ai_features' if sport == 'football' else 'basketball_ai_features' print(f" {features_written:,} {table_name} rows written") print(f" {len(elo.ratings):,} teams rated") # ── 4. Persist final team ELO state ── if not dry_run: elo.save_ratings_to_db() elo.save_ratings() print("💾 team_elo_ratings + JSON saved") else: print("🔸 DRY-RUN: no DB writes performed") # ── 5. Show top teams ── elo._show_top_teams(10) conn.close() def main(): parser = argparse.ArgumentParser( description="ELO Backfill — chronological replay → match_ai_features & team_elo_ratings" ) parser.add_argument( "--sport", choices=["football", "basketball", "all"], default="football", help="Sport to compute ELO for (default: football)", ) parser.add_argument( "--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help=f"DB insert batch size (default: {DEFAULT_BATCH_SIZE})", ) parser.add_argument( "--dry-run", action="store_true", help="Run replay without writing to DB", ) args = parser.parse_args() sports = ["football", "basketball"] if args.sport == "all" else [args.sport] for sport in sports: backfill(sport, args.batch_size, args.dry_run) if __name__ == "__main__": main()