Files
fahricansecer 2f0b85a0c7
Deploy Iddaai Backend / build-and-deploy (push) Failing after 18s
first (part 2: other directories)
2026-04-16 15:11:25 +03:00

229 lines
7.7 KiB
Python

#!/usr/bin/env python3
"""
ELO Backfill Script — Chronological Replay
Replays all finished matches in chronological order, computes ELO ratings,
and persists:
1. Per-match pre-match ELO snapshots → match_ai_features
2. Final team ELO state → team_elo_ratings
Usage:
python scripts/elo_backfill.py # football (default)
python scripts/elo_backfill.py --sport basketball
python scripts/elo_backfill.py --sport all
python scripts/elo_backfill.py --dry-run # no DB writes
python scripts/elo_backfill.py --batch-size 2000
Designed to be idempotent: uses ON CONFLICT upserts everywhere.
"""
import os
import sys
import time
import argparse
# Add ai-engine root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import psycopg2
from psycopg2.extras import execute_values
from data.db import get_clean_dsn
from features.elo_system import ELORatingSystem
# ────────────────────────── constants ──────────────────────────
CALCULATOR_VER = "elo_backfill_v1"
DEFAULT_BATCH_SIZE = 1000
# ────────────────────────── helpers ────────────────────────────
def fetch_matches(conn, sport: str):
"""Fetch all finished matches chronologically."""
with conn.cursor() as cur:
cur.execute("""
SELECT m.id, m.home_team_id, m.away_team_id,
m.score_home, m.score_away,
t1.name AS home_name, t2.name AS away_name,
l.name AS league_name
FROM matches m
LEFT JOIN teams t1 ON m.home_team_id = t1.id
LEFT JOIN teams t2 ON m.away_team_id = t2.id
LEFT JOIN leagues l ON m.league_id = l.id
WHERE m.sport = %s
AND m.score_home IS NOT NULL
AND m.score_away IS NOT NULL
ORDER BY m.mst_utc ASC
""", (sport,))
return cur.fetchall()
def flush_features_batch(conn, rows, dry_run: bool, sport: str = 'football'):
"""Bulk upsert a batch of (match_id, home_elo, away_elo) into sport-partitioned ai_features table."""
if not rows or dry_run:
return
table_name = 'football_ai_features' if sport == 'football' else 'basketball_ai_features'
with conn.cursor() as cur:
execute_values(
cur,
f"""
INSERT INTO {table_name}
(match_id, home_elo, away_elo,
home_form_score, away_form_score,
missing_players_impact, calculator_ver, updated_at)
VALUES %s
ON CONFLICT (match_id) DO UPDATE SET
home_elo = EXCLUDED.home_elo,
away_elo = EXCLUDED.away_elo,
home_form_score = EXCLUDED.home_form_score,
away_form_score = EXCLUDED.away_form_score,
calculator_ver = EXCLUDED.calculator_ver,
updated_at = EXCLUDED.updated_at
""",
rows,
template="(%s, %s, %s, %s, %s, 0.0, %s, NOW())",
page_size=500,
)
conn.commit()
# ────────────────────────── main ───────────────────────────────
def backfill(sport: str, batch_size: int, dry_run: bool):
"""Core backfill: chronological replay → match_ai_features + team_elo_ratings"""
dsn = get_clean_dsn()
conn = psycopg2.connect(dsn)
print(f"\n{'='*60}")
print(f"🏆 ELO Backfill — {sport.upper()}")
print(f" batch_size={batch_size} dry_run={dry_run}")
print(f"{'='*60}")
# ── 1. Fetch matches ──
t0 = time.time()
matches = fetch_matches(conn, sport)
print(f"📊 {len(matches):,} matches fetched in {time.time()-t0:.1f}s")
if not matches:
print("⚠️ No matches found — nothing to do.")
conn.close()
return
# ── 2. Fresh ELO system (no preloaded ratings) ──
elo = ELORatingSystem.__new__(ELORatingSystem)
elo.ratings = {}
elo.league_cache = {}
elo.conn = conn
# ── 3. Chronological replay ──
feature_buf = []
processed = 0
features_written = 0
t_start = time.time()
def form_to_score(form: str) -> float:
"""Convert WDLWW form string to 0-100 float (matches existing DB convention)."""
if not form:
return 50.0
s = sum(1.0 if c == 'W' else 0.5 if c == 'D' else 0.0 for c in form)
return (s / max(len(form), 1)) * 100.0
for row in matches:
match_id, home_id, away_id, score_h, score_a, h_name, a_name, league = row
if not home_id or not away_id:
continue
# Snapshot PRE-match ELO
home_rating = elo.get_or_create_rating(home_id, h_name or "")
away_rating = elo.get_or_create_rating(away_id, a_name or "")
feature_buf.append((
match_id,
round(home_rating.overall_elo, 2),
round(away_rating.overall_elo, 2),
round(form_to_score(home_rating.recent_form), 2),
round(form_to_score(away_rating.recent_form), 2),
CALCULATOR_VER,
))
# Update ELO after the match
elo.update_after_match(
home_id, away_id, score_h, score_a,
h_name or "", a_name or "", league or "",
)
processed += 1
# Flush batch
if len(feature_buf) >= batch_size:
flush_features_batch(conn, feature_buf, dry_run, sport)
features_written += len(feature_buf)
feature_buf.clear()
if processed % 10_000 == 0:
elapsed = time.time() - t_start
rate = processed / elapsed if elapsed > 0 else 0
print(f" {processed:>8,} / {len(matches):,} processed "
f"({rate:,.0f} matches/s) "
f"teams={len(elo.ratings)}")
# Flush remaining
if feature_buf:
flush_features_batch(conn, feature_buf, dry_run, sport)
features_written += len(feature_buf)
elapsed = time.time() - t_start
print(f"\n✅ Replay complete: {processed:,} matches in {elapsed:.1f}s")
table_name = 'football_ai_features' if sport == 'football' else 'basketball_ai_features'
print(f" {features_written:,} {table_name} rows written")
print(f" {len(elo.ratings):,} teams rated")
# ── 4. Persist final team ELO state ──
if not dry_run:
elo.save_ratings_to_db()
elo.save_ratings()
print("💾 team_elo_ratings + JSON saved")
else:
print("🔸 DRY-RUN: no DB writes performed")
# ── 5. Show top teams ──
elo._show_top_teams(10)
conn.close()
def main():
parser = argparse.ArgumentParser(
description="ELO Backfill — chronological replay → match_ai_features & team_elo_ratings"
)
parser.add_argument(
"--sport",
choices=["football", "basketball", "all"],
default="football",
help="Sport to compute ELO for (default: football)",
)
parser.add_argument(
"--batch-size",
type=int,
default=DEFAULT_BATCH_SIZE,
help=f"DB insert batch size (default: {DEFAULT_BATCH_SIZE})",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Run replay without writing to DB",
)
args = parser.parse_args()
sports = ["football", "basketball"] if args.sport == "all" else [args.sport]
for sport in sports:
backfill(sport, args.batch_size, args.dry_run)
if __name__ == "__main__":
main()