460 lines
16 KiB
Python
460 lines
16 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
AI Features Full Enrichment Script
|
||
====================================
|
||
Fills empty/default columns in football_ai_features that were not populated
|
||
by the original elo_backfill_v1 script.
|
||
|
||
Enriches: H2H, referee, team_stats, league_averages, form_streaks,
|
||
rolling_goals, implied_odds, and clean_sheet/scoring rates.
|
||
|
||
Usage:
|
||
python scripts/enrich_ai_features.py # enrich all
|
||
python scripts/enrich_ai_features.py --batch-size 500 # smaller batches
|
||
python scripts/enrich_ai_features.py --dry-run # preview only
|
||
python scripts/enrich_ai_features.py --force # re-enrich all rows
|
||
python scripts/enrich_ai_features.py --limit 1000 # process N rows max
|
||
|
||
Designed to be idempotent: uses ON CONFLICT upserts, skips already-enriched rows.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import sys
|
||
import time
|
||
import argparse
|
||
from typing import Any, Dict, List, Optional, Tuple
|
||
|
||
# Add ai-engine root to path
|
||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
|
||
import psycopg2
|
||
from psycopg2.extras import RealDictCursor, execute_values
|
||
|
||
from data.db import get_clean_dsn
|
||
from services.feature_enrichment import FeatureEnrichmentService
|
||
|
||
# ────────────────────────── constants ──────────────────────────
|
||
|
||
CALCULATOR_VER = 'enrichment_v2.0'
|
||
DEFAULT_BATCH_SIZE = 200
|
||
|
||
|
||
# ────────────────────────── helpers ────────────────────────────
|
||
|
||
def fetch_unenriched_matches(
|
||
conn: psycopg2.extensions.connection,
|
||
force: bool = False,
|
||
limit: Optional[int] = None,
|
||
) -> List[Dict[str, Any]]:
|
||
"""
|
||
Fetch matches from football_ai_features that still have default values
|
||
in the enrichment columns (h2h_total=0 AND referee_avg_cards=0).
|
||
|
||
If force=True, fetches ALL rows regardless of current state.
|
||
"""
|
||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||
where_clause = "WHERE 1=1" if force else (
|
||
"WHERE (faf.h2h_total = 0 AND faf.referee_avg_cards = 0)"
|
||
)
|
||
limit_clause = f"LIMIT {limit}" if limit else ""
|
||
|
||
cur.execute(f"""
|
||
SELECT
|
||
faf.match_id,
|
||
m.home_team_id,
|
||
m.away_team_id,
|
||
m.mst_utc,
|
||
m.league_id,
|
||
m.score_home,
|
||
m.score_away
|
||
FROM football_ai_features faf
|
||
JOIN matches m ON m.id = faf.match_id
|
||
WHERE m.status = 'FT'
|
||
AND m.score_home IS NOT NULL
|
||
AND m.sport = 'football'
|
||
AND ({where_clause.replace('WHERE ', '')})
|
||
ORDER BY m.mst_utc ASC
|
||
{limit_clause}
|
||
""")
|
||
return cur.fetchall()
|
||
|
||
|
||
def fetch_referee_for_match(
|
||
cur: RealDictCursor,
|
||
match_id: str,
|
||
) -> Optional[str]:
|
||
"""Get the head referee name for a match from match_officials."""
|
||
try:
|
||
cur.execute("""
|
||
SELECT mo.name
|
||
FROM match_officials mo
|
||
WHERE mo.match_id = %s
|
||
AND mo.role_id = 1
|
||
LIMIT 1
|
||
""", (match_id,))
|
||
row = cur.fetchone()
|
||
return row['name'] if row else None
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def fetch_implied_odds(
|
||
cur: RealDictCursor,
|
||
match_id: str,
|
||
) -> Dict[str, float]:
|
||
"""Get implied probabilities from odd_categories + odd_selections."""
|
||
defaults = {
|
||
'implied_home': 0.33,
|
||
'implied_draw': 0.33,
|
||
'implied_away': 0.33,
|
||
'implied_over25': 0.50,
|
||
'implied_btts_yes': 0.50,
|
||
'odds_overround': 0.0,
|
||
}
|
||
try:
|
||
cur.execute("""
|
||
SELECT oc.name AS cat_name, os.name AS sel_name, os.odd_value
|
||
FROM odd_selections os
|
||
JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
|
||
WHERE oc.match_id = %s
|
||
""", (match_id,))
|
||
rows = cur.fetchall()
|
||
except Exception:
|
||
return defaults
|
||
|
||
odds: Dict[str, float] = {}
|
||
for row in rows:
|
||
try:
|
||
cat = (row.get('cat_name') or '').lower().strip()
|
||
sel = (row.get('sel_name') or '').strip()
|
||
val = float(row.get('odd_value', 0))
|
||
if val <= 0:
|
||
continue
|
||
|
||
if cat == 'maç sonucu':
|
||
if sel == '1':
|
||
odds['ms_h'] = val
|
||
elif sel in ('0', 'X'):
|
||
odds['ms_d'] = val
|
||
elif sel == '2':
|
||
odds['ms_a'] = val
|
||
elif cat == '2,5 alt/üst':
|
||
if 'üst' in sel.lower():
|
||
odds['ou25_o'] = val
|
||
elif 'alt' in sel.lower():
|
||
odds['ou25_u'] = val
|
||
elif cat == 'karşılıklı gol':
|
||
if 'var' in sel.lower():
|
||
odds['btts_y'] = val
|
||
elif 'yok' in sel.lower():
|
||
odds['btts_n'] = val
|
||
except (ValueError, TypeError):
|
||
continue
|
||
|
||
# Compute implied probabilities
|
||
ms_h = odds.get('ms_h', 0)
|
||
ms_d = odds.get('ms_d', 0)
|
||
ms_a = odds.get('ms_a', 0)
|
||
|
||
if ms_h > 1.0 and ms_d > 1.0 and ms_a > 1.0:
|
||
raw_sum = 1 / ms_h + 1 / ms_d + 1 / ms_a
|
||
overround = raw_sum - 1.0
|
||
defaults['implied_home'] = round((1 / ms_h) / raw_sum, 4)
|
||
defaults['implied_draw'] = round((1 / ms_d) / raw_sum, 4)
|
||
defaults['implied_away'] = round((1 / ms_a) / raw_sum, 4)
|
||
defaults['odds_overround'] = round(overround, 4)
|
||
|
||
ou25_o = odds.get('ou25_o', 0)
|
||
ou25_u = odds.get('ou25_u', 0)
|
||
if ou25_o > 1.0 and ou25_u > 1.0:
|
||
raw_sum = 1 / ou25_o + 1 / ou25_u
|
||
defaults['implied_over25'] = round((1 / ou25_o) / raw_sum, 4)
|
||
|
||
btts_y = odds.get('btts_y', 0)
|
||
btts_n = odds.get('btts_n', 0)
|
||
if btts_y > 1.0 and btts_n > 1.0:
|
||
raw_sum = 1 / btts_y + 1 / btts_n
|
||
defaults['implied_btts_yes'] = round((1 / btts_y) / raw_sum, 4)
|
||
|
||
return defaults
|
||
|
||
|
||
def enrich_single_match(
|
||
enrichment: FeatureEnrichmentService,
|
||
cur: RealDictCursor,
|
||
match: Dict[str, Any],
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
Compute all enrichment features for a single match and return
|
||
a dict ready for DB upsert.
|
||
"""
|
||
match_id = match['match_id']
|
||
home_id = str(match['home_team_id'])
|
||
away_id = str(match['away_team_id'])
|
||
mst_utc = int(match['mst_utc']) if match['mst_utc'] else 0
|
||
league_id = str(match['league_id']) if match['league_id'] else None
|
||
|
||
# 1. Team stats
|
||
home_stats = enrichment.compute_team_stats(cur, home_id, mst_utc)
|
||
away_stats = enrichment.compute_team_stats(cur, away_id, mst_utc)
|
||
|
||
# 2. H2H
|
||
h2h = enrichment.compute_h2h(cur, home_id, away_id, mst_utc)
|
||
|
||
# 3. Form & streaks
|
||
home_form = enrichment.compute_form_streaks(cur, home_id, mst_utc)
|
||
away_form = enrichment.compute_form_streaks(cur, away_id, mst_utc)
|
||
|
||
# 4. Referee
|
||
referee_name = fetch_referee_for_match(cur, match_id)
|
||
referee = enrichment.compute_referee_stats(cur, referee_name, mst_utc)
|
||
|
||
# 5. League averages
|
||
league = enrichment.compute_league_averages(cur, league_id, mst_utc)
|
||
|
||
# 6. Rolling stats (for goals avg)
|
||
home_rolling = enrichment.compute_rolling_stats(cur, home_id, mst_utc)
|
||
away_rolling = enrichment.compute_rolling_stats(cur, away_id, mst_utc)
|
||
|
||
# 7. Implied odds
|
||
implied = fetch_implied_odds(cur, match_id)
|
||
|
||
return {
|
||
'match_id': match_id,
|
||
# Team stats
|
||
'home_avg_possession': round(home_stats['avg_possession'], 2),
|
||
'away_avg_possession': round(away_stats['avg_possession'], 2),
|
||
'home_avg_shots_on_target': round(home_stats['avg_shots_on_target'], 2),
|
||
'away_avg_shots_on_target': round(away_stats['avg_shots_on_target'], 2),
|
||
'home_shot_conversion': round(home_stats['shot_conversion'], 4),
|
||
'away_shot_conversion': round(away_stats['shot_conversion'], 4),
|
||
'home_avg_corners': round(home_stats['avg_corners'], 2),
|
||
'away_avg_corners': round(away_stats['avg_corners'], 2),
|
||
# H2H
|
||
'h2h_total': h2h['total_matches'],
|
||
'h2h_home_win_rate': round(h2h['home_win_rate'], 4),
|
||
'h2h_avg_goals': round(h2h['avg_goals'], 2),
|
||
'h2h_over25_rate': round(h2h['over25_rate'], 4),
|
||
'h2h_btts_rate': round(h2h['btts_rate'], 4),
|
||
# Form
|
||
'home_clean_sheet_rate': round(home_form['clean_sheet_rate'], 4),
|
||
'away_clean_sheet_rate': round(away_form['clean_sheet_rate'], 4),
|
||
'home_scoring_rate': round(home_form['scoring_rate'], 4),
|
||
'away_scoring_rate': round(away_form['scoring_rate'], 4),
|
||
'home_win_streak': home_form['winning_streak'],
|
||
'away_win_streak': away_form['winning_streak'],
|
||
# Rolling goals
|
||
'home_goals_avg_5': round(home_rolling['rolling5_goals'], 2),
|
||
'away_goals_avg_5': round(away_rolling['rolling5_goals'], 2),
|
||
'home_conceded_avg_5': round(home_rolling['rolling5_conceded'], 2),
|
||
'away_conceded_avg_5': round(away_rolling['rolling5_conceded'], 2),
|
||
# Referee
|
||
'referee_avg_cards': round(referee['cards_total'], 2),
|
||
'referee_home_bias': round(referee['home_bias'], 4),
|
||
'referee_avg_goals': round(referee['avg_goals'], 2),
|
||
# League
|
||
'league_avg_goals': round(league['avg_goals'], 2),
|
||
'league_home_win_pct': round(league['home_win_rate'], 4),
|
||
'league_over25_pct': round(league['ou25_rate'], 4),
|
||
# Implied odds
|
||
'implied_home': implied['implied_home'],
|
||
'implied_draw': implied['implied_draw'],
|
||
'implied_away': implied['implied_away'],
|
||
'implied_over25': implied['implied_over25'],
|
||
'implied_btts_yes': implied['implied_btts_yes'],
|
||
'odds_overround': implied['odds_overround'],
|
||
# Missing players impact — default (no lineup data for historical)
|
||
'missing_players_impact': 0.0,
|
||
# Version
|
||
'calculator_ver': CALCULATOR_VER,
|
||
}
|
||
|
||
|
||
def flush_enrichment_batch(
|
||
conn: psycopg2.extensions.connection,
|
||
rows: List[Dict[str, Any]],
|
||
dry_run: bool,
|
||
) -> int:
|
||
"""Bulk upsert enriched features into football_ai_features."""
|
||
if not rows or dry_run:
|
||
return 0
|
||
|
||
columns = [
|
||
'match_id',
|
||
'home_avg_possession', 'away_avg_possession',
|
||
'home_avg_shots_on_target', 'away_avg_shots_on_target',
|
||
'home_shot_conversion', 'away_shot_conversion',
|
||
'home_avg_corners', 'away_avg_corners',
|
||
'h2h_total', 'h2h_home_win_rate', 'h2h_avg_goals',
|
||
'h2h_over25_rate', 'h2h_btts_rate',
|
||
'home_clean_sheet_rate', 'away_clean_sheet_rate',
|
||
'home_scoring_rate', 'away_scoring_rate',
|
||
'home_win_streak', 'away_win_streak',
|
||
'home_goals_avg_5', 'away_goals_avg_5',
|
||
'home_conceded_avg_5', 'away_conceded_avg_5',
|
||
'referee_avg_cards', 'referee_home_bias', 'referee_avg_goals',
|
||
'league_avg_goals', 'league_home_win_pct', 'league_over25_pct',
|
||
'implied_home', 'implied_draw', 'implied_away',
|
||
'implied_over25', 'implied_btts_yes', 'odds_overround',
|
||
'missing_players_impact', 'calculator_ver',
|
||
]
|
||
|
||
# Build update SET clause (skip match_id)
|
||
update_cols = [c for c in columns if c != 'match_id']
|
||
set_clause = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
|
||
|
||
placeholders = ', '.join(['%s'] * len(columns))
|
||
values = [
|
||
tuple(row[c] for c in columns)
|
||
for row in rows
|
||
]
|
||
|
||
with conn.cursor() as cur:
|
||
execute_values(
|
||
cur,
|
||
f"""
|
||
INSERT INTO football_ai_features ({', '.join(columns)})
|
||
VALUES %s
|
||
ON CONFLICT (match_id) DO UPDATE SET
|
||
{set_clause},
|
||
updated_at = NOW()
|
||
""",
|
||
values,
|
||
template=f"({placeholders})",
|
||
page_size=200,
|
||
)
|
||
conn.commit()
|
||
return len(rows)
|
||
|
||
|
||
# ────────────────────────── main ───────────────────────────────
|
||
|
||
def run_enrichment(
|
||
batch_size: int,
|
||
dry_run: bool,
|
||
force: bool,
|
||
limit: Optional[int],
|
||
) -> None:
|
||
"""Core enrichment loop."""
|
||
dsn = get_clean_dsn()
|
||
conn = psycopg2.connect(dsn)
|
||
|
||
print(f"\n{'=' * 60}")
|
||
print(f"🧠 AI Features Full Enrichment — {CALCULATOR_VER}")
|
||
print(f" batch_size={batch_size} dry_run={dry_run} force={force}")
|
||
print(f"{'=' * 60}")
|
||
|
||
# 1. Fetch unenriched matches
|
||
t0 = time.time()
|
||
matches = fetch_unenriched_matches(conn, force=force, limit=limit)
|
||
print(f"\n📊 {len(matches):,} matches to enrich ({time.time() - t0:.1f}s)")
|
||
|
||
if not matches:
|
||
print("✅ Nothing to enrich — all rows already populated.")
|
||
conn.close()
|
||
return
|
||
|
||
# 2. Initialize enrichment service
|
||
enrichment = FeatureEnrichmentService()
|
||
|
||
# 3. Process in batches
|
||
total = len(matches)
|
||
processed = 0
|
||
written = 0
|
||
errors = 0
|
||
batch_buf: List[Dict[str, Any]] = []
|
||
t_start = time.time()
|
||
|
||
# Use a dedicated cursor with RealDictCursor for all enrichment queries
|
||
enrich_cur = conn.cursor(cursor_factory=RealDictCursor)
|
||
|
||
for idx, match in enumerate(matches):
|
||
try:
|
||
enriched = enrich_single_match(enrichment, enrich_cur, match)
|
||
batch_buf.append(enriched)
|
||
except Exception as e:
|
||
errors += 1
|
||
if errors <= 10:
|
||
print(f" ⚠️ Error enriching {match.get('match_id', '?')}: {e}")
|
||
|
||
processed += 1
|
||
|
||
# Flush batch
|
||
if len(batch_buf) >= batch_size:
|
||
flushed = flush_enrichment_batch(conn, batch_buf, dry_run)
|
||
written += flushed
|
||
batch_buf.clear()
|
||
|
||
# Progress reporting
|
||
if processed % 500 == 0:
|
||
elapsed = time.time() - t_start
|
||
rate = processed / elapsed if elapsed > 0 else 0
|
||
remaining = (total - processed) / rate if rate > 0 else 0
|
||
pct = processed / total * 100
|
||
print(
|
||
f" [{processed:>8,} / {total:,}] "
|
||
f"({pct:.1f}%) | {rate:.0f} matches/s | "
|
||
f"ETA: {remaining / 60:.1f} min | "
|
||
f"errors: {errors}"
|
||
)
|
||
|
||
# Flush remaining
|
||
if batch_buf:
|
||
flushed = flush_enrichment_batch(conn, batch_buf, dry_run)
|
||
written += flushed
|
||
|
||
enrich_cur.close()
|
||
|
||
elapsed = time.time() - t_start
|
||
print(f"\n{'=' * 60}")
|
||
print(f"✅ Enrichment complete:")
|
||
print(f" Processed: {processed:,} matches in {elapsed:.1f}s")
|
||
print(f" Written: {written:,} rows")
|
||
print(f" Errors: {errors:,}")
|
||
print(f" Rate: {processed / elapsed:.0f} matches/s")
|
||
print(f"{'=' * 60}")
|
||
|
||
conn.close()
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(
|
||
description="Enrich football_ai_features with H2H, referee, stats, and odds data"
|
||
)
|
||
parser.add_argument(
|
||
'--batch-size',
|
||
type=int,
|
||
default=DEFAULT_BATCH_SIZE,
|
||
help=f'DB insert batch size (default: {DEFAULT_BATCH_SIZE})',
|
||
)
|
||
parser.add_argument(
|
||
'--dry-run',
|
||
action='store_true',
|
||
help='Compute features but do not write to DB',
|
||
)
|
||
parser.add_argument(
|
||
'--force',
|
||
action='store_true',
|
||
help='Re-enrich ALL rows, not just empty ones',
|
||
)
|
||
parser.add_argument(
|
||
'--limit',
|
||
type=int,
|
||
default=None,
|
||
help='Max number of matches to process',
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
run_enrichment(
|
||
batch_size=args.batch_size,
|
||
dry_run=args.dry_run,
|
||
force=args.force,
|
||
limit=args.limit,
|
||
)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|