feat(ai-engine): value sniper thresholds and logic relaxed
This commit is contained in:
@@ -0,0 +1,459 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AI Features Full Enrichment Script
|
||||
====================================
|
||||
Fills empty/default columns in football_ai_features that were not populated
|
||||
by the original elo_backfill_v1 script.
|
||||
|
||||
Enriches: H2H, referee, team_stats, league_averages, form_streaks,
|
||||
rolling_goals, implied_odds, and clean_sheet/scoring rates.
|
||||
|
||||
Usage:
|
||||
python scripts/enrich_ai_features.py # enrich all
|
||||
python scripts/enrich_ai_features.py --batch-size 500 # smaller batches
|
||||
python scripts/enrich_ai_features.py --dry-run # preview only
|
||||
python scripts/enrich_ai_features.py --force # re-enrich all rows
|
||||
python scripts/enrich_ai_features.py --limit 1000 # process N rows max
|
||||
|
||||
Designed to be idempotent: uses ON CONFLICT upserts, skips already-enriched rows.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
# Add ai-engine root to path
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor, execute_values
|
||||
|
||||
from data.db import get_clean_dsn
|
||||
from services.feature_enrichment import FeatureEnrichmentService
|
||||
|
||||
# ────────────────────────── constants ──────────────────────────
|
||||
|
||||
CALCULATOR_VER = 'enrichment_v2.0'
|
||||
DEFAULT_BATCH_SIZE = 200
|
||||
|
||||
|
||||
# ────────────────────────── helpers ────────────────────────────
|
||||
|
||||
def fetch_unenriched_matches(
|
||||
conn: psycopg2.extensions.connection,
|
||||
force: bool = False,
|
||||
limit: Optional[int] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Fetch matches from football_ai_features that still have default values
|
||||
in the enrichment columns (h2h_total=0 AND referee_avg_cards=0).
|
||||
|
||||
If force=True, fetches ALL rows regardless of current state.
|
||||
"""
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
where_clause = "WHERE 1=1" if force else (
|
||||
"WHERE (faf.h2h_total = 0 AND faf.referee_avg_cards = 0)"
|
||||
)
|
||||
limit_clause = f"LIMIT {limit}" if limit else ""
|
||||
|
||||
cur.execute(f"""
|
||||
SELECT
|
||||
faf.match_id,
|
||||
m.home_team_id,
|
||||
m.away_team_id,
|
||||
m.mst_utc,
|
||||
m.league_id,
|
||||
m.score_home,
|
||||
m.score_away
|
||||
FROM football_ai_features faf
|
||||
JOIN matches m ON m.id = faf.match_id
|
||||
WHERE m.status = 'FT'
|
||||
AND m.score_home IS NOT NULL
|
||||
AND m.sport = 'football'
|
||||
AND ({where_clause.replace('WHERE ', '')})
|
||||
ORDER BY m.mst_utc ASC
|
||||
{limit_clause}
|
||||
""")
|
||||
return cur.fetchall()
|
||||
|
||||
|
||||
def fetch_referee_for_match(
|
||||
cur: RealDictCursor,
|
||||
match_id: str,
|
||||
) -> Optional[str]:
|
||||
"""Get the head referee name for a match from match_officials."""
|
||||
try:
|
||||
cur.execute("""
|
||||
SELECT mo.name
|
||||
FROM match_officials mo
|
||||
WHERE mo.match_id = %s
|
||||
AND mo.role_id = 1
|
||||
LIMIT 1
|
||||
""", (match_id,))
|
||||
row = cur.fetchone()
|
||||
return row['name'] if row else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def fetch_implied_odds(
|
||||
cur: RealDictCursor,
|
||||
match_id: str,
|
||||
) -> Dict[str, float]:
|
||||
"""Get implied probabilities from odd_categories + odd_selections."""
|
||||
defaults = {
|
||||
'implied_home': 0.33,
|
||||
'implied_draw': 0.33,
|
||||
'implied_away': 0.33,
|
||||
'implied_over25': 0.50,
|
||||
'implied_btts_yes': 0.50,
|
||||
'odds_overround': 0.0,
|
||||
}
|
||||
try:
|
||||
cur.execute("""
|
||||
SELECT oc.name AS cat_name, os.name AS sel_name, os.odd_value
|
||||
FROM odd_selections os
|
||||
JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
|
||||
WHERE oc.match_id = %s
|
||||
""", (match_id,))
|
||||
rows = cur.fetchall()
|
||||
except Exception:
|
||||
return defaults
|
||||
|
||||
odds: Dict[str, float] = {}
|
||||
for row in rows:
|
||||
try:
|
||||
cat = (row.get('cat_name') or '').lower().strip()
|
||||
sel = (row.get('sel_name') or '').strip()
|
||||
val = float(row.get('odd_value', 0))
|
||||
if val <= 0:
|
||||
continue
|
||||
|
||||
if cat == 'maç sonucu':
|
||||
if sel == '1':
|
||||
odds['ms_h'] = val
|
||||
elif sel in ('0', 'X'):
|
||||
odds['ms_d'] = val
|
||||
elif sel == '2':
|
||||
odds['ms_a'] = val
|
||||
elif cat == '2,5 alt/üst':
|
||||
if 'üst' in sel.lower():
|
||||
odds['ou25_o'] = val
|
||||
elif 'alt' in sel.lower():
|
||||
odds['ou25_u'] = val
|
||||
elif cat == 'karşılıklı gol':
|
||||
if 'var' in sel.lower():
|
||||
odds['btts_y'] = val
|
||||
elif 'yok' in sel.lower():
|
||||
odds['btts_n'] = val
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
# Compute implied probabilities
|
||||
ms_h = odds.get('ms_h', 0)
|
||||
ms_d = odds.get('ms_d', 0)
|
||||
ms_a = odds.get('ms_a', 0)
|
||||
|
||||
if ms_h > 1.0 and ms_d > 1.0 and ms_a > 1.0:
|
||||
raw_sum = 1 / ms_h + 1 / ms_d + 1 / ms_a
|
||||
overround = raw_sum - 1.0
|
||||
defaults['implied_home'] = round((1 / ms_h) / raw_sum, 4)
|
||||
defaults['implied_draw'] = round((1 / ms_d) / raw_sum, 4)
|
||||
defaults['implied_away'] = round((1 / ms_a) / raw_sum, 4)
|
||||
defaults['odds_overround'] = round(overround, 4)
|
||||
|
||||
ou25_o = odds.get('ou25_o', 0)
|
||||
ou25_u = odds.get('ou25_u', 0)
|
||||
if ou25_o > 1.0 and ou25_u > 1.0:
|
||||
raw_sum = 1 / ou25_o + 1 / ou25_u
|
||||
defaults['implied_over25'] = round((1 / ou25_o) / raw_sum, 4)
|
||||
|
||||
btts_y = odds.get('btts_y', 0)
|
||||
btts_n = odds.get('btts_n', 0)
|
||||
if btts_y > 1.0 and btts_n > 1.0:
|
||||
raw_sum = 1 / btts_y + 1 / btts_n
|
||||
defaults['implied_btts_yes'] = round((1 / btts_y) / raw_sum, 4)
|
||||
|
||||
return defaults
|
||||
|
||||
|
||||
def enrich_single_match(
|
||||
enrichment: FeatureEnrichmentService,
|
||||
cur: RealDictCursor,
|
||||
match: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Compute all enrichment features for a single match and return
|
||||
a dict ready for DB upsert.
|
||||
"""
|
||||
match_id = match['match_id']
|
||||
home_id = str(match['home_team_id'])
|
||||
away_id = str(match['away_team_id'])
|
||||
mst_utc = int(match['mst_utc']) if match['mst_utc'] else 0
|
||||
league_id = str(match['league_id']) if match['league_id'] else None
|
||||
|
||||
# 1. Team stats
|
||||
home_stats = enrichment.compute_team_stats(cur, home_id, mst_utc)
|
||||
away_stats = enrichment.compute_team_stats(cur, away_id, mst_utc)
|
||||
|
||||
# 2. H2H
|
||||
h2h = enrichment.compute_h2h(cur, home_id, away_id, mst_utc)
|
||||
|
||||
# 3. Form & streaks
|
||||
home_form = enrichment.compute_form_streaks(cur, home_id, mst_utc)
|
||||
away_form = enrichment.compute_form_streaks(cur, away_id, mst_utc)
|
||||
|
||||
# 4. Referee
|
||||
referee_name = fetch_referee_for_match(cur, match_id)
|
||||
referee = enrichment.compute_referee_stats(cur, referee_name, mst_utc)
|
||||
|
||||
# 5. League averages
|
||||
league = enrichment.compute_league_averages(cur, league_id, mst_utc)
|
||||
|
||||
# 6. Rolling stats (for goals avg)
|
||||
home_rolling = enrichment.compute_rolling_stats(cur, home_id, mst_utc)
|
||||
away_rolling = enrichment.compute_rolling_stats(cur, away_id, mst_utc)
|
||||
|
||||
# 7. Implied odds
|
||||
implied = fetch_implied_odds(cur, match_id)
|
||||
|
||||
return {
|
||||
'match_id': match_id,
|
||||
# Team stats
|
||||
'home_avg_possession': round(home_stats['avg_possession'], 2),
|
||||
'away_avg_possession': round(away_stats['avg_possession'], 2),
|
||||
'home_avg_shots_on_target': round(home_stats['avg_shots_on_target'], 2),
|
||||
'away_avg_shots_on_target': round(away_stats['avg_shots_on_target'], 2),
|
||||
'home_shot_conversion': round(home_stats['shot_conversion'], 4),
|
||||
'away_shot_conversion': round(away_stats['shot_conversion'], 4),
|
||||
'home_avg_corners': round(home_stats['avg_corners'], 2),
|
||||
'away_avg_corners': round(away_stats['avg_corners'], 2),
|
||||
# H2H
|
||||
'h2h_total': h2h['total_matches'],
|
||||
'h2h_home_win_rate': round(h2h['home_win_rate'], 4),
|
||||
'h2h_avg_goals': round(h2h['avg_goals'], 2),
|
||||
'h2h_over25_rate': round(h2h['over25_rate'], 4),
|
||||
'h2h_btts_rate': round(h2h['btts_rate'], 4),
|
||||
# Form
|
||||
'home_clean_sheet_rate': round(home_form['clean_sheet_rate'], 4),
|
||||
'away_clean_sheet_rate': round(away_form['clean_sheet_rate'], 4),
|
||||
'home_scoring_rate': round(home_form['scoring_rate'], 4),
|
||||
'away_scoring_rate': round(away_form['scoring_rate'], 4),
|
||||
'home_win_streak': home_form['winning_streak'],
|
||||
'away_win_streak': away_form['winning_streak'],
|
||||
# Rolling goals
|
||||
'home_goals_avg_5': round(home_rolling['rolling5_goals'], 2),
|
||||
'away_goals_avg_5': round(away_rolling['rolling5_goals'], 2),
|
||||
'home_conceded_avg_5': round(home_rolling['rolling5_conceded'], 2),
|
||||
'away_conceded_avg_5': round(away_rolling['rolling5_conceded'], 2),
|
||||
# Referee
|
||||
'referee_avg_cards': round(referee['cards_total'], 2),
|
||||
'referee_home_bias': round(referee['home_bias'], 4),
|
||||
'referee_avg_goals': round(referee['avg_goals'], 2),
|
||||
# League
|
||||
'league_avg_goals': round(league['avg_goals'], 2),
|
||||
'league_home_win_pct': round(league['home_win_rate'], 4),
|
||||
'league_over25_pct': round(league['ou25_rate'], 4),
|
||||
# Implied odds
|
||||
'implied_home': implied['implied_home'],
|
||||
'implied_draw': implied['implied_draw'],
|
||||
'implied_away': implied['implied_away'],
|
||||
'implied_over25': implied['implied_over25'],
|
||||
'implied_btts_yes': implied['implied_btts_yes'],
|
||||
'odds_overround': implied['odds_overround'],
|
||||
# Missing players impact — default (no lineup data for historical)
|
||||
'missing_players_impact': 0.0,
|
||||
# Version
|
||||
'calculator_ver': CALCULATOR_VER,
|
||||
}
|
||||
|
||||
|
||||
def flush_enrichment_batch(
|
||||
conn: psycopg2.extensions.connection,
|
||||
rows: List[Dict[str, Any]],
|
||||
dry_run: bool,
|
||||
) -> int:
|
||||
"""Bulk upsert enriched features into football_ai_features."""
|
||||
if not rows or dry_run:
|
||||
return 0
|
||||
|
||||
columns = [
|
||||
'match_id',
|
||||
'home_avg_possession', 'away_avg_possession',
|
||||
'home_avg_shots_on_target', 'away_avg_shots_on_target',
|
||||
'home_shot_conversion', 'away_shot_conversion',
|
||||
'home_avg_corners', 'away_avg_corners',
|
||||
'h2h_total', 'h2h_home_win_rate', 'h2h_avg_goals',
|
||||
'h2h_over25_rate', 'h2h_btts_rate',
|
||||
'home_clean_sheet_rate', 'away_clean_sheet_rate',
|
||||
'home_scoring_rate', 'away_scoring_rate',
|
||||
'home_win_streak', 'away_win_streak',
|
||||
'home_goals_avg_5', 'away_goals_avg_5',
|
||||
'home_conceded_avg_5', 'away_conceded_avg_5',
|
||||
'referee_avg_cards', 'referee_home_bias', 'referee_avg_goals',
|
||||
'league_avg_goals', 'league_home_win_pct', 'league_over25_pct',
|
||||
'implied_home', 'implied_draw', 'implied_away',
|
||||
'implied_over25', 'implied_btts_yes', 'odds_overround',
|
||||
'missing_players_impact', 'calculator_ver',
|
||||
]
|
||||
|
||||
# Build update SET clause (skip match_id)
|
||||
update_cols = [c for c in columns if c != 'match_id']
|
||||
set_clause = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
|
||||
|
||||
placeholders = ', '.join(['%s'] * len(columns))
|
||||
values = [
|
||||
tuple(row[c] for c in columns)
|
||||
for row in rows
|
||||
]
|
||||
|
||||
with conn.cursor() as cur:
|
||||
execute_values(
|
||||
cur,
|
||||
f"""
|
||||
INSERT INTO football_ai_features ({', '.join(columns)})
|
||||
VALUES %s
|
||||
ON CONFLICT (match_id) DO UPDATE SET
|
||||
{set_clause},
|
||||
updated_at = NOW()
|
||||
""",
|
||||
values,
|
||||
template=f"({placeholders})",
|
||||
page_size=200,
|
||||
)
|
||||
conn.commit()
|
||||
return len(rows)
|
||||
|
||||
|
||||
# ────────────────────────── main ───────────────────────────────
|
||||
|
||||
def run_enrichment(
|
||||
batch_size: int,
|
||||
dry_run: bool,
|
||||
force: bool,
|
||||
limit: Optional[int],
|
||||
) -> None:
|
||||
"""Core enrichment loop."""
|
||||
dsn = get_clean_dsn()
|
||||
conn = psycopg2.connect(dsn)
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"🧠 AI Features Full Enrichment — {CALCULATOR_VER}")
|
||||
print(f" batch_size={batch_size} dry_run={dry_run} force={force}")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# 1. Fetch unenriched matches
|
||||
t0 = time.time()
|
||||
matches = fetch_unenriched_matches(conn, force=force, limit=limit)
|
||||
print(f"\n📊 {len(matches):,} matches to enrich ({time.time() - t0:.1f}s)")
|
||||
|
||||
if not matches:
|
||||
print("✅ Nothing to enrich — all rows already populated.")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# 2. Initialize enrichment service
|
||||
enrichment = FeatureEnrichmentService()
|
||||
|
||||
# 3. Process in batches
|
||||
total = len(matches)
|
||||
processed = 0
|
||||
written = 0
|
||||
errors = 0
|
||||
batch_buf: List[Dict[str, Any]] = []
|
||||
t_start = time.time()
|
||||
|
||||
# Use a dedicated cursor with RealDictCursor for all enrichment queries
|
||||
enrich_cur = conn.cursor(cursor_factory=RealDictCursor)
|
||||
|
||||
for idx, match in enumerate(matches):
|
||||
try:
|
||||
enriched = enrich_single_match(enrichment, enrich_cur, match)
|
||||
batch_buf.append(enriched)
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
if errors <= 10:
|
||||
print(f" ⚠️ Error enriching {match.get('match_id', '?')}: {e}")
|
||||
|
||||
processed += 1
|
||||
|
||||
# Flush batch
|
||||
if len(batch_buf) >= batch_size:
|
||||
flushed = flush_enrichment_batch(conn, batch_buf, dry_run)
|
||||
written += flushed
|
||||
batch_buf.clear()
|
||||
|
||||
# Progress reporting
|
||||
if processed % 500 == 0:
|
||||
elapsed = time.time() - t_start
|
||||
rate = processed / elapsed if elapsed > 0 else 0
|
||||
remaining = (total - processed) / rate if rate > 0 else 0
|
||||
pct = processed / total * 100
|
||||
print(
|
||||
f" [{processed:>8,} / {total:,}] "
|
||||
f"({pct:.1f}%) | {rate:.0f} matches/s | "
|
||||
f"ETA: {remaining / 60:.1f} min | "
|
||||
f"errors: {errors}"
|
||||
)
|
||||
|
||||
# Flush remaining
|
||||
if batch_buf:
|
||||
flushed = flush_enrichment_batch(conn, batch_buf, dry_run)
|
||||
written += flushed
|
||||
|
||||
enrich_cur.close()
|
||||
|
||||
elapsed = time.time() - t_start
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"✅ Enrichment complete:")
|
||||
print(f" Processed: {processed:,} matches in {elapsed:.1f}s")
|
||||
print(f" Written: {written:,} rows")
|
||||
print(f" Errors: {errors:,}")
|
||||
print(f" Rate: {processed / elapsed:.0f} matches/s")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Enrich football_ai_features with H2H, referee, stats, and odds data"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--batch-size',
|
||||
type=int,
|
||||
default=DEFAULT_BATCH_SIZE,
|
||||
help=f'DB insert batch size (default: {DEFAULT_BATCH_SIZE})',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Compute features but do not write to DB',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--force',
|
||||
action='store_true',
|
||||
help='Re-enrich ALL rows, not just empty ones',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--limit',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Max number of matches to process',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
run_enrichment(
|
||||
batch_size=args.batch_size,
|
||||
dry_run=args.dry_run,
|
||||
force=args.force,
|
||||
limit=args.limit,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user