feat(ai-engine): value sniper thresholds and logic relaxed

This commit is contained in:
2026-05-06 17:44:45 +03:00
parent 5b5f83c8cf
commit 4f7090e2d9
13 changed files with 2040 additions and 382 deletions
+459
View File
@@ -0,0 +1,459 @@
#!/usr/bin/env python3
"""
AI Features Full Enrichment Script
====================================
Fills empty/default columns in football_ai_features that were not populated
by the original elo_backfill_v1 script.
Enriches: H2H, referee, team_stats, league_averages, form_streaks,
rolling_goals, implied_odds, and clean_sheet/scoring rates.
Usage:
python scripts/enrich_ai_features.py # enrich all
python scripts/enrich_ai_features.py --batch-size 500 # smaller batches
python scripts/enrich_ai_features.py --dry-run # preview only
python scripts/enrich_ai_features.py --force # re-enrich all rows
python scripts/enrich_ai_features.py --limit 1000 # process N rows max
Designed to be idempotent: uses ON CONFLICT upserts, skips already-enriched rows.
"""
from __future__ import annotations
import os
import sys
import time
import argparse
from typing import Any, Dict, List, Optional, Tuple
# Add ai-engine root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import psycopg2
from psycopg2.extras import RealDictCursor, execute_values
from data.db import get_clean_dsn
from services.feature_enrichment import FeatureEnrichmentService
# ────────────────────────── constants ──────────────────────────
CALCULATOR_VER = 'enrichment_v2.0'
DEFAULT_BATCH_SIZE = 200
# ────────────────────────── helpers ────────────────────────────
def fetch_unenriched_matches(
conn: psycopg2.extensions.connection,
force: bool = False,
limit: Optional[int] = None,
) -> List[Dict[str, Any]]:
"""
Fetch matches from football_ai_features that still have default values
in the enrichment columns (h2h_total=0 AND referee_avg_cards=0).
If force=True, fetches ALL rows regardless of current state.
"""
with conn.cursor(cursor_factory=RealDictCursor) as cur:
where_clause = "WHERE 1=1" if force else (
"WHERE (faf.h2h_total = 0 AND faf.referee_avg_cards = 0)"
)
limit_clause = f"LIMIT {limit}" if limit else ""
cur.execute(f"""
SELECT
faf.match_id,
m.home_team_id,
m.away_team_id,
m.mst_utc,
m.league_id,
m.score_home,
m.score_away
FROM football_ai_features faf
JOIN matches m ON m.id = faf.match_id
WHERE m.status = 'FT'
AND m.score_home IS NOT NULL
AND m.sport = 'football'
AND ({where_clause.replace('WHERE ', '')})
ORDER BY m.mst_utc ASC
{limit_clause}
""")
return cur.fetchall()
def fetch_referee_for_match(
cur: RealDictCursor,
match_id: str,
) -> Optional[str]:
"""Get the head referee name for a match from match_officials."""
try:
cur.execute("""
SELECT mo.name
FROM match_officials mo
WHERE mo.match_id = %s
AND mo.role_id = 1
LIMIT 1
""", (match_id,))
row = cur.fetchone()
return row['name'] if row else None
except Exception:
return None
def fetch_implied_odds(
cur: RealDictCursor,
match_id: str,
) -> Dict[str, float]:
"""Get implied probabilities from odd_categories + odd_selections."""
defaults = {
'implied_home': 0.33,
'implied_draw': 0.33,
'implied_away': 0.33,
'implied_over25': 0.50,
'implied_btts_yes': 0.50,
'odds_overround': 0.0,
}
try:
cur.execute("""
SELECT oc.name AS cat_name, os.name AS sel_name, os.odd_value
FROM odd_selections os
JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
WHERE oc.match_id = %s
""", (match_id,))
rows = cur.fetchall()
except Exception:
return defaults
odds: Dict[str, float] = {}
for row in rows:
try:
cat = (row.get('cat_name') or '').lower().strip()
sel = (row.get('sel_name') or '').strip()
val = float(row.get('odd_value', 0))
if val <= 0:
continue
if cat == 'maç sonucu':
if sel == '1':
odds['ms_h'] = val
elif sel in ('0', 'X'):
odds['ms_d'] = val
elif sel == '2':
odds['ms_a'] = val
elif cat == '2,5 alt/üst':
if 'üst' in sel.lower():
odds['ou25_o'] = val
elif 'alt' in sel.lower():
odds['ou25_u'] = val
elif cat == 'karşılıklı gol':
if 'var' in sel.lower():
odds['btts_y'] = val
elif 'yok' in sel.lower():
odds['btts_n'] = val
except (ValueError, TypeError):
continue
# Compute implied probabilities
ms_h = odds.get('ms_h', 0)
ms_d = odds.get('ms_d', 0)
ms_a = odds.get('ms_a', 0)
if ms_h > 1.0 and ms_d > 1.0 and ms_a > 1.0:
raw_sum = 1 / ms_h + 1 / ms_d + 1 / ms_a
overround = raw_sum - 1.0
defaults['implied_home'] = round((1 / ms_h) / raw_sum, 4)
defaults['implied_draw'] = round((1 / ms_d) / raw_sum, 4)
defaults['implied_away'] = round((1 / ms_a) / raw_sum, 4)
defaults['odds_overround'] = round(overround, 4)
ou25_o = odds.get('ou25_o', 0)
ou25_u = odds.get('ou25_u', 0)
if ou25_o > 1.0 and ou25_u > 1.0:
raw_sum = 1 / ou25_o + 1 / ou25_u
defaults['implied_over25'] = round((1 / ou25_o) / raw_sum, 4)
btts_y = odds.get('btts_y', 0)
btts_n = odds.get('btts_n', 0)
if btts_y > 1.0 and btts_n > 1.0:
raw_sum = 1 / btts_y + 1 / btts_n
defaults['implied_btts_yes'] = round((1 / btts_y) / raw_sum, 4)
return defaults
def enrich_single_match(
enrichment: FeatureEnrichmentService,
cur: RealDictCursor,
match: Dict[str, Any],
) -> Dict[str, Any]:
"""
Compute all enrichment features for a single match and return
a dict ready for DB upsert.
"""
match_id = match['match_id']
home_id = str(match['home_team_id'])
away_id = str(match['away_team_id'])
mst_utc = int(match['mst_utc']) if match['mst_utc'] else 0
league_id = str(match['league_id']) if match['league_id'] else None
# 1. Team stats
home_stats = enrichment.compute_team_stats(cur, home_id, mst_utc)
away_stats = enrichment.compute_team_stats(cur, away_id, mst_utc)
# 2. H2H
h2h = enrichment.compute_h2h(cur, home_id, away_id, mst_utc)
# 3. Form & streaks
home_form = enrichment.compute_form_streaks(cur, home_id, mst_utc)
away_form = enrichment.compute_form_streaks(cur, away_id, mst_utc)
# 4. Referee
referee_name = fetch_referee_for_match(cur, match_id)
referee = enrichment.compute_referee_stats(cur, referee_name, mst_utc)
# 5. League averages
league = enrichment.compute_league_averages(cur, league_id, mst_utc)
# 6. Rolling stats (for goals avg)
home_rolling = enrichment.compute_rolling_stats(cur, home_id, mst_utc)
away_rolling = enrichment.compute_rolling_stats(cur, away_id, mst_utc)
# 7. Implied odds
implied = fetch_implied_odds(cur, match_id)
return {
'match_id': match_id,
# Team stats
'home_avg_possession': round(home_stats['avg_possession'], 2),
'away_avg_possession': round(away_stats['avg_possession'], 2),
'home_avg_shots_on_target': round(home_stats['avg_shots_on_target'], 2),
'away_avg_shots_on_target': round(away_stats['avg_shots_on_target'], 2),
'home_shot_conversion': round(home_stats['shot_conversion'], 4),
'away_shot_conversion': round(away_stats['shot_conversion'], 4),
'home_avg_corners': round(home_stats['avg_corners'], 2),
'away_avg_corners': round(away_stats['avg_corners'], 2),
# H2H
'h2h_total': h2h['total_matches'],
'h2h_home_win_rate': round(h2h['home_win_rate'], 4),
'h2h_avg_goals': round(h2h['avg_goals'], 2),
'h2h_over25_rate': round(h2h['over25_rate'], 4),
'h2h_btts_rate': round(h2h['btts_rate'], 4),
# Form
'home_clean_sheet_rate': round(home_form['clean_sheet_rate'], 4),
'away_clean_sheet_rate': round(away_form['clean_sheet_rate'], 4),
'home_scoring_rate': round(home_form['scoring_rate'], 4),
'away_scoring_rate': round(away_form['scoring_rate'], 4),
'home_win_streak': home_form['winning_streak'],
'away_win_streak': away_form['winning_streak'],
# Rolling goals
'home_goals_avg_5': round(home_rolling['rolling5_goals'], 2),
'away_goals_avg_5': round(away_rolling['rolling5_goals'], 2),
'home_conceded_avg_5': round(home_rolling['rolling5_conceded'], 2),
'away_conceded_avg_5': round(away_rolling['rolling5_conceded'], 2),
# Referee
'referee_avg_cards': round(referee['cards_total'], 2),
'referee_home_bias': round(referee['home_bias'], 4),
'referee_avg_goals': round(referee['avg_goals'], 2),
# League
'league_avg_goals': round(league['avg_goals'], 2),
'league_home_win_pct': round(league['home_win_rate'], 4),
'league_over25_pct': round(league['ou25_rate'], 4),
# Implied odds
'implied_home': implied['implied_home'],
'implied_draw': implied['implied_draw'],
'implied_away': implied['implied_away'],
'implied_over25': implied['implied_over25'],
'implied_btts_yes': implied['implied_btts_yes'],
'odds_overround': implied['odds_overround'],
# Missing players impact — default (no lineup data for historical)
'missing_players_impact': 0.0,
# Version
'calculator_ver': CALCULATOR_VER,
}
def flush_enrichment_batch(
conn: psycopg2.extensions.connection,
rows: List[Dict[str, Any]],
dry_run: bool,
) -> int:
"""Bulk upsert enriched features into football_ai_features."""
if not rows or dry_run:
return 0
columns = [
'match_id',
'home_avg_possession', 'away_avg_possession',
'home_avg_shots_on_target', 'away_avg_shots_on_target',
'home_shot_conversion', 'away_shot_conversion',
'home_avg_corners', 'away_avg_corners',
'h2h_total', 'h2h_home_win_rate', 'h2h_avg_goals',
'h2h_over25_rate', 'h2h_btts_rate',
'home_clean_sheet_rate', 'away_clean_sheet_rate',
'home_scoring_rate', 'away_scoring_rate',
'home_win_streak', 'away_win_streak',
'home_goals_avg_5', 'away_goals_avg_5',
'home_conceded_avg_5', 'away_conceded_avg_5',
'referee_avg_cards', 'referee_home_bias', 'referee_avg_goals',
'league_avg_goals', 'league_home_win_pct', 'league_over25_pct',
'implied_home', 'implied_draw', 'implied_away',
'implied_over25', 'implied_btts_yes', 'odds_overround',
'missing_players_impact', 'calculator_ver',
]
# Build update SET clause (skip match_id)
update_cols = [c for c in columns if c != 'match_id']
set_clause = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
placeholders = ', '.join(['%s'] * len(columns))
values = [
tuple(row[c] for c in columns)
for row in rows
]
with conn.cursor() as cur:
execute_values(
cur,
f"""
INSERT INTO football_ai_features ({', '.join(columns)})
VALUES %s
ON CONFLICT (match_id) DO UPDATE SET
{set_clause},
updated_at = NOW()
""",
values,
template=f"({placeholders})",
page_size=200,
)
conn.commit()
return len(rows)
# ────────────────────────── main ───────────────────────────────
def run_enrichment(
batch_size: int,
dry_run: bool,
force: bool,
limit: Optional[int],
) -> None:
"""Core enrichment loop."""
dsn = get_clean_dsn()
conn = psycopg2.connect(dsn)
print(f"\n{'=' * 60}")
print(f"🧠 AI Features Full Enrichment — {CALCULATOR_VER}")
print(f" batch_size={batch_size} dry_run={dry_run} force={force}")
print(f"{'=' * 60}")
# 1. Fetch unenriched matches
t0 = time.time()
matches = fetch_unenriched_matches(conn, force=force, limit=limit)
print(f"\n📊 {len(matches):,} matches to enrich ({time.time() - t0:.1f}s)")
if not matches:
print("✅ Nothing to enrich — all rows already populated.")
conn.close()
return
# 2. Initialize enrichment service
enrichment = FeatureEnrichmentService()
# 3. Process in batches
total = len(matches)
processed = 0
written = 0
errors = 0
batch_buf: List[Dict[str, Any]] = []
t_start = time.time()
# Use a dedicated cursor with RealDictCursor for all enrichment queries
enrich_cur = conn.cursor(cursor_factory=RealDictCursor)
for idx, match in enumerate(matches):
try:
enriched = enrich_single_match(enrichment, enrich_cur, match)
batch_buf.append(enriched)
except Exception as e:
errors += 1
if errors <= 10:
print(f" ⚠️ Error enriching {match.get('match_id', '?')}: {e}")
processed += 1
# Flush batch
if len(batch_buf) >= batch_size:
flushed = flush_enrichment_batch(conn, batch_buf, dry_run)
written += flushed
batch_buf.clear()
# Progress reporting
if processed % 500 == 0:
elapsed = time.time() - t_start
rate = processed / elapsed if elapsed > 0 else 0
remaining = (total - processed) / rate if rate > 0 else 0
pct = processed / total * 100
print(
f" [{processed:>8,} / {total:,}] "
f"({pct:.1f}%) | {rate:.0f} matches/s | "
f"ETA: {remaining / 60:.1f} min | "
f"errors: {errors}"
)
# Flush remaining
if batch_buf:
flushed = flush_enrichment_batch(conn, batch_buf, dry_run)
written += flushed
enrich_cur.close()
elapsed = time.time() - t_start
print(f"\n{'=' * 60}")
print(f"✅ Enrichment complete:")
print(f" Processed: {processed:,} matches in {elapsed:.1f}s")
print(f" Written: {written:,} rows")
print(f" Errors: {errors:,}")
print(f" Rate: {processed / elapsed:.0f} matches/s")
print(f"{'=' * 60}")
conn.close()
def main() -> None:
parser = argparse.ArgumentParser(
description="Enrich football_ai_features with H2H, referee, stats, and odds data"
)
parser.add_argument(
'--batch-size',
type=int,
default=DEFAULT_BATCH_SIZE,
help=f'DB insert batch size (default: {DEFAULT_BATCH_SIZE})',
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Compute features but do not write to DB',
)
parser.add_argument(
'--force',
action='store_true',
help='Re-enrich ALL rows, not just empty ones',
)
parser.add_argument(
'--limit',
type=int,
default=None,
help='Max number of matches to process',
)
args = parser.parse_args()
run_enrichment(
batch_size=args.batch_size,
dry_run=args.dry_run,
force=args.force,
limit=args.limit,
)
if __name__ == '__main__':
main()