Files
iddaai-be/ai-engine/scripts/compute_league_reliability.py
T
fahricansecer 2f0b85a0c7
Deploy Iddaai Backend / build-and-deploy (push) Failing after 18s
first (part 2: other directories)
2026-04-16 15:11:25 +03:00

249 lines
9.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
League Odds Reliability Calculator
===================================
Computes per-league Brier Score from historical match results + odds,
then derives an odds_reliability factor (0.0 1.0) for each league.
Output: ai-engine/data/league_reliability.json
Used by: SingleMatchOrchestrator to weight odds-based edge calculations.
Usage:
python3 scripts/compute_league_reliability.py
"""
from __future__ import annotations
import json
import os
import sys
from typing import Any, Dict, List
import psycopg2
import psycopg2.extras
# ─── Config ──────────────────────────────────────────────────────────────
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
AI_ENGINE_DIR = os.path.join(SCRIPT_DIR, "..")
OUTPUT_PATH = os.path.join(AI_ENGINE_DIR, "data", "league_reliability.json")
MIN_MATCHES = 50 # Minimum completed matches to compute reliability
BRIER_BASELINE = 0.50 # Random-guess Brier Score for 3-way (worst case)
BRIER_PERFECT = 0.33 # Theoretical best for well-calibrated 3-way odds
def get_dsn() -> str:
"""Build DSN from environment, matching the AI Engine's own config."""
from dotenv import load_dotenv
env_path = os.path.join(AI_ENGINE_DIR, "..", ".env")
load_dotenv(env_path)
raw = os.getenv("DATABASE_URL", "")
if raw.startswith("postgresql://"):
return raw.split("?")[0]
host = os.getenv("DB_HOST", "localhost")
port = os.getenv("DB_PORT", "15432")
user = os.getenv("DB_USER", "suggestbet")
pw = os.getenv("DB_PASS", "SuGGesT2026SecuRe")
db = os.getenv("DB_NAME", "boilerplate_db")
return f"postgresql://{user}:{pw}@{host}:{port}/{db}"
def compute_league_reliability(conn: Any) -> List[Dict[str, Any]]:
"""
For each league with enough data, compute:
- brier_score: calibration quality of the odds
- heavy_fav_win_pct: how often <1.50 favorites actually win
- upset_rate: how often heavy favorites lose
- odds_reliability: composite 0.0-1.0 score
"""
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
print("📊 Computing per-league Brier Scores from match results + odds...")
cur.execute("""
WITH ms_odds AS (
SELECT
oc.match_id,
MAX(CASE WHEN os.name = '1' THEN os.odd_value::float END) AS odds_h,
MAX(CASE WHEN os.name = 'X' THEN os.odd_value::float END) AS odds_d,
MAX(CASE WHEN os.name = '2' THEN os.odd_value::float END) AS odds_a
FROM odd_categories oc
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
WHERE oc.name = 'Maç Sonucu'
GROUP BY oc.match_id
HAVING MAX(CASE WHEN os.name = '1' THEN os.odd_value::float END) > 1.0
AND MAX(CASE WHEN os.name = '2' THEN os.odd_value::float END) > 1.0
),
match_results AS (
SELECT
m.league_id,
l.name AS league_name,
CASE
WHEN m.score_home > m.score_away THEN '1'
WHEN m.score_home = m.score_away THEN 'X'
ELSE '2'
END AS result,
o.odds_h, o.odds_d, o.odds_a,
-- Normalized implied probabilities
(1.0 / o.odds_h) / (
(1.0 / o.odds_h) +
(1.0 / COALESCE(o.odds_d, 3.3)) +
(1.0 / o.odds_a)
) AS ip_home,
(1.0 / o.odds_a) / (
(1.0 / o.odds_h) +
(1.0 / COALESCE(o.odds_d, 3.3)) +
(1.0 / o.odds_a)
) AS ip_away,
CASE WHEN o.odds_h < o.odds_a THEN 'H' ELSE 'A' END AS fav_side,
LEAST(o.odds_h, o.odds_a) AS fav_odds
FROM matches m
JOIN ms_odds o ON o.match_id = m.id
JOIN leagues l ON m.league_id = l.id
WHERE m.status = 'FT'
AND m.score_home IS NOT NULL
AND m.sport = 'football'
)
SELECT
league_id,
league_name,
COUNT(*) AS match_count,
-- Brier Score (lower = better odds calibration)
AVG(
POWER(ip_home - CASE WHEN result = '1' THEN 1.0 ELSE 0.0 END, 2) +
POWER(ip_away - CASE WHEN result = '2' THEN 1.0 ELSE 0.0 END, 2)
) AS brier_score,
-- Heavy favorite metrics
COUNT(CASE WHEN fav_odds < 1.50 THEN 1 END) AS heavy_fav_count,
AVG(CASE
WHEN fav_odds < 1.50
AND ((fav_side = 'H' AND result = '1') OR (fav_side = 'A' AND result = '2'))
THEN 1.0
WHEN fav_odds < 1.50 THEN 0.0
END) AS heavy_fav_win_rate,
-- Overall favorite win rate
AVG(CASE
WHEN (fav_side = 'H' AND result = '1') OR (fav_side = 'A' AND result = '2')
THEN 1.0 ELSE 0.0
END) AS fav_win_rate,
-- Chaos metric
STDDEV(
CASE WHEN result = '1' THEN 1 WHEN result = '2' THEN -1 ELSE 0 END
) AS result_volatility
FROM match_results
GROUP BY league_id, league_name
HAVING COUNT(*) >= %s
ORDER BY COUNT(*) DESC
""", (MIN_MATCHES,))
rows = cur.fetchall()
cur.close()
print(f" ✅ Found {len(rows)} leagues with >= {MIN_MATCHES} matches")
# ── Compute composite odds_reliability ──────────────────────────────
results: List[Dict[str, Any]] = []
for row in rows:
brier = float(row["brier_score"])
match_count = int(row["match_count"])
heavy_fav_win = float(row["heavy_fav_win_rate"] or 0.65)
fav_win = float(row["fav_win_rate"])
# Component 1: Brier-based reliability (0-1, higher = better)
# Maps [BRIER_BASELINE .. BRIER_PERFECT] → [0.0 .. 1.0]
brier_reliability = max(0.0, min(1.0,
(BRIER_BASELINE - brier) / (BRIER_BASELINE - BRIER_PERFECT)
))
# Component 2: Sample size confidence (log scale, caps at 500 matches)
import math
sample_confidence = min(1.0, math.log(max(1, match_count)) / math.log(500))
# Component 3: Heavy favorite predictability
# If heavy fav wins 80%+ → odds are very reliable; if 55% → chaotic
fav_reliability = max(0.0, min(1.0, (heavy_fav_win - 0.55) / (0.80 - 0.55)))
# Composite: weighted blend
# Brier is the primary signal (60%), sample size (20%), fav reliability (20%)
odds_reliability = (
brier_reliability * 0.60 +
sample_confidence * 0.20 +
fav_reliability * 0.20
)
results.append({
"league_id": row["league_id"],
"league_name": row["league_name"],
"match_count": match_count,
"brier_score": round(brier, 4),
"heavy_fav_win_pct": round(heavy_fav_win * 100, 1),
"fav_win_pct": round(fav_win * 100, 1),
"odds_reliability": round(odds_reliability, 4),
})
# Sort by reliability descending
results.sort(key=lambda x: x["odds_reliability"], reverse=True)
return results
def build_lookup(results: List[Dict[str, Any]]) -> Dict[str, float]:
"""Build league_id → odds_reliability lookup for the orchestrator."""
return {r["league_id"]: r["odds_reliability"] for r in results}
def main() -> None:
dsn = get_dsn()
print(f"🔗 Connecting to database...")
conn = psycopg2.connect(dsn)
try:
results = compute_league_reliability(conn)
# Build output structure
output = {
"version": "v1",
"description": "Per-league odds reliability scores computed from Brier Score analysis",
"min_matches_threshold": MIN_MATCHES,
"total_leagues": len(results),
"default_reliability": 0.35, # fallback for unknown leagues
"lookup": build_lookup(results),
"details": results[:50], # top 50 for human reference
}
# Ensure output directory exists
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\n✅ Saved {len(results)} league reliability scores to {OUTPUT_PATH}")
print(f"\n📈 Top 10 most reliable leagues:")
for i, r in enumerate(results[:10], 1):
print(f" {i:2d}. {r['league_name']:25s} | Brier: {r['brier_score']:.4f} | "
f"Reliability: {r['odds_reliability']:.4f} | "
f"Heavy Fav: {r['heavy_fav_win_pct']:.1f}% | "
f"N={r['match_count']}")
print(f"\n📉 Bottom 10 (least reliable):")
for i, r in enumerate(results[-10:], 1):
print(f" {i:2d}. {r['league_name']:25s} | Brier: {r['brier_score']:.4f} | "
f"Reliability: {r['odds_reliability']:.4f} | "
f"Heavy Fav: {r['heavy_fav_win_pct']:.1f}% | "
f"N={r['match_count']}")
finally:
conn.close()
if __name__ == "__main__":
main()