first (part 2: other directories)

2026-04-16 15:11:25 +03:00
parent 7814e0bc6b
commit 2f0b85a0c7
203 changed files with 59989 additions and 0 deletions
@@ -0,0 +1,248 @@
+"""
+League Odds Reliability Calculator
+===================================
+Computes per-league Brier Score from historical match results + odds,
+then derives an odds_reliability factor (0.0 – 1.0) for each league.
+
+Output: ai-engine/data/league_reliability.json
+Used by: SingleMatchOrchestrator to weight odds-based edge calculations.
+
+Usage:
+    python3 scripts/compute_league_reliability.py
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from typing import Any, Dict, List
+
+import psycopg2
+import psycopg2.extras
+
+# ─── Config ──────────────────────────────────────────────────────────────
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+AI_ENGINE_DIR = os.path.join(SCRIPT_DIR, "..")
+OUTPUT_PATH = os.path.join(AI_ENGINE_DIR, "data", "league_reliability.json")
+
+MIN_MATCHES = 50  # Minimum completed matches to compute reliability
+BRIER_BASELINE = 0.50  # Random-guess Brier Score for 3-way (worst case)
+BRIER_PERFECT = 0.33  # Theoretical best for well-calibrated 3-way odds
+
+
+def get_dsn() -> str:
+    """Build DSN from environment, matching the AI Engine's own config."""
+    from dotenv import load_dotenv
+
+    env_path = os.path.join(AI_ENGINE_DIR, "..", ".env")
+    load_dotenv(env_path)
+
+    raw = os.getenv("DATABASE_URL", "")
+    if raw.startswith("postgresql://"):
+        return raw.split("?")[0]
+
+    host = os.getenv("DB_HOST", "localhost")
+    port = os.getenv("DB_PORT", "15432")
+    user = os.getenv("DB_USER", "suggestbet")
+    pw = os.getenv("DB_PASS", "SuGGesT2026SecuRe")
+    db = os.getenv("DB_NAME", "boilerplate_db")
+    return f"postgresql://{user}:{pw}@{host}:{port}/{db}"
+
+
+def compute_league_reliability(conn: Any) -> List[Dict[str, Any]]:
+    """
+    For each league with enough data, compute:
+    - brier_score: calibration quality of the odds
+    - heavy_fav_win_pct: how often <1.50 favorites actually win
+    - upset_rate: how often heavy favorites lose
+    - odds_reliability: composite 0.0-1.0 score
+    """
+    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+
+    print("📊 Computing per-league Brier Scores from match results + odds...")
+
+    cur.execute("""
+        WITH ms_odds AS (
+            SELECT
+                oc.match_id,
+                MAX(CASE WHEN os.name = '1' THEN os.odd_value::float END) AS odds_h,
+                MAX(CASE WHEN os.name = 'X' THEN os.odd_value::float END) AS odds_d,
+                MAX(CASE WHEN os.name = '2' THEN os.odd_value::float END) AS odds_a
+            FROM odd_categories oc
+            JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
+            WHERE oc.name = 'Maç Sonucu'
+            GROUP BY oc.match_id
+            HAVING MAX(CASE WHEN os.name = '1' THEN os.odd_value::float END) > 1.0
+               AND MAX(CASE WHEN os.name = '2' THEN os.odd_value::float END) > 1.0
+        ),
+        match_results AS (
+            SELECT
+                m.league_id,
+                l.name AS league_name,
+                CASE
+                    WHEN m.score_home > m.score_away THEN '1'
+                    WHEN m.score_home = m.score_away THEN 'X'
+                    ELSE '2'
+                END AS result,
+                o.odds_h, o.odds_d, o.odds_a,
+                -- Normalized implied probabilities
+                (1.0 / o.odds_h) / (
+                    (1.0 / o.odds_h) +
+                    (1.0 / COALESCE(o.odds_d, 3.3)) +
+                    (1.0 / o.odds_a)
+                ) AS ip_home,
+                (1.0 / o.odds_a) / (
+                    (1.0 / o.odds_h) +
+                    (1.0 / COALESCE(o.odds_d, 3.3)) +
+                    (1.0 / o.odds_a)
+                ) AS ip_away,
+                CASE WHEN o.odds_h < o.odds_a THEN 'H' ELSE 'A' END AS fav_side,
+                LEAST(o.odds_h, o.odds_a) AS fav_odds
+            FROM matches m
+            JOIN ms_odds o ON o.match_id = m.id
+            JOIN leagues l ON m.league_id = l.id
+            WHERE m.status = 'FT'
+              AND m.score_home IS NOT NULL
+              AND m.sport = 'football'
+        )
+        SELECT
+            league_id,
+            league_name,
+            COUNT(*) AS match_count,
+
+            -- Brier Score (lower = better odds calibration)
+            AVG(
+                POWER(ip_home - CASE WHEN result = '1' THEN 1.0 ELSE 0.0 END, 2) +
+                POWER(ip_away - CASE WHEN result = '2' THEN 1.0 ELSE 0.0 END, 2)
+            ) AS brier_score,
+
+            -- Heavy favorite metrics
+            COUNT(CASE WHEN fav_odds < 1.50 THEN 1 END) AS heavy_fav_count,
+            AVG(CASE
+                WHEN fav_odds < 1.50
+                    AND ((fav_side = 'H' AND result = '1') OR (fav_side = 'A' AND result = '2'))
+                THEN 1.0
+                WHEN fav_odds < 1.50 THEN 0.0
+            END) AS heavy_fav_win_rate,
+
+            -- Overall favorite win rate
+            AVG(CASE
+                WHEN (fav_side = 'H' AND result = '1') OR (fav_side = 'A' AND result = '2')
+                THEN 1.0 ELSE 0.0
+            END) AS fav_win_rate,
+
+            -- Chaos metric
+            STDDEV(
+                CASE WHEN result = '1' THEN 1 WHEN result = '2' THEN -1 ELSE 0 END
+            ) AS result_volatility
+
+        FROM match_results
+        GROUP BY league_id, league_name
+        HAVING COUNT(*) >= %s
+        ORDER BY COUNT(*) DESC
+    """, (MIN_MATCHES,))
+
+    rows = cur.fetchall()
+    cur.close()
+
+    print(f"  ✅ Found {len(rows)} leagues with >= {MIN_MATCHES} matches")
+
+    # ── Compute composite odds_reliability ──────────────────────────────
+    results: List[Dict[str, Any]] = []
+
+    for row in rows:
+        brier = float(row["brier_score"])
+        match_count = int(row["match_count"])
+        heavy_fav_win = float(row["heavy_fav_win_rate"] or 0.65)
+        fav_win = float(row["fav_win_rate"])
+
+        # Component 1: Brier-based reliability (0-1, higher = better)
+        # Maps [BRIER_BASELINE .. BRIER_PERFECT] → [0.0 .. 1.0]
+        brier_reliability = max(0.0, min(1.0,
+            (BRIER_BASELINE - brier) / (BRIER_BASELINE - BRIER_PERFECT)
+        ))
+
+        # Component 2: Sample size confidence (log scale, caps at 500 matches)
+        import math
+        sample_confidence = min(1.0, math.log(max(1, match_count)) / math.log(500))
+
+        # Component 3: Heavy favorite predictability
+        # If heavy fav wins 80%+ → odds are very reliable; if 55% → chaotic
+        fav_reliability = max(0.0, min(1.0, (heavy_fav_win - 0.55) / (0.80 - 0.55)))
+
+        # Composite: weighted blend
+        # Brier is the primary signal (60%), sample size (20%), fav reliability (20%)
+        odds_reliability = (
+            brier_reliability * 0.60 +
+            sample_confidence * 0.20 +
+            fav_reliability * 0.20
+        )
+
+        results.append({
+            "league_id": row["league_id"],
+            "league_name": row["league_name"],
+            "match_count": match_count,
+            "brier_score": round(brier, 4),
+            "heavy_fav_win_pct": round(heavy_fav_win * 100, 1),
+            "fav_win_pct": round(fav_win * 100, 1),
+            "odds_reliability": round(odds_reliability, 4),
+        })
+
+    # Sort by reliability descending
+    results.sort(key=lambda x: x["odds_reliability"], reverse=True)
+
+    return results
+
+
+def build_lookup(results: List[Dict[str, Any]]) -> Dict[str, float]:
+    """Build league_id → odds_reliability lookup for the orchestrator."""
+    return {r["league_id"]: r["odds_reliability"] for r in results}
+
+
+def main() -> None:
+    dsn = get_dsn()
+    print(f"🔗 Connecting to database...")
+    conn = psycopg2.connect(dsn)
+
+    try:
+        results = compute_league_reliability(conn)
+
+        # Build output structure
+        output = {
+            "version": "v1",
+            "description": "Per-league odds reliability scores computed from Brier Score analysis",
+            "min_matches_threshold": MIN_MATCHES,
+            "total_leagues": len(results),
+            "default_reliability": 0.35,  # fallback for unknown leagues
+            "lookup": build_lookup(results),
+            "details": results[:50],  # top 50 for human reference
+        }
+
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
+
+        with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
+            json.dump(output, f, indent=2, ensure_ascii=False)
+
+        print(f"\n✅ Saved {len(results)} league reliability scores to {OUTPUT_PATH}")
+        print(f"\n📈 Top 10 most reliable leagues:")
+        for i, r in enumerate(results[:10], 1):
+            print(f"  {i:2d}. {r['league_name']:25s} | Brier: {r['brier_score']:.4f} | "
+                  f"Reliability: {r['odds_reliability']:.4f} | "
+                  f"Heavy Fav: {r['heavy_fav_win_pct']:.1f}% | "
+                  f"N={r['match_count']}")
+
+        print(f"\n📉 Bottom 10 (least reliable):")
+        for i, r in enumerate(results[-10:], 1):
+            print(f"  {i:2d}. {r['league_name']:25s} | Brier: {r['brier_score']:.4f} | "
+                  f"Reliability: {r['odds_reliability']:.4f} | "
+                  f"Heavy Fav: {r['heavy_fav_win_pct']:.1f}% | "
+                  f"N={r['match_count']}")
+
+    finally:
+        conn.close()
+
+
+if __name__ == "__main__":
+    main()