first (part 2: other directories)

2026-04-16 15:11:25 +03:00
parent 7814e0bc6b
commit 2f0b85a0c7
203 changed files with 59989 additions and 0 deletions
@@ -0,0 +1,423 @@
+"""
+Calibration Training Script
+===========================
+Trains Isotonic Regression calibration models for all betting markets.
+
+This script:
+1. Fetches historical match data with predictions and actual results
+2. Trains Isotonic Regression models for each market
+3. Calculates calibration metrics (Brier Score, ECE)
+4. Saves models to ai-engine/models/calibration/
+
+Usage:
+    # Train on last 90 days of data
+    python3 ai-engine/scripts/train_calibration.py
+    
+    # Train on specific date range
+    python3 ai-engine/scripts/train_calibration.py --start 2026-01-01 --end 2026-02-15
+    
+    # Train only specific markets
+    python3 ai-engine/scripts/train_calibration.py --markets ou25 btts ms_home
+"""
+
+import os
+import sys
+import json
+import argparse
+import psycopg2
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+from dotenv import load_dotenv
+from typing import Dict, List, Tuple, Any, Optional
+
+# Setup path for ai-engine imports
+AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, AI_ENGINE_DIR)
+
+from models.calibration import get_calibrator, SUPPORTED_MARKETS
+
+load_dotenv()
+
+
+# =============================================================================
+# CONFIG
+# =============================================================================
+TOP_LEAGUES_PATH = os.path.join(
+    os.path.dirname(os.path.dirname(AI_ENGINE_DIR)),
+    "top_leagues.json"
+)
+
+# Default: last 90 days
+DEFAULT_START_DATE = (datetime.utcnow() - timedelta(days=90)).strftime("%Y-%m-%d")
+DEFAULT_END_DATE = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d")
+
+
+# =============================================================================
+# DB CONNECTION
+# =============================================================================
+def get_conn():
+    """Get PostgreSQL connection."""
+    db_url = os.getenv("DATABASE_URL")
+    if not db_url:
+        raise ValueError("DATABASE_URL not set")
+    if "?schema=" in db_url:
+        db_url = db_url.split("?schema=")[0]
+    return psycopg2.connect(db_url)
+
+
+def load_top_league_ids() -> List[str]:
+    """Load top league IDs from JSON file."""
+    if not os.path.exists(TOP_LEAGUES_PATH):
+        print(f"[Warning] top_leagues.json not found at {TOP_LEAGUES_PATH}")
+        return []
+    
+    with open(TOP_LEAGUES_PATH, "r") as f:
+        data = json.load(f)
+        
+    # Handle both list and dict formats
+    if isinstance(data, dict):
+        return data.get("football", [])
+    return data
+
+
+# =============================================================================
+# DATA EXTRACTION
+# =============================================================================
+def fetch_training_data(
+    cur,
+    start_date: str,
+    end_date: str,
+    league_ids: List[str] = None,
+) -> pd.DataFrame:
+    """
+    Fetch match data with odds and results for calibration training.
+    
+    Returns DataFrame with columns:
+    - match_id
+    - home_team, away_team
+    - ms_h, ms_d, ms_a (odds)
+    - score_home, score_away (actual result)
+    - ht_score_home, ht_score_away
+    - ou25_actual, btts_actual, etc.
+    """
+    start_ms = int(datetime.strptime(start_date, "%Y-%m-%d").timestamp() * 1000)
+    end_ms = int(datetime.strptime(end_date, "%Y-%m-%d").timestamp() * 1000) + 86400000  # +1 day
+    
+    # Build league filter
+    league_filter = ""
+    params = [start_ms, end_ms]
+    if league_ids:
+        placeholders = ",".join(["%s"] * len(league_ids))
+        league_filter = f"AND m.league_id IN ({placeholders})"
+        params.extend(league_ids)
+    
+    query = f"""
+    SELECT 
+        m.id as match_id,
+        m.home_team_id,
+        m.away_team_id,
+        m.score_home,
+        m.score_away,
+        m.ht_score_home,
+        m.ht_score_away,
+        m.mst_utc,
+        -- Odds from odd_categories/selections
+        MAX(CASE WHEN oc.name = 'Maç Sonucu' AND os.name = '1' THEN os.odd_value END) as ms_h,
+        MAX(CASE WHEN oc.name = 'Maç Sonucu' AND os.name = 'X' THEN os.odd_value END) as ms_d,
+        MAX(CASE WHEN oc.name = 'Maç Sonucu' AND os.name = '2' THEN os.odd_value END) as ms_a,
+        MAX(CASE WHEN oc.name = '2,5 Alt/Üst' AND os.name = 'Üst' THEN os.odd_value END) as ou25_over,
+        MAX(CASE WHEN oc.name = '2,5 Alt/Üst' AND os.name = 'Alt' THEN os.odd_value END) as ou25_under,
+        MAX(CASE WHEN oc.name = '1,5 Alt/Üst' AND os.name = 'Üst' THEN os.odd_value END) as ou15_over,
+        MAX(CASE WHEN oc.name = '3,5 Alt/Üst' AND os.name = 'Üst' THEN os.odd_value END) as ou35_over,
+        MAX(CASE WHEN oc.name = 'Karşılıklı Gol' AND os.name = 'Var' THEN os.odd_value END) as btts_yes,
+        MAX(CASE WHEN oc.name = 'Karşılıklı Gol' AND os.name = 'Yok' THEN os.odd_value END) as btts_no
+    FROM matches m
+    LEFT JOIN odd_categories oc ON oc.match_id = m.id
+    LEFT JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
+    WHERE m.mst_utc >= %s
+      AND m.mst_utc < %s
+      AND m.status = 'FT'
+      AND m.score_home IS NOT NULL
+      AND m.score_away IS NOT NULL
+      {league_filter}
+    GROUP BY m.id, m.home_team_id, m.away_team_id, m.score_home, m.score_away, 
+             m.ht_score_home, m.ht_score_away, m.mst_utc
+    ORDER BY m.mst_utc DESC
+    """
+    
+    cur.execute(query, params)
+    rows = cur.fetchall()
+    columns = [desc[0] for desc in cur.description]
+    
+    df = pd.DataFrame(rows, columns=columns)
+    print(f"[Data] Fetched {len(df)} matches from {start_date} to {end_date}")
+    
+    return df
+
+
+def calculate_actual_outcomes(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Calculate actual binary outcomes for each market.
+    
+    Adds columns:
+    - ms_home_actual: 1 if home won, 0 otherwise
+    - ms_draw_actual: 1 if draw, 0 otherwise
+    - ms_away_actual: 1 if away won, 0 otherwise
+    - ou25_over_actual: 1 if total goals > 2.5, 0 otherwise
+    - ou15_over_actual: 1 if total goals > 1.5, 0 otherwise
+    - ou35_over_actual: 1 if total goals > 3.5, 0 otherwise
+    - btts_yes_actual: 1 if both teams scored, 0 otherwise
+    """
+    # Total goals
+    df["total_goals"] = df["score_home"] + df["score_away"]
+    df["ht_total_goals"] = df["ht_score_home"].fillna(0) + df["ht_score_away"].fillna(0)
+    
+    # Match result outcomes
+    df["ms_home_actual"] = (df["score_home"] > df["score_away"]).astype(int)
+    df["ms_draw_actual"] = (df["score_home"] == df["score_away"]).astype(int)
+    df["ms_away_actual"] = (df["score_home"] < df["score_away"]).astype(int)
+    
+    # Over/Under outcomes
+    df["ou25_over_actual"] = (df["total_goals"] > 2.5).astype(int)
+    df["ou15_over_actual"] = (df["total_goals"] > 1.5).astype(int)
+    df["ou35_over_actual"] = (df["total_goals"] > 3.5).astype(int)
+    
+    # BTTS outcome
+    df["btts_yes_actual"] = ((df["score_home"] > 0) & (df["score_away"] > 0)).astype(int)
+    
+    # Half-Time result
+    df["ht_home_actual"] = (df["ht_score_home"] > df["ht_score_away"]).astype(int)
+    df["ht_draw_actual"] = (df["ht_score_home"] == df["ht_score_away"]).astype(int)
+    df["ht_away_actual"] = (df["ht_score_home"] < df["ht_score_away"]).astype(int)
+    
+    return df
+
+
+def calculate_implied_probabilities(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Calculate implied probabilities from odds.
+    
+    Adds columns:
+    - ms_home_prob: implied probability from odds
+    - ms_draw_prob
+    - ms_away_prob
+    - ou25_over_prob
+    - etc.
+    """
+    def safe_implied_prob(odd_str: str) -> float:
+        """Convert odds string to implied probability."""
+        if pd.isna(odd_str) or odd_str is None:
+            return np.nan
+        try:
+            odd = float(odd_str)
+            if odd <= 1.0:
+                return np.nan
+            return 1.0 / odd
+        except (ValueError, TypeError):
+            return np.nan
+    
+    # Match result implied probabilities
+    df["ms_home_prob"] = df["ms_h"].apply(safe_implied_prob)
+    df["ms_draw_prob"] = df["ms_d"].apply(safe_implied_prob)
+    df["ms_away_prob"] = df["ms_a"].apply(safe_implied_prob)
+    
+    # Over/Under implied probabilities
+    df["ou25_over_prob"] = df["ou25_over"].apply(safe_implied_prob)
+    df["ou15_over_prob"] = df["ou15_over"].apply(safe_implied_prob)
+    df["ou35_over_prob"] = df["ou35_over"].apply(safe_implied_prob)
+    
+    # BTTS implied probabilities
+    df["btts_yes_prob"] = df["btts_yes"].apply(safe_implied_prob)
+    
+    # -----------------------------------------------------
+    # CONTEXT-AWARE BUCKETS
+    # Create separate probability and actual columns for odds buckets
+    # ms_home odds: ms_h (note ms_h is the bookmaker odds for home win)
+    # -----------------------------------------------------
+    # Helper to safe-cast to float
+    df['ms_h_num'] = pd.to_numeric(df['ms_h'], errors='coerce')
+    
+    # Bucket 1: Heavy Fav (odds <= 1.40)
+    b1_mask = df['ms_h_num'] <= 1.40
+    df.loc[b1_mask, 'ms_home_heavy_fav_prob'] = df.loc[b1_mask, 'ms_home_prob']
+    df.loc[b1_mask, 'ms_home_heavy_fav_actual'] = df.loc[b1_mask, 'ms_home_actual']
+
+    # Bucket 2: Fav (1.40 < odds <= 1.80)
+    b2_mask = (df['ms_h_num'] > 1.40) & (df['ms_h_num'] <= 1.80)
+    df.loc[b2_mask, 'ms_home_fav_prob'] = df.loc[b2_mask, 'ms_home_prob']
+    df.loc[b2_mask, 'ms_home_fav_actual'] = df.loc[b2_mask, 'ms_home_actual']
+
+    # Bucket 3: Balanced (1.80 < odds <= 2.50)
+    b3_mask = (df['ms_h_num'] > 1.80) & (df['ms_h_num'] <= 2.50)
+    df.loc[b3_mask, 'ms_home_balanced_prob'] = df.loc[b3_mask, 'ms_home_prob']
+    df.loc[b3_mask, 'ms_home_balanced_actual'] = df.loc[b3_mask, 'ms_home_actual']
+
+    # Bucket 4: Underdog (odds > 2.50)
+    b4_mask = df['ms_h_num'] > 2.50
+    df.loc[b4_mask, 'ms_home_underdog_prob'] = df.loc[b4_mask, 'ms_home_prob']
+    df.loc[b4_mask, 'ms_home_underdog_actual'] = df.loc[b4_mask, 'ms_home_actual']
+    
+    return df
+
+
+# =============================================================================
+# MODEL PREDICTIONS (Optional - if you want to calibrate model outputs)
+# =============================================================================
+def get_model_predictions(
+    df: pd.DataFrame,
+    cur,
+) -> pd.DataFrame:
+    """
+    Get model predictions for each match.
+    
+    This is optional - if you want to calibrate model outputs rather than
+    raw odds-implied probabilities.
+    
+    TODO: Implement if needed. For now, we use odds-implied probabilities
+    as a proxy for model predictions.
+    """
+    # For now, return odds-implied probabilities as "model predictions"
+    # In a full implementation, you would:
+    # 1. Load the V20 predictor
+    # 2. Run predictions for each match
+    # 3. Store raw model probabilities
+    
+    return df
+
+
+# =============================================================================
+# MAIN TRAINING
+# =============================================================================
+def train_calibration_models(
+    df: pd.DataFrame,
+    markets: List[str] = None,
+    min_samples: int = 100,
+) -> Dict[str, Any]:
+    """
+    Train calibration models for specified markets.
+    
+    Args:
+        df: DataFrame with probabilities and actual outcomes
+        markets: List of markets to train (default: all supported)
+        min_samples: Minimum samples required per market
+        
+    Returns:
+        Dict with training results
+    """
+    if markets is None:
+        markets = SUPPORTED_MARKETS
+    
+    calibrator = get_calibrator()
+    
+    # Define market config: market -> (prob_col, actual_col)
+    market_config = {
+        "ms_home": ("ms_home_prob", "ms_home_actual"),
+        "ms_home_heavy_fav": ("ms_home_heavy_fav_prob", "ms_home_heavy_fav_actual"),
+        "ms_home_fav": ("ms_home_fav_prob", "ms_home_fav_actual"),
+        "ms_home_balanced": ("ms_home_balanced_prob", "ms_home_balanced_actual"),
+        "ms_home_underdog": ("ms_home_underdog_prob", "ms_home_underdog_actual"),
+        "ms_draw": ("ms_draw_prob", "ms_draw_actual"),
+        "ms_away": ("ms_away_prob", "ms_away_actual"),
+        "ou15": ("ou15_over_prob", "ou15_over_actual"),
+        "ou25": ("ou25_over_prob", "ou25_over_actual"),
+        "ou35": ("ou35_over_prob", "ou35_over_actual"),
+        "btts": ("btts_yes_prob", "btts_yes_actual"),
+        "ht_home": ("ht_home_prob", "ht_home_actual"),  # Note: need to add ht probs
+        "ht_draw": ("ht_draw_prob", "ht_draw_actual"),
+        "ht_away": ("ht_away_prob", "ht_away_actual"),
+    }
+    
+    # Filter to requested markets
+    market_config = {k: v for k, v in market_config.items() if k in markets}
+    
+    # Train all markets
+    results = calibrator.train_all_markets(
+        df=df,
+        market_config=market_config,
+        min_samples=min_samples,
+    )
+    
+    return results
+
+
+def print_calibration_report(results: Dict[str, Any]):
+    """Print a formatted calibration report."""
+    print("\n" + "=" * 70)
+    print("CALIBRATION TRAINING REPORT")
+    print("=" * 70)
+    
+    print(f"\n{'Market':<15} {'Brier':<10} {'ECE':<10} {'Samples':<10} {'Status'}")
+    print("-" * 60)
+    
+    for market, metrics in results.items():
+        status = "✓ Trained" if metrics.sample_count >= 100 else "⚠ Insufficient"
+        print(f"{market:<15} {metrics.brier_score:<10.4f} {metrics.calibration_error:<10.4f} "
+              f"{metrics.sample_count:<10} {status}")
+    
+    print("\n" + "=" * 70)
+    print("Interpretation:")
+    print("  - Brier Score: Lower is better (0 = perfect, 0.25 = random)")
+    print("  - ECE (Expected Calibration Error): Lower is better (0 = perfect)")
+    print("  - Models saved to: ai-engine/models/calibration/")
+    print("=" * 70)
+
+
+# =============================================================================
+# CLI
+# =============================================================================
+def main():
+    parser = argparse.ArgumentParser(description="Train calibration models")
+    parser.add_argument("--start", type=str, default=DEFAULT_START_DATE,
+                        help="Start date (YYYY-MM-DD)")
+    parser.add_argument("--end", type=str, default=DEFAULT_END_DATE,
+                        help="End date (YYYY-MM-DD)")
+    parser.add_argument("--markets", nargs="+", default=None,
+                        help="Markets to train (default: all)")
+    parser.add_argument("--min-samples", type=int, default=100,
+                        help="Minimum samples per market")
+    parser.add_argument("--top-leagues-only", action="store_true",
+                        help="Only use top leagues data")
+    
+    args = parser.parse_args()
+    
+    print(f"\n[Calibration Training] {args.start} to {args.end}")
+    
+    # Load top leagues if requested
+    league_ids = None
+    if args.top_leagues_only:
+        league_ids = load_top_league_ids()
+        print(f"[Data] Filtering to {len(league_ids)} top leagues")
+    
+    # Fetch data
+    conn = get_conn()
+    cur = conn.cursor()
+    
+    try:
+        df = fetch_training_data(cur, args.start, args.end, league_ids)
+        
+        if len(df) == 0:
+            print("[Error] No data found for the specified date range")
+            return
+        
+        # Calculate outcomes and probabilities
+        df = calculate_actual_outcomes(df)
+        df = calculate_implied_probabilities(df)
+        
+        # Train models
+        results = train_calibration_models(
+            df=df,
+            markets=args.markets,
+            min_samples=args.min_samples,
+        )
+        
+        # Print report
+        print_calibration_report(results)
+        
+    finally:
+        cur.close()
+        conn.close()
+
+
+if __name__ == "__main__":
+    main()