Files
fahricansecer 2f0b85a0c7
Deploy Iddaai Backend / build-and-deploy (push) Failing after 18s
first (part 2: other directories)
2026-04-16 15:11:25 +03:00

424 lines
15 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Calibration Training Script
===========================
Trains Isotonic Regression calibration models for all betting markets.
This script:
1. Fetches historical match data with predictions and actual results
2. Trains Isotonic Regression models for each market
3. Calculates calibration metrics (Brier Score, ECE)
4. Saves models to ai-engine/models/calibration/
Usage:
# Train on last 90 days of data
python3 ai-engine/scripts/train_calibration.py
# Train on specific date range
python3 ai-engine/scripts/train_calibration.py --start 2026-01-01 --end 2026-02-15
# Train only specific markets
python3 ai-engine/scripts/train_calibration.py --markets ou25 btts ms_home
"""
import os
import sys
import json
import argparse
import psycopg2
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dotenv import load_dotenv
from typing import Dict, List, Tuple, Any, Optional
# Setup path for ai-engine imports
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_ENGINE_DIR)
from models.calibration import get_calibrator, SUPPORTED_MARKETS
load_dotenv()
# =============================================================================
# CONFIG
# =============================================================================
TOP_LEAGUES_PATH = os.path.join(
os.path.dirname(os.path.dirname(AI_ENGINE_DIR)),
"top_leagues.json"
)
# Default: last 90 days
DEFAULT_START_DATE = (datetime.utcnow() - timedelta(days=90)).strftime("%Y-%m-%d")
DEFAULT_END_DATE = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d")
# =============================================================================
# DB CONNECTION
# =============================================================================
def get_conn():
"""Get PostgreSQL connection."""
db_url = os.getenv("DATABASE_URL")
if not db_url:
raise ValueError("DATABASE_URL not set")
if "?schema=" in db_url:
db_url = db_url.split("?schema=")[0]
return psycopg2.connect(db_url)
def load_top_league_ids() -> List[str]:
"""Load top league IDs from JSON file."""
if not os.path.exists(TOP_LEAGUES_PATH):
print(f"[Warning] top_leagues.json not found at {TOP_LEAGUES_PATH}")
return []
with open(TOP_LEAGUES_PATH, "r") as f:
data = json.load(f)
# Handle both list and dict formats
if isinstance(data, dict):
return data.get("football", [])
return data
# =============================================================================
# DATA EXTRACTION
# =============================================================================
def fetch_training_data(
cur,
start_date: str,
end_date: str,
league_ids: List[str] = None,
) -> pd.DataFrame:
"""
Fetch match data with odds and results for calibration training.
Returns DataFrame with columns:
- match_id
- home_team, away_team
- ms_h, ms_d, ms_a (odds)
- score_home, score_away (actual result)
- ht_score_home, ht_score_away
- ou25_actual, btts_actual, etc.
"""
start_ms = int(datetime.strptime(start_date, "%Y-%m-%d").timestamp() * 1000)
end_ms = int(datetime.strptime(end_date, "%Y-%m-%d").timestamp() * 1000) + 86400000 # +1 day
# Build league filter
league_filter = ""
params = [start_ms, end_ms]
if league_ids:
placeholders = ",".join(["%s"] * len(league_ids))
league_filter = f"AND m.league_id IN ({placeholders})"
params.extend(league_ids)
query = f"""
SELECT
m.id as match_id,
m.home_team_id,
m.away_team_id,
m.score_home,
m.score_away,
m.ht_score_home,
m.ht_score_away,
m.mst_utc,
-- Odds from odd_categories/selections
MAX(CASE WHEN oc.name = 'Maç Sonucu' AND os.name = '1' THEN os.odd_value END) as ms_h,
MAX(CASE WHEN oc.name = 'Maç Sonucu' AND os.name = 'X' THEN os.odd_value END) as ms_d,
MAX(CASE WHEN oc.name = 'Maç Sonucu' AND os.name = '2' THEN os.odd_value END) as ms_a,
MAX(CASE WHEN oc.name = '2,5 Alt/Üst' AND os.name = 'Üst' THEN os.odd_value END) as ou25_over,
MAX(CASE WHEN oc.name = '2,5 Alt/Üst' AND os.name = 'Alt' THEN os.odd_value END) as ou25_under,
MAX(CASE WHEN oc.name = '1,5 Alt/Üst' AND os.name = 'Üst' THEN os.odd_value END) as ou15_over,
MAX(CASE WHEN oc.name = '3,5 Alt/Üst' AND os.name = 'Üst' THEN os.odd_value END) as ou35_over,
MAX(CASE WHEN oc.name = 'Karşılıklı Gol' AND os.name = 'Var' THEN os.odd_value END) as btts_yes,
MAX(CASE WHEN oc.name = 'Karşılıklı Gol' AND os.name = 'Yok' THEN os.odd_value END) as btts_no
FROM matches m
LEFT JOIN odd_categories oc ON oc.match_id = m.id
LEFT JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
WHERE m.mst_utc >= %s
AND m.mst_utc < %s
AND m.status = 'FT'
AND m.score_home IS NOT NULL
AND m.score_away IS NOT NULL
{league_filter}
GROUP BY m.id, m.home_team_id, m.away_team_id, m.score_home, m.score_away,
m.ht_score_home, m.ht_score_away, m.mst_utc
ORDER BY m.mst_utc DESC
"""
cur.execute(query, params)
rows = cur.fetchall()
columns = [desc[0] for desc in cur.description]
df = pd.DataFrame(rows, columns=columns)
print(f"[Data] Fetched {len(df)} matches from {start_date} to {end_date}")
return df
def calculate_actual_outcomes(df: pd.DataFrame) -> pd.DataFrame:
"""
Calculate actual binary outcomes for each market.
Adds columns:
- ms_home_actual: 1 if home won, 0 otherwise
- ms_draw_actual: 1 if draw, 0 otherwise
- ms_away_actual: 1 if away won, 0 otherwise
- ou25_over_actual: 1 if total goals > 2.5, 0 otherwise
- ou15_over_actual: 1 if total goals > 1.5, 0 otherwise
- ou35_over_actual: 1 if total goals > 3.5, 0 otherwise
- btts_yes_actual: 1 if both teams scored, 0 otherwise
"""
# Total goals
df["total_goals"] = df["score_home"] + df["score_away"]
df["ht_total_goals"] = df["ht_score_home"].fillna(0) + df["ht_score_away"].fillna(0)
# Match result outcomes
df["ms_home_actual"] = (df["score_home"] > df["score_away"]).astype(int)
df["ms_draw_actual"] = (df["score_home"] == df["score_away"]).astype(int)
df["ms_away_actual"] = (df["score_home"] < df["score_away"]).astype(int)
# Over/Under outcomes
df["ou25_over_actual"] = (df["total_goals"] > 2.5).astype(int)
df["ou15_over_actual"] = (df["total_goals"] > 1.5).astype(int)
df["ou35_over_actual"] = (df["total_goals"] > 3.5).astype(int)
# BTTS outcome
df["btts_yes_actual"] = ((df["score_home"] > 0) & (df["score_away"] > 0)).astype(int)
# Half-Time result
df["ht_home_actual"] = (df["ht_score_home"] > df["ht_score_away"]).astype(int)
df["ht_draw_actual"] = (df["ht_score_home"] == df["ht_score_away"]).astype(int)
df["ht_away_actual"] = (df["ht_score_home"] < df["ht_score_away"]).astype(int)
return df
def calculate_implied_probabilities(df: pd.DataFrame) -> pd.DataFrame:
"""
Calculate implied probabilities from odds.
Adds columns:
- ms_home_prob: implied probability from odds
- ms_draw_prob
- ms_away_prob
- ou25_over_prob
- etc.
"""
def safe_implied_prob(odd_str: str) -> float:
"""Convert odds string to implied probability."""
if pd.isna(odd_str) or odd_str is None:
return np.nan
try:
odd = float(odd_str)
if odd <= 1.0:
return np.nan
return 1.0 / odd
except (ValueError, TypeError):
return np.nan
# Match result implied probabilities
df["ms_home_prob"] = df["ms_h"].apply(safe_implied_prob)
df["ms_draw_prob"] = df["ms_d"].apply(safe_implied_prob)
df["ms_away_prob"] = df["ms_a"].apply(safe_implied_prob)
# Over/Under implied probabilities
df["ou25_over_prob"] = df["ou25_over"].apply(safe_implied_prob)
df["ou15_over_prob"] = df["ou15_over"].apply(safe_implied_prob)
df["ou35_over_prob"] = df["ou35_over"].apply(safe_implied_prob)
# BTTS implied probabilities
df["btts_yes_prob"] = df["btts_yes"].apply(safe_implied_prob)
# -----------------------------------------------------
# CONTEXT-AWARE BUCKETS
# Create separate probability and actual columns for odds buckets
# ms_home odds: ms_h (note ms_h is the bookmaker odds for home win)
# -----------------------------------------------------
# Helper to safe-cast to float
df['ms_h_num'] = pd.to_numeric(df['ms_h'], errors='coerce')
# Bucket 1: Heavy Fav (odds <= 1.40)
b1_mask = df['ms_h_num'] <= 1.40
df.loc[b1_mask, 'ms_home_heavy_fav_prob'] = df.loc[b1_mask, 'ms_home_prob']
df.loc[b1_mask, 'ms_home_heavy_fav_actual'] = df.loc[b1_mask, 'ms_home_actual']
# Bucket 2: Fav (1.40 < odds <= 1.80)
b2_mask = (df['ms_h_num'] > 1.40) & (df['ms_h_num'] <= 1.80)
df.loc[b2_mask, 'ms_home_fav_prob'] = df.loc[b2_mask, 'ms_home_prob']
df.loc[b2_mask, 'ms_home_fav_actual'] = df.loc[b2_mask, 'ms_home_actual']
# Bucket 3: Balanced (1.80 < odds <= 2.50)
b3_mask = (df['ms_h_num'] > 1.80) & (df['ms_h_num'] <= 2.50)
df.loc[b3_mask, 'ms_home_balanced_prob'] = df.loc[b3_mask, 'ms_home_prob']
df.loc[b3_mask, 'ms_home_balanced_actual'] = df.loc[b3_mask, 'ms_home_actual']
# Bucket 4: Underdog (odds > 2.50)
b4_mask = df['ms_h_num'] > 2.50
df.loc[b4_mask, 'ms_home_underdog_prob'] = df.loc[b4_mask, 'ms_home_prob']
df.loc[b4_mask, 'ms_home_underdog_actual'] = df.loc[b4_mask, 'ms_home_actual']
return df
# =============================================================================
# MODEL PREDICTIONS (Optional - if you want to calibrate model outputs)
# =============================================================================
def get_model_predictions(
df: pd.DataFrame,
cur,
) -> pd.DataFrame:
"""
Get model predictions for each match.
This is optional - if you want to calibrate model outputs rather than
raw odds-implied probabilities.
TODO: Implement if needed. For now, we use odds-implied probabilities
as a proxy for model predictions.
"""
# For now, return odds-implied probabilities as "model predictions"
# In a full implementation, you would:
# 1. Load the V20 predictor
# 2. Run predictions for each match
# 3. Store raw model probabilities
return df
# =============================================================================
# MAIN TRAINING
# =============================================================================
def train_calibration_models(
df: pd.DataFrame,
markets: List[str] = None,
min_samples: int = 100,
) -> Dict[str, Any]:
"""
Train calibration models for specified markets.
Args:
df: DataFrame with probabilities and actual outcomes
markets: List of markets to train (default: all supported)
min_samples: Minimum samples required per market
Returns:
Dict with training results
"""
if markets is None:
markets = SUPPORTED_MARKETS
calibrator = get_calibrator()
# Define market config: market -> (prob_col, actual_col)
market_config = {
"ms_home": ("ms_home_prob", "ms_home_actual"),
"ms_home_heavy_fav": ("ms_home_heavy_fav_prob", "ms_home_heavy_fav_actual"),
"ms_home_fav": ("ms_home_fav_prob", "ms_home_fav_actual"),
"ms_home_balanced": ("ms_home_balanced_prob", "ms_home_balanced_actual"),
"ms_home_underdog": ("ms_home_underdog_prob", "ms_home_underdog_actual"),
"ms_draw": ("ms_draw_prob", "ms_draw_actual"),
"ms_away": ("ms_away_prob", "ms_away_actual"),
"ou15": ("ou15_over_prob", "ou15_over_actual"),
"ou25": ("ou25_over_prob", "ou25_over_actual"),
"ou35": ("ou35_over_prob", "ou35_over_actual"),
"btts": ("btts_yes_prob", "btts_yes_actual"),
"ht_home": ("ht_home_prob", "ht_home_actual"), # Note: need to add ht probs
"ht_draw": ("ht_draw_prob", "ht_draw_actual"),
"ht_away": ("ht_away_prob", "ht_away_actual"),
}
# Filter to requested markets
market_config = {k: v for k, v in market_config.items() if k in markets}
# Train all markets
results = calibrator.train_all_markets(
df=df,
market_config=market_config,
min_samples=min_samples,
)
return results
def print_calibration_report(results: Dict[str, Any]):
"""Print a formatted calibration report."""
print("\n" + "=" * 70)
print("CALIBRATION TRAINING REPORT")
print("=" * 70)
print(f"\n{'Market':<15} {'Brier':<10} {'ECE':<10} {'Samples':<10} {'Status'}")
print("-" * 60)
for market, metrics in results.items():
status = "✓ Trained" if metrics.sample_count >= 100 else "⚠ Insufficient"
print(f"{market:<15} {metrics.brier_score:<10.4f} {metrics.calibration_error:<10.4f} "
f"{metrics.sample_count:<10} {status}")
print("\n" + "=" * 70)
print("Interpretation:")
print(" - Brier Score: Lower is better (0 = perfect, 0.25 = random)")
print(" - ECE (Expected Calibration Error): Lower is better (0 = perfect)")
print(" - Models saved to: ai-engine/models/calibration/")
print("=" * 70)
# =============================================================================
# CLI
# =============================================================================
def main():
parser = argparse.ArgumentParser(description="Train calibration models")
parser.add_argument("--start", type=str, default=DEFAULT_START_DATE,
help="Start date (YYYY-MM-DD)")
parser.add_argument("--end", type=str, default=DEFAULT_END_DATE,
help="End date (YYYY-MM-DD)")
parser.add_argument("--markets", nargs="+", default=None,
help="Markets to train (default: all)")
parser.add_argument("--min-samples", type=int, default=100,
help="Minimum samples per market")
parser.add_argument("--top-leagues-only", action="store_true",
help="Only use top leagues data")
args = parser.parse_args()
print(f"\n[Calibration Training] {args.start} to {args.end}")
# Load top leagues if requested
league_ids = None
if args.top_leagues_only:
league_ids = load_top_league_ids()
print(f"[Data] Filtering to {len(league_ids)} top leagues")
# Fetch data
conn = get_conn()
cur = conn.cursor()
try:
df = fetch_training_data(cur, args.start, args.end, league_ids)
if len(df) == 0:
print("[Error] No data found for the specified date range")
return
# Calculate outcomes and probabilities
df = calculate_actual_outcomes(df)
df = calculate_implied_probabilities(df)
# Train models
results = train_calibration_models(
df=df,
markets=args.markets,
min_samples=args.min_samples,
)
# Print report
print_calibration_report(results)
finally:
cur.close()
conn.close()
if __name__ == "__main__":
main()