main
Deploy Iddaai Backend / build-and-deploy (push) Successful in 37s

This commit is contained in:
2026-05-17 02:17:22 +03:00
parent 17ace9bd12
commit 94c7a4481a
53 changed files with 29602 additions and 7832 deletions
+510
View File
@@ -0,0 +1,510 @@
"""
Calibration Backfill Script
============================
Runs V25 model against historical matches (using pre-computed ai_features + odds)
to generate calibration training data, then trains isotonic calibration models.
Usage:
python ai-engine/scripts/backfill_calibration.py
python ai-engine/scripts/backfill_calibration.py --limit 5000
python ai-engine/scripts/backfill_calibration.py --min-samples 50
"""
import argparse
import json
import os
import sys
import time
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
import pandas as pd
import psycopg2
from psycopg2.extras import RealDictCursor
from dotenv import load_dotenv
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_ENGINE_DIR)
from models.v25_ensemble import V25Predictor
from models.calibration import get_calibrator
load_dotenv()
def _normalize_pick(pick) -> str:
return str(pick or "").strip().casefold()
def resolve_actual(market, pick, score_home, score_away, ht_home, ht_away):
if score_home is None or score_away is None:
return None
market = (market or "").upper()
p = _normalize_pick(pick)
total = score_home + score_away
ht_total = (ht_home or 0) + (ht_away or 0) if ht_home is not None else None
if market == "MS":
if p == "1": return int(score_home > score_away)
if p in {"x", "0"}: return int(score_home == score_away)
if p == "2": return int(score_away > score_home)
return None
if market in {"OU15", "OU25", "OU35"}:
line = {"OU15": 1.5, "OU25": 2.5, "OU35": 3.5}[market]
if "over" in p or "üst" in p or "ust" in p: return int(total > line)
if "under" in p or "alt" in p: return int(total < line)
return None
if market == "BTTS":
both = score_home > 0 and score_away > 0
if "yes" in p or "var" in p: return int(both)
if "no" in p or "yok" in p: return int(not both)
return None
if market == "HT":
if ht_home is None or ht_away is None: return None
if p == "1": return int(ht_home > ht_away)
if p in {"x", "0"}: return int(ht_home == ht_away)
if p == "2": return int(ht_away > ht_home)
return None
if market == "HTFT":
if ht_home is None or ht_away is None or "/" not in p: return None
ht_p, ft_p = p.split("/")
ht_actual = "1" if ht_home > ht_away else "2" if ht_away > ht_home else "x"
ft_actual = "1" if score_home > score_away else "2" if score_away > score_home else "x"
return int(ht_p.strip() == ht_actual and ft_p.strip() == ft_actual)
if market == "DC":
norm = p.replace("-", "").upper()
if norm == "1X": return int(score_home >= score_away)
if norm == "X2": return int(score_away >= score_home)
if norm == "12": return int(score_home != score_away)
return None
return None
def calibrator_key(market, pick):
m = (market or "").upper()
p = _normalize_pick(pick)
if m == "MS":
if p == "1": return "ms_home"
if p in {"x", "0"}: return "ms_draw"
if p == "2": return "ms_away"
return None
if m == "DC": return "dc"
if m == "OU15" and ("over" in p or "üst" in p): return "ou15"
if m == "OU25" and ("over" in p or "üst" in p): return "ou25"
if m == "OU35" and ("over" in p or "üst" in p): return "ou35"
if m == "BTTS" and ("yes" in p or "var" in p): return "btts"
if m == "HT":
if p == "1": return "ht_home"
if p in {"x", "0"}: return "ht_draw"
if p == "2": return "ht_away"
return None
if m == "HTFT": return "ht_ft"
return None
def get_conn():
db_url = os.getenv("DATABASE_URL", "")
if "?schema=" in db_url:
db_url = db_url.split("?schema=")[0]
if not db_url:
raise ValueError("DATABASE_URL not set")
return psycopg2.connect(db_url, cursor_factory=RealDictCursor)
ODD_CAT_MAP = {
"maç sonucu": {"1": "ms_h", "0": "ms_d", "x": "ms_d", "2": "ms_a"},
"1. yarı sonucu": {"1": "ht_ms_h", "0": "ht_ms_d", "x": "ht_ms_d", "2": "ht_ms_a"},
}
ODD_CAT_KEYWORD_MAP = {
"karşılıklı gol": {"var": "btts_y", "yok": "btts_n"},
"0,5 alt/üst": {"alt": "ou05_u", "üst": "ou05_o"},
"1,5 alt/üst": {"alt": "ou15_u", "üst": "ou15_o"},
"2,5 alt/üst": {"alt": "ou25_u", "üst": "ou25_o"},
"3,5 alt/üst": {"alt": "ou35_u", "üst": "ou35_o"},
"ilk yarı 0,5 alt/üst": {"alt": "ht_ou05_u", "üst": "ht_ou05_o"},
"ilk yarı 1,5 alt/üst": {"alt": "ht_ou15_u", "üst": "ht_ou15_o"},
}
def load_matches(cur, limit: int) -> List[Dict]:
cur.execute("""
SELECT m.id, m.score_home, m.score_away,
m.ht_score_home, m.ht_score_away
FROM matches m
JOIN football_ai_features f ON f.match_id = m.id
WHERE m.status = 'FT'
AND m.sport = 'football'
AND m.score_home IS NOT NULL
AND m.score_away IS NOT NULL
ORDER BY m.mst_utc DESC
LIMIT %s
""", (limit,))
return cur.fetchall()
def load_ai_features_batch(cur, match_ids: List[str]) -> Dict[str, Dict]:
if not match_ids:
return {}
ph = ",".join(["%s"] * len(match_ids))
cur.execute(f"""
SELECT match_id,
home_elo AS home_overall_elo,
away_elo AS away_overall_elo,
elo_diff,
home_home_elo, away_away_elo,
home_form_elo, away_form_elo,
(home_form_elo - away_form_elo) AS form_elo_diff,
home_goals_avg_5 AS home_goals_avg,
home_conceded_avg_5 AS home_conceded_avg,
away_goals_avg_5 AS away_goals_avg,
away_conceded_avg_5 AS away_conceded_avg,
home_clean_sheet_rate, away_clean_sheet_rate,
home_scoring_rate, away_scoring_rate,
home_win_streak AS home_winning_streak,
away_win_streak AS away_winning_streak,
0 AS home_unbeaten_streak,
0 AS away_unbeaten_streak,
h2h_total AS h2h_total_matches,
h2h_home_win_rate,
(1.0 - h2h_home_win_rate - 0.33) AS h2h_draw_rate,
h2h_avg_goals,
h2h_btts_rate, h2h_over25_rate,
home_avg_possession, away_avg_possession,
home_avg_shots_on_target, away_avg_shots_on_target,
home_shot_conversion, away_shot_conversion,
0.0 AS home_avg_corners, 0.0 AS away_avg_corners,
implied_home, implied_draw, implied_away,
league_avg_goals,
0.0 AS league_zero_goal_rate,
0.0 AS home_xga, 0.0 AS away_xga,
0.0 AS upset_atmosphere, 0.0 AS upset_motivation,
0.0 AS upset_fatigue, 0.0 AS upset_potential,
referee_home_bias, referee_avg_goals,
referee_avg_cards AS referee_cards_total,
0.0 AS referee_avg_yellow,
0.0 AS referee_experience,
0.0 AS home_momentum_score, 0.0 AS away_momentum_score,
0.0 AS momentum_diff,
0.0 AS home_squad_quality, 0.0 AS away_squad_quality,
0.0 AS squad_diff,
0 AS home_key_players, 0 AS away_key_players,
missing_players_impact AS home_missing_impact,
0.0 AS away_missing_impact,
home_goals_avg_5 AS home_goals_form,
away_goals_avg_5 AS away_goals_form
FROM football_ai_features
WHERE match_id IN ({ph})
""", match_ids)
return {str(row["match_id"]): dict(row) for row in cur.fetchall()}
def load_odds_batch(cur, match_ids: List[str]) -> Dict[str, Dict[str, float]]:
if not match_ids:
return {}
ph = ",".join(["%s"] * len(match_ids))
cur.execute(f"""
SELECT oc.match_id, oc.name AS cat_name,
os.name AS sel_name, os.odd_value
FROM odd_selections os
JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
WHERE oc.match_id IN ({ph})
""", match_ids)
odds: Dict[str, Dict[str, float]] = {}
for row in cur.fetchall():
mid = str(row["match_id"])
cat = (row["cat_name"] or "").lower().strip()
sel = (row["sel_name"] or "").strip()
val = float(row["odd_value"]) if row["odd_value"] else 0
if val <= 0:
continue
if mid not in odds:
odds[mid] = {}
if cat in ODD_CAT_MAP:
key = ODD_CAT_MAP[cat].get(sel.lower())
if key:
odds[mid][key] = val
else:
for cat_pattern, kw_map in ODD_CAT_KEYWORD_MAP.items():
if cat == cat_pattern:
for keyword, key in kw_map.items():
if keyword in sel.lower():
odds[mid][key] = val
break
return odds
MARKETS_TO_PREDICT = [
("MS", "1", lambda p: p[0]),
("MS", "X", lambda p: p[1]),
("MS", "2", lambda p: p[2]),
("OU25", "Over 2.5", lambda p: p[0]),
("BTTS", "Yes", lambda p: p[0]),
("OU15", "Over 1.5", lambda p: p[0]),
("OU35", "Over 3.5", lambda p: p[0]),
("HT", "1", lambda p: p[0]),
("HT", "X", lambda p: p[1]),
("HT", "2", lambda p: p[2]),
]
def run_backfill(args):
print("=" * 70)
print("CALIBRATION BACKFILL")
print("=" * 70)
conn = get_conn()
cur = conn.cursor(cursor_factory=RealDictCursor)
t0 = time.time()
print(f"Loading matches (limit={args.limit})...")
matches = load_matches(cur, args.limit)
print(f" Found {len(matches)} finished matches with ai_features")
match_ids = [str(m["id"]) for m in matches]
match_map = {str(m["id"]): m for m in matches}
print("Loading ai_features...")
features_map = load_ai_features_batch(cur, match_ids)
print(f" Loaded features for {len(features_map)} matches")
print("Loading odds...")
odds_map = load_odds_batch(cur, match_ids)
print(f" Loaded odds for {len(odds_map)} matches")
print(f"Data loading: {time.time() - t0:.1f}s")
print("\nLoading V25 model...")
predictor = V25Predictor()
predictor.load_models()
feature_cols = predictor.FEATURE_COLS
samples: List[Dict[str, Any]] = []
skipped = 0
processed = 0
print(f"\nRunning predictions on {len(match_ids)} matches...")
t1 = time.time()
for i, mid in enumerate(match_ids):
if mid not in features_map:
skipped += 1
continue
feat_row = features_map[mid]
odds_row = odds_map.get(mid, {})
match_row = match_map[mid]
feat_dict = {}
for col in feature_cols:
if col in feat_row and feat_row[col] is not None:
feat_dict[col] = float(feat_row[col])
elif col.startswith("odds_") and not col.endswith("_present"):
odds_key = col.replace("odds_", "")
feat_dict[col] = float(odds_row.get(odds_key, 0))
elif col.endswith("_present"):
base = col.replace("_present", "")
odds_key = base.replace("odds_", "")
feat_dict[col] = 1.0 if odds_row.get(odds_key, 0) > 0 else 0.0
else:
feat_dict[col] = 0.0
if odds_row.get("ms_h", 0) > 0:
feat_dict["odds_ms_h"] = odds_row["ms_h"]
if odds_row.get("ms_d", 0) > 0:
feat_dict["odds_ms_d"] = odds_row["ms_d"]
if odds_row.get("ms_a", 0) > 0:
feat_dict["odds_ms_a"] = odds_row["ms_a"]
ms_h = feat_dict.get("odds_ms_h", 0)
ms_d = feat_dict.get("odds_ms_d", 0)
ms_a = feat_dict.get("odds_ms_a", 0)
if ms_h > 0 and ms_d > 0 and ms_a > 0:
raw_sum = 1/ms_h + 1/ms_d + 1/ms_a
feat_dict["implied_home"] = (1/ms_h) / raw_sum
feat_dict["implied_draw"] = (1/ms_d) / raw_sum
feat_dict["implied_away"] = (1/ms_a) / raw_sum
sh = match_row["score_home"]
sa = match_row["score_away"]
ht_h = match_row.get("ht_score_home")
ht_a = match_row.get("ht_score_away")
try:
X = pd.DataFrame([{c: feat_dict.get(c, 0.0) for c in feature_cols}])
for market_name, model_key, market_list in [
("ms", "ms", ["MS"]),
("ou25", "ou25", ["OU25"]),
("btts", "btts", ["BTTS"]),
("ou15", "ou15", ["OU15"]),
("ou35", "ou35", ["OU35"]),
("ht_result", "ht_result", ["HT"]),
]:
if model_key not in predictor.models:
continue
probs = predictor.predict_market(model_key, feat_dict)
if probs is None:
continue
if model_key == "ms":
for pick, prob in [("1", probs[0]), ("X", probs[1]), ("2", probs[2])]:
actual = resolve_actual("MS", pick, sh, sa, ht_h, ht_a)
key = calibrator_key("MS", pick)
if actual is not None and key:
samples.append({
"match_id": mid,
"market": "MS",
"pick": pick,
"key": key,
"raw_prob": float(prob),
"actual": int(actual),
})
elif model_key == "ht_result":
if ht_h is None or ht_a is None:
continue
for pick, prob in [("1", probs[0]), ("X", probs[1]), ("2", probs[2])]:
actual = resolve_actual("HT", pick, sh, sa, ht_h, ht_a)
key = calibrator_key("HT", pick)
if actual is not None and key:
samples.append({
"match_id": mid,
"market": "HT",
"pick": pick,
"key": key,
"raw_prob": float(prob),
"actual": int(actual),
})
elif model_key in ("ou25", "ou15", "ou35"):
market_upper = model_key.upper()
over_prob = float(probs[0]) if len(probs) > 0 else 0.5
pick = f"Over"
actual = resolve_actual(market_upper, "Over", sh, sa, ht_h, ht_a)
key = calibrator_key(market_upper, "Over")
if actual is not None and key:
samples.append({
"match_id": mid,
"market": market_upper,
"pick": pick,
"key": key,
"raw_prob": over_prob,
"actual": int(actual),
})
elif model_key == "btts":
yes_prob = float(probs[0]) if len(probs) > 0 else 0.5
actual = resolve_actual("BTTS", "Yes", sh, sa, ht_h, ht_a)
key = calibrator_key("BTTS", "Yes")
if actual is not None and key:
samples.append({
"match_id": mid,
"market": "BTTS",
"pick": "Yes",
"key": key,
"raw_prob": yes_prob,
"actual": int(actual),
})
processed += 1
except Exception as e:
skipped += 1
if skipped <= 5:
print(f" Error on {mid}: {e}")
if (i + 1) % 5000 == 0:
elapsed = time.time() - t1
rate = (i + 1) / elapsed
print(f" Processed {i+1}/{len(match_ids)} ({rate:.0f} matches/s)")
elapsed = time.time() - t1
print(f"\nPrediction complete: {processed} matches, {skipped} skipped, {elapsed:.1f}s")
if not samples:
print("No calibration samples generated!")
cur.close()
conn.close()
return
df = pd.DataFrame(samples)
print(f"\nTotal calibration samples: {len(df)}")
print(f"Unique matches: {df['match_id'].nunique()}")
print(f"\nPer-key counts:")
for key, count in df["key"].value_counts().items():
print(f" {key:<14} {count}")
print(f"\nTraining isotonic calibration models (min_samples={args.min_samples})...")
calibrator = get_calibrator()
results: Dict[str, Any] = {}
keys = sorted(df["key"].unique())
for key in keys:
sub = df[df["key"] == key].copy()
sub = sub.drop_duplicates(subset=["match_id", "key"], keep="first")
sub = sub.dropna(subset=["raw_prob", "actual"])
sub = sub[(sub["raw_prob"] > 0.0) & (sub["raw_prob"] < 1.0)]
n = len(sub)
if n < args.min_samples:
results[key] = {"status": "skipped", "samples": n}
continue
metrics = calibrator.train_calibration(
df=sub,
market=key,
prob_col="raw_prob",
actual_col="actual",
min_samples=args.min_samples,
save=True,
)
results[key] = {
"status": "trained",
"samples": metrics.sample_count,
"brier": round(metrics.brier_score, 4),
"ece": round(metrics.calibration_error, 4),
"mean_predicted": round(metrics.mean_predicted, 4),
"mean_actual": round(metrics.mean_actual, 4),
}
print("\n" + "=" * 70)
print("CALIBRATION RESULTS")
print("=" * 70)
print(f"{'market':<14} {'status':<10} {'n':<8} {'brier':<9} {'ece':<8} {'pred_avg':<9} {'actual_avg'}")
print("-" * 70)
for key, info in sorted(results.items()):
if info["status"] == "trained":
print(
f"{key:<14} {'OK':<10} {info['samples']:<8} "
f"{info['brier']:<9.4f} {info['ece']:<8.4f} "
f"{info['mean_predicted']:<9.4f} {info['mean_actual']}"
)
else:
print(f"{key:<14} {'SKIP':<10} {info['samples']:<8}")
print("=" * 70)
total_time = time.time() - t0
print(f"\nTotal time: {total_time:.1f}s")
print(f"Calibration models saved to: {os.path.join(AI_ENGINE_DIR, 'models', 'calibration')}/")
cur.close()
conn.close()
def main():
parser = argparse.ArgumentParser(description="Backfill calibration from historical matches")
parser.add_argument("--limit", type=int, default=50000,
help="Max matches to process (default: 50000)")
parser.add_argument("--min-samples", type=int, default=100,
help="Min samples per market for calibration (default: 100)")
args = parser.parse_args()
run_backfill(args)
if __name__ == "__main__":
main()