""" VQWEN v3 Shared-Contract Backtest ================================= Evaluates the retrained VQWEN models on the temporal validation slice using the exact same pre-match feature contract as training/runtime. """ from __future__ import annotations import json import os import pickle import sys from pathlib import Path import numpy as np import pandas as pd import psycopg2 from dotenv import load_dotenv AI_DIR = Path(__file__).resolve().parent ENGINE_DIR = AI_DIR.parent REPO_DIR = ENGINE_DIR.parent MODELS_DIR = ENGINE_DIR / "models" / "vqwen" if str(ENGINE_DIR) not in sys.path: sys.path.insert(0, str(ENGINE_DIR)) from features.vqwen_contract import FEATURE_COLUMNS # noqa: E402 from train_vqwen_v3 import ( # noqa: E402 _enrich_pre_match_context, _fetch_dataframe, _prepare_features, _temporal_split, load_top_league_ids, ) def _load_env() -> None: load_dotenv(REPO_DIR / ".env", override=False) load_dotenv(ENGINE_DIR / ".env", override=False) def get_clean_dsn() -> str: _load_env() raw = os.getenv("DATABASE_URL", "").strip().strip('"').strip("'") if not raw: raise RuntimeError("DATABASE_URL is missing.") return raw.split("?", 1)[0] def _accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float: if len(y_true) == 0: return 0.0 return float((y_true == y_pred).mean()) def _binary_metrics(prob: np.ndarray, y_true: np.ndarray) -> tuple[float, float]: pred = (prob >= 0.5).astype(int) acc = _accuracy(y_true, pred) brier = float(np.mean((prob - y_true) ** 2)) if len(y_true) else 1.0 return acc, brier def _multiclass_brier(prob: np.ndarray, y_true: np.ndarray, n_classes: int = 3) -> float: if len(y_true) == 0: return 1.0 target = np.zeros((len(y_true), n_classes), dtype=np.float64) target[np.arange(len(y_true)), y_true.astype(int)] = 1.0 return float(np.mean(np.sum((prob - target) ** 2, axis=1))) def _band_label(probability: float) -> str: if probability >= 0.70: return "HIGH" if probability >= 0.60: return "MEDIUM" if probability >= 0.50: return "LOW" return "NO_BET" def _summarize_bands( name: str, confidence: np.ndarray, is_correct: np.ndarray, ) -> list[str]: lines: list[str] = [] for band in ("HIGH", "MEDIUM", "LOW"): mask = np.array([_band_label(float(p)) == band for p in confidence], dtype=bool) count = int(mask.sum()) accuracy = float(is_correct[mask].mean()) if count else 0.0 avg_conf = float(confidence[mask].mean()) if count else 0.0 lines.append( f"{name} {band:<6} count={count:<4} accuracy={accuracy*100:5.1f}% avg_conf={avg_conf*100:5.1f}%" ) return lines def run_v3_backtest() -> None: print("VQWEN v3 SHARED-CONTRACT BACKTEST") print("=" * 60) league_ids = load_top_league_ids() dsn = get_clean_dsn() with psycopg2.connect(dsn) as conn: with conn.cursor() as cur: df = _fetch_dataframe(cur, league_ids) df = _enrich_pre_match_context(cur, df) df = _prepare_features(df) train_df, valid_df = _temporal_split(df) print(f"Toplam ornek: {len(df)} | Train: {len(train_df)} | Valid: {len(valid_df)}") with (MODELS_DIR / "vqwen_ms.pkl").open("rb") as handle: model_ms = pickle.load(handle) with (MODELS_DIR / "vqwen_ou25.pkl").open("rb") as handle: model_ou25 = pickle.load(handle) with (MODELS_DIR / "vqwen_btts.pkl").open("rb") as handle: model_btts = pickle.load(handle) X_valid = valid_df[FEATURE_COLUMNS] y_ms = valid_df["t_ms"].to_numpy(dtype=np.int64) y_ou25 = valid_df["t_ou"].to_numpy(dtype=np.int64) y_btts = valid_df["t_btts"].to_numpy(dtype=np.int64) ms_prob = np.asarray(model_ms.predict(X_valid), dtype=np.float64) ou25_prob = np.asarray(model_ou25.predict(X_valid), dtype=np.float64).reshape(-1) btts_prob = np.asarray(model_btts.predict(X_valid), dtype=np.float64).reshape(-1) ms_pred = np.argmax(ms_prob, axis=1) ms_conf = np.max(ms_prob, axis=1) ms_correct = (ms_pred == y_ms).astype(np.int64) ou25_pred = (ou25_prob >= 0.5).astype(np.int64) ou25_conf = np.where(ou25_prob >= 0.5, ou25_prob, 1.0 - ou25_prob) ou25_correct = (ou25_pred == y_ou25).astype(np.int64) btts_pred = (btts_prob >= 0.5).astype(np.int64) btts_conf = np.where(btts_prob >= 0.5, btts_prob, 1.0 - btts_prob) btts_correct = (btts_pred == y_btts).astype(np.int64) ms_acc = _accuracy(y_ms, ms_pred) ou25_acc, ou25_brier = _binary_metrics(ou25_prob, y_ou25) btts_acc, btts_brier = _binary_metrics(btts_prob, y_btts) ms_brier = _multiclass_brier(ms_prob, y_ms) print("\nGenel metrikler") print(f"MS accuracy : {ms_acc*100:.2f}% | multiclass_brier={ms_brier:.4f}") print(f"OU25 accuracy : {ou25_acc*100:.2f}% | brier={ou25_brier:.4f}") print(f"BTTS accuracy : {btts_acc*100:.2f}% | brier={btts_brier:.4f}") print("\nConfidence band") for line in _summarize_bands("MS", ms_conf, ms_correct): print(line) for line in _summarize_bands("OU25", ou25_conf, ou25_correct): print(line) for line in _summarize_bands("BTTS", btts_conf, btts_correct): print(line) summary = { "validation_samples": int(len(valid_df)), "metrics": { "ms_accuracy": round(ms_acc, 4), "ms_brier": round(ms_brier, 4), "ou25_accuracy": round(ou25_acc, 4), "ou25_brier": round(ou25_brier, 4), "btts_accuracy": round(btts_acc, 4), "btts_brier": round(btts_brier, 4), }, } (MODELS_DIR / "vqwen_backtest_v3_summary.json").write_text( json.dumps(summary, indent=2), encoding="utf-8", ) print("\nKaydedildi: vqwen_backtest_v3_summary.json") if __name__ == "__main__": run_v3_backtest()