Files
fahricansecer f8599bdb9a
Deploy Iddaai Backend / build-and-deploy (push) Failing after 2m1s
gg
2026-05-11 23:11:41 +03:00

183 lines
5.8 KiB
Python

"""
VQWEN v3 Shared-Contract Backtest
=================================
Evaluates the retrained VQWEN models on the temporal validation slice using
the exact same pre-match feature contract as training/runtime.
"""
from __future__ import annotations
import json
import os
import pickle
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import psycopg2
from dotenv import load_dotenv
AI_DIR = Path(__file__).resolve().parent
ENGINE_DIR = AI_DIR.parent
REPO_DIR = ENGINE_DIR.parent
MODELS_DIR = ENGINE_DIR / "models" / "vqwen"
if str(ENGINE_DIR) not in sys.path:
sys.path.insert(0, str(ENGINE_DIR))
from features.vqwen_contract import FEATURE_COLUMNS # noqa: E402
from train_vqwen_v3 import ( # noqa: E402
_enrich_pre_match_context,
_fetch_dataframe,
_prepare_features,
_temporal_split,
load_top_league_ids,
)
def _load_env() -> None:
load_dotenv(REPO_DIR / ".env", override=False)
load_dotenv(ENGINE_DIR / ".env", override=False)
def get_clean_dsn() -> str:
_load_env()
raw = os.getenv("DATABASE_URL", "").strip().strip('"').strip("'")
if not raw:
raise RuntimeError("DATABASE_URL is missing.")
return raw.split("?", 1)[0]
def _accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
if len(y_true) == 0:
return 0.0
return float((y_true == y_pred).mean())
def _binary_metrics(prob: np.ndarray, y_true: np.ndarray) -> tuple[float, float]:
pred = (prob >= 0.5).astype(int)
acc = _accuracy(y_true, pred)
brier = float(np.mean((prob - y_true) ** 2)) if len(y_true) else 1.0
return acc, brier
def _multiclass_brier(prob: np.ndarray, y_true: np.ndarray, n_classes: int = 3) -> float:
if len(y_true) == 0:
return 1.0
target = np.zeros((len(y_true), n_classes), dtype=np.float64)
target[np.arange(len(y_true)), y_true.astype(int)] = 1.0
return float(np.mean(np.sum((prob - target) ** 2, axis=1)))
def _band_label(probability: float) -> str:
if probability >= 0.70:
return "HIGH"
if probability >= 0.60:
return "MEDIUM"
if probability >= 0.50:
return "LOW"
return "NO_BET"
def _summarize_bands(
name: str,
confidence: np.ndarray,
is_correct: np.ndarray,
) -> list[str]:
lines: list[str] = []
for band in ("HIGH", "MEDIUM", "LOW"):
mask = np.array([_band_label(float(p)) == band for p in confidence], dtype=bool)
count = int(mask.sum())
accuracy = float(is_correct[mask].mean()) if count else 0.0
avg_conf = float(confidence[mask].mean()) if count else 0.0
lines.append(
f"{name} {band:<6} count={count:<4} accuracy={accuracy*100:5.1f}% avg_conf={avg_conf*100:5.1f}%"
)
return lines
def run_v3_backtest() -> None:
print("VQWEN v3 SHARED-CONTRACT BACKTEST")
print("=" * 60)
league_ids = load_top_league_ids()
dsn = get_clean_dsn()
with psycopg2.connect(dsn) as conn:
with conn.cursor() as cur:
df = _fetch_dataframe(cur, league_ids)
df = _enrich_pre_match_context(cur, df)
df = _prepare_features(df)
train_df, valid_df = _temporal_split(df)
print(f"Toplam ornek: {len(df)} | Train: {len(train_df)} | Valid: {len(valid_df)}")
with (MODELS_DIR / "vqwen_ms.pkl").open("rb") as handle:
model_ms = pickle.load(handle)
with (MODELS_DIR / "vqwen_ou25.pkl").open("rb") as handle:
model_ou25 = pickle.load(handle)
with (MODELS_DIR / "vqwen_btts.pkl").open("rb") as handle:
model_btts = pickle.load(handle)
X_valid = valid_df[FEATURE_COLUMNS]
y_ms = valid_df["t_ms"].to_numpy(dtype=np.int64)
y_ou25 = valid_df["t_ou"].to_numpy(dtype=np.int64)
y_btts = valid_df["t_btts"].to_numpy(dtype=np.int64)
ms_prob = np.asarray(model_ms.predict(X_valid), dtype=np.float64)
ou25_prob = np.asarray(model_ou25.predict(X_valid), dtype=np.float64).reshape(-1)
btts_prob = np.asarray(model_btts.predict(X_valid), dtype=np.float64).reshape(-1)
ms_pred = np.argmax(ms_prob, axis=1)
ms_conf = np.max(ms_prob, axis=1)
ms_correct = (ms_pred == y_ms).astype(np.int64)
ou25_pred = (ou25_prob >= 0.5).astype(np.int64)
ou25_conf = np.where(ou25_prob >= 0.5, ou25_prob, 1.0 - ou25_prob)
ou25_correct = (ou25_pred == y_ou25).astype(np.int64)
btts_pred = (btts_prob >= 0.5).astype(np.int64)
btts_conf = np.where(btts_prob >= 0.5, btts_prob, 1.0 - btts_prob)
btts_correct = (btts_pred == y_btts).astype(np.int64)
ms_acc = _accuracy(y_ms, ms_pred)
ou25_acc, ou25_brier = _binary_metrics(ou25_prob, y_ou25)
btts_acc, btts_brier = _binary_metrics(btts_prob, y_btts)
ms_brier = _multiclass_brier(ms_prob, y_ms)
print("\nGenel metrikler")
print(f"MS accuracy : {ms_acc*100:.2f}% | multiclass_brier={ms_brier:.4f}")
print(f"OU25 accuracy : {ou25_acc*100:.2f}% | brier={ou25_brier:.4f}")
print(f"BTTS accuracy : {btts_acc*100:.2f}% | brier={btts_brier:.4f}")
print("\nConfidence band")
for line in _summarize_bands("MS", ms_conf, ms_correct):
print(line)
for line in _summarize_bands("OU25", ou25_conf, ou25_correct):
print(line)
for line in _summarize_bands("BTTS", btts_conf, btts_correct):
print(line)
summary = {
"validation_samples": int(len(valid_df)),
"metrics": {
"ms_accuracy": round(ms_acc, 4),
"ms_brier": round(ms_brier, 4),
"ou25_accuracy": round(ou25_acc, 4),
"ou25_brier": round(ou25_brier, 4),
"btts_accuracy": round(btts_acc, 4),
"btts_brier": round(btts_brier, 4),
},
}
(MODELS_DIR / "vqwen_backtest_v3_summary.json").write_text(
json.dumps(summary, indent=2),
encoding="utf-8",
)
print("\nKaydedildi: vqwen_backtest_v3_summary.json")
if __name__ == "__main__":
run_v3_backtest()