iddaai-be/ai-engine/scripts/backtest_vqwen_v3.py

"""
VQWEN v3 Shared-Contract Backtest
=================================

Evaluates the retrained VQWEN models on the temporal validation slice using
the exact same pre-match feature contract as training/runtime.
"""

from __future__ import annotations

import json
import os
import pickle
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import psycopg2
from dotenv import load_dotenv

AI_DIR = Path(__file__).resolve().parent
ENGINE_DIR = AI_DIR.parent
REPO_DIR = ENGINE_DIR.parent
MODELS_DIR = ENGINE_DIR / "models" / "vqwen"

if str(ENGINE_DIR) not in sys.path:
    sys.path.insert(0, str(ENGINE_DIR))

from features.vqwen_contract import FEATURE_COLUMNS  # noqa: E402
from train_vqwen_v3 import (  # noqa: E402
    _enrich_pre_match_context,
    _fetch_dataframe,
    _prepare_features,
    _temporal_split,
    load_top_league_ids,
)


def _load_env() -> None:
    load_dotenv(REPO_DIR / ".env", override=False)
    load_dotenv(ENGINE_DIR / ".env", override=False)


def get_clean_dsn() -> str:
    _load_env()
    raw = os.getenv("DATABASE_URL", "").strip().strip('"').strip("'")
    if not raw:
        raise RuntimeError("DATABASE_URL is missing.")
    return raw.split("?", 1)[0]


def _accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    if len(y_true) == 0:
        return 0.0
    return float((y_true == y_pred).mean())


def _binary_metrics(prob: np.ndarray, y_true: np.ndarray) -> tuple[float, float]:
    pred = (prob >= 0.5).astype(int)
    acc = _accuracy(y_true, pred)
    brier = float(np.mean((prob - y_true) ** 2)) if len(y_true) else 1.0
    return acc, brier


def _multiclass_brier(prob: np.ndarray, y_true: np.ndarray, n_classes: int = 3) -> float:
    if len(y_true) == 0:
        return 1.0
    target = np.zeros((len(y_true), n_classes), dtype=np.float64)
    target[np.arange(len(y_true)), y_true.astype(int)] = 1.0
    return float(np.mean(np.sum((prob - target) ** 2, axis=1)))


def _band_label(probability: float) -> str:
    if probability >= 0.70:
        return "HIGH"
    if probability >= 0.60:
        return "MEDIUM"
    if probability >= 0.50:
        return "LOW"
    return "NO_BET"


def _summarize_bands(
    name: str,
    confidence: np.ndarray,
    is_correct: np.ndarray,
) -> list[str]:
    lines: list[str] = []
    for band in ("HIGH", "MEDIUM", "LOW"):
        mask = np.array([_band_label(float(p)) == band for p in confidence], dtype=bool)
        count = int(mask.sum())
        accuracy = float(is_correct[mask].mean()) if count else 0.0
        avg_conf = float(confidence[mask].mean()) if count else 0.0
        lines.append(
            f"{name} {band:<6} count={count:<4} accuracy={accuracy*100:5.1f}% avg_conf={avg_conf*100:5.1f}%"
        )
    return lines


def run_v3_backtest() -> None:
    print("VQWEN v3 SHARED-CONTRACT BACKTEST")
    print("=" * 60)

    league_ids = load_top_league_ids()
    dsn = get_clean_dsn()

    with psycopg2.connect(dsn) as conn:
        with conn.cursor() as cur:
            df = _fetch_dataframe(cur, league_ids)
            df = _enrich_pre_match_context(cur, df)
            df = _prepare_features(df)

    train_df, valid_df = _temporal_split(df)
    print(f"Toplam ornek: {len(df)} | Train: {len(train_df)} | Valid: {len(valid_df)}")

    with (MODELS_DIR / "vqwen_ms.pkl").open("rb") as handle:
        model_ms = pickle.load(handle)
    with (MODELS_DIR / "vqwen_ou25.pkl").open("rb") as handle:
        model_ou25 = pickle.load(handle)
    with (MODELS_DIR / "vqwen_btts.pkl").open("rb") as handle:
        model_btts = pickle.load(handle)

    X_valid = valid_df[FEATURE_COLUMNS]
    y_ms = valid_df["t_ms"].to_numpy(dtype=np.int64)
    y_ou25 = valid_df["t_ou"].to_numpy(dtype=np.int64)
    y_btts = valid_df["t_btts"].to_numpy(dtype=np.int64)

    ms_prob = np.asarray(model_ms.predict(X_valid), dtype=np.float64)
    ou25_prob = np.asarray(model_ou25.predict(X_valid), dtype=np.float64).reshape(-1)
    btts_prob = np.asarray(model_btts.predict(X_valid), dtype=np.float64).reshape(-1)

    ms_pred = np.argmax(ms_prob, axis=1)
    ms_conf = np.max(ms_prob, axis=1)
    ms_correct = (ms_pred == y_ms).astype(np.int64)

    ou25_pred = (ou25_prob >= 0.5).astype(np.int64)
    ou25_conf = np.where(ou25_prob >= 0.5, ou25_prob, 1.0 - ou25_prob)
    ou25_correct = (ou25_pred == y_ou25).astype(np.int64)

    btts_pred = (btts_prob >= 0.5).astype(np.int64)
    btts_conf = np.where(btts_prob >= 0.5, btts_prob, 1.0 - btts_prob)
    btts_correct = (btts_pred == y_btts).astype(np.int64)

    ms_acc = _accuracy(y_ms, ms_pred)
    ou25_acc, ou25_brier = _binary_metrics(ou25_prob, y_ou25)
    btts_acc, btts_brier = _binary_metrics(btts_prob, y_btts)
    ms_brier = _multiclass_brier(ms_prob, y_ms)

    print("\nGenel metrikler")
    print(f"MS accuracy   : {ms_acc*100:.2f}% | multiclass_brier={ms_brier:.4f}")
    print(f"OU25 accuracy : {ou25_acc*100:.2f}% | brier={ou25_brier:.4f}")
    print(f"BTTS accuracy : {btts_acc*100:.2f}% | brier={btts_brier:.4f}")

    print("\nConfidence band")
    for line in _summarize_bands("MS", ms_conf, ms_correct):
        print(line)
    for line in _summarize_bands("OU25", ou25_conf, ou25_correct):
        print(line)
    for line in _summarize_bands("BTTS", btts_conf, btts_correct):
        print(line)

    summary = {
        "validation_samples": int(len(valid_df)),
        "metrics": {
            "ms_accuracy": round(ms_acc, 4),
            "ms_brier": round(ms_brier, 4),
            "ou25_accuracy": round(ou25_acc, 4),
            "ou25_brier": round(ou25_brier, 4),
            "btts_accuracy": round(btts_acc, 4),
            "btts_brier": round(btts_brier, 4),
        },
    }
    (MODELS_DIR / "vqwen_backtest_v3_summary.json").write_text(
        json.dumps(summary, indent=2),
        encoding="utf-8",
    )
    print("\nKaydedildi: vqwen_backtest_v3_summary.json")


if __name__ == "__main__":
    run_v3_backtest()