iddaai-be/ai-engine/scripts/train_htft_with_tendencies.py

"""
HT/FT Model Training with New Features + Backtest
=====================================================
Extracts training data with the new HT/FT tendency features,
trains a new XGBoost model, and compares it against the old model.

Usage:
    python ai-engine/scripts/train_htft_with_tendencies.py
"""

import os
import sys
import time
import json
import pickle

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

import numpy as np
import pandas as pd
from collections import defaultdict
from tabulate import tabulate

import psycopg2
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from data.db import get_clean_dsn
from features.htft_tendency_engine import HtftTendencyEngine

AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "top_leagues.json")
OUTPUT_DIR = os.path.join(AI_ENGINE_DIR, "data")
os.makedirs(OUTPUT_DIR, exist_ok=True)

HTFT_LABELS = ["1/1", "1/X", "1/2", "X/1", "X/X", "X/2", "2/1", "2/X", "2/2"]


def get_conn():
    dsn = get_clean_dsn()
    return psycopg2.connect(dsn)


def load_top_leagues():
    """Load top league IDs from top_leagues.json."""
    try:
        with open(TOP_LEAGUES_PATH, "r") as f:
            data = json.load(f)
        ids = set()
        for entry in data:
            if isinstance(entry, dict):
                lid = entry.get("id") or entry.get("league_id")
                if lid:
                    ids.add(str(lid))
            elif isinstance(entry, str):
                ids.add(entry)
        print(f"✅ Loaded {len(ids)} top leagues")
        return ids
    except Exception as e:
        print(f"⚠️  Could not load top_leagues.json: {e}. Using all leagues.")
        return None


def load_matches_with_odds(conn, top_league_ids=None):
    """Load FT football matches with HT scores and odds."""
    query = """
    SELECT
        m.id,
        m.home_team_id,
        m.away_team_id,
        m.league_id,
        m.score_home,
        m.score_away,
        m.ht_score_home,
        m.ht_score_away,
        m.mst_utc
    FROM matches m
    WHERE m.sport = 'football'
      AND m.status = 'FT'
      AND m.score_home IS NOT NULL
      AND m.score_away IS NOT NULL
      AND m.ht_score_home IS NOT NULL
      AND m.ht_score_away IS NOT NULL
      AND m.home_team_id IS NOT NULL
      AND m.away_team_id IS NOT NULL
    """

    if top_league_ids:
        placeholders = ",".join(["%s"] * len(top_league_ids))
        query += f" AND m.league_id IN ({placeholders})"

    query += " ORDER BY m.mst_utc ASC"

    cur = conn.cursor()
    params = list(top_league_ids) if top_league_ids else []
    cur.execute(query, params)
    rows = cur.fetchall()
    cur.close()

    cols = ["id", "home_team_id", "away_team_id", "league_id",
            "score_home", "score_away", "ht_score_home", "ht_score_away", "mst_utc"]
    return pd.DataFrame(rows, columns=cols)


def load_odds_for_matches(conn, match_ids):
    """Load MS + HT odds for given match IDs."""
    if not match_ids:
        return {}

    # Load in batches
    odds_map = {}
    batch_size = 5000
    match_list = list(match_ids)

    for i in range(0, len(match_list), batch_size):
        batch = match_list[i:i + batch_size]
        placeholders = ",".join(["%s"] * len(batch))

        cur = conn.cursor()
        cur.execute(f"""
            SELECT oc.match_id, oc.name, os.name as sel_name, os.odd_value
            FROM odd_categories oc
            JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
            WHERE oc.match_id IN ({placeholders})
              AND oc.name IN (
                'Maç Sonucu',
                '1. Yarı Sonucu',
                '2,5 Alt/Üst',
                'Karşılıklı Gol',
                'Çifte Şans'
              )
        """, batch)
        rows = cur.fetchall()
        cur.close()

        for mid, cat_name, sel_name, odd_value in rows:
            if mid not in odds_map:
                odds_map[mid] = {}
            om = odds_map[mid]

            try:
                val = float(odd_value) if odd_value else 0.0
            except (ValueError, TypeError):
                val = 0.0

            if val <= 0:
                continue

            # Exact match for MS
            if cat_name == "Maç Sonucu":
                if sel_name in ("1", "Ev Sahibi"):
                    om["ms_h"] = val
                elif sel_name in ("X", "Berabere"):
                    om["ms_d"] = val
                elif sel_name in ("2", "Deplasman"):
                    om["ms_a"] = val
            elif cat_name == "1. Yarı Sonucu":
                if sel_name in ("1", "Ev Sahibi"):
                    om["ht_ms_h"] = val
                elif sel_name in ("X", "Berabere"):
                    om["ht_ms_d"] = val
                elif sel_name in ("2", "Deplasman"):
                    om["ht_ms_a"] = val

    return odds_map


def compute_labels(df):
    """Compute HT/FT label (0-8)."""
    labels = []
    for _, row in df.iterrows():
        ht = 0 if row["ht_score_home"] > row["ht_score_away"] else (2 if row["ht_score_home"] < row["ht_score_away"] else 1)
        ft = 0 if row["score_home"] > row["score_away"] else (2 if row["score_home"] < row["score_away"] else 1)
        labels.append(ht * 3 + ft)
    return labels


def extract_features(df, conn, odds_map, htft_engine):
    """Extract all features for each match."""
    print(f"\n⏳ Extracting features for {len(df):,} matches...")
    start_time = time.time()

    all_features = []
    processed = 0
    skipped = 0

    for idx, row in df.iterrows():
        mid = row["id"]
        hid = row["home_team_id"]
        aid = row["away_team_id"]
        lid = row["league_id"]
        mst = row["mst_utc"]

        # Odds features
        odds = odds_map.get(mid, {})
        ms_h = odds.get("ms_h", 0.0)
        ms_d = odds.get("ms_d", 0.0)
        ms_a = odds.get("ms_a", 0.0)

        # Skip matches without any odds (too noisy)
        if ms_h <= 0 or ms_d <= 0 or ms_a <= 0:
            skipped += 1
            all_features.append(None)
            continue

        # Implied probs (vig-free)
        raw_sum = 1/ms_h + 1/ms_d + 1/ms_a
        implied_home = (1/ms_h) / raw_sum
        implied_draw = (1/ms_d) / raw_sum
        implied_away = (1/ms_a) / raw_sum

        ht_ms_h = odds.get("ht_ms_h", 0.0)
        ht_ms_d = odds.get("ht_ms_d", 0.0)
        ht_ms_a = odds.get("ht_ms_a", 0.0)

        # HT implied probs
        if ht_ms_h > 0 and ht_ms_d > 0 and ht_ms_a > 0:
            ht_raw = 1/ht_ms_h + 1/ht_ms_d + 1/ht_ms_a
            ht_implied_home = (1/ht_ms_h) / ht_raw
            ht_implied_draw = (1/ht_ms_d) / ht_raw
            ht_implied_away = (1/ht_ms_a) / ht_raw
        else:
            ht_implied_home = ht_implied_draw = ht_implied_away = 0.33

        feat = {
            # Odds features (core)
            "odds_ms_h": ms_h,
            "odds_ms_d": ms_d,
            "odds_ms_a": ms_a,
            "implied_home": implied_home,
            "implied_draw": implied_draw,
            "implied_away": implied_away,
            "fav_gap": abs(implied_home - implied_away),

            # HT odds
            "ht_implied_home": ht_implied_home,
            "ht_implied_draw": ht_implied_draw,
            "ht_implied_away": ht_implied_away,
        }

        # HT/FT tendency features (NEW!)
        try:
            htft_feats = htft_engine.get_features(hid, aid, lid, mst)
            feat.update(htft_feats)
        except Exception as e:
            # Fallback to neutral values
            feat.update({
                "htft_home_ht_scoring_rate": 0.5,
                "htft_home_ht_concede_rate": 0.5,
                "htft_home_ht_win_rate": 0.33,
                "htft_home_comeback_rate": 0.0,
                "htft_home_first_half_goal_pct": 0.5,
                "htft_home_second_half_surge": 1.0,
                "htft_away_ht_scoring_rate": 0.5,
                "htft_away_ht_concede_rate": 0.5,
                "htft_away_ht_win_rate": 0.33,
                "htft_away_comeback_rate": 0.0,
                "htft_away_first_half_goal_pct": 0.5,
                "htft_away_second_half_surge": 1.0,
                "htft_league_avg_ht_goals": 1.0,
                "htft_league_reversal_rate": 0.05,
                "htft_league_first_half_pct": 0.44,
                "htft_home_sample_size": 0.0,
                "htft_away_sample_size": 0.0,
            })

        all_features.append(feat)
        processed += 1

        if processed % 2000 == 0:
            elapsed = time.time() - start_time
            rate = processed / elapsed
            remaining = (len(df) - processed - skipped) / rate if rate > 0 else 0
            print(f"   Processed: {processed:,} / {len(df):,} "
                  f"(skipped: {skipped:,}) "
                  f"[{elapsed:.0f}s elapsed, ~{remaining:.0f}s remaining]")

    elapsed = time.time() - start_time
    print(f"   ✅ Features extracted: {processed:,} (skipped {skipped:,}) in {elapsed:.1f}s")

    return all_features


def train_and_evaluate(X_train, y_train, X_test, y_test, feature_names, label=""):
    """Train XGBoost model and evaluate."""
    model = xgb.XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        num_class=9,
        objective="multi:softprob",
        eval_metric="mlogloss",
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=5,
        random_state=42,
        verbosity=0,
        n_jobs=-1,
    )

    print(f"\n🏋️  Training {label} model...")
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

    # Predictions
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"\n📊 {label} Results:")
    print(f"   Overall Accuracy: {accuracy:.4f} ({accuracy*100:.1f}%)")

    # Per-class accuracy
    print(f"\n   Per-class breakdown:")
    rows = []
    for i, label_name in enumerate(HTFT_LABELS):
        mask = y_test == i
        if mask.sum() > 0:
            class_acc = accuracy_score(y_test[mask], y_pred[mask])
            rows.append([label_name, mask.sum(), f"{class_acc*100:.1f}%"])

    print(tabulate(rows, headers=["HT/FT", "Count", "Accuracy"], tablefmt="pretty"))

    # Feature importance
    importances = model.feature_importances_
    feat_imp = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)
    print(f"\n   Top 15 Features:")
    for fname, imp in feat_imp[:15]:
        bar = "█" * int(imp * 100)
        print(f"   {fname:40s} {imp:.4f} {bar}")

    return model, accuracy


def main():
    print("🚀 HT/FT Model Training with New Tendency Features")
    print("=" * 70)

    conn = get_conn()
    top_league_ids = load_top_leagues()

    # Load matches
    print("\n📊 Loading matches...")
    df = load_matches_with_odds(conn, top_league_ids)
    print(f"   ✅ {len(df):,} matches loaded")

    # Load odds
    print("\n📊 Loading odds...")
    match_ids = set(df["id"].tolist())
    odds_map = load_odds_for_matches(conn, match_ids)
    print(f"   ✅ Odds loaded for {len(odds_map):,} matches")

    # Compute labels
    print("\n📊 Computing HT/FT labels...")
    df["label"] = compute_labels(df)
    label_dist = df["label"].value_counts().sort_index()
    for i, label in enumerate(HTFT_LABELS):
        c = label_dist.get(i, 0)
        print(f"   {label}: {c:,} ({c/len(df)*100:.1f}%)")

    # Initialize HT/FT tendency engine
    htft_engine = HtftTendencyEngine()

    # Extract features
    all_features = extract_features(df, conn, odds_map, htft_engine)

    # Filter: keep only matches with features
    valid_mask = [f is not None for f in all_features]
    df_valid = df[valid_mask].reset_index(drop=True)
    features_valid = [f for f in all_features if f is not None]

    print(f"\n📊 Valid matches with features: {len(df_valid):,}")

    # Convert to arrays
    feature_names = list(features_valid[0].keys())
    X = np.array([[f[k] for k in feature_names] for f in features_valid], dtype=np.float32)
    y = np.array(df_valid["label"].tolist(), dtype=np.int32)

    # Split: time-based (last 20% as test)
    split_idx = int(len(X) * 0.8)
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    print(f"   Train: {len(X_train):,}, Test: {len(X_test):,}")

    # ─── Train WITH new features ─────────────────────────────────────────
    model_new, acc_new = train_and_evaluate(
        X_train, y_train, X_test, y_test, feature_names,
        label="NEW (with HT/FT tendencies)"
    )

    # ─── Train WITHOUT new features (baseline) ──────────────────────────
    # Remove htft_ features for comparison
    baseline_cols = [i for i, n in enumerate(feature_names) if not n.startswith("htft_")]
    baseline_names = [feature_names[i] for i in baseline_cols]
    X_train_base = X_train[:, baseline_cols]
    X_test_base = X_test[:, baseline_cols]

    model_base, acc_base = train_and_evaluate(
        X_train_base, y_train, X_test_base, y_test, baseline_names,
        label="BASELINE (without HT/FT tendencies)"
    )

    # ─── Comparison ──────────────────────────────────────────────────────
    print("\n" + "=" * 70)
    print("📈 COMPARISON")
    print("=" * 70)
    print(f"   Baseline accuracy:  {acc_base*100:.2f}%")
    print(f"   New accuracy:       {acc_new*100:.2f}%")
    delta = (acc_new - acc_base) * 100
    direction = "📈 IMPROVEMENT" if delta > 0 else "📉 REGRESSION"
    print(f"   Delta:              {delta:+.2f}% {direction}")

    # Save new model
    model_path = os.path.join(AI_ENGINE_DIR, "models", "xgboost", "xgb_ht_ft_v2.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(model_new, f)
    print(f"\n💾 New model saved: {model_path}")

    conn.close()
    print("\n✅ Done!")


if __name__ == "__main__":
    main()