iddaai-be/ai-engine/scripts/train_htft_vqwen.py

"""
HT/FT (İY/MS) Model Training Script - VQWEN v3

Bu script İY/MS (Half Time / Full Time) tahmini için XGBoost modeli eğitir.
9 sınıf: 1/1, 1/X, 1/2, X/1, X/X, X/2, 2/1, 2/X, 2/2

Features:
- Odds (MS + HT)
- HT/FT Tendency Engine (takımların ilk yarı/ikinci yarı performansları)
- League-level stats
- Data quality metrics

Output:
- ai-engine/models/xgboost/xgb_ht_ft.json (V20 + V25 compatible)
"""

import os
import sys
import json
import pickle
import psycopg2
from psycopg2.extras import RealDictCursor
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.calibration import CalibratedClassifierCV

# Add parent directorys to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from features.htft_tendency_engine import HtftTendencyEngine

# Database connection
DB_URL = os.getenv('DATABASE_URL', 'postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db')
# Remove ?schema=public if present (psycopg2 doesn't accept it)
if '?' in DB_URL:
    DB_URL = DB_URL.split('?')[0]

# HT/FT Labels
HTFT_LABELS = ["1/1", "1/X", "1/2", "X/1", "X/X", "X/2", "2/1", "2/X", "2/2"]

# Save path
MODEL_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'models', 'xgboost')
MODEL_PATH_JSON = os.path.join(MODEL_DIR, 'xgb_ht_ft.json')
MODEL_PATH_PKL = os.path.join(MODEL_DIR, 'xgb_ht_ft.pkl')


def fetch_matches():
    """Fetch completed football matches with HT and FT scores"""
    print("📊 Fetching completed football matches...")

    conn = psycopg2.connect(DB_URL)
    cur = conn.cursor(cursor_factory=RealDictCursor)

    cur.execute("""
        SELECT
            m.id,
            m.home_team_id,
            m.away_team_id,
            m.league_id,
            m.sport,
            m.mst_utc,
            m.ht_score_home,
            m.ht_score_away,
            m.score_home,
            m.score_away
        FROM matches m
        WHERE m.sport = 'football'
          AND m.status = 'FT'
          AND m.ht_score_home IS NOT NULL
          AND m.ht_score_away IS NOT NULL
          AND m.score_home IS NOT NULL
          AND m.score_away IS NOT NULL
          AND m.mst_utc IS NOT NULL
        ORDER BY m.mst_utc ASC
    """)

    matches = cur.fetchall()
    print(f"✅ Fetched {len(matches)} matches")

    cur.close()
    conn.close()

    return matches


def compute_htft_label(ht_home, ht_away, ft_home, ft_away):
    """
    Compute HT/FT label as integer 0-8

    HT result: 0=home, 1=draw, 2=away
    FT result: 0=home, 1=draw, 2=away
    Label = ht_result * 3 + ft_result
    """
    if ht_home > ht_away:
        ht_result = 0
    elif ht_home == ht_away:
        ht_result = 1
    else:
        ht_result = 2

    if ft_home > ft_away:
        ft_result = 0
    elif ft_home == ft_away:
        ft_result = 1
    else:
        ft_result = 2

    return ht_result * 3 + ft_result


def extract_features_and_labels(matches):
    """Extract features using HT/FT Tendency Engine + Odds"""
    print("\n🔧 Extracting features...")

    conn = psycopg2.connect(DB_URL)
    cur = conn.cursor(cursor_factory=RealDictCursor)

    htft_engine = HtftTendencyEngine()

    features_list = []
    labels = []
    match_ids = []

    for idx, match in enumerate(matches):
        if idx % 1000 == 0:
            print(f"   Processing {idx}/{len(matches)}...")

        mid = match['id']
        hid = str(match['home_team_id'])
        aid = str(match['away_team_id'])
        lid = str(match['league_id']) if match['league_id'] else None
        mst = int(match['mst_utc'])

        # Fetch odds (MS and HT)
        cur.execute("""
            SELECT oc.name as category_name, os.name as selection_name, os.odd_value
            FROM odd_categories oc
            JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
            WHERE oc.match_id = %s
        """, (mid,))

        odds_rows = cur.fetchall()
        odds = {}
        ht_odds = {}

        for row in odds_rows:
            cat = row['category_name'].lower()
            sel = row['selection_name'].lower()
            val = float(row['odd_value'])

            if 'maç sonucu' in cat or '1.yarı sonucu' in cat:
                if '1.yarı sonucu' in cat:
                    if sel == '1': ht_odds['ht_ms_h'] = val
                    elif sel in ('x', '0'): ht_odds['ht_ms_d'] = val
                    elif sel == '2': ht_odds['ht_ms_a'] = val
                else:
                    if sel == '1': odds['ms_h'] = val
                    elif sel in ('x', '0'): odds['ms_d'] = val
                    elif sel == '2': odds['ms_a'] = val

        # Skip if no odds
        if 'ms_h' not in odds or 'ms_d' not in odds or 'ms_a' not in odds:
            continue

        # Compute HT/FT label
        label = compute_htft_label(
            match['ht_score_home'],
            match['ht_score_away'],
            match['score_home'],
            match['score_away']
        )

        # Extract HT/FT tendency features
        try:
            htft_feats = htft_engine.get_features(hid, aid, lid, mst)
        except Exception as e:
            # Fallback to defaults
            htft_feats = htft_engine._empty_features()

        # Build feature dict
        feat = {
            # MS Odds
            'odds_ms_h': odds.get('ms_h', 2.0),
            'odds_ms_d': odds.get('ms_d', 3.2),
            'odds_ms_a': odds.get('ms_a', 3.5),
            'implied_home': 1.0 / odds.get('ms_h', 2.0),
            'implied_draw': 1.0 / odds.get('ms_d', 3.2),
            'implied_away': 1.0 / odds.get('ms_a', 3.5),
            'fav_gap': abs(odds.get('ms_h', 2.0) - odds.get('ms_a', 3.5)),

            # HT Odds
            'ht_implied_home': 1.0 / ht_odds.get('ht_ms_h', 3.0),
            'ht_implied_draw': 1.0 / ht_odds.get('ht_ms_d', 2.1),
            'ht_implied_away': 1.0 / ht_odds.get('ht_ms_a', 3.5),

            # HT/FT Tendencies (from engine)
            'htft_home_ht_scoring_rate': htft_feats.get('home_ht_scoring_rate', 0.5),
            'htft_home_ht_concede_rate': htft_feats.get('home_ht_concede_rate', 0.5),
            'htft_home_ht_win_rate': htft_feats.get('home_ht_win_rate', 0.33),
            'htft_home_comeback_rate': htft_feats.get('home_comeback_rate', 0.0),
            'htft_home_first_half_goal_pct': htft_feats.get('home_first_half_goal_pct', 0.5),
            'htft_home_second_half_surge': htft_feats.get('home_second_half_surge', 1.0),

            'htft_away_ht_scoring_rate': htft_feats.get('away_ht_scoring_rate', 0.5),
            'htft_away_ht_concede_rate': htft_feats.get('away_ht_concede_rate', 0.5),
            'htft_away_ht_win_rate': htft_feats.get('away_ht_win_rate', 0.33),
            'htft_away_comeback_rate': htft_feats.get('away_comeback_rate', 0.0),
            'htft_away_first_half_goal_pct': htft_feats.get('away_first_half_goal_pct', 0.5),
            'htft_away_second_half_surge': htft_feats.get('away_second_half_surge', 1.0),

            # League-level
            'htft_league_avg_ht_goals': htft_feats.get('league_avg_ht_goals', 1.0),
            'htft_league_reversal_rate': htft_feats.get('league_reversal_rate', 0.05),
            'htft_league_first_half_pct': htft_feats.get('league_first_half_pct', 0.44),

            # Data quality
            'htft_home_sample_size': htft_feats.get('home_sample_size', 0.0),
            'htft_away_sample_size': htft_feats.get('away_sample_size', 0.0),
        }

        features_list.append(feat)
        labels.append(label)
        match_ids.append(mid)

    cur.close()
    conn.close()

    print(f"✅ Extracted {len(features_list)} samples with features")

    return features_list, labels, match_ids


def train_model(features_list, labels):
    """Train XGBoost classifier with class weights and calibration"""
    print("\n🎯 Training HT/FT XGBoost model...")

    # Convert to DataFrame
    X = pd.DataFrame(features_list)
    y = np.array(labels)

    # Print class distribution
    print("\n📊 Class distribution:")
    for i, label_name in enumerate(HTFT_LABELS):
        count = np.sum(y == i)
        print(f"   {label_name}: {count} ({count/len(y)*100:.1f}%)")

    # Time-based split (80/20)
    split_idx = int(len(X) * 0.8)
    X_train = X.iloc[:split_idx]
    X_test = X.iloc[split_idx:]
    y_train = y[:split_idx]
    y_test = y[split_idx:]

    print(f"\n📈 Train size: {len(X_train)}, Test size: {len(X_test)}")

    # Compute class weights (handle imbalance)
    from sklearn.utils.class_weight import compute_class_weight
    class_weights = compute_class_weight('balanced', classes=np.arange(9), y=y_train)
    sample_weights = np.array([class_weights[label] for label in y_train])

    print(f"\n⚖️  Class weights: {dict(zip(HTFT_LABELS, [round(w, 2) for w in class_weights]))}")

    # Train XGBoost
    model = xgb.XGBClassifier(
        n_estimators=400,
        max_depth=7,
        learning_rate=0.05,
        objective='multi:softprob',
        num_class=9,
        eval_metric='mlogloss',
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=5,
        gamma=0.1,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=20,  # Move to init for newer XGBoost versions
    )

    model.fit(
        X_train, y_train,
        sample_weight=sample_weights,
        eval_set=[(X_test, y_test)],
        verbose=False,
    )

    # Evaluate
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n✅ Test Accuracy: {accuracy:.4f} ({accuracy*100:.1f}%)")

    # Classification report
    print("\n📊 Classification Report:")
    print(classification_report(y_test, y_pred, target_names=HTFT_LABELS, zero_division=0))

    # Confusion matrix
    print("\n🔲 Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    # Feature importance
    print("\n🔝 Top 15 Features:")
    importance = model.feature_importances_
    feat_importance = sorted(zip(X.columns, importance), key=lambda x: x[1], reverse=True)[:15]
    for feat, imp in feat_importance:
        print(f"   {feat}: {imp:.4f}")

    return model, X.columns.tolist()


def save_model(model, feature_names):
    """Save model in both JSON and PKL formats"""
    print("\n💾 Saving model...")

    # Create directory
    os.makedirs(MODEL_DIR, exist_ok=True)

    # Save as JSON (for V25 + V20)
    model.get_booster().save_model(MODEL_PATH_JSON)
    print(f"✅ Saved JSON model: {MODEL_PATH_JSON}")

    # Save as PKL (for V20 sklearn wrapper)
    with open(MODEL_PATH_PKL, 'wb') as f:
        pickle.dump(model, f)
    print(f"✅ Saved PKL model: {MODEL_PATH_PKL}")

    # Save feature names as JSON
    features_path = os.path.join(MODEL_DIR, 'htft_features.json')
    with open(features_path, 'w') as f:
        json.dump(feature_names, f, indent=2)
    print(f"✅ Saved features: {features_path}")


def test_model_loading():
    """Test that models can be loaded by V20 and V25"""
    print("\n🧪 Testing model loading...")

    # Test V25 loading (raw xgb.Booster from JSON)
    import xgboost as xgb
    booster = xgb.Booster()
    booster.load_model(MODEL_PATH_JSON)
    print(f"✅ V25 booster loaded from JSON, features: {len(booster.feature_names)}")

    # Test V20 loading (sklearn wrapper from PKL)
    with open(MODEL_PATH_PKL, 'rb') as f:
        model_pkl = pickle.load(f)
    print(f"✅ V20 model loaded from PKL, features: {len(model_pkl.feature_names_in_)}")

    print("\n✅ All model loading tests passed!")


def main():
    print("="*80)
    print("🚀 HT/FT (İY/MS) MODEL TRAINING - VQWEN v3")
    print("="*80)

    # 1. Fetch matches
    matches = fetch_matches()
    if not matches:
        print("❌ No matches found")
        return

    # 2. Extract features and labels
    features_list, labels, match_ids = extract_features_and_labels(matches)
    if not features_list:
        print("❌ No features extracted")
        return

    # 3. Train model
    model, feature_names = train_model(features_list, labels)

    # 4. Save model
    save_model(model, feature_names)

    # 5. Test loading
    test_model_loading()

    print("\n" + "="*80)
    print("🎉 TRAINING COMPLETE")
    print("="*80)
    print(f"\n📊 Model files:")
    print(f"   JSON (V25+V20): {MODEL_PATH_JSON}")
    print(f"   PKL (V20): {MODEL_PATH_PKL}")
    print(f"   Features: {MODEL_DIR}/htft_features.json")
    print(f"\n📈 Total samples: {len(features_list)}")
    print(f"🎯 Classes: {len(HTFT_LABELS)}")


if __name__ == '__main__':
    main()