""" HT/FT Model Training with New Features + Backtest ===================================================== Extracts training data with the new HT/FT tendency features, trains a new XGBoost model, and compares it against the old model. Usage: python ai-engine/scripts/train_htft_with_tendencies.py """ import os import sys import time import json import pickle sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) import numpy as np import pandas as pd from collections import defaultdict from tabulate import tabulate import psycopg2 import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from data.db import get_clean_dsn from features.htft_tendency_engine import HtftTendencyEngine AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "top_leagues.json") OUTPUT_DIR = os.path.join(AI_ENGINE_DIR, "data") os.makedirs(OUTPUT_DIR, exist_ok=True) HTFT_LABELS = ["1/1", "1/X", "1/2", "X/1", "X/X", "X/2", "2/1", "2/X", "2/2"] def get_conn(): dsn = get_clean_dsn() return psycopg2.connect(dsn) def load_top_leagues(): """Load top league IDs from top_leagues.json.""" try: with open(TOP_LEAGUES_PATH, "r") as f: data = json.load(f) ids = set() for entry in data: if isinstance(entry, dict): lid = entry.get("id") or entry.get("league_id") if lid: ids.add(str(lid)) elif isinstance(entry, str): ids.add(entry) print(f"✅ Loaded {len(ids)} top leagues") return ids except Exception as e: print(f"⚠️ Could not load top_leagues.json: {e}. Using all leagues.") return None def load_matches_with_odds(conn, top_league_ids=None): """Load FT football matches with HT scores and odds.""" query = """ SELECT m.id, m.home_team_id, m.away_team_id, m.league_id, m.score_home, m.score_away, m.ht_score_home, m.ht_score_away, m.mst_utc FROM matches m WHERE m.sport = 'football' AND m.status = 'FT' AND m.score_home IS NOT NULL AND m.score_away IS NOT NULL AND m.ht_score_home IS NOT NULL AND m.ht_score_away IS NOT NULL AND m.home_team_id IS NOT NULL AND m.away_team_id IS NOT NULL """ if top_league_ids: placeholders = ",".join(["%s"] * len(top_league_ids)) query += f" AND m.league_id IN ({placeholders})" query += " ORDER BY m.mst_utc ASC" cur = conn.cursor() params = list(top_league_ids) if top_league_ids else [] cur.execute(query, params) rows = cur.fetchall() cur.close() cols = ["id", "home_team_id", "away_team_id", "league_id", "score_home", "score_away", "ht_score_home", "ht_score_away", "mst_utc"] return pd.DataFrame(rows, columns=cols) def load_odds_for_matches(conn, match_ids): """Load MS + HT odds for given match IDs.""" if not match_ids: return {} # Load in batches odds_map = {} batch_size = 5000 match_list = list(match_ids) for i in range(0, len(match_list), batch_size): batch = match_list[i:i + batch_size] placeholders = ",".join(["%s"] * len(batch)) cur = conn.cursor() cur.execute(f""" SELECT oc.match_id, oc.name, os.name as sel_name, os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id IN ({placeholders}) AND oc.name IN ( 'Maç Sonucu', '1. Yarı Sonucu', '2,5 Alt/Üst', 'Karşılıklı Gol', 'Çifte Şans' ) """, batch) rows = cur.fetchall() cur.close() for mid, cat_name, sel_name, odd_value in rows: if mid not in odds_map: odds_map[mid] = {} om = odds_map[mid] try: val = float(odd_value) if odd_value else 0.0 except (ValueError, TypeError): val = 0.0 if val <= 0: continue # Exact match for MS if cat_name == "Maç Sonucu": if sel_name in ("1", "Ev Sahibi"): om["ms_h"] = val elif sel_name in ("X", "Berabere"): om["ms_d"] = val elif sel_name in ("2", "Deplasman"): om["ms_a"] = val elif cat_name == "1. Yarı Sonucu": if sel_name in ("1", "Ev Sahibi"): om["ht_ms_h"] = val elif sel_name in ("X", "Berabere"): om["ht_ms_d"] = val elif sel_name in ("2", "Deplasman"): om["ht_ms_a"] = val return odds_map def compute_labels(df): """Compute HT/FT label (0-8).""" labels = [] for _, row in df.iterrows(): ht = 0 if row["ht_score_home"] > row["ht_score_away"] else (2 if row["ht_score_home"] < row["ht_score_away"] else 1) ft = 0 if row["score_home"] > row["score_away"] else (2 if row["score_home"] < row["score_away"] else 1) labels.append(ht * 3 + ft) return labels def extract_features(df, conn, odds_map, htft_engine): """Extract all features for each match.""" print(f"\n⏳ Extracting features for {len(df):,} matches...") start_time = time.time() all_features = [] processed = 0 skipped = 0 for idx, row in df.iterrows(): mid = row["id"] hid = row["home_team_id"] aid = row["away_team_id"] lid = row["league_id"] mst = row["mst_utc"] # Odds features odds = odds_map.get(mid, {}) ms_h = odds.get("ms_h", 0.0) ms_d = odds.get("ms_d", 0.0) ms_a = odds.get("ms_a", 0.0) # Skip matches without any odds (too noisy) if ms_h <= 0 or ms_d <= 0 or ms_a <= 0: skipped += 1 all_features.append(None) continue # Implied probs (vig-free) raw_sum = 1/ms_h + 1/ms_d + 1/ms_a implied_home = (1/ms_h) / raw_sum implied_draw = (1/ms_d) / raw_sum implied_away = (1/ms_a) / raw_sum ht_ms_h = odds.get("ht_ms_h", 0.0) ht_ms_d = odds.get("ht_ms_d", 0.0) ht_ms_a = odds.get("ht_ms_a", 0.0) # HT implied probs if ht_ms_h > 0 and ht_ms_d > 0 and ht_ms_a > 0: ht_raw = 1/ht_ms_h + 1/ht_ms_d + 1/ht_ms_a ht_implied_home = (1/ht_ms_h) / ht_raw ht_implied_draw = (1/ht_ms_d) / ht_raw ht_implied_away = (1/ht_ms_a) / ht_raw else: ht_implied_home = ht_implied_draw = ht_implied_away = 0.33 feat = { # Odds features (core) "odds_ms_h": ms_h, "odds_ms_d": ms_d, "odds_ms_a": ms_a, "implied_home": implied_home, "implied_draw": implied_draw, "implied_away": implied_away, "fav_gap": abs(implied_home - implied_away), # HT odds "ht_implied_home": ht_implied_home, "ht_implied_draw": ht_implied_draw, "ht_implied_away": ht_implied_away, } # HT/FT tendency features (NEW!) try: htft_feats = htft_engine.get_features(hid, aid, lid, mst) feat.update(htft_feats) except Exception as e: # Fallback to neutral values feat.update({ "htft_home_ht_scoring_rate": 0.5, "htft_home_ht_concede_rate": 0.5, "htft_home_ht_win_rate": 0.33, "htft_home_comeback_rate": 0.0, "htft_home_first_half_goal_pct": 0.5, "htft_home_second_half_surge": 1.0, "htft_away_ht_scoring_rate": 0.5, "htft_away_ht_concede_rate": 0.5, "htft_away_ht_win_rate": 0.33, "htft_away_comeback_rate": 0.0, "htft_away_first_half_goal_pct": 0.5, "htft_away_second_half_surge": 1.0, "htft_league_avg_ht_goals": 1.0, "htft_league_reversal_rate": 0.05, "htft_league_first_half_pct": 0.44, "htft_home_sample_size": 0.0, "htft_away_sample_size": 0.0, }) all_features.append(feat) processed += 1 if processed % 2000 == 0: elapsed = time.time() - start_time rate = processed / elapsed remaining = (len(df) - processed - skipped) / rate if rate > 0 else 0 print(f" Processed: {processed:,} / {len(df):,} " f"(skipped: {skipped:,}) " f"[{elapsed:.0f}s elapsed, ~{remaining:.0f}s remaining]") elapsed = time.time() - start_time print(f" ✅ Features extracted: {processed:,} (skipped {skipped:,}) in {elapsed:.1f}s") return all_features def train_and_evaluate(X_train, y_train, X_test, y_test, feature_names, label=""): """Train XGBoost model and evaluate.""" model = xgb.XGBClassifier( n_estimators=300, max_depth=6, learning_rate=0.05, num_class=9, objective="multi:softprob", eval_metric="mlogloss", subsample=0.8, colsample_bytree=0.8, min_child_weight=5, random_state=42, verbosity=0, n_jobs=-1, ) print(f"\n🏋️ Training {label} model...") model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) # Predictions y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"\n📊 {label} Results:") print(f" Overall Accuracy: {accuracy:.4f} ({accuracy*100:.1f}%)") # Per-class accuracy print(f"\n Per-class breakdown:") rows = [] for i, label_name in enumerate(HTFT_LABELS): mask = y_test == i if mask.sum() > 0: class_acc = accuracy_score(y_test[mask], y_pred[mask]) rows.append([label_name, mask.sum(), f"{class_acc*100:.1f}%"]) print(tabulate(rows, headers=["HT/FT", "Count", "Accuracy"], tablefmt="pretty")) # Feature importance importances = model.feature_importances_ feat_imp = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True) print(f"\n Top 15 Features:") for fname, imp in feat_imp[:15]: bar = "█" * int(imp * 100) print(f" {fname:40s} {imp:.4f} {bar}") return model, accuracy def main(): print("🚀 HT/FT Model Training with New Tendency Features") print("=" * 70) conn = get_conn() top_league_ids = load_top_leagues() # Load matches print("\n📊 Loading matches...") df = load_matches_with_odds(conn, top_league_ids) print(f" ✅ {len(df):,} matches loaded") # Load odds print("\n📊 Loading odds...") match_ids = set(df["id"].tolist()) odds_map = load_odds_for_matches(conn, match_ids) print(f" ✅ Odds loaded for {len(odds_map):,} matches") # Compute labels print("\n📊 Computing HT/FT labels...") df["label"] = compute_labels(df) label_dist = df["label"].value_counts().sort_index() for i, label in enumerate(HTFT_LABELS): c = label_dist.get(i, 0) print(f" {label}: {c:,} ({c/len(df)*100:.1f}%)") # Initialize HT/FT tendency engine htft_engine = HtftTendencyEngine() # Extract features all_features = extract_features(df, conn, odds_map, htft_engine) # Filter: keep only matches with features valid_mask = [f is not None for f in all_features] df_valid = df[valid_mask].reset_index(drop=True) features_valid = [f for f in all_features if f is not None] print(f"\n📊 Valid matches with features: {len(df_valid):,}") # Convert to arrays feature_names = list(features_valid[0].keys()) X = np.array([[f[k] for k in feature_names] for f in features_valid], dtype=np.float32) y = np.array(df_valid["label"].tolist(), dtype=np.int32) # Split: time-based (last 20% as test) split_idx = int(len(X) * 0.8) X_train, X_test = X[:split_idx], X[split_idx:] y_train, y_test = y[:split_idx], y[split_idx:] print(f" Train: {len(X_train):,}, Test: {len(X_test):,}") # ─── Train WITH new features ───────────────────────────────────────── model_new, acc_new = train_and_evaluate( X_train, y_train, X_test, y_test, feature_names, label="NEW (with HT/FT tendencies)" ) # ─── Train WITHOUT new features (baseline) ────────────────────────── # Remove htft_ features for comparison baseline_cols = [i for i, n in enumerate(feature_names) if not n.startswith("htft_")] baseline_names = [feature_names[i] for i in baseline_cols] X_train_base = X_train[:, baseline_cols] X_test_base = X_test[:, baseline_cols] model_base, acc_base = train_and_evaluate( X_train_base, y_train, X_test_base, y_test, baseline_names, label="BASELINE (without HT/FT tendencies)" ) # ─── Comparison ────────────────────────────────────────────────────── print("\n" + "=" * 70) print("📈 COMPARISON") print("=" * 70) print(f" Baseline accuracy: {acc_base*100:.2f}%") print(f" New accuracy: {acc_new*100:.2f}%") delta = (acc_new - acc_base) * 100 direction = "📈 IMPROVEMENT" if delta > 0 else "📉 REGRESSION" print(f" Delta: {delta:+.2f}% {direction}") # Save new model model_path = os.path.join(AI_ENGINE_DIR, "models", "xgboost", "xgb_ht_ft_v2.pkl") with open(model_path, "wb") as f: pickle.dump(model_new, f) print(f"\n💾 New model saved: {model_path}") conn.close() print("\n✅ Done!") if __name__ == "__main__": main()