first (part 2: other directories)

2026-04-16 15:11:25 +03:00
parent 7814e0bc6b
commit 2f0b85a0c7
203 changed files with 59989 additions and 0 deletions
@@ -0,0 +1,423 @@
+"""
+HT/FT Model Training with New Features + Backtest
+=====================================================
+Extracts training data with the new HT/FT tendency features,
+trains a new XGBoost model, and compares it against the old model.
+
+Usage:
+    python ai-engine/scripts/train_htft_with_tendencies.py
+"""
+
+import os
+import sys
+import time
+import json
+import pickle
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import numpy as np
+import pandas as pd
+from collections import defaultdict
+from tabulate import tabulate
+
+import psycopg2
+import xgboost as xgb
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+
+from data.db import get_clean_dsn
+from features.htft_tendency_engine import HtftTendencyEngine
+
+AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "top_leagues.json")
+OUTPUT_DIR = os.path.join(AI_ENGINE_DIR, "data")
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+HTFT_LABELS = ["1/1", "1/X", "1/2", "X/1", "X/X", "X/2", "2/1", "2/X", "2/2"]
+
+
+def get_conn():
+    dsn = get_clean_dsn()
+    return psycopg2.connect(dsn)
+
+
+def load_top_leagues():
+    """Load top league IDs from top_leagues.json."""
+    try:
+        with open(TOP_LEAGUES_PATH, "r") as f:
+            data = json.load(f)
+        ids = set()
+        for entry in data:
+            if isinstance(entry, dict):
+                lid = entry.get("id") or entry.get("league_id")
+                if lid:
+                    ids.add(str(lid))
+            elif isinstance(entry, str):
+                ids.add(entry)
+        print(f"✅ Loaded {len(ids)} top leagues")
+        return ids
+    except Exception as e:
+        print(f"⚠️  Could not load top_leagues.json: {e}. Using all leagues.")
+        return None
+
+
+def load_matches_with_odds(conn, top_league_ids=None):
+    """Load FT football matches with HT scores and odds."""
+    query = """
+    SELECT
+        m.id,
+        m.home_team_id,
+        m.away_team_id,
+        m.league_id,
+        m.score_home,
+        m.score_away,
+        m.ht_score_home,
+        m.ht_score_away,
+        m.mst_utc
+    FROM matches m
+    WHERE m.sport = 'football'
+      AND m.status = 'FT'
+      AND m.score_home IS NOT NULL
+      AND m.score_away IS NOT NULL
+      AND m.ht_score_home IS NOT NULL
+      AND m.ht_score_away IS NOT NULL
+      AND m.home_team_id IS NOT NULL
+      AND m.away_team_id IS NOT NULL
+    """
+
+    if top_league_ids:
+        placeholders = ",".join(["%s"] * len(top_league_ids))
+        query += f" AND m.league_id IN ({placeholders})"
+
+    query += " ORDER BY m.mst_utc ASC"
+
+    cur = conn.cursor()
+    params = list(top_league_ids) if top_league_ids else []
+    cur.execute(query, params)
+    rows = cur.fetchall()
+    cur.close()
+
+    cols = ["id", "home_team_id", "away_team_id", "league_id",
+            "score_home", "score_away", "ht_score_home", "ht_score_away", "mst_utc"]
+    return pd.DataFrame(rows, columns=cols)
+
+
+def load_odds_for_matches(conn, match_ids):
+    """Load MS + HT odds for given match IDs."""
+    if not match_ids:
+        return {}
+
+    # Load in batches
+    odds_map = {}
+    batch_size = 5000
+    match_list = list(match_ids)
+
+    for i in range(0, len(match_list), batch_size):
+        batch = match_list[i:i + batch_size]
+        placeholders = ",".join(["%s"] * len(batch))
+
+        cur = conn.cursor()
+        cur.execute(f"""
+            SELECT oc.match_id, oc.name, os.name as sel_name, os.odd_value
+            FROM odd_categories oc
+            JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
+            WHERE oc.match_id IN ({placeholders})
+              AND oc.name IN (
+                'Maç Sonucu',
+                '1. Yarı Sonucu',
+                '2,5 Alt/Üst',
+                'Karşılıklı Gol',
+                'Çifte Şans'
+              )
+        """, batch)
+        rows = cur.fetchall()
+        cur.close()
+
+        for mid, cat_name, sel_name, odd_value in rows:
+            if mid not in odds_map:
+                odds_map[mid] = {}
+            om = odds_map[mid]
+
+            try:
+                val = float(odd_value) if odd_value else 0.0
+            except (ValueError, TypeError):
+                val = 0.0
+
+            if val <= 0:
+                continue
+
+            # Exact match for MS
+            if cat_name == "Maç Sonucu":
+                if sel_name in ("1", "Ev Sahibi"):
+                    om["ms_h"] = val
+                elif sel_name in ("X", "Berabere"):
+                    om["ms_d"] = val
+                elif sel_name in ("2", "Deplasman"):
+                    om["ms_a"] = val
+            elif cat_name == "1. Yarı Sonucu":
+                if sel_name in ("1", "Ev Sahibi"):
+                    om["ht_ms_h"] = val
+                elif sel_name in ("X", "Berabere"):
+                    om["ht_ms_d"] = val
+                elif sel_name in ("2", "Deplasman"):
+                    om["ht_ms_a"] = val
+
+    return odds_map
+
+
+def compute_labels(df):
+    """Compute HT/FT label (0-8)."""
+    labels = []
+    for _, row in df.iterrows():
+        ht = 0 if row["ht_score_home"] > row["ht_score_away"] else (2 if row["ht_score_home"] < row["ht_score_away"] else 1)
+        ft = 0 if row["score_home"] > row["score_away"] else (2 if row["score_home"] < row["score_away"] else 1)
+        labels.append(ht * 3 + ft)
+    return labels
+
+
+def extract_features(df, conn, odds_map, htft_engine):
+    """Extract all features for each match."""
+    print(f"\n⏳ Extracting features for {len(df):,} matches...")
+    start_time = time.time()
+
+    all_features = []
+    processed = 0
+    skipped = 0
+
+    for idx, row in df.iterrows():
+        mid = row["id"]
+        hid = row["home_team_id"]
+        aid = row["away_team_id"]
+        lid = row["league_id"]
+        mst = row["mst_utc"]
+
+        # Odds features
+        odds = odds_map.get(mid, {})
+        ms_h = odds.get("ms_h", 0.0)
+        ms_d = odds.get("ms_d", 0.0)
+        ms_a = odds.get("ms_a", 0.0)
+
+        # Skip matches without any odds (too noisy)
+        if ms_h <= 0 or ms_d <= 0 or ms_a <= 0:
+            skipped += 1
+            all_features.append(None)
+            continue
+
+        # Implied probs (vig-free)
+        raw_sum = 1/ms_h + 1/ms_d + 1/ms_a
+        implied_home = (1/ms_h) / raw_sum
+        implied_draw = (1/ms_d) / raw_sum
+        implied_away = (1/ms_a) / raw_sum
+
+        ht_ms_h = odds.get("ht_ms_h", 0.0)
+        ht_ms_d = odds.get("ht_ms_d", 0.0)
+        ht_ms_a = odds.get("ht_ms_a", 0.0)
+
+        # HT implied probs
+        if ht_ms_h > 0 and ht_ms_d > 0 and ht_ms_a > 0:
+            ht_raw = 1/ht_ms_h + 1/ht_ms_d + 1/ht_ms_a
+            ht_implied_home = (1/ht_ms_h) / ht_raw
+            ht_implied_draw = (1/ht_ms_d) / ht_raw
+            ht_implied_away = (1/ht_ms_a) / ht_raw
+        else:
+            ht_implied_home = ht_implied_draw = ht_implied_away = 0.33
+
+        feat = {
+            # Odds features (core)
+            "odds_ms_h": ms_h,
+            "odds_ms_d": ms_d,
+            "odds_ms_a": ms_a,
+            "implied_home": implied_home,
+            "implied_draw": implied_draw,
+            "implied_away": implied_away,
+            "fav_gap": abs(implied_home - implied_away),
+
+            # HT odds
+            "ht_implied_home": ht_implied_home,
+            "ht_implied_draw": ht_implied_draw,
+            "ht_implied_away": ht_implied_away,
+        }
+
+        # HT/FT tendency features (NEW!)
+        try:
+            htft_feats = htft_engine.get_features(hid, aid, lid, mst)
+            feat.update(htft_feats)
+        except Exception as e:
+            # Fallback to neutral values
+            feat.update({
+                "htft_home_ht_scoring_rate": 0.5,
+                "htft_home_ht_concede_rate": 0.5,
+                "htft_home_ht_win_rate": 0.33,
+                "htft_home_comeback_rate": 0.0,
+                "htft_home_first_half_goal_pct": 0.5,
+                "htft_home_second_half_surge": 1.0,
+                "htft_away_ht_scoring_rate": 0.5,
+                "htft_away_ht_concede_rate": 0.5,
+                "htft_away_ht_win_rate": 0.33,
+                "htft_away_comeback_rate": 0.0,
+                "htft_away_first_half_goal_pct": 0.5,
+                "htft_away_second_half_surge": 1.0,
+                "htft_league_avg_ht_goals": 1.0,
+                "htft_league_reversal_rate": 0.05,
+                "htft_league_first_half_pct": 0.44,
+                "htft_home_sample_size": 0.0,
+                "htft_away_sample_size": 0.0,
+            })
+
+        all_features.append(feat)
+        processed += 1
+
+        if processed % 2000 == 0:
+            elapsed = time.time() - start_time
+            rate = processed / elapsed
+            remaining = (len(df) - processed - skipped) / rate if rate > 0 else 0
+            print(f"   Processed: {processed:,} / {len(df):,} "
+                  f"(skipped: {skipped:,}) "
+                  f"[{elapsed:.0f}s elapsed, ~{remaining:.0f}s remaining]")
+
+    elapsed = time.time() - start_time
+    print(f"   ✅ Features extracted: {processed:,} (skipped {skipped:,}) in {elapsed:.1f}s")
+
+    return all_features
+
+
+def train_and_evaluate(X_train, y_train, X_test, y_test, feature_names, label=""):
+    """Train XGBoost model and evaluate."""
+    model = xgb.XGBClassifier(
+        n_estimators=300,
+        max_depth=6,
+        learning_rate=0.05,
+        num_class=9,
+        objective="multi:softprob",
+        eval_metric="mlogloss",
+        subsample=0.8,
+        colsample_bytree=0.8,
+        min_child_weight=5,
+        random_state=42,
+        verbosity=0,
+        n_jobs=-1,
+    )
+
+    print(f"\n🏋️  Training {label} model...")
+    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
+
+    # Predictions
+    y_pred = model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+
+    print(f"\n📊 {label} Results:")
+    print(f"   Overall Accuracy: {accuracy:.4f} ({accuracy*100:.1f}%)")
+
+    # Per-class accuracy
+    print(f"\n   Per-class breakdown:")
+    rows = []
+    for i, label_name in enumerate(HTFT_LABELS):
+        mask = y_test == i
+        if mask.sum() > 0:
+            class_acc = accuracy_score(y_test[mask], y_pred[mask])
+            rows.append([label_name, mask.sum(), f"{class_acc*100:.1f}%"])
+
+    print(tabulate(rows, headers=["HT/FT", "Count", "Accuracy"], tablefmt="pretty"))
+
+    # Feature importance
+    importances = model.feature_importances_
+    feat_imp = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)
+    print(f"\n   Top 15 Features:")
+    for fname, imp in feat_imp[:15]:
+        bar = "█" * int(imp * 100)
+        print(f"   {fname:40s} {imp:.4f} {bar}")
+
+    return model, accuracy
+
+
+def main():
+    print("🚀 HT/FT Model Training with New Tendency Features")
+    print("=" * 70)
+
+    conn = get_conn()
+    top_league_ids = load_top_leagues()
+
+    # Load matches
+    print("\n📊 Loading matches...")
+    df = load_matches_with_odds(conn, top_league_ids)
+    print(f"   ✅ {len(df):,} matches loaded")
+
+    # Load odds
+    print("\n📊 Loading odds...")
+    match_ids = set(df["id"].tolist())
+    odds_map = load_odds_for_matches(conn, match_ids)
+    print(f"   ✅ Odds loaded for {len(odds_map):,} matches")
+
+    # Compute labels
+    print("\n📊 Computing HT/FT labels...")
+    df["label"] = compute_labels(df)
+    label_dist = df["label"].value_counts().sort_index()
+    for i, label in enumerate(HTFT_LABELS):
+        c = label_dist.get(i, 0)
+        print(f"   {label}: {c:,} ({c/len(df)*100:.1f}%)")
+
+    # Initialize HT/FT tendency engine
+    htft_engine = HtftTendencyEngine()
+
+    # Extract features
+    all_features = extract_features(df, conn, odds_map, htft_engine)
+
+    # Filter: keep only matches with features
+    valid_mask = [f is not None for f in all_features]
+    df_valid = df[valid_mask].reset_index(drop=True)
+    features_valid = [f for f in all_features if f is not None]
+
+    print(f"\n📊 Valid matches with features: {len(df_valid):,}")
+
+    # Convert to arrays
+    feature_names = list(features_valid[0].keys())
+    X = np.array([[f[k] for k in feature_names] for f in features_valid], dtype=np.float32)
+    y = np.array(df_valid["label"].tolist(), dtype=np.int32)
+
+    # Split: time-based (last 20% as test)
+    split_idx = int(len(X) * 0.8)
+    X_train, X_test = X[:split_idx], X[split_idx:]
+    y_train, y_test = y[:split_idx], y[split_idx:]
+    print(f"   Train: {len(X_train):,}, Test: {len(X_test):,}")
+
+    # ─── Train WITH new features ─────────────────────────────────────────
+    model_new, acc_new = train_and_evaluate(
+        X_train, y_train, X_test, y_test, feature_names,
+        label="NEW (with HT/FT tendencies)"
+    )
+
+    # ─── Train WITHOUT new features (baseline) ──────────────────────────
+    # Remove htft_ features for comparison
+    baseline_cols = [i for i, n in enumerate(feature_names) if not n.startswith("htft_")]
+    baseline_names = [feature_names[i] for i in baseline_cols]
+    X_train_base = X_train[:, baseline_cols]
+    X_test_base = X_test[:, baseline_cols]
+
+    model_base, acc_base = train_and_evaluate(
+        X_train_base, y_train, X_test_base, y_test, baseline_names,
+        label="BASELINE (without HT/FT tendencies)"
+    )
+
+    # ─── Comparison ──────────────────────────────────────────────────────
+    print("\n" + "=" * 70)
+    print("📈 COMPARISON")
+    print("=" * 70)
+    print(f"   Baseline accuracy:  {acc_base*100:.2f}%")
+    print(f"   New accuracy:       {acc_new*100:.2f}%")
+    delta = (acc_new - acc_base) * 100
+    direction = "📈 IMPROVEMENT" if delta > 0 else "📉 REGRESSION"
+    print(f"   Delta:              {delta:+.2f}% {direction}")
+
+    # Save new model
+    model_path = os.path.join(AI_ENGINE_DIR, "models", "xgboost", "xgb_ht_ft_v2.pkl")
+    with open(model_path, "wb") as f:
+        pickle.dump(model_new, f)
+    print(f"\n💾 New model saved: {model_path}")
+
+    conn.close()
+    print("\n✅ Done!")
+
+
+if __name__ == "__main__":
+    main()