This commit is contained in:
@@ -0,0 +1,423 @@
|
||||
"""
|
||||
HT/FT Model Training with New Features + Backtest
|
||||
=====================================================
|
||||
Extracts training data with the new HT/FT tendency features,
|
||||
trains a new XGBoost model, and compares it against the old model.
|
||||
|
||||
Usage:
|
||||
python ai-engine/scripts/train_htft_with_tendencies.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import pickle
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
from tabulate import tabulate
|
||||
|
||||
import psycopg2
|
||||
import xgboost as xgb
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||
|
||||
from data.db import get_clean_dsn
|
||||
from features.htft_tendency_engine import HtftTendencyEngine
|
||||
|
||||
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "top_leagues.json")
|
||||
OUTPUT_DIR = os.path.join(AI_ENGINE_DIR, "data")
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
HTFT_LABELS = ["1/1", "1/X", "1/2", "X/1", "X/X", "X/2", "2/1", "2/X", "2/2"]
|
||||
|
||||
|
||||
def get_conn():
|
||||
dsn = get_clean_dsn()
|
||||
return psycopg2.connect(dsn)
|
||||
|
||||
|
||||
def load_top_leagues():
|
||||
"""Load top league IDs from top_leagues.json."""
|
||||
try:
|
||||
with open(TOP_LEAGUES_PATH, "r") as f:
|
||||
data = json.load(f)
|
||||
ids = set()
|
||||
for entry in data:
|
||||
if isinstance(entry, dict):
|
||||
lid = entry.get("id") or entry.get("league_id")
|
||||
if lid:
|
||||
ids.add(str(lid))
|
||||
elif isinstance(entry, str):
|
||||
ids.add(entry)
|
||||
print(f"✅ Loaded {len(ids)} top leagues")
|
||||
return ids
|
||||
except Exception as e:
|
||||
print(f"⚠️ Could not load top_leagues.json: {e}. Using all leagues.")
|
||||
return None
|
||||
|
||||
|
||||
def load_matches_with_odds(conn, top_league_ids=None):
|
||||
"""Load FT football matches with HT scores and odds."""
|
||||
query = """
|
||||
SELECT
|
||||
m.id,
|
||||
m.home_team_id,
|
||||
m.away_team_id,
|
||||
m.league_id,
|
||||
m.score_home,
|
||||
m.score_away,
|
||||
m.ht_score_home,
|
||||
m.ht_score_away,
|
||||
m.mst_utc
|
||||
FROM matches m
|
||||
WHERE m.sport = 'football'
|
||||
AND m.status = 'FT'
|
||||
AND m.score_home IS NOT NULL
|
||||
AND m.score_away IS NOT NULL
|
||||
AND m.ht_score_home IS NOT NULL
|
||||
AND m.ht_score_away IS NOT NULL
|
||||
AND m.home_team_id IS NOT NULL
|
||||
AND m.away_team_id IS NOT NULL
|
||||
"""
|
||||
|
||||
if top_league_ids:
|
||||
placeholders = ",".join(["%s"] * len(top_league_ids))
|
||||
query += f" AND m.league_id IN ({placeholders})"
|
||||
|
||||
query += " ORDER BY m.mst_utc ASC"
|
||||
|
||||
cur = conn.cursor()
|
||||
params = list(top_league_ids) if top_league_ids else []
|
||||
cur.execute(query, params)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
|
||||
cols = ["id", "home_team_id", "away_team_id", "league_id",
|
||||
"score_home", "score_away", "ht_score_home", "ht_score_away", "mst_utc"]
|
||||
return pd.DataFrame(rows, columns=cols)
|
||||
|
||||
|
||||
def load_odds_for_matches(conn, match_ids):
|
||||
"""Load MS + HT odds for given match IDs."""
|
||||
if not match_ids:
|
||||
return {}
|
||||
|
||||
# Load in batches
|
||||
odds_map = {}
|
||||
batch_size = 5000
|
||||
match_list = list(match_ids)
|
||||
|
||||
for i in range(0, len(match_list), batch_size):
|
||||
batch = match_list[i:i + batch_size]
|
||||
placeholders = ",".join(["%s"] * len(batch))
|
||||
|
||||
cur = conn.cursor()
|
||||
cur.execute(f"""
|
||||
SELECT oc.match_id, oc.name, os.name as sel_name, os.odd_value
|
||||
FROM odd_categories oc
|
||||
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||||
WHERE oc.match_id IN ({placeholders})
|
||||
AND oc.name IN (
|
||||
'Maç Sonucu',
|
||||
'1. Yarı Sonucu',
|
||||
'2,5 Alt/Üst',
|
||||
'Karşılıklı Gol',
|
||||
'Çifte Şans'
|
||||
)
|
||||
""", batch)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
|
||||
for mid, cat_name, sel_name, odd_value in rows:
|
||||
if mid not in odds_map:
|
||||
odds_map[mid] = {}
|
||||
om = odds_map[mid]
|
||||
|
||||
try:
|
||||
val = float(odd_value) if odd_value else 0.0
|
||||
except (ValueError, TypeError):
|
||||
val = 0.0
|
||||
|
||||
if val <= 0:
|
||||
continue
|
||||
|
||||
# Exact match for MS
|
||||
if cat_name == "Maç Sonucu":
|
||||
if sel_name in ("1", "Ev Sahibi"):
|
||||
om["ms_h"] = val
|
||||
elif sel_name in ("X", "Berabere"):
|
||||
om["ms_d"] = val
|
||||
elif sel_name in ("2", "Deplasman"):
|
||||
om["ms_a"] = val
|
||||
elif cat_name == "1. Yarı Sonucu":
|
||||
if sel_name in ("1", "Ev Sahibi"):
|
||||
om["ht_ms_h"] = val
|
||||
elif sel_name in ("X", "Berabere"):
|
||||
om["ht_ms_d"] = val
|
||||
elif sel_name in ("2", "Deplasman"):
|
||||
om["ht_ms_a"] = val
|
||||
|
||||
return odds_map
|
||||
|
||||
|
||||
def compute_labels(df):
|
||||
"""Compute HT/FT label (0-8)."""
|
||||
labels = []
|
||||
for _, row in df.iterrows():
|
||||
ht = 0 if row["ht_score_home"] > row["ht_score_away"] else (2 if row["ht_score_home"] < row["ht_score_away"] else 1)
|
||||
ft = 0 if row["score_home"] > row["score_away"] else (2 if row["score_home"] < row["score_away"] else 1)
|
||||
labels.append(ht * 3 + ft)
|
||||
return labels
|
||||
|
||||
|
||||
def extract_features(df, conn, odds_map, htft_engine):
|
||||
"""Extract all features for each match."""
|
||||
print(f"\n⏳ Extracting features for {len(df):,} matches...")
|
||||
start_time = time.time()
|
||||
|
||||
all_features = []
|
||||
processed = 0
|
||||
skipped = 0
|
||||
|
||||
for idx, row in df.iterrows():
|
||||
mid = row["id"]
|
||||
hid = row["home_team_id"]
|
||||
aid = row["away_team_id"]
|
||||
lid = row["league_id"]
|
||||
mst = row["mst_utc"]
|
||||
|
||||
# Odds features
|
||||
odds = odds_map.get(mid, {})
|
||||
ms_h = odds.get("ms_h", 0.0)
|
||||
ms_d = odds.get("ms_d", 0.0)
|
||||
ms_a = odds.get("ms_a", 0.0)
|
||||
|
||||
# Skip matches without any odds (too noisy)
|
||||
if ms_h <= 0 or ms_d <= 0 or ms_a <= 0:
|
||||
skipped += 1
|
||||
all_features.append(None)
|
||||
continue
|
||||
|
||||
# Implied probs (vig-free)
|
||||
raw_sum = 1/ms_h + 1/ms_d + 1/ms_a
|
||||
implied_home = (1/ms_h) / raw_sum
|
||||
implied_draw = (1/ms_d) / raw_sum
|
||||
implied_away = (1/ms_a) / raw_sum
|
||||
|
||||
ht_ms_h = odds.get("ht_ms_h", 0.0)
|
||||
ht_ms_d = odds.get("ht_ms_d", 0.0)
|
||||
ht_ms_a = odds.get("ht_ms_a", 0.0)
|
||||
|
||||
# HT implied probs
|
||||
if ht_ms_h > 0 and ht_ms_d > 0 and ht_ms_a > 0:
|
||||
ht_raw = 1/ht_ms_h + 1/ht_ms_d + 1/ht_ms_a
|
||||
ht_implied_home = (1/ht_ms_h) / ht_raw
|
||||
ht_implied_draw = (1/ht_ms_d) / ht_raw
|
||||
ht_implied_away = (1/ht_ms_a) / ht_raw
|
||||
else:
|
||||
ht_implied_home = ht_implied_draw = ht_implied_away = 0.33
|
||||
|
||||
feat = {
|
||||
# Odds features (core)
|
||||
"odds_ms_h": ms_h,
|
||||
"odds_ms_d": ms_d,
|
||||
"odds_ms_a": ms_a,
|
||||
"implied_home": implied_home,
|
||||
"implied_draw": implied_draw,
|
||||
"implied_away": implied_away,
|
||||
"fav_gap": abs(implied_home - implied_away),
|
||||
|
||||
# HT odds
|
||||
"ht_implied_home": ht_implied_home,
|
||||
"ht_implied_draw": ht_implied_draw,
|
||||
"ht_implied_away": ht_implied_away,
|
||||
}
|
||||
|
||||
# HT/FT tendency features (NEW!)
|
||||
try:
|
||||
htft_feats = htft_engine.get_features(hid, aid, lid, mst)
|
||||
feat.update(htft_feats)
|
||||
except Exception as e:
|
||||
# Fallback to neutral values
|
||||
feat.update({
|
||||
"htft_home_ht_scoring_rate": 0.5,
|
||||
"htft_home_ht_concede_rate": 0.5,
|
||||
"htft_home_ht_win_rate": 0.33,
|
||||
"htft_home_comeback_rate": 0.0,
|
||||
"htft_home_first_half_goal_pct": 0.5,
|
||||
"htft_home_second_half_surge": 1.0,
|
||||
"htft_away_ht_scoring_rate": 0.5,
|
||||
"htft_away_ht_concede_rate": 0.5,
|
||||
"htft_away_ht_win_rate": 0.33,
|
||||
"htft_away_comeback_rate": 0.0,
|
||||
"htft_away_first_half_goal_pct": 0.5,
|
||||
"htft_away_second_half_surge": 1.0,
|
||||
"htft_league_avg_ht_goals": 1.0,
|
||||
"htft_league_reversal_rate": 0.05,
|
||||
"htft_league_first_half_pct": 0.44,
|
||||
"htft_home_sample_size": 0.0,
|
||||
"htft_away_sample_size": 0.0,
|
||||
})
|
||||
|
||||
all_features.append(feat)
|
||||
processed += 1
|
||||
|
||||
if processed % 2000 == 0:
|
||||
elapsed = time.time() - start_time
|
||||
rate = processed / elapsed
|
||||
remaining = (len(df) - processed - skipped) / rate if rate > 0 else 0
|
||||
print(f" Processed: {processed:,} / {len(df):,} "
|
||||
f"(skipped: {skipped:,}) "
|
||||
f"[{elapsed:.0f}s elapsed, ~{remaining:.0f}s remaining]")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(f" ✅ Features extracted: {processed:,} (skipped {skipped:,}) in {elapsed:.1f}s")
|
||||
|
||||
return all_features
|
||||
|
||||
|
||||
def train_and_evaluate(X_train, y_train, X_test, y_test, feature_names, label=""):
|
||||
"""Train XGBoost model and evaluate."""
|
||||
model = xgb.XGBClassifier(
|
||||
n_estimators=300,
|
||||
max_depth=6,
|
||||
learning_rate=0.05,
|
||||
num_class=9,
|
||||
objective="multi:softprob",
|
||||
eval_metric="mlogloss",
|
||||
subsample=0.8,
|
||||
colsample_bytree=0.8,
|
||||
min_child_weight=5,
|
||||
random_state=42,
|
||||
verbosity=0,
|
||||
n_jobs=-1,
|
||||
)
|
||||
|
||||
print(f"\n🏋️ Training {label} model...")
|
||||
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
|
||||
|
||||
# Predictions
|
||||
y_pred = model.predict(X_test)
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
|
||||
print(f"\n📊 {label} Results:")
|
||||
print(f" Overall Accuracy: {accuracy:.4f} ({accuracy*100:.1f}%)")
|
||||
|
||||
# Per-class accuracy
|
||||
print(f"\n Per-class breakdown:")
|
||||
rows = []
|
||||
for i, label_name in enumerate(HTFT_LABELS):
|
||||
mask = y_test == i
|
||||
if mask.sum() > 0:
|
||||
class_acc = accuracy_score(y_test[mask], y_pred[mask])
|
||||
rows.append([label_name, mask.sum(), f"{class_acc*100:.1f}%"])
|
||||
|
||||
print(tabulate(rows, headers=["HT/FT", "Count", "Accuracy"], tablefmt="pretty"))
|
||||
|
||||
# Feature importance
|
||||
importances = model.feature_importances_
|
||||
feat_imp = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)
|
||||
print(f"\n Top 15 Features:")
|
||||
for fname, imp in feat_imp[:15]:
|
||||
bar = "█" * int(imp * 100)
|
||||
print(f" {fname:40s} {imp:.4f} {bar}")
|
||||
|
||||
return model, accuracy
|
||||
|
||||
|
||||
def main():
|
||||
print("🚀 HT/FT Model Training with New Tendency Features")
|
||||
print("=" * 70)
|
||||
|
||||
conn = get_conn()
|
||||
top_league_ids = load_top_leagues()
|
||||
|
||||
# Load matches
|
||||
print("\n📊 Loading matches...")
|
||||
df = load_matches_with_odds(conn, top_league_ids)
|
||||
print(f" ✅ {len(df):,} matches loaded")
|
||||
|
||||
# Load odds
|
||||
print("\n📊 Loading odds...")
|
||||
match_ids = set(df["id"].tolist())
|
||||
odds_map = load_odds_for_matches(conn, match_ids)
|
||||
print(f" ✅ Odds loaded for {len(odds_map):,} matches")
|
||||
|
||||
# Compute labels
|
||||
print("\n📊 Computing HT/FT labels...")
|
||||
df["label"] = compute_labels(df)
|
||||
label_dist = df["label"].value_counts().sort_index()
|
||||
for i, label in enumerate(HTFT_LABELS):
|
||||
c = label_dist.get(i, 0)
|
||||
print(f" {label}: {c:,} ({c/len(df)*100:.1f}%)")
|
||||
|
||||
# Initialize HT/FT tendency engine
|
||||
htft_engine = HtftTendencyEngine()
|
||||
|
||||
# Extract features
|
||||
all_features = extract_features(df, conn, odds_map, htft_engine)
|
||||
|
||||
# Filter: keep only matches with features
|
||||
valid_mask = [f is not None for f in all_features]
|
||||
df_valid = df[valid_mask].reset_index(drop=True)
|
||||
features_valid = [f for f in all_features if f is not None]
|
||||
|
||||
print(f"\n📊 Valid matches with features: {len(df_valid):,}")
|
||||
|
||||
# Convert to arrays
|
||||
feature_names = list(features_valid[0].keys())
|
||||
X = np.array([[f[k] for k in feature_names] for f in features_valid], dtype=np.float32)
|
||||
y = np.array(df_valid["label"].tolist(), dtype=np.int32)
|
||||
|
||||
# Split: time-based (last 20% as test)
|
||||
split_idx = int(len(X) * 0.8)
|
||||
X_train, X_test = X[:split_idx], X[split_idx:]
|
||||
y_train, y_test = y[:split_idx], y[split_idx:]
|
||||
print(f" Train: {len(X_train):,}, Test: {len(X_test):,}")
|
||||
|
||||
# ─── Train WITH new features ─────────────────────────────────────────
|
||||
model_new, acc_new = train_and_evaluate(
|
||||
X_train, y_train, X_test, y_test, feature_names,
|
||||
label="NEW (with HT/FT tendencies)"
|
||||
)
|
||||
|
||||
# ─── Train WITHOUT new features (baseline) ──────────────────────────
|
||||
# Remove htft_ features for comparison
|
||||
baseline_cols = [i for i, n in enumerate(feature_names) if not n.startswith("htft_")]
|
||||
baseline_names = [feature_names[i] for i in baseline_cols]
|
||||
X_train_base = X_train[:, baseline_cols]
|
||||
X_test_base = X_test[:, baseline_cols]
|
||||
|
||||
model_base, acc_base = train_and_evaluate(
|
||||
X_train_base, y_train, X_test_base, y_test, baseline_names,
|
||||
label="BASELINE (without HT/FT tendencies)"
|
||||
)
|
||||
|
||||
# ─── Comparison ──────────────────────────────────────────────────────
|
||||
print("\n" + "=" * 70)
|
||||
print("📈 COMPARISON")
|
||||
print("=" * 70)
|
||||
print(f" Baseline accuracy: {acc_base*100:.2f}%")
|
||||
print(f" New accuracy: {acc_new*100:.2f}%")
|
||||
delta = (acc_new - acc_base) * 100
|
||||
direction = "📈 IMPROVEMENT" if delta > 0 else "📉 REGRESSION"
|
||||
print(f" Delta: {delta:+.2f}% {direction}")
|
||||
|
||||
# Save new model
|
||||
model_path = os.path.join(AI_ENGINE_DIR, "models", "xgboost", "xgb_ht_ft_v2.pkl")
|
||||
with open(model_path, "wb") as f:
|
||||
pickle.dump(model_new, f)
|
||||
print(f"\n💾 New model saved: {model_path}")
|
||||
|
||||
conn.close()
|
||||
print("\n✅ Done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user