Files
iddaai-be/ai-engine/scripts/train_htft_with_tendencies.py
T
fahricansecer 2f0b85a0c7
Deploy Iddaai Backend / build-and-deploy (push) Failing after 18s
first (part 2: other directories)
2026-04-16 15:11:25 +03:00

424 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
HT/FT Model Training with New Features + Backtest
=====================================================
Extracts training data with the new HT/FT tendency features,
trains a new XGBoost model, and compares it against the old model.
Usage:
python ai-engine/scripts/train_htft_with_tendencies.py
"""
import os
import sys
import time
import json
import pickle
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import numpy as np
import pandas as pd
from collections import defaultdict
from tabulate import tabulate
import psycopg2
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from data.db import get_clean_dsn
from features.htft_tendency_engine import HtftTendencyEngine
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "top_leagues.json")
OUTPUT_DIR = os.path.join(AI_ENGINE_DIR, "data")
os.makedirs(OUTPUT_DIR, exist_ok=True)
HTFT_LABELS = ["1/1", "1/X", "1/2", "X/1", "X/X", "X/2", "2/1", "2/X", "2/2"]
def get_conn():
dsn = get_clean_dsn()
return psycopg2.connect(dsn)
def load_top_leagues():
"""Load top league IDs from top_leagues.json."""
try:
with open(TOP_LEAGUES_PATH, "r") as f:
data = json.load(f)
ids = set()
for entry in data:
if isinstance(entry, dict):
lid = entry.get("id") or entry.get("league_id")
if lid:
ids.add(str(lid))
elif isinstance(entry, str):
ids.add(entry)
print(f"✅ Loaded {len(ids)} top leagues")
return ids
except Exception as e:
print(f"⚠️ Could not load top_leagues.json: {e}. Using all leagues.")
return None
def load_matches_with_odds(conn, top_league_ids=None):
"""Load FT football matches with HT scores and odds."""
query = """
SELECT
m.id,
m.home_team_id,
m.away_team_id,
m.league_id,
m.score_home,
m.score_away,
m.ht_score_home,
m.ht_score_away,
m.mst_utc
FROM matches m
WHERE m.sport = 'football'
AND m.status = 'FT'
AND m.score_home IS NOT NULL
AND m.score_away IS NOT NULL
AND m.ht_score_home IS NOT NULL
AND m.ht_score_away IS NOT NULL
AND m.home_team_id IS NOT NULL
AND m.away_team_id IS NOT NULL
"""
if top_league_ids:
placeholders = ",".join(["%s"] * len(top_league_ids))
query += f" AND m.league_id IN ({placeholders})"
query += " ORDER BY m.mst_utc ASC"
cur = conn.cursor()
params = list(top_league_ids) if top_league_ids else []
cur.execute(query, params)
rows = cur.fetchall()
cur.close()
cols = ["id", "home_team_id", "away_team_id", "league_id",
"score_home", "score_away", "ht_score_home", "ht_score_away", "mst_utc"]
return pd.DataFrame(rows, columns=cols)
def load_odds_for_matches(conn, match_ids):
"""Load MS + HT odds for given match IDs."""
if not match_ids:
return {}
# Load in batches
odds_map = {}
batch_size = 5000
match_list = list(match_ids)
for i in range(0, len(match_list), batch_size):
batch = match_list[i:i + batch_size]
placeholders = ",".join(["%s"] * len(batch))
cur = conn.cursor()
cur.execute(f"""
SELECT oc.match_id, oc.name, os.name as sel_name, os.odd_value
FROM odd_categories oc
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
WHERE oc.match_id IN ({placeholders})
AND oc.name IN (
'Maç Sonucu',
'1. Yarı Sonucu',
'2,5 Alt/Üst',
'Karşılıklı Gol',
'Çifte Şans'
)
""", batch)
rows = cur.fetchall()
cur.close()
for mid, cat_name, sel_name, odd_value in rows:
if mid not in odds_map:
odds_map[mid] = {}
om = odds_map[mid]
try:
val = float(odd_value) if odd_value else 0.0
except (ValueError, TypeError):
val = 0.0
if val <= 0:
continue
# Exact match for MS
if cat_name == "Maç Sonucu":
if sel_name in ("1", "Ev Sahibi"):
om["ms_h"] = val
elif sel_name in ("X", "Berabere"):
om["ms_d"] = val
elif sel_name in ("2", "Deplasman"):
om["ms_a"] = val
elif cat_name == "1. Yarı Sonucu":
if sel_name in ("1", "Ev Sahibi"):
om["ht_ms_h"] = val
elif sel_name in ("X", "Berabere"):
om["ht_ms_d"] = val
elif sel_name in ("2", "Deplasman"):
om["ht_ms_a"] = val
return odds_map
def compute_labels(df):
"""Compute HT/FT label (0-8)."""
labels = []
for _, row in df.iterrows():
ht = 0 if row["ht_score_home"] > row["ht_score_away"] else (2 if row["ht_score_home"] < row["ht_score_away"] else 1)
ft = 0 if row["score_home"] > row["score_away"] else (2 if row["score_home"] < row["score_away"] else 1)
labels.append(ht * 3 + ft)
return labels
def extract_features(df, conn, odds_map, htft_engine):
"""Extract all features for each match."""
print(f"\n⏳ Extracting features for {len(df):,} matches...")
start_time = time.time()
all_features = []
processed = 0
skipped = 0
for idx, row in df.iterrows():
mid = row["id"]
hid = row["home_team_id"]
aid = row["away_team_id"]
lid = row["league_id"]
mst = row["mst_utc"]
# Odds features
odds = odds_map.get(mid, {})
ms_h = odds.get("ms_h", 0.0)
ms_d = odds.get("ms_d", 0.0)
ms_a = odds.get("ms_a", 0.0)
# Skip matches without any odds (too noisy)
if ms_h <= 0 or ms_d <= 0 or ms_a <= 0:
skipped += 1
all_features.append(None)
continue
# Implied probs (vig-free)
raw_sum = 1/ms_h + 1/ms_d + 1/ms_a
implied_home = (1/ms_h) / raw_sum
implied_draw = (1/ms_d) / raw_sum
implied_away = (1/ms_a) / raw_sum
ht_ms_h = odds.get("ht_ms_h", 0.0)
ht_ms_d = odds.get("ht_ms_d", 0.0)
ht_ms_a = odds.get("ht_ms_a", 0.0)
# HT implied probs
if ht_ms_h > 0 and ht_ms_d > 0 and ht_ms_a > 0:
ht_raw = 1/ht_ms_h + 1/ht_ms_d + 1/ht_ms_a
ht_implied_home = (1/ht_ms_h) / ht_raw
ht_implied_draw = (1/ht_ms_d) / ht_raw
ht_implied_away = (1/ht_ms_a) / ht_raw
else:
ht_implied_home = ht_implied_draw = ht_implied_away = 0.33
feat = {
# Odds features (core)
"odds_ms_h": ms_h,
"odds_ms_d": ms_d,
"odds_ms_a": ms_a,
"implied_home": implied_home,
"implied_draw": implied_draw,
"implied_away": implied_away,
"fav_gap": abs(implied_home - implied_away),
# HT odds
"ht_implied_home": ht_implied_home,
"ht_implied_draw": ht_implied_draw,
"ht_implied_away": ht_implied_away,
}
# HT/FT tendency features (NEW!)
try:
htft_feats = htft_engine.get_features(hid, aid, lid, mst)
feat.update(htft_feats)
except Exception as e:
# Fallback to neutral values
feat.update({
"htft_home_ht_scoring_rate": 0.5,
"htft_home_ht_concede_rate": 0.5,
"htft_home_ht_win_rate": 0.33,
"htft_home_comeback_rate": 0.0,
"htft_home_first_half_goal_pct": 0.5,
"htft_home_second_half_surge": 1.0,
"htft_away_ht_scoring_rate": 0.5,
"htft_away_ht_concede_rate": 0.5,
"htft_away_ht_win_rate": 0.33,
"htft_away_comeback_rate": 0.0,
"htft_away_first_half_goal_pct": 0.5,
"htft_away_second_half_surge": 1.0,
"htft_league_avg_ht_goals": 1.0,
"htft_league_reversal_rate": 0.05,
"htft_league_first_half_pct": 0.44,
"htft_home_sample_size": 0.0,
"htft_away_sample_size": 0.0,
})
all_features.append(feat)
processed += 1
if processed % 2000 == 0:
elapsed = time.time() - start_time
rate = processed / elapsed
remaining = (len(df) - processed - skipped) / rate if rate > 0 else 0
print(f" Processed: {processed:,} / {len(df):,} "
f"(skipped: {skipped:,}) "
f"[{elapsed:.0f}s elapsed, ~{remaining:.0f}s remaining]")
elapsed = time.time() - start_time
print(f" ✅ Features extracted: {processed:,} (skipped {skipped:,}) in {elapsed:.1f}s")
return all_features
def train_and_evaluate(X_train, y_train, X_test, y_test, feature_names, label=""):
"""Train XGBoost model and evaluate."""
model = xgb.XGBClassifier(
n_estimators=300,
max_depth=6,
learning_rate=0.05,
num_class=9,
objective="multi:softprob",
eval_metric="mlogloss",
subsample=0.8,
colsample_bytree=0.8,
min_child_weight=5,
random_state=42,
verbosity=0,
n_jobs=-1,
)
print(f"\n🏋️ Training {label} model...")
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
# Predictions
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n📊 {label} Results:")
print(f" Overall Accuracy: {accuracy:.4f} ({accuracy*100:.1f}%)")
# Per-class accuracy
print(f"\n Per-class breakdown:")
rows = []
for i, label_name in enumerate(HTFT_LABELS):
mask = y_test == i
if mask.sum() > 0:
class_acc = accuracy_score(y_test[mask], y_pred[mask])
rows.append([label_name, mask.sum(), f"{class_acc*100:.1f}%"])
print(tabulate(rows, headers=["HT/FT", "Count", "Accuracy"], tablefmt="pretty"))
# Feature importance
importances = model.feature_importances_
feat_imp = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)
print(f"\n Top 15 Features:")
for fname, imp in feat_imp[:15]:
bar = "" * int(imp * 100)
print(f" {fname:40s} {imp:.4f} {bar}")
return model, accuracy
def main():
print("🚀 HT/FT Model Training with New Tendency Features")
print("=" * 70)
conn = get_conn()
top_league_ids = load_top_leagues()
# Load matches
print("\n📊 Loading matches...")
df = load_matches_with_odds(conn, top_league_ids)
print(f"{len(df):,} matches loaded")
# Load odds
print("\n📊 Loading odds...")
match_ids = set(df["id"].tolist())
odds_map = load_odds_for_matches(conn, match_ids)
print(f" ✅ Odds loaded for {len(odds_map):,} matches")
# Compute labels
print("\n📊 Computing HT/FT labels...")
df["label"] = compute_labels(df)
label_dist = df["label"].value_counts().sort_index()
for i, label in enumerate(HTFT_LABELS):
c = label_dist.get(i, 0)
print(f" {label}: {c:,} ({c/len(df)*100:.1f}%)")
# Initialize HT/FT tendency engine
htft_engine = HtftTendencyEngine()
# Extract features
all_features = extract_features(df, conn, odds_map, htft_engine)
# Filter: keep only matches with features
valid_mask = [f is not None for f in all_features]
df_valid = df[valid_mask].reset_index(drop=True)
features_valid = [f for f in all_features if f is not None]
print(f"\n📊 Valid matches with features: {len(df_valid):,}")
# Convert to arrays
feature_names = list(features_valid[0].keys())
X = np.array([[f[k] for k in feature_names] for f in features_valid], dtype=np.float32)
y = np.array(df_valid["label"].tolist(), dtype=np.int32)
# Split: time-based (last 20% as test)
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
print(f" Train: {len(X_train):,}, Test: {len(X_test):,}")
# ─── Train WITH new features ─────────────────────────────────────────
model_new, acc_new = train_and_evaluate(
X_train, y_train, X_test, y_test, feature_names,
label="NEW (with HT/FT tendencies)"
)
# ─── Train WITHOUT new features (baseline) ──────────────────────────
# Remove htft_ features for comparison
baseline_cols = [i for i, n in enumerate(feature_names) if not n.startswith("htft_")]
baseline_names = [feature_names[i] for i in baseline_cols]
X_train_base = X_train[:, baseline_cols]
X_test_base = X_test[:, baseline_cols]
model_base, acc_base = train_and_evaluate(
X_train_base, y_train, X_test_base, y_test, baseline_names,
label="BASELINE (without HT/FT tendencies)"
)
# ─── Comparison ──────────────────────────────────────────────────────
print("\n" + "=" * 70)
print("📈 COMPARISON")
print("=" * 70)
print(f" Baseline accuracy: {acc_base*100:.2f}%")
print(f" New accuracy: {acc_new*100:.2f}%")
delta = (acc_new - acc_base) * 100
direction = "📈 IMPROVEMENT" if delta > 0 else "📉 REGRESSION"
print(f" Delta: {delta:+.2f}% {direction}")
# Save new model
model_path = os.path.join(AI_ENGINE_DIR, "models", "xgboost", "xgb_ht_ft_v2.pkl")
with open(model_path, "wb") as f:
pickle.dump(model_new, f)
print(f"\n💾 New model saved: {model_path}")
conn.close()
print("\n✅ Done!")
if __name__ == "__main__":
main()