first (part 2: other directories)
Deploy Iddaai Backend / build-and-deploy (push) Failing after 18s

This commit is contained in:
2026-04-16 15:11:25 +03:00
parent 7814e0bc6b
commit 2f0b85a0c7
203 changed files with 59989 additions and 0 deletions
@@ -0,0 +1,423 @@
"""
HT/FT Model Training with New Features + Backtest
=====================================================
Extracts training data with the new HT/FT tendency features,
trains a new XGBoost model, and compares it against the old model.
Usage:
python ai-engine/scripts/train_htft_with_tendencies.py
"""
import os
import sys
import time
import json
import pickle
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import numpy as np
import pandas as pd
from collections import defaultdict
from tabulate import tabulate
import psycopg2
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from data.db import get_clean_dsn
from features.htft_tendency_engine import HtftTendencyEngine
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "top_leagues.json")
OUTPUT_DIR = os.path.join(AI_ENGINE_DIR, "data")
os.makedirs(OUTPUT_DIR, exist_ok=True)
HTFT_LABELS = ["1/1", "1/X", "1/2", "X/1", "X/X", "X/2", "2/1", "2/X", "2/2"]
def get_conn():
dsn = get_clean_dsn()
return psycopg2.connect(dsn)
def load_top_leagues():
"""Load top league IDs from top_leagues.json."""
try:
with open(TOP_LEAGUES_PATH, "r") as f:
data = json.load(f)
ids = set()
for entry in data:
if isinstance(entry, dict):
lid = entry.get("id") or entry.get("league_id")
if lid:
ids.add(str(lid))
elif isinstance(entry, str):
ids.add(entry)
print(f"✅ Loaded {len(ids)} top leagues")
return ids
except Exception as e:
print(f"⚠️ Could not load top_leagues.json: {e}. Using all leagues.")
return None
def load_matches_with_odds(conn, top_league_ids=None):
"""Load FT football matches with HT scores and odds."""
query = """
SELECT
m.id,
m.home_team_id,
m.away_team_id,
m.league_id,
m.score_home,
m.score_away,
m.ht_score_home,
m.ht_score_away,
m.mst_utc
FROM matches m
WHERE m.sport = 'football'
AND m.status = 'FT'
AND m.score_home IS NOT NULL
AND m.score_away IS NOT NULL
AND m.ht_score_home IS NOT NULL
AND m.ht_score_away IS NOT NULL
AND m.home_team_id IS NOT NULL
AND m.away_team_id IS NOT NULL
"""
if top_league_ids:
placeholders = ",".join(["%s"] * len(top_league_ids))
query += f" AND m.league_id IN ({placeholders})"
query += " ORDER BY m.mst_utc ASC"
cur = conn.cursor()
params = list(top_league_ids) if top_league_ids else []
cur.execute(query, params)
rows = cur.fetchall()
cur.close()
cols = ["id", "home_team_id", "away_team_id", "league_id",
"score_home", "score_away", "ht_score_home", "ht_score_away", "mst_utc"]
return pd.DataFrame(rows, columns=cols)
def load_odds_for_matches(conn, match_ids):
"""Load MS + HT odds for given match IDs."""
if not match_ids:
return {}
# Load in batches
odds_map = {}
batch_size = 5000
match_list = list(match_ids)
for i in range(0, len(match_list), batch_size):
batch = match_list[i:i + batch_size]
placeholders = ",".join(["%s"] * len(batch))
cur = conn.cursor()
cur.execute(f"""
SELECT oc.match_id, oc.name, os.name as sel_name, os.odd_value
FROM odd_categories oc
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
WHERE oc.match_id IN ({placeholders})
AND oc.name IN (
'Maç Sonucu',
'1. Yarı Sonucu',
'2,5 Alt/Üst',
'Karşılıklı Gol',
'Çifte Şans'
)
""", batch)
rows = cur.fetchall()
cur.close()
for mid, cat_name, sel_name, odd_value in rows:
if mid not in odds_map:
odds_map[mid] = {}
om = odds_map[mid]
try:
val = float(odd_value) if odd_value else 0.0
except (ValueError, TypeError):
val = 0.0
if val <= 0:
continue
# Exact match for MS
if cat_name == "Maç Sonucu":
if sel_name in ("1", "Ev Sahibi"):
om["ms_h"] = val
elif sel_name in ("X", "Berabere"):
om["ms_d"] = val
elif sel_name in ("2", "Deplasman"):
om["ms_a"] = val
elif cat_name == "1. Yarı Sonucu":
if sel_name in ("1", "Ev Sahibi"):
om["ht_ms_h"] = val
elif sel_name in ("X", "Berabere"):
om["ht_ms_d"] = val
elif sel_name in ("2", "Deplasman"):
om["ht_ms_a"] = val
return odds_map
def compute_labels(df):
"""Compute HT/FT label (0-8)."""
labels = []
for _, row in df.iterrows():
ht = 0 if row["ht_score_home"] > row["ht_score_away"] else (2 if row["ht_score_home"] < row["ht_score_away"] else 1)
ft = 0 if row["score_home"] > row["score_away"] else (2 if row["score_home"] < row["score_away"] else 1)
labels.append(ht * 3 + ft)
return labels
def extract_features(df, conn, odds_map, htft_engine):
"""Extract all features for each match."""
print(f"\n⏳ Extracting features for {len(df):,} matches...")
start_time = time.time()
all_features = []
processed = 0
skipped = 0
for idx, row in df.iterrows():
mid = row["id"]
hid = row["home_team_id"]
aid = row["away_team_id"]
lid = row["league_id"]
mst = row["mst_utc"]
# Odds features
odds = odds_map.get(mid, {})
ms_h = odds.get("ms_h", 0.0)
ms_d = odds.get("ms_d", 0.0)
ms_a = odds.get("ms_a", 0.0)
# Skip matches without any odds (too noisy)
if ms_h <= 0 or ms_d <= 0 or ms_a <= 0:
skipped += 1
all_features.append(None)
continue
# Implied probs (vig-free)
raw_sum = 1/ms_h + 1/ms_d + 1/ms_a
implied_home = (1/ms_h) / raw_sum
implied_draw = (1/ms_d) / raw_sum
implied_away = (1/ms_a) / raw_sum
ht_ms_h = odds.get("ht_ms_h", 0.0)
ht_ms_d = odds.get("ht_ms_d", 0.0)
ht_ms_a = odds.get("ht_ms_a", 0.0)
# HT implied probs
if ht_ms_h > 0 and ht_ms_d > 0 and ht_ms_a > 0:
ht_raw = 1/ht_ms_h + 1/ht_ms_d + 1/ht_ms_a
ht_implied_home = (1/ht_ms_h) / ht_raw
ht_implied_draw = (1/ht_ms_d) / ht_raw
ht_implied_away = (1/ht_ms_a) / ht_raw
else:
ht_implied_home = ht_implied_draw = ht_implied_away = 0.33
feat = {
# Odds features (core)
"odds_ms_h": ms_h,
"odds_ms_d": ms_d,
"odds_ms_a": ms_a,
"implied_home": implied_home,
"implied_draw": implied_draw,
"implied_away": implied_away,
"fav_gap": abs(implied_home - implied_away),
# HT odds
"ht_implied_home": ht_implied_home,
"ht_implied_draw": ht_implied_draw,
"ht_implied_away": ht_implied_away,
}
# HT/FT tendency features (NEW!)
try:
htft_feats = htft_engine.get_features(hid, aid, lid, mst)
feat.update(htft_feats)
except Exception as e:
# Fallback to neutral values
feat.update({
"htft_home_ht_scoring_rate": 0.5,
"htft_home_ht_concede_rate": 0.5,
"htft_home_ht_win_rate": 0.33,
"htft_home_comeback_rate": 0.0,
"htft_home_first_half_goal_pct": 0.5,
"htft_home_second_half_surge": 1.0,
"htft_away_ht_scoring_rate": 0.5,
"htft_away_ht_concede_rate": 0.5,
"htft_away_ht_win_rate": 0.33,
"htft_away_comeback_rate": 0.0,
"htft_away_first_half_goal_pct": 0.5,
"htft_away_second_half_surge": 1.0,
"htft_league_avg_ht_goals": 1.0,
"htft_league_reversal_rate": 0.05,
"htft_league_first_half_pct": 0.44,
"htft_home_sample_size": 0.0,
"htft_away_sample_size": 0.0,
})
all_features.append(feat)
processed += 1
if processed % 2000 == 0:
elapsed = time.time() - start_time
rate = processed / elapsed
remaining = (len(df) - processed - skipped) / rate if rate > 0 else 0
print(f" Processed: {processed:,} / {len(df):,} "
f"(skipped: {skipped:,}) "
f"[{elapsed:.0f}s elapsed, ~{remaining:.0f}s remaining]")
elapsed = time.time() - start_time
print(f" ✅ Features extracted: {processed:,} (skipped {skipped:,}) in {elapsed:.1f}s")
return all_features
def train_and_evaluate(X_train, y_train, X_test, y_test, feature_names, label=""):
"""Train XGBoost model and evaluate."""
model = xgb.XGBClassifier(
n_estimators=300,
max_depth=6,
learning_rate=0.05,
num_class=9,
objective="multi:softprob",
eval_metric="mlogloss",
subsample=0.8,
colsample_bytree=0.8,
min_child_weight=5,
random_state=42,
verbosity=0,
n_jobs=-1,
)
print(f"\n🏋️ Training {label} model...")
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
# Predictions
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n📊 {label} Results:")
print(f" Overall Accuracy: {accuracy:.4f} ({accuracy*100:.1f}%)")
# Per-class accuracy
print(f"\n Per-class breakdown:")
rows = []
for i, label_name in enumerate(HTFT_LABELS):
mask = y_test == i
if mask.sum() > 0:
class_acc = accuracy_score(y_test[mask], y_pred[mask])
rows.append([label_name, mask.sum(), f"{class_acc*100:.1f}%"])
print(tabulate(rows, headers=["HT/FT", "Count", "Accuracy"], tablefmt="pretty"))
# Feature importance
importances = model.feature_importances_
feat_imp = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)
print(f"\n Top 15 Features:")
for fname, imp in feat_imp[:15]:
bar = "" * int(imp * 100)
print(f" {fname:40s} {imp:.4f} {bar}")
return model, accuracy
def main():
print("🚀 HT/FT Model Training with New Tendency Features")
print("=" * 70)
conn = get_conn()
top_league_ids = load_top_leagues()
# Load matches
print("\n📊 Loading matches...")
df = load_matches_with_odds(conn, top_league_ids)
print(f"{len(df):,} matches loaded")
# Load odds
print("\n📊 Loading odds...")
match_ids = set(df["id"].tolist())
odds_map = load_odds_for_matches(conn, match_ids)
print(f" ✅ Odds loaded for {len(odds_map):,} matches")
# Compute labels
print("\n📊 Computing HT/FT labels...")
df["label"] = compute_labels(df)
label_dist = df["label"].value_counts().sort_index()
for i, label in enumerate(HTFT_LABELS):
c = label_dist.get(i, 0)
print(f" {label}: {c:,} ({c/len(df)*100:.1f}%)")
# Initialize HT/FT tendency engine
htft_engine = HtftTendencyEngine()
# Extract features
all_features = extract_features(df, conn, odds_map, htft_engine)
# Filter: keep only matches with features
valid_mask = [f is not None for f in all_features]
df_valid = df[valid_mask].reset_index(drop=True)
features_valid = [f for f in all_features if f is not None]
print(f"\n📊 Valid matches with features: {len(df_valid):,}")
# Convert to arrays
feature_names = list(features_valid[0].keys())
X = np.array([[f[k] for k in feature_names] for f in features_valid], dtype=np.float32)
y = np.array(df_valid["label"].tolist(), dtype=np.int32)
# Split: time-based (last 20% as test)
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
print(f" Train: {len(X_train):,}, Test: {len(X_test):,}")
# ─── Train WITH new features ─────────────────────────────────────────
model_new, acc_new = train_and_evaluate(
X_train, y_train, X_test, y_test, feature_names,
label="NEW (with HT/FT tendencies)"
)
# ─── Train WITHOUT new features (baseline) ──────────────────────────
# Remove htft_ features for comparison
baseline_cols = [i for i, n in enumerate(feature_names) if not n.startswith("htft_")]
baseline_names = [feature_names[i] for i in baseline_cols]
X_train_base = X_train[:, baseline_cols]
X_test_base = X_test[:, baseline_cols]
model_base, acc_base = train_and_evaluate(
X_train_base, y_train, X_test_base, y_test, baseline_names,
label="BASELINE (without HT/FT tendencies)"
)
# ─── Comparison ──────────────────────────────────────────────────────
print("\n" + "=" * 70)
print("📈 COMPARISON")
print("=" * 70)
print(f" Baseline accuracy: {acc_base*100:.2f}%")
print(f" New accuracy: {acc_new*100:.2f}%")
delta = (acc_new - acc_base) * 100
direction = "📈 IMPROVEMENT" if delta > 0 else "📉 REGRESSION"
print(f" Delta: {delta:+.2f}% {direction}")
# Save new model
model_path = os.path.join(AI_ENGINE_DIR, "models", "xgboost", "xgb_ht_ft_v2.pkl")
with open(model_path, "wb") as f:
pickle.dump(model_new, f)
print(f"\n💾 New model saved: {model_path}")
conn.close()
print("\n✅ Done!")
if __name__ == "__main__":
main()