Files
iddaai-be/ai-engine/scripts/train_htft_vqwen.py
fahricansecer 2f0b85a0c7
Deploy Iddaai Backend / build-and-deploy (push) Failing after 18s
first (part 2: other directories)
2026-04-16 15:11:25 +03:00

397 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
HT/FT (İY/MS) Model Training Script - VQWEN v3
Bu script İY/MS (Half Time / Full Time) tahmini için XGBoost modeli eğitir.
9 sınıf: 1/1, 1/X, 1/2, X/1, X/X, X/2, 2/1, 2/X, 2/2
Features:
- Odds (MS + HT)
- HT/FT Tendency Engine (takımların ilk yarı/ikinci yarı performansları)
- League-level stats
- Data quality metrics
Output:
- ai-engine/models/xgboost/xgb_ht_ft.json (V20 + V25 compatible)
"""
import os
import sys
import json
import pickle
import psycopg2
from psycopg2.extras import RealDictCursor
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.calibration import CalibratedClassifierCV
# Add parent directorys to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from features.htft_tendency_engine import HtftTendencyEngine
# Database connection
DB_URL = os.getenv('DATABASE_URL', 'postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db')
# Remove ?schema=public if present (psycopg2 doesn't accept it)
if '?' in DB_URL:
DB_URL = DB_URL.split('?')[0]
# HT/FT Labels
HTFT_LABELS = ["1/1", "1/X", "1/2", "X/1", "X/X", "X/2", "2/1", "2/X", "2/2"]
# Save path
MODEL_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'models', 'xgboost')
MODEL_PATH_JSON = os.path.join(MODEL_DIR, 'xgb_ht_ft.json')
MODEL_PATH_PKL = os.path.join(MODEL_DIR, 'xgb_ht_ft.pkl')
def fetch_matches():
"""Fetch completed football matches with HT and FT scores"""
print("📊 Fetching completed football matches...")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor(cursor_factory=RealDictCursor)
cur.execute("""
SELECT
m.id,
m.home_team_id,
m.away_team_id,
m.league_id,
m.sport,
m.mst_utc,
m.ht_score_home,
m.ht_score_away,
m.score_home,
m.score_away
FROM matches m
WHERE m.sport = 'football'
AND m.status = 'FT'
AND m.ht_score_home IS NOT NULL
AND m.ht_score_away IS NOT NULL
AND m.score_home IS NOT NULL
AND m.score_away IS NOT NULL
AND m.mst_utc IS NOT NULL
ORDER BY m.mst_utc ASC
""")
matches = cur.fetchall()
print(f"✅ Fetched {len(matches)} matches")
cur.close()
conn.close()
return matches
def compute_htft_label(ht_home, ht_away, ft_home, ft_away):
"""
Compute HT/FT label as integer 0-8
HT result: 0=home, 1=draw, 2=away
FT result: 0=home, 1=draw, 2=away
Label = ht_result * 3 + ft_result
"""
if ht_home > ht_away:
ht_result = 0
elif ht_home == ht_away:
ht_result = 1
else:
ht_result = 2
if ft_home > ft_away:
ft_result = 0
elif ft_home == ft_away:
ft_result = 1
else:
ft_result = 2
return ht_result * 3 + ft_result
def extract_features_and_labels(matches):
"""Extract features using HT/FT Tendency Engine + Odds"""
print("\n🔧 Extracting features...")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor(cursor_factory=RealDictCursor)
htft_engine = HtftTendencyEngine()
features_list = []
labels = []
match_ids = []
for idx, match in enumerate(matches):
if idx % 1000 == 0:
print(f" Processing {idx}/{len(matches)}...")
mid = match['id']
hid = str(match['home_team_id'])
aid = str(match['away_team_id'])
lid = str(match['league_id']) if match['league_id'] else None
mst = int(match['mst_utc'])
# Fetch odds (MS and HT)
cur.execute("""
SELECT oc.name as category_name, os.name as selection_name, os.odd_value
FROM odd_categories oc
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
WHERE oc.match_id = %s
""", (mid,))
odds_rows = cur.fetchall()
odds = {}
ht_odds = {}
for row in odds_rows:
cat = row['category_name'].lower()
sel = row['selection_name'].lower()
val = float(row['odd_value'])
if 'maç sonucu' in cat or '1.yarı sonucu' in cat:
if '1.yarı sonucu' in cat:
if sel == '1': ht_odds['ht_ms_h'] = val
elif sel in ('x', '0'): ht_odds['ht_ms_d'] = val
elif sel == '2': ht_odds['ht_ms_a'] = val
else:
if sel == '1': odds['ms_h'] = val
elif sel in ('x', '0'): odds['ms_d'] = val
elif sel == '2': odds['ms_a'] = val
# Skip if no odds
if 'ms_h' not in odds or 'ms_d' not in odds or 'ms_a' not in odds:
continue
# Compute HT/FT label
label = compute_htft_label(
match['ht_score_home'],
match['ht_score_away'],
match['score_home'],
match['score_away']
)
# Extract HT/FT tendency features
try:
htft_feats = htft_engine.get_features(hid, aid, lid, mst)
except Exception as e:
# Fallback to defaults
htft_feats = htft_engine._empty_features()
# Build feature dict
feat = {
# MS Odds
'odds_ms_h': odds.get('ms_h', 2.0),
'odds_ms_d': odds.get('ms_d', 3.2),
'odds_ms_a': odds.get('ms_a', 3.5),
'implied_home': 1.0 / odds.get('ms_h', 2.0),
'implied_draw': 1.0 / odds.get('ms_d', 3.2),
'implied_away': 1.0 / odds.get('ms_a', 3.5),
'fav_gap': abs(odds.get('ms_h', 2.0) - odds.get('ms_a', 3.5)),
# HT Odds
'ht_implied_home': 1.0 / ht_odds.get('ht_ms_h', 3.0),
'ht_implied_draw': 1.0 / ht_odds.get('ht_ms_d', 2.1),
'ht_implied_away': 1.0 / ht_odds.get('ht_ms_a', 3.5),
# HT/FT Tendencies (from engine)
'htft_home_ht_scoring_rate': htft_feats.get('home_ht_scoring_rate', 0.5),
'htft_home_ht_concede_rate': htft_feats.get('home_ht_concede_rate', 0.5),
'htft_home_ht_win_rate': htft_feats.get('home_ht_win_rate', 0.33),
'htft_home_comeback_rate': htft_feats.get('home_comeback_rate', 0.0),
'htft_home_first_half_goal_pct': htft_feats.get('home_first_half_goal_pct', 0.5),
'htft_home_second_half_surge': htft_feats.get('home_second_half_surge', 1.0),
'htft_away_ht_scoring_rate': htft_feats.get('away_ht_scoring_rate', 0.5),
'htft_away_ht_concede_rate': htft_feats.get('away_ht_concede_rate', 0.5),
'htft_away_ht_win_rate': htft_feats.get('away_ht_win_rate', 0.33),
'htft_away_comeback_rate': htft_feats.get('away_comeback_rate', 0.0),
'htft_away_first_half_goal_pct': htft_feats.get('away_first_half_goal_pct', 0.5),
'htft_away_second_half_surge': htft_feats.get('away_second_half_surge', 1.0),
# League-level
'htft_league_avg_ht_goals': htft_feats.get('league_avg_ht_goals', 1.0),
'htft_league_reversal_rate': htft_feats.get('league_reversal_rate', 0.05),
'htft_league_first_half_pct': htft_feats.get('league_first_half_pct', 0.44),
# Data quality
'htft_home_sample_size': htft_feats.get('home_sample_size', 0.0),
'htft_away_sample_size': htft_feats.get('away_sample_size', 0.0),
}
features_list.append(feat)
labels.append(label)
match_ids.append(mid)
cur.close()
conn.close()
print(f"✅ Extracted {len(features_list)} samples with features")
return features_list, labels, match_ids
def train_model(features_list, labels):
"""Train XGBoost classifier with class weights and calibration"""
print("\n🎯 Training HT/FT XGBoost model...")
# Convert to DataFrame
X = pd.DataFrame(features_list)
y = np.array(labels)
# Print class distribution
print("\n📊 Class distribution:")
for i, label_name in enumerate(HTFT_LABELS):
count = np.sum(y == i)
print(f" {label_name}: {count} ({count/len(y)*100:.1f}%)")
# Time-based split (80/20)
split_idx = int(len(X) * 0.8)
X_train = X.iloc[:split_idx]
X_test = X.iloc[split_idx:]
y_train = y[:split_idx]
y_test = y[split_idx:]
print(f"\n📈 Train size: {len(X_train)}, Test size: {len(X_test)}")
# Compute class weights (handle imbalance)
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.arange(9), y=y_train)
sample_weights = np.array([class_weights[label] for label in y_train])
print(f"\n⚖️ Class weights: {dict(zip(HTFT_LABELS, [round(w, 2) for w in class_weights]))}")
# Train XGBoost
model = xgb.XGBClassifier(
n_estimators=400,
max_depth=7,
learning_rate=0.05,
objective='multi:softprob',
num_class=9,
eval_metric='mlogloss',
subsample=0.8,
colsample_bytree=0.8,
min_child_weight=5,
gamma=0.1,
reg_alpha=0.1,
reg_lambda=1.0,
random_state=42,
n_jobs=-1,
early_stopping_rounds=20, # Move to init for newer XGBoost versions
)
model.fit(
X_train, y_train,
sample_weight=sample_weights,
eval_set=[(X_test, y_test)],
verbose=False,
)
# Evaluate
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Test Accuracy: {accuracy:.4f} ({accuracy*100:.1f}%)")
# Classification report
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred, target_names=HTFT_LABELS, zero_division=0))
# Confusion matrix
print("\n🔲 Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Feature importance
print("\n🔝 Top 15 Features:")
importance = model.feature_importances_
feat_importance = sorted(zip(X.columns, importance), key=lambda x: x[1], reverse=True)[:15]
for feat, imp in feat_importance:
print(f" {feat}: {imp:.4f}")
return model, X.columns.tolist()
def save_model(model, feature_names):
"""Save model in both JSON and PKL formats"""
print("\n💾 Saving model...")
# Create directory
os.makedirs(MODEL_DIR, exist_ok=True)
# Save as JSON (for V25 + V20)
model.get_booster().save_model(MODEL_PATH_JSON)
print(f"✅ Saved JSON model: {MODEL_PATH_JSON}")
# Save as PKL (for V20 sklearn wrapper)
with open(MODEL_PATH_PKL, 'wb') as f:
pickle.dump(model, f)
print(f"✅ Saved PKL model: {MODEL_PATH_PKL}")
# Save feature names as JSON
features_path = os.path.join(MODEL_DIR, 'htft_features.json')
with open(features_path, 'w') as f:
json.dump(feature_names, f, indent=2)
print(f"✅ Saved features: {features_path}")
def test_model_loading():
"""Test that models can be loaded by V20 and V25"""
print("\n🧪 Testing model loading...")
# Test V25 loading (raw xgb.Booster from JSON)
import xgboost as xgb
booster = xgb.Booster()
booster.load_model(MODEL_PATH_JSON)
print(f"✅ V25 booster loaded from JSON, features: {len(booster.feature_names)}")
# Test V20 loading (sklearn wrapper from PKL)
with open(MODEL_PATH_PKL, 'rb') as f:
model_pkl = pickle.load(f)
print(f"✅ V20 model loaded from PKL, features: {len(model_pkl.feature_names_in_)}")
print("\n✅ All model loading tests passed!")
def main():
print("="*80)
print("🚀 HT/FT (İY/MS) MODEL TRAINING - VQWEN v3")
print("="*80)
# 1. Fetch matches
matches = fetch_matches()
if not matches:
print("❌ No matches found")
return
# 2. Extract features and labels
features_list, labels, match_ids = extract_features_and_labels(matches)
if not features_list:
print("❌ No features extracted")
return
# 3. Train model
model, feature_names = train_model(features_list, labels)
# 4. Save model
save_model(model, feature_names)
# 5. Test loading
test_model_loading()
print("\n" + "="*80)
print("🎉 TRAINING COMPLETE")
print("="*80)
print(f"\n📊 Model files:")
print(f" JSON (V25+V20): {MODEL_PATH_JSON}")
print(f" PKL (V20): {MODEL_PATH_PKL}")
print(f" Features: {MODEL_DIR}/htft_features.json")
print(f"\n📈 Total samples: {len(features_list)}")
print(f"🎯 Classes: {len(HTFT_LABELS)}")
if __name__ == '__main__':
main()