397 lines
13 KiB
Python
397 lines
13 KiB
Python
"""
|
||
HT/FT (İY/MS) Model Training Script - VQWEN v3
|
||
|
||
Bu script İY/MS (Half Time / Full Time) tahmini için XGBoost modeli eğitir.
|
||
9 sınıf: 1/1, 1/X, 1/2, X/1, X/X, X/2, 2/1, 2/X, 2/2
|
||
|
||
Features:
|
||
- Odds (MS + HT)
|
||
- HT/FT Tendency Engine (takımların ilk yarı/ikinci yarı performansları)
|
||
- League-level stats
|
||
- Data quality metrics
|
||
|
||
Output:
|
||
- ai-engine/models/xgboost/xgb_ht_ft.json (V20 + V25 compatible)
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import pickle
|
||
import psycopg2
|
||
from psycopg2.extras import RealDictCursor
|
||
import pandas as pd
|
||
import numpy as np
|
||
import xgboost as xgb
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
|
||
from sklearn.calibration import CalibratedClassifierCV
|
||
|
||
# Add parent directorys to path
|
||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
from features.htft_tendency_engine import HtftTendencyEngine
|
||
|
||
# Database connection
|
||
DB_URL = os.getenv('DATABASE_URL', 'postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db')
|
||
# Remove ?schema=public if present (psycopg2 doesn't accept it)
|
||
if '?' in DB_URL:
|
||
DB_URL = DB_URL.split('?')[0]
|
||
|
||
# HT/FT Labels
|
||
HTFT_LABELS = ["1/1", "1/X", "1/2", "X/1", "X/X", "X/2", "2/1", "2/X", "2/2"]
|
||
|
||
# Save path
|
||
MODEL_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'models', 'xgboost')
|
||
MODEL_PATH_JSON = os.path.join(MODEL_DIR, 'xgb_ht_ft.json')
|
||
MODEL_PATH_PKL = os.path.join(MODEL_DIR, 'xgb_ht_ft.pkl')
|
||
|
||
|
||
def fetch_matches():
|
||
"""Fetch completed football matches with HT and FT scores"""
|
||
print("📊 Fetching completed football matches...")
|
||
|
||
conn = psycopg2.connect(DB_URL)
|
||
cur = conn.cursor(cursor_factory=RealDictCursor)
|
||
|
||
cur.execute("""
|
||
SELECT
|
||
m.id,
|
||
m.home_team_id,
|
||
m.away_team_id,
|
||
m.league_id,
|
||
m.sport,
|
||
m.mst_utc,
|
||
m.ht_score_home,
|
||
m.ht_score_away,
|
||
m.score_home,
|
||
m.score_away
|
||
FROM matches m
|
||
WHERE m.sport = 'football'
|
||
AND m.status = 'FT'
|
||
AND m.ht_score_home IS NOT NULL
|
||
AND m.ht_score_away IS NOT NULL
|
||
AND m.score_home IS NOT NULL
|
||
AND m.score_away IS NOT NULL
|
||
AND m.mst_utc IS NOT NULL
|
||
ORDER BY m.mst_utc ASC
|
||
""")
|
||
|
||
matches = cur.fetchall()
|
||
print(f"✅ Fetched {len(matches)} matches")
|
||
|
||
cur.close()
|
||
conn.close()
|
||
|
||
return matches
|
||
|
||
|
||
def compute_htft_label(ht_home, ht_away, ft_home, ft_away):
|
||
"""
|
||
Compute HT/FT label as integer 0-8
|
||
|
||
HT result: 0=home, 1=draw, 2=away
|
||
FT result: 0=home, 1=draw, 2=away
|
||
Label = ht_result * 3 + ft_result
|
||
"""
|
||
if ht_home > ht_away:
|
||
ht_result = 0
|
||
elif ht_home == ht_away:
|
||
ht_result = 1
|
||
else:
|
||
ht_result = 2
|
||
|
||
if ft_home > ft_away:
|
||
ft_result = 0
|
||
elif ft_home == ft_away:
|
||
ft_result = 1
|
||
else:
|
||
ft_result = 2
|
||
|
||
return ht_result * 3 + ft_result
|
||
|
||
|
||
def extract_features_and_labels(matches):
|
||
"""Extract features using HT/FT Tendency Engine + Odds"""
|
||
print("\n🔧 Extracting features...")
|
||
|
||
conn = psycopg2.connect(DB_URL)
|
||
cur = conn.cursor(cursor_factory=RealDictCursor)
|
||
|
||
htft_engine = HtftTendencyEngine()
|
||
|
||
features_list = []
|
||
labels = []
|
||
match_ids = []
|
||
|
||
for idx, match in enumerate(matches):
|
||
if idx % 1000 == 0:
|
||
print(f" Processing {idx}/{len(matches)}...")
|
||
|
||
mid = match['id']
|
||
hid = str(match['home_team_id'])
|
||
aid = str(match['away_team_id'])
|
||
lid = str(match['league_id']) if match['league_id'] else None
|
||
mst = int(match['mst_utc'])
|
||
|
||
# Fetch odds (MS and HT)
|
||
cur.execute("""
|
||
SELECT oc.name as category_name, os.name as selection_name, os.odd_value
|
||
FROM odd_categories oc
|
||
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||
WHERE oc.match_id = %s
|
||
""", (mid,))
|
||
|
||
odds_rows = cur.fetchall()
|
||
odds = {}
|
||
ht_odds = {}
|
||
|
||
for row in odds_rows:
|
||
cat = row['category_name'].lower()
|
||
sel = row['selection_name'].lower()
|
||
val = float(row['odd_value'])
|
||
|
||
if 'maç sonucu' in cat or '1.yarı sonucu' in cat:
|
||
if '1.yarı sonucu' in cat:
|
||
if sel == '1': ht_odds['ht_ms_h'] = val
|
||
elif sel in ('x', '0'): ht_odds['ht_ms_d'] = val
|
||
elif sel == '2': ht_odds['ht_ms_a'] = val
|
||
else:
|
||
if sel == '1': odds['ms_h'] = val
|
||
elif sel in ('x', '0'): odds['ms_d'] = val
|
||
elif sel == '2': odds['ms_a'] = val
|
||
|
||
# Skip if no odds
|
||
if 'ms_h' not in odds or 'ms_d' not in odds or 'ms_a' not in odds:
|
||
continue
|
||
|
||
# Compute HT/FT label
|
||
label = compute_htft_label(
|
||
match['ht_score_home'],
|
||
match['ht_score_away'],
|
||
match['score_home'],
|
||
match['score_away']
|
||
)
|
||
|
||
# Extract HT/FT tendency features
|
||
try:
|
||
htft_feats = htft_engine.get_features(hid, aid, lid, mst)
|
||
except Exception as e:
|
||
# Fallback to defaults
|
||
htft_feats = htft_engine._empty_features()
|
||
|
||
# Build feature dict
|
||
feat = {
|
||
# MS Odds
|
||
'odds_ms_h': odds.get('ms_h', 2.0),
|
||
'odds_ms_d': odds.get('ms_d', 3.2),
|
||
'odds_ms_a': odds.get('ms_a', 3.5),
|
||
'implied_home': 1.0 / odds.get('ms_h', 2.0),
|
||
'implied_draw': 1.0 / odds.get('ms_d', 3.2),
|
||
'implied_away': 1.0 / odds.get('ms_a', 3.5),
|
||
'fav_gap': abs(odds.get('ms_h', 2.0) - odds.get('ms_a', 3.5)),
|
||
|
||
# HT Odds
|
||
'ht_implied_home': 1.0 / ht_odds.get('ht_ms_h', 3.0),
|
||
'ht_implied_draw': 1.0 / ht_odds.get('ht_ms_d', 2.1),
|
||
'ht_implied_away': 1.0 / ht_odds.get('ht_ms_a', 3.5),
|
||
|
||
# HT/FT Tendencies (from engine)
|
||
'htft_home_ht_scoring_rate': htft_feats.get('home_ht_scoring_rate', 0.5),
|
||
'htft_home_ht_concede_rate': htft_feats.get('home_ht_concede_rate', 0.5),
|
||
'htft_home_ht_win_rate': htft_feats.get('home_ht_win_rate', 0.33),
|
||
'htft_home_comeback_rate': htft_feats.get('home_comeback_rate', 0.0),
|
||
'htft_home_first_half_goal_pct': htft_feats.get('home_first_half_goal_pct', 0.5),
|
||
'htft_home_second_half_surge': htft_feats.get('home_second_half_surge', 1.0),
|
||
|
||
'htft_away_ht_scoring_rate': htft_feats.get('away_ht_scoring_rate', 0.5),
|
||
'htft_away_ht_concede_rate': htft_feats.get('away_ht_concede_rate', 0.5),
|
||
'htft_away_ht_win_rate': htft_feats.get('away_ht_win_rate', 0.33),
|
||
'htft_away_comeback_rate': htft_feats.get('away_comeback_rate', 0.0),
|
||
'htft_away_first_half_goal_pct': htft_feats.get('away_first_half_goal_pct', 0.5),
|
||
'htft_away_second_half_surge': htft_feats.get('away_second_half_surge', 1.0),
|
||
|
||
# League-level
|
||
'htft_league_avg_ht_goals': htft_feats.get('league_avg_ht_goals', 1.0),
|
||
'htft_league_reversal_rate': htft_feats.get('league_reversal_rate', 0.05),
|
||
'htft_league_first_half_pct': htft_feats.get('league_first_half_pct', 0.44),
|
||
|
||
# Data quality
|
||
'htft_home_sample_size': htft_feats.get('home_sample_size', 0.0),
|
||
'htft_away_sample_size': htft_feats.get('away_sample_size', 0.0),
|
||
}
|
||
|
||
features_list.append(feat)
|
||
labels.append(label)
|
||
match_ids.append(mid)
|
||
|
||
cur.close()
|
||
conn.close()
|
||
|
||
print(f"✅ Extracted {len(features_list)} samples with features")
|
||
|
||
return features_list, labels, match_ids
|
||
|
||
|
||
def train_model(features_list, labels):
|
||
"""Train XGBoost classifier with class weights and calibration"""
|
||
print("\n🎯 Training HT/FT XGBoost model...")
|
||
|
||
# Convert to DataFrame
|
||
X = pd.DataFrame(features_list)
|
||
y = np.array(labels)
|
||
|
||
# Print class distribution
|
||
print("\n📊 Class distribution:")
|
||
for i, label_name in enumerate(HTFT_LABELS):
|
||
count = np.sum(y == i)
|
||
print(f" {label_name}: {count} ({count/len(y)*100:.1f}%)")
|
||
|
||
# Time-based split (80/20)
|
||
split_idx = int(len(X) * 0.8)
|
||
X_train = X.iloc[:split_idx]
|
||
X_test = X.iloc[split_idx:]
|
||
y_train = y[:split_idx]
|
||
y_test = y[split_idx:]
|
||
|
||
print(f"\n📈 Train size: {len(X_train)}, Test size: {len(X_test)}")
|
||
|
||
# Compute class weights (handle imbalance)
|
||
from sklearn.utils.class_weight import compute_class_weight
|
||
class_weights = compute_class_weight('balanced', classes=np.arange(9), y=y_train)
|
||
sample_weights = np.array([class_weights[label] for label in y_train])
|
||
|
||
print(f"\n⚖️ Class weights: {dict(zip(HTFT_LABELS, [round(w, 2) for w in class_weights]))}")
|
||
|
||
# Train XGBoost
|
||
model = xgb.XGBClassifier(
|
||
n_estimators=400,
|
||
max_depth=7,
|
||
learning_rate=0.05,
|
||
objective='multi:softprob',
|
||
num_class=9,
|
||
eval_metric='mlogloss',
|
||
subsample=0.8,
|
||
colsample_bytree=0.8,
|
||
min_child_weight=5,
|
||
gamma=0.1,
|
||
reg_alpha=0.1,
|
||
reg_lambda=1.0,
|
||
random_state=42,
|
||
n_jobs=-1,
|
||
early_stopping_rounds=20, # Move to init for newer XGBoost versions
|
||
)
|
||
|
||
model.fit(
|
||
X_train, y_train,
|
||
sample_weight=sample_weights,
|
||
eval_set=[(X_test, y_test)],
|
||
verbose=False,
|
||
)
|
||
|
||
# Evaluate
|
||
y_pred = model.predict(X_test)
|
||
y_pred_proba = model.predict_proba(X_test)
|
||
|
||
accuracy = accuracy_score(y_test, y_pred)
|
||
print(f"\n✅ Test Accuracy: {accuracy:.4f} ({accuracy*100:.1f}%)")
|
||
|
||
# Classification report
|
||
print("\n📊 Classification Report:")
|
||
print(classification_report(y_test, y_pred, target_names=HTFT_LABELS, zero_division=0))
|
||
|
||
# Confusion matrix
|
||
print("\n🔲 Confusion Matrix:")
|
||
cm = confusion_matrix(y_test, y_pred)
|
||
print(cm)
|
||
|
||
# Feature importance
|
||
print("\n🔝 Top 15 Features:")
|
||
importance = model.feature_importances_
|
||
feat_importance = sorted(zip(X.columns, importance), key=lambda x: x[1], reverse=True)[:15]
|
||
for feat, imp in feat_importance:
|
||
print(f" {feat}: {imp:.4f}")
|
||
|
||
return model, X.columns.tolist()
|
||
|
||
|
||
def save_model(model, feature_names):
|
||
"""Save model in both JSON and PKL formats"""
|
||
print("\n💾 Saving model...")
|
||
|
||
# Create directory
|
||
os.makedirs(MODEL_DIR, exist_ok=True)
|
||
|
||
# Save as JSON (for V25 + V20)
|
||
model.get_booster().save_model(MODEL_PATH_JSON)
|
||
print(f"✅ Saved JSON model: {MODEL_PATH_JSON}")
|
||
|
||
# Save as PKL (for V20 sklearn wrapper)
|
||
with open(MODEL_PATH_PKL, 'wb') as f:
|
||
pickle.dump(model, f)
|
||
print(f"✅ Saved PKL model: {MODEL_PATH_PKL}")
|
||
|
||
# Save feature names as JSON
|
||
features_path = os.path.join(MODEL_DIR, 'htft_features.json')
|
||
with open(features_path, 'w') as f:
|
||
json.dump(feature_names, f, indent=2)
|
||
print(f"✅ Saved features: {features_path}")
|
||
|
||
|
||
def test_model_loading():
|
||
"""Test that models can be loaded by V20 and V25"""
|
||
print("\n🧪 Testing model loading...")
|
||
|
||
# Test V25 loading (raw xgb.Booster from JSON)
|
||
import xgboost as xgb
|
||
booster = xgb.Booster()
|
||
booster.load_model(MODEL_PATH_JSON)
|
||
print(f"✅ V25 booster loaded from JSON, features: {len(booster.feature_names)}")
|
||
|
||
# Test V20 loading (sklearn wrapper from PKL)
|
||
with open(MODEL_PATH_PKL, 'rb') as f:
|
||
model_pkl = pickle.load(f)
|
||
print(f"✅ V20 model loaded from PKL, features: {len(model_pkl.feature_names_in_)}")
|
||
|
||
print("\n✅ All model loading tests passed!")
|
||
|
||
|
||
def main():
|
||
print("="*80)
|
||
print("🚀 HT/FT (İY/MS) MODEL TRAINING - VQWEN v3")
|
||
print("="*80)
|
||
|
||
# 1. Fetch matches
|
||
matches = fetch_matches()
|
||
if not matches:
|
||
print("❌ No matches found")
|
||
return
|
||
|
||
# 2. Extract features and labels
|
||
features_list, labels, match_ids = extract_features_and_labels(matches)
|
||
if not features_list:
|
||
print("❌ No features extracted")
|
||
return
|
||
|
||
# 3. Train model
|
||
model, feature_names = train_model(features_list, labels)
|
||
|
||
# 4. Save model
|
||
save_model(model, feature_names)
|
||
|
||
# 5. Test loading
|
||
test_model_loading()
|
||
|
||
print("\n" + "="*80)
|
||
print("🎉 TRAINING COMPLETE")
|
||
print("="*80)
|
||
print(f"\n📊 Model files:")
|
||
print(f" JSON (V25+V20): {MODEL_PATH_JSON}")
|
||
print(f" PKL (V20): {MODEL_PATH_PKL}")
|
||
print(f" Features: {MODEL_DIR}/htft_features.json")
|
||
print(f"\n📈 Total samples: {len(features_list)}")
|
||
print(f"🎯 Classes: {len(HTFT_LABELS)}")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|