""" HT/FT (İY/MS) Model Training Script - VQWEN v3 Bu script İY/MS (Half Time / Full Time) tahmini için XGBoost modeli eğitir. 9 sınıf: 1/1, 1/X, 1/2, X/1, X/X, X/2, 2/1, 2/X, 2/2 Features: - Odds (MS + HT) - HT/FT Tendency Engine (takımların ilk yarı/ikinci yarı performansları) - League-level stats - Data quality metrics Output: - ai-engine/models/xgboost/xgb_ht_ft.json (V20 + V25 compatible) """ import os import sys import json import pickle import psycopg2 from psycopg2.extras import RealDictCursor import pandas as pd import numpy as np import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from sklearn.calibration import CalibratedClassifierCV # Add parent directorys to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from features.htft_tendency_engine import HtftTendencyEngine # Database connection DB_URL = os.getenv('DATABASE_URL', 'postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db') # Remove ?schema=public if present (psycopg2 doesn't accept it) if '?' in DB_URL: DB_URL = DB_URL.split('?')[0] # HT/FT Labels HTFT_LABELS = ["1/1", "1/X", "1/2", "X/1", "X/X", "X/2", "2/1", "2/X", "2/2"] # Save path MODEL_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'models', 'xgboost') MODEL_PATH_JSON = os.path.join(MODEL_DIR, 'xgb_ht_ft.json') MODEL_PATH_PKL = os.path.join(MODEL_DIR, 'xgb_ht_ft.pkl') def fetch_matches(): """Fetch completed football matches with HT and FT scores""" print("📊 Fetching completed football matches...") conn = psycopg2.connect(DB_URL) cur = conn.cursor(cursor_factory=RealDictCursor) cur.execute(""" SELECT m.id, m.home_team_id, m.away_team_id, m.league_id, m.sport, m.mst_utc, m.ht_score_home, m.ht_score_away, m.score_home, m.score_away FROM matches m WHERE m.sport = 'football' AND m.status = 'FT' AND m.ht_score_home IS NOT NULL AND m.ht_score_away IS NOT NULL AND m.score_home IS NOT NULL AND m.score_away IS NOT NULL AND m.mst_utc IS NOT NULL ORDER BY m.mst_utc ASC """) matches = cur.fetchall() print(f"✅ Fetched {len(matches)} matches") cur.close() conn.close() return matches def compute_htft_label(ht_home, ht_away, ft_home, ft_away): """ Compute HT/FT label as integer 0-8 HT result: 0=home, 1=draw, 2=away FT result: 0=home, 1=draw, 2=away Label = ht_result * 3 + ft_result """ if ht_home > ht_away: ht_result = 0 elif ht_home == ht_away: ht_result = 1 else: ht_result = 2 if ft_home > ft_away: ft_result = 0 elif ft_home == ft_away: ft_result = 1 else: ft_result = 2 return ht_result * 3 + ft_result def extract_features_and_labels(matches): """Extract features using HT/FT Tendency Engine + Odds""" print("\n🔧 Extracting features...") conn = psycopg2.connect(DB_URL) cur = conn.cursor(cursor_factory=RealDictCursor) htft_engine = HtftTendencyEngine() features_list = [] labels = [] match_ids = [] for idx, match in enumerate(matches): if idx % 1000 == 0: print(f" Processing {idx}/{len(matches)}...") mid = match['id'] hid = str(match['home_team_id']) aid = str(match['away_team_id']) lid = str(match['league_id']) if match['league_id'] else None mst = int(match['mst_utc']) # Fetch odds (MS and HT) cur.execute(""" SELECT oc.name as category_name, os.name as selection_name, os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = %s """, (mid,)) odds_rows = cur.fetchall() odds = {} ht_odds = {} for row in odds_rows: cat = row['category_name'].lower() sel = row['selection_name'].lower() val = float(row['odd_value']) if 'maç sonucu' in cat or '1.yarı sonucu' in cat: if '1.yarı sonucu' in cat: if sel == '1': ht_odds['ht_ms_h'] = val elif sel in ('x', '0'): ht_odds['ht_ms_d'] = val elif sel == '2': ht_odds['ht_ms_a'] = val else: if sel == '1': odds['ms_h'] = val elif sel in ('x', '0'): odds['ms_d'] = val elif sel == '2': odds['ms_a'] = val # Skip if no odds if 'ms_h' not in odds or 'ms_d' not in odds or 'ms_a' not in odds: continue # Compute HT/FT label label = compute_htft_label( match['ht_score_home'], match['ht_score_away'], match['score_home'], match['score_away'] ) # Extract HT/FT tendency features try: htft_feats = htft_engine.get_features(hid, aid, lid, mst) except Exception as e: # Fallback to defaults htft_feats = htft_engine._empty_features() # Build feature dict feat = { # MS Odds 'odds_ms_h': odds.get('ms_h', 2.0), 'odds_ms_d': odds.get('ms_d', 3.2), 'odds_ms_a': odds.get('ms_a', 3.5), 'implied_home': 1.0 / odds.get('ms_h', 2.0), 'implied_draw': 1.0 / odds.get('ms_d', 3.2), 'implied_away': 1.0 / odds.get('ms_a', 3.5), 'fav_gap': abs(odds.get('ms_h', 2.0) - odds.get('ms_a', 3.5)), # HT Odds 'ht_implied_home': 1.0 / ht_odds.get('ht_ms_h', 3.0), 'ht_implied_draw': 1.0 / ht_odds.get('ht_ms_d', 2.1), 'ht_implied_away': 1.0 / ht_odds.get('ht_ms_a', 3.5), # HT/FT Tendencies (from engine) 'htft_home_ht_scoring_rate': htft_feats.get('home_ht_scoring_rate', 0.5), 'htft_home_ht_concede_rate': htft_feats.get('home_ht_concede_rate', 0.5), 'htft_home_ht_win_rate': htft_feats.get('home_ht_win_rate', 0.33), 'htft_home_comeback_rate': htft_feats.get('home_comeback_rate', 0.0), 'htft_home_first_half_goal_pct': htft_feats.get('home_first_half_goal_pct', 0.5), 'htft_home_second_half_surge': htft_feats.get('home_second_half_surge', 1.0), 'htft_away_ht_scoring_rate': htft_feats.get('away_ht_scoring_rate', 0.5), 'htft_away_ht_concede_rate': htft_feats.get('away_ht_concede_rate', 0.5), 'htft_away_ht_win_rate': htft_feats.get('away_ht_win_rate', 0.33), 'htft_away_comeback_rate': htft_feats.get('away_comeback_rate', 0.0), 'htft_away_first_half_goal_pct': htft_feats.get('away_first_half_goal_pct', 0.5), 'htft_away_second_half_surge': htft_feats.get('away_second_half_surge', 1.0), # League-level 'htft_league_avg_ht_goals': htft_feats.get('league_avg_ht_goals', 1.0), 'htft_league_reversal_rate': htft_feats.get('league_reversal_rate', 0.05), 'htft_league_first_half_pct': htft_feats.get('league_first_half_pct', 0.44), # Data quality 'htft_home_sample_size': htft_feats.get('home_sample_size', 0.0), 'htft_away_sample_size': htft_feats.get('away_sample_size', 0.0), } features_list.append(feat) labels.append(label) match_ids.append(mid) cur.close() conn.close() print(f"✅ Extracted {len(features_list)} samples with features") return features_list, labels, match_ids def train_model(features_list, labels): """Train XGBoost classifier with class weights and calibration""" print("\n🎯 Training HT/FT XGBoost model...") # Convert to DataFrame X = pd.DataFrame(features_list) y = np.array(labels) # Print class distribution print("\n📊 Class distribution:") for i, label_name in enumerate(HTFT_LABELS): count = np.sum(y == i) print(f" {label_name}: {count} ({count/len(y)*100:.1f}%)") # Time-based split (80/20) split_idx = int(len(X) * 0.8) X_train = X.iloc[:split_idx] X_test = X.iloc[split_idx:] y_train = y[:split_idx] y_test = y[split_idx:] print(f"\n📈 Train size: {len(X_train)}, Test size: {len(X_test)}") # Compute class weights (handle imbalance) from sklearn.utils.class_weight import compute_class_weight class_weights = compute_class_weight('balanced', classes=np.arange(9), y=y_train) sample_weights = np.array([class_weights[label] for label in y_train]) print(f"\n⚖️ Class weights: {dict(zip(HTFT_LABELS, [round(w, 2) for w in class_weights]))}") # Train XGBoost model = xgb.XGBClassifier( n_estimators=400, max_depth=7, learning_rate=0.05, objective='multi:softprob', num_class=9, eval_metric='mlogloss', subsample=0.8, colsample_bytree=0.8, min_child_weight=5, gamma=0.1, reg_alpha=0.1, reg_lambda=1.0, random_state=42, n_jobs=-1, early_stopping_rounds=20, # Move to init for newer XGBoost versions ) model.fit( X_train, y_train, sample_weight=sample_weights, eval_set=[(X_test, y_test)], verbose=False, ) # Evaluate y_pred = model.predict(X_test) y_pred_proba = model.predict_proba(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"\n✅ Test Accuracy: {accuracy:.4f} ({accuracy*100:.1f}%)") # Classification report print("\n📊 Classification Report:") print(classification_report(y_test, y_pred, target_names=HTFT_LABELS, zero_division=0)) # Confusion matrix print("\n🔲 Confusion Matrix:") cm = confusion_matrix(y_test, y_pred) print(cm) # Feature importance print("\n🔝 Top 15 Features:") importance = model.feature_importances_ feat_importance = sorted(zip(X.columns, importance), key=lambda x: x[1], reverse=True)[:15] for feat, imp in feat_importance: print(f" {feat}: {imp:.4f}") return model, X.columns.tolist() def save_model(model, feature_names): """Save model in both JSON and PKL formats""" print("\n💾 Saving model...") # Create directory os.makedirs(MODEL_DIR, exist_ok=True) # Save as JSON (for V25 + V20) model.get_booster().save_model(MODEL_PATH_JSON) print(f"✅ Saved JSON model: {MODEL_PATH_JSON}") # Save as PKL (for V20 sklearn wrapper) with open(MODEL_PATH_PKL, 'wb') as f: pickle.dump(model, f) print(f"✅ Saved PKL model: {MODEL_PATH_PKL}") # Save feature names as JSON features_path = os.path.join(MODEL_DIR, 'htft_features.json') with open(features_path, 'w') as f: json.dump(feature_names, f, indent=2) print(f"✅ Saved features: {features_path}") def test_model_loading(): """Test that models can be loaded by V20 and V25""" print("\n🧪 Testing model loading...") # Test V25 loading (raw xgb.Booster from JSON) import xgboost as xgb booster = xgb.Booster() booster.load_model(MODEL_PATH_JSON) print(f"✅ V25 booster loaded from JSON, features: {len(booster.feature_names)}") # Test V20 loading (sklearn wrapper from PKL) with open(MODEL_PATH_PKL, 'rb') as f: model_pkl = pickle.load(f) print(f"✅ V20 model loaded from PKL, features: {len(model_pkl.feature_names_in_)}") print("\n✅ All model loading tests passed!") def main(): print("="*80) print("🚀 HT/FT (İY/MS) MODEL TRAINING - VQWEN v3") print("="*80) # 1. Fetch matches matches = fetch_matches() if not matches: print("❌ No matches found") return # 2. Extract features and labels features_list, labels, match_ids = extract_features_and_labels(matches) if not features_list: print("❌ No features extracted") return # 3. Train model model, feature_names = train_model(features_list, labels) # 4. Save model save_model(model, feature_names) # 5. Test loading test_model_loading() print("\n" + "="*80) print("🎉 TRAINING COMPLETE") print("="*80) print(f"\n📊 Model files:") print(f" JSON (V25+V20): {MODEL_PATH_JSON}") print(f" PKL (V20): {MODEL_PATH_PKL}") print(f" Features: {MODEL_DIR}/htft_features.json") print(f"\n📈 Total samples: {len(features_list)}") print(f"🎯 Classes: {len(HTFT_LABELS)}") if __name__ == '__main__': main()