""" Card Market XGBoost Model Trainer ================================== Kart (4.5 Alt/Üst, 5.5 Alt/Üst) için XGBoost modeli eğitir. Usage: python3 scripts/train_cards_model.py """ import os import sys import pickle import numpy as np import pandas as pd import xgboost as xgb from sklearn.model_selection import train_test_split, StratifiedKFold from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, classification_report # Config AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data_cards.csv") MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "xgboost") os.makedirs(MODELS_DIR, exist_ok=True) # Feature columns FEATURES = [ # Referee features "ref_matches", "ref_avg_yellow", "ref_avg_red", "ref_avg_total", # Team features "home_team_matches", "home_team_avg_cards", "away_team_matches", "away_team_avg_cards", # League features "league_avg_cards", "league_match_count", # Derived "combined_team_avg", "ref_team_combined", ] def load_data(): if not os.path.exists(DATA_PATH): print(f"❌ Data file not found: {DATA_PATH}") print(" Run extract_card_training_data.py first!") sys.exit(1) print(f"📦 Loading data from {DATA_PATH}...") df = pd.read_csv(DATA_PATH) df.fillna(0, inplace=True) print(f" Shape: {df.shape}") return df def train_card_model(df, target_col, model_name): """Kart modeli eğit""" print(f"\n🚀 Training {model_name} (Target: {target_col})...") # Filter valid rows valid_df = df[df[target_col].notna()].copy() if valid_df.empty: print(f" ⚠️ No valid data for {target_col}, skipping.") return None X = valid_df[FEATURES] y = valid_df[target_col].astype(int) print(f" Target distribution: {dict(y.value_counts())}") # Split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Model params params = { 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'eta': 0.05, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'nthread': 4, 'seed': 42 } # Train with cross-validation skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) cv_scores = [] for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)): X_t, X_v = X_train.iloc[train_idx], X_train.iloc[val_idx] y_t, y_v = y_train.iloc[train_idx], y_train.iloc[val_idx] dtrain = xgb.DMatrix(X_t, label=y_t, feature_names=FEATURES) dval = xgb.DMatrix(X_v, label=y_v, feature_names=FEATURES) model = xgb.train( params, dtrain, num_boost_round=500, evals=[(dval, 'eval')], early_stopping_rounds=30, verbose_eval=False ) preds = model.predict(dval) auc = roc_auc_score(y_v, preds) cv_scores.append(auc) print(f" Fold {fold+1} AUC: {auc:.4f}") print(f" Mean CV AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})") # Train final model on all training data dtrain_full = xgb.DMatrix(X_train, label=y_train, feature_names=FEATURES) dtest = xgb.DMatrix(X_test, label=y_test, feature_names=FEATURES) final_model = xgb.train( params, dtrain_full, num_boost_round=300, verbose_eval=False ) # Evaluate test_preds = final_model.predict(dtest) test_pred_class = (test_preds > 0.5).astype(int) acc = accuracy_score(y_test, test_pred_class) auc = roc_auc_score(y_test, test_preds) print(f"\n📊 Test Results:") print(f" Accuracy: {acc:.4f}") print(f" AUC: {auc:.4f}") print(classification_report(y_test, test_pred_class)) # Feature importance importance = final_model.get_score(importance_type='gain') print(f"\n🔍 Top Features:") sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)[:5] for feat, score in sorted_importance: print(f" {feat}: {score:.2f}") # Save model model_path = os.path.join(MODELS_DIR, f"xgb_{model_name.lower()}.json") final_model.save_model(model_path) print(f"\n💾 Model saved to: {model_path}") return final_model def main(): df = load_data() # Train multiple card models models = [] # 1. Cards Over 4.5 model_45 = train_card_model(df, "label_cards_over45", "cards45") models.append(("cards_over_45", model_45)) # 2. Cards Over 3.5 model_35 = train_card_model(df, "label_cards_over35", "cards35") models.append(("cards_over_35", model_35)) # 3. Cards Over 5.5 model_55 = train_card_model(df, "label_cards_over55", "cards55") models.append(("cards_over_55", model_55)) print("\n" + "="*60) print("✅ All card models trained successfully!") print(f"📁 Models saved to: {MODELS_DIR}") # List saved files import glob card_files = glob.glob(os.path.join(MODELS_DIR, "xgb_cards*.json")) for f in card_files: print(f" - {os.path.basename(f)}") if __name__ == "__main__": main()