first (part 2: other directories)

2026-04-16 15:11:25 +03:00
parent 7814e0bc6b
commit 2f0b85a0c7
203 changed files with 59989 additions and 0 deletions
@@ -0,0 +1,192 @@
+"""
+Card Market XGBoost Model Trainer
+==================================
+Kart (4.5 Alt/Üst, 5.5 Alt/Üst) için XGBoost modeli eğitir.
+
+Usage:
+    python3 scripts/train_cards_model.py
+"""
+
+import os
+import sys
+import pickle
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from sklearn.model_selection import train_test_split, StratifiedKFold
+from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, classification_report
+
+# Config
+AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data_cards.csv")
+MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "xgboost")
+
+os.makedirs(MODELS_DIR, exist_ok=True)
+
+# Feature columns
+FEATURES = [
+    # Referee features
+    "ref_matches",
+    "ref_avg_yellow",
+    "ref_avg_red",
+    "ref_avg_total",
+    
+    # Team features
+    "home_team_matches",
+    "home_team_avg_cards",
+    "away_team_matches",
+    "away_team_avg_cards",
+    
+    # League features
+    "league_avg_cards",
+    "league_match_count",
+    
+    # Derived
+    "combined_team_avg",
+    "ref_team_combined",
+]
+
+
+def load_data():
+    if not os.path.exists(DATA_PATH):
+        print(f"❌ Data file not found: {DATA_PATH}")
+        print("   Run extract_card_training_data.py first!")
+        sys.exit(1)
+    
+    print(f"📦 Loading data from {DATA_PATH}...")
+    df = pd.read_csv(DATA_PATH)
+    df.fillna(0, inplace=True)
+    print(f"   Shape: {df.shape}")
+    return df
+
+
+def train_card_model(df, target_col, model_name):
+    """Kart modeli eğit"""
+    
+    print(f"\n🚀 Training {model_name} (Target: {target_col})...")
+    
+    # Filter valid rows
+    valid_df = df[df[target_col].notna()].copy()
+    if valid_df.empty:
+        print(f"   ⚠️ No valid data for {target_col}, skipping.")
+        return None
+    
+    X = valid_df[FEATURES]
+    y = valid_df[target_col].astype(int)
+    
+    print(f"   Target distribution: {dict(y.value_counts())}")
+    
+    # Split
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42, stratify=y
+    )
+    
+    # Model params
+    params = {
+        'objective': 'binary:logistic',
+        'eval_metric': 'logloss',
+        'eta': 0.05,
+        'max_depth': 5,
+        'subsample': 0.8,
+        'colsample_bytree': 0.8,
+        'min_child_weight': 3,
+        'nthread': 4,
+        'seed': 42
+    }
+    
+    # Train with cross-validation
+    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+    cv_scores = []
+    
+    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
+        X_t, X_v = X_train.iloc[train_idx], X_train.iloc[val_idx]
+        y_t, y_v = y_train.iloc[train_idx], y_train.iloc[val_idx]
+        
+        dtrain = xgb.DMatrix(X_t, label=y_t, feature_names=FEATURES)
+        dval = xgb.DMatrix(X_v, label=y_v, feature_names=FEATURES)
+        
+        model = xgb.train(
+            params,
+            dtrain,
+            num_boost_round=500,
+            evals=[(dval, 'eval')],
+            early_stopping_rounds=30,
+            verbose_eval=False
+        )
+        
+        preds = model.predict(dval)
+        auc = roc_auc_score(y_v, preds)
+        cv_scores.append(auc)
+        print(f"   Fold {fold+1} AUC: {auc:.4f}")
+    
+    print(f"   Mean CV AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
+    
+    # Train final model on all training data
+    dtrain_full = xgb.DMatrix(X_train, label=y_train, feature_names=FEATURES)
+    dtest = xgb.DMatrix(X_test, label=y_test, feature_names=FEATURES)
+    
+    final_model = xgb.train(
+        params,
+        dtrain_full,
+        num_boost_round=300,
+        verbose_eval=False
+    )
+    
+    # Evaluate
+    test_preds = final_model.predict(dtest)
+    test_pred_class = (test_preds > 0.5).astype(int)
+    
+    acc = accuracy_score(y_test, test_pred_class)
+    auc = roc_auc_score(y_test, test_preds)
+    
+    print(f"\n📊 Test Results:")
+    print(f"   Accuracy: {acc:.4f}")
+    print(f"   AUC: {auc:.4f}")
+    print(classification_report(y_test, test_pred_class))
+    
+    # Feature importance
+    importance = final_model.get_score(importance_type='gain')
+    print(f"\n🔍 Top Features:")
+    sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)[:5]
+    for feat, score in sorted_importance:
+        print(f"   {feat}: {score:.2f}")
+    
+    # Save model
+    model_path = os.path.join(MODELS_DIR, f"xgb_{model_name.lower()}.json")
+    final_model.save_model(model_path)
+    print(f"\n💾 Model saved to: {model_path}")
+    
+    return final_model
+
+
+def main():
+    df = load_data()
+    
+    # Train multiple card models
+    models = []
+    
+    # 1. Cards Over 4.5
+    model_45 = train_card_model(df, "label_cards_over45", "cards45")
+    models.append(("cards_over_45", model_45))
+    
+    # 2. Cards Over 3.5
+    model_35 = train_card_model(df, "label_cards_over35", "cards35")
+    models.append(("cards_over_35", model_35))
+    
+    # 3. Cards Over 5.5
+    model_55 = train_card_model(df, "label_cards_over55", "cards55")
+    models.append(("cards_over_55", model_55))
+    
+    print("\n" + "="*60)
+    print("✅ All card models trained successfully!")
+    print(f"📁 Models saved to: {MODELS_DIR}")
+    
+    # List saved files
+    import glob
+    card_files = glob.glob(os.path.join(MODELS_DIR, "xgb_cards*.json"))
+    for f in card_files:
+        print(f"   - {os.path.basename(f)}")
+
+
+if __name__ == "__main__":
+    main()