This commit is contained in:
Executable
+192
@@ -0,0 +1,192 @@
|
||||
"""
|
||||
Card Market XGBoost Model Trainer
|
||||
==================================
|
||||
Kart (4.5 Alt/Üst, 5.5 Alt/Üst) için XGBoost modeli eğitir.
|
||||
|
||||
Usage:
|
||||
python3 scripts/train_cards_model.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import xgboost as xgb
|
||||
from sklearn.model_selection import train_test_split, StratifiedKFold
|
||||
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, classification_report
|
||||
|
||||
# Config
|
||||
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data_cards.csv")
|
||||
MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "xgboost")
|
||||
|
||||
os.makedirs(MODELS_DIR, exist_ok=True)
|
||||
|
||||
# Feature columns
|
||||
FEATURES = [
|
||||
# Referee features
|
||||
"ref_matches",
|
||||
"ref_avg_yellow",
|
||||
"ref_avg_red",
|
||||
"ref_avg_total",
|
||||
|
||||
# Team features
|
||||
"home_team_matches",
|
||||
"home_team_avg_cards",
|
||||
"away_team_matches",
|
||||
"away_team_avg_cards",
|
||||
|
||||
# League features
|
||||
"league_avg_cards",
|
||||
"league_match_count",
|
||||
|
||||
# Derived
|
||||
"combined_team_avg",
|
||||
"ref_team_combined",
|
||||
]
|
||||
|
||||
|
||||
def load_data():
|
||||
if not os.path.exists(DATA_PATH):
|
||||
print(f"❌ Data file not found: {DATA_PATH}")
|
||||
print(" Run extract_card_training_data.py first!")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"📦 Loading data from {DATA_PATH}...")
|
||||
df = pd.read_csv(DATA_PATH)
|
||||
df.fillna(0, inplace=True)
|
||||
print(f" Shape: {df.shape}")
|
||||
return df
|
||||
|
||||
|
||||
def train_card_model(df, target_col, model_name):
|
||||
"""Kart modeli eğit"""
|
||||
|
||||
print(f"\n🚀 Training {model_name} (Target: {target_col})...")
|
||||
|
||||
# Filter valid rows
|
||||
valid_df = df[df[target_col].notna()].copy()
|
||||
if valid_df.empty:
|
||||
print(f" ⚠️ No valid data for {target_col}, skipping.")
|
||||
return None
|
||||
|
||||
X = valid_df[FEATURES]
|
||||
y = valid_df[target_col].astype(int)
|
||||
|
||||
print(f" Target distribution: {dict(y.value_counts())}")
|
||||
|
||||
# Split
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42, stratify=y
|
||||
)
|
||||
|
||||
# Model params
|
||||
params = {
|
||||
'objective': 'binary:logistic',
|
||||
'eval_metric': 'logloss',
|
||||
'eta': 0.05,
|
||||
'max_depth': 5,
|
||||
'subsample': 0.8,
|
||||
'colsample_bytree': 0.8,
|
||||
'min_child_weight': 3,
|
||||
'nthread': 4,
|
||||
'seed': 42
|
||||
}
|
||||
|
||||
# Train with cross-validation
|
||||
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
||||
cv_scores = []
|
||||
|
||||
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
|
||||
X_t, X_v = X_train.iloc[train_idx], X_train.iloc[val_idx]
|
||||
y_t, y_v = y_train.iloc[train_idx], y_train.iloc[val_idx]
|
||||
|
||||
dtrain = xgb.DMatrix(X_t, label=y_t, feature_names=FEATURES)
|
||||
dval = xgb.DMatrix(X_v, label=y_v, feature_names=FEATURES)
|
||||
|
||||
model = xgb.train(
|
||||
params,
|
||||
dtrain,
|
||||
num_boost_round=500,
|
||||
evals=[(dval, 'eval')],
|
||||
early_stopping_rounds=30,
|
||||
verbose_eval=False
|
||||
)
|
||||
|
||||
preds = model.predict(dval)
|
||||
auc = roc_auc_score(y_v, preds)
|
||||
cv_scores.append(auc)
|
||||
print(f" Fold {fold+1} AUC: {auc:.4f}")
|
||||
|
||||
print(f" Mean CV AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
|
||||
|
||||
# Train final model on all training data
|
||||
dtrain_full = xgb.DMatrix(X_train, label=y_train, feature_names=FEATURES)
|
||||
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=FEATURES)
|
||||
|
||||
final_model = xgb.train(
|
||||
params,
|
||||
dtrain_full,
|
||||
num_boost_round=300,
|
||||
verbose_eval=False
|
||||
)
|
||||
|
||||
# Evaluate
|
||||
test_preds = final_model.predict(dtest)
|
||||
test_pred_class = (test_preds > 0.5).astype(int)
|
||||
|
||||
acc = accuracy_score(y_test, test_pred_class)
|
||||
auc = roc_auc_score(y_test, test_preds)
|
||||
|
||||
print(f"\n📊 Test Results:")
|
||||
print(f" Accuracy: {acc:.4f}")
|
||||
print(f" AUC: {auc:.4f}")
|
||||
print(classification_report(y_test, test_pred_class))
|
||||
|
||||
# Feature importance
|
||||
importance = final_model.get_score(importance_type='gain')
|
||||
print(f"\n🔍 Top Features:")
|
||||
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)[:5]
|
||||
for feat, score in sorted_importance:
|
||||
print(f" {feat}: {score:.2f}")
|
||||
|
||||
# Save model
|
||||
model_path = os.path.join(MODELS_DIR, f"xgb_{model_name.lower()}.json")
|
||||
final_model.save_model(model_path)
|
||||
print(f"\n💾 Model saved to: {model_path}")
|
||||
|
||||
return final_model
|
||||
|
||||
|
||||
def main():
|
||||
df = load_data()
|
||||
|
||||
# Train multiple card models
|
||||
models = []
|
||||
|
||||
# 1. Cards Over 4.5
|
||||
model_45 = train_card_model(df, "label_cards_over45", "cards45")
|
||||
models.append(("cards_over_45", model_45))
|
||||
|
||||
# 2. Cards Over 3.5
|
||||
model_35 = train_card_model(df, "label_cards_over35", "cards35")
|
||||
models.append(("cards_over_35", model_35))
|
||||
|
||||
# 3. Cards Over 5.5
|
||||
model_55 = train_card_model(df, "label_cards_over55", "cards55")
|
||||
models.append(("cards_over_55", model_55))
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("✅ All card models trained successfully!")
|
||||
print(f"📁 Models saved to: {MODELS_DIR}")
|
||||
|
||||
# List saved files
|
||||
import glob
|
||||
card_files = glob.glob(os.path.join(MODELS_DIR, "xgb_cards*.json"))
|
||||
for f in card_files:
|
||||
print(f" - {os.path.basename(f)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user