343 lines
26 KiB
Plaintext
343 lines
26 KiB
Plaintext
{
|
||
"nbformat": 4,
|
||
"nbformat_minor": 0,
|
||
"metadata": {
|
||
"colab": {
|
||
"provenance": [],
|
||
"gpuType": "T4"
|
||
},
|
||
"kernelspec": {
|
||
"name": "python3",
|
||
"display_name": "Python 3"
|
||
},
|
||
"accelerator": "GPU"
|
||
},
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# V25 Pro Model Trainer — Colab Edition\n",
|
||
"**152 feature + Optuna + Isotonic Calibration + GPU**\n",
|
||
"\n",
|
||
"### Kullanım:\n",
|
||
"1. Runtime → Change runtime type → **T4 GPU** seç\n",
|
||
"2. `training_data.csv` dosyasını Google Drive'a yükle\n",
|
||
"3. Hücreleri sırayla çalıştır\n",
|
||
"4. Eğitim bitince `v25_models.zip` indir ve sunucuya yükle"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 1) Kütüphaneleri kur\n",
|
||
"!pip install -q xgboost lightgbm optuna"
|
||
],
|
||
"execution_count": null,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 2) Google Drive bağla ve CSV'yi yükle\n",
|
||
"from google.colab import drive\n",
|
||
"drive.mount('/content/drive')\n",
|
||
"\n",
|
||
"import os\n",
|
||
"\n",
|
||
"# CSV'nin Drive'daki yolunu ayarla\n",
|
||
"DRIVE_CSV = '/content/drive/MyDrive/iddaai/training_data.csv'\n",
|
||
"\n",
|
||
"if not os.path.exists(DRIVE_CSV):\n",
|
||
" print(f'HATA: {DRIVE_CSV} bulunamadı!')\n",
|
||
" print('Drive\\'a training_data.csv yükle veya yolu düzelt.')\n",
|
||
" print()\n",
|
||
" print('Alternatif: Dosyayı doğrudan upload et →')\n",
|
||
" from google.colab import files\n",
|
||
" uploaded = files.upload()\n",
|
||
" DRIVE_CSV = list(uploaded.keys())[0]\n",
|
||
" print(f'Uploaded: {DRIVE_CSV}')\n",
|
||
"else:\n",
|
||
" print(f'CSV bulundu: {DRIVE_CSV}')"
|
||
],
|
||
"execution_count": null,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 3) GPU kontrolü\n",
|
||
"import subprocess\n",
|
||
"try:\n",
|
||
" gpu_info = subprocess.check_output(['nvidia-smi'], text=True)\n",
|
||
" print(gpu_info)\n",
|
||
" USE_GPU = True\n",
|
||
"except:\n",
|
||
" print('GPU bulunamadı, CPU ile devam edilecek.')\n",
|
||
" USE_GPU = False"
|
||
],
|
||
"execution_count": null,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 4) Imports ve Config\n",
|
||
"import json\n",
|
||
"import pickle\n",
|
||
"import time\n",
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"import xgboost as xgb\n",
|
||
"import lightgbm as lgb\n",
|
||
"import optuna\n",
|
||
"from optuna.samplers import TPESampler\n",
|
||
"from datetime import datetime\n",
|
||
"from sklearn.metrics import accuracy_score, log_loss, classification_report\n",
|
||
"from sklearn.isotonic import IsotonicRegression\n",
|
||
"from IPython.display import clear_output\n",
|
||
"\n",
|
||
"optuna.logging.set_verbosity(optuna.logging.WARNING)\n",
|
||
"\n",
|
||
"MODELS_DIR = '/content/v25_models'\n",
|
||
"REPORTS_DIR = '/content/v25_reports'\n",
|
||
"os.makedirs(MODELS_DIR, exist_ok=True)\n",
|
||
"os.makedirs(REPORTS_DIR, exist_ok=True)\n",
|
||
"\n",
|
||
"N_TRIALS = 50 # Optuna deneme sayısı (market başına XGB + LGB)\n",
|
||
"\n",
|
||
"print(f'Optuna trials: {N_TRIALS}')\n",
|
||
"print(f'GPU: {USE_GPU}')\n",
|
||
"print(f'Models dir: {MODELS_DIR}')"
|
||
],
|
||
"execution_count": null,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 5) Feature ve Market tanımları\n",
|
||
"\n",
|
||
"FEATURES = [\n",
|
||
" # ELO (8)\n",
|
||
" \"home_overall_elo\", \"away_overall_elo\", \"elo_diff\",\n",
|
||
" \"home_home_elo\", \"away_away_elo\",\n",
|
||
" \"home_form_elo\", \"away_form_elo\", \"form_elo_diff\",\n",
|
||
" # Form (12)\n",
|
||
" \"home_goals_avg\", \"home_conceded_avg\",\n",
|
||
" \"away_goals_avg\", \"away_conceded_avg\",\n",
|
||
" \"home_clean_sheet_rate\", \"away_clean_sheet_rate\",\n",
|
||
" \"home_scoring_rate\", \"away_scoring_rate\",\n",
|
||
" \"home_winning_streak\", \"away_winning_streak\",\n",
|
||
" \"home_unbeaten_streak\", \"away_unbeaten_streak\",\n",
|
||
" # H2H (6)\n",
|
||
" \"h2h_total_matches\", \"h2h_home_win_rate\", \"h2h_draw_rate\",\n",
|
||
" \"h2h_avg_goals\", \"h2h_btts_rate\", \"h2h_over25_rate\",\n",
|
||
" # Team Stats (8)\n",
|
||
" \"home_avg_possession\", \"away_avg_possession\",\n",
|
||
" \"home_avg_shots_on_target\", \"away_avg_shots_on_target\",\n",
|
||
" \"home_shot_conversion\", \"away_shot_conversion\",\n",
|
||
" \"home_avg_corners\", \"away_avg_corners\",\n",
|
||
" # Odds (24 + 20 presence flags)\n",
|
||
" \"odds_ms_h\", \"odds_ms_d\", \"odds_ms_a\",\n",
|
||
" \"implied_home\", \"implied_draw\", \"implied_away\",\n",
|
||
" \"odds_ht_ms_h\", \"odds_ht_ms_d\", \"odds_ht_ms_a\",\n",
|
||
" \"odds_ou05_o\", \"odds_ou05_u\",\n",
|
||
" \"odds_ou15_o\", \"odds_ou15_u\",\n",
|
||
" \"odds_ou25_o\", \"odds_ou25_u\",\n",
|
||
" \"odds_ou35_o\", \"odds_ou35_u\",\n",
|
||
" \"odds_ht_ou05_o\", \"odds_ht_ou05_u\",\n",
|
||
" \"odds_ht_ou15_o\", \"odds_ht_ou15_u\",\n",
|
||
" \"odds_btts_y\", \"odds_btts_n\",\n",
|
||
" \"odds_ms_h_present\", \"odds_ms_d_present\", \"odds_ms_a_present\",\n",
|
||
" \"odds_ht_ms_h_present\", \"odds_ht_ms_d_present\", \"odds_ht_ms_a_present\",\n",
|
||
" \"odds_ou05_o_present\", \"odds_ou05_u_present\",\n",
|
||
" \"odds_ou15_o_present\", \"odds_ou15_u_present\",\n",
|
||
" \"odds_ou25_o_present\", \"odds_ou25_u_present\",\n",
|
||
" \"odds_ou35_o_present\", \"odds_ou35_u_present\",\n",
|
||
" \"odds_ht_ou05_o_present\", \"odds_ht_ou05_u_present\",\n",
|
||
" \"odds_ht_ou15_o_present\", \"odds_ht_ou15_u_present\",\n",
|
||
" \"odds_btts_y_present\", \"odds_btts_n_present\",\n",
|
||
" # League (4)\n",
|
||
" \"home_xga\", \"away_xga\",\n",
|
||
" \"league_avg_goals\", \"league_zero_goal_rate\",\n",
|
||
" # Upset Engine (4)\n",
|
||
" \"upset_atmosphere\", \"upset_motivation\", \"upset_fatigue\", \"upset_potential\",\n",
|
||
" # Referee Engine (5)\n",
|
||
" \"referee_home_bias\", \"referee_avg_goals\", \"referee_cards_total\",\n",
|
||
" \"referee_avg_yellow\", \"referee_experience\",\n",
|
||
" # Momentum (3)\n",
|
||
" \"home_momentum_score\", \"away_momentum_score\", \"momentum_diff\",\n",
|
||
" # Squad (9)\n",
|
||
" \"home_squad_quality\", \"away_squad_quality\", \"squad_diff\",\n",
|
||
" \"home_key_players\", \"away_key_players\",\n",
|
||
" \"home_missing_impact\", \"away_missing_impact\",\n",
|
||
" \"home_goals_form\", \"away_goals_form\",\n",
|
||
" # Player-Level Features (12)\n",
|
||
" \"home_lineup_goals_per90\", \"away_lineup_goals_per90\",\n",
|
||
" \"home_lineup_assists_per90\", \"away_lineup_assists_per90\",\n",
|
||
" \"home_squad_continuity\", \"away_squad_continuity\",\n",
|
||
" \"home_top_scorer_form\", \"away_top_scorer_form\",\n",
|
||
" \"home_avg_player_exp\", \"away_avg_player_exp\",\n",
|
||
" \"home_goals_diversity\", \"away_goals_diversity\",\n",
|
||
" # V27 H2H Expanded (4)\n",
|
||
" \"h2h_home_goals_avg\", \"h2h_away_goals_avg\",\n",
|
||
" \"h2h_recent_trend\", \"h2h_venue_advantage\",\n",
|
||
" # V27 Rolling Stats (13)\n",
|
||
" \"home_rolling5_goals\", \"home_rolling5_conceded\",\n",
|
||
" \"home_rolling10_goals\", \"home_rolling10_conceded\",\n",
|
||
" \"home_rolling20_goals\", \"home_rolling20_conceded\",\n",
|
||
" \"away_rolling5_goals\", \"away_rolling5_conceded\",\n",
|
||
" \"away_rolling10_goals\", \"away_rolling10_conceded\",\n",
|
||
" \"home_rolling5_cs\", \"away_rolling5_cs\",\n",
|
||
" # V27 Venue Stats (4)\n",
|
||
" \"home_venue_goals\", \"home_venue_conceded\",\n",
|
||
" \"away_venue_goals\", \"away_venue_conceded\",\n",
|
||
" # V27 Goal Trend (2)\n",
|
||
" \"home_goal_trend\", \"away_goal_trend\",\n",
|
||
" # V27 Calendar (5)\n",
|
||
" \"home_days_rest\", \"away_days_rest\",\n",
|
||
" \"match_month\", \"is_season_start\", \"is_season_end\",\n",
|
||
" # V27 Interaction (6)\n",
|
||
" \"attack_vs_defense_home\", \"attack_vs_defense_away\",\n",
|
||
" \"xg_diff\", \"form_momentum_interaction\",\n",
|
||
" \"elo_form_consistency\", \"upset_x_elo_gap\",\n",
|
||
" # V27 League Expanded (5)\n",
|
||
" \"league_home_win_rate\", \"league_draw_rate\",\n",
|
||
" \"league_btts_rate\", \"league_ou25_rate\",\n",
|
||
" \"league_reliability_score\",\n",
|
||
"]\n",
|
||
"\n",
|
||
"MARKET_CONFIGS = [\n",
|
||
" {\"target\": \"label_ms\", \"name\": \"MS\", \"num_class\": 3},\n",
|
||
" {\"target\": \"label_ou15\", \"name\": \"OU15\", \"num_class\": 2},\n",
|
||
" {\"target\": \"label_ou25\", \"name\": \"OU25\", \"num_class\": 2},\n",
|
||
" {\"target\": \"label_ou35\", \"name\": \"OU35\", \"num_class\": 2},\n",
|
||
" {\"target\": \"label_btts\", \"name\": \"BTTS\", \"num_class\": 2},\n",
|
||
" {\"target\": \"label_ht_result\", \"name\": \"HT_RESULT\", \"num_class\": 3},\n",
|
||
" {\"target\": \"label_ht_ou05\", \"name\": \"HT_OU05\", \"num_class\": 2},\n",
|
||
" {\"target\": \"label_ht_ou15\", \"name\": \"HT_OU15\", \"num_class\": 2},\n",
|
||
" {\"target\": \"label_ht_ft\", \"name\": \"HTFT\", \"num_class\": 9},\n",
|
||
" {\"target\": \"label_odd_even\", \"name\": \"ODD_EVEN\", \"num_class\": 2},\n",
|
||
" {\"target\": \"label_cards_ou45\", \"name\": \"CARDS_OU45\", \"num_class\": 2},\n",
|
||
" {\"target\": \"label_handicap_ms\", \"name\": \"HANDICAP_MS\", \"num_class\": 3},\n",
|
||
"]\n",
|
||
"\n",
|
||
"print(f'Features: {len(FEATURES)}')\n",
|
||
"print(f'Markets: {len(MARKET_CONFIGS)}')"
|
||
],
|
||
"execution_count": null,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 6) Veriyi yükle\n",
|
||
"print(f'Loading {DRIVE_CSV}...')\n",
|
||
"df = pd.read_csv(DRIVE_CSV)\n",
|
||
"\n",
|
||
"for col in FEATURES:\n",
|
||
" if col in df.columns:\n",
|
||
" df[col] = df[col].fillna(0)\n",
|
||
"\n",
|
||
"# Odds presence flags\n",
|
||
"odds_flag_sources = {\n",
|
||
" \"odds_ms_h_present\": \"odds_ms_h\", \"odds_ms_d_present\": \"odds_ms_d\",\n",
|
||
" \"odds_ms_a_present\": \"odds_ms_a\", \"odds_ht_ms_h_present\": \"odds_ht_ms_h\",\n",
|
||
" \"odds_ht_ms_d_present\": \"odds_ht_ms_d\", \"odds_ht_ms_a_present\": \"odds_ht_ms_a\",\n",
|
||
" \"odds_ou05_o_present\": \"odds_ou05_o\", \"odds_ou05_u_present\": \"odds_ou05_u\",\n",
|
||
" \"odds_ou15_o_present\": \"odds_ou15_o\", \"odds_ou15_u_present\": \"odds_ou15_u\",\n",
|
||
" \"odds_ou25_o_present\": \"odds_ou25_o\", \"odds_ou25_u_present\": \"odds_ou25_u\",\n",
|
||
" \"odds_ou35_o_present\": \"odds_ou35_o\", \"odds_ou35_u_present\": \"odds_ou35_u\",\n",
|
||
" \"odds_ht_ou05_o_present\": \"odds_ht_ou05_o\", \"odds_ht_ou05_u_present\": \"odds_ht_ou05_u\",\n",
|
||
" \"odds_ht_ou15_o_present\": \"odds_ht_ou15_o\", \"odds_ht_ou15_u_present\": \"odds_ht_ou15_u\",\n",
|
||
" \"odds_btts_y_present\": \"odds_btts_y\", \"odds_btts_n_present\": \"odds_btts_n\",\n",
|
||
"}\n",
|
||
"for flag_col, odds_col in odds_flag_sources.items():\n",
|
||
" if flag_col not in df.columns:\n",
|
||
" df[flag_col] = (\n",
|
||
" pd.to_numeric(df.get(odds_col, 0), errors='coerce').fillna(0) > 1.01\n",
|
||
" ).astype(float)\n",
|
||
"\n",
|
||
"available = [f for f in FEATURES if f in df.columns]\n",
|
||
"missing = [f for f in FEATURES if f not in df.columns]\n",
|
||
"\n",
|
||
"print(f'Shape: {df.shape}')\n",
|
||
"print(f'Features: {len(available)}/{len(FEATURES)}')\n",
|
||
"if missing:\n",
|
||
" print(f'Missing features: {missing}')"
|
||
],
|
||
"execution_count": null,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {},
|
||
"source": "# 7) Yardımcı fonksiyonlar\n\ndef temporal_split_4way(valid_df):\n ordered = valid_df.sort_values('mst_utc').reset_index(drop=True)\n n = len(ordered)\n i1 = int(n * 0.60)\n i2 = int(n * 0.75)\n i3 = int(n * 0.85)\n return ordered.iloc[:i1].copy(), ordered.iloc[i1:i2].copy(), ordered.iloc[i2:i3].copy(), ordered.iloc[i3:].copy()\n\n\ndef xgb_objective(trial, X_train, y_train, X_val, y_val, num_class):\n params = {\n 'objective': 'multi:softprob' if num_class > 2 else 'binary:logistic',\n 'eval_metric': 'mlogloss' if num_class > 2 else 'logloss',\n 'tree_method': 'hist',\n 'device': 'cuda' if USE_GPU else 'cpu',\n 'max_depth': trial.suggest_int('max_depth', 3, 8),\n 'eta': trial.suggest_float('eta', 0.01, 0.15, log=True),\n 'subsample': trial.suggest_float('subsample', 0.6, 1.0),\n 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),\n 'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),\n 'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),\n 'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),\n 'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),\n 'random_state': 42,\n }\n if num_class > 2:\n params['num_class'] = num_class\n\n dtrain = xgb.DMatrix(X_train, label=y_train)\n dval = xgb.DMatrix(X_val, label=y_val)\n model = xgb.train(params, dtrain, num_boost_round=1000,\n evals=[(dval, 'val')], early_stopping_rounds=50, verbose_eval=False)\n preds = model.predict(dval)\n if len(preds.shape) == 1:\n preds = np.column_stack([1 - preds, preds])\n return log_loss(y_val, preds)\n\n\ndef lgb_objective(trial, X_train, y_train, X_val, y_val, num_class):\n params = {\n 'objective': 'multiclass' if num_class > 2 else 'binary',\n 'metric': 'multi_logloss' if num_class > 2 else 'binary_logloss',\n 'device': 'gpu' if USE_GPU else 'cpu',\n 'max_depth': trial.suggest_int('max_depth', 3, 8),\n 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),\n 'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),\n 'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),\n 'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),\n 'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),\n 'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 1.0, log=True),\n 'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),\n 'random_state': 42, 'verbose': -1,\n }\n if num_class > 2:\n params['num_class'] = num_class\n\n train_data = lgb.Dataset(X_train, label=y_train)\n val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)\n model = lgb.train(params, train_data, num_boost_round=1000,\n valid_sets=[val_data], valid_names=['val'],\n callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])\n preds = model.predict(X_val, num_iteration=model.best_iteration)\n if len(preds.shape) == 1:\n preds = np.column_stack([1 - preds, preds])\n return log_loss(y_val, preds)\n\n\nprint('Fonksiyonlar hazır.')",
|
||
"execution_count": null,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {},
|
||
"source": "# 8) Ana Eğitim Döngüsü\n\nall_metrics = {\n 'trained_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),\n 'trainer': 'v25_pro_colab',\n 'optuna_trials': N_TRIALS,\n 'total_features': len(FEATURES),\n 'gpu': USE_GPU,\n 'markets': {},\n}\n\navailable_features = [f for f in FEATURES if f in df.columns]\ntotal_markets = len(MARKET_CONFIGS)\nstart_time = time.time()\n\nfor mi, config in enumerate(MARKET_CONFIGS):\n target = config['target']\n market_name = config['name']\n num_class = config['num_class']\n market_start = time.time()\n\n print(f\"\\n{'='*60}\")\n print(f\"[{mi+1}/{total_markets}] {market_name} (classes={num_class})\")\n print(f\"{'='*60}\")\n\n if target not in df.columns:\n print(f' SKIP: {target} not in data')\n continue\n\n valid_df = df[df[target].notna()].copy()\n valid_df = valid_df[valid_df[target].astype(str) != ''].copy()\n\n if len(valid_df) < 500:\n print(f' SKIP: only {len(valid_df)} samples')\n continue\n\n train_df, val_df, cal_df, test_df = temporal_split_4way(valid_df)\n X_train = train_df[available_features].values\n X_val = val_df[available_features].values\n X_cal = cal_df[available_features].values\n X_test = test_df[available_features].values\n y_train = train_df[target].astype(int).values\n y_val = val_df[target].astype(int).values\n y_cal = cal_df[target].astype(int).values\n y_test = test_df[target].astype(int).values\n\n print(f' Samples: {len(valid_df)} | Split: {len(X_train)}/{len(X_val)}/{len(X_cal)}/{len(X_test)}')\n print(f' Features: {len(available_features)}')\n\n # ── Optuna XGBoost ──\n print(f' XGBoost Optuna ({N_TRIALS} trials)...', end=' ', flush=True)\n t0 = time.time()\n xgb_study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))\n xgb_study.optimize(\n lambda trial: xgb_objective(trial, X_train, y_train, X_val, y_val, num_class),\n n_trials=N_TRIALS)\n xgb_best = xgb_study.best_params\n print(f'done ({time.time()-t0:.0f}s) best={xgb_study.best_value:.4f}')\n\n # ── Optuna LightGBM ──\n print(f' LightGBM Optuna ({N_TRIALS} trials)...', end=' ', flush=True)\n t0 = time.time()\n lgb_study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))\n lgb_study.optimize(\n lambda trial: lgb_objective(trial, X_train, y_train, X_val, y_val, num_class),\n n_trials=N_TRIALS)\n lgb_best = lgb_study.best_params\n print(f'done ({time.time()-t0:.0f}s) best={lgb_study.best_value:.4f}')\n\n # ── Final XGBoost ──\n print(f' Training final XGBoost...', end=' ', flush=True)\n xgb_params = {\n 'objective': 'multi:softprob' if num_class > 2 else 'binary:logistic',\n 'eval_metric': 'mlogloss' if num_class > 2 else 'logloss',\n 'tree_method': 'hist',\n 'device': 'cuda' if USE_GPU else 'cpu',\n 'random_state': 42,\n **xgb_best,\n }\n if num_class > 2:\n xgb_params['num_class'] = num_class\n\n dtrain = xgb.DMatrix(X_train, label=y_train)\n dval = xgb.DMatrix(X_val, label=y_val)\n xgb_model = xgb.train(\n xgb_params, dtrain, num_boost_round=1500,\n evals=[(dtrain, 'train'), (dval, 'val')],\n early_stopping_rounds=80, verbose_eval=False)\n print(f'iter={xgb_model.best_iteration} score={xgb_model.best_score:.4f}')\n\n # ── Final LightGBM ──\n print(f' Training final LightGBM...', end=' ', flush=True)\n lgb_params = {\n 'objective': 'multiclass' if num_class > 2 else 'binary',\n 'metric': 'multi_logloss' if num_class > 2 else 'binary_logloss',\n 'device': 'gpu' if USE_GPU else 'cpu',\n 'random_state': 42, 'verbose': -1,\n **lgb_best,\n }\n if num_class > 2:\n lgb_params['num_class'] = num_class\n\n lgb_train_data = lgb.Dataset(X_train, label=y_train)\n lgb_val_data = lgb.Dataset(X_val, label=y_val, reference=lgb_train_data)\n lgb_model = lgb.train(\n lgb_params, lgb_train_data, num_boost_round=1500,\n valid_sets=[lgb_train_data, lgb_val_data],\n valid_names=['train', 'val'],\n callbacks=[lgb.early_stopping(80), lgb.log_evaluation(0)])\n print(f'iter={lgb_model.best_iteration}')\n\n # ── Isotonic Calibration ──\n print(f' Isotonic calibration...', end=' ', flush=True)\n dcal = xgb.DMatrix(X_cal)\n xgb_cal_raw = xgb_model.predict(dcal)\n if len(xgb_cal_raw.shape) == 1:\n xgb_cal_raw = np.column_stack([1 - xgb_cal_raw, xgb_cal_raw])\n\n xgb_iso = []\n for cls_idx in range(num_class):\n ir = IsotonicRegression(out_of_bounds='clip')\n ir.fit(xgb_cal_raw[:, cls_idx], (y_cal == cls_idx).astype(float))\n xgb_iso.append(ir)\n\n lgb_cal_raw = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration)\n if len(lgb_cal_raw.shape) == 1:\n lgb_cal_raw = np.column_stack([1 - lgb_cal_raw, lgb_cal_raw])\n\n lgb_iso = []\n for cls_idx in range(num_class):\n ir = IsotonicRegression(out_of_bounds='clip')\n ir.fit(lgb_cal_raw[:, cls_idx], (y_cal == cls_idx).astype(float))\n lgb_iso.append(ir)\n print(f'{num_class} classes done')\n\n # ── Test Evaluation ──\n dtest = xgb.DMatrix(X_test)\n xgb_raw = xgb_model.predict(dtest)\n if len(xgb_raw.shape) == 1:\n xgb_raw = np.column_stack([1 - xgb_raw, xgb_raw])\n\n xgb_cal_p = np.column_stack([xgb_iso[i].predict(xgb_raw[:, i]) for i in range(num_class)])\n xgb_cal_p = xgb_cal_p / xgb_cal_p.sum(axis=1, keepdims=True)\n\n lgb_raw = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)\n if len(lgb_raw.shape) == 1:\n lgb_raw = np.column_stack([1 - lgb_raw, lgb_raw])\n\n lgb_cal_p = np.column_stack([lgb_iso[i].predict(lgb_raw[:, i]) for i in range(num_class)])\n lgb_cal_p = lgb_cal_p / lgb_cal_p.sum(axis=1, keepdims=True)\n\n raw_ens = (xgb_raw + lgb_raw) / 2\n cal_ens = (xgb_cal_p + lgb_cal_p) / 2\n\n def _eval(probs, label):\n preds = np.argmax(probs, axis=1)\n acc = accuracy_score(y_test, preds)\n ll = log_loss(y_test, probs)\n return {'accuracy': round(float(acc), 4), 'logloss': round(float(ll), 4)}\n\n m_xgb_raw = _eval(xgb_raw, 'XGB Raw')\n m_xgb_cal = _eval(xgb_cal_p, 'XGB Cal')\n m_lgb_raw = _eval(lgb_raw, 'LGB Raw')\n m_lgb_cal = _eval(lgb_cal_p, 'LGB Cal')\n m_ens_raw = _eval(raw_ens, 'Ens Raw')\n m_ens_cal = _eval(cal_ens, 'Ens Cal')\n\n print(f' ── Test Results ──')\n print(f' XGB Raw: Acc={m_xgb_raw[\"accuracy\"]:.4f} LL={m_xgb_raw[\"logloss\"]:.4f}')\n print(f' XGB Cal: Acc={m_xgb_cal[\"accuracy\"]:.4f} LL={m_xgb_cal[\"logloss\"]:.4f}')\n print(f' LGB Raw: Acc={m_lgb_raw[\"accuracy\"]:.4f} LL={m_lgb_raw[\"logloss\"]:.4f}')\n print(f' LGB Cal: Acc={m_lgb_cal[\"accuracy\"]:.4f} LL={m_lgb_cal[\"logloss\"]:.4f}')\n print(f' Ens Raw: Acc={m_ens_raw[\"accuracy\"]:.4f} LL={m_ens_raw[\"logloss\"]:.4f}')\n print(f' Ens Cal: Acc={m_ens_cal[\"accuracy\"]:.4f} LL={m_ens_cal[\"logloss\"]:.4f}')\n\n ens_preds = np.argmax(raw_ens, axis=1)\n print(f'\\n Classification Report:')\n print(classification_report(y_test, ens_preds))\n\n # ── Save Models ──\n mn = market_name.lower()\n xgb_model.save_model(os.path.join(MODELS_DIR, f'xgb_v25_{mn}.json'))\n lgb_model.save_model(os.path.join(MODELS_DIR, f'lgb_v25_{mn}.txt'))\n with open(os.path.join(MODELS_DIR, f'iso_xgb_v25_{mn}.pkl'), 'wb') as f:\n pickle.dump(xgb_iso, f)\n with open(os.path.join(MODELS_DIR, f'iso_lgb_v25_{mn}.pkl'), 'wb') as f:\n pickle.dump(lgb_iso, f)\n\n elapsed = time.time() - market_start\n total_elapsed = time.time() - start_time\n avg_per_market = total_elapsed / (mi + 1)\n remaining = avg_per_market * (total_markets - mi - 1)\n print(f' Saved! ({elapsed:.0f}s) | Toplam: {total_elapsed/60:.1f}dk | Kalan: ~{remaining/60:.0f}dk')\n\n all_metrics['markets'][market_name] = {\n 'samples': int(len(valid_df)),\n 'train': int(len(X_train)),\n 'features_used': len(available_features),\n 'xgb_best_iteration': int(xgb_model.best_iteration),\n 'lgb_best_iteration': int(lgb_model.best_iteration),\n 'xgb_optuna_best': round(float(xgb_study.best_value), 4),\n 'lgb_optuna_best': round(float(lgb_study.best_value), 4),\n 'test_xgb_raw': m_xgb_raw, 'test_xgb_cal': m_xgb_cal,\n 'test_lgb_raw': m_lgb_raw, 'test_lgb_cal': m_lgb_cal,\n 'test_ensemble_raw': m_ens_raw, 'test_ensemble_calibrated': m_ens_cal,\n 'elapsed_seconds': round(elapsed, 1),\n }\n\n# Feature cols kaydet\nwith open(os.path.join(MODELS_DIR, 'feature_cols.json'), 'w') as f:\n json.dump(available_features, f, indent=2)\n\n# Rapor kaydet\nwith open(os.path.join(REPORTS_DIR, 'v25_pro_metrics.json'), 'w') as f:\n json.dump(all_metrics, f, indent=2, default=str)\n\ntotal_time = time.time() - start_time\nprint(f\"\\n{'='*60}\")\nprint(f'TAMAMLANDI! Toplam süre: {total_time/60:.1f} dakika')\nprint(f\"{'='*60}\")\nfor name, m in all_metrics['markets'].items():\n ens = m.get('test_ensemble_calibrated', m.get('test_ensemble_raw', {}))\n print(f\" {name:12s} | Acc={ens.get('accuracy','?'):.4f} | LL={ens.get('logloss','?'):.4f}\")",
|
||
"execution_count": null,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 9) Modelleri ZIP'le ve indir\n",
|
||
"import shutil\n",
|
||
"\n",
|
||
"zip_path = shutil.make_archive('/content/v25_models', 'zip', MODELS_DIR)\n",
|
||
"print(f'ZIP: {zip_path} ({os.path.getsize(zip_path)/1024/1024:.1f} MB)')\n",
|
||
"\n",
|
||
"# Drive'a da kopyala\n",
|
||
"drive_out = '/content/drive/MyDrive/iddaai/v25_models.zip'\n",
|
||
"shutil.copy2(zip_path, drive_out)\n",
|
||
"print(f'Drive kopyası: {drive_out}')\n",
|
||
"\n",
|
||
"# Raporu da kopyala\n",
|
||
"shutil.copy2(os.path.join(REPORTS_DIR, 'v25_pro_metrics.json'),\n",
|
||
" '/content/drive/MyDrive/iddaai/v25_pro_metrics.json')\n",
|
||
"\n",
|
||
"# İndir\n",
|
||
"from google.colab import files\n",
|
||
"files.download(zip_path)"
|
||
],
|
||
"execution_count": null,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Sunucuya Yükleme\n",
|
||
"\n",
|
||
"ZIP indirdikten sonra sunucuya yüklemek için:\n",
|
||
"\n",
|
||
"```bash\n",
|
||
"# 1) ZIP'i sunucuya kopyala\n",
|
||
"scp -P 2222 v25_models.zip haruncan@95.70.252.214:~/\n",
|
||
"\n",
|
||
"# 2) Docker container'a kopyala\n",
|
||
"docker cp ~/v25_models.zip 85b57a7291df:/app/models/\n",
|
||
"\n",
|
||
"# 3) Container içinde aç\n",
|
||
"docker exec 85b57a7291df bash -c 'cd /app/models/v25 && unzip -o /app/models/v25_models.zip'\n",
|
||
"\n",
|
||
"# 4) Container'ı restart et\n",
|
||
"docker restart 85b57a7291df\n",
|
||
"```"
|
||
]
|
||
}
|
||
]
|
||
} |