Files
iddaai-be/ai-engine/scripts/train_v25_pro_colab.ipynb
T
fahricansecer 94c7a4481a
Deploy Iddaai Backend / build-and-deploy (push) Successful in 37s
main
2026-05-17 02:17:22 +03:00

343 lines
26 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# V25 Pro Model Trainer — Colab Edition\n",
"**152 feature + Optuna + Isotonic Calibration + GPU**\n",
"\n",
"### Kullanım:\n",
"1. Runtime → Change runtime type → **T4 GPU** seç\n",
"2. `training_data.csv` dosyasını Google Drive'a yükle\n",
"3. Hücreleri sırayla çalıştır\n",
"4. Eğitim bitince `v25_models.zip` indir ve sunucuya yükle"
]
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# 1) Kütüphaneleri kur\n",
"!pip install -q xgboost lightgbm optuna"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# 2) Google Drive bağla ve CSV'yi yükle\n",
"from google.colab import drive\n",
"drive.mount('/content/drive')\n",
"\n",
"import os\n",
"\n",
"# CSV'nin Drive'daki yolunu ayarla\n",
"DRIVE_CSV = '/content/drive/MyDrive/iddaai/training_data.csv'\n",
"\n",
"if not os.path.exists(DRIVE_CSV):\n",
" print(f'HATA: {DRIVE_CSV} bulunamadı!')\n",
" print('Drive\\'a training_data.csv yükle veya yolu düzelt.')\n",
" print()\n",
" print('Alternatif: Dosyayı doğrudan upload et →')\n",
" from google.colab import files\n",
" uploaded = files.upload()\n",
" DRIVE_CSV = list(uploaded.keys())[0]\n",
" print(f'Uploaded: {DRIVE_CSV}')\n",
"else:\n",
" print(f'CSV bulundu: {DRIVE_CSV}')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# 3) GPU kontrolü\n",
"import subprocess\n",
"try:\n",
" gpu_info = subprocess.check_output(['nvidia-smi'], text=True)\n",
" print(gpu_info)\n",
" USE_GPU = True\n",
"except:\n",
" print('GPU bulunamadı, CPU ile devam edilecek.')\n",
" USE_GPU = False"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# 4) Imports ve Config\n",
"import json\n",
"import pickle\n",
"import time\n",
"import numpy as np\n",
"import pandas as pd\n",
"import xgboost as xgb\n",
"import lightgbm as lgb\n",
"import optuna\n",
"from optuna.samplers import TPESampler\n",
"from datetime import datetime\n",
"from sklearn.metrics import accuracy_score, log_loss, classification_report\n",
"from sklearn.isotonic import IsotonicRegression\n",
"from IPython.display import clear_output\n",
"\n",
"optuna.logging.set_verbosity(optuna.logging.WARNING)\n",
"\n",
"MODELS_DIR = '/content/v25_models'\n",
"REPORTS_DIR = '/content/v25_reports'\n",
"os.makedirs(MODELS_DIR, exist_ok=True)\n",
"os.makedirs(REPORTS_DIR, exist_ok=True)\n",
"\n",
"N_TRIALS = 50 # Optuna deneme sayısı (market başına XGB + LGB)\n",
"\n",
"print(f'Optuna trials: {N_TRIALS}')\n",
"print(f'GPU: {USE_GPU}')\n",
"print(f'Models dir: {MODELS_DIR}')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# 5) Feature ve Market tanımları\n",
"\n",
"FEATURES = [\n",
" # ELO (8)\n",
" \"home_overall_elo\", \"away_overall_elo\", \"elo_diff\",\n",
" \"home_home_elo\", \"away_away_elo\",\n",
" \"home_form_elo\", \"away_form_elo\", \"form_elo_diff\",\n",
" # Form (12)\n",
" \"home_goals_avg\", \"home_conceded_avg\",\n",
" \"away_goals_avg\", \"away_conceded_avg\",\n",
" \"home_clean_sheet_rate\", \"away_clean_sheet_rate\",\n",
" \"home_scoring_rate\", \"away_scoring_rate\",\n",
" \"home_winning_streak\", \"away_winning_streak\",\n",
" \"home_unbeaten_streak\", \"away_unbeaten_streak\",\n",
" # H2H (6)\n",
" \"h2h_total_matches\", \"h2h_home_win_rate\", \"h2h_draw_rate\",\n",
" \"h2h_avg_goals\", \"h2h_btts_rate\", \"h2h_over25_rate\",\n",
" # Team Stats (8)\n",
" \"home_avg_possession\", \"away_avg_possession\",\n",
" \"home_avg_shots_on_target\", \"away_avg_shots_on_target\",\n",
" \"home_shot_conversion\", \"away_shot_conversion\",\n",
" \"home_avg_corners\", \"away_avg_corners\",\n",
" # Odds (24 + 20 presence flags)\n",
" \"odds_ms_h\", \"odds_ms_d\", \"odds_ms_a\",\n",
" \"implied_home\", \"implied_draw\", \"implied_away\",\n",
" \"odds_ht_ms_h\", \"odds_ht_ms_d\", \"odds_ht_ms_a\",\n",
" \"odds_ou05_o\", \"odds_ou05_u\",\n",
" \"odds_ou15_o\", \"odds_ou15_u\",\n",
" \"odds_ou25_o\", \"odds_ou25_u\",\n",
" \"odds_ou35_o\", \"odds_ou35_u\",\n",
" \"odds_ht_ou05_o\", \"odds_ht_ou05_u\",\n",
" \"odds_ht_ou15_o\", \"odds_ht_ou15_u\",\n",
" \"odds_btts_y\", \"odds_btts_n\",\n",
" \"odds_ms_h_present\", \"odds_ms_d_present\", \"odds_ms_a_present\",\n",
" \"odds_ht_ms_h_present\", \"odds_ht_ms_d_present\", \"odds_ht_ms_a_present\",\n",
" \"odds_ou05_o_present\", \"odds_ou05_u_present\",\n",
" \"odds_ou15_o_present\", \"odds_ou15_u_present\",\n",
" \"odds_ou25_o_present\", \"odds_ou25_u_present\",\n",
" \"odds_ou35_o_present\", \"odds_ou35_u_present\",\n",
" \"odds_ht_ou05_o_present\", \"odds_ht_ou05_u_present\",\n",
" \"odds_ht_ou15_o_present\", \"odds_ht_ou15_u_present\",\n",
" \"odds_btts_y_present\", \"odds_btts_n_present\",\n",
" # League (4)\n",
" \"home_xga\", \"away_xga\",\n",
" \"league_avg_goals\", \"league_zero_goal_rate\",\n",
" # Upset Engine (4)\n",
" \"upset_atmosphere\", \"upset_motivation\", \"upset_fatigue\", \"upset_potential\",\n",
" # Referee Engine (5)\n",
" \"referee_home_bias\", \"referee_avg_goals\", \"referee_cards_total\",\n",
" \"referee_avg_yellow\", \"referee_experience\",\n",
" # Momentum (3)\n",
" \"home_momentum_score\", \"away_momentum_score\", \"momentum_diff\",\n",
" # Squad (9)\n",
" \"home_squad_quality\", \"away_squad_quality\", \"squad_diff\",\n",
" \"home_key_players\", \"away_key_players\",\n",
" \"home_missing_impact\", \"away_missing_impact\",\n",
" \"home_goals_form\", \"away_goals_form\",\n",
" # Player-Level Features (12)\n",
" \"home_lineup_goals_per90\", \"away_lineup_goals_per90\",\n",
" \"home_lineup_assists_per90\", \"away_lineup_assists_per90\",\n",
" \"home_squad_continuity\", \"away_squad_continuity\",\n",
" \"home_top_scorer_form\", \"away_top_scorer_form\",\n",
" \"home_avg_player_exp\", \"away_avg_player_exp\",\n",
" \"home_goals_diversity\", \"away_goals_diversity\",\n",
" # V27 H2H Expanded (4)\n",
" \"h2h_home_goals_avg\", \"h2h_away_goals_avg\",\n",
" \"h2h_recent_trend\", \"h2h_venue_advantage\",\n",
" # V27 Rolling Stats (13)\n",
" \"home_rolling5_goals\", \"home_rolling5_conceded\",\n",
" \"home_rolling10_goals\", \"home_rolling10_conceded\",\n",
" \"home_rolling20_goals\", \"home_rolling20_conceded\",\n",
" \"away_rolling5_goals\", \"away_rolling5_conceded\",\n",
" \"away_rolling10_goals\", \"away_rolling10_conceded\",\n",
" \"home_rolling5_cs\", \"away_rolling5_cs\",\n",
" # V27 Venue Stats (4)\n",
" \"home_venue_goals\", \"home_venue_conceded\",\n",
" \"away_venue_goals\", \"away_venue_conceded\",\n",
" # V27 Goal Trend (2)\n",
" \"home_goal_trend\", \"away_goal_trend\",\n",
" # V27 Calendar (5)\n",
" \"home_days_rest\", \"away_days_rest\",\n",
" \"match_month\", \"is_season_start\", \"is_season_end\",\n",
" # V27 Interaction (6)\n",
" \"attack_vs_defense_home\", \"attack_vs_defense_away\",\n",
" \"xg_diff\", \"form_momentum_interaction\",\n",
" \"elo_form_consistency\", \"upset_x_elo_gap\",\n",
" # V27 League Expanded (5)\n",
" \"league_home_win_rate\", \"league_draw_rate\",\n",
" \"league_btts_rate\", \"league_ou25_rate\",\n",
" \"league_reliability_score\",\n",
"]\n",
"\n",
"MARKET_CONFIGS = [\n",
" {\"target\": \"label_ms\", \"name\": \"MS\", \"num_class\": 3},\n",
" {\"target\": \"label_ou15\", \"name\": \"OU15\", \"num_class\": 2},\n",
" {\"target\": \"label_ou25\", \"name\": \"OU25\", \"num_class\": 2},\n",
" {\"target\": \"label_ou35\", \"name\": \"OU35\", \"num_class\": 2},\n",
" {\"target\": \"label_btts\", \"name\": \"BTTS\", \"num_class\": 2},\n",
" {\"target\": \"label_ht_result\", \"name\": \"HT_RESULT\", \"num_class\": 3},\n",
" {\"target\": \"label_ht_ou05\", \"name\": \"HT_OU05\", \"num_class\": 2},\n",
" {\"target\": \"label_ht_ou15\", \"name\": \"HT_OU15\", \"num_class\": 2},\n",
" {\"target\": \"label_ht_ft\", \"name\": \"HTFT\", \"num_class\": 9},\n",
" {\"target\": \"label_odd_even\", \"name\": \"ODD_EVEN\", \"num_class\": 2},\n",
" {\"target\": \"label_cards_ou45\", \"name\": \"CARDS_OU45\", \"num_class\": 2},\n",
" {\"target\": \"label_handicap_ms\", \"name\": \"HANDICAP_MS\", \"num_class\": 3},\n",
"]\n",
"\n",
"print(f'Features: {len(FEATURES)}')\n",
"print(f'Markets: {len(MARKET_CONFIGS)}')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# 6) Veriyi yükle\n",
"print(f'Loading {DRIVE_CSV}...')\n",
"df = pd.read_csv(DRIVE_CSV)\n",
"\n",
"for col in FEATURES:\n",
" if col in df.columns:\n",
" df[col] = df[col].fillna(0)\n",
"\n",
"# Odds presence flags\n",
"odds_flag_sources = {\n",
" \"odds_ms_h_present\": \"odds_ms_h\", \"odds_ms_d_present\": \"odds_ms_d\",\n",
" \"odds_ms_a_present\": \"odds_ms_a\", \"odds_ht_ms_h_present\": \"odds_ht_ms_h\",\n",
" \"odds_ht_ms_d_present\": \"odds_ht_ms_d\", \"odds_ht_ms_a_present\": \"odds_ht_ms_a\",\n",
" \"odds_ou05_o_present\": \"odds_ou05_o\", \"odds_ou05_u_present\": \"odds_ou05_u\",\n",
" \"odds_ou15_o_present\": \"odds_ou15_o\", \"odds_ou15_u_present\": \"odds_ou15_u\",\n",
" \"odds_ou25_o_present\": \"odds_ou25_o\", \"odds_ou25_u_present\": \"odds_ou25_u\",\n",
" \"odds_ou35_o_present\": \"odds_ou35_o\", \"odds_ou35_u_present\": \"odds_ou35_u\",\n",
" \"odds_ht_ou05_o_present\": \"odds_ht_ou05_o\", \"odds_ht_ou05_u_present\": \"odds_ht_ou05_u\",\n",
" \"odds_ht_ou15_o_present\": \"odds_ht_ou15_o\", \"odds_ht_ou15_u_present\": \"odds_ht_ou15_u\",\n",
" \"odds_btts_y_present\": \"odds_btts_y\", \"odds_btts_n_present\": \"odds_btts_n\",\n",
"}\n",
"for flag_col, odds_col in odds_flag_sources.items():\n",
" if flag_col not in df.columns:\n",
" df[flag_col] = (\n",
" pd.to_numeric(df.get(odds_col, 0), errors='coerce').fillna(0) > 1.01\n",
" ).astype(float)\n",
"\n",
"available = [f for f in FEATURES if f in df.columns]\n",
"missing = [f for f in FEATURES if f not in df.columns]\n",
"\n",
"print(f'Shape: {df.shape}')\n",
"print(f'Features: {len(available)}/{len(FEATURES)}')\n",
"if missing:\n",
" print(f'Missing features: {missing}')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {},
"source": "# 7) Yardımcı fonksiyonlar\n\ndef temporal_split_4way(valid_df):\n ordered = valid_df.sort_values('mst_utc').reset_index(drop=True)\n n = len(ordered)\n i1 = int(n * 0.60)\n i2 = int(n * 0.75)\n i3 = int(n * 0.85)\n return ordered.iloc[:i1].copy(), ordered.iloc[i1:i2].copy(), ordered.iloc[i2:i3].copy(), ordered.iloc[i3:].copy()\n\n\ndef xgb_objective(trial, X_train, y_train, X_val, y_val, num_class):\n params = {\n 'objective': 'multi:softprob' if num_class > 2 else 'binary:logistic',\n 'eval_metric': 'mlogloss' if num_class > 2 else 'logloss',\n 'tree_method': 'hist',\n 'device': 'cuda' if USE_GPU else 'cpu',\n 'max_depth': trial.suggest_int('max_depth', 3, 8),\n 'eta': trial.suggest_float('eta', 0.01, 0.15, log=True),\n 'subsample': trial.suggest_float('subsample', 0.6, 1.0),\n 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),\n 'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),\n 'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),\n 'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),\n 'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),\n 'random_state': 42,\n }\n if num_class > 2:\n params['num_class'] = num_class\n\n dtrain = xgb.DMatrix(X_train, label=y_train)\n dval = xgb.DMatrix(X_val, label=y_val)\n model = xgb.train(params, dtrain, num_boost_round=1000,\n evals=[(dval, 'val')], early_stopping_rounds=50, verbose_eval=False)\n preds = model.predict(dval)\n if len(preds.shape) == 1:\n preds = np.column_stack([1 - preds, preds])\n return log_loss(y_val, preds)\n\n\ndef lgb_objective(trial, X_train, y_train, X_val, y_val, num_class):\n params = {\n 'objective': 'multiclass' if num_class > 2 else 'binary',\n 'metric': 'multi_logloss' if num_class > 2 else 'binary_logloss',\n 'device': 'gpu' if USE_GPU else 'cpu',\n 'max_depth': trial.suggest_int('max_depth', 3, 8),\n 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),\n 'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),\n 'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),\n 'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),\n 'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),\n 'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 1.0, log=True),\n 'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),\n 'random_state': 42, 'verbose': -1,\n }\n if num_class > 2:\n params['num_class'] = num_class\n\n train_data = lgb.Dataset(X_train, label=y_train)\n val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)\n model = lgb.train(params, train_data, num_boost_round=1000,\n valid_sets=[val_data], valid_names=['val'],\n callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])\n preds = model.predict(X_val, num_iteration=model.best_iteration)\n if len(preds.shape) == 1:\n preds = np.column_stack([1 - preds, preds])\n return log_loss(y_val, preds)\n\n\nprint('Fonksiyonlar hazır.')",
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {},
"source": "# 8) Ana Eğitim Döngüsü\n\nall_metrics = {\n 'trained_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),\n 'trainer': 'v25_pro_colab',\n 'optuna_trials': N_TRIALS,\n 'total_features': len(FEATURES),\n 'gpu': USE_GPU,\n 'markets': {},\n}\n\navailable_features = [f for f in FEATURES if f in df.columns]\ntotal_markets = len(MARKET_CONFIGS)\nstart_time = time.time()\n\nfor mi, config in enumerate(MARKET_CONFIGS):\n target = config['target']\n market_name = config['name']\n num_class = config['num_class']\n market_start = time.time()\n\n print(f\"\\n{'='*60}\")\n print(f\"[{mi+1}/{total_markets}] {market_name} (classes={num_class})\")\n print(f\"{'='*60}\")\n\n if target not in df.columns:\n print(f' SKIP: {target} not in data')\n continue\n\n valid_df = df[df[target].notna()].copy()\n valid_df = valid_df[valid_df[target].astype(str) != ''].copy()\n\n if len(valid_df) < 500:\n print(f' SKIP: only {len(valid_df)} samples')\n continue\n\n train_df, val_df, cal_df, test_df = temporal_split_4way(valid_df)\n X_train = train_df[available_features].values\n X_val = val_df[available_features].values\n X_cal = cal_df[available_features].values\n X_test = test_df[available_features].values\n y_train = train_df[target].astype(int).values\n y_val = val_df[target].astype(int).values\n y_cal = cal_df[target].astype(int).values\n y_test = test_df[target].astype(int).values\n\n print(f' Samples: {len(valid_df)} | Split: {len(X_train)}/{len(X_val)}/{len(X_cal)}/{len(X_test)}')\n print(f' Features: {len(available_features)}')\n\n # ── Optuna XGBoost ──\n print(f' XGBoost Optuna ({N_TRIALS} trials)...', end=' ', flush=True)\n t0 = time.time()\n xgb_study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))\n xgb_study.optimize(\n lambda trial: xgb_objective(trial, X_train, y_train, X_val, y_val, num_class),\n n_trials=N_TRIALS)\n xgb_best = xgb_study.best_params\n print(f'done ({time.time()-t0:.0f}s) best={xgb_study.best_value:.4f}')\n\n # ── Optuna LightGBM ──\n print(f' LightGBM Optuna ({N_TRIALS} trials)...', end=' ', flush=True)\n t0 = time.time()\n lgb_study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))\n lgb_study.optimize(\n lambda trial: lgb_objective(trial, X_train, y_train, X_val, y_val, num_class),\n n_trials=N_TRIALS)\n lgb_best = lgb_study.best_params\n print(f'done ({time.time()-t0:.0f}s) best={lgb_study.best_value:.4f}')\n\n # ── Final XGBoost ──\n print(f' Training final XGBoost...', end=' ', flush=True)\n xgb_params = {\n 'objective': 'multi:softprob' if num_class > 2 else 'binary:logistic',\n 'eval_metric': 'mlogloss' if num_class > 2 else 'logloss',\n 'tree_method': 'hist',\n 'device': 'cuda' if USE_GPU else 'cpu',\n 'random_state': 42,\n **xgb_best,\n }\n if num_class > 2:\n xgb_params['num_class'] = num_class\n\n dtrain = xgb.DMatrix(X_train, label=y_train)\n dval = xgb.DMatrix(X_val, label=y_val)\n xgb_model = xgb.train(\n xgb_params, dtrain, num_boost_round=1500,\n evals=[(dtrain, 'train'), (dval, 'val')],\n early_stopping_rounds=80, verbose_eval=False)\n print(f'iter={xgb_model.best_iteration} score={xgb_model.best_score:.4f}')\n\n # ── Final LightGBM ──\n print(f' Training final LightGBM...', end=' ', flush=True)\n lgb_params = {\n 'objective': 'multiclass' if num_class > 2 else 'binary',\n 'metric': 'multi_logloss' if num_class > 2 else 'binary_logloss',\n 'device': 'gpu' if USE_GPU else 'cpu',\n 'random_state': 42, 'verbose': -1,\n **lgb_best,\n }\n if num_class > 2:\n lgb_params['num_class'] = num_class\n\n lgb_train_data = lgb.Dataset(X_train, label=y_train)\n lgb_val_data = lgb.Dataset(X_val, label=y_val, reference=lgb_train_data)\n lgb_model = lgb.train(\n lgb_params, lgb_train_data, num_boost_round=1500,\n valid_sets=[lgb_train_data, lgb_val_data],\n valid_names=['train', 'val'],\n callbacks=[lgb.early_stopping(80), lgb.log_evaluation(0)])\n print(f'iter={lgb_model.best_iteration}')\n\n # ── Isotonic Calibration ──\n print(f' Isotonic calibration...', end=' ', flush=True)\n dcal = xgb.DMatrix(X_cal)\n xgb_cal_raw = xgb_model.predict(dcal)\n if len(xgb_cal_raw.shape) == 1:\n xgb_cal_raw = np.column_stack([1 - xgb_cal_raw, xgb_cal_raw])\n\n xgb_iso = []\n for cls_idx in range(num_class):\n ir = IsotonicRegression(out_of_bounds='clip')\n ir.fit(xgb_cal_raw[:, cls_idx], (y_cal == cls_idx).astype(float))\n xgb_iso.append(ir)\n\n lgb_cal_raw = lgb_model.predict(X_cal, num_iteration=lgb_model.best_iteration)\n if len(lgb_cal_raw.shape) == 1:\n lgb_cal_raw = np.column_stack([1 - lgb_cal_raw, lgb_cal_raw])\n\n lgb_iso = []\n for cls_idx in range(num_class):\n ir = IsotonicRegression(out_of_bounds='clip')\n ir.fit(lgb_cal_raw[:, cls_idx], (y_cal == cls_idx).astype(float))\n lgb_iso.append(ir)\n print(f'{num_class} classes done')\n\n # ── Test Evaluation ──\n dtest = xgb.DMatrix(X_test)\n xgb_raw = xgb_model.predict(dtest)\n if len(xgb_raw.shape) == 1:\n xgb_raw = np.column_stack([1 - xgb_raw, xgb_raw])\n\n xgb_cal_p = np.column_stack([xgb_iso[i].predict(xgb_raw[:, i]) for i in range(num_class)])\n xgb_cal_p = xgb_cal_p / xgb_cal_p.sum(axis=1, keepdims=True)\n\n lgb_raw = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)\n if len(lgb_raw.shape) == 1:\n lgb_raw = np.column_stack([1 - lgb_raw, lgb_raw])\n\n lgb_cal_p = np.column_stack([lgb_iso[i].predict(lgb_raw[:, i]) for i in range(num_class)])\n lgb_cal_p = lgb_cal_p / lgb_cal_p.sum(axis=1, keepdims=True)\n\n raw_ens = (xgb_raw + lgb_raw) / 2\n cal_ens = (xgb_cal_p + lgb_cal_p) / 2\n\n def _eval(probs, label):\n preds = np.argmax(probs, axis=1)\n acc = accuracy_score(y_test, preds)\n ll = log_loss(y_test, probs)\n return {'accuracy': round(float(acc), 4), 'logloss': round(float(ll), 4)}\n\n m_xgb_raw = _eval(xgb_raw, 'XGB Raw')\n m_xgb_cal = _eval(xgb_cal_p, 'XGB Cal')\n m_lgb_raw = _eval(lgb_raw, 'LGB Raw')\n m_lgb_cal = _eval(lgb_cal_p, 'LGB Cal')\n m_ens_raw = _eval(raw_ens, 'Ens Raw')\n m_ens_cal = _eval(cal_ens, 'Ens Cal')\n\n print(f' ── Test Results ──')\n print(f' XGB Raw: Acc={m_xgb_raw[\"accuracy\"]:.4f} LL={m_xgb_raw[\"logloss\"]:.4f}')\n print(f' XGB Cal: Acc={m_xgb_cal[\"accuracy\"]:.4f} LL={m_xgb_cal[\"logloss\"]:.4f}')\n print(f' LGB Raw: Acc={m_lgb_raw[\"accuracy\"]:.4f} LL={m_lgb_raw[\"logloss\"]:.4f}')\n print(f' LGB Cal: Acc={m_lgb_cal[\"accuracy\"]:.4f} LL={m_lgb_cal[\"logloss\"]:.4f}')\n print(f' Ens Raw: Acc={m_ens_raw[\"accuracy\"]:.4f} LL={m_ens_raw[\"logloss\"]:.4f}')\n print(f' Ens Cal: Acc={m_ens_cal[\"accuracy\"]:.4f} LL={m_ens_cal[\"logloss\"]:.4f}')\n\n ens_preds = np.argmax(raw_ens, axis=1)\n print(f'\\n Classification Report:')\n print(classification_report(y_test, ens_preds))\n\n # ── Save Models ──\n mn = market_name.lower()\n xgb_model.save_model(os.path.join(MODELS_DIR, f'xgb_v25_{mn}.json'))\n lgb_model.save_model(os.path.join(MODELS_DIR, f'lgb_v25_{mn}.txt'))\n with open(os.path.join(MODELS_DIR, f'iso_xgb_v25_{mn}.pkl'), 'wb') as f:\n pickle.dump(xgb_iso, f)\n with open(os.path.join(MODELS_DIR, f'iso_lgb_v25_{mn}.pkl'), 'wb') as f:\n pickle.dump(lgb_iso, f)\n\n elapsed = time.time() - market_start\n total_elapsed = time.time() - start_time\n avg_per_market = total_elapsed / (mi + 1)\n remaining = avg_per_market * (total_markets - mi - 1)\n print(f' Saved! ({elapsed:.0f}s) | Toplam: {total_elapsed/60:.1f}dk | Kalan: ~{remaining/60:.0f}dk')\n\n all_metrics['markets'][market_name] = {\n 'samples': int(len(valid_df)),\n 'train': int(len(X_train)),\n 'features_used': len(available_features),\n 'xgb_best_iteration': int(xgb_model.best_iteration),\n 'lgb_best_iteration': int(lgb_model.best_iteration),\n 'xgb_optuna_best': round(float(xgb_study.best_value), 4),\n 'lgb_optuna_best': round(float(lgb_study.best_value), 4),\n 'test_xgb_raw': m_xgb_raw, 'test_xgb_cal': m_xgb_cal,\n 'test_lgb_raw': m_lgb_raw, 'test_lgb_cal': m_lgb_cal,\n 'test_ensemble_raw': m_ens_raw, 'test_ensemble_calibrated': m_ens_cal,\n 'elapsed_seconds': round(elapsed, 1),\n }\n\n# Feature cols kaydet\nwith open(os.path.join(MODELS_DIR, 'feature_cols.json'), 'w') as f:\n json.dump(available_features, f, indent=2)\n\n# Rapor kaydet\nwith open(os.path.join(REPORTS_DIR, 'v25_pro_metrics.json'), 'w') as f:\n json.dump(all_metrics, f, indent=2, default=str)\n\ntotal_time = time.time() - start_time\nprint(f\"\\n{'='*60}\")\nprint(f'TAMAMLANDI! Toplam süre: {total_time/60:.1f} dakika')\nprint(f\"{'='*60}\")\nfor name, m in all_metrics['markets'].items():\n ens = m.get('test_ensemble_calibrated', m.get('test_ensemble_raw', {}))\n print(f\" {name:12s} | Acc={ens.get('accuracy','?'):.4f} | LL={ens.get('logloss','?'):.4f}\")",
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# 9) Modelleri ZIP'le ve indir\n",
"import shutil\n",
"\n",
"zip_path = shutil.make_archive('/content/v25_models', 'zip', MODELS_DIR)\n",
"print(f'ZIP: {zip_path} ({os.path.getsize(zip_path)/1024/1024:.1f} MB)')\n",
"\n",
"# Drive'a da kopyala\n",
"drive_out = '/content/drive/MyDrive/iddaai/v25_models.zip'\n",
"shutil.copy2(zip_path, drive_out)\n",
"print(f'Drive kopyası: {drive_out}')\n",
"\n",
"# Raporu da kopyala\n",
"shutil.copy2(os.path.join(REPORTS_DIR, 'v25_pro_metrics.json'),\n",
" '/content/drive/MyDrive/iddaai/v25_pro_metrics.json')\n",
"\n",
"# İndir\n",
"from google.colab import files\n",
"files.download(zip_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sunucuya Yükleme\n",
"\n",
"ZIP indirdikten sonra sunucuya yüklemek için:\n",
"\n",
"```bash\n",
"# 1) ZIP'i sunucuya kopyala\n",
"scp -P 2222 v25_models.zip haruncan@95.70.252.214:~/\n",
"\n",
"# 2) Docker container'a kopyala\n",
"docker cp ~/v25_models.zip 85b57a7291df:/app/models/\n",
"\n",
"# 3) Container içinde aç\n",
"docker exec 85b57a7291df bash -c 'cd /app/models/v25 && unzip -o /app/models/v25_models.zip'\n",
"\n",
"# 4) Container'ı restart et\n",
"docker restart 85b57a7291df\n",
"```"
]
}
]
}