{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# League-Specific Model Trainer \u2014 Google Colab\n", "164 lig i\u00e7in XGBoost + isotonic kalibrasyon. 12 market.\n", "Modeller Drive'a kaydedilir, `models/league_specific/` klas\u00f6r\u00fcne kopyalan\u0131r.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Mount Drive\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "DRIVE_DIR = '/content/drive/MyDrive/iddaai'\n", "import os\n", "os.makedirs(DRIVE_DIR, exist_ok=True)\n", "print('Drive mounted:', DRIVE_DIR)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# training_data.csv zaten Drive da: /content/drive/MyDrive/iddaai/training_data.csv\n", "# Sadece qualified_leagues.json upload et (iddaai-be/ klas\u00f6r\u00fcnden)\n", "from google.colab import files\n", "import shutil\n", "print(\"qualified_leagues.json dosyasini upload edin\")\n", "uploaded = files.upload()\n", "for fname in uploaded:\n", " shutil.copy(fname, f\"{DRIVE_DIR}/{fname}\")\n", " print(f\"Kaydedildi: {DRIVE_DIR}/{fname}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Upload training_data.csv and qualified_leagues.json from local machine\n", "from google.colab import files\n", "print('training_data.csv upload edin (ai-engine/data/training_data.csv)')\n", "uploaded = files.upload()\n", "import shutil\n", "for fname in uploaded:\n", " shutil.copy(fname, f'{DRIVE_DIR}/{fname}')\n", " print(f'Saved: {DRIVE_DIR}/{fname}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os, json, pickle, time, warnings\n", "import numpy as np\n", "import pandas as pd\n", "import xgboost as xgb\n", "from sklearn.isotonic import IsotonicRegression\n", "from sklearn.metrics import accuracy_score, log_loss\n", "warnings.filterwarnings('ignore')\n", "\n", "DRIVE_DIR = '/content/drive/MyDrive/iddaai'\n", "DATA_PATH = f'{DRIVE_DIR}/training_data.csv'\n", "QL_PATH = f'{DRIVE_DIR}/qualified_leagues.json'\n", "MODELS_DIR = f'{DRIVE_DIR}/league_specific'\n", "os.makedirs(MODELS_DIR, exist_ok=True)\n", "\n", "MARKETS = {\n", " 'MS': {'label': 'label_ms', 'num_class': 3, 'min_samples': 200},\n", " 'OU15': {'label': 'label_ou15', 'num_class': 2, 'min_samples': 150},\n", " 'OU25': {'label': 'label_ou25', 'num_class': 2, 'min_samples': 150},\n", " 'OU35': {'label': 'label_ou35', 'num_class': 2, 'min_samples': 150},\n", " 'BTTS': {'label': 'label_btts', 'num_class': 2, 'min_samples': 150},\n", " 'HT': {'label': 'label_ht_result', 'num_class': 3, 'min_samples': 150},\n", " 'HT_OU05': {'label': 'label_ht_ou05', 'num_class': 2, 'min_samples': 150},\n", " 'HT_OU15': {'label': 'label_ht_ou15', 'num_class': 2, 'min_samples': 150},\n", " 'HTFT': {'label': 'label_ht_ft', 'num_class': 9, 'min_samples': 300},\n", " 'OE': {'label': 'label_odd_even', 'num_class': 2, 'min_samples': 150},\n", " 'CARDS': {'label': 'label_cards_ou45', 'num_class': 2, 'min_samples': 150},\n", " 'HANDICAP': {'label': 'label_handicap_ms', 'num_class': 3, 'min_samples': 200},\n", "}\n", "\n", "SKIP_COLS = {\n", " 'match_id','home_team_id','away_team_id','league_id','mst_utc',\n", " 'score_home','score_away','total_goals','ht_score_home','ht_score_away','ht_total_goals',\n", " 'label_ms','label_ou05','label_ou15','label_ou25','label_ou35','label_btts',\n", " 'label_ht_result','label_ht_ou05','label_ht_ou15','label_ht_ft',\n", " 'label_odd_even','label_yellow_cards','label_cards_ou45','label_handicap_ms',\n", "}\n", "\n", "XGB_BASE = {\n", " 'max_depth': 4, 'eta': 0.05, 'subsample': 0.8,\n", " 'colsample_bytree': 0.8, 'min_child_weight': 5,\n", " 'gamma': 0.1, 'reg_lambda': 1.0, 'verbosity': 0, 'seed': 42,\n", " 'nthread': -1,\n", "}\n", "\n", "df = pd.read_csv(DATA_PATH, low_memory=False)\n", "feature_cols = [c for c in df.columns if c not in SKIP_COLS]\n", "print(f'Y\u00fcklendi: {len(df):,} sat\u0131r | {len(feature_cols)} feature')\n", "\n", "qualified = json.load(open(QL_PATH)) if os.path.exists(QL_PATH) else df['league_id'].unique().tolist()\n", "counts = df[df['league_id'].isin(qualified)].groupby('league_id').size()\n", "full_ids = counts[counts >= 500].index.tolist()\n", "cal_ids = counts[(counts >= 100) & (counts < 500)].index.tolist()\n", "print(f'Tam model: {len(full_ids)} | Kalibrasyon: {len(cal_ids)} | Toplam: {len(full_ids)+len(cal_ids)}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def train_one_league(league_id, df_league, feature_cols, full_model):\n", " n = len(df_league)\n", " out_dir = f'{MODELS_DIR}/{league_id}'\n", " os.makedirs(out_dir, exist_ok=True)\n", " metrics = {}\n", "\n", " df_sorted = df_league.sort_values('mst_utc')\n", " split = int(n * 0.80)\n", " df_tr, df_te = df_sorted.iloc[:split], df_sorted.iloc[split:]\n", "\n", " saved_fc = False\n", "\n", " for market, cfg in MARKETS.items():\n", " lbl, nc, ms = cfg['label'], cfg['num_class'], cfg['min_samples']\n", " if lbl not in df_league.columns:\n", " continue\n", "\n", " if full_model:\n", " vtr = df_tr[feature_cols + [lbl]].dropna()\n", " vte = df_te[feature_cols + [lbl]].dropna()\n", " if len(vtr) < ms or len(vte) < 30:\n", " continue\n", " Xtr, ytr = vtr[feature_cols].fillna(0).values, vtr[lbl].values.astype(int)\n", " Xte, yte = vte[feature_cols].fillna(0).values, vte[lbl].values.astype(int)\n", "\n", " params = {**XGB_BASE, 'objective': 'multi:softprob' if nc > 2 else 'binary:logistic',\n", " 'eval_metric': 'mlogloss' if nc > 2 else 'logloss'}\n", " if nc > 2: params['num_class'] = nc\n", "\n", " dtr = xgb.DMatrix(Xtr, label=ytr, feature_names=feature_cols)\n", " dte = xgb.DMatrix(Xte, label=yte, feature_names=feature_cols)\n", " model = xgb.train(params, dtr, 300, [(dte,'v')], early_stopping_rounds=30, verbose_eval=False)\n", " model.save_model(f'{out_dir}/xgb_{market.lower()}.json')\n", "\n", " if not saved_fc:\n", " json.dump(feature_cols, open(f'{out_dir}/feature_cols.json','w'))\n", " saved_fc = True\n", "\n", " raw = model.predict(dte)\n", " if nc > 2:\n", " raw = raw.reshape(-1, nc)\n", " acc = accuracy_score(yte, np.argmax(raw, axis=1))\n", " for ci in range(nc):\n", " iso = IsotonicRegression(out_of_bounds='clip').fit(raw[:,ci], (yte==ci).astype(int))\n", " pickle.dump(iso, open(f'{out_dir}/cal_{market.lower()}_{ci}.pkl','wb'))\n", " else:\n", " acc = accuracy_score(yte, (raw>=0.5).astype(int))\n", " iso = IsotonicRegression(out_of_bounds='clip').fit(raw, yte)\n", " pickle.dump(iso, open(f'{out_dir}/cal_{market.lower()}.pkl','wb'))\n", "\n", " metrics[market] = {'accuracy': round(float(acc),4), 'n_train': len(Xtr)}\n", " else:\n", " # Cal only \u2014 store empty placeholder so prediction knows to use general V25\n", " metrics[market] = {'model': 'cal_only', 'n': n}\n", "\n", " json.dump({'league_id': league_id, 'n': n, 'markets': metrics},\n", " open(f'{out_dir}/metrics.json','w'), indent=2)\n", " return metrics\n", "\n", "start = time.time()\n", "all_ids = [(lid, True) for lid in full_ids] + [(lid, False) for lid in cal_ids]\n", "results = []\n", "\n", "for i, (lid, full) in enumerate(all_ids, 1):\n", " dfl = df[df['league_id'] == lid].copy()\n", " t0 = time.time()\n", " try:\n", " mkt_res = train_one_league(lid, dfl, feature_cols, full)\n", " ms_acc = mkt_res.get('MS', {}).get('accuracy', '-')\n", " results.append((lid, len(dfl), mkt_res))\n", " print(f'[{i:>3}/{len(all_ids)}] {lid[:20]:<20} n={len(dfl):>5,} MS={ms_acc} {time.time()-t0:.1f}s')\n", " except Exception as e:\n", " print(f'[{i:>3}/{len(all_ids)}] {lid[:20]:<20} ERROR: {e}')\n", "\n", " if i % 20 == 0:\n", " el = time.time()-start\n", " print(f' \u2500\u2500 {i}/{len(all_ids)} done | {el/60:.1f}min elapsed | ~{el/i*(len(all_ids)-i)/60:.1f}min left \u2500\u2500')\n", "\n", "print(f'\\nBitti! {len(results)} lig | {(time.time()-start)/60:.1f} dakika')\n", "print(f'Modeller: {MODELS_DIR}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Sonu\u00e7lar\u0131 g\u00f6ster \u2014 MS accuracy s\u0131ralamas\u0131\n", "printable = [(lid, n, m) for lid, n, m in results if 'MS' in m and 'accuracy' in m['MS']]\n", "printable.sort(key=lambda x: x[2]['MS']['accuracy'], reverse=True)\n", "print(f'{\"Liga ID\":<30} {\"Ma\u00e7\":>6} {\"MS\":>7} {\"OU15\":>7} {\"OU25\":>7} {\"BTTS\":>7}')\n", "print('-'*70)\n", "for lid, n, m in printable[:30]:\n", " ms = m.get('MS', {}).get('accuracy', 0)*100\n", " ou15 = m.get('OU15',{}).get('accuracy', 0)*100\n", " ou25 = m.get('OU25',{}).get('accuracy', 0)*100\n", " btts = m.get('BTTS',{}).get('accuracy', 0)*100\n", " print(f'{lid:<30} {n:>6,} {ms:>6.1f}% {ou15:>6.1f}% {ou25:>6.1f}% {btts:>6.1f}%')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Zip ve indir\n", "import shutil\n", "zip_path = f'{DRIVE_DIR}/league_specific_models.zip'\n", "shutil.make_archive(zip_path.replace('.zip',''), 'zip', MODELS_DIR)\n", "print(f'Zip: {zip_path}')\n", "# \u0130ndirmek i\u00e7in:\n", "# from google.colab import files\n", "# files.download(zip_path)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 4 }