259 lines
11 KiB
Plaintext
259 lines
11 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# League-Specific Model Trainer \u2014 Google Colab\n",
|
|
"164 lig i\u00e7in XGBoost + isotonic kalibrasyon. 12 market.\n",
|
|
"Modeller Drive'a kaydedilir, `models/league_specific/` klas\u00f6r\u00fcne kopyalan\u0131r.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Mount Drive\n",
|
|
"from google.colab import drive\n",
|
|
"drive.mount('/content/drive')\n",
|
|
"\n",
|
|
"DRIVE_DIR = '/content/drive/MyDrive/iddaai'\n",
|
|
"import os\n",
|
|
"os.makedirs(DRIVE_DIR, exist_ok=True)\n",
|
|
"print('Drive mounted:', DRIVE_DIR)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# training_data.csv zaten Drive da: /content/drive/MyDrive/iddaai/training_data.csv\n",
|
|
"# Sadece qualified_leagues.json upload et (iddaai-be/ klas\u00f6r\u00fcnden)\n",
|
|
"from google.colab import files\n",
|
|
"import shutil\n",
|
|
"print(\"qualified_leagues.json dosyasini upload edin\")\n",
|
|
"uploaded = files.upload()\n",
|
|
"for fname in uploaded:\n",
|
|
" shutil.copy(fname, f\"{DRIVE_DIR}/{fname}\")\n",
|
|
" print(f\"Kaydedildi: {DRIVE_DIR}/{fname}\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Upload training_data.csv and qualified_leagues.json from local machine\n",
|
|
"from google.colab import files\n",
|
|
"print('training_data.csv upload edin (ai-engine/data/training_data.csv)')\n",
|
|
"uploaded = files.upload()\n",
|
|
"import shutil\n",
|
|
"for fname in uploaded:\n",
|
|
" shutil.copy(fname, f'{DRIVE_DIR}/{fname}')\n",
|
|
" print(f'Saved: {DRIVE_DIR}/{fname}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os, json, pickle, time, warnings\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import xgboost as xgb\n",
|
|
"from sklearn.isotonic import IsotonicRegression\n",
|
|
"from sklearn.metrics import accuracy_score, log_loss\n",
|
|
"warnings.filterwarnings('ignore')\n",
|
|
"\n",
|
|
"DRIVE_DIR = '/content/drive/MyDrive/iddaai'\n",
|
|
"DATA_PATH = f'{DRIVE_DIR}/training_data.csv'\n",
|
|
"QL_PATH = f'{DRIVE_DIR}/qualified_leagues.json'\n",
|
|
"MODELS_DIR = f'{DRIVE_DIR}/league_specific'\n",
|
|
"os.makedirs(MODELS_DIR, exist_ok=True)\n",
|
|
"\n",
|
|
"MARKETS = {\n",
|
|
" 'MS': {'label': 'label_ms', 'num_class': 3, 'min_samples': 200},\n",
|
|
" 'OU15': {'label': 'label_ou15', 'num_class': 2, 'min_samples': 150},\n",
|
|
" 'OU25': {'label': 'label_ou25', 'num_class': 2, 'min_samples': 150},\n",
|
|
" 'OU35': {'label': 'label_ou35', 'num_class': 2, 'min_samples': 150},\n",
|
|
" 'BTTS': {'label': 'label_btts', 'num_class': 2, 'min_samples': 150},\n",
|
|
" 'HT': {'label': 'label_ht_result', 'num_class': 3, 'min_samples': 150},\n",
|
|
" 'HT_OU05': {'label': 'label_ht_ou05', 'num_class': 2, 'min_samples': 150},\n",
|
|
" 'HT_OU15': {'label': 'label_ht_ou15', 'num_class': 2, 'min_samples': 150},\n",
|
|
" 'HTFT': {'label': 'label_ht_ft', 'num_class': 9, 'min_samples': 300},\n",
|
|
" 'OE': {'label': 'label_odd_even', 'num_class': 2, 'min_samples': 150},\n",
|
|
" 'CARDS': {'label': 'label_cards_ou45', 'num_class': 2, 'min_samples': 150},\n",
|
|
" 'HANDICAP': {'label': 'label_handicap_ms', 'num_class': 3, 'min_samples': 200},\n",
|
|
"}\n",
|
|
"\n",
|
|
"SKIP_COLS = {\n",
|
|
" 'match_id','home_team_id','away_team_id','league_id','mst_utc',\n",
|
|
" 'score_home','score_away','total_goals','ht_score_home','ht_score_away','ht_total_goals',\n",
|
|
" 'label_ms','label_ou05','label_ou15','label_ou25','label_ou35','label_btts',\n",
|
|
" 'label_ht_result','label_ht_ou05','label_ht_ou15','label_ht_ft',\n",
|
|
" 'label_odd_even','label_yellow_cards','label_cards_ou45','label_handicap_ms',\n",
|
|
"}\n",
|
|
"\n",
|
|
"XGB_BASE = {\n",
|
|
" 'max_depth': 4, 'eta': 0.05, 'subsample': 0.8,\n",
|
|
" 'colsample_bytree': 0.8, 'min_child_weight': 5,\n",
|
|
" 'gamma': 0.1, 'reg_lambda': 1.0, 'verbosity': 0, 'seed': 42,\n",
|
|
" 'nthread': -1,\n",
|
|
"}\n",
|
|
"\n",
|
|
"df = pd.read_csv(DATA_PATH, low_memory=False)\n",
|
|
"feature_cols = [c for c in df.columns if c not in SKIP_COLS]\n",
|
|
"print(f'Y\u00fcklendi: {len(df):,} sat\u0131r | {len(feature_cols)} feature')\n",
|
|
"\n",
|
|
"qualified = json.load(open(QL_PATH)) if os.path.exists(QL_PATH) else df['league_id'].unique().tolist()\n",
|
|
"counts = df[df['league_id'].isin(qualified)].groupby('league_id').size()\n",
|
|
"full_ids = counts[counts >= 500].index.tolist()\n",
|
|
"cal_ids = counts[(counts >= 100) & (counts < 500)].index.tolist()\n",
|
|
"print(f'Tam model: {len(full_ids)} | Kalibrasyon: {len(cal_ids)} | Toplam: {len(full_ids)+len(cal_ids)}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def train_one_league(league_id, df_league, feature_cols, full_model):\n",
|
|
" n = len(df_league)\n",
|
|
" out_dir = f'{MODELS_DIR}/{league_id}'\n",
|
|
" os.makedirs(out_dir, exist_ok=True)\n",
|
|
" metrics = {}\n",
|
|
"\n",
|
|
" df_sorted = df_league.sort_values('mst_utc')\n",
|
|
" split = int(n * 0.80)\n",
|
|
" df_tr, df_te = df_sorted.iloc[:split], df_sorted.iloc[split:]\n",
|
|
"\n",
|
|
" saved_fc = False\n",
|
|
"\n",
|
|
" for market, cfg in MARKETS.items():\n",
|
|
" lbl, nc, ms = cfg['label'], cfg['num_class'], cfg['min_samples']\n",
|
|
" if lbl not in df_league.columns:\n",
|
|
" continue\n",
|
|
"\n",
|
|
" if full_model:\n",
|
|
" vtr = df_tr[feature_cols + [lbl]].dropna()\n",
|
|
" vte = df_te[feature_cols + [lbl]].dropna()\n",
|
|
" if len(vtr) < ms or len(vte) < 30:\n",
|
|
" continue\n",
|
|
" Xtr, ytr = vtr[feature_cols].fillna(0).values, vtr[lbl].values.astype(int)\n",
|
|
" Xte, yte = vte[feature_cols].fillna(0).values, vte[lbl].values.astype(int)\n",
|
|
"\n",
|
|
" params = {**XGB_BASE, 'objective': 'multi:softprob' if nc > 2 else 'binary:logistic',\n",
|
|
" 'eval_metric': 'mlogloss' if nc > 2 else 'logloss'}\n",
|
|
" if nc > 2: params['num_class'] = nc\n",
|
|
"\n",
|
|
" dtr = xgb.DMatrix(Xtr, label=ytr, feature_names=feature_cols)\n",
|
|
" dte = xgb.DMatrix(Xte, label=yte, feature_names=feature_cols)\n",
|
|
" model = xgb.train(params, dtr, 300, [(dte,'v')], early_stopping_rounds=30, verbose_eval=False)\n",
|
|
" model.save_model(f'{out_dir}/xgb_{market.lower()}.json')\n",
|
|
"\n",
|
|
" if not saved_fc:\n",
|
|
" json.dump(feature_cols, open(f'{out_dir}/feature_cols.json','w'))\n",
|
|
" saved_fc = True\n",
|
|
"\n",
|
|
" raw = model.predict(dte)\n",
|
|
" if nc > 2:\n",
|
|
" raw = raw.reshape(-1, nc)\n",
|
|
" acc = accuracy_score(yte, np.argmax(raw, axis=1))\n",
|
|
" for ci in range(nc):\n",
|
|
" iso = IsotonicRegression(out_of_bounds='clip').fit(raw[:,ci], (yte==ci).astype(int))\n",
|
|
" pickle.dump(iso, open(f'{out_dir}/cal_{market.lower()}_{ci}.pkl','wb'))\n",
|
|
" else:\n",
|
|
" acc = accuracy_score(yte, (raw>=0.5).astype(int))\n",
|
|
" iso = IsotonicRegression(out_of_bounds='clip').fit(raw, yte)\n",
|
|
" pickle.dump(iso, open(f'{out_dir}/cal_{market.lower()}.pkl','wb'))\n",
|
|
"\n",
|
|
" metrics[market] = {'accuracy': round(float(acc),4), 'n_train': len(Xtr)}\n",
|
|
" else:\n",
|
|
" # Cal only \u2014 store empty placeholder so prediction knows to use general V25\n",
|
|
" metrics[market] = {'model': 'cal_only', 'n': n}\n",
|
|
"\n",
|
|
" json.dump({'league_id': league_id, 'n': n, 'markets': metrics},\n",
|
|
" open(f'{out_dir}/metrics.json','w'), indent=2)\n",
|
|
" return metrics\n",
|
|
"\n",
|
|
"start = time.time()\n",
|
|
"all_ids = [(lid, True) for lid in full_ids] + [(lid, False) for lid in cal_ids]\n",
|
|
"results = []\n",
|
|
"\n",
|
|
"for i, (lid, full) in enumerate(all_ids, 1):\n",
|
|
" dfl = df[df['league_id'] == lid].copy()\n",
|
|
" t0 = time.time()\n",
|
|
" try:\n",
|
|
" mkt_res = train_one_league(lid, dfl, feature_cols, full)\n",
|
|
" ms_acc = mkt_res.get('MS', {}).get('accuracy', '-')\n",
|
|
" results.append((lid, len(dfl), mkt_res))\n",
|
|
" print(f'[{i:>3}/{len(all_ids)}] {lid[:20]:<20} n={len(dfl):>5,} MS={ms_acc} {time.time()-t0:.1f}s')\n",
|
|
" except Exception as e:\n",
|
|
" print(f'[{i:>3}/{len(all_ids)}] {lid[:20]:<20} ERROR: {e}')\n",
|
|
"\n",
|
|
" if i % 20 == 0:\n",
|
|
" el = time.time()-start\n",
|
|
" print(f' \u2500\u2500 {i}/{len(all_ids)} done | {el/60:.1f}min elapsed | ~{el/i*(len(all_ids)-i)/60:.1f}min left \u2500\u2500')\n",
|
|
"\n",
|
|
"print(f'\\nBitti! {len(results)} lig | {(time.time()-start)/60:.1f} dakika')\n",
|
|
"print(f'Modeller: {MODELS_DIR}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Sonu\u00e7lar\u0131 g\u00f6ster \u2014 MS accuracy s\u0131ralamas\u0131\n",
|
|
"printable = [(lid, n, m) for lid, n, m in results if 'MS' in m and 'accuracy' in m['MS']]\n",
|
|
"printable.sort(key=lambda x: x[2]['MS']['accuracy'], reverse=True)\n",
|
|
"print(f'{\"Liga ID\":<30} {\"Ma\u00e7\":>6} {\"MS\":>7} {\"OU15\":>7} {\"OU25\":>7} {\"BTTS\":>7}')\n",
|
|
"print('-'*70)\n",
|
|
"for lid, n, m in printable[:30]:\n",
|
|
" ms = m.get('MS', {}).get('accuracy', 0)*100\n",
|
|
" ou15 = m.get('OU15',{}).get('accuracy', 0)*100\n",
|
|
" ou25 = m.get('OU25',{}).get('accuracy', 0)*100\n",
|
|
" btts = m.get('BTTS',{}).get('accuracy', 0)*100\n",
|
|
" print(f'{lid:<30} {n:>6,} {ms:>6.1f}% {ou15:>6.1f}% {ou25:>6.1f}% {btts:>6.1f}%')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Zip ve indir\n",
|
|
"import shutil\n",
|
|
"zip_path = f'{DRIVE_DIR}/league_specific_models.zip'\n",
|
|
"shutil.make_archive(zip_path.replace('.zip',''), 'zip', MODELS_DIR)\n",
|
|
"print(f'Zip: {zip_path}')\n",
|
|
"# \u0130ndirmek i\u00e7in:\n",
|
|
"# from google.colab import files\n",
|
|
"# files.download(zip_path)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"name": "python",
|
|
"version": "3.10.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
} |