""" VQWEN Model Training Script (Optimized) ======================================== Fast, efficient, uses all 180k+ matches with rich features. """ import os import sys import json import time import pickle import psycopg2 import pandas as pd import numpy as np from sklearn.model_selection import train_test_split import lightgbm as lgb AI_DIR = os.path.dirname(os.path.abspath(__file__)) ROOT_DIR = os.path.dirname(AI_DIR) sys.path.insert(0, ROOT_DIR) def get_clean_dsn() -> str: return "postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db" def train_vqwen(): print("🧠 VQWEN MODEL EĞİTİMİ (OPTIMIZED)") print("="*60) dsn = get_clean_dsn() conn = psycopg2.connect(dsn) cur = conn.cursor() # ─── 1. HIZLI VERİ ÇEKME (Optimized Query) ─── query = """ SELECT m.id, m.home_team_id, m.away_team_id, m.score_home, m.score_away, -- Odds (SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = '1' LIMIT 1) as odds_h, (SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = 'X' LIMIT 1) as odds_d, (SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = '2' LIMIT 1) as odds_a, -- Form (Last 5) COALESCE((SELECT AVG(CASE WHEN m2.home_team_id = m.home_team_id AND m2.score_home > m2.score_away THEN 3 WHEN m2.home_team_id = m.home_team_id AND m2.score_home = m2.score_away THEN 1 ELSE 0 END) FROM matches m2 WHERE m2.home_team_id = m.home_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc LIMIT 5), 0) as home_form, COALESCE((SELECT AVG(CASE WHEN m2.away_team_id = m.away_team_id AND m2.score_away > m2.score_home THEN 3 WHEN m2.away_team_id = m.away_team_id AND m2.score_away = m2.score_home THEN 1 ELSE 0 END) FROM matches m2 WHERE m2.away_team_id = m.away_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc LIMIT 5), 0) as away_form, -- Goal Averages COALESCE((SELECT AVG(m2.score_home) FROM matches m2 WHERE m2.home_team_id = m.home_team_id AND m2.status = 'FT' LIMIT 10), 1.2) as h_avg_scored, COALESCE((SELECT AVG(m2.score_away) FROM matches m2 WHERE m2.away_team_id = m.home_team_id AND m2.status = 'FT' LIMIT 10), 1.2) as h_avg_conceded, COALESCE((SELECT AVG(m2.score_away) FROM matches m2 WHERE m2.away_team_id = m.away_team_id AND m2.status = 'FT' LIMIT 10), 1.2) as a_avg_scored, COALESCE((SELECT AVG(m2.score_home) FROM matches m2 WHERE m2.home_team_id = m.away_team_id AND m2.status = 'FT' LIMIT 10), 1.2) as a_avg_conceded, -- Team Stats COALESCE(ts_home.possession_percentage, 50) as h_poss, COALESCE(ts_home.shots_on_target, 4) as h_sot, COALESCE(ts_home.corners, 5) as h_corners, COALESCE(ts_away.possession_percentage, 50) as a_poss, COALESCE(ts_away.shots_on_target, 3) as a_sot, COALESCE(ts_away.corners, 4) as a_corners FROM matches m LEFT JOIN football_team_stats ts_home ON ts_home.match_id = m.id AND ts_home.team_id = m.home_team_id LEFT JOIN football_team_stats ts_away ON ts_away.match_id = m.id AND ts_away.team_id = m.away_team_id WHERE m.status = 'FT' AND m.score_home IS NOT NULL AND m.sport = 'football' AND EXISTS (SELECT 1 FROM odd_categories oc WHERE oc.match_id = m.id) ORDER BY m.mst_utc DESC LIMIT 200000 """ print("📊 Veritabanından özellikler çekiliyor (Limit 200k)...") start = time.time() cur.execute(query) rows = cur.fetchall() print(f"✅ {len(rows)} maç çekildi ({time.time()-start:.1f}s)") df = pd.DataFrame(rows, columns=[ 'id', 'h_id', 'a_id', 'sh', 'sa', 'oh', 'od', 'oa', 'h_form', 'a_form', 'h_sc', 'h_co', 'a_sc', 'a_co', 'h_poss', 'h_sot', 'h_corn', 'a_poss', 'a_sot', 'a_corn' ]) for col in df.columns[5:]: df[col] = pd.to_numeric(df[col], errors='coerce') df = df.fillna(df.median(numeric_only=True)) # ─── 2. ÖZELLİK MÜHENDİSLİĞİ ─── df['h_xg'] = (df['h_sc'] + df['a_co']) / 2 df['a_xg'] = (df['a_sc'] + df['h_co']) / 2 df['total_xg'] = df['h_xg'] + df['a_xg'] df['h_pow'] = (df['h_form']*10) + (df['h_sc']*5) - (df['h_co']*5) + (df['h_sot']*2) df['a_pow'] = (df['a_form']*10) + (df['a_sc']*5) - (df['a_co']*5) + (df['a_sot']*2) df['pow_diff'] = df['h_pow'] - df['a_pow'] margin = (1/df['oh']) + (1/df['od']) + (1/df['oa']) df['imp_h'] = (1/df['oh']) / margin df['imp_d'] = (1/df['od']) / margin df['imp_a'] = (1/df['oa']) / margin # Targets df['t_ms'] = df.apply(lambda r: 0 if r['sh']>r['sa'] else (2 if r['sh'] 2.5).astype(int) df['t_btts'] = ((df['sh'] > 0) & (df['sa'] > 0)).astype(int) # ─── 3. MODELLER ─── feats_ms = ['h_form', 'a_form', 'h_xg', 'a_xg', 'pow_diff', 'imp_h', 'imp_d', 'imp_a', 'h_sot', 'a_sot'] X_ms, y_ms = df[feats_ms], df['t_ms'] X_tr, X_te, y_tr, y_te = train_test_split(X_ms, y_ms, test_size=0.15, random_state=42) print("🤖 MS Modeli eğitiliyor...") model_ms = lgb.train({'objective': 'multiclass', 'num_class': 3, 'metric': 'multi_logloss', 'verbose': -1, 'num_leaves': 63}, lgb.Dataset(X_tr, y_tr), num_boost_round=1000, valid_sets=[lgb.Dataset(X_te, y_te)], callbacks=[lgb.early_stopping(50)]) feats_ou = ['h_xg', 'a_xg', 'total_xg', 'h_sot', 'a_sot'] print("🤖 OU2.5 Modeli...") model_ou = lgb.train({'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1}, lgb.Dataset(df[feats_ou], df['t_ou']), num_boost_round=500) feats_btts = ['h_xg', 'a_xg', 'h_sc', 'a_sc'] print("🤖 BTTS Modeli...") model_btts = lgb.train({'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1}, lgb.Dataset(df[feats_btts], df['t_btts']), num_boost_round=500) # ─── 4. KAYDET ─── mdir = os.path.join(ROOT_DIR, 'models', 'vqwen') os.makedirs(mdir, exist_ok=True) for nm, md in [('ms', model_ms), ('ou25', model_ou), ('btts', model_btts)]: p = os.path.join(mdir, f'vqwen_{nm}.pkl') with open(p, 'wb') as f: pickle.dump(md, f) print(f"✅ {p} kaydedildi.") cur.close() conn.close() print("\n🎉 VQWEN EĞİTİMİ BİTTİ!") if __name__ == "__main__": train_vqwen()