This commit is contained in:
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
VQWEN Model Training Script (Optimized)
|
||||
========================================
|
||||
Fast, efficient, uses all 180k+ matches with rich features.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import pickle
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
import lightgbm as lgb
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
ROOT_DIR = os.path.dirname(AI_DIR)
|
||||
sys.path.insert(0, ROOT_DIR)
|
||||
|
||||
def get_clean_dsn() -> str:
|
||||
return "postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db"
|
||||
|
||||
def train_vqwen():
|
||||
print("🧠 VQWEN MODEL EĞİTİMİ (OPTIMIZED)")
|
||||
print("="*60)
|
||||
|
||||
dsn = get_clean_dsn()
|
||||
conn = psycopg2.connect(dsn)
|
||||
cur = conn.cursor()
|
||||
|
||||
# ─── 1. HIZLI VERİ ÇEKME (Optimized Query) ───
|
||||
query = """
|
||||
SELECT
|
||||
m.id, m.home_team_id, m.away_team_id, m.score_home, m.score_away,
|
||||
-- Odds
|
||||
(SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||||
WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = '1' LIMIT 1) as odds_h,
|
||||
(SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||||
WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = 'X' LIMIT 1) as odds_d,
|
||||
(SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||||
WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = '2' LIMIT 1) as odds_a,
|
||||
-- Form (Last 5)
|
||||
COALESCE((SELECT AVG(CASE WHEN m2.home_team_id = m.home_team_id AND m2.score_home > m2.score_away THEN 3 WHEN m2.home_team_id = m.home_team_id AND m2.score_home = m2.score_away THEN 1 ELSE 0 END) FROM matches m2 WHERE m2.home_team_id = m.home_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc LIMIT 5), 0) as home_form,
|
||||
COALESCE((SELECT AVG(CASE WHEN m2.away_team_id = m.away_team_id AND m2.score_away > m2.score_home THEN 3 WHEN m2.away_team_id = m.away_team_id AND m2.score_away = m2.score_home THEN 1 ELSE 0 END) FROM matches m2 WHERE m2.away_team_id = m.away_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc LIMIT 5), 0) as away_form,
|
||||
-- Goal Averages
|
||||
COALESCE((SELECT AVG(m2.score_home) FROM matches m2 WHERE m2.home_team_id = m.home_team_id AND m2.status = 'FT' LIMIT 10), 1.2) as h_avg_scored,
|
||||
COALESCE((SELECT AVG(m2.score_away) FROM matches m2 WHERE m2.away_team_id = m.home_team_id AND m2.status = 'FT' LIMIT 10), 1.2) as h_avg_conceded,
|
||||
COALESCE((SELECT AVG(m2.score_away) FROM matches m2 WHERE m2.away_team_id = m.away_team_id AND m2.status = 'FT' LIMIT 10), 1.2) as a_avg_scored,
|
||||
COALESCE((SELECT AVG(m2.score_home) FROM matches m2 WHERE m2.home_team_id = m.away_team_id AND m2.status = 'FT' LIMIT 10), 1.2) as a_avg_conceded,
|
||||
-- Team Stats
|
||||
COALESCE(ts_home.possession_percentage, 50) as h_poss,
|
||||
COALESCE(ts_home.shots_on_target, 4) as h_sot,
|
||||
COALESCE(ts_home.corners, 5) as h_corners,
|
||||
COALESCE(ts_away.possession_percentage, 50) as a_poss,
|
||||
COALESCE(ts_away.shots_on_target, 3) as a_sot,
|
||||
COALESCE(ts_away.corners, 4) as a_corners
|
||||
FROM matches m
|
||||
LEFT JOIN football_team_stats ts_home ON ts_home.match_id = m.id AND ts_home.team_id = m.home_team_id
|
||||
LEFT JOIN football_team_stats ts_away ON ts_away.match_id = m.id AND ts_away.team_id = m.away_team_id
|
||||
WHERE m.status = 'FT' AND m.score_home IS NOT NULL AND m.sport = 'football'
|
||||
AND EXISTS (SELECT 1 FROM odd_categories oc WHERE oc.match_id = m.id)
|
||||
ORDER BY m.mst_utc DESC
|
||||
LIMIT 200000
|
||||
"""
|
||||
|
||||
print("📊 Veritabanından özellikler çekiliyor (Limit 200k)...")
|
||||
start = time.time()
|
||||
cur.execute(query)
|
||||
rows = cur.fetchall()
|
||||
print(f"✅ {len(rows)} maç çekildi ({time.time()-start:.1f}s)")
|
||||
|
||||
df = pd.DataFrame(rows, columns=[
|
||||
'id', 'h_id', 'a_id', 'sh', 'sa', 'oh', 'od', 'oa',
|
||||
'h_form', 'a_form', 'h_sc', 'h_co', 'a_sc', 'a_co',
|
||||
'h_poss', 'h_sot', 'h_corn', 'a_poss', 'a_sot', 'a_corn'
|
||||
])
|
||||
|
||||
for col in df.columns[5:]:
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce')
|
||||
df = df.fillna(df.median(numeric_only=True))
|
||||
|
||||
# ─── 2. ÖZELLİK MÜHENDİSLİĞİ ───
|
||||
df['h_xg'] = (df['h_sc'] + df['a_co']) / 2
|
||||
df['a_xg'] = (df['a_sc'] + df['h_co']) / 2
|
||||
df['total_xg'] = df['h_xg'] + df['a_xg']
|
||||
|
||||
df['h_pow'] = (df['h_form']*10) + (df['h_sc']*5) - (df['h_co']*5) + (df['h_sot']*2)
|
||||
df['a_pow'] = (df['a_form']*10) + (df['a_sc']*5) - (df['a_co']*5) + (df['a_sot']*2)
|
||||
df['pow_diff'] = df['h_pow'] - df['a_pow']
|
||||
|
||||
margin = (1/df['oh']) + (1/df['od']) + (1/df['oa'])
|
||||
df['imp_h'] = (1/df['oh']) / margin
|
||||
df['imp_d'] = (1/df['od']) / margin
|
||||
df['imp_a'] = (1/df['oa']) / margin
|
||||
|
||||
# Targets
|
||||
df['t_ms'] = df.apply(lambda r: 0 if r['sh']>r['sa'] else (2 if r['sh']<r['sa'] else 1), axis=1)
|
||||
df['t_ou'] = ((df['sh'] + df['sa']) > 2.5).astype(int)
|
||||
df['t_btts'] = ((df['sh'] > 0) & (df['sa'] > 0)).astype(int)
|
||||
|
||||
# ─── 3. MODELLER ───
|
||||
feats_ms = ['h_form', 'a_form', 'h_xg', 'a_xg', 'pow_diff', 'imp_h', 'imp_d', 'imp_a', 'h_sot', 'a_sot']
|
||||
X_ms, y_ms = df[feats_ms], df['t_ms']
|
||||
|
||||
X_tr, X_te, y_tr, y_te = train_test_split(X_ms, y_ms, test_size=0.15, random_state=42)
|
||||
print("🤖 MS Modeli eğitiliyor...")
|
||||
model_ms = lgb.train({'objective': 'multiclass', 'num_class': 3, 'metric': 'multi_logloss', 'verbose': -1, 'num_leaves': 63},
|
||||
lgb.Dataset(X_tr, y_tr), num_boost_round=1000,
|
||||
valid_sets=[lgb.Dataset(X_te, y_te)],
|
||||
callbacks=[lgb.early_stopping(50)])
|
||||
|
||||
feats_ou = ['h_xg', 'a_xg', 'total_xg', 'h_sot', 'a_sot']
|
||||
print("🤖 OU2.5 Modeli...")
|
||||
model_ou = lgb.train({'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1},
|
||||
lgb.Dataset(df[feats_ou], df['t_ou']), num_boost_round=500)
|
||||
|
||||
feats_btts = ['h_xg', 'a_xg', 'h_sc', 'a_sc']
|
||||
print("🤖 BTTS Modeli...")
|
||||
model_btts = lgb.train({'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1},
|
||||
lgb.Dataset(df[feats_btts], df['t_btts']), num_boost_round=500)
|
||||
|
||||
# ─── 4. KAYDET ───
|
||||
mdir = os.path.join(ROOT_DIR, 'models', 'vqwen')
|
||||
os.makedirs(mdir, exist_ok=True)
|
||||
for nm, md in [('ms', model_ms), ('ou25', model_ou), ('btts', model_btts)]:
|
||||
p = os.path.join(mdir, f'vqwen_{nm}.pkl')
|
||||
with open(p, 'wb') as f: pickle.dump(md, f)
|
||||
print(f"✅ {p} kaydedildi.")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("\n🎉 VQWEN EĞİTİMİ BİTTİ!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
train_vqwen()
|
||||
Reference in New Issue
Block a user