166 lines
7.0 KiB
Python
166 lines
7.0 KiB
Python
"""
|
||
VQWEN Deep Model Training Script (Final Version)
|
||
================================================
|
||
Includes: ELO, Contextual Goals, Rest Days, Player Participation.
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import time
|
||
import pickle
|
||
import psycopg2
|
||
import pandas as pd
|
||
import numpy as np
|
||
from sklearn.model_selection import train_test_split
|
||
import lightgbm as lgb
|
||
|
||
AI_DIR = os.path.dirname(os.path.abspath(__file__))
|
||
ROOT_DIR = os.path.dirname(AI_DIR)
|
||
sys.path.insert(0, ROOT_DIR)
|
||
|
||
def get_clean_dsn() -> str:
|
||
return "postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db"
|
||
|
||
def train_vqwen_deep():
|
||
print("🧠 VQWEN DEEP MODEL EĞİTİMİ (ELO + REST + CONTEXT)")
|
||
print("="*60)
|
||
|
||
dsn = get_clean_dsn()
|
||
conn = psycopg2.connect(dsn)
|
||
cur = conn.cursor()
|
||
|
||
# ─── 1. GELİŞMİŞ VERİ SORGUSU ───
|
||
# ELO, Dinlenme Süresi, İç Saha/Deplasman Performansı
|
||
query = """
|
||
SELECT
|
||
m.id, m.home_team_id, m.away_team_id, m.score_home, m.score_away, m.mst_utc,
|
||
|
||
-- ELO Ratings
|
||
COALESCE(maf.home_elo, 1500) as home_elo,
|
||
COALESCE(maf.away_elo, 1500) as away_elo,
|
||
|
||
-- Contextual Goals (Home Team at Home, Away Team Away)
|
||
COALESCE((SELECT AVG(m2.score_home) FROM matches m2 WHERE m2.home_team_id = m.home_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc), 1.2) as h_home_goals,
|
||
COALESCE((SELECT AVG(m2.score_away) FROM matches m2 WHERE m2.away_team_id = m.away_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc), 1.2) as a_away_goals,
|
||
|
||
-- Rest Days (Yorgunluk)
|
||
COALESCE(EXTRACT(EPOCH FROM (to_timestamp(m.mst_utc/1000) - (SELECT MAX(to_timestamp(m2.mst_utc/1000)) FROM matches m2 WHERE m2.home_team_id = m.home_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc)) / 86400), 7) as h_rest,
|
||
COALESCE(EXTRACT(EPOCH FROM (to_timestamp(m.mst_utc/1000) - (SELECT MAX(to_timestamp(m2.mst_utc/1000)) FROM matches m2 WHERE m2.away_team_id = m.away_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc)) / 86400), 7) as a_rest,
|
||
|
||
-- Squad Participation
|
||
COALESCE((SELECT COUNT(*) FROM match_player_participation mp WHERE mp.match_id = m.id AND mp.team_id = m.home_team_id AND mp.is_starting = true), 11) as h_xi,
|
||
COALESCE((SELECT COUNT(*) FROM match_player_participation mp WHERE mp.match_id = m.id AND mp.team_id = m.away_team_id AND mp.is_starting = true), 11) as a_xi,
|
||
|
||
-- Cards
|
||
COALESCE((SELECT COUNT(*) FROM match_player_events mpe WHERE mpe.match_id = m.id AND mpe.event_type = 'card'), 4) as cards,
|
||
|
||
-- Odds
|
||
(SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = '1' LIMIT 1) as oh,
|
||
(SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = 'X' LIMIT 1) as od,
|
||
(SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = '2' LIMIT 1) as oa
|
||
|
||
FROM matches m
|
||
LEFT JOIN football_ai_features maf ON maf.match_id = m.id
|
||
WHERE m.status = 'FT' AND m.score_home IS NOT NULL AND m.sport = 'football'
|
||
AND EXISTS (SELECT 1 FROM odd_categories oc WHERE oc.match_id = m.id)
|
||
ORDER BY m.mst_utc DESC
|
||
LIMIT 150000
|
||
"""
|
||
|
||
print("📊 Veri çekiliyor...")
|
||
start = time.time()
|
||
cur.execute(query)
|
||
rows = cur.fetchall()
|
||
print(f"✅ {len(rows)} maç çekildi ({time.time()-start:.1f}s)")
|
||
|
||
df = pd.DataFrame(rows, columns=[
|
||
'id', 'h_id', 'a_id', 'sh', 'sa', 'utc',
|
||
'h_elo', 'a_elo',
|
||
'h_home_goals', 'a_away_goals',
|
||
'h_rest', 'a_rest',
|
||
'h_xi', 'a_xi', 'cards',
|
||
'oh', 'od', 'oa'
|
||
])
|
||
|
||
# Temizlik
|
||
for col in df.columns[2:]:
|
||
df[col] = pd.to_numeric(df[col], errors='coerce')
|
||
df = df.fillna(df.median(numeric_only=True))
|
||
df = df[(df['oh'] > 1.0) & (df['oa'] > 1.0)]
|
||
|
||
# ─── 2. ÖZELLİK MÜHENDİSLİĞİ ───
|
||
|
||
# 1. ELO Farkı
|
||
df['elo_diff'] = df['h_elo'] - df['a_elo']
|
||
|
||
# 2. Yorgunluk Faktörü (Dinlenme < 3 günse performans düşer)
|
||
# xG hesaplamasında kullanacağız
|
||
def fatigue_factor(rest):
|
||
if rest < 3: return 0.85
|
||
if rest < 5: return 0.95
|
||
return 1.0
|
||
|
||
df['h_fatigue'] = df['h_rest'].apply(fatigue_factor)
|
||
df['a_fatigue'] = df['a_rest'].apply(fatigue_factor)
|
||
|
||
# 3. xG (Contextual Goals * Fatigue)
|
||
df['h_xg'] = df['h_home_goals'] * df['h_fatigue']
|
||
df['a_xg'] = df['a_away_goals'] * df['a_fatigue']
|
||
df['total_xg'] = df['h_xg'] + df['a_xg']
|
||
df['rest_diff'] = df['h_rest'] - df['a_rest']
|
||
|
||
# 4. Form (ELO bazlı power rating)
|
||
df['h_pow'] = (df['h_elo'] / 100) * df['h_fatigue']
|
||
df['a_pow'] = (df['a_elo'] / 100) * df['a_fatigue']
|
||
df['pow_diff'] = df['h_pow'] - df['a_pow']
|
||
|
||
# Oranlar
|
||
margin = (1/df['oh']) + (1/df['od']) + (1/df['oa'])
|
||
df['imp_h'] = (1/df['oh']) / margin
|
||
df['imp_d'] = (1/df['od']) / margin
|
||
df['imp_a'] = (1/df['oa']) / margin
|
||
|
||
# Hedefler
|
||
df['t_ms'] = df.apply(lambda r: 0 if r['sh']>r['sa'] else (2 if r['sh']<r['sa'] else 1), axis=1)
|
||
df['t_ou'] = ((df['sh'] + df['sa']) > 2.5).astype(int)
|
||
df['t_btts'] = ((df['sh'] > 0) & (df['sa'] > 0)).astype(int)
|
||
|
||
# ─── 3. MODEL EĞİTİMİ ───
|
||
# Yeni Özellik Seti
|
||
feats = ['elo_diff', 'h_xg', 'a_xg', 'total_xg', 'pow_diff', 'rest_diff', 'h_fatigue', 'a_fatigue',
|
||
'imp_h', 'imp_d', 'imp_a', 'h_xi', 'a_xi', 'cards']
|
||
|
||
# MS
|
||
print("🤖 MS...")
|
||
X_ms, y_ms = df[feats], df['t_ms']
|
||
X_tr, X_te, y_tr, y_te = train_test_split(X_ms, y_ms, test_size=0.15, random_state=42)
|
||
model_ms = lgb.train({'objective': 'multiclass', 'num_class': 3, 'verbose': -1, 'num_leaves': 63},
|
||
lgb.Dataset(X_tr, y_tr), num_boost_round=1000,
|
||
valid_sets=[lgb.Dataset(X_te, y_te)], callbacks=[lgb.early_stopping(50)])
|
||
|
||
# OU2.5
|
||
print("🤖 OU2.5...")
|
||
model_ou = lgb.train({'objective': 'binary', 'verbose': -1},
|
||
lgb.Dataset(df[feats], df['t_ou']), num_boost_round=500)
|
||
|
||
# BTTS
|
||
print("🤖 BTTS...")
|
||
model_btts = lgb.train({'objective': 'binary', 'verbose': -1},
|
||
lgb.Dataset(df[feats], df['t_btts']), num_boost_round=500)
|
||
|
||
# ─── 4. KAYDET ───
|
||
mdir = os.path.join(ROOT_DIR, 'models', 'vqwen')
|
||
os.makedirs(mdir, exist_ok=True)
|
||
for nm, md in [('ms', model_ms), ('ou25', model_ou), ('btts', model_btts)]:
|
||
p = os.path.join(mdir, f'vqwen_{nm}.pkl')
|
||
with open(p, 'wb') as f: pickle.dump(md, f)
|
||
print(f"✅ vqwen_{nm}.pkl")
|
||
|
||
print("\n🎉 VQWEN DEEP EĞİTİMİ BİTTİ!")
|
||
cur.close()
|
||
conn.close()
|
||
|
||
if __name__ == "__main__":
|
||
train_vqwen_deep()
|