Files
iddaai-be/ai-engine/scripts/train_vqwen_deep.py
fahricansecer 2f0b85a0c7
Deploy Iddaai Backend / build-and-deploy (push) Failing after 18s
first (part 2: other directories)
2026-04-16 15:11:25 +03:00

166 lines
7.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
VQWEN Deep Model Training Script (Final Version)
================================================
Includes: ELO, Contextual Goals, Rest Days, Player Participation.
"""
import os
import sys
import json
import time
import pickle
import psycopg2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
AI_DIR = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR = os.path.dirname(AI_DIR)
sys.path.insert(0, ROOT_DIR)
def get_clean_dsn() -> str:
return "postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db"
def train_vqwen_deep():
print("🧠 VQWEN DEEP MODEL EĞİTİMİ (ELO + REST + CONTEXT)")
print("="*60)
dsn = get_clean_dsn()
conn = psycopg2.connect(dsn)
cur = conn.cursor()
# ─── 1. GELİŞMİŞ VERİ SORGUSU ───
# ELO, Dinlenme Süresi, İç Saha/Deplasman Performansı
query = """
SELECT
m.id, m.home_team_id, m.away_team_id, m.score_home, m.score_away, m.mst_utc,
-- ELO Ratings
COALESCE(maf.home_elo, 1500) as home_elo,
COALESCE(maf.away_elo, 1500) as away_elo,
-- Contextual Goals (Home Team at Home, Away Team Away)
COALESCE((SELECT AVG(m2.score_home) FROM matches m2 WHERE m2.home_team_id = m.home_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc), 1.2) as h_home_goals,
COALESCE((SELECT AVG(m2.score_away) FROM matches m2 WHERE m2.away_team_id = m.away_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc), 1.2) as a_away_goals,
-- Rest Days (Yorgunluk)
COALESCE(EXTRACT(EPOCH FROM (to_timestamp(m.mst_utc/1000) - (SELECT MAX(to_timestamp(m2.mst_utc/1000)) FROM matches m2 WHERE m2.home_team_id = m.home_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc)) / 86400), 7) as h_rest,
COALESCE(EXTRACT(EPOCH FROM (to_timestamp(m.mst_utc/1000) - (SELECT MAX(to_timestamp(m2.mst_utc/1000)) FROM matches m2 WHERE m2.away_team_id = m.away_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc)) / 86400), 7) as a_rest,
-- Squad Participation
COALESCE((SELECT COUNT(*) FROM match_player_participation mp WHERE mp.match_id = m.id AND mp.team_id = m.home_team_id AND mp.is_starting = true), 11) as h_xi,
COALESCE((SELECT COUNT(*) FROM match_player_participation mp WHERE mp.match_id = m.id AND mp.team_id = m.away_team_id AND mp.is_starting = true), 11) as a_xi,
-- Cards
COALESCE((SELECT COUNT(*) FROM match_player_events mpe WHERE mpe.match_id = m.id AND mpe.event_type = 'card'), 4) as cards,
-- Odds
(SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = '1' LIMIT 1) as oh,
(SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = 'X' LIMIT 1) as od,
(SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = '2' LIMIT 1) as oa
FROM matches m
LEFT JOIN football_ai_features maf ON maf.match_id = m.id
WHERE m.status = 'FT' AND m.score_home IS NOT NULL AND m.sport = 'football'
AND EXISTS (SELECT 1 FROM odd_categories oc WHERE oc.match_id = m.id)
ORDER BY m.mst_utc DESC
LIMIT 150000
"""
print("📊 Veri çekiliyor...")
start = time.time()
cur.execute(query)
rows = cur.fetchall()
print(f"{len(rows)} maç çekildi ({time.time()-start:.1f}s)")
df = pd.DataFrame(rows, columns=[
'id', 'h_id', 'a_id', 'sh', 'sa', 'utc',
'h_elo', 'a_elo',
'h_home_goals', 'a_away_goals',
'h_rest', 'a_rest',
'h_xi', 'a_xi', 'cards',
'oh', 'od', 'oa'
])
# Temizlik
for col in df.columns[2:]:
df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(df.median(numeric_only=True))
df = df[(df['oh'] > 1.0) & (df['oa'] > 1.0)]
# ─── 2. ÖZELLİK MÜHENDİSLİĞİ ───
# 1. ELO Farkı
df['elo_diff'] = df['h_elo'] - df['a_elo']
# 2. Yorgunluk Faktörü (Dinlenme < 3 günse performans düşer)
# xG hesaplamasında kullanacağız
def fatigue_factor(rest):
if rest < 3: return 0.85
if rest < 5: return 0.95
return 1.0
df['h_fatigue'] = df['h_rest'].apply(fatigue_factor)
df['a_fatigue'] = df['a_rest'].apply(fatigue_factor)
# 3. xG (Contextual Goals * Fatigue)
df['h_xg'] = df['h_home_goals'] * df['h_fatigue']
df['a_xg'] = df['a_away_goals'] * df['a_fatigue']
df['total_xg'] = df['h_xg'] + df['a_xg']
df['rest_diff'] = df['h_rest'] - df['a_rest']
# 4. Form (ELO bazlı power rating)
df['h_pow'] = (df['h_elo'] / 100) * df['h_fatigue']
df['a_pow'] = (df['a_elo'] / 100) * df['a_fatigue']
df['pow_diff'] = df['h_pow'] - df['a_pow']
# Oranlar
margin = (1/df['oh']) + (1/df['od']) + (1/df['oa'])
df['imp_h'] = (1/df['oh']) / margin
df['imp_d'] = (1/df['od']) / margin
df['imp_a'] = (1/df['oa']) / margin
# Hedefler
df['t_ms'] = df.apply(lambda r: 0 if r['sh']>r['sa'] else (2 if r['sh']<r['sa'] else 1), axis=1)
df['t_ou'] = ((df['sh'] + df['sa']) > 2.5).astype(int)
df['t_btts'] = ((df['sh'] > 0) & (df['sa'] > 0)).astype(int)
# ─── 3. MODEL EĞİTİMİ ───
# Yeni Özellik Seti
feats = ['elo_diff', 'h_xg', 'a_xg', 'total_xg', 'pow_diff', 'rest_diff', 'h_fatigue', 'a_fatigue',
'imp_h', 'imp_d', 'imp_a', 'h_xi', 'a_xi', 'cards']
# MS
print("🤖 MS...")
X_ms, y_ms = df[feats], df['t_ms']
X_tr, X_te, y_tr, y_te = train_test_split(X_ms, y_ms, test_size=0.15, random_state=42)
model_ms = lgb.train({'objective': 'multiclass', 'num_class': 3, 'verbose': -1, 'num_leaves': 63},
lgb.Dataset(X_tr, y_tr), num_boost_round=1000,
valid_sets=[lgb.Dataset(X_te, y_te)], callbacks=[lgb.early_stopping(50)])
# OU2.5
print("🤖 OU2.5...")
model_ou = lgb.train({'objective': 'binary', 'verbose': -1},
lgb.Dataset(df[feats], df['t_ou']), num_boost_round=500)
# BTTS
print("🤖 BTTS...")
model_btts = lgb.train({'objective': 'binary', 'verbose': -1},
lgb.Dataset(df[feats], df['t_btts']), num_boost_round=500)
# ─── 4. KAYDET ───
mdir = os.path.join(ROOT_DIR, 'models', 'vqwen')
os.makedirs(mdir, exist_ok=True)
for nm, md in [('ms', model_ms), ('ou25', model_ou), ('btts', model_btts)]:
p = os.path.join(mdir, f'vqwen_{nm}.pkl')
with open(p, 'wb') as f: pickle.dump(md, f)
print(f"✅ vqwen_{nm}.pkl")
print("\n🎉 VQWEN DEEP EĞİTİMİ BİTTİ!")
cur.close()
conn.close()
if __name__ == "__main__":
train_vqwen_deep()