""" VQWEN v3 Training Script ======================== Retrains the VQWEN market models using only the configured top leagues. """ from __future__ import annotations import json import os import pickle import sys import time from pathlib import Path from typing import Any import lightgbm as lgb import pandas as pd import psycopg2 from dotenv import load_dotenv AI_DIR = Path(__file__).resolve().parent ENGINE_DIR = AI_DIR.parent REPO_DIR = ENGINE_DIR.parent MODELS_DIR = ENGINE_DIR / "models" / "vqwen" TOP_LEAGUES_PATH = REPO_DIR / "top_leagues.json" if str(ENGINE_DIR) not in sys.path: sys.path.insert(0, str(ENGINE_DIR)) from features.vqwen_contract import ( FEATURE_COLUMNS, VqwenFeatureInput, build_vqwen_feature_row, ) def _load_env() -> None: load_dotenv(REPO_DIR / ".env", override=False) load_dotenv(ENGINE_DIR / ".env", override=False) def get_clean_dsn() -> str: _load_env() raw = os.getenv("DATABASE_URL", "").strip().strip('"').strip("'") if not raw: raise RuntimeError("DATABASE_URL is missing.") return raw.split("?", 1)[0] def load_top_league_ids() -> list[str]: if not TOP_LEAGUES_PATH.exists(): raise FileNotFoundError(f"top_leagues.json not found at {TOP_LEAGUES_PATH}") raw = json.loads(TOP_LEAGUES_PATH.read_text(encoding="utf-8")) if not isinstance(raw, list): raise ValueError("top_leagues.json must contain a JSON array.") league_ids = [str(item).strip() for item in raw if str(item).strip()] deduped = list(dict.fromkeys(league_ids)) if not deduped: raise ValueError("top_leagues.json is empty.") return deduped def _fetch_dataframe(cur: psycopg2.extensions.cursor, league_ids: list[str]) -> pd.DataFrame: query = """ WITH match_data AS ( SELECT m.id, m.league_id, m.home_team_id, m.away_team_id, m.score_home, m.score_away, m.mst_utc, ref.name AS referee_name, COALESCE(maf.home_elo, 1500) AS home_elo, COALESCE(maf.away_elo, 1500) AS away_elo, COALESCE( ( SELECT AVG(m2.score_home) FROM matches m2 WHERE m2.home_team_id = m.home_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc ), 1.2 ) AS h_home_goals, COALESCE( ( SELECT AVG(m2.score_away) FROM matches m2 WHERE m2.away_team_id = m.away_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc ), 1.2 ) AS a_away_goals, COALESCE( ( SELECT EXTRACT( EPOCH FROM ( to_timestamp(m.mst_utc / 1000.0) - MAX(to_timestamp(m2.mst_utc / 1000.0)) ) ) / 86400.0 FROM matches m2 WHERE m2.home_team_id = m.home_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc ), 7 ) AS h_rest, COALESCE( ( SELECT EXTRACT( EPOCH FROM ( to_timestamp(m.mst_utc / 1000.0) - MAX(to_timestamp(m2.mst_utc / 1000.0)) ) ) / 86400.0 FROM matches m2 WHERE m2.away_team_id = m.away_team_id AND m2.status = 'FT' AND m2.mst_utc < m.mst_utc ), 7 ) AS a_rest, ( SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = '1' LIMIT 1 ) AS oh, ( SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = 'X' LIMIT 1 ) AS od, ( SELECT os.odd_value FROM odd_categories oc JOIN odd_selections os ON os.odd_category_db_id = oc.db_id WHERE oc.match_id = m.id AND oc.name ILIKE 'Maç Sonucu' AND os.name = '2' LIMIT 1 ) AS oa FROM matches m LEFT JOIN football_ai_features maf ON maf.match_id = m.id LEFT JOIN match_officials ref ON ref.match_id = m.id AND ref.role_id = 1 WHERE m.status = 'FT' AND m.score_home IS NOT NULL AND m.score_away IS NOT NULL AND m.sport = 'football' AND m.league_id = ANY(%s) AND EXISTS (SELECT 1 FROM odd_categories oc WHERE oc.match_id = m.id) ) SELECT md.*, COALESCE( ( SELECT ( COUNT(*) FILTER ( WHERE ( (m2.home_team_id = md.home_team_id AND m2.score_home > m2.score_away) OR (m2.away_team_id = md.home_team_id AND m2.score_away > m2.score_home) ) )::float + COUNT(*) FILTER (WHERE m2.score_home = m2.score_away)::float * 0.5 ) / NULLIF(COUNT(*), 0) FROM matches m2 WHERE m2.status = 'FT' AND m2.mst_utc < md.mst_utc AND ( (m2.home_team_id = md.home_team_id AND m2.away_team_id = md.away_team_id) OR (m2.home_team_id = md.away_team_id AND m2.away_team_id = md.home_team_id) ) ), 0.5 ) AS h2h_h_wr, COALESCE( ( SELECT SUM(points) FROM ( SELECT CASE WHEN m2.score_home > m2.score_away THEN 3 WHEN m2.score_home = m2.score_away THEN 1 ELSE 0 END AS points FROM matches m2 WHERE m2.home_team_id = md.home_team_id AND m2.status = 'FT' AND m2.mst_utc < md.mst_utc ORDER BY m2.mst_utc DESC LIMIT 5 ) home_form ), 0 ) AS h_form_pts, COALESCE( ( SELECT SUM(points) FROM ( SELECT CASE WHEN m2.score_away > m2.score_home THEN 3 WHEN m2.score_away = m2.score_home THEN 1 ELSE 0 END AS points FROM matches m2 WHERE m2.away_team_id = md.away_team_id AND m2.status = 'FT' AND m2.mst_utc < md.mst_utc ORDER BY m2.mst_utc DESC LIMIT 5 ) away_form ), 0 ) AS a_form_pts FROM match_data md ORDER BY md.mst_utc DESC """ print("Top league verisi cekiliyor...") started_at = time.time() cur.execute(query, (league_ids,)) rows = cur.fetchall() elapsed = time.time() - started_at print(f"{len(rows)} mac cekildi ({elapsed:.1f}s)") dataframe = pd.DataFrame( rows, columns=[ "id", "league_id", "h_id", "a_id", "sh", "sa", "utc", "referee_name", "h_elo", "a_elo", "h_home_goals", "a_away_goals", "h_rest", "a_rest", "oh", "od", "oa", "h2h_h_wr", "h_form_pts", "a_form_pts", ], ) return dataframe def _compute_league_avg_goals( cur: psycopg2.extensions.cursor, league_id: str, before_ts: int, ) -> float: if not league_id: return 2.6 cur.execute( """ SELECT COALESCE(AVG(src.score_home + src.score_away), 2.6) FROM ( SELECT score_home, score_away FROM matches WHERE league_id = %s AND sport = 'football' AND status = 'FT' AND score_home IS NOT NULL AND score_away IS NOT NULL AND mst_utc < %s ORDER BY mst_utc DESC LIMIT 100 ) src """, (league_id, before_ts), ) row = cur.fetchone() return float(row[0] or 2.6) def _compute_referee_profile( cur: psycopg2.extensions.cursor, referee_name: str | None, before_ts: int, ) -> tuple[float, float]: if not referee_name: return 2.6, 0.0 cur.execute( """ SELECT COALESCE(AVG(score_home + score_away), 2.6) AS avg_goals, COALESCE(AVG(CASE WHEN score_home > score_away THEN 1.0 ELSE 0.0 END), 0.46) - 0.46 AS home_bias FROM ( SELECT m.score_home, m.score_away FROM match_officials mo JOIN matches m ON m.id = mo.match_id WHERE mo.name = %s AND mo.role_id = 1 AND m.sport = 'football' AND m.status = 'FT' AND m.score_home IS NOT NULL AND m.score_away IS NOT NULL AND m.mst_utc < %s ORDER BY m.mst_utc DESC LIMIT 30 ) src """, (referee_name, before_ts), ) row = cur.fetchone() if not row: return 2.6, 0.0 return float(row[0] or 2.6), float(row[1] or 0.0) def _compute_team_squad_profile( cur: psycopg2.extensions.cursor, team_id: str, before_ts: int, ) -> tuple[float, float]: if not team_id: return 0.5, 0.0 cur.execute( """ WITH recent_matches AS ( SELECT m.id FROM matches m WHERE (m.home_team_id = %s OR m.away_team_id = %s) AND m.sport = 'football' AND m.status = 'FT' AND m.mst_utc < %s ORDER BY m.mst_utc DESC LIMIT 8 ), player_base AS ( SELECT mpp.player_id, COUNT(*)::float AS appearances, COUNT(*) FILTER (WHERE mpp.is_starting = true)::float AS starts FROM match_player_participation mpp JOIN recent_matches rm ON rm.id = mpp.match_id WHERE mpp.team_id = %s GROUP BY mpp.player_id ), player_goals AS ( SELECT mpe.player_id, COUNT(*) FILTER ( WHERE mpe.event_type = 'goal' AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%' )::float AS goals, 0.0::float AS assists FROM match_player_events mpe JOIN recent_matches rm ON rm.id = mpe.match_id WHERE mpe.team_id = %s GROUP BY mpe.player_id UNION ALL SELECT mpe.assist_player_id AS player_id, 0.0::float AS goals, COUNT(*) FILTER ( WHERE mpe.event_type = 'goal' AND mpe.assist_player_id IS NOT NULL )::float AS assists FROM match_player_events mpe JOIN recent_matches rm ON rm.id = mpe.match_id WHERE mpe.team_id = %s AND mpe.assist_player_id IS NOT NULL GROUP BY mpe.assist_player_id ), player_events AS ( SELECT player_id, SUM(goals) AS goals, SUM(assists) AS assists FROM player_goals GROUP BY player_id ), player_scores AS ( SELECT pb.player_id, (pb.starts * 1.5) + ((pb.appearances - pb.starts) * 0.5) + (COALESCE(pe.goals, 0.0) * 2.5) + (COALESCE(pe.assists, 0.0) * 1.5) AS score FROM player_base pb LEFT JOIN player_events pe ON pe.player_id = pb.player_id ) SELECT COALESCE(AVG(top_players.score), 0.0) AS avg_top_score, COALESCE(COUNT(*) FILTER (WHERE top_players.score >= 6.0), 0) AS key_players FROM ( SELECT score FROM player_scores ORDER BY score DESC LIMIT 11 ) top_players """, (team_id, team_id, before_ts, team_id, team_id, team_id), ) row = cur.fetchone() if not row: return 0.5, 0.0 avg_top_score = float(row[0] or 0.0) return min(max(avg_top_score / 10.0, 0.0), 1.0), float(row[1] or 0.0) def _enrich_pre_match_context( cur: psycopg2.extensions.cursor, df: pd.DataFrame, ) -> pd.DataFrame: league_avg_goals: list[float] = [] referee_avg_goals: list[float] = [] referee_home_bias: list[float] = [] home_squad_strength: list[float] = [] away_squad_strength: list[float] = [] home_key_players: list[float] = [] away_key_players: list[float] = [] print("Pre-match context enrich ediliyor...") started_at = time.time() for row in df.itertuples(index=False): before_ts = int(getattr(row, "utc") or 0) league_id = str(getattr(row, "league_id") or "") ref_name_raw: Any = getattr(row, "referee_name", None) referee_name = str(ref_name_raw).strip() if ref_name_raw else None lg_avg = _compute_league_avg_goals(cur, league_id, before_ts) ref_avg, ref_bias = _compute_referee_profile(cur, referee_name, before_ts) h_sq, h_key = _compute_team_squad_profile(cur, str(getattr(row, "h_id")), before_ts) a_sq, a_key = _compute_team_squad_profile(cur, str(getattr(row, "a_id")), before_ts) league_avg_goals.append(lg_avg) referee_avg_goals.append(ref_avg) referee_home_bias.append(ref_bias) home_squad_strength.append(h_sq) away_squad_strength.append(a_sq) home_key_players.append(h_key) away_key_players.append(a_key) enriched = df.copy() enriched["league_avg_goals"] = league_avg_goals enriched["referee_avg_goals"] = referee_avg_goals enriched["referee_home_bias"] = referee_home_bias enriched["home_squad_strength"] = home_squad_strength enriched["away_squad_strength"] = away_squad_strength enriched["home_key_players"] = home_key_players enriched["away_key_players"] = away_key_players print(f"Pre-match context tamam ({time.time() - started_at:.1f}s)") return enriched def _prepare_features(df: pd.DataFrame) -> pd.DataFrame: numeric_columns = [ "sh", "sa", "utc", "league_avg_goals", "referee_avg_goals", "referee_home_bias", "home_squad_strength", "away_squad_strength", "home_key_players", "away_key_players", "h_elo", "a_elo", "h_home_goals", "a_away_goals", "h_rest", "a_rest", "oh", "od", "oa", "h2h_h_wr", "h_form_pts", "a_form_pts", ] for column in numeric_columns: df[column] = pd.to_numeric(df[column], errors="coerce") df = df.fillna(df.median(numeric_only=True)) df = df[(df["oh"] > 1.0) & (df["od"] > 1.0) & (df["oa"] > 1.0)].copy() if df.empty: raise RuntimeError("No valid rows remained after odds filtering.") margin = (1.0 / df["oh"]) + (1.0 / df["od"]) + (1.0 / df["oa"]) df["imp_h"] = (1.0 / df["oh"]) / margin df["imp_d"] = (1.0 / df["od"]) / margin df["imp_a"] = (1.0 / df["oa"]) / margin feature_rows = df.apply( lambda row: build_vqwen_feature_row( VqwenFeatureInput( home_elo=float(row["h_elo"]), away_elo=float(row["a_elo"]), home_avg_goals_scored=float(row["h_home_goals"]), away_avg_goals_scored=float(row["a_away_goals"]), home_avg_goals_conceded=float(row["a_away_goals"]), away_avg_goals_conceded=float(row["h_home_goals"]), home_avg_shots_on_target=4.0, away_avg_shots_on_target=4.0, home_avg_possession=50.0, away_avg_possession=50.0, home_rest_days=float(row["h_rest"]), away_rest_days=float(row["a_rest"]), implied_prob_home=float(row["imp_h"]), implied_prob_draw=float(row["imp_d"]), implied_prob_away=float(row["imp_a"]), # Historical training must not leak actual match lineups. # Runtime also often defaults to 1.0 when pre-match lineup data # is unavailable, so training should mirror that behavior. home_lineup_availability=1.0, away_lineup_availability=1.0, h2h_home_win_rate=float(row["h2h_h_wr"]), home_form_score=float(row["h_form_pts"]), away_form_score=float(row["a_form_pts"]), league_avg_goals=float(row["league_avg_goals"]), referee_avg_goals=float(row["referee_avg_goals"]), referee_home_bias=float(row["referee_home_bias"]), home_squad_strength=float(row["home_squad_strength"]), away_squad_strength=float(row["away_squad_strength"]), home_key_players=float(row["home_key_players"]), away_key_players=float(row["away_key_players"]), ), ), axis=1, result_type="expand", ) for column in FEATURE_COLUMNS: df[column] = feature_rows[column] df["t_ms"] = df.apply( lambda row: 0 if row["sh"] > row["sa"] else (2 if row["sh"] < row["sa"] else 1), axis=1, ) df["t_ou"] = ((df["sh"] + df["sa"]) > 2.5).astype(int) df["t_btts"] = ((df["sh"] > 0) & (df["sa"] > 0)).astype(int) return df def _temporal_split(df: pd.DataFrame, validation_ratio: float = 0.15) -> tuple[pd.DataFrame, pd.DataFrame]: if df.empty: raise RuntimeError("Cannot split an empty dataframe.") ordered = df.sort_values("utc").reset_index(drop=True) split_index = max(int(len(ordered) * (1.0 - validation_ratio)), 1) split_index = min(split_index, len(ordered) - 1) return ordered.iloc[:split_index].copy(), ordered.iloc[split_index:].copy() def _save_metadata(df: pd.DataFrame, league_ids: list[str]) -> None: metadata = { "trained_at": time.strftime("%Y-%m-%d %H:%M:%S"), "contract_version": "vqwen.shared.v1", "league_count": len(league_ids), "league_ids": league_ids, "sample_count": int(len(df)), "feature_columns": FEATURE_COLUMNS, "target_distribution": { "ms_home": int((df["t_ms"] == 0).sum()), "ms_draw": int((df["t_ms"] == 1).sum()), "ms_away": int((df["t_ms"] == 2).sum()), "ou25_over": int(df["t_ou"].sum()), "ou25_under": int(len(df) - df["t_ou"].sum()), "btts_yes": int(df["t_btts"].sum()), "btts_no": int(len(df) - df["t_btts"].sum()), }, } MODELS_DIR.mkdir(parents=True, exist_ok=True) (MODELS_DIR / "vqwen_training_meta.json").write_text( json.dumps(metadata, indent=2), encoding="utf-8", ) def train_vqwen_v3() -> None: print("VQWEN v3 MODEL EGITIMI (TOP LEAGUES)") print("=" * 60) league_ids = load_top_league_ids() print(f"League filter aktif: {len(league_ids)} lig") dsn = get_clean_dsn() conn = psycopg2.connect(dsn) cur = conn.cursor() try: df = _fetch_dataframe(cur, league_ids) df = _enrich_pre_match_context(cur, df) df = _prepare_features(df) print(f"Temiz egitim orneklemi: {len(df)} mac") train_df, valid_df = _temporal_split(df) X_train = train_df[FEATURE_COLUMNS] X_valid = valid_df[FEATURE_COLUMNS] y_train = train_df["t_ms"] y_valid = valid_df["t_ms"] print( "Temporal split:" f" train={len(train_df)}" f" valid={len(valid_df)}" f" train_end_utc={int(train_df['utc'].max())}" f" valid_start_utc={int(valid_df['utc'].min())}" ) print("MS modeli egitiliyor...") model_ms = lgb.train( { "objective": "multiclass", "num_class": 3, "metric": "multi_logloss", "verbose": -1, "num_leaves": 63, "learning_rate": 0.03, "feature_fraction": 0.85, "bagging_fraction": 0.85, "bagging_freq": 1, }, lgb.Dataset(X_train, y_train), num_boost_round=1000, valid_sets=[lgb.Dataset(X_valid, y_valid)], callbacks=[lgb.early_stopping(50)], ) print("OU2.5 modeli egitiliyor...") model_ou25 = lgb.train( { "objective": "binary", "metric": "binary_logloss", "verbose": -1, "learning_rate": 0.03, "num_leaves": 31, }, lgb.Dataset(train_df[FEATURE_COLUMNS], train_df["t_ou"]), num_boost_round=1000, valid_sets=[lgb.Dataset(valid_df[FEATURE_COLUMNS], valid_df["t_ou"])], callbacks=[lgb.early_stopping(50)], ) print("BTTS modeli egitiliyor...") model_btts = lgb.train( { "objective": "binary", "metric": "binary_logloss", "verbose": -1, "learning_rate": 0.03, "num_leaves": 31, }, lgb.Dataset(train_df[FEATURE_COLUMNS], train_df["t_btts"]), num_boost_round=1000, valid_sets=[lgb.Dataset(valid_df[FEATURE_COLUMNS], valid_df["t_btts"])], callbacks=[lgb.early_stopping(50)], ) MODELS_DIR.mkdir(parents=True, exist_ok=True) artifacts = { "vqwen_ms.pkl": model_ms, "vqwen_ou25.pkl": model_ou25, "vqwen_btts.pkl": model_btts, } for filename, model in artifacts.items(): with (MODELS_DIR / filename).open("wb") as handle: pickle.dump(model, handle) print(f"Kaydedildi: {filename}") _save_metadata(df, league_ids) print("Kaydedildi: vqwen_training_meta.json") print("VQWEN v3 top league egitimi tamamlandi.") finally: cur.close() conn.close() if __name__ == "__main__": train_vqwen_v3()