703 lines
24 KiB
Python
703 lines
24 KiB
Python
"""
|
||
VQWEN v3 Training Script
|
||
========================
|
||
Retrains the VQWEN market models using only the configured top leagues.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import os
|
||
import pickle
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import lightgbm as lgb
|
||
import pandas as pd
|
||
import psycopg2
|
||
from dotenv import load_dotenv
|
||
|
||
AI_DIR = Path(__file__).resolve().parent
|
||
ENGINE_DIR = AI_DIR.parent
|
||
REPO_DIR = ENGINE_DIR.parent
|
||
MODELS_DIR = ENGINE_DIR / "models" / "vqwen"
|
||
TOP_LEAGUES_PATH = REPO_DIR / "top_leagues.json"
|
||
|
||
if str(ENGINE_DIR) not in sys.path:
|
||
sys.path.insert(0, str(ENGINE_DIR))
|
||
|
||
from features.vqwen_contract import (
|
||
FEATURE_COLUMNS,
|
||
VqwenFeatureInput,
|
||
build_vqwen_feature_row,
|
||
)
|
||
|
||
def _load_env() -> None:
|
||
load_dotenv(REPO_DIR / ".env", override=False)
|
||
load_dotenv(ENGINE_DIR / ".env", override=False)
|
||
|
||
|
||
def get_clean_dsn() -> str:
|
||
_load_env()
|
||
raw = os.getenv("DATABASE_URL", "").strip().strip('"').strip("'")
|
||
if not raw:
|
||
raise RuntimeError("DATABASE_URL is missing.")
|
||
return raw.split("?", 1)[0]
|
||
|
||
|
||
def load_top_league_ids() -> list[str]:
|
||
if not TOP_LEAGUES_PATH.exists():
|
||
raise FileNotFoundError(f"top_leagues.json not found at {TOP_LEAGUES_PATH}")
|
||
|
||
raw = json.loads(TOP_LEAGUES_PATH.read_text(encoding="utf-8"))
|
||
if not isinstance(raw, list):
|
||
raise ValueError("top_leagues.json must contain a JSON array.")
|
||
|
||
league_ids = [str(item).strip() for item in raw if str(item).strip()]
|
||
deduped = list(dict.fromkeys(league_ids))
|
||
if not deduped:
|
||
raise ValueError("top_leagues.json is empty.")
|
||
return deduped
|
||
|
||
|
||
def _fetch_dataframe(cur: psycopg2.extensions.cursor, league_ids: list[str]) -> pd.DataFrame:
|
||
query = """
|
||
WITH match_data AS (
|
||
SELECT
|
||
m.id,
|
||
m.league_id,
|
||
m.home_team_id,
|
||
m.away_team_id,
|
||
m.score_home,
|
||
m.score_away,
|
||
m.mst_utc,
|
||
ref.name AS referee_name,
|
||
COALESCE(maf.home_elo, 1500) AS home_elo,
|
||
COALESCE(maf.away_elo, 1500) AS away_elo,
|
||
COALESCE(
|
||
(
|
||
SELECT AVG(m2.score_home)
|
||
FROM matches m2
|
||
WHERE m2.home_team_id = m.home_team_id
|
||
AND m2.status = 'FT'
|
||
AND m2.mst_utc < m.mst_utc
|
||
),
|
||
1.2
|
||
) AS h_home_goals,
|
||
COALESCE(
|
||
(
|
||
SELECT AVG(m2.score_away)
|
||
FROM matches m2
|
||
WHERE m2.away_team_id = m.away_team_id
|
||
AND m2.status = 'FT'
|
||
AND m2.mst_utc < m.mst_utc
|
||
),
|
||
1.2
|
||
) AS a_away_goals,
|
||
COALESCE(
|
||
(
|
||
SELECT EXTRACT(
|
||
EPOCH FROM (
|
||
to_timestamp(m.mst_utc / 1000.0)
|
||
- MAX(to_timestamp(m2.mst_utc / 1000.0))
|
||
)
|
||
) / 86400.0
|
||
FROM matches m2
|
||
WHERE m2.home_team_id = m.home_team_id
|
||
AND m2.status = 'FT'
|
||
AND m2.mst_utc < m.mst_utc
|
||
),
|
||
7
|
||
) AS h_rest,
|
||
COALESCE(
|
||
(
|
||
SELECT EXTRACT(
|
||
EPOCH FROM (
|
||
to_timestamp(m.mst_utc / 1000.0)
|
||
- MAX(to_timestamp(m2.mst_utc / 1000.0))
|
||
)
|
||
) / 86400.0
|
||
FROM matches m2
|
||
WHERE m2.away_team_id = m.away_team_id
|
||
AND m2.status = 'FT'
|
||
AND m2.mst_utc < m.mst_utc
|
||
),
|
||
7
|
||
) AS a_rest,
|
||
(
|
||
SELECT os.odd_value
|
||
FROM odd_categories oc
|
||
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||
WHERE oc.match_id = m.id
|
||
AND oc.name ILIKE 'Maç Sonucu'
|
||
AND os.name = '1'
|
||
LIMIT 1
|
||
) AS oh,
|
||
(
|
||
SELECT os.odd_value
|
||
FROM odd_categories oc
|
||
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||
WHERE oc.match_id = m.id
|
||
AND oc.name ILIKE 'Maç Sonucu'
|
||
AND os.name = 'X'
|
||
LIMIT 1
|
||
) AS od,
|
||
(
|
||
SELECT os.odd_value
|
||
FROM odd_categories oc
|
||
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||
WHERE oc.match_id = m.id
|
||
AND oc.name ILIKE 'Maç Sonucu'
|
||
AND os.name = '2'
|
||
LIMIT 1
|
||
) AS oa
|
||
FROM matches m
|
||
LEFT JOIN football_ai_features maf ON maf.match_id = m.id
|
||
LEFT JOIN match_officials ref ON ref.match_id = m.id AND ref.role_id = 1
|
||
WHERE m.status = 'FT'
|
||
AND m.score_home IS NOT NULL
|
||
AND m.score_away IS NOT NULL
|
||
AND m.sport = 'football'
|
||
AND m.league_id = ANY(%s)
|
||
AND EXISTS (SELECT 1 FROM odd_categories oc WHERE oc.match_id = m.id)
|
||
)
|
||
SELECT
|
||
md.*,
|
||
COALESCE(
|
||
(
|
||
SELECT
|
||
(
|
||
COUNT(*) FILTER (
|
||
WHERE (
|
||
(m2.home_team_id = md.home_team_id AND m2.score_home > m2.score_away)
|
||
OR
|
||
(m2.away_team_id = md.home_team_id AND m2.score_away > m2.score_home)
|
||
)
|
||
)::float
|
||
+ COUNT(*) FILTER (WHERE m2.score_home = m2.score_away)::float * 0.5
|
||
) / NULLIF(COUNT(*), 0)
|
||
FROM matches m2
|
||
WHERE m2.status = 'FT'
|
||
AND m2.mst_utc < md.mst_utc
|
||
AND (
|
||
(m2.home_team_id = md.home_team_id AND m2.away_team_id = md.away_team_id)
|
||
OR
|
||
(m2.home_team_id = md.away_team_id AND m2.away_team_id = md.home_team_id)
|
||
)
|
||
),
|
||
0.5
|
||
) AS h2h_h_wr,
|
||
COALESCE(
|
||
(
|
||
SELECT SUM(points)
|
||
FROM (
|
||
SELECT
|
||
CASE
|
||
WHEN m2.score_home > m2.score_away THEN 3
|
||
WHEN m2.score_home = m2.score_away THEN 1
|
||
ELSE 0
|
||
END AS points
|
||
FROM matches m2
|
||
WHERE m2.home_team_id = md.home_team_id
|
||
AND m2.status = 'FT'
|
||
AND m2.mst_utc < md.mst_utc
|
||
ORDER BY m2.mst_utc DESC
|
||
LIMIT 5
|
||
) home_form
|
||
),
|
||
0
|
||
) AS h_form_pts,
|
||
COALESCE(
|
||
(
|
||
SELECT SUM(points)
|
||
FROM (
|
||
SELECT
|
||
CASE
|
||
WHEN m2.score_away > m2.score_home THEN 3
|
||
WHEN m2.score_away = m2.score_home THEN 1
|
||
ELSE 0
|
||
END AS points
|
||
FROM matches m2
|
||
WHERE m2.away_team_id = md.away_team_id
|
||
AND m2.status = 'FT'
|
||
AND m2.mst_utc < md.mst_utc
|
||
ORDER BY m2.mst_utc DESC
|
||
LIMIT 5
|
||
) away_form
|
||
),
|
||
0
|
||
) AS a_form_pts
|
||
FROM match_data md
|
||
ORDER BY md.mst_utc DESC
|
||
"""
|
||
|
||
print("Top league verisi cekiliyor...")
|
||
started_at = time.time()
|
||
cur.execute(query, (league_ids,))
|
||
rows = cur.fetchall()
|
||
elapsed = time.time() - started_at
|
||
print(f"{len(rows)} mac cekildi ({elapsed:.1f}s)")
|
||
|
||
dataframe = pd.DataFrame(
|
||
rows,
|
||
columns=[
|
||
"id",
|
||
"league_id",
|
||
"h_id",
|
||
"a_id",
|
||
"sh",
|
||
"sa",
|
||
"utc",
|
||
"referee_name",
|
||
"h_elo",
|
||
"a_elo",
|
||
"h_home_goals",
|
||
"a_away_goals",
|
||
"h_rest",
|
||
"a_rest",
|
||
"oh",
|
||
"od",
|
||
"oa",
|
||
"h2h_h_wr",
|
||
"h_form_pts",
|
||
"a_form_pts",
|
||
],
|
||
)
|
||
return dataframe
|
||
|
||
|
||
def _compute_league_avg_goals(
|
||
cur: psycopg2.extensions.cursor,
|
||
league_id: str,
|
||
before_ts: int,
|
||
) -> float:
|
||
if not league_id:
|
||
return 2.6
|
||
|
||
cur.execute(
|
||
"""
|
||
SELECT COALESCE(AVG(src.score_home + src.score_away), 2.6)
|
||
FROM (
|
||
SELECT score_home, score_away
|
||
FROM matches
|
||
WHERE league_id = %s
|
||
AND sport = 'football'
|
||
AND status = 'FT'
|
||
AND score_home IS NOT NULL
|
||
AND score_away IS NOT NULL
|
||
AND mst_utc < %s
|
||
ORDER BY mst_utc DESC
|
||
LIMIT 100
|
||
) src
|
||
""",
|
||
(league_id, before_ts),
|
||
)
|
||
row = cur.fetchone()
|
||
return float(row[0] or 2.6)
|
||
|
||
|
||
def _compute_referee_profile(
|
||
cur: psycopg2.extensions.cursor,
|
||
referee_name: str | None,
|
||
before_ts: int,
|
||
) -> tuple[float, float]:
|
||
if not referee_name:
|
||
return 2.6, 0.0
|
||
|
||
cur.execute(
|
||
"""
|
||
SELECT
|
||
COALESCE(AVG(score_home + score_away), 2.6) AS avg_goals,
|
||
COALESCE(AVG(CASE WHEN score_home > score_away THEN 1.0 ELSE 0.0 END), 0.46) - 0.46 AS home_bias
|
||
FROM (
|
||
SELECT m.score_home, m.score_away
|
||
FROM match_officials mo
|
||
JOIN matches m ON m.id = mo.match_id
|
||
WHERE mo.name = %s
|
||
AND mo.role_id = 1
|
||
AND m.sport = 'football'
|
||
AND m.status = 'FT'
|
||
AND m.score_home IS NOT NULL
|
||
AND m.score_away IS NOT NULL
|
||
AND m.mst_utc < %s
|
||
ORDER BY m.mst_utc DESC
|
||
LIMIT 30
|
||
) src
|
||
""",
|
||
(referee_name, before_ts),
|
||
)
|
||
row = cur.fetchone()
|
||
if not row:
|
||
return 2.6, 0.0
|
||
return float(row[0] or 2.6), float(row[1] or 0.0)
|
||
|
||
|
||
def _compute_team_squad_profile(
|
||
cur: psycopg2.extensions.cursor,
|
||
team_id: str,
|
||
before_ts: int,
|
||
) -> tuple[float, float]:
|
||
if not team_id:
|
||
return 0.5, 0.0
|
||
|
||
cur.execute(
|
||
"""
|
||
WITH recent_matches AS (
|
||
SELECT m.id
|
||
FROM matches m
|
||
WHERE (m.home_team_id = %s OR m.away_team_id = %s)
|
||
AND m.sport = 'football'
|
||
AND m.status = 'FT'
|
||
AND m.mst_utc < %s
|
||
ORDER BY m.mst_utc DESC
|
||
LIMIT 8
|
||
),
|
||
player_base AS (
|
||
SELECT
|
||
mpp.player_id,
|
||
COUNT(*)::float AS appearances,
|
||
COUNT(*) FILTER (WHERE mpp.is_starting = true)::float AS starts
|
||
FROM match_player_participation mpp
|
||
JOIN recent_matches rm ON rm.id = mpp.match_id
|
||
WHERE mpp.team_id = %s
|
||
GROUP BY mpp.player_id
|
||
),
|
||
player_goals AS (
|
||
SELECT
|
||
mpe.player_id,
|
||
COUNT(*) FILTER (
|
||
WHERE mpe.event_type = 'goal'
|
||
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
|
||
)::float AS goals,
|
||
0.0::float AS assists
|
||
FROM match_player_events mpe
|
||
JOIN recent_matches rm ON rm.id = mpe.match_id
|
||
WHERE mpe.team_id = %s
|
||
GROUP BY mpe.player_id
|
||
UNION ALL
|
||
SELECT
|
||
mpe.assist_player_id AS player_id,
|
||
0.0::float AS goals,
|
||
COUNT(*) FILTER (
|
||
WHERE mpe.event_type = 'goal'
|
||
AND mpe.assist_player_id IS NOT NULL
|
||
)::float AS assists
|
||
FROM match_player_events mpe
|
||
JOIN recent_matches rm ON rm.id = mpe.match_id
|
||
WHERE mpe.team_id = %s
|
||
AND mpe.assist_player_id IS NOT NULL
|
||
GROUP BY mpe.assist_player_id
|
||
),
|
||
player_events AS (
|
||
SELECT
|
||
player_id,
|
||
SUM(goals) AS goals,
|
||
SUM(assists) AS assists
|
||
FROM player_goals
|
||
GROUP BY player_id
|
||
),
|
||
player_scores AS (
|
||
SELECT
|
||
pb.player_id,
|
||
(pb.starts * 1.5)
|
||
+ ((pb.appearances - pb.starts) * 0.5)
|
||
+ (COALESCE(pe.goals, 0.0) * 2.5)
|
||
+ (COALESCE(pe.assists, 0.0) * 1.5) AS score
|
||
FROM player_base pb
|
||
LEFT JOIN player_events pe ON pe.player_id = pb.player_id
|
||
)
|
||
SELECT
|
||
COALESCE(AVG(top_players.score), 0.0) AS avg_top_score,
|
||
COALESCE(COUNT(*) FILTER (WHERE top_players.score >= 6.0), 0) AS key_players
|
||
FROM (
|
||
SELECT score
|
||
FROM player_scores
|
||
ORDER BY score DESC
|
||
LIMIT 11
|
||
) top_players
|
||
""",
|
||
(team_id, team_id, before_ts, team_id, team_id, team_id),
|
||
)
|
||
row = cur.fetchone()
|
||
if not row:
|
||
return 0.5, 0.0
|
||
|
||
avg_top_score = float(row[0] or 0.0)
|
||
return min(max(avg_top_score / 10.0, 0.0), 1.0), float(row[1] or 0.0)
|
||
|
||
|
||
def _enrich_pre_match_context(
|
||
cur: psycopg2.extensions.cursor,
|
||
df: pd.DataFrame,
|
||
) -> pd.DataFrame:
|
||
league_avg_goals: list[float] = []
|
||
referee_avg_goals: list[float] = []
|
||
referee_home_bias: list[float] = []
|
||
home_squad_strength: list[float] = []
|
||
away_squad_strength: list[float] = []
|
||
home_key_players: list[float] = []
|
||
away_key_players: list[float] = []
|
||
|
||
print("Pre-match context enrich ediliyor...")
|
||
started_at = time.time()
|
||
|
||
for row in df.itertuples(index=False):
|
||
before_ts = int(getattr(row, "utc") or 0)
|
||
league_id = str(getattr(row, "league_id") or "")
|
||
ref_name_raw: Any = getattr(row, "referee_name", None)
|
||
referee_name = str(ref_name_raw).strip() if ref_name_raw else None
|
||
|
||
lg_avg = _compute_league_avg_goals(cur, league_id, before_ts)
|
||
ref_avg, ref_bias = _compute_referee_profile(cur, referee_name, before_ts)
|
||
h_sq, h_key = _compute_team_squad_profile(cur, str(getattr(row, "h_id")), before_ts)
|
||
a_sq, a_key = _compute_team_squad_profile(cur, str(getattr(row, "a_id")), before_ts)
|
||
|
||
league_avg_goals.append(lg_avg)
|
||
referee_avg_goals.append(ref_avg)
|
||
referee_home_bias.append(ref_bias)
|
||
home_squad_strength.append(h_sq)
|
||
away_squad_strength.append(a_sq)
|
||
home_key_players.append(h_key)
|
||
away_key_players.append(a_key)
|
||
|
||
enriched = df.copy()
|
||
enriched["league_avg_goals"] = league_avg_goals
|
||
enriched["referee_avg_goals"] = referee_avg_goals
|
||
enriched["referee_home_bias"] = referee_home_bias
|
||
enriched["home_squad_strength"] = home_squad_strength
|
||
enriched["away_squad_strength"] = away_squad_strength
|
||
enriched["home_key_players"] = home_key_players
|
||
enriched["away_key_players"] = away_key_players
|
||
|
||
print(f"Pre-match context tamam ({time.time() - started_at:.1f}s)")
|
||
return enriched
|
||
|
||
|
||
def _prepare_features(df: pd.DataFrame) -> pd.DataFrame:
|
||
numeric_columns = [
|
||
"sh",
|
||
"sa",
|
||
"utc",
|
||
"league_avg_goals",
|
||
"referee_avg_goals",
|
||
"referee_home_bias",
|
||
"home_squad_strength",
|
||
"away_squad_strength",
|
||
"home_key_players",
|
||
"away_key_players",
|
||
"h_elo",
|
||
"a_elo",
|
||
"h_home_goals",
|
||
"a_away_goals",
|
||
"h_rest",
|
||
"a_rest",
|
||
"oh",
|
||
"od",
|
||
"oa",
|
||
"h2h_h_wr",
|
||
"h_form_pts",
|
||
"a_form_pts",
|
||
]
|
||
for column in numeric_columns:
|
||
df[column] = pd.to_numeric(df[column], errors="coerce")
|
||
|
||
df = df.fillna(df.median(numeric_only=True))
|
||
df = df[(df["oh"] > 1.0) & (df["od"] > 1.0) & (df["oa"] > 1.0)].copy()
|
||
if df.empty:
|
||
raise RuntimeError("No valid rows remained after odds filtering.")
|
||
|
||
margin = (1.0 / df["oh"]) + (1.0 / df["od"]) + (1.0 / df["oa"])
|
||
df["imp_h"] = (1.0 / df["oh"]) / margin
|
||
df["imp_d"] = (1.0 / df["od"]) / margin
|
||
df["imp_a"] = (1.0 / df["oa"]) / margin
|
||
|
||
feature_rows = df.apply(
|
||
lambda row: build_vqwen_feature_row(
|
||
VqwenFeatureInput(
|
||
home_elo=float(row["h_elo"]),
|
||
away_elo=float(row["a_elo"]),
|
||
home_avg_goals_scored=float(row["h_home_goals"]),
|
||
away_avg_goals_scored=float(row["a_away_goals"]),
|
||
home_avg_goals_conceded=float(row["a_away_goals"]),
|
||
away_avg_goals_conceded=float(row["h_home_goals"]),
|
||
home_avg_shots_on_target=4.0,
|
||
away_avg_shots_on_target=4.0,
|
||
home_avg_possession=50.0,
|
||
away_avg_possession=50.0,
|
||
home_rest_days=float(row["h_rest"]),
|
||
away_rest_days=float(row["a_rest"]),
|
||
implied_prob_home=float(row["imp_h"]),
|
||
implied_prob_draw=float(row["imp_d"]),
|
||
implied_prob_away=float(row["imp_a"]),
|
||
# Historical training must not leak actual match lineups.
|
||
# Runtime also often defaults to 1.0 when pre-match lineup data
|
||
# is unavailable, so training should mirror that behavior.
|
||
home_lineup_availability=1.0,
|
||
away_lineup_availability=1.0,
|
||
h2h_home_win_rate=float(row["h2h_h_wr"]),
|
||
home_form_score=float(row["h_form_pts"]),
|
||
away_form_score=float(row["a_form_pts"]),
|
||
league_avg_goals=float(row["league_avg_goals"]),
|
||
referee_avg_goals=float(row["referee_avg_goals"]),
|
||
referee_home_bias=float(row["referee_home_bias"]),
|
||
home_squad_strength=float(row["home_squad_strength"]),
|
||
away_squad_strength=float(row["away_squad_strength"]),
|
||
home_key_players=float(row["home_key_players"]),
|
||
away_key_players=float(row["away_key_players"]),
|
||
),
|
||
),
|
||
axis=1,
|
||
result_type="expand",
|
||
)
|
||
for column in FEATURE_COLUMNS:
|
||
df[column] = feature_rows[column]
|
||
|
||
df["t_ms"] = df.apply(
|
||
lambda row: 0 if row["sh"] > row["sa"] else (2 if row["sh"] < row["sa"] else 1),
|
||
axis=1,
|
||
)
|
||
df["t_ou"] = ((df["sh"] + df["sa"]) > 2.5).astype(int)
|
||
df["t_btts"] = ((df["sh"] > 0) & (df["sa"] > 0)).astype(int)
|
||
|
||
return df
|
||
|
||
|
||
def _temporal_split(df: pd.DataFrame, validation_ratio: float = 0.15) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||
if df.empty:
|
||
raise RuntimeError("Cannot split an empty dataframe.")
|
||
|
||
ordered = df.sort_values("utc").reset_index(drop=True)
|
||
split_index = max(int(len(ordered) * (1.0 - validation_ratio)), 1)
|
||
split_index = min(split_index, len(ordered) - 1)
|
||
return ordered.iloc[:split_index].copy(), ordered.iloc[split_index:].copy()
|
||
|
||
|
||
def _save_metadata(df: pd.DataFrame, league_ids: list[str]) -> None:
|
||
metadata = {
|
||
"trained_at": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||
"contract_version": "vqwen.shared.v1",
|
||
"league_count": len(league_ids),
|
||
"league_ids": league_ids,
|
||
"sample_count": int(len(df)),
|
||
"feature_columns": FEATURE_COLUMNS,
|
||
"target_distribution": {
|
||
"ms_home": int((df["t_ms"] == 0).sum()),
|
||
"ms_draw": int((df["t_ms"] == 1).sum()),
|
||
"ms_away": int((df["t_ms"] == 2).sum()),
|
||
"ou25_over": int(df["t_ou"].sum()),
|
||
"ou25_under": int(len(df) - df["t_ou"].sum()),
|
||
"btts_yes": int(df["t_btts"].sum()),
|
||
"btts_no": int(len(df) - df["t_btts"].sum()),
|
||
},
|
||
}
|
||
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
||
(MODELS_DIR / "vqwen_training_meta.json").write_text(
|
||
json.dumps(metadata, indent=2),
|
||
encoding="utf-8",
|
||
)
|
||
|
||
|
||
def train_vqwen_v3() -> None:
|
||
print("VQWEN v3 MODEL EGITIMI (TOP LEAGUES)")
|
||
print("=" * 60)
|
||
|
||
league_ids = load_top_league_ids()
|
||
print(f"League filter aktif: {len(league_ids)} lig")
|
||
|
||
dsn = get_clean_dsn()
|
||
conn = psycopg2.connect(dsn)
|
||
cur = conn.cursor()
|
||
|
||
try:
|
||
df = _fetch_dataframe(cur, league_ids)
|
||
df = _enrich_pre_match_context(cur, df)
|
||
df = _prepare_features(df)
|
||
print(f"Temiz egitim orneklemi: {len(df)} mac")
|
||
|
||
train_df, valid_df = _temporal_split(df)
|
||
X_train = train_df[FEATURE_COLUMNS]
|
||
X_valid = valid_df[FEATURE_COLUMNS]
|
||
y_train = train_df["t_ms"]
|
||
y_valid = valid_df["t_ms"]
|
||
|
||
print(
|
||
"Temporal split:"
|
||
f" train={len(train_df)}"
|
||
f" valid={len(valid_df)}"
|
||
f" train_end_utc={int(train_df['utc'].max())}"
|
||
f" valid_start_utc={int(valid_df['utc'].min())}"
|
||
)
|
||
|
||
print("MS modeli egitiliyor...")
|
||
model_ms = lgb.train(
|
||
{
|
||
"objective": "multiclass",
|
||
"num_class": 3,
|
||
"metric": "multi_logloss",
|
||
"verbose": -1,
|
||
"num_leaves": 63,
|
||
"learning_rate": 0.03,
|
||
"feature_fraction": 0.85,
|
||
"bagging_fraction": 0.85,
|
||
"bagging_freq": 1,
|
||
},
|
||
lgb.Dataset(X_train, y_train),
|
||
num_boost_round=1000,
|
||
valid_sets=[lgb.Dataset(X_valid, y_valid)],
|
||
callbacks=[lgb.early_stopping(50)],
|
||
)
|
||
|
||
print("OU2.5 modeli egitiliyor...")
|
||
model_ou25 = lgb.train(
|
||
{
|
||
"objective": "binary",
|
||
"metric": "binary_logloss",
|
||
"verbose": -1,
|
||
"learning_rate": 0.03,
|
||
"num_leaves": 31,
|
||
},
|
||
lgb.Dataset(train_df[FEATURE_COLUMNS], train_df["t_ou"]),
|
||
num_boost_round=1000,
|
||
valid_sets=[lgb.Dataset(valid_df[FEATURE_COLUMNS], valid_df["t_ou"])],
|
||
callbacks=[lgb.early_stopping(50)],
|
||
)
|
||
|
||
print("BTTS modeli egitiliyor...")
|
||
model_btts = lgb.train(
|
||
{
|
||
"objective": "binary",
|
||
"metric": "binary_logloss",
|
||
"verbose": -1,
|
||
"learning_rate": 0.03,
|
||
"num_leaves": 31,
|
||
},
|
||
lgb.Dataset(train_df[FEATURE_COLUMNS], train_df["t_btts"]),
|
||
num_boost_round=1000,
|
||
valid_sets=[lgb.Dataset(valid_df[FEATURE_COLUMNS], valid_df["t_btts"])],
|
||
callbacks=[lgb.early_stopping(50)],
|
||
)
|
||
|
||
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
||
artifacts = {
|
||
"vqwen_ms.pkl": model_ms,
|
||
"vqwen_ou25.pkl": model_ou25,
|
||
"vqwen_btts.pkl": model_btts,
|
||
}
|
||
for filename, model in artifacts.items():
|
||
with (MODELS_DIR / filename).open("wb") as handle:
|
||
pickle.dump(model, handle)
|
||
print(f"Kaydedildi: {filename}")
|
||
|
||
_save_metadata(df, league_ids)
|
||
print("Kaydedildi: vqwen_training_meta.json")
|
||
print("VQWEN v3 top league egitimi tamamlandi.")
|
||
finally:
|
||
cur.close()
|
||
conn.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
train_vqwen_v3()
|