This commit is contained in:
@@ -0,0 +1,702 @@
|
||||
"""
|
||||
VQWEN v3 Training Script
|
||||
========================
|
||||
Retrains the VQWEN market models using only the configured top leagues.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import lightgbm as lgb
|
||||
import pandas as pd
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
|
||||
AI_DIR = Path(__file__).resolve().parent
|
||||
ENGINE_DIR = AI_DIR.parent
|
||||
REPO_DIR = ENGINE_DIR.parent
|
||||
MODELS_DIR = ENGINE_DIR / "models" / "vqwen"
|
||||
TOP_LEAGUES_PATH = REPO_DIR / "top_leagues.json"
|
||||
|
||||
if str(ENGINE_DIR) not in sys.path:
|
||||
sys.path.insert(0, str(ENGINE_DIR))
|
||||
|
||||
from features.vqwen_contract import (
|
||||
FEATURE_COLUMNS,
|
||||
VqwenFeatureInput,
|
||||
build_vqwen_feature_row,
|
||||
)
|
||||
|
||||
def _load_env() -> None:
|
||||
load_dotenv(REPO_DIR / ".env", override=False)
|
||||
load_dotenv(ENGINE_DIR / ".env", override=False)
|
||||
|
||||
|
||||
def get_clean_dsn() -> str:
|
||||
_load_env()
|
||||
raw = os.getenv("DATABASE_URL", "").strip().strip('"').strip("'")
|
||||
if not raw:
|
||||
raise RuntimeError("DATABASE_URL is missing.")
|
||||
return raw.split("?", 1)[0]
|
||||
|
||||
|
||||
def load_top_league_ids() -> list[str]:
|
||||
if not TOP_LEAGUES_PATH.exists():
|
||||
raise FileNotFoundError(f"top_leagues.json not found at {TOP_LEAGUES_PATH}")
|
||||
|
||||
raw = json.loads(TOP_LEAGUES_PATH.read_text(encoding="utf-8"))
|
||||
if not isinstance(raw, list):
|
||||
raise ValueError("top_leagues.json must contain a JSON array.")
|
||||
|
||||
league_ids = [str(item).strip() for item in raw if str(item).strip()]
|
||||
deduped = list(dict.fromkeys(league_ids))
|
||||
if not deduped:
|
||||
raise ValueError("top_leagues.json is empty.")
|
||||
return deduped
|
||||
|
||||
|
||||
def _fetch_dataframe(cur: psycopg2.extensions.cursor, league_ids: list[str]) -> pd.DataFrame:
|
||||
query = """
|
||||
WITH match_data AS (
|
||||
SELECT
|
||||
m.id,
|
||||
m.league_id,
|
||||
m.home_team_id,
|
||||
m.away_team_id,
|
||||
m.score_home,
|
||||
m.score_away,
|
||||
m.mst_utc,
|
||||
ref.name AS referee_name,
|
||||
COALESCE(maf.home_elo, 1500) AS home_elo,
|
||||
COALESCE(maf.away_elo, 1500) AS away_elo,
|
||||
COALESCE(
|
||||
(
|
||||
SELECT AVG(m2.score_home)
|
||||
FROM matches m2
|
||||
WHERE m2.home_team_id = m.home_team_id
|
||||
AND m2.status = 'FT'
|
||||
AND m2.mst_utc < m.mst_utc
|
||||
),
|
||||
1.2
|
||||
) AS h_home_goals,
|
||||
COALESCE(
|
||||
(
|
||||
SELECT AVG(m2.score_away)
|
||||
FROM matches m2
|
||||
WHERE m2.away_team_id = m.away_team_id
|
||||
AND m2.status = 'FT'
|
||||
AND m2.mst_utc < m.mst_utc
|
||||
),
|
||||
1.2
|
||||
) AS a_away_goals,
|
||||
COALESCE(
|
||||
(
|
||||
SELECT EXTRACT(
|
||||
EPOCH FROM (
|
||||
to_timestamp(m.mst_utc / 1000.0)
|
||||
- MAX(to_timestamp(m2.mst_utc / 1000.0))
|
||||
)
|
||||
) / 86400.0
|
||||
FROM matches m2
|
||||
WHERE m2.home_team_id = m.home_team_id
|
||||
AND m2.status = 'FT'
|
||||
AND m2.mst_utc < m.mst_utc
|
||||
),
|
||||
7
|
||||
) AS h_rest,
|
||||
COALESCE(
|
||||
(
|
||||
SELECT EXTRACT(
|
||||
EPOCH FROM (
|
||||
to_timestamp(m.mst_utc / 1000.0)
|
||||
- MAX(to_timestamp(m2.mst_utc / 1000.0))
|
||||
)
|
||||
) / 86400.0
|
||||
FROM matches m2
|
||||
WHERE m2.away_team_id = m.away_team_id
|
||||
AND m2.status = 'FT'
|
||||
AND m2.mst_utc < m.mst_utc
|
||||
),
|
||||
7
|
||||
) AS a_rest,
|
||||
(
|
||||
SELECT os.odd_value
|
||||
FROM odd_categories oc
|
||||
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||||
WHERE oc.match_id = m.id
|
||||
AND oc.name ILIKE 'Maç Sonucu'
|
||||
AND os.name = '1'
|
||||
LIMIT 1
|
||||
) AS oh,
|
||||
(
|
||||
SELECT os.odd_value
|
||||
FROM odd_categories oc
|
||||
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||||
WHERE oc.match_id = m.id
|
||||
AND oc.name ILIKE 'Maç Sonucu'
|
||||
AND os.name = 'X'
|
||||
LIMIT 1
|
||||
) AS od,
|
||||
(
|
||||
SELECT os.odd_value
|
||||
FROM odd_categories oc
|
||||
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||||
WHERE oc.match_id = m.id
|
||||
AND oc.name ILIKE 'Maç Sonucu'
|
||||
AND os.name = '2'
|
||||
LIMIT 1
|
||||
) AS oa
|
||||
FROM matches m
|
||||
LEFT JOIN football_ai_features maf ON maf.match_id = m.id
|
||||
LEFT JOIN match_officials ref ON ref.match_id = m.id AND ref.role_id = 1
|
||||
WHERE m.status = 'FT'
|
||||
AND m.score_home IS NOT NULL
|
||||
AND m.score_away IS NOT NULL
|
||||
AND m.sport = 'football'
|
||||
AND m.league_id = ANY(%s)
|
||||
AND EXISTS (SELECT 1 FROM odd_categories oc WHERE oc.match_id = m.id)
|
||||
)
|
||||
SELECT
|
||||
md.*,
|
||||
COALESCE(
|
||||
(
|
||||
SELECT
|
||||
(
|
||||
COUNT(*) FILTER (
|
||||
WHERE (
|
||||
(m2.home_team_id = md.home_team_id AND m2.score_home > m2.score_away)
|
||||
OR
|
||||
(m2.away_team_id = md.home_team_id AND m2.score_away > m2.score_home)
|
||||
)
|
||||
)::float
|
||||
+ COUNT(*) FILTER (WHERE m2.score_home = m2.score_away)::float * 0.5
|
||||
) / NULLIF(COUNT(*), 0)
|
||||
FROM matches m2
|
||||
WHERE m2.status = 'FT'
|
||||
AND m2.mst_utc < md.mst_utc
|
||||
AND (
|
||||
(m2.home_team_id = md.home_team_id AND m2.away_team_id = md.away_team_id)
|
||||
OR
|
||||
(m2.home_team_id = md.away_team_id AND m2.away_team_id = md.home_team_id)
|
||||
)
|
||||
),
|
||||
0.5
|
||||
) AS h2h_h_wr,
|
||||
COALESCE(
|
||||
(
|
||||
SELECT SUM(points)
|
||||
FROM (
|
||||
SELECT
|
||||
CASE
|
||||
WHEN m2.score_home > m2.score_away THEN 3
|
||||
WHEN m2.score_home = m2.score_away THEN 1
|
||||
ELSE 0
|
||||
END AS points
|
||||
FROM matches m2
|
||||
WHERE m2.home_team_id = md.home_team_id
|
||||
AND m2.status = 'FT'
|
||||
AND m2.mst_utc < md.mst_utc
|
||||
ORDER BY m2.mst_utc DESC
|
||||
LIMIT 5
|
||||
) home_form
|
||||
),
|
||||
0
|
||||
) AS h_form_pts,
|
||||
COALESCE(
|
||||
(
|
||||
SELECT SUM(points)
|
||||
FROM (
|
||||
SELECT
|
||||
CASE
|
||||
WHEN m2.score_away > m2.score_home THEN 3
|
||||
WHEN m2.score_away = m2.score_home THEN 1
|
||||
ELSE 0
|
||||
END AS points
|
||||
FROM matches m2
|
||||
WHERE m2.away_team_id = md.away_team_id
|
||||
AND m2.status = 'FT'
|
||||
AND m2.mst_utc < md.mst_utc
|
||||
ORDER BY m2.mst_utc DESC
|
||||
LIMIT 5
|
||||
) away_form
|
||||
),
|
||||
0
|
||||
) AS a_form_pts
|
||||
FROM match_data md
|
||||
ORDER BY md.mst_utc DESC
|
||||
"""
|
||||
|
||||
print("Top league verisi cekiliyor...")
|
||||
started_at = time.time()
|
||||
cur.execute(query, (league_ids,))
|
||||
rows = cur.fetchall()
|
||||
elapsed = time.time() - started_at
|
||||
print(f"{len(rows)} mac cekildi ({elapsed:.1f}s)")
|
||||
|
||||
dataframe = pd.DataFrame(
|
||||
rows,
|
||||
columns=[
|
||||
"id",
|
||||
"league_id",
|
||||
"h_id",
|
||||
"a_id",
|
||||
"sh",
|
||||
"sa",
|
||||
"utc",
|
||||
"referee_name",
|
||||
"h_elo",
|
||||
"a_elo",
|
||||
"h_home_goals",
|
||||
"a_away_goals",
|
||||
"h_rest",
|
||||
"a_rest",
|
||||
"oh",
|
||||
"od",
|
||||
"oa",
|
||||
"h2h_h_wr",
|
||||
"h_form_pts",
|
||||
"a_form_pts",
|
||||
],
|
||||
)
|
||||
return dataframe
|
||||
|
||||
|
||||
def _compute_league_avg_goals(
|
||||
cur: psycopg2.extensions.cursor,
|
||||
league_id: str,
|
||||
before_ts: int,
|
||||
) -> float:
|
||||
if not league_id:
|
||||
return 2.6
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT COALESCE(AVG(src.score_home + src.score_away), 2.6)
|
||||
FROM (
|
||||
SELECT score_home, score_away
|
||||
FROM matches
|
||||
WHERE league_id = %s
|
||||
AND sport = 'football'
|
||||
AND status = 'FT'
|
||||
AND score_home IS NOT NULL
|
||||
AND score_away IS NOT NULL
|
||||
AND mst_utc < %s
|
||||
ORDER BY mst_utc DESC
|
||||
LIMIT 100
|
||||
) src
|
||||
""",
|
||||
(league_id, before_ts),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
return float(row[0] or 2.6)
|
||||
|
||||
|
||||
def _compute_referee_profile(
|
||||
cur: psycopg2.extensions.cursor,
|
||||
referee_name: str | None,
|
||||
before_ts: int,
|
||||
) -> tuple[float, float]:
|
||||
if not referee_name:
|
||||
return 2.6, 0.0
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT
|
||||
COALESCE(AVG(score_home + score_away), 2.6) AS avg_goals,
|
||||
COALESCE(AVG(CASE WHEN score_home > score_away THEN 1.0 ELSE 0.0 END), 0.46) - 0.46 AS home_bias
|
||||
FROM (
|
||||
SELECT m.score_home, m.score_away
|
||||
FROM match_officials mo
|
||||
JOIN matches m ON m.id = mo.match_id
|
||||
WHERE mo.name = %s
|
||||
AND mo.role_id = 1
|
||||
AND m.sport = 'football'
|
||||
AND m.status = 'FT'
|
||||
AND m.score_home IS NOT NULL
|
||||
AND m.score_away IS NOT NULL
|
||||
AND m.mst_utc < %s
|
||||
ORDER BY m.mst_utc DESC
|
||||
LIMIT 30
|
||||
) src
|
||||
""",
|
||||
(referee_name, before_ts),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
return 2.6, 0.0
|
||||
return float(row[0] or 2.6), float(row[1] or 0.0)
|
||||
|
||||
|
||||
def _compute_team_squad_profile(
|
||||
cur: psycopg2.extensions.cursor,
|
||||
team_id: str,
|
||||
before_ts: int,
|
||||
) -> tuple[float, float]:
|
||||
if not team_id:
|
||||
return 0.5, 0.0
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
WITH recent_matches AS (
|
||||
SELECT m.id
|
||||
FROM matches m
|
||||
WHERE (m.home_team_id = %s OR m.away_team_id = %s)
|
||||
AND m.sport = 'football'
|
||||
AND m.status = 'FT'
|
||||
AND m.mst_utc < %s
|
||||
ORDER BY m.mst_utc DESC
|
||||
LIMIT 8
|
||||
),
|
||||
player_base AS (
|
||||
SELECT
|
||||
mpp.player_id,
|
||||
COUNT(*)::float AS appearances,
|
||||
COUNT(*) FILTER (WHERE mpp.is_starting = true)::float AS starts
|
||||
FROM match_player_participation mpp
|
||||
JOIN recent_matches rm ON rm.id = mpp.match_id
|
||||
WHERE mpp.team_id = %s
|
||||
GROUP BY mpp.player_id
|
||||
),
|
||||
player_goals AS (
|
||||
SELECT
|
||||
mpe.player_id,
|
||||
COUNT(*) FILTER (
|
||||
WHERE mpe.event_type = 'goal'
|
||||
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
|
||||
)::float AS goals,
|
||||
0.0::float AS assists
|
||||
FROM match_player_events mpe
|
||||
JOIN recent_matches rm ON rm.id = mpe.match_id
|
||||
WHERE mpe.team_id = %s
|
||||
GROUP BY mpe.player_id
|
||||
UNION ALL
|
||||
SELECT
|
||||
mpe.assist_player_id AS player_id,
|
||||
0.0::float AS goals,
|
||||
COUNT(*) FILTER (
|
||||
WHERE mpe.event_type = 'goal'
|
||||
AND mpe.assist_player_id IS NOT NULL
|
||||
)::float AS assists
|
||||
FROM match_player_events mpe
|
||||
JOIN recent_matches rm ON rm.id = mpe.match_id
|
||||
WHERE mpe.team_id = %s
|
||||
AND mpe.assist_player_id IS NOT NULL
|
||||
GROUP BY mpe.assist_player_id
|
||||
),
|
||||
player_events AS (
|
||||
SELECT
|
||||
player_id,
|
||||
SUM(goals) AS goals,
|
||||
SUM(assists) AS assists
|
||||
FROM player_goals
|
||||
GROUP BY player_id
|
||||
),
|
||||
player_scores AS (
|
||||
SELECT
|
||||
pb.player_id,
|
||||
(pb.starts * 1.5)
|
||||
+ ((pb.appearances - pb.starts) * 0.5)
|
||||
+ (COALESCE(pe.goals, 0.0) * 2.5)
|
||||
+ (COALESCE(pe.assists, 0.0) * 1.5) AS score
|
||||
FROM player_base pb
|
||||
LEFT JOIN player_events pe ON pe.player_id = pb.player_id
|
||||
)
|
||||
SELECT
|
||||
COALESCE(AVG(top_players.score), 0.0) AS avg_top_score,
|
||||
COALESCE(COUNT(*) FILTER (WHERE top_players.score >= 6.0), 0) AS key_players
|
||||
FROM (
|
||||
SELECT score
|
||||
FROM player_scores
|
||||
ORDER BY score DESC
|
||||
LIMIT 11
|
||||
) top_players
|
||||
""",
|
||||
(team_id, team_id, before_ts, team_id, team_id, team_id),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
return 0.5, 0.0
|
||||
|
||||
avg_top_score = float(row[0] or 0.0)
|
||||
return min(max(avg_top_score / 10.0, 0.0), 1.0), float(row[1] or 0.0)
|
||||
|
||||
|
||||
def _enrich_pre_match_context(
|
||||
cur: psycopg2.extensions.cursor,
|
||||
df: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
league_avg_goals: list[float] = []
|
||||
referee_avg_goals: list[float] = []
|
||||
referee_home_bias: list[float] = []
|
||||
home_squad_strength: list[float] = []
|
||||
away_squad_strength: list[float] = []
|
||||
home_key_players: list[float] = []
|
||||
away_key_players: list[float] = []
|
||||
|
||||
print("Pre-match context enrich ediliyor...")
|
||||
started_at = time.time()
|
||||
|
||||
for row in df.itertuples(index=False):
|
||||
before_ts = int(getattr(row, "utc") or 0)
|
||||
league_id = str(getattr(row, "league_id") or "")
|
||||
ref_name_raw: Any = getattr(row, "referee_name", None)
|
||||
referee_name = str(ref_name_raw).strip() if ref_name_raw else None
|
||||
|
||||
lg_avg = _compute_league_avg_goals(cur, league_id, before_ts)
|
||||
ref_avg, ref_bias = _compute_referee_profile(cur, referee_name, before_ts)
|
||||
h_sq, h_key = _compute_team_squad_profile(cur, str(getattr(row, "h_id")), before_ts)
|
||||
a_sq, a_key = _compute_team_squad_profile(cur, str(getattr(row, "a_id")), before_ts)
|
||||
|
||||
league_avg_goals.append(lg_avg)
|
||||
referee_avg_goals.append(ref_avg)
|
||||
referee_home_bias.append(ref_bias)
|
||||
home_squad_strength.append(h_sq)
|
||||
away_squad_strength.append(a_sq)
|
||||
home_key_players.append(h_key)
|
||||
away_key_players.append(a_key)
|
||||
|
||||
enriched = df.copy()
|
||||
enriched["league_avg_goals"] = league_avg_goals
|
||||
enriched["referee_avg_goals"] = referee_avg_goals
|
||||
enriched["referee_home_bias"] = referee_home_bias
|
||||
enriched["home_squad_strength"] = home_squad_strength
|
||||
enriched["away_squad_strength"] = away_squad_strength
|
||||
enriched["home_key_players"] = home_key_players
|
||||
enriched["away_key_players"] = away_key_players
|
||||
|
||||
print(f"Pre-match context tamam ({time.time() - started_at:.1f}s)")
|
||||
return enriched
|
||||
|
||||
|
||||
def _prepare_features(df: pd.DataFrame) -> pd.DataFrame:
|
||||
numeric_columns = [
|
||||
"sh",
|
||||
"sa",
|
||||
"utc",
|
||||
"league_avg_goals",
|
||||
"referee_avg_goals",
|
||||
"referee_home_bias",
|
||||
"home_squad_strength",
|
||||
"away_squad_strength",
|
||||
"home_key_players",
|
||||
"away_key_players",
|
||||
"h_elo",
|
||||
"a_elo",
|
||||
"h_home_goals",
|
||||
"a_away_goals",
|
||||
"h_rest",
|
||||
"a_rest",
|
||||
"oh",
|
||||
"od",
|
||||
"oa",
|
||||
"h2h_h_wr",
|
||||
"h_form_pts",
|
||||
"a_form_pts",
|
||||
]
|
||||
for column in numeric_columns:
|
||||
df[column] = pd.to_numeric(df[column], errors="coerce")
|
||||
|
||||
df = df.fillna(df.median(numeric_only=True))
|
||||
df = df[(df["oh"] > 1.0) & (df["od"] > 1.0) & (df["oa"] > 1.0)].copy()
|
||||
if df.empty:
|
||||
raise RuntimeError("No valid rows remained after odds filtering.")
|
||||
|
||||
margin = (1.0 / df["oh"]) + (1.0 / df["od"]) + (1.0 / df["oa"])
|
||||
df["imp_h"] = (1.0 / df["oh"]) / margin
|
||||
df["imp_d"] = (1.0 / df["od"]) / margin
|
||||
df["imp_a"] = (1.0 / df["oa"]) / margin
|
||||
|
||||
feature_rows = df.apply(
|
||||
lambda row: build_vqwen_feature_row(
|
||||
VqwenFeatureInput(
|
||||
home_elo=float(row["h_elo"]),
|
||||
away_elo=float(row["a_elo"]),
|
||||
home_avg_goals_scored=float(row["h_home_goals"]),
|
||||
away_avg_goals_scored=float(row["a_away_goals"]),
|
||||
home_avg_goals_conceded=float(row["a_away_goals"]),
|
||||
away_avg_goals_conceded=float(row["h_home_goals"]),
|
||||
home_avg_shots_on_target=4.0,
|
||||
away_avg_shots_on_target=4.0,
|
||||
home_avg_possession=50.0,
|
||||
away_avg_possession=50.0,
|
||||
home_rest_days=float(row["h_rest"]),
|
||||
away_rest_days=float(row["a_rest"]),
|
||||
implied_prob_home=float(row["imp_h"]),
|
||||
implied_prob_draw=float(row["imp_d"]),
|
||||
implied_prob_away=float(row["imp_a"]),
|
||||
# Historical training must not leak actual match lineups.
|
||||
# Runtime also often defaults to 1.0 when pre-match lineup data
|
||||
# is unavailable, so training should mirror that behavior.
|
||||
home_lineup_availability=1.0,
|
||||
away_lineup_availability=1.0,
|
||||
h2h_home_win_rate=float(row["h2h_h_wr"]),
|
||||
home_form_score=float(row["h_form_pts"]),
|
||||
away_form_score=float(row["a_form_pts"]),
|
||||
league_avg_goals=float(row["league_avg_goals"]),
|
||||
referee_avg_goals=float(row["referee_avg_goals"]),
|
||||
referee_home_bias=float(row["referee_home_bias"]),
|
||||
home_squad_strength=float(row["home_squad_strength"]),
|
||||
away_squad_strength=float(row["away_squad_strength"]),
|
||||
home_key_players=float(row["home_key_players"]),
|
||||
away_key_players=float(row["away_key_players"]),
|
||||
),
|
||||
),
|
||||
axis=1,
|
||||
result_type="expand",
|
||||
)
|
||||
for column in FEATURE_COLUMNS:
|
||||
df[column] = feature_rows[column]
|
||||
|
||||
df["t_ms"] = df.apply(
|
||||
lambda row: 0 if row["sh"] > row["sa"] else (2 if row["sh"] < row["sa"] else 1),
|
||||
axis=1,
|
||||
)
|
||||
df["t_ou"] = ((df["sh"] + df["sa"]) > 2.5).astype(int)
|
||||
df["t_btts"] = ((df["sh"] > 0) & (df["sa"] > 0)).astype(int)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def _temporal_split(df: pd.DataFrame, validation_ratio: float = 0.15) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||
if df.empty:
|
||||
raise RuntimeError("Cannot split an empty dataframe.")
|
||||
|
||||
ordered = df.sort_values("utc").reset_index(drop=True)
|
||||
split_index = max(int(len(ordered) * (1.0 - validation_ratio)), 1)
|
||||
split_index = min(split_index, len(ordered) - 1)
|
||||
return ordered.iloc[:split_index].copy(), ordered.iloc[split_index:].copy()
|
||||
|
||||
|
||||
def _save_metadata(df: pd.DataFrame, league_ids: list[str]) -> None:
|
||||
metadata = {
|
||||
"trained_at": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"contract_version": "vqwen.shared.v1",
|
||||
"league_count": len(league_ids),
|
||||
"league_ids": league_ids,
|
||||
"sample_count": int(len(df)),
|
||||
"feature_columns": FEATURE_COLUMNS,
|
||||
"target_distribution": {
|
||||
"ms_home": int((df["t_ms"] == 0).sum()),
|
||||
"ms_draw": int((df["t_ms"] == 1).sum()),
|
||||
"ms_away": int((df["t_ms"] == 2).sum()),
|
||||
"ou25_over": int(df["t_ou"].sum()),
|
||||
"ou25_under": int(len(df) - df["t_ou"].sum()),
|
||||
"btts_yes": int(df["t_btts"].sum()),
|
||||
"btts_no": int(len(df) - df["t_btts"].sum()),
|
||||
},
|
||||
}
|
||||
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(MODELS_DIR / "vqwen_training_meta.json").write_text(
|
||||
json.dumps(metadata, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def train_vqwen_v3() -> None:
|
||||
print("VQWEN v3 MODEL EGITIMI (TOP LEAGUES)")
|
||||
print("=" * 60)
|
||||
|
||||
league_ids = load_top_league_ids()
|
||||
print(f"League filter aktif: {len(league_ids)} lig")
|
||||
|
||||
dsn = get_clean_dsn()
|
||||
conn = psycopg2.connect(dsn)
|
||||
cur = conn.cursor()
|
||||
|
||||
try:
|
||||
df = _fetch_dataframe(cur, league_ids)
|
||||
df = _enrich_pre_match_context(cur, df)
|
||||
df = _prepare_features(df)
|
||||
print(f"Temiz egitim orneklemi: {len(df)} mac")
|
||||
|
||||
train_df, valid_df = _temporal_split(df)
|
||||
X_train = train_df[FEATURE_COLUMNS]
|
||||
X_valid = valid_df[FEATURE_COLUMNS]
|
||||
y_train = train_df["t_ms"]
|
||||
y_valid = valid_df["t_ms"]
|
||||
|
||||
print(
|
||||
"Temporal split:"
|
||||
f" train={len(train_df)}"
|
||||
f" valid={len(valid_df)}"
|
||||
f" train_end_utc={int(train_df['utc'].max())}"
|
||||
f" valid_start_utc={int(valid_df['utc'].min())}"
|
||||
)
|
||||
|
||||
print("MS modeli egitiliyor...")
|
||||
model_ms = lgb.train(
|
||||
{
|
||||
"objective": "multiclass",
|
||||
"num_class": 3,
|
||||
"metric": "multi_logloss",
|
||||
"verbose": -1,
|
||||
"num_leaves": 63,
|
||||
"learning_rate": 0.03,
|
||||
"feature_fraction": 0.85,
|
||||
"bagging_fraction": 0.85,
|
||||
"bagging_freq": 1,
|
||||
},
|
||||
lgb.Dataset(X_train, y_train),
|
||||
num_boost_round=1000,
|
||||
valid_sets=[lgb.Dataset(X_valid, y_valid)],
|
||||
callbacks=[lgb.early_stopping(50)],
|
||||
)
|
||||
|
||||
print("OU2.5 modeli egitiliyor...")
|
||||
model_ou25 = lgb.train(
|
||||
{
|
||||
"objective": "binary",
|
||||
"metric": "binary_logloss",
|
||||
"verbose": -1,
|
||||
"learning_rate": 0.03,
|
||||
"num_leaves": 31,
|
||||
},
|
||||
lgb.Dataset(train_df[FEATURE_COLUMNS], train_df["t_ou"]),
|
||||
num_boost_round=1000,
|
||||
valid_sets=[lgb.Dataset(valid_df[FEATURE_COLUMNS], valid_df["t_ou"])],
|
||||
callbacks=[lgb.early_stopping(50)],
|
||||
)
|
||||
|
||||
print("BTTS modeli egitiliyor...")
|
||||
model_btts = lgb.train(
|
||||
{
|
||||
"objective": "binary",
|
||||
"metric": "binary_logloss",
|
||||
"verbose": -1,
|
||||
"learning_rate": 0.03,
|
||||
"num_leaves": 31,
|
||||
},
|
||||
lgb.Dataset(train_df[FEATURE_COLUMNS], train_df["t_btts"]),
|
||||
num_boost_round=1000,
|
||||
valid_sets=[lgb.Dataset(valid_df[FEATURE_COLUMNS], valid_df["t_btts"])],
|
||||
callbacks=[lgb.early_stopping(50)],
|
||||
)
|
||||
|
||||
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
artifacts = {
|
||||
"vqwen_ms.pkl": model_ms,
|
||||
"vqwen_ou25.pkl": model_ou25,
|
||||
"vqwen_btts.pkl": model_btts,
|
||||
}
|
||||
for filename, model in artifacts.items():
|
||||
with (MODELS_DIR / filename).open("wb") as handle:
|
||||
pickle.dump(model, handle)
|
||||
print(f"Kaydedildi: {filename}")
|
||||
|
||||
_save_metadata(df, league_ids)
|
||||
print("Kaydedildi: vqwen_training_meta.json")
|
||||
print("VQWEN v3 top league egitimi tamamlandi.")
|
||||
finally:
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train_vqwen_v3()
|
||||
Reference in New Issue
Block a user