Files
fahricansecer 2f0b85a0c7
Deploy Iddaai Backend / build-and-deploy (push) Failing after 18s
first (part 2: other directories)
2026-04-16 15:11:25 +03:00

703 lines
24 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
VQWEN v3 Training Script
========================
Retrains the VQWEN market models using only the configured top leagues.
"""
from __future__ import annotations
import json
import os
import pickle
import sys
import time
from pathlib import Path
from typing import Any
import lightgbm as lgb
import pandas as pd
import psycopg2
from dotenv import load_dotenv
AI_DIR = Path(__file__).resolve().parent
ENGINE_DIR = AI_DIR.parent
REPO_DIR = ENGINE_DIR.parent
MODELS_DIR = ENGINE_DIR / "models" / "vqwen"
TOP_LEAGUES_PATH = REPO_DIR / "top_leagues.json"
if str(ENGINE_DIR) not in sys.path:
sys.path.insert(0, str(ENGINE_DIR))
from features.vqwen_contract import (
FEATURE_COLUMNS,
VqwenFeatureInput,
build_vqwen_feature_row,
)
def _load_env() -> None:
load_dotenv(REPO_DIR / ".env", override=False)
load_dotenv(ENGINE_DIR / ".env", override=False)
def get_clean_dsn() -> str:
_load_env()
raw = os.getenv("DATABASE_URL", "").strip().strip('"').strip("'")
if not raw:
raise RuntimeError("DATABASE_URL is missing.")
return raw.split("?", 1)[0]
def load_top_league_ids() -> list[str]:
if not TOP_LEAGUES_PATH.exists():
raise FileNotFoundError(f"top_leagues.json not found at {TOP_LEAGUES_PATH}")
raw = json.loads(TOP_LEAGUES_PATH.read_text(encoding="utf-8"))
if not isinstance(raw, list):
raise ValueError("top_leagues.json must contain a JSON array.")
league_ids = [str(item).strip() for item in raw if str(item).strip()]
deduped = list(dict.fromkeys(league_ids))
if not deduped:
raise ValueError("top_leagues.json is empty.")
return deduped
def _fetch_dataframe(cur: psycopg2.extensions.cursor, league_ids: list[str]) -> pd.DataFrame:
query = """
WITH match_data AS (
SELECT
m.id,
m.league_id,
m.home_team_id,
m.away_team_id,
m.score_home,
m.score_away,
m.mst_utc,
ref.name AS referee_name,
COALESCE(maf.home_elo, 1500) AS home_elo,
COALESCE(maf.away_elo, 1500) AS away_elo,
COALESCE(
(
SELECT AVG(m2.score_home)
FROM matches m2
WHERE m2.home_team_id = m.home_team_id
AND m2.status = 'FT'
AND m2.mst_utc < m.mst_utc
),
1.2
) AS h_home_goals,
COALESCE(
(
SELECT AVG(m2.score_away)
FROM matches m2
WHERE m2.away_team_id = m.away_team_id
AND m2.status = 'FT'
AND m2.mst_utc < m.mst_utc
),
1.2
) AS a_away_goals,
COALESCE(
(
SELECT EXTRACT(
EPOCH FROM (
to_timestamp(m.mst_utc / 1000.0)
- MAX(to_timestamp(m2.mst_utc / 1000.0))
)
) / 86400.0
FROM matches m2
WHERE m2.home_team_id = m.home_team_id
AND m2.status = 'FT'
AND m2.mst_utc < m.mst_utc
),
7
) AS h_rest,
COALESCE(
(
SELECT EXTRACT(
EPOCH FROM (
to_timestamp(m.mst_utc / 1000.0)
- MAX(to_timestamp(m2.mst_utc / 1000.0))
)
) / 86400.0
FROM matches m2
WHERE m2.away_team_id = m.away_team_id
AND m2.status = 'FT'
AND m2.mst_utc < m.mst_utc
),
7
) AS a_rest,
(
SELECT os.odd_value
FROM odd_categories oc
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
WHERE oc.match_id = m.id
AND oc.name ILIKE 'Maç Sonucu'
AND os.name = '1'
LIMIT 1
) AS oh,
(
SELECT os.odd_value
FROM odd_categories oc
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
WHERE oc.match_id = m.id
AND oc.name ILIKE 'Maç Sonucu'
AND os.name = 'X'
LIMIT 1
) AS od,
(
SELECT os.odd_value
FROM odd_categories oc
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
WHERE oc.match_id = m.id
AND oc.name ILIKE 'Maç Sonucu'
AND os.name = '2'
LIMIT 1
) AS oa
FROM matches m
LEFT JOIN football_ai_features maf ON maf.match_id = m.id
LEFT JOIN match_officials ref ON ref.match_id = m.id AND ref.role_id = 1
WHERE m.status = 'FT'
AND m.score_home IS NOT NULL
AND m.score_away IS NOT NULL
AND m.sport = 'football'
AND m.league_id = ANY(%s)
AND EXISTS (SELECT 1 FROM odd_categories oc WHERE oc.match_id = m.id)
)
SELECT
md.*,
COALESCE(
(
SELECT
(
COUNT(*) FILTER (
WHERE (
(m2.home_team_id = md.home_team_id AND m2.score_home > m2.score_away)
OR
(m2.away_team_id = md.home_team_id AND m2.score_away > m2.score_home)
)
)::float
+ COUNT(*) FILTER (WHERE m2.score_home = m2.score_away)::float * 0.5
) / NULLIF(COUNT(*), 0)
FROM matches m2
WHERE m2.status = 'FT'
AND m2.mst_utc < md.mst_utc
AND (
(m2.home_team_id = md.home_team_id AND m2.away_team_id = md.away_team_id)
OR
(m2.home_team_id = md.away_team_id AND m2.away_team_id = md.home_team_id)
)
),
0.5
) AS h2h_h_wr,
COALESCE(
(
SELECT SUM(points)
FROM (
SELECT
CASE
WHEN m2.score_home > m2.score_away THEN 3
WHEN m2.score_home = m2.score_away THEN 1
ELSE 0
END AS points
FROM matches m2
WHERE m2.home_team_id = md.home_team_id
AND m2.status = 'FT'
AND m2.mst_utc < md.mst_utc
ORDER BY m2.mst_utc DESC
LIMIT 5
) home_form
),
0
) AS h_form_pts,
COALESCE(
(
SELECT SUM(points)
FROM (
SELECT
CASE
WHEN m2.score_away > m2.score_home THEN 3
WHEN m2.score_away = m2.score_home THEN 1
ELSE 0
END AS points
FROM matches m2
WHERE m2.away_team_id = md.away_team_id
AND m2.status = 'FT'
AND m2.mst_utc < md.mst_utc
ORDER BY m2.mst_utc DESC
LIMIT 5
) away_form
),
0
) AS a_form_pts
FROM match_data md
ORDER BY md.mst_utc DESC
"""
print("Top league verisi cekiliyor...")
started_at = time.time()
cur.execute(query, (league_ids,))
rows = cur.fetchall()
elapsed = time.time() - started_at
print(f"{len(rows)} mac cekildi ({elapsed:.1f}s)")
dataframe = pd.DataFrame(
rows,
columns=[
"id",
"league_id",
"h_id",
"a_id",
"sh",
"sa",
"utc",
"referee_name",
"h_elo",
"a_elo",
"h_home_goals",
"a_away_goals",
"h_rest",
"a_rest",
"oh",
"od",
"oa",
"h2h_h_wr",
"h_form_pts",
"a_form_pts",
],
)
return dataframe
def _compute_league_avg_goals(
cur: psycopg2.extensions.cursor,
league_id: str,
before_ts: int,
) -> float:
if not league_id:
return 2.6
cur.execute(
"""
SELECT COALESCE(AVG(src.score_home + src.score_away), 2.6)
FROM (
SELECT score_home, score_away
FROM matches
WHERE league_id = %s
AND sport = 'football'
AND status = 'FT'
AND score_home IS NOT NULL
AND score_away IS NOT NULL
AND mst_utc < %s
ORDER BY mst_utc DESC
LIMIT 100
) src
""",
(league_id, before_ts),
)
row = cur.fetchone()
return float(row[0] or 2.6)
def _compute_referee_profile(
cur: psycopg2.extensions.cursor,
referee_name: str | None,
before_ts: int,
) -> tuple[float, float]:
if not referee_name:
return 2.6, 0.0
cur.execute(
"""
SELECT
COALESCE(AVG(score_home + score_away), 2.6) AS avg_goals,
COALESCE(AVG(CASE WHEN score_home > score_away THEN 1.0 ELSE 0.0 END), 0.46) - 0.46 AS home_bias
FROM (
SELECT m.score_home, m.score_away
FROM match_officials mo
JOIN matches m ON m.id = mo.match_id
WHERE mo.name = %s
AND mo.role_id = 1
AND m.sport = 'football'
AND m.status = 'FT'
AND m.score_home IS NOT NULL
AND m.score_away IS NOT NULL
AND m.mst_utc < %s
ORDER BY m.mst_utc DESC
LIMIT 30
) src
""",
(referee_name, before_ts),
)
row = cur.fetchone()
if not row:
return 2.6, 0.0
return float(row[0] or 2.6), float(row[1] or 0.0)
def _compute_team_squad_profile(
cur: psycopg2.extensions.cursor,
team_id: str,
before_ts: int,
) -> tuple[float, float]:
if not team_id:
return 0.5, 0.0
cur.execute(
"""
WITH recent_matches AS (
SELECT m.id
FROM matches m
WHERE (m.home_team_id = %s OR m.away_team_id = %s)
AND m.sport = 'football'
AND m.status = 'FT'
AND m.mst_utc < %s
ORDER BY m.mst_utc DESC
LIMIT 8
),
player_base AS (
SELECT
mpp.player_id,
COUNT(*)::float AS appearances,
COUNT(*) FILTER (WHERE mpp.is_starting = true)::float AS starts
FROM match_player_participation mpp
JOIN recent_matches rm ON rm.id = mpp.match_id
WHERE mpp.team_id = %s
GROUP BY mpp.player_id
),
player_goals AS (
SELECT
mpe.player_id,
COUNT(*) FILTER (
WHERE mpe.event_type = 'goal'
AND COALESCE(mpe.event_subtype, '') NOT ILIKE '%%penaltı kaçırma%%'
)::float AS goals,
0.0::float AS assists
FROM match_player_events mpe
JOIN recent_matches rm ON rm.id = mpe.match_id
WHERE mpe.team_id = %s
GROUP BY mpe.player_id
UNION ALL
SELECT
mpe.assist_player_id AS player_id,
0.0::float AS goals,
COUNT(*) FILTER (
WHERE mpe.event_type = 'goal'
AND mpe.assist_player_id IS NOT NULL
)::float AS assists
FROM match_player_events mpe
JOIN recent_matches rm ON rm.id = mpe.match_id
WHERE mpe.team_id = %s
AND mpe.assist_player_id IS NOT NULL
GROUP BY mpe.assist_player_id
),
player_events AS (
SELECT
player_id,
SUM(goals) AS goals,
SUM(assists) AS assists
FROM player_goals
GROUP BY player_id
),
player_scores AS (
SELECT
pb.player_id,
(pb.starts * 1.5)
+ ((pb.appearances - pb.starts) * 0.5)
+ (COALESCE(pe.goals, 0.0) * 2.5)
+ (COALESCE(pe.assists, 0.0) * 1.5) AS score
FROM player_base pb
LEFT JOIN player_events pe ON pe.player_id = pb.player_id
)
SELECT
COALESCE(AVG(top_players.score), 0.0) AS avg_top_score,
COALESCE(COUNT(*) FILTER (WHERE top_players.score >= 6.0), 0) AS key_players
FROM (
SELECT score
FROM player_scores
ORDER BY score DESC
LIMIT 11
) top_players
""",
(team_id, team_id, before_ts, team_id, team_id, team_id),
)
row = cur.fetchone()
if not row:
return 0.5, 0.0
avg_top_score = float(row[0] or 0.0)
return min(max(avg_top_score / 10.0, 0.0), 1.0), float(row[1] or 0.0)
def _enrich_pre_match_context(
cur: psycopg2.extensions.cursor,
df: pd.DataFrame,
) -> pd.DataFrame:
league_avg_goals: list[float] = []
referee_avg_goals: list[float] = []
referee_home_bias: list[float] = []
home_squad_strength: list[float] = []
away_squad_strength: list[float] = []
home_key_players: list[float] = []
away_key_players: list[float] = []
print("Pre-match context enrich ediliyor...")
started_at = time.time()
for row in df.itertuples(index=False):
before_ts = int(getattr(row, "utc") or 0)
league_id = str(getattr(row, "league_id") or "")
ref_name_raw: Any = getattr(row, "referee_name", None)
referee_name = str(ref_name_raw).strip() if ref_name_raw else None
lg_avg = _compute_league_avg_goals(cur, league_id, before_ts)
ref_avg, ref_bias = _compute_referee_profile(cur, referee_name, before_ts)
h_sq, h_key = _compute_team_squad_profile(cur, str(getattr(row, "h_id")), before_ts)
a_sq, a_key = _compute_team_squad_profile(cur, str(getattr(row, "a_id")), before_ts)
league_avg_goals.append(lg_avg)
referee_avg_goals.append(ref_avg)
referee_home_bias.append(ref_bias)
home_squad_strength.append(h_sq)
away_squad_strength.append(a_sq)
home_key_players.append(h_key)
away_key_players.append(a_key)
enriched = df.copy()
enriched["league_avg_goals"] = league_avg_goals
enriched["referee_avg_goals"] = referee_avg_goals
enriched["referee_home_bias"] = referee_home_bias
enriched["home_squad_strength"] = home_squad_strength
enriched["away_squad_strength"] = away_squad_strength
enriched["home_key_players"] = home_key_players
enriched["away_key_players"] = away_key_players
print(f"Pre-match context tamam ({time.time() - started_at:.1f}s)")
return enriched
def _prepare_features(df: pd.DataFrame) -> pd.DataFrame:
numeric_columns = [
"sh",
"sa",
"utc",
"league_avg_goals",
"referee_avg_goals",
"referee_home_bias",
"home_squad_strength",
"away_squad_strength",
"home_key_players",
"away_key_players",
"h_elo",
"a_elo",
"h_home_goals",
"a_away_goals",
"h_rest",
"a_rest",
"oh",
"od",
"oa",
"h2h_h_wr",
"h_form_pts",
"a_form_pts",
]
for column in numeric_columns:
df[column] = pd.to_numeric(df[column], errors="coerce")
df = df.fillna(df.median(numeric_only=True))
df = df[(df["oh"] > 1.0) & (df["od"] > 1.0) & (df["oa"] > 1.0)].copy()
if df.empty:
raise RuntimeError("No valid rows remained after odds filtering.")
margin = (1.0 / df["oh"]) + (1.0 / df["od"]) + (1.0 / df["oa"])
df["imp_h"] = (1.0 / df["oh"]) / margin
df["imp_d"] = (1.0 / df["od"]) / margin
df["imp_a"] = (1.0 / df["oa"]) / margin
feature_rows = df.apply(
lambda row: build_vqwen_feature_row(
VqwenFeatureInput(
home_elo=float(row["h_elo"]),
away_elo=float(row["a_elo"]),
home_avg_goals_scored=float(row["h_home_goals"]),
away_avg_goals_scored=float(row["a_away_goals"]),
home_avg_goals_conceded=float(row["a_away_goals"]),
away_avg_goals_conceded=float(row["h_home_goals"]),
home_avg_shots_on_target=4.0,
away_avg_shots_on_target=4.0,
home_avg_possession=50.0,
away_avg_possession=50.0,
home_rest_days=float(row["h_rest"]),
away_rest_days=float(row["a_rest"]),
implied_prob_home=float(row["imp_h"]),
implied_prob_draw=float(row["imp_d"]),
implied_prob_away=float(row["imp_a"]),
# Historical training must not leak actual match lineups.
# Runtime also often defaults to 1.0 when pre-match lineup data
# is unavailable, so training should mirror that behavior.
home_lineup_availability=1.0,
away_lineup_availability=1.0,
h2h_home_win_rate=float(row["h2h_h_wr"]),
home_form_score=float(row["h_form_pts"]),
away_form_score=float(row["a_form_pts"]),
league_avg_goals=float(row["league_avg_goals"]),
referee_avg_goals=float(row["referee_avg_goals"]),
referee_home_bias=float(row["referee_home_bias"]),
home_squad_strength=float(row["home_squad_strength"]),
away_squad_strength=float(row["away_squad_strength"]),
home_key_players=float(row["home_key_players"]),
away_key_players=float(row["away_key_players"]),
),
),
axis=1,
result_type="expand",
)
for column in FEATURE_COLUMNS:
df[column] = feature_rows[column]
df["t_ms"] = df.apply(
lambda row: 0 if row["sh"] > row["sa"] else (2 if row["sh"] < row["sa"] else 1),
axis=1,
)
df["t_ou"] = ((df["sh"] + df["sa"]) > 2.5).astype(int)
df["t_btts"] = ((df["sh"] > 0) & (df["sa"] > 0)).astype(int)
return df
def _temporal_split(df: pd.DataFrame, validation_ratio: float = 0.15) -> tuple[pd.DataFrame, pd.DataFrame]:
if df.empty:
raise RuntimeError("Cannot split an empty dataframe.")
ordered = df.sort_values("utc").reset_index(drop=True)
split_index = max(int(len(ordered) * (1.0 - validation_ratio)), 1)
split_index = min(split_index, len(ordered) - 1)
return ordered.iloc[:split_index].copy(), ordered.iloc[split_index:].copy()
def _save_metadata(df: pd.DataFrame, league_ids: list[str]) -> None:
metadata = {
"trained_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"contract_version": "vqwen.shared.v1",
"league_count": len(league_ids),
"league_ids": league_ids,
"sample_count": int(len(df)),
"feature_columns": FEATURE_COLUMNS,
"target_distribution": {
"ms_home": int((df["t_ms"] == 0).sum()),
"ms_draw": int((df["t_ms"] == 1).sum()),
"ms_away": int((df["t_ms"] == 2).sum()),
"ou25_over": int(df["t_ou"].sum()),
"ou25_under": int(len(df) - df["t_ou"].sum()),
"btts_yes": int(df["t_btts"].sum()),
"btts_no": int(len(df) - df["t_btts"].sum()),
},
}
MODELS_DIR.mkdir(parents=True, exist_ok=True)
(MODELS_DIR / "vqwen_training_meta.json").write_text(
json.dumps(metadata, indent=2),
encoding="utf-8",
)
def train_vqwen_v3() -> None:
print("VQWEN v3 MODEL EGITIMI (TOP LEAGUES)")
print("=" * 60)
league_ids = load_top_league_ids()
print(f"League filter aktif: {len(league_ids)} lig")
dsn = get_clean_dsn()
conn = psycopg2.connect(dsn)
cur = conn.cursor()
try:
df = _fetch_dataframe(cur, league_ids)
df = _enrich_pre_match_context(cur, df)
df = _prepare_features(df)
print(f"Temiz egitim orneklemi: {len(df)} mac")
train_df, valid_df = _temporal_split(df)
X_train = train_df[FEATURE_COLUMNS]
X_valid = valid_df[FEATURE_COLUMNS]
y_train = train_df["t_ms"]
y_valid = valid_df["t_ms"]
print(
"Temporal split:"
f" train={len(train_df)}"
f" valid={len(valid_df)}"
f" train_end_utc={int(train_df['utc'].max())}"
f" valid_start_utc={int(valid_df['utc'].min())}"
)
print("MS modeli egitiliyor...")
model_ms = lgb.train(
{
"objective": "multiclass",
"num_class": 3,
"metric": "multi_logloss",
"verbose": -1,
"num_leaves": 63,
"learning_rate": 0.03,
"feature_fraction": 0.85,
"bagging_fraction": 0.85,
"bagging_freq": 1,
},
lgb.Dataset(X_train, y_train),
num_boost_round=1000,
valid_sets=[lgb.Dataset(X_valid, y_valid)],
callbacks=[lgb.early_stopping(50)],
)
print("OU2.5 modeli egitiliyor...")
model_ou25 = lgb.train(
{
"objective": "binary",
"metric": "binary_logloss",
"verbose": -1,
"learning_rate": 0.03,
"num_leaves": 31,
},
lgb.Dataset(train_df[FEATURE_COLUMNS], train_df["t_ou"]),
num_boost_round=1000,
valid_sets=[lgb.Dataset(valid_df[FEATURE_COLUMNS], valid_df["t_ou"])],
callbacks=[lgb.early_stopping(50)],
)
print("BTTS modeli egitiliyor...")
model_btts = lgb.train(
{
"objective": "binary",
"metric": "binary_logloss",
"verbose": -1,
"learning_rate": 0.03,
"num_leaves": 31,
},
lgb.Dataset(train_df[FEATURE_COLUMNS], train_df["t_btts"]),
num_boost_round=1000,
valid_sets=[lgb.Dataset(valid_df[FEATURE_COLUMNS], valid_df["t_btts"])],
callbacks=[lgb.early_stopping(50)],
)
MODELS_DIR.mkdir(parents=True, exist_ok=True)
artifacts = {
"vqwen_ms.pkl": model_ms,
"vqwen_ou25.pkl": model_ou25,
"vqwen_btts.pkl": model_btts,
}
for filename, model in artifacts.items():
with (MODELS_DIR / filename).open("wb") as handle:
pickle.dump(model, handle)
print(f"Kaydedildi: {filename}")
_save_metadata(df, league_ids)
print("Kaydedildi: vqwen_training_meta.json")
print("VQWEN v3 top league egitimi tamamlandi.")
finally:
cur.close()
conn.close()
if __name__ == "__main__":
train_vqwen_v3()