first (part 2: other directories)
Deploy Iddaai Backend / build-and-deploy (push) Failing after 18s

This commit is contained in:
2026-04-16 15:11:25 +03:00
parent 7814e0bc6b
commit 2f0b85a0c7
203 changed files with 59989 additions and 0 deletions
+451
View File
@@ -0,0 +1,451 @@
"""
V25 Model Trainer - NO TARGET LEAKAGE
=====================================
Training script for V25 ensemble model.
CRITICAL: This version removes total_goals and ht_total_goals features
to prevent target leakage. These features are only known AFTER the match ends.
Usage:
python scripts/train_v25_clean.py
"""
import os
import sys
import json
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime
from sklearn.metrics import accuracy_score, log_loss, classification_report
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Config
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
MODELS_DIR = os.path.join(AI_ENGINE_DIR, "models", "v25")
REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports", "training_v25")
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)
# Feature Columns - NO TARGET LEAKAGE
# These features are available BEFORE the match starts
FEATURES = [
# ELO Features (8)
"home_overall_elo", "away_overall_elo", "elo_diff",
"home_home_elo", "away_away_elo",
"home_form_elo", "away_form_elo", "form_elo_diff",
# Form Features (12)
"home_goals_avg", "home_conceded_avg",
"away_goals_avg", "away_conceded_avg",
"home_clean_sheet_rate", "away_clean_sheet_rate",
"home_scoring_rate", "away_scoring_rate",
"home_winning_streak", "away_winning_streak",
"home_unbeaten_streak", "away_unbeaten_streak",
# H2H Features (6)
"h2h_total_matches", "h2h_home_win_rate", "h2h_draw_rate",
"h2h_avg_goals", "h2h_btts_rate", "h2h_over25_rate",
# Team Stats Features (8)
"home_avg_possession", "away_avg_possession",
"home_avg_shots_on_target", "away_avg_shots_on_target",
"home_shot_conversion", "away_shot_conversion",
"home_avg_corners", "away_avg_corners",
# Odds Features (24) - Market wisdom
"odds_ms_h", "odds_ms_d", "odds_ms_a",
"implied_home", "implied_draw", "implied_away",
"odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
"odds_ou05_o", "odds_ou05_u",
"odds_ou15_o", "odds_ou15_u",
"odds_ou25_o", "odds_ou25_u",
"odds_ou35_o", "odds_ou35_u",
"odds_ht_ou05_o", "odds_ht_ou05_u",
"odds_ht_ou15_o", "odds_ht_ou15_u",
"odds_btts_y", "odds_btts_n",
"odds_ms_h_present", "odds_ms_d_present", "odds_ms_a_present",
"odds_ht_ms_h_present", "odds_ht_ms_d_present", "odds_ht_ms_a_present",
"odds_ou05_o_present", "odds_ou05_u_present",
"odds_ou15_o_present", "odds_ou15_u_present",
"odds_ou25_o_present", "odds_ou25_u_present",
"odds_ou35_o_present", "odds_ou35_u_present",
"odds_ht_ou05_o_present", "odds_ht_ou05_u_present",
"odds_ht_ou15_o_present", "odds_ht_ou15_u_present",
"odds_btts_y_present", "odds_btts_n_present",
# League Features (4)
"home_xga", "away_xga",
"league_avg_goals", "league_zero_goal_rate",
# Upset Engine (4)
"upset_atmosphere", "upset_motivation", "upset_fatigue", "upset_potential",
# Referee Engine (5)
"referee_home_bias", "referee_avg_goals", "referee_cards_total",
"referee_avg_yellow", "referee_experience",
# Momentum Engine (3)
"home_momentum_score", "away_momentum_score", "momentum_diff",
# Squad Features (9)
"home_squad_quality", "away_squad_quality", "squad_diff",
"home_key_players", "away_key_players",
"home_missing_impact", "away_missing_impact",
"home_goals_form", "away_goals_form",
]
# REMOVED: total_goals, ht_total_goals (TARGET LEAKAGE!)
# These are only known AFTER the match ends
print(f"[INFO] Total features: {len(FEATURES)}")
MARKET_CONFIGS = [
{"target": "label_ms", "name": "MS", "num_class": 3},
{"target": "label_ou15", "name": "OU15", "num_class": 2},
{"target": "label_ou25", "name": "OU25", "num_class": 2},
{"target": "label_ou35", "name": "OU35", "num_class": 2},
{"target": "label_btts", "name": "BTTS", "num_class": 2},
{"target": "label_ht_result", "name": "HT_RESULT", "num_class": 3},
{"target": "label_ht_ou05", "name": "HT_OU05", "num_class": 2},
{"target": "label_ht_ou15", "name": "HT_OU15", "num_class": 2},
{"target": "label_ht_ft", "name": "HTFT", "num_class": 9},
{"target": "label_odd_even", "name": "ODD_EVEN", "num_class": 2},
{"target": "label_cards_ou45", "name": "CARDS_OU45", "num_class": 2},
{"target": "label_handicap_ms", "name": "HANDICAP_MS", "num_class": 3},
]
def load_data():
"""Load training data from CSV."""
if not os.path.exists(DATA_PATH):
print(f"[ERROR] Data file not found: {DATA_PATH}")
print("[INFO] Run extract_training_data.py first to generate training data")
sys.exit(1)
print(f"[INFO] Loading data from {DATA_PATH}...")
df = pd.read_csv(DATA_PATH)
# Fill NaN values
for col in FEATURES:
if col in df.columns:
df[col] = df[col].fillna(0)
# Backward-compatible derivation for older CSVs without odds availability flags.
odds_flag_sources = {
"odds_ms_h_present": "odds_ms_h",
"odds_ms_d_present": "odds_ms_d",
"odds_ms_a_present": "odds_ms_a",
"odds_ht_ms_h_present": "odds_ht_ms_h",
"odds_ht_ms_d_present": "odds_ht_ms_d",
"odds_ht_ms_a_present": "odds_ht_ms_a",
"odds_ou05_o_present": "odds_ou05_o",
"odds_ou05_u_present": "odds_ou05_u",
"odds_ou15_o_present": "odds_ou15_o",
"odds_ou15_u_present": "odds_ou15_u",
"odds_ou25_o_present": "odds_ou25_o",
"odds_ou25_u_present": "odds_ou25_u",
"odds_ou35_o_present": "odds_ou35_o",
"odds_ou35_u_present": "odds_ou35_u",
"odds_ht_ou05_o_present": "odds_ht_ou05_o",
"odds_ht_ou05_u_present": "odds_ht_ou05_u",
"odds_ht_ou15_o_present": "odds_ht_ou15_o",
"odds_ht_ou15_u_present": "odds_ht_ou15_u",
"odds_btts_y_present": "odds_btts_y",
"odds_btts_n_present": "odds_btts_n",
}
for flag_col, odds_col in odds_flag_sources.items():
if flag_col not in df.columns:
df[flag_col] = (
pd.to_numeric(df.get(odds_col, 0), errors="coerce").fillna(0) > 1.01
).astype(float)
print(f"[INFO] Shape: {df.shape}")
print(f"[INFO] Columns: {list(df.columns)}")
return df
def temporal_split(valid_df: pd.DataFrame):
"""Chronological train/val/test split."""
ordered = valid_df.sort_values("mst_utc").reset_index(drop=True)
n = len(ordered)
train_end = max(int(n * 0.70), 1)
val_end = max(int(n * 0.85), train_end + 1)
val_end = min(val_end, n - 1)
train_df = ordered.iloc[:train_end].copy()
val_df = ordered.iloc[train_end:val_end].copy()
test_df = ordered.iloc[val_end:].copy()
return train_df, val_df, test_df
def train_xgboost_model(X_train, y_train, X_val, y_val, num_class=3, market_name="MS"):
"""Train XGBoost model with early stopping."""
print(f"\n[INFO] Training XGBoost for {market_name}...")
params = {
"objective": "multi:softprob" if num_class > 2 else "binary:logistic",
"eval_metric": "mlogloss" if num_class > 2 else "logloss",
"max_depth": 6,
"eta": 0.05,
"subsample": 0.8,
"colsample_bytree": 0.8,
"min_child_weight": 3,
"gamma": 0.1,
"n_jobs": 4,
"random_state": 42,
}
if num_class > 2:
params["num_class"] = num_class
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
evals_result = {}
model = xgb.train(
params,
dtrain,
num_boost_round=1000,
evals=[(dtrain, 'train'), (dval, 'val')],
early_stopping_rounds=50,
evals_result=evals_result,
verbose_eval=100,
)
print(f"[OK] Best iteration: {model.best_iteration}")
print(f"[OK] Best score: {model.best_score:.4f}")
return model
def train_lightgbm_model(X_train, y_train, X_val, y_val, num_class=3, market_name="MS"):
"""Train LightGBM model with early stopping."""
print(f"\n[INFO] Training LightGBM for {market_name}...")
params = {
"objective": "multiclass" if num_class > 2 else "binary",
"metric": "multi_logloss" if num_class > 2 else "binary_logloss",
"max_depth": 6,
"learning_rate": 0.05,
"feature_fraction": 0.8,
"bagging_fraction": 0.8,
"bagging_freq": 5,
"min_child_samples": 20,
"n_jobs": 4,
"random_state": 42,
"verbose": -1,
}
if num_class > 2:
params["num_class"] = num_class
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[train_data, val_data],
valid_names=['train', 'val'],
callbacks=[
lgb.early_stopping(stopping_rounds=50),
lgb.log_evaluation(period=100),
],
)
print(f"[OK] Best iteration: {model.best_iteration}")
print(f"[OK] Best score: {model.best_score['val'][params['metric']]:.4f}")
return model
def evaluate_model(model, X_test, y_test, model_type='xgb', num_class=3):
"""Evaluate model on test set."""
if model_type == 'xgb':
dtest = xgb.DMatrix(X_test)
probs = model.predict(dtest)
else: # lgb
probs = model.predict(X_test, num_iteration=model.best_iteration)
if len(probs.shape) == 1:
# Binary classification
probs = np.column_stack([1 - probs, probs])
preds = np.argmax(probs, axis=1)
acc = accuracy_score(y_test, preds)
loss = log_loss(y_test, probs)
print(f"\n[RESULTS] Test Results:")
print(f" Accuracy: {acc:.4f}")
print(f" Log Loss: {loss:.4f}")
# Per-class metrics
print("\n[REPORT] Classification Report:")
print(classification_report(y_test, preds))
return probs, acc, loss
def train_market(df, target_col, market_name, num_class=3):
"""Train models for a specific market."""
print(f"\n{'='*60}")
print(f"[MARKET] Training {market_name}")
print(f"{'='*60}")
# Filter valid rows
valid_df = df[df[target_col].notna()].copy()
valid_df = valid_df[valid_df[target_col].astype(str) != ""].copy()
print(f"[INFO] Valid samples: {len(valid_df)}")
if len(valid_df) < 100:
print(f"[ERROR] Not enough data for {market_name}")
return None, None
# Prepare features
available_features = [f for f in FEATURES if f in valid_df.columns]
print(f"[INFO] Available features: {len(available_features)}/{len(FEATURES)}")
train_df, val_df, test_df = temporal_split(valid_df)
X_train = train_df[available_features].values
X_val = val_df[available_features].values
X_test = test_df[available_features].values
y_train = train_df[target_col].astype(int).values
y_val = val_df[target_col].astype(int).values
y_test = test_df[target_col].astype(int).values
print(
f"[INFO] Temporal split -> Train: {len(X_train)},"
f" Val: {len(X_val)}, Test: {len(X_test)}"
)
print(
f"[INFO] Time windows -> train_end={int(train_df['mst_utc'].max())},"
f" val_end={int(val_df['mst_utc'].max())},"
f" test_end={int(test_df['mst_utc'].max())}"
)
# Train XGBoost
xgb_model = train_xgboost_model(X_train, y_train, X_val, y_val, num_class, market_name)
# Train LightGBM
lgb_model = train_lightgbm_model(X_train, y_train, X_val, y_val, num_class, market_name)
# Evaluate
print("\n[INFO] XGBoost Evaluation:")
xgb_probs, xgb_acc, xgb_loss = evaluate_model(xgb_model, X_test, y_test, 'xgb', num_class)
print("\n[INFO] LightGBM Evaluation:")
lgb_probs, lgb_acc, lgb_loss = evaluate_model(lgb_model, X_test, y_test, 'lgb', num_class)
# Ensemble evaluation
ensemble_probs = (xgb_probs + lgb_probs) / 2
ensemble_preds = np.argmax(ensemble_probs, axis=1)
ensemble_acc = accuracy_score(y_test, ensemble_preds)
ensemble_loss = log_loss(y_test, ensemble_probs)
print(f"\n[INFO] Ensemble Evaluation:")
print(f" Accuracy: {ensemble_acc:.4f}")
print(f" Log Loss: {ensemble_loss:.4f}")
# Save models
xgb_path = os.path.join(MODELS_DIR, f"xgb_v25_{market_name.lower()}.json")
xgb_model.save_model(xgb_path)
print(f"[OK] XGBoost saved: {xgb_path}")
lgb_path = os.path.join(MODELS_DIR, f"lgb_v25_{market_name.lower()}.txt")
lgb_model.save_model(lgb_path)
print(f"[OK] LightGBM saved: {lgb_path}")
metrics = {
"samples": int(len(valid_df)),
"features_used": available_features,
"train_samples": int(len(X_train)),
"val_samples": int(len(X_val)),
"test_samples": int(len(X_test)),
"xgb_accuracy": round(float(xgb_acc), 4),
"xgb_logloss": round(float(xgb_loss), 4),
"lgb_accuracy": round(float(lgb_acc), 4),
"lgb_logloss": round(float(lgb_loss), 4),
"ensemble_accuracy": round(float(ensemble_acc), 4),
"ensemble_logloss": round(float(ensemble_loss), 4),
"class_count": int(num_class),
}
return xgb_model, lgb_model, metrics
def main():
"""Main training pipeline."""
print("="*60)
print("V25 Model Training - NO TARGET LEAKAGE")
print("="*60)
print(f"[INFO] Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# Load data
df = load_data()
target_cols = [col for col in df.columns if col.startswith('label_')]
print(f"\n[INFO] Available targets: {target_cols}")
results = {}
reports = {
"trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"market_results": {},
}
for config in MARKET_CONFIGS:
target = config["target"]
market_name = config["name"]
num_class = config["num_class"]
if target not in df.columns:
print(f"[SKIP] {market_name}: missing target column {target}")
continue
xgb_model, lgb_model, metrics = train_market(
df, target, market_name, num_class=num_class
)
results[market_name] = {
'xgb': xgb_model is not None,
'lgb': lgb_model is not None,
}
reports["market_results"][market_name] = metrics
# Save feature list
feature_path = os.path.join(MODELS_DIR, "feature_cols.json")
with open(feature_path, 'w') as f:
json.dump(FEATURES, f, indent=2)
print(f"\n[OK] Feature list saved: {feature_path}")
report_path = os.path.join(REPORTS_DIR, "v25_market_metrics.json")
with open(report_path, "w") as f:
json.dump(reports, f, indent=2)
print(f"[OK] Metrics report saved: {report_path}")
# Summary
print("\n" + "="*60)
print("[SUMMARY] Training Results")
print("="*60)
for market, status in results.items():
print(f" {market}: XGB={status['xgb']}, LGB={status['lgb']}")
print(f"\n[INFO] Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("[OK] V25 Training Complete!")
if __name__ == "__main__":
main()