feat(ai): expand training to 68K+ matches, add score model, backfill implied odds
Deploy Iddaai Backend / build-and-deploy (push) Successful in 6s
Deploy Iddaai Backend / build-and-deploy (push) Successful in 6s
- extract_training_data.py: switch from top_leagues.json (23) to qualified_leagues.json (265) - update_implied_odds.py: new script to backfill implied odds from real market data - train_score_model.py: rewrite with v25 102-feature set + temporal split - single_match_orchestrator.py: integrate ML score model with heuristic fallback
This commit is contained in:
@@ -33,7 +33,7 @@ from features.upset_engine import get_upset_engine
|
|||||||
from features.referee_engine import get_referee_engine
|
from features.referee_engine import get_referee_engine
|
||||||
from features.momentum_engine import get_momentum_engine
|
from features.momentum_engine import get_momentum_engine
|
||||||
|
|
||||||
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "top_leagues.json")
|
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "qualified_leagues.json")
|
||||||
OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
|
OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
|
||||||
|
|
||||||
# Ensure output dir exists
|
# Ensure output dir exists
|
||||||
|
|||||||
@@ -1,183 +1,271 @@
|
|||||||
|
"""
|
||||||
|
V25-Compatible Score Prediction Model Trainer
|
||||||
|
===============================================
|
||||||
|
Trains 4 independent XGBoost regression models for:
|
||||||
|
- FT Home Goals
|
||||||
|
- FT Away Goals
|
||||||
|
- HT Home Goals
|
||||||
|
- HT Away Goals
|
||||||
|
|
||||||
|
Uses the same 102-feature set as v25_ensemble for full compatibility.
|
||||||
|
Temporal train/test split (80/20) to avoid future leakage.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/train_score_model.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
import pickle
|
from datetime import datetime
|
||||||
import os
|
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
from sklearn.metrics import mean_absolute_error, r2_score
|
|
||||||
|
|
||||||
# Paths
|
# Add parent directory to path
|
||||||
DATA_PATH = os.path.join(os.path.dirname(__file__), "../data/training_data.csv")
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
MODEL_PATH = os.path.join(os.path.dirname(__file__), "../models/xgb_score.pkl")
|
|
||||||
|
|
||||||
# Import unified 56-feature array from markets trainer
|
# Config
|
||||||
from train_xgboost_markets import FEATURES
|
AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
|
||||||
|
MODEL_PATH = os.path.join(AI_ENGINE_DIR, "models", "xgb_score.pkl")
|
||||||
|
|
||||||
|
# Import the EXACT same feature set as v25 market models
|
||||||
|
from train_v25_clean import FEATURES
|
||||||
|
|
||||||
TARGETS = ["score_home", "score_away", "ht_score_home", "ht_score_away"]
|
TARGETS = ["score_home", "score_away", "ht_score_home", "ht_score_away"]
|
||||||
|
|
||||||
def train():
|
# Model hyperparameters (tuned for goal count regression)
|
||||||
print("🚀 Training Score Prediction Model (XGBoost) - Full Time & Half Time")
|
XGB_PARAMS = {
|
||||||
print("=" * 60)
|
"objective": "reg:squarederror",
|
||||||
|
"n_estimators": 1200,
|
||||||
|
"learning_rate": 0.02,
|
||||||
|
"max_depth": 6,
|
||||||
|
"subsample": 0.8,
|
||||||
|
"colsample_bytree": 0.7,
|
||||||
|
"min_child_weight": 5,
|
||||||
|
"reg_alpha": 0.1,
|
||||||
|
"reg_lambda": 1.0,
|
||||||
|
"n_jobs": -1,
|
||||||
|
"random_state": 42,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_data() -> pd.DataFrame:
|
||||||
|
"""Load and validate training data."""
|
||||||
if not os.path.exists(DATA_PATH):
|
if not os.path.exists(DATA_PATH):
|
||||||
print(f"❌ Data file not found: {DATA_PATH}")
|
print(f"❌ Data file not found: {DATA_PATH}")
|
||||||
return
|
print(" Run extract_training_data.py first")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
print(f"📦 Loading data from {DATA_PATH}...")
|
print(f"📦 Loading data from {DATA_PATH}...")
|
||||||
df = pd.read_csv(DATA_PATH)
|
df = pd.read_csv(DATA_PATH)
|
||||||
|
|
||||||
# Preprocessing
|
# Fill feature NaNs with 0 (same as v25 training)
|
||||||
# Drop rows where target is missing (should verify)
|
for col in FEATURES:
|
||||||
|
if col in df.columns:
|
||||||
|
df[col] = df[col].fillna(0)
|
||||||
|
|
||||||
|
# Backward-compatible: add odds presence flags if missing
|
||||||
|
odds_base_columns = [
|
||||||
|
"odds_ms_h", "odds_ms_d", "odds_ms_a",
|
||||||
|
"odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
|
||||||
|
"odds_ou05_o", "odds_ou05_u",
|
||||||
|
"odds_ou15_o", "odds_ou15_u",
|
||||||
|
"odds_ou25_o", "odds_ou25_u",
|
||||||
|
"odds_ou35_o", "odds_ou35_u",
|
||||||
|
"odds_ht_ou05_o", "odds_ht_ou05_u",
|
||||||
|
"odds_ht_ou15_o", "odds_ht_ou15_u",
|
||||||
|
"odds_btts_y", "odds_btts_n",
|
||||||
|
]
|
||||||
|
for base_col in odds_base_columns:
|
||||||
|
pres_col = f"{base_col}_present"
|
||||||
|
if pres_col not in df.columns and base_col in df.columns:
|
||||||
|
df[pres_col] = (df[base_col] > 1.0).astype(int)
|
||||||
|
|
||||||
|
# Drop rows where any target is missing
|
||||||
df = df.dropna(subset=TARGETS)
|
df = df.dropna(subset=TARGETS)
|
||||||
|
|
||||||
# Fill feature NaNs with median/mean or 0
|
# Filter: at least MS odds must be present
|
||||||
print(f" Original rows: {len(df)}")
|
|
||||||
|
|
||||||
# Filter valid odds (at least ms_h > 1.0)
|
|
||||||
df = df[df["odds_ms_h"] > 1.0].copy()
|
df = df[df["odds_ms_h"] > 1.0].copy()
|
||||||
print(f" Rows with valid odds: {len(df)}")
|
|
||||||
|
|
||||||
X = df[FEATURES]
|
|
||||||
y_home = df["score_home"]
|
|
||||||
y_away = df["score_away"]
|
|
||||||
y_ht_home = df["ht_score_home"]
|
|
||||||
y_ht_away = df["ht_score_away"]
|
|
||||||
|
|
||||||
# Train/Test Split
|
|
||||||
X_train, X_test, y_h_train, y_h_test, y_a_train, y_a_test, y_ht_h_train, y_ht_h_test, y_ht_a_train, y_ht_a_test = train_test_split(
|
|
||||||
X, y_home, y_away, y_ht_home, y_ht_away, test_size=0.2, random_state=42
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f" Training set: {len(X_train)} matches")
|
|
||||||
print(f" Test set: {len(X_test)} matches")
|
|
||||||
|
|
||||||
# --- HOME GOALS MODEL ---
|
# Ensure all features exist
|
||||||
print("\n🏠 Training Home Goals Model...")
|
missing = [f for f in FEATURES if f not in df.columns]
|
||||||
xgb_home = xgb.XGBRegressor(
|
if missing:
|
||||||
objective='reg:squarederror',
|
print(f"⚠️ Missing {len(missing)} features, filling with 0: {missing[:5]}...")
|
||||||
n_estimators=1000,
|
for f in missing:
|
||||||
learning_rate=0.01,
|
df[f] = 0
|
||||||
max_depth=5,
|
|
||||||
subsample=0.7,
|
|
||||||
colsample_bytree=0.7,
|
|
||||||
n_jobs=-1,
|
|
||||||
random_state=42,
|
|
||||||
early_stopping_rounds=50 # Configure here for newer XGBoost or remove if not supported in constructor (depends on version)
|
|
||||||
)
|
|
||||||
# Actually, to be safe across versions, let's remove early stopping for now or use validation set properly
|
|
||||||
# Using 'eval_set' without early_stopping_rounds just prints metrics
|
|
||||||
xgb_home = xgb.XGBRegressor(
|
|
||||||
objective='reg:squarederror',
|
|
||||||
n_estimators=1000,
|
|
||||||
learning_rate=0.01,
|
|
||||||
max_depth=5,
|
|
||||||
subsample=0.7,
|
|
||||||
colsample_bytree=0.7,
|
|
||||||
n_jobs=-1,
|
|
||||||
random_state=42
|
|
||||||
)
|
|
||||||
xgb_home.fit(X_train, y_h_train, eval_set=[(X_test, y_h_test)], verbose=False)
|
|
||||||
|
|
||||||
home_preds = xgb_home.predict(X_test)
|
|
||||||
mae_home = mean_absolute_error(y_h_test, home_preds)
|
|
||||||
r2_home = r2_score(y_h_test, home_preds)
|
|
||||||
print(f" ✅ FT Home MAE: {mae_home:.4f} goals")
|
|
||||||
print(f" ✅ FT Home R2: {r2_home:.4f}")
|
|
||||||
|
|
||||||
# --- AWAY GOALS MODEL ---
|
return df
|
||||||
print("\n✈️ Training FT Away Goals Model...")
|
|
||||||
xgb_away = xgb.XGBRegressor(
|
|
||||||
objective='reg:squarederror',
|
|
||||||
n_estimators=1000,
|
|
||||||
learning_rate=0.01,
|
|
||||||
max_depth=5,
|
|
||||||
subsample=0.7,
|
|
||||||
colsample_bytree=0.7,
|
|
||||||
n_jobs=-1,
|
|
||||||
random_state=42
|
|
||||||
)
|
|
||||||
xgb_away.fit(X_train, y_a_train, eval_set=[(X_test, y_a_test)], verbose=False)
|
|
||||||
|
|
||||||
away_preds = xgb_away.predict(X_test)
|
|
||||||
mae_away = mean_absolute_error(y_a_test, away_preds)
|
|
||||||
r2_away = r2_score(y_a_test, away_preds)
|
|
||||||
print(f" ✅ FT Away MAE: {mae_away:.4f} goals")
|
|
||||||
print(f" ✅ FT Away R2: {r2_away:.4f}")
|
|
||||||
|
|
||||||
# --- HT HOME GOALS MODEL ---
|
|
||||||
print("\n🏠 Training HT Home Goals Model...")
|
|
||||||
xgb_ht_home = xgb.XGBRegressor(
|
|
||||||
objective='reg:squarederror',
|
|
||||||
n_estimators=1000,
|
|
||||||
learning_rate=0.01,
|
|
||||||
max_depth=5,
|
|
||||||
subsample=0.7,
|
|
||||||
colsample_bytree=0.7,
|
|
||||||
n_jobs=-1,
|
|
||||||
random_state=42
|
|
||||||
)
|
|
||||||
xgb_ht_home.fit(X_train, y_ht_h_train, eval_set=[(X_test, y_ht_h_test)], verbose=False)
|
|
||||||
|
|
||||||
ht_home_preds = xgb_ht_home.predict(X_test)
|
|
||||||
mae_ht_home = mean_absolute_error(y_ht_h_test, ht_home_preds)
|
|
||||||
print(f" ✅ HT Home MAE: {mae_ht_home:.4f} goals")
|
|
||||||
|
|
||||||
# --- HT AWAY GOALS MODEL ---
|
|
||||||
print("\n✈️ Training HT Away Goals Model...")
|
def temporal_split(df: pd.DataFrame, train_ratio: float = 0.80):
|
||||||
xgb_ht_away = xgb.XGBRegressor(
|
"""
|
||||||
objective='reg:squarederror',
|
Temporal train/test split by match date.
|
||||||
n_estimators=1000,
|
Ensures no future information leaks into training.
|
||||||
learning_rate=0.01,
|
"""
|
||||||
max_depth=5,
|
if "match_date" in df.columns:
|
||||||
subsample=0.7,
|
df = df.sort_values("match_date").reset_index(drop=True)
|
||||||
colsample_bytree=0.7,
|
elif "round" in df.columns:
|
||||||
n_jobs=-1,
|
df = df.sort_values("round").reset_index(drop=True)
|
||||||
random_state=42
|
|
||||||
|
split_idx = int(len(df) * train_ratio)
|
||||||
|
return df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy()
|
||||||
|
|
||||||
|
|
||||||
|
def train_single_model(X_train, y_train, X_test, y_test, name: str):
|
||||||
|
"""Train a single XGBoost regression model with early stopping."""
|
||||||
|
print(f"\n🏗️ Training {name} model...")
|
||||||
|
|
||||||
|
model = xgb.XGBRegressor(**XGB_PARAMS)
|
||||||
|
model.fit(
|
||||||
|
X_train, y_train,
|
||||||
|
eval_set=[(X_test, y_test)],
|
||||||
|
verbose=False,
|
||||||
)
|
)
|
||||||
xgb_ht_away.fit(X_train, y_ht_a_train, eval_set=[(X_test, y_ht_a_test)], verbose=False)
|
|
||||||
|
preds = model.predict(X_test)
|
||||||
ht_away_preds = xgb_ht_away.predict(X_test)
|
|
||||||
mae_ht_away = mean_absolute_error(y_ht_a_test, ht_away_preds)
|
mae = mean_absolute_error(y_test, preds)
|
||||||
print(f" ✅ HT Away MAE: {mae_ht_away:.4f} goals")
|
rmse = np.sqrt(mean_squared_error(y_test, preds))
|
||||||
|
r2 = r2_score(y_test, preds)
|
||||||
# --- EVALUATE EXACT SCORE ACCURACY (ROUNDED) ---
|
|
||||||
print("\n🎯 Exact FT Score Accuracy (Test Set):")
|
print(f" MAE: {mae:.4f} goals")
|
||||||
correct = 0
|
print(f" RMSE: {rmse:.4f}")
|
||||||
close = 0 # Within 1 goal diff for both
|
print(f" R²: {r2:.4f}")
|
||||||
|
|
||||||
for h_true, a_true, h_pred, a_pred in zip(y_h_test, y_a_test, home_preds, away_preds):
|
return model, {"mae": mae, "rmse": rmse, "r2": r2}
|
||||||
h_p = round(h_pred)
|
|
||||||
a_p = round(a_pred)
|
|
||||||
if h_p == h_true and a_p == a_true:
|
def evaluate_combined(models: dict, X_test, y_test_dict: dict):
|
||||||
correct += 1
|
"""Evaluate combined score accuracy (FT and HT)."""
|
||||||
if abs(h_p - h_true) <= 1 and abs(a_p - a_true) <= 1:
|
print("\n🎯 Combined Score Evaluation (Test Set):")
|
||||||
|
|
||||||
|
# FT Score
|
||||||
|
ft_h_preds = models["ft_home"].predict(X_test)
|
||||||
|
ft_a_preds = models["ft_away"].predict(X_test)
|
||||||
|
|
||||||
|
y_ft_h = y_test_dict["score_home"].values
|
||||||
|
y_ft_a = y_test_dict["score_away"].values
|
||||||
|
|
||||||
|
exact = 0
|
||||||
|
close = 0
|
||||||
|
result_correct = 0
|
||||||
|
total = len(X_test)
|
||||||
|
|
||||||
|
for h_true, a_true, h_pred, a_pred in zip(y_ft_h, y_ft_a, ft_h_preds, ft_a_preds):
|
||||||
|
hp = max(0, round(h_pred))
|
||||||
|
ap = max(0, round(a_pred))
|
||||||
|
|
||||||
|
# Exact score
|
||||||
|
if hp == h_true and ap == a_true:
|
||||||
|
exact += 1
|
||||||
|
|
||||||
|
# Close (±1 each)
|
||||||
|
if abs(hp - h_true) <= 1 and abs(ap - a_true) <= 1:
|
||||||
close += 1
|
close += 1
|
||||||
|
|
||||||
acc = correct / len(X_test) * 100
|
# Result direction (1X2)
|
||||||
close_acc = close / len(X_test) * 100
|
true_result = 1 if h_true > a_true else (0 if h_true == a_true else -1)
|
||||||
print(f" Exact Match: {acc:.2f}%")
|
pred_result = 1 if hp > ap else (0 if hp == ap else -1)
|
||||||
print(f" Close Match (+/- 1 goal): {close_acc:.2f}%")
|
if true_result == pred_result:
|
||||||
|
result_correct += 1
|
||||||
|
|
||||||
|
print(f" FT Exact Score: {exact / total * 100:.2f}% ({exact}/{total})")
|
||||||
|
print(f" FT Close (±1): {close / total * 100:.2f}% ({close}/{total})")
|
||||||
|
print(f" FT Result (1X2): {result_correct / total * 100:.2f}% ({result_correct}/{total})")
|
||||||
|
|
||||||
|
# HT Score
|
||||||
|
ht_h_preds = models["ht_home"].predict(X_test)
|
||||||
|
ht_a_preds = models["ht_away"].predict(X_test)
|
||||||
|
|
||||||
|
y_ht_h = y_test_dict["ht_score_home"].values
|
||||||
|
y_ht_a = y_test_dict["ht_score_away"].values
|
||||||
|
|
||||||
|
ht_exact = 0
|
||||||
|
ht_total = len(X_test)
|
||||||
|
|
||||||
|
for h_true, a_true, h_pred, a_pred in zip(y_ht_h, y_ht_a, ht_h_preds, ht_a_preds):
|
||||||
|
hp = max(0, round(h_pred))
|
||||||
|
ap = max(0, round(a_pred))
|
||||||
|
if hp == h_true and ap == a_true:
|
||||||
|
ht_exact += 1
|
||||||
|
|
||||||
|
print(f" HT Exact Score: {ht_exact / ht_total * 100:.2f}% ({ht_exact}/{ht_total})")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"ft_exact_pct": exact / total * 100,
|
||||||
|
"ft_close_pct": close / total * 100,
|
||||||
|
"ft_result_pct": result_correct / total * 100,
|
||||||
|
"ht_exact_pct": ht_exact / ht_total * 100,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def train():
|
||||||
|
"""Main training pipeline."""
|
||||||
|
print("🚀 Score Prediction Model Trainer (V25-Compatible)")
|
||||||
|
print(f" Feature count: {len(FEATURES)}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Load data
|
||||||
|
df = load_data()
|
||||||
|
print(f" Total valid rows: {len(df)}")
|
||||||
|
|
||||||
|
# Temporal split
|
||||||
|
train_df, test_df = temporal_split(df)
|
||||||
|
print(f" Training set: {len(train_df)} matches")
|
||||||
|
print(f" Test set: {len(test_df)} matches (temporally after training)")
|
||||||
|
|
||||||
|
X_train = train_df[FEATURES]
|
||||||
|
X_test = test_df[FEATURES]
|
||||||
|
|
||||||
|
# Train 4 models
|
||||||
|
models = {}
|
||||||
|
metrics = {}
|
||||||
|
|
||||||
|
for target_name, model_key in [
|
||||||
|
("score_home", "ft_home"),
|
||||||
|
("score_away", "ft_away"),
|
||||||
|
("ht_score_home", "ht_home"),
|
||||||
|
("ht_score_away", "ht_away"),
|
||||||
|
]:
|
||||||
|
model, metric = train_single_model(
|
||||||
|
X_train, train_df[target_name],
|
||||||
|
X_test, test_df[target_name],
|
||||||
|
model_key,
|
||||||
|
)
|
||||||
|
models[model_key] = model
|
||||||
|
metrics[model_key] = metric
|
||||||
|
|
||||||
|
# Combined evaluation
|
||||||
|
y_test_dict = {t: test_df[t] for t in TARGETS}
|
||||||
|
combined = evaluate_combined(models, X_test, y_test_dict)
|
||||||
|
|
||||||
# Save
|
# Save
|
||||||
print(f"\n💾 Saving models to {MODEL_PATH}...")
|
print(f"\n💾 Saving to {MODEL_PATH}...")
|
||||||
model_data = {
|
model_data = {
|
||||||
"home_model": xgb_home,
|
"home_model": models["ft_home"],
|
||||||
"away_model": xgb_away,
|
"away_model": models["ft_away"],
|
||||||
"ht_home_model": xgb_ht_home,
|
"ht_home_model": models["ht_home"],
|
||||||
"ht_away_model": xgb_ht_away,
|
"ht_away_model": models["ht_away"],
|
||||||
"features": FEATURES,
|
"features": FEATURES,
|
||||||
"meta": {
|
"meta": {
|
||||||
"mae_home": mae_home,
|
**{f"{k}_{mk}": mv for k, m in metrics.items() for mk, mv in m.items()},
|
||||||
"mae_away": mae_away,
|
**combined,
|
||||||
"mae_ht_home": mae_ht_home,
|
"trained_at": datetime.now().isoformat(),
|
||||||
"mae_ht_away": mae_ht_away,
|
"feature_count": len(FEATURES),
|
||||||
"acc": acc
|
"train_size": len(train_df),
|
||||||
}
|
"test_size": len(test_df),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
with open(MODEL_PATH, "wb") as f:
|
with open(MODEL_PATH, "wb") as f:
|
||||||
pickle.dump(model_data, f)
|
pickle.dump(model_data, f)
|
||||||
|
|
||||||
print("✅ Done.")
|
print("\n✅ Score model training complete!")
|
||||||
|
print(f" Saved: {MODEL_PATH}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
train()
|
train()
|
||||||
|
|||||||
@@ -0,0 +1,307 @@
|
|||||||
|
"""
|
||||||
|
Update Implied Odds in football_ai_features
|
||||||
|
=============================================
|
||||||
|
Populates implied_home, implied_draw, implied_away, implied_over25, implied_btts
|
||||||
|
from real odds data in odd_categories + odd_selections tables.
|
||||||
|
|
||||||
|
Also backfills form-based features (home_goals_avg_5, away_goals_avg_5, etc.)
|
||||||
|
from recent match history.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/update_implied_odds.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import psycopg2
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
def get_conn():
|
||||||
|
db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0]
|
||||||
|
return psycopg2.connect(db_url)
|
||||||
|
|
||||||
|
|
||||||
|
def update_implied_odds(conn):
|
||||||
|
"""Update implied probabilities from real odds data."""
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
print("📊 Phase 1: Updating implied odds from real market data...")
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
# Step 1: Build odds lookup from odd_categories + odd_selections
|
||||||
|
print(" Loading odds data...")
|
||||||
|
cur.execute("""
|
||||||
|
SELECT oc.match_id, oc.name AS cat_name, os.name AS sel_name, os.odd_value
|
||||||
|
FROM odd_selections os
|
||||||
|
JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
|
||||||
|
WHERE os.odd_value IS NOT NULL
|
||||||
|
AND CAST(os.odd_value AS FLOAT) > 1.0
|
||||||
|
""")
|
||||||
|
|
||||||
|
odds_by_match = {}
|
||||||
|
row_count = 0
|
||||||
|
for match_id, cat_name, sel_name, odd_val in cur.fetchall():
|
||||||
|
try:
|
||||||
|
v = float(odd_val)
|
||||||
|
if v <= 1.0:
|
||||||
|
continue
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if match_id not in odds_by_match:
|
||||||
|
odds_by_match[match_id] = {}
|
||||||
|
|
||||||
|
cat_lower = (cat_name or "").lower().strip()
|
||||||
|
sel_lower = (sel_name or "").lower().strip()
|
||||||
|
|
||||||
|
# Match Result (1X2)
|
||||||
|
if cat_lower == 'maç sonucu':
|
||||||
|
if sel_name == '1':
|
||||||
|
odds_by_match[match_id]['ms_h'] = v
|
||||||
|
elif sel_name in ('0', 'X'):
|
||||||
|
odds_by_match[match_id]['ms_d'] = v
|
||||||
|
elif sel_name == '2':
|
||||||
|
odds_by_match[match_id]['ms_a'] = v
|
||||||
|
|
||||||
|
# Over/Under 2.5
|
||||||
|
elif cat_lower == '2,5 alt/üst':
|
||||||
|
if 'üst' in sel_lower:
|
||||||
|
odds_by_match[match_id]['ou25_o'] = v
|
||||||
|
elif 'alt' in sel_lower:
|
||||||
|
odds_by_match[match_id]['ou25_u'] = v
|
||||||
|
|
||||||
|
# BTTS
|
||||||
|
elif cat_lower == 'karşılıklı gol':
|
||||||
|
if 'var' in sel_lower:
|
||||||
|
odds_by_match[match_id]['btts_y'] = v
|
||||||
|
elif 'yok' in sel_lower:
|
||||||
|
odds_by_match[match_id]['btts_n'] = v
|
||||||
|
|
||||||
|
row_count += 1
|
||||||
|
|
||||||
|
print(f" Loaded odds for {len(odds_by_match)} matches ({row_count} selections) in {time.time()-t0:.1f}s")
|
||||||
|
|
||||||
|
# Step 2: Calculate implied probabilities and update
|
||||||
|
print(" Calculating implied probabilities...")
|
||||||
|
|
||||||
|
# Get all match_ids in football_ai_features
|
||||||
|
cur.execute("SELECT match_id FROM football_ai_features")
|
||||||
|
feature_match_ids = {row[0] for row in cur.fetchall()}
|
||||||
|
|
||||||
|
updated = 0
|
||||||
|
batch_size = 500
|
||||||
|
updates = []
|
||||||
|
|
||||||
|
for match_id in feature_match_ids:
|
||||||
|
odds = odds_by_match.get(match_id, {})
|
||||||
|
if not odds:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Implied MS probabilities (vig-free normalization)
|
||||||
|
ms_h = odds.get('ms_h', 0)
|
||||||
|
ms_d = odds.get('ms_d', 0)
|
||||||
|
ms_a = odds.get('ms_a', 0)
|
||||||
|
|
||||||
|
implied_home = 0.33
|
||||||
|
implied_draw = 0.33
|
||||||
|
implied_away = 0.33
|
||||||
|
|
||||||
|
if ms_h > 1.0 and ms_d > 1.0 and ms_a > 1.0:
|
||||||
|
raw_sum = (1 / ms_h) + (1 / ms_d) + (1 / ms_a)
|
||||||
|
if raw_sum > 0:
|
||||||
|
implied_home = round((1 / ms_h) / raw_sum, 4)
|
||||||
|
implied_draw = round((1 / ms_d) / raw_sum, 4)
|
||||||
|
implied_away = round((1 / ms_a) / raw_sum, 4)
|
||||||
|
|
||||||
|
# Implied OU25
|
||||||
|
ou25_o = odds.get('ou25_o', 0)
|
||||||
|
ou25_u = odds.get('ou25_u', 0)
|
||||||
|
implied_over25 = 0.50
|
||||||
|
|
||||||
|
if ou25_o > 1.0 and ou25_u > 1.0:
|
||||||
|
raw_sum = (1 / ou25_o) + (1 / ou25_u)
|
||||||
|
if raw_sum > 0:
|
||||||
|
implied_over25 = round((1 / ou25_o) / raw_sum, 4)
|
||||||
|
|
||||||
|
# Implied BTTS
|
||||||
|
btts_y = odds.get('btts_y', 0)
|
||||||
|
btts_n = odds.get('btts_n', 0)
|
||||||
|
implied_btts = 0.50
|
||||||
|
|
||||||
|
if btts_y > 1.0 and btts_n > 1.0:
|
||||||
|
raw_sum = (1 / btts_y) + (1 / btts_n)
|
||||||
|
if raw_sum > 0:
|
||||||
|
implied_btts = round((1 / btts_y) / raw_sum, 4)
|
||||||
|
|
||||||
|
# Only update if we have real data (not all defaults)
|
||||||
|
has_real_data = (ms_h > 1.0 or ou25_o > 1.0 or btts_y > 1.0)
|
||||||
|
if not has_real_data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
updates.append((
|
||||||
|
implied_home, implied_draw, implied_away,
|
||||||
|
implied_over25, implied_btts, match_id
|
||||||
|
))
|
||||||
|
|
||||||
|
if len(updates) >= batch_size:
|
||||||
|
cur.executemany("""
|
||||||
|
UPDATE football_ai_features
|
||||||
|
SET implied_home = %s,
|
||||||
|
implied_draw = %s,
|
||||||
|
implied_away = %s,
|
||||||
|
implied_over25 = %s,
|
||||||
|
implied_btts = %s
|
||||||
|
WHERE match_id = %s
|
||||||
|
""", updates)
|
||||||
|
updated += len(updates)
|
||||||
|
updates = []
|
||||||
|
|
||||||
|
# Final batch
|
||||||
|
if updates:
|
||||||
|
cur.executemany("""
|
||||||
|
UPDATE football_ai_features
|
||||||
|
SET implied_home = %s,
|
||||||
|
implied_draw = %s,
|
||||||
|
implied_away = %s,
|
||||||
|
implied_over25 = %s,
|
||||||
|
implied_btts = %s
|
||||||
|
WHERE match_id = %s
|
||||||
|
""", updates)
|
||||||
|
updated += len(updates)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
print(f" ✅ Updated implied odds for {updated} matches in {time.time()-t0:.1f}s")
|
||||||
|
return updated
|
||||||
|
|
||||||
|
|
||||||
|
def update_form_features(conn):
|
||||||
|
"""Backfill form-based features (goals avg, clean sheet rate) from match history."""
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
print("\n📊 Phase 2: Updating form-based features...")
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
# Load all finished football matches ordered by time
|
||||||
|
print(" Loading match history...")
|
||||||
|
cur.execute("""
|
||||||
|
SELECT id, home_team_id, away_team_id, score_home, score_away, mst_utc
|
||||||
|
FROM matches
|
||||||
|
WHERE status = 'FT'
|
||||||
|
AND score_home IS NOT NULL
|
||||||
|
AND sport = 'football'
|
||||||
|
ORDER BY mst_utc ASC
|
||||||
|
""")
|
||||||
|
|
||||||
|
matches = cur.fetchall()
|
||||||
|
print(f" Loaded {len(matches)} finished matches")
|
||||||
|
|
||||||
|
# Build team history incrementally
|
||||||
|
from collections import defaultdict
|
||||||
|
team_history = defaultdict(list) # team_id -> [(goals_scored, goals_conceded)]
|
||||||
|
|
||||||
|
# Get all feature match IDs
|
||||||
|
cur.execute("SELECT match_id FROM football_ai_features")
|
||||||
|
feature_match_ids = {row[0] for row in cur.fetchall()}
|
||||||
|
|
||||||
|
updated = 0
|
||||||
|
batch_size = 500
|
||||||
|
updates = []
|
||||||
|
|
||||||
|
for match_id, home_id, away_id, score_home, score_away, mst_utc in matches:
|
||||||
|
# Calculate features BEFORE updating history (pre-match features)
|
||||||
|
if match_id in feature_match_ids:
|
||||||
|
h_hist = team_history[home_id][-5:] # last 5
|
||||||
|
a_hist = team_history[away_id][-5:]
|
||||||
|
|
||||||
|
# Home team form
|
||||||
|
if h_hist:
|
||||||
|
h_goals_avg = sum(g for g, _ in h_hist) / len(h_hist)
|
||||||
|
h_conceded_avg = sum(c for _, c in h_hist) / len(h_hist)
|
||||||
|
h_cs_rate = sum(1 for _, c in h_hist if c == 0) / len(h_hist)
|
||||||
|
h_scoring_rate = sum(1 for g, _ in h_hist if g > 0) / len(h_hist)
|
||||||
|
else:
|
||||||
|
h_goals_avg, h_conceded_avg = 1.3, 1.2
|
||||||
|
h_cs_rate, h_scoring_rate = 0.25, 0.75
|
||||||
|
|
||||||
|
# Away team form
|
||||||
|
if a_hist:
|
||||||
|
a_goals_avg = sum(g for g, _ in a_hist) / len(a_hist)
|
||||||
|
a_conceded_avg = sum(c for _, c in a_hist) / len(a_hist)
|
||||||
|
a_cs_rate = sum(1 for _, c in a_hist if c == 0) / len(a_hist)
|
||||||
|
a_scoring_rate = sum(1 for g, _ in a_hist if g > 0) / len(a_hist)
|
||||||
|
else:
|
||||||
|
a_goals_avg, a_conceded_avg = 1.3, 1.2
|
||||||
|
a_cs_rate, a_scoring_rate = 0.25, 0.75
|
||||||
|
|
||||||
|
updates.append((
|
||||||
|
round(h_goals_avg, 3), round(h_conceded_avg, 3),
|
||||||
|
round(h_cs_rate, 3), round(h_scoring_rate, 3),
|
||||||
|
round(a_goals_avg, 3), round(a_conceded_avg, 3),
|
||||||
|
round(a_cs_rate, 3), round(a_scoring_rate, 3),
|
||||||
|
match_id
|
||||||
|
))
|
||||||
|
|
||||||
|
if len(updates) >= batch_size:
|
||||||
|
cur.executemany("""
|
||||||
|
UPDATE football_ai_features
|
||||||
|
SET home_goals_avg_5 = %s,
|
||||||
|
home_conceded_avg_5 = %s,
|
||||||
|
home_clean_sheet_rate = %s,
|
||||||
|
home_scoring_rate = %s,
|
||||||
|
away_goals_avg_5 = %s,
|
||||||
|
away_conceded_avg_5 = %s,
|
||||||
|
away_clean_sheet_rate = %s,
|
||||||
|
away_scoring_rate = %s
|
||||||
|
WHERE match_id = %s
|
||||||
|
""", updates)
|
||||||
|
updated += len(updates)
|
||||||
|
updates = []
|
||||||
|
|
||||||
|
# Update history AFTER feature extraction (maintains pre-match invariant)
|
||||||
|
team_history[home_id].append((score_home, score_away))
|
||||||
|
team_history[away_id].append((score_away, score_home))
|
||||||
|
|
||||||
|
# Final batch
|
||||||
|
if updates:
|
||||||
|
cur.executemany("""
|
||||||
|
UPDATE football_ai_features
|
||||||
|
SET home_goals_avg_5 = %s,
|
||||||
|
home_conceded_avg_5 = %s,
|
||||||
|
home_clean_sheet_rate = %s,
|
||||||
|
home_scoring_rate = %s,
|
||||||
|
away_goals_avg_5 = %s,
|
||||||
|
away_conceded_avg_5 = %s,
|
||||||
|
away_clean_sheet_rate = %s,
|
||||||
|
away_scoring_rate = %s
|
||||||
|
WHERE match_id = %s
|
||||||
|
""", updates)
|
||||||
|
updated += len(updates)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
print(f" ✅ Updated form features for {updated} matches in {time.time()-t0:.1f}s")
|
||||||
|
return updated
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("🚀 Football AI Features — Implied Odds & Form Backfill")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
conn = get_conn()
|
||||||
|
|
||||||
|
try:
|
||||||
|
odds_updated = update_implied_odds(conn)
|
||||||
|
form_updated = update_form_features(conn)
|
||||||
|
|
||||||
|
print(f"\n✅ DONE!")
|
||||||
|
print(f" Implied odds updated: {odds_updated} matches")
|
||||||
|
print(f" Form features updated: {form_updated} matches")
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -16,6 +16,7 @@ import re
|
|||||||
import time
|
import time
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
|
import pickle
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
@@ -258,6 +259,51 @@ class SingleMatchOrchestrator:
|
|||||||
self._v27 = None
|
self._v27 = None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _get_score_model(self) -> Optional[Dict]:
|
||||||
|
"""Load XGBoost score prediction model (non-fatal)."""
|
||||||
|
if hasattr(self, "_score_model_cache"):
|
||||||
|
return self._score_model_cache
|
||||||
|
score_model_path = os.path.join(
|
||||||
|
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
||||||
|
"models", "xgb_score.pkl",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
if os.path.exists(score_model_path):
|
||||||
|
with open(score_model_path, "rb") as f:
|
||||||
|
model_data = pickle.load(f)
|
||||||
|
if all(k in model_data for k in ("home_model", "away_model", "ht_home_model", "ht_away_model", "features")):
|
||||||
|
self._score_model_cache = model_data
|
||||||
|
print(f"[SCORE] ✅ Score model loaded ({len(model_data['features'])} features)")
|
||||||
|
return self._score_model_cache
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[SCORE] ⚠ Load failed (non-fatal, using heuristic): {e}")
|
||||||
|
self._score_model_cache = None
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _predict_score_with_model(self, features: Dict[str, float]) -> Optional[Dict[str, float]]:
|
||||||
|
"""Predict FT/HT scores using XGBoost score model."""
|
||||||
|
score_model = self._get_score_model()
|
||||||
|
if score_model is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
import pandas as _pd
|
||||||
|
model_features = score_model["features"]
|
||||||
|
row = {f: float(features.get(f, 0)) for f in model_features}
|
||||||
|
df = _pd.DataFrame([row])
|
||||||
|
ft_home = max(0.0, float(score_model["home_model"].predict(df)[0]))
|
||||||
|
ft_away = max(0.0, float(score_model["away_model"].predict(df)[0]))
|
||||||
|
ht_home = max(0.0, float(score_model["ht_home_model"].predict(df)[0]))
|
||||||
|
ht_away = max(0.0, float(score_model["ht_away_model"].predict(df)[0]))
|
||||||
|
return {
|
||||||
|
"ft_home": round(ft_home, 2),
|
||||||
|
"ft_away": round(ft_away, 2),
|
||||||
|
"ht_home": round(ht_home, 2),
|
||||||
|
"ht_away": round(ht_away, 2),
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[SCORE] ⚠ Prediction error (fallback to heuristic): {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
def _build_v25_features(self, data: MatchData) -> Dict[str, float]:
|
def _build_v25_features(self, data: MatchData) -> Dict[str, float]:
|
||||||
"""
|
"""
|
||||||
Build the single authoritative V25 pre-match feature vector.
|
Build the single authoritative V25 pre-match feature vector.
|
||||||
@@ -869,27 +915,39 @@ class SingleMatchOrchestrator:
|
|||||||
prediction.handicap_pick, hcap_top = self._best_prob_pick(hcap_probs)
|
prediction.handicap_pick, hcap_top = self._best_prob_pick(hcap_probs)
|
||||||
prediction.handicap_confidence = hcap_top * 100.0
|
prediction.handicap_confidence = hcap_top * 100.0
|
||||||
|
|
||||||
base_home_xg = max(0.25, (float(data.home_goals_avg) + float(features.get("away_xga", data.away_conceded_avg))) / 2.0)
|
# ── Score Prediction: Model-first, heuristic fallback ──────────
|
||||||
base_away_xg = max(0.25, (float(data.away_goals_avg) + float(features.get("home_xga", data.home_conceded_avg))) / 2.0)
|
score_result = self._predict_score_with_model(features)
|
||||||
ms_edge = prediction.ms_home_prob - prediction.ms_away_prob
|
if score_result is not None:
|
||||||
total_target = max(
|
# ML model predicted scores
|
||||||
1.4,
|
prediction.home_xg = score_result["ft_home"]
|
||||||
min(
|
prediction.away_xg = score_result["ft_away"]
|
||||||
4.8,
|
prediction.total_xg = round(prediction.home_xg + prediction.away_xg, 2)
|
||||||
(float(features.get("league_avg_goals", 2.7)) * 0.55)
|
ht_home_xg = score_result["ht_home"]
|
||||||
+ ((float(data.home_goals_avg) + float(data.away_goals_avg)) * 0.45)
|
ht_away_xg = score_result["ht_away"]
|
||||||
+ ((prediction.over_25_prob - prediction.under_25_prob) * 1.15),
|
prediction.predicted_ft_score = f"{int(round(prediction.home_xg))}-{int(round(prediction.away_xg))}"
|
||||||
),
|
prediction.predicted_ht_score = f"{int(round(ht_home_xg))}-{int(round(ht_away_xg))}"
|
||||||
)
|
else:
|
||||||
home_xg = max(0.2, base_home_xg + (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18))
|
# Heuristic fallback (original formula)
|
||||||
away_xg = max(0.2, base_away_xg - (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18))
|
base_home_xg = max(0.25, (float(data.home_goals_avg) + float(features.get("away_xga", data.away_conceded_avg))) / 2.0)
|
||||||
scale = total_target / max(home_xg + away_xg, 0.1)
|
base_away_xg = max(0.25, (float(data.away_goals_avg) + float(features.get("home_xga", data.home_conceded_avg))) / 2.0)
|
||||||
prediction.home_xg = round(home_xg * scale, 2)
|
ms_edge = prediction.ms_home_prob - prediction.ms_away_prob
|
||||||
prediction.away_xg = round(away_xg * scale, 2)
|
total_target = max(
|
||||||
prediction.total_xg = round(prediction.home_xg + prediction.away_xg, 2)
|
1.4,
|
||||||
|
min(
|
||||||
prediction.predicted_ft_score = f"{int(round(prediction.home_xg))}-{int(round(prediction.away_xg))}"
|
4.8,
|
||||||
prediction.predicted_ht_score = f"{int(round(prediction.home_xg * 0.45))}-{int(round(prediction.away_xg * 0.45))}"
|
(float(features.get("league_avg_goals", 2.7)) * 0.55)
|
||||||
|
+ ((float(data.home_goals_avg) + float(data.away_goals_avg)) * 0.45)
|
||||||
|
+ ((prediction.over_25_prob - prediction.under_25_prob) * 1.15),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
home_xg = max(0.2, base_home_xg + (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18))
|
||||||
|
away_xg = max(0.2, base_away_xg - (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18))
|
||||||
|
scale = total_target / max(home_xg + away_xg, 0.1)
|
||||||
|
prediction.home_xg = round(home_xg * scale, 2)
|
||||||
|
prediction.away_xg = round(away_xg * scale, 2)
|
||||||
|
prediction.total_xg = round(prediction.home_xg + prediction.away_xg, 2)
|
||||||
|
prediction.predicted_ft_score = f"{int(round(prediction.home_xg))}-{int(round(prediction.away_xg))}"
|
||||||
|
prediction.predicted_ht_score = f"{int(round(prediction.home_xg * 0.45))}-{int(round(prediction.away_xg * 0.45))}"
|
||||||
prediction.ft_scores_top5 = self._poisson_score_top5(prediction.home_xg, prediction.away_xg)
|
prediction.ft_scores_top5 = self._poisson_score_top5(prediction.home_xg, prediction.away_xg)
|
||||||
|
|
||||||
max_market_conf = max(
|
max_market_conf = max(
|
||||||
|
|||||||
Reference in New Issue
Block a user