From 244d8f5366b556e5b5d43594bdbcf0adee1753cc Mon Sep 17 00:00:00 2001 From: Fahri Can Date: Tue, 5 May 2026 16:04:00 +0300 Subject: [PATCH] feat(ai): expand training to 68K+ matches, add score model, backfill implied odds - extract_training_data.py: switch from top_leagues.json (23) to qualified_leagues.json (265) - update_implied_odds.py: new script to backfill implied odds from real market data - train_score_model.py: rewrite with v25 102-feature set + temporal split - single_match_orchestrator.py: integrate ML score model with heuristic fallback --- ai-engine/scripts/extract_training_data.py | 2 +- ai-engine/scripts/train_score_model.py | 390 +++++++++++------- ai-engine/scripts/update_implied_odds.py | 307 ++++++++++++++ .../services/single_match_orchestrator.py | 100 ++++- 4 files changed, 626 insertions(+), 173 deletions(-) create mode 100644 ai-engine/scripts/update_implied_odds.py diff --git a/ai-engine/scripts/extract_training_data.py b/ai-engine/scripts/extract_training_data.py index 87dc015..4c92ca2 100755 --- a/ai-engine/scripts/extract_training_data.py +++ b/ai-engine/scripts/extract_training_data.py @@ -33,7 +33,7 @@ from features.upset_engine import get_upset_engine from features.referee_engine import get_referee_engine from features.momentum_engine import get_momentum_engine -TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "top_leagues.json") +TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "qualified_leagues.json") OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv") # Ensure output dir exists diff --git a/ai-engine/scripts/train_score_model.py b/ai-engine/scripts/train_score_model.py index a0ee74a..c3790c8 100755 --- a/ai-engine/scripts/train_score_model.py +++ b/ai-engine/scripts/train_score_model.py @@ -1,183 +1,271 @@ +""" +V25-Compatible Score Prediction Model Trainer +=============================================== +Trains 4 independent XGBoost regression models for: + - FT Home Goals + - FT Away Goals + - HT Home Goals + - HT Away Goals +Uses the same 102-feature set as v25_ensemble for full compatibility. +Temporal train/test split (80/20) to avoid future leakage. + +Usage: + python3 scripts/train_score_model.py +""" + +import os +import sys +import pickle +import numpy as np import pandas as pd import xgboost as xgb -import pickle -import os -from sklearn.model_selection import train_test_split -from sklearn.metrics import mean_absolute_error, r2_score +from datetime import datetime +from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error -# Paths -DATA_PATH = os.path.join(os.path.dirname(__file__), "../data/training_data.csv") -MODEL_PATH = os.path.join(os.path.dirname(__file__), "../models/xgb_score.pkl") +# Add parent directory to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Import unified 56-feature array from markets trainer -from train_xgboost_markets import FEATURES +# Config +AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv") +MODEL_PATH = os.path.join(AI_ENGINE_DIR, "models", "xgb_score.pkl") + +# Import the EXACT same feature set as v25 market models +from train_v25_clean import FEATURES TARGETS = ["score_home", "score_away", "ht_score_home", "ht_score_away"] -def train(): - print("πŸš€ Training Score Prediction Model (XGBoost) - Full Time & Half Time") - print("=" * 60) +# Model hyperparameters (tuned for goal count regression) +XGB_PARAMS = { + "objective": "reg:squarederror", + "n_estimators": 1200, + "learning_rate": 0.02, + "max_depth": 6, + "subsample": 0.8, + "colsample_bytree": 0.7, + "min_child_weight": 5, + "reg_alpha": 0.1, + "reg_lambda": 1.0, + "n_jobs": -1, + "random_state": 42, +} + +def load_data() -> pd.DataFrame: + """Load and validate training data.""" if not os.path.exists(DATA_PATH): print(f"❌ Data file not found: {DATA_PATH}") - return + print(" Run extract_training_data.py first") + sys.exit(1) print(f"πŸ“¦ Loading data from {DATA_PATH}...") df = pd.read_csv(DATA_PATH) - - # Preprocessing - # Drop rows where target is missing (should verify) + + # Fill feature NaNs with 0 (same as v25 training) + for col in FEATURES: + if col in df.columns: + df[col] = df[col].fillna(0) + + # Backward-compatible: add odds presence flags if missing + odds_base_columns = [ + "odds_ms_h", "odds_ms_d", "odds_ms_a", + "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a", + "odds_ou05_o", "odds_ou05_u", + "odds_ou15_o", "odds_ou15_u", + "odds_ou25_o", "odds_ou25_u", + "odds_ou35_o", "odds_ou35_u", + "odds_ht_ou05_o", "odds_ht_ou05_u", + "odds_ht_ou15_o", "odds_ht_ou15_u", + "odds_btts_y", "odds_btts_n", + ] + for base_col in odds_base_columns: + pres_col = f"{base_col}_present" + if pres_col not in df.columns and base_col in df.columns: + df[pres_col] = (df[base_col] > 1.0).astype(int) + + # Drop rows where any target is missing df = df.dropna(subset=TARGETS) - - # Fill feature NaNs with median/mean or 0 - print(f" Original rows: {len(df)}") - - # Filter valid odds (at least ms_h > 1.0) + + # Filter: at least MS odds must be present df = df[df["odds_ms_h"] > 1.0].copy() - print(f" Rows with valid odds: {len(df)}") - - X = df[FEATURES] - y_home = df["score_home"] - y_away = df["score_away"] - y_ht_home = df["ht_score_home"] - y_ht_away = df["ht_score_away"] - - # Train/Test Split - X_train, X_test, y_h_train, y_h_test, y_a_train, y_a_test, y_ht_h_train, y_ht_h_test, y_ht_a_train, y_ht_a_test = train_test_split( - X, y_home, y_away, y_ht_home, y_ht_away, test_size=0.2, random_state=42 - ) - - print(f" Training set: {len(X_train)} matches") - print(f" Test set: {len(X_test)} matches") - # --- HOME GOALS MODEL --- - print("\n🏠 Training Home Goals Model...") - xgb_home = xgb.XGBRegressor( - objective='reg:squarederror', - n_estimators=1000, - learning_rate=0.01, - max_depth=5, - subsample=0.7, - colsample_bytree=0.7, - n_jobs=-1, - random_state=42, - early_stopping_rounds=50 # Configure here for newer XGBoost or remove if not supported in constructor (depends on version) - ) - # Actually, to be safe across versions, let's remove early stopping for now or use validation set properly - # Using 'eval_set' without early_stopping_rounds just prints metrics - xgb_home = xgb.XGBRegressor( - objective='reg:squarederror', - n_estimators=1000, - learning_rate=0.01, - max_depth=5, - subsample=0.7, - colsample_bytree=0.7, - n_jobs=-1, - random_state=42 - ) - xgb_home.fit(X_train, y_h_train, eval_set=[(X_test, y_h_test)], verbose=False) - - home_preds = xgb_home.predict(X_test) - mae_home = mean_absolute_error(y_h_test, home_preds) - r2_home = r2_score(y_h_test, home_preds) - print(f" βœ… FT Home MAE: {mae_home:.4f} goals") - print(f" βœ… FT Home R2: {r2_home:.4f}") + # Ensure all features exist + missing = [f for f in FEATURES if f not in df.columns] + if missing: + print(f"⚠️ Missing {len(missing)} features, filling with 0: {missing[:5]}...") + for f in missing: + df[f] = 0 - # --- AWAY GOALS MODEL --- - print("\n✈️ Training FT Away Goals Model...") - xgb_away = xgb.XGBRegressor( - objective='reg:squarederror', - n_estimators=1000, - learning_rate=0.01, - max_depth=5, - subsample=0.7, - colsample_bytree=0.7, - n_jobs=-1, - random_state=42 - ) - xgb_away.fit(X_train, y_a_train, eval_set=[(X_test, y_a_test)], verbose=False) - - away_preds = xgb_away.predict(X_test) - mae_away = mean_absolute_error(y_a_test, away_preds) - r2_away = r2_score(y_a_test, away_preds) - print(f" βœ… FT Away MAE: {mae_away:.4f} goals") - print(f" βœ… FT Away R2: {r2_away:.4f}") - - # --- HT HOME GOALS MODEL --- - print("\n🏠 Training HT Home Goals Model...") - xgb_ht_home = xgb.XGBRegressor( - objective='reg:squarederror', - n_estimators=1000, - learning_rate=0.01, - max_depth=5, - subsample=0.7, - colsample_bytree=0.7, - n_jobs=-1, - random_state=42 - ) - xgb_ht_home.fit(X_train, y_ht_h_train, eval_set=[(X_test, y_ht_h_test)], verbose=False) - - ht_home_preds = xgb_ht_home.predict(X_test) - mae_ht_home = mean_absolute_error(y_ht_h_test, ht_home_preds) - print(f" βœ… HT Home MAE: {mae_ht_home:.4f} goals") + return df - # --- HT AWAY GOALS MODEL --- - print("\n✈️ Training HT Away Goals Model...") - xgb_ht_away = xgb.XGBRegressor( - objective='reg:squarederror', - n_estimators=1000, - learning_rate=0.01, - max_depth=5, - subsample=0.7, - colsample_bytree=0.7, - n_jobs=-1, - random_state=42 + +def temporal_split(df: pd.DataFrame, train_ratio: float = 0.80): + """ + Temporal train/test split by match date. + Ensures no future information leaks into training. + """ + if "match_date" in df.columns: + df = df.sort_values("match_date").reset_index(drop=True) + elif "round" in df.columns: + df = df.sort_values("round").reset_index(drop=True) + + split_idx = int(len(df) * train_ratio) + return df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy() + + +def train_single_model(X_train, y_train, X_test, y_test, name: str): + """Train a single XGBoost regression model with early stopping.""" + print(f"\nπŸ—οΈ Training {name} model...") + + model = xgb.XGBRegressor(**XGB_PARAMS) + model.fit( + X_train, y_train, + eval_set=[(X_test, y_test)], + verbose=False, ) - xgb_ht_away.fit(X_train, y_ht_a_train, eval_set=[(X_test, y_ht_a_test)], verbose=False) - - ht_away_preds = xgb_ht_away.predict(X_test) - mae_ht_away = mean_absolute_error(y_ht_a_test, ht_away_preds) - print(f" βœ… HT Away MAE: {mae_ht_away:.4f} goals") - - # --- EVALUATE EXACT SCORE ACCURACY (ROUNDED) --- - print("\n🎯 Exact FT Score Accuracy (Test Set):") - correct = 0 - close = 0 # Within 1 goal diff for both - - for h_true, a_true, h_pred, a_pred in zip(y_h_test, y_a_test, home_preds, away_preds): - h_p = round(h_pred) - a_p = round(a_pred) - if h_p == h_true and a_p == a_true: - correct += 1 - if abs(h_p - h_true) <= 1 and abs(a_p - a_true) <= 1: + + preds = model.predict(X_test) + + mae = mean_absolute_error(y_test, preds) + rmse = np.sqrt(mean_squared_error(y_test, preds)) + r2 = r2_score(y_test, preds) + + print(f" MAE: {mae:.4f} goals") + print(f" RMSE: {rmse:.4f}") + print(f" RΒ²: {r2:.4f}") + + return model, {"mae": mae, "rmse": rmse, "r2": r2} + + +def evaluate_combined(models: dict, X_test, y_test_dict: dict): + """Evaluate combined score accuracy (FT and HT).""" + print("\n🎯 Combined Score Evaluation (Test Set):") + + # FT Score + ft_h_preds = models["ft_home"].predict(X_test) + ft_a_preds = models["ft_away"].predict(X_test) + + y_ft_h = y_test_dict["score_home"].values + y_ft_a = y_test_dict["score_away"].values + + exact = 0 + close = 0 + result_correct = 0 + total = len(X_test) + + for h_true, a_true, h_pred, a_pred in zip(y_ft_h, y_ft_a, ft_h_preds, ft_a_preds): + hp = max(0, round(h_pred)) + ap = max(0, round(a_pred)) + + # Exact score + if hp == h_true and ap == a_true: + exact += 1 + + # Close (Β±1 each) + if abs(hp - h_true) <= 1 and abs(ap - a_true) <= 1: close += 1 - - acc = correct / len(X_test) * 100 - close_acc = close / len(X_test) * 100 - print(f" Exact Match: {acc:.2f}%") - print(f" Close Match (+/- 1 goal): {close_acc:.2f}%") + + # Result direction (1X2) + true_result = 1 if h_true > a_true else (0 if h_true == a_true else -1) + pred_result = 1 if hp > ap else (0 if hp == ap else -1) + if true_result == pred_result: + result_correct += 1 + + print(f" FT Exact Score: {exact / total * 100:.2f}% ({exact}/{total})") + print(f" FT Close (Β±1): {close / total * 100:.2f}% ({close}/{total})") + print(f" FT Result (1X2): {result_correct / total * 100:.2f}% ({result_correct}/{total})") + + # HT Score + ht_h_preds = models["ht_home"].predict(X_test) + ht_a_preds = models["ht_away"].predict(X_test) + + y_ht_h = y_test_dict["ht_score_home"].values + y_ht_a = y_test_dict["ht_score_away"].values + + ht_exact = 0 + ht_total = len(X_test) + + for h_true, a_true, h_pred, a_pred in zip(y_ht_h, y_ht_a, ht_h_preds, ht_a_preds): + hp = max(0, round(h_pred)) + ap = max(0, round(a_pred)) + if hp == h_true and ap == a_true: + ht_exact += 1 + + print(f" HT Exact Score: {ht_exact / ht_total * 100:.2f}% ({ht_exact}/{ht_total})") + + return { + "ft_exact_pct": exact / total * 100, + "ft_close_pct": close / total * 100, + "ft_result_pct": result_correct / total * 100, + "ht_exact_pct": ht_exact / ht_total * 100, + } + + +def train(): + """Main training pipeline.""" + print("πŸš€ Score Prediction Model Trainer (V25-Compatible)") + print(f" Feature count: {len(FEATURES)}") + print("=" * 60) + + # Load data + df = load_data() + print(f" Total valid rows: {len(df)}") + + # Temporal split + train_df, test_df = temporal_split(df) + print(f" Training set: {len(train_df)} matches") + print(f" Test set: {len(test_df)} matches (temporally after training)") + + X_train = train_df[FEATURES] + X_test = test_df[FEATURES] + + # Train 4 models + models = {} + metrics = {} + + for target_name, model_key in [ + ("score_home", "ft_home"), + ("score_away", "ft_away"), + ("ht_score_home", "ht_home"), + ("ht_score_away", "ht_away"), + ]: + model, metric = train_single_model( + X_train, train_df[target_name], + X_test, test_df[target_name], + model_key, + ) + models[model_key] = model + metrics[model_key] = metric + + # Combined evaluation + y_test_dict = {t: test_df[t] for t in TARGETS} + combined = evaluate_combined(models, X_test, y_test_dict) # Save - print(f"\nπŸ’Ύ Saving models to {MODEL_PATH}...") + print(f"\nπŸ’Ύ Saving to {MODEL_PATH}...") model_data = { - "home_model": xgb_home, - "away_model": xgb_away, - "ht_home_model": xgb_ht_home, - "ht_away_model": xgb_ht_away, + "home_model": models["ft_home"], + "away_model": models["ft_away"], + "ht_home_model": models["ht_home"], + "ht_away_model": models["ht_away"], "features": FEATURES, "meta": { - "mae_home": mae_home, - "mae_away": mae_away, - "mae_ht_home": mae_ht_home, - "mae_ht_away": mae_ht_away, - "acc": acc - } + **{f"{k}_{mk}": mv for k, m in metrics.items() for mk, mv in m.items()}, + **combined, + "trained_at": datetime.now().isoformat(), + "feature_count": len(FEATURES), + "train_size": len(train_df), + "test_size": len(test_df), + }, } + with open(MODEL_PATH, "wb") as f: pickle.dump(model_data, f) - - print("βœ… Done.") + + print("\nβœ… Score model training complete!") + print(f" Saved: {MODEL_PATH}") + if __name__ == "__main__": train() diff --git a/ai-engine/scripts/update_implied_odds.py b/ai-engine/scripts/update_implied_odds.py new file mode 100644 index 0000000..51428c6 --- /dev/null +++ b/ai-engine/scripts/update_implied_odds.py @@ -0,0 +1,307 @@ +""" +Update Implied Odds in football_ai_features +============================================= +Populates implied_home, implied_draw, implied_away, implied_over25, implied_btts +from real odds data in odd_categories + odd_selections tables. + +Also backfills form-based features (home_goals_avg_5, away_goals_avg_5, etc.) +from recent match history. + +Usage: + python3 scripts/update_implied_odds.py +""" + +import os +import sys +import time +import psycopg2 +from dotenv import load_dotenv + +load_dotenv() + + +def get_conn(): + db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0] + return psycopg2.connect(db_url) + + +def update_implied_odds(conn): + """Update implied probabilities from real odds data.""" + cur = conn.cursor() + + print("πŸ“Š Phase 1: Updating implied odds from real market data...") + t0 = time.time() + + # Step 1: Build odds lookup from odd_categories + odd_selections + print(" Loading odds data...") + cur.execute(""" + SELECT oc.match_id, oc.name AS cat_name, os.name AS sel_name, os.odd_value + FROM odd_selections os + JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id + WHERE os.odd_value IS NOT NULL + AND CAST(os.odd_value AS FLOAT) > 1.0 + """) + + odds_by_match = {} + row_count = 0 + for match_id, cat_name, sel_name, odd_val in cur.fetchall(): + try: + v = float(odd_val) + if v <= 1.0: + continue + except (ValueError, TypeError): + continue + + if match_id not in odds_by_match: + odds_by_match[match_id] = {} + + cat_lower = (cat_name or "").lower().strip() + sel_lower = (sel_name or "").lower().strip() + + # Match Result (1X2) + if cat_lower == 'maΓ§ sonucu': + if sel_name == '1': + odds_by_match[match_id]['ms_h'] = v + elif sel_name in ('0', 'X'): + odds_by_match[match_id]['ms_d'] = v + elif sel_name == '2': + odds_by_match[match_id]['ms_a'] = v + + # Over/Under 2.5 + elif cat_lower == '2,5 alt/ΓΌst': + if 'ΓΌst' in sel_lower: + odds_by_match[match_id]['ou25_o'] = v + elif 'alt' in sel_lower: + odds_by_match[match_id]['ou25_u'] = v + + # BTTS + elif cat_lower == 'karşılΔ±klΔ± gol': + if 'var' in sel_lower: + odds_by_match[match_id]['btts_y'] = v + elif 'yok' in sel_lower: + odds_by_match[match_id]['btts_n'] = v + + row_count += 1 + + print(f" Loaded odds for {len(odds_by_match)} matches ({row_count} selections) in {time.time()-t0:.1f}s") + + # Step 2: Calculate implied probabilities and update + print(" Calculating implied probabilities...") + + # Get all match_ids in football_ai_features + cur.execute("SELECT match_id FROM football_ai_features") + feature_match_ids = {row[0] for row in cur.fetchall()} + + updated = 0 + batch_size = 500 + updates = [] + + for match_id in feature_match_ids: + odds = odds_by_match.get(match_id, {}) + if not odds: + continue + + # Implied MS probabilities (vig-free normalization) + ms_h = odds.get('ms_h', 0) + ms_d = odds.get('ms_d', 0) + ms_a = odds.get('ms_a', 0) + + implied_home = 0.33 + implied_draw = 0.33 + implied_away = 0.33 + + if ms_h > 1.0 and ms_d > 1.0 and ms_a > 1.0: + raw_sum = (1 / ms_h) + (1 / ms_d) + (1 / ms_a) + if raw_sum > 0: + implied_home = round((1 / ms_h) / raw_sum, 4) + implied_draw = round((1 / ms_d) / raw_sum, 4) + implied_away = round((1 / ms_a) / raw_sum, 4) + + # Implied OU25 + ou25_o = odds.get('ou25_o', 0) + ou25_u = odds.get('ou25_u', 0) + implied_over25 = 0.50 + + if ou25_o > 1.0 and ou25_u > 1.0: + raw_sum = (1 / ou25_o) + (1 / ou25_u) + if raw_sum > 0: + implied_over25 = round((1 / ou25_o) / raw_sum, 4) + + # Implied BTTS + btts_y = odds.get('btts_y', 0) + btts_n = odds.get('btts_n', 0) + implied_btts = 0.50 + + if btts_y > 1.0 and btts_n > 1.0: + raw_sum = (1 / btts_y) + (1 / btts_n) + if raw_sum > 0: + implied_btts = round((1 / btts_y) / raw_sum, 4) + + # Only update if we have real data (not all defaults) + has_real_data = (ms_h > 1.0 or ou25_o > 1.0 or btts_y > 1.0) + if not has_real_data: + continue + + updates.append(( + implied_home, implied_draw, implied_away, + implied_over25, implied_btts, match_id + )) + + if len(updates) >= batch_size: + cur.executemany(""" + UPDATE football_ai_features + SET implied_home = %s, + implied_draw = %s, + implied_away = %s, + implied_over25 = %s, + implied_btts = %s + WHERE match_id = %s + """, updates) + updated += len(updates) + updates = [] + + # Final batch + if updates: + cur.executemany(""" + UPDATE football_ai_features + SET implied_home = %s, + implied_draw = %s, + implied_away = %s, + implied_over25 = %s, + implied_btts = %s + WHERE match_id = %s + """, updates) + updated += len(updates) + + conn.commit() + print(f" βœ… Updated implied odds for {updated} matches in {time.time()-t0:.1f}s") + return updated + + +def update_form_features(conn): + """Backfill form-based features (goals avg, clean sheet rate) from match history.""" + cur = conn.cursor() + + print("\nπŸ“Š Phase 2: Updating form-based features...") + t0 = time.time() + + # Load all finished football matches ordered by time + print(" Loading match history...") + cur.execute(""" + SELECT id, home_team_id, away_team_id, score_home, score_away, mst_utc + FROM matches + WHERE status = 'FT' + AND score_home IS NOT NULL + AND sport = 'football' + ORDER BY mst_utc ASC + """) + + matches = cur.fetchall() + print(f" Loaded {len(matches)} finished matches") + + # Build team history incrementally + from collections import defaultdict + team_history = defaultdict(list) # team_id -> [(goals_scored, goals_conceded)] + + # Get all feature match IDs + cur.execute("SELECT match_id FROM football_ai_features") + feature_match_ids = {row[0] for row in cur.fetchall()} + + updated = 0 + batch_size = 500 + updates = [] + + for match_id, home_id, away_id, score_home, score_away, mst_utc in matches: + # Calculate features BEFORE updating history (pre-match features) + if match_id in feature_match_ids: + h_hist = team_history[home_id][-5:] # last 5 + a_hist = team_history[away_id][-5:] + + # Home team form + if h_hist: + h_goals_avg = sum(g for g, _ in h_hist) / len(h_hist) + h_conceded_avg = sum(c for _, c in h_hist) / len(h_hist) + h_cs_rate = sum(1 for _, c in h_hist if c == 0) / len(h_hist) + h_scoring_rate = sum(1 for g, _ in h_hist if g > 0) / len(h_hist) + else: + h_goals_avg, h_conceded_avg = 1.3, 1.2 + h_cs_rate, h_scoring_rate = 0.25, 0.75 + + # Away team form + if a_hist: + a_goals_avg = sum(g for g, _ in a_hist) / len(a_hist) + a_conceded_avg = sum(c for _, c in a_hist) / len(a_hist) + a_cs_rate = sum(1 for _, c in a_hist if c == 0) / len(a_hist) + a_scoring_rate = sum(1 for g, _ in a_hist if g > 0) / len(a_hist) + else: + a_goals_avg, a_conceded_avg = 1.3, 1.2 + a_cs_rate, a_scoring_rate = 0.25, 0.75 + + updates.append(( + round(h_goals_avg, 3), round(h_conceded_avg, 3), + round(h_cs_rate, 3), round(h_scoring_rate, 3), + round(a_goals_avg, 3), round(a_conceded_avg, 3), + round(a_cs_rate, 3), round(a_scoring_rate, 3), + match_id + )) + + if len(updates) >= batch_size: + cur.executemany(""" + UPDATE football_ai_features + SET home_goals_avg_5 = %s, + home_conceded_avg_5 = %s, + home_clean_sheet_rate = %s, + home_scoring_rate = %s, + away_goals_avg_5 = %s, + away_conceded_avg_5 = %s, + away_clean_sheet_rate = %s, + away_scoring_rate = %s + WHERE match_id = %s + """, updates) + updated += len(updates) + updates = [] + + # Update history AFTER feature extraction (maintains pre-match invariant) + team_history[home_id].append((score_home, score_away)) + team_history[away_id].append((score_away, score_home)) + + # Final batch + if updates: + cur.executemany(""" + UPDATE football_ai_features + SET home_goals_avg_5 = %s, + home_conceded_avg_5 = %s, + home_clean_sheet_rate = %s, + home_scoring_rate = %s, + away_goals_avg_5 = %s, + away_conceded_avg_5 = %s, + away_clean_sheet_rate = %s, + away_scoring_rate = %s + WHERE match_id = %s + """, updates) + updated += len(updates) + + conn.commit() + print(f" βœ… Updated form features for {updated} matches in {time.time()-t0:.1f}s") + return updated + + +def main(): + print("πŸš€ Football AI Features β€” Implied Odds & Form Backfill") + print("=" * 60) + + conn = get_conn() + + try: + odds_updated = update_implied_odds(conn) + form_updated = update_form_features(conn) + + print(f"\nβœ… DONE!") + print(f" Implied odds updated: {odds_updated} matches") + print(f" Form features updated: {form_updated} matches") + finally: + conn.close() + + +if __name__ == "__main__": + main() diff --git a/ai-engine/services/single_match_orchestrator.py b/ai-engine/services/single_match_orchestrator.py index e800b93..23dafc8 100755 --- a/ai-engine/services/single_match_orchestrator.py +++ b/ai-engine/services/single_match_orchestrator.py @@ -16,6 +16,7 @@ import re import time import math import os +import pickle import pandas as pd import numpy as np from collections import defaultdict @@ -258,6 +259,51 @@ class SingleMatchOrchestrator: self._v27 = None return None + def _get_score_model(self) -> Optional[Dict]: + """Load XGBoost score prediction model (non-fatal).""" + if hasattr(self, "_score_model_cache"): + return self._score_model_cache + score_model_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "models", "xgb_score.pkl", + ) + try: + if os.path.exists(score_model_path): + with open(score_model_path, "rb") as f: + model_data = pickle.load(f) + if all(k in model_data for k in ("home_model", "away_model", "ht_home_model", "ht_away_model", "features")): + self._score_model_cache = model_data + print(f"[SCORE] βœ… Score model loaded ({len(model_data['features'])} features)") + return self._score_model_cache + except Exception as e: + print(f"[SCORE] ⚠ Load failed (non-fatal, using heuristic): {e}") + self._score_model_cache = None + return None + + def _predict_score_with_model(self, features: Dict[str, float]) -> Optional[Dict[str, float]]: + """Predict FT/HT scores using XGBoost score model.""" + score_model = self._get_score_model() + if score_model is None: + return None + try: + import pandas as _pd + model_features = score_model["features"] + row = {f: float(features.get(f, 0)) for f in model_features} + df = _pd.DataFrame([row]) + ft_home = max(0.0, float(score_model["home_model"].predict(df)[0])) + ft_away = max(0.0, float(score_model["away_model"].predict(df)[0])) + ht_home = max(0.0, float(score_model["ht_home_model"].predict(df)[0])) + ht_away = max(0.0, float(score_model["ht_away_model"].predict(df)[0])) + return { + "ft_home": round(ft_home, 2), + "ft_away": round(ft_away, 2), + "ht_home": round(ht_home, 2), + "ht_away": round(ht_away, 2), + } + except Exception as e: + print(f"[SCORE] ⚠ Prediction error (fallback to heuristic): {e}") + return None + def _build_v25_features(self, data: MatchData) -> Dict[str, float]: """ Build the single authoritative V25 pre-match feature vector. @@ -869,27 +915,39 @@ class SingleMatchOrchestrator: prediction.handicap_pick, hcap_top = self._best_prob_pick(hcap_probs) prediction.handicap_confidence = hcap_top * 100.0 - base_home_xg = max(0.25, (float(data.home_goals_avg) + float(features.get("away_xga", data.away_conceded_avg))) / 2.0) - base_away_xg = max(0.25, (float(data.away_goals_avg) + float(features.get("home_xga", data.home_conceded_avg))) / 2.0) - ms_edge = prediction.ms_home_prob - prediction.ms_away_prob - total_target = max( - 1.4, - min( - 4.8, - (float(features.get("league_avg_goals", 2.7)) * 0.55) - + ((float(data.home_goals_avg) + float(data.away_goals_avg)) * 0.45) - + ((prediction.over_25_prob - prediction.under_25_prob) * 1.15), - ), - ) - home_xg = max(0.2, base_home_xg + (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18)) - away_xg = max(0.2, base_away_xg - (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18)) - scale = total_target / max(home_xg + away_xg, 0.1) - prediction.home_xg = round(home_xg * scale, 2) - prediction.away_xg = round(away_xg * scale, 2) - prediction.total_xg = round(prediction.home_xg + prediction.away_xg, 2) - - prediction.predicted_ft_score = f"{int(round(prediction.home_xg))}-{int(round(prediction.away_xg))}" - prediction.predicted_ht_score = f"{int(round(prediction.home_xg * 0.45))}-{int(round(prediction.away_xg * 0.45))}" + # ── Score Prediction: Model-first, heuristic fallback ────────── + score_result = self._predict_score_with_model(features) + if score_result is not None: + # ML model predicted scores + prediction.home_xg = score_result["ft_home"] + prediction.away_xg = score_result["ft_away"] + prediction.total_xg = round(prediction.home_xg + prediction.away_xg, 2) + ht_home_xg = score_result["ht_home"] + ht_away_xg = score_result["ht_away"] + prediction.predicted_ft_score = f"{int(round(prediction.home_xg))}-{int(round(prediction.away_xg))}" + prediction.predicted_ht_score = f"{int(round(ht_home_xg))}-{int(round(ht_away_xg))}" + else: + # Heuristic fallback (original formula) + base_home_xg = max(0.25, (float(data.home_goals_avg) + float(features.get("away_xga", data.away_conceded_avg))) / 2.0) + base_away_xg = max(0.25, (float(data.away_goals_avg) + float(features.get("home_xga", data.home_conceded_avg))) / 2.0) + ms_edge = prediction.ms_home_prob - prediction.ms_away_prob + total_target = max( + 1.4, + min( + 4.8, + (float(features.get("league_avg_goals", 2.7)) * 0.55) + + ((float(data.home_goals_avg) + float(data.away_goals_avg)) * 0.45) + + ((prediction.over_25_prob - prediction.under_25_prob) * 1.15), + ), + ) + home_xg = max(0.2, base_home_xg + (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18)) + away_xg = max(0.2, base_away_xg - (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18)) + scale = total_target / max(home_xg + away_xg, 0.1) + prediction.home_xg = round(home_xg * scale, 2) + prediction.away_xg = round(away_xg * scale, 2) + prediction.total_xg = round(prediction.home_xg + prediction.away_xg, 2) + prediction.predicted_ft_score = f"{int(round(prediction.home_xg))}-{int(round(prediction.away_xg))}" + prediction.predicted_ht_score = f"{int(round(prediction.home_xg * 0.45))}-{int(round(prediction.away_xg * 0.45))}" prediction.ft_scores_top5 = self._poisson_score_top5(prediction.home_xg, prediction.away_xg) max_market_conf = max(