feat(ai): expand training to 68K+ matches, add score model, backfill implied odds
Deploy Iddaai Backend / build-and-deploy (push) Successful in 6s

- extract_training_data.py: switch from top_leagues.json (23) to qualified_leagues.json (265)
- update_implied_odds.py: new script to backfill implied odds from real market data
- train_score_model.py: rewrite with v25 102-feature set + temporal split
- single_match_orchestrator.py: integrate ML score model with heuristic fallback
This commit is contained in:
2026-05-05 16:04:00 +03:00
parent 9bb8f39bca
commit 244d8f5366
4 changed files with 626 additions and 173 deletions
+1 -1
View File
@@ -33,7 +33,7 @@ from features.upset_engine import get_upset_engine
from features.referee_engine import get_referee_engine from features.referee_engine import get_referee_engine
from features.momentum_engine import get_momentum_engine from features.momentum_engine import get_momentum_engine
TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "top_leagues.json") TOP_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "qualified_leagues.json")
OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv") OUTPUT_CSV = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
# Ensure output dir exists # Ensure output dir exists
+225 -137
View File
@@ -1,183 +1,271 @@
"""
V25-Compatible Score Prediction Model Trainer
===============================================
Trains 4 independent XGBoost regression models for:
- FT Home Goals
- FT Away Goals
- HT Home Goals
- HT Away Goals
Uses the same 102-feature set as v25_ensemble for full compatibility.
Temporal train/test split (80/20) to avoid future leakage.
Usage:
python3 scripts/train_score_model.py
"""
import os
import sys
import pickle
import numpy as np
import pandas as pd import pandas as pd
import xgboost as xgb import xgboost as xgb
import pickle from datetime import datetime
import os from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
# Paths # Add parent directory to path
DATA_PATH = os.path.join(os.path.dirname(__file__), "../data/training_data.csv") sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
MODEL_PATH = os.path.join(os.path.dirname(__file__), "../models/xgb_score.pkl")
# Import unified 56-feature array from markets trainer # Config
from train_xgboost_markets import FEATURES AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
MODEL_PATH = os.path.join(AI_ENGINE_DIR, "models", "xgb_score.pkl")
# Import the EXACT same feature set as v25 market models
from train_v25_clean import FEATURES
TARGETS = ["score_home", "score_away", "ht_score_home", "ht_score_away"] TARGETS = ["score_home", "score_away", "ht_score_home", "ht_score_away"]
def train(): # Model hyperparameters (tuned for goal count regression)
print("🚀 Training Score Prediction Model (XGBoost) - Full Time & Half Time") XGB_PARAMS = {
print("=" * 60) "objective": "reg:squarederror",
"n_estimators": 1200,
"learning_rate": 0.02,
"max_depth": 6,
"subsample": 0.8,
"colsample_bytree": 0.7,
"min_child_weight": 5,
"reg_alpha": 0.1,
"reg_lambda": 1.0,
"n_jobs": -1,
"random_state": 42,
}
def load_data() -> pd.DataFrame:
"""Load and validate training data."""
if not os.path.exists(DATA_PATH): if not os.path.exists(DATA_PATH):
print(f"❌ Data file not found: {DATA_PATH}") print(f"❌ Data file not found: {DATA_PATH}")
return print(" Run extract_training_data.py first")
sys.exit(1)
print(f"📦 Loading data from {DATA_PATH}...") print(f"📦 Loading data from {DATA_PATH}...")
df = pd.read_csv(DATA_PATH) df = pd.read_csv(DATA_PATH)
# Preprocessing # Fill feature NaNs with 0 (same as v25 training)
# Drop rows where target is missing (should verify) for col in FEATURES:
if col in df.columns:
df[col] = df[col].fillna(0)
# Backward-compatible: add odds presence flags if missing
odds_base_columns = [
"odds_ms_h", "odds_ms_d", "odds_ms_a",
"odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
"odds_ou05_o", "odds_ou05_u",
"odds_ou15_o", "odds_ou15_u",
"odds_ou25_o", "odds_ou25_u",
"odds_ou35_o", "odds_ou35_u",
"odds_ht_ou05_o", "odds_ht_ou05_u",
"odds_ht_ou15_o", "odds_ht_ou15_u",
"odds_btts_y", "odds_btts_n",
]
for base_col in odds_base_columns:
pres_col = f"{base_col}_present"
if pres_col not in df.columns and base_col in df.columns:
df[pres_col] = (df[base_col] > 1.0).astype(int)
# Drop rows where any target is missing
df = df.dropna(subset=TARGETS) df = df.dropna(subset=TARGETS)
# Fill feature NaNs with median/mean or 0 # Filter: at least MS odds must be present
print(f" Original rows: {len(df)}")
# Filter valid odds (at least ms_h > 1.0)
df = df[df["odds_ms_h"] > 1.0].copy() df = df[df["odds_ms_h"] > 1.0].copy()
print(f" Rows with valid odds: {len(df)}")
X = df[FEATURES] # Ensure all features exist
y_home = df["score_home"] missing = [f for f in FEATURES if f not in df.columns]
y_away = df["score_away"] if missing:
y_ht_home = df["ht_score_home"] print(f"⚠️ Missing {len(missing)} features, filling with 0: {missing[:5]}...")
y_ht_away = df["ht_score_away"] for f in missing:
df[f] = 0
# Train/Test Split return df
X_train, X_test, y_h_train, y_h_test, y_a_train, y_a_test, y_ht_h_train, y_ht_h_test, y_ht_a_train, y_ht_a_test = train_test_split(
X, y_home, y_away, y_ht_home, y_ht_away, test_size=0.2, random_state=42
def temporal_split(df: pd.DataFrame, train_ratio: float = 0.80):
"""
Temporal train/test split by match date.
Ensures no future information leaks into training.
"""
if "match_date" in df.columns:
df = df.sort_values("match_date").reset_index(drop=True)
elif "round" in df.columns:
df = df.sort_values("round").reset_index(drop=True)
split_idx = int(len(df) * train_ratio)
return df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy()
def train_single_model(X_train, y_train, X_test, y_test, name: str):
"""Train a single XGBoost regression model with early stopping."""
print(f"\n🏗️ Training {name} model...")
model = xgb.XGBRegressor(**XGB_PARAMS)
model.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=False,
) )
print(f" Training set: {len(X_train)} matches") preds = model.predict(X_test)
print(f" Test set: {len(X_test)} matches")
# --- HOME GOALS MODEL --- mae = mean_absolute_error(y_test, preds)
print("\n🏠 Training Home Goals Model...") rmse = np.sqrt(mean_squared_error(y_test, preds))
xgb_home = xgb.XGBRegressor( r2 = r2_score(y_test, preds)
objective='reg:squarederror',
n_estimators=1000,
learning_rate=0.01,
max_depth=5,
subsample=0.7,
colsample_bytree=0.7,
n_jobs=-1,
random_state=42,
early_stopping_rounds=50 # Configure here for newer XGBoost or remove if not supported in constructor (depends on version)
)
# Actually, to be safe across versions, let's remove early stopping for now or use validation set properly
# Using 'eval_set' without early_stopping_rounds just prints metrics
xgb_home = xgb.XGBRegressor(
objective='reg:squarederror',
n_estimators=1000,
learning_rate=0.01,
max_depth=5,
subsample=0.7,
colsample_bytree=0.7,
n_jobs=-1,
random_state=42
)
xgb_home.fit(X_train, y_h_train, eval_set=[(X_test, y_h_test)], verbose=False)
home_preds = xgb_home.predict(X_test) print(f" MAE: {mae:.4f} goals")
mae_home = mean_absolute_error(y_h_test, home_preds) print(f" RMSE: {rmse:.4f}")
r2_home = r2_score(y_h_test, home_preds) print(f" R²: {r2:.4f}")
print(f" ✅ FT Home MAE: {mae_home:.4f} goals")
print(f" ✅ FT Home R2: {r2_home:.4f}")
# --- AWAY GOALS MODEL --- return model, {"mae": mae, "rmse": rmse, "r2": r2}
print("\n✈️ Training FT Away Goals Model...")
xgb_away = xgb.XGBRegressor(
objective='reg:squarederror',
n_estimators=1000,
learning_rate=0.01,
max_depth=5,
subsample=0.7,
colsample_bytree=0.7,
n_jobs=-1,
random_state=42
)
xgb_away.fit(X_train, y_a_train, eval_set=[(X_test, y_a_test)], verbose=False)
away_preds = xgb_away.predict(X_test)
mae_away = mean_absolute_error(y_a_test, away_preds)
r2_away = r2_score(y_a_test, away_preds)
print(f" ✅ FT Away MAE: {mae_away:.4f} goals")
print(f" ✅ FT Away R2: {r2_away:.4f}")
# --- HT HOME GOALS MODEL --- def evaluate_combined(models: dict, X_test, y_test_dict: dict):
print("\n🏠 Training HT Home Goals Model...") """Evaluate combined score accuracy (FT and HT)."""
xgb_ht_home = xgb.XGBRegressor( print("\n🎯 Combined Score Evaluation (Test Set):")
objective='reg:squarederror',
n_estimators=1000,
learning_rate=0.01,
max_depth=5,
subsample=0.7,
colsample_bytree=0.7,
n_jobs=-1,
random_state=42
)
xgb_ht_home.fit(X_train, y_ht_h_train, eval_set=[(X_test, y_ht_h_test)], verbose=False)
ht_home_preds = xgb_ht_home.predict(X_test) # FT Score
mae_ht_home = mean_absolute_error(y_ht_h_test, ht_home_preds) ft_h_preds = models["ft_home"].predict(X_test)
print(f" ✅ HT Home MAE: {mae_ht_home:.4f} goals") ft_a_preds = models["ft_away"].predict(X_test)
# --- HT AWAY GOALS MODEL --- y_ft_h = y_test_dict["score_home"].values
print("\n✈️ Training HT Away Goals Model...") y_ft_a = y_test_dict["score_away"].values
xgb_ht_away = xgb.XGBRegressor(
objective='reg:squarederror',
n_estimators=1000,
learning_rate=0.01,
max_depth=5,
subsample=0.7,
colsample_bytree=0.7,
n_jobs=-1,
random_state=42
)
xgb_ht_away.fit(X_train, y_ht_a_train, eval_set=[(X_test, y_ht_a_test)], verbose=False)
ht_away_preds = xgb_ht_away.predict(X_test) exact = 0
mae_ht_away = mean_absolute_error(y_ht_a_test, ht_away_preds) close = 0
print(f" ✅ HT Away MAE: {mae_ht_away:.4f} goals") result_correct = 0
total = len(X_test)
# --- EVALUATE EXACT SCORE ACCURACY (ROUNDED) --- for h_true, a_true, h_pred, a_pred in zip(y_ft_h, y_ft_a, ft_h_preds, ft_a_preds):
print("\n🎯 Exact FT Score Accuracy (Test Set):") hp = max(0, round(h_pred))
correct = 0 ap = max(0, round(a_pred))
close = 0 # Within 1 goal diff for both
for h_true, a_true, h_pred, a_pred in zip(y_h_test, y_a_test, home_preds, away_preds): # Exact score
h_p = round(h_pred) if hp == h_true and ap == a_true:
a_p = round(a_pred) exact += 1
if h_p == h_true and a_p == a_true:
correct += 1 # Close (±1 each)
if abs(h_p - h_true) <= 1 and abs(a_p - a_true) <= 1: if abs(hp - h_true) <= 1 and abs(ap - a_true) <= 1:
close += 1 close += 1
acc = correct / len(X_test) * 100 # Result direction (1X2)
close_acc = close / len(X_test) * 100 true_result = 1 if h_true > a_true else (0 if h_true == a_true else -1)
print(f" Exact Match: {acc:.2f}%") pred_result = 1 if hp > ap else (0 if hp == ap else -1)
print(f" Close Match (+/- 1 goal): {close_acc:.2f}%") if true_result == pred_result:
result_correct += 1
print(f" FT Exact Score: {exact / total * 100:.2f}% ({exact}/{total})")
print(f" FT Close (±1): {close / total * 100:.2f}% ({close}/{total})")
print(f" FT Result (1X2): {result_correct / total * 100:.2f}% ({result_correct}/{total})")
# HT Score
ht_h_preds = models["ht_home"].predict(X_test)
ht_a_preds = models["ht_away"].predict(X_test)
y_ht_h = y_test_dict["ht_score_home"].values
y_ht_a = y_test_dict["ht_score_away"].values
ht_exact = 0
ht_total = len(X_test)
for h_true, a_true, h_pred, a_pred in zip(y_ht_h, y_ht_a, ht_h_preds, ht_a_preds):
hp = max(0, round(h_pred))
ap = max(0, round(a_pred))
if hp == h_true and ap == a_true:
ht_exact += 1
print(f" HT Exact Score: {ht_exact / ht_total * 100:.2f}% ({ht_exact}/{ht_total})")
return {
"ft_exact_pct": exact / total * 100,
"ft_close_pct": close / total * 100,
"ft_result_pct": result_correct / total * 100,
"ht_exact_pct": ht_exact / ht_total * 100,
}
def train():
"""Main training pipeline."""
print("🚀 Score Prediction Model Trainer (V25-Compatible)")
print(f" Feature count: {len(FEATURES)}")
print("=" * 60)
# Load data
df = load_data()
print(f" Total valid rows: {len(df)}")
# Temporal split
train_df, test_df = temporal_split(df)
print(f" Training set: {len(train_df)} matches")
print(f" Test set: {len(test_df)} matches (temporally after training)")
X_train = train_df[FEATURES]
X_test = test_df[FEATURES]
# Train 4 models
models = {}
metrics = {}
for target_name, model_key in [
("score_home", "ft_home"),
("score_away", "ft_away"),
("ht_score_home", "ht_home"),
("ht_score_away", "ht_away"),
]:
model, metric = train_single_model(
X_train, train_df[target_name],
X_test, test_df[target_name],
model_key,
)
models[model_key] = model
metrics[model_key] = metric
# Combined evaluation
y_test_dict = {t: test_df[t] for t in TARGETS}
combined = evaluate_combined(models, X_test, y_test_dict)
# Save # Save
print(f"\n💾 Saving models to {MODEL_PATH}...") print(f"\n💾 Saving to {MODEL_PATH}...")
model_data = { model_data = {
"home_model": xgb_home, "home_model": models["ft_home"],
"away_model": xgb_away, "away_model": models["ft_away"],
"ht_home_model": xgb_ht_home, "ht_home_model": models["ht_home"],
"ht_away_model": xgb_ht_away, "ht_away_model": models["ht_away"],
"features": FEATURES, "features": FEATURES,
"meta": { "meta": {
"mae_home": mae_home, **{f"{k}_{mk}": mv for k, m in metrics.items() for mk, mv in m.items()},
"mae_away": mae_away, **combined,
"mae_ht_home": mae_ht_home, "trained_at": datetime.now().isoformat(),
"mae_ht_away": mae_ht_away, "feature_count": len(FEATURES),
"acc": acc "train_size": len(train_df),
} "test_size": len(test_df),
},
} }
with open(MODEL_PATH, "wb") as f: with open(MODEL_PATH, "wb") as f:
pickle.dump(model_data, f) pickle.dump(model_data, f)
print("✅ Done.") print("\n✅ Score model training complete!")
print(f" Saved: {MODEL_PATH}")
if __name__ == "__main__": if __name__ == "__main__":
train() train()
+307
View File
@@ -0,0 +1,307 @@
"""
Update Implied Odds in football_ai_features
=============================================
Populates implied_home, implied_draw, implied_away, implied_over25, implied_btts
from real odds data in odd_categories + odd_selections tables.
Also backfills form-based features (home_goals_avg_5, away_goals_avg_5, etc.)
from recent match history.
Usage:
python3 scripts/update_implied_odds.py
"""
import os
import sys
import time
import psycopg2
from dotenv import load_dotenv
load_dotenv()
def get_conn():
db_url = os.getenv("DATABASE_URL", "").split("?schema=")[0]
return psycopg2.connect(db_url)
def update_implied_odds(conn):
"""Update implied probabilities from real odds data."""
cur = conn.cursor()
print("📊 Phase 1: Updating implied odds from real market data...")
t0 = time.time()
# Step 1: Build odds lookup from odd_categories + odd_selections
print(" Loading odds data...")
cur.execute("""
SELECT oc.match_id, oc.name AS cat_name, os.name AS sel_name, os.odd_value
FROM odd_selections os
JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
WHERE os.odd_value IS NOT NULL
AND CAST(os.odd_value AS FLOAT) > 1.0
""")
odds_by_match = {}
row_count = 0
for match_id, cat_name, sel_name, odd_val in cur.fetchall():
try:
v = float(odd_val)
if v <= 1.0:
continue
except (ValueError, TypeError):
continue
if match_id not in odds_by_match:
odds_by_match[match_id] = {}
cat_lower = (cat_name or "").lower().strip()
sel_lower = (sel_name or "").lower().strip()
# Match Result (1X2)
if cat_lower == 'maç sonucu':
if sel_name == '1':
odds_by_match[match_id]['ms_h'] = v
elif sel_name in ('0', 'X'):
odds_by_match[match_id]['ms_d'] = v
elif sel_name == '2':
odds_by_match[match_id]['ms_a'] = v
# Over/Under 2.5
elif cat_lower == '2,5 alt/üst':
if 'üst' in sel_lower:
odds_by_match[match_id]['ou25_o'] = v
elif 'alt' in sel_lower:
odds_by_match[match_id]['ou25_u'] = v
# BTTS
elif cat_lower == 'karşılıklı gol':
if 'var' in sel_lower:
odds_by_match[match_id]['btts_y'] = v
elif 'yok' in sel_lower:
odds_by_match[match_id]['btts_n'] = v
row_count += 1
print(f" Loaded odds for {len(odds_by_match)} matches ({row_count} selections) in {time.time()-t0:.1f}s")
# Step 2: Calculate implied probabilities and update
print(" Calculating implied probabilities...")
# Get all match_ids in football_ai_features
cur.execute("SELECT match_id FROM football_ai_features")
feature_match_ids = {row[0] for row in cur.fetchall()}
updated = 0
batch_size = 500
updates = []
for match_id in feature_match_ids:
odds = odds_by_match.get(match_id, {})
if not odds:
continue
# Implied MS probabilities (vig-free normalization)
ms_h = odds.get('ms_h', 0)
ms_d = odds.get('ms_d', 0)
ms_a = odds.get('ms_a', 0)
implied_home = 0.33
implied_draw = 0.33
implied_away = 0.33
if ms_h > 1.0 and ms_d > 1.0 and ms_a > 1.0:
raw_sum = (1 / ms_h) + (1 / ms_d) + (1 / ms_a)
if raw_sum > 0:
implied_home = round((1 / ms_h) / raw_sum, 4)
implied_draw = round((1 / ms_d) / raw_sum, 4)
implied_away = round((1 / ms_a) / raw_sum, 4)
# Implied OU25
ou25_o = odds.get('ou25_o', 0)
ou25_u = odds.get('ou25_u', 0)
implied_over25 = 0.50
if ou25_o > 1.0 and ou25_u > 1.0:
raw_sum = (1 / ou25_o) + (1 / ou25_u)
if raw_sum > 0:
implied_over25 = round((1 / ou25_o) / raw_sum, 4)
# Implied BTTS
btts_y = odds.get('btts_y', 0)
btts_n = odds.get('btts_n', 0)
implied_btts = 0.50
if btts_y > 1.0 and btts_n > 1.0:
raw_sum = (1 / btts_y) + (1 / btts_n)
if raw_sum > 0:
implied_btts = round((1 / btts_y) / raw_sum, 4)
# Only update if we have real data (not all defaults)
has_real_data = (ms_h > 1.0 or ou25_o > 1.0 or btts_y > 1.0)
if not has_real_data:
continue
updates.append((
implied_home, implied_draw, implied_away,
implied_over25, implied_btts, match_id
))
if len(updates) >= batch_size:
cur.executemany("""
UPDATE football_ai_features
SET implied_home = %s,
implied_draw = %s,
implied_away = %s,
implied_over25 = %s,
implied_btts = %s
WHERE match_id = %s
""", updates)
updated += len(updates)
updates = []
# Final batch
if updates:
cur.executemany("""
UPDATE football_ai_features
SET implied_home = %s,
implied_draw = %s,
implied_away = %s,
implied_over25 = %s,
implied_btts = %s
WHERE match_id = %s
""", updates)
updated += len(updates)
conn.commit()
print(f" ✅ Updated implied odds for {updated} matches in {time.time()-t0:.1f}s")
return updated
def update_form_features(conn):
"""Backfill form-based features (goals avg, clean sheet rate) from match history."""
cur = conn.cursor()
print("\n📊 Phase 2: Updating form-based features...")
t0 = time.time()
# Load all finished football matches ordered by time
print(" Loading match history...")
cur.execute("""
SELECT id, home_team_id, away_team_id, score_home, score_away, mst_utc
FROM matches
WHERE status = 'FT'
AND score_home IS NOT NULL
AND sport = 'football'
ORDER BY mst_utc ASC
""")
matches = cur.fetchall()
print(f" Loaded {len(matches)} finished matches")
# Build team history incrementally
from collections import defaultdict
team_history = defaultdict(list) # team_id -> [(goals_scored, goals_conceded)]
# Get all feature match IDs
cur.execute("SELECT match_id FROM football_ai_features")
feature_match_ids = {row[0] for row in cur.fetchall()}
updated = 0
batch_size = 500
updates = []
for match_id, home_id, away_id, score_home, score_away, mst_utc in matches:
# Calculate features BEFORE updating history (pre-match features)
if match_id in feature_match_ids:
h_hist = team_history[home_id][-5:] # last 5
a_hist = team_history[away_id][-5:]
# Home team form
if h_hist:
h_goals_avg = sum(g for g, _ in h_hist) / len(h_hist)
h_conceded_avg = sum(c for _, c in h_hist) / len(h_hist)
h_cs_rate = sum(1 for _, c in h_hist if c == 0) / len(h_hist)
h_scoring_rate = sum(1 for g, _ in h_hist if g > 0) / len(h_hist)
else:
h_goals_avg, h_conceded_avg = 1.3, 1.2
h_cs_rate, h_scoring_rate = 0.25, 0.75
# Away team form
if a_hist:
a_goals_avg = sum(g for g, _ in a_hist) / len(a_hist)
a_conceded_avg = sum(c for _, c in a_hist) / len(a_hist)
a_cs_rate = sum(1 for _, c in a_hist if c == 0) / len(a_hist)
a_scoring_rate = sum(1 for g, _ in a_hist if g > 0) / len(a_hist)
else:
a_goals_avg, a_conceded_avg = 1.3, 1.2
a_cs_rate, a_scoring_rate = 0.25, 0.75
updates.append((
round(h_goals_avg, 3), round(h_conceded_avg, 3),
round(h_cs_rate, 3), round(h_scoring_rate, 3),
round(a_goals_avg, 3), round(a_conceded_avg, 3),
round(a_cs_rate, 3), round(a_scoring_rate, 3),
match_id
))
if len(updates) >= batch_size:
cur.executemany("""
UPDATE football_ai_features
SET home_goals_avg_5 = %s,
home_conceded_avg_5 = %s,
home_clean_sheet_rate = %s,
home_scoring_rate = %s,
away_goals_avg_5 = %s,
away_conceded_avg_5 = %s,
away_clean_sheet_rate = %s,
away_scoring_rate = %s
WHERE match_id = %s
""", updates)
updated += len(updates)
updates = []
# Update history AFTER feature extraction (maintains pre-match invariant)
team_history[home_id].append((score_home, score_away))
team_history[away_id].append((score_away, score_home))
# Final batch
if updates:
cur.executemany("""
UPDATE football_ai_features
SET home_goals_avg_5 = %s,
home_conceded_avg_5 = %s,
home_clean_sheet_rate = %s,
home_scoring_rate = %s,
away_goals_avg_5 = %s,
away_conceded_avg_5 = %s,
away_clean_sheet_rate = %s,
away_scoring_rate = %s
WHERE match_id = %s
""", updates)
updated += len(updates)
conn.commit()
print(f" ✅ Updated form features for {updated} matches in {time.time()-t0:.1f}s")
return updated
def main():
print("🚀 Football AI Features — Implied Odds & Form Backfill")
print("=" * 60)
conn = get_conn()
try:
odds_updated = update_implied_odds(conn)
form_updated = update_form_features(conn)
print(f"\n✅ DONE!")
print(f" Implied odds updated: {odds_updated} matches")
print(f" Form features updated: {form_updated} matches")
finally:
conn.close()
if __name__ == "__main__":
main()
+79 -21
View File
@@ -16,6 +16,7 @@ import re
import time import time
import math import math
import os import os
import pickle
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from collections import defaultdict from collections import defaultdict
@@ -258,6 +259,51 @@ class SingleMatchOrchestrator:
self._v27 = None self._v27 = None
return None return None
def _get_score_model(self) -> Optional[Dict]:
"""Load XGBoost score prediction model (non-fatal)."""
if hasattr(self, "_score_model_cache"):
return self._score_model_cache
score_model_path = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
"models", "xgb_score.pkl",
)
try:
if os.path.exists(score_model_path):
with open(score_model_path, "rb") as f:
model_data = pickle.load(f)
if all(k in model_data for k in ("home_model", "away_model", "ht_home_model", "ht_away_model", "features")):
self._score_model_cache = model_data
print(f"[SCORE] ✅ Score model loaded ({len(model_data['features'])} features)")
return self._score_model_cache
except Exception as e:
print(f"[SCORE] ⚠ Load failed (non-fatal, using heuristic): {e}")
self._score_model_cache = None
return None
def _predict_score_with_model(self, features: Dict[str, float]) -> Optional[Dict[str, float]]:
"""Predict FT/HT scores using XGBoost score model."""
score_model = self._get_score_model()
if score_model is None:
return None
try:
import pandas as _pd
model_features = score_model["features"]
row = {f: float(features.get(f, 0)) for f in model_features}
df = _pd.DataFrame([row])
ft_home = max(0.0, float(score_model["home_model"].predict(df)[0]))
ft_away = max(0.0, float(score_model["away_model"].predict(df)[0]))
ht_home = max(0.0, float(score_model["ht_home_model"].predict(df)[0]))
ht_away = max(0.0, float(score_model["ht_away_model"].predict(df)[0]))
return {
"ft_home": round(ft_home, 2),
"ft_away": round(ft_away, 2),
"ht_home": round(ht_home, 2),
"ht_away": round(ht_away, 2),
}
except Exception as e:
print(f"[SCORE] ⚠ Prediction error (fallback to heuristic): {e}")
return None
def _build_v25_features(self, data: MatchData) -> Dict[str, float]: def _build_v25_features(self, data: MatchData) -> Dict[str, float]:
""" """
Build the single authoritative V25 pre-match feature vector. Build the single authoritative V25 pre-match feature vector.
@@ -869,27 +915,39 @@ class SingleMatchOrchestrator:
prediction.handicap_pick, hcap_top = self._best_prob_pick(hcap_probs) prediction.handicap_pick, hcap_top = self._best_prob_pick(hcap_probs)
prediction.handicap_confidence = hcap_top * 100.0 prediction.handicap_confidence = hcap_top * 100.0
base_home_xg = max(0.25, (float(data.home_goals_avg) + float(features.get("away_xga", data.away_conceded_avg))) / 2.0) # ── Score Prediction: Model-first, heuristic fallback ──────────
base_away_xg = max(0.25, (float(data.away_goals_avg) + float(features.get("home_xga", data.home_conceded_avg))) / 2.0) score_result = self._predict_score_with_model(features)
ms_edge = prediction.ms_home_prob - prediction.ms_away_prob if score_result is not None:
total_target = max( # ML model predicted scores
1.4, prediction.home_xg = score_result["ft_home"]
min( prediction.away_xg = score_result["ft_away"]
4.8, prediction.total_xg = round(prediction.home_xg + prediction.away_xg, 2)
(float(features.get("league_avg_goals", 2.7)) * 0.55) ht_home_xg = score_result["ht_home"]
+ ((float(data.home_goals_avg) + float(data.away_goals_avg)) * 0.45) ht_away_xg = score_result["ht_away"]
+ ((prediction.over_25_prob - prediction.under_25_prob) * 1.15), prediction.predicted_ft_score = f"{int(round(prediction.home_xg))}-{int(round(prediction.away_xg))}"
), prediction.predicted_ht_score = f"{int(round(ht_home_xg))}-{int(round(ht_away_xg))}"
) else:
home_xg = max(0.2, base_home_xg + (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18)) # Heuristic fallback (original formula)
away_xg = max(0.2, base_away_xg - (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18)) base_home_xg = max(0.25, (float(data.home_goals_avg) + float(features.get("away_xga", data.away_conceded_avg))) / 2.0)
scale = total_target / max(home_xg + away_xg, 0.1) base_away_xg = max(0.25, (float(data.away_goals_avg) + float(features.get("home_xga", data.home_conceded_avg))) / 2.0)
prediction.home_xg = round(home_xg * scale, 2) ms_edge = prediction.ms_home_prob - prediction.ms_away_prob
prediction.away_xg = round(away_xg * scale, 2) total_target = max(
prediction.total_xg = round(prediction.home_xg + prediction.away_xg, 2) 1.4,
min(
prediction.predicted_ft_score = f"{int(round(prediction.home_xg))}-{int(round(prediction.away_xg))}" 4.8,
prediction.predicted_ht_score = f"{int(round(prediction.home_xg * 0.45))}-{int(round(prediction.away_xg * 0.45))}" (float(features.get("league_avg_goals", 2.7)) * 0.55)
+ ((float(data.home_goals_avg) + float(data.away_goals_avg)) * 0.45)
+ ((prediction.over_25_prob - prediction.under_25_prob) * 1.15),
),
)
home_xg = max(0.2, base_home_xg + (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18))
away_xg = max(0.2, base_away_xg - (ms_edge * 0.55) + ((prediction.btts_yes_prob - 0.5) * 0.18))
scale = total_target / max(home_xg + away_xg, 0.1)
prediction.home_xg = round(home_xg * scale, 2)
prediction.away_xg = round(away_xg * scale, 2)
prediction.total_xg = round(prediction.home_xg + prediction.away_xg, 2)
prediction.predicted_ft_score = f"{int(round(prediction.home_xg))}-{int(round(prediction.away_xg))}"
prediction.predicted_ht_score = f"{int(round(prediction.home_xg * 0.45))}-{int(round(prediction.away_xg * 0.45))}"
prediction.ft_scores_top5 = self._poisson_score_top5(prediction.home_xg, prediction.away_xg) prediction.ft_scores_top5 = self._poisson_score_top5(prediction.home_xg, prediction.away_xg)
max_market_conf = max( max_market_conf = max(