gg3
Deploy Iddaai Backend / build-and-deploy (push) Successful in 35s

This commit is contained in:
2026-06-05 00:36:24 +03:00
parent b9700f9fda
commit 9e41407cb5
10 changed files with 1683 additions and 0 deletions
+112
View File
@@ -0,0 +1,112 @@
"""
Train Favorite-Policy Model (v1) — leak-free MS model for the validated strategy.
================================================================================
Trains a LEAK-FREE 1X2 model (drops the result-encoding columns) and saves it
plus the feature list and policy metadata. This is the brain of the new system;
the favourite-band value policy (odds ~1.5-2.2, model_prob>implied, flat stake)
is applied on top of its probabilities at serving time.
Honest holdout: trains on the first --holdout-frac of history, evaluates the
EXACT policy on the most recent slice (never seen in training), then retrains
on ALL history for the saved production artifact.
Saves to models/favorite_v1/: model.json, feature_cols.json, metadata.json
Usage: python scripts/train_favorite_model.py
"""
from __future__ import annotations
import argparse, json, os, sys, datetime
import numpy as np, pandas as pd, xgboost as xgb
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
try: sys.stdout.reconfigure(encoding="utf-8")
except Exception: pass
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
OUT = os.path.join(AI_DIR, "models", "favorite_v1")
META = {"match_id","home_team_id","away_team_id","league_id","mst_utc",
"score_home","score_away","ht_score_home","ht_score_away"}
# Result-encoding leakage — never feed these to the model (train OR serve).
LEAKY = {"home_goals_form","away_goals_form","total_goals","ht_total_goals",
"squad_diff","home_squad_quality","away_squad_quality",
"referee_home_bias","referee_avg_goals"}
PARAMS = {"objective":"multi:softprob","num_class":3,"max_depth":5,"eta":0.05,
"subsample":0.8,"colsample_bytree":0.8,"tree_method":"hist","verbosity":0}
def policy_eval(P, y, O, lo, hi, margin):
implied = np.where(O > 1.0, 1.0/O, np.nan)
edge = np.where(np.isnan(implied), -9.0, P - implied)
pick = edge.argmax(1); pe = edge[np.arange(len(y)), pick]; po = O[np.arange(len(y)), pick]
bet = (pe > margin) & (po >= lo) & (po < hi)
win = (pick == y) & bet
pnl = np.where(win, po-1.0, -1.0)[bet]
n = int(bet.sum())
return {"bets": n, "hit_pct": round(100*win.sum()/max(n,1),1),
"roi_pct": round(100*pnl.sum()/max(n,1),2), "net_u": round(float(pnl.sum()),1)}
def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--lo", type=float, default=1.5)
ap.add_argument("--hi", type=float, default=2.2)
ap.add_argument("--margin", type=float, default=0.0)
ap.add_argument("--holdout-frac", type=float, default=0.15)
ap.add_argument("--estimators", type=int, default=300)
args = ap.parse_args()
print(f"Loading {CSV} ...")
df = pd.read_csv(CSV, low_memory=False).sort_values("mst_utc").reset_index(drop=True)
sh = pd.to_numeric(df["score_home"], errors="coerce")
sa = pd.to_numeric(df["score_away"], errors="coerce")
ok = sh.notna() & sa.notna()
df, sh, sa = df[ok].reset_index(drop=True), sh[ok.values].values, sa[ok.values].values
y = np.where(sh > sa, 0, np.where(sh == sa, 1, 2))
O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
feats = [c for c in df.columns if c not in META and not c.startswith("label_") and c not in LEAKY]
X = df[feats].apply(pd.to_numeric, errors="coerce").fillna(0.0).values
print(f" {len(df):,} rows, {len(feats)} leak-free features")
# ── Honest holdout (last slice, never trained on) ──
cut = int(len(df) * (1 - args.holdout_frac))
bst = xgb.train(PARAMS, xgb.DMatrix(X[:cut], label=y[:cut]), num_boost_round=args.estimators)
Ph = bst.predict(xgb.DMatrix(X[cut:]))
acc = float((Ph.argmax(1) == y[cut:]).mean())
hold = policy_eval(Ph, y[cut:], O[cut:], args.lo, args.hi, args.margin)
print(f"\nHOLDOUT (last {args.holdout_frac:.0%}, {len(df)-cut:,} matches, never seen):")
print(f" MS accuracy: {acc*100:.1f}%")
print(f" POLICY band[{args.lo},{args.hi}] margin {args.margin}: {hold}")
# ── Production model: retrain on ALL history ──
print("\nTraining production model on ALL history ...")
final = xgb.train(PARAMS, xgb.DMatrix(X, label=y), num_boost_round=args.estimators)
os.makedirs(OUT, exist_ok=True)
final.save_model(os.path.join(OUT, "model.json"))
with open(os.path.join(OUT, "feature_cols.json"), "w", encoding="utf-8") as f:
json.dump(feats, f, ensure_ascii=False, indent=2)
meta = {
"version": "favorite_v1",
"trained_at": datetime.datetime.now().isoformat(timespec="seconds"),
"market": "MS",
"classes": {"0": "home(1)", "1": "draw(X)", "2": "away(2)"},
"policy": {"odds_lo": args.lo, "odds_hi": args.hi, "margin": args.margin,
"stake": "flat 1u", "rule": "bet model's max value edge if picked odds in band",
"never": ["longshots odds>=hi", "parlays/combos"]},
"n_train": len(df), "n_features": len(feats),
"leaky_excluded": sorted(LEAKY),
"holdout_eval": {"accuracy_pct": round(acc*100,1), **hold},
"caveat": "CSV odds are a static capture, not verified closing. Forward paper-trade with real CLV before staking.",
}
with open(os.path.join(OUT, "metadata.json"), "w", encoding="utf-8") as f:
json.dump(meta, f, ensure_ascii=False, indent=2)
print(f"\n✅ Saved production model to {OUT}/")
print(f" model.json, feature_cols.json ({len(feats)} feats), metadata.json")
print("\nNEXT: serving wrapper that loads this + applies the policy to upcoming")
print("matches, logs paper-trade picks, and we measure real forward CLV/ROI.")
if __name__ == "__main__":
main()