@@ -0,0 +1,162 @@
|
||||
"""
|
||||
Market Calibration Scan — find where the ODDS THEMSELVES are systematically wrong.
|
||||
=================================================================================
|
||||
The legit, measurable version of "odds şike": pockets (leagues / teams / bands)
|
||||
where the market's implied probability does NOT match realized frequency, so a
|
||||
SIMPLE rule (no model) is +EV. This is pure market inefficiency — soft pricing
|
||||
in obscure leagues, persistent team bias, etc.
|
||||
|
||||
Discipline against false 'rigged' pockets (the multiple-comparison trap):
|
||||
* split history by time into HALF-1 (discover) and HALF-2 (validate)
|
||||
* a pocket counts ONLY if it is +EV in BOTH halves with enough bets each
|
||||
* report realized-vs-implied gap (the miscalibration) + ROI
|
||||
|
||||
No model. Just odds vs outcomes. Read-only on the training CSV (104k matches
|
||||
with odds). Forward 'suspicious line movement' detection needs odds_history
|
||||
(currently empty) — separate, forward-only.
|
||||
|
||||
Usage: python scripts/market_calibration.py --min-bets 120 --side fav
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse, os, sys
|
||||
import numpy as np, pandas as pd
|
||||
|
||||
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
||||
try: sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception: pass
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||||
|
||||
|
||||
def league_names(ids):
|
||||
try:
|
||||
sys.path.insert(0, AI_DIR)
|
||||
from data.db import get_clean_dsn
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
ids = [str(i) for i in ids if i is not None]
|
||||
for _ in range(3):
|
||||
try:
|
||||
with psycopg2.connect(get_clean_dsn()) as c:
|
||||
with c.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("SELECT id,name FROM leagues WHERE id = ANY(%s)", (ids,))
|
||||
return {str(r["id"]): r["name"] for r in cur.fetchall()}
|
||||
except Exception:
|
||||
import time; time.sleep(1)
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def team_names(ids):
|
||||
try:
|
||||
sys.path.insert(0, AI_DIR)
|
||||
from data.db import get_clean_dsn
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
ids = [str(i) for i in ids if i is not None]
|
||||
for _ in range(3):
|
||||
try:
|
||||
with psycopg2.connect(get_clean_dsn()) as c:
|
||||
with c.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("SELECT id,name FROM teams WHERE id = ANY(%s)", (ids,))
|
||||
return {str(r["id"]): r["name"] for r in cur.fetchall()}
|
||||
except Exception:
|
||||
import time; time.sleep(1)
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument("--min-bets", type=int, default=120, help="min bets PER HALF")
|
||||
ap.add_argument("--fav-max", type=float, default=2.5, help="only count favourites below this odds")
|
||||
args = ap.parse_args()
|
||||
|
||||
df = pd.read_csv(CSV, low_memory=False,
|
||||
usecols=["match_id","league_id","home_team_id","away_team_id","mst_utc",
|
||||
"odds_ms_h","odds_ms_d","odds_ms_a","score_home","score_away"])
|
||||
df = df.sort_values("mst_utc").reset_index(drop=True)
|
||||
sh = pd.to_numeric(df["score_home"],errors="coerce"); sa = pd.to_numeric(df["score_away"],errors="coerce")
|
||||
ok = sh.notna()&sa.notna()
|
||||
df = df[ok].reset_index(drop=True); sh=sh[ok.values].values; sa=sa[ok.values].values
|
||||
O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric,errors="coerce").fillna(0.0).values
|
||||
valid = (O>1.0).all(1)
|
||||
outcome = np.where(sh>sa,0,np.where(sh==sa,1,2)) # 0 home,1 draw,2 away
|
||||
fav = O.argmin(1); fav_odds = O[np.arange(len(O)),fav]
|
||||
fav_won = (fav==outcome).astype(float)
|
||||
fav_implied = 1.0/fav_odds
|
||||
pnl = np.where(fav_won, fav_odds-1.0, -1.0)
|
||||
half = (np.arange(len(df)) >= len(df)//2).astype(int) # 0=first half,1=second
|
||||
use = valid & (fav_odds <= args.fav_max)
|
||||
|
||||
base = pd.DataFrame({
|
||||
"league": df["league_id"].astype(str).values,
|
||||
"home": df["home_team_id"].astype(str).values,
|
||||
"fav_is_home": (fav==0),
|
||||
"won": fav_won, "implied": fav_implied, "pnl": pnl, "half": half, "use": use,
|
||||
"fav_odds": fav_odds,
|
||||
})
|
||||
b = base[base["use"]].copy()
|
||||
print(f"{len(b):,} favourite bets (odds<= {args.fav_max}); split into 2 time halves\n")
|
||||
print(f"GLOBAL favourite: realized={100*b['won'].mean():.1f}% implied={100*b['implied'].mean():.1f}% "
|
||||
f"ROI={100*b['pnl'].mean():+.2f}% (negative = vig; market roughly right)")
|
||||
|
||||
def scan(groupcol, label, namefn, min_bets):
|
||||
rows=[]
|
||||
for key,d in b.groupby(groupcol):
|
||||
h0=d[d["half"]==0]; h1=d[d["half"]==1]
|
||||
if len(h0)<min_bets or len(h1)<min_bets: continue
|
||||
r0=100*h0["pnl"].mean(); r1=100*h1["pnl"].mean()
|
||||
# miscalibration gap: realized - implied (positive = market underprices the favourite)
|
||||
gap=100*(d["won"].mean()-d["implied"].mean())
|
||||
both_pos = r0>0 and r1>0
|
||||
rows.append((min(r0,r1), key, len(d), 100*d["pnl"].mean(), r0, r1, gap, both_pos))
|
||||
rows.sort(reverse=True)
|
||||
names = namefn([r[1] for r in rows[:40]])
|
||||
print(f"\n{'='*82}\n{label} (✓ = +EV in BOTH halves, the only trustworthy ones)\n{'='*82}")
|
||||
print(f" {'name':<30}{'n':>6}{'ROI%':>7}{'H1%':>7}{'H2%':>7}{'gap%':>7} ✓")
|
||||
print(" "+"-"*72)
|
||||
shown=0
|
||||
for mn,key,n,roi,r0,r1,gap,both in rows:
|
||||
if shown>=20 and not both: continue
|
||||
nm=(names.get(key,key) or key)[:28]
|
||||
mark = "✓" if both else ""
|
||||
print(f" {nm:<30}{n:>6}{roi:>+7.1f}{r0:>+7.1f}{r1:>+7.1f}{gap:>+7.1f} {mark}")
|
||||
shown+=1
|
||||
if shown>=25: break
|
||||
good=[r for r in rows if r[7]]
|
||||
print(f"\n -> {len(good)} {label.split()[0].lower()} pockets are +EV in BOTH halves "
|
||||
f"(out of {len(rows)} with enough data)")
|
||||
return good
|
||||
|
||||
scan("league", "BY LEAGUE (favourite flat bet)", league_names, args.min_bets)
|
||||
# team: only when the team is the home favourite (cleanest, most samples)
|
||||
bt = b[b["fav_is_home"]]
|
||||
globals()['b'] = bt # reuse scan on home-favourite subset
|
||||
# inline team scan
|
||||
rows=[]
|
||||
for key,d in bt.groupby("home"):
|
||||
h0=d[d["half"]==0]; h1=d[d["half"]==1]
|
||||
if len(h0)<max(25,args.min_bets//3) or len(h1)<max(25,args.min_bets//3): continue
|
||||
r0=100*h0["pnl"].mean(); r1=100*h1["pnl"].mean()
|
||||
gap=100*(d["won"].mean()-d["implied"].mean())
|
||||
rows.append((min(r0,r1), key, len(d), 100*d["pnl"].mean(), r0, r1, gap, r0>0 and r1>0))
|
||||
rows.sort(reverse=True)
|
||||
tn=team_names([r[1] for r in rows[:40]])
|
||||
print(f"\n{'='*82}\nBY TEAM as HOME FAVOURITE (✓ = +EV both halves)\n{'='*82}")
|
||||
print(f" {'team':<30}{'n':>6}{'ROI%':>7}{'H1%':>7}{'H2%':>7}{'gap%':>7} ✓")
|
||||
print(" "+"-"*72)
|
||||
for mn,key,n,roi,r0,r1,gap,both in rows[:22]:
|
||||
nm=(tn.get(key,key) or key)[:28]; mark="✓" if both else ""
|
||||
print(f" {nm:<30}{n:>6}{roi:>+7.1f}{r0:>+7.1f}{r1:>+7.1f}{gap:>+7.1f} {mark}")
|
||||
good=[r for r in rows if r[7]]
|
||||
print(f"\n -> {len(good)} teams +EV in BOTH halves (out of {len(rows)})")
|
||||
print("\nREAD: ✓ pockets survived a time-split = candidate real inefficiencies (not noise).")
|
||||
print("Still forward-validate with CLV. No ✓ = market is efficient there; don't bet.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user