163 lines
7.5 KiB
Python
163 lines
7.5 KiB
Python
"""
|
|
Market Calibration Scan — find where the ODDS THEMSELVES are systematically wrong.
|
|
=================================================================================
|
|
The legit, measurable version of "odds şike": pockets (leagues / teams / bands)
|
|
where the market's implied probability does NOT match realized frequency, so a
|
|
SIMPLE rule (no model) is +EV. This is pure market inefficiency — soft pricing
|
|
in obscure leagues, persistent team bias, etc.
|
|
|
|
Discipline against false 'rigged' pockets (the multiple-comparison trap):
|
|
* split history by time into HALF-1 (discover) and HALF-2 (validate)
|
|
* a pocket counts ONLY if it is +EV in BOTH halves with enough bets each
|
|
* report realized-vs-implied gap (the miscalibration) + ROI
|
|
|
|
No model. Just odds vs outcomes. Read-only on the training CSV (104k matches
|
|
with odds). Forward 'suspicious line movement' detection needs odds_history
|
|
(currently empty) — separate, forward-only.
|
|
|
|
Usage: python scripts/market_calibration.py --min-bets 120 --side fav
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse, os, sys
|
|
import numpy as np, pandas as pd
|
|
|
|
if sys.stdout and hasattr(sys.stdout, "reconfigure"):
|
|
try: sys.stdout.reconfigure(encoding="utf-8")
|
|
except Exception: pass
|
|
|
|
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
CSV = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
|
|
|
|
|
def league_names(ids):
|
|
try:
|
|
sys.path.insert(0, AI_DIR)
|
|
from data.db import get_clean_dsn
|
|
import psycopg2
|
|
from psycopg2.extras import RealDictCursor
|
|
ids = [str(i) for i in ids if i is not None]
|
|
for _ in range(3):
|
|
try:
|
|
with psycopg2.connect(get_clean_dsn()) as c:
|
|
with c.cursor(cursor_factory=RealDictCursor) as cur:
|
|
cur.execute("SELECT id,name FROM leagues WHERE id = ANY(%s)", (ids,))
|
|
return {str(r["id"]): r["name"] for r in cur.fetchall()}
|
|
except Exception:
|
|
import time; time.sleep(1)
|
|
except Exception:
|
|
pass
|
|
return {}
|
|
|
|
|
|
def team_names(ids):
|
|
try:
|
|
sys.path.insert(0, AI_DIR)
|
|
from data.db import get_clean_dsn
|
|
import psycopg2
|
|
from psycopg2.extras import RealDictCursor
|
|
ids = [str(i) for i in ids if i is not None]
|
|
for _ in range(3):
|
|
try:
|
|
with psycopg2.connect(get_clean_dsn()) as c:
|
|
with c.cursor(cursor_factory=RealDictCursor) as cur:
|
|
cur.execute("SELECT id,name FROM teams WHERE id = ANY(%s)", (ids,))
|
|
return {str(r["id"]): r["name"] for r in cur.fetchall()}
|
|
except Exception:
|
|
import time; time.sleep(1)
|
|
except Exception:
|
|
pass
|
|
return {}
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser(description=__doc__)
|
|
ap.add_argument("--min-bets", type=int, default=120, help="min bets PER HALF")
|
|
ap.add_argument("--fav-max", type=float, default=2.5, help="only count favourites below this odds")
|
|
args = ap.parse_args()
|
|
|
|
df = pd.read_csv(CSV, low_memory=False,
|
|
usecols=["match_id","league_id","home_team_id","away_team_id","mst_utc",
|
|
"odds_ms_h","odds_ms_d","odds_ms_a","score_home","score_away"])
|
|
df = df.sort_values("mst_utc").reset_index(drop=True)
|
|
sh = pd.to_numeric(df["score_home"],errors="coerce"); sa = pd.to_numeric(df["score_away"],errors="coerce")
|
|
ok = sh.notna()&sa.notna()
|
|
df = df[ok].reset_index(drop=True); sh=sh[ok.values].values; sa=sa[ok.values].values
|
|
O = df[["odds_ms_h","odds_ms_d","odds_ms_a"]].apply(pd.to_numeric,errors="coerce").fillna(0.0).values
|
|
valid = (O>1.0).all(1)
|
|
outcome = np.where(sh>sa,0,np.where(sh==sa,1,2)) # 0 home,1 draw,2 away
|
|
fav = O.argmin(1); fav_odds = O[np.arange(len(O)),fav]
|
|
fav_won = (fav==outcome).astype(float)
|
|
fav_implied = 1.0/fav_odds
|
|
pnl = np.where(fav_won, fav_odds-1.0, -1.0)
|
|
half = (np.arange(len(df)) >= len(df)//2).astype(int) # 0=first half,1=second
|
|
use = valid & (fav_odds <= args.fav_max)
|
|
|
|
base = pd.DataFrame({
|
|
"league": df["league_id"].astype(str).values,
|
|
"home": df["home_team_id"].astype(str).values,
|
|
"fav_is_home": (fav==0),
|
|
"won": fav_won, "implied": fav_implied, "pnl": pnl, "half": half, "use": use,
|
|
"fav_odds": fav_odds,
|
|
})
|
|
b = base[base["use"]].copy()
|
|
print(f"{len(b):,} favourite bets (odds<= {args.fav_max}); split into 2 time halves\n")
|
|
print(f"GLOBAL favourite: realized={100*b['won'].mean():.1f}% implied={100*b['implied'].mean():.1f}% "
|
|
f"ROI={100*b['pnl'].mean():+.2f}% (negative = vig; market roughly right)")
|
|
|
|
def scan(groupcol, label, namefn, min_bets):
|
|
rows=[]
|
|
for key,d in b.groupby(groupcol):
|
|
h0=d[d["half"]==0]; h1=d[d["half"]==1]
|
|
if len(h0)<min_bets or len(h1)<min_bets: continue
|
|
r0=100*h0["pnl"].mean(); r1=100*h1["pnl"].mean()
|
|
# miscalibration gap: realized - implied (positive = market underprices the favourite)
|
|
gap=100*(d["won"].mean()-d["implied"].mean())
|
|
both_pos = r0>0 and r1>0
|
|
rows.append((min(r0,r1), key, len(d), 100*d["pnl"].mean(), r0, r1, gap, both_pos))
|
|
rows.sort(reverse=True)
|
|
names = namefn([r[1] for r in rows[:40]])
|
|
print(f"\n{'='*82}\n{label} (✓ = +EV in BOTH halves, the only trustworthy ones)\n{'='*82}")
|
|
print(f" {'name':<30}{'n':>6}{'ROI%':>7}{'H1%':>7}{'H2%':>7}{'gap%':>7} ✓")
|
|
print(" "+"-"*72)
|
|
shown=0
|
|
for mn,key,n,roi,r0,r1,gap,both in rows:
|
|
if shown>=20 and not both: continue
|
|
nm=(names.get(key,key) or key)[:28]
|
|
mark = "✓" if both else ""
|
|
print(f" {nm:<30}{n:>6}{roi:>+7.1f}{r0:>+7.1f}{r1:>+7.1f}{gap:>+7.1f} {mark}")
|
|
shown+=1
|
|
if shown>=25: break
|
|
good=[r for r in rows if r[7]]
|
|
print(f"\n -> {len(good)} {label.split()[0].lower()} pockets are +EV in BOTH halves "
|
|
f"(out of {len(rows)} with enough data)")
|
|
return good
|
|
|
|
scan("league", "BY LEAGUE (favourite flat bet)", league_names, args.min_bets)
|
|
# team: only when the team is the home favourite (cleanest, most samples)
|
|
bt = b[b["fav_is_home"]]
|
|
globals()['b'] = bt # reuse scan on home-favourite subset
|
|
# inline team scan
|
|
rows=[]
|
|
for key,d in bt.groupby("home"):
|
|
h0=d[d["half"]==0]; h1=d[d["half"]==1]
|
|
if len(h0)<max(25,args.min_bets//3) or len(h1)<max(25,args.min_bets//3): continue
|
|
r0=100*h0["pnl"].mean(); r1=100*h1["pnl"].mean()
|
|
gap=100*(d["won"].mean()-d["implied"].mean())
|
|
rows.append((min(r0,r1), key, len(d), 100*d["pnl"].mean(), r0, r1, gap, r0>0 and r1>0))
|
|
rows.sort(reverse=True)
|
|
tn=team_names([r[1] for r in rows[:40]])
|
|
print(f"\n{'='*82}\nBY TEAM as HOME FAVOURITE (✓ = +EV both halves)\n{'='*82}")
|
|
print(f" {'team':<30}{'n':>6}{'ROI%':>7}{'H1%':>7}{'H2%':>7}{'gap%':>7} ✓")
|
|
print(" "+"-"*72)
|
|
for mn,key,n,roi,r0,r1,gap,both in rows[:22]:
|
|
nm=(tn.get(key,key) or key)[:28]; mark="✓" if both else ""
|
|
print(f" {nm:<30}{n:>6}{roi:>+7.1f}{r0:>+7.1f}{r1:>+7.1f}{gap:>+7.1f} {mark}")
|
|
good=[r for r in rows if r[7]]
|
|
print(f"\n -> {len(good)} teams +EV in BOTH halves (out of {len(rows)})")
|
|
print("\nREAD: ✓ pockets survived a time-split = candidate real inefficiencies (not noise).")
|
|
print("Still forward-validate with CLV. No ✓ = market is efficient there; don't bet.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|