gg
This commit is contained in:
@@ -0,0 +1,312 @@
|
||||
"""
|
||||
V28 — CONDITIONAL FREQUENCY ENGINE
|
||||
====================================
|
||||
User's strategy automated at scale:
|
||||
|
||||
For every match (e.g. Beşiktaş vs Konya):
|
||||
1. Look at Beşiktaş's HOME history when their MS1 odds were in the same band (e.g. 1.30-1.40)
|
||||
→ What % of those matches ended OU 1.5 over? OU 2.5 over? MS1?
|
||||
2. Look at Konya's AWAY history when their MS2 odds were in the same band (e.g. 2.00-2.20)
|
||||
→ Same questions
|
||||
3. COMBINE both signals:
|
||||
→ If BOTH teams historically produce >80% OU1.5 over at these odds → BET OU1.5 over
|
||||
→ This is the user's exact Excel strategy, now running on 104K matches
|
||||
|
||||
CRITICAL: Only uses PAST matches for each prediction (no future leakage)
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
# ─── Load Data ───
|
||||
print("Loading data...")
|
||||
df = pd.read_csv('data/training_data_v27.csv', low_memory=False)
|
||||
KEEP_STR = ['match_id', 'league_name', 'home_team', 'away_team',
|
||||
'home_team_id', 'away_team_id', 'league_id', 'mst_utc']
|
||||
for c in df.columns:
|
||||
if c not in KEEP_STR:
|
||||
df[c] = pd.to_numeric(df[c], errors='coerce')
|
||||
|
||||
# Ensure chronological order (by match_id or date)
|
||||
if 'mst_utc' in df.columns:
|
||||
df['mst_utc'] = pd.to_datetime(df['mst_utc'], errors='coerce')
|
||||
df = df.sort_values('mst_utc').reset_index(drop=True)
|
||||
|
||||
# Filter: need valid odds + scores
|
||||
df = df.dropna(subset=['odds_ms_h', 'odds_ms_a', 'score_home', 'score_away',
|
||||
'home_team_id', 'away_team_id', 'label_ms'])
|
||||
|
||||
# Compute actual goal labels
|
||||
df['total_goals'] = df['score_home'] + df['score_away']
|
||||
df['ou15_actual'] = (df['total_goals'] > 1.5).astype(int)
|
||||
df['ou25_actual'] = (df['total_goals'] > 2.5).astype(int)
|
||||
df['ou35_actual'] = (df['total_goals'] > 3.5).astype(int)
|
||||
df['btts_actual'] = ((df['score_home'] > 0) & (df['score_away'] > 0)).astype(int)
|
||||
df['ms_result'] = df['label_ms'].astype(int) # 0=H, 1=D, 2=A
|
||||
|
||||
N = len(df)
|
||||
print(f"Total matches: {N}")
|
||||
print(f"Unique home teams: {df.home_team_id.nunique()}")
|
||||
print(f"Unique away teams: {df.away_team_id.nunique()}")
|
||||
|
||||
# ─── Odds Band Helper ───
|
||||
def get_odds_band(odds, band_width=0.10):
|
||||
"""Round odds to nearest band. E.g. 1.35 → (1.30, 1.40)"""
|
||||
lower = round(np.floor(odds / band_width) * band_width, 2)
|
||||
upper = round(lower + band_width, 2)
|
||||
return (lower, upper)
|
||||
|
||||
def get_odds_band_wide(odds):
|
||||
"""Wider band for less common teams. E.g. 1.35 → (1.20, 1.50)"""
|
||||
if odds < 1.50:
|
||||
return (1.01, 1.50)
|
||||
elif odds < 2.00:
|
||||
return (1.50, 2.00)
|
||||
elif odds < 2.50:
|
||||
return (2.00, 2.50)
|
||||
elif odds < 3.00:
|
||||
return (2.50, 3.00)
|
||||
elif odds < 4.00:
|
||||
return (3.00, 4.00)
|
||||
elif odds < 6.00:
|
||||
return (4.00, 6.00)
|
||||
else:
|
||||
return (6.00, 20.00)
|
||||
|
||||
# ─── Build Conditional Frequency Lookup (Expanding Window) ───
|
||||
print("\nBuilding conditional frequency features (expanding window)...")
|
||||
|
||||
# We'll compute features for each match using only past data
|
||||
MIN_MATCHES = 5 # minimum historical matches to generate a signal
|
||||
|
||||
# Pre-allocate feature arrays
|
||||
feat_names = [
|
||||
'home_ou15_rate_at_band', 'home_ou25_rate_at_band', 'home_ou35_rate_at_band',
|
||||
'home_btts_rate_at_band', 'home_win_rate_at_band', 'home_n_at_band',
|
||||
'away_ou15_rate_at_band', 'away_ou25_rate_at_band', 'away_ou35_rate_at_band',
|
||||
'away_btts_rate_at_band', 'away_win_rate_at_band', 'away_n_at_band',
|
||||
'combined_ou15', 'combined_ou25', 'combined_ou35', 'combined_btts',
|
||||
'home_goals_at_band', 'away_goals_at_band', 'combined_goals_at_band',
|
||||
'home_conceded_at_band', 'away_conceded_at_band',
|
||||
]
|
||||
features = np.full((N, len(feat_names)), np.nan)
|
||||
|
||||
# Historical ledger: team_id → list of (odds_band, ou15, ou25, ou35, btts, ms_result, goals_scored, goals_conceded)
|
||||
home_history = defaultdict(list) # team performances when playing HOME
|
||||
away_history = defaultdict(list) # team performances when playing AWAY
|
||||
|
||||
for i in range(N):
|
||||
row = df.iloc[i]
|
||||
ht_id = row.home_team_id
|
||||
at_id = row.away_team_id
|
||||
h_odds = row.odds_ms_h
|
||||
a_odds = row.odds_ms_a
|
||||
|
||||
if pd.isna(h_odds) or pd.isna(a_odds):
|
||||
continue
|
||||
|
||||
h_band = get_odds_band_wide(h_odds)
|
||||
a_band = get_odds_band_wide(a_odds)
|
||||
|
||||
# ── Look up HOME team's historical performance at this odds band ──
|
||||
h_hist = [x for x in home_history[ht_id] if h_band[0] <= x[0] < h_band[1]]
|
||||
if len(h_hist) >= MIN_MATCHES:
|
||||
features[i, 0] = np.mean([x[1] for x in h_hist]) # ou15 rate
|
||||
features[i, 1] = np.mean([x[2] for x in h_hist]) # ou25 rate
|
||||
features[i, 2] = np.mean([x[3] for x in h_hist]) # ou35 rate
|
||||
features[i, 3] = np.mean([x[4] for x in h_hist]) # btts rate
|
||||
features[i, 4] = np.mean([x[5] for x in h_hist]) # win rate (home win = 1 if ms==0)
|
||||
features[i, 5] = len(h_hist)
|
||||
features[i, 16] = np.mean([x[6] for x in h_hist]) # avg goals scored
|
||||
features[i, 19] = np.mean([x[7] for x in h_hist]) # avg goals conceded
|
||||
|
||||
# ── Look up AWAY team's historical performance at this odds band ──
|
||||
a_hist = [x for x in away_history[at_id] if a_band[0] <= x[0] < a_band[1]]
|
||||
if len(a_hist) >= MIN_MATCHES:
|
||||
features[i, 6] = np.mean([x[1] for x in a_hist]) # ou15 rate
|
||||
features[i, 7] = np.mean([x[2] for x in a_hist]) # ou25 rate
|
||||
features[i, 8] = np.mean([x[3] for x in a_hist]) # ou35 rate
|
||||
features[i, 9] = np.mean([x[4] for x in a_hist]) # btts rate
|
||||
features[i, 10] = np.mean([x[5] for x in a_hist]) # away win rate
|
||||
features[i, 11] = len(a_hist)
|
||||
features[i, 17] = np.mean([x[6] for x in a_hist]) # avg goals scored (away)
|
||||
features[i, 20] = np.mean([x[7] for x in a_hist]) # avg goals conceded (away)
|
||||
|
||||
# ── Combined signals ──
|
||||
if not np.isnan(features[i, 0]) and not np.isnan(features[i, 6]):
|
||||
features[i, 12] = (features[i, 0] + features[i, 6]) / 2 # combined ou15
|
||||
features[i, 13] = (features[i, 1] + features[i, 7]) / 2 # combined ou25
|
||||
features[i, 14] = (features[i, 2] + features[i, 8]) / 2 # combined ou35
|
||||
features[i, 15] = (features[i, 3] + features[i, 9]) / 2 # combined btts
|
||||
features[i, 18] = features[i, 16] + features[i, 17] # combined goals
|
||||
|
||||
# ── Add THIS match to history (for future lookups) ──
|
||||
ou15 = int(row.total_goals > 1.5)
|
||||
ou25 = int(row.total_goals > 2.5)
|
||||
ou35 = int(row.total_goals > 3.5)
|
||||
btts = int(row.score_home > 0 and row.score_away > 0)
|
||||
h_won = int(row.label_ms == 0)
|
||||
a_won = int(row.label_ms == 2)
|
||||
|
||||
home_history[ht_id].append((h_odds, ou15, ou25, ou35, btts, h_won,
|
||||
row.score_home, row.score_away))
|
||||
away_history[at_id].append((a_odds, ou15, ou25, ou35, btts, a_won,
|
||||
row.score_away, row.score_home))
|
||||
|
||||
if (i+1) % 20000 == 0:
|
||||
valid = np.sum(~np.isnan(features[:i+1, 12]))
|
||||
print(f" Processed {i+1}/{N} matches, {valid} with combined signals")
|
||||
|
||||
# Count valid features
|
||||
valid_mask = ~np.isnan(features[:, 12])
|
||||
print(f"\nMatches with combined conditional signals: {valid_mask.sum()} / {N}")
|
||||
|
||||
# ─── BACKTEST: Walk-Forward ───
|
||||
print("\n" + "="*70)
|
||||
print(" CONDITIONAL FREQUENCY BACKTEST")
|
||||
print("="*70)
|
||||
|
||||
# Only test on last 20% of data (to avoid early sparse data)
|
||||
test_start = int(N * 0.7)
|
||||
test_idx = range(test_start, N)
|
||||
test_valid = [i for i in test_idx if valid_mask[i]]
|
||||
print(f"Test window: matches {test_start}-{N} ({len(test_valid)} with signals)")
|
||||
|
||||
# Strategy: bet on OU1.5 over when combined_ou15 > threshold
|
||||
markets = [
|
||||
('OU 1.5 Over', 'combined_ou15', 12, 'ou15_actual', 'odds_ou15_o'),
|
||||
('OU 2.5 Over', 'combined_ou25', 13, 'ou25_actual', 'odds_ou25_o'),
|
||||
('OU 3.5 Over', 'combined_ou35', 14, 'ou35_actual', 'odds_ou35_o'),
|
||||
('BTTS Yes', 'combined_btts', 15, 'btts_actual', 'odds_btts_y'),
|
||||
]
|
||||
|
||||
for market_name, feat_key, feat_idx, label_col, odds_col in markets:
|
||||
print(f"\n ── {market_name} ──")
|
||||
|
||||
if odds_col not in df.columns:
|
||||
print(f" No odds column '{odds_col}', skipping")
|
||||
continue
|
||||
|
||||
for threshold in [0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90]:
|
||||
bets = 0
|
||||
wins = 0
|
||||
pnl = 0.0
|
||||
|
||||
for i in test_valid:
|
||||
signal = features[i, feat_idx]
|
||||
if np.isnan(signal) or signal < threshold:
|
||||
continue
|
||||
odds_val = df.iloc[i][odds_col]
|
||||
if pd.isna(odds_val) or odds_val < 1.05:
|
||||
continue
|
||||
actual = df.iloc[i][label_col]
|
||||
if pd.isna(actual):
|
||||
continue
|
||||
|
||||
bets += 1
|
||||
if actual == 1:
|
||||
wins += 1
|
||||
pnl += odds_val - 1
|
||||
else:
|
||||
pnl -= 1
|
||||
|
||||
if bets >= 20:
|
||||
roi = pnl / bets * 100
|
||||
hit = wins / bets * 100
|
||||
ev = (wins/bets) * (pnl/wins + 1) if wins > 0 else 0
|
||||
marker = " *** PROFITABLE ***" if roi > 0 else ""
|
||||
print(f" Threshold>{threshold:.2f}: {bets:5d} bets, "
|
||||
f"hit={hit:.1f}%, ROI={roi:+.1f}%{marker}")
|
||||
|
||||
# Also test MS (1X2) market
|
||||
print(f"\n ── Maç Sonucu (1X2) ──")
|
||||
# Home win when home_win_rate_at_band > X AND away team loses often at that band
|
||||
for threshold in [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80]:
|
||||
bets = wins = 0
|
||||
pnl = 0.0
|
||||
for i in test_valid:
|
||||
h_wr = features[i, 4] # home win rate at band
|
||||
a_lr = 1 - features[i, 10] if not np.isnan(features[i, 10]) else np.nan # away loss rate
|
||||
if np.isnan(h_wr) or np.isnan(a_lr):
|
||||
continue
|
||||
combined = (h_wr + a_lr) / 2
|
||||
if combined < threshold:
|
||||
continue
|
||||
odds_val = df.iloc[i].odds_ms_h
|
||||
if pd.isna(odds_val) or odds_val < 1.10 or odds_val > 5.0:
|
||||
continue
|
||||
bets += 1
|
||||
if df.iloc[i].label_ms == 0:
|
||||
wins += 1
|
||||
pnl += odds_val - 1
|
||||
else:
|
||||
pnl -= 1
|
||||
if bets >= 20:
|
||||
roi = pnl / bets * 100
|
||||
hit = wins / bets * 100
|
||||
marker = " *** PROFITABLE ***" if roi > 0 else ""
|
||||
print(f" Home win comb>{threshold:.2f}: {bets:5d} bets, "
|
||||
f"hit={hit:.1f}%, ROI={roi:+.1f}%{marker}")
|
||||
|
||||
# ─── DEEP DIVE: Best performing niches ───
|
||||
print("\n" + "="*70)
|
||||
print(" DEEP DIVE: Combined OU15 + Odds Value Filter")
|
||||
print("="*70)
|
||||
|
||||
# The user's strategy: high confidence + the odds must pay enough
|
||||
for threshold in [0.75, 0.80, 0.85, 0.90]:
|
||||
for min_odds in [1.10, 1.20, 1.30, 1.40]:
|
||||
bets = wins = 0
|
||||
pnl = 0.0
|
||||
for i in test_valid:
|
||||
signal = features[i, 12] # combined ou15
|
||||
if np.isnan(signal) or signal < threshold:
|
||||
continue
|
||||
odds_val = df.iloc[i].get('odds_ou15_o', np.nan) if 'odds_ou15_o' in df.columns else np.nan
|
||||
if pd.isna(odds_val) or odds_val < min_odds:
|
||||
continue
|
||||
actual = df.iloc[i].ou15_actual
|
||||
|
||||
bets += 1
|
||||
if actual == 1:
|
||||
wins += 1
|
||||
pnl += odds_val - 1
|
||||
else:
|
||||
pnl -= 1
|
||||
|
||||
if bets >= 30:
|
||||
roi = pnl / bets * 100
|
||||
hit = wins / bets * 100
|
||||
if roi > -5: # show near-profitable too
|
||||
marker = " *** PROFITABLE ***" if roi > 0 else ""
|
||||
print(f" OU15 sig>{threshold:.2f} odds>{min_odds}: "
|
||||
f"{bets:5d} bets, hit={hit:.1f}%, ROI={roi:+.1f}%{marker}")
|
||||
|
||||
# ─── Additional: Goal expectation accuracy ───
|
||||
print("\n" + "="*70)
|
||||
print(" GOAL PREDICTION ACCURACY")
|
||||
print("="*70)
|
||||
valid_goals = [i for i in test_valid if not np.isnan(features[i, 18])]
|
||||
if valid_goals:
|
||||
pred_goals = [features[i, 18] for i in valid_goals]
|
||||
actual_goals = [df.iloc[i].total_goals for i in valid_goals]
|
||||
from sklearn.metrics import mean_absolute_error
|
||||
mae = mean_absolute_error(actual_goals, pred_goals)
|
||||
corr = np.corrcoef(pred_goals, actual_goals)[0, 1]
|
||||
print(f" Combined goal prediction MAE: {mae:.3f}")
|
||||
print(f" Correlation: {corr:.4f}")
|
||||
print(f" Avg predicted: {np.mean(pred_goals):.2f}, Avg actual: {np.mean(actual_goals):.2f}")
|
||||
|
||||
# Bucket analysis
|
||||
print("\n Goal prediction buckets:")
|
||||
for low, high in [(0, 1.5), (1.5, 2.0), (2.0, 2.5), (2.5, 3.0), (3.0, 3.5), (3.5, 5.0)]:
|
||||
bucket = [i for i, pg in zip(valid_goals, pred_goals) if low <= pg < high]
|
||||
if len(bucket) >= 20:
|
||||
avg_actual = np.mean([df.iloc[i].total_goals for i in bucket])
|
||||
ou25_rate = np.mean([df.iloc[i].ou25_actual for i in bucket])
|
||||
print(f" Predicted {low:.1f}-{high:.1f}: n={len(bucket)}, "
|
||||
f"actual_avg={avg_actual:.2f}, OU25%={ou25_rate*100:.1f}%")
|
||||
|
||||
print("\nDone!")
|
||||
Reference in New Issue
Block a user