gg
This commit is contained in:
@@ -0,0 +1,215 @@
|
||||
"""
|
||||
V27 FINAL BACKTEST — Conservative Flat Bet
|
||||
Only the strongest validated edges. No Kelly compounding.
|
||||
"""
|
||||
import pandas as pd, numpy as np
|
||||
|
||||
df = pd.read_csv('data/training_data_v27.csv', low_memory=False)
|
||||
for c in df.columns:
|
||||
if c not in ['match_id','league_name','home_team','away_team']:
|
||||
df[c] = pd.to_numeric(df[c], errors='coerce')
|
||||
df = df.dropna(subset=['odds_ms_h','odds_ms_d','odds_ms_a'])
|
||||
df = df[(df.odds_ms_h>1.01)&(df.odds_ms_d>1.01)&(df.odds_ms_a>1.01)]
|
||||
|
||||
n = len(df)
|
||||
# 5-fold walk-forward: train on 60%, validate patterns, test on remaining
|
||||
folds = 5
|
||||
fold_size = n // folds
|
||||
all_results = []
|
||||
|
||||
print("="*65)
|
||||
print(" V27 WALK-FORWARD FLAT-BET BACKTEST")
|
||||
print("="*65)
|
||||
|
||||
for fold in range(2, folds): # start from fold 2 so we have enough training data
|
||||
train_end = fold * fold_size
|
||||
test_start = train_end
|
||||
test_end = (fold+1)*fold_size if fold < folds-1 else n
|
||||
|
||||
train_df = df.iloc[:train_end]
|
||||
test_df = df.iloc[test_start:test_end]
|
||||
|
||||
print(f"\n --- Fold {fold}: train={len(train_df)}, test={len(test_df)} ---")
|
||||
|
||||
# Discover REST edges from training data
|
||||
strategies = []
|
||||
|
||||
for hr in [5, 7, 10, 14]:
|
||||
for ar in [3, 4, 5]:
|
||||
for cls, col in [(0,'odds_ms_h'), (2,'odds_ms_a')]:
|
||||
idx = (train_df.home_days_rest > hr) & (train_df.away_days_rest < ar)
|
||||
sub = train_df[idx]
|
||||
if len(sub) < 50:
|
||||
continue
|
||||
rate = (sub.label_ms == cls).mean()
|
||||
avg_odds = sub[col].mean()
|
||||
ev = rate * avg_odds
|
||||
if ev > 1.02: # only strong edges (>2% edge)
|
||||
strategies.append((hr, ar, cls, rate, avg_odds, ev, len(sub)))
|
||||
|
||||
if not strategies:
|
||||
print(" No strong edges found in training data")
|
||||
continue
|
||||
|
||||
# Apply best strategies to test
|
||||
strategies.sort(key=lambda x: x[5], reverse=True)
|
||||
best = strategies[:3] # top 3 only
|
||||
|
||||
fold_bets = 0
|
||||
fold_wins = 0
|
||||
fold_pnl = 0
|
||||
stake = 10 # flat 10 units
|
||||
|
||||
for _, row in test_df.iterrows():
|
||||
for hr, ar, cls, est_p, _, _, _ in best:
|
||||
if pd.isna(row.home_days_rest) or pd.isna(row.away_days_rest):
|
||||
continue
|
||||
if row.home_days_rest <= hr or row.away_days_rest >= ar:
|
||||
continue
|
||||
odds_col = ['odds_ms_h','odds_ms_d','odds_ms_a'][cls]
|
||||
odds_val = row[odds_col]
|
||||
if pd.isna(odds_val) or odds_val < 1.50 or odds_val > 5.0:
|
||||
continue
|
||||
# Additional filter: only bet when odds give reasonable EV
|
||||
if est_p * odds_val < 1.0:
|
||||
continue
|
||||
|
||||
won = (row.label_ms == cls)
|
||||
pnl = stake * (odds_val - 1) if won else -stake
|
||||
fold_bets += 1
|
||||
if won:
|
||||
fold_wins += 1
|
||||
fold_pnl += pnl
|
||||
all_results.append({'fold': fold, 'won': won, 'pnl': pnl,
|
||||
'odds': odds_val, 'stake': stake,
|
||||
'cls': ['H','D','A'][cls]})
|
||||
|
||||
if fold_bets > 0:
|
||||
roi = fold_pnl / (fold_bets * stake) * 100
|
||||
print(f" Best strategies: {[(h,a,['H','D','A'][c],f'EV={e:.3f}') for h,a,c,_,_,e,_ in best]}")
|
||||
print(f" Bets: {fold_bets}, Wins: {fold_wins} ({fold_wins/fold_bets*100:.1f}%), "
|
||||
f"ROI: {roi:+.1f}%, PnL: {fold_pnl:+.0f}")
|
||||
|
||||
# Overall
|
||||
print("\n" + "="*65)
|
||||
print(" OVERALL RESULTS")
|
||||
print("="*65)
|
||||
if all_results:
|
||||
total = len(all_results)
|
||||
wins = sum(1 for r in all_results if r['won'])
|
||||
total_pnl = sum(r['pnl'] for r in all_results)
|
||||
total_staked = sum(r['stake'] for r in all_results)
|
||||
roi = total_pnl / total_staked * 100
|
||||
|
||||
print(f" Total bets: {total}")
|
||||
print(f" Wins: {wins} ({wins/total*100:.1f}%)")
|
||||
print(f" Total staked: {total_staked:.0f}")
|
||||
print(f" PnL: {total_pnl:+.0f}")
|
||||
print(f" ROI: {roi:+.1f}%")
|
||||
print(f" Avg odds: {np.mean([r['odds'] for r in all_results]):.2f}")
|
||||
|
||||
# By class
|
||||
print("\n --- By Bet Type ---")
|
||||
for cls in ['H','A']:
|
||||
cb = [r for r in all_results if r['cls'] == cls]
|
||||
if cb:
|
||||
cw = sum(1 for r in cb if r['won'])
|
||||
cp = sum(r['pnl'] for r in cb)
|
||||
cs = sum(r['stake'] for r in cb)
|
||||
print(f" {cls}: {len(cb)} bets, hit={cw/len(cb)*100:.1f}%, ROI={cp/cs*100:+.1f}%")
|
||||
|
||||
# Cumulative PnL curve
|
||||
print("\n --- Cumulative PnL ---")
|
||||
cum = 0
|
||||
step = max(1, total // 15)
|
||||
for j in range(0, total, step):
|
||||
cum = sum(r['pnl'] for r in all_results[:j+1])
|
||||
print(f" After bet {j+1:4d}: PnL={cum:+.0f}")
|
||||
cum = sum(r['pnl'] for r in all_results)
|
||||
print(f" After bet {total:4d}: PnL={cum:+.0f} (FINAL)")
|
||||
else:
|
||||
print(" No bets placed!")
|
||||
|
||||
# ── Now combine with MODEL for smarter filtering ──
|
||||
print("\n" + "="*65)
|
||||
print(" COMBINED: Rest Rules + Fundamentals Model")
|
||||
print("="*65)
|
||||
|
||||
import pickle, json
|
||||
from pathlib import Path
|
||||
MODELS_DIR = Path("models/v27")
|
||||
|
||||
feat_cols = json.load(open(MODELS_DIR / "v27_feature_cols.json"))
|
||||
ms_models = {}
|
||||
for name in ['xgb','lgb','cb']:
|
||||
p = MODELS_DIR / f"v27_ms_{name}.pkl"
|
||||
if p.exists():
|
||||
with open(p,'rb') as f:
|
||||
ms_models[name] = pickle.load(f)
|
||||
|
||||
if ms_models:
|
||||
test_df = df.iloc[int(n*0.8):].copy()
|
||||
X_test = test_df[feat_cols].values
|
||||
|
||||
# Get model predictions
|
||||
preds = []
|
||||
for name, m in ms_models.items():
|
||||
if name == 'xgb':
|
||||
import xgboost as xgb
|
||||
dm = xgb.DMatrix(X_test, feature_names=feat_cols)
|
||||
preds.append(m.predict(dm))
|
||||
elif name == 'lgb':
|
||||
preds.append(m.predict(X_test))
|
||||
elif name == 'cb':
|
||||
preds.append(m.predict_proba(X_test))
|
||||
model_probs = np.mean(preds, axis=0) # (n, 3)
|
||||
|
||||
# Now apply rest rules + model agreement
|
||||
margin = 1/test_df.odds_ms_h.values + 1/test_df.odds_ms_d.values + 1/test_df.odds_ms_a.values
|
||||
impl = np.column_stack([
|
||||
(1/test_df.odds_ms_h.values)/margin,
|
||||
(1/test_df.odds_ms_d.values)/margin,
|
||||
(1/test_df.odds_ms_a.values)/margin,
|
||||
])
|
||||
|
||||
combo_bets = 0
|
||||
combo_wins = 0
|
||||
combo_pnl = 0
|
||||
|
||||
for j in range(len(test_df)):
|
||||
row = test_df.iloc[j]
|
||||
for hr, ar in [(14,5),(10,5),(7,5),(5,5)]:
|
||||
if pd.isna(row.home_days_rest) or pd.isna(row.away_days_rest):
|
||||
continue
|
||||
if row.home_days_rest <= hr or row.away_days_rest >= ar:
|
||||
continue
|
||||
for cls in [0, 2]:
|
||||
odds_val = [row.odds_ms_h, row.odds_ms_d, row.odds_ms_a][cls]
|
||||
if pd.isna(odds_val) or odds_val < 1.50 or odds_val > 5.0:
|
||||
continue
|
||||
|
||||
model_p = model_probs[j, cls]
|
||||
impl_p = impl[j, cls]
|
||||
|
||||
# DOUBLE FILTER: rest rule + model agrees (model_prob > implied)
|
||||
if model_p <= impl_p:
|
||||
continue # model disagrees, skip
|
||||
edge = model_p - impl_p
|
||||
if edge < 0.03:
|
||||
continue # too small
|
||||
|
||||
won = (row.label_ms == cls)
|
||||
pnl = 10 * (odds_val - 1) if won else -10
|
||||
combo_bets += 1
|
||||
if won:
|
||||
combo_wins += 1
|
||||
combo_pnl += pnl
|
||||
|
||||
if combo_bets > 0:
|
||||
roi = combo_pnl / (combo_bets * 10) * 100
|
||||
print(f" Bets: {combo_bets}")
|
||||
print(f" Wins: {combo_wins} ({combo_wins/combo_bets*100:.1f}%)")
|
||||
print(f" PnL: {combo_pnl:+.0f}")
|
||||
print(f" ROI: {roi:+.1f}%")
|
||||
else:
|
||||
print(" No combined bets triggered")
|
||||
@@ -0,0 +1,312 @@
|
||||
"""
|
||||
V28 — CONDITIONAL FREQUENCY ENGINE
|
||||
====================================
|
||||
User's strategy automated at scale:
|
||||
|
||||
For every match (e.g. Beşiktaş vs Konya):
|
||||
1. Look at Beşiktaş's HOME history when their MS1 odds were in the same band (e.g. 1.30-1.40)
|
||||
→ What % of those matches ended OU 1.5 over? OU 2.5 over? MS1?
|
||||
2. Look at Konya's AWAY history when their MS2 odds were in the same band (e.g. 2.00-2.20)
|
||||
→ Same questions
|
||||
3. COMBINE both signals:
|
||||
→ If BOTH teams historically produce >80% OU1.5 over at these odds → BET OU1.5 over
|
||||
→ This is the user's exact Excel strategy, now running on 104K matches
|
||||
|
||||
CRITICAL: Only uses PAST matches for each prediction (no future leakage)
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
# ─── Load Data ───
|
||||
print("Loading data...")
|
||||
df = pd.read_csv('data/training_data_v27.csv', low_memory=False)
|
||||
KEEP_STR = ['match_id', 'league_name', 'home_team', 'away_team',
|
||||
'home_team_id', 'away_team_id', 'league_id', 'mst_utc']
|
||||
for c in df.columns:
|
||||
if c not in KEEP_STR:
|
||||
df[c] = pd.to_numeric(df[c], errors='coerce')
|
||||
|
||||
# Ensure chronological order (by match_id or date)
|
||||
if 'mst_utc' in df.columns:
|
||||
df['mst_utc'] = pd.to_datetime(df['mst_utc'], errors='coerce')
|
||||
df = df.sort_values('mst_utc').reset_index(drop=True)
|
||||
|
||||
# Filter: need valid odds + scores
|
||||
df = df.dropna(subset=['odds_ms_h', 'odds_ms_a', 'score_home', 'score_away',
|
||||
'home_team_id', 'away_team_id', 'label_ms'])
|
||||
|
||||
# Compute actual goal labels
|
||||
df['total_goals'] = df['score_home'] + df['score_away']
|
||||
df['ou15_actual'] = (df['total_goals'] > 1.5).astype(int)
|
||||
df['ou25_actual'] = (df['total_goals'] > 2.5).astype(int)
|
||||
df['ou35_actual'] = (df['total_goals'] > 3.5).astype(int)
|
||||
df['btts_actual'] = ((df['score_home'] > 0) & (df['score_away'] > 0)).astype(int)
|
||||
df['ms_result'] = df['label_ms'].astype(int) # 0=H, 1=D, 2=A
|
||||
|
||||
N = len(df)
|
||||
print(f"Total matches: {N}")
|
||||
print(f"Unique home teams: {df.home_team_id.nunique()}")
|
||||
print(f"Unique away teams: {df.away_team_id.nunique()}")
|
||||
|
||||
# ─── Odds Band Helper ───
|
||||
def get_odds_band(odds, band_width=0.10):
|
||||
"""Round odds to nearest band. E.g. 1.35 → (1.30, 1.40)"""
|
||||
lower = round(np.floor(odds / band_width) * band_width, 2)
|
||||
upper = round(lower + band_width, 2)
|
||||
return (lower, upper)
|
||||
|
||||
def get_odds_band_wide(odds):
|
||||
"""Wider band for less common teams. E.g. 1.35 → (1.20, 1.50)"""
|
||||
if odds < 1.50:
|
||||
return (1.01, 1.50)
|
||||
elif odds < 2.00:
|
||||
return (1.50, 2.00)
|
||||
elif odds < 2.50:
|
||||
return (2.00, 2.50)
|
||||
elif odds < 3.00:
|
||||
return (2.50, 3.00)
|
||||
elif odds < 4.00:
|
||||
return (3.00, 4.00)
|
||||
elif odds < 6.00:
|
||||
return (4.00, 6.00)
|
||||
else:
|
||||
return (6.00, 20.00)
|
||||
|
||||
# ─── Build Conditional Frequency Lookup (Expanding Window) ───
|
||||
print("\nBuilding conditional frequency features (expanding window)...")
|
||||
|
||||
# We'll compute features for each match using only past data
|
||||
MIN_MATCHES = 5 # minimum historical matches to generate a signal
|
||||
|
||||
# Pre-allocate feature arrays
|
||||
feat_names = [
|
||||
'home_ou15_rate_at_band', 'home_ou25_rate_at_band', 'home_ou35_rate_at_band',
|
||||
'home_btts_rate_at_band', 'home_win_rate_at_band', 'home_n_at_band',
|
||||
'away_ou15_rate_at_band', 'away_ou25_rate_at_band', 'away_ou35_rate_at_band',
|
||||
'away_btts_rate_at_band', 'away_win_rate_at_band', 'away_n_at_band',
|
||||
'combined_ou15', 'combined_ou25', 'combined_ou35', 'combined_btts',
|
||||
'home_goals_at_band', 'away_goals_at_band', 'combined_goals_at_band',
|
||||
'home_conceded_at_band', 'away_conceded_at_band',
|
||||
]
|
||||
features = np.full((N, len(feat_names)), np.nan)
|
||||
|
||||
# Historical ledger: team_id → list of (odds_band, ou15, ou25, ou35, btts, ms_result, goals_scored, goals_conceded)
|
||||
home_history = defaultdict(list) # team performances when playing HOME
|
||||
away_history = defaultdict(list) # team performances when playing AWAY
|
||||
|
||||
for i in range(N):
|
||||
row = df.iloc[i]
|
||||
ht_id = row.home_team_id
|
||||
at_id = row.away_team_id
|
||||
h_odds = row.odds_ms_h
|
||||
a_odds = row.odds_ms_a
|
||||
|
||||
if pd.isna(h_odds) or pd.isna(a_odds):
|
||||
continue
|
||||
|
||||
h_band = get_odds_band_wide(h_odds)
|
||||
a_band = get_odds_band_wide(a_odds)
|
||||
|
||||
# ── Look up HOME team's historical performance at this odds band ──
|
||||
h_hist = [x for x in home_history[ht_id] if h_band[0] <= x[0] < h_band[1]]
|
||||
if len(h_hist) >= MIN_MATCHES:
|
||||
features[i, 0] = np.mean([x[1] for x in h_hist]) # ou15 rate
|
||||
features[i, 1] = np.mean([x[2] for x in h_hist]) # ou25 rate
|
||||
features[i, 2] = np.mean([x[3] for x in h_hist]) # ou35 rate
|
||||
features[i, 3] = np.mean([x[4] for x in h_hist]) # btts rate
|
||||
features[i, 4] = np.mean([x[5] for x in h_hist]) # win rate (home win = 1 if ms==0)
|
||||
features[i, 5] = len(h_hist)
|
||||
features[i, 16] = np.mean([x[6] for x in h_hist]) # avg goals scored
|
||||
features[i, 19] = np.mean([x[7] for x in h_hist]) # avg goals conceded
|
||||
|
||||
# ── Look up AWAY team's historical performance at this odds band ──
|
||||
a_hist = [x for x in away_history[at_id] if a_band[0] <= x[0] < a_band[1]]
|
||||
if len(a_hist) >= MIN_MATCHES:
|
||||
features[i, 6] = np.mean([x[1] for x in a_hist]) # ou15 rate
|
||||
features[i, 7] = np.mean([x[2] for x in a_hist]) # ou25 rate
|
||||
features[i, 8] = np.mean([x[3] for x in a_hist]) # ou35 rate
|
||||
features[i, 9] = np.mean([x[4] for x in a_hist]) # btts rate
|
||||
features[i, 10] = np.mean([x[5] for x in a_hist]) # away win rate
|
||||
features[i, 11] = len(a_hist)
|
||||
features[i, 17] = np.mean([x[6] for x in a_hist]) # avg goals scored (away)
|
||||
features[i, 20] = np.mean([x[7] for x in a_hist]) # avg goals conceded (away)
|
||||
|
||||
# ── Combined signals ──
|
||||
if not np.isnan(features[i, 0]) and not np.isnan(features[i, 6]):
|
||||
features[i, 12] = (features[i, 0] + features[i, 6]) / 2 # combined ou15
|
||||
features[i, 13] = (features[i, 1] + features[i, 7]) / 2 # combined ou25
|
||||
features[i, 14] = (features[i, 2] + features[i, 8]) / 2 # combined ou35
|
||||
features[i, 15] = (features[i, 3] + features[i, 9]) / 2 # combined btts
|
||||
features[i, 18] = features[i, 16] + features[i, 17] # combined goals
|
||||
|
||||
# ── Add THIS match to history (for future lookups) ──
|
||||
ou15 = int(row.total_goals > 1.5)
|
||||
ou25 = int(row.total_goals > 2.5)
|
||||
ou35 = int(row.total_goals > 3.5)
|
||||
btts = int(row.score_home > 0 and row.score_away > 0)
|
||||
h_won = int(row.label_ms == 0)
|
||||
a_won = int(row.label_ms == 2)
|
||||
|
||||
home_history[ht_id].append((h_odds, ou15, ou25, ou35, btts, h_won,
|
||||
row.score_home, row.score_away))
|
||||
away_history[at_id].append((a_odds, ou15, ou25, ou35, btts, a_won,
|
||||
row.score_away, row.score_home))
|
||||
|
||||
if (i+1) % 20000 == 0:
|
||||
valid = np.sum(~np.isnan(features[:i+1, 12]))
|
||||
print(f" Processed {i+1}/{N} matches, {valid} with combined signals")
|
||||
|
||||
# Count valid features
|
||||
valid_mask = ~np.isnan(features[:, 12])
|
||||
print(f"\nMatches with combined conditional signals: {valid_mask.sum()} / {N}")
|
||||
|
||||
# ─── BACKTEST: Walk-Forward ───
|
||||
print("\n" + "="*70)
|
||||
print(" CONDITIONAL FREQUENCY BACKTEST")
|
||||
print("="*70)
|
||||
|
||||
# Only test on last 20% of data (to avoid early sparse data)
|
||||
test_start = int(N * 0.7)
|
||||
test_idx = range(test_start, N)
|
||||
test_valid = [i for i in test_idx if valid_mask[i]]
|
||||
print(f"Test window: matches {test_start}-{N} ({len(test_valid)} with signals)")
|
||||
|
||||
# Strategy: bet on OU1.5 over when combined_ou15 > threshold
|
||||
markets = [
|
||||
('OU 1.5 Over', 'combined_ou15', 12, 'ou15_actual', 'odds_ou15_o'),
|
||||
('OU 2.5 Over', 'combined_ou25', 13, 'ou25_actual', 'odds_ou25_o'),
|
||||
('OU 3.5 Over', 'combined_ou35', 14, 'ou35_actual', 'odds_ou35_o'),
|
||||
('BTTS Yes', 'combined_btts', 15, 'btts_actual', 'odds_btts_y'),
|
||||
]
|
||||
|
||||
for market_name, feat_key, feat_idx, label_col, odds_col in markets:
|
||||
print(f"\n ── {market_name} ──")
|
||||
|
||||
if odds_col not in df.columns:
|
||||
print(f" No odds column '{odds_col}', skipping")
|
||||
continue
|
||||
|
||||
for threshold in [0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90]:
|
||||
bets = 0
|
||||
wins = 0
|
||||
pnl = 0.0
|
||||
|
||||
for i in test_valid:
|
||||
signal = features[i, feat_idx]
|
||||
if np.isnan(signal) or signal < threshold:
|
||||
continue
|
||||
odds_val = df.iloc[i][odds_col]
|
||||
if pd.isna(odds_val) or odds_val < 1.05:
|
||||
continue
|
||||
actual = df.iloc[i][label_col]
|
||||
if pd.isna(actual):
|
||||
continue
|
||||
|
||||
bets += 1
|
||||
if actual == 1:
|
||||
wins += 1
|
||||
pnl += odds_val - 1
|
||||
else:
|
||||
pnl -= 1
|
||||
|
||||
if bets >= 20:
|
||||
roi = pnl / bets * 100
|
||||
hit = wins / bets * 100
|
||||
ev = (wins/bets) * (pnl/wins + 1) if wins > 0 else 0
|
||||
marker = " *** PROFITABLE ***" if roi > 0 else ""
|
||||
print(f" Threshold>{threshold:.2f}: {bets:5d} bets, "
|
||||
f"hit={hit:.1f}%, ROI={roi:+.1f}%{marker}")
|
||||
|
||||
# Also test MS (1X2) market
|
||||
print(f"\n ── Maç Sonucu (1X2) ──")
|
||||
# Home win when home_win_rate_at_band > X AND away team loses often at that band
|
||||
for threshold in [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80]:
|
||||
bets = wins = 0
|
||||
pnl = 0.0
|
||||
for i in test_valid:
|
||||
h_wr = features[i, 4] # home win rate at band
|
||||
a_lr = 1 - features[i, 10] if not np.isnan(features[i, 10]) else np.nan # away loss rate
|
||||
if np.isnan(h_wr) or np.isnan(a_lr):
|
||||
continue
|
||||
combined = (h_wr + a_lr) / 2
|
||||
if combined < threshold:
|
||||
continue
|
||||
odds_val = df.iloc[i].odds_ms_h
|
||||
if pd.isna(odds_val) or odds_val < 1.10 or odds_val > 5.0:
|
||||
continue
|
||||
bets += 1
|
||||
if df.iloc[i].label_ms == 0:
|
||||
wins += 1
|
||||
pnl += odds_val - 1
|
||||
else:
|
||||
pnl -= 1
|
||||
if bets >= 20:
|
||||
roi = pnl / bets * 100
|
||||
hit = wins / bets * 100
|
||||
marker = " *** PROFITABLE ***" if roi > 0 else ""
|
||||
print(f" Home win comb>{threshold:.2f}: {bets:5d} bets, "
|
||||
f"hit={hit:.1f}%, ROI={roi:+.1f}%{marker}")
|
||||
|
||||
# ─── DEEP DIVE: Best performing niches ───
|
||||
print("\n" + "="*70)
|
||||
print(" DEEP DIVE: Combined OU15 + Odds Value Filter")
|
||||
print("="*70)
|
||||
|
||||
# The user's strategy: high confidence + the odds must pay enough
|
||||
for threshold in [0.75, 0.80, 0.85, 0.90]:
|
||||
for min_odds in [1.10, 1.20, 1.30, 1.40]:
|
||||
bets = wins = 0
|
||||
pnl = 0.0
|
||||
for i in test_valid:
|
||||
signal = features[i, 12] # combined ou15
|
||||
if np.isnan(signal) or signal < threshold:
|
||||
continue
|
||||
odds_val = df.iloc[i].get('odds_ou15_o', np.nan) if 'odds_ou15_o' in df.columns else np.nan
|
||||
if pd.isna(odds_val) or odds_val < min_odds:
|
||||
continue
|
||||
actual = df.iloc[i].ou15_actual
|
||||
|
||||
bets += 1
|
||||
if actual == 1:
|
||||
wins += 1
|
||||
pnl += odds_val - 1
|
||||
else:
|
||||
pnl -= 1
|
||||
|
||||
if bets >= 30:
|
||||
roi = pnl / bets * 100
|
||||
hit = wins / bets * 100
|
||||
if roi > -5: # show near-profitable too
|
||||
marker = " *** PROFITABLE ***" if roi > 0 else ""
|
||||
print(f" OU15 sig>{threshold:.2f} odds>{min_odds}: "
|
||||
f"{bets:5d} bets, hit={hit:.1f}%, ROI={roi:+.1f}%{marker}")
|
||||
|
||||
# ─── Additional: Goal expectation accuracy ───
|
||||
print("\n" + "="*70)
|
||||
print(" GOAL PREDICTION ACCURACY")
|
||||
print("="*70)
|
||||
valid_goals = [i for i in test_valid if not np.isnan(features[i, 18])]
|
||||
if valid_goals:
|
||||
pred_goals = [features[i, 18] for i in valid_goals]
|
||||
actual_goals = [df.iloc[i].total_goals for i in valid_goals]
|
||||
from sklearn.metrics import mean_absolute_error
|
||||
mae = mean_absolute_error(actual_goals, pred_goals)
|
||||
corr = np.corrcoef(pred_goals, actual_goals)[0, 1]
|
||||
print(f" Combined goal prediction MAE: {mae:.3f}")
|
||||
print(f" Correlation: {corr:.4f}")
|
||||
print(f" Avg predicted: {np.mean(pred_goals):.2f}, Avg actual: {np.mean(actual_goals):.2f}")
|
||||
|
||||
# Bucket analysis
|
||||
print("\n Goal prediction buckets:")
|
||||
for low, high in [(0, 1.5), (1.5, 2.0), (2.0, 2.5), (2.5, 3.0), (3.0, 3.5), (3.5, 5.0)]:
|
||||
bucket = [i for i, pg in zip(valid_goals, pred_goals) if low <= pg < high]
|
||||
if len(bucket) >= 20:
|
||||
avg_actual = np.mean([df.iloc[i].total_goals for i in bucket])
|
||||
ou25_rate = np.mean([df.iloc[i].ou25_actual for i in bucket])
|
||||
print(f" Predicted {low:.1f}-{high:.1f}: n={len(bucket)}, "
|
||||
f"actual_avg={avg_actual:.2f}, OU25%={ou25_rate*100:.1f}%")
|
||||
|
||||
print("\nDone!")
|
||||
@@ -1071,13 +1071,13 @@ class FeatureExtractor:
|
||||
|
||||
for mst, poss, sot, total_shots, corners, team_goals in rows:
|
||||
if poss and poss > 0:
|
||||
poss_sum += poss
|
||||
poss_sum += float(poss)
|
||||
poss_count += 1
|
||||
sot_sum += sot or 0
|
||||
shots_sum += total_shots or 0
|
||||
corners_sum += corners or 0
|
||||
sot_sum += float(sot or 0)
|
||||
shots_sum += float(total_shots or 0)
|
||||
corners_sum += float(corners or 0)
|
||||
|
||||
goals_scored += team_goals or 0
|
||||
goals_scored += float(team_goals or 0)
|
||||
|
||||
return {
|
||||
"possession": (poss_sum / poss_count / 100) if poss_count > 0 else 0.50,
|
||||
|
||||
@@ -0,0 +1,305 @@
|
||||
"""
|
||||
V27 Training Data Extraction - Value Sniper
|
||||
Extends V25 to ALL matches with odds (~104K).
|
||||
Adds rolling window, league quality, time, H2H, strength features.
|
||||
Usage: python3 scripts/extract_training_data_v27.py
|
||||
"""
|
||||
import os, sys, csv, time
|
||||
from collections import defaultdict
|
||||
|
||||
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0, AI_DIR)
|
||||
|
||||
from scripts.extract_training_data import (
|
||||
BatchDataLoader as V25Loader,
|
||||
FeatureExtractor as V25Extractor,
|
||||
FEATURE_COLS as V25_COLS,
|
||||
get_conn,
|
||||
)
|
||||
from features.rolling_features import (
|
||||
calc_rolling_features, calc_league_quality,
|
||||
calc_time_features, calc_advanced_h2h, calc_strength_diff,
|
||||
)
|
||||
|
||||
OUTPUT = os.path.join(AI_DIR, "data", "training_data_v27.csv")
|
||||
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
|
||||
|
||||
V27_NEW = [
|
||||
"home_rolling5_goals","home_rolling5_conceded",
|
||||
"home_rolling10_goals","home_rolling10_conceded",
|
||||
"home_rolling20_goals","home_rolling20_conceded",
|
||||
"away_rolling5_goals","away_rolling5_conceded",
|
||||
"away_rolling10_goals","away_rolling10_conceded",
|
||||
"home_rolling5_cs","away_rolling5_cs",
|
||||
"home_venue_goals","home_venue_conceded",
|
||||
"away_venue_goals","away_venue_conceded",
|
||||
"home_goal_trend","away_goal_trend",
|
||||
"league_home_win_rate","league_draw_rate",
|
||||
"league_btts_rate","league_ou25_rate",
|
||||
"league_reliability_score",
|
||||
"home_days_rest","away_days_rest",
|
||||
"match_month","is_season_start","is_season_end",
|
||||
"h2h_home_goals_avg","h2h_away_goals_avg",
|
||||
"h2h_recent_trend","h2h_venue_advantage",
|
||||
"attack_vs_defense_home","attack_vs_defense_away",
|
||||
"xg_diff","form_momentum_interaction",
|
||||
"elo_form_consistency","upset_x_elo_gap",
|
||||
]
|
||||
ALL_COLS = V25_COLS + V27_NEW
|
||||
|
||||
|
||||
class V27Loader(V25Loader):
|
||||
"""Load ALL matches with odds, not just top leagues."""
|
||||
def __init__(self, conn):
|
||||
super().__init__(conn, [])
|
||||
self.league_matches_cache = {}
|
||||
|
||||
def _load_matches(self):
|
||||
self.cur.execute("""
|
||||
SELECT m.id, m.home_team_id, m.away_team_id,
|
||||
m.score_home, m.score_away,
|
||||
m.ht_score_home, m.ht_score_away,
|
||||
m.mst_utc, m.league_id,
|
||||
ht.name, at.name, l.name
|
||||
FROM matches m
|
||||
JOIN teams ht ON m.home_team_id = ht.id
|
||||
JOIN teams at ON m.away_team_id = at.id
|
||||
JOIN leagues l ON m.league_id = l.id
|
||||
WHERE m.status='FT' AND m.score_home IS NOT NULL
|
||||
AND m.sport='football'
|
||||
AND EXISTS(SELECT 1 FROM odd_categories oc WHERE oc.match_id=m.id)
|
||||
ORDER BY m.mst_utc ASC
|
||||
""")
|
||||
self.matches = self.cur.fetchall()
|
||||
|
||||
def _load_odds(self):
|
||||
self.cur.execute("""
|
||||
SELECT oc.match_id, oc.name, os.name, os.odd_value
|
||||
FROM odd_selections os
|
||||
JOIN odd_categories oc ON os.odd_category_db_id=oc.db_id
|
||||
JOIN matches m ON oc.match_id=m.id
|
||||
WHERE m.status='FT' AND m.sport='football'
|
||||
""")
|
||||
for mid, cat, sel, val in self.cur.fetchall():
|
||||
try:
|
||||
v = float(val) if val else 0
|
||||
if v <= 0 or not cat or not sel: continue
|
||||
if mid not in self.odds_cache: self.odds_cache[mid] = {}
|
||||
c = cat.lower().strip()
|
||||
s = sel.lower().strip()
|
||||
o = self.odds_cache[mid]
|
||||
if c == 'maç sonucu':
|
||||
if sel=='1': o['ms_h']=v
|
||||
elif sel in('0','X'): o['ms_d']=v
|
||||
elif sel=='2': o['ms_a']=v
|
||||
elif c == '1. yarı sonucu':
|
||||
if sel=='1': o['ht_ms_h']=v
|
||||
elif sel in('0','X'): o['ht_ms_d']=v
|
||||
elif sel=='2': o['ht_ms_a']=v
|
||||
elif c == 'karşılıklı gol':
|
||||
if 'var' in s: o['btts_y']=v
|
||||
elif 'yok' in s: o['btts_n']=v
|
||||
elif c == '2,5 alt/üst':
|
||||
if 'alt' in s: o['ou25_u']=v
|
||||
elif 'üst' in s: o['ou25_o']=v
|
||||
elif c == '1,5 alt/üst':
|
||||
if 'alt' in s: o['ou15_u']=v
|
||||
elif 'üst' in s: o['ou15_o']=v
|
||||
elif c == '3,5 alt/üst':
|
||||
if 'alt' in s: o['ou35_u']=v
|
||||
elif 'üst' in s: o['ou35_o']=v
|
||||
elif c == '0,5 alt/üst':
|
||||
if 'alt' in s: o['ou05_u']=v
|
||||
elif 'üst' in s: o['ou05_o']=v
|
||||
elif c == '1. yarı 0,5 alt/üst':
|
||||
if 'alt' in s: o['ht_ou05_u']=v
|
||||
elif 'üst' in s: o['ht_ou05_o']=v
|
||||
elif c == '1. yarı 1,5 alt/üst':
|
||||
if 'alt' in s: o['ht_ou15_u']=v
|
||||
elif 'üst' in s: o['ht_ou15_o']=v
|
||||
except (ValueError, TypeError): pass
|
||||
|
||||
def _load_league_stats(self):
|
||||
self.cur.execute("""
|
||||
SELECT league_id,
|
||||
AVG(score_home+score_away), AVG(CASE WHEN score_home=0 AND score_away=0 THEN 1.0 ELSE 0.0 END),
|
||||
COUNT(*)
|
||||
FROM matches WHERE status='FT' AND score_home IS NOT NULL AND sport='football'
|
||||
GROUP BY league_id
|
||||
""")
|
||||
for lid, ag, zr, cnt in self.cur.fetchall():
|
||||
self.league_stats_cache[lid] = {
|
||||
"avg_goals": float(ag) if ag else 2.5,
|
||||
"zero_rate": float(zr) if zr else 0.07,
|
||||
"match_count": cnt
|
||||
}
|
||||
|
||||
def _load_squad_data(self):
|
||||
self.cur.execute("""
|
||||
SELECT mpp.match_id, mpp.team_id,
|
||||
COUNT(*) FILTER(WHERE mpp.is_starting=true),
|
||||
COUNT(*),
|
||||
COUNT(*) FILTER(WHERE mpp.is_starting=true
|
||||
AND LOWER(COALESCE(mpp.position::TEXT,''))~'(forward|fwd|forvet|striker)')
|
||||
FROM match_player_participation mpp
|
||||
JOIN matches m ON mpp.match_id=m.id
|
||||
WHERE m.status='FT' AND m.sport='football'
|
||||
GROUP BY mpp.match_id, mpp.team_id
|
||||
""")
|
||||
part = {}
|
||||
for mid,tid,st,tot,fwd in self.cur.fetchall():
|
||||
part[(mid,tid)]={'starting_count':st or 0,'total_squad':tot or 0,'fwd_count':fwd or 0}
|
||||
|
||||
self.cur.execute("""
|
||||
SELECT mpe.match_id, mpe.team_id,
|
||||
COUNT(*) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'),
|
||||
COUNT(DISTINCT mpe.assist_player_id) FILTER(WHERE mpe.event_type='goal' AND mpe.assist_player_id IS NOT NULL),
|
||||
COUNT(DISTINCT mpe.player_id) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%')
|
||||
FROM match_player_events mpe
|
||||
JOIN matches m ON mpe.match_id=m.id
|
||||
WHERE m.status='FT' AND m.sport='football'
|
||||
GROUP BY mpe.match_id, mpe.team_id
|
||||
""")
|
||||
evts = {}
|
||||
for mid,tid,g,a,sc in self.cur.fetchall():
|
||||
evts[(mid,tid)]={'goals':g or 0,'assists':a or 0,'unique_scorers':sc or 0}
|
||||
|
||||
self.cur.execute("""
|
||||
SELECT mpe.team_id, mpe.player_id, COUNT(*)
|
||||
FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id
|
||||
WHERE m.status='FT' AND m.sport='football' AND mpe.event_type='goal'
|
||||
AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'
|
||||
GROUP BY mpe.team_id, mpe.player_id HAVING COUNT(*)>=3
|
||||
""")
|
||||
kp_by_team = defaultdict(set)
|
||||
for tid,pid,_ in self.cur.fetchall(): kp_by_team[tid].add(pid)
|
||||
|
||||
self.cur.execute("""
|
||||
SELECT mpp.match_id, mpp.team_id, mpp.player_id
|
||||
FROM match_player_participation mpp JOIN matches m ON mpp.match_id=m.id
|
||||
WHERE mpp.is_starting=true AND m.status='FT' AND m.sport='football'
|
||||
""")
|
||||
starters = defaultdict(list)
|
||||
for mid,tid,pid in self.cur.fetchall(): starters[(mid,tid)].append(pid)
|
||||
|
||||
for key in set(part)|set(evts):
|
||||
mid,tid = key
|
||||
p = part.get(key,{'starting_count':0,'total_squad':0,'fwd_count':0})
|
||||
e = evts.get(key,{'goals':0,'assists':0,'unique_scorers':0})
|
||||
s = starters.get(key,[])
|
||||
kp_in = sum(1 for x in s if x in kp_by_team.get(tid,set()))
|
||||
kp_tot = len(kp_by_team.get(tid,set()))
|
||||
kp_miss = max(0, kp_tot - kp_in)
|
||||
sq = p['starting_count']*0.3 + e['goals']*2.0 + e['assists']*1.0 + kp_in*3.0 + p['fwd_count']*1.5
|
||||
mi = min(kp_miss/max(kp_tot,1), 1.0)
|
||||
self.squad_cache[key] = {'squad_quality':sq,'key_players':kp_in,'missing_impact':mi,'goals_form':e['goals']}
|
||||
|
||||
def _load_cards_data(self):
|
||||
self.cur.execute("""
|
||||
SELECT mpe.match_id,
|
||||
SUM(CASE WHEN mpe.event_type::text LIKE '%%yellow_card%%' THEN 1
|
||||
WHEN mpe.event_type::text LIKE '%%red_card%%' THEN 2 ELSE 1 END)
|
||||
FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id
|
||||
WHERE m.status='FT' AND m.sport='football' AND mpe.event_type::text LIKE '%%card%%'
|
||||
GROUP BY mpe.match_id
|
||||
""")
|
||||
for mid, cw in self.cur.fetchall():
|
||||
self.cards_cache[mid] = float(cw) if cw else 0.0
|
||||
|
||||
def load_league_matches(self):
|
||||
for m in self.matches:
|
||||
lid = m[8]
|
||||
if lid not in self.league_matches_cache:
|
||||
self.league_matches_cache[lid] = []
|
||||
self.league_matches_cache[lid].append((m[7],None,m[3],m[4],None))
|
||||
|
||||
|
||||
class V27Extractor(V25Extractor):
|
||||
"""Adds V27 features on top of V25."""
|
||||
def _extract_one(self, mid, hid, aid, sh, sa, hth, hta, mst, lid,
|
||||
hn, an, ln):
|
||||
row = super()._extract_one(mid,hid,aid,sh,sa,hth,hta,mst,lid,hn,an,ln)
|
||||
if not row: return None
|
||||
|
||||
hm = self.loader.team_matches.get(hid,[])
|
||||
am = self.loader.team_matches.get(aid,[])
|
||||
|
||||
hr = calc_rolling_features(hm, mst, True)
|
||||
ar = calc_rolling_features(am, mst, False)
|
||||
for pfx,r in [("home",hr),("away",ar)]:
|
||||
row[f"{pfx}_rolling5_goals"]=r["rolling5_goals_avg"]
|
||||
row[f"{pfx}_rolling5_conceded"]=r["rolling5_conceded_avg"]
|
||||
row[f"{pfx}_rolling10_goals"]=r["rolling10_goals_avg"]
|
||||
row[f"{pfx}_rolling10_conceded"]=r["rolling10_conceded_avg"]
|
||||
row[f"{pfx}_rolling20_goals"]=r["rolling20_goals_avg"]
|
||||
row[f"{pfx}_rolling20_conceded"]=r["rolling20_conceded_avg"]
|
||||
row[f"{pfx}_rolling5_cs"]=r["rolling5_clean_sheets"]
|
||||
row[f"{pfx}_venue_goals"]=r["venue_goals_avg"]
|
||||
row[f"{pfx}_venue_conceded"]=r["venue_conceded_avg"]
|
||||
row[f"{pfx}_goal_trend"]=r["goal_trend"]
|
||||
|
||||
lb = [x for x in self.loader.league_matches_cache.get(lid,[]) if x[0]<mst]
|
||||
lq = calc_league_quality(lb)
|
||||
for k,v in lq.items(): row[k]=v
|
||||
|
||||
ht = calc_time_features(hm, mst)
|
||||
at = calc_time_features(am, mst)
|
||||
row["home_days_rest"]=ht["days_rest"]
|
||||
row["away_days_rest"]=at["days_rest"]
|
||||
row["match_month"]=ht["match_month"]
|
||||
row["is_season_start"]=ht["is_season_start"]
|
||||
row["is_season_end"]=ht["is_season_end"]
|
||||
|
||||
h2h = calc_advanced_h2h(hm, hid, aid, mst)
|
||||
for k,v in h2h.items(): row[k]=v
|
||||
|
||||
sd = calc_strength_diff(
|
||||
{"goals_avg":row.get("home_goals_avg",1.3),"conceded_avg":row.get("home_conceded_avg",1.2),"scoring_rate":row.get("home_scoring_rate",0.75)},
|
||||
{"goals_avg":row.get("away_goals_avg",1.3),"conceded_avg":row.get("away_conceded_avg",1.2),"scoring_rate":row.get("away_scoring_rate",0.75)},
|
||||
self.elo_ratings[hid], self.elo_ratings[aid],
|
||||
row.get("home_momentum_score",0.5), row.get("away_momentum_score",0.5),
|
||||
row.get("upset_potential",0.0),
|
||||
)
|
||||
row.update(sd)
|
||||
return row
|
||||
|
||||
|
||||
def main():
|
||||
print("🚀 V27 Value Sniper — Training Data Extraction")
|
||||
print("="*60)
|
||||
t0 = time.time()
|
||||
conn = get_conn()
|
||||
|
||||
print("\n📦 Loading ALL odds-bearing matches...")
|
||||
loader = V27Loader(conn)
|
||||
loader.load_all()
|
||||
loader.load_league_matches()
|
||||
print(f" Matches: {len(loader.matches)}")
|
||||
print(f" Leagues: {len(loader.league_stats_cache)}")
|
||||
print(f" Odds: {len(loader.odds_cache)}")
|
||||
|
||||
ext = V27Extractor(conn, loader)
|
||||
rows = ext.extract_all()
|
||||
if not rows:
|
||||
print("❌ No data!"); return
|
||||
|
||||
print(f"\n💾 Writing {len(rows)} rows...")
|
||||
with open(OUTPUT,"w",newline="",encoding="utf-8") as f:
|
||||
w = csv.DictWriter(f, fieldnames=ALL_COLS, extrasaction='ignore')
|
||||
w.writeheader(); w.writerows(rows)
|
||||
|
||||
n = len(rows)
|
||||
wo = sum(1 for r in rows if r.get("odds_ms_h",0)>0)
|
||||
md = defaultdict(int)
|
||||
for r in rows: md[r["label_ms"]]+=1
|
||||
print(f"\n📊 Summary:")
|
||||
print(f" Rows: {n}")
|
||||
print(f" With odds: {wo} ({wo/n*100:.1f}%)")
|
||||
print(f" Features: {len(ALL_COLS)} ({len(V25_COLS)} V25 + {len(V27_NEW)} new)")
|
||||
print(f" MS: H={md[0]/n*100:.1f}% D={md[1]/n*100:.1f}% A={md[2]/n*100:.1f}%")
|
||||
print(f" Time: {(time.time()-t0)/60:.1f}min")
|
||||
print(f"\n✅ Done! → {OUTPUT}")
|
||||
conn.close()
|
||||
|
||||
if __name__=="__main__":
|
||||
main()
|
||||
@@ -0,0 +1,317 @@
|
||||
"""
|
||||
Strategy Generator — Senin Excel mantığını DB üzerinde otomatize eder.
|
||||
|
||||
Mantık:
|
||||
1. Ev sahibi takım X, evinde oran bandı Y'de oynadığında → OU1.5/OU2.5/BTTS oranları
|
||||
2. Deplasman takım Z, deplasmanda oran bandı W'de oynadığında → OU1.5/OU2.5/BTTS oranları
|
||||
3. İkisi de yüksekse → STRATEJİ ÜRET
|
||||
|
||||
Çıktı: Her maç için hangi bahis oynanabilir, neden, ve geçmiş başarı oranı
|
||||
"""
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
# DB connection
|
||||
conn = psycopg2.connect(
|
||||
host="localhost",
|
||||
port=15432,
|
||||
dbname="boilerplate_db",
|
||||
user="suggestbet",
|
||||
password="SuGGesT2026SecuRe"
|
||||
)
|
||||
|
||||
print("=" * 70)
|
||||
print(" STRATEGY GENERATOR — Veritabanından Strateji Üretimi")
|
||||
print("=" * 70)
|
||||
|
||||
# 1. Tüm biten maçları, takım adları ve MS oranlarıyla çek
|
||||
query = """
|
||||
SELECT
|
||||
m.id as match_id,
|
||||
m.home_team_id,
|
||||
m.away_team_id,
|
||||
m.league_id,
|
||||
m.score_home,
|
||||
m.score_away,
|
||||
m.mst_utc,
|
||||
ht.name as home_team,
|
||||
at.name as away_team,
|
||||
l.name as league_name
|
||||
FROM matches m
|
||||
JOIN teams ht ON m.home_team_id = ht.id
|
||||
JOIN teams at ON m.away_team_id = at.id
|
||||
JOIN leagues l ON m.league_id = l.id
|
||||
WHERE m.status = 'FT'
|
||||
AND m.score_home IS NOT NULL
|
||||
ORDER BY m.mst_utc ASC
|
||||
"""
|
||||
df = pd.read_sql(query, conn)
|
||||
print(f"\nToplam biten maç: {len(df):,}")
|
||||
|
||||
# 2. Tüm oranları çek (MS, OU25, BTTS, OU15)
|
||||
odds_query = """
|
||||
SELECT
|
||||
oc.match_id,
|
||||
oc.name as market,
|
||||
os.name as selection,
|
||||
CAST(os.odd_value AS DECIMAL) as odds
|
||||
FROM odd_categories oc
|
||||
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
|
||||
WHERE oc.name IN (
|
||||
'Maç Sonucu',
|
||||
'2,5 Alt/Üst',
|
||||
'1,5 Alt/Üst',
|
||||
'3,5 Alt/Üst',
|
||||
'Karşılıklı Gol'
|
||||
)
|
||||
"""
|
||||
odds_df = pd.read_sql(odds_query, conn)
|
||||
print(f"Toplam oran kaydı: {len(odds_df):,}")
|
||||
|
||||
# Pivot: her maç için oranları sütunlara çevir
|
||||
def get_odds(match_id, market, selection):
|
||||
mask = (odds_df.match_id == match_id) & (odds_df.market == market) & (odds_df.selection == selection)
|
||||
vals = odds_df.loc[mask, 'odds']
|
||||
return float(vals.iloc[0]) if len(vals) > 0 else None
|
||||
|
||||
# Daha verimli: oran lookup dict oluştur
|
||||
print("Oran lookup oluşturuluyor...")
|
||||
odds_lookup = {}
|
||||
for _, row in odds_df.iterrows():
|
||||
key = (row.match_id, row.market, row.selection)
|
||||
odds_lookup[key] = float(row.odds)
|
||||
|
||||
def get_o(mid, market, sel):
|
||||
return odds_lookup.get((mid, market, sel))
|
||||
|
||||
# 3. Her maça oranları ekle
|
||||
print("Maçlara oranlar ekleniyor...")
|
||||
df['odds_ms_h'] = df.match_id.map(lambda x: get_o(x, 'Maç Sonucu', '1'))
|
||||
df['odds_ms_a'] = df.match_id.map(lambda x: get_o(x, 'Maç Sonucu', '2'))
|
||||
df['odds_ms_d'] = df.match_id.map(lambda x: get_o(x, 'Maç Sonucu', '0'))
|
||||
df['odds_ou25_o'] = df.match_id.map(lambda x: get_o(x, '2,5 Alt/Üst', 'Üst'))
|
||||
df['odds_ou25_u'] = df.match_id.map(lambda x: get_o(x, '2,5 Alt/Üst', 'Alt'))
|
||||
df['odds_ou15_o'] = df.match_id.map(lambda x: get_o(x, '1,5 Alt/Üst', 'Üst'))
|
||||
df['odds_ou15_u'] = df.match_id.map(lambda x: get_o(x, '1,5 Alt/Üst', 'Alt'))
|
||||
df['odds_ou35_o'] = df.match_id.map(lambda x: get_o(x, '3,5 Alt/Üst', 'Üst'))
|
||||
df['odds_ou35_u'] = df.match_id.map(lambda x: get_o(x, '3,5 Alt/Üst', 'Alt'))
|
||||
df['odds_btts_y'] = df.match_id.map(lambda x: get_o(x, 'Karşılıklı Gol', 'Var'))
|
||||
df['odds_btts_n'] = df.match_id.map(lambda x: get_o(x, 'Karşılıklı Gol', 'Yok'))
|
||||
|
||||
# Sonuç hesapla
|
||||
df['total_goals'] = df.score_home + df.score_away
|
||||
df['ou15'] = (df.total_goals > 1).astype(int)
|
||||
df['ou25'] = (df.total_goals > 2).astype(int)
|
||||
df['ou35'] = (df.total_goals > 3).astype(int)
|
||||
df['btts'] = ((df.score_home > 0) & (df.score_away > 0)).astype(int)
|
||||
|
||||
print(f"Oranı olan maç sayısı: {df.odds_ms_h.notna().sum():,}")
|
||||
|
||||
# 4. ORAN BANDI fonksiyonu
|
||||
def odds_band(odds):
|
||||
if pd.isna(odds): return None
|
||||
if odds < 1.30: return '1.00-1.30'
|
||||
if odds < 1.50: return '1.30-1.50'
|
||||
if odds < 1.80: return '1.50-1.80'
|
||||
if odds < 2.20: return '1.80-2.20'
|
||||
if odds < 2.80: return '2.20-2.80'
|
||||
if odds < 4.00: return '2.80-4.00'
|
||||
if odds < 6.00: return '4.00-6.00'
|
||||
return '6.00+'
|
||||
|
||||
# 5. STRATEJİ: Expanding window — sadece geçmiş veriye bakarak tahmin
|
||||
print("\n" + "=" * 70)
|
||||
print(" STRATEJİ BACKTEST — Expanding Window")
|
||||
print("=" * 70)
|
||||
|
||||
# Ev sahibi geçmişi: {team_id: {odds_band: [ou15, ou25, btts, ou35, ...]}}
|
||||
home_history = defaultdict(lambda: defaultdict(list))
|
||||
away_history = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
MIN_MATCHES = 8 # Minimum geçmiş maç sayısı
|
||||
TEST_PCT = 0.30 # Son %30 test
|
||||
N = len(df)
|
||||
test_start = int(N * (1 - TEST_PCT))
|
||||
|
||||
results = {
|
||||
'ou15_over': [], 'ou25_over': [], 'ou35_over': [],
|
||||
'btts_yes': [], 'btts_no': [],
|
||||
'ou25_under': [], 'ou15_under': [],
|
||||
'ms_home': []
|
||||
}
|
||||
|
||||
for i in range(N):
|
||||
row = df.iloc[i]
|
||||
h_odds = row.odds_ms_h
|
||||
a_odds = row.odds_ms_a
|
||||
|
||||
if pd.isna(h_odds) or pd.isna(a_odds):
|
||||
continue
|
||||
|
||||
h_band = odds_band(h_odds)
|
||||
a_band = odds_band(a_odds)
|
||||
|
||||
# TEST: sadece test bölümünde bahis yap
|
||||
if i >= test_start:
|
||||
h_hist = home_history[row.home_team_id][h_band]
|
||||
a_hist = away_history[row.away_team_id][a_band]
|
||||
|
||||
if len(h_hist) >= MIN_MATCHES and len(a_hist) >= MIN_MATCHES:
|
||||
# Ev sahibi bu oran bandında ne yapmış?
|
||||
h_ou15 = np.mean([x[0] for x in h_hist])
|
||||
h_ou25 = np.mean([x[1] for x in h_hist])
|
||||
h_ou35 = np.mean([x[2] for x in h_hist])
|
||||
h_btts = np.mean([x[3] for x in h_hist])
|
||||
h_win = np.mean([x[4] for x in h_hist])
|
||||
|
||||
# Deplasman bu oran bandında ne yapmış?
|
||||
a_ou15 = np.mean([x[0] for x in a_hist])
|
||||
a_ou25 = np.mean([x[1] for x in a_hist])
|
||||
a_ou35 = np.mean([x[2] for x in a_hist])
|
||||
a_btts = np.mean([x[3] for x in a_hist])
|
||||
a_loss = np.mean([x[4] for x in a_hist]) # deplasman kaybetme oranı
|
||||
|
||||
# KOMBİNE SİNYAL
|
||||
sig_ou15 = (h_ou15 + a_ou15) / 2
|
||||
sig_ou25 = (h_ou25 + a_ou25) / 2
|
||||
sig_ou35 = (h_ou35 + a_ou35) / 2
|
||||
sig_btts = (h_btts + a_btts) / 2
|
||||
sig_hw = (h_win + a_loss) / 2 # ev kazanma + deplasman kaybetme
|
||||
|
||||
base = {
|
||||
'match': f"{row.home_team} vs {row.away_team}",
|
||||
'league': row.league_name,
|
||||
'home_team': row.home_team,
|
||||
'away_team': row.away_team,
|
||||
'h_band': h_band,
|
||||
'a_band': a_band,
|
||||
'h_n': len(h_hist),
|
||||
'a_n': len(a_hist),
|
||||
}
|
||||
|
||||
# OU 1.5 OVER
|
||||
if sig_ou15 >= 0.85 and row.odds_ou15_o and row.odds_ou15_o > 1.01:
|
||||
results['ou15_over'].append({
|
||||
**base, 'signal': sig_ou15, 'odds': row.odds_ou15_o,
|
||||
'won': row.ou15 == 1, 'actual_goals': row.total_goals,
|
||||
'h_sig': h_ou15, 'a_sig': a_ou15
|
||||
})
|
||||
|
||||
# OU 2.5 OVER
|
||||
if sig_ou25 >= 0.70 and row.odds_ou25_o and row.odds_ou25_o > 1.10:
|
||||
results['ou25_over'].append({
|
||||
**base, 'signal': sig_ou25, 'odds': row.odds_ou25_o,
|
||||
'won': row.ou25 == 1, 'actual_goals': row.total_goals,
|
||||
'h_sig': h_ou25, 'a_sig': a_ou25
|
||||
})
|
||||
|
||||
# OU 3.5 OVER
|
||||
if sig_ou35 >= 0.60 and row.odds_ou35_o and row.odds_ou35_o > 1.20:
|
||||
results['ou35_over'].append({
|
||||
**base, 'signal': sig_ou35, 'odds': row.odds_ou35_o,
|
||||
'won': row.ou35 == 1, 'actual_goals': row.total_goals,
|
||||
'h_sig': h_ou35, 'a_sig': a_ou35
|
||||
})
|
||||
|
||||
# BTTS YES
|
||||
if sig_btts >= 0.70 and row.odds_btts_y and row.odds_btts_y > 1.10:
|
||||
results['btts_yes'].append({
|
||||
**base, 'signal': sig_btts, 'odds': row.odds_btts_y,
|
||||
'won': row.btts == 1, 'actual_goals': row.total_goals,
|
||||
'h_sig': h_btts, 'a_sig': a_btts
|
||||
})
|
||||
|
||||
# OU 2.5 UNDER (düşük gol beklentisi)
|
||||
if sig_ou25 <= 0.30 and row.odds_ou25_u and row.odds_ou25_u > 1.10:
|
||||
results['ou25_under'].append({
|
||||
**base, 'signal': 1-sig_ou25, 'odds': row.odds_ou25_u,
|
||||
'won': row.ou25 == 0, 'actual_goals': row.total_goals,
|
||||
'h_sig': 1-h_ou25, 'a_sig': 1-a_ou25
|
||||
})
|
||||
|
||||
# MS HOME WIN (ev sahibi kazanma)
|
||||
if sig_hw >= 0.75 and row.odds_ms_h and 1.10 < row.odds_ms_h < 3.50:
|
||||
results['ms_home'].append({
|
||||
**base, 'signal': sig_hw, 'odds': row.odds_ms_h,
|
||||
'won': row.score_home > row.score_away,
|
||||
'actual_goals': row.total_goals,
|
||||
'h_sig': h_win, 'a_sig': a_loss
|
||||
})
|
||||
|
||||
# History güncelle (her zaman)
|
||||
home_history[row.home_team_id][h_band].append((
|
||||
row.ou15, row.ou25, row.ou35, row.btts,
|
||||
int(row.score_home > row.score_away)
|
||||
))
|
||||
away_history[row.away_team_id][a_band].append((
|
||||
row.ou15, row.ou25, row.ou35, row.btts,
|
||||
int(row.score_away < row.score_home) # deplasman kaybetme
|
||||
))
|
||||
|
||||
# 6. SONUÇLARI YAZIDIR
|
||||
print(f"\nTest bölümü: son {TEST_PCT*100:.0f}% ({N - test_start:,} maç)")
|
||||
print(f"Minimum geçmiş: {MIN_MATCHES} maç\n")
|
||||
|
||||
for market_name, bets in results.items():
|
||||
if not bets:
|
||||
print(f"\n {market_name}: sinyal yok")
|
||||
continue
|
||||
|
||||
bdf = pd.DataFrame(bets)
|
||||
total = len(bdf)
|
||||
wins = bdf.won.sum()
|
||||
hit = wins / total * 100
|
||||
pnl = (bdf.won * (bdf.odds - 1) - (~bdf.won) * 1).sum()
|
||||
roi = pnl / total * 100
|
||||
avg_odds = bdf.odds.mean()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f" {market_name.upper()}")
|
||||
print(f"{'='*60}")
|
||||
print(f" Toplam bahis: {total}")
|
||||
print(f" Kazanan: {wins} ({hit:.1f}%)")
|
||||
print(f" Ortalama odds: {avg_odds:.2f}")
|
||||
print(f" PnL: {pnl:+.1f} birim")
|
||||
print(f" ROI: {roi:+.1f}%")
|
||||
|
||||
# Farklı sinyal eşiklerinde performans
|
||||
print(f"\n Sinyal eşik analizi:")
|
||||
for threshold in [0.70, 0.75, 0.80, 0.85, 0.90, 0.95]:
|
||||
sub = bdf[bdf.signal >= threshold]
|
||||
if len(sub) < 5: continue
|
||||
w = sub.won.sum()
|
||||
p = (sub.won * (sub.odds - 1) - (~sub.won) * 1).sum()
|
||||
r = p / len(sub) * 100
|
||||
star = ' ✅ PROFIT' if r > 0 else (' ⚖️ BE' if r > -3 else '')
|
||||
print(f" ≥{threshold:.2f}: {len(sub):5d} bahis, hit={w/len(sub)*100:.1f}%, ROI={r:+.1f}%{star}")
|
||||
|
||||
# En iyi 10 örnek (kazanan)
|
||||
if wins > 0:
|
||||
best = bdf[bdf.won].nlargest(min(5, wins), 'signal')
|
||||
print(f"\n Örnek kazanan bahisler:")
|
||||
for _, b in best.iterrows():
|
||||
print(f" {b.home_team} vs {b.away_team} ({b.league})")
|
||||
print(f" Ev {b.h_band} ({b.h_sig:.0%}) + Dep {b.a_band} ({b.a_sig:.0%}) → sinyal={b.signal:.0%}, odds={b.odds:.2f}, gol={b.actual_goals:.0f}")
|
||||
|
||||
# 7. ÖZET TABLO
|
||||
print("\n\n" + "=" * 70)
|
||||
print(" ÖZET TABLO")
|
||||
print("=" * 70)
|
||||
print(f"{'Market':<15} {'Bahis':>6} {'Hit':>7} {'ROI':>8} {'Avg Odds':>9}")
|
||||
print("-" * 50)
|
||||
for market_name, bets in results.items():
|
||||
if not bets: continue
|
||||
bdf = pd.DataFrame(bets)
|
||||
total = len(bdf)
|
||||
wins = bdf.won.sum()
|
||||
hit = wins / total * 100
|
||||
pnl = (bdf.won * (bdf.odds - 1) - (~bdf.won) * 1).sum()
|
||||
roi = pnl / total * 100
|
||||
avg_odds = bdf.odds.mean()
|
||||
print(f"{market_name:<15} {total:>6} {hit:>6.1f}% {roi:>+7.1f}% {avg_odds:>8.2f}")
|
||||
|
||||
conn.close()
|
||||
print("\n✅ Tamamlandı!")
|
||||
@@ -0,0 +1,480 @@
|
||||
"""
|
||||
V27 Value Sniper — PRO Training Script
|
||||
========================================
|
||||
KEY INSIGHT: Train model WITHOUT odds to get independent probability.
|
||||
Then compare with market odds to find genuine value edges.
|
||||
|
||||
Strategy:
|
||||
Stage A: "Fundamentals Model" — odds-free, learns from ELO/form/rolling/H2H
|
||||
Stage B: "Value Model" — uses fundamentals + odds disagreement as features
|
||||
Stage C: Multi-market — 1X2, O/U 2.5, BTTS
|
||||
Stage D: Walk-forward backtest with Kelly sizing
|
||||
"""
|
||||
import os, sys, json, pickle, time, warnings
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from sklearn.metrics import accuracy_score, log_loss
|
||||
from sklearn.isotonic import IsotonicRegression
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
AI_DIR = Path(__file__).resolve().parent.parent
|
||||
DATA_CSV = AI_DIR / "data" / "training_data_v27.csv"
|
||||
MODELS_DIR = AI_DIR / "models" / "v27"
|
||||
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# ── Leakage & category definitions ──
|
||||
LEAKAGE_COLS = [
|
||||
"total_goals", "goal_diff", "ht_total_goals", "ht_goal_diff",
|
||||
"score_home", "score_away", "ht_score_home", "ht_score_away",
|
||||
"home_goals_form", "away_goals_form",
|
||||
"home_squad_quality", "away_squad_quality", "squad_diff",
|
||||
"home_key_players", "away_key_players",
|
||||
"home_missing_impact", "away_missing_impact",
|
||||
"referee_home_bias", "referee_avg_goals", "referee_cards_total",
|
||||
"referee_avg_yellow", "referee_avg_red", "referee_penalty_rate",
|
||||
"referee_over25_rate", "referee_experience", "referee_matches",
|
||||
]
|
||||
LABEL_COLS = [c for c in [] ] # populated dynamically
|
||||
META_COLS = ["match_id", "league_name", "home_team", "away_team"]
|
||||
ODDS_COLS_PATTERNS = ["odds_", "implied_"]
|
||||
|
||||
|
||||
def get_odds_cols(df):
|
||||
return [c for c in df.columns if any(c.startswith(p) for p in ODDS_COLS_PATTERNS)]
|
||||
|
||||
|
||||
def get_label_cols(df):
|
||||
return [c for c in df.columns if c.startswith("label_")]
|
||||
|
||||
|
||||
def get_clean_features(df):
|
||||
"""Features with NO odds and NO leakage — pure fundamentals."""
|
||||
odds = set(get_odds_cols(df))
|
||||
labels = set(get_label_cols(df))
|
||||
exclude = odds | labels | set(LEAKAGE_COLS) | set(META_COLS)
|
||||
# Also exclude ID columns
|
||||
exclude |= {c for c in df.columns if c.endswith("_id") and c != "match_id"}
|
||||
feats = [c for c in df.columns if c not in exclude]
|
||||
# Keep only numeric
|
||||
feats = [c for c in feats if pd.to_numeric(df[c], errors="coerce").notna().sum() > len(df)*0.3]
|
||||
return feats
|
||||
|
||||
|
||||
def load_data():
|
||||
print(f"Loading {DATA_CSV}...")
|
||||
df = pd.read_csv(DATA_CSV, low_memory=False)
|
||||
print(f" Raw: {len(df)} rows")
|
||||
|
||||
# Ensure odds exist for value comparison
|
||||
for c in ["odds_ms_h","odds_ms_d","odds_ms_a"]:
|
||||
df[c] = pd.to_numeric(df[c], errors="coerce")
|
||||
df = df.dropna(subset=["odds_ms_h","odds_ms_d","odds_ms_a"])
|
||||
df = df[(df.odds_ms_h>1.01)&(df.odds_ms_d>1.01)&(df.odds_ms_a>1.01)]
|
||||
|
||||
# OU25 odds
|
||||
for c in ["odds_ou25_over","odds_ou25_under"]:
|
||||
if c in df.columns:
|
||||
df[c] = pd.to_numeric(df[c], errors="coerce")
|
||||
|
||||
# Implied probabilities
|
||||
margin = 1/df.odds_ms_h + 1/df.odds_ms_d + 1/df.odds_ms_a
|
||||
df["implied_h"] = (1/df.odds_ms_h)/margin
|
||||
df["implied_d"] = (1/df.odds_ms_d)/margin
|
||||
df["implied_a"] = (1/df.odds_ms_a)/margin
|
||||
|
||||
print(f" After filter: {len(df)} rows")
|
||||
return df
|
||||
|
||||
|
||||
def temporal_split(df, val_ratio=0.15, test_ratio=0.10):
|
||||
n = len(df)
|
||||
tr = int(n*(1-val_ratio-test_ratio))
|
||||
va = int(n*(1-test_ratio))
|
||||
return df.iloc[:tr].copy(), df.iloc[tr:va].copy(), df.iloc[va:].copy()
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# STAGE A: Fundamentals-Only Model (NO ODDS)
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
def train_fundamentals_model(X_tr, y_tr, X_va, y_va, feat_cols, market="ms"):
|
||||
"""Train ensemble WITHOUT odds features."""
|
||||
models = {}
|
||||
n_class = 3 if market == "ms" else 2
|
||||
|
||||
# XGBoost
|
||||
try:
|
||||
import xgboost as xgb
|
||||
print(f" [XGB] Training {market.upper()}...")
|
||||
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=feat_cols)
|
||||
dval = xgb.DMatrix(X_va, label=y_va, feature_names=feat_cols)
|
||||
params = {
|
||||
"objective": "multi:softprob" if n_class==3 else "binary:logistic",
|
||||
"eval_metric": "mlogloss" if n_class==3 else "logloss",
|
||||
"max_depth": 6, "learning_rate": 0.02, "subsample": 0.75,
|
||||
"colsample_bytree": 0.75, "min_child_weight": 10,
|
||||
"reg_alpha": 0.5, "reg_lambda": 2.0,
|
||||
"verbosity": 0, "tree_method": "hist",
|
||||
}
|
||||
if n_class == 3:
|
||||
params["num_class"] = 3
|
||||
m = xgb.train(params, dtrain, num_boost_round=2000,
|
||||
evals=[(dval,"val")], early_stopping_rounds=80,
|
||||
verbose_eval=False)
|
||||
p = m.predict(dval)
|
||||
if n_class == 2:
|
||||
p = np.column_stack([1-p, p])
|
||||
acc = accuracy_score(y_va, p.argmax(1))
|
||||
print(f" acc={acc:.4f}")
|
||||
models["xgb"] = m
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# LightGBM
|
||||
try:
|
||||
import lightgbm as lgb
|
||||
print(f" [LGB] Training {market.upper()}...")
|
||||
ds_tr = lgb.Dataset(X_tr, label=y_tr)
|
||||
ds_va = lgb.Dataset(X_va, label=y_va, reference=ds_tr)
|
||||
par = {
|
||||
"objective": "multiclass" if n_class==3 else "binary",
|
||||
"metric": "multi_logloss" if n_class==3 else "binary_logloss",
|
||||
"num_leaves": 48, "learning_rate": 0.02,
|
||||
"feature_fraction": 0.7, "bagging_fraction": 0.7,
|
||||
"bagging_freq": 1, "min_child_samples": 30,
|
||||
"lambda_l1": 0.5, "lambda_l2": 2.0, "verbose": -1,
|
||||
}
|
||||
if n_class == 3:
|
||||
par["num_class"] = 3
|
||||
m = lgb.train(par, ds_tr, 2000, valid_sets=[ds_va],
|
||||
callbacks=[lgb.early_stopping(80, verbose=False)])
|
||||
p = m.predict(X_va)
|
||||
if n_class == 2:
|
||||
p = np.column_stack([1-p, p])
|
||||
acc = accuracy_score(y_va, p.argmax(1))
|
||||
print(f" acc={acc:.4f}")
|
||||
models["lgb"] = m
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# CatBoost
|
||||
try:
|
||||
from catboost import CatBoostClassifier
|
||||
print(f" [CB] Training {market.upper()}...")
|
||||
m = CatBoostClassifier(
|
||||
iterations=2000, learning_rate=0.02, depth=6,
|
||||
l2_leaf_reg=5, loss_function="MultiClass" if n_class==3 else "Logloss",
|
||||
early_stopping_rounds=80, verbose=0, task_type="CPU",
|
||||
**({"classes_count": 3} if n_class==3 else {}),
|
||||
)
|
||||
m.fit(X_tr, y_tr, eval_set=(X_va, y_va))
|
||||
p = m.predict_proba(X_va)
|
||||
acc = accuracy_score(y_va, p.argmax(1))
|
||||
print(f" acc={acc:.4f}")
|
||||
models["cb"] = m
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
return models
|
||||
|
||||
|
||||
def ensemble_predict(models, X, feat_cols, n_class=3):
|
||||
preds = []
|
||||
for name, m in models.items():
|
||||
if name == "xgb":
|
||||
import xgboost as xgb
|
||||
dm = xgb.DMatrix(X, feature_names=feat_cols)
|
||||
p = m.predict(dm)
|
||||
if n_class == 2 and p.ndim == 1:
|
||||
p = np.column_stack([1-p, p])
|
||||
elif name == "lgb":
|
||||
p = m.predict(X)
|
||||
if n_class == 2 and p.ndim == 1:
|
||||
p = np.column_stack([1-p, p])
|
||||
elif name == "cb":
|
||||
p = m.predict_proba(X)
|
||||
preds.append(np.array(p))
|
||||
if not preds:
|
||||
raise RuntimeError("No models!")
|
||||
return np.mean(preds, axis=0)
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# STAGE B: Walk-Forward Backtest with Kelly
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
def kelly_fraction(model_prob, odds, fraction=0.25):
|
||||
"""Fractional Kelly: f = fraction * (p*odds - 1) / (odds - 1)"""
|
||||
edge = model_prob * odds - 1
|
||||
if edge <= 0 or odds <= 1:
|
||||
return 0.0
|
||||
f = edge / (odds - 1)
|
||||
return max(0, min(fraction * f, 0.10)) # cap at 10% bankroll
|
||||
|
||||
|
||||
def backtest_value(models, df_test, feat_cols, market="ms",
|
||||
min_edge=0.05, min_odds=1.40, max_odds=4.50,
|
||||
use_kelly=True):
|
||||
"""Realistic backtest: flat or Kelly sizing, edge filtering."""
|
||||
X = df_test[feat_cols].values
|
||||
n_class = 3 if market == "ms" else 2
|
||||
probs = ensemble_predict(models, X, feat_cols, n_class)
|
||||
|
||||
if market == "ms":
|
||||
y = df_test["label_ms"].values
|
||||
odds_arr = df_test[["odds_ms_h","odds_ms_d","odds_ms_a"]].values
|
||||
implied = df_test[["implied_h","implied_d","implied_a"]].values
|
||||
class_names = ["Home","Draw","Away"]
|
||||
elif market == "ou25":
|
||||
if "label_ou25" not in df_test.columns:
|
||||
return {}
|
||||
y = df_test["label_ou25"].values
|
||||
o_over = pd.to_numeric(df_test["odds_ou25_o"], errors="coerce").fillna(1.85).values if "odds_ou25_o" in df_test.columns else np.full(len(df_test), 1.85)
|
||||
o_under = pd.to_numeric(df_test["odds_ou25_u"], errors="coerce").fillna(1.85).values if "odds_ou25_u" in df_test.columns else np.full(len(df_test), 1.85)
|
||||
odds_arr = np.column_stack([o_under, o_over])
|
||||
m = 1/odds_arr
|
||||
implied = m / m.sum(axis=1, keepdims=True)
|
||||
class_names = ["Under","Over"]
|
||||
else:
|
||||
return {}
|
||||
|
||||
results = {"bets": [], "total": 0, "wins": 0, "pnl": 0.0, "bankroll_curve": [1000.0]}
|
||||
bankroll = 1000.0
|
||||
|
||||
for i in range(len(y)):
|
||||
for cls in range(n_class):
|
||||
edge = probs[i, cls] - implied[i, cls]
|
||||
odds_val = odds_arr[i, cls]
|
||||
|
||||
# FILTERS
|
||||
if edge < min_edge:
|
||||
continue
|
||||
if odds_val < min_odds or odds_val > max_odds:
|
||||
continue
|
||||
# Don't bet on heavy favorites with tiny edge
|
||||
if implied[i, cls] > 0.65 and edge < 0.08:
|
||||
continue
|
||||
|
||||
# Sizing
|
||||
if use_kelly:
|
||||
frac = kelly_fraction(probs[i, cls], odds_val, fraction=0.15)
|
||||
stake = bankroll * frac
|
||||
else:
|
||||
stake = 10.0 # flat
|
||||
|
||||
if stake < 1:
|
||||
continue
|
||||
|
||||
won = (y[i] == cls)
|
||||
pnl = stake * (odds_val - 1) if won else -stake
|
||||
bankroll += pnl
|
||||
|
||||
results["bets"].append({
|
||||
"edge": float(edge), "odds": float(odds_val),
|
||||
"model_p": float(probs[i,cls]), "implied_p": float(implied[i,cls]),
|
||||
"won": bool(won), "pnl": float(pnl), "stake": float(stake),
|
||||
"class": class_names[cls],
|
||||
})
|
||||
results["bankroll_curve"].append(bankroll)
|
||||
results["total"] += 1
|
||||
if won:
|
||||
results["wins"] += 1
|
||||
results["pnl"] = bankroll - 1000.0
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def print_backtest(results, label=""):
|
||||
total = results.get("total", 0)
|
||||
if total == 0:
|
||||
print(f" {label}: No bets placed")
|
||||
return
|
||||
wins = results["wins"]
|
||||
pnl = results["pnl"]
|
||||
hit = wins/total*100
|
||||
roi = pnl / sum(b["stake"] for b in results["bets"]) * 100
|
||||
curve = results["bankroll_curve"]
|
||||
peak = max(curve)
|
||||
dd = min((c - peak) / peak * 100 for c in curve if c <= peak) if len(curve) > 1 else 0
|
||||
|
||||
# Per-class breakdown
|
||||
by_class = {}
|
||||
for b in results["bets"]:
|
||||
cls = b["class"]
|
||||
if cls not in by_class:
|
||||
by_class[cls] = {"n": 0, "w": 0, "pnl": 0}
|
||||
by_class[cls]["n"] += 1
|
||||
if b["won"]:
|
||||
by_class[cls]["w"] += 1
|
||||
by_class[cls]["pnl"] += b["pnl"]
|
||||
|
||||
print(f"\n {label}")
|
||||
print(f" Bets: {total} | Hit: {hit:.1f}% | ROI: {roi:+.1f}%")
|
||||
print(f" PnL: {pnl:+.0f} | Final: {curve[-1]:.0f} | MaxDD: {dd:.1f}%")
|
||||
for cls, d in sorted(by_class.items()):
|
||||
r = d["pnl"]/d["n"]*100 if d["n"] > 0 else 0
|
||||
print(f" {cls:6s}: {d['n']:4d} bets, "
|
||||
f"hit={d['w']/d['n']*100:.1f}%, avg_pnl={r:+.1f}%")
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# MAIN
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
def main():
|
||||
print("=" * 65)
|
||||
print(" V27 VALUE SNIPER — PRO TRAINING (Odds-Free Fundamentals)")
|
||||
print("=" * 65)
|
||||
t0 = time.time()
|
||||
|
||||
df = load_data()
|
||||
clean_feats = get_clean_features(df)
|
||||
print(f" Clean features (no odds): {len(clean_feats)}")
|
||||
|
||||
# Numerify
|
||||
for c in clean_feats:
|
||||
df[c] = pd.to_numeric(df[c], errors="coerce")
|
||||
df[clean_feats] = df[clean_feats].fillna(df[clean_feats].median())
|
||||
|
||||
# Remove constant columns
|
||||
clean_feats = [c for c in clean_feats if df[c].nunique() > 1]
|
||||
print(f" After removing constants: {len(clean_feats)}")
|
||||
|
||||
# Split
|
||||
tr, va, te = temporal_split(df)
|
||||
print(f" Train: {len(tr)}, Val: {len(va)}, Test: {len(te)}")
|
||||
print(f" Target: H={tr.label_ms.eq(0).mean():.1%}, "
|
||||
f"D={tr.label_ms.eq(1).mean():.1%}, A={tr.label_ms.eq(2).mean():.1%}")
|
||||
|
||||
X_tr = tr[clean_feats].values
|
||||
y_tr = tr["label_ms"].values
|
||||
X_va = va[clean_feats].values
|
||||
y_va = va["label_ms"].values
|
||||
|
||||
# ── STAGE A: Train fundamentals model (1X2) ──
|
||||
print("\n" + "─"*65)
|
||||
print(" STAGE A: Fundamentals-Only 1X2 Model")
|
||||
print("─"*65)
|
||||
ms_models = train_fundamentals_model(X_tr, y_tr, X_va, y_va, clean_feats, "ms")
|
||||
|
||||
val_probs = ensemble_predict(ms_models, X_va, clean_feats, 3)
|
||||
val_acc = accuracy_score(y_va, val_probs.argmax(1))
|
||||
val_ll = log_loss(y_va, val_probs)
|
||||
print(f"\n Ensemble Val: acc={val_acc:.4f}, logloss={val_ll:.4f}")
|
||||
|
||||
# Compare with odds baseline
|
||||
odds_pred = va[["implied_h","implied_d","implied_a"]].values.argmax(1)
|
||||
odds_acc = accuracy_score(y_va, odds_pred)
|
||||
print(f" Odds baseline: acc={odds_acc:.4f}")
|
||||
print(f" Model vs Odds: {val_acc - odds_acc:+.4f}")
|
||||
|
||||
# ── STAGE B: O/U 2.5 Model ──
|
||||
ou_models = None
|
||||
if "label_ou25" in tr.columns:
|
||||
print("\n" + "─"*65)
|
||||
print(" STAGE A.2: Fundamentals-Only O/U 2.5 Model")
|
||||
print("─"*65)
|
||||
y_tr_ou = tr["label_ou25"].values
|
||||
y_va_ou = va["label_ou25"].values
|
||||
mask_tr = ~np.isnan(y_tr_ou)
|
||||
mask_va = ~np.isnan(y_va_ou)
|
||||
if mask_tr.sum() > 1000:
|
||||
ou_models = train_fundamentals_model(
|
||||
X_tr[mask_tr], y_tr_ou[mask_tr].astype(int),
|
||||
X_va[mask_va], y_va_ou[mask_va].astype(int),
|
||||
clean_feats, "ou25")
|
||||
|
||||
# ── STAGE C: Backtest ──
|
||||
print("\n" + "─"*65)
|
||||
print(" STAGE B: Walk-Forward Backtest (Test Set)")
|
||||
print("─"*65)
|
||||
|
||||
# Try multiple edge thresholds
|
||||
best_roi = -999
|
||||
best_cfg = {}
|
||||
for min_edge in [0.03, 0.05, 0.07, 0.10, 0.12, 0.15]:
|
||||
for min_odds in [1.35, 1.50, 1.70]:
|
||||
r = backtest_value(ms_models, te, clean_feats, "ms",
|
||||
min_edge=min_edge, min_odds=min_odds,
|
||||
max_odds=5.0, use_kelly=True)
|
||||
if r.get("total", 0) >= 20:
|
||||
invested = sum(b["stake"] for b in r["bets"])
|
||||
roi = r["pnl"] / invested * 100 if invested > 0 else -100
|
||||
if roi > best_roi:
|
||||
best_roi = roi
|
||||
best_cfg = {"edge": min_edge, "min_odds": min_odds, "result": r}
|
||||
|
||||
if best_cfg:
|
||||
cfg = best_cfg
|
||||
print(f"\n Best 1X2 Config: edge>{cfg['edge']}, odds>{cfg['min_odds']}")
|
||||
print_backtest(cfg["result"], "1X2 VALUE")
|
||||
|
||||
# Flat bet comparison
|
||||
print("\n --- Flat Bet Comparison ---")
|
||||
for edge in [0.05, 0.07, 0.10]:
|
||||
r = backtest_value(ms_models, te, clean_feats, "ms",
|
||||
min_edge=edge, min_odds=1.50, max_odds=4.5,
|
||||
use_kelly=False)
|
||||
if r.get("total", 0) > 0:
|
||||
inv = r["total"] * 10
|
||||
roi = r["pnl"]/inv*100
|
||||
print(f" Edge>{edge:.2f}: {r['total']} bets, "
|
||||
f"hit={r['wins']/r['total']*100:.1f}%, ROI={roi:+.1f}%")
|
||||
|
||||
# OU25 backtest
|
||||
if ou_models:
|
||||
print("\n --- O/U 2.5 Backtest ---")
|
||||
for edge in [0.05, 0.07, 0.10]:
|
||||
r = backtest_value(ou_models, te, clean_feats, "ou25",
|
||||
min_edge=edge, min_odds=1.50, max_odds=3.0,
|
||||
use_kelly=True)
|
||||
if r.get("total", 0) > 0:
|
||||
print_backtest(r, f"OU25 edge>{edge}")
|
||||
|
||||
# ── Feature importance ──
|
||||
if "lgb" in ms_models:
|
||||
imp = ms_models["lgb"].feature_importance(importance_type="gain")
|
||||
imp_df = pd.DataFrame({"feature": clean_feats, "importance": imp}
|
||||
).sort_values("importance", ascending=False)
|
||||
print("\n TOP 15 FEATURES (no odds!):")
|
||||
for _, r in imp_df.head(15).iterrows():
|
||||
print(f" {r['feature']:40s} {r['importance']:.0f}")
|
||||
imp_df.to_csv(MODELS_DIR / "v27_feature_importance.csv", index=False)
|
||||
|
||||
# ── Save ──
|
||||
print("\n" + "─"*65)
|
||||
print(" SAVING MODELS")
|
||||
print("─"*65)
|
||||
for name, m in ms_models.items():
|
||||
p = MODELS_DIR / f"v27_ms_{name}.pkl"
|
||||
with open(p, "wb") as f:
|
||||
pickle.dump(m, f)
|
||||
print(f" ✓ {p.name}")
|
||||
|
||||
if ou_models:
|
||||
for name, m in ou_models.items():
|
||||
p = MODELS_DIR / f"v27_ou25_{name}.pkl"
|
||||
with open(p, "wb") as f:
|
||||
pickle.dump(m, f)
|
||||
print(f" ✓ {p.name}")
|
||||
|
||||
meta = {
|
||||
"version": "v27-pro", "trained_at": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"approach": "odds-free fundamentals + value edge detection",
|
||||
"feature_count": len(clean_feats),
|
||||
"total_samples": len(df),
|
||||
"val_acc": round(val_acc, 4), "val_ll": round(val_ll, 4),
|
||||
"best_config": {k: v for k, v in best_cfg.items() if k != "result"} if best_cfg else {},
|
||||
"markets": ["ms"] + (["ou25"] if ou_models else []),
|
||||
}
|
||||
with open(MODELS_DIR / "v27_metadata.json", "w") as f:
|
||||
json.dump(meta, f, indent=2, default=str)
|
||||
with open(MODELS_DIR / "v27_feature_cols.json", "w") as f:
|
||||
json.dump(clean_feats, f, indent=2)
|
||||
print(f" ✓ metadata + feature_cols")
|
||||
|
||||
print(f"\n Total time: {(time.time()-t0)/60:.1f} min")
|
||||
print(" DONE!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user