This commit is contained in:
2026-04-22 02:17:02 +03:00
parent 2ccd6831eb
commit df428ed1e8
19 changed files with 6436 additions and 9 deletions
+215
View File
@@ -0,0 +1,215 @@
"""
V27 FINAL BACKTEST — Conservative Flat Bet
Only the strongest validated edges. No Kelly compounding.
"""
import pandas as pd, numpy as np
df = pd.read_csv('data/training_data_v27.csv', low_memory=False)
for c in df.columns:
if c not in ['match_id','league_name','home_team','away_team']:
df[c] = pd.to_numeric(df[c], errors='coerce')
df = df.dropna(subset=['odds_ms_h','odds_ms_d','odds_ms_a'])
df = df[(df.odds_ms_h>1.01)&(df.odds_ms_d>1.01)&(df.odds_ms_a>1.01)]
n = len(df)
# 5-fold walk-forward: train on 60%, validate patterns, test on remaining
folds = 5
fold_size = n // folds
all_results = []
print("="*65)
print(" V27 WALK-FORWARD FLAT-BET BACKTEST")
print("="*65)
for fold in range(2, folds): # start from fold 2 so we have enough training data
train_end = fold * fold_size
test_start = train_end
test_end = (fold+1)*fold_size if fold < folds-1 else n
train_df = df.iloc[:train_end]
test_df = df.iloc[test_start:test_end]
print(f"\n --- Fold {fold}: train={len(train_df)}, test={len(test_df)} ---")
# Discover REST edges from training data
strategies = []
for hr in [5, 7, 10, 14]:
for ar in [3, 4, 5]:
for cls, col in [(0,'odds_ms_h'), (2,'odds_ms_a')]:
idx = (train_df.home_days_rest > hr) & (train_df.away_days_rest < ar)
sub = train_df[idx]
if len(sub) < 50:
continue
rate = (sub.label_ms == cls).mean()
avg_odds = sub[col].mean()
ev = rate * avg_odds
if ev > 1.02: # only strong edges (>2% edge)
strategies.append((hr, ar, cls, rate, avg_odds, ev, len(sub)))
if not strategies:
print(" No strong edges found in training data")
continue
# Apply best strategies to test
strategies.sort(key=lambda x: x[5], reverse=True)
best = strategies[:3] # top 3 only
fold_bets = 0
fold_wins = 0
fold_pnl = 0
stake = 10 # flat 10 units
for _, row in test_df.iterrows():
for hr, ar, cls, est_p, _, _, _ in best:
if pd.isna(row.home_days_rest) or pd.isna(row.away_days_rest):
continue
if row.home_days_rest <= hr or row.away_days_rest >= ar:
continue
odds_col = ['odds_ms_h','odds_ms_d','odds_ms_a'][cls]
odds_val = row[odds_col]
if pd.isna(odds_val) or odds_val < 1.50 or odds_val > 5.0:
continue
# Additional filter: only bet when odds give reasonable EV
if est_p * odds_val < 1.0:
continue
won = (row.label_ms == cls)
pnl = stake * (odds_val - 1) if won else -stake
fold_bets += 1
if won:
fold_wins += 1
fold_pnl += pnl
all_results.append({'fold': fold, 'won': won, 'pnl': pnl,
'odds': odds_val, 'stake': stake,
'cls': ['H','D','A'][cls]})
if fold_bets > 0:
roi = fold_pnl / (fold_bets * stake) * 100
print(f" Best strategies: {[(h,a,['H','D','A'][c],f'EV={e:.3f}') for h,a,c,_,_,e,_ in best]}")
print(f" Bets: {fold_bets}, Wins: {fold_wins} ({fold_wins/fold_bets*100:.1f}%), "
f"ROI: {roi:+.1f}%, PnL: {fold_pnl:+.0f}")
# Overall
print("\n" + "="*65)
print(" OVERALL RESULTS")
print("="*65)
if all_results:
total = len(all_results)
wins = sum(1 for r in all_results if r['won'])
total_pnl = sum(r['pnl'] for r in all_results)
total_staked = sum(r['stake'] for r in all_results)
roi = total_pnl / total_staked * 100
print(f" Total bets: {total}")
print(f" Wins: {wins} ({wins/total*100:.1f}%)")
print(f" Total staked: {total_staked:.0f}")
print(f" PnL: {total_pnl:+.0f}")
print(f" ROI: {roi:+.1f}%")
print(f" Avg odds: {np.mean([r['odds'] for r in all_results]):.2f}")
# By class
print("\n --- By Bet Type ---")
for cls in ['H','A']:
cb = [r for r in all_results if r['cls'] == cls]
if cb:
cw = sum(1 for r in cb if r['won'])
cp = sum(r['pnl'] for r in cb)
cs = sum(r['stake'] for r in cb)
print(f" {cls}: {len(cb)} bets, hit={cw/len(cb)*100:.1f}%, ROI={cp/cs*100:+.1f}%")
# Cumulative PnL curve
print("\n --- Cumulative PnL ---")
cum = 0
step = max(1, total // 15)
for j in range(0, total, step):
cum = sum(r['pnl'] for r in all_results[:j+1])
print(f" After bet {j+1:4d}: PnL={cum:+.0f}")
cum = sum(r['pnl'] for r in all_results)
print(f" After bet {total:4d}: PnL={cum:+.0f} (FINAL)")
else:
print(" No bets placed!")
# ── Now combine with MODEL for smarter filtering ──
print("\n" + "="*65)
print(" COMBINED: Rest Rules + Fundamentals Model")
print("="*65)
import pickle, json
from pathlib import Path
MODELS_DIR = Path("models/v27")
feat_cols = json.load(open(MODELS_DIR / "v27_feature_cols.json"))
ms_models = {}
for name in ['xgb','lgb','cb']:
p = MODELS_DIR / f"v27_ms_{name}.pkl"
if p.exists():
with open(p,'rb') as f:
ms_models[name] = pickle.load(f)
if ms_models:
test_df = df.iloc[int(n*0.8):].copy()
X_test = test_df[feat_cols].values
# Get model predictions
preds = []
for name, m in ms_models.items():
if name == 'xgb':
import xgboost as xgb
dm = xgb.DMatrix(X_test, feature_names=feat_cols)
preds.append(m.predict(dm))
elif name == 'lgb':
preds.append(m.predict(X_test))
elif name == 'cb':
preds.append(m.predict_proba(X_test))
model_probs = np.mean(preds, axis=0) # (n, 3)
# Now apply rest rules + model agreement
margin = 1/test_df.odds_ms_h.values + 1/test_df.odds_ms_d.values + 1/test_df.odds_ms_a.values
impl = np.column_stack([
(1/test_df.odds_ms_h.values)/margin,
(1/test_df.odds_ms_d.values)/margin,
(1/test_df.odds_ms_a.values)/margin,
])
combo_bets = 0
combo_wins = 0
combo_pnl = 0
for j in range(len(test_df)):
row = test_df.iloc[j]
for hr, ar in [(14,5),(10,5),(7,5),(5,5)]:
if pd.isna(row.home_days_rest) or pd.isna(row.away_days_rest):
continue
if row.home_days_rest <= hr or row.away_days_rest >= ar:
continue
for cls in [0, 2]:
odds_val = [row.odds_ms_h, row.odds_ms_d, row.odds_ms_a][cls]
if pd.isna(odds_val) or odds_val < 1.50 or odds_val > 5.0:
continue
model_p = model_probs[j, cls]
impl_p = impl[j, cls]
# DOUBLE FILTER: rest rule + model agrees (model_prob > implied)
if model_p <= impl_p:
continue # model disagrees, skip
edge = model_p - impl_p
if edge < 0.03:
continue # too small
won = (row.label_ms == cls)
pnl = 10 * (odds_val - 1) if won else -10
combo_bets += 1
if won:
combo_wins += 1
combo_pnl += pnl
if combo_bets > 0:
roi = combo_pnl / (combo_bets * 10) * 100
print(f" Bets: {combo_bets}")
print(f" Wins: {combo_wins} ({combo_wins/combo_bets*100:.1f}%)")
print(f" PnL: {combo_pnl:+.0f}")
print(f" ROI: {roi:+.1f}%")
else:
print(" No combined bets triggered")
@@ -0,0 +1,312 @@
"""
V28 — CONDITIONAL FREQUENCY ENGINE
====================================
User's strategy automated at scale:
For every match (e.g. Beşiktaş vs Konya):
1. Look at Beşiktaş's HOME history when their MS1 odds were in the same band (e.g. 1.30-1.40)
→ What % of those matches ended OU 1.5 over? OU 2.5 over? MS1?
2. Look at Konya's AWAY history when their MS2 odds were in the same band (e.g. 2.00-2.20)
→ Same questions
3. COMBINE both signals:
→ If BOTH teams historically produce >80% OU1.5 over at these odds → BET OU1.5 over
→ This is the user's exact Excel strategy, now running on 104K matches
CRITICAL: Only uses PAST matches for each prediction (no future leakage)
"""
import pandas as pd
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
# ─── Load Data ───
print("Loading data...")
df = pd.read_csv('data/training_data_v27.csv', low_memory=False)
KEEP_STR = ['match_id', 'league_name', 'home_team', 'away_team',
'home_team_id', 'away_team_id', 'league_id', 'mst_utc']
for c in df.columns:
if c not in KEEP_STR:
df[c] = pd.to_numeric(df[c], errors='coerce')
# Ensure chronological order (by match_id or date)
if 'mst_utc' in df.columns:
df['mst_utc'] = pd.to_datetime(df['mst_utc'], errors='coerce')
df = df.sort_values('mst_utc').reset_index(drop=True)
# Filter: need valid odds + scores
df = df.dropna(subset=['odds_ms_h', 'odds_ms_a', 'score_home', 'score_away',
'home_team_id', 'away_team_id', 'label_ms'])
# Compute actual goal labels
df['total_goals'] = df['score_home'] + df['score_away']
df['ou15_actual'] = (df['total_goals'] > 1.5).astype(int)
df['ou25_actual'] = (df['total_goals'] > 2.5).astype(int)
df['ou35_actual'] = (df['total_goals'] > 3.5).astype(int)
df['btts_actual'] = ((df['score_home'] > 0) & (df['score_away'] > 0)).astype(int)
df['ms_result'] = df['label_ms'].astype(int) # 0=H, 1=D, 2=A
N = len(df)
print(f"Total matches: {N}")
print(f"Unique home teams: {df.home_team_id.nunique()}")
print(f"Unique away teams: {df.away_team_id.nunique()}")
# ─── Odds Band Helper ───
def get_odds_band(odds, band_width=0.10):
"""Round odds to nearest band. E.g. 1.35 → (1.30, 1.40)"""
lower = round(np.floor(odds / band_width) * band_width, 2)
upper = round(lower + band_width, 2)
return (lower, upper)
def get_odds_band_wide(odds):
"""Wider band for less common teams. E.g. 1.35 → (1.20, 1.50)"""
if odds < 1.50:
return (1.01, 1.50)
elif odds < 2.00:
return (1.50, 2.00)
elif odds < 2.50:
return (2.00, 2.50)
elif odds < 3.00:
return (2.50, 3.00)
elif odds < 4.00:
return (3.00, 4.00)
elif odds < 6.00:
return (4.00, 6.00)
else:
return (6.00, 20.00)
# ─── Build Conditional Frequency Lookup (Expanding Window) ───
print("\nBuilding conditional frequency features (expanding window)...")
# We'll compute features for each match using only past data
MIN_MATCHES = 5 # minimum historical matches to generate a signal
# Pre-allocate feature arrays
feat_names = [
'home_ou15_rate_at_band', 'home_ou25_rate_at_band', 'home_ou35_rate_at_band',
'home_btts_rate_at_band', 'home_win_rate_at_band', 'home_n_at_band',
'away_ou15_rate_at_band', 'away_ou25_rate_at_band', 'away_ou35_rate_at_band',
'away_btts_rate_at_band', 'away_win_rate_at_band', 'away_n_at_band',
'combined_ou15', 'combined_ou25', 'combined_ou35', 'combined_btts',
'home_goals_at_band', 'away_goals_at_band', 'combined_goals_at_band',
'home_conceded_at_band', 'away_conceded_at_band',
]
features = np.full((N, len(feat_names)), np.nan)
# Historical ledger: team_id → list of (odds_band, ou15, ou25, ou35, btts, ms_result, goals_scored, goals_conceded)
home_history = defaultdict(list) # team performances when playing HOME
away_history = defaultdict(list) # team performances when playing AWAY
for i in range(N):
row = df.iloc[i]
ht_id = row.home_team_id
at_id = row.away_team_id
h_odds = row.odds_ms_h
a_odds = row.odds_ms_a
if pd.isna(h_odds) or pd.isna(a_odds):
continue
h_band = get_odds_band_wide(h_odds)
a_band = get_odds_band_wide(a_odds)
# ── Look up HOME team's historical performance at this odds band ──
h_hist = [x for x in home_history[ht_id] if h_band[0] <= x[0] < h_band[1]]
if len(h_hist) >= MIN_MATCHES:
features[i, 0] = np.mean([x[1] for x in h_hist]) # ou15 rate
features[i, 1] = np.mean([x[2] for x in h_hist]) # ou25 rate
features[i, 2] = np.mean([x[3] for x in h_hist]) # ou35 rate
features[i, 3] = np.mean([x[4] for x in h_hist]) # btts rate
features[i, 4] = np.mean([x[5] for x in h_hist]) # win rate (home win = 1 if ms==0)
features[i, 5] = len(h_hist)
features[i, 16] = np.mean([x[6] for x in h_hist]) # avg goals scored
features[i, 19] = np.mean([x[7] for x in h_hist]) # avg goals conceded
# ── Look up AWAY team's historical performance at this odds band ──
a_hist = [x for x in away_history[at_id] if a_band[0] <= x[0] < a_band[1]]
if len(a_hist) >= MIN_MATCHES:
features[i, 6] = np.mean([x[1] for x in a_hist]) # ou15 rate
features[i, 7] = np.mean([x[2] for x in a_hist]) # ou25 rate
features[i, 8] = np.mean([x[3] for x in a_hist]) # ou35 rate
features[i, 9] = np.mean([x[4] for x in a_hist]) # btts rate
features[i, 10] = np.mean([x[5] for x in a_hist]) # away win rate
features[i, 11] = len(a_hist)
features[i, 17] = np.mean([x[6] for x in a_hist]) # avg goals scored (away)
features[i, 20] = np.mean([x[7] for x in a_hist]) # avg goals conceded (away)
# ── Combined signals ──
if not np.isnan(features[i, 0]) and not np.isnan(features[i, 6]):
features[i, 12] = (features[i, 0] + features[i, 6]) / 2 # combined ou15
features[i, 13] = (features[i, 1] + features[i, 7]) / 2 # combined ou25
features[i, 14] = (features[i, 2] + features[i, 8]) / 2 # combined ou35
features[i, 15] = (features[i, 3] + features[i, 9]) / 2 # combined btts
features[i, 18] = features[i, 16] + features[i, 17] # combined goals
# ── Add THIS match to history (for future lookups) ──
ou15 = int(row.total_goals > 1.5)
ou25 = int(row.total_goals > 2.5)
ou35 = int(row.total_goals > 3.5)
btts = int(row.score_home > 0 and row.score_away > 0)
h_won = int(row.label_ms == 0)
a_won = int(row.label_ms == 2)
home_history[ht_id].append((h_odds, ou15, ou25, ou35, btts, h_won,
row.score_home, row.score_away))
away_history[at_id].append((a_odds, ou15, ou25, ou35, btts, a_won,
row.score_away, row.score_home))
if (i+1) % 20000 == 0:
valid = np.sum(~np.isnan(features[:i+1, 12]))
print(f" Processed {i+1}/{N} matches, {valid} with combined signals")
# Count valid features
valid_mask = ~np.isnan(features[:, 12])
print(f"\nMatches with combined conditional signals: {valid_mask.sum()} / {N}")
# ─── BACKTEST: Walk-Forward ───
print("\n" + "="*70)
print(" CONDITIONAL FREQUENCY BACKTEST")
print("="*70)
# Only test on last 20% of data (to avoid early sparse data)
test_start = int(N * 0.7)
test_idx = range(test_start, N)
test_valid = [i for i in test_idx if valid_mask[i]]
print(f"Test window: matches {test_start}-{N} ({len(test_valid)} with signals)")
# Strategy: bet on OU1.5 over when combined_ou15 > threshold
markets = [
('OU 1.5 Over', 'combined_ou15', 12, 'ou15_actual', 'odds_ou15_o'),
('OU 2.5 Over', 'combined_ou25', 13, 'ou25_actual', 'odds_ou25_o'),
('OU 3.5 Over', 'combined_ou35', 14, 'ou35_actual', 'odds_ou35_o'),
('BTTS Yes', 'combined_btts', 15, 'btts_actual', 'odds_btts_y'),
]
for market_name, feat_key, feat_idx, label_col, odds_col in markets:
print(f"\n ── {market_name} ──")
if odds_col not in df.columns:
print(f" No odds column '{odds_col}', skipping")
continue
for threshold in [0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90]:
bets = 0
wins = 0
pnl = 0.0
for i in test_valid:
signal = features[i, feat_idx]
if np.isnan(signal) or signal < threshold:
continue
odds_val = df.iloc[i][odds_col]
if pd.isna(odds_val) or odds_val < 1.05:
continue
actual = df.iloc[i][label_col]
if pd.isna(actual):
continue
bets += 1
if actual == 1:
wins += 1
pnl += odds_val - 1
else:
pnl -= 1
if bets >= 20:
roi = pnl / bets * 100
hit = wins / bets * 100
ev = (wins/bets) * (pnl/wins + 1) if wins > 0 else 0
marker = " *** PROFITABLE ***" if roi > 0 else ""
print(f" Threshold>{threshold:.2f}: {bets:5d} bets, "
f"hit={hit:.1f}%, ROI={roi:+.1f}%{marker}")
# Also test MS (1X2) market
print(f"\n ── Maç Sonucu (1X2) ──")
# Home win when home_win_rate_at_band > X AND away team loses often at that band
for threshold in [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80]:
bets = wins = 0
pnl = 0.0
for i in test_valid:
h_wr = features[i, 4] # home win rate at band
a_lr = 1 - features[i, 10] if not np.isnan(features[i, 10]) else np.nan # away loss rate
if np.isnan(h_wr) or np.isnan(a_lr):
continue
combined = (h_wr + a_lr) / 2
if combined < threshold:
continue
odds_val = df.iloc[i].odds_ms_h
if pd.isna(odds_val) or odds_val < 1.10 or odds_val > 5.0:
continue
bets += 1
if df.iloc[i].label_ms == 0:
wins += 1
pnl += odds_val - 1
else:
pnl -= 1
if bets >= 20:
roi = pnl / bets * 100
hit = wins / bets * 100
marker = " *** PROFITABLE ***" if roi > 0 else ""
print(f" Home win comb>{threshold:.2f}: {bets:5d} bets, "
f"hit={hit:.1f}%, ROI={roi:+.1f}%{marker}")
# ─── DEEP DIVE: Best performing niches ───
print("\n" + "="*70)
print(" DEEP DIVE: Combined OU15 + Odds Value Filter")
print("="*70)
# The user's strategy: high confidence + the odds must pay enough
for threshold in [0.75, 0.80, 0.85, 0.90]:
for min_odds in [1.10, 1.20, 1.30, 1.40]:
bets = wins = 0
pnl = 0.0
for i in test_valid:
signal = features[i, 12] # combined ou15
if np.isnan(signal) or signal < threshold:
continue
odds_val = df.iloc[i].get('odds_ou15_o', np.nan) if 'odds_ou15_o' in df.columns else np.nan
if pd.isna(odds_val) or odds_val < min_odds:
continue
actual = df.iloc[i].ou15_actual
bets += 1
if actual == 1:
wins += 1
pnl += odds_val - 1
else:
pnl -= 1
if bets >= 30:
roi = pnl / bets * 100
hit = wins / bets * 100
if roi > -5: # show near-profitable too
marker = " *** PROFITABLE ***" if roi > 0 else ""
print(f" OU15 sig>{threshold:.2f} odds>{min_odds}: "
f"{bets:5d} bets, hit={hit:.1f}%, ROI={roi:+.1f}%{marker}")
# ─── Additional: Goal expectation accuracy ───
print("\n" + "="*70)
print(" GOAL PREDICTION ACCURACY")
print("="*70)
valid_goals = [i for i in test_valid if not np.isnan(features[i, 18])]
if valid_goals:
pred_goals = [features[i, 18] for i in valid_goals]
actual_goals = [df.iloc[i].total_goals for i in valid_goals]
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(actual_goals, pred_goals)
corr = np.corrcoef(pred_goals, actual_goals)[0, 1]
print(f" Combined goal prediction MAE: {mae:.3f}")
print(f" Correlation: {corr:.4f}")
print(f" Avg predicted: {np.mean(pred_goals):.2f}, Avg actual: {np.mean(actual_goals):.2f}")
# Bucket analysis
print("\n Goal prediction buckets:")
for low, high in [(0, 1.5), (1.5, 2.0), (2.0, 2.5), (2.5, 3.0), (3.0, 3.5), (3.5, 5.0)]:
bucket = [i for i, pg in zip(valid_goals, pred_goals) if low <= pg < high]
if len(bucket) >= 20:
avg_actual = np.mean([df.iloc[i].total_goals for i in bucket])
ou25_rate = np.mean([df.iloc[i].ou25_actual for i in bucket])
print(f" Predicted {low:.1f}-{high:.1f}: n={len(bucket)}, "
f"actual_avg={avg_actual:.2f}, OU25%={ou25_rate*100:.1f}%")
print("\nDone!")
+5 -5
View File
@@ -1071,13 +1071,13 @@ class FeatureExtractor:
for mst, poss, sot, total_shots, corners, team_goals in rows:
if poss and poss > 0:
poss_sum += poss
poss_sum += float(poss)
poss_count += 1
sot_sum += sot or 0
shots_sum += total_shots or 0
corners_sum += corners or 0
sot_sum += float(sot or 0)
shots_sum += float(total_shots or 0)
corners_sum += float(corners or 0)
goals_scored += team_goals or 0
goals_scored += float(team_goals or 0)
return {
"possession": (poss_sum / poss_count / 100) if poss_count > 0 else 0.50,
@@ -0,0 +1,305 @@
"""
V27 Training Data Extraction - Value Sniper
Extends V25 to ALL matches with odds (~104K).
Adds rolling window, league quality, time, H2H, strength features.
Usage: python3 scripts/extract_training_data_v27.py
"""
import os, sys, csv, time
from collections import defaultdict
AI_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, AI_DIR)
from scripts.extract_training_data import (
BatchDataLoader as V25Loader,
FeatureExtractor as V25Extractor,
FEATURE_COLS as V25_COLS,
get_conn,
)
from features.rolling_features import (
calc_rolling_features, calc_league_quality,
calc_time_features, calc_advanced_h2h, calc_strength_diff,
)
OUTPUT = os.path.join(AI_DIR, "data", "training_data_v27.csv")
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
V27_NEW = [
"home_rolling5_goals","home_rolling5_conceded",
"home_rolling10_goals","home_rolling10_conceded",
"home_rolling20_goals","home_rolling20_conceded",
"away_rolling5_goals","away_rolling5_conceded",
"away_rolling10_goals","away_rolling10_conceded",
"home_rolling5_cs","away_rolling5_cs",
"home_venue_goals","home_venue_conceded",
"away_venue_goals","away_venue_conceded",
"home_goal_trend","away_goal_trend",
"league_home_win_rate","league_draw_rate",
"league_btts_rate","league_ou25_rate",
"league_reliability_score",
"home_days_rest","away_days_rest",
"match_month","is_season_start","is_season_end",
"h2h_home_goals_avg","h2h_away_goals_avg",
"h2h_recent_trend","h2h_venue_advantage",
"attack_vs_defense_home","attack_vs_defense_away",
"xg_diff","form_momentum_interaction",
"elo_form_consistency","upset_x_elo_gap",
]
ALL_COLS = V25_COLS + V27_NEW
class V27Loader(V25Loader):
"""Load ALL matches with odds, not just top leagues."""
def __init__(self, conn):
super().__init__(conn, [])
self.league_matches_cache = {}
def _load_matches(self):
self.cur.execute("""
SELECT m.id, m.home_team_id, m.away_team_id,
m.score_home, m.score_away,
m.ht_score_home, m.ht_score_away,
m.mst_utc, m.league_id,
ht.name, at.name, l.name
FROM matches m
JOIN teams ht ON m.home_team_id = ht.id
JOIN teams at ON m.away_team_id = at.id
JOIN leagues l ON m.league_id = l.id
WHERE m.status='FT' AND m.score_home IS NOT NULL
AND m.sport='football'
AND EXISTS(SELECT 1 FROM odd_categories oc WHERE oc.match_id=m.id)
ORDER BY m.mst_utc ASC
""")
self.matches = self.cur.fetchall()
def _load_odds(self):
self.cur.execute("""
SELECT oc.match_id, oc.name, os.name, os.odd_value
FROM odd_selections os
JOIN odd_categories oc ON os.odd_category_db_id=oc.db_id
JOIN matches m ON oc.match_id=m.id
WHERE m.status='FT' AND m.sport='football'
""")
for mid, cat, sel, val in self.cur.fetchall():
try:
v = float(val) if val else 0
if v <= 0 or not cat or not sel: continue
if mid not in self.odds_cache: self.odds_cache[mid] = {}
c = cat.lower().strip()
s = sel.lower().strip()
o = self.odds_cache[mid]
if c == 'maç sonucu':
if sel=='1': o['ms_h']=v
elif sel in('0','X'): o['ms_d']=v
elif sel=='2': o['ms_a']=v
elif c == '1. yarı sonucu':
if sel=='1': o['ht_ms_h']=v
elif sel in('0','X'): o['ht_ms_d']=v
elif sel=='2': o['ht_ms_a']=v
elif c == 'karşılıklı gol':
if 'var' in s: o['btts_y']=v
elif 'yok' in s: o['btts_n']=v
elif c == '2,5 alt/üst':
if 'alt' in s: o['ou25_u']=v
elif 'üst' in s: o['ou25_o']=v
elif c == '1,5 alt/üst':
if 'alt' in s: o['ou15_u']=v
elif 'üst' in s: o['ou15_o']=v
elif c == '3,5 alt/üst':
if 'alt' in s: o['ou35_u']=v
elif 'üst' in s: o['ou35_o']=v
elif c == '0,5 alt/üst':
if 'alt' in s: o['ou05_u']=v
elif 'üst' in s: o['ou05_o']=v
elif c == '1. yarı 0,5 alt/üst':
if 'alt' in s: o['ht_ou05_u']=v
elif 'üst' in s: o['ht_ou05_o']=v
elif c == '1. yarı 1,5 alt/üst':
if 'alt' in s: o['ht_ou15_u']=v
elif 'üst' in s: o['ht_ou15_o']=v
except (ValueError, TypeError): pass
def _load_league_stats(self):
self.cur.execute("""
SELECT league_id,
AVG(score_home+score_away), AVG(CASE WHEN score_home=0 AND score_away=0 THEN 1.0 ELSE 0.0 END),
COUNT(*)
FROM matches WHERE status='FT' AND score_home IS NOT NULL AND sport='football'
GROUP BY league_id
""")
for lid, ag, zr, cnt in self.cur.fetchall():
self.league_stats_cache[lid] = {
"avg_goals": float(ag) if ag else 2.5,
"zero_rate": float(zr) if zr else 0.07,
"match_count": cnt
}
def _load_squad_data(self):
self.cur.execute("""
SELECT mpp.match_id, mpp.team_id,
COUNT(*) FILTER(WHERE mpp.is_starting=true),
COUNT(*),
COUNT(*) FILTER(WHERE mpp.is_starting=true
AND LOWER(COALESCE(mpp.position::TEXT,''))~'(forward|fwd|forvet|striker)')
FROM match_player_participation mpp
JOIN matches m ON mpp.match_id=m.id
WHERE m.status='FT' AND m.sport='football'
GROUP BY mpp.match_id, mpp.team_id
""")
part = {}
for mid,tid,st,tot,fwd in self.cur.fetchall():
part[(mid,tid)]={'starting_count':st or 0,'total_squad':tot or 0,'fwd_count':fwd or 0}
self.cur.execute("""
SELECT mpe.match_id, mpe.team_id,
COUNT(*) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'),
COUNT(DISTINCT mpe.assist_player_id) FILTER(WHERE mpe.event_type='goal' AND mpe.assist_player_id IS NOT NULL),
COUNT(DISTINCT mpe.player_id) FILTER(WHERE mpe.event_type='goal' AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%')
FROM match_player_events mpe
JOIN matches m ON mpe.match_id=m.id
WHERE m.status='FT' AND m.sport='football'
GROUP BY mpe.match_id, mpe.team_id
""")
evts = {}
for mid,tid,g,a,sc in self.cur.fetchall():
evts[(mid,tid)]={'goals':g or 0,'assists':a or 0,'unique_scorers':sc or 0}
self.cur.execute("""
SELECT mpe.team_id, mpe.player_id, COUNT(*)
FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id
WHERE m.status='FT' AND m.sport='football' AND mpe.event_type='goal'
AND COALESCE(mpe.event_subtype,'') NOT ILIKE '%%penaltı kaçırma%%'
GROUP BY mpe.team_id, mpe.player_id HAVING COUNT(*)>=3
""")
kp_by_team = defaultdict(set)
for tid,pid,_ in self.cur.fetchall(): kp_by_team[tid].add(pid)
self.cur.execute("""
SELECT mpp.match_id, mpp.team_id, mpp.player_id
FROM match_player_participation mpp JOIN matches m ON mpp.match_id=m.id
WHERE mpp.is_starting=true AND m.status='FT' AND m.sport='football'
""")
starters = defaultdict(list)
for mid,tid,pid in self.cur.fetchall(): starters[(mid,tid)].append(pid)
for key in set(part)|set(evts):
mid,tid = key
p = part.get(key,{'starting_count':0,'total_squad':0,'fwd_count':0})
e = evts.get(key,{'goals':0,'assists':0,'unique_scorers':0})
s = starters.get(key,[])
kp_in = sum(1 for x in s if x in kp_by_team.get(tid,set()))
kp_tot = len(kp_by_team.get(tid,set()))
kp_miss = max(0, kp_tot - kp_in)
sq = p['starting_count']*0.3 + e['goals']*2.0 + e['assists']*1.0 + kp_in*3.0 + p['fwd_count']*1.5
mi = min(kp_miss/max(kp_tot,1), 1.0)
self.squad_cache[key] = {'squad_quality':sq,'key_players':kp_in,'missing_impact':mi,'goals_form':e['goals']}
def _load_cards_data(self):
self.cur.execute("""
SELECT mpe.match_id,
SUM(CASE WHEN mpe.event_type::text LIKE '%%yellow_card%%' THEN 1
WHEN mpe.event_type::text LIKE '%%red_card%%' THEN 2 ELSE 1 END)
FROM match_player_events mpe JOIN matches m ON mpe.match_id=m.id
WHERE m.status='FT' AND m.sport='football' AND mpe.event_type::text LIKE '%%card%%'
GROUP BY mpe.match_id
""")
for mid, cw in self.cur.fetchall():
self.cards_cache[mid] = float(cw) if cw else 0.0
def load_league_matches(self):
for m in self.matches:
lid = m[8]
if lid not in self.league_matches_cache:
self.league_matches_cache[lid] = []
self.league_matches_cache[lid].append((m[7],None,m[3],m[4],None))
class V27Extractor(V25Extractor):
"""Adds V27 features on top of V25."""
def _extract_one(self, mid, hid, aid, sh, sa, hth, hta, mst, lid,
hn, an, ln):
row = super()._extract_one(mid,hid,aid,sh,sa,hth,hta,mst,lid,hn,an,ln)
if not row: return None
hm = self.loader.team_matches.get(hid,[])
am = self.loader.team_matches.get(aid,[])
hr = calc_rolling_features(hm, mst, True)
ar = calc_rolling_features(am, mst, False)
for pfx,r in [("home",hr),("away",ar)]:
row[f"{pfx}_rolling5_goals"]=r["rolling5_goals_avg"]
row[f"{pfx}_rolling5_conceded"]=r["rolling5_conceded_avg"]
row[f"{pfx}_rolling10_goals"]=r["rolling10_goals_avg"]
row[f"{pfx}_rolling10_conceded"]=r["rolling10_conceded_avg"]
row[f"{pfx}_rolling20_goals"]=r["rolling20_goals_avg"]
row[f"{pfx}_rolling20_conceded"]=r["rolling20_conceded_avg"]
row[f"{pfx}_rolling5_cs"]=r["rolling5_clean_sheets"]
row[f"{pfx}_venue_goals"]=r["venue_goals_avg"]
row[f"{pfx}_venue_conceded"]=r["venue_conceded_avg"]
row[f"{pfx}_goal_trend"]=r["goal_trend"]
lb = [x for x in self.loader.league_matches_cache.get(lid,[]) if x[0]<mst]
lq = calc_league_quality(lb)
for k,v in lq.items(): row[k]=v
ht = calc_time_features(hm, mst)
at = calc_time_features(am, mst)
row["home_days_rest"]=ht["days_rest"]
row["away_days_rest"]=at["days_rest"]
row["match_month"]=ht["match_month"]
row["is_season_start"]=ht["is_season_start"]
row["is_season_end"]=ht["is_season_end"]
h2h = calc_advanced_h2h(hm, hid, aid, mst)
for k,v in h2h.items(): row[k]=v
sd = calc_strength_diff(
{"goals_avg":row.get("home_goals_avg",1.3),"conceded_avg":row.get("home_conceded_avg",1.2),"scoring_rate":row.get("home_scoring_rate",0.75)},
{"goals_avg":row.get("away_goals_avg",1.3),"conceded_avg":row.get("away_conceded_avg",1.2),"scoring_rate":row.get("away_scoring_rate",0.75)},
self.elo_ratings[hid], self.elo_ratings[aid],
row.get("home_momentum_score",0.5), row.get("away_momentum_score",0.5),
row.get("upset_potential",0.0),
)
row.update(sd)
return row
def main():
print("🚀 V27 Value Sniper — Training Data Extraction")
print("="*60)
t0 = time.time()
conn = get_conn()
print("\n📦 Loading ALL odds-bearing matches...")
loader = V27Loader(conn)
loader.load_all()
loader.load_league_matches()
print(f" Matches: {len(loader.matches)}")
print(f" Leagues: {len(loader.league_stats_cache)}")
print(f" Odds: {len(loader.odds_cache)}")
ext = V27Extractor(conn, loader)
rows = ext.extract_all()
if not rows:
print("❌ No data!"); return
print(f"\n💾 Writing {len(rows)} rows...")
with open(OUTPUT,"w",newline="",encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=ALL_COLS, extrasaction='ignore')
w.writeheader(); w.writerows(rows)
n = len(rows)
wo = sum(1 for r in rows if r.get("odds_ms_h",0)>0)
md = defaultdict(int)
for r in rows: md[r["label_ms"]]+=1
print(f"\n📊 Summary:")
print(f" Rows: {n}")
print(f" With odds: {wo} ({wo/n*100:.1f}%)")
print(f" Features: {len(ALL_COLS)} ({len(V25_COLS)} V25 + {len(V27_NEW)} new)")
print(f" MS: H={md[0]/n*100:.1f}% D={md[1]/n*100:.1f}% A={md[2]/n*100:.1f}%")
print(f" Time: {(time.time()-t0)/60:.1f}min")
print(f"\n✅ Done! → {OUTPUT}")
conn.close()
if __name__=="__main__":
main()
+317
View File
@@ -0,0 +1,317 @@
"""
Strategy Generator — Senin Excel mantığını DB üzerinde otomatize eder.
Mantık:
1. Ev sahibi takım X, evinde oran bandı Y'de oynadığında → OU1.5/OU2.5/BTTS oranları
2. Deplasman takım Z, deplasmanda oran bandı W'de oynadığında → OU1.5/OU2.5/BTTS oranları
3. İkisi de yüksekse → STRATEJİ ÜRET
Çıktı: Her maç için hangi bahis oynanabilir, neden, ve geçmiş başarı oranı
"""
import psycopg2
import pandas as pd
import numpy as np
from collections import defaultdict
from datetime import datetime
# DB connection
conn = psycopg2.connect(
host="localhost",
port=15432,
dbname="boilerplate_db",
user="suggestbet",
password="SuGGesT2026SecuRe"
)
print("=" * 70)
print(" STRATEGY GENERATOR — Veritabanından Strateji Üretimi")
print("=" * 70)
# 1. Tüm biten maçları, takım adları ve MS oranlarıyla çek
query = """
SELECT
m.id as match_id,
m.home_team_id,
m.away_team_id,
m.league_id,
m.score_home,
m.score_away,
m.mst_utc,
ht.name as home_team,
at.name as away_team,
l.name as league_name
FROM matches m
JOIN teams ht ON m.home_team_id = ht.id
JOIN teams at ON m.away_team_id = at.id
JOIN leagues l ON m.league_id = l.id
WHERE m.status = 'FT'
AND m.score_home IS NOT NULL
ORDER BY m.mst_utc ASC
"""
df = pd.read_sql(query, conn)
print(f"\nToplam biten maç: {len(df):,}")
# 2. Tüm oranları çek (MS, OU25, BTTS, OU15)
odds_query = """
SELECT
oc.match_id,
oc.name as market,
os.name as selection,
CAST(os.odd_value AS DECIMAL) as odds
FROM odd_categories oc
JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
WHERE oc.name IN (
'Maç Sonucu',
'2,5 Alt/Üst',
'1,5 Alt/Üst',
'3,5 Alt/Üst',
'Karşılıklı Gol'
)
"""
odds_df = pd.read_sql(odds_query, conn)
print(f"Toplam oran kaydı: {len(odds_df):,}")
# Pivot: her maç için oranları sütunlara çevir
def get_odds(match_id, market, selection):
mask = (odds_df.match_id == match_id) & (odds_df.market == market) & (odds_df.selection == selection)
vals = odds_df.loc[mask, 'odds']
return float(vals.iloc[0]) if len(vals) > 0 else None
# Daha verimli: oran lookup dict oluştur
print("Oran lookup oluşturuluyor...")
odds_lookup = {}
for _, row in odds_df.iterrows():
key = (row.match_id, row.market, row.selection)
odds_lookup[key] = float(row.odds)
def get_o(mid, market, sel):
return odds_lookup.get((mid, market, sel))
# 3. Her maça oranları ekle
print("Maçlara oranlar ekleniyor...")
df['odds_ms_h'] = df.match_id.map(lambda x: get_o(x, 'Maç Sonucu', '1'))
df['odds_ms_a'] = df.match_id.map(lambda x: get_o(x, 'Maç Sonucu', '2'))
df['odds_ms_d'] = df.match_id.map(lambda x: get_o(x, 'Maç Sonucu', '0'))
df['odds_ou25_o'] = df.match_id.map(lambda x: get_o(x, '2,5 Alt/Üst', 'Üst'))
df['odds_ou25_u'] = df.match_id.map(lambda x: get_o(x, '2,5 Alt/Üst', 'Alt'))
df['odds_ou15_o'] = df.match_id.map(lambda x: get_o(x, '1,5 Alt/Üst', 'Üst'))
df['odds_ou15_u'] = df.match_id.map(lambda x: get_o(x, '1,5 Alt/Üst', 'Alt'))
df['odds_ou35_o'] = df.match_id.map(lambda x: get_o(x, '3,5 Alt/Üst', 'Üst'))
df['odds_ou35_u'] = df.match_id.map(lambda x: get_o(x, '3,5 Alt/Üst', 'Alt'))
df['odds_btts_y'] = df.match_id.map(lambda x: get_o(x, 'Karşılıklı Gol', 'Var'))
df['odds_btts_n'] = df.match_id.map(lambda x: get_o(x, 'Karşılıklı Gol', 'Yok'))
# Sonuç hesapla
df['total_goals'] = df.score_home + df.score_away
df['ou15'] = (df.total_goals > 1).astype(int)
df['ou25'] = (df.total_goals > 2).astype(int)
df['ou35'] = (df.total_goals > 3).astype(int)
df['btts'] = ((df.score_home > 0) & (df.score_away > 0)).astype(int)
print(f"Oranı olan maç sayısı: {df.odds_ms_h.notna().sum():,}")
# 4. ORAN BANDI fonksiyonu
def odds_band(odds):
if pd.isna(odds): return None
if odds < 1.30: return '1.00-1.30'
if odds < 1.50: return '1.30-1.50'
if odds < 1.80: return '1.50-1.80'
if odds < 2.20: return '1.80-2.20'
if odds < 2.80: return '2.20-2.80'
if odds < 4.00: return '2.80-4.00'
if odds < 6.00: return '4.00-6.00'
return '6.00+'
# 5. STRATEJİ: Expanding window — sadece geçmiş veriye bakarak tahmin
print("\n" + "=" * 70)
print(" STRATEJİ BACKTEST — Expanding Window")
print("=" * 70)
# Ev sahibi geçmişi: {team_id: {odds_band: [ou15, ou25, btts, ou35, ...]}}
home_history = defaultdict(lambda: defaultdict(list))
away_history = defaultdict(lambda: defaultdict(list))
MIN_MATCHES = 8 # Minimum geçmiş maç sayısı
TEST_PCT = 0.30 # Son %30 test
N = len(df)
test_start = int(N * (1 - TEST_PCT))
results = {
'ou15_over': [], 'ou25_over': [], 'ou35_over': [],
'btts_yes': [], 'btts_no': [],
'ou25_under': [], 'ou15_under': [],
'ms_home': []
}
for i in range(N):
row = df.iloc[i]
h_odds = row.odds_ms_h
a_odds = row.odds_ms_a
if pd.isna(h_odds) or pd.isna(a_odds):
continue
h_band = odds_band(h_odds)
a_band = odds_band(a_odds)
# TEST: sadece test bölümünde bahis yap
if i >= test_start:
h_hist = home_history[row.home_team_id][h_band]
a_hist = away_history[row.away_team_id][a_band]
if len(h_hist) >= MIN_MATCHES and len(a_hist) >= MIN_MATCHES:
# Ev sahibi bu oran bandında ne yapmış?
h_ou15 = np.mean([x[0] for x in h_hist])
h_ou25 = np.mean([x[1] for x in h_hist])
h_ou35 = np.mean([x[2] for x in h_hist])
h_btts = np.mean([x[3] for x in h_hist])
h_win = np.mean([x[4] for x in h_hist])
# Deplasman bu oran bandında ne yapmış?
a_ou15 = np.mean([x[0] for x in a_hist])
a_ou25 = np.mean([x[1] for x in a_hist])
a_ou35 = np.mean([x[2] for x in a_hist])
a_btts = np.mean([x[3] for x in a_hist])
a_loss = np.mean([x[4] for x in a_hist]) # deplasman kaybetme oranı
# KOMBİNE SİNYAL
sig_ou15 = (h_ou15 + a_ou15) / 2
sig_ou25 = (h_ou25 + a_ou25) / 2
sig_ou35 = (h_ou35 + a_ou35) / 2
sig_btts = (h_btts + a_btts) / 2
sig_hw = (h_win + a_loss) / 2 # ev kazanma + deplasman kaybetme
base = {
'match': f"{row.home_team} vs {row.away_team}",
'league': row.league_name,
'home_team': row.home_team,
'away_team': row.away_team,
'h_band': h_band,
'a_band': a_band,
'h_n': len(h_hist),
'a_n': len(a_hist),
}
# OU 1.5 OVER
if sig_ou15 >= 0.85 and row.odds_ou15_o and row.odds_ou15_o > 1.01:
results['ou15_over'].append({
**base, 'signal': sig_ou15, 'odds': row.odds_ou15_o,
'won': row.ou15 == 1, 'actual_goals': row.total_goals,
'h_sig': h_ou15, 'a_sig': a_ou15
})
# OU 2.5 OVER
if sig_ou25 >= 0.70 and row.odds_ou25_o and row.odds_ou25_o > 1.10:
results['ou25_over'].append({
**base, 'signal': sig_ou25, 'odds': row.odds_ou25_o,
'won': row.ou25 == 1, 'actual_goals': row.total_goals,
'h_sig': h_ou25, 'a_sig': a_ou25
})
# OU 3.5 OVER
if sig_ou35 >= 0.60 and row.odds_ou35_o and row.odds_ou35_o > 1.20:
results['ou35_over'].append({
**base, 'signal': sig_ou35, 'odds': row.odds_ou35_o,
'won': row.ou35 == 1, 'actual_goals': row.total_goals,
'h_sig': h_ou35, 'a_sig': a_ou35
})
# BTTS YES
if sig_btts >= 0.70 and row.odds_btts_y and row.odds_btts_y > 1.10:
results['btts_yes'].append({
**base, 'signal': sig_btts, 'odds': row.odds_btts_y,
'won': row.btts == 1, 'actual_goals': row.total_goals,
'h_sig': h_btts, 'a_sig': a_btts
})
# OU 2.5 UNDER (düşük gol beklentisi)
if sig_ou25 <= 0.30 and row.odds_ou25_u and row.odds_ou25_u > 1.10:
results['ou25_under'].append({
**base, 'signal': 1-sig_ou25, 'odds': row.odds_ou25_u,
'won': row.ou25 == 0, 'actual_goals': row.total_goals,
'h_sig': 1-h_ou25, 'a_sig': 1-a_ou25
})
# MS HOME WIN (ev sahibi kazanma)
if sig_hw >= 0.75 and row.odds_ms_h and 1.10 < row.odds_ms_h < 3.50:
results['ms_home'].append({
**base, 'signal': sig_hw, 'odds': row.odds_ms_h,
'won': row.score_home > row.score_away,
'actual_goals': row.total_goals,
'h_sig': h_win, 'a_sig': a_loss
})
# History güncelle (her zaman)
home_history[row.home_team_id][h_band].append((
row.ou15, row.ou25, row.ou35, row.btts,
int(row.score_home > row.score_away)
))
away_history[row.away_team_id][a_band].append((
row.ou15, row.ou25, row.ou35, row.btts,
int(row.score_away < row.score_home) # deplasman kaybetme
))
# 6. SONUÇLARI YAZIDIR
print(f"\nTest bölümü: son {TEST_PCT*100:.0f}% ({N - test_start:,} maç)")
print(f"Minimum geçmiş: {MIN_MATCHES} maç\n")
for market_name, bets in results.items():
if not bets:
print(f"\n {market_name}: sinyal yok")
continue
bdf = pd.DataFrame(bets)
total = len(bdf)
wins = bdf.won.sum()
hit = wins / total * 100
pnl = (bdf.won * (bdf.odds - 1) - (~bdf.won) * 1).sum()
roi = pnl / total * 100
avg_odds = bdf.odds.mean()
print(f"\n{'='*60}")
print(f" {market_name.upper()}")
print(f"{'='*60}")
print(f" Toplam bahis: {total}")
print(f" Kazanan: {wins} ({hit:.1f}%)")
print(f" Ortalama odds: {avg_odds:.2f}")
print(f" PnL: {pnl:+.1f} birim")
print(f" ROI: {roi:+.1f}%")
# Farklı sinyal eşiklerinde performans
print(f"\n Sinyal eşik analizi:")
for threshold in [0.70, 0.75, 0.80, 0.85, 0.90, 0.95]:
sub = bdf[bdf.signal >= threshold]
if len(sub) < 5: continue
w = sub.won.sum()
p = (sub.won * (sub.odds - 1) - (~sub.won) * 1).sum()
r = p / len(sub) * 100
star = ' ✅ PROFIT' if r > 0 else (' ⚖️ BE' if r > -3 else '')
print(f"{threshold:.2f}: {len(sub):5d} bahis, hit={w/len(sub)*100:.1f}%, ROI={r:+.1f}%{star}")
# En iyi 10 örnek (kazanan)
if wins > 0:
best = bdf[bdf.won].nlargest(min(5, wins), 'signal')
print(f"\n Örnek kazanan bahisler:")
for _, b in best.iterrows():
print(f" {b.home_team} vs {b.away_team} ({b.league})")
print(f" Ev {b.h_band} ({b.h_sig:.0%}) + Dep {b.a_band} ({b.a_sig:.0%}) → sinyal={b.signal:.0%}, odds={b.odds:.2f}, gol={b.actual_goals:.0f}")
# 7. ÖZET TABLO
print("\n\n" + "=" * 70)
print(" ÖZET TABLO")
print("=" * 70)
print(f"{'Market':<15} {'Bahis':>6} {'Hit':>7} {'ROI':>8} {'Avg Odds':>9}")
print("-" * 50)
for market_name, bets in results.items():
if not bets: continue
bdf = pd.DataFrame(bets)
total = len(bdf)
wins = bdf.won.sum()
hit = wins / total * 100
pnl = (bdf.won * (bdf.odds - 1) - (~bdf.won) * 1).sum()
roi = pnl / total * 100
avg_odds = bdf.odds.mean()
print(f"{market_name:<15} {total:>6} {hit:>6.1f}% {roi:>+7.1f}% {avg_odds:>8.2f}")
conn.close()
print("\n✅ Tamamlandı!")
+480
View File
@@ -0,0 +1,480 @@
"""
V27 Value Sniper — PRO Training Script
========================================
KEY INSIGHT: Train model WITHOUT odds to get independent probability.
Then compare with market odds to find genuine value edges.
Strategy:
Stage A: "Fundamentals Model" — odds-free, learns from ELO/form/rolling/H2H
Stage B: "Value Model" — uses fundamentals + odds disagreement as features
Stage C: Multi-market — 1X2, O/U 2.5, BTTS
Stage D: Walk-forward backtest with Kelly sizing
"""
import os, sys, json, pickle, time, warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, log_loss
from sklearn.isotonic import IsotonicRegression
warnings.filterwarnings("ignore")
AI_DIR = Path(__file__).resolve().parent.parent
DATA_CSV = AI_DIR / "data" / "training_data_v27.csv"
MODELS_DIR = AI_DIR / "models" / "v27"
MODELS_DIR.mkdir(parents=True, exist_ok=True)
# ── Leakage & category definitions ──
LEAKAGE_COLS = [
"total_goals", "goal_diff", "ht_total_goals", "ht_goal_diff",
"score_home", "score_away", "ht_score_home", "ht_score_away",
"home_goals_form", "away_goals_form",
"home_squad_quality", "away_squad_quality", "squad_diff",
"home_key_players", "away_key_players",
"home_missing_impact", "away_missing_impact",
"referee_home_bias", "referee_avg_goals", "referee_cards_total",
"referee_avg_yellow", "referee_avg_red", "referee_penalty_rate",
"referee_over25_rate", "referee_experience", "referee_matches",
]
LABEL_COLS = [c for c in [] ] # populated dynamically
META_COLS = ["match_id", "league_name", "home_team", "away_team"]
ODDS_COLS_PATTERNS = ["odds_", "implied_"]
def get_odds_cols(df):
return [c for c in df.columns if any(c.startswith(p) for p in ODDS_COLS_PATTERNS)]
def get_label_cols(df):
return [c for c in df.columns if c.startswith("label_")]
def get_clean_features(df):
"""Features with NO odds and NO leakage — pure fundamentals."""
odds = set(get_odds_cols(df))
labels = set(get_label_cols(df))
exclude = odds | labels | set(LEAKAGE_COLS) | set(META_COLS)
# Also exclude ID columns
exclude |= {c for c in df.columns if c.endswith("_id") and c != "match_id"}
feats = [c for c in df.columns if c not in exclude]
# Keep only numeric
feats = [c for c in feats if pd.to_numeric(df[c], errors="coerce").notna().sum() > len(df)*0.3]
return feats
def load_data():
print(f"Loading {DATA_CSV}...")
df = pd.read_csv(DATA_CSV, low_memory=False)
print(f" Raw: {len(df)} rows")
# Ensure odds exist for value comparison
for c in ["odds_ms_h","odds_ms_d","odds_ms_a"]:
df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=["odds_ms_h","odds_ms_d","odds_ms_a"])
df = df[(df.odds_ms_h>1.01)&(df.odds_ms_d>1.01)&(df.odds_ms_a>1.01)]
# OU25 odds
for c in ["odds_ou25_over","odds_ou25_under"]:
if c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce")
# Implied probabilities
margin = 1/df.odds_ms_h + 1/df.odds_ms_d + 1/df.odds_ms_a
df["implied_h"] = (1/df.odds_ms_h)/margin
df["implied_d"] = (1/df.odds_ms_d)/margin
df["implied_a"] = (1/df.odds_ms_a)/margin
print(f" After filter: {len(df)} rows")
return df
def temporal_split(df, val_ratio=0.15, test_ratio=0.10):
n = len(df)
tr = int(n*(1-val_ratio-test_ratio))
va = int(n*(1-test_ratio))
return df.iloc[:tr].copy(), df.iloc[tr:va].copy(), df.iloc[va:].copy()
# ═══════════════════════════════════════════════════════════════════
# STAGE A: Fundamentals-Only Model (NO ODDS)
# ═══════════════════════════════════════════════════════════════════
def train_fundamentals_model(X_tr, y_tr, X_va, y_va, feat_cols, market="ms"):
"""Train ensemble WITHOUT odds features."""
models = {}
n_class = 3 if market == "ms" else 2
# XGBoost
try:
import xgboost as xgb
print(f" [XGB] Training {market.upper()}...")
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=feat_cols)
dval = xgb.DMatrix(X_va, label=y_va, feature_names=feat_cols)
params = {
"objective": "multi:softprob" if n_class==3 else "binary:logistic",
"eval_metric": "mlogloss" if n_class==3 else "logloss",
"max_depth": 6, "learning_rate": 0.02, "subsample": 0.75,
"colsample_bytree": 0.75, "min_child_weight": 10,
"reg_alpha": 0.5, "reg_lambda": 2.0,
"verbosity": 0, "tree_method": "hist",
}
if n_class == 3:
params["num_class"] = 3
m = xgb.train(params, dtrain, num_boost_round=2000,
evals=[(dval,"val")], early_stopping_rounds=80,
verbose_eval=False)
p = m.predict(dval)
if n_class == 2:
p = np.column_stack([1-p, p])
acc = accuracy_score(y_va, p.argmax(1))
print(f" acc={acc:.4f}")
models["xgb"] = m
except ImportError:
pass
# LightGBM
try:
import lightgbm as lgb
print(f" [LGB] Training {market.upper()}...")
ds_tr = lgb.Dataset(X_tr, label=y_tr)
ds_va = lgb.Dataset(X_va, label=y_va, reference=ds_tr)
par = {
"objective": "multiclass" if n_class==3 else "binary",
"metric": "multi_logloss" if n_class==3 else "binary_logloss",
"num_leaves": 48, "learning_rate": 0.02,
"feature_fraction": 0.7, "bagging_fraction": 0.7,
"bagging_freq": 1, "min_child_samples": 30,
"lambda_l1": 0.5, "lambda_l2": 2.0, "verbose": -1,
}
if n_class == 3:
par["num_class"] = 3
m = lgb.train(par, ds_tr, 2000, valid_sets=[ds_va],
callbacks=[lgb.early_stopping(80, verbose=False)])
p = m.predict(X_va)
if n_class == 2:
p = np.column_stack([1-p, p])
acc = accuracy_score(y_va, p.argmax(1))
print(f" acc={acc:.4f}")
models["lgb"] = m
except ImportError:
pass
# CatBoost
try:
from catboost import CatBoostClassifier
print(f" [CB] Training {market.upper()}...")
m = CatBoostClassifier(
iterations=2000, learning_rate=0.02, depth=6,
l2_leaf_reg=5, loss_function="MultiClass" if n_class==3 else "Logloss",
early_stopping_rounds=80, verbose=0, task_type="CPU",
**({"classes_count": 3} if n_class==3 else {}),
)
m.fit(X_tr, y_tr, eval_set=(X_va, y_va))
p = m.predict_proba(X_va)
acc = accuracy_score(y_va, p.argmax(1))
print(f" acc={acc:.4f}")
models["cb"] = m
except ImportError:
pass
return models
def ensemble_predict(models, X, feat_cols, n_class=3):
preds = []
for name, m in models.items():
if name == "xgb":
import xgboost as xgb
dm = xgb.DMatrix(X, feature_names=feat_cols)
p = m.predict(dm)
if n_class == 2 and p.ndim == 1:
p = np.column_stack([1-p, p])
elif name == "lgb":
p = m.predict(X)
if n_class == 2 and p.ndim == 1:
p = np.column_stack([1-p, p])
elif name == "cb":
p = m.predict_proba(X)
preds.append(np.array(p))
if not preds:
raise RuntimeError("No models!")
return np.mean(preds, axis=0)
# ═══════════════════════════════════════════════════════════════════
# STAGE B: Walk-Forward Backtest with Kelly
# ═══════════════════════════════════════════════════════════════════
def kelly_fraction(model_prob, odds, fraction=0.25):
"""Fractional Kelly: f = fraction * (p*odds - 1) / (odds - 1)"""
edge = model_prob * odds - 1
if edge <= 0 or odds <= 1:
return 0.0
f = edge / (odds - 1)
return max(0, min(fraction * f, 0.10)) # cap at 10% bankroll
def backtest_value(models, df_test, feat_cols, market="ms",
min_edge=0.05, min_odds=1.40, max_odds=4.50,
use_kelly=True):
"""Realistic backtest: flat or Kelly sizing, edge filtering."""
X = df_test[feat_cols].values
n_class = 3 if market == "ms" else 2
probs = ensemble_predict(models, X, feat_cols, n_class)
if market == "ms":
y = df_test["label_ms"].values
odds_arr = df_test[["odds_ms_h","odds_ms_d","odds_ms_a"]].values
implied = df_test[["implied_h","implied_d","implied_a"]].values
class_names = ["Home","Draw","Away"]
elif market == "ou25":
if "label_ou25" not in df_test.columns:
return {}
y = df_test["label_ou25"].values
o_over = pd.to_numeric(df_test["odds_ou25_o"], errors="coerce").fillna(1.85).values if "odds_ou25_o" in df_test.columns else np.full(len(df_test), 1.85)
o_under = pd.to_numeric(df_test["odds_ou25_u"], errors="coerce").fillna(1.85).values if "odds_ou25_u" in df_test.columns else np.full(len(df_test), 1.85)
odds_arr = np.column_stack([o_under, o_over])
m = 1/odds_arr
implied = m / m.sum(axis=1, keepdims=True)
class_names = ["Under","Over"]
else:
return {}
results = {"bets": [], "total": 0, "wins": 0, "pnl": 0.0, "bankroll_curve": [1000.0]}
bankroll = 1000.0
for i in range(len(y)):
for cls in range(n_class):
edge = probs[i, cls] - implied[i, cls]
odds_val = odds_arr[i, cls]
# FILTERS
if edge < min_edge:
continue
if odds_val < min_odds or odds_val > max_odds:
continue
# Don't bet on heavy favorites with tiny edge
if implied[i, cls] > 0.65 and edge < 0.08:
continue
# Sizing
if use_kelly:
frac = kelly_fraction(probs[i, cls], odds_val, fraction=0.15)
stake = bankroll * frac
else:
stake = 10.0 # flat
if stake < 1:
continue
won = (y[i] == cls)
pnl = stake * (odds_val - 1) if won else -stake
bankroll += pnl
results["bets"].append({
"edge": float(edge), "odds": float(odds_val),
"model_p": float(probs[i,cls]), "implied_p": float(implied[i,cls]),
"won": bool(won), "pnl": float(pnl), "stake": float(stake),
"class": class_names[cls],
})
results["bankroll_curve"].append(bankroll)
results["total"] += 1
if won:
results["wins"] += 1
results["pnl"] = bankroll - 1000.0
return results
def print_backtest(results, label=""):
total = results.get("total", 0)
if total == 0:
print(f" {label}: No bets placed")
return
wins = results["wins"]
pnl = results["pnl"]
hit = wins/total*100
roi = pnl / sum(b["stake"] for b in results["bets"]) * 100
curve = results["bankroll_curve"]
peak = max(curve)
dd = min((c - peak) / peak * 100 for c in curve if c <= peak) if len(curve) > 1 else 0
# Per-class breakdown
by_class = {}
for b in results["bets"]:
cls = b["class"]
if cls not in by_class:
by_class[cls] = {"n": 0, "w": 0, "pnl": 0}
by_class[cls]["n"] += 1
if b["won"]:
by_class[cls]["w"] += 1
by_class[cls]["pnl"] += b["pnl"]
print(f"\n {label}")
print(f" Bets: {total} | Hit: {hit:.1f}% | ROI: {roi:+.1f}%")
print(f" PnL: {pnl:+.0f} | Final: {curve[-1]:.0f} | MaxDD: {dd:.1f}%")
for cls, d in sorted(by_class.items()):
r = d["pnl"]/d["n"]*100 if d["n"] > 0 else 0
print(f" {cls:6s}: {d['n']:4d} bets, "
f"hit={d['w']/d['n']*100:.1f}%, avg_pnl={r:+.1f}%")
# ═══════════════════════════════════════════════════════════════════
# MAIN
# ═══════════════════════════════════════════════════════════════════
def main():
print("=" * 65)
print(" V27 VALUE SNIPER — PRO TRAINING (Odds-Free Fundamentals)")
print("=" * 65)
t0 = time.time()
df = load_data()
clean_feats = get_clean_features(df)
print(f" Clean features (no odds): {len(clean_feats)}")
# Numerify
for c in clean_feats:
df[c] = pd.to_numeric(df[c], errors="coerce")
df[clean_feats] = df[clean_feats].fillna(df[clean_feats].median())
# Remove constant columns
clean_feats = [c for c in clean_feats if df[c].nunique() > 1]
print(f" After removing constants: {len(clean_feats)}")
# Split
tr, va, te = temporal_split(df)
print(f" Train: {len(tr)}, Val: {len(va)}, Test: {len(te)}")
print(f" Target: H={tr.label_ms.eq(0).mean():.1%}, "
f"D={tr.label_ms.eq(1).mean():.1%}, A={tr.label_ms.eq(2).mean():.1%}")
X_tr = tr[clean_feats].values
y_tr = tr["label_ms"].values
X_va = va[clean_feats].values
y_va = va["label_ms"].values
# ── STAGE A: Train fundamentals model (1X2) ──
print("\n" + ""*65)
print(" STAGE A: Fundamentals-Only 1X2 Model")
print(""*65)
ms_models = train_fundamentals_model(X_tr, y_tr, X_va, y_va, clean_feats, "ms")
val_probs = ensemble_predict(ms_models, X_va, clean_feats, 3)
val_acc = accuracy_score(y_va, val_probs.argmax(1))
val_ll = log_loss(y_va, val_probs)
print(f"\n Ensemble Val: acc={val_acc:.4f}, logloss={val_ll:.4f}")
# Compare with odds baseline
odds_pred = va[["implied_h","implied_d","implied_a"]].values.argmax(1)
odds_acc = accuracy_score(y_va, odds_pred)
print(f" Odds baseline: acc={odds_acc:.4f}")
print(f" Model vs Odds: {val_acc - odds_acc:+.4f}")
# ── STAGE B: O/U 2.5 Model ──
ou_models = None
if "label_ou25" in tr.columns:
print("\n" + ""*65)
print(" STAGE A.2: Fundamentals-Only O/U 2.5 Model")
print(""*65)
y_tr_ou = tr["label_ou25"].values
y_va_ou = va["label_ou25"].values
mask_tr = ~np.isnan(y_tr_ou)
mask_va = ~np.isnan(y_va_ou)
if mask_tr.sum() > 1000:
ou_models = train_fundamentals_model(
X_tr[mask_tr], y_tr_ou[mask_tr].astype(int),
X_va[mask_va], y_va_ou[mask_va].astype(int),
clean_feats, "ou25")
# ── STAGE C: Backtest ──
print("\n" + ""*65)
print(" STAGE B: Walk-Forward Backtest (Test Set)")
print(""*65)
# Try multiple edge thresholds
best_roi = -999
best_cfg = {}
for min_edge in [0.03, 0.05, 0.07, 0.10, 0.12, 0.15]:
for min_odds in [1.35, 1.50, 1.70]:
r = backtest_value(ms_models, te, clean_feats, "ms",
min_edge=min_edge, min_odds=min_odds,
max_odds=5.0, use_kelly=True)
if r.get("total", 0) >= 20:
invested = sum(b["stake"] for b in r["bets"])
roi = r["pnl"] / invested * 100 if invested > 0 else -100
if roi > best_roi:
best_roi = roi
best_cfg = {"edge": min_edge, "min_odds": min_odds, "result": r}
if best_cfg:
cfg = best_cfg
print(f"\n Best 1X2 Config: edge>{cfg['edge']}, odds>{cfg['min_odds']}")
print_backtest(cfg["result"], "1X2 VALUE")
# Flat bet comparison
print("\n --- Flat Bet Comparison ---")
for edge in [0.05, 0.07, 0.10]:
r = backtest_value(ms_models, te, clean_feats, "ms",
min_edge=edge, min_odds=1.50, max_odds=4.5,
use_kelly=False)
if r.get("total", 0) > 0:
inv = r["total"] * 10
roi = r["pnl"]/inv*100
print(f" Edge>{edge:.2f}: {r['total']} bets, "
f"hit={r['wins']/r['total']*100:.1f}%, ROI={roi:+.1f}%")
# OU25 backtest
if ou_models:
print("\n --- O/U 2.5 Backtest ---")
for edge in [0.05, 0.07, 0.10]:
r = backtest_value(ou_models, te, clean_feats, "ou25",
min_edge=edge, min_odds=1.50, max_odds=3.0,
use_kelly=True)
if r.get("total", 0) > 0:
print_backtest(r, f"OU25 edge>{edge}")
# ── Feature importance ──
if "lgb" in ms_models:
imp = ms_models["lgb"].feature_importance(importance_type="gain")
imp_df = pd.DataFrame({"feature": clean_feats, "importance": imp}
).sort_values("importance", ascending=False)
print("\n TOP 15 FEATURES (no odds!):")
for _, r in imp_df.head(15).iterrows():
print(f" {r['feature']:40s} {r['importance']:.0f}")
imp_df.to_csv(MODELS_DIR / "v27_feature_importance.csv", index=False)
# ── Save ──
print("\n" + ""*65)
print(" SAVING MODELS")
print(""*65)
for name, m in ms_models.items():
p = MODELS_DIR / f"v27_ms_{name}.pkl"
with open(p, "wb") as f:
pickle.dump(m, f)
print(f"{p.name}")
if ou_models:
for name, m in ou_models.items():
p = MODELS_DIR / f"v27_ou25_{name}.pkl"
with open(p, "wb") as f:
pickle.dump(m, f)
print(f"{p.name}")
meta = {
"version": "v27-pro", "trained_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"approach": "odds-free fundamentals + value edge detection",
"feature_count": len(clean_feats),
"total_samples": len(df),
"val_acc": round(val_acc, 4), "val_ll": round(val_ll, 4),
"best_config": {k: v for k, v in best_cfg.items() if k != "result"} if best_cfg else {},
"markets": ["ms"] + (["ou25"] if ou_models else []),
}
with open(MODELS_DIR / "v27_metadata.json", "w") as f:
json.dump(meta, f, indent=2, default=str)
with open(MODELS_DIR / "v27_feature_cols.json", "w") as f:
json.dump(clean_feats, f, indent=2)
print(f" ✓ metadata + feature_cols")
print(f"\n Total time: {(time.time()-t0)/60:.1f} min")
print(" DONE!")
if __name__ == "__main__":
main()