iddaai-be/ai-engine/scripts/backtest_niche.py

"""
V27 FINAL BACKTEST — Conservative Flat Bet
Only the strongest validated edges. No Kelly compounding.
"""
import pandas as pd, numpy as np

df = pd.read_csv('data/training_data_v27.csv', low_memory=False)
for c in df.columns:
    if c not in ['match_id','league_name','home_team','away_team']:
        df[c] = pd.to_numeric(df[c], errors='coerce')
df = df.dropna(subset=['odds_ms_h','odds_ms_d','odds_ms_a'])
df = df[(df.odds_ms_h>1.01)&(df.odds_ms_d>1.01)&(df.odds_ms_a>1.01)]

n = len(df)
# 5-fold walk-forward: train on 60%, validate patterns, test on remaining
folds = 5
fold_size = n // folds
all_results = []

print("="*65)
print("  V27 WALK-FORWARD FLAT-BET BACKTEST")
print("="*65)

for fold in range(2, folds):  # start from fold 2 so we have enough training data
    train_end = fold * fold_size
    test_start = train_end
    test_end = (fold+1)*fold_size if fold < folds-1 else n

    train_df = df.iloc[:train_end]
    test_df = df.iloc[test_start:test_end]

    print(f"\n  --- Fold {fold}: train={len(train_df)}, test={len(test_df)} ---")

    # Discover REST edges from training data
    strategies = []

    for hr in [5, 7, 10, 14]:
        for ar in [3, 4, 5]:
            for cls, col in [(0,'odds_ms_h'), (2,'odds_ms_a')]:
                idx = (train_df.home_days_rest > hr) & (train_df.away_days_rest < ar)
                sub = train_df[idx]
                if len(sub) < 50:
                    continue
                rate = (sub.label_ms == cls).mean()
                avg_odds = sub[col].mean()
                ev = rate * avg_odds
                if ev > 1.02:  # only strong edges (>2% edge)
                    strategies.append((hr, ar, cls, rate, avg_odds, ev, len(sub)))

    if not strategies:
        print("    No strong edges found in training data")
        continue

    # Apply best strategies to test
    strategies.sort(key=lambda x: x[5], reverse=True)
    best = strategies[:3]  # top 3 only

    fold_bets = 0
    fold_wins = 0
    fold_pnl = 0
    stake = 10  # flat 10 units

    for _, row in test_df.iterrows():
        for hr, ar, cls, est_p, _, _, _ in best:
            if pd.isna(row.home_days_rest) or pd.isna(row.away_days_rest):
                continue
            if row.home_days_rest <= hr or row.away_days_rest >= ar:
                continue
            odds_col = ['odds_ms_h','odds_ms_d','odds_ms_a'][cls]
            odds_val = row[odds_col]
            if pd.isna(odds_val) or odds_val < 1.50 or odds_val > 5.0:
                continue
            # Additional filter: only bet when odds give reasonable EV
            if est_p * odds_val < 1.0:
                continue

            won = (row.label_ms == cls)
            pnl = stake * (odds_val - 1) if won else -stake
            fold_bets += 1
            if won:
                fold_wins += 1
            fold_pnl += pnl
            all_results.append({'fold': fold, 'won': won, 'pnl': pnl,
                                'odds': odds_val, 'stake': stake,
                                'cls': ['H','D','A'][cls]})

    if fold_bets > 0:
        roi = fold_pnl / (fold_bets * stake) * 100
        print(f"    Best strategies: {[(h,a,['H','D','A'][c],f'EV={e:.3f}') for h,a,c,_,_,e,_ in best]}")
        print(f"    Bets: {fold_bets}, Wins: {fold_wins} ({fold_wins/fold_bets*100:.1f}%), "
              f"ROI: {roi:+.1f}%, PnL: {fold_pnl:+.0f}")

# Overall
print("\n" + "="*65)
print("  OVERALL RESULTS")
print("="*65)
if all_results:
    total = len(all_results)
    wins = sum(1 for r in all_results if r['won'])
    total_pnl = sum(r['pnl'] for r in all_results)
    total_staked = sum(r['stake'] for r in all_results)
    roi = total_pnl / total_staked * 100

    print(f"  Total bets:   {total}")
    print(f"  Wins:         {wins} ({wins/total*100:.1f}%)")
    print(f"  Total staked: {total_staked:.0f}")
    print(f"  PnL:          {total_pnl:+.0f}")
    print(f"  ROI:          {roi:+.1f}%")
    print(f"  Avg odds:     {np.mean([r['odds'] for r in all_results]):.2f}")

    # By class
    print("\n  --- By Bet Type ---")
    for cls in ['H','A']:
        cb = [r for r in all_results if r['cls'] == cls]
        if cb:
            cw = sum(1 for r in cb if r['won'])
            cp = sum(r['pnl'] for r in cb)
            cs = sum(r['stake'] for r in cb)
            print(f"    {cls}: {len(cb)} bets, hit={cw/len(cb)*100:.1f}%, ROI={cp/cs*100:+.1f}%")

    # Cumulative PnL curve
    print("\n  --- Cumulative PnL ---")
    cum = 0
    step = max(1, total // 15)
    for j in range(0, total, step):
        cum = sum(r['pnl'] for r in all_results[:j+1])
        print(f"    After bet {j+1:4d}: PnL={cum:+.0f}")
    cum = sum(r['pnl'] for r in all_results)
    print(f"    After bet {total:4d}: PnL={cum:+.0f} (FINAL)")
else:
    print("  No bets placed!")

# ── Now combine with MODEL for smarter filtering ──
print("\n" + "="*65)
print("  COMBINED: Rest Rules + Fundamentals Model")
print("="*65)

import pickle, json
from pathlib import Path
MODELS_DIR = Path("models/v27")

feat_cols = json.load(open(MODELS_DIR / "v27_feature_cols.json"))
ms_models = {}
for name in ['xgb','lgb','cb']:
    p = MODELS_DIR / f"v27_ms_{name}.pkl"
    if p.exists():
        with open(p,'rb') as f:
            ms_models[name] = pickle.load(f)

if ms_models:
    test_df = df.iloc[int(n*0.8):].copy()
    X_test = test_df[feat_cols].values

    # Get model predictions
    preds = []
    for name, m in ms_models.items():
        if name == 'xgb':
            import xgboost as xgb
            dm = xgb.DMatrix(X_test, feature_names=feat_cols)
            preds.append(m.predict(dm))
        elif name == 'lgb':
            preds.append(m.predict(X_test))
        elif name == 'cb':
            preds.append(m.predict_proba(X_test))
    model_probs = np.mean(preds, axis=0)  # (n, 3)

    # Now apply rest rules + model agreement
    margin = 1/test_df.odds_ms_h.values + 1/test_df.odds_ms_d.values + 1/test_df.odds_ms_a.values
    impl = np.column_stack([
        (1/test_df.odds_ms_h.values)/margin,
        (1/test_df.odds_ms_d.values)/margin,
        (1/test_df.odds_ms_a.values)/margin,
    ])

    combo_bets = 0
    combo_wins = 0
    combo_pnl = 0

    for j in range(len(test_df)):
        row = test_df.iloc[j]
        for hr, ar in [(14,5),(10,5),(7,5),(5,5)]:
            if pd.isna(row.home_days_rest) or pd.isna(row.away_days_rest):
                continue
            if row.home_days_rest <= hr or row.away_days_rest >= ar:
                continue
            for cls in [0, 2]:
                odds_val = [row.odds_ms_h, row.odds_ms_d, row.odds_ms_a][cls]
                if pd.isna(odds_val) or odds_val < 1.50 or odds_val > 5.0:
                    continue

                model_p = model_probs[j, cls]
                impl_p = impl[j, cls]

                # DOUBLE FILTER: rest rule + model agrees (model_prob > implied)
                if model_p <= impl_p:
                    continue  # model disagrees, skip
                edge = model_p - impl_p
                if edge < 0.03:
                    continue  # too small

                won = (row.label_ms == cls)
                pnl = 10 * (odds_val - 1) if won else -10
                combo_bets += 1
                if won:
                    combo_wins += 1
                combo_pnl += pnl

    if combo_bets > 0:
        roi = combo_pnl / (combo_bets * 10) * 100
        print(f"  Bets:   {combo_bets}")
        print(f"  Wins:   {combo_wins} ({combo_wins/combo_bets*100:.1f}%)")
        print(f"  PnL:    {combo_pnl:+.0f}")
        print(f"  ROI:    {roi:+.1f}%")
    else:
        print("  No combined bets triggered")