main

2026-05-17 02:17:22 +03:00
parent 17ace9bd12
commit 94c7a4481a
53 changed files with 29602 additions and 7832 deletions
@@ -0,0 +1,510 @@
+"""
+Calibration Backfill Script
+============================
+Runs V25 model against historical matches (using pre-computed ai_features + odds)
+to generate calibration training data, then trains isotonic calibration models.
+
+Usage:
+    python ai-engine/scripts/backfill_calibration.py
+    python ai-engine/scripts/backfill_calibration.py --limit 5000
+    python ai-engine/scripts/backfill_calibration.py --min-samples 50
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+import psycopg2
+from psycopg2.extras import RealDictCursor
+from dotenv import load_dotenv
+
+AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, AI_ENGINE_DIR)
+
+from models.v25_ensemble import V25Predictor
+from models.calibration import get_calibrator
+
+load_dotenv()
+
+
+def _normalize_pick(pick) -> str:
+    return str(pick or "").strip().casefold()
+
+
+def resolve_actual(market, pick, score_home, score_away, ht_home, ht_away):
+    if score_home is None or score_away is None:
+        return None
+    market = (market or "").upper()
+    p = _normalize_pick(pick)
+    total = score_home + score_away
+    ht_total = (ht_home or 0) + (ht_away or 0) if ht_home is not None else None
+
+    if market == "MS":
+        if p == "1": return int(score_home > score_away)
+        if p in {"x", "0"}: return int(score_home == score_away)
+        if p == "2": return int(score_away > score_home)
+        return None
+    if market in {"OU15", "OU25", "OU35"}:
+        line = {"OU15": 1.5, "OU25": 2.5, "OU35": 3.5}[market]
+        if "over" in p or "üst" in p or "ust" in p: return int(total > line)
+        if "under" in p or "alt" in p: return int(total < line)
+        return None
+    if market == "BTTS":
+        both = score_home > 0 and score_away > 0
+        if "yes" in p or "var" in p: return int(both)
+        if "no" in p or "yok" in p: return int(not both)
+        return None
+    if market == "HT":
+        if ht_home is None or ht_away is None: return None
+        if p == "1": return int(ht_home > ht_away)
+        if p in {"x", "0"}: return int(ht_home == ht_away)
+        if p == "2": return int(ht_away > ht_home)
+        return None
+    if market == "HTFT":
+        if ht_home is None or ht_away is None or "/" not in p: return None
+        ht_p, ft_p = p.split("/")
+        ht_actual = "1" if ht_home > ht_away else "2" if ht_away > ht_home else "x"
+        ft_actual = "1" if score_home > score_away else "2" if score_away > score_home else "x"
+        return int(ht_p.strip() == ht_actual and ft_p.strip() == ft_actual)
+    if market == "DC":
+        norm = p.replace("-", "").upper()
+        if norm == "1X": return int(score_home >= score_away)
+        if norm == "X2": return int(score_away >= score_home)
+        if norm == "12": return int(score_home != score_away)
+        return None
+    return None
+
+
+def calibrator_key(market, pick):
+    m = (market or "").upper()
+    p = _normalize_pick(pick)
+    if m == "MS":
+        if p == "1": return "ms_home"
+        if p in {"x", "0"}: return "ms_draw"
+        if p == "2": return "ms_away"
+        return None
+    if m == "DC": return "dc"
+    if m == "OU15" and ("over" in p or "üst" in p): return "ou15"
+    if m == "OU25" and ("over" in p or "üst" in p): return "ou25"
+    if m == "OU35" and ("over" in p or "üst" in p): return "ou35"
+    if m == "BTTS" and ("yes" in p or "var" in p): return "btts"
+    if m == "HT":
+        if p == "1": return "ht_home"
+        if p in {"x", "0"}: return "ht_draw"
+        if p == "2": return "ht_away"
+        return None
+    if m == "HTFT": return "ht_ft"
+    return None
+
+
+def get_conn():
+    db_url = os.getenv("DATABASE_URL", "")
+    if "?schema=" in db_url:
+        db_url = db_url.split("?schema=")[0]
+    if not db_url:
+        raise ValueError("DATABASE_URL not set")
+    return psycopg2.connect(db_url, cursor_factory=RealDictCursor)
+
+
+ODD_CAT_MAP = {
+    "maç sonucu": {"1": "ms_h", "0": "ms_d", "x": "ms_d", "2": "ms_a"},
+    "1. yarı sonucu": {"1": "ht_ms_h", "0": "ht_ms_d", "x": "ht_ms_d", "2": "ht_ms_a"},
+}
+
+ODD_CAT_KEYWORD_MAP = {
+    "karşılıklı gol": {"var": "btts_y", "yok": "btts_n"},
+    "0,5 alt/üst": {"alt": "ou05_u", "üst": "ou05_o"},
+    "1,5 alt/üst": {"alt": "ou15_u", "üst": "ou15_o"},
+    "2,5 alt/üst": {"alt": "ou25_u", "üst": "ou25_o"},
+    "3,5 alt/üst": {"alt": "ou35_u", "üst": "ou35_o"},
+    "ilk yarı 0,5 alt/üst": {"alt": "ht_ou05_u", "üst": "ht_ou05_o"},
+    "ilk yarı 1,5 alt/üst": {"alt": "ht_ou15_u", "üst": "ht_ou15_o"},
+}
+
+
+def load_matches(cur, limit: int) -> List[Dict]:
+    cur.execute("""
+        SELECT m.id, m.score_home, m.score_away,
+               m.ht_score_home, m.ht_score_away
+        FROM matches m
+        JOIN football_ai_features f ON f.match_id = m.id
+        WHERE m.status = 'FT'
+          AND m.sport = 'football'
+          AND m.score_home IS NOT NULL
+          AND m.score_away IS NOT NULL
+        ORDER BY m.mst_utc DESC
+        LIMIT %s
+    """, (limit,))
+    return cur.fetchall()
+
+
+def load_ai_features_batch(cur, match_ids: List[str]) -> Dict[str, Dict]:
+    if not match_ids:
+        return {}
+    ph = ",".join(["%s"] * len(match_ids))
+    cur.execute(f"""
+        SELECT match_id,
+               home_elo AS home_overall_elo,
+               away_elo AS away_overall_elo,
+               elo_diff,
+               home_home_elo, away_away_elo,
+               home_form_elo, away_form_elo,
+               (home_form_elo - away_form_elo) AS form_elo_diff,
+               home_goals_avg_5 AS home_goals_avg,
+               home_conceded_avg_5 AS home_conceded_avg,
+               away_goals_avg_5 AS away_goals_avg,
+               away_conceded_avg_5 AS away_conceded_avg,
+               home_clean_sheet_rate, away_clean_sheet_rate,
+               home_scoring_rate, away_scoring_rate,
+               home_win_streak AS home_winning_streak,
+               away_win_streak AS away_winning_streak,
+               0 AS home_unbeaten_streak,
+               0 AS away_unbeaten_streak,
+               h2h_total AS h2h_total_matches,
+               h2h_home_win_rate,
+               (1.0 - h2h_home_win_rate - 0.33) AS h2h_draw_rate,
+               h2h_avg_goals,
+               h2h_btts_rate, h2h_over25_rate,
+               home_avg_possession, away_avg_possession,
+               home_avg_shots_on_target, away_avg_shots_on_target,
+               home_shot_conversion, away_shot_conversion,
+               0.0 AS home_avg_corners, 0.0 AS away_avg_corners,
+               implied_home, implied_draw, implied_away,
+               league_avg_goals,
+               0.0 AS league_zero_goal_rate,
+               0.0 AS home_xga, 0.0 AS away_xga,
+               0.0 AS upset_atmosphere, 0.0 AS upset_motivation,
+               0.0 AS upset_fatigue, 0.0 AS upset_potential,
+               referee_home_bias, referee_avg_goals,
+               referee_avg_cards AS referee_cards_total,
+               0.0 AS referee_avg_yellow,
+               0.0 AS referee_experience,
+               0.0 AS home_momentum_score, 0.0 AS away_momentum_score,
+               0.0 AS momentum_diff,
+               0.0 AS home_squad_quality, 0.0 AS away_squad_quality,
+               0.0 AS squad_diff,
+               0 AS home_key_players, 0 AS away_key_players,
+               missing_players_impact AS home_missing_impact,
+               0.0 AS away_missing_impact,
+               home_goals_avg_5 AS home_goals_form,
+               away_goals_avg_5 AS away_goals_form
+        FROM football_ai_features
+        WHERE match_id IN ({ph})
+    """, match_ids)
+    return {str(row["match_id"]): dict(row) for row in cur.fetchall()}
+
+
+def load_odds_batch(cur, match_ids: List[str]) -> Dict[str, Dict[str, float]]:
+    if not match_ids:
+        return {}
+    ph = ",".join(["%s"] * len(match_ids))
+    cur.execute(f"""
+        SELECT oc.match_id, oc.name AS cat_name,
+               os.name AS sel_name, os.odd_value
+        FROM odd_selections os
+        JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
+        WHERE oc.match_id IN ({ph})
+    """, match_ids)
+
+    odds: Dict[str, Dict[str, float]] = {}
+    for row in cur.fetchall():
+        mid = str(row["match_id"])
+        cat = (row["cat_name"] or "").lower().strip()
+        sel = (row["sel_name"] or "").strip()
+        val = float(row["odd_value"]) if row["odd_value"] else 0
+        if val <= 0:
+            continue
+        if mid not in odds:
+            odds[mid] = {}
+
+        if cat in ODD_CAT_MAP:
+            key = ODD_CAT_MAP[cat].get(sel.lower())
+            if key:
+                odds[mid][key] = val
+        else:
+            for cat_pattern, kw_map in ODD_CAT_KEYWORD_MAP.items():
+                if cat == cat_pattern:
+                    for keyword, key in kw_map.items():
+                        if keyword in sel.lower():
+                            odds[mid][key] = val
+                    break
+    return odds
+
+
+MARKETS_TO_PREDICT = [
+    ("MS", "1", lambda p: p[0]),
+    ("MS", "X", lambda p: p[1]),
+    ("MS", "2", lambda p: p[2]),
+    ("OU25", "Over 2.5", lambda p: p[0]),
+    ("BTTS", "Yes", lambda p: p[0]),
+    ("OU15", "Over 1.5", lambda p: p[0]),
+    ("OU35", "Over 3.5", lambda p: p[0]),
+    ("HT", "1", lambda p: p[0]),
+    ("HT", "X", lambda p: p[1]),
+    ("HT", "2", lambda p: p[2]),
+]
+
+
+def run_backfill(args):
+    print("=" * 70)
+    print("CALIBRATION BACKFILL")
+    print("=" * 70)
+
+    conn = get_conn()
+    cur = conn.cursor(cursor_factory=RealDictCursor)
+
+    t0 = time.time()
+    print(f"Loading matches (limit={args.limit})...")
+    matches = load_matches(cur, args.limit)
+    print(f"  Found {len(matches)} finished matches with ai_features")
+
+    match_ids = [str(m["id"]) for m in matches]
+    match_map = {str(m["id"]): m for m in matches}
+
+    print("Loading ai_features...")
+    features_map = load_ai_features_batch(cur, match_ids)
+    print(f"  Loaded features for {len(features_map)} matches")
+
+    print("Loading odds...")
+    odds_map = load_odds_batch(cur, match_ids)
+    print(f"  Loaded odds for {len(odds_map)} matches")
+
+    print(f"Data loading: {time.time() - t0:.1f}s")
+
+    print("\nLoading V25 model...")
+    predictor = V25Predictor()
+    predictor.load_models()
+
+    feature_cols = predictor.FEATURE_COLS
+
+    samples: List[Dict[str, Any]] = []
+    skipped = 0
+    processed = 0
+
+    print(f"\nRunning predictions on {len(match_ids)} matches...")
+    t1 = time.time()
+
+    for i, mid in enumerate(match_ids):
+        if mid not in features_map:
+            skipped += 1
+            continue
+
+        feat_row = features_map[mid]
+        odds_row = odds_map.get(mid, {})
+        match_row = match_map[mid]
+
+        feat_dict = {}
+        for col in feature_cols:
+            if col in feat_row and feat_row[col] is not None:
+                feat_dict[col] = float(feat_row[col])
+            elif col.startswith("odds_") and not col.endswith("_present"):
+                odds_key = col.replace("odds_", "")
+                feat_dict[col] = float(odds_row.get(odds_key, 0))
+            elif col.endswith("_present"):
+                base = col.replace("_present", "")
+                odds_key = base.replace("odds_", "")
+                feat_dict[col] = 1.0 if odds_row.get(odds_key, 0) > 0 else 0.0
+            else:
+                feat_dict[col] = 0.0
+
+        if odds_row.get("ms_h", 0) > 0:
+            feat_dict["odds_ms_h"] = odds_row["ms_h"]
+        if odds_row.get("ms_d", 0) > 0:
+            feat_dict["odds_ms_d"] = odds_row["ms_d"]
+        if odds_row.get("ms_a", 0) > 0:
+            feat_dict["odds_ms_a"] = odds_row["ms_a"]
+
+        ms_h = feat_dict.get("odds_ms_h", 0)
+        ms_d = feat_dict.get("odds_ms_d", 0)
+        ms_a = feat_dict.get("odds_ms_a", 0)
+        if ms_h > 0 and ms_d > 0 and ms_a > 0:
+            raw_sum = 1/ms_h + 1/ms_d + 1/ms_a
+            feat_dict["implied_home"] = (1/ms_h) / raw_sum
+            feat_dict["implied_draw"] = (1/ms_d) / raw_sum
+            feat_dict["implied_away"] = (1/ms_a) / raw_sum
+
+        sh = match_row["score_home"]
+        sa = match_row["score_away"]
+        ht_h = match_row.get("ht_score_home")
+        ht_a = match_row.get("ht_score_away")
+
+        try:
+            X = pd.DataFrame([{c: feat_dict.get(c, 0.0) for c in feature_cols}])
+
+            for market_name, model_key, market_list in [
+                ("ms", "ms", ["MS"]),
+                ("ou25", "ou25", ["OU25"]),
+                ("btts", "btts", ["BTTS"]),
+                ("ou15", "ou15", ["OU15"]),
+                ("ou35", "ou35", ["OU35"]),
+                ("ht_result", "ht_result", ["HT"]),
+            ]:
+                if model_key not in predictor.models:
+                    continue
+
+                probs = predictor.predict_market(model_key, feat_dict)
+                if probs is None:
+                    continue
+
+                if model_key == "ms":
+                    for pick, prob in [("1", probs[0]), ("X", probs[1]), ("2", probs[2])]:
+                        actual = resolve_actual("MS", pick, sh, sa, ht_h, ht_a)
+                        key = calibrator_key("MS", pick)
+                        if actual is not None and key:
+                            samples.append({
+                                "match_id": mid,
+                                "market": "MS",
+                                "pick": pick,
+                                "key": key,
+                                "raw_prob": float(prob),
+                                "actual": int(actual),
+                            })
+
+                elif model_key == "ht_result":
+                    if ht_h is None or ht_a is None:
+                        continue
+                    for pick, prob in [("1", probs[0]), ("X", probs[1]), ("2", probs[2])]:
+                        actual = resolve_actual("HT", pick, sh, sa, ht_h, ht_a)
+                        key = calibrator_key("HT", pick)
+                        if actual is not None and key:
+                            samples.append({
+                                "match_id": mid,
+                                "market": "HT",
+                                "pick": pick,
+                                "key": key,
+                                "raw_prob": float(prob),
+                                "actual": int(actual),
+                            })
+
+                elif model_key in ("ou25", "ou15", "ou35"):
+                    market_upper = model_key.upper()
+                    over_prob = float(probs[0]) if len(probs) > 0 else 0.5
+                    pick = f"Over"
+                    actual = resolve_actual(market_upper, "Over", sh, sa, ht_h, ht_a)
+                    key = calibrator_key(market_upper, "Over")
+                    if actual is not None and key:
+                        samples.append({
+                            "match_id": mid,
+                            "market": market_upper,
+                            "pick": pick,
+                            "key": key,
+                            "raw_prob": over_prob,
+                            "actual": int(actual),
+                        })
+
+                elif model_key == "btts":
+                    yes_prob = float(probs[0]) if len(probs) > 0 else 0.5
+                    actual = resolve_actual("BTTS", "Yes", sh, sa, ht_h, ht_a)
+                    key = calibrator_key("BTTS", "Yes")
+                    if actual is not None and key:
+                        samples.append({
+                            "match_id": mid,
+                            "market": "BTTS",
+                            "pick": "Yes",
+                            "key": key,
+                            "raw_prob": yes_prob,
+                            "actual": int(actual),
+                        })
+
+            processed += 1
+
+        except Exception as e:
+            skipped += 1
+            if skipped <= 5:
+                print(f"  Error on {mid}: {e}")
+
+        if (i + 1) % 5000 == 0:
+            elapsed = time.time() - t1
+            rate = (i + 1) / elapsed
+            print(f"  Processed {i+1}/{len(match_ids)} ({rate:.0f} matches/s)")
+
+    elapsed = time.time() - t1
+    print(f"\nPrediction complete: {processed} matches, {skipped} skipped, {elapsed:.1f}s")
+
+    if not samples:
+        print("No calibration samples generated!")
+        cur.close()
+        conn.close()
+        return
+
+    df = pd.DataFrame(samples)
+    print(f"\nTotal calibration samples: {len(df)}")
+    print(f"Unique matches: {df['match_id'].nunique()}")
+    print(f"\nPer-key counts:")
+    for key, count in df["key"].value_counts().items():
+        print(f"  {key:<14} {count}")
+
+    print(f"\nTraining isotonic calibration models (min_samples={args.min_samples})...")
+    calibrator = get_calibrator()
+    results: Dict[str, Any] = {}
+    keys = sorted(df["key"].unique())
+
+    for key in keys:
+        sub = df[df["key"] == key].copy()
+        sub = sub.drop_duplicates(subset=["match_id", "key"], keep="first")
+        sub = sub.dropna(subset=["raw_prob", "actual"])
+        sub = sub[(sub["raw_prob"] > 0.0) & (sub["raw_prob"] < 1.0)]
+
+        n = len(sub)
+        if n < args.min_samples:
+            results[key] = {"status": "skipped", "samples": n}
+            continue
+
+        metrics = calibrator.train_calibration(
+            df=sub,
+            market=key,
+            prob_col="raw_prob",
+            actual_col="actual",
+            min_samples=args.min_samples,
+            save=True,
+        )
+        results[key] = {
+            "status": "trained",
+            "samples": metrics.sample_count,
+            "brier": round(metrics.brier_score, 4),
+            "ece": round(metrics.calibration_error, 4),
+            "mean_predicted": round(metrics.mean_predicted, 4),
+            "mean_actual": round(metrics.mean_actual, 4),
+        }
+
+    print("\n" + "=" * 70)
+    print("CALIBRATION RESULTS")
+    print("=" * 70)
+    print(f"{'market':<14} {'status':<10} {'n':<8} {'brier':<9} {'ece':<8} {'pred_avg':<9} {'actual_avg'}")
+    print("-" * 70)
+    for key, info in sorted(results.items()):
+        if info["status"] == "trained":
+            print(
+                f"{key:<14} {'OK':<10} {info['samples']:<8} "
+                f"{info['brier']:<9.4f} {info['ece']:<8.4f} "
+                f"{info['mean_predicted']:<9.4f} {info['mean_actual']}"
+            )
+        else:
+            print(f"{key:<14} {'SKIP':<10} {info['samples']:<8}")
+    print("=" * 70)
+
+    total_time = time.time() - t0
+    print(f"\nTotal time: {total_time:.1f}s")
+    print(f"Calibration models saved to: {os.path.join(AI_ENGINE_DIR, 'models', 'calibration')}/")
+
+    cur.close()
+    conn.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Backfill calibration from historical matches")
+    parser.add_argument("--limit", type=int, default=50000,
+                        help="Max matches to process (default: 50000)")
+    parser.add_argument("--min-samples", type=int, default=100,
+                        help="Min samples per market for calibration (default: 100)")
+    args = parser.parse_args()
+    run_backfill(args)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,352 @@
+"""
+Tutarsızlık Bazlı Backtest
+============================
+Modeller arası tutarsızlığı ölçer, tutarlı maçlarda bahis açılsaydı
+ROI ne olurdu hesaplar.
+
+Mantık:
+- Her maç için market'ler arası çelişkileri tespit et
+- Tutarsız maçları filtrele
+- Tutarlı maçlarda hit rate ve ROI hesapla
+
+Usage:
+  python scripts/backtest_consistency.py
+"""
+
+import os, sys, json
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from sklearn.metrics import accuracy_score
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+                         'data', 'training_data.csv')
+MODELS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+                          'models', 'v25')
+
+SKIP_COLS = {
+    'match_id','home_team_id','away_team_id','league_id','mst_utc',
+    'score_home','score_away','total_goals','ht_score_home','ht_score_away','ht_total_goals',
+    'label_ms','label_ou05','label_ou15','label_ou25','label_ou35','label_btts',
+    'label_ht_result','label_ht_ou05','label_ht_ou15','label_ht_ft',
+    'label_odd_even','label_yellow_cards','label_cards_ou45','label_handicap_ms',
+}
+
+
+def load_model(market: str):
+    path = os.path.join(MODELS_DIR, f'xgb_v25_{market}.json')
+    if not os.path.exists(path):
+        return None
+    b = xgb.Booster()
+    b.load_model(path)
+    return b
+
+
+def predict_proba(model, X: np.ndarray, feature_cols: list, n_class: int):
+    dmat = xgb.DMatrix(pd.DataFrame(X, columns=feature_cols))
+    raw = model.predict(dmat)
+    if n_class > 2:
+        return raw.reshape(-1, n_class)
+    return np.column_stack([1 - raw, raw])
+
+
+def consistency_score(probs: dict) -> tuple[float, list]:
+    """
+    Market'ler arası tutarsızlığı hesapla.
+    0 = tamamen tutarlı, 1 = tamamen çelişkili.
+
+    Kontrol edilen çelişkiler:
+    1. OU15 üst yüksek ama OU25 üst de yüksek → ok
+       OU15 üst yüksek ama OU25 alt yüksek → ÇELISKI (1 gol bekleniyor ama 2.5+ da bekleniyor?)
+
+    2. HT_OU05 üst yüksek ama HT sonucu draw yüksek → ÇELISKI
+
+    3. OU35 üst yüksek ama BTTS düşük → şüpheli
+
+    4. MS home yüksek ama HT away yüksek → çelişkili
+    """
+    conflicts = []
+    total_weight = 0
+    total_conflict = 0
+
+    # OU tutarlılığı: P(OU25>0.5) <= P(OU15>0.5) matematiksel zorunluluk
+    ou15_over = probs.get('ou15_over', 0.5)
+    ou25_over = probs.get('ou25_over', 0.5)
+    ou35_over = probs.get('ou35_over', 0.5)
+
+    # OU hiyerarşisi: ou35 <= ou25 <= ou15 olmalı
+    if ou25_over > ou15_over + 0.05:
+        gap = ou25_over - ou15_over
+        conflicts.append(f'OU25>{ou25_over:.0%} > OU15>{ou15_over:.0%} (imkansız)')
+        total_conflict += gap * 2
+    total_weight += 1
+
+    if ou35_over > ou25_over + 0.05:
+        gap = ou35_over - ou25_over
+        conflicts.append(f'OU35>{ou35_over:.0%} > OU25>{ou25_over:.0%} (imkansız)')
+        total_conflict += gap * 2
+    total_weight += 1
+
+    # HT_OU05 ve HT sonuç tutarlılığı
+    ht_ou05_over = probs.get('ht_ou05_over', 0.5)
+    ht_draw_prob = probs.get('ht_draw', 0.34)
+
+    # İlk yarıda gol bekleniyor ama beraberlik de bekleniyor (0-0 draw?)
+    # HT_OU05 >%70 ama HT draw >%50 → çelişkili (0-0 berabere çok?)
+    if ht_ou05_over > 0.70 and ht_draw_prob > 0.50:
+        conflict = min(ht_ou05_over - 0.5, ht_draw_prob - 0.4)
+        conflicts.append(f'HT_OU05>{ht_ou05_over:.0%} ama HT_Draw>{ht_draw_prob:.0%}')
+        total_conflict += conflict
+    total_weight += 1
+
+    # HT_OU05 ve HT_OU15 tutarlılığı
+    ht_ou15_over = probs.get('ht_ou15_over', 0.3)
+    if ht_ou15_over > ht_ou05_over + 0.05:
+        gap = ht_ou15_over - ht_ou05_over
+        conflicts.append(f'HT_OU15>{ht_ou15_over:.0%} > HT_OU05>{ht_ou05_over:.0%} (imkansız)')
+        total_conflict += gap * 2
+    total_weight += 1
+
+    # MS ve OU tutarlılığı
+    ms_home = probs.get('ms_home', 0.33)
+    ms_away = probs.get('ms_away', 0.33)
+    btts_yes = probs.get('btts_yes', 0.5)
+
+    # Tek takım galibiyeti kuvvetli ama BTTS yüksek → şüpheli
+    dominant = max(ms_home, ms_away)
+    if dominant > 0.65 and btts_yes > 0.65:
+        conflict = (dominant - 0.5) * (btts_yes - 0.5)
+        conflicts.append(f'MS dominant>{dominant:.0%} ama BTTS_Yes>{btts_yes:.0%}')
+        total_conflict += conflict * 0.5
+    total_weight += 1
+
+    # OU25 ve BTTS tutarlılığı
+    # BTTS yüksekse en az 2 gol → OU25 üst de yüksek olmalı
+    if btts_yes > 0.65 and ou25_over < 0.45:
+        conflict = btts_yes - ou25_over
+        conflicts.append(f'BTTS_Yes>{btts_yes:.0%} ama OU25>{ou25_over:.0%} düşük')
+        total_conflict += conflict
+    total_weight += 1
+
+    # OU35 üst yüksek ama BTTS düşük → şüpheli (3+ gol ama tek takım mı?)
+    if ou35_over > 0.45 and btts_yes < 0.40:
+        conflict = (ou35_over - 0.35) * (0.5 - btts_yes)
+        conflicts.append(f'OU35>{ou35_over:.0%} ama BTTS_Yes<{btts_yes:.0%}')
+        total_conflict += conflict
+    total_weight += 1
+
+    score = min(1.0, total_conflict / max(total_weight * 0.3, 0.1))
+    return score, conflicts
+
+
+def main():
+    print('Loading data...')
+    df = pd.read_csv(DATA_PATH, low_memory=False)
+
+    # Son %20 = test seti (kronolojik)
+    df = df.sort_values('mst_utc')
+    n_test = int(len(df) * 0.20)
+    df_test = df.tail(n_test).copy()
+    print(f'Test seti: {len(df_test):,} maç')
+
+    feature_cols = [c for c in df.columns if c not in SKIP_COLS]
+
+    # Modelleri yükle
+    print('Modeller yükleniyor...')
+    models = {
+        'ms':       (load_model('ms'),        3),
+        'ou15':     (load_model('ou15'),       2),
+        'ou25':     (load_model('ou25'),       2),
+        'ou35':     (load_model('ou35'),       2),
+        'btts':     (load_model('btts'),       2),
+        'ht_result':(load_model('ht_result'),  3),
+        'ht_ou05':  (load_model('ht_ou05'),    2),
+        'ht_ou15':  (load_model('ht_ou15'),    2),
+    }
+    models = {k: v for k, v in models.items() if v[0] is not None}
+    print(f'Yüklenen model: {list(models.keys())}')
+
+    X = df_test[feature_cols].fillna(0).values
+
+    # Tüm tahminleri al
+    print('Tahminler yapılıyor...')
+    preds = {}
+    for mkey, (model, n_class) in models.items():
+        p = predict_proba(model, X, feature_cols, n_class)
+        preds[mkey] = p
+
+    # Her maç için tutarsızlık skoru ve tahmin kararı
+    results = []
+    for i in range(len(df_test)):
+        row = df_test.iloc[i]
+
+        # Olasılıkları topla
+        probs = {}
+        if 'ms' in preds:
+            probs['ms_home']    = preds['ms'][i][0]
+            probs['ms_draw']    = preds['ms'][i][1]
+            probs['ms_away']    = preds['ms'][i][2]
+        if 'ou15' in preds:
+            probs['ou15_over']  = preds['ou15'][i][1]
+        if 'ou25' in preds:
+            probs['ou25_over']  = preds['ou25'][i][1]
+        if 'ou35' in preds:
+            probs['ou35_over']  = preds['ou35'][i][1]
+        if 'btts' in preds:
+            probs['btts_yes']   = preds['btts'][i][1]
+        if 'ht_result' in preds:
+            probs['ht_home']    = preds['ht_result'][i][0]
+            probs['ht_draw']    = preds['ht_result'][i][1]
+            probs['ht_away']    = preds['ht_result'][i][2]
+        if 'ht_ou05' in preds:
+            probs['ht_ou05_over'] = preds['ht_ou05'][i][1]
+        if 'ht_ou15' in preds:
+            probs['ht_ou15_over'] = preds['ht_ou15'][i][1]
+
+        c_score, conflicts = consistency_score(probs)
+
+        # Gerçek sonuçlar
+        actual = {
+            'ms':    int(row.get('label_ms', -1)),
+            'ou15':  int(row.get('label_ou15', -1)),
+            'ou25':  int(row.get('label_ou25', -1)),
+            'ou35':  int(row.get('label_ou35', -1)),
+            'btts':  int(row.get('label_btts', -1)),
+        }
+
+        # Her market için tahmin ve doğruluk
+        market_results = {}
+        for mkt, label_key in [('ms','ms'),('ou15','ou15'),('ou25','ou25'),
+                                ('ou35','ou35'),('btts','btts')]:
+            if mkt not in preds or actual[label_key] < 0:
+                continue
+            pred_class = int(np.argmax(preds[mkt][i]))
+            correct = int(pred_class == actual[label_key])
+
+            # Odds (implied prob → odds = 1/prob)
+            pred_prob = float(preds[mkt][i][pred_class])
+            implied_odds = 1 / pred_prob if pred_prob > 0.01 else 10.0
+            # ROI hesabı: 1 birim bahis, kazanırsa (odds-1) kazanç, kaybederse -1
+            roi = (implied_odds - 1) * correct - (1 - correct)
+
+            market_results[mkt] = {
+                'pred': pred_class,
+                'actual': actual[label_key],
+                'correct': correct,
+                'prob': pred_prob,
+                'roi': roi,
+            }
+
+        results.append({
+            'idx': i,
+            'consistency_score': c_score,
+            'conflicts': conflicts,
+            'probs': probs,
+            'market_results': market_results,
+        })
+
+    df_results = pd.DataFrame([{
+        'consistency_score': r['consistency_score'],
+        'n_conflicts': len(r['conflicts']),
+        **{f'{m}_correct': r['market_results'].get(m, {}).get('correct', None)
+           for m in ['ms','ou15','ou25','ou35','btts']},
+        **{f'{m}_roi': r['market_results'].get(m, {}).get('roi', None)
+           for m in ['ms','ou15','ou25','ou35','btts']},
+    } for r in results])
+
+    # ── Analiz ──────────────────────────────────────────────────────────
+    print(f'\n{"="*70}')
+    print('TUTARSIZLIK ANALİZİ')
+    print(f'{"="*70}')
+
+    thresholds = [0.0, 0.1, 0.2, 0.3, 0.5]
+    markets = ['ms', 'ou15', 'ou25', 'ou35', 'btts']
+
+    for t in thresholds:
+        mask = df_results['consistency_score'] <= t
+        n = mask.sum()
+        if n < 50:
+            continue
+
+        print(f'\n[Tutarsızlık <= {t:.1f}] → {n:,} maç ({n/len(df_results)*100:.0f}%)')
+        print(f'  {"Market":<8} {"HitRate":>8} {"ROI/bahis":>10} {"Toplam ROI":>12}')
+        print(f'  {"-"*42}')
+        for m in markets:
+            col_c = f'{m}_correct'
+            col_r = f'{m}_roi'
+            if col_c not in df_results.columns:
+                continue
+            sub = df_results[mask][col_c].dropna()
+            roi_sub = df_results[mask][col_r].dropna()
+            if len(sub) < 20:
+                continue
+            hit = sub.mean()
+            avg_roi = roi_sub.mean()
+            total_roi = roi_sub.sum()
+            print(f'  {m:<8} {hit:>7.1%}  {avg_roi:>+9.3f}  {total_roi:>+11.1f}')
+
+    # Çelişki türlerine göre breakdown
+    print(f'\n{"="*70}')
+    print('EN SIK ÇELIŞKILER')
+    print(f'{"="*70}')
+    all_conflicts = [c for r in results for c in r['conflicts']]
+    from collections import Counter
+    for conflict, cnt in Counter(all_conflicts).most_common(10):
+        print(f'  {cnt:>5}x  {conflict}')
+
+    # Tutarsızlık dağılımı
+    print(f'\n{"="*70}')
+    print('TUTARSIZLIK DAĞILIMI')
+    print(f'{"="*70}')
+    for label, lo, hi in [
+        ('Tamamen tutarlı', 0.0, 0.05),
+        ('Çok tutarlı',     0.05, 0.15),
+        ('Orta',            0.15, 0.30),
+        ('Tutarsız',        0.30, 0.50),
+        ('Çok tutarsız',    0.50, 1.01),
+    ]:
+        mask = (df_results['consistency_score'] >= lo) & (df_results['consistency_score'] < hi)
+        n = mask.sum()
+        ou25_hit = df_results[mask]['ou25_correct'].mean()
+        ms_hit = df_results[mask]['ms_correct'].mean()
+        print(f'  {label:<20} {n:>6,} maç ({n/len(df_results)*100:>4.0f}%) | '
+              f'MS={ms_hit:.0%} OU25={ou25_hit:.0%}')
+
+    # Raporu kaydet
+    report = {
+        'total_test': len(df_results),
+        'thresholds': {},
+    }
+    for t in thresholds:
+        mask = df_results['consistency_score'] <= t
+        n = mask.sum()
+        report['thresholds'][str(t)] = {
+            'n_matches': int(n),
+            'pct': round(n/len(df_results)*100, 1),
+            'markets': {},
+        }
+        for m in markets:
+            col_c = f'{m}_correct'
+            col_r = f'{m}_roi'
+            if col_c not in df_results.columns:
+                continue
+            sub_c = df_results[mask][col_c].dropna()
+            sub_r = df_results[mask][col_r].dropna()
+            if len(sub_c) > 0:
+                report['thresholds'][str(t)]['markets'][m] = {
+                    'hit_rate': round(float(sub_c.mean()), 4),
+                    'avg_roi': round(float(sub_r.mean()), 4),
+                    'total_roi': round(float(sub_r.sum()), 2),
+                }
+
+    out_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+                            'reports', 'backtest_consistency.json')
+    with open(out_path, 'w') as f:
+        json.dump(report, f, indent=2)
+    print(f'\nRapor: {out_path}')
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,310 @@
+"""
+League Model Backtest — Son 100+ Maç
+======================================
+Her lig için en son 100-200 maçı (eğitim datasından bağımsız, test seti)
+lig bazlı modelle tahmin eder ve gerçek sonuçla karşılaştırır.
+
+Usage:
+  python scripts/backtest_league_models.py
+  python scripts/backtest_league_models.py --min-matches 150
+"""
+
+import os, sys, json, warnings, argparse
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from sklearn.metrics import accuracy_score
+
+warnings.filterwarnings("ignore")
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from models.league_model import get_league_model_loader, MARKET_META, FILE_TO_SIGNAL
+
+AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+DATA_PATH     = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
+REPORTS_DIR   = os.path.join(AI_ENGINE_DIR, "reports")
+QL_PATH       = os.path.join(os.path.dirname(AI_ENGINE_DIR), "qualified_leagues.json")
+
+# Gerçek label kolonları (CSV'den)
+LABEL_COLS = {
+    "MS":      "label_ms",
+    "OU15":    "label_ou15",
+    "OU25":    "label_ou25",
+    "OU35":    "label_ou35",
+    "BTTS":    "label_btts",
+    "HT":      "label_ht_result",
+    "HT_OU05": "label_ht_ou05",
+    "HT_OU15": "label_ht_ou15",
+    "HTFT":    "label_ht_ft",
+    "OE":      "label_odd_even",
+    "CARDS":   "label_cards_ou45",
+    "HCAP":    "label_handicap_ms",
+}
+
+# Model dosya adı → signal key eşlemesi
+SIGNAL_TO_FILE = {v: k for k, v in FILE_TO_SIGNAL.items()}
+
+SKIP_COLS = {
+    "match_id","home_team_id","away_team_id","league_id","mst_utc",
+    "score_home","score_away","total_goals","ht_score_home","ht_score_away","ht_total_goals",
+    "label_ms","label_ou05","label_ou15","label_ou25","label_ou35","label_btts",
+    "label_ht_result","label_ht_ou05","label_ht_ou15","label_ht_ft",
+    "label_odd_even","label_yellow_cards","label_cards_ou45","label_handicap_ms",
+}
+
+
+def backtest_league(
+    league_id: str,
+    df_league: pd.DataFrame,
+    feature_cols: list,
+    league_model,
+    n_test: int,
+) -> dict:
+    """Son n_test maçı backtest et, her market için doğruluk döndür."""
+    df_sorted = df_league.sort_values("mst_utc")
+    df_test = df_sorted.tail(n_test)
+
+    X = df_test[feature_cols].fillna(0)
+    results = {}
+
+    for sig_key, mfile_key in SIGNAL_TO_FILE.items():
+        label_col = LABEL_COLS.get(sig_key)
+        if not label_col or label_col not in df_test.columns:
+            continue
+
+        y_true = df_test[label_col].dropna().values
+        if len(y_true) < 30:
+            continue
+
+        # League-specific model varsa kullan
+        if league_model and league_model.has_market(mfile_key):
+            probs_list = []
+            preds = []
+            for _, row in df_test.iterrows():
+                feat = row[feature_cols].fillna(0).to_dict()
+                probs = league_model.predict_market(mfile_key, feat)
+                if probs:
+                    best = max(probs, key=probs.__getitem__)
+                    meta = MARKET_META[mfile_key]
+                    labels = meta[1]
+                    pred_idx = labels.index(best)
+                    preds.append(pred_idx)
+                    probs_list.append(list(probs.values()))
+
+            if not preds:
+                continue
+
+            y_valid = df_test[label_col].dropna()
+            if len(preds) != len(y_valid):
+                min_len = min(len(preds), len(y_valid))
+                preds = preds[:min_len]
+                y_valid = y_valid.values[:min_len]
+            else:
+                y_valid = y_valid.values
+
+            acc = accuracy_score(y_valid, preds)
+            results[sig_key] = {
+                "accuracy": round(acc, 4),
+                "n": len(preds),
+                "source": "league_specific",
+            }
+
+    return results
+
+
+def backtest_with_general_v25(
+    df_test: pd.DataFrame,
+    feature_cols: list,
+) -> dict:
+    """Genel V25 modeli ile backtest."""
+    try:
+        from models.v25_ensemble import get_v25_predictor
+        v25 = get_v25_predictor()
+        if not v25._loaded:
+            v25.load_models()
+    except Exception as e:
+        return {}
+
+    X = df_test[feature_cols].fillna(0)
+    results = {}
+
+    mkey_map = {
+        "MS":      ("ms",          {"1": 0, "X": 1, "2": 2}),
+        "OU15":    ("ou15",        {"Over": 0, "Under": 1}),
+        "OU25":    ("ou25",        {"Over": 0, "Under": 1}),
+        "OU35":    ("ou35",        {"Over": 0, "Under": 1}),
+        "BTTS":    ("btts",        {"Yes": 0, "No": 1}),
+    }
+
+    for sig_key, (mkey, label_to_idx) in mkey_map.items():
+        label_col = LABEL_COLS.get(sig_key)
+        if not label_col or label_col not in df_test.columns:
+            continue
+        y_true = df_test[label_col].dropna().values
+        if len(y_true) < 30 or not v25.has_market(mkey):
+            continue
+
+        try:
+            dmat = xgb.DMatrix(X.values, feature_names=feature_cols)
+            models_v25 = v25.models.get(mkey, {})
+            if "xgb" not in models_v25:
+                continue
+            raw = models_v25["xgb"].predict(dmat)
+            num_class = list(MARKET_META.get(mkey, (2,)))[0]
+
+            if num_class > 2:
+                raw = raw.reshape(-1, num_class)
+                preds = np.argmax(raw, axis=1)
+            else:
+                preds = (raw >= 0.5).astype(int)
+
+            acc = accuracy_score(y_true, preds)
+            results[sig_key] = {
+                "accuracy": round(acc, 4),
+                "n": len(preds),
+                "source": "general_v25",
+            }
+        except Exception:
+            continue
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--min-matches", type=int, default=100)
+    parser.add_argument("--test-size",   type=int, default=150,
+                        help="Son kaç maçı test için kullan (min 100)")
+    args = parser.parse_args()
+    n_test = max(args.min_matches, args.test_size)
+
+    print(f"Loading training data ...")
+    df = pd.read_csv(DATA_PATH, low_memory=False)
+    feature_cols = [c for c in df.columns if c not in SKIP_COLS]
+    print(f"  {len(df):,} maç | {len(feature_cols)} feature")
+
+    qualified = json.load(open(QL_PATH)) if os.path.exists(QL_PATH) else []
+    loader = get_league_model_loader()
+
+    try:
+        import psycopg2
+        from data.db import get_clean_dsn
+        conn = psycopg2.connect(get_clean_dsn())
+        cur = conn.cursor()
+        cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (qualified,))
+        league_names = {r[0]: r[1] for r in cur.fetchall()}
+        conn.close()
+    except Exception:
+        league_names = {}
+
+    counts = df[df["league_id"].isin(qualified)].groupby("league_id").size()
+    leagues_to_test = counts[counts >= n_test].index.tolist()
+    print(f"\nBacktest: {len(leagues_to_test)} lig (>={n_test} maç) | son {n_test} maç kullanılacak\n")
+
+    all_results = []
+    markets_order = ["MS", "OU15", "OU25", "OU35", "BTTS", "HT", "HT_OU05", "HT_OU15", "HTFT", "OE", "CARDS", "HCAP"]
+
+    header = f"{'Liga':<35} {'Maç':>5} | " + " | ".join(f"{m:>7}" for m in markets_order)
+    print(header)
+    print("-" * len(header))
+
+    for league_id in leagues_to_test:
+        df_league = df[df["league_id"] == league_id].copy()
+        name = league_names.get(league_id, league_id[:20])
+
+        league_model = loader.get(league_id)
+
+        if league_model and league_model.models:
+            # Batch predict from CSV features (fast)
+            df_test = df_league.sort_values("mst_utc").tail(n_test)
+            X = df_test[feature_cols].fillna(0)
+            mkt_results = {}
+
+            for mfile_key in list(league_model.models.keys()):
+                sig_key = FILE_TO_SIGNAL.get(mfile_key)
+                if not sig_key:
+                    continue
+                label_col = LABEL_COLS.get(sig_key)
+                if not label_col or label_col not in df_test.columns:
+                    continue
+                y_true = df_test[label_col].dropna().values
+                if len(y_true) < 30:
+                    continue
+
+                try:
+                    dmat = xgb.DMatrix(X.values, feature_names=feature_cols)
+                    raw  = league_model.models[mfile_key].predict(dmat)
+                    nc   = MARKET_META[mfile_key][0]
+                    if nc > 2:
+                        preds = np.argmax(raw.reshape(-1, nc), axis=1)
+                    else:
+                        preds = (raw >= 0.5).astype(int)
+
+                    acc = accuracy_score(y_true[:len(preds)], preds[:len(y_true)])
+                    mkt_results[sig_key] = {"accuracy": round(float(acc), 4), "n": len(preds), "source": "league_xgb"}
+                except Exception as e:
+                    mkt_results[sig_key] = {"error": str(e)}
+
+            # Fill missing markets with general V25
+            missing_mkts_df = df_league.sort_values("mst_utc").tail(n_test)
+            gen_results = backtest_with_general_v25(missing_mkts_df, feature_cols)
+            for k, v in gen_results.items():
+                if k not in mkt_results:
+                    mkt_results[k] = {**v, "source": "general_v25_fallback"}
+        else:
+            # No league model — use general V25
+            df_test = df_league.sort_values("mst_utc").tail(n_test)
+            mkt_results = backtest_with_general_v25(df_test, feature_cols)
+            for k in mkt_results:
+                mkt_results[k]["source"] = "general_v25"
+
+        n_used = min(n_test, len(df_league))
+
+        # Print row
+        accs = []
+        for m in markets_order:
+            r = mkt_results.get(m, {})
+            if "accuracy" in r:
+                accs.append(f"{r['accuracy']*100:>6.1f}%")
+            else:
+                accs.append(f"{'—':>7}")
+        print(f"{name:<35} {n_used:>5} | " + " | ".join(accs))
+
+        all_results.append({
+            "league_id": league_id,
+            "league_name": name,
+            "n_tested": n_used,
+            "markets": mkt_results,
+        })
+
+    # ── Özet ──────────────────────────────────────────────────────
+    print("\n" + "=" * len(header))
+    print("ORTALAMA DOĞRULUK (tüm ligler):")
+    for m in markets_order:
+        accs = [r["markets"][m]["accuracy"] for r in all_results if m in r["markets"] and "accuracy" in r["markets"][m]]
+        if accs:
+            print(f"  {m:<10}: {np.mean(accs)*100:.1f}%  (min={min(accs)*100:.1f}%  max={max(accs)*100:.1f}%  n_leagues={len(accs)})")
+
+    # En iyi / en kötü MS ligleri
+    ms_sorted = sorted(
+        [(r["league_name"], r["markets"].get("MS",{}).get("accuracy",0), r["n_tested"])
+         for r in all_results if "MS" in r["markets"] and "accuracy" in r["markets"]["MS"]],
+        key=lambda x: x[1], reverse=True
+    )
+    print("\nEN İYİ MS (Top 10):")
+    for name, acc, n in ms_sorted[:10]:
+        print(f"  {name:<35} {acc*100:.1f}%  ({n} maç)")
+    print("\nEN KÖTÜ MS (Bottom 10):")
+    for name, acc, n in ms_sorted[-10:]:
+        print(f"  {name:<35} {acc*100:.1f}%  ({n} maç)")
+
+    # Save
+    report = {"generated_at": pd.Timestamp.now().isoformat(), "n_test_per_league": n_test, "results": all_results}
+    out_path = os.path.join(REPORTS_DIR, "backtest_league_results.json")
+    with open(out_path, "w") as f:
+        json.dump(report, f, indent=2)
+    print(f"\nRapor: {out_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -1,223 +1,136 @@
 """
-Real AI Engine Backtest Script
-==============================
-Uses the ACTUAL models (V20/V25 Ensemble) to predict historical matches.
-
-Usage:
-    python ai-engine/scripts/backtest_real.py
+Gerçek Odds Bazlı Backtest
+============================
+Model olasılığı vs gerçek bookmaker odds karşılaştırır.
+Edge varsa bahis açıldığı varsayılır, gerçek ROI hesaplanır.
 """

-import os
-import sys
-import json
-import time
-import psycopg2
-from psycopg2.extras import RealDictCursor
-from datetime import datetime
+import os, sys, json
+import numpy as np
+import pandas as pd
+import xgboost as xgb

-# Add paths
-AI_DIR = os.path.dirname(os.path.abspath(__file__))
-ROOT_DIR = os.path.dirname(AI_DIR)
-sys.path.insert(0, ROOT_DIR)
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

-# Fix for Windows path issues in scripts
-if "scripts" in os.path.basename(AI_DIR):
-    ROOT_DIR = os.path.dirname(ROOT_DIR) # One level up if inside scripts folder
+DATA_PATH  = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'training_data.csv')
+MODELS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'models', 'v25')
+REPORT_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'reports')

-from services.single_match_orchestrator import get_single_match_orchestrator, MatchData
+SKIP_COLS = {
+    'match_id','home_team_id','away_team_id','league_id','mst_utc',
+    'score_home','score_away','total_goals','ht_score_home','ht_score_away','ht_total_goals',
+    'label_ms','label_ou05','label_ou15','label_ou25','label_ou35','label_btts',
+    'label_ht_result','label_ht_ou05','label_ht_ou15','label_ht_ft',
+    'label_odd_even','label_yellow_cards','label_cards_ou45','label_handicap_ms',
+}

-def get_clean_dsn() -> str:
-    return "postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db"
+# (model_key, n_class, pred_class, label_col, odds_col, isim)
+MARKETS = [
+    ('ms',   3, 0, 'label_ms',   'odds_ms_h',   'MS-Ev'),
+    ('ms',   3, 1, 'label_ms',   'odds_ms_d',   'MS-Ber'),
+    ('ms',   3, 2, 'label_ms',   'odds_ms_a',   'MS-Dep'),
+    ('ou15', 2, 1, 'label_ou15', 'odds_ou15_o', 'OU15-Ust'),
+    ('ou15', 2, 0, 'label_ou15', 'odds_ou15_u', 'OU15-Alt'),
+    ('ou25', 2, 1, 'label_ou25', 'odds_ou25_o', 'OU25-Ust'),
+    ('ou25', 2, 0, 'label_ou25', 'odds_ou25_u', 'OU25-Alt'),
+    ('ou35', 2, 1, 'label_ou35', 'odds_ou35_o', 'OU35-Ust'),
+    ('ou35', 2, 0, 'label_ou35', 'odds_ou35_u', 'OU35-Alt'),
+    ('btts', 2, 1, 'label_btts', 'odds_btts_y', 'BTTS-Var'),
+    ('btts', 2, 0, 'label_btts', 'odds_btts_n', 'BTTS-Yok'),
+]

-def run_backtest():
-    print("🚀 REAL AI BACKTEST: Sept 13, 2024 - Top Leagues")
-    print("🧠 Engine: V30 Ensemble (V20+V25)")
-    print("="*60)
+MIN_ODDS = 1.10
+MAX_ODDS = 10.0

-    # Load Top Leagues
-    leagues_path = os.path.join(ROOT_DIR, "top_leagues.json")
-    try:
-        with open(leagues_path, 'r') as f:
-            top_leagues = json.load(f)
-        league_ids = tuple(str(lid) for lid in top_leagues)
-        print(f"📋 Loaded {len(top_leagues)} top leagues.")
-    except Exception as e:
-        print(f"❌ Error loading top_leagues.json: {e}")
-        return

-    # Date Range (Sept 13, 2024)
-    start_dt = datetime(2024, 9, 13, 0, 0, 0)
-    end_dt = datetime(2024, 9, 13, 23, 59, 59)
-    start_ts = int(start_dt.timestamp() * 1000)
-    end_ts = int(end_dt.timestamp() * 1000)
+def load_model(market):
+    path = os.path.join(MODELS_DIR, f'xgb_v25_{market}.json')
+    if not os.path.exists(path):
+        return None
+    b = xgb.Booster()
+    b.load_model(path)
+    return b

-    dsn = get_clean_dsn()
-    conn = psycopg2.connect(dsn)
-    cur = conn.cursor(cursor_factory=RealDictCursor)

-    # Fetch Matches
-    cur.execute("""
-        SELECT m.id, m.match_name, m.home_team_id, m.away_team_id, 
-               m.mst_utc, m.league_id, m.status, m.score_home, m.score_away,
-               t1.name as home_team, t2.name as away_team,
-               l.name as league_name
-        FROM matches m
-        LEFT JOIN teams t1 ON m.home_team_id = t1.id
-        LEFT JOIN teams t2 ON m.away_team_id = t2.id
-        LEFT JOIN leagues l ON m.league_id = l.id
-        WHERE m.mst_utc BETWEEN %s AND %s
-          AND m.league_id IN %s
-          AND m.status = 'FT'
-        ORDER BY m.mst_utc ASC
-        LIMIT 20  -- Limit to 20 matches to avoid running for hours on a single backtest
-    """, (start_ts, end_ts, league_ids))
-    
-    rows = cur.fetchall()
-    print(f"📊 Found {len(rows)} finished matches. Starting AI Analysis...")
+def main():
+    print('Veri yukleniyor...')
+    df = pd.read_csv(DATA_PATH, low_memory=False)
+    df = df.sort_values('mst_utc')
+    n_test = int(len(df) * 0.20)
+    df_test = df.tail(n_test).copy().reset_index(drop=True)
+    print(f'Test seti: {len(df_test):,} mac')

-    if not rows:
-        print("⚠️ No matches found for this date.")
-        cur.close()
-        conn.close()
-        return
+    feature_cols = [c for c in df.columns if c not in SKIP_COLS]
+    X = df_test[feature_cols].fillna(0).values

-    # Initialize AI Engine
-    try:
-        orchestrator = get_single_match_orchestrator()
-        print("✅ AI Engine (SingleMatchOrchestrator) Loaded.")
-    except Exception as e:
-        print(f"❌ Failed to load AI Engine: {e}")
-        print("💡 Make sure models are trained/present in ai-engine/models/")
-        cur.close()
-        conn.close()
-        return
+    # Modelleri yukle
+    loaded = {}
+    for mkey, n_class, *_ in MARKETS:
+        if mkey not in loaded:
+            m = load_model(mkey)
+            if m:
+                loaded[mkey] = (m, n_class)
+    print(f'Modeller: {list(loaded.keys())}')

-    # ─── Backtest Loop ───
-    total_matches_analyzed = 0
-    bets_skipped = 0
-    bets_played = 0
-    bets_won = 0
-    total_profit = 0.0
-    
-    # Thresholds matching the NEW Skip Logic
-    MIN_CONF = 45.0 
+    # Toplu tahmin
+    raw_preds = {}
+    for mkey, (model, n_class) in loaded.items():
+        dmat = xgb.DMatrix(pd.DataFrame(X, columns=feature_cols))
+        raw = model.predict(dmat)
+        raw_preds[mkey] = raw.reshape(-1, n_class) if n_class > 2 else np.column_stack([1-raw, raw])

-    start_time = time.time()
+    # Backtest
+    all_results = []
+    print(f'\n{"Market":<12} {"Edge>=":>7} {"Bahis":>7} {"Hit%":>7} {"AvgOdds":>9} {"ROI/b":>8} {"Toplam":>10}')
+    print('-' * 65)

-    for i, row in enumerate(rows):
-        match_id = str(row['id'])
-        home_team = row['home_team']
-        away_team = row['away_team']
-        home_score = row['score_home']
-        away_score = row['score_away']
-        
-        print(f"\n[{i+1}/{len(rows)}] Analyzing: {home_team} vs {away_team} ...")
+    for mkey, n_class, pred_cls, label_col, odds_col, isim in MARKETS:
+        if mkey not in raw_preds or label_col not in df_test.columns or odds_col not in df_test.columns:
+            continue

-        try:
-            # 1. AI PREDICTION (Actual Model Call)
-            prediction = orchestrator.analyze_match(match_id)
-            
-            if not prediction:
-                print(f"   ⚠️ AI returned no prediction.")
+        mp  = raw_preds[mkey][:, pred_cls]
+        act = pd.to_numeric(df_test[label_col], errors='coerce').values
+        bko = pd.to_numeric(df_test[odds_col],  errors='coerce').values
+
+        valid = (~np.isnan(act) & ~np.isnan(bko) &
+                 (bko >= MIN_ODDS) & (bko <= MAX_ODDS))
+        mp, act, bko = mp[valid], act[valid].astype(int), bko[valid]
+        implied = 1.0 / bko
+        edge = mp - implied
+
+        print(f'\n{isim}:')
+        for min_e in [0.02, 0.03, 0.05, 0.07, 0.10]:
+            mask = edge >= min_e
+            n = mask.sum()
+            if n < 20:
                continue
+            won = (act[mask] == pred_cls).astype(int)
+            roi = (bko[mask] - 1) * won - (1 - won)
+            hit = won.mean()
+            avg_roi = roi.mean()
+            total = roi.sum()
+            avg_odds = bko[mask].mean()
+            sign = '+' if total > 0 else ''
+            print(f'  edge>={min_e:+.0%}  n={n:>5,}  hit={hit:.1%}  odds={avg_odds:.2f}  roi/b={avg_roi:+.3f}  toplam={sign}{total:.1f}')
+            all_results.append({'market': isim, 'min_edge': min_e, 'n': n,
+                                 'hit': round(hit, 4), 'avg_odds': round(avg_odds, 3),
+                                 'avg_roi': round(avg_roi, 4), 'total_roi': round(total, 2)})

-            total_matches_analyzed += 1
-            
-            # 2. Extract Main Pick
-            main_pick = prediction.get("main_pick") or {}
-            pick_name = main_pick.get("pick")
-            confidence = main_pick.get("confidence", 0)
-            odds = main_pick.get("odds", 0)
+    # En iyi
+    winners = sorted([r for r in all_results if r['total_roi'] > 0],
+                     key=lambda x: x['avg_roi'], reverse=True)
+    print(f'\n{"="*65}')
+    print('KAZANCLI KOMBINASYONLAR (total_roi > 0):')
+    print(f'{"="*65}')
+    for r in winners[:20]:
+        print(f'  {r["market"]:<12} edge>={r["min_edge"]:+.0%} | n={r["n"]:>5,} | '
+              f'hit={r["hit"]:.0%} | roi/b={r["avg_roi"]:+.3f} | toplam={r["total_roi"]:+.1f}')

-            if not pick_name or not confidence:
-                print(f"   ⚠️ No main pick found in prediction.")
-                continue
+    os.makedirs(REPORT_DIR, exist_ok=True)
+    with open(os.path.join(REPORT_DIR, 'backtest_real_odds.json'), 'w') as f:
+        json.dump(all_results, f, indent=2)
+    print(f'\nRapor kaydedildi.')

-            print(f"   🤖 Pick: {pick_name} | Conf: {confidence}% | Odds: {odds}")

-            # 3. Apply Skip Logic (New Backtest Logic)
-            if confidence < MIN_CONF:
-                print(f"   🚫 SKIPPED (Confidence {confidence}% < {MIN_CONF}%)")
-                bets_skipped += 1
-                continue
-
-            if odds > 0:
-                implied_prob = 1.0 / odds
-                my_prob = confidence / 100.0
-                if my_prob - implied_prob < -0.03: # Negative edge
-                    print(f"   🚫 SKIPPED (Negative Edge)")
-                    bets_skipped += 1
-                    continue
-
-            # 4. Bet Played
-            bets_played += 1
-            print(f"   🎲 BET PLAYED: {pick_name} @ {odds}")
-
-            # 5. Resolve Bet
-            won = False
-            # Basic resolution logic (Need to parse pick_name like "1", "X", "2", "2.5 Üst", etc.)
-            pick_clean = str(pick_name).upper()
-            
-            # MS
-            if pick_clean in ["1", "MS 1"] and home_score > away_score: won = True
-            elif pick_clean in ["X", "MS X"] and home_score == away_score: won = True
-            elif pick_clean in ["2", "MS 2"] and away_score > home_score: won = True
-            
-            # OU25
-            elif "ÜST" in pick_clean or "OVER" in pick_clean:
-                if (home_score + away_score) > 2.5: won = True
-            elif "ALT" in pick_clean or "UNDER" in pick_clean:
-                if (home_score + away_score) < 2.5: won = True
-            
-            # BTTS
-            elif "VAR" in pick_clean and home_score > 0 and away_score > 0: won = True
-            elif "YOK" in pick_clean and (home_score == 0 or away_score == 0): won = True
-
-            if won:
-                bets_won += 1
-                profit = odds - 1.0
-                print(f"   ✅ WON! (+{profit:.2f} units)")
-            else:
-                profit = -1.0
-                print(f"   ❌ LOST! (-1.00 units)")
-            
-            total_profit += profit
-
-        except Exception as e:
-            print(f"   💥 Error during analysis: {e}")
-
-    elapsed = time.time() - start_time
-
-    # ─── FINAL REPORT ───
-    print("\n" + "="*60)
-    print("📈 REAL AI BACKTEST RESULTS")
-    print(f"🕒 Time taken: {elapsed:.1f} seconds")
-    print("="*60)
-    print(f"📊 Matches Analyzed: {total_matches_analyzed}")
-    print(f"🚫 Bets SKIPPED: {bets_skipped}")
-    print(f"✅ Bets PLAYED: {bets_played}")
-    
-    if bets_played > 0:
-        win_rate = (bets_won / bets_played) * 100
-        roi = (total_profit / bets_played) * 100
-        yield_val = total_profit  # Net Units
-        
-        print(f"🏆 Bets Won: {bets_won}")
-        print(f"💀 Bets Lost: {bets_played - bets_won}")
-        print("-" * 40)
-        print(f" Win Rate: {win_rate:.2f}%")
-        print(f"💰 Total Profit (Units): {total_profit:.2f}")
-        print(f"📊 ROI: {roi:.2f}%")
-        
-        if roi > 0:
-            print("🟢 STRATEGY IS PROFITABLE!")
-        else:
-            print("🔴 STRATEGY IS LOSING")
-    else:
-        print("⚠️ No bets were played. All were skipped or failed.")
-
-    cur.close()
-    conn.close()
-
-if __name__ == "__main__":
-    run_backtest()
+if __name__ == '__main__':
+    main()
@@ -128,7 +128,40 @@ FEATURE_COLS = [
    "home_top_scorer_form", "away_top_scorer_form",
    "home_avg_player_exp", "away_avg_player_exp",
    "home_goals_diversity", "away_goals_diversity",
-    
+
+    # V27 H2H Expanded (4)
+    "h2h_home_goals_avg", "h2h_away_goals_avg",
+    "h2h_recent_trend", "h2h_venue_advantage",
+
+    # V27 Rolling Stats (13)
+    "home_rolling5_goals", "home_rolling5_conceded",
+    "home_rolling10_goals", "home_rolling10_conceded",
+    "home_rolling20_goals", "home_rolling20_conceded",
+    "away_rolling5_goals", "away_rolling5_conceded",
+    "away_rolling10_goals", "away_rolling10_conceded",
+    "home_rolling5_cs", "away_rolling5_cs",
+
+    # V27 Venue Stats (4)
+    "home_venue_goals", "home_venue_conceded",
+    "away_venue_goals", "away_venue_conceded",
+
+    # V27 Goal Trend (2)
+    "home_goal_trend", "away_goal_trend",
+
+    # V27 Calendar (5)
+    "home_days_rest", "away_days_rest",
+    "match_month", "is_season_start", "is_season_end",
+
+    # V27 Interaction (6)
+    "attack_vs_defense_home", "attack_vs_defense_away",
+    "xg_diff", "form_momentum_interaction",
+    "elo_form_consistency", "upset_x_elo_gap",
+
+    # V27 League Expanded (5)
+    "league_home_win_rate", "league_draw_rate",
+    "league_btts_rate", "league_ou25_rate",
+    "league_reliability_score",
+
    # Labels
    "score_home", "score_away", "total_goals",
    "ht_score_home", "ht_score_away", "ht_total_goals",
@@ -296,6 +329,10 @@ class BatchDataLoader:
            SELECT league_id,
                   AVG(score_home + score_away) as avg_goals,
                   AVG(CASE WHEN score_home = 0 AND score_away = 0 THEN 1.0 ELSE 0.0 END) as zero_rate,
+                   AVG(CASE WHEN score_home > score_away THEN 1.0 ELSE 0.0 END) as home_win_rate,
+                   AVG(CASE WHEN score_home = score_away THEN 1.0 ELSE 0.0 END) as draw_rate,
+                   AVG(CASE WHEN score_home > 0 AND score_away > 0 THEN 1.0 ELSE 0.0 END) as btts_rate,
+                   AVG(CASE WHEN score_home + score_away > 2.5 THEN 1.0 ELSE 0.0 END) as ou25_rate,
                   COUNT(*) as match_count
            FROM matches
            WHERE status = 'FT'
@@ -304,12 +341,17 @@ class BatchDataLoader:
              AND league_id IN ({ph})
            GROUP BY league_id
        """, self.top_league_ids)
-        
-        for league_id, avg_goals, zero_rate, cnt in self.cur.fetchall():
+
+        for row in self.cur.fetchall():
+            league_id, avg_goals, zero_rate, home_win_rate, draw_rate, btts_rate, ou25_rate, cnt = row
            self.league_stats_cache[league_id] = {
                "avg_goals": float(avg_goals) if avg_goals else 2.5,
                "zero_rate": float(zero_rate) if zero_rate else 0.07,
-                "match_count": cnt
+                "home_win_rate": float(home_win_rate) if home_win_rate else 0.45,
+                "draw_rate": float(draw_rate) if draw_rate else 0.25,
+                "btts_rate": float(btts_rate) if btts_rate else 0.50,
+                "ou25_rate": float(ou25_rate) if ou25_rate else 0.50,
+                "match_count": cnt,
            }

    def _load_team_history(self):
@@ -666,6 +708,9 @@ class FeatureExtractor:

        print(f"\n🔄 Extracting features for {total} matches...", flush=True)

+        _last_print = t_start
+        _PRINT_INTERVAL = 60  # her dakika bir ilerleme
+
        # Process chronologically — ELO grows as we go
        for i, m in enumerate(matches):
            (
@@ -683,17 +728,25 @@ class FeatureExtractor:
                league_name,
            ) = m

-            if i % 100 == 0 and i > 0:
-                elapsed = time.time() - t_start
-                rate = i / elapsed  # matches per second
+            now = time.time()
+            if now - _last_print >= _PRINT_INTERVAL and i > 0:
+                elapsed  = now - t_start
+                rate     = i / elapsed
                remaining = (total - i) / rate if rate > 0 else 0
-                pct = i / total * 100
+                pct      = i / total * 100
+                eta_h    = int(remaining // 3600)
+                eta_m    = int((remaining % 3600) // 60)
+                eta_s    = int(remaining % 60)
+                eta_str  = (f"{eta_h}s {eta_m}dk" if eta_h else f"{eta_m}dk {eta_s}s")
                print(
-                    f"  [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | "
-                    f"ETA: {remaining/60:.1f} dk | skipped: {skipped} | "
-                    f"dq_rejected: {dq_rejected}",
+                    f"  ⏱  [{i:>6}/{total}] %{pct:>4.1f} | "
+                    f"{rate:.1f} maç/s | "
+                    f"bitti: {len(rows):,} | "
+                    f"atlanan: {skipped+dq_rejected} | "
+                    f"ETA: {eta_str}",
                    flush=True,
                )
+                _last_print = now

            row = self._extract_one(
                mid, hid, aid, sh, sa, hth, hta, mst, lid,
@@ -882,7 +935,10 @@ class FeatureExtractor:
        }
        
        # === LEAGUE FEATURES ===
-        league = self.loader.league_stats_cache.get(lid, {"avg_goals": 2.5, "zero_rate": 0.07})
+        league = self.loader.league_stats_cache.get(lid, {
+            "avg_goals": 2.5, "zero_rate": 0.07, "home_win_rate": 0.45,
+            "draw_rate": 0.25, "btts_rate": 0.50, "ou25_rate": 0.50, "match_count": 0,
+        })
        league_features = {
            "league_avg_goals": league["avg_goals"],
            "league_zero_goal_rate": league["zero_rate"],
@@ -953,6 +1009,11 @@ class FeatureExtractor:
        home_goals_form = home_sq.get('goals_form', 0)
        away_goals_form = away_sq.get('goals_form', 0)

+        # === V27 ROLLING / VENUE / CALENDAR FEATURES ===
+        v27 = self._compute_v27_features(hid, aid, mst, elo_features, form_features,
+                                         home_momentum_score, away_momentum_score,
+                                         upset_feats, h2h_features, league)
+
        # === ASSEMBLE ROW ===
        row = {
            "match_id": mid,
@@ -960,13 +1021,13 @@ class FeatureExtractor:
            "away_team_id": aid,
            "league_id": lid,
            "mst_utc": mst,
-            
+
            **elo_features,
            **form_features,
            **h2h_features,
            **stats_features,
            **odds_features,
-            
+
            "home_xga": form_features["home_conceded_avg"],
            "away_xga": form_features["away_conceded_avg"],
            **league_features,
@@ -1007,7 +1068,10 @@ class FeatureExtractor:
            "away_avg_player_exp": away_sq.get('avg_player_exp', 0.0),
            "home_goals_diversity": home_sq.get('goals_diversity', 0.0),
            "away_goals_diversity": away_sq.get('goals_diversity', 0.0),
-            
+
+            # V27 Features
+            **v27,
+
            # Labels
            "score_home": sh,
            "score_away": sa,
@@ -1033,6 +1097,103 @@ class FeatureExtractor:
        
        return row

+    def _compute_v27_features(self, hid, aid, mst, elo_features, form_features,
+                              home_momentum, away_momentum, upset_feats, h2h_features, league):
+        """Compute V27 rolling, venue, calendar, interaction features from pre-loaded data."""
+        home_history = self.loader.team_matches.get(hid, [])
+        away_history = self.loader.team_matches.get(aid, [])
+
+        def _rolling(history, n):
+            recent = [m for m in history if m[0] < mst][-n:]
+            if not recent:
+                return 1.3, 1.1, 0.0
+            goals = sum(m[2] for m in recent) / len(recent)
+            conceded = sum(m[3] for m in recent) / len(recent)
+            cs = sum(1 for m in recent if m[3] == 0) / len(recent)
+            return round(goals, 3), round(conceded, 3), round(cs, 3)
+
+        def _venue(history, is_home):
+            recent = [m for m in history if m[0] < mst and m[1] == is_home][-10:]
+            if not recent:
+                return 1.3, 1.1
+            goals = sum(m[2] for m in recent) / len(recent)
+            conceded = sum(m[3] for m in recent) / len(recent)
+            return round(goals, 3), round(conceded, 3)
+
+        def _days_rest(history):
+            prior = [m[0] for m in history if m[0] < mst]
+            if not prior:
+                return 7.0
+            last = prior[-1]
+            return round(min((mst - last) / 86400000.0, 30.0), 1)
+
+        h5g, h5c, h5cs = _rolling(home_history, 5)
+        h10g, h10c, _ = _rolling(home_history, 10)
+        h20g, h20c, _ = _rolling(home_history, 20)
+        a5g, a5c, a5cs = _rolling(away_history, 5)
+        a10g, a10c, _ = _rolling(away_history, 10)
+
+        hvg, hvc = _venue(home_history, True)
+        avg, avc = _venue(away_history, False)
+
+        home_rest = _days_rest(home_history)
+        away_rest = _days_rest(away_history)
+
+        import datetime
+        match_dt = datetime.datetime.utcfromtimestamp(mst / 1000)
+        match_month = match_dt.month
+
+        elo_diff = elo_features["elo_diff"]
+        form_elo_diff = elo_features["form_elo_diff"]
+        mom_diff = home_momentum - away_momentum
+        home_conceded = form_features["home_conceded_avg"]
+        away_conceded = form_features["away_conceded_avg"]
+        home_goals = form_features["home_goals_avg"]
+        away_goals = form_features["away_goals_avg"]
+        upset_potential = upset_feats.get("upset_potential", 0.0)
+
+        h2h_prior = [m for m in home_history if m[0] < mst and m[4] == aid]
+        h2h_home_goals_avg = sum(m[2] for m in h2h_prior) / len(h2h_prior) if h2h_prior else 1.3
+        h2h_away_goals_avg = sum(m[3] for m in h2h_prior) / len(h2h_prior) if h2h_prior else 1.1
+        recent_h2h = h2h_prior[-3:]
+        h2h_recent_trend = sum(1 if m[2] > m[3] else -1 if m[2] < m[3] else 0 for m in recent_h2h) / max(len(recent_h2h), 1)
+        venue_h2h = [m for m in h2h_prior if m[1]]
+        h2h_venue_advantage = sum(1 if m[2] > m[3] else 0 for m in venue_h2h) / max(len(venue_h2h), 1) if venue_h2h else 0.5
+
+        league_count = league.get("match_count", 0)
+
+        return {
+            "h2h_home_goals_avg": round(h2h_home_goals_avg, 3),
+            "h2h_away_goals_avg": round(h2h_away_goals_avg, 3),
+            "h2h_recent_trend": round(h2h_recent_trend, 3),
+            "h2h_venue_advantage": round(h2h_venue_advantage, 3),
+            "home_rolling5_goals": h5g, "home_rolling5_conceded": h5c,
+            "home_rolling10_goals": h10g, "home_rolling10_conceded": h10c,
+            "home_rolling20_goals": h20g, "home_rolling20_conceded": h20c,
+            "away_rolling5_goals": a5g, "away_rolling5_conceded": a5c,
+            "away_rolling10_goals": a10g, "away_rolling10_conceded": a10c,
+            "home_rolling5_cs": h5cs, "away_rolling5_cs": a5cs,
+            "home_venue_goals": hvg, "home_venue_conceded": hvc,
+            "away_venue_goals": avg, "away_venue_conceded": avc,
+            "home_goal_trend": round(h5g - h10g, 3),
+            "away_goal_trend": round(a5g - a10g, 3),
+            "home_days_rest": home_rest, "away_days_rest": away_rest,
+            "match_month": float(match_month),
+            "is_season_start": 1.0 if match_month in (7, 8, 9) else 0.0,
+            "is_season_end": 1.0 if match_month in (5, 6) else 0.0,
+            "attack_vs_defense_home": round(home_goals - away_conceded, 3),
+            "attack_vs_defense_away": round(away_goals - home_conceded, 3),
+            "xg_diff": round(home_conceded - away_conceded, 3),
+            "form_momentum_interaction": round(mom_diff * form_elo_diff / 1000.0, 4),
+            "elo_form_consistency": round(1.0 - abs(elo_diff - form_elo_diff) / max(abs(elo_diff), 100.0), 4),
+            "upset_x_elo_gap": round(upset_potential * abs(elo_diff) / 500.0, 4),
+            "league_home_win_rate": league.get("home_win_rate", 0.45),
+            "league_draw_rate": league.get("draw_rate", 0.25),
+            "league_btts_rate": league.get("btts_rate", 0.50),
+            "league_ou25_rate": league.get("ou25_rate", 0.50),
+            "league_reliability_score": min(1.0, league_count / 500.0) if league_count else 0.3,
+        }
+
    def _validate_row_quality(
        self,
        row: dict,
@@ -0,0 +1,166 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": ["# Training Data Extraction — Google Colab\n", "SSH tunnel ile sunucuya bağlanır, DB'den 270K+ maç çeker, Drive'a kaydeder.\n"]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1. Gerekli paketler\n",
+    "!pip install sshtunnel psycopg2-binary pandas numpy -q\n",
+    "print('Paketler hazır')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 2. Drive bağla\n",
+    "from google.colab import drive\n",
+    "drive.mount('/content/drive')\n",
+    "import os\n",
+    "DRIVE_DIR = '/content/drive/MyDrive/iddaai'\n",
+    "os.makedirs(DRIVE_DIR, exist_ok=True)\n",
+    "print('Drive hazır:', DRIVE_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 3. SSH private key upload\n",
+    "# Mac'te terminalde şunu çalıştır, çıktıyı kopyala:\n",
+    "#   cat ~/.ssh/id_ed25519\n",
+    "# Aşağıya yapıştır (BEGIN ve END satırları dahil)\n",
+    "\n",
+    "SSH_PRIVATE_KEY = \"\"\"-----BEGIN OPENSSH PRIVATE KEY-----\n",
+    "BURAYA_KEY_ICERIGINI_YAPISTIR\n",
+    "-----END OPENSSH PRIVATE KEY-----\"\"\"\n",
+    "\n",
+    "# Key dosyasına yaz\n",
+    "key_path = '/root/.ssh/id_ed25519'\n",
+    "os.makedirs('/root/.ssh', exist_ok=True)\n",
+    "with open(key_path, 'w') as f:\n",
+    "    f.write(SSH_PRIVATE_KEY.strip() + '\\n')\n",
+    "os.chmod(key_path, 0o600)\n",
+    "print('SSH key hazır')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 4. SSH Tunnel aç + DB bağlantısını test et\n",
+    "from sshtunnel import SSHTunnelForwarder\n",
+    "import psycopg2\n",
+    "\n",
+    "tunnel = SSHTunnelForwarder(\n",
+    "    ('95.70.252.214', 2222),\n",
+    "    ssh_username='haruncan',\n",
+    "    ssh_pkey=key_path,\n",
+    "    remote_bind_address=('localhost', 5432),\n",
+    "    local_bind_address=('localhost', 15432),\n",
+    ")\n",
+    "tunnel.start()\n",
+    "print(f'Tunnel açık: localhost:{tunnel.local_bind_port}')\n",
+    "\n",
+    "conn = psycopg2.connect(\n",
+    "    host='localhost',\n",
+    "    port=15432,\n",
+    "    dbname='iddaai_db',\n",
+    "    user='iddaai_user',\n",
+    "    password='IddaA1_S4crET!',\n",
+    ")\n",
+    "cur = conn.cursor()\n",
+    "cur.execute(\"SELECT COUNT(*) FROM matches WHERE status='FT' AND score_home IS NOT NULL\")\n",
+    "print(f'DB bağlantısı OK — FT maç sayısı: {cur.fetchone()[0]:,}')\n",
+    "conn.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 5. extract_training_data.py kodunu Drive'dan veya doğrudan çalıştır\n",
+    "# Önce repo'yu Drive'a kopyala (yoksa)\n",
+    "import subprocess\n",
+    "\n",
+    "REPO_DIR = f'{DRIVE_DIR}/ai-engine'\n",
+    "SCRIPT   = f'{REPO_DIR}/scripts/extract_training_data.py'\n",
+    "\n",
+    "if not os.path.exists(SCRIPT):\n",
+    "    print('Script bulunamadı — ai-engine klasörünü Drive a yükle:')\n",
+    "    print('  Yerel makinede: cp -r /Users/piton/Documents/GitHub/iddaai/iddaai-be/ai-engine ~/Google\\ Drive/MyDrive/iddaai/')\n",
+    "else:\n",
+    "    print('Script hazır:', SCRIPT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 6. Extraction'ı çalıştır\n",
+    "import sys, os\n",
+    "sys.path.insert(0, REPO_DIR)\n",
+    "\n",
+    "# DB URL'i tunnel üzerinden ayarla\n",
+    "os.environ['DATABASE_URL'] = 'postgresql://iddaai_user:IddaA1_S4crET!@localhost:15432/iddaai_db'\n",
+    "\n",
+    "# Output CSV'yi Drive'a kaydet\n",
+    "OUTPUT_CSV = f'{DRIVE_DIR}/training_data_full.csv'\n",
+    "\n",
+    "# Script'i import et ve main'i çalıştır\n",
+    "import importlib.util\n",
+    "spec = importlib.util.spec_from_file_location('extract', SCRIPT)\n",
+    "mod  = importlib.util.load_from_spec(spec)\n",
+    "spec.loader.exec_module(mod)\n",
+    "\n",
+    "# OUTPUT_CSV'yi override et\n",
+    "mod.OUTPUT_CSV = OUTPUT_CSV\n",
+    "mod.TOP_LEAGUES_PATH = f'{DRIVE_DIR}/qualified_leagues.json'\n",
+    "\n",
+    "mod.main()\n",
+    "print(f'\\nKaydedildi: {OUTPUT_CSV}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 7. Tunnel kapat\n",
+    "tunnel.stop()\n",
+    "print('Tunnel kapatıldı')\n",
+    "\n",
+    "# Dosya boyutunu kontrol et\n",
+    "size_mb = os.path.getsize(OUTPUT_CSV) / 1024 / 1024\n",
+    "import pandas as pd\n",
+    "df = pd.read_csv(OUTPUT_CSV, nrows=5)\n",
+    "print(f'CSV: {size_mb:.1f} MB')\n",
+    "print(f'Kolonlar: {len(df.columns)}')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
+  "language_info": {"name": "python", "version": "3.10.0"}
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -0,0 +1,806 @@
+"""
+V25 Backtest + Calibration Training Script
+==========================================
+Runs a full backtest on historical football matches, measures model accuracy
+by market / confidence band / league, and trains isotonic calibration models
+for MS, OU15, OU25, and BTTS markets.
+
+Usage:
+    venv/bin/python scripts/run_backtest_and_calibrate.py
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+import json
+import pickle
+import time
+from collections import defaultdict
+from datetime import datetime
+from typing import Dict, List, Optional, Tuple, Any
+
+import numpy as np
+import pandas as pd
+import psycopg2
+from psycopg2.extras import RealDictCursor
+
+# ---------------------------------------------------------------------------
+# Path setup — works whether executed from ai-engine/ or project root
+# ---------------------------------------------------------------------------
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+AI_ENGINE_DIR = os.path.dirname(SCRIPT_DIR)
+sys.path.insert(0, AI_ENGINE_DIR)
+
+from data.db import get_clean_dsn
+from models.calibration import Calibrator
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+QUALIFIED_LEAGUES_PATH = os.path.join(AI_ENGINE_DIR, "..", "qualified_leagues.json")
+CALIBRATION_DIR = os.path.join(AI_ENGINE_DIR, "models", "calibration")
+REPORTS_DIR = os.path.join(AI_ENGINE_DIR, "reports")
+MAX_MATCHES = 3000          # target upper bound
+PROGRESS_INTERVAL = 100     # print every N matches
+
+os.makedirs(CALIBRATION_DIR, exist_ok=True)
+os.makedirs(REPORTS_DIR, exist_ok=True)
+
+# Mapping: Turkish category name -> internal feature key
+ODDS_CATEGORY_MAP = {
+    "Maç Sonucu": {
+        "1": "odds_ms_h",
+        "X": "odds_ms_d",
+        "2": "odds_ms_a",
+    },
+    "1,5 Alt/Üst": {
+        "Üst": "odds_ou15_o",
+        "Alt": "odds_ou15_u",
+    },
+    "2,5 Alt/Üst": {
+        "Üst": "odds_ou25_o",
+        "Alt": "odds_ou25_u",
+    },
+    "3,5 Alt/Üst": {
+        "Üst": "odds_ou35_o",
+        "Alt": "odds_ou35_u",
+    },
+    "0,5 Alt/Üst": {
+        "Üst": "odds_ou05_o",
+        "Alt": "odds_ou05_u",
+    },
+    "Karşılıklı Gol": {
+        "Var": "odds_btts_y",
+        "Yok": "odds_btts_n",
+    },
+    "1. Yarı Sonucu": {
+        "1": "odds_ht_ms_h",
+        "X": "odds_ht_ms_d",
+        "2": "odds_ht_ms_a",
+    },
+    "1. Yarı 0,5 Alt/Üst": {
+        "Üst": "odds_ht_ou05_o",
+        "Alt": "odds_ht_ou05_u",
+    },
+    "1. Yarı 1,5 Alt/Üst": {
+        "Üst": "odds_ht_ou15_o",
+        "Alt": "odds_ht_ou15_u",
+    },
+}
+
+# Top 5 leagues by name for individual breakdown (will be matched by league_id)
+TOP5_LEAGUE_NAMES = {
+    "Premier League",
+    "La Liga",
+    "Bundesliga",
+    "Serie A",
+    "Ligue 1",
+}
+
+# ============================================================================
+# STEP 1 — Load qualified league IDs
+# ============================================================================
+
+def load_qualified_leagues() -> List[str]:
+    path = os.path.abspath(QUALIFIED_LEAGUES_PATH)
+    with open(path, "r") as f:
+        leagues = json.load(f)
+    print(f"[Step 1] Loaded {len(leagues)} qualified league IDs.")
+    return [str(lid) for lid in leagues]
+
+
+# ============================================================================
+# STEP 1b — Fetch matches + pre-computed features in batch
+# ============================================================================
+
+def fetch_matches(conn, league_ids: List[str]) -> pd.DataFrame:
+    """
+    Single batch query: matches + football_ai_features + league name.
+    Only returns matches that also have odds data (inner join on odd_categories).
+    Returns a DataFrame with one row per match.
+    """
+    print("[Step 1b] Fetching matches with pre-computed features and odds ...")
+    cur = conn.cursor(cursor_factory=RealDictCursor)
+
+    cur.execute(
+        """
+        SELECT
+            m.id                AS match_id,
+            m.league_id,
+            l.name              AS league_name,
+            m.score_home,
+            m.score_away,
+            m.mst_utc,
+            -- From football_ai_features
+            f.home_elo          AS home_overall_elo,
+            f.away_elo          AS away_overall_elo,
+            f.elo_diff,
+            f.home_home_elo,
+            f.away_away_elo,
+            f.home_form_elo,
+            f.away_form_elo,
+            f.home_goals_avg_5  AS home_goals_avg,
+            f.away_goals_avg_5  AS away_goals_avg,
+            f.home_conceded_avg_5 AS home_conceded_avg,
+            f.away_conceded_avg_5 AS away_conceded_avg,
+            f.home_clean_sheet_rate,
+            f.away_clean_sheet_rate,
+            f.home_scoring_rate,
+            f.away_scoring_rate,
+            f.home_win_streak   AS home_winning_streak,
+            f.away_win_streak   AS away_winning_streak,
+            f.home_avg_possession,
+            f.away_avg_possession,
+            f.home_avg_shots_on_target,
+            f.away_avg_shots_on_target,
+            f.home_shot_conversion,
+            f.away_shot_conversion,
+            f.home_avg_corners,
+            f.away_avg_corners,
+            f.h2h_total         AS h2h_total_matches,
+            f.h2h_home_win_rate,
+            f.h2h_avg_goals,
+            f.h2h_over25_rate,
+            f.h2h_btts_rate,
+            f.league_avg_goals,
+            f.league_home_win_pct AS league_home_win_rate,
+            f.league_over25_pct   AS league_ou25_rate,
+            f.referee_avg_cards   AS referee_cards_total,
+            f.referee_home_bias,
+            f.referee_avg_goals,
+            f.missing_players_impact AS home_missing_impact,
+            f.implied_home,
+            f.implied_draw,
+            f.implied_away
+        FROM matches m
+        JOIN football_ai_features f ON f.match_id = m.id
+        -- Only matches that have odds data
+        JOIN (SELECT DISTINCT match_id FROM odd_categories WHERE sport = 'football') oc
+            ON oc.match_id = m.id
+        LEFT JOIN leagues l ON l.id = m.league_id
+        WHERE m.status = 'FT'
+          AND m.score_home IS NOT NULL
+          AND m.score_away IS NOT NULL
+          AND m.league_id = ANY(%s)
+        ORDER BY m.mst_utc DESC
+        LIMIT %s
+        """,
+        (league_ids, MAX_MATCHES),
+    )
+
+    rows = cur.fetchall()
+    cur.close()
+    df = pd.DataFrame([dict(r) for r in rows])
+    print(f"[Step 1b] Fetched {len(df)} matches with features + odds coverage.")
+    return df
+
+
+# ============================================================================
+# STEP 1c — Fetch all odds for the matched match IDs in one query
+# ============================================================================
+
+def fetch_odds_bulk(conn, match_ids: List[str]) -> Dict[str, Dict[str, float]]:
+    """
+    Returns {match_id: {feature_key: odd_value, ...}} for all known categories.
+    """
+    print(f"[Step 1c] Fetching odds for {len(match_ids)} matches ...")
+    cur = conn.cursor(cursor_factory=RealDictCursor)
+
+    # Build a set of known category names
+    known_cats = tuple(ODDS_CATEGORY_MAP.keys())
+
+    cur.execute(
+        """
+        SELECT oc.match_id, oc.name AS cat_name, os.name AS sel_name, os.odd_value
+        FROM odd_categories oc
+        JOIN odd_selections os ON os.odd_category_db_id = oc.db_id
+        WHERE oc.match_id = ANY(%s)
+          AND oc.name = ANY(%s)
+          AND oc.sport = 'football'
+          AND os.odd_value IS NOT NULL
+          AND os.odd_value ~ '^[0-9]+(\.[0-9]+)?$'
+        """,
+        (match_ids, list(known_cats)),
+    )
+
+    rows = cur.fetchall()
+    cur.close()
+
+    # Build nested dict: match_id -> {feature_key -> value}
+    odds_map: Dict[str, Dict[str, float]] = defaultdict(dict)
+    for r in rows:
+        cat_name = r["cat_name"]
+        sel_name = r["sel_name"]
+        if cat_name in ODDS_CATEGORY_MAP and sel_name in ODDS_CATEGORY_MAP[cat_name]:
+            feat_key = ODDS_CATEGORY_MAP[cat_name][sel_name]
+            try:
+                val = float(r["odd_value"])
+                if val > 1.0:
+                    # Keep first encountered (most recent or primary bookmaker)
+                    if feat_key not in odds_map[r["match_id"]]:
+                        odds_map[r["match_id"]][feat_key] = val
+            except (TypeError, ValueError):
+                pass
+
+    print(f"[Step 1c] Odds loaded for {len(odds_map)} matches.")
+    return dict(odds_map)
+
+
+# ============================================================================
+# STEP 2 — Build 114-feature vector per match
+# ============================================================================
+
+def load_feature_cols() -> List[str]:
+    path = os.path.join(AI_ENGINE_DIR, "models", "v25", "feature_cols.json")
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def build_feature_vector(
+    match_row: pd.Series,
+    odds: Dict[str, float],
+    feature_cols: List[str],
+) -> Dict[str, float]:
+    """
+    Construct the full feature dict for one match.
+    Falls back to 0.0 for any missing feature.
+    """
+    feat: Dict[str, float] = {col: 0.0 for col in feature_cols}
+
+    # ---- Direct columns from match row ----
+    direct_map = {
+        "home_overall_elo": "home_overall_elo",
+        "away_overall_elo": "away_overall_elo",
+        "elo_diff": "elo_diff",
+        "home_home_elo": "home_home_elo",
+        "away_away_elo": "away_away_elo",
+        "home_form_elo": "home_form_elo",
+        "away_form_elo": "away_form_elo",
+        "home_goals_avg": "home_goals_avg",
+        "away_goals_avg": "away_goals_avg",
+        "home_conceded_avg": "home_conceded_avg",
+        "away_conceded_avg": "away_conceded_avg",
+        "home_clean_sheet_rate": "home_clean_sheet_rate",
+        "away_clean_sheet_rate": "away_clean_sheet_rate",
+        "home_scoring_rate": "home_scoring_rate",
+        "away_scoring_rate": "away_scoring_rate",
+        "home_winning_streak": "home_winning_streak",
+        "away_winning_streak": "away_winning_streak",
+        "home_avg_possession": "home_avg_possession",
+        "away_avg_possession": "away_avg_possession",
+        "home_avg_shots_on_target": "home_avg_shots_on_target",
+        "away_avg_shots_on_target": "away_avg_shots_on_target",
+        "home_shot_conversion": "home_shot_conversion",
+        "away_shot_conversion": "away_shot_conversion",
+        "home_avg_corners": "home_avg_corners",
+        "away_avg_corners": "away_avg_corners",
+        "h2h_total_matches": "h2h_total_matches",
+        "h2h_home_win_rate": "h2h_home_win_rate",
+        "h2h_avg_goals": "h2h_avg_goals",
+        "h2h_over25_rate": "h2h_over25_rate",
+        "h2h_btts_rate": "h2h_btts_rate",
+        "league_avg_goals": "league_avg_goals",
+        "league_home_win_rate": "league_home_win_rate",
+        "league_ou25_rate": "league_ou25_rate",
+        "referee_cards_total": "referee_cards_total",
+        "referee_home_bias": "referee_home_bias",
+        "referee_avg_goals": "referee_avg_goals",
+        "home_missing_impact": "home_missing_impact",
+        "implied_home": "implied_home",
+        "implied_draw": "implied_draw",
+        "implied_away": "implied_away",
+    }
+
+    for src_col, feat_col in direct_map.items():
+        if feat_col in feat and src_col in match_row.index:
+            val = match_row.get(src_col)
+            if val is not None and not (isinstance(val, float) and np.isnan(val)):
+                feat[feat_col] = float(val)
+
+    # ---- Derived elo features ----
+    if feat.get("home_form_elo", 0) and feat.get("away_form_elo", 0):
+        feat["form_elo_diff"] = feat["home_form_elo"] - feat["away_form_elo"]
+
+    # ---- Odds features from relational tables ----
+    odds_features = [
+        "odds_ms_h", "odds_ms_d", "odds_ms_a",
+        "odds_ht_ms_h", "odds_ht_ms_d", "odds_ht_ms_a",
+        "odds_ou05_o", "odds_ou05_u",
+        "odds_ou15_o", "odds_ou15_u",
+        "odds_ou25_o", "odds_ou25_u",
+        "odds_ou35_o", "odds_ou35_u",
+        "odds_ht_ou05_o", "odds_ht_ou05_u",
+        "odds_ht_ou15_o", "odds_ht_ou15_u",
+        "odds_btts_y", "odds_btts_n",
+    ]
+    for ok in odds_features:
+        if ok in odds:
+            feat[ok] = odds[ok]
+            presence_key = f"{ok}_present"
+            if presence_key in feat:
+                feat[presence_key] = 1.0
+
+    # Recompute implied probabilities from odds if available and not already set
+    if feat.get("odds_ms_h", 0) > 1 and feat.get("odds_ms_d", 0) > 1 and feat.get("odds_ms_a", 0) > 1:
+        raw_h = 1.0 / feat["odds_ms_h"]
+        raw_d = 1.0 / feat["odds_ms_d"]
+        raw_a = 1.0 / feat["odds_ms_a"]
+        total = raw_h + raw_d + raw_a
+        if total > 0:
+            feat["implied_home"] = raw_h / total
+            feat["implied_draw"] = raw_d / total
+            feat["implied_away"] = raw_a / total
+
+    # ---- Derived match metadata ----
+    mst = match_row.get("mst_utc")
+    if mst is not None:
+        try:
+            ts_s = int(mst) / 1000  # stored as epoch ms
+            dt = datetime.utcfromtimestamp(ts_s)
+            if "match_month" in feat:
+                feat["match_month"] = float(dt.month)
+            # Season markers: Sept-Oct = start, April-May = end
+            if "is_season_start" in feat:
+                feat["is_season_start"] = 1.0 if dt.month in (8, 9, 10) else 0.0
+            if "is_season_end" in feat:
+                feat["is_season_end"] = 1.0 if dt.month in (4, 5) else 0.0
+        except Exception:
+            pass
+
+    # ---- Interaction features ----
+    if "attack_vs_defense_home" in feat:
+        feat["attack_vs_defense_home"] = feat.get("home_goals_avg", 0) - feat.get("away_conceded_avg", 0)
+    if "attack_vs_defense_away" in feat:
+        feat["attack_vs_defense_away"] = feat.get("away_goals_avg", 0) - feat.get("home_conceded_avg", 0)
+    if "form_momentum_interaction" in feat:
+        feat["form_momentum_interaction"] = (
+            feat.get("home_momentum_score", 0) * feat.get("home_goals_avg", 0)
+            - feat.get("away_momentum_score", 0) * feat.get("away_goals_avg", 0)
+        )
+    if "elo_form_consistency" in feat:
+        feat["elo_form_consistency"] = feat.get("elo_diff", 0) * feat.get("home_goals_avg", 0)
+
+    return feat
+
+
+# ============================================================================
+# STEP 3 — Run V25 predictions
+# ============================================================================
+
+def load_predictor():
+    from models.v25_ensemble import get_v25_predictor
+    print("[Step 3] Loading V25 predictor ...")
+    pred = get_v25_predictor()
+    print("[Step 3] V25 predictor ready.")
+    return pred
+
+
+# ============================================================================
+# STEP 4 — Compute actual outcomes from scores
+# ============================================================================
+
+def compute_actuals(score_home: int, score_away: int) -> Dict[str, Any]:
+    total = score_home + score_away
+    return {
+        "ms_actual": "1" if score_home > score_away else ("X" if score_home == score_away else "2"),
+        "ou15_actual": "Over" if total >= 2 else "Under",
+        "ou25_actual": "Over" if total >= 3 else "Under",
+        "btts_actual": "Yes" if score_home > 0 and score_away > 0 else "No",
+    }
+
+
+# ============================================================================
+# STEP 5 — Accuracy helpers
+# ============================================================================
+
+def confidence_band(prob: float) -> str:
+    if prob < 0.50:
+        return "<50%"
+    elif prob < 0.65:
+        return "50-65%"
+    elif prob < 0.75:
+        return "65-75%"
+    else:
+        return "75%+"
+
+
+def pick_from_ms(home_prob: float, draw_prob: float, away_prob: float) -> Tuple[str, float]:
+    picks = {"1": home_prob, "X": draw_prob, "2": away_prob}
+    best = max(picks, key=picks.__getitem__)
+    return best, picks[best]
+
+
+def pick_from_binary(yes_prob: float, no_prob: float, yes_label: str, no_label: str) -> Tuple[str, float]:
+    if yes_prob >= no_prob:
+        return yes_label, yes_prob
+    return no_label, no_prob
+
+
+# ============================================================================
+# MAIN
+# ============================================================================
+
+def main():
+    t_start = time.time()
+    print("=" * 70)
+    print("  V25 Backtest + Calibration Training")
+    print(f"  Run at: {datetime.utcnow().isoformat()} UTC")
+    print("=" * 70)
+
+    # ------------------------------------------------------------------
+    # Step 1 — Load qualified leagues
+    # ------------------------------------------------------------------
+    league_ids = load_qualified_leagues()
+
+    # ------------------------------------------------------------------
+    # Step 1b — Fetch matches with features
+    # ------------------------------------------------------------------
+    conn = psycopg2.connect(get_clean_dsn())
+    try:
+        matches_df = fetch_matches(conn, league_ids)
+
+        if matches_df.empty:
+            print("[ERROR] No matches found. Check DB connection and league IDs.")
+            return
+
+        match_ids = matches_df["match_id"].tolist()
+
+        # ------------------------------------------------------------------
+        # Step 1c — Fetch odds in bulk
+        # ------------------------------------------------------------------
+        odds_map = fetch_odds_bulk(conn, match_ids)
+    finally:
+        conn.close()
+
+    # ------------------------------------------------------------------
+    # Step 2 — Build feature vectors
+    # ------------------------------------------------------------------
+    print(f"\n[Step 2] Building feature vectors for {len(matches_df)} matches ...")
+    feature_cols = load_feature_cols()
+
+    # ------------------------------------------------------------------
+    # Step 3 — Load V25 predictor
+    # ------------------------------------------------------------------
+    predictor = load_predictor()
+
+    # ------------------------------------------------------------------
+    # Main loop — predict each match, collect results
+    # ------------------------------------------------------------------
+    print(f"\n[Loop] Running predictions ...")
+
+    # Storage for calibration training
+    calib_data: Dict[str, List[Tuple[float, int]]] = {
+        "ms_home": [],   # (prob, 1 if home win)
+        "ms_draw": [],
+        "ms_away": [],
+        "ou15": [],
+        "ou25": [],
+        "btts": [],
+    }
+
+    # Storage for accuracy reporting
+    records = []
+
+    skipped = 0
+    processed = 0
+
+    for idx, row in matches_df.iterrows():
+        match_id = row["match_id"]
+        score_home = row.get("score_home")
+        score_away = row.get("score_away")
+
+        # Validate scores
+        try:
+            score_home = int(score_home)
+            score_away = int(score_away)
+        except (TypeError, ValueError):
+            skipped += 1
+            continue
+
+        # Build features
+        match_odds = odds_map.get(match_id, {})
+        feat = build_feature_vector(row, match_odds, feature_cols)
+
+        # Run predictions
+        try:
+            home_prob, draw_prob, away_prob = predictor.predict_ms(feat)
+            over25_prob, under25_prob = predictor.predict_ou25(feat)
+            btts_yes_prob, btts_no_prob = predictor.predict_btts(feat)
+
+            # ou15 is loaded via predict_market (returns np.ndarray for binary)
+            ou15_arr = predictor.predict_market("ou15", feat)
+            if ou15_arr is not None and len(ou15_arr) > 0:
+                over15_prob = float(ou15_arr[0])
+                under15_prob = 1.0 - over15_prob
+            else:
+                over15_prob = 0.5
+                under15_prob = 0.5
+
+        except Exception as e:
+            skipped += 1
+            continue
+
+        # Compute actuals
+        actuals = compute_actuals(score_home, score_away)
+
+        # MS picks
+        ms_pick, ms_conf = pick_from_ms(home_prob, draw_prob, away_prob)
+        ms_correct = int(ms_pick == actuals["ms_actual"])
+
+        # OU15
+        ou15_pick, ou15_conf = pick_from_binary(over15_prob, under15_prob, "Over", "Under")
+        ou15_correct = int(ou15_pick == actuals["ou15_actual"])
+
+        # OU25
+        ou25_pick, ou25_conf = pick_from_binary(over25_prob, under25_prob, "Over", "Under")
+        ou25_correct = int(ou25_pick == actuals["ou25_actual"])
+
+        # BTTS
+        btts_pick, btts_conf = pick_from_binary(btts_yes_prob, btts_no_prob, "Yes", "No")
+        btts_correct = int(btts_pick == actuals["btts_actual"])
+
+        # Collect calibration data
+        calib_data["ms_home"].append((home_prob, int(actuals["ms_actual"] == "1")))
+        calib_data["ms_draw"].append((draw_prob, int(actuals["ms_actual"] == "X")))
+        calib_data["ms_away"].append((away_prob, int(actuals["ms_actual"] == "2")))
+        calib_data["ou15"].append((over15_prob, int(actuals["ou15_actual"] == "Over")))
+        calib_data["ou25"].append((over25_prob, int(actuals["ou25_actual"] == "Over")))
+        calib_data["btts"].append((btts_yes_prob, int(actuals["btts_actual"] == "Yes")))
+
+        # Determine league group
+        league_name = str(row.get("league_name", "Other") or "Other")
+        league_group = league_name if league_name in TOP5_LEAGUE_NAMES else "Other"
+
+        records.append({
+            "match_id": match_id,
+            "league_name": league_name,
+            "league_group": league_group,
+            "score_home": score_home,
+            "score_away": score_away,
+            # MS
+            "ms_pick": ms_pick,
+            "ms_actual": actuals["ms_actual"],
+            "ms_conf": ms_conf,
+            "ms_conf_band": confidence_band(ms_conf),
+            "ms_correct": ms_correct,
+            "ms_home_prob": home_prob,
+            "ms_draw_prob": draw_prob,
+            "ms_away_prob": away_prob,
+            # OU15
+            "ou15_pick": ou15_pick,
+            "ou15_actual": actuals["ou15_actual"],
+            "ou15_conf": ou15_conf,
+            "ou15_conf_band": confidence_band(ou15_conf),
+            "ou15_correct": ou15_correct,
+            "ou15_over_prob": over15_prob,
+            # OU25
+            "ou25_pick": ou25_pick,
+            "ou25_actual": actuals["ou25_actual"],
+            "ou25_conf": ou25_conf,
+            "ou25_conf_band": confidence_band(ou25_conf),
+            "ou25_correct": ou25_correct,
+            "ou25_over_prob": over25_prob,
+            # BTTS
+            "btts_pick": btts_pick,
+            "btts_actual": actuals["btts_actual"],
+            "btts_conf": btts_conf,
+            "btts_conf_band": confidence_band(btts_conf),
+            "btts_correct": btts_correct,
+            "btts_yes_prob": btts_yes_prob,
+        })
+
+        processed += 1
+        if processed % PROGRESS_INTERVAL == 0:
+            elapsed = time.time() - t_start
+            print(f"  [Progress] {processed}/{len(matches_df)} matches | "
+                  f"skipped={skipped} | elapsed={elapsed:.1f}s")
+
+    print(f"\n[Loop] Done. Processed={processed}, Skipped={skipped}")
+
+    if not records:
+        print("[ERROR] No records to analyze. Exiting.")
+        return
+
+    results_df = pd.DataFrame(records)
+
+    # ------------------------------------------------------------------
+    # Step 5 — Accuracy report
+    # ------------------------------------------------------------------
+    print("\n" + "=" * 70)
+    print("  ACCURACY REPORT")
+    print("=" * 70)
+
+    markets = [
+        ("MS",   "ms_correct",   "ms_conf",   "ms_conf_band",   "ms_pick"),
+        ("OU15", "ou15_correct", "ou15_conf", "ou15_conf_band", "ou15_pick"),
+        ("OU25", "ou25_correct", "ou25_conf", "ou25_conf_band", "ou25_pick"),
+        ("BTTS", "btts_correct", "btts_conf", "btts_conf_band", "btts_pick"),
+    ]
+
+    summary: Dict[str, Any] = {
+        "generated_at": datetime.utcnow().isoformat(),
+        "matches_processed": processed,
+        "matches_skipped": skipped,
+        "markets": {},
+    }
+
+    for market_label, correct_col, conf_col, band_col, pick_col in markets:
+        print(f"\n--- {market_label} ---")
+        sub = results_df[[correct_col, conf_col, band_col, pick_col, "league_group"]].copy()
+        total = len(sub)
+        overall_acc = sub[correct_col].mean() * 100
+        print(f"  Overall accuracy: {overall_acc:.1f}% ({sub[correct_col].sum()}/{total})")
+
+        market_summary = {
+            "overall_accuracy": round(overall_acc, 2),
+            "total_matches": total,
+            "by_confidence_band": {},
+            "by_league": {},
+            "by_pick_direction": {},
+        }
+
+        # By confidence band
+        print(f"  By confidence band:")
+        bands = ["<50%", "50-65%", "65-75%", "75%+"]
+        for band in bands:
+            mask = sub[band_col] == band
+            n = mask.sum()
+            if n > 0:
+                acc = sub.loc[mask, correct_col].mean() * 100
+                mean_conf = sub.loc[mask, conf_col].mean() * 100
+                print(f"    {band:8s}: {acc:5.1f}% acc | {n:4d} matches | "
+                      f"mean_conf={mean_conf:.1f}%")
+                market_summary["by_confidence_band"][band] = {
+                    "accuracy": round(acc, 2),
+                    "count": int(n),
+                    "mean_confidence": round(mean_conf, 2),
+                }
+
+        # By league group
+        print(f"  By league:")
+        league_groups = list(results_df["league_group"].unique())
+        # Sort: named leagues first, then Other
+        named = sorted([g for g in league_groups if g != "Other"])
+        ordered = named + (["Other"] if "Other" in league_groups else [])
+        for lg in ordered:
+            mask = sub["league_group"] == lg
+            n = mask.sum()
+            if n > 0:
+                acc = sub.loc[mask, correct_col].mean() * 100
+                print(f"    {lg[:20]:20s}: {acc:5.1f}% ({n} matches)")
+                market_summary["by_league"][lg] = {
+                    "accuracy": round(acc, 2),
+                    "count": int(n),
+                }
+
+        # By pick direction
+        print(f"  By pick direction:")
+        for pick_val in sorted(sub[pick_col].unique()):
+            mask = sub[pick_col] == pick_val
+            n = mask.sum()
+            if n > 0:
+                acc = sub.loc[mask, correct_col].mean() * 100
+                mean_conf = sub.loc[mask, conf_col].mean() * 100
+                print(f"    {pick_val:8s}: {acc:5.1f}% acc | {n:4d} matches | "
+                      f"mean_conf={mean_conf:.1f}%")
+                market_summary["by_pick_direction"][pick_val] = {
+                    "accuracy": round(acc, 2),
+                    "count": int(n),
+                    "mean_confidence": round(mean_conf, 2),
+                }
+
+        summary["markets"][market_label] = market_summary
+
+    # ------------------------------------------------------------------
+    # Step 6 — Train calibration models
+    # ------------------------------------------------------------------
+    print("\n" + "=" * 70)
+    print("  CALIBRATION TRAINING")
+    print("=" * 70)
+
+    calibrator = Calibrator()
+
+    # Market config: market_key -> (label for prob, label for actual binary)
+    calib_market_map = {
+        "ms_home": "ms_home",
+        "ms_draw": "ms_draw",
+        "ms_away": "ms_away",
+        "ou15": "ou15",
+        "ou25": "ou25",
+        "btts": "btts",
+    }
+
+    calibration_results: Dict[str, Dict] = {}
+
+    for market_key in calib_market_map:
+        pairs = calib_data[market_key]
+        if len(pairs) < 100:
+            print(f"[Calib] {market_key}: only {len(pairs)} samples — skipping.")
+            continue
+
+        probs = np.array([p for p, _ in pairs])
+        actuals_bin = np.array([a for _, a in pairs])
+
+        # Build a tiny DataFrame to use Calibrator.train_calibration
+        calib_df = pd.DataFrame({
+            "prob": probs,
+            "actual": actuals_bin,
+        })
+
+        metrics = calibrator.train_calibration(
+            df=calib_df,
+            market=market_key,
+            prob_col="prob",
+            actual_col="actual",
+            min_samples=100,
+            save=True,
+        )
+        calibration_results[market_key] = metrics.to_dict()
+        print(f"  [Calib] {market_key}: Brier={metrics.brier_score:.4f} | "
+              f"ECE={metrics.calibration_error:.4f} | n={metrics.sample_count}")
+
+    # ------------------------------------------------------------------
+    # Step 7 — Save results
+    # ------------------------------------------------------------------
+    output_path = os.path.join(REPORTS_DIR, "backtest_results.json")
+    full_report = {
+        **summary,
+        "calibration": calibration_results,
+        "runtime_seconds": round(time.time() - t_start, 1),
+    }
+
+    with open(output_path, "w") as f:
+        json.dump(full_report, f, indent=2)
+    print(f"\n[Step 7] Report saved to {output_path}")
+
+    # ------------------------------------------------------------------
+    # Final summary table
+    # ------------------------------------------------------------------
+    print("\n" + "=" * 70)
+    print("  FINAL SUMMARY TABLE")
+    print("=" * 70)
+    print(f"{'Market':<8} {'Overall Acc':>12} {'Matches':>8} "
+          f"{'Best Band (acc)':>18}")
+    print("-" * 70)
+    for market_label, _, _, _, _ in markets:
+        ms = summary["markets"].get(market_label, {})
+        overall = ms.get("overall_accuracy", 0)
+        total_m = ms.get("total_matches", 0)
+        bands_d = ms.get("by_confidence_band", {})
+        # Find best accuracy band with >= 50 matches
+        best_band = "-"
+        best_acc = 0.0
+        for band, bdata in bands_d.items():
+            if bdata["count"] >= 50 and bdata["accuracy"] > best_acc:
+                best_acc = bdata["accuracy"]
+                best_band = f"{band} ({best_acc:.1f}%)"
+        print(f"{market_label:<8} {overall:>11.1f}% {total_m:>8d} {best_band:>18s}")
+
+    elapsed_total = time.time() - t_start
+    print(f"\nTotal runtime: {elapsed_total:.1f}s")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,459 @@
+"""
+League-Specific Model Trainer
+==============================
+Trains dedicated XGBoost models + isotonic calibration for each qualified league.
+
+Tiers:
+  - >=500 FT matches  → full XGBoost (12 markets) + calibration
+  - 100-499 matches   → isotonic calibration only (over general V25 predictions)
+  - <100 matches      → skipped
+
+Usage:
+  python scripts/train_league_models.py
+  python scripts/train_league_models.py --min-samples 300   # stricter threshold
+  python scripts/train_league_models.py --colab             # Colab-friendly output
+"""
+
+import os
+import sys
+import json
+import pickle
+import argparse
+import time
+import warnings
+from datetime import datetime
+
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from sklearn.isotonic import IsotonicRegression
+from sklearn.metrics import accuracy_score, log_loss
+
+warnings.filterwarnings("ignore")
+optuna_available = False
+try:
+    import optuna
+    optuna.logging.set_verbosity(optuna.logging.WARNING)
+    optuna_available = True
+except ImportError:
+    pass
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+AI_ENGINE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+DATA_PATH     = os.path.join(AI_ENGINE_DIR, "data", "training_data.csv")
+MODELS_DIR    = os.path.join(AI_ENGINE_DIR, "models", "league_specific")
+REPORTS_DIR   = os.path.join(AI_ENGINE_DIR, "reports", "league_models")
+QUALIFIED_LEAGUES_PATH = os.path.join(os.path.dirname(AI_ENGINE_DIR), "qualified_leagues.json")
+
+os.makedirs(MODELS_DIR, exist_ok=True)
+os.makedirs(REPORTS_DIR, exist_ok=True)
+
+# ─── Markets ────────────────────────────────────────────────────────
+MARKETS = {
+    "MS":         {"label": "label_ms",         "num_class": 3, "min_samples": 200},
+    "OU15":       {"label": "label_ou15",        "num_class": 2, "min_samples": 150},
+    "OU25":       {"label": "label_ou25",        "num_class": 2, "min_samples": 150},
+    "OU35":       {"label": "label_ou35",        "num_class": 2, "min_samples": 150},
+    "BTTS":       {"label": "label_btts",        "num_class": 2, "min_samples": 150},
+    "HT":         {"label": "label_ht_result",   "num_class": 3, "min_samples": 150},
+    "HT_OU05":    {"label": "label_ht_ou05",     "num_class": 2, "min_samples": 150},
+    "HT_OU15":    {"label": "label_ht_ou15",     "num_class": 2, "min_samples": 150},
+    "HTFT":       {"label": "label_ht_ft",       "num_class": 9, "min_samples": 300},
+    "OE":         {"label": "label_odd_even",    "num_class": 2, "min_samples": 150},
+    "CARDS":      {"label": "label_cards_ou45",  "num_class": 2, "min_samples": 150},
+    "HANDICAP":   {"label": "label_handicap_ms", "num_class": 3, "min_samples": 200},
+}
+
+# Feature columns (from training_data.csv, excluding metadata + labels)
+SKIP_COLS = {
+    "match_id", "home_team_id", "away_team_id", "league_id", "mst_utc",
+    "score_home", "score_away", "total_goals", "ht_score_home", "ht_score_away",
+    "ht_total_goals",
+    "label_ms", "label_ou05", "label_ou15", "label_ou25", "label_ou35",
+    "label_btts", "label_ht_result", "label_ht_ou05", "label_ht_ou15",
+    "label_ht_ft", "label_odd_even", "label_yellow_cards", "label_cards_ou45",
+    "label_handicap_ms",
+}
+
+# XGBoost defaults — fast, no Optuna
+XGB_PARAMS_BINARY = {
+    "objective": "binary:logistic",
+    "eval_metric": "logloss",
+    "max_depth": 4,
+    "eta": 0.05,
+    "subsample": 0.8,
+    "colsample_bytree": 0.8,
+    "min_child_weight": 5,
+    "gamma": 0.1,
+    "reg_lambda": 1.0,
+    "verbosity": 0,
+    "seed": 42,
+    "nthread": -1,
+}
+
+XGB_PARAMS_MULTI = {
+    **XGB_PARAMS_BINARY,
+    "objective": "multi:softprob",
+    "eval_metric": "mlogloss",
+}
+
+
+def load_data() -> pd.DataFrame:
+    print(f"Loading training data from {DATA_PATH} ...")
+    df = pd.read_csv(DATA_PATH, low_memory=False)
+    print(f"  {len(df):,} rows, {len(df.columns)} columns")
+    return df
+
+
+def get_feature_cols(df: pd.DataFrame) -> list:
+    return [c for c in df.columns if c not in SKIP_COLS]
+
+
+def load_qualified_leagues() -> list:
+    if os.path.exists(QUALIFIED_LEAGUES_PATH):
+        with open(QUALIFIED_LEAGUES_PATH) as f:
+            return json.load(f)
+    # fallback: all leagues in CSV
+    return []
+
+
+def train_xgb_market(
+    X_train: np.ndarray,
+    y_train: np.ndarray,
+    X_test: np.ndarray,
+    y_test: np.ndarray,
+    num_class: int,
+    feature_cols: list,
+) -> tuple:
+    """Train XGBoost for one market. Returns (model, accuracy, logloss)."""
+    params = dict(XGB_PARAMS_MULTI if num_class > 2 else XGB_PARAMS_BINARY)
+    if num_class > 2:
+        params["num_class"] = num_class
+
+    dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_cols)
+    dtest  = xgb.DMatrix(X_test,  label=y_test,  feature_names=feature_cols)
+
+    model = xgb.train(
+        params,
+        dtrain,
+        num_boost_round=300,
+        evals=[(dtest, "val")],
+        early_stopping_rounds=30,
+        verbose_eval=False,
+    )
+
+    raw = model.predict(dtest)
+    if num_class > 2:
+        probs = raw.reshape(-1, num_class)
+        preds = np.argmax(probs, axis=1)
+        ll = log_loss(y_test, probs)
+    else:
+        preds = (raw >= 0.5).astype(int)
+        ll = log_loss(y_test, raw)
+
+    acc = accuracy_score(y_test, preds)
+    return model, acc, ll
+
+
+def train_isotonic(raw_probs: np.ndarray, y_true: np.ndarray) -> IsotonicRegression:
+    iso = IsotonicRegression(out_of_bounds="clip")
+    iso.fit(raw_probs, y_true)
+    return iso
+
+
+def get_general_v25_probs(df_league: pd.DataFrame, feature_cols: list, market: str, num_class: int):
+    """Use general V25 model to get predictions on this league's matches (for cal-only leagues)."""
+    try:
+        from models.v25_ensemble import get_v25_predictor
+        v25 = get_v25_predictor()
+        if not v25._loaded:
+            v25.load_models()
+
+        label_col = MARKETS[market]["label"]
+        valid = df_league[feature_cols + [label_col]].dropna()
+        if len(valid) < 50:
+            return None, None
+
+        market_key_map = {
+            "MS": "ms", "OU15": "ou15", "OU25": "ou25", "OU35": "ou35",
+            "BTTS": "btts", "HT": "ht_result", "HT_OU05": "ht_ou05",
+            "HT_OU15": "ht_ou15", "HTFT": "htft", "OE": "odd_even",
+            "CARDS": "cards_ou45", "HANDICAP": "handicap_ms",
+        }
+        mkey = market_key_map.get(market)
+        if not mkey or not v25.has_market(mkey):
+            return None, None
+
+        X = valid[feature_cols].fillna(0).values
+        y = valid[label_col].values
+
+        all_probs = []
+        for i in range(0, len(X), 500):
+            batch = X[i:i+500]
+            feat_dict = {col: float(batch[j, k]) for j, row in enumerate(batch) for k, col in enumerate(feature_cols)}
+            # batch predict
+            df_batch = pd.DataFrame(batch, columns=feature_cols)
+            dmat = xgb.DMatrix(df_batch)
+            models = v25.models.get(mkey, {})
+            batch_probs = []
+            if "xgb" in models:
+                p = models["xgb"].predict(dmat)
+                if num_class > 2:
+                    p = p.reshape(-1, num_class)
+                batch_probs.append(p)
+            if batch_probs:
+                all_probs.append(np.mean(batch_probs, axis=0))
+
+        if not all_probs:
+            return None, None
+
+        probs = np.vstack(all_probs) if num_class > 2 else np.concatenate(all_probs)
+        return probs, y
+    except Exception as e:
+        return None, None
+
+
+def process_league(
+    league_id: str,
+    df_league: pd.DataFrame,
+    feature_cols: list,
+    full_model: bool,
+    league_name: str,
+) -> dict:
+    """Train models for one league. Returns metrics dict."""
+    n = len(df_league)
+    out_dir = os.path.join(MODELS_DIR, league_id)
+    os.makedirs(out_dir, exist_ok=True)
+
+    metrics = {"league_id": league_id, "league_name": league_name, "n_matches": n, "markets": {}}
+
+    # Time-based split: last 20% as test
+    split_idx = int(n * 0.80)
+    df_sorted = df_league.sort_values("mst_utc")
+    df_train = df_sorted.iloc[:split_idx]
+    df_test  = df_sorted.iloc[split_idx:]
+
+    saved_feature_cols = False
+
+    for market, cfg in MARKETS.items():
+        label_col  = cfg["label"]
+        num_class  = cfg["num_class"]
+        min_samp   = cfg["min_samples"]
+
+        if label_col not in df_league.columns:
+            continue
+
+        valid_train = df_train[feature_cols + [label_col]].dropna()
+        valid_test  = df_test[feature_cols + [label_col]].dropna()
+
+        if len(valid_train) < min_samp or len(valid_test) < 30:
+            continue
+
+        X_train = valid_train[feature_cols].fillna(0).values
+        y_train = valid_train[label_col].values.astype(int)
+        X_test  = valid_test[feature_cols].fillna(0).values
+        y_test  = valid_test[label_col].values.astype(int)
+
+        mkt_metrics = {"n_train": len(X_train), "n_test": len(X_test)}
+
+        if full_model:
+            try:
+                model, acc, ll = train_xgb_market(X_train, y_train, X_test, y_test, num_class, feature_cols)
+                model_path = os.path.join(out_dir, f"xgb_{market.lower()}.json")
+                model.save_model(model_path)
+                mkt_metrics.update({"accuracy": round(acc, 4), "logloss": round(ll, 4), "model": "xgb"})
+
+                if not saved_feature_cols:
+                    with open(os.path.join(out_dir, "feature_cols.json"), "w") as f:
+                        json.dump(feature_cols, f)
+                    saved_feature_cols = True
+
+                # Isotonic calibration from own model predictions
+                dtest_xgb = xgb.DMatrix(X_test, feature_names=feature_cols)
+                raw = model.predict(dtest_xgb)
+                if num_class > 2:
+                    raw = raw.reshape(-1, num_class)
+                    for cls_idx in range(num_class):
+                        iso = train_isotonic(raw[:, cls_idx], (y_test == cls_idx).astype(int))
+                        with open(os.path.join(out_dir, f"cal_{market.lower()}_{cls_idx}.pkl"), "wb") as f:
+                            pickle.dump(iso, f)
+                else:
+                    iso = train_isotonic(raw, y_test)
+                    with open(os.path.join(out_dir, f"cal_{market.lower()}.pkl"), "wb") as f:
+                        pickle.dump(iso, f)
+
+            except Exception as e:
+                mkt_metrics["error"] = str(e)
+        else:
+            # Calibration only: use general V25 model
+            try:
+                all_valid = df_league[feature_cols + [label_col]].dropna()
+                if len(all_valid) < min_samp:
+                    continue
+
+                X_all = all_valid[feature_cols].fillna(0).values
+                y_all = all_valid[label_col].values.astype(int)
+
+                # Use V25 general model
+                from models.v25_ensemble import get_v25_predictor
+                v25 = get_v25_predictor()
+                if not v25._loaded:
+                    v25.load_models()
+
+                market_key_map = {
+                    "MS": "ms", "OU15": "ou15", "OU25": "ou25", "OU35": "ou35",
+                    "BTTS": "btts", "HT": "ht_result", "HT_OU05": "ht_ou05",
+                    "HT_OU15": "ht_ou15", "HTFT": "htft", "OE": "odd_even",
+                    "CARDS": "cards_ou45", "HANDICAP": "handicap_ms",
+                }
+                mkey = market_key_map.get(market)
+                if not mkey or not v25.has_market(mkey):
+                    continue
+
+                df_feat = pd.DataFrame(X_all, columns=feature_cols)
+                dmat = xgb.DMatrix(df_feat)
+                models_v25 = v25.models.get(mkey, {})
+                if "xgb" not in models_v25:
+                    continue
+                raw = models_v25["xgb"].predict(dmat)
+
+                if num_class > 2:
+                    raw = raw.reshape(-1, num_class)
+                    for cls_idx in range(num_class):
+                        iso = train_isotonic(raw[:, cls_idx], (y_all == cls_idx).astype(int))
+                        with open(os.path.join(out_dir, f"cal_{market.lower()}_{cls_idx}.pkl"), "wb") as f:
+                            pickle.dump(iso, f)
+                else:
+                    iso = train_isotonic(raw, y_all)
+                    with open(os.path.join(out_dir, f"cal_{market.lower()}.pkl"), "wb") as f:
+                        pickle.dump(iso, f)
+
+                mkt_metrics.update({"n_train": len(X_all), "model": "cal_only"})
+            except Exception as e:
+                mkt_metrics["error"] = str(e)
+
+        metrics["markets"][market] = mkt_metrics
+
+    # Save metrics
+    with open(os.path.join(out_dir, "metrics.json"), "w") as f:
+        json.dump(metrics, f, indent=2)
+
+    return metrics
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--min-samples", type=int, default=500, help="Min matches for full model")
+    parser.add_argument("--cal-min",     type=int, default=100, help="Min matches for calibration")
+    parser.add_argument("--colab",       action="store_true",   help="Colab-friendly verbose output")
+    args = parser.parse_args()
+
+    start_total = time.time()
+
+    df = load_data()
+    feature_cols = get_feature_cols(df)
+    print(f"Feature columns: {len(feature_cols)}")
+
+    qualified = load_qualified_leagues()
+    if not qualified:
+        qualified = df["league_id"].unique().tolist()
+    print(f"Qualified leagues: {len(qualified)}")
+
+    # Get league names
+    league_names = {}
+    try:
+        import psycopg2
+        from data.db import get_clean_dsn
+        conn = psycopg2.connect(get_clean_dsn())
+        cur = conn.cursor()
+        cur.execute("SELECT id, name FROM leagues WHERE id = ANY(%s)", (qualified,))
+        league_names = {r[0]: r[1] for r in cur.fetchall()}
+        conn.close()
+    except Exception:
+        pass
+
+    # Filter to qualified leagues with enough data
+    counts = df[df["league_id"].isin(qualified)].groupby("league_id").size()
+    full_model_ids = counts[counts >= args.min_samples].index.tolist()
+    cal_only_ids   = counts[(counts >= args.cal_min) & (counts < args.min_samples)].index.tolist()
+
+    print(f"\nTam model ({args.min_samples}+ maç): {len(full_model_ids)} lig")
+    print(f"Kalibrasyon ({args.cal_min}-{args.min_samples-1} maç): {len(cal_only_ids)} lig")
+    print(f"Atlandı (<{args.cal_min} maç): {len([l for l in qualified if l not in full_model_ids and l not in cal_only_ids])} lig")
+    print()
+
+    all_results = []
+    total = len(full_model_ids) + len(cal_only_ids)
+    done = 0
+
+    for league_id, full_model in (
+        [(lid, True) for lid in full_model_ids] +
+        [(lid, False) for lid in cal_only_ids]
+    ):
+        t0 = time.time()
+        df_league = df[df["league_id"] == league_id].copy()
+        n = len(df_league)
+        name = league_names.get(league_id, league_id[:12])
+        tier = "FULL" if full_model else "CAL"
+
+        try:
+            result = process_league(league_id, df_league, feature_cols, full_model, name)
+            done += 1
+            elapsed = time.time() - t0
+
+            # Build accuracy string for key markets
+            acc_parts = []
+            for mkt in ["MS", "OU15", "OU25", "BTTS"]:
+                m = result["markets"].get(mkt, {})
+                if "accuracy" in m:
+                    acc_parts.append(f"{mkt}={m['accuracy']*100:.1f}%")
+            acc_str = " | ".join(acc_parts) if acc_parts else "(cal only)"
+
+            print(f"[{done:>3}/{total}] [{tier}] {name:<35} {n:>6,} maç | {acc_str} | {elapsed:.1f}s")
+            all_results.append(result)
+
+        except Exception as e:
+            done += 1
+            print(f"[{done:>3}/{total}] [{tier}] {name:<35} ERROR: {e}")
+
+        if done % 10 == 0:
+            elapsed_total = time.time() - start_total
+            remaining = (elapsed_total / done) * (total - done)
+            print(f"  ── {done}/{total} tamamlandı | geçen: {elapsed_total/60:.1f}dk | kalan tahmini: {remaining/60:.1f}dk ──")
+
+    # Final report
+    total_elapsed = time.time() - start_total
+    print(f"\n{'='*70}")
+    print(f"TAMAMLANDI: {len(all_results)}/{total} lig | Süre: {total_elapsed/60:.1f} dakika")
+    print(f"{'='*70}")
+
+    # Top 20 by accuracy
+    printable = [(r["league_name"], r["n_matches"], r["markets"]) for r in all_results
+                 if "MS" in r["markets"] and "accuracy" in r["markets"]["MS"]]
+    printable.sort(key=lambda x: x[2]["MS"].get("accuracy", 0), reverse=True)
+
+    print(f"\n{'Liga':<35} {'Maç':>6} {'MS':>7} {'OU15':>7} {'OU25':>7} {'BTTS':>7}")
+    print("-" * 70)
+    for name, n, mkts in printable[:30]:
+        ms   = mkts.get("MS",   {}).get("accuracy", 0) * 100
+        ou15 = mkts.get("OU15", {}).get("accuracy", 0) * 100
+        ou25 = mkts.get("OU25", {}).get("accuracy", 0) * 100
+        btts = mkts.get("BTTS", {}).get("accuracy", 0) * 100
+        print(f"{name:<35} {n:>6,} {ms:>6.1f}% {ou15:>6.1f}% {ou25:>6.1f}% {btts:>6.1f}%")
+
+    # Save master report
+    report = {
+        "generated_at": datetime.now().isoformat(),
+        "total_leagues": len(all_results),
+        "elapsed_minutes": round(total_elapsed / 60, 1),
+        "results": all_results,
+    }
+    report_path = os.path.join(REPORTS_DIR, "league_models_report.json")
+    with open(report_path, "w") as f:
+        json.dump(report, f, indent=2)
+    print(f"\nRapor kaydedildi: {report_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,259 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# League-Specific Model Trainer \u2014 Google Colab\n",
+    "164 lig i\u00e7in XGBoost + isotonic kalibrasyon. 12 market.\n",
+    "Modeller Drive'a kaydedilir, `models/league_specific/` klas\u00f6r\u00fcne kopyalan\u0131r.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mount Drive\n",
+    "from google.colab import drive\n",
+    "drive.mount('/content/drive')\n",
+    "\n",
+    "DRIVE_DIR = '/content/drive/MyDrive/iddaai'\n",
+    "import os\n",
+    "os.makedirs(DRIVE_DIR, exist_ok=True)\n",
+    "print('Drive mounted:', DRIVE_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# training_data.csv zaten Drive da: /content/drive/MyDrive/iddaai/training_data.csv\n",
+    "# Sadece qualified_leagues.json upload et (iddaai-be/ klas\u00f6r\u00fcnden)\n",
+    "from google.colab import files\n",
+    "import shutil\n",
+    "print(\"qualified_leagues.json dosyasini upload edin\")\n",
+    "uploaded = files.upload()\n",
+    "for fname in uploaded:\n",
+    "    shutil.copy(fname, f\"{DRIVE_DIR}/{fname}\")\n",
+    "    print(f\"Kaydedildi: {DRIVE_DIR}/{fname}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Upload training_data.csv and qualified_leagues.json from local machine\n",
+    "from google.colab import files\n",
+    "print('training_data.csv upload edin (ai-engine/data/training_data.csv)')\n",
+    "uploaded = files.upload()\n",
+    "import shutil\n",
+    "for fname in uploaded:\n",
+    "    shutil.copy(fname, f'{DRIVE_DIR}/{fname}')\n",
+    "    print(f'Saved: {DRIVE_DIR}/{fname}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, json, pickle, time, warnings\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import xgboost as xgb\n",
+    "from sklearn.isotonic import IsotonicRegression\n",
+    "from sklearn.metrics import accuracy_score, log_loss\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "DRIVE_DIR   = '/content/drive/MyDrive/iddaai'\n",
+    "DATA_PATH   = f'{DRIVE_DIR}/training_data.csv'\n",
+    "QL_PATH     = f'{DRIVE_DIR}/qualified_leagues.json'\n",
+    "MODELS_DIR  = f'{DRIVE_DIR}/league_specific'\n",
+    "os.makedirs(MODELS_DIR, exist_ok=True)\n",
+    "\n",
+    "MARKETS = {\n",
+    "    'MS':       {'label': 'label_ms',          'num_class': 3, 'min_samples': 200},\n",
+    "    'OU15':     {'label': 'label_ou15',         'num_class': 2, 'min_samples': 150},\n",
+    "    'OU25':     {'label': 'label_ou25',         'num_class': 2, 'min_samples': 150},\n",
+    "    'OU35':     {'label': 'label_ou35',         'num_class': 2, 'min_samples': 150},\n",
+    "    'BTTS':     {'label': 'label_btts',         'num_class': 2, 'min_samples': 150},\n",
+    "    'HT':       {'label': 'label_ht_result',    'num_class': 3, 'min_samples': 150},\n",
+    "    'HT_OU05':  {'label': 'label_ht_ou05',      'num_class': 2, 'min_samples': 150},\n",
+    "    'HT_OU15':  {'label': 'label_ht_ou15',      'num_class': 2, 'min_samples': 150},\n",
+    "    'HTFT':     {'label': 'label_ht_ft',        'num_class': 9, 'min_samples': 300},\n",
+    "    'OE':       {'label': 'label_odd_even',     'num_class': 2, 'min_samples': 150},\n",
+    "    'CARDS':    {'label': 'label_cards_ou45',   'num_class': 2, 'min_samples': 150},\n",
+    "    'HANDICAP': {'label': 'label_handicap_ms',  'num_class': 3, 'min_samples': 200},\n",
+    "}\n",
+    "\n",
+    "SKIP_COLS = {\n",
+    "    'match_id','home_team_id','away_team_id','league_id','mst_utc',\n",
+    "    'score_home','score_away','total_goals','ht_score_home','ht_score_away','ht_total_goals',\n",
+    "    'label_ms','label_ou05','label_ou15','label_ou25','label_ou35','label_btts',\n",
+    "    'label_ht_result','label_ht_ou05','label_ht_ou15','label_ht_ft',\n",
+    "    'label_odd_even','label_yellow_cards','label_cards_ou45','label_handicap_ms',\n",
+    "}\n",
+    "\n",
+    "XGB_BASE = {\n",
+    "    'max_depth': 4, 'eta': 0.05, 'subsample': 0.8,\n",
+    "    'colsample_bytree': 0.8, 'min_child_weight': 5,\n",
+    "    'gamma': 0.1, 'reg_lambda': 1.0, 'verbosity': 0, 'seed': 42,\n",
+    "    'nthread': -1,\n",
+    "}\n",
+    "\n",
+    "df = pd.read_csv(DATA_PATH, low_memory=False)\n",
+    "feature_cols = [c for c in df.columns if c not in SKIP_COLS]\n",
+    "print(f'Y\u00fcklendi: {len(df):,} sat\u0131r | {len(feature_cols)} feature')\n",
+    "\n",
+    "qualified = json.load(open(QL_PATH)) if os.path.exists(QL_PATH) else df['league_id'].unique().tolist()\n",
+    "counts = df[df['league_id'].isin(qualified)].groupby('league_id').size()\n",
+    "full_ids = counts[counts >= 500].index.tolist()\n",
+    "cal_ids  = counts[(counts >= 100) & (counts < 500)].index.tolist()\n",
+    "print(f'Tam model: {len(full_ids)} | Kalibrasyon: {len(cal_ids)} | Toplam: {len(full_ids)+len(cal_ids)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_one_league(league_id, df_league, feature_cols, full_model):\n",
+    "    n = len(df_league)\n",
+    "    out_dir = f'{MODELS_DIR}/{league_id}'\n",
+    "    os.makedirs(out_dir, exist_ok=True)\n",
+    "    metrics = {}\n",
+    "\n",
+    "    df_sorted = df_league.sort_values('mst_utc')\n",
+    "    split = int(n * 0.80)\n",
+    "    df_tr, df_te = df_sorted.iloc[:split], df_sorted.iloc[split:]\n",
+    "\n",
+    "    saved_fc = False\n",
+    "\n",
+    "    for market, cfg in MARKETS.items():\n",
+    "        lbl, nc, ms = cfg['label'], cfg['num_class'], cfg['min_samples']\n",
+    "        if lbl not in df_league.columns:\n",
+    "            continue\n",
+    "\n",
+    "        if full_model:\n",
+    "            vtr = df_tr[feature_cols + [lbl]].dropna()\n",
+    "            vte = df_te[feature_cols + [lbl]].dropna()\n",
+    "            if len(vtr) < ms or len(vte) < 30:\n",
+    "                continue\n",
+    "            Xtr, ytr = vtr[feature_cols].fillna(0).values, vtr[lbl].values.astype(int)\n",
+    "            Xte, yte = vte[feature_cols].fillna(0).values, vte[lbl].values.astype(int)\n",
+    "\n",
+    "            params = {**XGB_BASE, 'objective': 'multi:softprob' if nc > 2 else 'binary:logistic',\n",
+    "                      'eval_metric': 'mlogloss' if nc > 2 else 'logloss'}\n",
+    "            if nc > 2: params['num_class'] = nc\n",
+    "\n",
+    "            dtr = xgb.DMatrix(Xtr, label=ytr, feature_names=feature_cols)\n",
+    "            dte = xgb.DMatrix(Xte, label=yte, feature_names=feature_cols)\n",
+    "            model = xgb.train(params, dtr, 300, [(dte,'v')], early_stopping_rounds=30, verbose_eval=False)\n",
+    "            model.save_model(f'{out_dir}/xgb_{market.lower()}.json')\n",
+    "\n",
+    "            if not saved_fc:\n",
+    "                json.dump(feature_cols, open(f'{out_dir}/feature_cols.json','w'))\n",
+    "                saved_fc = True\n",
+    "\n",
+    "            raw = model.predict(dte)\n",
+    "            if nc > 2:\n",
+    "                raw = raw.reshape(-1, nc)\n",
+    "                acc = accuracy_score(yte, np.argmax(raw, axis=1))\n",
+    "                for ci in range(nc):\n",
+    "                    iso = IsotonicRegression(out_of_bounds='clip').fit(raw[:,ci], (yte==ci).astype(int))\n",
+    "                    pickle.dump(iso, open(f'{out_dir}/cal_{market.lower()}_{ci}.pkl','wb'))\n",
+    "            else:\n",
+    "                acc = accuracy_score(yte, (raw>=0.5).astype(int))\n",
+    "                iso = IsotonicRegression(out_of_bounds='clip').fit(raw, yte)\n",
+    "                pickle.dump(iso, open(f'{out_dir}/cal_{market.lower()}.pkl','wb'))\n",
+    "\n",
+    "            metrics[market] = {'accuracy': round(float(acc),4), 'n_train': len(Xtr)}\n",
+    "        else:\n",
+    "            # Cal only \u2014 store empty placeholder so prediction knows to use general V25\n",
+    "            metrics[market] = {'model': 'cal_only', 'n': n}\n",
+    "\n",
+    "    json.dump({'league_id': league_id, 'n': n, 'markets': metrics},\n",
+    "               open(f'{out_dir}/metrics.json','w'), indent=2)\n",
+    "    return metrics\n",
+    "\n",
+    "start = time.time()\n",
+    "all_ids = [(lid, True) for lid in full_ids] + [(lid, False) for lid in cal_ids]\n",
+    "results = []\n",
+    "\n",
+    "for i, (lid, full) in enumerate(all_ids, 1):\n",
+    "    dfl = df[df['league_id'] == lid].copy()\n",
+    "    t0 = time.time()\n",
+    "    try:\n",
+    "        mkt_res = train_one_league(lid, dfl, feature_cols, full)\n",
+    "        ms_acc = mkt_res.get('MS', {}).get('accuracy', '-')\n",
+    "        results.append((lid, len(dfl), mkt_res))\n",
+    "        print(f'[{i:>3}/{len(all_ids)}] {lid[:20]:<20} n={len(dfl):>5,} MS={ms_acc} {time.time()-t0:.1f}s')\n",
+    "    except Exception as e:\n",
+    "        print(f'[{i:>3}/{len(all_ids)}] {lid[:20]:<20} ERROR: {e}')\n",
+    "\n",
+    "    if i % 20 == 0:\n",
+    "        el = time.time()-start\n",
+    "        print(f'  \u2500\u2500 {i}/{len(all_ids)} done | {el/60:.1f}min elapsed | ~{el/i*(len(all_ids)-i)/60:.1f}min left \u2500\u2500')\n",
+    "\n",
+    "print(f'\\nBitti! {len(results)} lig | {(time.time()-start)/60:.1f} dakika')\n",
+    "print(f'Modeller: {MODELS_DIR}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sonu\u00e7lar\u0131 g\u00f6ster \u2014 MS accuracy s\u0131ralamas\u0131\n",
+    "printable = [(lid, n, m) for lid, n, m in results if 'MS' in m and 'accuracy' in m['MS']]\n",
+    "printable.sort(key=lambda x: x[2]['MS']['accuracy'], reverse=True)\n",
+    "print(f'{\"Liga ID\":<30} {\"Ma\u00e7\":>6} {\"MS\":>7} {\"OU15\":>7} {\"OU25\":>7} {\"BTTS\":>7}')\n",
+    "print('-'*70)\n",
+    "for lid, n, m in printable[:30]:\n",
+    "    ms   = m.get('MS',  {}).get('accuracy', 0)*100\n",
+    "    ou15 = m.get('OU15',{}).get('accuracy', 0)*100\n",
+    "    ou25 = m.get('OU25',{}).get('accuracy', 0)*100\n",
+    "    btts = m.get('BTTS',{}).get('accuracy', 0)*100\n",
+    "    print(f'{lid:<30} {n:>6,} {ms:>6.1f}% {ou15:>6.1f}% {ou25:>6.1f}% {btts:>6.1f}%')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Zip ve indir\n",
+    "import shutil\n",
+    "zip_path = f'{DRIVE_DIR}/league_specific_models.zip'\n",
+    "shutil.make_archive(zip_path.replace('.zip',''), 'zip', MODELS_DIR)\n",
+    "print(f'Zip: {zip_path}')\n",
+    "# \u0130ndirmek i\u00e7in:\n",
+    "# from google.colab import files\n",
+    "# files.download(zip_path)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -0,0 +1,108 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# HÜCRE 1 — Paketler\n",
+    "!pip install xgboost lightgbm optuna scikit-learn pandas numpy -q\n",
+    "print('Hazır')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# HÜCRE 2 — Drive bağla + CSV çek\n",
+    "from google.colab import drive\n",
+    "import os, shutil\n",
+    "drive.mount('/content/drive')\n",
+    "\n",
+    "# training_data.csv'yi Drive'ın iddaai klasöründen kopyala\n",
+    "shutil.copy('/content/drive/MyDrive/iddaai/training_data.csv', '/content/training_data.csv')\n",
+    "print('CSV hazır:', os.path.getsize('/content/training_data.csv') // 1024 // 1024, 'MB')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# HÜCRE 3 — iddaai_colab3.zip upload et (ai-engine kodları)\n",
+    "from google.colab import files\n",
+    "import zipfile\n",
+    "print('iddaai_colab3.zip dosyasını seç:')\n",
+    "uploaded = files.upload()\n",
+    "with zipfile.ZipFile('iddaai_colab3.zip') as z:\n",
+    "    z.extractall('/content')\n",
+    "print('Kod hazır')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# HÜCRE 4 — training_data.csv'yi script'in beklediği yere koy\n",
+    "import os, shutil\n",
+    "os.makedirs('/content/ai-engine/data', exist_ok=True)\n",
+    "shutil.copy('/content/training_data.csv', '/content/ai-engine/data/training_data.csv')\n",
+    "print('Yerleştirildi')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# HÜCRE 5 — Eğitimi başlat (her 5 trial'da bir ilerleme gösterir)\n",
+    "import subprocess, os\n",
+    "\n",
+    "proc = subprocess.Popen(\n",
+    "    ['python', 'scripts/train_v25_pro.py'],\n",
+    "    stdout=subprocess.PIPE,\n",
+    "    stderr=subprocess.STDOUT,\n",
+    "    text=True,\n",
+    "    cwd='/content/ai-engine',\n",
+    "    env={**os.environ, 'PYTHONPATH': '/content/ai-engine'}\n",
+    ")\n",
+    "\n",
+    "for line in proc.stdout:\n",
+    "    print(line, end='', flush=True)\n",
+    "\n",
+    "proc.wait()\n",
+    "print('\\nEĞİTİM BİTTİ!')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# HÜCRE 6 — Modelleri Drive'a kaydet\n",
+    "import shutil, os\n",
+    "os.makedirs('/content/drive/MyDrive/iddaai/models_v25', exist_ok=True)\n",
+    "shutil.copytree(\n",
+    "    '/content/ai-engine/models/v25',\n",
+    "    '/content/drive/MyDrive/iddaai/models_v25',\n",
+    "    dirs_exist_ok=True\n",
+    ")\n",
+    "print('Modeller Drive a kaydedildi: MyDrive/iddaai/models_v25/')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
+  "language_info": {"name": "python", "version": "3.10.0"}
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -101,6 +101,32 @@ FEATURES = [
    "home_top_scorer_form", "away_top_scorer_form",
    "home_avg_player_exp", "away_avg_player_exp",
    "home_goals_diversity", "away_goals_diversity",
+    # V27 H2H Expanded (4)
+    "h2h_home_goals_avg", "h2h_away_goals_avg",
+    "h2h_recent_trend", "h2h_venue_advantage",
+    # V27 Rolling Stats (13)
+    "home_rolling5_goals", "home_rolling5_conceded",
+    "home_rolling10_goals", "home_rolling10_conceded",
+    "home_rolling20_goals", "home_rolling20_conceded",
+    "away_rolling5_goals", "away_rolling5_conceded",
+    "away_rolling10_goals", "away_rolling10_conceded",
+    "home_rolling5_cs", "away_rolling5_cs",
+    # V27 Venue Stats (4)
+    "home_venue_goals", "home_venue_conceded",
+    "away_venue_goals", "away_venue_conceded",
+    # V27 Goal Trend (2)
+    "home_goal_trend", "away_goal_trend",
+    # V27 Calendar (5)
+    "home_days_rest", "away_days_rest",
+    "match_month", "is_season_start", "is_season_end",
+    # V27 Interaction (6)
+    "attack_vs_defense_home", "attack_vs_defense_away",
+    "xg_diff", "form_momentum_interaction",
+    "elo_form_consistency", "upset_x_elo_gap",
+    # V27 League Expanded (5)
+    "league_home_win_rate", "league_draw_rate",
+    "league_btts_rate", "league_ou25_rate",
+    "league_reliability_score",
 ]

 MARKET_CONFIGS = [
@@ -295,12 +321,18 @@ def train_market(df, target_col, market_name, num_class, n_trials):

    print(f"[INFO] Split: train={len(X_train)} val={len(X_val)} cal={len(X_cal)} test={len(X_test)}")

+    def _cb(study, trial):
+        if trial.number % 5 == 0 or trial.number == n_trials - 1:
+            best = study.best_value if study.best_trial else float('inf')
+            print(f"  [{trial.number+1:>3}/{n_trials}] loss={trial.value:.4f} | best={best:.4f}", flush=True)
+
    # ── Phase 1: Optuna XGBoost ──────────────────────────────────
    print(f"\n[OPTUNA] XGBoost tuning ({n_trials} trials)...")
    xgb_study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
    xgb_study.optimize(
        lambda trial: xgb_objective(trial, X_train, y_train, X_val, y_val, num_class),
        n_trials=n_trials,
+        callbacks=[_cb],
    )
    xgb_best = xgb_study.best_params
    print(f"[OK] XGB best logloss: {xgb_study.best_value:.4f}")
@@ -311,6 +343,7 @@ def train_market(df, target_col, market_name, num_class, n_trials):
    lgb_study.optimize(
        lambda trial: lgb_objective(trial, X_train, y_train, X_val, y_val, num_class),
        n_trials=n_trials,
+        callbacks=[_cb],
    )
    lgb_best = lgb_study.best_params
    print(f"[OK] LGB best logloss: {lgb_study.best_value:.4f}")