feat(ai-engine): value sniper thresholds and logic relaxed

2026-05-06 17:44:45 +03:00
parent 5b5f83c8cf
commit 4f7090e2d9
13 changed files with 2040 additions and 382 deletions
@@ -0,0 +1,146 @@
+import os
+import sys
+import psycopg2
+from psycopg2.extras import RealDictCursor
+
+# Path ayarları
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from services.single_match_orchestrator import SingleMatchOrchestrator
+from services.feature_enrichment import FeatureEnrichmentService
+
+DSN = "postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db"
+
+def run_backtest(target_date="2026-05-03"):
+    conn = psycopg2.connect(DSN)
+    cur = conn.cursor(cursor_factory=RealDictCursor)
+    
+    # 1. Hedef tarihteki bitmiş maçları ve takım isimlerini getir
+    cur.execute("""
+        SELECT m.id, m.score_home, m.score_away, m.mst_utc,
+               t1.name as home_name, t2.name as away_name
+        FROM matches m
+        LEFT JOIN teams t1 ON m.home_team_id = t1.id
+        LEFT JOIN teams t2 ON m.away_team_id = t2.id
+        WHERE m.status IN ('FT', 'AET', 'PEN') 
+          AND to_timestamp(m.mst_utc / 1000.0)::date = %s::date
+          AND m.score_home IS NOT NULL
+        ORDER BY m.mst_utc ASC
+    """, (target_date,))
+    matches = cur.fetchall()
+    
+    if not matches:
+        print(f"❌ {target_date} tarihinde bitmiş maç bulunamadı.")
+        return
+
+    print(f"🚀 {target_date} için Orkestratör Backtesti Başlatılıyor... ({len(matches)} maç bulundu)")
+    print("-" * 60)
+    
+    orchestrator = SingleMatchOrchestrator()
+    
+    bets_placed = 0
+    won = 0
+    lost = 0
+    total_odds_won = 0.0
+    
+    for match in matches:
+        # 3. Üst Akıl (Orkestratör) analizi yapar
+        try:
+            package = orchestrator.analyze_match(match['id'])
+        except Exception as e:
+            print(f"Hata ({match['id']}): {e}")
+            continue
+            
+        if not package:
+            continue
+            
+        package_data = package
+        
+        # 4. Üst akıl bu maça bahis yapmaya karar verdi mi?
+        bet_advice = package_data.get("bet_advice", {})
+        if bet_advice.get("playable") == True:
+            bets_placed += 1
+            main_pick = package_data.get("main_pick", {})
+            market = main_pick.get("market")
+            pick = main_pick.get("pick")
+            odds = float(main_pick.get("odds", 0.0) or 0.0)
+            
+            # Skora göre kazanıp kazanmadığını kontrol et
+            is_won = False
+            h = match['score_home']
+            a = match['score_away']
+            
+            if market == "MS":
+                if pick == "1" and h > a: is_won = True
+                elif pick in ("X", "0") and h == a: is_won = True
+                elif pick == "2" and a > h: is_won = True
+            elif market == "OU25":
+                if pick == "Üst" and (h+a) > 2.5: is_won = True
+                elif pick == "Alt" and (h+a) < 2.5: is_won = True
+            elif market == "OU15":
+                if pick == "Üst" and (h+a) > 1.5: is_won = True
+                elif pick == "Alt" and (h+a) < 1.5: is_won = True
+            elif market == "BTTS":
+                if pick == "KG Var" and h > 0 and a > 0: is_won = True
+                elif pick == "KG Yok" and (h == 0 or a == 0): is_won = True
+            elif market == "DC":
+                if pick == "1X" and h >= a: is_won = True
+                elif pick == "12" and h != a: is_won = True
+                elif pick == "X2" and h <= a: is_won = True
+            
+            if is_won:
+                won += 1
+                total_odds_won += odds
+                res = "✅ KAZANDI"
+            else:
+                lost += 1
+                res = "❌ KAYBETTİ"
+                
+            print(f"[{res}] {match['home_name']} {h}-{a} {match['away_name']} | Tahmin: {market} {pick} (Oran: {odds})")
+        else:
+            main_pick = package_data.get("main_pick", {})
+            reasons = main_pick.get("reasons", ["Bilinmeyen Neden"]) if main_pick else ["No main pick"]
+            reason = " | ".join(reasons) if isinstance(reasons, list) else str(reasons)
+            
+            market_board = package_data.get("market_board", {})
+            main_pick_market = main_pick.get('market', 'N/A') if main_pick else 'N/A'
+            main_pick_pick = main_pick.get('pick', 'N/A') if main_pick else 'N/A'
+            print(f"[PAS] {match['home_name']} {match['score_home']}-{match['score_away']} {match['away_name']} | Reddedilen: {main_pick_market} {main_pick_pick} -> Neden: {reason}")
+            if "market_passed_all_gates" in reason:
+                print(f"      DEBUG: bet_advice = {bet_advice}")
+            
+            v25_ms = market_board.get("MS", {}).get("probs", {})
+            v27_ms = {} # V27 is merged into V25 probabilities in market_board, or we don't have separate V27 access here
+            
+            # Skora göre ms kontrolü
+            h = match['score_home']
+            a = match['score_away']
+            actual_ms = "1" if h > a else ("X" if h == a else "2")
+            
+            v25_top = max(v25_ms, key=v25_ms.get) if v25_ms else "N/A"
+            v27_top = "N/A"
+            
+            rejected_market = main_pick.get("market", "N/A") if main_pick else "N/A"
+            rejected_pick = main_pick.get("pick", "N/A") if main_pick else "N/A"
+            
+            print(f"[PAS] {match['home_name']} {h}-{a} {match['away_name']} | Reddedilen: {rejected_market} {rejected_pick} -> Neden: {reason}")
+            print(f"      [V25 MS Raw: {v25_top}] [Gerçek MS: {actual_ms}]")
+            
+    # Sonuç Raporu
+    print("\n" + "=" * 60)
+    print(f"📊 BACKTEST SONUÇLARI ({target_date})")
+    print("=" * 60)
+    print(f"Toplam Maç Sayısı   : {len(matches)}")
+    print(f"Oynanan Bahis Sayısı: {bets_placed} (Oynama Oranı: %{bets_placed/len(matches)*100:.1f})")
+    print(f"Riskli Bulunup Pas Geçilen: {len(matches) - bets_placed}")
+    
+    if bets_placed > 0:
+        win_rate = won / bets_placed * 100
+        roi = ((total_odds_won - bets_placed) / bets_placed) * 100
+        print(f"Kazanılan           : {won}")
+        print(f"Kaybedilen          : {lost}")
+        print(f"İsabet Oranı        : %{win_rate:.1f}")
+        print(f"Net Kar (ROI)       : %{roi:.1f} {'📈' if roi > 0 else '📉'}")
+
+if __name__ == "__main__":
+    run_backtest("2026-05-03")
@@ -0,0 +1,459 @@
+#!/usr/bin/env python3
+"""
+AI Features Full Enrichment Script
+====================================
+Fills empty/default columns in football_ai_features that were not populated
+by the original elo_backfill_v1 script.
+
+Enriches: H2H, referee, team_stats, league_averages, form_streaks,
+          rolling_goals, implied_odds, and clean_sheet/scoring rates.
+
+Usage:
+    python scripts/enrich_ai_features.py                          # enrich all
+    python scripts/enrich_ai_features.py --batch-size 500         # smaller batches
+    python scripts/enrich_ai_features.py --dry-run                # preview only
+    python scripts/enrich_ai_features.py --force                  # re-enrich all rows
+    python scripts/enrich_ai_features.py --limit 1000             # process N rows max
+
+Designed to be idempotent: uses ON CONFLICT upserts, skips already-enriched rows.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+import time
+import argparse
+from typing import Any, Dict, List, Optional, Tuple
+
+# Add ai-engine root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import psycopg2
+from psycopg2.extras import RealDictCursor, execute_values
+
+from data.db import get_clean_dsn
+from services.feature_enrichment import FeatureEnrichmentService
+
+# ────────────────────────── constants ──────────────────────────
+
+CALCULATOR_VER = 'enrichment_v2.0'
+DEFAULT_BATCH_SIZE = 200
+
+
+# ────────────────────────── helpers ────────────────────────────
+
+def fetch_unenriched_matches(
+    conn: psycopg2.extensions.connection,
+    force: bool = False,
+    limit: Optional[int] = None,
+) -> List[Dict[str, Any]]:
+    """
+    Fetch matches from football_ai_features that still have default values
+    in the enrichment columns (h2h_total=0 AND referee_avg_cards=0).
+
+    If force=True, fetches ALL rows regardless of current state.
+    """
+    with conn.cursor(cursor_factory=RealDictCursor) as cur:
+        where_clause = "WHERE 1=1" if force else (
+            "WHERE (faf.h2h_total = 0 AND faf.referee_avg_cards = 0)"
+        )
+        limit_clause = f"LIMIT {limit}" if limit else ""
+
+        cur.execute(f"""
+            SELECT
+                faf.match_id,
+                m.home_team_id,
+                m.away_team_id,
+                m.mst_utc,
+                m.league_id,
+                m.score_home,
+                m.score_away
+            FROM football_ai_features faf
+            JOIN matches m ON m.id = faf.match_id
+            WHERE m.status = 'FT'
+              AND m.score_home IS NOT NULL
+              AND m.sport = 'football'
+              AND ({where_clause.replace('WHERE ', '')})
+            ORDER BY m.mst_utc ASC
+            {limit_clause}
+        """)
+        return cur.fetchall()
+
+
+def fetch_referee_for_match(
+    cur: RealDictCursor,
+    match_id: str,
+) -> Optional[str]:
+    """Get the head referee name for a match from match_officials."""
+    try:
+        cur.execute("""
+            SELECT mo.name
+            FROM match_officials mo
+            WHERE mo.match_id = %s
+              AND mo.role_id = 1
+            LIMIT 1
+        """, (match_id,))
+        row = cur.fetchone()
+        return row['name'] if row else None
+    except Exception:
+        return None
+
+
+def fetch_implied_odds(
+    cur: RealDictCursor,
+    match_id: str,
+) -> Dict[str, float]:
+    """Get implied probabilities from odd_categories + odd_selections."""
+    defaults = {
+        'implied_home': 0.33,
+        'implied_draw': 0.33,
+        'implied_away': 0.33,
+        'implied_over25': 0.50,
+        'implied_btts_yes': 0.50,
+        'odds_overround': 0.0,
+    }
+    try:
+        cur.execute("""
+            SELECT oc.name AS cat_name, os.name AS sel_name, os.odd_value
+            FROM odd_selections os
+            JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
+            WHERE oc.match_id = %s
+        """, (match_id,))
+        rows = cur.fetchall()
+    except Exception:
+        return defaults
+
+    odds: Dict[str, float] = {}
+    for row in rows:
+        try:
+            cat = (row.get('cat_name') or '').lower().strip()
+            sel = (row.get('sel_name') or '').strip()
+            val = float(row.get('odd_value', 0))
+            if val <= 0:
+                continue
+
+            if cat == 'maç sonucu':
+                if sel == '1':
+                    odds['ms_h'] = val
+                elif sel in ('0', 'X'):
+                    odds['ms_d'] = val
+                elif sel == '2':
+                    odds['ms_a'] = val
+            elif cat == '2,5 alt/üst':
+                if 'üst' in sel.lower():
+                    odds['ou25_o'] = val
+                elif 'alt' in sel.lower():
+                    odds['ou25_u'] = val
+            elif cat == 'karşılıklı gol':
+                if 'var' in sel.lower():
+                    odds['btts_y'] = val
+                elif 'yok' in sel.lower():
+                    odds['btts_n'] = val
+        except (ValueError, TypeError):
+            continue
+
+    # Compute implied probabilities
+    ms_h = odds.get('ms_h', 0)
+    ms_d = odds.get('ms_d', 0)
+    ms_a = odds.get('ms_a', 0)
+
+    if ms_h > 1.0 and ms_d > 1.0 and ms_a > 1.0:
+        raw_sum = 1 / ms_h + 1 / ms_d + 1 / ms_a
+        overround = raw_sum - 1.0
+        defaults['implied_home'] = round((1 / ms_h) / raw_sum, 4)
+        defaults['implied_draw'] = round((1 / ms_d) / raw_sum, 4)
+        defaults['implied_away'] = round((1 / ms_a) / raw_sum, 4)
+        defaults['odds_overround'] = round(overround, 4)
+
+    ou25_o = odds.get('ou25_o', 0)
+    ou25_u = odds.get('ou25_u', 0)
+    if ou25_o > 1.0 and ou25_u > 1.0:
+        raw_sum = 1 / ou25_o + 1 / ou25_u
+        defaults['implied_over25'] = round((1 / ou25_o) / raw_sum, 4)
+
+    btts_y = odds.get('btts_y', 0)
+    btts_n = odds.get('btts_n', 0)
+    if btts_y > 1.0 and btts_n > 1.0:
+        raw_sum = 1 / btts_y + 1 / btts_n
+        defaults['implied_btts_yes'] = round((1 / btts_y) / raw_sum, 4)
+
+    return defaults
+
+
+def enrich_single_match(
+    enrichment: FeatureEnrichmentService,
+    cur: RealDictCursor,
+    match: Dict[str, Any],
+) -> Dict[str, Any]:
+    """
+    Compute all enrichment features for a single match and return
+    a dict ready for DB upsert.
+    """
+    match_id = match['match_id']
+    home_id = str(match['home_team_id'])
+    away_id = str(match['away_team_id'])
+    mst_utc = int(match['mst_utc']) if match['mst_utc'] else 0
+    league_id = str(match['league_id']) if match['league_id'] else None
+
+    # 1. Team stats
+    home_stats = enrichment.compute_team_stats(cur, home_id, mst_utc)
+    away_stats = enrichment.compute_team_stats(cur, away_id, mst_utc)
+
+    # 2. H2H
+    h2h = enrichment.compute_h2h(cur, home_id, away_id, mst_utc)
+
+    # 3. Form & streaks
+    home_form = enrichment.compute_form_streaks(cur, home_id, mst_utc)
+    away_form = enrichment.compute_form_streaks(cur, away_id, mst_utc)
+
+    # 4. Referee
+    referee_name = fetch_referee_for_match(cur, match_id)
+    referee = enrichment.compute_referee_stats(cur, referee_name, mst_utc)
+
+    # 5. League averages
+    league = enrichment.compute_league_averages(cur, league_id, mst_utc)
+
+    # 6. Rolling stats (for goals avg)
+    home_rolling = enrichment.compute_rolling_stats(cur, home_id, mst_utc)
+    away_rolling = enrichment.compute_rolling_stats(cur, away_id, mst_utc)
+
+    # 7. Implied odds
+    implied = fetch_implied_odds(cur, match_id)
+
+    return {
+        'match_id': match_id,
+        # Team stats
+        'home_avg_possession': round(home_stats['avg_possession'], 2),
+        'away_avg_possession': round(away_stats['avg_possession'], 2),
+        'home_avg_shots_on_target': round(home_stats['avg_shots_on_target'], 2),
+        'away_avg_shots_on_target': round(away_stats['avg_shots_on_target'], 2),
+        'home_shot_conversion': round(home_stats['shot_conversion'], 4),
+        'away_shot_conversion': round(away_stats['shot_conversion'], 4),
+        'home_avg_corners': round(home_stats['avg_corners'], 2),
+        'away_avg_corners': round(away_stats['avg_corners'], 2),
+        # H2H
+        'h2h_total': h2h['total_matches'],
+        'h2h_home_win_rate': round(h2h['home_win_rate'], 4),
+        'h2h_avg_goals': round(h2h['avg_goals'], 2),
+        'h2h_over25_rate': round(h2h['over25_rate'], 4),
+        'h2h_btts_rate': round(h2h['btts_rate'], 4),
+        # Form
+        'home_clean_sheet_rate': round(home_form['clean_sheet_rate'], 4),
+        'away_clean_sheet_rate': round(away_form['clean_sheet_rate'], 4),
+        'home_scoring_rate': round(home_form['scoring_rate'], 4),
+        'away_scoring_rate': round(away_form['scoring_rate'], 4),
+        'home_win_streak': home_form['winning_streak'],
+        'away_win_streak': away_form['winning_streak'],
+        # Rolling goals
+        'home_goals_avg_5': round(home_rolling['rolling5_goals'], 2),
+        'away_goals_avg_5': round(away_rolling['rolling5_goals'], 2),
+        'home_conceded_avg_5': round(home_rolling['rolling5_conceded'], 2),
+        'away_conceded_avg_5': round(away_rolling['rolling5_conceded'], 2),
+        # Referee
+        'referee_avg_cards': round(referee['cards_total'], 2),
+        'referee_home_bias': round(referee['home_bias'], 4),
+        'referee_avg_goals': round(referee['avg_goals'], 2),
+        # League
+        'league_avg_goals': round(league['avg_goals'], 2),
+        'league_home_win_pct': round(league['home_win_rate'], 4),
+        'league_over25_pct': round(league['ou25_rate'], 4),
+        # Implied odds
+        'implied_home': implied['implied_home'],
+        'implied_draw': implied['implied_draw'],
+        'implied_away': implied['implied_away'],
+        'implied_over25': implied['implied_over25'],
+        'implied_btts_yes': implied['implied_btts_yes'],
+        'odds_overround': implied['odds_overround'],
+        # Missing players impact — default (no lineup data for historical)
+        'missing_players_impact': 0.0,
+        # Version
+        'calculator_ver': CALCULATOR_VER,
+    }
+
+
+def flush_enrichment_batch(
+    conn: psycopg2.extensions.connection,
+    rows: List[Dict[str, Any]],
+    dry_run: bool,
+) -> int:
+    """Bulk upsert enriched features into football_ai_features."""
+    if not rows or dry_run:
+        return 0
+
+    columns = [
+        'match_id',
+        'home_avg_possession', 'away_avg_possession',
+        'home_avg_shots_on_target', 'away_avg_shots_on_target',
+        'home_shot_conversion', 'away_shot_conversion',
+        'home_avg_corners', 'away_avg_corners',
+        'h2h_total', 'h2h_home_win_rate', 'h2h_avg_goals',
+        'h2h_over25_rate', 'h2h_btts_rate',
+        'home_clean_sheet_rate', 'away_clean_sheet_rate',
+        'home_scoring_rate', 'away_scoring_rate',
+        'home_win_streak', 'away_win_streak',
+        'home_goals_avg_5', 'away_goals_avg_5',
+        'home_conceded_avg_5', 'away_conceded_avg_5',
+        'referee_avg_cards', 'referee_home_bias', 'referee_avg_goals',
+        'league_avg_goals', 'league_home_win_pct', 'league_over25_pct',
+        'implied_home', 'implied_draw', 'implied_away',
+        'implied_over25', 'implied_btts_yes', 'odds_overround',
+        'missing_players_impact', 'calculator_ver',
+    ]
+
+    # Build update SET clause (skip match_id)
+    update_cols = [c for c in columns if c != 'match_id']
+    set_clause = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
+
+    placeholders = ', '.join(['%s'] * len(columns))
+    values = [
+        tuple(row[c] for c in columns)
+        for row in rows
+    ]
+
+    with conn.cursor() as cur:
+        execute_values(
+            cur,
+            f"""
+            INSERT INTO football_ai_features ({', '.join(columns)})
+            VALUES %s
+            ON CONFLICT (match_id) DO UPDATE SET
+                {set_clause},
+                updated_at = NOW()
+            """,
+            values,
+            template=f"({placeholders})",
+            page_size=200,
+        )
+    conn.commit()
+    return len(rows)
+
+
+# ────────────────────────── main ───────────────────────────────
+
+def run_enrichment(
+    batch_size: int,
+    dry_run: bool,
+    force: bool,
+    limit: Optional[int],
+) -> None:
+    """Core enrichment loop."""
+    dsn = get_clean_dsn()
+    conn = psycopg2.connect(dsn)
+
+    print(f"\n{'=' * 60}")
+    print(f"🧠 AI Features Full Enrichment — {CALCULATOR_VER}")
+    print(f"   batch_size={batch_size}  dry_run={dry_run}  force={force}")
+    print(f"{'=' * 60}")
+
+    # 1. Fetch unenriched matches
+    t0 = time.time()
+    matches = fetch_unenriched_matches(conn, force=force, limit=limit)
+    print(f"\n📊 {len(matches):,} matches to enrich ({time.time() - t0:.1f}s)")
+
+    if not matches:
+        print("✅ Nothing to enrich — all rows already populated.")
+        conn.close()
+        return
+
+    # 2. Initialize enrichment service
+    enrichment = FeatureEnrichmentService()
+
+    # 3. Process in batches
+    total = len(matches)
+    processed = 0
+    written = 0
+    errors = 0
+    batch_buf: List[Dict[str, Any]] = []
+    t_start = time.time()
+
+    # Use a dedicated cursor with RealDictCursor for all enrichment queries
+    enrich_cur = conn.cursor(cursor_factory=RealDictCursor)
+
+    for idx, match in enumerate(matches):
+        try:
+            enriched = enrich_single_match(enrichment, enrich_cur, match)
+            batch_buf.append(enriched)
+        except Exception as e:
+            errors += 1
+            if errors <= 10:
+                print(f"   ⚠️ Error enriching {match.get('match_id', '?')}: {e}")
+
+        processed += 1
+
+        # Flush batch
+        if len(batch_buf) >= batch_size:
+            flushed = flush_enrichment_batch(conn, batch_buf, dry_run)
+            written += flushed
+            batch_buf.clear()
+
+        # Progress reporting
+        if processed % 500 == 0:
+            elapsed = time.time() - t_start
+            rate = processed / elapsed if elapsed > 0 else 0
+            remaining = (total - processed) / rate if rate > 0 else 0
+            pct = processed / total * 100
+            print(
+                f"   [{processed:>8,} / {total:,}] "
+                f"({pct:.1f}%) | {rate:.0f} matches/s | "
+                f"ETA: {remaining / 60:.1f} min | "
+                f"errors: {errors}"
+            )
+
+    # Flush remaining
+    if batch_buf:
+        flushed = flush_enrichment_batch(conn, batch_buf, dry_run)
+        written += flushed
+
+    enrich_cur.close()
+
+    elapsed = time.time() - t_start
+    print(f"\n{'=' * 60}")
+    print(f"✅ Enrichment complete:")
+    print(f"   Processed: {processed:,} matches in {elapsed:.1f}s")
+    print(f"   Written:   {written:,} rows")
+    print(f"   Errors:    {errors:,}")
+    print(f"   Rate:      {processed / elapsed:.0f} matches/s")
+    print(f"{'=' * 60}")
+
+    conn.close()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Enrich football_ai_features with H2H, referee, stats, and odds data"
+    )
+    parser.add_argument(
+        '--batch-size',
+        type=int,
+        default=DEFAULT_BATCH_SIZE,
+        help=f'DB insert batch size (default: {DEFAULT_BATCH_SIZE})',
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Compute features but do not write to DB',
+    )
+    parser.add_argument(
+        '--force',
+        action='store_true',
+        help='Re-enrich ALL rows, not just empty ones',
+    )
+    parser.add_argument(
+        '--limit',
+        type=int,
+        default=None,
+        help='Max number of matches to process',
+    )
+    args = parser.parse_args()
+
+    run_enrichment(
+        batch_size=args.batch_size,
+        dry_run=args.dry_run,
+        force=args.force,
+        limit=args.limit,
+    )
+
+
+if __name__ == '__main__':
+    main()
@@ -510,16 +510,24 @@ class FeatureExtractor:
        self.referee_engine = get_referee_engine()
        self.momentum_engine = get_momentum_engine()
    
+    # ── Data Quality Thresholds ──
+    # Matches below these thresholds produce default-only features that
+    # teach the model noise rather than signal.
+    DQ_MIN_FORM_MATCHES = 3          # team must have ≥3 prior matches
+    DQ_MIN_FEATURE_COVERAGE = 0.30   # ≥30% of key features must be non-default
+
    def extract_all(self) -> list:
-        """Extract features for all matches, yield row dicts."""
+        """Extract features for all matches with data quality validation."""
        matches = self.loader.matches
        total = len(matches)
        rows = []
        skipped = 0
+        dq_rejected = 0
+        dq_reasons: dict = defaultdict(int)
        t_start = time.time()
-        
+
        print(f"\n🔄 Extracting features for {total} matches...", flush=True)
-        
+
        # Process chronologically — ELO grows as we go
        for i, m in enumerate(matches):
            (
@@ -536,38 +544,43 @@ class FeatureExtractor:
                away_name,
                league_name,
            ) = m
-            
+
            if i % 100 == 0 and i > 0:
                elapsed = time.time() - t_start
                rate = i / elapsed  # matches per second
                remaining = (total - i) / rate if rate > 0 else 0
                pct = i / total * 100
-                print(f"  [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | ETA: {remaining/60:.1f} dk | skipped: {skipped}", flush=True)
-            
+                print(
+                    f"  [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | "
+                    f"ETA: {remaining/60:.1f} dk | skipped: {skipped} | "
+                    f"dq_rejected: {dq_rejected}",
+                    flush=True,
+                )
+
            row = self._extract_one(
-                mid,
-                hid,
-                aid,
-                sh,
-                sa,
-                hth,
-                hta,
-                mst,
-                lid,
-                home_name,
-                away_name,
-                league_name,
+                mid, hid, aid, sh, sa, hth, hta, mst, lid,
+                home_name, away_name, league_name,
            )
-            
+
            if row:
-                rows.append(row)
+                # ── Data Quality Gate ──
+                dq_pass, reason = self._validate_row_quality(row, hid, aid, mst)
+                if dq_pass:
+                    rows.append(row)
+                else:
+                    dq_rejected += 1
+                    dq_reasons[reason] += 1
            else:
                skipped += 1
-            
+
            # Update ELO after processing (so ELO is calculated BEFORE the match)
            self._update_elo(hid, aid, sh, sa)
-        
-        print(f"  ✅ Extracted {len(rows)} rows, skipped {skipped}", flush=True)
+
+        print(f"  ✅ Extracted {len(rows)} rows, skipped {skipped}, DQ rejected {dq_rejected}", flush=True)
+        if dq_reasons:
+            print(f"  📊 DQ Rejection reasons:")
+            for reason, count in sorted(dq_reasons.items(), key=lambda x: -x[1]):
+                print(f"     {reason}: {count}")
        return rows
    
    def _extract_one(
@@ -867,7 +880,58 @@ class FeatureExtractor:
        }
        
        return row
-    
+
+    def _validate_row_quality(
+        self,
+        row: dict,
+        home_id: str,
+        away_id: str,
+        before_date: int,
+    ) -> tuple:
+        """
+        Data quality gate for training rows.
+
+        Ensures the feature vector has enough real signal to be useful for
+        training.  Rejects rows where critical features are all at their
+        default/fallback values — these teach the model noise, not patterns.
+
+        Returns (pass: bool, reason: str | None).
+        """
+        # 1. Minimum form history: both teams must have enough prior matches
+        home_history = self.loader.team_matches.get(home_id, [])
+        away_history = self.loader.team_matches.get(away_id, [])
+        home_prior = sum(1 for m in home_history if m[0] < before_date)
+        away_prior = sum(1 for m in away_history if m[0] < before_date)
+
+        if home_prior < self.DQ_MIN_FORM_MATCHES:
+            return False, 'home_insufficient_history'
+        if away_prior < self.DQ_MIN_FORM_MATCHES:
+            return False, 'away_insufficient_history'
+
+        # 2. Feature coverage check: count how many key features are non-default
+        key_features = [
+            ('home_goals_avg', 1.3),
+            ('away_goals_avg', 1.3),
+            ('home_clean_sheet_rate', 0.25),
+            ('away_clean_sheet_rate', 0.25),
+            ('home_avg_possession', 0.50),
+            ('away_avg_possession', 0.50),
+            ('home_avg_shots_on_target', 3.5),
+            ('away_avg_shots_on_target', 3.5),
+            ('h2h_total_matches', 0),
+            ('odds_ms_h', 0.0),
+        ]
+        non_default = sum(
+            1 for feat_name, default_val in key_features
+            if abs(float(row.get(feat_name, default_val)) - default_val) > 0.01
+        )
+        coverage = non_default / len(key_features)
+
+        if coverage < self.DQ_MIN_FEATURE_COVERAGE:
+            return False, f'low_feature_coverage_{coverage:.0%}'
+
+        return True, None
+
    # -------------------------------------------------------------------------
    # ELO (simplified inline version — doesn't need DB, grows incrementally)
    # -------------------------------------------------------------------------
@@ -20,7 +20,7 @@ from sklearn.isotonic import IsotonicRegression
 warnings.filterwarnings("ignore")

 AI_DIR = Path(__file__).resolve().parent.parent
-DATA_CSV = AI_DIR / "data" / "training_data_v27.csv"
+DATA_CSV = AI_DIR / "data" / "training_data.csv"
 MODELS_DIR = AI_DIR / "models" / "v27"
 MODELS_DIR.mkdir(parents=True, exist_ok=True)

@@ -373,15 +373,52 @@ def main():
        print("\n" + "─"*65)
        print("  STAGE A.2: Fundamentals-Only O/U 2.5 Model")
        print("─"*65)
-        y_tr_ou = tr["label_ou25"].values
-        y_va_ou = va["label_ou25"].values
+        y_tr_ou = tr['label_ou25'].values
+        y_va_ou = va['label_ou25'].values
        mask_tr = ~np.isnan(y_tr_ou)
        mask_va = ~np.isnan(y_va_ou)
        if mask_tr.sum() > 1000:
            ou_models = train_fundamentals_model(
                X_tr[mask_tr], y_tr_ou[mask_tr].astype(int),
                X_va[mask_va], y_va_ou[mask_va].astype(int),
-                clean_feats, "ou25")
+                clean_feats, 'ou25')
+
+    # ── STAGE A.3: BTTS Model ──
+    btts_models = None
+    if 'label_btts' in tr.columns:
+        print('\n' + '─' * 65)
+        print('  STAGE A.3: Fundamentals-Only BTTS Model')
+        print('─' * 65)
+        y_tr_btts = tr['label_btts'].values
+        y_va_btts = va['label_btts'].values
+        mask_tr_btts = ~np.isnan(y_tr_btts)
+        mask_va_btts = ~np.isnan(y_va_btts)
+        if mask_tr_btts.sum() > 1000:
+            btts_models = train_fundamentals_model(
+                X_tr[mask_tr_btts], y_tr_btts[mask_tr_btts].astype(int),
+                X_va[mask_va_btts], y_va_btts[mask_va_btts].astype(int),
+                clean_feats, 'btts')
+
+            # Quick val accuracy
+            btts_probs = ensemble_predict(
+                btts_models,
+                X_va[mask_va_btts],
+                clean_feats,
+                n_class=2,
+            )
+            btts_acc = accuracy_score(
+                y_va_btts[mask_va_btts].astype(int),
+                btts_probs.argmax(1),
+            )
+            btts_ll = log_loss(
+                y_va_btts[mask_va_btts].astype(int),
+                btts_probs,
+            )
+            print(f'\n  BTTS Ensemble Val: acc={btts_acc:.4f}, logloss={btts_ll:.4f}')
+            # Compare with naive baseline (always predict majority class)
+            btts_majority = y_va_btts[mask_va_btts].astype(int).mean()
+            print(f'  BTTS baseline: {max(btts_majority, 1-btts_majority):.4f} (majority class)')
+            print(f'  Model vs baseline: {btts_acc - max(btts_majority, 1-btts_majority):+.4f}')

    # ── STAGE C: Backtest ──
    print("\n" + "─"*65)
@@ -422,13 +459,58 @@ def main():

    # OU25 backtest
    if ou_models:
-        print("\n  --- O/U 2.5 Backtest ---")
+        print('\n  --- O/U 2.5 Backtest ---')
        for edge in [0.05, 0.07, 0.10]:
-            r = backtest_value(ou_models, te, clean_feats, "ou25",
+            r = backtest_value(ou_models, te, clean_feats, 'ou25',
                               min_edge=edge, min_odds=1.50, max_odds=3.0,
                               use_kelly=True)
-            if r.get("total", 0) > 0:
-                print_backtest(r, f"OU25 edge>{edge}")
+            if r.get('total', 0) > 0:
+                print_backtest(r, f'OU25 edge>{edge}')
+
+    # BTTS backtest
+    if btts_models and 'label_btts' in te.columns:
+        print('\n  --- BTTS Backtest ---')
+        # Build BTTS odds for backtest
+        if 'odds_btts_y' in te.columns and 'odds_btts_n' in te.columns:
+            te_btts = te.copy()
+            te_btts['odds_btts_y'] = pd.to_numeric(
+                te_btts['odds_btts_y'], errors='coerce',
+            ).fillna(1.85)
+            te_btts['odds_btts_n'] = pd.to_numeric(
+                te_btts['odds_btts_n'], errors='coerce',
+            ).fillna(1.85)
+
+            for edge in [0.05, 0.07, 0.10]:
+                X_test = te_btts[clean_feats].values
+                probs = ensemble_predict(btts_models, X_test, clean_feats, 2)
+                y_btts = te_btts['label_btts'].values.astype(int)
+                odds_arr = te_btts[['odds_btts_n', 'odds_btts_y']].values
+                m_arr = 1 / odds_arr
+                impl = m_arr / m_arr.sum(axis=1, keepdims=True)
+
+                total_bets = 0
+                wins = 0
+                pnl = 0.0
+                for i in range(len(y_btts)):
+                    for cls in range(2):
+                        e = probs[i, cls] - impl[i, cls]
+                        o = odds_arr[i, cls]
+                        if e < edge or o < 1.50 or o > 3.0:
+                            continue
+                        total_bets += 1
+                        won = (y_btts[i] == cls)
+                        if won:
+                            wins += 1
+                            pnl += 10 * (o - 1)
+                        else:
+                            pnl -= 10
+                if total_bets > 0:
+                    roi = pnl / (total_bets * 10) * 100
+                    hit = wins / total_bets * 100
+                    print(
+                        f'    Edge>{edge:.2f}: {total_bets} bets, '
+                        f'hit={hit:.1f}%, ROI={roi:+.1f}%'
+                    )

    # ── Feature importance ──
    if "lgb" in ms_models:
@@ -452,25 +534,40 @@ def main():

    if ou_models:
        for name, m in ou_models.items():
-            p = MODELS_DIR / f"v27_ou25_{name}.pkl"
-            with open(p, "wb") as f:
+            p = MODELS_DIR / f'v27_ou25_{name}.pkl'
+            with open(p, 'wb') as f:
                pickle.dump(m, f)
-            print(f"  ✓ {p.name}")
+            print(f'  ✓ {p.name}')
+
+    if btts_models:
+        for name, m in btts_models.items():
+            p = MODELS_DIR / f'v27_btts_{name}.pkl'
+            with open(p, 'wb') as f:
+                pickle.dump(m, f)
+            print(f'  ✓ {p.name}')

    meta = {
-        "version": "v27-pro", "trained_at": time.strftime("%Y-%m-%d %H:%M:%S"),
-        "approach": "odds-free fundamentals + value edge detection",
-        "feature_count": len(clean_feats),
-        "total_samples": len(df),
-        "val_acc": round(val_acc, 4), "val_ll": round(val_ll, 4),
-        "best_config": {k: v for k, v in best_cfg.items() if k != "result"} if best_cfg else {},
-        "markets": ["ms"] + (["ou25"] if ou_models else []),
+        'version': 'v27-pro',
+        'trained_at': time.strftime('%Y-%m-%d %H:%M:%S'),
+        'approach': 'odds-free fundamentals + value edge detection',
+        'feature_count': len(clean_feats),
+        'total_samples': len(df),
+        'val_acc': round(val_acc, 4),
+        'val_ll': round(val_ll, 4),
+        'best_config': {
+            k: v for k, v in best_cfg.items() if k != 'result'
+        } if best_cfg else {},
+        'markets': (
+            ['ms']
+            + (['ou25'] if ou_models else [])
+            + (['btts'] if btts_models else [])
+        ),
    }
-    with open(MODELS_DIR / "v27_metadata.json", "w") as f:
+    with open(MODELS_DIR / 'v27_metadata.json', 'w') as f:
        json.dump(meta, f, indent=2, default=str)
-    with open(MODELS_DIR / "v27_feature_cols.json", "w") as f:
+    with open(MODELS_DIR / 'v27_feature_cols.json', 'w') as f:
        json.dump(clean_feats, f, indent=2)
-    print(f"  ✓ metadata + feature_cols")
+    print(f'  ✓ metadata + feature_cols')

    print(f"\n  Total time: {(time.time()-t0)/60:.1f} min")
    print("  DONE!")