feat(ai-engine): value sniper thresholds and logic relaxed

This commit is contained in:
2026-05-06 17:44:45 +03:00
parent 5b5f83c8cf
commit 4f7090e2d9
13 changed files with 2040 additions and 382 deletions
+146
View File
@@ -0,0 +1,146 @@
import os
import sys
import psycopg2
from psycopg2.extras import RealDictCursor
# Path ayarları
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from services.single_match_orchestrator import SingleMatchOrchestrator
from services.feature_enrichment import FeatureEnrichmentService
DSN = "postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db"
def run_backtest(target_date="2026-05-03"):
conn = psycopg2.connect(DSN)
cur = conn.cursor(cursor_factory=RealDictCursor)
# 1. Hedef tarihteki bitmiş maçları ve takım isimlerini getir
cur.execute("""
SELECT m.id, m.score_home, m.score_away, m.mst_utc,
t1.name as home_name, t2.name as away_name
FROM matches m
LEFT JOIN teams t1 ON m.home_team_id = t1.id
LEFT JOIN teams t2 ON m.away_team_id = t2.id
WHERE m.status IN ('FT', 'AET', 'PEN')
AND to_timestamp(m.mst_utc / 1000.0)::date = %s::date
AND m.score_home IS NOT NULL
ORDER BY m.mst_utc ASC
""", (target_date,))
matches = cur.fetchall()
if not matches:
print(f"{target_date} tarihinde bitmiş maç bulunamadı.")
return
print(f"🚀 {target_date} için Orkestratör Backtesti Başlatılıyor... ({len(matches)} maç bulundu)")
print("-" * 60)
orchestrator = SingleMatchOrchestrator()
bets_placed = 0
won = 0
lost = 0
total_odds_won = 0.0
for match in matches:
# 3. Üst Akıl (Orkestratör) analizi yapar
try:
package = orchestrator.analyze_match(match['id'])
except Exception as e:
print(f"Hata ({match['id']}): {e}")
continue
if not package:
continue
package_data = package
# 4. Üst akıl bu maça bahis yapmaya karar verdi mi?
bet_advice = package_data.get("bet_advice", {})
if bet_advice.get("playable") == True:
bets_placed += 1
main_pick = package_data.get("main_pick", {})
market = main_pick.get("market")
pick = main_pick.get("pick")
odds = float(main_pick.get("odds", 0.0) or 0.0)
# Skora göre kazanıp kazanmadığını kontrol et
is_won = False
h = match['score_home']
a = match['score_away']
if market == "MS":
if pick == "1" and h > a: is_won = True
elif pick in ("X", "0") and h == a: is_won = True
elif pick == "2" and a > h: is_won = True
elif market == "OU25":
if pick == "Üst" and (h+a) > 2.5: is_won = True
elif pick == "Alt" and (h+a) < 2.5: is_won = True
elif market == "OU15":
if pick == "Üst" and (h+a) > 1.5: is_won = True
elif pick == "Alt" and (h+a) < 1.5: is_won = True
elif market == "BTTS":
if pick == "KG Var" and h > 0 and a > 0: is_won = True
elif pick == "KG Yok" and (h == 0 or a == 0): is_won = True
elif market == "DC":
if pick == "1X" and h >= a: is_won = True
elif pick == "12" and h != a: is_won = True
elif pick == "X2" and h <= a: is_won = True
if is_won:
won += 1
total_odds_won += odds
res = "✅ KAZANDI"
else:
lost += 1
res = "❌ KAYBETTİ"
print(f"[{res}] {match['home_name']} {h}-{a} {match['away_name']} | Tahmin: {market} {pick} (Oran: {odds})")
else:
main_pick = package_data.get("main_pick", {})
reasons = main_pick.get("reasons", ["Bilinmeyen Neden"]) if main_pick else ["No main pick"]
reason = " | ".join(reasons) if isinstance(reasons, list) else str(reasons)
market_board = package_data.get("market_board", {})
main_pick_market = main_pick.get('market', 'N/A') if main_pick else 'N/A'
main_pick_pick = main_pick.get('pick', 'N/A') if main_pick else 'N/A'
print(f"[PAS] {match['home_name']} {match['score_home']}-{match['score_away']} {match['away_name']} | Reddedilen: {main_pick_market} {main_pick_pick} -> Neden: {reason}")
if "market_passed_all_gates" in reason:
print(f" DEBUG: bet_advice = {bet_advice}")
v25_ms = market_board.get("MS", {}).get("probs", {})
v27_ms = {} # V27 is merged into V25 probabilities in market_board, or we don't have separate V27 access here
# Skora göre ms kontrolü
h = match['score_home']
a = match['score_away']
actual_ms = "1" if h > a else ("X" if h == a else "2")
v25_top = max(v25_ms, key=v25_ms.get) if v25_ms else "N/A"
v27_top = "N/A"
rejected_market = main_pick.get("market", "N/A") if main_pick else "N/A"
rejected_pick = main_pick.get("pick", "N/A") if main_pick else "N/A"
print(f"[PAS] {match['home_name']} {h}-{a} {match['away_name']} | Reddedilen: {rejected_market} {rejected_pick} -> Neden: {reason}")
print(f" [V25 MS Raw: {v25_top}] [Gerçek MS: {actual_ms}]")
# Sonuç Raporu
print("\n" + "=" * 60)
print(f"📊 BACKTEST SONUÇLARI ({target_date})")
print("=" * 60)
print(f"Toplam Maç Sayısı : {len(matches)}")
print(f"Oynanan Bahis Sayısı: {bets_placed} (Oynama Oranı: %{bets_placed/len(matches)*100:.1f})")
print(f"Riskli Bulunup Pas Geçilen: {len(matches) - bets_placed}")
if bets_placed > 0:
win_rate = won / bets_placed * 100
roi = ((total_odds_won - bets_placed) / bets_placed) * 100
print(f"Kazanılan : {won}")
print(f"Kaybedilen : {lost}")
print(f"İsabet Oranı : %{win_rate:.1f}")
print(f"Net Kar (ROI) : %{roi:.1f} {'📈' if roi > 0 else '📉'}")
if __name__ == "__main__":
run_backtest("2026-05-03")
+459
View File
@@ -0,0 +1,459 @@
#!/usr/bin/env python3
"""
AI Features Full Enrichment Script
====================================
Fills empty/default columns in football_ai_features that were not populated
by the original elo_backfill_v1 script.
Enriches: H2H, referee, team_stats, league_averages, form_streaks,
rolling_goals, implied_odds, and clean_sheet/scoring rates.
Usage:
python scripts/enrich_ai_features.py # enrich all
python scripts/enrich_ai_features.py --batch-size 500 # smaller batches
python scripts/enrich_ai_features.py --dry-run # preview only
python scripts/enrich_ai_features.py --force # re-enrich all rows
python scripts/enrich_ai_features.py --limit 1000 # process N rows max
Designed to be idempotent: uses ON CONFLICT upserts, skips already-enriched rows.
"""
from __future__ import annotations
import os
import sys
import time
import argparse
from typing import Any, Dict, List, Optional, Tuple
# Add ai-engine root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import psycopg2
from psycopg2.extras import RealDictCursor, execute_values
from data.db import get_clean_dsn
from services.feature_enrichment import FeatureEnrichmentService
# ────────────────────────── constants ──────────────────────────
CALCULATOR_VER = 'enrichment_v2.0'
DEFAULT_BATCH_SIZE = 200
# ────────────────────────── helpers ────────────────────────────
def fetch_unenriched_matches(
conn: psycopg2.extensions.connection,
force: bool = False,
limit: Optional[int] = None,
) -> List[Dict[str, Any]]:
"""
Fetch matches from football_ai_features that still have default values
in the enrichment columns (h2h_total=0 AND referee_avg_cards=0).
If force=True, fetches ALL rows regardless of current state.
"""
with conn.cursor(cursor_factory=RealDictCursor) as cur:
where_clause = "WHERE 1=1" if force else (
"WHERE (faf.h2h_total = 0 AND faf.referee_avg_cards = 0)"
)
limit_clause = f"LIMIT {limit}" if limit else ""
cur.execute(f"""
SELECT
faf.match_id,
m.home_team_id,
m.away_team_id,
m.mst_utc,
m.league_id,
m.score_home,
m.score_away
FROM football_ai_features faf
JOIN matches m ON m.id = faf.match_id
WHERE m.status = 'FT'
AND m.score_home IS NOT NULL
AND m.sport = 'football'
AND ({where_clause.replace('WHERE ', '')})
ORDER BY m.mst_utc ASC
{limit_clause}
""")
return cur.fetchall()
def fetch_referee_for_match(
cur: RealDictCursor,
match_id: str,
) -> Optional[str]:
"""Get the head referee name for a match from match_officials."""
try:
cur.execute("""
SELECT mo.name
FROM match_officials mo
WHERE mo.match_id = %s
AND mo.role_id = 1
LIMIT 1
""", (match_id,))
row = cur.fetchone()
return row['name'] if row else None
except Exception:
return None
def fetch_implied_odds(
cur: RealDictCursor,
match_id: str,
) -> Dict[str, float]:
"""Get implied probabilities from odd_categories + odd_selections."""
defaults = {
'implied_home': 0.33,
'implied_draw': 0.33,
'implied_away': 0.33,
'implied_over25': 0.50,
'implied_btts_yes': 0.50,
'odds_overround': 0.0,
}
try:
cur.execute("""
SELECT oc.name AS cat_name, os.name AS sel_name, os.odd_value
FROM odd_selections os
JOIN odd_categories oc ON os.odd_category_db_id = oc.db_id
WHERE oc.match_id = %s
""", (match_id,))
rows = cur.fetchall()
except Exception:
return defaults
odds: Dict[str, float] = {}
for row in rows:
try:
cat = (row.get('cat_name') or '').lower().strip()
sel = (row.get('sel_name') or '').strip()
val = float(row.get('odd_value', 0))
if val <= 0:
continue
if cat == 'maç sonucu':
if sel == '1':
odds['ms_h'] = val
elif sel in ('0', 'X'):
odds['ms_d'] = val
elif sel == '2':
odds['ms_a'] = val
elif cat == '2,5 alt/üst':
if 'üst' in sel.lower():
odds['ou25_o'] = val
elif 'alt' in sel.lower():
odds['ou25_u'] = val
elif cat == 'karşılıklı gol':
if 'var' in sel.lower():
odds['btts_y'] = val
elif 'yok' in sel.lower():
odds['btts_n'] = val
except (ValueError, TypeError):
continue
# Compute implied probabilities
ms_h = odds.get('ms_h', 0)
ms_d = odds.get('ms_d', 0)
ms_a = odds.get('ms_a', 0)
if ms_h > 1.0 and ms_d > 1.0 and ms_a > 1.0:
raw_sum = 1 / ms_h + 1 / ms_d + 1 / ms_a
overround = raw_sum - 1.0
defaults['implied_home'] = round((1 / ms_h) / raw_sum, 4)
defaults['implied_draw'] = round((1 / ms_d) / raw_sum, 4)
defaults['implied_away'] = round((1 / ms_a) / raw_sum, 4)
defaults['odds_overround'] = round(overround, 4)
ou25_o = odds.get('ou25_o', 0)
ou25_u = odds.get('ou25_u', 0)
if ou25_o > 1.0 and ou25_u > 1.0:
raw_sum = 1 / ou25_o + 1 / ou25_u
defaults['implied_over25'] = round((1 / ou25_o) / raw_sum, 4)
btts_y = odds.get('btts_y', 0)
btts_n = odds.get('btts_n', 0)
if btts_y > 1.0 and btts_n > 1.0:
raw_sum = 1 / btts_y + 1 / btts_n
defaults['implied_btts_yes'] = round((1 / btts_y) / raw_sum, 4)
return defaults
def enrich_single_match(
enrichment: FeatureEnrichmentService,
cur: RealDictCursor,
match: Dict[str, Any],
) -> Dict[str, Any]:
"""
Compute all enrichment features for a single match and return
a dict ready for DB upsert.
"""
match_id = match['match_id']
home_id = str(match['home_team_id'])
away_id = str(match['away_team_id'])
mst_utc = int(match['mst_utc']) if match['mst_utc'] else 0
league_id = str(match['league_id']) if match['league_id'] else None
# 1. Team stats
home_stats = enrichment.compute_team_stats(cur, home_id, mst_utc)
away_stats = enrichment.compute_team_stats(cur, away_id, mst_utc)
# 2. H2H
h2h = enrichment.compute_h2h(cur, home_id, away_id, mst_utc)
# 3. Form & streaks
home_form = enrichment.compute_form_streaks(cur, home_id, mst_utc)
away_form = enrichment.compute_form_streaks(cur, away_id, mst_utc)
# 4. Referee
referee_name = fetch_referee_for_match(cur, match_id)
referee = enrichment.compute_referee_stats(cur, referee_name, mst_utc)
# 5. League averages
league = enrichment.compute_league_averages(cur, league_id, mst_utc)
# 6. Rolling stats (for goals avg)
home_rolling = enrichment.compute_rolling_stats(cur, home_id, mst_utc)
away_rolling = enrichment.compute_rolling_stats(cur, away_id, mst_utc)
# 7. Implied odds
implied = fetch_implied_odds(cur, match_id)
return {
'match_id': match_id,
# Team stats
'home_avg_possession': round(home_stats['avg_possession'], 2),
'away_avg_possession': round(away_stats['avg_possession'], 2),
'home_avg_shots_on_target': round(home_stats['avg_shots_on_target'], 2),
'away_avg_shots_on_target': round(away_stats['avg_shots_on_target'], 2),
'home_shot_conversion': round(home_stats['shot_conversion'], 4),
'away_shot_conversion': round(away_stats['shot_conversion'], 4),
'home_avg_corners': round(home_stats['avg_corners'], 2),
'away_avg_corners': round(away_stats['avg_corners'], 2),
# H2H
'h2h_total': h2h['total_matches'],
'h2h_home_win_rate': round(h2h['home_win_rate'], 4),
'h2h_avg_goals': round(h2h['avg_goals'], 2),
'h2h_over25_rate': round(h2h['over25_rate'], 4),
'h2h_btts_rate': round(h2h['btts_rate'], 4),
# Form
'home_clean_sheet_rate': round(home_form['clean_sheet_rate'], 4),
'away_clean_sheet_rate': round(away_form['clean_sheet_rate'], 4),
'home_scoring_rate': round(home_form['scoring_rate'], 4),
'away_scoring_rate': round(away_form['scoring_rate'], 4),
'home_win_streak': home_form['winning_streak'],
'away_win_streak': away_form['winning_streak'],
# Rolling goals
'home_goals_avg_5': round(home_rolling['rolling5_goals'], 2),
'away_goals_avg_5': round(away_rolling['rolling5_goals'], 2),
'home_conceded_avg_5': round(home_rolling['rolling5_conceded'], 2),
'away_conceded_avg_5': round(away_rolling['rolling5_conceded'], 2),
# Referee
'referee_avg_cards': round(referee['cards_total'], 2),
'referee_home_bias': round(referee['home_bias'], 4),
'referee_avg_goals': round(referee['avg_goals'], 2),
# League
'league_avg_goals': round(league['avg_goals'], 2),
'league_home_win_pct': round(league['home_win_rate'], 4),
'league_over25_pct': round(league['ou25_rate'], 4),
# Implied odds
'implied_home': implied['implied_home'],
'implied_draw': implied['implied_draw'],
'implied_away': implied['implied_away'],
'implied_over25': implied['implied_over25'],
'implied_btts_yes': implied['implied_btts_yes'],
'odds_overround': implied['odds_overround'],
# Missing players impact — default (no lineup data for historical)
'missing_players_impact': 0.0,
# Version
'calculator_ver': CALCULATOR_VER,
}
def flush_enrichment_batch(
conn: psycopg2.extensions.connection,
rows: List[Dict[str, Any]],
dry_run: bool,
) -> int:
"""Bulk upsert enriched features into football_ai_features."""
if not rows or dry_run:
return 0
columns = [
'match_id',
'home_avg_possession', 'away_avg_possession',
'home_avg_shots_on_target', 'away_avg_shots_on_target',
'home_shot_conversion', 'away_shot_conversion',
'home_avg_corners', 'away_avg_corners',
'h2h_total', 'h2h_home_win_rate', 'h2h_avg_goals',
'h2h_over25_rate', 'h2h_btts_rate',
'home_clean_sheet_rate', 'away_clean_sheet_rate',
'home_scoring_rate', 'away_scoring_rate',
'home_win_streak', 'away_win_streak',
'home_goals_avg_5', 'away_goals_avg_5',
'home_conceded_avg_5', 'away_conceded_avg_5',
'referee_avg_cards', 'referee_home_bias', 'referee_avg_goals',
'league_avg_goals', 'league_home_win_pct', 'league_over25_pct',
'implied_home', 'implied_draw', 'implied_away',
'implied_over25', 'implied_btts_yes', 'odds_overround',
'missing_players_impact', 'calculator_ver',
]
# Build update SET clause (skip match_id)
update_cols = [c for c in columns if c != 'match_id']
set_clause = ', '.join(f'{c} = EXCLUDED.{c}' for c in update_cols)
placeholders = ', '.join(['%s'] * len(columns))
values = [
tuple(row[c] for c in columns)
for row in rows
]
with conn.cursor() as cur:
execute_values(
cur,
f"""
INSERT INTO football_ai_features ({', '.join(columns)})
VALUES %s
ON CONFLICT (match_id) DO UPDATE SET
{set_clause},
updated_at = NOW()
""",
values,
template=f"({placeholders})",
page_size=200,
)
conn.commit()
return len(rows)
# ────────────────────────── main ───────────────────────────────
def run_enrichment(
batch_size: int,
dry_run: bool,
force: bool,
limit: Optional[int],
) -> None:
"""Core enrichment loop."""
dsn = get_clean_dsn()
conn = psycopg2.connect(dsn)
print(f"\n{'=' * 60}")
print(f"🧠 AI Features Full Enrichment — {CALCULATOR_VER}")
print(f" batch_size={batch_size} dry_run={dry_run} force={force}")
print(f"{'=' * 60}")
# 1. Fetch unenriched matches
t0 = time.time()
matches = fetch_unenriched_matches(conn, force=force, limit=limit)
print(f"\n📊 {len(matches):,} matches to enrich ({time.time() - t0:.1f}s)")
if not matches:
print("✅ Nothing to enrich — all rows already populated.")
conn.close()
return
# 2. Initialize enrichment service
enrichment = FeatureEnrichmentService()
# 3. Process in batches
total = len(matches)
processed = 0
written = 0
errors = 0
batch_buf: List[Dict[str, Any]] = []
t_start = time.time()
# Use a dedicated cursor with RealDictCursor for all enrichment queries
enrich_cur = conn.cursor(cursor_factory=RealDictCursor)
for idx, match in enumerate(matches):
try:
enriched = enrich_single_match(enrichment, enrich_cur, match)
batch_buf.append(enriched)
except Exception as e:
errors += 1
if errors <= 10:
print(f" ⚠️ Error enriching {match.get('match_id', '?')}: {e}")
processed += 1
# Flush batch
if len(batch_buf) >= batch_size:
flushed = flush_enrichment_batch(conn, batch_buf, dry_run)
written += flushed
batch_buf.clear()
# Progress reporting
if processed % 500 == 0:
elapsed = time.time() - t_start
rate = processed / elapsed if elapsed > 0 else 0
remaining = (total - processed) / rate if rate > 0 else 0
pct = processed / total * 100
print(
f" [{processed:>8,} / {total:,}] "
f"({pct:.1f}%) | {rate:.0f} matches/s | "
f"ETA: {remaining / 60:.1f} min | "
f"errors: {errors}"
)
# Flush remaining
if batch_buf:
flushed = flush_enrichment_batch(conn, batch_buf, dry_run)
written += flushed
enrich_cur.close()
elapsed = time.time() - t_start
print(f"\n{'=' * 60}")
print(f"✅ Enrichment complete:")
print(f" Processed: {processed:,} matches in {elapsed:.1f}s")
print(f" Written: {written:,} rows")
print(f" Errors: {errors:,}")
print(f" Rate: {processed / elapsed:.0f} matches/s")
print(f"{'=' * 60}")
conn.close()
def main() -> None:
parser = argparse.ArgumentParser(
description="Enrich football_ai_features with H2H, referee, stats, and odds data"
)
parser.add_argument(
'--batch-size',
type=int,
default=DEFAULT_BATCH_SIZE,
help=f'DB insert batch size (default: {DEFAULT_BATCH_SIZE})',
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Compute features but do not write to DB',
)
parser.add_argument(
'--force',
action='store_true',
help='Re-enrich ALL rows, not just empty ones',
)
parser.add_argument(
'--limit',
type=int,
default=None,
help='Max number of matches to process',
)
args = parser.parse_args()
run_enrichment(
batch_size=args.batch_size,
dry_run=args.dry_run,
force=args.force,
limit=args.limit,
)
if __name__ == '__main__':
main()
+88 -24
View File
@@ -510,16 +510,24 @@ class FeatureExtractor:
self.referee_engine = get_referee_engine()
self.momentum_engine = get_momentum_engine()
# ── Data Quality Thresholds ──
# Matches below these thresholds produce default-only features that
# teach the model noise rather than signal.
DQ_MIN_FORM_MATCHES = 3 # team must have ≥3 prior matches
DQ_MIN_FEATURE_COVERAGE = 0.30 # ≥30% of key features must be non-default
def extract_all(self) -> list:
"""Extract features for all matches, yield row dicts."""
"""Extract features for all matches with data quality validation."""
matches = self.loader.matches
total = len(matches)
rows = []
skipped = 0
dq_rejected = 0
dq_reasons: dict = defaultdict(int)
t_start = time.time()
print(f"\n🔄 Extracting features for {total} matches...", flush=True)
# Process chronologically — ELO grows as we go
for i, m in enumerate(matches):
(
@@ -536,38 +544,43 @@ class FeatureExtractor:
away_name,
league_name,
) = m
if i % 100 == 0 and i > 0:
elapsed = time.time() - t_start
rate = i / elapsed # matches per second
remaining = (total - i) / rate if rate > 0 else 0
pct = i / total * 100
print(f" [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | ETA: {remaining/60:.1f} dk | skipped: {skipped}", flush=True)
print(
f" [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | "
f"ETA: {remaining/60:.1f} dk | skipped: {skipped} | "
f"dq_rejected: {dq_rejected}",
flush=True,
)
row = self._extract_one(
mid,
hid,
aid,
sh,
sa,
hth,
hta,
mst,
lid,
home_name,
away_name,
league_name,
mid, hid, aid, sh, sa, hth, hta, mst, lid,
home_name, away_name, league_name,
)
if row:
rows.append(row)
# ── Data Quality Gate ──
dq_pass, reason = self._validate_row_quality(row, hid, aid, mst)
if dq_pass:
rows.append(row)
else:
dq_rejected += 1
dq_reasons[reason] += 1
else:
skipped += 1
# Update ELO after processing (so ELO is calculated BEFORE the match)
self._update_elo(hid, aid, sh, sa)
print(f" ✅ Extracted {len(rows)} rows, skipped {skipped}", flush=True)
print(f" ✅ Extracted {len(rows)} rows, skipped {skipped}, DQ rejected {dq_rejected}", flush=True)
if dq_reasons:
print(f" 📊 DQ Rejection reasons:")
for reason, count in sorted(dq_reasons.items(), key=lambda x: -x[1]):
print(f" {reason}: {count}")
return rows
def _extract_one(
@@ -867,7 +880,58 @@ class FeatureExtractor:
}
return row
def _validate_row_quality(
self,
row: dict,
home_id: str,
away_id: str,
before_date: int,
) -> tuple:
"""
Data quality gate for training rows.
Ensures the feature vector has enough real signal to be useful for
training. Rejects rows where critical features are all at their
default/fallback values — these teach the model noise, not patterns.
Returns (pass: bool, reason: str | None).
"""
# 1. Minimum form history: both teams must have enough prior matches
home_history = self.loader.team_matches.get(home_id, [])
away_history = self.loader.team_matches.get(away_id, [])
home_prior = sum(1 for m in home_history if m[0] < before_date)
away_prior = sum(1 for m in away_history if m[0] < before_date)
if home_prior < self.DQ_MIN_FORM_MATCHES:
return False, 'home_insufficient_history'
if away_prior < self.DQ_MIN_FORM_MATCHES:
return False, 'away_insufficient_history'
# 2. Feature coverage check: count how many key features are non-default
key_features = [
('home_goals_avg', 1.3),
('away_goals_avg', 1.3),
('home_clean_sheet_rate', 0.25),
('away_clean_sheet_rate', 0.25),
('home_avg_possession', 0.50),
('away_avg_possession', 0.50),
('home_avg_shots_on_target', 3.5),
('away_avg_shots_on_target', 3.5),
('h2h_total_matches', 0),
('odds_ms_h', 0.0),
]
non_default = sum(
1 for feat_name, default_val in key_features
if abs(float(row.get(feat_name, default_val)) - default_val) > 0.01
)
coverage = non_default / len(key_features)
if coverage < self.DQ_MIN_FEATURE_COVERAGE:
return False, f'low_feature_coverage_{coverage:.0%}'
return True, None
# -------------------------------------------------------------------------
# ELO (simplified inline version — doesn't need DB, grows incrementally)
# -------------------------------------------------------------------------
+118 -21
View File
@@ -20,7 +20,7 @@ from sklearn.isotonic import IsotonicRegression
warnings.filterwarnings("ignore")
AI_DIR = Path(__file__).resolve().parent.parent
DATA_CSV = AI_DIR / "data" / "training_data_v27.csv"
DATA_CSV = AI_DIR / "data" / "training_data.csv"
MODELS_DIR = AI_DIR / "models" / "v27"
MODELS_DIR.mkdir(parents=True, exist_ok=True)
@@ -373,15 +373,52 @@ def main():
print("\n" + ""*65)
print(" STAGE A.2: Fundamentals-Only O/U 2.5 Model")
print(""*65)
y_tr_ou = tr["label_ou25"].values
y_va_ou = va["label_ou25"].values
y_tr_ou = tr['label_ou25'].values
y_va_ou = va['label_ou25'].values
mask_tr = ~np.isnan(y_tr_ou)
mask_va = ~np.isnan(y_va_ou)
if mask_tr.sum() > 1000:
ou_models = train_fundamentals_model(
X_tr[mask_tr], y_tr_ou[mask_tr].astype(int),
X_va[mask_va], y_va_ou[mask_va].astype(int),
clean_feats, "ou25")
clean_feats, 'ou25')
# ── STAGE A.3: BTTS Model ──
btts_models = None
if 'label_btts' in tr.columns:
print('\n' + '' * 65)
print(' STAGE A.3: Fundamentals-Only BTTS Model')
print('' * 65)
y_tr_btts = tr['label_btts'].values
y_va_btts = va['label_btts'].values
mask_tr_btts = ~np.isnan(y_tr_btts)
mask_va_btts = ~np.isnan(y_va_btts)
if mask_tr_btts.sum() > 1000:
btts_models = train_fundamentals_model(
X_tr[mask_tr_btts], y_tr_btts[mask_tr_btts].astype(int),
X_va[mask_va_btts], y_va_btts[mask_va_btts].astype(int),
clean_feats, 'btts')
# Quick val accuracy
btts_probs = ensemble_predict(
btts_models,
X_va[mask_va_btts],
clean_feats,
n_class=2,
)
btts_acc = accuracy_score(
y_va_btts[mask_va_btts].astype(int),
btts_probs.argmax(1),
)
btts_ll = log_loss(
y_va_btts[mask_va_btts].astype(int),
btts_probs,
)
print(f'\n BTTS Ensemble Val: acc={btts_acc:.4f}, logloss={btts_ll:.4f}')
# Compare with naive baseline (always predict majority class)
btts_majority = y_va_btts[mask_va_btts].astype(int).mean()
print(f' BTTS baseline: {max(btts_majority, 1-btts_majority):.4f} (majority class)')
print(f' Model vs baseline: {btts_acc - max(btts_majority, 1-btts_majority):+.4f}')
# ── STAGE C: Backtest ──
print("\n" + ""*65)
@@ -422,13 +459,58 @@ def main():
# OU25 backtest
if ou_models:
print("\n --- O/U 2.5 Backtest ---")
print('\n --- O/U 2.5 Backtest ---')
for edge in [0.05, 0.07, 0.10]:
r = backtest_value(ou_models, te, clean_feats, "ou25",
r = backtest_value(ou_models, te, clean_feats, 'ou25',
min_edge=edge, min_odds=1.50, max_odds=3.0,
use_kelly=True)
if r.get("total", 0) > 0:
print_backtest(r, f"OU25 edge>{edge}")
if r.get('total', 0) > 0:
print_backtest(r, f'OU25 edge>{edge}')
# BTTS backtest
if btts_models and 'label_btts' in te.columns:
print('\n --- BTTS Backtest ---')
# Build BTTS odds for backtest
if 'odds_btts_y' in te.columns and 'odds_btts_n' in te.columns:
te_btts = te.copy()
te_btts['odds_btts_y'] = pd.to_numeric(
te_btts['odds_btts_y'], errors='coerce',
).fillna(1.85)
te_btts['odds_btts_n'] = pd.to_numeric(
te_btts['odds_btts_n'], errors='coerce',
).fillna(1.85)
for edge in [0.05, 0.07, 0.10]:
X_test = te_btts[clean_feats].values
probs = ensemble_predict(btts_models, X_test, clean_feats, 2)
y_btts = te_btts['label_btts'].values.astype(int)
odds_arr = te_btts[['odds_btts_n', 'odds_btts_y']].values
m_arr = 1 / odds_arr
impl = m_arr / m_arr.sum(axis=1, keepdims=True)
total_bets = 0
wins = 0
pnl = 0.0
for i in range(len(y_btts)):
for cls in range(2):
e = probs[i, cls] - impl[i, cls]
o = odds_arr[i, cls]
if e < edge or o < 1.50 or o > 3.0:
continue
total_bets += 1
won = (y_btts[i] == cls)
if won:
wins += 1
pnl += 10 * (o - 1)
else:
pnl -= 10
if total_bets > 0:
roi = pnl / (total_bets * 10) * 100
hit = wins / total_bets * 100
print(
f' Edge>{edge:.2f}: {total_bets} bets, '
f'hit={hit:.1f}%, ROI={roi:+.1f}%'
)
# ── Feature importance ──
if "lgb" in ms_models:
@@ -452,25 +534,40 @@ def main():
if ou_models:
for name, m in ou_models.items():
p = MODELS_DIR / f"v27_ou25_{name}.pkl"
with open(p, "wb") as f:
p = MODELS_DIR / f'v27_ou25_{name}.pkl'
with open(p, 'wb') as f:
pickle.dump(m, f)
print(f"{p.name}")
print(f'{p.name}')
if btts_models:
for name, m in btts_models.items():
p = MODELS_DIR / f'v27_btts_{name}.pkl'
with open(p, 'wb') as f:
pickle.dump(m, f)
print(f'{p.name}')
meta = {
"version": "v27-pro", "trained_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"approach": "odds-free fundamentals + value edge detection",
"feature_count": len(clean_feats),
"total_samples": len(df),
"val_acc": round(val_acc, 4), "val_ll": round(val_ll, 4),
"best_config": {k: v for k, v in best_cfg.items() if k != "result"} if best_cfg else {},
"markets": ["ms"] + (["ou25"] if ou_models else []),
'version': 'v27-pro',
'trained_at': time.strftime('%Y-%m-%d %H:%M:%S'),
'approach': 'odds-free fundamentals + value edge detection',
'feature_count': len(clean_feats),
'total_samples': len(df),
'val_acc': round(val_acc, 4),
'val_ll': round(val_ll, 4),
'best_config': {
k: v for k, v in best_cfg.items() if k != 'result'
} if best_cfg else {},
'markets': (
['ms']
+ (['ou25'] if ou_models else [])
+ (['btts'] if btts_models else [])
),
}
with open(MODELS_DIR / "v27_metadata.json", "w") as f:
with open(MODELS_DIR / 'v27_metadata.json', 'w') as f:
json.dump(meta, f, indent=2, default=str)
with open(MODELS_DIR / "v27_feature_cols.json", "w") as f:
with open(MODELS_DIR / 'v27_feature_cols.json', 'w') as f:
json.dump(clean_feats, f, indent=2)
print(f" ✓ metadata + feature_cols")
print(f' ✓ metadata + feature_cols')
print(f"\n Total time: {(time.time()-t0)/60:.1f} min")
print(" DONE!")