feat(ai-engine): value sniper thresholds and logic relaxed
This commit is contained in:
@@ -510,16 +510,24 @@ class FeatureExtractor:
|
||||
self.referee_engine = get_referee_engine()
|
||||
self.momentum_engine = get_momentum_engine()
|
||||
|
||||
# ── Data Quality Thresholds ──
|
||||
# Matches below these thresholds produce default-only features that
|
||||
# teach the model noise rather than signal.
|
||||
DQ_MIN_FORM_MATCHES = 3 # team must have ≥3 prior matches
|
||||
DQ_MIN_FEATURE_COVERAGE = 0.30 # ≥30% of key features must be non-default
|
||||
|
||||
def extract_all(self) -> list:
|
||||
"""Extract features for all matches, yield row dicts."""
|
||||
"""Extract features for all matches with data quality validation."""
|
||||
matches = self.loader.matches
|
||||
total = len(matches)
|
||||
rows = []
|
||||
skipped = 0
|
||||
dq_rejected = 0
|
||||
dq_reasons: dict = defaultdict(int)
|
||||
t_start = time.time()
|
||||
|
||||
|
||||
print(f"\n🔄 Extracting features for {total} matches...", flush=True)
|
||||
|
||||
|
||||
# Process chronologically — ELO grows as we go
|
||||
for i, m in enumerate(matches):
|
||||
(
|
||||
@@ -536,38 +544,43 @@ class FeatureExtractor:
|
||||
away_name,
|
||||
league_name,
|
||||
) = m
|
||||
|
||||
|
||||
if i % 100 == 0 and i > 0:
|
||||
elapsed = time.time() - t_start
|
||||
rate = i / elapsed # matches per second
|
||||
remaining = (total - i) / rate if rate > 0 else 0
|
||||
pct = i / total * 100
|
||||
print(f" [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | ETA: {remaining/60:.1f} dk | skipped: {skipped}", flush=True)
|
||||
|
||||
print(
|
||||
f" [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | "
|
||||
f"ETA: {remaining/60:.1f} dk | skipped: {skipped} | "
|
||||
f"dq_rejected: {dq_rejected}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
row = self._extract_one(
|
||||
mid,
|
||||
hid,
|
||||
aid,
|
||||
sh,
|
||||
sa,
|
||||
hth,
|
||||
hta,
|
||||
mst,
|
||||
lid,
|
||||
home_name,
|
||||
away_name,
|
||||
league_name,
|
||||
mid, hid, aid, sh, sa, hth, hta, mst, lid,
|
||||
home_name, away_name, league_name,
|
||||
)
|
||||
|
||||
|
||||
if row:
|
||||
rows.append(row)
|
||||
# ── Data Quality Gate ──
|
||||
dq_pass, reason = self._validate_row_quality(row, hid, aid, mst)
|
||||
if dq_pass:
|
||||
rows.append(row)
|
||||
else:
|
||||
dq_rejected += 1
|
||||
dq_reasons[reason] += 1
|
||||
else:
|
||||
skipped += 1
|
||||
|
||||
|
||||
# Update ELO after processing (so ELO is calculated BEFORE the match)
|
||||
self._update_elo(hid, aid, sh, sa)
|
||||
|
||||
print(f" ✅ Extracted {len(rows)} rows, skipped {skipped}", flush=True)
|
||||
|
||||
print(f" ✅ Extracted {len(rows)} rows, skipped {skipped}, DQ rejected {dq_rejected}", flush=True)
|
||||
if dq_reasons:
|
||||
print(f" 📊 DQ Rejection reasons:")
|
||||
for reason, count in sorted(dq_reasons.items(), key=lambda x: -x[1]):
|
||||
print(f" {reason}: {count}")
|
||||
return rows
|
||||
|
||||
def _extract_one(
|
||||
@@ -867,7 +880,58 @@ class FeatureExtractor:
|
||||
}
|
||||
|
||||
return row
|
||||
|
||||
|
||||
def _validate_row_quality(
|
||||
self,
|
||||
row: dict,
|
||||
home_id: str,
|
||||
away_id: str,
|
||||
before_date: int,
|
||||
) -> tuple:
|
||||
"""
|
||||
Data quality gate for training rows.
|
||||
|
||||
Ensures the feature vector has enough real signal to be useful for
|
||||
training. Rejects rows where critical features are all at their
|
||||
default/fallback values — these teach the model noise, not patterns.
|
||||
|
||||
Returns (pass: bool, reason: str | None).
|
||||
"""
|
||||
# 1. Minimum form history: both teams must have enough prior matches
|
||||
home_history = self.loader.team_matches.get(home_id, [])
|
||||
away_history = self.loader.team_matches.get(away_id, [])
|
||||
home_prior = sum(1 for m in home_history if m[0] < before_date)
|
||||
away_prior = sum(1 for m in away_history if m[0] < before_date)
|
||||
|
||||
if home_prior < self.DQ_MIN_FORM_MATCHES:
|
||||
return False, 'home_insufficient_history'
|
||||
if away_prior < self.DQ_MIN_FORM_MATCHES:
|
||||
return False, 'away_insufficient_history'
|
||||
|
||||
# 2. Feature coverage check: count how many key features are non-default
|
||||
key_features = [
|
||||
('home_goals_avg', 1.3),
|
||||
('away_goals_avg', 1.3),
|
||||
('home_clean_sheet_rate', 0.25),
|
||||
('away_clean_sheet_rate', 0.25),
|
||||
('home_avg_possession', 0.50),
|
||||
('away_avg_possession', 0.50),
|
||||
('home_avg_shots_on_target', 3.5),
|
||||
('away_avg_shots_on_target', 3.5),
|
||||
('h2h_total_matches', 0),
|
||||
('odds_ms_h', 0.0),
|
||||
]
|
||||
non_default = sum(
|
||||
1 for feat_name, default_val in key_features
|
||||
if abs(float(row.get(feat_name, default_val)) - default_val) > 0.01
|
||||
)
|
||||
coverage = non_default / len(key_features)
|
||||
|
||||
if coverage < self.DQ_MIN_FEATURE_COVERAGE:
|
||||
return False, f'low_feature_coverage_{coverage:.0%}'
|
||||
|
||||
return True, None
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# ELO (simplified inline version — doesn't need DB, grows incrementally)
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user