feat(ai-engine): value sniper thresholds and logic relaxed

2026-05-06 17:44:45 +03:00
parent 5b5f83c8cf
commit 4f7090e2d9
13 changed files with 2040 additions and 382 deletions
@@ -510,16 +510,24 @@ class FeatureExtractor:
        self.referee_engine = get_referee_engine()
        self.momentum_engine = get_momentum_engine()
    
+    # ── Data Quality Thresholds ──
+    # Matches below these thresholds produce default-only features that
+    # teach the model noise rather than signal.
+    DQ_MIN_FORM_MATCHES = 3          # team must have ≥3 prior matches
+    DQ_MIN_FEATURE_COVERAGE = 0.30   # ≥30% of key features must be non-default
+
    def extract_all(self) -> list:
-        """Extract features for all matches, yield row dicts."""
+        """Extract features for all matches with data quality validation."""
        matches = self.loader.matches
        total = len(matches)
        rows = []
        skipped = 0
+        dq_rejected = 0
+        dq_reasons: dict = defaultdict(int)
        t_start = time.time()
-        
+
        print(f"\n🔄 Extracting features for {total} matches...", flush=True)
-        
+
        # Process chronologically — ELO grows as we go
        for i, m in enumerate(matches):
            (
@@ -536,38 +544,43 @@ class FeatureExtractor:
                away_name,
                league_name,
            ) = m
-            
+
            if i % 100 == 0 and i > 0:
                elapsed = time.time() - t_start
                rate = i / elapsed  # matches per second
                remaining = (total - i) / rate if rate > 0 else 0
                pct = i / total * 100
-                print(f"  [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | ETA: {remaining/60:.1f} dk | skipped: {skipped}", flush=True)
-            
+                print(
+                    f"  [{i}/{total}] ({pct:.0f}%) | {rate:.1f} maç/s | "
+                    f"ETA: {remaining/60:.1f} dk | skipped: {skipped} | "
+                    f"dq_rejected: {dq_rejected}",
+                    flush=True,
+                )
+
            row = self._extract_one(
-                mid,
-                hid,
-                aid,
-                sh,
-                sa,
-                hth,
-                hta,
-                mst,
-                lid,
-                home_name,
-                away_name,
-                league_name,
+                mid, hid, aid, sh, sa, hth, hta, mst, lid,
+                home_name, away_name, league_name,
            )
-            
+
            if row:
-                rows.append(row)
+                # ── Data Quality Gate ──
+                dq_pass, reason = self._validate_row_quality(row, hid, aid, mst)
+                if dq_pass:
+                    rows.append(row)
+                else:
+                    dq_rejected += 1
+                    dq_reasons[reason] += 1
            else:
                skipped += 1
-            
+
            # Update ELO after processing (so ELO is calculated BEFORE the match)
            self._update_elo(hid, aid, sh, sa)
-        
-        print(f"  ✅ Extracted {len(rows)} rows, skipped {skipped}", flush=True)
+
+        print(f"  ✅ Extracted {len(rows)} rows, skipped {skipped}, DQ rejected {dq_rejected}", flush=True)
+        if dq_reasons:
+            print(f"  📊 DQ Rejection reasons:")
+            for reason, count in sorted(dq_reasons.items(), key=lambda x: -x[1]):
+                print(f"     {reason}: {count}")
        return rows
    
    def _extract_one(
@@ -867,7 +880,58 @@ class FeatureExtractor:
        }
        
        return row
-    
+
+    def _validate_row_quality(
+        self,
+        row: dict,
+        home_id: str,
+        away_id: str,
+        before_date: int,
+    ) -> tuple:
+        """
+        Data quality gate for training rows.
+
+        Ensures the feature vector has enough real signal to be useful for
+        training.  Rejects rows where critical features are all at their
+        default/fallback values — these teach the model noise, not patterns.
+
+        Returns (pass: bool, reason: str | None).
+        """
+        # 1. Minimum form history: both teams must have enough prior matches
+        home_history = self.loader.team_matches.get(home_id, [])
+        away_history = self.loader.team_matches.get(away_id, [])
+        home_prior = sum(1 for m in home_history if m[0] < before_date)
+        away_prior = sum(1 for m in away_history if m[0] < before_date)
+
+        if home_prior < self.DQ_MIN_FORM_MATCHES:
+            return False, 'home_insufficient_history'
+        if away_prior < self.DQ_MIN_FORM_MATCHES:
+            return False, 'away_insufficient_history'
+
+        # 2. Feature coverage check: count how many key features are non-default
+        key_features = [
+            ('home_goals_avg', 1.3),
+            ('away_goals_avg', 1.3),
+            ('home_clean_sheet_rate', 0.25),
+            ('away_clean_sheet_rate', 0.25),
+            ('home_avg_possession', 0.50),
+            ('away_avg_possession', 0.50),
+            ('home_avg_shots_on_target', 3.5),
+            ('away_avg_shots_on_target', 3.5),
+            ('h2h_total_matches', 0),
+            ('odds_ms_h', 0.0),
+        ]
+        non_default = sum(
+            1 for feat_name, default_val in key_features
+            if abs(float(row.get(feat_name, default_val)) - default_val) > 0.01
+        )
+        coverage = non_default / len(key_features)
+
+        if coverage < self.DQ_MIN_FEATURE_COVERAGE:
+            return False, f'low_feature_coverage_{coverage:.0%}'
+
+        return True, None
+
    # -------------------------------------------------------------------------
    # ELO (simplified inline version — doesn't need DB, grows incrementally)
    # -------------------------------------------------------------------------