From 22596e69f2fe24bc4de7099769f495698d48311c Mon Sep 17 00:00:00 2001
From: Fahri Can <fahricansecer@gmail.com>
Date: Tue, 5 May 2026 20:19:25 +0300
Subject: [PATCH] fix(predictions): circuit breaker resilience + graceful
 degradation

- Reset consecutiveFailures on cooldown expiry (half-open state)
  so a single retry failure doesn't immediately re-open the circuit
- Exclude AI Engine app-level 500s from circuit breaker count
  (only network/infra errors: timeout, 502, 503, 504, 429)
- Return null gracefully instead of throwing 503 when no cache exists
- Add DB fallback for non-cooldown AI Engine failures
- Remove blocking wait-and-retry that held requests for up to 20s
---
 src/common/utils/ai-engine-client.ts          | 17 +++++-
 .../predictions/predictions.service.ts        | 61 +++++++++----------
 2 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/src/common/utils/ai-engine-client.ts b/src/common/utils/ai-engine-client.ts
index 9ab26bf..c6d9768 100644
--- a/src/common/utils/ai-engine-client.ts
+++ b/src/common/utils/ai-engine-client.ts
@@ -183,8 +183,11 @@ export class AiEngineClient {
     }
 
     this.logger.warn(
-      `[${this.serviceName}] AI circuit breaker cooldown elapsed, allowing a recovery attempt`,
+      `[${this.serviceName}] AI circuit breaker cooldown elapsed, allowing a recovery attempt (resetting failures from ${this.consecutiveFailures})`,
     );
+    // Half-open state: reset failures so a single retry failure doesn't
+    // immediately re-open the circuit at threshold+1
+    this.consecutiveFailures = 0;
     this.circuitOpenedAt = null;
   }
 
@@ -233,8 +236,18 @@ export class AiEngineClient {
     if (!error.response) {
       return true; // Network error, timeout, etc.
     }
+    // Only count infrastructure-level errors toward circuit breaker:
+    // - No response (network failure) → already handled above
+    // - Timeout (ECONNABORTED) → infrastructure
+    // - 429 (rate limit) → infrastructure
+    // - 502/503/504 (proxy/gateway errors) → infrastructure
+    // Do NOT count 500 (app-level crash in AI Engine) — it may be
+    // match-specific and shouldn't block all other matches.
+    if (error.code === 'ECONNABORTED') {
+      return true;
+    }
     const status = error.response.status;
-    return status >= 500 || status === 429;
+    return status === 429 || status === 502 || status === 503 || status === 504;
   }
 
   private toRequestError(error: unknown): AiEngineRequestError {
diff --git a/src/modules/predictions/predictions.service.ts b/src/modules/predictions/predictions.service.ts
index 9d8a0bf..b2c3dd0 100755
--- a/src/modules/predictions/predictions.service.ts
+++ b/src/modules/predictions/predictions.service.ts
@@ -278,40 +278,36 @@ export class PredictionsService implements OnModuleInit, OnModuleDestroy {
           return this.enrichPredictionResponse(cachedPrediction, matchContext);
         }
 
-        // 4) No cached data at all — wait out cooldown and retry once
-        const cooldownMs = this.extractCooldownMs(detail);
-        if (cooldownMs > 0 && cooldownMs <= 20000) {
-          this.logger.warn(
-            `AI Engine cooldown for ${matchId}; no cached data — waiting ${cooldownMs}ms and retrying...`,
-          );
-          await new Promise((resolve) => setTimeout(resolve, cooldownMs + 500));
-          try {
-            const retryResponse =
-              await this.aiEngineClient.post<MatchPredictionDto>(
-                `/v20plus/analyze/${matchId}`,
-                { simulate: true, is_simulation: true, pre_match_only: true },
-              );
-            const retryPrediction = this.enrichPredictionResponse(
-              retryResponse.data,
-              matchContext,
-            );
-            await this.recordPredictionRun(matchId, retryResponse.data);
-            await this.cachePrediction(matchId, retryPrediction);
-            return retryPrediction;
-          } catch (retryErr: unknown) {
-            this.logger.error(
-              `AI Engine retry after cooldown also failed for ${matchId}`,
-            );
-            // Fall through to error handling below
-          }
-        }
+        // 4) No cached data at all — return null gracefully
+        this.logger.warn(
+          `AI Engine cooldown for ${matchId}; no cached data available — returning null gracefully`,
+        );
+        return null;
+      }
+
+      // ── Non-cooldown errors (e.g. AI Engine 500 for this match) ──
+      // Try DB fallback before giving up
+      const storedFallback = await this.getStoredPrediction(matchId);
+      if (storedFallback) {
+        this.logger.warn(
+          `AI Engine failed for ${matchId} (status=${status}); returning stored prediction as fallback`,
+        );
+        return this.enrichPredictionResponse(storedFallback, matchContext);
+      }
+
+      const cachedFallback = await this.getCachedPrediction(matchId);
+      if (cachedFallback) {
+        this.logger.warn(
+          `AI Engine failed for ${matchId} (status=${status}); returning cached prediction as fallback`,
+        );
+        return this.enrichPredictionResponse(cachedFallback, matchContext);
       }
 
       this.logger.error(
         `Direct AI Engine call failed for ${matchId}: status=${status}, detail=${JSON.stringify(detail)}`,
       );
 
-      // Forward AI Engine's actual error
+      // Forward AI Engine's actual error for client-meaningful statuses
       if (status === 404) {
         throw new HttpException(
           `Match not found in AI Engine: ${matchId}`,
@@ -324,10 +320,13 @@ export class PredictionsService implements OnModuleInit, OnModuleDestroy {
           HttpStatus.UNPROCESSABLE_ENTITY,
         );
       }
-      throw new HttpException(
-        `AI Engine error: ${typeof detail === "string" ? detail : JSON.stringify(detail)}`,
-        status || HttpStatus.SERVICE_UNAVAILABLE,
+
+      // For server errors (500, 503 etc.) return null instead of throwing
+      // This prevents the user from seeing raw 503 errors
+      this.logger.warn(
+        `AI Engine server error for ${matchId}; returning null gracefully instead of ${status}`,
       );
+      return null;
     }
   }