From 22596e69f2fe24bc4de7099769f495698d48311c Mon Sep 17 00:00:00 2001 From: Fahri Can Date: Tue, 5 May 2026 20:19:25 +0300 Subject: [PATCH] fix(predictions): circuit breaker resilience + graceful degradation - Reset consecutiveFailures on cooldown expiry (half-open state) so a single retry failure doesn't immediately re-open the circuit - Exclude AI Engine app-level 500s from circuit breaker count (only network/infra errors: timeout, 502, 503, 504, 429) - Return null gracefully instead of throwing 503 when no cache exists - Add DB fallback for non-cooldown AI Engine failures - Remove blocking wait-and-retry that held requests for up to 20s --- src/common/utils/ai-engine-client.ts | 17 +++++- .../predictions/predictions.service.ts | 61 +++++++++---------- 2 files changed, 45 insertions(+), 33 deletions(-) diff --git a/src/common/utils/ai-engine-client.ts b/src/common/utils/ai-engine-client.ts index 9ab26bf..c6d9768 100644 --- a/src/common/utils/ai-engine-client.ts +++ b/src/common/utils/ai-engine-client.ts @@ -183,8 +183,11 @@ export class AiEngineClient { } this.logger.warn( - `[${this.serviceName}] AI circuit breaker cooldown elapsed, allowing a recovery attempt`, + `[${this.serviceName}] AI circuit breaker cooldown elapsed, allowing a recovery attempt (resetting failures from ${this.consecutiveFailures})`, ); + // Half-open state: reset failures so a single retry failure doesn't + // immediately re-open the circuit at threshold+1 + this.consecutiveFailures = 0; this.circuitOpenedAt = null; } @@ -233,8 +236,18 @@ export class AiEngineClient { if (!error.response) { return true; // Network error, timeout, etc. } + // Only count infrastructure-level errors toward circuit breaker: + // - No response (network failure) → already handled above + // - Timeout (ECONNABORTED) → infrastructure + // - 429 (rate limit) → infrastructure + // - 502/503/504 (proxy/gateway errors) → infrastructure + // Do NOT count 500 (app-level crash in AI Engine) — it may be + // match-specific and shouldn't block all other matches. + if (error.code === 'ECONNABORTED') { + return true; + } const status = error.response.status; - return status >= 500 || status === 429; + return status === 429 || status === 502 || status === 503 || status === 504; } private toRequestError(error: unknown): AiEngineRequestError { diff --git a/src/modules/predictions/predictions.service.ts b/src/modules/predictions/predictions.service.ts index 9d8a0bf..b2c3dd0 100755 --- a/src/modules/predictions/predictions.service.ts +++ b/src/modules/predictions/predictions.service.ts @@ -278,40 +278,36 @@ export class PredictionsService implements OnModuleInit, OnModuleDestroy { return this.enrichPredictionResponse(cachedPrediction, matchContext); } - // 4) No cached data at all — wait out cooldown and retry once - const cooldownMs = this.extractCooldownMs(detail); - if (cooldownMs > 0 && cooldownMs <= 20000) { - this.logger.warn( - `AI Engine cooldown for ${matchId}; no cached data — waiting ${cooldownMs}ms and retrying...`, - ); - await new Promise((resolve) => setTimeout(resolve, cooldownMs + 500)); - try { - const retryResponse = - await this.aiEngineClient.post( - `/v20plus/analyze/${matchId}`, - { simulate: true, is_simulation: true, pre_match_only: true }, - ); - const retryPrediction = this.enrichPredictionResponse( - retryResponse.data, - matchContext, - ); - await this.recordPredictionRun(matchId, retryResponse.data); - await this.cachePrediction(matchId, retryPrediction); - return retryPrediction; - } catch (retryErr: unknown) { - this.logger.error( - `AI Engine retry after cooldown also failed for ${matchId}`, - ); - // Fall through to error handling below - } - } + // 4) No cached data at all — return null gracefully + this.logger.warn( + `AI Engine cooldown for ${matchId}; no cached data available — returning null gracefully`, + ); + return null; + } + + // ── Non-cooldown errors (e.g. AI Engine 500 for this match) ── + // Try DB fallback before giving up + const storedFallback = await this.getStoredPrediction(matchId); + if (storedFallback) { + this.logger.warn( + `AI Engine failed for ${matchId} (status=${status}); returning stored prediction as fallback`, + ); + return this.enrichPredictionResponse(storedFallback, matchContext); + } + + const cachedFallback = await this.getCachedPrediction(matchId); + if (cachedFallback) { + this.logger.warn( + `AI Engine failed for ${matchId} (status=${status}); returning cached prediction as fallback`, + ); + return this.enrichPredictionResponse(cachedFallback, matchContext); } this.logger.error( `Direct AI Engine call failed for ${matchId}: status=${status}, detail=${JSON.stringify(detail)}`, ); - // Forward AI Engine's actual error + // Forward AI Engine's actual error for client-meaningful statuses if (status === 404) { throw new HttpException( `Match not found in AI Engine: ${matchId}`, @@ -324,10 +320,13 @@ export class PredictionsService implements OnModuleInit, OnModuleDestroy { HttpStatus.UNPROCESSABLE_ENTITY, ); } - throw new HttpException( - `AI Engine error: ${typeof detail === "string" ? detail : JSON.stringify(detail)}`, - status || HttpStatus.SERVICE_UNAVAILABLE, + + // For server errors (500, 503 etc.) return null instead of throwing + // This prevents the user from seeing raw 503 errors + this.logger.warn( + `AI Engine server error for ${matchId}; returning null gracefully instead of ${status}`, ); + return null; } }