fix(predictions): circuit breaker resilience + graceful degradation
Deploy Iddaai Backend / build-and-deploy (push) Successful in 27s

- Reset consecutiveFailures on cooldown expiry (half-open state)
  so a single retry failure doesn't immediately re-open the circuit
- Exclude AI Engine app-level 500s from circuit breaker count
  (only network/infra errors: timeout, 502, 503, 504, 429)
- Return null gracefully instead of throwing 503 when no cache exists
- Add DB fallback for non-cooldown AI Engine failures
- Remove blocking wait-and-retry that held requests for up to 20s
This commit is contained in:
2026-05-05 20:19:25 +03:00
parent f32badbd8f
commit 22596e69f2
2 changed files with 45 additions and 33 deletions
+15 -2
View File
@@ -183,8 +183,11 @@ export class AiEngineClient {
} }
this.logger.warn( this.logger.warn(
`[${this.serviceName}] AI circuit breaker cooldown elapsed, allowing a recovery attempt`, `[${this.serviceName}] AI circuit breaker cooldown elapsed, allowing a recovery attempt (resetting failures from ${this.consecutiveFailures})`,
); );
// Half-open state: reset failures so a single retry failure doesn't
// immediately re-open the circuit at threshold+1
this.consecutiveFailures = 0;
this.circuitOpenedAt = null; this.circuitOpenedAt = null;
} }
@@ -233,8 +236,18 @@ export class AiEngineClient {
if (!error.response) { if (!error.response) {
return true; // Network error, timeout, etc. return true; // Network error, timeout, etc.
} }
// Only count infrastructure-level errors toward circuit breaker:
// - No response (network failure) → already handled above
// - Timeout (ECONNABORTED) → infrastructure
// - 429 (rate limit) → infrastructure
// - 502/503/504 (proxy/gateway errors) → infrastructure
// Do NOT count 500 (app-level crash in AI Engine) — it may be
// match-specific and shouldn't block all other matches.
if (error.code === 'ECONNABORTED') {
return true;
}
const status = error.response.status; const status = error.response.status;
return status >= 500 || status === 429; return status === 429 || status === 502 || status === 503 || status === 504;
} }
private toRequestError(error: unknown): AiEngineRequestError { private toRequestError(error: unknown): AiEngineRequestError {
+26 -27
View File
@@ -278,40 +278,36 @@ export class PredictionsService implements OnModuleInit, OnModuleDestroy {
return this.enrichPredictionResponse(cachedPrediction, matchContext); return this.enrichPredictionResponse(cachedPrediction, matchContext);
} }
// 4) No cached data at all — wait out cooldown and retry once // 4) No cached data at all — return null gracefully
const cooldownMs = this.extractCooldownMs(detail);
if (cooldownMs > 0 && cooldownMs <= 20000) {
this.logger.warn( this.logger.warn(
`AI Engine cooldown for ${matchId}; no cached data — waiting ${cooldownMs}ms and retrying...`, `AI Engine cooldown for ${matchId}; no cached data available — returning null gracefully`,
); );
await new Promise((resolve) => setTimeout(resolve, cooldownMs + 500)); return null;
try {
const retryResponse =
await this.aiEngineClient.post<MatchPredictionDto>(
`/v20plus/analyze/${matchId}`,
{ simulate: true, is_simulation: true, pre_match_only: true },
);
const retryPrediction = this.enrichPredictionResponse(
retryResponse.data,
matchContext,
);
await this.recordPredictionRun(matchId, retryResponse.data);
await this.cachePrediction(matchId, retryPrediction);
return retryPrediction;
} catch (retryErr: unknown) {
this.logger.error(
`AI Engine retry after cooldown also failed for ${matchId}`,
);
// Fall through to error handling below
} }
// ── Non-cooldown errors (e.g. AI Engine 500 for this match) ──
// Try DB fallback before giving up
const storedFallback = await this.getStoredPrediction(matchId);
if (storedFallback) {
this.logger.warn(
`AI Engine failed for ${matchId} (status=${status}); returning stored prediction as fallback`,
);
return this.enrichPredictionResponse(storedFallback, matchContext);
} }
const cachedFallback = await this.getCachedPrediction(matchId);
if (cachedFallback) {
this.logger.warn(
`AI Engine failed for ${matchId} (status=${status}); returning cached prediction as fallback`,
);
return this.enrichPredictionResponse(cachedFallback, matchContext);
} }
this.logger.error( this.logger.error(
`Direct AI Engine call failed for ${matchId}: status=${status}, detail=${JSON.stringify(detail)}`, `Direct AI Engine call failed for ${matchId}: status=${status}, detail=${JSON.stringify(detail)}`,
); );
// Forward AI Engine's actual error // Forward AI Engine's actual error for client-meaningful statuses
if (status === 404) { if (status === 404) {
throw new HttpException( throw new HttpException(
`Match not found in AI Engine: ${matchId}`, `Match not found in AI Engine: ${matchId}`,
@@ -324,10 +320,13 @@ export class PredictionsService implements OnModuleInit, OnModuleDestroy {
HttpStatus.UNPROCESSABLE_ENTITY, HttpStatus.UNPROCESSABLE_ENTITY,
); );
} }
throw new HttpException(
`AI Engine error: ${typeof detail === "string" ? detail : JSON.stringify(detail)}`, // For server errors (500, 503 etc.) return null instead of throwing
status || HttpStatus.SERVICE_UNAVAILABLE, // This prevents the user from seeing raw 503 errors
this.logger.warn(
`AI Engine server error for ${matchId}; returning null gracefully instead of ${status}`,
); );
return null;
} }
} }