fix(predictions): circuit breaker resilience + graceful degradation
Deploy Iddaai Backend / build-and-deploy (push) Successful in 27s
Deploy Iddaai Backend / build-and-deploy (push) Successful in 27s
- Reset consecutiveFailures on cooldown expiry (half-open state) so a single retry failure doesn't immediately re-open the circuit - Exclude AI Engine app-level 500s from circuit breaker count (only network/infra errors: timeout, 502, 503, 504, 429) - Return null gracefully instead of throwing 503 when no cache exists - Add DB fallback for non-cooldown AI Engine failures - Remove blocking wait-and-retry that held requests for up to 20s
This commit is contained in:
@@ -278,40 +278,36 @@ export class PredictionsService implements OnModuleInit, OnModuleDestroy {
|
||||
return this.enrichPredictionResponse(cachedPrediction, matchContext);
|
||||
}
|
||||
|
||||
// 4) No cached data at all — wait out cooldown and retry once
|
||||
const cooldownMs = this.extractCooldownMs(detail);
|
||||
if (cooldownMs > 0 && cooldownMs <= 20000) {
|
||||
this.logger.warn(
|
||||
`AI Engine cooldown for ${matchId}; no cached data — waiting ${cooldownMs}ms and retrying...`,
|
||||
);
|
||||
await new Promise((resolve) => setTimeout(resolve, cooldownMs + 500));
|
||||
try {
|
||||
const retryResponse =
|
||||
await this.aiEngineClient.post<MatchPredictionDto>(
|
||||
`/v20plus/analyze/${matchId}`,
|
||||
{ simulate: true, is_simulation: true, pre_match_only: true },
|
||||
);
|
||||
const retryPrediction = this.enrichPredictionResponse(
|
||||
retryResponse.data,
|
||||
matchContext,
|
||||
);
|
||||
await this.recordPredictionRun(matchId, retryResponse.data);
|
||||
await this.cachePrediction(matchId, retryPrediction);
|
||||
return retryPrediction;
|
||||
} catch (retryErr: unknown) {
|
||||
this.logger.error(
|
||||
`AI Engine retry after cooldown also failed for ${matchId}`,
|
||||
);
|
||||
// Fall through to error handling below
|
||||
}
|
||||
}
|
||||
// 4) No cached data at all — return null gracefully
|
||||
this.logger.warn(
|
||||
`AI Engine cooldown for ${matchId}; no cached data available — returning null gracefully`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
// ── Non-cooldown errors (e.g. AI Engine 500 for this match) ──
|
||||
// Try DB fallback before giving up
|
||||
const storedFallback = await this.getStoredPrediction(matchId);
|
||||
if (storedFallback) {
|
||||
this.logger.warn(
|
||||
`AI Engine failed for ${matchId} (status=${status}); returning stored prediction as fallback`,
|
||||
);
|
||||
return this.enrichPredictionResponse(storedFallback, matchContext);
|
||||
}
|
||||
|
||||
const cachedFallback = await this.getCachedPrediction(matchId);
|
||||
if (cachedFallback) {
|
||||
this.logger.warn(
|
||||
`AI Engine failed for ${matchId} (status=${status}); returning cached prediction as fallback`,
|
||||
);
|
||||
return this.enrichPredictionResponse(cachedFallback, matchContext);
|
||||
}
|
||||
|
||||
this.logger.error(
|
||||
`Direct AI Engine call failed for ${matchId}: status=${status}, detail=${JSON.stringify(detail)}`,
|
||||
);
|
||||
|
||||
// Forward AI Engine's actual error
|
||||
// Forward AI Engine's actual error for client-meaningful statuses
|
||||
if (status === 404) {
|
||||
throw new HttpException(
|
||||
`Match not found in AI Engine: ${matchId}`,
|
||||
@@ -324,10 +320,13 @@ export class PredictionsService implements OnModuleInit, OnModuleDestroy {
|
||||
HttpStatus.UNPROCESSABLE_ENTITY,
|
||||
);
|
||||
}
|
||||
throw new HttpException(
|
||||
`AI Engine error: ${typeof detail === "string" ? detail : JSON.stringify(detail)}`,
|
||||
status || HttpStatus.SERVICE_UNAVAILABLE,
|
||||
|
||||
// For server errors (500, 503 etc.) return null instead of throwing
|
||||
// This prevents the user from seeing raw 503 errors
|
||||
this.logger.warn(
|
||||
`AI Engine server error for ${matchId}; returning null gracefully instead of ${status}`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user