fix: watchdog force-kill with SIGKILL fallback when process.exit is blocked
This commit is contained in:
@@ -12,7 +12,7 @@ import { FeederService } from "../modules/feeder/feeder.service";
|
|||||||
import { Logger } from "@nestjs/common";
|
import { Logger } from "@nestjs/common";
|
||||||
|
|
||||||
const WATCHDOG_INTERVAL_MS = 60_000; // Check every 1 minute
|
const WATCHDOG_INTERVAL_MS = 60_000; // Check every 1 minute
|
||||||
const WATCHDOG_TIMEOUT_MS = 5 * 60_000; // Kill if no activity for 5 minutes
|
const WATCHDOG_TIMEOUT_MS = 3 * 60_000; // Kill if no activity for 3 minutes
|
||||||
|
|
||||||
async function bootstrap() {
|
async function bootstrap() {
|
||||||
process.env.FEEDER_MODE = "historical";
|
process.env.FEEDER_MODE = "historical";
|
||||||
@@ -31,15 +31,31 @@ async function bootstrap() {
|
|||||||
const feederService = app.get(FeederService);
|
const feederService = app.get(FeederService);
|
||||||
|
|
||||||
// ── Watchdog Timer ──────────────────────────────────────────
|
// ── Watchdog Timer ──────────────────────────────────────────
|
||||||
// If the feeder hangs on an API call for 5+ minutes, force-exit
|
// If the feeder hangs on an API call for 3+ minutes, force-kill
|
||||||
// so PM2 can restart and resume from where it left off in DB.
|
// so PM2 can restart and resume from where it left off in DB.
|
||||||
|
// NOTE: process.exit(1) alone can be blocked by open handles
|
||||||
|
// (DB connections, HTTP sockets). We use process.kill(SIGKILL)
|
||||||
|
// as an unconditional fallback.
|
||||||
const watchdog = setInterval(() => {
|
const watchdog = setInterval(() => {
|
||||||
const idleMs = Date.now() - feederService.lastActivityAt;
|
const idleMs = Date.now() - feederService.lastActivityAt;
|
||||||
if (idleMs > WATCHDOG_TIMEOUT_MS) {
|
if (idleMs > WATCHDOG_TIMEOUT_MS) {
|
||||||
logger.error(
|
logger.error(
|
||||||
`🐕 WATCHDOG: No activity for ${Math.round(idleMs / 1000)}s. Force-exiting for PM2 restart...`,
|
`🐕 WATCHDOG: No activity for ${Math.round(idleMs / 1000)}s. Force-killing for PM2 restart...`,
|
||||||
);
|
);
|
||||||
process.exit(1);
|
|
||||||
|
// Try graceful exit first
|
||||||
|
try {
|
||||||
|
process.exit(1);
|
||||||
|
} catch {
|
||||||
|
// Ignored – fallback below
|
||||||
|
}
|
||||||
|
|
||||||
|
// If process.exit didn't work (blocked by open handles),
|
||||||
|
// schedule an unconditional SIGKILL after 2 seconds
|
||||||
|
setTimeout(() => {
|
||||||
|
logger.error("🐕 WATCHDOG: process.exit blocked. Sending SIGKILL...");
|
||||||
|
process.kill(process.pid, "SIGKILL");
|
||||||
|
}, 2_000).unref();
|
||||||
}
|
}
|
||||||
}, WATCHDOG_INTERVAL_MS);
|
}, WATCHDOG_INTERVAL_MS);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user