Graceful Shutdown: Zero-Downtime Deployments Done Right

Kill -9 is violence. Your application deserves a dignified death.

Graceful shutdown means finishing in-flight work before terminating. Without it, deployments cause dropped requests, broken connections, and data corruption. With it, users never notice you restarted.

The Problem

When a process receives SIGTERM:

Kubernetes/Docker sends the signal
Your app has a grace period (default 30s)
After the grace period, SIGKILL terminates forcefully

If your app doesn’t handle SIGTERM, in-flight requests get dropped. Database transactions abort. WebSocket connections die mid-message.

The Pattern

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
const server = require('http').createServer(app);
let isShuttingDown = false;

// Handle shutdown signals
process.on('SIGTERM', gracefulShutdown);
process.on('SIGINT', gracefulShutdown);

async function gracefulShutdown(signal) {
  console.log(`Received ${signal}, starting graceful shutdown`);
  isShuttingDown = true;
  
  // 1. Stop accepting new connections
  server.close(async () => {
    console.log('HTTP server closed');
    
    // 2. Close database connections
    await db.end();
    console.log('Database connections closed');
    
    // 3. Close other resources
    await redis.quit();
    await messageQueue.close();
    
    console.log('Graceful shutdown complete');
    process.exit(0);
  });
  
  // Force exit after timeout
  setTimeout(() => {
    console.error('Forced shutdown after timeout');
    process.exit(1);
  }, 25000); // Leave buffer before SIGKILL
}

// Reject new requests during shutdown
app.use((req, res, next) => {
  if (isShuttingDown) {
    res.set('Connection', 'close');
    return res.status(503).json({ error: 'Server shutting down' });
  }
  next();
});

Connection Draining

The key insight: stop accepting new work, but finish existing work.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
class ConnectionDrainer {
  constructor() {
    this.activeConnections = new Set();
  }

  track(connection) {
    this.activeConnections.add(connection);
    connection.on('close', () => {
      this.activeConnections.delete(connection);
    });
  }

  async drain(timeoutMs = 10000) {
    console.log(`Draining ${this.activeConnections.size} connections`);
    
    // Tell clients to close after current request
    for (const conn of this.activeConnections) {
      conn.setHeader?.('Connection', 'close');
    }
    
    // Wait for connections to close naturally
    const start = Date.now();
    while (this.activeConnections.size > 0) {
      if (Date.now() - start > timeoutMs) {
        console.log(`Force closing ${this.activeConnections.size} connections`);
        for (const conn of this.activeConnections) {
          conn.destroy();
        }
        break;
      }
      await sleep(100);
    }
  }
}

Health Check Integration

Load balancers need to know when to stop sending traffic:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
let isReady = false;
let isHealthy = true;

// Readiness: can accept new traffic?
app.get('/ready', (req, res) => {
  if (isReady && !isShuttingDown) {
    res.status(200).json({ ready: true });
  } else {
    res.status(503).json({ ready: false });
  }
});

// Liveness: is the process healthy?
app.get('/health', (req, res) => {
  if (isHealthy) {
    res.status(200).json({ healthy: true });
  } else {
    res.status(503).json({ healthy: false });
  }
});

// During startup
async function startup() {
  await db.connect();
  await cache.connect();
  isReady = true;  // Now accept traffic
}

// During shutdown
async function gracefulShutdown() {
  isReady = false;  // Stop accepting new traffic
  // ... drain connections ...
}

Kubernetes Configuration

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
apiVersion: v1
kind: Pod
spec:
  terminationGracePeriodSeconds: 60
  containers:
    - name: app
      lifecycle:
        preStop:
          exec:
            command: ["/bin/sh", "-c", "sleep 5"]
      readinessProbe:
        httpGet:
          path: /ready
          port: 8080
        initialDelaySeconds: 5
        periodSeconds: 5
      livenessProbe:
        httpGet:
          path: /health
          port: 8080
        initialDelaySeconds: 10
        periodSeconds: 10

The preStop sleep is crucial: it gives the load balancer time to remove the pod from rotation before the app starts shutting down.

Timeline:

Pod receives SIGTERM
preStop hook runs (sleep 5)
Kubernetes removes pod from Service endpoints
Load balancer stops sending traffic
App handles SIGTERM, drains connections
Clean exit

Without the sleep, traffic can arrive after shutdown begins.

Database Transactions

In-flight transactions need special handling:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class TransactionManager {
  constructor() {
    this.activeTransactions = new Set();
  }

  async runInTransaction(fn) {
    if (isShuttingDown) {
      throw new Error('Cannot start transaction during shutdown');
    }
    
    const tx = await db.beginTransaction();
    this.activeTransactions.add(tx);
    
    try {
      const result = await fn(tx);
      await tx.commit();
      return result;
    } catch (error) {
      await tx.rollback();
      throw error;
    } finally {
      this.activeTransactions.delete(tx);
    }
  }

  async waitForTransactions(timeoutMs = 10000) {
    const start = Date.now();
    while (this.activeTransactions.size > 0) {
      if (Date.now() - start > timeoutMs) {
        console.warn(`${this.activeTransactions.size} transactions still active`);
        break;
      }
      await sleep(100);
    }
  }
}

Worker Processes

Background workers need graceful shutdown too:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
class Worker {
  constructor(queue) {
    this.queue = queue;
    this.currentJob = null;
    this.shouldStop = false;
  }

  async start() {
    while (!this.shouldStop) {
      this.currentJob = await this.queue.getNextJob();
      if (this.currentJob) {
        await this.processJob(this.currentJob);
        this.currentJob = null;
      }
    }
  }

  async stop() {
    this.shouldStop = true;
    
    // Wait for current job to finish
    while (this.currentJob) {
      await sleep(100);
    }
    
    // Return unprocessed jobs to queue
    await this.queue.close();
  }
}

WebSocket Handling

WebSockets need explicit close messages:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
const activeWebSockets = new Set();

wss.on('connection', (ws) => {
  activeWebSockets.add(ws);
  ws.on('close', () => activeWebSockets.delete(ws));
});

async function shutdownWebSockets() {
  for (const ws of activeWebSockets) {
    // Send close frame with reason
    ws.close(1001, 'Server shutting down');
  }
  
  // Wait for clients to acknowledge
  await sleep(1000);
  
  // Force close any remaining
  for (const ws of activeWebSockets) {
    ws.terminate();
  }
}

Common Mistakes

No SIGTERM handler: Process exits immediately, dropping everything.

Closing server before draining: New connections rejected, but existing ones not tracked.

Infinite grace period: Eventually SIGKILL comes. Plan for it.

Not marking unready first: Load balancer keeps sending traffic during shutdown.

Ignoring background jobs: Worker picks up job, gets killed mid-processing.

Database connections orphaned: Connection pool exhausted on restart.

The Complete Pattern

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
const shutdown = {
  isShuttingDown: false,
  
  async graceful(signal) {
    if (this.isShuttingDown) return;
    this.isShuttingDown = true;
    
    console.log(`${signal} received, graceful shutdown starting`);
    
    // Phase 1: Stop accepting new work
    server.close();
    
    // Phase 2: Drain in-flight requests (with timeout)
    await Promise.race([
      connectionDrainer.drain(),
      sleep(15000),
    ]);
    
    // Phase 3: Wait for background jobs
    await Promise.race([
      worker.stop(),
      sleep(10000),
    ]);
    
    // Phase 4: Close external connections
    await Promise.all([
      db.end(),
      redis.quit(),
      messageQueue.close(),
    ]);
    
    console.log('Graceful shutdown complete');
    process.exit(0);
  },
};

process.on('SIGTERM', () => shutdown.graceful('SIGTERM'));
process.on('SIGINT', () => shutdown.graceful('SIGINT'));

The Mental Model

Think of graceful shutdown like closing a restaurant:

Stop seating new guests (stop accepting connections)
Let current diners finish (drain in-flight requests)
Close the kitchen (stop background jobs)
Clean up (close database connections)
Lock the door (exit process)

You don’t throw people out mid-meal. You let them finish, then close.

Your users deserve the same courtesy. Handle SIGTERM with grace.

The Problem#

The Pattern#

Connection Draining#

Health Check Integration#

Kubernetes Configuration#

Database Transactions#

Worker Processes#

WebSocket Handling#

Common Mistakes#

The Complete Pattern#

The Mental Model#

📬 Get the Newsletter