From ee59056c6e6170e561960be8723980d7426a4d17 Mon Sep 17 00:00:00 2001 From: Jakob Ackermann Date: Thu, 13 Aug 2020 12:12:12 +0100 Subject: [PATCH 1/2] [misc] forcefully disconnect stale clients from shutdown process --- services/real-time/app.js | 15 ++++++++++++++- services/real-time/app/js/DrainManager.js | 14 ++++++++++---- services/real-time/config/settings.defaults.js | 10 ++++++++++ 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/services/real-time/app.js b/services/real-time/app.js index c41743e597..f2f756c21c 100644 --- a/services/real-time/app.js +++ b/services/real-time/app.js @@ -155,7 +155,20 @@ function drainAndShutdown(signal) { { signal }, `received interrupt, starting drain over ${shutdownDrainTimeWindow} mins` ) - DrainManager.startDrainTimeWindow(io, shutdownDrainTimeWindow) + DrainManager.startDrainTimeWindow(io, shutdownDrainTimeWindow, () => { + setTimeout(() => { + const staleClients = io.sockets.clients() + if (staleClients.length !== 0) { + logger.warn( + { staleClients: staleClients.map((client) => client.id) }, + 'forcefully disconnecting stale clients' + ) + staleClients.forEach((client) => { + client.disconnect() + }) + } + }, Settings.gracefulReconnectTimeoutMs) + }) shutdownCleanly(signal) }, statusCheckInterval) } diff --git a/services/real-time/app/js/DrainManager.js b/services/real-time/app/js/DrainManager.js index c605957160..06885f1d28 100644 --- a/services/real-time/app/js/DrainManager.js +++ b/services/real-time/app/js/DrainManager.js @@ -1,13 +1,13 @@ const logger = require('logger-sharelatex') module.exports = { - startDrainTimeWindow(io, minsToDrain) { + startDrainTimeWindow(io, minsToDrain, callback) { const drainPerMin = io.sockets.clients().length / minsToDrain // enforce minimum drain rate - this.startDrain(io, Math.max(drainPerMin / 60, 4)) + this.startDrain(io, Math.max(drainPerMin / 60, 4), callback) }, - startDrain(io, rate) { + startDrain(io, rate, callback) { // Clear out any old interval clearInterval(this.interval) logger.log({ rate }, 'starting drain') @@ -24,7 +24,11 @@ module.exports = { pollingInterval = 1000 } this.interval = setInterval(() => { - this.reconnectNClients(io, rate) + const requestedAllClientsToReconnect = this.reconnectNClients(io, rate) + if (requestedAllClientsToReconnect && callback) { + callback() + callback = undefined + } }, pollingInterval) }, @@ -48,6 +52,8 @@ module.exports = { } if (drainedCount < N) { logger.log('All clients have been told to reconnectGracefully') + return true } + return false } } diff --git a/services/real-time/config/settings.defaults.js b/services/real-time/config/settings.defaults.js index cbf6d3be82..5c0d501b50 100644 --- a/services/real-time/config/settings.defaults.js +++ b/services/real-time/config/settings.defaults.js @@ -123,6 +123,16 @@ const settings = { shutdownDrainTimeWindow: process.env.SHUTDOWN_DRAIN_TIME_WINDOW || 9, + // The shutdown procedure asks clients to reconnect gracefully. + // 3rd-party/buggy clients may not act upon receiving the message and keep + // stale connections alive. We forcefully disconnect them after X ms: + gracefulReconnectTimeoutMs: + parseInt(process.env.GRACEFUL_RECONNECT_TIMEOUT_MS, 10) || + // The frontend allows actively editing users to keep the connection open + // for up-to ConnectionManager.MAX_RECONNECT_GRACEFULLY_INTERVAL=45s + // Permit an extra delay to account for slow/flaky connections. + (45 + 30) * 1000, + continualPubsubTraffic: process.env.CONTINUAL_PUBSUB_TRAFFIC || false, checkEventOrder: process.env.CHECK_EVENT_ORDER || false, From ea75b84eefedaf03847612f3e4c02834e3b700ff Mon Sep 17 00:00:00 2001 From: Jakob Ackermann Date: Thu, 13 Aug 2020 12:39:51 +0100 Subject: [PATCH 2/2] [misc] let the orchestrator handle the process restart Note that there is also the `shutdownCleanly` interval which may notice that the shutdown has completed. There is some network IO required to signal all clients the server disconnect, so we cannot run process.exit immediately. --- services/real-time/app.js | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/services/real-time/app.js b/services/real-time/app.js index f2f756c21c..485937889d 100644 --- a/services/real-time/app.js +++ b/services/real-time/app.js @@ -98,7 +98,16 @@ function healthCheck(req, res) { } }) } -app.get('/health_check', healthCheck) +app.get( + '/health_check', + (req, res, next) => { + if (Settings.shutDownComplete) { + return res.sendStatus(503) + } + next() + }, + healthCheck +) app.get('/health_check/redis', healthCheck) @@ -167,6 +176,8 @@ function drainAndShutdown(signal) { client.disconnect() }) } + // Mark the node as unhealthy. + Settings.shutDownComplete = true }, Settings.gracefulReconnectTimeoutMs) }) shutdownCleanly(signal)