diff --git a/services/real-time/app.js b/services/real-time/app.js index fa6c5f7563..993bb58ff6 100644 --- a/services/real-time/app.js +++ b/services/real-time/app.js @@ -98,7 +98,16 @@ function healthCheck(req, res) { } }) } -app.get('/health_check', healthCheck) +app.get( + '/health_check', + (req, res, next) => { + if (Settings.shutDownComplete) { + return res.sendStatus(503) + } + next() + }, + healthCheck +) app.get('/health_check/redis', healthCheck) @@ -155,7 +164,22 @@ function drainAndShutdown(signal) { { signal }, `received interrupt, starting drain over ${shutdownDrainTimeWindow} mins` ) - DrainManager.startDrainTimeWindow(io, shutdownDrainTimeWindow) + DrainManager.startDrainTimeWindow(io, shutdownDrainTimeWindow, () => { + setTimeout(() => { + const staleClients = io.sockets.clients() + if (staleClients.length !== 0) { + logger.warn( + { staleClients: staleClients.map((client) => client.id) }, + 'forcefully disconnecting stale clients' + ) + staleClients.forEach((client) => { + client.disconnect() + }) + } + // Mark the node as unhealthy. + Settings.shutDownComplete = true + }, Settings.gracefulReconnectTimeoutMs) + }) shutdownCleanly(signal) }, statusCheckInterval) } diff --git a/services/real-time/app/js/DrainManager.js b/services/real-time/app/js/DrainManager.js index c605957160..06885f1d28 100644 --- a/services/real-time/app/js/DrainManager.js +++ b/services/real-time/app/js/DrainManager.js @@ -1,13 +1,13 @@ const logger = require('logger-sharelatex') module.exports = { - startDrainTimeWindow(io, minsToDrain) { + startDrainTimeWindow(io, minsToDrain, callback) { const drainPerMin = io.sockets.clients().length / minsToDrain // enforce minimum drain rate - this.startDrain(io, Math.max(drainPerMin / 60, 4)) + this.startDrain(io, Math.max(drainPerMin / 60, 4), callback) }, - startDrain(io, rate) { + startDrain(io, rate, callback) { // Clear out any old interval clearInterval(this.interval) logger.log({ rate }, 'starting drain') @@ -24,7 +24,11 @@ module.exports = { pollingInterval = 1000 } this.interval = setInterval(() => { - this.reconnectNClients(io, rate) + const requestedAllClientsToReconnect = this.reconnectNClients(io, rate) + if (requestedAllClientsToReconnect && callback) { + callback() + callback = undefined + } }, pollingInterval) }, @@ -48,6 +52,8 @@ module.exports = { } if (drainedCount < N) { logger.log('All clients have been told to reconnectGracefully') + return true } + return false } } diff --git a/services/real-time/config/settings.defaults.js b/services/real-time/config/settings.defaults.js index cbf6d3be82..5c0d501b50 100644 --- a/services/real-time/config/settings.defaults.js +++ b/services/real-time/config/settings.defaults.js @@ -123,6 +123,16 @@ const settings = { shutdownDrainTimeWindow: process.env.SHUTDOWN_DRAIN_TIME_WINDOW || 9, + // The shutdown procedure asks clients to reconnect gracefully. + // 3rd-party/buggy clients may not act upon receiving the message and keep + // stale connections alive. We forcefully disconnect them after X ms: + gracefulReconnectTimeoutMs: + parseInt(process.env.GRACEFUL_RECONNECT_TIMEOUT_MS, 10) || + // The frontend allows actively editing users to keep the connection open + // for up-to ConnectionManager.MAX_RECONNECT_GRACEFULLY_INTERVAL=45s + // Permit an extra delay to account for slow/flaky connections. + (45 + 30) * 1000, + continualPubsubTraffic: process.env.CONTINUAL_PUBSUB_TRAFFIC || false, checkEventOrder: process.env.CHECK_EVENT_ORDER || false,