diff --git a/services/clsi/app/js/CLSICacheHandler.js b/services/clsi/app/js/CLSICacheHandler.js index dbe5127e1c..2e5f139f7b 100644 --- a/services/clsi/app/js/CLSICacheHandler.js +++ b/services/clsi/app/js/CLSICacheHandler.js @@ -17,6 +17,11 @@ const { MeteredStream } = require('@overleaf/stream-utils') const { CACHE_SUBDIR } = require('./OutputCacheManager') const { isExtraneousFile } = require('./ResourceWriter') +const TIMEOUT = 5_000 +/** + * @type {Map} + */ +const lastFailures = new Map() const TIMING_BUCKETS = [ 0, 10, 100, 1000, 2000, 5000, 10000, 15000, 20000, 30000, ] @@ -35,6 +40,25 @@ function getShard(projectId) { return Settings.apis.clsiCache.shards[idx] } +function checkCircuitBreaker(url) { + const lastFailure = lastFailures.get(url) ?? 0 + if (lastFailure) { + // Circuit breaker that avoids retries for 5-20s. + const retryDelay = TIMEOUT * (1 + 3 * Math.random()) + if (performance.now() - lastFailure < retryDelay) { + return true + } + } + return false +} + +function tripCircuitBreaker(url) { + lastFailures.set(url, performance.now()) // The shard is unhealthy. Refresh timestamp of last failure. +} +function closeCircuitBreaker(url) { + lastFailures.delete(url) // The shard is back up. +} + /** * @param {string} projectId * @param {string} userId @@ -61,6 +85,7 @@ function notifyCLSICacheAboutBuild({ if (!Settings.apis.clsiCache.enabled) return undefined if (!OBJECT_ID_REGEX.test(projectId)) return undefined const { url, shard } = getShard(projectId) + if (checkCircuitBreaker(url)) return undefined /** * @param {[{path: string}]} files @@ -102,13 +127,18 @@ function notifyCLSICacheAboutBuild({ method: 'POST', body, headers: { 'Content-Type': 'application/json' }, - signal: AbortSignal.timeout(15_000), - }).catch(err => { - logger.warn( - { err, projectId, userId, buildId }, - 'enqueue for clsi cache failed' - ) + signal: AbortSignal.timeout(TIMEOUT), }) + .then(() => { + closeCircuitBreaker() + }) + .catch(err => { + tripCircuitBreaker() + logger.warn( + { err, projectId, userId, buildId }, + 'enqueue for clsi cache failed' + ) + }) } // PDF preview @@ -201,6 +231,8 @@ async function downloadOutputDotSynctexFromCompileCache( ) { if (!Settings.apis.clsiCache.enabled) return false if (!OBJECT_ID_REGEX.test(projectId)) return false + const { url } = getShard(projectId) + if (checkCircuitBreaker(url)) return false const timer = new Metrics.Timer( 'clsi_cache_download', @@ -211,19 +243,21 @@ async function downloadOutputDotSynctexFromCompileCache( let stream try { stream = await fetchStream( - `${getShard(projectId).url}/project/${projectId}/${ + `${url}/project/${projectId}/${ userId ? `user/${userId}/` : '' }build/${editorId}-${buildId}/search/output/output.synctex.gz`, { method: 'GET', - signal: AbortSignal.timeout(10_000), + signal: AbortSignal.timeout(TIMEOUT), } ) } catch (err) { if (err instanceof RequestFailedError && err.response.status === 404) { + closeCircuitBreaker() timer.done({ status: 'not-found' }) return false } + tripCircuitBreaker() timer.done({ status: 'error' }) throw err } @@ -240,11 +274,13 @@ async function downloadOutputDotSynctexFromCompileCache( ) await fs.promises.rename(tmp, dst) } catch (err) { + tripCircuitBreaker() try { await fs.promises.unlink(tmp) } catch {} throw err } + closeCircuitBreaker() timer.done({ status: 'success' }) return true } @@ -258,10 +294,9 @@ async function downloadOutputDotSynctexFromCompileCache( async function downloadLatestCompileCache(projectId, userId, compileDir) { if (!Settings.apis.clsiCache.enabled) return false if (!OBJECT_ID_REGEX.test(projectId)) return false + const { url } = getShard(projectId) + if (checkCircuitBreaker(url)) return false - const url = `${getShard(projectId).url}/project/${projectId}/${ - userId ? `user/${userId}/` : '' - }latest/output/output.tar.gz` const timer = new Metrics.Timer( 'clsi_cache_download', 1, @@ -270,54 +305,71 @@ async function downloadLatestCompileCache(projectId, userId, compileDir) { ) let stream try { - stream = await fetchStream(url, { - method: 'GET', - signal: AbortSignal.timeout(10_000), - }) + stream = await fetchStream( + `${url}/project/${projectId}/${ + userId ? `user/${userId}/` : '' + }latest/output/output.tar.gz`, + { + method: 'GET', + signal: AbortSignal.timeout(TIMEOUT), + } + ) } catch (err) { if (err instanceof RequestFailedError && err.response.status === 404) { + closeCircuitBreaker() timer.done({ status: 'not-found' }) return false } + tripCircuitBreaker() timer.done({ status: 'error' }) throw err } let n = 0 let abort = false - await pipeline( - stream, - new MeteredStream(Metrics, 'clsi_cache_egress', { path: 'output.tar.gz' }), - createGunzip(), - tarFs.extract(compileDir, { - // use ignore hook for counting entries (files+folders) and validation. - // Include folders as they incur mkdir calls. - ignore(_, header) { - if (abort) return true // log once - n++ - if (n > MAX_ENTRIES_IN_OUTPUT_TAR) { - abort = true - logger.warn( - { - url, - compileDir, - }, - 'too many entries in tar-ball from clsi-cache' - ) - } else if (header.type !== 'file' && header.type !== 'directory') { - abort = true - logger.warn( - { - url, - compileDir, - entryType: header.type, - }, - 'unexpected entry in tar-ball from clsi-cache' - ) - } - return abort - }, - }) - ) + try { + await pipeline( + stream, + new MeteredStream(Metrics, 'clsi_cache_egress', { + path: 'output.tar.gz', + }), + createGunzip(), + tarFs.extract(compileDir, { + // use ignore hook for counting entries (files+folders) and validation. + // Include folders as they incur mkdir calls. + ignore(_, header) { + if (abort) return true // log once + n++ + if (n > MAX_ENTRIES_IN_OUTPUT_TAR) { + abort = true + logger.warn( + { + projectId, + userId, + compileDir, + }, + 'too many entries in tar-ball from clsi-cache' + ) + } else if (header.type !== 'file' && header.type !== 'directory') { + abort = true + logger.warn( + { + projectId, + userId, + compileDir, + entryType: header.type, + }, + 'unexpected entry in tar-ball from clsi-cache' + ) + } + return abort + }, + }) + ) + } catch (err) { + tripCircuitBreaker() + throw err + } + closeCircuitBreaker() Metrics.count('clsi_cache_download_entries', n) timer.done({ status: 'success' }) return !abort diff --git a/services/web/app/src/Features/Compile/ClsiCacheHandler.js b/services/web/app/src/Features/Compile/ClsiCacheHandler.js index 4f74d237d6..4c9a39fbef 100644 --- a/services/web/app/src/Features/Compile/ClsiCacheHandler.js +++ b/services/web/app/src/Features/Compile/ClsiCacheHandler.js @@ -9,6 +9,13 @@ const Settings = require('@overleaf/settings') const OError = require('@overleaf/o-error') const { NotFoundError, InvalidNameError } = require('../Errors/Errors') +const TIMEOUT = 4_000 + +/** + * @type {Map} + */ +const lastFailures = new Map() + /** * Keep in sync with validateFilename in services/clsi-cache/app/js/utils.js * @@ -70,7 +77,7 @@ async function clearCache(projectId, userId) { try { await fetchNothing(u, { method: 'DELETE', - signal: AbortSignal.timeout(15_000), + signal: AbortSignal.timeout(TIMEOUT), }) } catch (err) { throw OError.tag(err, 'clear clsi-cache', { url, shard }) @@ -94,7 +101,7 @@ async function getOutputFile( userId, buildId, filename, - signal = AbortSignal.timeout(15_000) + signal = AbortSignal.timeout(TIMEOUT) ) { validateFilename(filename) if (!/^[a-f0-9-]+$/.test(buildId)) { @@ -122,7 +129,7 @@ async function getLatestOutputFile( projectId, userId, filename, - signal = AbortSignal.timeout(15_000) + signal = AbortSignal.timeout(TIMEOUT) ) { validateFilename(filename) @@ -154,11 +161,23 @@ async function getRedirectWithFallback( projectId, userId, path, - signal = AbortSignal.timeout(15_000) + signal = AbortSignal.timeout(TIMEOUT) ) { // Avoid hitting the same instance first all the time. const instances = _.shuffle(Settings.apis.clsiCache.instances) for (const { url, shard } of instances) { + if (signal.aborted) { + break // Stop trying the next backend when the signal has expired. + } + const lastFailure = lastFailures.get(url) ?? 0 + if (lastFailure) { + // Circuit breaker that avoids retries for 4-16s. + const retryDelay = TIMEOUT * (1 + 3 * Math.random()) + if (performance.now() - lastFailure < retryDelay) { + continue + } + } + const u = new URL(url) u.pathname = path try { @@ -168,6 +187,7 @@ async function getRedirectWithFallback( } = await fetchRedirectWithResponse(u, { signal, }) + lastFailures.delete(url) // The shard is back up. let allFilesRaw = headers.get('X-All-Files') if (!allFilesRaw.startsWith('[')) { allFilesRaw = Buffer.from(allFilesRaw, 'base64url').toString() @@ -183,8 +203,10 @@ async function getRedirectWithFallback( } } catch (err) { if (err instanceof RequestFailedError && err.response.status === 404) { + lastFailures.delete(url) // The shard is back up. break // No clsi-cache instance has cached something for this project/user. } + lastFailures.set(url, performance.now()) // The shard is unhealthy. Refresh timestamp of last failure. logger.warn( { err, projectId, userId, url, shard }, 'getLatestOutputFile from clsi-cache failed' @@ -239,6 +261,7 @@ async function prepareCacheSource( } module.exports = { + TIMEOUT, getEgressLabel, clearCache, getOutputFile, diff --git a/services/web/app/src/Features/Compile/ClsiCacheManager.mjs b/services/web/app/src/Features/Compile/ClsiCacheManager.mjs index 492b1e10bf..9ac45db8fd 100644 --- a/services/web/app/src/Features/Compile/ClsiCacheManager.mjs +++ b/services/web/app/src/Features/Compile/ClsiCacheManager.mjs @@ -195,7 +195,7 @@ async function prepareClsiCache( const features = await UserGetter.promises.getUserFeatures(userId) if (features.compileGroup !== 'priority') return - const signal = AbortSignal.timeout(5_000) + const signal = AbortSignal.timeout(ClsiCacheHandler.TIMEOUT) let lastUpdated let shard = _.shuffle(Settings.apis.clsiCache.instances)[0].shard if (sourceProjectId) { diff --git a/services/web/frontend/js/shared/context/local-compile-context.tsx b/services/web/frontend/js/shared/context/local-compile-context.tsx index e2d5eb65bb..1bfc34923a 100644 --- a/services/web/frontend/js/shared/context/local-compile-context.tsx +++ b/services/web/frontend/js/shared/context/local-compile-context.tsx @@ -378,7 +378,9 @@ export const LocalCompileProvider: FC = ({ if (initialCompileFromCache && !pendingInitialCompileFromCache) { setPendingInitialCompileFromCache(true) setCompileFromCacheStartedAt(performance.now()) - getJSON(`/project/${projectId}/output/cached/output.overleaf.json`) + getJSON(`/project/${projectId}/output/cached/output.overleaf.json`, { + signal: AbortSignal.timeout(5_000), + }) .then((data: any) => { // Hand data over to next effect, it will wait for project/doc loading. setDataFromCache(data)