diff --git a/services/clsi/app/js/CompileManager.js b/services/clsi/app/js/CompileManager.js index c771082bfd..1d681e8de7 100644 --- a/services/clsi/app/js/CompileManager.js +++ b/services/clsi/app/js/CompileManager.js @@ -292,6 +292,8 @@ module.exports = CompileManager = { timings['cpu-time'] / stats['latex-runs'] ) } + // Emit compile time. + timings.compile = ts return OutputFileFinder.findOutputFiles( resourceList, @@ -317,8 +319,7 @@ module.exports = CompileManager = { ) } - // Emit compile time. - timings.compile = ts + // Emit e2e compile time. timings.compileE2E = timerE2E.done() if (stats['pdf-size']) { diff --git a/services/clsi/app/js/ContentCacheManager.js b/services/clsi/app/js/ContentCacheManager.js index ab73c61d93..cbd6f3cde0 100644 --- a/services/clsi/app/js/ContentCacheManager.js +++ b/services/clsi/app/js/ContentCacheManager.js @@ -10,21 +10,26 @@ const Settings = require('settings-sharelatex') const OError = require('@overleaf/o-error') const pLimit = require('p-limit') const { parseXrefTable } = require('../lib/pdfjs/parseXrefTable') +const { TimedOutError } = require('./Errors') /** * * @param {String} contentDir path to directory where content hash files are cached * @param {String} filePath the pdf file to scan for streams * @param {number} size the pdf size + * @param {number} compileTime */ -async function update(contentDir, filePath, size) { +async function update(contentDir, filePath, size, compileTime) { + const checkDeadline = getDeadlineChecker(compileTime) const ranges = [] const newRanges = [] // keep track of hashes expire old ones when they reach a generation > N. const tracker = await HashFileTracker.from(contentDir) tracker.updateAge() - const rawTable = await parseXrefTable(filePath, size) + checkDeadline('after init HashFileTracker') + + const rawTable = await parseXrefTable(filePath, size, checkDeadline) rawTable.sort((a, b) => { return a.offset - b.offset }) @@ -32,6 +37,8 @@ async function update(contentDir, filePath, size) { obj.idx = idx }) + checkDeadline('after parsing') + const uncompressedObjects = [] for (const object of rawTable) { if (!object.uncompressed) { @@ -50,12 +57,14 @@ async function update(contentDir, filePath, size) { if (size < Settings.pdfCachingMinChunkSize) { continue } - uncompressedObjects.push(object) + uncompressedObjects.push({ object, idx: uncompressedObjects.length }) } + checkDeadline('after finding uncompressed') + const handle = await fs.promises.open(filePath) try { - for (const object of uncompressedObjects) { + for (const { object, idx } of uncompressedObjects) { let buffer = Buffer.alloc(object.size, 0) const { bytesRead } = await handle.read( buffer, @@ -63,6 +72,7 @@ async function update(contentDir, filePath, size) { object.size, object.offset ) + checkDeadline('after read ' + idx) if (bytesRead !== object.size) { throw new OError('could not read full chunk', { object, @@ -80,6 +90,7 @@ async function update(contentDir, filePath, size) { buffer = buffer.subarray(objectIdRaw.byteLength) const hash = pdfStreamHash(buffer) + checkDeadline('after hash ' + idx) const range = { objectId: objectIdRaw.toString(), start: object.offset + objectIdRaw.byteLength, @@ -92,12 +103,15 @@ async function update(contentDir, filePath, size) { if (tracker.track(range)) continue await writePdfStream(contentDir, hash, buffer) + checkDeadline('after write ' + idx) newRanges.push(range) } } finally { await handle.close() } + // NOTE: Bailing out below does not make sense. + // Let the next compile use the already written ranges. const reclaimedSpace = await tracker.deleteStaleHashes(5) await tracker.flush() return [ranges, newRanges, reclaimedSpace] @@ -219,6 +233,28 @@ async function writePdfStream(dir, hash, buffer) { } } +function getDeadlineChecker(compileTime) { + const maxOverhead = Math.min( + // Adding 10s to a 40s compile time is OK. + compileTime / 4, + // Adding 30s to a 120s compile time is not OK, limit to 10s. + Settings.pdfCachingMaxProcessingTime + ) + + const deadline = Date.now() + maxOverhead + let lastStage = { stage: 'start', now: Date.now() } + return function (stage) { + const now = Date.now() + if (now > deadline) { + throw new TimedOutError(stage, { + lastStage: lastStage.stage, + diffToLastStage: now - lastStage.now + }) + } + lastStage = { stage, now } + } +} + function promiseMapWithLimit(concurrency, array, fn) { const limit = pLimit(concurrency) return Promise.all(array.map((x) => limit(() => fn(x)))) diff --git a/services/clsi/app/js/ContentCacheMetrics.js b/services/clsi/app/js/ContentCacheMetrics.js index 0078406926..6b7a33de8a 100644 --- a/services/clsi/app/js/ContentCacheMetrics.js +++ b/services/clsi/app/js/ContentCacheMetrics.js @@ -19,6 +19,9 @@ function getSystemLoad() { const ONE_MB = 1024 * 1024 function emitPdfStats(stats, timings) { + if (stats['pdf-caching-timed-out']) { + Metrics.inc('pdf-caching-timed-out') + } if (timings['compute-pdf-caching']) { emitPdfCachingStats(stats, timings) } else { diff --git a/services/clsi/app/js/Errors.js b/services/clsi/app/js/Errors.js index 9b014f8be0..0b16803414 100644 --- a/services/clsi/app/js/Errors.js +++ b/services/clsi/app/js/Errors.js @@ -4,6 +4,8 @@ */ // TODO: This file was created by bulk-decaffeinate. // Fix any style issues and re-enable lint. +const OError = require('@overleaf/o-error') + let Errors var NotFoundError = function (message) { const error = new Error(message) @@ -29,7 +31,10 @@ var AlreadyCompilingError = function (message) { } AlreadyCompilingError.prototype.__proto__ = Error.prototype +class TimedOutError extends OError {} + module.exports = Errors = { + TimedOutError, NotFoundError, FilesOutOfSyncError, AlreadyCompilingError diff --git a/services/clsi/app/js/OutputCacheManager.js b/services/clsi/app/js/OutputCacheManager.js index 4d2e64132b..539b1bc99d 100644 --- a/services/clsi/app/js/OutputCacheManager.js +++ b/services/clsi/app/js/OutputCacheManager.js @@ -26,6 +26,7 @@ const Metrics = require('./Metrics') const OutputFileOptimiser = require('./OutputFileOptimiser') const ContentCacheManager = require('./ContentCacheManager') +const { TimedOutError } = require('./Errors') module.exports = OutputCacheManager = { CONTENT_SUBDIR: 'content', @@ -281,7 +282,16 @@ module.exports = OutputCacheManager = { contentDir, outputFilePath, pdfSize, + timings.compile, function (err, result) { + if (err && err instanceof TimedOutError) { + logger.warn( + { err, outputDir, stats, timings }, + 'pdf caching timed out' + ) + stats['pdf-caching-timed-out'] = 1 + return callback(null, outputFiles) + } if (err) return callback(err, outputFiles) const [contentRanges, newContentRanges, reclaimedSpace] = result diff --git a/services/clsi/app/lib/pdfjs/FSPdfManager.js b/services/clsi/app/lib/pdfjs/FSPdfManager.js index a0450c1e85..8fb1606bf0 100644 --- a/services/clsi/app/lib/pdfjs/FSPdfManager.js +++ b/services/clsi/app/lib/pdfjs/FSPdfManager.js @@ -4,10 +4,10 @@ const { MissingDataException } = require('pdfjs-dist/lib/core/core_utils') const { FSStream } = require('./FSStream') class FSPdfManager extends LocalPdfManager { - constructor(docId, options) { + constructor(docId, { fh, size, checkDeadline }) { const nonEmptyDummyBuffer = Buffer.alloc(1, 0) super(docId, nonEmptyDummyBuffer) - this.stream = new FSStream(options.fh, 0, options.size) + this.stream = new FSStream(fh, 0, size, null, null, checkDeadline) this.pdfDocument = new PDFDocument(this, this.stream) } diff --git a/services/clsi/app/lib/pdfjs/FSStream.js b/services/clsi/app/lib/pdfjs/FSStream.js index 9179e83f0e..748d74362a 100644 --- a/services/clsi/app/lib/pdfjs/FSStream.js +++ b/services/clsi/app/lib/pdfjs/FSStream.js @@ -4,11 +4,12 @@ const { MissingDataException } = require('pdfjs-dist/lib/core/core_utils') const BUF_SIZE = 1024 // read from the file in 1024 byte pages class FSStream extends Stream { - constructor(fh, start, length, dict, cachedBytes) { + constructor(fh, start, length, dict, cachedBytes, checkDeadline) { const nonEmptyDummyBuffer = Buffer.alloc(1, 0) super(nonEmptyDummyBuffer, start, length, dict) delete this.bytes this.fh = fh + this.checkDeadline = checkDeadline this.cachedBytes = cachedBytes || [] } @@ -23,6 +24,7 @@ class FSStream extends Stream { // Manage cached reads from the file requestRange(begin, end) { + this.checkDeadline(`request range ${begin} - ${end}`) // expand small ranges to read a larger amount if (end - begin < BUF_SIZE) { end = begin + BUF_SIZE @@ -123,6 +125,7 @@ class FSStream extends Stream { } makeSubStream(start, length, dict = null) { + this.checkDeadline(`make sub stream start=${start}/length=${length}`) // BG: had to add this check for null length, it is being called with only // the start value at one point in the xref decoding. The intent is clear // enough @@ -131,7 +134,14 @@ class FSStream extends Stream { if (!length) { length = this.end - start } - return new FSStream(this.fh, start, length, dict, this.cachedBytes) + return new FSStream( + this.fh, + start, + length, + dict, + this.cachedBytes, + this.checkDeadline + ) } } diff --git a/services/clsi/app/lib/pdfjs/parseXrefTable.js b/services/clsi/app/lib/pdfjs/parseXrefTable.js index 4db1e0cfc7..de7a386f48 100644 --- a/services/clsi/app/lib/pdfjs/parseXrefTable.js +++ b/services/clsi/app/lib/pdfjs/parseXrefTable.js @@ -1,18 +1,21 @@ const fs = require('fs') const { FSPdfManager } = require('./FSPdfManager') -async function parseXrefTable(path, size) { +async function parseXrefTable(path, size, checkDeadline) { if (size === 0) { return [] } const file = await fs.promises.open(path) try { - const manager = new FSPdfManager(0, { fh: file, size }) + const manager = new FSPdfManager(0, { fh: file, size, checkDeadline }) await manager.ensureDoc('checkHeader') + checkDeadline('pdfjs: after checkHeader') await manager.ensureDoc('parseStartXRef') + checkDeadline('pdfjs: after parseStartXRef') await manager.ensureDoc('parse') + checkDeadline('pdfjs: after parse') return manager.pdfDocument.catalog.xref.entries } finally { file.close() diff --git a/services/clsi/config/settings.defaults.js b/services/clsi/config/settings.defaults.js index b65a019365..ae6086f1d4 100644 --- a/services/clsi/config/settings.defaults.js +++ b/services/clsi/config/settings.defaults.js @@ -68,7 +68,9 @@ module.exports = { enablePdfCaching: process.env.ENABLE_PDF_CACHING === 'true', enablePdfCachingDark: process.env.ENABLE_PDF_CACHING_DARK === 'true', pdfCachingMinChunkSize: - parseInt(process.env.PDF_CACHING_MIN_CHUNK_SIZE, 10) || 1024 + parseInt(process.env.PDF_CACHING_MIN_CHUNK_SIZE, 10) || 1024, + pdfCachingMaxProcessingTime: + parseInt(process.env.PDF_CACHING_MAX_PROCESSING_TIME, 10) || 10 * 1000 } if (process.env.ALLOWED_COMPILE_GROUPS) { diff --git a/services/clsi/test/unit/lib/pdfjsTests.js b/services/clsi/test/unit/lib/pdfjsTests.js index 64f062c0b8..72b36a3355 100644 --- a/services/clsi/test/unit/lib/pdfjsTests.js +++ b/services/clsi/test/unit/lib/pdfjsTests.js @@ -33,7 +33,7 @@ async function loadContext(example) { } async function backFillSnapshot(example, size) { - const table = await parseXrefTable(pdfPath(example), size) + const table = await parseXrefTable(pdfPath(example), size, () => {}) await fs.promises.mkdir(Path.dirname(snapshotPath(example)), { recursive: true }) @@ -53,6 +53,24 @@ describe('pdfjs', function () { }) }) + describe('when the operation times out', function () { + it('should bail out', async function () { + const path = pdfPath(EXAMPLES[0]) + const { size } = await loadContext(EXAMPLES[0]) + const err = new Error() + let table + try { + table = await parseXrefTable(path, size, () => { + throw err + }) + } catch (e) { + expect(e).to.equal(err) + return + } + expect(table).to.not.exist + }) + }) + for (const example of EXAMPLES) { describe(example, function () { let size, snapshot @@ -70,7 +88,7 @@ describe('pdfjs', function () { }) it('should produce the expected xRef table', async function () { - const table = await parseXrefTable(pdfPath(example), size) + const table = await parseXrefTable(pdfPath(example), size, () => {}) expect(table).to.deep.equal(snapshot) }) })