From a9f10f013e86ea04a93b8c5bb7d7eac51355a0ab Mon Sep 17 00:00:00 2001 From: Jakob Ackermann Date: Thu, 28 Nov 2024 17:39:31 +0100 Subject: [PATCH] Merge pull request #22208 from overleaf/jpa-clsi-hash [misc] clsi: read files from history-v1 with fallback to filestore GitOrigin-RevId: c54bb128780198c14e7a63818f39fad62ce65d4e --- services/clsi/app/js/RequestParser.js | 4 + services/clsi/app/js/ResourceWriter.js | 1 + services/clsi/app/js/UrlCache.js | 29 ++- services/clsi/app/js/UrlFetcher.js | 71 ++++++-- .../test/acceptance/js/UrlCachingTests.js | 172 +++++++++++++++++- .../clsi/test/unit/js/ResourceWriterTests.js | 6 +- services/clsi/test/unit/js/UrlCacheTests.js | 29 ++- .../filestore/config/settings.defaults.js | 4 + .../history-v1/storage/lib/project_key.js | 1 + services/web/app.mjs | 13 +- .../app/src/Features/Compile/ClsiManager.js | 10 +- .../src/Features/History/HistoryManager.js | 40 +++- .../app/src/Features/History/project_key.js | 24 +++ .../web/app/src/infrastructure/mongodb.js | 1 + services/web/config/settings.defaults.js | 17 ++ .../test/unit/src/Compile/ClsiManagerTests.js | 37 +++- 16 files changed, 431 insertions(+), 28 deletions(-) create mode 100644 services/web/app/src/Features/History/project_key.js diff --git a/services/clsi/app/js/RequestParser.js b/services/clsi/app/js/RequestParser.js index 61d3b9d229..28e182ea44 100644 --- a/services/clsi/app/js/RequestParser.js +++ b/services/clsi/app/js/RequestParser.js @@ -169,11 +169,15 @@ function _parseResource(resource) { if (resource.url != null && typeof resource.url !== 'string') { throw new Error('url attribute should be a string') } + if (resource.fallbackURL && typeof resource.fallbackURL !== 'string') { + throw new Error('fallbackURL attribute should be a string') + } return { path: resource.path, modified, url: resource.url, + fallbackURL: resource.fallbackURL, content: resource.content, } } diff --git a/services/clsi/app/js/ResourceWriter.js b/services/clsi/app/js/ResourceWriter.js index c6ff0caf26..6fa6f85e1f 100644 --- a/services/clsi/app/js/ResourceWriter.js +++ b/services/clsi/app/js/ResourceWriter.js @@ -333,6 +333,7 @@ module.exports = ResourceWriter = { return UrlCache.downloadUrlToFile( projectId, resource.url, + resource.fallbackURL, path, resource.modified, function (err) { diff --git a/services/clsi/app/js/UrlCache.js b/services/clsi/app/js/UrlCache.js index 65a0f21b46..36703e7091 100644 --- a/services/clsi/app/js/UrlCache.js +++ b/services/clsi/app/js/UrlCache.js @@ -47,14 +47,29 @@ async function createProjectDir(projectId) { await fs.promises.mkdir(getProjectDir(projectId), { recursive: true }) } -async function downloadUrlToFile(projectId, url, destPath, lastModified) { +async function downloadUrlToFile( + projectId, + url, + fallbackURL, + destPath, + lastModified +) { const cachePath = getCachePath(projectId, url, lastModified) try { const timer = new Metrics.Timer('url_cache', { status: 'cache-hit', path: 'copy', }) - await fs.promises.copyFile(cachePath, destPath) + try { + await fs.promises.copyFile(cachePath, destPath) + } catch (err) { + if (err.code === 'ENOENT' && fallbackURL) { + const fallbackPath = getCachePath(projectId, fallbackURL, lastModified) + await fs.promises.copyFile(fallbackPath, destPath) + } else { + throw err + } + } // the metric is only updated if the file is present in the cache timer.done() return @@ -70,7 +85,7 @@ async function downloadUrlToFile(projectId, url, destPath, lastModified) { path: 'download', }) try { - await download(url, cachePath) + await download(url, fallbackURL, cachePath) } finally { timer.done() } @@ -86,13 +101,17 @@ async function downloadUrlToFile(projectId, url, destPath, lastModified) { } } -async function download(url, cachePath) { +async function download(url, fallbackURL, cachePath) { let pending = PENDING_DOWNLOADS.get(cachePath) if (pending) { return pending } - pending = UrlFetcher.promises.pipeUrlToFileWithRetry(url, cachePath) + pending = UrlFetcher.promises.pipeUrlToFileWithRetry( + url, + fallbackURL, + cachePath + ) PENDING_DOWNLOADS.set(cachePath, pending) try { await pending diff --git a/services/clsi/app/js/UrlFetcher.js b/services/clsi/app/js/UrlFetcher.js index 2f48dc306a..2c44f3a6dd 100644 --- a/services/clsi/app/js/UrlFetcher.js +++ b/services/clsi/app/js/UrlFetcher.js @@ -5,6 +5,7 @@ const { CustomHttpAgent, CustomHttpsAgent, fetchStream, + RequestFailedError, } = require('@overleaf/fetch-utils') const { URL } = require('node:url') const { pipeline } = require('node:stream/promises') @@ -14,7 +15,7 @@ const MAX_CONNECT_TIME = 1000 const httpAgent = new CustomHttpAgent({ connectTimeout: MAX_CONNECT_TIME }) const httpsAgent = new CustomHttpsAgent({ connectTimeout: MAX_CONNECT_TIME }) -async function pipeUrlToFileWithRetry(url, filePath) { +async function pipeUrlToFileWithRetry(url, fallbackURL, filePath) { let remainingAttempts = 3 let lastErr while (remainingAttempts-- > 0) { @@ -22,7 +23,7 @@ async function pipeUrlToFileWithRetry(url, filePath) { path: lastErr ? ' retry' : 'fetch', }) try { - await pipeUrlToFile(url, filePath) + await pipeUrlToFile(url, fallbackURL, filePath) timer.done({ status: 'success' }) return } catch (err) { @@ -37,7 +38,7 @@ async function pipeUrlToFileWithRetry(url, filePath) { throw lastErr } -async function pipeUrlToFile(url, filePath) { +async function pipeUrlToFile(url, fallbackURL, filePath) { const u = new URL(url) if ( Settings.filestoreDomainOveride && @@ -45,21 +46,55 @@ async function pipeUrlToFile(url, filePath) { ) { url = `${Settings.filestoreDomainOveride}${u.pathname}${u.search}` } + if (fallbackURL) { + const u2 = new URL(fallbackURL) + if ( + Settings.filestoreDomainOveride && + u2.host !== Settings.apis.clsiPerf.host + ) { + fallbackURL = `${Settings.filestoreDomainOveride}${u2.pathname}${u2.search}` + } + } - const stream = await fetchStream(url, { - signal: AbortSignal.timeout(60 * 1000), - // provide a function to get the agent for each request - // as there may be multiple requests with different protocols - // due to redirects. - agent: _url => (_url.protocol === 'https:' ? httpsAgent : httpAgent), - }) + let stream + try { + stream = await fetchStream(url, { + signal: AbortSignal.timeout(60 * 1000), + // provide a function to get the agent for each request + // as there may be multiple requests with different protocols + // due to redirects. + agent: _url => (_url.protocol === 'https:' ? httpsAgent : httpAgent), + }) + } catch (err) { + if ( + fallbackURL && + err instanceof RequestFailedError && + err.response.status === 404 + ) { + stream = await fetchStream(fallbackURL, { + signal: AbortSignal.timeout(60 * 1000), + // provide a function to get the agent for each request + // as there may be multiple requests with different protocols + // due to redirects. + agent: _url => (_url.protocol === 'https:' ? httpsAgent : httpAgent), + }) + url = fallbackURL + } else { + throw err + } + } + + const source = inferSource(url) + Metrics.inc('url_source', 1, { path: source }) const atomicWrite = filePath + '~' try { const output = fs.createWriteStream(atomicWrite) await pipeline(stream, output) await fs.promises.rename(atomicWrite, filePath) - Metrics.count('UrlFetcher.downloaded_bytes', output.bytesWritten) + Metrics.count('UrlFetcher.downloaded_bytes', output.bytesWritten, { + path: source, + }) } catch (err) { try { await fs.promises.unlink(atomicWrite) @@ -68,6 +103,20 @@ async function pipeUrlToFile(url, filePath) { } } +const BUCKET_REGEX = /\/bucket\/([^/]+)\/key\// + +function inferSource(url) { + if (url.includes(Settings.apis.clsiPerf.host)) { + return 'clsi-perf' + } else if (url.includes('/project/') && url.includes('/file/')) { + return 'user-files' + } else if (url.includes('/key/')) { + const match = url.match(BUCKET_REGEX) + if (match) return match[1] + } + return 'unknown' +} + module.exports.promises = { pipeUrlToFileWithRetry, } diff --git a/services/clsi/test/acceptance/js/UrlCachingTests.js b/services/clsi/test/acceptance/js/UrlCachingTests.js index e1f70b655c..9fc9608204 100644 --- a/services/clsi/test/acceptance/js/UrlCachingTests.js +++ b/services/clsi/test/acceptance/js/UrlCachingTests.js @@ -14,6 +14,8 @@ const Path = require('node:path') const Client = require('./helpers/Client') const sinon = require('sinon') const ClsiApp = require('./helpers/ClsiApp') +const request = require('request') +const Settings = require('@overleaf/settings') const Server = { run() { @@ -35,6 +37,21 @@ const Server = { } }) + app.get('/not-found', (req, res, next) => { + this.getFile(req.url) + res.status(404).end() + }) + + app.get('/project/:projectId/file/:fileId', (req, res, next) => { + this.getFile(req.url) + return res.send(`${req.params.projectId}:${req.params.fileId}`) + }) + + app.get('/bucket/:bucket/key/*', (req, res, next) => { + this.getFile(req.url) + return res.send(`${req.params.bucket}:${req.params[0]}`) + }) + app.get('/:random_id/*', (req, res, next) => { this.getFile(req.url) req.url = `/${req.params[0]}` @@ -218,9 +235,24 @@ describe('Url Caching', function () { return Server.getFile.restore() }) - return it('should not download the image again', function () { + it('should not download the image again', function () { return Server.getFile.called.should.equal(false) }) + + it('should gather metrics', function (done) { + request.get(`${Settings.apis.clsi.url}/metrics`, (err, res, body) => { + if (err) return done(err) + body + .split('\n') + .some(line => { + return ( + line.startsWith('url_source') && line.includes('path="unknown"') + ) + }) + .should.equal(true) + done() + }) + }) }) describe('When an image is in the cache and the last modified date is advanced', function () { @@ -391,7 +423,7 @@ describe('Url Caching', function () { }) }) - return describe('After clearing the cache', function () { + describe('After clearing the cache', function () { before(function (done) { this.project_id = Client.randomId() this.file = `${Server.randomId()}/lion.png` @@ -446,4 +478,140 @@ describe('Url Caching', function () { return Server.getFile.called.should.equal(true) }) }) + + describe('fallbackURL', function () { + describe('when the primary resource is available', function () { + before(function (done) { + this.project_id = Client.randomId() + this.file = `/project/${Server.randomId()}/file/${Server.randomId()}` + this.fallback = `/bucket/project-blobs/key/ab/cd/${Server.randomId()}` + this.request = { + resources: [ + { + path: 'main.tex', + content: `\ +\\documentclass{article} +\\usepackage{graphicx} +\\begin{document} +\\includegraphics{lion.png} +\\end{document}\ +`, + }, + { + path: 'lion.png', + url: `http://filestore${this.file}`, + fallbackURL: `http://filestore${this.fallback}`, + }, + ], + } + + sinon.spy(Server, 'getFile') + return ClsiApp.ensureRunning(() => { + return Client.compile( + this.project_id, + this.request, + (error, res, body) => { + this.error = error + this.res = res + this.body = body + return done() + } + ) + }) + }) + + after(function () { + return Server.getFile.restore() + }) + + it('should download from the primary', function () { + Server.getFile.calledWith(this.file).should.equal(true) + }) + it('should not download from the fallback', function () { + Server.getFile.calledWith(this.fallback).should.equal(false) + }) + + it('should gather metrics', function (done) { + request.get(`${Settings.apis.clsi.url}/metrics`, (err, res, body) => { + if (err) return done(err) + body + .split('\n') + .some(line => { + return ( + line.startsWith('url_source') && + line.includes('path="user-files"') + ) + }) + .should.equal(true) + done() + }) + }) + }) + + describe('when the primary resource is not available', function () { + before(function (done) { + this.project_id = Client.randomId() + this.file = `/project/${Server.randomId()}/file/${Server.randomId()}` + this.fallback = `/bucket/project-blobs/key/ab/cd/${Server.randomId()}` + this.request = { + resources: [ + { + path: 'main.tex', + content: `\ +\\documentclass{article} +\\usepackage{graphicx} +\\begin{document} +\\includegraphics{lion.png} +\\end{document}\ +`, + }, + { + path: 'lion.png', + url: `http://filestore/not-found`, + fallbackURL: `http://filestore${this.fallback}`, + }, + ], + } + + sinon.spy(Server, 'getFile') + return ClsiApp.ensureRunning(() => { + return Client.compile( + this.project_id, + this.request, + (error, res, body) => { + this.error = error + this.res = res + this.body = body + return done() + } + ) + }) + }) + + after(function () { + return Server.getFile.restore() + }) + + it('should download from the fallback', function () { + Server.getFile.calledWith(`/not-found`).should.equal(true) + Server.getFile.calledWith(this.fallback).should.equal(true) + }) + + it('should gather metrics', function (done) { + request.get(`${Settings.apis.clsi.url}/metrics`, (err, res, body) => { + if (err) return done(err) + body + .split('\n') + .some(line => { + return ( + line.startsWith('url_source') && + line.includes('path="project-blobs"') + ) + }) + .should.equal(true) + done() + }) + }) + }) + }) }) diff --git a/services/clsi/test/unit/js/ResourceWriterTests.js b/services/clsi/test/unit/js/ResourceWriterTests.js index c7db8b98f2..c2e09ce9cf 100644 --- a/services/clsi/test/unit/js/ResourceWriterTests.js +++ b/services/clsi/test/unit/js/ResourceWriterTests.js @@ -378,12 +378,13 @@ describe('ResourceWriter', function () { this.fs.mkdir = sinon.stub().callsArg(2) this.resource = { path: 'main.tex', - url: 'http://www.example.com/main.tex', + url: 'http://www.example.com/primary/main.tex', + fallbackURL: 'http://fallback.example.com/fallback/main.tex', modified: Date.now(), } this.UrlCache.downloadUrlToFile = sinon .stub() - .callsArgWith(4, 'fake error downloading file') + .callsArgWith(5, 'fake error downloading file') return this.ResourceWriter._writeResourceToDisk( this.project_id, this.resource, @@ -405,6 +406,7 @@ describe('ResourceWriter', function () { .calledWith( this.project_id, this.resource.url, + this.resource.fallbackURL, path.join(this.basePath, this.resource.path), this.resource.modified ) diff --git a/services/clsi/test/unit/js/UrlCacheTests.js b/services/clsi/test/unit/js/UrlCacheTests.js index 7a3a8f6f00..a3dc2fac3c 100644 --- a/services/clsi/test/unit/js/UrlCacheTests.js +++ b/services/clsi/test/unit/js/UrlCacheTests.js @@ -23,6 +23,7 @@ describe('UrlCache', function () { this.callback = sinon.stub() this.url = 'http://filestore/project/60b0dd39c418bc00598a0d22/file/60ae721ffb1d920027d3201f' + this.fallbackURL = 'http://filestore/bucket/project-blobs/key/ab/cd/ef' this.project_id = '60b0dd39c418bc00598a0d22' return (this.UrlCache = SandboxedModule.require(modulePath, { requires: { @@ -54,6 +55,29 @@ describe('UrlCache', function () { this.UrlCache.downloadUrlToFile( this.project_id, this.url, + this.fallbackURL, + this.destPath, + this.lastModified, + error => { + expect(error).to.not.exist + expect( + this.UrlFetcher.promises.pipeUrlToFileWithRetry.called + ).to.equal(false) + done() + } + ) + }) + + it('should not download on the semi-happy path', function (done) { + const codedError = new Error() + codedError.code = 'ENOENT' + this.fs.promises.copyFile.onCall(0).rejects(codedError) + this.fs.promises.copyFile.onCall(1).resolves() + + this.UrlCache.downloadUrlToFile( + this.project_id, + this.url, + this.fallbackURL, this.destPath, this.lastModified, error => { @@ -70,11 +94,13 @@ describe('UrlCache', function () { const codedError = new Error() codedError.code = 'ENOENT' this.fs.promises.copyFile.onCall(0).rejects(codedError) - this.fs.promises.copyFile.onCall(1).resolves() + this.fs.promises.copyFile.onCall(1).rejects(codedError) + this.fs.promises.copyFile.onCall(2).resolves() this.UrlCache.downloadUrlToFile( this.project_id, this.url, + this.fallbackURL, this.destPath, this.lastModified, error => { @@ -94,6 +120,7 @@ describe('UrlCache', function () { this.UrlCache.downloadUrlToFile( this.project_id, this.url, + this.fallbackURL, this.destPath, this.lastModified, error => { diff --git a/services/filestore/config/settings.defaults.js b/services/filestore/config/settings.defaults.js index a926115b5c..9a08bb197e 100644 --- a/services/filestore/config/settings.defaults.js +++ b/services/filestore/config/settings.defaults.js @@ -73,6 +73,10 @@ const settings = { stores: { user_files: process.env.USER_FILES_BUCKET_NAME, template_files: process.env.TEMPLATE_FILES_BUCKET_NAME, + + // allow signed links to be generated for these buckets + project_blobs: process.env.OVERLEAF_EDITOR_PROJECT_BLOBS_BUCKET, + global_blobs: process.env.OVERLEAF_EDITOR_BLOBS_BUCKET, }, fallback: process.env.FALLBACK_BACKEND diff --git a/services/history-v1/storage/lib/project_key.js b/services/history-v1/storage/lib/project_key.js index d4b03783aa..03fb2a5141 100644 --- a/services/history-v1/storage/lib/project_key.js +++ b/services/history-v1/storage/lib/project_key.js @@ -1,3 +1,4 @@ +// Keep in sync with services/web/app/src/Features/History/project_key.js const _ = require('lodash') const path = require('node:path') diff --git a/services/web/app.mjs b/services/web/app.mjs index 418d0582cd..528f2c079b 100644 --- a/services/web/app.mjs +++ b/services/web/app.mjs @@ -6,6 +6,7 @@ import metrics from '@overleaf/metrics' import Settings from '@overleaf/settings' import logger from '@overleaf/logger' import PlansLocator from './app/src/Features/Subscription/PlansLocator.js' +import HistoryManager from './app/src/Features/History/HistoryManager.js' import SiteAdminHandler from './app/src/infrastructure/SiteAdminHandler.js' import http from 'node:http' import https from 'node:https' @@ -58,7 +59,11 @@ if (process.argv[1] === fileURLToPath(import.meta.url)) { PlansLocator.ensurePlansAreSetupCorrectly() - Promise.all([mongodb.connectionPromise, mongoose.connectionPromise]) + Promise.all([ + mongodb.connectionPromise, + mongoose.connectionPromise, + HistoryManager.promises.loadGlobalBlobs(), + ]) .then(async () => { Server.server.listen(port, host, function () { logger.debug(`web starting up, listening on ${host}:${port}`) @@ -76,7 +81,11 @@ if (process.argv[1] === fileURLToPath(import.meta.url)) { } // initialise site admin tasks -Promise.all([mongodb.connectionPromise, mongoose.connectionPromise]) +Promise.all([ + mongodb.connectionPromise, + mongoose.connectionPromise, + HistoryManager.promises.loadGlobalBlobs(), +]) .then(() => SiteAdminHandler.initialise()) .catch(err => { logger.fatal({ err }, 'Cannot connect to mongo. Exiting.') diff --git a/services/web/app/src/Features/Compile/ClsiManager.js b/services/web/app/src/Features/Compile/ClsiManager.js index 7b6f9d3e4e..0b5156bfa2 100644 --- a/services/web/app/src/Features/Compile/ClsiManager.js +++ b/services/web/app/src/Features/Compile/ClsiManager.js @@ -24,6 +24,7 @@ const ClsiFormatChecker = require('./ClsiFormatChecker') const DocumentUpdaterHandler = require('../DocumentUpdater/DocumentUpdaterHandler') const Metrics = require('@overleaf/metrics') const Errors = require('../Errors/Errors') +const { getBlobLocation } = require('../History/HistoryManager') const VALID_COMPILERS = ['pdflatex', 'latex', 'xelatex', 'lualatex'] const OUTPUT_FILE_TIMEOUT_MS = 60000 @@ -532,6 +533,7 @@ async function _buildRequest(projectId, options) { rootDoc_id: 1, imageName: 1, rootFolder: 1, + 'overleaf.history.id': 1, }) if (project == null) { throw new Errors.NotFoundError(`project does not exist: ${projectId}`) @@ -731,12 +733,18 @@ function _finaliseRequest(projectId, options, project, docs, files) { } } + const historyId = project.overleaf.history.id + if (!historyId) { + throw new OError('project does not have a history id', { projectId }) + } for (let path in files) { const file = files[path] path = path.replace(/^\//, '') // Remove leading / + const { bucket, key } = getBlobLocation(historyId, file.hash) resources.push({ path, - url: `${Settings.apis.filestore.url}/project/${project._id}/file/${file._id}`, + url: `${Settings.apis.filestore.url}/bucket/${bucket}/key/${key}`, + fallbackURL: `${Settings.apis.filestore.url}/project/${project._id}/file/${file._id}`, modified: file.created?.getTime(), }) } diff --git a/services/web/app/src/Features/History/HistoryManager.js b/services/web/app/src/Features/History/HistoryManager.js index 707771ca0b..78730c4782 100644 --- a/services/web/app/src/Features/History/HistoryManager.js +++ b/services/web/app/src/Features/History/HistoryManager.js @@ -11,10 +11,46 @@ const OError = require('@overleaf/o-error') const UserGetter = require('../User/UserGetter') const ProjectGetter = require('../Project/ProjectGetter') const HistoryBackupDeletionHandler = require('./HistoryBackupDeletionHandler') -const { ObjectId } = require('../../infrastructure/mongodb') +const { db, ObjectId } = require('../../infrastructure/mongodb') const Metrics = require('@overleaf/metrics') const logger = require('@overleaf/logger') const { NotFoundError } = require('../Errors/Errors') +const projectKey = require('./project_key') + +// BEGIN copy from services/history-v1/storage/lib/blob_store/index.js + +const GLOBAL_BLOBS = new Set() // CHANGE FROM SOURCE: only store hashes. + +function makeGlobalKey(hash) { + return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}` +} + +function makeProjectKey(projectId, hash) { + return `${projectKey.format(projectId)}/${hash.slice(0, 2)}/${hash.slice(2)}` +} + +function getBlobLocation(projectId, hash) { + if (GLOBAL_BLOBS.has(hash)) { + return { + bucket: settings.apis.v1_history.buckets.globalBlobs, + key: makeGlobalKey(hash), + } + } else { + return { + bucket: settings.apis.v1_history.buckets.projectBlobs, + key: makeProjectKey(projectId, hash), + } + } +} + +async function loadGlobalBlobs() { + const blobs = db.projectHistoryGlobalBlobs.find() + for await (const blob of blobs) { + GLOBAL_BLOBS.add(blob._id) // CHANGE FROM SOURCE: only store hashes. + } +} + +// END copy from services/history-v1/storage/lib/blob_store/index.js async function initializeProject(projectId) { const body = await fetchJson(`${settings.apis.project_history.url}/project`, { @@ -357,6 +393,7 @@ function _userView(user) { } module.exports = { + getBlobLocation, initializeProject: callbackify(initializeProject), flushProject: callbackify(flushProject), resyncProject: callbackify(resyncProject), @@ -368,6 +405,7 @@ module.exports = { copyBlob: callbackify(copyBlob), requestBlobWithFallback: callbackify(requestBlobWithFallback), promises: { + loadGlobalBlobs, initializeProject, flushProject, resyncProject, diff --git a/services/web/app/src/Features/History/project_key.js b/services/web/app/src/Features/History/project_key.js new file mode 100644 index 0000000000..a4722db09a --- /dev/null +++ b/services/web/app/src/Features/History/project_key.js @@ -0,0 +1,24 @@ +// Keep in sync with services/history-v1/storage/lib/project_key.js +const _ = require('lodash') +const path = require('node:path') + +// +// The advice in http://docs.aws.amazon.com/AmazonS3/latest/dev/ +// request-rate-perf-considerations.html is to avoid sequential key prefixes, +// so we reverse the project ID part of the key as they suggest. +// +function format(projectId) { + const prefix = naiveReverse(pad(projectId)) + return path.join(prefix.slice(0, 3), prefix.slice(3, 6), prefix.slice(6)) +} + +function pad(number) { + return _.padStart(number, 9, '0') +} + +function naiveReverse(string) { + return string.split('').reverse().join('') +} + +exports.format = format +exports.pad = pad diff --git a/services/web/app/src/infrastructure/mongodb.js b/services/web/app/src/infrastructure/mongodb.js index acc7f4781b..9309054d16 100644 --- a/services/web/app/src/infrastructure/mongodb.js +++ b/services/web/app/src/infrastructure/mongodb.js @@ -68,6 +68,7 @@ const db = { projectAuditLogEntries: internalDb.collection('projectAuditLogEntries'), projectHistoryChunks: internalDb.collection('projectHistoryChunks'), projectHistoryFailures: internalDb.collection('projectHistoryFailures'), + projectHistoryGlobalBlobs: internalDb.collection('projectHistoryGlobalBlobs'), projectHistoryLabels: internalDb.collection('projectHistoryLabels'), projectHistoryMetaData: internalDb.collection('projectHistoryMetaData'), projectHistorySyncState: internalDb.collection('projectHistorySyncState'), diff --git a/services/web/config/settings.defaults.js b/services/web/config/settings.defaults.js index 0625a31dfd..33f65c97e3 100644 --- a/services/web/config/settings.defaults.js +++ b/services/web/config/settings.defaults.js @@ -275,6 +275,23 @@ module.exports = { process.env.HAVE_I_BEEN_PWNED_URL || 'https://api.pwnedpasswords.com', timeout: parseInt(process.env.HAVE_I_BEEN_PWNED_TIMEOUT, 10) || 5 * 1000, }, + v1_history: { + url: + process.env.V1_HISTORY_URL || + `http://${process.env.V1_HISTORY_HOST || '127.0.0.1'}:${ + process.env.V1_HISTORY_PORT || '3100' + }/api`, + user: process.env.V1_HISTORY_USER || 'staging', + pass: + process.env.V1_HISTORY_PASS || + process.env.V1_HISTORY_PASSWORD || + 'password', + + buckets: { + globalBlobs: process.env.OVERLEAF_EDITOR_BLOBS_BUCKET, + projectBlobs: process.env.OVERLEAF_EDITOR_PROJECT_BLOBS_BUCKET, + }, + }, // For legacy reasons, we need to populate the below objects. v1: {}, diff --git a/services/web/test/unit/src/Compile/ClsiManagerTests.js b/services/web/test/unit/src/Compile/ClsiManagerTests.js index a962ce2dc8..cb532acdef 100644 --- a/services/web/test/unit/src/Compile/ClsiManagerTests.js +++ b/services/web/test/unit/src/Compile/ClsiManagerTests.js @@ -9,6 +9,8 @@ const FILESTORE_URL = 'http://filestore.example.com' const CLSI_HOST = 'clsi.example.com' const MODULE_PATH = '../../../../app/src/Features/Compile/ClsiManager.js' +const GLOBAL_BLOB_HASH = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' + describe('ClsiManager', function () { beforeEach(function () { this.user_id = 'user-id' @@ -17,6 +19,7 @@ describe('ClsiManager', function () { compiler: 'latex', rootDoc_id: 'mock-doc-id-1', imageName: 'mock-image-name', + overleaf: { history: { id: 42 } }, } this.docs = { '/main.tex': { @@ -31,10 +34,17 @@ describe('ClsiManager', function () { }, } this.files = { - '/images/image.png': { - name: 'image.png', + '/images/frog.png': { + name: 'frog.png', _id: 'mock-file-id-1', created: new Date(), + hash: GLOBAL_BLOB_HASH, + }, + '/images/image.png': { + name: 'image.png', + _id: 'mock-file-id-2', + created: new Date(), + hash: 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb', }, } this.clsiCookieKey = 'clsiserver' @@ -129,6 +139,17 @@ describe('ClsiManager', function () { enablePdfCaching: true, clsiCookie: { key: 'clsiserver' }, } + this.HistoryManager = { + getBlobLocation: sinon.stub().callsFake((historyId, hash) => { + if (hash === GLOBAL_BLOB_HASH) { + return { + bucket: 'global-blobs', + key: 'aa/aa/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', + } + } + return { bucket: 'project-blobs', key: `${historyId}/${hash}` } + }), + } this.ClsiManager = SandboxedModule.require(MODULE_PATH, { requires: { @@ -145,6 +166,7 @@ describe('ClsiManager', function () { '@overleaf/fetch-utils': this.FetchUtils, './ClsiFormatChecker': this.ClsiFormatChecker, '@overleaf/metrics': this.Metrics, + '../History/HistoryManager': this.HistoryManager, }, }) tk.freeze(Date.now()) @@ -238,6 +260,7 @@ describe('ClsiManager', function () { rootDoc_id: 1, imageName: 1, rootFolder: 1, + 'overleaf.history.id': 1, } ) }) @@ -372,6 +395,7 @@ describe('ClsiManager', function () { rootDoc_id: 1, imageName: 1, rootFolder: 1, + 'overleaf.history.id': 1, } ) }) @@ -1003,9 +1027,16 @@ function _makeResources(project, docs, files) { }) } for (const [path, file] of Object.entries(files)) { + let url + if (file.hash === GLOBAL_BLOB_HASH) { + url = `${FILESTORE_URL}/bucket/global-blobs/key/aa/aa/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa` + } else { + url = `${FILESTORE_URL}/bucket/project-blobs/key/${project.overleaf.history.id}/${file.hash}` + } resources.push({ path: path.replace(/^\//, ''), - url: `${FILESTORE_URL}/project/${project._id}/file/${file._id}`, + url, + fallbackURL: `${FILESTORE_URL}/project/${project._id}/file/${file._id}`, modified: file.created.getTime(), }) }