From 7a57ef00cbcb73e6b91d964aec0eccbdfa646216 Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Fri, 5 Dec 2025 08:53:54 +0000 Subject: [PATCH] Merge pull request #30045 from overleaf/bg-object-persistor-make-list-directory-safer Improve safety of object persistor GitOrigin-RevId: bced9814de6613b388ca288a5f72cd42cff6c1d3 --- .../object-persistor/src/AbstractPersistor.js | 17 ++ libraries/object-persistor/src/FSPersistor.js | 41 +++- .../object-persistor/src/GcsPersistor.js | 11 + .../src/PerProjectEncryptedS3Persistor.js | 16 +- libraries/object-persistor/src/S3Persistor.js | 57 ++++- .../test/unit/FSPersistorTests.js | 50 +++++ .../test/unit/GcsPersistorTests.js | 123 ++++++++++ .../test/unit/S3PersistorTests.js | 211 ++++++++++++++++++ .../history-v1/storage/lib/backupArchiver.mjs | 38 ++-- .../history-v1/storage/scripts/backup.mjs | 26 ++- 10 files changed, 550 insertions(+), 40 deletions(-) diff --git a/libraries/object-persistor/src/AbstractPersistor.js b/libraries/object-persistor/src/AbstractPersistor.js index 519fc7f954..f64d9bae64 100644 --- a/libraries/object-persistor/src/AbstractPersistor.js +++ b/libraries/object-persistor/src/AbstractPersistor.js @@ -177,4 +177,21 @@ module.exports = class AbstractPersistor { prefix, }) } + + /** + * List objects in a directory, returning key and size information. + * + * Suitable only for directories where the number of keys is known to be small. + * + * @param {string} location + * @param {string} prefix + * @returns {Promise>} + */ + async listDirectoryStats(location, prefix) { + throw new NotImplementedError('method not implemented in persistor', { + method: 'listDirectoryStats', + location, + prefix, + }) + } } diff --git a/libraries/object-persistor/src/FSPersistor.js b/libraries/object-persistor/src/FSPersistor.js index 549a9a8c68..df0cca68a4 100644 --- a/libraries/object-persistor/src/FSPersistor.js +++ b/libraries/object-persistor/src/FSPersistor.js @@ -196,7 +196,46 @@ module.exports = class FSPersistor extends AbstractPersistor { async listDirectoryKeys(location, name) { const fsPath = this._getFsPath(location, name) - return await this._listDirectory(fsPath) + const paths = await this._listDirectory(fsPath) + + // Filter to only return files, not directories + const files = [] + for (const path of paths) { + try { + const stat = await fsPromises.stat(path) + if (stat.isFile()) { + files.push(path) + } + } catch (err) { + // ignore files that may have just been deleted + if (err.code !== 'ENOENT') { + throw err + } + } + } + return files + } + + async listDirectoryStats(location, name) { + const fsPath = this._getFsPath(location, name) + const paths = await this._listDirectory(fsPath) + + // Filter to only return files, not directories, with their sizes + const stats = [] + for (const path of paths) { + try { + const stat = await fsPromises.stat(path) + if (stat.isFile()) { + stats.push({ key: path, size: stat.size }) + } + } catch (err) { + // ignore files that may have just been deleted + if (err.code !== 'ENOENT') { + throw err + } + } + } + return stats } async checkIfObjectExists(location, name) { diff --git a/libraries/object-persistor/src/GcsPersistor.js b/libraries/object-persistor/src/GcsPersistor.js index c90978eff7..c9dac47d7e 100644 --- a/libraries/object-persistor/src/GcsPersistor.js +++ b/libraries/object-persistor/src/GcsPersistor.js @@ -322,6 +322,17 @@ module.exports = class GcsPersistor extends AbstractPersistor { return files.map(file => file.name) } + async listDirectoryStats(bucketName, prefix) { + const files = await this.#listDirectory( + bucketName, + ensurePrefixIsDirectory(prefix) + ) + return files.map(file => ({ + key: file.name, + size: parseInt(file.metadata.size, 10), + })) + } + async checkIfObjectExists(bucketName, key) { try { const [response] = await this.storage diff --git a/libraries/object-persistor/src/PerProjectEncryptedS3Persistor.js b/libraries/object-persistor/src/PerProjectEncryptedS3Persistor.js index 946ff0d480..a14c5eda2d 100644 --- a/libraries/object-persistor/src/PerProjectEncryptedS3Persistor.js +++ b/libraries/object-persistor/src/PerProjectEncryptedS3Persistor.js @@ -417,10 +417,20 @@ class CachedPerProjectEncryptedS3Persistor { * * @param {string} bucketName * @param {string} path - * @return {Promise} + * @return {Promise} */ - async listDirectory(bucketName, path) { - return await this.#parent.listDirectory(bucketName, path) + async listDirectoryKeys(bucketName, path) { + return await this.#parent.listDirectoryKeys(bucketName, path) + } + + /** + * + * @param {string} bucketName + * @param {string} path + * @return {Promise>} + */ + async listDirectoryStats(bucketName, path) { + return await this.#parent.listDirectoryStats(bucketName, path) } /** diff --git a/libraries/object-persistor/src/S3Persistor.js b/libraries/object-persistor/src/S3Persistor.js index eac758d760..ac0075c718 100644 --- a/libraries/object-persistor/src/S3Persistor.js +++ b/libraries/object-persistor/src/S3Persistor.js @@ -266,7 +266,7 @@ class S3Persistor extends AbstractPersistor { * @return {Promise} */ async deleteDirectory(bucketName, key, continuationToken) { - const { contents, response } = await this.listDirectory( + const { contents, response } = await this.#listDirectory( bucketName, key, continuationToken @@ -309,7 +309,7 @@ class S3Persistor extends AbstractPersistor { * @param {string} [continuationToken] * @return {Promise} */ - async listDirectory(bucketName, key, continuationToken) { + async #listDirectory(bucketName, key, continuationToken) { let response const options = { Bucket: bucketName, Prefix: key } if (continuationToken) { @@ -535,6 +535,59 @@ class S3Persistor extends AbstractPersistor { } } + /** + * @param {string} bucketName + * @param {string} prefix + * @return {Promise>} + */ + async listDirectoryKeys(bucketName, prefix) { + const keys = [] + let continuationToken + + do { + const { contents, response } = await this.#listDirectory( + bucketName, + prefix, + continuationToken + ) + keys.push(...contents.map(item => item.Key || '')) + continuationToken = response.IsTruncated + ? response.NextContinuationToken + : undefined + } while (continuationToken) + + return keys + } + + /** + * @param {string} bucketName + * @param {string} prefix + * @return {Promise>} + */ + async listDirectoryStats(bucketName, prefix) { + const stats = [] + let continuationToken + + do { + const { contents, response } = await this.#listDirectory( + bucketName, + prefix, + continuationToken + ) + stats.push( + ...contents.map(item => ({ + key: item.Key || '', + size: item.Size ?? -1, + })) + ) + continuationToken = response.IsTruncated + ? response.NextContinuationToken + : undefined + } while (continuationToken) + + return stats + } + /** * @param {string} bucket * @return {S3Client} diff --git a/libraries/object-persistor/test/unit/FSPersistorTests.js b/libraries/object-persistor/test/unit/FSPersistorTests.js index dc7a5c50d7..386878930c 100644 --- a/libraries/object-persistor/test/unit/FSPersistorTests.js +++ b/libraries/object-persistor/test/unit/FSPersistorTests.js @@ -400,6 +400,56 @@ describe('FSPersistorTests', function () { ).to.equal(0) }) }) + + describe('listDirectoryKeys', function () { + beforeEach(async function () { + for (const file of Object.values(files)) { + await persistor.sendFile(location, file, '/uploads/info.txt') + } + }) + + it('should list directory keys', async function () { + const keys = await persistor.listDirectoryKeys(location, 'animals') + expect(keys).to.have.lengthOf(2) + expect(keys).to.include(scenario.fsPath(files.wombat)) + expect(keys).to.include(scenario.fsPath(files.giraffe)) + }) + + it('should return empty array for non-existing directories', async function () { + const keys = await persistor.listDirectoryKeys( + location, + 'does-not-exist' + ) + expect(keys).to.deep.equal([]) + }) + }) + + describe('listDirectoryStats', function () { + beforeEach(async function () { + for (const file of Object.values(files)) { + await persistor.sendFile(location, file, '/uploads/info.txt') + } + }) + + it('should list directory stats', async function () { + const stats = await persistor.listDirectoryStats(location, 'animals') + expect(stats).to.have.lengthOf(2) + const keys = stats.map(s => s.key) + expect(keys).to.include(scenario.fsPath(files.wombat)) + expect(keys).to.include(scenario.fsPath(files.giraffe)) + for (const stat of stats) { + expect(stat.size).to.equal(localFiles['/uploads/info.txt'].length) + } + }) + + it('should return empty array for non-existing directories', async function () { + const stats = await persistor.listDirectoryStats( + location, + 'does-not-exist' + ) + expect(stats).to.deep.equal([]) + }) + }) }) } }) diff --git a/libraries/object-persistor/test/unit/GcsPersistorTests.js b/libraries/object-persistor/test/unit/GcsPersistorTests.js index 16a42c772c..527e49594d 100644 --- a/libraries/object-persistor/test/unit/GcsPersistorTests.js +++ b/libraries/object-persistor/test/unit/GcsPersistorTests.js @@ -723,4 +723,127 @@ describe('GcsPersistorTests', function () { }) }) }) + + describe('listDirectoryKeys', function () { + describe('with valid parameters', function () { + let keys + + beforeEach(async function () { + const filesWithNames = files.map((file, i) => ({ + ...file, + name: i === 0 ? 'llama' : 'hippo', + })) + GcsBucket.getFiles.resolves([filesWithNames]) + keys = await GcsPersistor.listDirectoryKeys(bucket, key) + }) + + it('should list the objects in the directory', function () { + expect(Storage.prototype.bucket).to.have.been.calledWith(bucket) + expect(GcsBucket.getFiles).to.have.been.calledWith({ + prefix: `${key}/`, + }) + }) + + it('should return the keys', function () { + expect(keys).to.deep.equal(['llama', 'hippo']) + }) + }) + + describe('when there are no files', function () { + let keys + + beforeEach(async function () { + GcsBucket.getFiles.resolves([[]]) + keys = await GcsPersistor.listDirectoryKeys(bucket, key) + }) + + it('should return an empty array', function () { + expect(keys).to.deep.equal([]) + }) + }) + + describe('when there is an error listing the objects', function () { + let error + + beforeEach(async function () { + GcsBucket.getFiles.rejects(genericError) + try { + await GcsPersistor.listDirectoryKeys(bucket, key) + } catch (err) { + error = err + } + }) + + it('should generate a ReadError', function () { + expect(error).to.be.an.instanceOf(Errors.ReadError) + }) + + it('should wrap the error', function () { + expect(error.cause).to.equal(genericError) + }) + }) + }) + + describe('listDirectoryStats', function () { + describe('with valid parameters', function () { + let stats + + beforeEach(async function () { + const filesWithNames = files.map((file, i) => ({ + ...file, + name: i === 0 ? 'llama' : 'hippo', + })) + GcsBucket.getFiles.resolves([filesWithNames]) + stats = await GcsPersistor.listDirectoryStats(bucket, key) + }) + + it('should list the objects in the directory', function () { + expect(Storage.prototype.bucket).to.have.been.calledWith(bucket) + expect(GcsBucket.getFiles).to.have.been.calledWith({ + prefix: `${key}/`, + }) + }) + + it('should return the stats', function () { + expect(stats).to.deep.equal([ + { key: 'llama', size: 11 }, + { key: 'hippo', size: 22 }, + ]) + }) + }) + + describe('when there are no files', function () { + let stats + + beforeEach(async function () { + GcsBucket.getFiles.resolves([[]]) + stats = await GcsPersistor.listDirectoryStats(bucket, key) + }) + + it('should return an empty array', function () { + expect(stats).to.deep.equal([]) + }) + }) + + describe('when there is an error listing the objects', function () { + let error + + beforeEach(async function () { + GcsBucket.getFiles.rejects(genericError) + try { + await GcsPersistor.listDirectoryStats(bucket, key) + } catch (err) { + error = err + } + }) + + it('should generate a ReadError', function () { + expect(error).to.be.an.instanceOf(Errors.ReadError) + }) + + it('should wrap the error', function () { + expect(error.cause).to.equal(genericError) + }) + }) + }) }) diff --git a/libraries/object-persistor/test/unit/S3PersistorTests.js b/libraries/object-persistor/test/unit/S3PersistorTests.js index 944ac4504d..f586cf7735 100644 --- a/libraries/object-persistor/test/unit/S3PersistorTests.js +++ b/libraries/object-persistor/test/unit/S3PersistorTests.js @@ -1046,4 +1046,215 @@ describe('S3PersistorTests', function () { }) }) }) + + describe('listDirectoryKeys', function () { + describe('with valid parameters', function () { + let keys + + beforeEach(async function () { + S3.mockSend(S3.ListObjectsV2Command, { + Contents: files.map(file => ({ Key: file.Key })), + IsTruncated: false, + }) + keys = await S3Persistor.listDirectoryKeys(bucket, key) + }) + + it('should list the objects in the directory', function () { + S3.assertSendCalledWith(S3.ListObjectsV2Command, { + Bucket: bucket, + Prefix: key, + }) + }) + + it('should return the keys', function () { + expect(keys).to.deep.equal(['llama', 'hippo']) + }) + }) + + describe('when there are no files', function () { + let keys + + beforeEach(async function () { + S3.mockSend(S3.ListObjectsV2Command, { + Contents: [], + IsTruncated: false, + }) + keys = await S3Persistor.listDirectoryKeys(bucket, key) + }) + + it('should return an empty array', function () { + expect(keys).to.deep.equal([]) + }) + }) + + describe('when there are more files available', function () { + const continuationToken = 'wombat' + let keys + + beforeEach(async function () { + S3.mockSend( + S3.ListObjectsV2Command, + { + Contents: [files[0]].map(file => ({ Key: file.Key })), + IsTruncated: true, + NextContinuationToken: continuationToken, + }, + { + nextResponses: [ + { + Contents: [files[1]].map(file => ({ Key: file.Key })), + IsTruncated: false, + }, + ], + } + ) + keys = await S3Persistor.listDirectoryKeys(bucket, key) + }) + + it('should list the objects in multiple calls', function () { + S3.assertSendCallCount(S3.ListObjectsV2Command, 2) + expect(S3.s3ClientStub.send.firstCall.args[0].payload).to.deep.equal({ + Bucket: bucket, + Prefix: key, + }) + expect(S3.s3ClientStub.send.secondCall.args[0].payload).to.deep.equal({ + Bucket: bucket, + Prefix: key, + ContinuationToken: continuationToken, + }) + }) + + it('should return all keys', function () { + expect(keys).to.deep.equal(['llama', 'hippo']) + }) + }) + + describe('when there is an error listing the objects', function () { + let error + + beforeEach(async function () { + S3.mockSend(S3.ListObjectsV2Command, genericError, { rejects: true }) + try { + await S3Persistor.listDirectoryKeys(bucket, key) + } catch (err) { + error = err + } + }) + + it('should generate a ReadError', function () { + expect(error).to.be.an.instanceOf(Errors.ReadError) + }) + + it('should wrap the error', function () { + expect(error.cause).to.exist + }) + }) + }) + + describe('listDirectoryStats', function () { + describe('with valid parameters', function () { + let stats + + beforeEach(async function () { + S3.mockSend(S3.ListObjectsV2Command, { + Contents: files.map(file => ({ Key: file.Key, Size: file.Size })), + IsTruncated: false, + }) + stats = await S3Persistor.listDirectoryStats(bucket, key) + }) + + it('should list the objects in the directory', function () { + S3.assertSendCalledWith(S3.ListObjectsV2Command, { + Bucket: bucket, + Prefix: key, + }) + }) + + it('should return the stats', function () { + expect(stats).to.deep.equal([ + { key: 'llama', size: 11 }, + { key: 'hippo', size: 22 }, + ]) + }) + }) + + describe('when there are no files', function () { + let stats + + beforeEach(async function () { + S3.mockSend(S3.ListObjectsV2Command, { + Contents: [], + IsTruncated: false, + }) + stats = await S3Persistor.listDirectoryStats(bucket, key) + }) + + it('should return an empty array', function () { + expect(stats).to.deep.equal([]) + }) + }) + + describe('when there are more files available', function () { + const continuationToken = 'wombat' + let stats + + beforeEach(async function () { + S3.mockSend( + S3.ListObjectsV2Command, + { + Contents: [files[0]].map(file => ({ + Key: file.Key, + Size: file.Size, + })), + IsTruncated: true, + NextContinuationToken: continuationToken, + }, + { + nextResponses: [ + { + Contents: [files[1]].map(file => ({ + Key: file.Key, + Size: file.Size, + })), + IsTruncated: false, + }, + ], + } + ) + stats = await S3Persistor.listDirectoryStats(bucket, key) + }) + + it('should list the objects in multiple calls', function () { + S3.assertSendCallCount(S3.ListObjectsV2Command, 2) + }) + + it('should return all stats', function () { + expect(stats).to.deep.equal([ + { key: 'llama', size: 11 }, + { key: 'hippo', size: 22 }, + ]) + }) + }) + + describe('when there is an error listing the objects', function () { + let error + + beforeEach(async function () { + S3.mockSend(S3.ListObjectsV2Command, genericError, { rejects: true }) + try { + await S3Persistor.listDirectoryStats(bucket, key) + } catch (err) { + error = err + } + }) + + it('should generate a ReadError', function () { + expect(error).to.be.an.instanceOf(Errors.ReadError) + }) + + it('should wrap the error', function () { + expect(error.cause).to.exist + }) + }) + }) }) diff --git a/services/history-v1/storage/lib/backupArchiver.mjs b/services/history-v1/storage/lib/backupArchiver.mjs index c6f0e3755d..12ee9d8ab2 100644 --- a/services/history-v1/storage/lib/backupArchiver.mjs +++ b/services/history-v1/storage/lib/backupArchiver.mjs @@ -400,28 +400,22 @@ export async function archiveLatestChunk( * @return {Promise} */ async function addRawBlobsToArchive(historyId, archive, projectCache) { - const key = projectKey.format(historyId) - const { contents } = await projectCache.listDirectory(projectBlobsBucket, key) - for (const blobRecord of contents) { - if (!blobRecord.Key) { - logger.debug({ blobRecord }, 'no key') - continue - } - const blobKey = blobRecord.Key + const blobKeys = await projectCache.listDirectoryKeys( + projectBlobsBucket, + projectKey.format(historyId) + ) + for (const key of blobKeys) { try { const stream = await projectCache.getObjectStream( projectBlobsBucket, - blobKey, + key, { autoGunzip: true } ) archive.append(stream, { - name: path.join(historyId, 'blobs', blobKey), + name: path.join(historyId, 'blobs', key), }) } catch (err) { - logger.warn( - { err, path: blobRecord.Key }, - 'Failed to append blob to archive' - ) + logger.warn({ err, path: key }, 'Failed to append blob to archive') } } } @@ -445,24 +439,20 @@ export async function archiveRawProject( ) { const projectCache = await getProjectPersistor(historyId) - const { contents: chunks } = await projectCache.listDirectory( + const chunkKeys = await projectCache.listDirectoryKeys( chunksBucket, projectKey.format(historyId) ) - if (chunks.length === 0) { + if (chunkKeys.length === 0) { throw new Error('No chunks found') } - for (const chunkRecord of chunks) { - if (!chunkRecord.Key) { - logger.debug({ chunkRecord }, 'no key') - continue - } - const chunkId = chunkRecord.Key.split('/').pop() - logger.debug({ chunkId, key: chunkRecord.Key }, 'Processing chunk') + for (const key of chunkKeys) { + const chunkId = key.split('/').pop() + logger.debug({ chunkId, key }, 'Processing chunk') - const { buffer } = await loadChunkByKey(projectCache, chunkRecord.Key) + const { buffer } = await loadChunkByKey(projectCache, key) archive.append(buffer, { name: `${historyId}/chunks/${chunkId}/chunk.json`, diff --git a/services/history-v1/storage/scripts/backup.mjs b/services/history-v1/storage/scripts/backup.mjs index 037a720de1..974f6112c4 100644 --- a/services/history-v1/storage/scripts/backup.mjs +++ b/services/history-v1/storage/scripts/backup.mjs @@ -900,6 +900,11 @@ class BlobComparator { const SHA1_HEX_REGEX = /^[a-f0-9]{40}$/ +/** + * Get a listing of all blobs for a project + * @param {string} historyId - The history ID + * @returns {Promise>} Map of blob hash to blob metadata + */ async function getBlobListing(historyId) { const backupPersistorForProject = await backupPersistor.forProject( projectBlobsBucket, @@ -909,7 +914,7 @@ async function getBlobListing(historyId) { // get the blob listing const projectBlobsPath = projectKey.format(historyId) - const { contents: blobList } = await backupPersistorForProject.listDirectory( + const blobList = await backupPersistorForProject.listDirectoryStats( projectBlobsBucket, projectBlobsPath ) @@ -918,21 +923,22 @@ async function getBlobListing(historyId) { return new Map() } + /** @type {Map} */ const remoteBlobs = new Map() for (const blobRecord of blobList) { - if (!blobRecord.Key) { - logger.debug({ blobRecord }, 'no key') + if (!blobRecord.key || typeof blobRecord.size !== 'number') { + logger.debug({ blobRecord }, 'invalid blob record') continue } - const parts = blobRecord.Key.split('/') + const parts = blobRecord.key.split('/') const hash = parts[3] + parts[4] if (!SHA1_HEX_REGEX.test(hash)) { console.warn(`Invalid SHA1 hash for project ${historyId}: ${hash}`) continue } - remoteBlobs.set(hash, blobRecord) + remoteBlobs.set(hash, { key: blobRecord.key, size: blobRecord.size }) } return remoteBlobs } @@ -1075,26 +1081,26 @@ async function compareBackups(projectId, options, log = console.log) { const blobListEntry = blobsFromListing.get(blob.hash) if (options.fast) { if (blobListEntry) { - if (blob.byteLength === blobListEntry.Size) { + if (blob.byteLength === blobListEntry.size) { // Size matches exactly log( ` ✓ Blob ${blob.hash} exists on remote with expected size (${blob.byteLength} bytes)` ) totalBlobMatches++ continue - } else if (blob.stringLength > 0 && blobListEntry.Size > 0) { + } else if (blob.stringLength > 0 && blobListEntry.size > 0) { // Text file present with compressed size, assume valid as we are in --fast comparison mode const compressionRatio = ( - blobListEntry.Size / blob.byteLength + blobListEntry.size / blob.byteLength ).toFixed(2) log( - ` ✓ Blob ${blob.hash} consistent with compressed data on remote (${blob.byteLength} bytes => ${blobListEntry.Size} bytes, ratio=${compressionRatio})` + ` ✓ Blob ${blob.hash} consistent with compressed data on remote (${blob.byteLength} bytes => ${blobListEntry.size} bytes, ratio=${compressionRatio})` ) totalBlobMatches++ continue } else { log( - ` ✗ Blob ${blob.hash} size mismatch (original: ${blob.byteLength} bytes, stringLength: ${blob.stringLength}, backup: ${blobListEntry.Size} bytes)` + ` ✗ Blob ${blob.hash} size mismatch (original: ${blob.byteLength} bytes, stringLength: ${blob.stringLength}, backup: ${blobListEntry.size} bytes)` ) totalBlobMismatches++ errors.push({