From c368d446091884e5e885c24347aff2b2583dd17d Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Wed, 16 Jul 2025 15:24:16 +0100 Subject: [PATCH] Merge pull request #27147 from overleaf/bg-filestore-migration-for-server-pro-II add support for fetching files via http from filestore in back_fill_file_hash script and tests GitOrigin-RevId: 8dea6383ed6fe9ee6786a5695e2deee93b1cdd84 --- .../storage/scripts/back_fill_file_hash.mjs | 48 +++++- .../js/storage/back_fill_file_hash.test.mjs | 153 +++++++++--------- 2 files changed, 119 insertions(+), 82 deletions(-) diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs index ba3e0d4359..5a590e347a 100644 --- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs +++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs @@ -33,7 +33,6 @@ import { makeProjectKey, } from '../lib/blob_store/index.js' import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js' -import filestorePersistor from '../lib/persistor.js' import commandLineArgs from 'command-line-args' import readline from 'node:readline' @@ -179,6 +178,37 @@ const STREAM_HIGH_WATER_MARK = parseInt( const LOGGING_INTERVAL = parseInt(process.env.LOGGING_INTERVAL || '60000', 10) const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10) +// Filestore endpoint location +const FILESTORE_HOST = process.env.FILESTORE_HOST || '127.0.0.1' +const FILESTORE_PORT = process.env.FILESTORE_PORT || '3009' + +async function fetchFromFilestore(projectId, fileId) { + const url = `http://${FILESTORE_HOST}:${FILESTORE_PORT}/project/${projectId}/file/${fileId}` + const response = await fetch(url) + if (!response.ok) { + if (response.status === 404) { + throw new NotFoundError('file not found in filestore', { + status: response.status, + }) + } + const body = await response.text() + throw new OError('fetchFromFilestore failed', { + projectId, + fileId, + status: response.status, + body, + }) + } + if (!response.body) { + throw new OError('fetchFromFilestore response has no body', { + projectId, + fileId, + status: response.status, + }) + } + return response.body +} + const projectsCollection = db.collection('projects') /** @type {ProjectsCollection} */ const typedProjectsCollection = db.collection('projects') @@ -348,8 +378,7 @@ async function processFile(entry, filePath) { } catch (err) { if (gracefulShutdownInitiated) throw err if (err instanceof NotFoundError) { - const { bucketName } = OError.getFullInfo(err) - if (bucketName === USER_FILES_BUCKET_NAME && !RETRY_FILESTORE_404) { + if (!RETRY_FILESTORE_404) { throw err // disable retries for not found in filestore bucket case } } @@ -416,10 +445,8 @@ async function processFileOnce(entry, filePath) { } STATS.readFromGCSCount++ - const src = await filestorePersistor.getObjectStream( - USER_FILES_BUCKET_NAME, - `${projectId}/${fileId}` - ) + // make a fetch request to filestore itself + const src = await fetchFromFilestore(projectId, fileId) const dst = fs.createWriteStream(filePath, { highWaterMark: STREAM_HIGH_WATER_MARK, }) @@ -1327,14 +1354,21 @@ async function processDeletedProjects() { } async function main() { + console.log('Starting project file backup...') await loadGlobalBlobs() + console.log('Loaded global blobs:', GLOBAL_BLOBS.size) if (PROJECT_IDS_FROM) { + console.log( + `Processing projects from file: ${PROJECT_IDS_FROM}, this may take a while...` + ) await processProjectsFromFile() } else { if (PROCESS_NON_DELETED_PROJECTS) { + console.log('Processing non-deleted projects...') await processNonDeletedProjects() } if (PROCESS_DELETED_PROJECTS) { + console.log('Processing deleted projects...') await processDeletedProjects() } } diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs index fd39369a71..8f861d3934 100644 --- a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs +++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs @@ -15,7 +15,6 @@ import { execFile } from 'node:child_process' import chai, { expect } from 'chai' import chaiExclude from 'chai-exclude' import config from 'config' -import ObjectPersistor from '@overleaf/object-persistor' import { WritableBuffer } from '@overleaf/stream-utils' import { backupPersistor, @@ -27,6 +26,8 @@ import { makeProjectKey, } from '../../../../storage/lib/blob_store/index.js' +import express from 'express' + chai.use(chaiExclude) const TIMEOUT = 20 * 1_000 @@ -36,15 +37,58 @@ const { tieringStorageClass } = config.get('backupPersistor') const projectsCollection = db.collection('projects') const deletedProjectsCollection = db.collection('deletedProjects') -const FILESTORE_PERSISTOR = ObjectPersistor({ - backend: 'gcs', - gcs: { - endpoint: { - apiEndpoint: process.env.GCS_API_ENDPOINT, - projectId: process.env.GCS_PROJECT_ID, - }, - }, -}) +class MockFilestore { + constructor() { + this.host = process.env.FILESTORE_HOST || '127.0.0.1' + this.port = process.env.FILESTORE_PORT || 3009 + // create a server listening on this.host and this.port + this.files = {} + + this.app = express() + + this.app.get('/project/:projectId/file/:fileId', (req, res) => { + const { projectId, fileId } = req.params + const content = this.files[projectId]?.[fileId] + if (!content) return res.status(404).end() + res.status(200).end(content) + }) + } + + start() { + // reset stored files + this.files = {} + // start the server + if (this.serverPromise) { + return this.serverPromise + } else { + this.serverPromise = new Promise((resolve, reject) => { + this.server = this.app.listen(this.port, this.host, err => { + if (err) return reject(err) + resolve() + }) + }) + return this.serverPromise + } + } + + addFile(projectId, fileId, fileContent) { + if (!this.files[projectId]) { + this.files[projectId] = {} + } + this.files[projectId][fileId] = fileContent + } + + deleteObject(projectId, fileId) { + if (this.files[projectId]) { + delete this.files[projectId][fileId] + if (Object.keys(this.files[projectId]).length === 0) { + delete this.files[projectId] + } + } + } +} + +const mockFilestore = new MockFilestore() /** * @param {ObjectId} objectId @@ -472,67 +516,36 @@ describe('back_fill_file_hash script', function () { } async function populateFilestore() { - await FILESTORE_PERSISTOR.sendStream( - USER_FILES_BUCKET_NAME, - `${projectId0}/${fileId0}`, - Stream.Readable.from([fileId0.toString()]) - ) - await FILESTORE_PERSISTOR.sendStream( - USER_FILES_BUCKET_NAME, - `${projectId0}/${fileId6}`, - Stream.Readable.from([fileId6.toString()]) - ) - await FILESTORE_PERSISTOR.sendStream( - USER_FILES_BUCKET_NAME, - `${projectId0}/${fileId7}`, - Stream.Readable.from([contentFile7]) - ) - await FILESTORE_PERSISTOR.sendStream( - USER_FILES_BUCKET_NAME, - `${projectId1}/${fileId1}`, - Stream.Readable.from([fileId1.toString()]) - ) - await FILESTORE_PERSISTOR.sendStream( - USER_FILES_BUCKET_NAME, - `${projectId2}/${fileId2}`, - Stream.Readable.from([fileId2.toString()]) - ) - await FILESTORE_PERSISTOR.sendStream( - USER_FILES_BUCKET_NAME, - `${projectId3}/${fileId3}`, - Stream.Readable.from([fileId3.toString()]) - ) - await FILESTORE_PERSISTOR.sendStream( - USER_FILES_BUCKET_NAME, - `${projectId3}/${fileId10}`, + await mockFilestore.addFile(projectId0, fileId0, fileId0.toString()) + await mockFilestore.addFile(projectId0, fileId6, fileId6.toString()) + await mockFilestore.addFile(projectId0, fileId7, contentFile7) + await mockFilestore.addFile(projectId1, fileId1, fileId1.toString()) + await mockFilestore.addFile(projectId2, fileId2, fileId2.toString()) + await mockFilestore.addFile(projectId3, fileId3, fileId3.toString()) + await mockFilestore.addFile( + projectId3, + fileId10, // fileId10 is dupe of fileId3 - Stream.Readable.from([fileId3.toString()]) + fileId3.toString() ) - await FILESTORE_PERSISTOR.sendStream( - USER_FILES_BUCKET_NAME, - `${projectId3}/${fileId11}`, + await mockFilestore.addFile( + projectId3, + fileId11, // fileId11 is dupe of fileId3 - Stream.Readable.from([fileId3.toString()]) + fileId3.toString() ) - await FILESTORE_PERSISTOR.sendStream( - USER_FILES_BUCKET_NAME, - `${projectIdDeleted0}/${fileId4}`, - Stream.Readable.from([fileId4.toString()]) - ) - await FILESTORE_PERSISTOR.sendStream( - USER_FILES_BUCKET_NAME, - `${projectIdDeleted1}/${fileId5}`, - Stream.Readable.from([fileId5.toString()]) - ) - await FILESTORE_PERSISTOR.sendStream( - USER_FILES_BUCKET_NAME, - `${projectIdBadFileTree3}/${fileId9}`, - Stream.Readable.from([fileId9.toString()]) + await mockFilestore.addFile(projectIdDeleted0, fileId4, fileId4.toString()) + await mockFilestore.addFile(projectIdDeleted1, fileId5, fileId5.toString()) + await mockFilestore.addFile( + projectIdBadFileTree3, + fileId9, + fileId9.toString() ) } async function prepareEnvironment() { await cleanup.everything() + await mockFilestore.start() await populateMongo() await populateHistoryV1() await populateFilestore() @@ -1117,10 +1130,7 @@ describe('back_fill_file_hash script', function () { beforeEach('prepare environment', prepareEnvironment) it('should gracefully handle fatal errors', async function () { - await FILESTORE_PERSISTOR.deleteObject( - USER_FILES_BUCKET_NAME, - `${projectId0}/${fileId0}` - ) + mockFilestore.deleteObject(projectId0, fileId0) const t0 = Date.now() const { stats, result } = await tryRunScript([], { RETRIES: '10', @@ -1148,17 +1158,10 @@ describe('back_fill_file_hash script', function () { }) it('should retry on error', async function () { - await FILESTORE_PERSISTOR.deleteObject( - USER_FILES_BUCKET_NAME, - `${projectId0}/${fileId0}` - ) + mockFilestore.deleteObject(projectId0, fileId0) const restoreFileAfter5s = async () => { await setTimeout(5_000) - await FILESTORE_PERSISTOR.sendStream( - USER_FILES_BUCKET_NAME, - `${projectId0}/${fileId0}`, - Stream.Readable.from([fileId0.toString()]) - ) + mockFilestore.addFile(projectId0, fileId0, fileId0.toString()) } // use Promise.allSettled to ensure the above sendStream call finishes before this test completes const [