Merge pull request #27147 from overleaf/bg-filestore-migration-for-server-pro-II

add support for fetching files via http from filestore in back_fill_file_hash script and tests

GitOrigin-RevId: 8dea6383ed6fe9ee6786a5695e2deee93b1cdd84
This commit is contained in:
Brian Gough
2025-07-16 15:24:16 +01:00
committed by Copybot
parent 37ccc379e8
commit c368d44609
2 changed files with 119 additions and 82 deletions

View File

@@ -33,7 +33,6 @@ import {
makeProjectKey,
} from '../lib/blob_store/index.js'
import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js'
import filestorePersistor from '../lib/persistor.js'
import commandLineArgs from 'command-line-args'
import readline from 'node:readline'
@@ -179,6 +178,37 @@ const STREAM_HIGH_WATER_MARK = parseInt(
const LOGGING_INTERVAL = parseInt(process.env.LOGGING_INTERVAL || '60000', 10)
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
// Filestore endpoint location
const FILESTORE_HOST = process.env.FILESTORE_HOST || '127.0.0.1'
const FILESTORE_PORT = process.env.FILESTORE_PORT || '3009'
async function fetchFromFilestore(projectId, fileId) {
const url = `http://${FILESTORE_HOST}:${FILESTORE_PORT}/project/${projectId}/file/${fileId}`
const response = await fetch(url)
if (!response.ok) {
if (response.status === 404) {
throw new NotFoundError('file not found in filestore', {
status: response.status,
})
}
const body = await response.text()
throw new OError('fetchFromFilestore failed', {
projectId,
fileId,
status: response.status,
body,
})
}
if (!response.body) {
throw new OError('fetchFromFilestore response has no body', {
projectId,
fileId,
status: response.status,
})
}
return response.body
}
const projectsCollection = db.collection('projects')
/** @type {ProjectsCollection} */
const typedProjectsCollection = db.collection('projects')
@@ -348,8 +378,7 @@ async function processFile(entry, filePath) {
} catch (err) {
if (gracefulShutdownInitiated) throw err
if (err instanceof NotFoundError) {
const { bucketName } = OError.getFullInfo(err)
if (bucketName === USER_FILES_BUCKET_NAME && !RETRY_FILESTORE_404) {
if (!RETRY_FILESTORE_404) {
throw err // disable retries for not found in filestore bucket case
}
}
@@ -416,10 +445,8 @@ async function processFileOnce(entry, filePath) {
}
STATS.readFromGCSCount++
const src = await filestorePersistor.getObjectStream(
USER_FILES_BUCKET_NAME,
`${projectId}/${fileId}`
)
// make a fetch request to filestore itself
const src = await fetchFromFilestore(projectId, fileId)
const dst = fs.createWriteStream(filePath, {
highWaterMark: STREAM_HIGH_WATER_MARK,
})
@@ -1327,14 +1354,21 @@ async function processDeletedProjects() {
}
async function main() {
console.log('Starting project file backup...')
await loadGlobalBlobs()
console.log('Loaded global blobs:', GLOBAL_BLOBS.size)
if (PROJECT_IDS_FROM) {
console.log(
`Processing projects from file: ${PROJECT_IDS_FROM}, this may take a while...`
)
await processProjectsFromFile()
} else {
if (PROCESS_NON_DELETED_PROJECTS) {
console.log('Processing non-deleted projects...')
await processNonDeletedProjects()
}
if (PROCESS_DELETED_PROJECTS) {
console.log('Processing deleted projects...')
await processDeletedProjects()
}
}

View File

@@ -15,7 +15,6 @@ import { execFile } from 'node:child_process'
import chai, { expect } from 'chai'
import chaiExclude from 'chai-exclude'
import config from 'config'
import ObjectPersistor from '@overleaf/object-persistor'
import { WritableBuffer } from '@overleaf/stream-utils'
import {
backupPersistor,
@@ -27,6 +26,8 @@ import {
makeProjectKey,
} from '../../../../storage/lib/blob_store/index.js'
import express from 'express'
chai.use(chaiExclude)
const TIMEOUT = 20 * 1_000
@@ -36,15 +37,58 @@ const { tieringStorageClass } = config.get('backupPersistor')
const projectsCollection = db.collection('projects')
const deletedProjectsCollection = db.collection('deletedProjects')
const FILESTORE_PERSISTOR = ObjectPersistor({
backend: 'gcs',
gcs: {
endpoint: {
apiEndpoint: process.env.GCS_API_ENDPOINT,
projectId: process.env.GCS_PROJECT_ID,
},
},
})
class MockFilestore {
constructor() {
this.host = process.env.FILESTORE_HOST || '127.0.0.1'
this.port = process.env.FILESTORE_PORT || 3009
// create a server listening on this.host and this.port
this.files = {}
this.app = express()
this.app.get('/project/:projectId/file/:fileId', (req, res) => {
const { projectId, fileId } = req.params
const content = this.files[projectId]?.[fileId]
if (!content) return res.status(404).end()
res.status(200).end(content)
})
}
start() {
// reset stored files
this.files = {}
// start the server
if (this.serverPromise) {
return this.serverPromise
} else {
this.serverPromise = new Promise((resolve, reject) => {
this.server = this.app.listen(this.port, this.host, err => {
if (err) return reject(err)
resolve()
})
})
return this.serverPromise
}
}
addFile(projectId, fileId, fileContent) {
if (!this.files[projectId]) {
this.files[projectId] = {}
}
this.files[projectId][fileId] = fileContent
}
deleteObject(projectId, fileId) {
if (this.files[projectId]) {
delete this.files[projectId][fileId]
if (Object.keys(this.files[projectId]).length === 0) {
delete this.files[projectId]
}
}
}
}
const mockFilestore = new MockFilestore()
/**
* @param {ObjectId} objectId
@@ -472,67 +516,36 @@ describe('back_fill_file_hash script', function () {
}
async function populateFilestore() {
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileId0}`,
Stream.Readable.from([fileId0.toString()])
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileId6}`,
Stream.Readable.from([fileId6.toString()])
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileId7}`,
Stream.Readable.from([contentFile7])
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId1}/${fileId1}`,
Stream.Readable.from([fileId1.toString()])
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId2}/${fileId2}`,
Stream.Readable.from([fileId2.toString()])
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId3}/${fileId3}`,
Stream.Readable.from([fileId3.toString()])
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId3}/${fileId10}`,
await mockFilestore.addFile(projectId0, fileId0, fileId0.toString())
await mockFilestore.addFile(projectId0, fileId6, fileId6.toString())
await mockFilestore.addFile(projectId0, fileId7, contentFile7)
await mockFilestore.addFile(projectId1, fileId1, fileId1.toString())
await mockFilestore.addFile(projectId2, fileId2, fileId2.toString())
await mockFilestore.addFile(projectId3, fileId3, fileId3.toString())
await mockFilestore.addFile(
projectId3,
fileId10,
// fileId10 is dupe of fileId3
Stream.Readable.from([fileId3.toString()])
fileId3.toString()
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId3}/${fileId11}`,
await mockFilestore.addFile(
projectId3,
fileId11,
// fileId11 is dupe of fileId3
Stream.Readable.from([fileId3.toString()])
fileId3.toString()
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectIdDeleted0}/${fileId4}`,
Stream.Readable.from([fileId4.toString()])
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectIdDeleted1}/${fileId5}`,
Stream.Readable.from([fileId5.toString()])
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectIdBadFileTree3}/${fileId9}`,
Stream.Readable.from([fileId9.toString()])
await mockFilestore.addFile(projectIdDeleted0, fileId4, fileId4.toString())
await mockFilestore.addFile(projectIdDeleted1, fileId5, fileId5.toString())
await mockFilestore.addFile(
projectIdBadFileTree3,
fileId9,
fileId9.toString()
)
}
async function prepareEnvironment() {
await cleanup.everything()
await mockFilestore.start()
await populateMongo()
await populateHistoryV1()
await populateFilestore()
@@ -1117,10 +1130,7 @@ describe('back_fill_file_hash script', function () {
beforeEach('prepare environment', prepareEnvironment)
it('should gracefully handle fatal errors', async function () {
await FILESTORE_PERSISTOR.deleteObject(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileId0}`
)
mockFilestore.deleteObject(projectId0, fileId0)
const t0 = Date.now()
const { stats, result } = await tryRunScript([], {
RETRIES: '10',
@@ -1148,17 +1158,10 @@ describe('back_fill_file_hash script', function () {
})
it('should retry on error', async function () {
await FILESTORE_PERSISTOR.deleteObject(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileId0}`
)
mockFilestore.deleteObject(projectId0, fileId0)
const restoreFileAfter5s = async () => {
await setTimeout(5_000)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileId0}`,
Stream.Readable.from([fileId0.toString()])
)
mockFilestore.addFile(projectId0, fileId0, fileId0.toString())
}
// use Promise.allSettled to ensure the above sendStream call finishes before this test completes
const [