From f8b2cc7ce84bc3d38dec4625eeae5f647e883edd Mon Sep 17 00:00:00 2001 From: Jakob Ackermann Date: Tue, 21 Jan 2025 16:43:14 +0000 Subject: [PATCH] [history-v1] add flag for back-filling hashes for projectIds from file (#23005) * [history-v1] delay process exit to give logging time to flush * [history-v1] add flag for back-filling hashes for projectIds from file GitOrigin-RevId: 887a1e1c72d6f5a13bfc8d0e54023afbf5bc671c --- .../storage/scripts/back_fill_file_hash.mjs | 74 +++++++++++++++++-- .../scripts/back_fill_file_hash_fix_up.mjs | 3 + .../js/storage/back_fill_file_hash.test.mjs | 71 ++++++++++++++++++ .../back_fill_file_hash_fix_up.test.mjs | 1 + 4 files changed, 143 insertions(+), 6 deletions(-) diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs index a0abfa52a7..96dfd79e38 100644 --- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs +++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs @@ -35,6 +35,7 @@ import { import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js' import filestorePersistor from '../lib/persistor.js' import commandLineArgs from 'command-line-args' +import readline from 'node:readline' // Silence warning. Events.setMaxListeners(20) @@ -46,6 +47,8 @@ ObjectId.cacheHexString = true * @typedef {import("overleaf-editor-core").Blob} Blob * @typedef {import("perf_hooks").EventLoopUtilization} EventLoopUtilization * @typedef {import("mongodb").Collection} Collection + * @typedef {import("mongodb").Collection} ProjectsCollection + * @typedef {import("mongodb").Collection<{project:Project}>} DeletedProjectsCollection * @typedef {import("@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor").CachedPerProjectEncryptedS3Persistor} CachedPerProjectEncryptedS3Persistor */ @@ -86,7 +89,7 @@ ObjectId.cacheHexString = true */ /** - * @return {{PROCESS_HASHED_FILES: boolean, PROCESS_DELETED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, PROCESS_BLOBS: boolean, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, COLLECT_BACKED_UP_BLOBS: boolean}} + * @return {{PROJECT_IDS_FROM: string, PROCESS_HASHED_FILES: boolean, PROCESS_DELETED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, PROCESS_BLOBS: boolean, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, COLLECT_BACKED_UP_BLOBS: boolean}} */ function parseArgs() { const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z') @@ -96,6 +99,7 @@ function parseArgs() { { name: 'processDeletedFiles', type: String, defaultValue: 'false' }, { name: 'processHashedFiles', type: String, defaultValue: 'false' }, { name: 'processBlobs', type: String, defaultValue: 'true' }, + { name: 'projectIdsFrom', type: String, defaultValue: '' }, { name: 'collectBackedUpBlobs', type: String, defaultValue: 'true' }, { name: 'BATCH_RANGE_START', @@ -133,6 +137,7 @@ function parseArgs() { BATCH_RANGE_START, BATCH_RANGE_END, LOGGING_IDENTIFIER: args['LOGGING_IDENTIFIER'] || BATCH_RANGE_START, + PROJECT_IDS_FROM: args['projectIdsFrom'], } } @@ -146,6 +151,7 @@ const { BATCH_RANGE_START, BATCH_RANGE_END, LOGGING_IDENTIFIER, + PROJECT_IDS_FROM, } = parseArgs() // We need to handle the start and end differently as ids of deleted projects are created at time of deletion. @@ -174,9 +180,14 @@ const STREAM_HIGH_WATER_MARK = parseInt( 10 ) const LOGGING_INTERVAL = parseInt(process.env.LOGGING_INTERVAL || '60000', 10) +const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10) const projectsCollection = db.collection('projects') +/** @type {ProjectsCollection} */ +const typedProjectsCollection = db.collection('projects') const deletedProjectsCollection = db.collection('deletedProjects') +/** @type {DeletedProjectsCollection} */ +const typedDeletedProjectsCollection = db.collection('deletedProjects') const deletedFilesCollection = db.collection('deletedFiles') const concurrencyLimit = pLimit(CONCURRENCY) @@ -1316,6 +1327,51 @@ function estimateBlobSize(blob) { return size } +async function processProjectsFromFile() { + const rl = readline.createInterface({ + input: fs.createReadStream(PROJECT_IDS_FROM), + }) + for await (const projectId of rl) { + if (!projectId) continue // skip over trailing new line + let project = await typedProjectsCollection.findOne( + { _id: new ObjectId(projectId) }, + { projection: { rootFolder: 1, _id: 1, 'overleaf.history.id': 1 } } + ) + let prefix = 'rootFolder.0' + if (!project) { + const deletedProject = await typedDeletedProjectsCollection.findOne( + { 'deleterData.deletedProjectId': new ObjectId(projectId) }, + { + projection: { + 'project.rootFolder': 1, + 'project._id': 1, + 'project.overleaf.history.id': 1, + }, + } + ) + if (!deletedProject?.project) { + logger.warn({ projectId }, 'project hard-deleted') + continue + } + project = deletedProject.project + prefix = 'project.rootFolder.0' + } + if (!project?.overleaf?.history?.id) { + logger.warn({ projectId }, 'project has no history id') + continue + } + try { + await queueNextBatch([project], prefix) + } catch (err) { + gracefulShutdownInitiated = true + await waitForDeferredQueues() + throw err + } + } + await waitForDeferredQueues() + console.warn('Done updating projects from input file') +} + async function processNonDeletedProjects() { try { await batchedUpdate( @@ -1367,11 +1423,15 @@ async function processDeletedProjects() { async function main() { await loadGlobalBlobs() - if (PROCESS_NON_DELETED_PROJECTS) { - await processNonDeletedProjects() - } - if (PROCESS_DELETED_PROJECTS) { - await processDeletedProjects() + if (PROJECT_IDS_FROM) { + await processProjectsFromFile() + } else { + if (PROCESS_NON_DELETED_PROJECTS) { + await processNonDeletedProjects() + } + if (PROCESS_DELETED_PROJECTS) { + await processDeletedProjects() + } } console.warn('Done.') } @@ -1407,8 +1467,10 @@ try { ) code++ } + await setTimeout(SLEEP_BEFORE_EXIT) process.exit(code) } catch (err) { console.error(err) + await setTimeout(SLEEP_BEFORE_EXIT) process.exit(1) } diff --git a/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs b/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs index e6ecdc7b2f..7bab794692 100644 --- a/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs +++ b/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs @@ -18,6 +18,7 @@ import readline from 'node:readline' import { _blobIsBackedUp, backupBlob } from '../lib/backupBlob.mjs' import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js' import filestorePersistor from '../lib/persistor.js' +import { setTimeout } from 'node:timers/promises' // Silence warning. Events.setMaxListeners(20) @@ -102,6 +103,7 @@ const STREAM_HIGH_WATER_MARK = parseInt( process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(), 10 ) +const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10) /** @type {ProjectsCollection} */ const projectsCollection = db.collection('projects') @@ -630,6 +632,7 @@ async function main() { } } const { skipped, failed, unmatched } = STATS + await setTimeout(SLEEP_BEFORE_EXIT) if (failed > 0) { process.exit(Math.min(failed, 99)) } else if (unmatched > 0) { diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs index 5e9bf2edaf..72eaa8676d 100644 --- a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs +++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs @@ -585,6 +585,7 @@ describe('back_fill_file_hash script', function () { env: { ...process.env, USER_FILES_BUCKET_NAME, + SLEEP_BEFORE_EXIT: '1', ...env, LOG_LEVEL: 'warn', // Override LOG_LEVEL of acceptance tests }, @@ -1436,4 +1437,74 @@ describe('back_fill_file_hash script', function () { }) commonAssertions() }) + + describe('projectIds from file', () => { + const path0 = '/tmp/project-ids-0.txt' + const path1 = '/tmp/project-ids-1.txt' + beforeEach('create project-ids.txt files', async function () { + await fs.promises.writeFile( + path0, + [projectId0, projectId1].map(id => id.toString()).join('\n') + ) + await fs.promises.writeFile( + path1, + [ + projectId2, + projectId3, + projectIdDeleted0, + projectIdDeleted1, + projectIdNoHistory, + projectIdNoHistoryDeleted, + projectIdHardDeleted, + projectIdNoOverleaf, + projectIdNoOverleafDeleted, + projectIdBadFileTree0, + projectIdBadFileTree1, + projectIdBadFileTree2, + projectIdBadFileTree3, + ] + .map(id => id.toString()) + .join('\n') + ) + }) + + let outputPart0, outputPart1 + beforeEach('run script on part 0', async function () { + outputPart0 = await runScript([`--projectIdsFrom=${path0}`]) + }) + beforeEach('run script on part 1', async function () { + outputPart1 = await runScript([`--projectIdsFrom=${path1}`]) + }) + + /** + * @param {string} msg + * @param {ObjectId} projectId + */ + function expectLogEntry(msg, projectId) { + expect(outputPart1.result.stdout).to.include(msg) + const log = JSON.parse( + outputPart1.result.stdout + .split('\n') + .find(l => l.includes(`"${msg}"`) && l.includes(projectId.toString())) + ) + expect(log).to.contain({ + projectId: projectId.toString(), + msg, + }) + } + it('should flag the hard-deleted project', function () { + expectLogEntry('project hard-deleted', projectIdHardDeleted) + }) + it('should flag the projects without history id', function () { + expectLogEntry('project has no history id', projectIdNoOverleaf) + expectLogEntry('project has no history id', projectIdNoOverleafDeleted) + expectLogEntry('project has no history id', projectIdNoHistory) + expectLogEntry('project has no history id', projectIdNoHistoryDeleted) + }) + it('should print stats', function () { + expect(outputPart0.stats).to.deep.equal(STATS_UP_TO_PROJECT1) + expect(outputPart1.stats).to.deep.equal(STATS_UP_FROM_PROJECT1_ONWARD) + }) + commonAssertions() + }) }) diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash_fix_up.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash_fix_up.test.mjs index 8828eb8e39..ce9b0e7d59 100644 --- a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash_fix_up.test.mjs +++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash_fix_up.test.mjs @@ -505,6 +505,7 @@ describe('back_fill_file_hash_fix_up script', function () { env: { ...process.env, USER_FILES_BUCKET_NAME, + SLEEP_BEFORE_EXIT: '1', ...env, LOG_LEVEL: 'warn', // Override LOG_LEVEL of acceptance tests },