From 92bbba5e507532c19aefcb2f7cdf7c463ca8279b Mon Sep 17 00:00:00 2001 From: Mathias Jakobsen Date: Wed, 29 Apr 2026 08:28:47 +0100 Subject: [PATCH] Merge pull request #33046 from overleaf/mj-batch-history-resync-script [web] Add script for batch history resyncs GitOrigin-RevId: 2409475fa1ba12dadfaae9641a5fafdaa6c88e47 --- .../web/scripts/history/resync_projects.mjs | 344 ++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 services/web/scripts/history/resync_projects.mjs diff --git a/services/web/scripts/history/resync_projects.mjs b/services/web/scripts/history/resync_projects.mjs new file mode 100644 index 0000000000..36e896d152 --- /dev/null +++ b/services/web/scripts/history/resync_projects.mjs @@ -0,0 +1,344 @@ +// @ts-check + +import minimist from 'minimist' +import { scriptRunner } from '../lib/ScriptRunner.mjs' +import logger from '@overleaf/logger' +import ProjectGetter from '../../app/src/Features/Project/ProjectGetter.mjs' +import { + db, + ObjectId, + READ_PREFERENCE_SECONDARY, +} from '../../app/src/infrastructure/mongodb.mjs' +import HistoryManager from '../../app/src/Features/History/HistoryManager.mjs' +import DocstoreManager from '../../app/src/Features/Docstore/DocstoreManager.mjs' +import DocumentUpdaterHandler from '../../app/src/Features/DocumentUpdater/DocumentUpdaterHandler.mjs' + +function usage() { + console.error(`Usage: resync_projects.mjs [OPTIONS] + +Options: + + --help Print this help + --project-id Migrate this project + --min-id Migrate projects from this id + --max-id Migrate projects to this id + --last-updated-after Migrate projects last updated after this date + --last-updated-before Migrate projects last updated before this date + --commit Actually perform the resync, instead of just checking which projects would be resynced + --skip-metadata-checks Skip doing Mongo/Redis-level only checks to determine if the projects needs resyncing (has ranges or linked file data) + --concurrency How many jobs to run in parallel +`) +} + +/** + * + * @returns {{ projectIds?: string[]; minId?: string; maxId?: string; concurrency: number; commit: boolean; skipMetadataChecks: boolean; lastUpdatedAfter?: string; lastUpdatedBefore?: string; }} + */ +function parseArgs() { + const args = minimist(process.argv.slice(2), { + boolean: ['help', 'commit', 'skip-metadata-checks'], + string: [ + 'project-id', + 'min-id', + 'max-id', + 'last-updated-after', + 'last-updated-before', + ], + }) + + if (args.help) { + usage() + process.exit(0) + } + + const projectIds = arrayOpt(args['project-id']) + const minId = args['min-id'] + const maxId = args['max-id'] + const lastUpdatedAfter = args['last-updated-after'] + const lastUpdatedBefore = args['last-updated-before'] + const concurrency = parseInt(args.concurrency, 10) || 1 + const commit = args.commit + const skipMetadataChecks = args['skip-metadata-checks'] + + if ( + projectIds == null && + minId == null && + maxId == null && + lastUpdatedAfter == null && + lastUpdatedBefore == null + ) { + console.error('Please specify at least one filter\n') + usage() + process.exit(1) + } + + return { + projectIds, + minId, + maxId, + concurrency, + commit, + skipMetadataChecks, + lastUpdatedAfter, + lastUpdatedBefore, + } +} + +async function main() { + const { + projectIds, + minId, + maxId, + concurrency, + commit, + skipMetadataChecks, + lastUpdatedAfter, + lastUpdatedBefore, + } = parseArgs() + + // skip projects that don't have full project history + /** @type {any[]} */ + const clauses = [{ 'overleaf.history.id': { $exists: true } }] + + if (projectIds != null) { + clauses.push({ _id: { $in: projectIds.map(id => new ObjectId(id)) } }) + } + if (minId) { + clauses.push({ _id: { $gte: new ObjectId(minId) } }) + } + if (maxId) { + clauses.push({ _id: { $lte: new ObjectId(maxId) } }) + } + if (lastUpdatedAfter) { + clauses.push({ lastUpdated: { $gt: new Date(lastUpdatedAfter) } }) + } + if (lastUpdatedBefore) { + clauses.push({ lastUpdated: { $lt: new Date(lastUpdatedBefore) } }) + } + const filter = { $and: clauses } + + const projects = db.projects + .find(filter, { + readPreference: READ_PREFERENCE_SECONDARY, + projection: { _id: 1, overleaf: 1 }, + }) + .sort({ _id: -1 }) + + /** @type {{ skipped: number; resync: number; total: number;}} */ + const projectsProcessed = { + skipped: 0, + resync: 0, + total: 0, + } + /** @type {Map>} */ + const jobsByProjectId = new Map() + let errors = 0 + + let terminating = false + /** + * @param {any} signal + */ + const handleSignal = signal => { + logger.info({ signal }, 'History resync job received signal') + terminating = true + } + process.on('SIGINT', handleSignal) + process.on('SIGTERM', handleSignal) + + for await (const project of projects) { + if (terminating) { + break + } + + const projectId = project._id.toString() + if (jobsByProjectId.size >= concurrency) { + // Wait until the next job finishes + await Promise.race(jobsByProjectId.values()) + } + const job = processProject(projectId, { commit, skipMetadataChecks }) + .then( + /** @param {'skipped' | 'resync'} migrationType */ migrationType => { + jobsByProjectId.delete(projectId) + projectsProcessed[migrationType] += 1 + projectsProcessed.total += 1 + logger.debug( + { + projectId, + projectsProcessed, + errors, + migrationType, + }, + 'History resync' + ) + if (projectsProcessed.total % 10000 === 0) { + logger.info( + { projectsProcessed, errors, lastProjectId: projectId }, + 'History resync progress' + ) + } + } + ) + .catch( + /** @param {any} err */ err => { + jobsByProjectId.delete(projectId) + errors += 1 + logger.error( + { err, projectId, projectsProcessed, errors }, + 'Failed to resync project history' + ) + } + ) + + jobsByProjectId.set(projectId, job) + } + + // Clear the remaining backlog of jobs + await Promise.all(jobsByProjectId.values()) + logger.info({ projectsProcessed, errors }, 'History resync completion') +} + +/** + * + * @param {string} projectId + * @param {{skipMetadataChecks: boolean, commit: boolean}} opts + * @returns + */ +async function processProject(projectId, opts) { + const shouldProceed = opts.skipMetadataChecks + ? true + : await hasHistoryMetadata(projectId) + + if (!shouldProceed) { + logger.debug( + { projectId }, + 'Skipping project as it has no history relevant data in Mongo' + ) + return 'skipped' + } + + if (opts.commit) { + logger.debug({ projectId }, 'Resyncing project') + await HistoryManager.promises.flushProject(projectId) + await HistoryManager.promises.resyncProject(projectId) + } else { + logger.debug({ projectId }, 'Project would be resynced') + } + + return 'resync' +} + +/** + * + * @param {string} projectId + * @returns + */ +async function hasHistoryMetadata(projectId) { + try { + const blockSuccess = + await DocumentUpdaterHandler.promises.blockProject(projectId) + if (!blockSuccess) { + logger.debug( + { projectId }, + 'Project is currently active, so we cannot skip' + ) + return true + } + } catch (err) { + logger.warn( + { projectId, err }, + 'Error thrown while acquiring block for project' + ) + return true + } + + try { + if (await hasLinkedFileData(projectId)) { + return true + } + if (await DocstoreManager.promises.projectHasRanges(projectId)) { + return true + } + return false + } catch (err) { + logger.warn( + { projectId, err }, + 'Error checking for history data in Mongo, proceeding with resync just in case' + ) + } finally { + try { + await DocumentUpdaterHandler.promises.unblockProject(projectId) + } catch (err) { + logger.warn( + { projectId, err }, + 'Error unblocking project after checking for history data in Mongo' + ) + } + } + return true +} + +/** + * + * @param {string} projectId + * @returns {Promise} + */ +async function hasLinkedFileData(projectId) { + const project = await ProjectGetter.promises.getProjectWithoutLock( + projectId, + { + rootFolder: 1, + } + ) + if (!project) { + return false + } + return hasLinkedDataInFileTree(project.rootFolder?.[0]) +} + +/** + * + * @param {any} folder + * @returns {boolean} + */ +function hasLinkedDataInFileTree(folder) { + if (!folder) { + return false + } + if (Array.isArray(folder.fileRefs)) { + for (const fileRef of folder.fileRefs) { + if (fileRef.linkedFileData) { + return true + } + } + } + if (Array.isArray(folder.folders)) { + for (const subfolder of folder.folders) { + if (hasLinkedDataInFileTree(subfolder)) { + return true + } + } + } + return false +} + +/** + * @param {any} value + * @returns {Array | undefined} + */ +function arrayOpt(value) { + if (typeof value === 'string') { + return [value] + } else if (Array.isArray(value)) { + return value + } else { + return undefined + } +} + +try { + await scriptRunner(main) + process.exit(0) +} catch (error) { + console.error(error) + process.exit(1) +}