From 93c04fe240105aca8944c191ad678c43332507a2 Mon Sep 17 00:00:00 2001 From: Jakob Ackermann Date: Tue, 29 Jul 2025 15:58:28 +0200 Subject: [PATCH] [history-v1] extend --report with number of files to check/missing (#27397) * [history-v1] put logs of back_fill_file_hash script in /var/log/overleaf * [history-v1] do not leave tmp dir behind when using --report * [history-v1] extend --report with number of files to check/missing * [history-v1] move all the options into command line flags Change default values: - concurrency to 10 - concurrent batches to 1 - stream high water mark to 1MiB * [web] find_malformed_filetrees: log progress on stderr * [history-v1] fix percentage of projects that need back-filling GitOrigin-RevId: ebd7889840f97b24eda43774ffe7524998fb634b --- .../storage/scripts/back_fill_file_hash.mjs | 142 +++++++++++++----- .../js/storage/back_fill_file_hash.test.mjs | 142 ++++++++++-------- .../web/scripts/find_malformed_filetrees.mjs | 2 +- 3 files changed, 188 insertions(+), 98 deletions(-) diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs index c0fdda35d8..33962c5da7 100644 --- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs +++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs @@ -79,11 +79,11 @@ ObjectId.cacheHexString = true */ /** - * @return {{PROJECT_IDS_FROM: string, PROCESS_HASHED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, PROCESS_BLOBS: boolean, DRY_RUN: boolean, OUTPUT_FILE: string, DISPLAY_REPORT: boolean}} + * @return {{PROJECT_IDS_FROM: string, PROCESS_HASHED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, PROCESS_BLOBS: boolean, DRY_RUN: boolean, OUTPUT_FILE: string, DISPLAY_REPORT: boolean, CONCURRENCY: number, CONCURRENT_BATCHES: number, RETRIES: number, RETRY_DELAY_MS: number, RETRY_FILESTORE_404: boolean, BUFFER_DIR_PREFIX: string, STREAM_HIGH_WATER_MARK: number, LOGGING_INTERVAL: number, SLEEP_BEFORE_EXIT: number }} */ function parseArgs() { const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z') - const DEFAULT_OUTPUT_FILE = `file-migration-${new Date() + const DEFAULT_OUTPUT_FILE = `/var/log/overleaf/file-migration-${new Date() .toISOString() .replace(/[:.]/g, '_')}.log` @@ -95,6 +95,12 @@ function parseArgs() { { name: 'skip-hashed-files', type: Boolean }, { name: 'skip-existing-blobs', type: Boolean }, { name: 'from-file', type: String, defaultValue: '' }, + { name: 'concurrency', type: Number, defaultValue: 10 }, + { name: 'concurrent-batches', type: Number, defaultValue: 1 }, + { name: 'stream-high-water-mark', type: Number, defaultValue: 1024 * 1024 }, + { name: 'retries', type: Number, defaultValue: 10 }, + { name: 'retry-delay-ms', type: Number, defaultValue: 100 }, + { name: 'retry-filestore-404', type: Boolean }, { name: 'dry-run', alias: 'n', type: Boolean }, { name: 'output', @@ -114,6 +120,13 @@ function parseArgs() { defaultValue: new Date().toISOString(), }, { name: 'logging-id', type: String, defaultValue: '' }, + { name: 'logging-interval-ms', type: Number, defaultValue: 60_000 }, + { + name: 'buffer-dir-prefix', + type: String, + defaultValue: '/tmp/back_fill_file_hash-', + }, + { name: 'sleep-before-exit-ms', type: Number, defaultValue: 1_000 }, ]) // If no arguments are provided, display a usage message @@ -143,6 +156,8 @@ Logging options: (default: file-migration-.log) --logging-id Identifier for logging (default: BATCH_RANGE_START) + --logging-interval-ms Interval for logging progres stats + (default: 60000, 1min) Batch range options: --BATCH_RANGE_START Start date for processing @@ -150,10 +165,30 @@ Batch range options: --BATCH_RANGE_END End date for processing (default: ${args.BATCH_RANGE_END}) +Concurrency: + --concurrency Number of files to process concurrently + (default: 10) + --concurrent-batches Number of project batches to process concurrently + (default: 1) + --stream-high-water-mark n In-Memory buffering threshold + (default: 1MiB) + +Retries: + --retries Number of times to retry processing a file + (default: 10) + --retry-delay-ms How long to wait before processing a file again + (default: 100, 100ms) + --retry-filestore-404 Retry downloading a file when receiving a 404 + (default: false) + Other options: --report Display a report of the current status --dry-run, -n Perform a dry run without making changes --help, -h Show this help message + --buffer-dir-prefix

Folder/prefix for buffering files on disk + (default: ${args['buffer-dir-prefix']}) + --sleep-before-exit-ms Defer exiting from the script + (default: 1000, 1s) Typical usage: @@ -208,12 +243,21 @@ is equivalent to PROCESS_HASHED_FILES: !args['skip-hashed-files'], PROCESS_BLOBS: !args['skip-existing-blobs'], DRY_RUN: args['dry-run'], - OUTPUT_FILE: args.output, + OUTPUT_FILE: args.report ? '-' : args.output, BATCH_RANGE_START, BATCH_RANGE_END, LOGGING_IDENTIFIER: args['logging-id'] || BATCH_RANGE_START, + LOGGING_INTERVAL: args['logging-interval-ms'], PROJECT_IDS_FROM: args['from-file'], DISPLAY_REPORT: args.report, + CONCURRENCY: args.concurrency, + CONCURRENT_BATCHES: args['concurrent-batches'], + STREAM_HIGH_WATER_MARK: args['stream-high-water-mark'], + RETRIES: args.retries, + RETRY_DELAY_MS: args['retry-delay-ms'], + RETRY_FILESTORE_404: args['retry-filestore-404'], + BUFFER_DIR_PREFIX: args['buffer-dir-prefix'], + SLEEP_BEFORE_EXIT: args['sleep-before-exit-ms'], } } @@ -229,6 +273,15 @@ const { LOGGING_IDENTIFIER, PROJECT_IDS_FROM, DISPLAY_REPORT, + CONCURRENCY, + CONCURRENT_BATCHES, + RETRIES, + RETRY_DELAY_MS, + RETRY_FILESTORE_404, + BUFFER_DIR_PREFIX, + STREAM_HIGH_WATER_MARK, + LOGGING_INTERVAL, + SLEEP_BEFORE_EXIT, } = parseArgs() // We need to handle the start and end differently as ids of deleted projects are created at time of deletion. @@ -236,26 +289,12 @@ if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) { throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END') } -// Concurrency for downloading from GCS and updating hashes in mongo -const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10) -const CONCURRENT_BATCHES = parseInt(process.env.CONCURRENT_BATCHES || '2', 10) -// Retries for processing a given file -const RETRIES = parseInt(process.env.RETRIES || '10', 10) -const RETRY_DELAY_MS = parseInt(process.env.RETRY_DELAY_MS || '100', 10) - -const RETRY_FILESTORE_404 = process.env.RETRY_FILESTORE_404 === 'true' -const BUFFER_DIR = fs.mkdtempSync( - process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-' -) -// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode -const STREAM_HIGH_WATER_MARK = parseInt( - process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(), - 10 -) -const LOGGING_INTERVAL = parseInt(process.env.LOGGING_INTERVAL || '60000', 10) -const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10) +const BUFFER_DIR = fs.mkdtempSync(BUFFER_DIR_PREFIX) // Log output to a file +if (OUTPUT_FILE !== '-') { + console.warn(`Writing logs into ${OUTPUT_FILE}`) +} logger.initialize('file-migration', { streams: [ { @@ -313,6 +352,7 @@ async function getStatsForCollection( projectsWithAllHashes: 0, fileCount: 0, fileWithHashCount: 0, + fileMissingInHistoryCount: 0, } // Pick a random sample of projects and estimate the number of files without hashes const result = await collection @@ -339,25 +379,44 @@ async function getStatsForCollection( const filesWithoutHash = fileTree.match(/\{"_id":"[0-9a-f]{24}"\}/g) || [] // count the number of files with a hash, these are uniquely identified // by the number of "hash" strings due to the filtering - const filesWithHash = fileTree.match(/"hash"/g) || [] + const filesWithHash = fileTree.match(/"hash":"[0-9a-f]{40}"/g) || [] stats.fileCount += filesWithoutHash.length + filesWithHash.length stats.fileWithHashCount += filesWithHash.length stats.projectCount++ stats.projectsWithAllHashes += filesWithoutHash.length === 0 ? 1 : 0 + const projectId = project._id.toString() + const { blobs: perProjectBlobs } = await getProjectBlobsBatch([projectId]) + const blobs = new Set( + (perProjectBlobs.get(projectId) || []).map(b => b.getHash()) + ) + const uniqueHashes = new Set(filesWithHash.map(m => m.slice(8, 48))) + for (const hash of uniqueHashes) { + if (blobs.has(hash) || GLOBAL_BLOBS.has(hash)) continue + stats.fileMissingInHistoryCount++ + } } console.log(`Sampled stats for ${name}:`) const fractionSampled = stats.projectCount / collectionCount - const percentageSampled = (fractionSampled * 100).toFixed(1) + const percentageSampled = (fractionSampled * 100).toFixed(0) const fractionConverted = stats.projectsWithAllHashes / stats.projectCount - const percentageConverted = (fractionConverted * 100).toFixed(1) + const fractionToBackFill = 1 - fractionConverted + const percentageToBackFill = (fractionToBackFill * 100).toFixed(0) + const fractionMissing = stats.fileMissingInHistoryCount / stats.fileCount + const percentageMissing = (fractionMissing * 100).toFixed(0) console.log( - `- Sampled ${name}: ${stats.projectCount} (${percentageSampled}%)` + `- Sampled ${name}: ${stats.projectCount} (${percentageSampled}% of all ${name})` ) console.log( `- Sampled ${name} with all hashes present: ${stats.projectsWithAllHashes}` ) console.log( - `- Percentage of ${name} converted: ${percentageConverted}% (estimated)` + `- Percentage of ${name} that need back-filling hashes: ${percentageToBackFill}% (estimated)` + ) + console.log( + `- Sampled ${name} have ${stats.fileCount} files that need to be checked against the full project history system.` + ) + console.log( + `- Sampled ${name} have ${stats.fileMissingInHistoryCount} files that need to be uploaded to the full project history system (estimating ${percentageMissing}% of all files).` ) } @@ -366,13 +425,15 @@ async function getStatsForCollection( * including counts and estimated progress based on a sample. */ async function displayReport() { - const projectsCountResult = await projectsCollection.countDocuments() + const projectsCountResult = await projectsCollection.estimatedDocumentCount() const deletedProjectsCountResult = - await deletedProjectsCollection.countDocuments() + await deletedProjectsCollection.estimatedDocumentCount() const sampleSize = 1000 console.log('Current status:') - console.log(`- Projects: ${projectsCountResult}`) - console.log(`- Deleted projects: ${deletedProjectsCountResult}`) + console.log(`- Total number of projects: ${projectsCountResult}`) + console.log( + `- Total number of deleted projects: ${deletedProjectsCountResult}` + ) console.log(`Sampling ${sampleSize} projects to estimate progress...`) await getStatsForCollection( sampleSize, @@ -392,7 +453,7 @@ async function displayReport() { ) } -// Filestore endpoint location +// Filestore endpoint location (configured by /etc/overleaf/env.sh) const FILESTORE_HOST = process.env.FILESTORE_HOST || '127.0.0.1' const FILESTORE_PORT = process.env.FILESTORE_PORT || '3009' @@ -1370,7 +1431,18 @@ async function main() { console.warn('Done.') } +async function cleanupBufferDir() { + try { + // Perform non-recursive removal of the BUFFER_DIR. Individual files + // should get removed in parallel as part of batch processing. + await fs.promises.rmdir(BUFFER_DIR) + } catch (err) { + console.error(`cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err) + } +} + if (DISPLAY_REPORT) { + await cleanupBufferDir() console.warn('Displaying report...') await displayReport() process.exit(0) @@ -1381,13 +1453,7 @@ try { await main() } finally { printStats(true) - try { - // Perform non-recursive removal of the BUFFER_DIR. Individual files - // should get removed in parallel as part of batch processing. - await fs.promises.rmdir(BUFFER_DIR) - } catch (err) { - console.error(`cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err) - } + await cleanupBufferDir() } let code = 0 diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs index f6f4a6fb76..bfcf93228d 100644 --- a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs +++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs @@ -63,7 +63,6 @@ const PRINT_IDS_AND_HASHES_FOR_DEBUGGING = false describe('back_fill_file_hash script', function () { this.timeout(TIMEOUT) - const USER_FILES_BUCKET_NAME = 'fake-user-files-gcs' const projectId0 = objectIdFromTime('2017-01-01T00:00:00Z') const projectId1 = objectIdFromTime('2017-01-01T00:01:00Z') @@ -480,20 +479,16 @@ describe('back_fill_file_hash script', function () { /** * @param {Array} args - * @param {Record} env - * @param {boolean} shouldHaveWritten - * @return {Promise<{result, stats: any}>} + * @return {Promise<{result: { stdout: string, stderr: string, status: number }, stats: any}>} */ - async function tryRunScript(args = [], env = {}, shouldHaveWritten) { + async function rawRunScript(args = []) { let result try { result = await promisify(execFile)( process.argv0, [ 'storage/scripts/back_fill_file_hash.mjs', - '--output=-', - '--projects', - '--deleted-projects', + '--sleep-before-exit-ms=1', ...args, ], { @@ -501,9 +496,7 @@ describe('back_fill_file_hash script', function () { timeout: TIMEOUT - 500, env: { ...process.env, - USER_FILES_BUCKET_NAME, - SLEEP_BEFORE_EXIT: '1', - ...env, + AWS_SDK_JS_SUPPRESS_MAINTENANCE_MODE_MESSAGE: '1', LOG_LEVEL: 'warn', // Override LOG_LEVEL of acceptance tests }, } @@ -516,9 +509,25 @@ describe('back_fill_file_hash script', function () { } result = { stdout, stderr, status: code } } + // Ensure no tmp folder is left behind. expect((await fs.promises.readdir('/tmp')).join(';')).to.not.match( /back_fill_file_hash/ ) + return result + } + + /** + * @param {Array} args + * @param {boolean} shouldHaveWritten + * @return {Promise<{result, stats: any}>} + */ + async function tryRunScript(args = [], shouldHaveWritten) { + const result = await rawRunScript([ + '--output=-', + '--projects', + '--deleted-projects', + ...args, + ]) const extraStatsKeys = ['eventLoop', 'readFromGCSThroughputMiBPerSecond'] const stats = JSON.parse( result.stderr @@ -549,12 +558,11 @@ describe('back_fill_file_hash script', function () { /** * @param {Array} args - * @param {Record} env * @param {boolean} shouldHaveWritten * @return {Promise<{result, stats: any}>} */ - async function runScript(args = [], env = {}, shouldHaveWritten = true) { - const { stats, result } = await tryRunScript(args, env, shouldHaveWritten) + async function runScript(args = [], shouldHaveWritten = true) { + const { stats, result } = await tryRunScript(args, shouldHaveWritten) if (result.status !== 0) { console.log(result) expect(result).to.have.property('status', 0) @@ -803,7 +811,6 @@ describe('back_fill_file_hash script', function () { it('should process nothing on re-run', async function () { const rerun = await runScript( !processHashedFiles ? ['--skip-hashed-files'] : [], - {}, false ) let stats = { @@ -928,10 +935,11 @@ describe('back_fill_file_hash script', function () { it('should gracefully handle fatal errors', async function () { mockFilestore.deleteObject(projectId0, fileId0) const t0 = Date.now() - const { stats, result } = await tryRunScript(['--skip-hashed-files'], { - RETRIES: '10', - RETRY_DELAY_MS: '1000', - }) + const { stats, result } = await tryRunScript([ + '--skip-hashed-files', + '--retries=10', + '--retry-delay-ms=1000', + ]) const t1 = Date.now() expectNotFoundError(result, 'failed to process file') expect(result.status).to.equal(1) @@ -963,11 +971,12 @@ describe('back_fill_file_hash script', function () { value: { stats, result }, }, ] = await Promise.allSettled([ - tryRunScript(['--skip-hashed-files'], { - RETRY_DELAY_MS: '100', - RETRIES: '60', - RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests - }), + tryRunScript([ + '--skip-hashed-files', + '--retries=60', + '--retry-delay-ms=1000', + '--retry-filestore-404', + ]), restoreFileAfter5s(), ]) expectNotFoundError(result, 'failed to process file, trying again') @@ -989,9 +998,7 @@ describe('back_fill_file_hash script', function () { let output before('prepare environment', prepareEnvironment) before('run script', async function () { - output = await runScript(['--skip-hashed-files'], { - CONCURRENCY: '1', - }) + output = await runScript(['--skip-hashed-files', '--concurrency=1']) }) /** @@ -1058,10 +1065,10 @@ describe('back_fill_file_hash script', function () { let output1, output2 before('prepare environment', prepareEnvironment) before('run script without hashed files', async function () { - output1 = await runScript(['--skip-hashed-files'], {}) + output1 = await runScript(['--skip-hashed-files']) }) before('run script with hashed files', async function () { - output2 = await runScript([], {}) + output2 = await runScript([]) }) it('should print stats for the first run without hashed files', function () { expect(output1.stats).deep.equal(STATS_ALL) @@ -1076,6 +1083,34 @@ describe('back_fill_file_hash script', function () { }) commonAssertions(true) }) + describe('report mode', function () { + let output + before('prepare environment', prepareEnvironment) + before('run script', async function () { + output = await rawRunScript(['--report']) + }) + it('should print the report', () => { + expect(output.status).to.equal(0) + expect(output.stdout).to.equal(`\ +Current status: +- Total number of projects: 10 +- Total number of deleted projects: 5 +Sampling 1000 projects to estimate progress... +Sampled stats for projects: +- Sampled projects: 9 (90% of all projects) +- Sampled projects with all hashes present: 5 +- Percentage of projects that need back-filling hashes: 44% (estimated) +- Sampled projects have 11 files that need to be checked against the full project history system. +- Sampled projects have 3 files that need to be uploaded to the full project history system (estimating 27% of all files). +Sampled stats for deleted projects: +- Sampled deleted projects: 4 (80% of all deleted projects) +- Sampled deleted projects with all hashes present: 3 +- Percentage of deleted projects that need back-filling hashes: 25% (estimated) +- Sampled deleted projects have 2 files that need to be checked against the full project history system. +- Sampled deleted projects have 1 files that need to be uploaded to the full project history system (estimating 50% of all files). +`) + }) + }) describe('full run in dry-run mode', function () { let output @@ -1089,13 +1124,7 @@ describe('back_fill_file_hash script', function () { .toArray() }) before('run script', async function () { - output = await runScript( - ['--dry-run'], - { - CONCURRENCY: '1', - }, - false - ) + output = await runScript(['--dry-run', '--concurrency=1'], false) }) it('should print stats for dry-run mode', function () { @@ -1136,9 +1165,7 @@ describe('back_fill_file_hash script', function () { let output before('prepare environment', prepareEnvironment) before('run script', async function () { - output = await runScript(['--skip-hashed-files'], { - CONCURRENCY: '10', - }) + output = await runScript(['--skip-hashed-files', '--concurrency=10']) }) it('should print stats', function () { expect(output.stats).deep.equal(STATS_ALL) @@ -1146,13 +1173,14 @@ describe('back_fill_file_hash script', function () { commonAssertions() }) - describe('full run STREAM_HIGH_WATER_MARK=1MB', function () { + describe('full run STREAM_HIGH_WATER_MARK=64kiB', function () { let output before('prepare environment', prepareEnvironment) before('run script', async function () { - output = await runScript(['--skip-hashed-files'], { - STREAM_HIGH_WATER_MARK: (1024 * 1024).toString(), - }) + output = await runScript([ + '--skip-hashed-files', + `--stream-high-water-mark=${64 * 1024}`, + ]) }) it('should print stats', function () { expect(output.stats).deep.equal(STATS_ALL) @@ -1164,7 +1192,7 @@ describe('back_fill_file_hash script', function () { let output before('prepare environment', prepareEnvironment) before('run script', async function () { - output = await runScript([], {}) + output = await runScript([]) }) it('should print stats', function () { expect(output.stats).deep.equal( @@ -1193,9 +1221,7 @@ describe('back_fill_file_hash script', function () { }) let output before('run script', async function () { - output = await runScript(['--skip-hashed-files'], { - CONCURRENCY: '1', - }) + output = await runScript(['--skip-hashed-files', '--concurrency=1']) }) it('should print stats', function () { @@ -1214,20 +1240,18 @@ describe('back_fill_file_hash script', function () { let outputPart0, outputPart1 before('prepare environment', prepareEnvironment) before('run script on part 0', async function () { - outputPart0 = await runScript( - ['--skip-hashed-files', `--BATCH_RANGE_END=${edge}`], - { - CONCURRENCY: '1', - } - ) + outputPart0 = await runScript([ + '--skip-hashed-files', + `--BATCH_RANGE_END=${edge}`, + '--concurrency=1', + ]) }) before('run script on part 1', async function () { - outputPart1 = await runScript( - ['--skip-hashed-files', `--BATCH_RANGE_START=${edge}`], - { - CONCURRENCY: '1', - } - ) + outputPart1 = await runScript([ + '--skip-hashed-files', + `--BATCH_RANGE_START=${edge}`, + '--concurrency=1', + ]) }) it('should print stats for part 0', function () { diff --git a/services/web/scripts/find_malformed_filetrees.mjs b/services/web/scripts/find_malformed_filetrees.mjs index 2614c7d622..9f0f6323ab 100644 --- a/services/web/scripts/find_malformed_filetrees.mjs +++ b/services/web/scripts/find_malformed_filetrees.mjs @@ -49,7 +49,7 @@ async function main(trackProgress) { for (const project of projects) { projectsProcessed += 1 if (projectsProcessed % 100000 === 0) { - console.log(projectsProcessed, 'projects processed') + console.warn(projectsProcessed, 'projects processed') } const projectId = project._id.toString() for (const { reason, path, _id } of processProject(project)) {