diff --git a/services/history-v1/Dockerfile b/services/history-v1/Dockerfile index 5646a1529e..be43ce553c 100644 --- a/services/history-v1/Dockerfile +++ b/services/history-v1/Dockerfile @@ -5,6 +5,8 @@ FROM node:20.18.0 AS base WORKDIR /overleaf/services/history-v1 +COPY services/history-v1/install_deps.sh /overleaf/services/history-v1/ +RUN chmod 0755 ./install_deps.sh && ./install_deps.sh # Google Cloud Storage needs a writable $HOME/.config for resumable uploads # (see https://googleapis.dev/nodejs/storage/latest/File.html#createWriteStream) diff --git a/services/history-v1/docker-compose.yml b/services/history-v1/docker-compose.yml index 580e8b2a78..17c41a1dad 100644 --- a/services/history-v1/docker-compose.yml +++ b/services/history-v1/docker-compose.yml @@ -6,7 +6,10 @@ version: "2.3" services: test_unit: - image: node:20.18.0 + build: + context: ../.. + dockerfile: services/history-v1/Dockerfile + target: base volumes: - .:/overleaf/services/history-v1 - ../../node_modules:/overleaf/node_modules @@ -20,7 +23,10 @@ services: user: node test_acceptance: - image: node:20.18.0 + build: + context: ../.. + dockerfile: services/history-v1/Dockerfile + target: base volumes: - .:/overleaf/services/history-v1 - ../../node_modules:/overleaf/node_modules diff --git a/services/history-v1/install_deps.sh b/services/history-v1/install_deps.sh new file mode 100755 index 0000000000..fecf44c8bd --- /dev/null +++ b/services/history-v1/install_deps.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +set -ex + +apt-get update + +apt-get install parallel --yes + +rm -rf /var/lib/apt/lists/* diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs index 1aaf533b16..65a4b64897 100644 --- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs +++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs @@ -34,6 +34,7 @@ import { } from '../lib/blob_store/index.js' import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js' import filestorePersistor from '../lib/persistor.js' +import commandLineArgs from 'command-line-args' // Silence warning. Events.setMaxListeners(20) @@ -84,20 +85,70 @@ ObjectId.cacheHexString = true * @property {Blob} [blob] */ -const COLLECT_BLOBS = process.argv.includes('blobs') +/** + * @return {{PROCESS_DELETED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, PROCESS_BLOBS: boolean, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, COLLECT_BACKED_UP_BLOBS: boolean}} + */ +function parseArgs() { + const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z') + const args = commandLineArgs([ + { name: 'processNonDeletedProjects', type: String, defaultValue: 'false' }, + { name: 'processDeletedProjects', type: String, defaultValue: 'false' }, + { name: 'processDeletedFiles', type: String, defaultValue: 'false' }, + { name: 'processBlobs', type: String, defaultValue: 'true' }, + { name: 'collectBackedUpBlobs', type: String, defaultValue: 'true' }, + { + name: 'BATCH_RANGE_START', + type: String, + defaultValue: PUBLIC_LAUNCH_DATE.toISOString(), + }, + { + name: 'BATCH_RANGE_END', + type: String, + defaultValue: new Date().toISOString(), + }, + { name: 'LOGGING_IDENTIFIER', type: String, defaultValue: '' }, + ]) + /** + * commandLineArgs cannot handle --foo=false, so go the long way + * @param {string} name + * @return {boolean} + */ + function boolVal(name) { + const v = args[name] + if (['true', 'false'].includes(v)) return v === 'true' + throw new Error(`expected "true" or "false" for boolean option ${name}`) + } + const BATCH_RANGE_START = objectIdFromInput( + args['BATCH_RANGE_START'] + ).toString() + const BATCH_RANGE_END = objectIdFromInput(args['BATCH_RANGE_END']).toString() + return { + PROCESS_NON_DELETED_PROJECTS: boolVal('processNonDeletedProjects'), + PROCESS_DELETED_PROJECTS: boolVal('processDeletedProjects'), + PROCESS_BLOBS: boolVal('processBlobs'), + PROCESS_DELETED_FILES: boolVal('processDeletedFiles'), + COLLECT_BACKED_UP_BLOBS: boolVal('collectBackedUpBlobs'), + BATCH_RANGE_START, + BATCH_RANGE_END, + LOGGING_IDENTIFIER: args['LOGGING_IDENTIFIER'] || BATCH_RANGE_START, + } +} -const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z') -const BATCH_RANGE_START = objectIdFromInput( - process.env.BATCH_RANGE_START || PUBLIC_LAUNCH_DATE.toISOString() -).toString() -const BATCH_RANGE_END = objectIdFromInput( - process.env.BATCH_RANGE_END || new Date().toISOString() -).toString() -// We need to control the start and end as ids of deleted projects are created at time of deletion. -delete process.env.BATCH_RANGE_START -delete process.env.BATCH_RANGE_END +const { + PROCESS_NON_DELETED_PROJECTS, + PROCESS_DELETED_PROJECTS, + PROCESS_BLOBS, + PROCESS_DELETED_FILES, + COLLECT_BACKED_UP_BLOBS, + BATCH_RANGE_START, + BATCH_RANGE_END, + LOGGING_IDENTIFIER, +} = parseArgs() -const LOGGING_IDENTIFIER = process.env.LOGGING_IDENTIFIER || BATCH_RANGE_START +// We need to handle the start and end differently as ids of deleted projects are created at time of deletion. +if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) { + throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END') +} // Concurrency for downloading from GCS and updating hashes in mongo const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10) @@ -396,7 +447,7 @@ async function uploadBlobToGCS(blobStore, entry, blob, hash, filePath) { if (entry.ctx.hasHistoryBlob(hash)) { return // fast-path using hint from pre-fetched blobs } - if (!COLLECT_BLOBS && (await blobStore.getBlob(hash))) { + if (!PROCESS_BLOBS && (await blobStore.getBlob(hash))) { entry.ctx.recordHistoryBlob(hash) return // round trip to postgres/mongo when not pre-fetched } @@ -817,7 +868,7 @@ function* findFileInBatch( * @return {Promise<{nBlobs: number, blobs: Map>}>} */ async function collectProjectBlobs(batch) { - if (!COLLECT_BLOBS) return { nBlobs: 0, blobs: new Map() } + if (!PROCESS_BLOBS) return { nBlobs: 0, blobs: new Map() } return await getProjectBlobsBatch(batch.map(p => p.overleaf.history.id)) } @@ -827,7 +878,7 @@ async function collectProjectBlobs(batch) { */ async function collectDeletedFiles(projects) { const deletedFiles = new Map() - if (!process.argv.includes('deletedFiles')) return deletedFiles + if (!PROCESS_DELETED_FILES) return deletedFiles const cursor = deletedFilesCollection.find( { @@ -860,9 +911,8 @@ async function collectDeletedFiles(projects) { async function collectBackedUpBlobs(projects) { let nBackedUpBlobs = 0 const backedUpBlobs = new Map() - if (!process.argv.includes('collectBackedUpBlobs')) { - return { nBackedUpBlobs, backedUpBlobs } - } + if (!COLLECT_BACKED_UP_BLOBS) return { nBackedUpBlobs, backedUpBlobs } + const cursor = backedUpBlobsCollection.find( { _id: { $in: projects.map(p => p._id) } }, { @@ -1122,7 +1172,7 @@ function estimateBlobSize(blob) { return size } -async function updateLiveFileTrees() { +async function processNonDeletedProjects() { try { await batchedUpdate( projectsCollection, @@ -1144,7 +1194,7 @@ async function updateLiveFileTrees() { console.warn('Done updating live projects') } -async function updateDeletedFileTrees() { +async function processDeletedProjects() { try { await batchedUpdate( deletedProjectsCollection, @@ -1173,11 +1223,11 @@ async function updateDeletedFileTrees() { async function main() { await loadGlobalBlobs() - if (process.argv.includes('live')) { - await updateLiveFileTrees() + if (PROCESS_NON_DELETED_PROJECTS) { + await processNonDeletedProjects() } - if (process.argv.includes('deleted')) { - await updateDeletedFileTrees() + if (PROCESS_DELETED_PROJECTS) { + await processDeletedProjects() } console.warn('Done.') } diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs index bc77df7422..f6ef5f1165 100644 --- a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs +++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs @@ -476,22 +476,22 @@ describe('back_fill_file_hash script', function () { }) /** + * @param {Array} args * @param {Record} env * @param {boolean} shouldHaveWritten * @return {Promise<{result, stats: any}>} */ - async function tryRunScript(env = {}, shouldHaveWritten) { + async function tryRunScript(args = [], env = {}, shouldHaveWritten) { let result try { result = await promisify(execFile)( process.argv0, [ 'storage/scripts/back_fill_file_hash.mjs', - 'collectBackedUpBlobs', - 'live', - 'blobs', - 'deleted', - 'deletedFiles', + '--processNonDeletedProjects=true', + '--processDeletedProjects=true', + '--processDeletedFiles=true', + ...args, ], { encoding: 'utf-8', @@ -549,12 +549,13 @@ describe('back_fill_file_hash script', function () { } /** + * @param {Array} args * @param {Record} env * @param {boolean} shouldHaveWritten * @return {Promise<{result, stats: any}>} */ - async function runScript(env = {}, shouldHaveWritten = true) { - const { stats, result } = await tryRunScript(env, shouldHaveWritten) + async function runScript(args = [], env = {}, shouldHaveWritten = true) { + const { stats, result } = await tryRunScript(args, env, shouldHaveWritten) if (result.status !== 0) { console.log(result) expect(result).to.have.property('status', 0) @@ -804,7 +805,7 @@ describe('back_fill_file_hash script', function () { ]) }) it('should process nothing on re-run', async function () { - const rerun = await runScript({}, false) + const rerun = await runScript([], {}, false) expect(rerun.stats).deep.equal({ ...STATS_ALL_ZERO, // We still need to iterate over all the projects and blobs. @@ -983,7 +984,7 @@ describe('back_fill_file_hash script', function () { `${projectId0}/${fileId0}` ) const t0 = Date.now() - const { stats, result } = await tryRunScript({ + const { stats, result } = await tryRunScript([], { RETRIES: '10', RETRY_DELAY_MS: '1000', }) @@ -1025,7 +1026,7 @@ describe('back_fill_file_hash script', function () { value: { stats, result }, }, ] = await Promise.allSettled([ - tryRunScript({ + tryRunScript([], { RETRY_DELAY_MS: '100', RETRIES: '60', RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests @@ -1049,7 +1050,7 @@ describe('back_fill_file_hash script', function () { describe('full run CONCURRENCY=1', function () { let output beforeEach('run script', async function () { - output = await runScript({ + output = await runScript([], { CONCURRENCY: '1', }) }) @@ -1063,7 +1064,7 @@ describe('back_fill_file_hash script', function () { describe('full run CONCURRENCY=10', function () { let output beforeEach('run script', async function () { - output = await runScript({ + output = await runScript([], { CONCURRENCY: '10', }) }) @@ -1076,7 +1077,7 @@ describe('back_fill_file_hash script', function () { describe('full run STREAM_HIGH_WATER_MARK=1MB', function () { let output beforeEach('run script', async function () { - output = await runScript({ + output = await runScript([], { STREAM_HIGH_WATER_MARK: (1024 * 1024).toString(), }) }) @@ -1098,7 +1099,7 @@ describe('back_fill_file_hash script', function () { }) let output beforeEach('run script', async function () { - output = await runScript({ + output = await runScript([], { CONCURRENCY: '1', }) }) @@ -1122,15 +1123,13 @@ describe('back_fill_file_hash script', function () { const edge = projectId1.toString() let outputPart0, outputPart1 beforeEach('run script on part 0', async function () { - outputPart0 = await runScript({ + outputPart0 = await runScript([`--BATCH_RANGE_END=${edge}`], { CONCURRENCY: '1', - BATCH_RANGE_END: edge, }) }) beforeEach('run script on part 1', async function () { - outputPart1 = await runScript({ + outputPart1 = await runScript([`--BATCH_RANGE_START=${edge}`], { CONCURRENCY: '1', - BATCH_RANGE_START: edge, }) })