Merge pull request #22070 from overleaf/jpa-args

[history-v1] refactor env vars to args for GNU-parallel usage

GitOrigin-RevId: 1ebfa3dfc25d36f2f86c22fa22e4864d55b511b2
This commit is contained in:
Jakob Ackermann
2024-11-21 18:21:08 +01:00
committed by Copybot
parent 77831b60bf
commit c2b876372b
5 changed files with 111 additions and 45 deletions

View File

@@ -5,6 +5,8 @@
FROM node:20.18.0 AS base FROM node:20.18.0 AS base
WORKDIR /overleaf/services/history-v1 WORKDIR /overleaf/services/history-v1
COPY services/history-v1/install_deps.sh /overleaf/services/history-v1/
RUN chmod 0755 ./install_deps.sh && ./install_deps.sh
# Google Cloud Storage needs a writable $HOME/.config for resumable uploads # Google Cloud Storage needs a writable $HOME/.config for resumable uploads
# (see https://googleapis.dev/nodejs/storage/latest/File.html#createWriteStream) # (see https://googleapis.dev/nodejs/storage/latest/File.html#createWriteStream)

View File

@@ -6,7 +6,10 @@ version: "2.3"
services: services:
test_unit: test_unit:
image: node:20.18.0 build:
context: ../..
dockerfile: services/history-v1/Dockerfile
target: base
volumes: volumes:
- .:/overleaf/services/history-v1 - .:/overleaf/services/history-v1
- ../../node_modules:/overleaf/node_modules - ../../node_modules:/overleaf/node_modules
@@ -20,7 +23,10 @@ services:
user: node user: node
test_acceptance: test_acceptance:
image: node:20.18.0 build:
context: ../..
dockerfile: services/history-v1/Dockerfile
target: base
volumes: volumes:
- .:/overleaf/services/history-v1 - .:/overleaf/services/history-v1
- ../../node_modules:/overleaf/node_modules - ../../node_modules:/overleaf/node_modules

View File

@@ -0,0 +1,9 @@
#!/bin/sh
set -ex
apt-get update
apt-get install parallel --yes
rm -rf /var/lib/apt/lists/*

View File

@@ -34,6 +34,7 @@ import {
} from '../lib/blob_store/index.js' } from '../lib/blob_store/index.js'
import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js' import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js'
import filestorePersistor from '../lib/persistor.js' import filestorePersistor from '../lib/persistor.js'
import commandLineArgs from 'command-line-args'
// Silence warning. // Silence warning.
Events.setMaxListeners(20) Events.setMaxListeners(20)
@@ -84,20 +85,70 @@ ObjectId.cacheHexString = true
* @property {Blob} [blob] * @property {Blob} [blob]
*/ */
const COLLECT_BLOBS = process.argv.includes('blobs') /**
* @return {{PROCESS_DELETED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, PROCESS_BLOBS: boolean, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, COLLECT_BACKED_UP_BLOBS: boolean}}
*/
function parseArgs() {
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
const args = commandLineArgs([
{ name: 'processNonDeletedProjects', type: String, defaultValue: 'false' },
{ name: 'processDeletedProjects', type: String, defaultValue: 'false' },
{ name: 'processDeletedFiles', type: String, defaultValue: 'false' },
{ name: 'processBlobs', type: String, defaultValue: 'true' },
{ name: 'collectBackedUpBlobs', type: String, defaultValue: 'true' },
{
name: 'BATCH_RANGE_START',
type: String,
defaultValue: PUBLIC_LAUNCH_DATE.toISOString(),
},
{
name: 'BATCH_RANGE_END',
type: String,
defaultValue: new Date().toISOString(),
},
{ name: 'LOGGING_IDENTIFIER', type: String, defaultValue: '' },
])
/**
* commandLineArgs cannot handle --foo=false, so go the long way
* @param {string} name
* @return {boolean}
*/
function boolVal(name) {
const v = args[name]
if (['true', 'false'].includes(v)) return v === 'true'
throw new Error(`expected "true" or "false" for boolean option ${name}`)
}
const BATCH_RANGE_START = objectIdFromInput(
args['BATCH_RANGE_START']
).toString()
const BATCH_RANGE_END = objectIdFromInput(args['BATCH_RANGE_END']).toString()
return {
PROCESS_NON_DELETED_PROJECTS: boolVal('processNonDeletedProjects'),
PROCESS_DELETED_PROJECTS: boolVal('processDeletedProjects'),
PROCESS_BLOBS: boolVal('processBlobs'),
PROCESS_DELETED_FILES: boolVal('processDeletedFiles'),
COLLECT_BACKED_UP_BLOBS: boolVal('collectBackedUpBlobs'),
BATCH_RANGE_START,
BATCH_RANGE_END,
LOGGING_IDENTIFIER: args['LOGGING_IDENTIFIER'] || BATCH_RANGE_START,
}
}
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z') const {
const BATCH_RANGE_START = objectIdFromInput( PROCESS_NON_DELETED_PROJECTS,
process.env.BATCH_RANGE_START || PUBLIC_LAUNCH_DATE.toISOString() PROCESS_DELETED_PROJECTS,
).toString() PROCESS_BLOBS,
const BATCH_RANGE_END = objectIdFromInput( PROCESS_DELETED_FILES,
process.env.BATCH_RANGE_END || new Date().toISOString() COLLECT_BACKED_UP_BLOBS,
).toString() BATCH_RANGE_START,
// We need to control the start and end as ids of deleted projects are created at time of deletion. BATCH_RANGE_END,
delete process.env.BATCH_RANGE_START LOGGING_IDENTIFIER,
delete process.env.BATCH_RANGE_END } = parseArgs()
const LOGGING_IDENTIFIER = process.env.LOGGING_IDENTIFIER || BATCH_RANGE_START // We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
}
// Concurrency for downloading from GCS and updating hashes in mongo // Concurrency for downloading from GCS and updating hashes in mongo
const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10) const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10)
@@ -396,7 +447,7 @@ async function uploadBlobToGCS(blobStore, entry, blob, hash, filePath) {
if (entry.ctx.hasHistoryBlob(hash)) { if (entry.ctx.hasHistoryBlob(hash)) {
return // fast-path using hint from pre-fetched blobs return // fast-path using hint from pre-fetched blobs
} }
if (!COLLECT_BLOBS && (await blobStore.getBlob(hash))) { if (!PROCESS_BLOBS && (await blobStore.getBlob(hash))) {
entry.ctx.recordHistoryBlob(hash) entry.ctx.recordHistoryBlob(hash)
return // round trip to postgres/mongo when not pre-fetched return // round trip to postgres/mongo when not pre-fetched
} }
@@ -817,7 +868,7 @@ function* findFileInBatch(
* @return {Promise<{nBlobs: number, blobs: Map<string, Array<Blob>>}>} * @return {Promise<{nBlobs: number, blobs: Map<string, Array<Blob>>}>}
*/ */
async function collectProjectBlobs(batch) { async function collectProjectBlobs(batch) {
if (!COLLECT_BLOBS) return { nBlobs: 0, blobs: new Map() } if (!PROCESS_BLOBS) return { nBlobs: 0, blobs: new Map() }
return await getProjectBlobsBatch(batch.map(p => p.overleaf.history.id)) return await getProjectBlobsBatch(batch.map(p => p.overleaf.history.id))
} }
@@ -827,7 +878,7 @@ async function collectProjectBlobs(batch) {
*/ */
async function collectDeletedFiles(projects) { async function collectDeletedFiles(projects) {
const deletedFiles = new Map() const deletedFiles = new Map()
if (!process.argv.includes('deletedFiles')) return deletedFiles if (!PROCESS_DELETED_FILES) return deletedFiles
const cursor = deletedFilesCollection.find( const cursor = deletedFilesCollection.find(
{ {
@@ -860,9 +911,8 @@ async function collectDeletedFiles(projects) {
async function collectBackedUpBlobs(projects) { async function collectBackedUpBlobs(projects) {
let nBackedUpBlobs = 0 let nBackedUpBlobs = 0
const backedUpBlobs = new Map() const backedUpBlobs = new Map()
if (!process.argv.includes('collectBackedUpBlobs')) { if (!COLLECT_BACKED_UP_BLOBS) return { nBackedUpBlobs, backedUpBlobs }
return { nBackedUpBlobs, backedUpBlobs }
}
const cursor = backedUpBlobsCollection.find( const cursor = backedUpBlobsCollection.find(
{ _id: { $in: projects.map(p => p._id) } }, { _id: { $in: projects.map(p => p._id) } },
{ {
@@ -1122,7 +1172,7 @@ function estimateBlobSize(blob) {
return size return size
} }
async function updateLiveFileTrees() { async function processNonDeletedProjects() {
try { try {
await batchedUpdate( await batchedUpdate(
projectsCollection, projectsCollection,
@@ -1144,7 +1194,7 @@ async function updateLiveFileTrees() {
console.warn('Done updating live projects') console.warn('Done updating live projects')
} }
async function updateDeletedFileTrees() { async function processDeletedProjects() {
try { try {
await batchedUpdate( await batchedUpdate(
deletedProjectsCollection, deletedProjectsCollection,
@@ -1173,11 +1223,11 @@ async function updateDeletedFileTrees() {
async function main() { async function main() {
await loadGlobalBlobs() await loadGlobalBlobs()
if (process.argv.includes('live')) { if (PROCESS_NON_DELETED_PROJECTS) {
await updateLiveFileTrees() await processNonDeletedProjects()
} }
if (process.argv.includes('deleted')) { if (PROCESS_DELETED_PROJECTS) {
await updateDeletedFileTrees() await processDeletedProjects()
} }
console.warn('Done.') console.warn('Done.')
} }

View File

@@ -476,22 +476,22 @@ describe('back_fill_file_hash script', function () {
}) })
/** /**
* @param {Array<string>} args
* @param {Record<string, string>} env * @param {Record<string, string>} env
* @param {boolean} shouldHaveWritten * @param {boolean} shouldHaveWritten
* @return {Promise<{result, stats: any}>} * @return {Promise<{result, stats: any}>}
*/ */
async function tryRunScript(env = {}, shouldHaveWritten) { async function tryRunScript(args = [], env = {}, shouldHaveWritten) {
let result let result
try { try {
result = await promisify(execFile)( result = await promisify(execFile)(
process.argv0, process.argv0,
[ [
'storage/scripts/back_fill_file_hash.mjs', 'storage/scripts/back_fill_file_hash.mjs',
'collectBackedUpBlobs', '--processNonDeletedProjects=true',
'live', '--processDeletedProjects=true',
'blobs', '--processDeletedFiles=true',
'deleted', ...args,
'deletedFiles',
], ],
{ {
encoding: 'utf-8', encoding: 'utf-8',
@@ -549,12 +549,13 @@ describe('back_fill_file_hash script', function () {
} }
/** /**
* @param {Array<string>} args
* @param {Record<string, string>} env * @param {Record<string, string>} env
* @param {boolean} shouldHaveWritten * @param {boolean} shouldHaveWritten
* @return {Promise<{result, stats: any}>} * @return {Promise<{result, stats: any}>}
*/ */
async function runScript(env = {}, shouldHaveWritten = true) { async function runScript(args = [], env = {}, shouldHaveWritten = true) {
const { stats, result } = await tryRunScript(env, shouldHaveWritten) const { stats, result } = await tryRunScript(args, env, shouldHaveWritten)
if (result.status !== 0) { if (result.status !== 0) {
console.log(result) console.log(result)
expect(result).to.have.property('status', 0) expect(result).to.have.property('status', 0)
@@ -804,7 +805,7 @@ describe('back_fill_file_hash script', function () {
]) ])
}) })
it('should process nothing on re-run', async function () { it('should process nothing on re-run', async function () {
const rerun = await runScript({}, false) const rerun = await runScript([], {}, false)
expect(rerun.stats).deep.equal({ expect(rerun.stats).deep.equal({
...STATS_ALL_ZERO, ...STATS_ALL_ZERO,
// We still need to iterate over all the projects and blobs. // We still need to iterate over all the projects and blobs.
@@ -983,7 +984,7 @@ describe('back_fill_file_hash script', function () {
`${projectId0}/${fileId0}` `${projectId0}/${fileId0}`
) )
const t0 = Date.now() const t0 = Date.now()
const { stats, result } = await tryRunScript({ const { stats, result } = await tryRunScript([], {
RETRIES: '10', RETRIES: '10',
RETRY_DELAY_MS: '1000', RETRY_DELAY_MS: '1000',
}) })
@@ -1025,7 +1026,7 @@ describe('back_fill_file_hash script', function () {
value: { stats, result }, value: { stats, result },
}, },
] = await Promise.allSettled([ ] = await Promise.allSettled([
tryRunScript({ tryRunScript([], {
RETRY_DELAY_MS: '100', RETRY_DELAY_MS: '100',
RETRIES: '60', RETRIES: '60',
RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests
@@ -1049,7 +1050,7 @@ describe('back_fill_file_hash script', function () {
describe('full run CONCURRENCY=1', function () { describe('full run CONCURRENCY=1', function () {
let output let output
beforeEach('run script', async function () { beforeEach('run script', async function () {
output = await runScript({ output = await runScript([], {
CONCURRENCY: '1', CONCURRENCY: '1',
}) })
}) })
@@ -1063,7 +1064,7 @@ describe('back_fill_file_hash script', function () {
describe('full run CONCURRENCY=10', function () { describe('full run CONCURRENCY=10', function () {
let output let output
beforeEach('run script', async function () { beforeEach('run script', async function () {
output = await runScript({ output = await runScript([], {
CONCURRENCY: '10', CONCURRENCY: '10',
}) })
}) })
@@ -1076,7 +1077,7 @@ describe('back_fill_file_hash script', function () {
describe('full run STREAM_HIGH_WATER_MARK=1MB', function () { describe('full run STREAM_HIGH_WATER_MARK=1MB', function () {
let output let output
beforeEach('run script', async function () { beforeEach('run script', async function () {
output = await runScript({ output = await runScript([], {
STREAM_HIGH_WATER_MARK: (1024 * 1024).toString(), STREAM_HIGH_WATER_MARK: (1024 * 1024).toString(),
}) })
}) })
@@ -1098,7 +1099,7 @@ describe('back_fill_file_hash script', function () {
}) })
let output let output
beforeEach('run script', async function () { beforeEach('run script', async function () {
output = await runScript({ output = await runScript([], {
CONCURRENCY: '1', CONCURRENCY: '1',
}) })
}) })
@@ -1122,15 +1123,13 @@ describe('back_fill_file_hash script', function () {
const edge = projectId1.toString() const edge = projectId1.toString()
let outputPart0, outputPart1 let outputPart0, outputPart1
beforeEach('run script on part 0', async function () { beforeEach('run script on part 0', async function () {
outputPart0 = await runScript({ outputPart0 = await runScript([`--BATCH_RANGE_END=${edge}`], {
CONCURRENCY: '1', CONCURRENCY: '1',
BATCH_RANGE_END: edge,
}) })
}) })
beforeEach('run script on part 1', async function () { beforeEach('run script on part 1', async function () {
outputPart1 = await runScript({ outputPart1 = await runScript([`--BATCH_RANGE_START=${edge}`], {
CONCURRENCY: '1', CONCURRENCY: '1',
BATCH_RANGE_START: edge,
}) })
}) })