Merge pull request #22070 from overleaf/jpa-args

[history-v1] refactor env vars to args for GNU-parallel usage

GitOrigin-RevId: 1ebfa3dfc25d36f2f86c22fa22e4864d55b511b2
This commit is contained in:
Jakob Ackermann
2024-11-21 18:21:08 +01:00
committed by Copybot
parent 77831b60bf
commit c2b876372b
5 changed files with 111 additions and 45 deletions

View File

@@ -5,6 +5,8 @@
FROM node:20.18.0 AS base
WORKDIR /overleaf/services/history-v1
COPY services/history-v1/install_deps.sh /overleaf/services/history-v1/
RUN chmod 0755 ./install_deps.sh && ./install_deps.sh
# Google Cloud Storage needs a writable $HOME/.config for resumable uploads
# (see https://googleapis.dev/nodejs/storage/latest/File.html#createWriteStream)

View File

@@ -6,7 +6,10 @@ version: "2.3"
services:
test_unit:
image: node:20.18.0
build:
context: ../..
dockerfile: services/history-v1/Dockerfile
target: base
volumes:
- .:/overleaf/services/history-v1
- ../../node_modules:/overleaf/node_modules
@@ -20,7 +23,10 @@ services:
user: node
test_acceptance:
image: node:20.18.0
build:
context: ../..
dockerfile: services/history-v1/Dockerfile
target: base
volumes:
- .:/overleaf/services/history-v1
- ../../node_modules:/overleaf/node_modules

View File

@@ -0,0 +1,9 @@
#!/bin/sh
set -ex
apt-get update
apt-get install parallel --yes
rm -rf /var/lib/apt/lists/*

View File

@@ -34,6 +34,7 @@ import {
} from '../lib/blob_store/index.js'
import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js'
import filestorePersistor from '../lib/persistor.js'
import commandLineArgs from 'command-line-args'
// Silence warning.
Events.setMaxListeners(20)
@@ -84,20 +85,70 @@ ObjectId.cacheHexString = true
* @property {Blob} [blob]
*/
const COLLECT_BLOBS = process.argv.includes('blobs')
/**
* @return {{PROCESS_DELETED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, PROCESS_BLOBS: boolean, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, COLLECT_BACKED_UP_BLOBS: boolean}}
*/
function parseArgs() {
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
const args = commandLineArgs([
{ name: 'processNonDeletedProjects', type: String, defaultValue: 'false' },
{ name: 'processDeletedProjects', type: String, defaultValue: 'false' },
{ name: 'processDeletedFiles', type: String, defaultValue: 'false' },
{ name: 'processBlobs', type: String, defaultValue: 'true' },
{ name: 'collectBackedUpBlobs', type: String, defaultValue: 'true' },
{
name: 'BATCH_RANGE_START',
type: String,
defaultValue: PUBLIC_LAUNCH_DATE.toISOString(),
},
{
name: 'BATCH_RANGE_END',
type: String,
defaultValue: new Date().toISOString(),
},
{ name: 'LOGGING_IDENTIFIER', type: String, defaultValue: '' },
])
/**
* commandLineArgs cannot handle --foo=false, so go the long way
* @param {string} name
* @return {boolean}
*/
function boolVal(name) {
const v = args[name]
if (['true', 'false'].includes(v)) return v === 'true'
throw new Error(`expected "true" or "false" for boolean option ${name}`)
}
const BATCH_RANGE_START = objectIdFromInput(
args['BATCH_RANGE_START']
).toString()
const BATCH_RANGE_END = objectIdFromInput(args['BATCH_RANGE_END']).toString()
return {
PROCESS_NON_DELETED_PROJECTS: boolVal('processNonDeletedProjects'),
PROCESS_DELETED_PROJECTS: boolVal('processDeletedProjects'),
PROCESS_BLOBS: boolVal('processBlobs'),
PROCESS_DELETED_FILES: boolVal('processDeletedFiles'),
COLLECT_BACKED_UP_BLOBS: boolVal('collectBackedUpBlobs'),
BATCH_RANGE_START,
BATCH_RANGE_END,
LOGGING_IDENTIFIER: args['LOGGING_IDENTIFIER'] || BATCH_RANGE_START,
}
}
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
const BATCH_RANGE_START = objectIdFromInput(
process.env.BATCH_RANGE_START || PUBLIC_LAUNCH_DATE.toISOString()
).toString()
const BATCH_RANGE_END = objectIdFromInput(
process.env.BATCH_RANGE_END || new Date().toISOString()
).toString()
// We need to control the start and end as ids of deleted projects are created at time of deletion.
delete process.env.BATCH_RANGE_START
delete process.env.BATCH_RANGE_END
const {
PROCESS_NON_DELETED_PROJECTS,
PROCESS_DELETED_PROJECTS,
PROCESS_BLOBS,
PROCESS_DELETED_FILES,
COLLECT_BACKED_UP_BLOBS,
BATCH_RANGE_START,
BATCH_RANGE_END,
LOGGING_IDENTIFIER,
} = parseArgs()
const LOGGING_IDENTIFIER = process.env.LOGGING_IDENTIFIER || BATCH_RANGE_START
// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
}
// Concurrency for downloading from GCS and updating hashes in mongo
const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10)
@@ -396,7 +447,7 @@ async function uploadBlobToGCS(blobStore, entry, blob, hash, filePath) {
if (entry.ctx.hasHistoryBlob(hash)) {
return // fast-path using hint from pre-fetched blobs
}
if (!COLLECT_BLOBS && (await blobStore.getBlob(hash))) {
if (!PROCESS_BLOBS && (await blobStore.getBlob(hash))) {
entry.ctx.recordHistoryBlob(hash)
return // round trip to postgres/mongo when not pre-fetched
}
@@ -817,7 +868,7 @@ function* findFileInBatch(
* @return {Promise<{nBlobs: number, blobs: Map<string, Array<Blob>>}>}
*/
async function collectProjectBlobs(batch) {
if (!COLLECT_BLOBS) return { nBlobs: 0, blobs: new Map() }
if (!PROCESS_BLOBS) return { nBlobs: 0, blobs: new Map() }
return await getProjectBlobsBatch(batch.map(p => p.overleaf.history.id))
}
@@ -827,7 +878,7 @@ async function collectProjectBlobs(batch) {
*/
async function collectDeletedFiles(projects) {
const deletedFiles = new Map()
if (!process.argv.includes('deletedFiles')) return deletedFiles
if (!PROCESS_DELETED_FILES) return deletedFiles
const cursor = deletedFilesCollection.find(
{
@@ -860,9 +911,8 @@ async function collectDeletedFiles(projects) {
async function collectBackedUpBlobs(projects) {
let nBackedUpBlobs = 0
const backedUpBlobs = new Map()
if (!process.argv.includes('collectBackedUpBlobs')) {
return { nBackedUpBlobs, backedUpBlobs }
}
if (!COLLECT_BACKED_UP_BLOBS) return { nBackedUpBlobs, backedUpBlobs }
const cursor = backedUpBlobsCollection.find(
{ _id: { $in: projects.map(p => p._id) } },
{
@@ -1122,7 +1172,7 @@ function estimateBlobSize(blob) {
return size
}
async function updateLiveFileTrees() {
async function processNonDeletedProjects() {
try {
await batchedUpdate(
projectsCollection,
@@ -1144,7 +1194,7 @@ async function updateLiveFileTrees() {
console.warn('Done updating live projects')
}
async function updateDeletedFileTrees() {
async function processDeletedProjects() {
try {
await batchedUpdate(
deletedProjectsCollection,
@@ -1173,11 +1223,11 @@ async function updateDeletedFileTrees() {
async function main() {
await loadGlobalBlobs()
if (process.argv.includes('live')) {
await updateLiveFileTrees()
if (PROCESS_NON_DELETED_PROJECTS) {
await processNonDeletedProjects()
}
if (process.argv.includes('deleted')) {
await updateDeletedFileTrees()
if (PROCESS_DELETED_PROJECTS) {
await processDeletedProjects()
}
console.warn('Done.')
}

View File

@@ -476,22 +476,22 @@ describe('back_fill_file_hash script', function () {
})
/**
* @param {Array<string>} args
* @param {Record<string, string>} env
* @param {boolean} shouldHaveWritten
* @return {Promise<{result, stats: any}>}
*/
async function tryRunScript(env = {}, shouldHaveWritten) {
async function tryRunScript(args = [], env = {}, shouldHaveWritten) {
let result
try {
result = await promisify(execFile)(
process.argv0,
[
'storage/scripts/back_fill_file_hash.mjs',
'collectBackedUpBlobs',
'live',
'blobs',
'deleted',
'deletedFiles',
'--processNonDeletedProjects=true',
'--processDeletedProjects=true',
'--processDeletedFiles=true',
...args,
],
{
encoding: 'utf-8',
@@ -549,12 +549,13 @@ describe('back_fill_file_hash script', function () {
}
/**
* @param {Array<string>} args
* @param {Record<string, string>} env
* @param {boolean} shouldHaveWritten
* @return {Promise<{result, stats: any}>}
*/
async function runScript(env = {}, shouldHaveWritten = true) {
const { stats, result } = await tryRunScript(env, shouldHaveWritten)
async function runScript(args = [], env = {}, shouldHaveWritten = true) {
const { stats, result } = await tryRunScript(args, env, shouldHaveWritten)
if (result.status !== 0) {
console.log(result)
expect(result).to.have.property('status', 0)
@@ -804,7 +805,7 @@ describe('back_fill_file_hash script', function () {
])
})
it('should process nothing on re-run', async function () {
const rerun = await runScript({}, false)
const rerun = await runScript([], {}, false)
expect(rerun.stats).deep.equal({
...STATS_ALL_ZERO,
// We still need to iterate over all the projects and blobs.
@@ -983,7 +984,7 @@ describe('back_fill_file_hash script', function () {
`${projectId0}/${fileId0}`
)
const t0 = Date.now()
const { stats, result } = await tryRunScript({
const { stats, result } = await tryRunScript([], {
RETRIES: '10',
RETRY_DELAY_MS: '1000',
})
@@ -1025,7 +1026,7 @@ describe('back_fill_file_hash script', function () {
value: { stats, result },
},
] = await Promise.allSettled([
tryRunScript({
tryRunScript([], {
RETRY_DELAY_MS: '100',
RETRIES: '60',
RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests
@@ -1049,7 +1050,7 @@ describe('back_fill_file_hash script', function () {
describe('full run CONCURRENCY=1', function () {
let output
beforeEach('run script', async function () {
output = await runScript({
output = await runScript([], {
CONCURRENCY: '1',
})
})
@@ -1063,7 +1064,7 @@ describe('back_fill_file_hash script', function () {
describe('full run CONCURRENCY=10', function () {
let output
beforeEach('run script', async function () {
output = await runScript({
output = await runScript([], {
CONCURRENCY: '10',
})
})
@@ -1076,7 +1077,7 @@ describe('back_fill_file_hash script', function () {
describe('full run STREAM_HIGH_WATER_MARK=1MB', function () {
let output
beforeEach('run script', async function () {
output = await runScript({
output = await runScript([], {
STREAM_HIGH_WATER_MARK: (1024 * 1024).toString(),
})
})
@@ -1098,7 +1099,7 @@ describe('back_fill_file_hash script', function () {
})
let output
beforeEach('run script', async function () {
output = await runScript({
output = await runScript([], {
CONCURRENCY: '1',
})
})
@@ -1122,15 +1123,13 @@ describe('back_fill_file_hash script', function () {
const edge = projectId1.toString()
let outputPart0, outputPart1
beforeEach('run script on part 0', async function () {
outputPart0 = await runScript({
outputPart0 = await runScript([`--BATCH_RANGE_END=${edge}`], {
CONCURRENCY: '1',
BATCH_RANGE_END: edge,
})
})
beforeEach('run script on part 1', async function () {
outputPart1 = await runScript({
outputPart1 = await runScript([`--BATCH_RANGE_START=${edge}`], {
CONCURRENCY: '1',
BATCH_RANGE_START: edge,
})
})