mirror of
https://github.com/yu-i-i/overleaf-cep.git
synced 2026-05-23 09:09:36 +02:00
Merge pull request #22070 from overleaf/jpa-args
[history-v1] refactor env vars to args for GNU-parallel usage GitOrigin-RevId: 1ebfa3dfc25d36f2f86c22fa22e4864d55b511b2
This commit is contained in:
@@ -5,6 +5,8 @@
|
||||
FROM node:20.18.0 AS base
|
||||
|
||||
WORKDIR /overleaf/services/history-v1
|
||||
COPY services/history-v1/install_deps.sh /overleaf/services/history-v1/
|
||||
RUN chmod 0755 ./install_deps.sh && ./install_deps.sh
|
||||
|
||||
# Google Cloud Storage needs a writable $HOME/.config for resumable uploads
|
||||
# (see https://googleapis.dev/nodejs/storage/latest/File.html#createWriteStream)
|
||||
|
||||
@@ -6,7 +6,10 @@ version: "2.3"
|
||||
|
||||
services:
|
||||
test_unit:
|
||||
image: node:20.18.0
|
||||
build:
|
||||
context: ../..
|
||||
dockerfile: services/history-v1/Dockerfile
|
||||
target: base
|
||||
volumes:
|
||||
- .:/overleaf/services/history-v1
|
||||
- ../../node_modules:/overleaf/node_modules
|
||||
@@ -20,7 +23,10 @@ services:
|
||||
user: node
|
||||
|
||||
test_acceptance:
|
||||
image: node:20.18.0
|
||||
build:
|
||||
context: ../..
|
||||
dockerfile: services/history-v1/Dockerfile
|
||||
target: base
|
||||
volumes:
|
||||
- .:/overleaf/services/history-v1
|
||||
- ../../node_modules:/overleaf/node_modules
|
||||
|
||||
9
services/history-v1/install_deps.sh
Executable file
9
services/history-v1/install_deps.sh
Executable file
@@ -0,0 +1,9 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -ex
|
||||
|
||||
apt-get update
|
||||
|
||||
apt-get install parallel --yes
|
||||
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
@@ -34,6 +34,7 @@ import {
|
||||
} from '../lib/blob_store/index.js'
|
||||
import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js'
|
||||
import filestorePersistor from '../lib/persistor.js'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
|
||||
// Silence warning.
|
||||
Events.setMaxListeners(20)
|
||||
@@ -84,20 +85,70 @@ ObjectId.cacheHexString = true
|
||||
* @property {Blob} [blob]
|
||||
*/
|
||||
|
||||
const COLLECT_BLOBS = process.argv.includes('blobs')
|
||||
/**
|
||||
* @return {{PROCESS_DELETED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, PROCESS_BLOBS: boolean, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, COLLECT_BACKED_UP_BLOBS: boolean}}
|
||||
*/
|
||||
function parseArgs() {
|
||||
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
|
||||
const args = commandLineArgs([
|
||||
{ name: 'processNonDeletedProjects', type: String, defaultValue: 'false' },
|
||||
{ name: 'processDeletedProjects', type: String, defaultValue: 'false' },
|
||||
{ name: 'processDeletedFiles', type: String, defaultValue: 'false' },
|
||||
{ name: 'processBlobs', type: String, defaultValue: 'true' },
|
||||
{ name: 'collectBackedUpBlobs', type: String, defaultValue: 'true' },
|
||||
{
|
||||
name: 'BATCH_RANGE_START',
|
||||
type: String,
|
||||
defaultValue: PUBLIC_LAUNCH_DATE.toISOString(),
|
||||
},
|
||||
{
|
||||
name: 'BATCH_RANGE_END',
|
||||
type: String,
|
||||
defaultValue: new Date().toISOString(),
|
||||
},
|
||||
{ name: 'LOGGING_IDENTIFIER', type: String, defaultValue: '' },
|
||||
])
|
||||
/**
|
||||
* commandLineArgs cannot handle --foo=false, so go the long way
|
||||
* @param {string} name
|
||||
* @return {boolean}
|
||||
*/
|
||||
function boolVal(name) {
|
||||
const v = args[name]
|
||||
if (['true', 'false'].includes(v)) return v === 'true'
|
||||
throw new Error(`expected "true" or "false" for boolean option ${name}`)
|
||||
}
|
||||
const BATCH_RANGE_START = objectIdFromInput(
|
||||
args['BATCH_RANGE_START']
|
||||
).toString()
|
||||
const BATCH_RANGE_END = objectIdFromInput(args['BATCH_RANGE_END']).toString()
|
||||
return {
|
||||
PROCESS_NON_DELETED_PROJECTS: boolVal('processNonDeletedProjects'),
|
||||
PROCESS_DELETED_PROJECTS: boolVal('processDeletedProjects'),
|
||||
PROCESS_BLOBS: boolVal('processBlobs'),
|
||||
PROCESS_DELETED_FILES: boolVal('processDeletedFiles'),
|
||||
COLLECT_BACKED_UP_BLOBS: boolVal('collectBackedUpBlobs'),
|
||||
BATCH_RANGE_START,
|
||||
BATCH_RANGE_END,
|
||||
LOGGING_IDENTIFIER: args['LOGGING_IDENTIFIER'] || BATCH_RANGE_START,
|
||||
}
|
||||
}
|
||||
|
||||
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
|
||||
const BATCH_RANGE_START = objectIdFromInput(
|
||||
process.env.BATCH_RANGE_START || PUBLIC_LAUNCH_DATE.toISOString()
|
||||
).toString()
|
||||
const BATCH_RANGE_END = objectIdFromInput(
|
||||
process.env.BATCH_RANGE_END || new Date().toISOString()
|
||||
).toString()
|
||||
// We need to control the start and end as ids of deleted projects are created at time of deletion.
|
||||
delete process.env.BATCH_RANGE_START
|
||||
delete process.env.BATCH_RANGE_END
|
||||
const {
|
||||
PROCESS_NON_DELETED_PROJECTS,
|
||||
PROCESS_DELETED_PROJECTS,
|
||||
PROCESS_BLOBS,
|
||||
PROCESS_DELETED_FILES,
|
||||
COLLECT_BACKED_UP_BLOBS,
|
||||
BATCH_RANGE_START,
|
||||
BATCH_RANGE_END,
|
||||
LOGGING_IDENTIFIER,
|
||||
} = parseArgs()
|
||||
|
||||
const LOGGING_IDENTIFIER = process.env.LOGGING_IDENTIFIER || BATCH_RANGE_START
|
||||
// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
|
||||
if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
|
||||
throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
|
||||
}
|
||||
|
||||
// Concurrency for downloading from GCS and updating hashes in mongo
|
||||
const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10)
|
||||
@@ -396,7 +447,7 @@ async function uploadBlobToGCS(blobStore, entry, blob, hash, filePath) {
|
||||
if (entry.ctx.hasHistoryBlob(hash)) {
|
||||
return // fast-path using hint from pre-fetched blobs
|
||||
}
|
||||
if (!COLLECT_BLOBS && (await blobStore.getBlob(hash))) {
|
||||
if (!PROCESS_BLOBS && (await blobStore.getBlob(hash))) {
|
||||
entry.ctx.recordHistoryBlob(hash)
|
||||
return // round trip to postgres/mongo when not pre-fetched
|
||||
}
|
||||
@@ -817,7 +868,7 @@ function* findFileInBatch(
|
||||
* @return {Promise<{nBlobs: number, blobs: Map<string, Array<Blob>>}>}
|
||||
*/
|
||||
async function collectProjectBlobs(batch) {
|
||||
if (!COLLECT_BLOBS) return { nBlobs: 0, blobs: new Map() }
|
||||
if (!PROCESS_BLOBS) return { nBlobs: 0, blobs: new Map() }
|
||||
return await getProjectBlobsBatch(batch.map(p => p.overleaf.history.id))
|
||||
}
|
||||
|
||||
@@ -827,7 +878,7 @@ async function collectProjectBlobs(batch) {
|
||||
*/
|
||||
async function collectDeletedFiles(projects) {
|
||||
const deletedFiles = new Map()
|
||||
if (!process.argv.includes('deletedFiles')) return deletedFiles
|
||||
if (!PROCESS_DELETED_FILES) return deletedFiles
|
||||
|
||||
const cursor = deletedFilesCollection.find(
|
||||
{
|
||||
@@ -860,9 +911,8 @@ async function collectDeletedFiles(projects) {
|
||||
async function collectBackedUpBlobs(projects) {
|
||||
let nBackedUpBlobs = 0
|
||||
const backedUpBlobs = new Map()
|
||||
if (!process.argv.includes('collectBackedUpBlobs')) {
|
||||
return { nBackedUpBlobs, backedUpBlobs }
|
||||
}
|
||||
if (!COLLECT_BACKED_UP_BLOBS) return { nBackedUpBlobs, backedUpBlobs }
|
||||
|
||||
const cursor = backedUpBlobsCollection.find(
|
||||
{ _id: { $in: projects.map(p => p._id) } },
|
||||
{
|
||||
@@ -1122,7 +1172,7 @@ function estimateBlobSize(blob) {
|
||||
return size
|
||||
}
|
||||
|
||||
async function updateLiveFileTrees() {
|
||||
async function processNonDeletedProjects() {
|
||||
try {
|
||||
await batchedUpdate(
|
||||
projectsCollection,
|
||||
@@ -1144,7 +1194,7 @@ async function updateLiveFileTrees() {
|
||||
console.warn('Done updating live projects')
|
||||
}
|
||||
|
||||
async function updateDeletedFileTrees() {
|
||||
async function processDeletedProjects() {
|
||||
try {
|
||||
await batchedUpdate(
|
||||
deletedProjectsCollection,
|
||||
@@ -1173,11 +1223,11 @@ async function updateDeletedFileTrees() {
|
||||
|
||||
async function main() {
|
||||
await loadGlobalBlobs()
|
||||
if (process.argv.includes('live')) {
|
||||
await updateLiveFileTrees()
|
||||
if (PROCESS_NON_DELETED_PROJECTS) {
|
||||
await processNonDeletedProjects()
|
||||
}
|
||||
if (process.argv.includes('deleted')) {
|
||||
await updateDeletedFileTrees()
|
||||
if (PROCESS_DELETED_PROJECTS) {
|
||||
await processDeletedProjects()
|
||||
}
|
||||
console.warn('Done.')
|
||||
}
|
||||
|
||||
@@ -476,22 +476,22 @@ describe('back_fill_file_hash script', function () {
|
||||
})
|
||||
|
||||
/**
|
||||
* @param {Array<string>} args
|
||||
* @param {Record<string, string>} env
|
||||
* @param {boolean} shouldHaveWritten
|
||||
* @return {Promise<{result, stats: any}>}
|
||||
*/
|
||||
async function tryRunScript(env = {}, shouldHaveWritten) {
|
||||
async function tryRunScript(args = [], env = {}, shouldHaveWritten) {
|
||||
let result
|
||||
try {
|
||||
result = await promisify(execFile)(
|
||||
process.argv0,
|
||||
[
|
||||
'storage/scripts/back_fill_file_hash.mjs',
|
||||
'collectBackedUpBlobs',
|
||||
'live',
|
||||
'blobs',
|
||||
'deleted',
|
||||
'deletedFiles',
|
||||
'--processNonDeletedProjects=true',
|
||||
'--processDeletedProjects=true',
|
||||
'--processDeletedFiles=true',
|
||||
...args,
|
||||
],
|
||||
{
|
||||
encoding: 'utf-8',
|
||||
@@ -549,12 +549,13 @@ describe('back_fill_file_hash script', function () {
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Array<string>} args
|
||||
* @param {Record<string, string>} env
|
||||
* @param {boolean} shouldHaveWritten
|
||||
* @return {Promise<{result, stats: any}>}
|
||||
*/
|
||||
async function runScript(env = {}, shouldHaveWritten = true) {
|
||||
const { stats, result } = await tryRunScript(env, shouldHaveWritten)
|
||||
async function runScript(args = [], env = {}, shouldHaveWritten = true) {
|
||||
const { stats, result } = await tryRunScript(args, env, shouldHaveWritten)
|
||||
if (result.status !== 0) {
|
||||
console.log(result)
|
||||
expect(result).to.have.property('status', 0)
|
||||
@@ -804,7 +805,7 @@ describe('back_fill_file_hash script', function () {
|
||||
])
|
||||
})
|
||||
it('should process nothing on re-run', async function () {
|
||||
const rerun = await runScript({}, false)
|
||||
const rerun = await runScript([], {}, false)
|
||||
expect(rerun.stats).deep.equal({
|
||||
...STATS_ALL_ZERO,
|
||||
// We still need to iterate over all the projects and blobs.
|
||||
@@ -983,7 +984,7 @@ describe('back_fill_file_hash script', function () {
|
||||
`${projectId0}/${fileId0}`
|
||||
)
|
||||
const t0 = Date.now()
|
||||
const { stats, result } = await tryRunScript({
|
||||
const { stats, result } = await tryRunScript([], {
|
||||
RETRIES: '10',
|
||||
RETRY_DELAY_MS: '1000',
|
||||
})
|
||||
@@ -1025,7 +1026,7 @@ describe('back_fill_file_hash script', function () {
|
||||
value: { stats, result },
|
||||
},
|
||||
] = await Promise.allSettled([
|
||||
tryRunScript({
|
||||
tryRunScript([], {
|
||||
RETRY_DELAY_MS: '100',
|
||||
RETRIES: '60',
|
||||
RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests
|
||||
@@ -1049,7 +1050,7 @@ describe('back_fill_file_hash script', function () {
|
||||
describe('full run CONCURRENCY=1', function () {
|
||||
let output
|
||||
beforeEach('run script', async function () {
|
||||
output = await runScript({
|
||||
output = await runScript([], {
|
||||
CONCURRENCY: '1',
|
||||
})
|
||||
})
|
||||
@@ -1063,7 +1064,7 @@ describe('back_fill_file_hash script', function () {
|
||||
describe('full run CONCURRENCY=10', function () {
|
||||
let output
|
||||
beforeEach('run script', async function () {
|
||||
output = await runScript({
|
||||
output = await runScript([], {
|
||||
CONCURRENCY: '10',
|
||||
})
|
||||
})
|
||||
@@ -1076,7 +1077,7 @@ describe('back_fill_file_hash script', function () {
|
||||
describe('full run STREAM_HIGH_WATER_MARK=1MB', function () {
|
||||
let output
|
||||
beforeEach('run script', async function () {
|
||||
output = await runScript({
|
||||
output = await runScript([], {
|
||||
STREAM_HIGH_WATER_MARK: (1024 * 1024).toString(),
|
||||
})
|
||||
})
|
||||
@@ -1098,7 +1099,7 @@ describe('back_fill_file_hash script', function () {
|
||||
})
|
||||
let output
|
||||
beforeEach('run script', async function () {
|
||||
output = await runScript({
|
||||
output = await runScript([], {
|
||||
CONCURRENCY: '1',
|
||||
})
|
||||
})
|
||||
@@ -1122,15 +1123,13 @@ describe('back_fill_file_hash script', function () {
|
||||
const edge = projectId1.toString()
|
||||
let outputPart0, outputPart1
|
||||
beforeEach('run script on part 0', async function () {
|
||||
outputPart0 = await runScript({
|
||||
outputPart0 = await runScript([`--BATCH_RANGE_END=${edge}`], {
|
||||
CONCURRENCY: '1',
|
||||
BATCH_RANGE_END: edge,
|
||||
})
|
||||
})
|
||||
beforeEach('run script on part 1', async function () {
|
||||
outputPart1 = await runScript({
|
||||
outputPart1 = await runScript([`--BATCH_RANGE_START=${edge}`], {
|
||||
CONCURRENCY: '1',
|
||||
BATCH_RANGE_START: edge,
|
||||
})
|
||||
})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user