mirror of
https://github.com/yu-i-i/overleaf-cep.git
synced 2026-05-23 17:19:37 +02:00
Merge pull request #22070 from overleaf/jpa-args
[history-v1] refactor env vars to args for GNU-parallel usage GitOrigin-RevId: 1ebfa3dfc25d36f2f86c22fa22e4864d55b511b2
This commit is contained in:
@@ -5,6 +5,8 @@
|
|||||||
FROM node:20.18.0 AS base
|
FROM node:20.18.0 AS base
|
||||||
|
|
||||||
WORKDIR /overleaf/services/history-v1
|
WORKDIR /overleaf/services/history-v1
|
||||||
|
COPY services/history-v1/install_deps.sh /overleaf/services/history-v1/
|
||||||
|
RUN chmod 0755 ./install_deps.sh && ./install_deps.sh
|
||||||
|
|
||||||
# Google Cloud Storage needs a writable $HOME/.config for resumable uploads
|
# Google Cloud Storage needs a writable $HOME/.config for resumable uploads
|
||||||
# (see https://googleapis.dev/nodejs/storage/latest/File.html#createWriteStream)
|
# (see https://googleapis.dev/nodejs/storage/latest/File.html#createWriteStream)
|
||||||
|
|||||||
@@ -6,7 +6,10 @@ version: "2.3"
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
test_unit:
|
test_unit:
|
||||||
image: node:20.18.0
|
build:
|
||||||
|
context: ../..
|
||||||
|
dockerfile: services/history-v1/Dockerfile
|
||||||
|
target: base
|
||||||
volumes:
|
volumes:
|
||||||
- .:/overleaf/services/history-v1
|
- .:/overleaf/services/history-v1
|
||||||
- ../../node_modules:/overleaf/node_modules
|
- ../../node_modules:/overleaf/node_modules
|
||||||
@@ -20,7 +23,10 @@ services:
|
|||||||
user: node
|
user: node
|
||||||
|
|
||||||
test_acceptance:
|
test_acceptance:
|
||||||
image: node:20.18.0
|
build:
|
||||||
|
context: ../..
|
||||||
|
dockerfile: services/history-v1/Dockerfile
|
||||||
|
target: base
|
||||||
volumes:
|
volumes:
|
||||||
- .:/overleaf/services/history-v1
|
- .:/overleaf/services/history-v1
|
||||||
- ../../node_modules:/overleaf/node_modules
|
- ../../node_modules:/overleaf/node_modules
|
||||||
|
|||||||
9
services/history-v1/install_deps.sh
Executable file
9
services/history-v1/install_deps.sh
Executable file
@@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
apt-get update
|
||||||
|
|
||||||
|
apt-get install parallel --yes
|
||||||
|
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
@@ -34,6 +34,7 @@ import {
|
|||||||
} from '../lib/blob_store/index.js'
|
} from '../lib/blob_store/index.js'
|
||||||
import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js'
|
import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js'
|
||||||
import filestorePersistor from '../lib/persistor.js'
|
import filestorePersistor from '../lib/persistor.js'
|
||||||
|
import commandLineArgs from 'command-line-args'
|
||||||
|
|
||||||
// Silence warning.
|
// Silence warning.
|
||||||
Events.setMaxListeners(20)
|
Events.setMaxListeners(20)
|
||||||
@@ -84,20 +85,70 @@ ObjectId.cacheHexString = true
|
|||||||
* @property {Blob} [blob]
|
* @property {Blob} [blob]
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const COLLECT_BLOBS = process.argv.includes('blobs')
|
/**
|
||||||
|
* @return {{PROCESS_DELETED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, PROCESS_BLOBS: boolean, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, COLLECT_BACKED_UP_BLOBS: boolean}}
|
||||||
|
*/
|
||||||
|
function parseArgs() {
|
||||||
|
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
|
||||||
|
const args = commandLineArgs([
|
||||||
|
{ name: 'processNonDeletedProjects', type: String, defaultValue: 'false' },
|
||||||
|
{ name: 'processDeletedProjects', type: String, defaultValue: 'false' },
|
||||||
|
{ name: 'processDeletedFiles', type: String, defaultValue: 'false' },
|
||||||
|
{ name: 'processBlobs', type: String, defaultValue: 'true' },
|
||||||
|
{ name: 'collectBackedUpBlobs', type: String, defaultValue: 'true' },
|
||||||
|
{
|
||||||
|
name: 'BATCH_RANGE_START',
|
||||||
|
type: String,
|
||||||
|
defaultValue: PUBLIC_LAUNCH_DATE.toISOString(),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'BATCH_RANGE_END',
|
||||||
|
type: String,
|
||||||
|
defaultValue: new Date().toISOString(),
|
||||||
|
},
|
||||||
|
{ name: 'LOGGING_IDENTIFIER', type: String, defaultValue: '' },
|
||||||
|
])
|
||||||
|
/**
|
||||||
|
* commandLineArgs cannot handle --foo=false, so go the long way
|
||||||
|
* @param {string} name
|
||||||
|
* @return {boolean}
|
||||||
|
*/
|
||||||
|
function boolVal(name) {
|
||||||
|
const v = args[name]
|
||||||
|
if (['true', 'false'].includes(v)) return v === 'true'
|
||||||
|
throw new Error(`expected "true" or "false" for boolean option ${name}`)
|
||||||
|
}
|
||||||
|
const BATCH_RANGE_START = objectIdFromInput(
|
||||||
|
args['BATCH_RANGE_START']
|
||||||
|
).toString()
|
||||||
|
const BATCH_RANGE_END = objectIdFromInput(args['BATCH_RANGE_END']).toString()
|
||||||
|
return {
|
||||||
|
PROCESS_NON_DELETED_PROJECTS: boolVal('processNonDeletedProjects'),
|
||||||
|
PROCESS_DELETED_PROJECTS: boolVal('processDeletedProjects'),
|
||||||
|
PROCESS_BLOBS: boolVal('processBlobs'),
|
||||||
|
PROCESS_DELETED_FILES: boolVal('processDeletedFiles'),
|
||||||
|
COLLECT_BACKED_UP_BLOBS: boolVal('collectBackedUpBlobs'),
|
||||||
|
BATCH_RANGE_START,
|
||||||
|
BATCH_RANGE_END,
|
||||||
|
LOGGING_IDENTIFIER: args['LOGGING_IDENTIFIER'] || BATCH_RANGE_START,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
|
const {
|
||||||
const BATCH_RANGE_START = objectIdFromInput(
|
PROCESS_NON_DELETED_PROJECTS,
|
||||||
process.env.BATCH_RANGE_START || PUBLIC_LAUNCH_DATE.toISOString()
|
PROCESS_DELETED_PROJECTS,
|
||||||
).toString()
|
PROCESS_BLOBS,
|
||||||
const BATCH_RANGE_END = objectIdFromInput(
|
PROCESS_DELETED_FILES,
|
||||||
process.env.BATCH_RANGE_END || new Date().toISOString()
|
COLLECT_BACKED_UP_BLOBS,
|
||||||
).toString()
|
BATCH_RANGE_START,
|
||||||
// We need to control the start and end as ids of deleted projects are created at time of deletion.
|
BATCH_RANGE_END,
|
||||||
delete process.env.BATCH_RANGE_START
|
LOGGING_IDENTIFIER,
|
||||||
delete process.env.BATCH_RANGE_END
|
} = parseArgs()
|
||||||
|
|
||||||
const LOGGING_IDENTIFIER = process.env.LOGGING_IDENTIFIER || BATCH_RANGE_START
|
// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
|
||||||
|
if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
|
||||||
|
throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
|
||||||
|
}
|
||||||
|
|
||||||
// Concurrency for downloading from GCS and updating hashes in mongo
|
// Concurrency for downloading from GCS and updating hashes in mongo
|
||||||
const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10)
|
const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10)
|
||||||
@@ -396,7 +447,7 @@ async function uploadBlobToGCS(blobStore, entry, blob, hash, filePath) {
|
|||||||
if (entry.ctx.hasHistoryBlob(hash)) {
|
if (entry.ctx.hasHistoryBlob(hash)) {
|
||||||
return // fast-path using hint from pre-fetched blobs
|
return // fast-path using hint from pre-fetched blobs
|
||||||
}
|
}
|
||||||
if (!COLLECT_BLOBS && (await blobStore.getBlob(hash))) {
|
if (!PROCESS_BLOBS && (await blobStore.getBlob(hash))) {
|
||||||
entry.ctx.recordHistoryBlob(hash)
|
entry.ctx.recordHistoryBlob(hash)
|
||||||
return // round trip to postgres/mongo when not pre-fetched
|
return // round trip to postgres/mongo when not pre-fetched
|
||||||
}
|
}
|
||||||
@@ -817,7 +868,7 @@ function* findFileInBatch(
|
|||||||
* @return {Promise<{nBlobs: number, blobs: Map<string, Array<Blob>>}>}
|
* @return {Promise<{nBlobs: number, blobs: Map<string, Array<Blob>>}>}
|
||||||
*/
|
*/
|
||||||
async function collectProjectBlobs(batch) {
|
async function collectProjectBlobs(batch) {
|
||||||
if (!COLLECT_BLOBS) return { nBlobs: 0, blobs: new Map() }
|
if (!PROCESS_BLOBS) return { nBlobs: 0, blobs: new Map() }
|
||||||
return await getProjectBlobsBatch(batch.map(p => p.overleaf.history.id))
|
return await getProjectBlobsBatch(batch.map(p => p.overleaf.history.id))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -827,7 +878,7 @@ async function collectProjectBlobs(batch) {
|
|||||||
*/
|
*/
|
||||||
async function collectDeletedFiles(projects) {
|
async function collectDeletedFiles(projects) {
|
||||||
const deletedFiles = new Map()
|
const deletedFiles = new Map()
|
||||||
if (!process.argv.includes('deletedFiles')) return deletedFiles
|
if (!PROCESS_DELETED_FILES) return deletedFiles
|
||||||
|
|
||||||
const cursor = deletedFilesCollection.find(
|
const cursor = deletedFilesCollection.find(
|
||||||
{
|
{
|
||||||
@@ -860,9 +911,8 @@ async function collectDeletedFiles(projects) {
|
|||||||
async function collectBackedUpBlobs(projects) {
|
async function collectBackedUpBlobs(projects) {
|
||||||
let nBackedUpBlobs = 0
|
let nBackedUpBlobs = 0
|
||||||
const backedUpBlobs = new Map()
|
const backedUpBlobs = new Map()
|
||||||
if (!process.argv.includes('collectBackedUpBlobs')) {
|
if (!COLLECT_BACKED_UP_BLOBS) return { nBackedUpBlobs, backedUpBlobs }
|
||||||
return { nBackedUpBlobs, backedUpBlobs }
|
|
||||||
}
|
|
||||||
const cursor = backedUpBlobsCollection.find(
|
const cursor = backedUpBlobsCollection.find(
|
||||||
{ _id: { $in: projects.map(p => p._id) } },
|
{ _id: { $in: projects.map(p => p._id) } },
|
||||||
{
|
{
|
||||||
@@ -1122,7 +1172,7 @@ function estimateBlobSize(blob) {
|
|||||||
return size
|
return size
|
||||||
}
|
}
|
||||||
|
|
||||||
async function updateLiveFileTrees() {
|
async function processNonDeletedProjects() {
|
||||||
try {
|
try {
|
||||||
await batchedUpdate(
|
await batchedUpdate(
|
||||||
projectsCollection,
|
projectsCollection,
|
||||||
@@ -1144,7 +1194,7 @@ async function updateLiveFileTrees() {
|
|||||||
console.warn('Done updating live projects')
|
console.warn('Done updating live projects')
|
||||||
}
|
}
|
||||||
|
|
||||||
async function updateDeletedFileTrees() {
|
async function processDeletedProjects() {
|
||||||
try {
|
try {
|
||||||
await batchedUpdate(
|
await batchedUpdate(
|
||||||
deletedProjectsCollection,
|
deletedProjectsCollection,
|
||||||
@@ -1173,11 +1223,11 @@ async function updateDeletedFileTrees() {
|
|||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
await loadGlobalBlobs()
|
await loadGlobalBlobs()
|
||||||
if (process.argv.includes('live')) {
|
if (PROCESS_NON_DELETED_PROJECTS) {
|
||||||
await updateLiveFileTrees()
|
await processNonDeletedProjects()
|
||||||
}
|
}
|
||||||
if (process.argv.includes('deleted')) {
|
if (PROCESS_DELETED_PROJECTS) {
|
||||||
await updateDeletedFileTrees()
|
await processDeletedProjects()
|
||||||
}
|
}
|
||||||
console.warn('Done.')
|
console.warn('Done.')
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -476,22 +476,22 @@ describe('back_fill_file_hash script', function () {
|
|||||||
})
|
})
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* @param {Array<string>} args
|
||||||
* @param {Record<string, string>} env
|
* @param {Record<string, string>} env
|
||||||
* @param {boolean} shouldHaveWritten
|
* @param {boolean} shouldHaveWritten
|
||||||
* @return {Promise<{result, stats: any}>}
|
* @return {Promise<{result, stats: any}>}
|
||||||
*/
|
*/
|
||||||
async function tryRunScript(env = {}, shouldHaveWritten) {
|
async function tryRunScript(args = [], env = {}, shouldHaveWritten) {
|
||||||
let result
|
let result
|
||||||
try {
|
try {
|
||||||
result = await promisify(execFile)(
|
result = await promisify(execFile)(
|
||||||
process.argv0,
|
process.argv0,
|
||||||
[
|
[
|
||||||
'storage/scripts/back_fill_file_hash.mjs',
|
'storage/scripts/back_fill_file_hash.mjs',
|
||||||
'collectBackedUpBlobs',
|
'--processNonDeletedProjects=true',
|
||||||
'live',
|
'--processDeletedProjects=true',
|
||||||
'blobs',
|
'--processDeletedFiles=true',
|
||||||
'deleted',
|
...args,
|
||||||
'deletedFiles',
|
|
||||||
],
|
],
|
||||||
{
|
{
|
||||||
encoding: 'utf-8',
|
encoding: 'utf-8',
|
||||||
@@ -549,12 +549,13 @@ describe('back_fill_file_hash script', function () {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* @param {Array<string>} args
|
||||||
* @param {Record<string, string>} env
|
* @param {Record<string, string>} env
|
||||||
* @param {boolean} shouldHaveWritten
|
* @param {boolean} shouldHaveWritten
|
||||||
* @return {Promise<{result, stats: any}>}
|
* @return {Promise<{result, stats: any}>}
|
||||||
*/
|
*/
|
||||||
async function runScript(env = {}, shouldHaveWritten = true) {
|
async function runScript(args = [], env = {}, shouldHaveWritten = true) {
|
||||||
const { stats, result } = await tryRunScript(env, shouldHaveWritten)
|
const { stats, result } = await tryRunScript(args, env, shouldHaveWritten)
|
||||||
if (result.status !== 0) {
|
if (result.status !== 0) {
|
||||||
console.log(result)
|
console.log(result)
|
||||||
expect(result).to.have.property('status', 0)
|
expect(result).to.have.property('status', 0)
|
||||||
@@ -804,7 +805,7 @@ describe('back_fill_file_hash script', function () {
|
|||||||
])
|
])
|
||||||
})
|
})
|
||||||
it('should process nothing on re-run', async function () {
|
it('should process nothing on re-run', async function () {
|
||||||
const rerun = await runScript({}, false)
|
const rerun = await runScript([], {}, false)
|
||||||
expect(rerun.stats).deep.equal({
|
expect(rerun.stats).deep.equal({
|
||||||
...STATS_ALL_ZERO,
|
...STATS_ALL_ZERO,
|
||||||
// We still need to iterate over all the projects and blobs.
|
// We still need to iterate over all the projects and blobs.
|
||||||
@@ -983,7 +984,7 @@ describe('back_fill_file_hash script', function () {
|
|||||||
`${projectId0}/${fileId0}`
|
`${projectId0}/${fileId0}`
|
||||||
)
|
)
|
||||||
const t0 = Date.now()
|
const t0 = Date.now()
|
||||||
const { stats, result } = await tryRunScript({
|
const { stats, result } = await tryRunScript([], {
|
||||||
RETRIES: '10',
|
RETRIES: '10',
|
||||||
RETRY_DELAY_MS: '1000',
|
RETRY_DELAY_MS: '1000',
|
||||||
})
|
})
|
||||||
@@ -1025,7 +1026,7 @@ describe('back_fill_file_hash script', function () {
|
|||||||
value: { stats, result },
|
value: { stats, result },
|
||||||
},
|
},
|
||||||
] = await Promise.allSettled([
|
] = await Promise.allSettled([
|
||||||
tryRunScript({
|
tryRunScript([], {
|
||||||
RETRY_DELAY_MS: '100',
|
RETRY_DELAY_MS: '100',
|
||||||
RETRIES: '60',
|
RETRIES: '60',
|
||||||
RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests
|
RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests
|
||||||
@@ -1049,7 +1050,7 @@ describe('back_fill_file_hash script', function () {
|
|||||||
describe('full run CONCURRENCY=1', function () {
|
describe('full run CONCURRENCY=1', function () {
|
||||||
let output
|
let output
|
||||||
beforeEach('run script', async function () {
|
beforeEach('run script', async function () {
|
||||||
output = await runScript({
|
output = await runScript([], {
|
||||||
CONCURRENCY: '1',
|
CONCURRENCY: '1',
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
@@ -1063,7 +1064,7 @@ describe('back_fill_file_hash script', function () {
|
|||||||
describe('full run CONCURRENCY=10', function () {
|
describe('full run CONCURRENCY=10', function () {
|
||||||
let output
|
let output
|
||||||
beforeEach('run script', async function () {
|
beforeEach('run script', async function () {
|
||||||
output = await runScript({
|
output = await runScript([], {
|
||||||
CONCURRENCY: '10',
|
CONCURRENCY: '10',
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
@@ -1076,7 +1077,7 @@ describe('back_fill_file_hash script', function () {
|
|||||||
describe('full run STREAM_HIGH_WATER_MARK=1MB', function () {
|
describe('full run STREAM_HIGH_WATER_MARK=1MB', function () {
|
||||||
let output
|
let output
|
||||||
beforeEach('run script', async function () {
|
beforeEach('run script', async function () {
|
||||||
output = await runScript({
|
output = await runScript([], {
|
||||||
STREAM_HIGH_WATER_MARK: (1024 * 1024).toString(),
|
STREAM_HIGH_WATER_MARK: (1024 * 1024).toString(),
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
@@ -1098,7 +1099,7 @@ describe('back_fill_file_hash script', function () {
|
|||||||
})
|
})
|
||||||
let output
|
let output
|
||||||
beforeEach('run script', async function () {
|
beforeEach('run script', async function () {
|
||||||
output = await runScript({
|
output = await runScript([], {
|
||||||
CONCURRENCY: '1',
|
CONCURRENCY: '1',
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
@@ -1122,15 +1123,13 @@ describe('back_fill_file_hash script', function () {
|
|||||||
const edge = projectId1.toString()
|
const edge = projectId1.toString()
|
||||||
let outputPart0, outputPart1
|
let outputPart0, outputPart1
|
||||||
beforeEach('run script on part 0', async function () {
|
beforeEach('run script on part 0', async function () {
|
||||||
outputPart0 = await runScript({
|
outputPart0 = await runScript([`--BATCH_RANGE_END=${edge}`], {
|
||||||
CONCURRENCY: '1',
|
CONCURRENCY: '1',
|
||||||
BATCH_RANGE_END: edge,
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
beforeEach('run script on part 1', async function () {
|
beforeEach('run script on part 1', async function () {
|
||||||
outputPart1 = await runScript({
|
outputPart1 = await runScript([`--BATCH_RANGE_START=${edge}`], {
|
||||||
CONCURRENCY: '1',
|
CONCURRENCY: '1',
|
||||||
BATCH_RANGE_START: edge,
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user