[history-v1] add flag for back-filling hashes for projectIds from file (#23005)

* [history-v1] delay process exit to give logging time to flush

* [history-v1] add flag for back-filling hashes for projectIds from file

GitOrigin-RevId: 887a1e1c72d6f5a13bfc8d0e54023afbf5bc671c
This commit is contained in:
Jakob Ackermann
2025-01-21 16:43:14 +00:00
committed by Copybot
parent fdf33f0ed7
commit f8b2cc7ce8
4 changed files with 143 additions and 6 deletions

View File

@@ -35,6 +35,7 @@ import {
import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js'
import filestorePersistor from '../lib/persistor.js'
import commandLineArgs from 'command-line-args'
import readline from 'node:readline'
// Silence warning.
Events.setMaxListeners(20)
@@ -46,6 +47,8 @@ ObjectId.cacheHexString = true
* @typedef {import("overleaf-editor-core").Blob} Blob
* @typedef {import("perf_hooks").EventLoopUtilization} EventLoopUtilization
* @typedef {import("mongodb").Collection} Collection
* @typedef {import("mongodb").Collection<Project>} ProjectsCollection
* @typedef {import("mongodb").Collection<{project:Project}>} DeletedProjectsCollection
* @typedef {import("@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor").CachedPerProjectEncryptedS3Persistor} CachedPerProjectEncryptedS3Persistor
*/
@@ -86,7 +89,7 @@ ObjectId.cacheHexString = true
*/
/**
* @return {{PROCESS_HASHED_FILES: boolean, PROCESS_DELETED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, PROCESS_BLOBS: boolean, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, COLLECT_BACKED_UP_BLOBS: boolean}}
* @return {{PROJECT_IDS_FROM: string, PROCESS_HASHED_FILES: boolean, PROCESS_DELETED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, PROCESS_BLOBS: boolean, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, COLLECT_BACKED_UP_BLOBS: boolean}}
*/
function parseArgs() {
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
@@ -96,6 +99,7 @@ function parseArgs() {
{ name: 'processDeletedFiles', type: String, defaultValue: 'false' },
{ name: 'processHashedFiles', type: String, defaultValue: 'false' },
{ name: 'processBlobs', type: String, defaultValue: 'true' },
{ name: 'projectIdsFrom', type: String, defaultValue: '' },
{ name: 'collectBackedUpBlobs', type: String, defaultValue: 'true' },
{
name: 'BATCH_RANGE_START',
@@ -133,6 +137,7 @@ function parseArgs() {
BATCH_RANGE_START,
BATCH_RANGE_END,
LOGGING_IDENTIFIER: args['LOGGING_IDENTIFIER'] || BATCH_RANGE_START,
PROJECT_IDS_FROM: args['projectIdsFrom'],
}
}
@@ -146,6 +151,7 @@ const {
BATCH_RANGE_START,
BATCH_RANGE_END,
LOGGING_IDENTIFIER,
PROJECT_IDS_FROM,
} = parseArgs()
// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
@@ -174,9 +180,14 @@ const STREAM_HIGH_WATER_MARK = parseInt(
10
)
const LOGGING_INTERVAL = parseInt(process.env.LOGGING_INTERVAL || '60000', 10)
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
const projectsCollection = db.collection('projects')
/** @type {ProjectsCollection} */
const typedProjectsCollection = db.collection('projects')
const deletedProjectsCollection = db.collection('deletedProjects')
/** @type {DeletedProjectsCollection} */
const typedDeletedProjectsCollection = db.collection('deletedProjects')
const deletedFilesCollection = db.collection('deletedFiles')
const concurrencyLimit = pLimit(CONCURRENCY)
@@ -1316,6 +1327,51 @@ function estimateBlobSize(blob) {
return size
}
async function processProjectsFromFile() {
const rl = readline.createInterface({
input: fs.createReadStream(PROJECT_IDS_FROM),
})
for await (const projectId of rl) {
if (!projectId) continue // skip over trailing new line
let project = await typedProjectsCollection.findOne(
{ _id: new ObjectId(projectId) },
{ projection: { rootFolder: 1, _id: 1, 'overleaf.history.id': 1 } }
)
let prefix = 'rootFolder.0'
if (!project) {
const deletedProject = await typedDeletedProjectsCollection.findOne(
{ 'deleterData.deletedProjectId': new ObjectId(projectId) },
{
projection: {
'project.rootFolder': 1,
'project._id': 1,
'project.overleaf.history.id': 1,
},
}
)
if (!deletedProject?.project) {
logger.warn({ projectId }, 'project hard-deleted')
continue
}
project = deletedProject.project
prefix = 'project.rootFolder.0'
}
if (!project?.overleaf?.history?.id) {
logger.warn({ projectId }, 'project has no history id')
continue
}
try {
await queueNextBatch([project], prefix)
} catch (err) {
gracefulShutdownInitiated = true
await waitForDeferredQueues()
throw err
}
}
await waitForDeferredQueues()
console.warn('Done updating projects from input file')
}
async function processNonDeletedProjects() {
try {
await batchedUpdate(
@@ -1367,11 +1423,15 @@ async function processDeletedProjects() {
async function main() {
await loadGlobalBlobs()
if (PROCESS_NON_DELETED_PROJECTS) {
await processNonDeletedProjects()
}
if (PROCESS_DELETED_PROJECTS) {
await processDeletedProjects()
if (PROJECT_IDS_FROM) {
await processProjectsFromFile()
} else {
if (PROCESS_NON_DELETED_PROJECTS) {
await processNonDeletedProjects()
}
if (PROCESS_DELETED_PROJECTS) {
await processDeletedProjects()
}
}
console.warn('Done.')
}
@@ -1407,8 +1467,10 @@ try {
)
code++
}
await setTimeout(SLEEP_BEFORE_EXIT)
process.exit(code)
} catch (err) {
console.error(err)
await setTimeout(SLEEP_BEFORE_EXIT)
process.exit(1)
}

View File

@@ -18,6 +18,7 @@ import readline from 'node:readline'
import { _blobIsBackedUp, backupBlob } from '../lib/backupBlob.mjs'
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
import filestorePersistor from '../lib/persistor.js'
import { setTimeout } from 'node:timers/promises'
// Silence warning.
Events.setMaxListeners(20)
@@ -102,6 +103,7 @@ const STREAM_HIGH_WATER_MARK = parseInt(
process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(),
10
)
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
/** @type {ProjectsCollection} */
const projectsCollection = db.collection('projects')
@@ -630,6 +632,7 @@ async function main() {
}
}
const { skipped, failed, unmatched } = STATS
await setTimeout(SLEEP_BEFORE_EXIT)
if (failed > 0) {
process.exit(Math.min(failed, 99))
} else if (unmatched > 0) {

View File

@@ -585,6 +585,7 @@ describe('back_fill_file_hash script', function () {
env: {
...process.env,
USER_FILES_BUCKET_NAME,
SLEEP_BEFORE_EXIT: '1',
...env,
LOG_LEVEL: 'warn', // Override LOG_LEVEL of acceptance tests
},
@@ -1436,4 +1437,74 @@ describe('back_fill_file_hash script', function () {
})
commonAssertions()
})
describe('projectIds from file', () => {
const path0 = '/tmp/project-ids-0.txt'
const path1 = '/tmp/project-ids-1.txt'
beforeEach('create project-ids.txt files', async function () {
await fs.promises.writeFile(
path0,
[projectId0, projectId1].map(id => id.toString()).join('\n')
)
await fs.promises.writeFile(
path1,
[
projectId2,
projectId3,
projectIdDeleted0,
projectIdDeleted1,
projectIdNoHistory,
projectIdNoHistoryDeleted,
projectIdHardDeleted,
projectIdNoOverleaf,
projectIdNoOverleafDeleted,
projectIdBadFileTree0,
projectIdBadFileTree1,
projectIdBadFileTree2,
projectIdBadFileTree3,
]
.map(id => id.toString())
.join('\n')
)
})
let outputPart0, outputPart1
beforeEach('run script on part 0', async function () {
outputPart0 = await runScript([`--projectIdsFrom=${path0}`])
})
beforeEach('run script on part 1', async function () {
outputPart1 = await runScript([`--projectIdsFrom=${path1}`])
})
/**
* @param {string} msg
* @param {ObjectId} projectId
*/
function expectLogEntry(msg, projectId) {
expect(outputPart1.result.stdout).to.include(msg)
const log = JSON.parse(
outputPart1.result.stdout
.split('\n')
.find(l => l.includes(`"${msg}"`) && l.includes(projectId.toString()))
)
expect(log).to.contain({
projectId: projectId.toString(),
msg,
})
}
it('should flag the hard-deleted project', function () {
expectLogEntry('project hard-deleted', projectIdHardDeleted)
})
it('should flag the projects without history id', function () {
expectLogEntry('project has no history id', projectIdNoOverleaf)
expectLogEntry('project has no history id', projectIdNoOverleafDeleted)
expectLogEntry('project has no history id', projectIdNoHistory)
expectLogEntry('project has no history id', projectIdNoHistoryDeleted)
})
it('should print stats', function () {
expect(outputPart0.stats).to.deep.equal(STATS_UP_TO_PROJECT1)
expect(outputPart1.stats).to.deep.equal(STATS_UP_FROM_PROJECT1_ONWARD)
})
commonAssertions()
})
})

View File

@@ -505,6 +505,7 @@ describe('back_fill_file_hash_fix_up script', function () {
env: {
...process.env,
USER_FILES_BUCKET_NAME,
SLEEP_BEFORE_EXIT: '1',
...env,
LOG_LEVEL: 'warn', // Override LOG_LEVEL of acceptance tests
},