From d6e89c7338bdba692ff599253290f7fd85d2b48b Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Thu, 23 Jan 2025 11:12:25 +0000 Subject: [PATCH] Merge pull request #23058 from overleaf/bg-export-backed-up-blobs export backed up blobs to csv file GitOrigin-RevId: 5a176055dc85c56ced64cbf13aa705b56071cdae --- .../scripts/fix_string_backedUpBlobs_ids.mjs | 8 +- .../scripts/verify_backed_up_blobs.mjs | 149 ++++++++++++++++++ 2 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 services/history-v1/storage/scripts/verify_backed_up_blobs.mjs diff --git a/services/history-v1/storage/scripts/fix_string_backedUpBlobs_ids.mjs b/services/history-v1/storage/scripts/fix_string_backedUpBlobs_ids.mjs index adb5028643..007eebea77 100644 --- a/services/history-v1/storage/scripts/fix_string_backedUpBlobs_ids.mjs +++ b/services/history-v1/storage/scripts/fix_string_backedUpBlobs_ids.mjs @@ -20,7 +20,13 @@ async function processRecord(record) { mongoId(record._id) const newId = new ObjectId(record._id) if (config.commit) { - await backedUpBlobs.insertOne({ _id: newId, blobs: record.blobs }) + await backedUpBlobs.updateOne( + { _id: newId }, + { + $addToSet: { blobs: { $each: record.blobs } }, + }, + { upsert: true } + ) await backedUpBlobs.deleteOne({ _id: record._id }) } STATS.replaced++ diff --git a/services/history-v1/storage/scripts/verify_backed_up_blobs.mjs b/services/history-v1/storage/scripts/verify_backed_up_blobs.mjs new file mode 100644 index 0000000000..f4778c382f --- /dev/null +++ b/services/history-v1/storage/scripts/verify_backed_up_blobs.mjs @@ -0,0 +1,149 @@ +// @ts-check +import { ObjectId } from 'mongodb' +import knex from '../lib/knex.js' +import { + batchedUpdate, + objectIdFromInput, + READ_PREFERENCE_SECONDARY, +} from '@overleaf/mongo-utils/batchedUpdate.js' +import { + GLOBAL_BLOBS, + loadGlobalBlobs, + makeProjectKey, +} from '../lib/blob_store/index.js' +import { + backedUpBlobs as backedUpBlobsCollection, + db, + client, +} from '../lib/mongodb.js' +import commandLineArgs from 'command-line-args' +import fs from 'node:fs' + +const projectsCollection = db.collection('projects') + +// Enable caching for ObjectId.toString() +ObjectId.cacheHexString = true + +function parseArgs() { + const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z') + const args = commandLineArgs([ + { + name: 'BATCH_RANGE_START', + type: String, + defaultValue: PUBLIC_LAUNCH_DATE.toISOString(), + }, + { + name: 'BATCH_RANGE_END', + type: String, + defaultValue: new Date().toISOString(), + }, + { + name: 'output', + type: String, + alias: 'o', + }, + ]) + const BATCH_RANGE_START = objectIdFromInput( + args['BATCH_RANGE_START'] + ).toString() + const BATCH_RANGE_END = objectIdFromInput(args['BATCH_RANGE_END']).toString() + if (!args['output']) { + throw new Error('missing --output') + } + const OUTPUT_STREAM = fs.createWriteStream(args['output']) + + return { + BATCH_RANGE_START, + BATCH_RANGE_END, + OUTPUT_STREAM, + } +} + +const { BATCH_RANGE_START, BATCH_RANGE_END, OUTPUT_STREAM } = parseArgs() + +// We need to handle the start and end differently as ids of deleted projects are created at time of deletion. +if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) { + throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END') +} + +let gracefulShutdownInitiated = false + +process.on('SIGINT', handleSignal) +process.on('SIGTERM', handleSignal) + +function handleSignal() { + gracefulShutdownInitiated = true + console.warn('graceful shutdown initiated, draining queue') +} + +async function processBatch(batch) { + if (gracefulShutdownInitiated) { + throw new Error('graceful shutdown: aborting batch processing') + } + + const N = batch.length + const firstId = batch[0]._id + const lastId = batch[N - 1]._id + const projectCursor = await projectsCollection.find( + { _id: { $gte: firstId, $lte: lastId } }, + { + projection: { _id: 1, 'overleaf.history.id': 1, lastUpdated: 1 }, + readPreference: READ_PREFERENCE_SECONDARY, + } + ) + const projectMap = new Map() + for await (const project of projectCursor) { + projectMap.set(project._id.toString(), project) + } + for (const project of batch) { + const projectId = project._id.toString() + const projectRecord = projectMap.get(projectId) + if (!projectRecord) { + console.error(`project not found: ${projectId}`) + continue + } + if (!projectRecord.overleaf?.history?.id) { + console.error(`project missing history: ${projectId}`) + continue + } + const historyId = projectRecord.overleaf.history.id.toString() + const prefix = `${projectId},${projectRecord.lastUpdated.toISOString()},` + const hashes = project.blobs.map(blob => blob.toString('hex')) + const projectBlobHashes = hashes.filter(hash => !GLOBAL_BLOBS.has(hash)) + if (projectBlobHashes.length < hashes.length) { + console.warn( + `project ${projectId} has ${hashes.length - projectBlobHashes.length} global blobs` + ) + } + const rows = projectBlobHashes.map( + hash => prefix + makeProjectKey(historyId, hash) + '\n' + ) + OUTPUT_STREAM.write(rows.join('')) + } +} + +async function main() { + await loadGlobalBlobs() + OUTPUT_STREAM.write('projectId,lastUpdated,path\n') + await batchedUpdate( + backedUpBlobsCollection, + {}, + processBatch, + {}, + {}, + { BATCH_RANGE_START, BATCH_RANGE_END } + ) +} + +main() + .then(() => console.log('Done.')) + .catch(err => { + console.error('Error:', err) + process.exitCode = 1 + }) + .finally(() => { + knex.destroy().catch(err => { + console.error('Error closing Postgres connection:', err) + }) + client.close().catch(err => console.error('Error closing MongoDB:', err)) + })