From 9a69ed9d902b4ec9904d02048aee1a3b4e87714b Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Fri, 31 Jan 2025 15:55:53 +0000 Subject: [PATCH] Merge pull request #23169 from overleaf/bg-export-global-blobs Add script to export global blobs to CSV GitOrigin-RevId: 86c885981b150aae4541770a094c822e1fb364b9 --- .../storage/scripts/export_global_blobs.mjs | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 services/history-v1/storage/scripts/export_global_blobs.mjs diff --git a/services/history-v1/storage/scripts/export_global_blobs.mjs b/services/history-v1/storage/scripts/export_global_blobs.mjs new file mode 100644 index 0000000000..ccbb1237dd --- /dev/null +++ b/services/history-v1/storage/scripts/export_global_blobs.mjs @@ -0,0 +1,69 @@ +/** + * A script to export the global blobs from mongo to a CSV file. + * + * node storage/scripts/export_global_blobs.mjs --output global_blobs.csv + * + * The output CSV has the following format: + * + * hash,path,byteLength,stringLength,demoted + * + * hash: the hash of the blob + * path: the path of the blob in the blob store + * byteLength: the byte length of the blob, or empty if unknown + * stringLength: the string length of the blob, or empty if unknown + * demoted: true if the blob has been demoted to a reference, false otherwise + */ + +// @ts-check +import { ObjectId } from 'mongodb' +import { GLOBAL_BLOBS, loadGlobalBlobs } from '../lib/blob_store/index.js' +import { client } from '../lib/mongodb.js' +import commandLineArgs from 'command-line-args' +import fs from 'node:fs' + +// Enable caching for ObjectId.toString() +ObjectId.cacheHexString = true + +function parseArgs() { + const args = commandLineArgs([ + { + name: 'output', + type: String, + alias: 'o', + }, + ]) + const OUTPUT_STREAM = fs.createWriteStream(args['output'], { flags: 'wx' }) + + return { + OUTPUT_STREAM, + } +} + +const { OUTPUT_STREAM } = parseArgs() + +async function main() { + await loadGlobalBlobs() + OUTPUT_STREAM.write('hash,path,byteLength,stringLength,demoted\n') + for (const [hash, { blob, demoted }] of GLOBAL_BLOBS) { + const { hash: blobHash, byteLength, stringLength } = blob + if (blobHash !== hash) { + throw new Error(`hash mismatch: ${hash} !== ${blobHash}`) + } + const path = blobHash.slice(0, 2) + '/' + blobHash.slice(2) + const byteLengthStr = byteLength === null ? '' : byteLength + const stringLengthStr = stringLength === null ? '' : stringLength + OUTPUT_STREAM.write( + `${hash},${path},${byteLengthStr},${stringLengthStr},${demoted}\n` + ) + } +} + +main() + .then(() => console.log('Done.')) + .catch(err => { + console.error('Error:', err) + process.exitCode = 1 + }) + .finally(() => { + client.close().catch(err => console.error('Error closing MongoDB:', err)) + })