From a0d79d5871ebd21d411f0426d1bcd67f617e5cf0 Mon Sep 17 00:00:00 2001 From: Andrew Rumble Date: Fri, 21 Feb 2025 13:57:00 +0000 Subject: [PATCH] Merge pull request #23742 from overleaf/ar-script-for-removing-blobs-in-wrong-place [history-v1] Script for deleting blobs backed up to wrong location GitOrigin-RevId: 6abe1dba5164f8fa7d41c4ee3e4a07764e73b7e1 --- .../remove_backup_blobs_from_wrong_path.mjs | 233 ++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 services/history-v1/storage/scripts/remove_backup_blobs_from_wrong_path.mjs diff --git a/services/history-v1/storage/scripts/remove_backup_blobs_from_wrong_path.mjs b/services/history-v1/storage/scripts/remove_backup_blobs_from_wrong_path.mjs new file mode 100644 index 0000000000..07ed4ca8c3 --- /dev/null +++ b/services/history-v1/storage/scripts/remove_backup_blobs_from_wrong_path.mjs @@ -0,0 +1,233 @@ +// @ts-check + +/** + * This script is used to remove blobs that have been backed up under the project ID + * instead of the history ID (where those are different). + * + * This script reads a CSV file with the following format: + * ``` + * project_id,hash + * , + * ``` + * + * The header row is optional. All rows will be checked for conformance to the format. + */ + +import commandLineArgs from 'command-line-args' +import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs' +import { makeProjectKey } from '../lib/blob_store/index.js' +import fs from 'node:fs' +import assert from '../lib/assert.js' +import { client, projects } from '../lib/mongodb.js' +import { ObjectId } from 'mongodb' +import { verifyBlobs } from '../lib/backupVerifier.mjs' +import { setTimeout } from 'node:timers/promises' +import check from 'check-types' + +const argsSchema = [ + { + name: 'input', + type: String, + }, + { + name: 'commit', + type: Boolean, + }, + { + name: 'header', + type: Boolean, + }, + { + name: 'force', + type: Boolean, + }, +] + +const args = commandLineArgs(argsSchema) + +async function gracefulClose(code = 0) { + await client.close() + process.exit(code) +} + +/** + * + * @param {(value: unknown) => void} fn + * @param {unknown} value + * @return {boolean} + */ +function not(fn, value) { + try { + fn(value) + return false + } catch { + return true + } +} + +/** + * + * @param {string} row + * @return {{projectId: string, hash: string}} + */ +function parseCSVRow(row) { + const [projectId, hash] = row.split(',') + assert.mongoId(projectId, `invalid projectId ${projectId}`) + assert.blobHash(hash, `invalid hash ${hash}`) + return { projectId, hash } +} + +/** + * + * @param {string} path + * @param {boolean} hasHeader + * @return {AsyncGenerator<{projectId: string, hash: string}, void, *>} + */ +async function* readCSV(path, hasHeader) { + let seenHeader = !hasHeader + let fh + try { + fh = await fs.promises.open(path, 'r') + } catch (error) { + console.error(`Could not open file: ${error}`) + return await gracefulClose(1) + } + for await (const line of fh.readLines()) { + if (!seenHeader) { + const [first, second] = line.split(',') + const noDataInHeader = + not(assert.mongoId, first) && not(assert.blobHash, second) + if (!noDataInHeader) { + console.error('Data found in header row') + return await gracefulClose(1) + } + seenHeader = true + continue + } + try { + yield parseCSVRow(line) + } catch (error) { + console.error(error instanceof Error ? error.message : error) + console.info(`Skipping invalid row: ${line}`) + } + } +} + +function usage() { + console.info( + 'Usage: remove_blobs_from_backup.mjs --input [--commit] [--header] [--force]' + ) +} + +if (!args.input) { + console.error('--input was missing') + usage() + await gracefulClose(1) +} + +/** + * Taken from backup store (in PR currently), will switch to using that when it lands. + * + * @param {string} projectId + * @return {Promise} + */ +async function getHistoryId(projectId) { + const project = await projects.findOne( + { _id: new ObjectId(projectId) }, + { + projection: { + 'overleaf.history.id': 1, + }, + } + ) + if (!project) { + throw new Error('Project not found') + } + return project.overleaf.history.id +} + +/** + * + * @param {string} projectId + * @param {string} hash + * @return {Promise} + */ +async function deleteBlob(projectId, hash) { + const path = makeProjectKey(projectId, hash) + if (args.commit) { + await backupPersistor.deleteObject(projectBlobsBucket, path) + } else { + console.log(`DELETE: ${path}`) + } +} + +/** + * + * @param {string} projectId + * @param {string} hash + * @return {Promise} + */ +async function canDeleteBlob(projectId, hash) { + let historyId + try { + historyId = await getHistoryId(projectId) + } catch (error) { + throw new Error(`No history ID found for project ${projectId}, skipping`) + } + if (historyId === projectId) { + throw new Error( + `Project ID and history ID are the same for ${projectId} - use --force to delete anyway` + ) + } + + // TODO: fix assert.postgresId to handle integers better and then stop coercing to string below + assert.postgresId( + `${historyId}`, + `History ID ${historyId} does not appear to be for a postgres project` + ) + + try { + await verifyBlobs(historyId, [hash]) + } catch (error) { + throw new Error( + `Blob ${hash} is not backed up for project ${projectId} - use --force to delete anyway` + ) + } +} + +if (!args.commit) { + console.log('DRY RUN: provide --commit to perform operations') +} + +if (args.force) { + console.log( + 'WARNING: --force is enabled, blobs will be deleted regardless of backup status' + ) + await setTimeout(5_000) +} + +let deleted = 0 +let errors = 0 + +for await (const { projectId, hash } of readCSV(args.input, args.header)) { + if (!args.force) { + try { + await canDeleteBlob(projectId, hash) + } catch (error) { + console.error(error instanceof Error ? error.message : error) + continue + } + } + try { + await deleteBlob(projectId, hash) + deleted++ + } catch (error) { + errors++ + console.error(error) + } +} + +console.log(`Deleted: ${deleted}`) +console.log(`Errors: ${errors}`) + +await gracefulClose()