Merge pull request #23742 from overleaf/ar-script-for-removing-blobs-in-wrong-place

[history-v1] Script for deleting blobs backed up to wrong location

GitOrigin-RevId: 6abe1dba5164f8fa7d41c4ee3e4a07764e73b7e1
This commit is contained in:
Andrew Rumble
2025-02-21 13:57:00 +00:00
committed by Copybot
parent df671b2869
commit a0d79d5871

View File

@@ -0,0 +1,233 @@
// @ts-check
/**
* This script is used to remove blobs that have been backed up under the project ID
* instead of the history ID (where those are different).
*
* This script reads a CSV file with the following format:
* ```
* project_id,hash
* <mongo ID>,<hash>
* ```
*
* The header row is optional. All rows will be checked for conformance to the format.
*/
import commandLineArgs from 'command-line-args'
import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
import { makeProjectKey } from '../lib/blob_store/index.js'
import fs from 'node:fs'
import assert from '../lib/assert.js'
import { client, projects } from '../lib/mongodb.js'
import { ObjectId } from 'mongodb'
import { verifyBlobs } from '../lib/backupVerifier.mjs'
import { setTimeout } from 'node:timers/promises'
import check from 'check-types'
const argsSchema = [
{
name: 'input',
type: String,
},
{
name: 'commit',
type: Boolean,
},
{
name: 'header',
type: Boolean,
},
{
name: 'force',
type: Boolean,
},
]
const args = commandLineArgs(argsSchema)
async function gracefulClose(code = 0) {
await client.close()
process.exit(code)
}
/**
*
* @param {(value: unknown) => void} fn
* @param {unknown} value
* @return {boolean}
*/
function not(fn, value) {
try {
fn(value)
return false
} catch {
return true
}
}
/**
*
* @param {string} row
* @return {{projectId: string, hash: string}}
*/
function parseCSVRow(row) {
const [projectId, hash] = row.split(',')
assert.mongoId(projectId, `invalid projectId ${projectId}`)
assert.blobHash(hash, `invalid hash ${hash}`)
return { projectId, hash }
}
/**
*
* @param {string} path
* @param {boolean} hasHeader
* @return {AsyncGenerator<{projectId: string, hash: string}, void, *>}
*/
async function* readCSV(path, hasHeader) {
let seenHeader = !hasHeader
let fh
try {
fh = await fs.promises.open(path, 'r')
} catch (error) {
console.error(`Could not open file: ${error}`)
return await gracefulClose(1)
}
for await (const line of fh.readLines()) {
if (!seenHeader) {
const [first, second] = line.split(',')
const noDataInHeader =
not(assert.mongoId, first) && not(assert.blobHash, second)
if (!noDataInHeader) {
console.error('Data found in header row')
return await gracefulClose(1)
}
seenHeader = true
continue
}
try {
yield parseCSVRow(line)
} catch (error) {
console.error(error instanceof Error ? error.message : error)
console.info(`Skipping invalid row: ${line}`)
}
}
}
function usage() {
console.info(
'Usage: remove_blobs_from_backup.mjs --input <path> [--commit] [--header] [--force]'
)
}
if (!args.input) {
console.error('--input was missing')
usage()
await gracefulClose(1)
}
/**
* Taken from backup store (in PR currently), will switch to using that when it lands.
*
* @param {string} projectId
* @return {Promise<string>}
*/
async function getHistoryId(projectId) {
const project = await projects.findOne(
{ _id: new ObjectId(projectId) },
{
projection: {
'overleaf.history.id': 1,
},
}
)
if (!project) {
throw new Error('Project not found')
}
return project.overleaf.history.id
}
/**
*
* @param {string} projectId
* @param {string} hash
* @return {Promise<void>}
*/
async function deleteBlob(projectId, hash) {
const path = makeProjectKey(projectId, hash)
if (args.commit) {
await backupPersistor.deleteObject(projectBlobsBucket, path)
} else {
console.log(`DELETE: ${path}`)
}
}
/**
*
* @param {string} projectId
* @param {string} hash
* @return {Promise<void>}
*/
async function canDeleteBlob(projectId, hash) {
let historyId
try {
historyId = await getHistoryId(projectId)
} catch (error) {
throw new Error(`No history ID found for project ${projectId}, skipping`)
}
if (historyId === projectId) {
throw new Error(
`Project ID and history ID are the same for ${projectId} - use --force to delete anyway`
)
}
// TODO: fix assert.postgresId to handle integers better and then stop coercing to string below
assert.postgresId(
`${historyId}`,
`History ID ${historyId} does not appear to be for a postgres project`
)
try {
await verifyBlobs(historyId, [hash])
} catch (error) {
throw new Error(
`Blob ${hash} is not backed up for project ${projectId} - use --force to delete anyway`
)
}
}
if (!args.commit) {
console.log('DRY RUN: provide --commit to perform operations')
}
if (args.force) {
console.log(
'WARNING: --force is enabled, blobs will be deleted regardless of backup status'
)
await setTimeout(5_000)
}
let deleted = 0
let errors = 0
for await (const { projectId, hash } of readCSV(args.input, args.header)) {
if (!args.force) {
try {
await canDeleteBlob(projectId, hash)
} catch (error) {
console.error(error instanceof Error ? error.message : error)
continue
}
}
try {
await deleteBlob(projectId, hash)
deleted++
} catch (error) {
errors++
console.error(error)
}
}
console.log(`Deleted: ${deleted}`)
console.log(`Errors: ${errors}`)
await gracefulClose()