Merge pull request #32702 from overleaf/bg-allow-redacting-blobs

add script for redacting unwanted blobs

GitOrigin-RevId: cddbeb4d27546b7cb98634ab364cc8dad0ada76c
This commit is contained in:
Brian Gough
2026-04-09 09:41:30 +01:00
committed by Copybot
parent c83d37437d
commit 367e8d53b2
2 changed files with 363 additions and 0 deletions
@@ -0,0 +1,144 @@
import fs from 'node:fs'
import { Readable } from 'node:stream'
import { createRequire } from 'node:module'
import * as readline from 'node:readline/promises'
import commandLineArgs from 'command-line-args'
import { makeProjectKey } from '../lib/blob_store/index.js'
import { client } from '../lib/mongodb.js'
import knex from '../lib/knex.js'
import redis from '../lib/redis.js'
const require = createRequire(import.meta.url)
const config = require('config')
const persistor = require('../lib/persistor.js')
const { Errors } = require('@overleaf/object-persistor')
const optionDefinitions = [
{ name: 'historyId', alias: 'p', type: String },
{ name: 'blob', alias: 'b', type: String },
{ name: 'file', alias: 'f', type: String },
{ name: 'empty', alias: 'e', type: Boolean },
{ name: 'delete', alias: 'd', type: Boolean },
{ name: 'yes', alias: 'y', type: Boolean },
{ name: 'message', alias: 'm', type: String },
]
async function replaceBlob(historyId, blobHash, options) {
const bucket = config.get('blobStore.projectBucket')
const key = makeProjectKey(historyId, blobHash)
// 1. Check existence
let originalSize
try {
originalSize = await persistor.getObjectSize(bucket, key)
console.log(`Found blob ${blobHash} of size ${originalSize} bytes`)
} catch (err) {
if (
err instanceof Errors.NotFoundError ||
err.code === 'NoSuchKey' ||
err.name === 'NoSuchKey'
) {
throw new Error(`Blob ${blobHash} not found in project ${historyId}`)
}
throw err
}
// 2. Prepare action
let stream
let streamSize
let actionDesc
if (!options.delete) {
if (options.empty) {
stream = Readable.from([])
streamSize = 0
actionDesc = 'empty file'
} else if (options.file) {
const stat = fs.statSync(options.file)
stream = fs.createReadStream(options.file)
streamSize = stat.size
actionDesc = `file ${options.file}`
} else {
const baseMessage = options.message || 'REDACTED'
const msg = `${baseMessage} ${new Date().toISOString()}`
const buf = Buffer.from(msg, 'utf8')
stream = Readable.from([buf])
streamSize = buf.length
actionDesc = `message "${msg}"`
}
}
const actionLog = options.delete
? `Deleting blob ${blobHash} in ${historyId}`
: `Replacing blob ${blobHash} in ${historyId} with ${actionDesc} (${streamSize} bytes)`
console.log(actionLog)
if (!options.yes) {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
})
const answer = await rl.question('Proceed (Y/N)? ')
rl.close()
if (answer.toLowerCase() !== 'y') {
console.log('Aborted.')
return
}
}
// 3. Execute action
if (options.delete) {
await persistor.deleteObject(bucket, key)
console.log('Blob deleted successfully.')
} else {
await persistor.sendStream(bucket, key, stream, {
contentType: 'application/octet-stream',
contentLength: streamSize,
})
console.log('Blob replaced successfully.')
}
}
async function main() {
const options = commandLineArgs(optionDefinitions)
if (!options.historyId) {
console.error('Error: --historyId is required.')
process.exit(1)
}
if (!options.blob) {
console.error('Error: --blob is required.')
process.exit(1)
}
const activeModes = [
options.delete ? '--delete' : null,
options.empty ? '--empty' : null,
options.file ? '--file' : null,
options.message !== undefined ? '--message' : null,
].filter(Boolean)
if (activeModes.length > 1) {
console.error(
`Error: Conflicting options provided (${activeModes.join(
', '
)}). Please select exactly one redaction mode.`
)
process.exit(1)
}
await replaceBlob(options.historyId, options.blob, options)
}
main()
.then(() => console.log('Done.'))
.catch(err => {
console.error('Error:', err.message)
process.exit(1)
})
.finally(() => {
knex.destroy().catch(err => console.error('Error closing Postgres:', err))
client.close().catch(err => console.error('Error closing MongoDB:', err))
redis
.disconnect()
.catch(err => console.error('Error disconnecting Redis:', err))
})
@@ -0,0 +1,219 @@
import { expect } from 'chai'
import { promisify } from 'node:util'
import { execFile } from 'node:child_process'
import { ObjectId } from 'mongodb'
import { BlobStore } from '../../../../storage/lib/blob_store/index.js'
import cleanup from './support/cleanup.js'
describe('redact.mjs script', function () {
const TIMEOUT = 20 * 1000
beforeEach(cleanup.everything)
async function runScript(args = []) {
let result
try {
result = await promisify(execFile)(
process.argv0,
['storage/scripts/redact.mjs', ...args],
{
encoding: 'utf-8',
timeout: TIMEOUT,
env: {
...process.env,
LOG_LEVEL: 'warn',
},
}
)
result.status = 0
} catch (err) {
const { stdout, stderr, code } = err
if (typeof code !== 'number') {
console.log(err)
}
result = { stdout, stderr, status: code }
}
return result
}
it('should redact one blob completely (via delete) and leave other unmodified', async function () {
const historyId = new ObjectId().toString()
const blobStore = new BlobStore(historyId)
const blob1 = await blobStore.putString('Confidential data')
const blob2 = await blobStore.putString('Public data')
const hash1 = blob1.getHash()
const hash2 = blob2.getHash()
// Redact blob1 completely
const result = await runScript([
'--historyId',
historyId,
'--blob',
hash1,
'--delete',
'--yes',
])
expect(result.status).to.equal(0)
expect(result.stdout).to.include(`Deleting blob ${hash1}`)
// Check blob1 is absent using getStream (as getString can mask specific NotFoundError)
let fetchError
try {
await blobStore.getStream(hash1)
} catch (err) {
fetchError = err
}
expect(fetchError).to.exist
expect(fetchError.message).to.match(/not found/i)
// Check blob2 is unmodified
const publicContent = await blobStore.getString(hash2)
expect(publicContent).to.equal('Public data')
})
it('should redact a blob with a default message if no flag is provided', async function () {
const historyId = new ObjectId().toString()
const blobStore = new BlobStore(historyId)
const blob1 = await blobStore.putString('Confidential data')
const hash1 = blob1.getHash()
// Redact blob1
const result = await runScript([
'--historyId',
historyId,
'--blob',
hash1,
'--yes',
])
expect(result.status).to.equal(0)
expect(result.stdout).to.include(`Replacing blob ${hash1}`)
// Check blob1 is redacted
const redactedContent = await blobStore.getString(hash1)
expect(redactedContent).to.match(
/^REDACTED \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/
)
})
it('should redact a blob with a custom message', async function () {
const historyId = new ObjectId().toString()
const blobStore = new BlobStore(historyId)
const blob1 = await blobStore.putString('Confidential data')
const hash1 = blob1.getHash()
// Redact blob1
const result = await runScript([
'--historyId',
historyId,
'--blob',
hash1,
'--message',
'MY_CUSTOM_MSG',
'--yes',
])
expect(result.status).to.equal(0)
expect(result.stdout).to.include(`Replacing blob ${hash1}`)
// Check blob1 is redacted
const redactedContent = await blobStore.getString(hash1)
expect(redactedContent).to.match(
/^MY_CUSTOM_MSG \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/
)
})
it('should redact a blob with an empty file if --empty is used', async function () {
const historyId = new ObjectId().toString()
const blobStore = new BlobStore(historyId)
const blob1 = await blobStore.putString('Confidential data')
const hash1 = blob1.getHash()
// Redact blob1
const result = await runScript([
'--historyId',
historyId,
'--blob',
hash1,
'--empty',
'--yes',
])
expect(result.status).to.equal(0)
expect(result.stdout).to.include(`Replacing blob ${hash1}`)
// Check blob1 is empty
const redactedContent = await blobStore.getString(hash1)
expect(redactedContent).to.equal('')
})
it('should redact a blob with a specific file if --file is used', async function () {
const historyId = new ObjectId().toString()
const blobStore = new BlobStore(historyId)
const blob1 = await blobStore.putString('Confidential data')
const hash1 = blob1.getHash()
// Create a temporary file
const fs = await import('node:fs/promises')
const os = await import('node:os')
const path = await import('node:path')
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'redact-test-'))
const tmpFile = path.join(tmpDir, 'replacement.txt')
await fs.writeFile(tmpFile, 'Replacement file content')
try {
// Redact blob1
const result = await runScript([
'--historyId',
historyId,
'--blob',
hash1,
'--file',
tmpFile,
'--yes',
])
expect(result.status).to.equal(0)
expect(result.stdout).to.include(`Replacing blob ${hash1}`)
// Check blob1 has replacement content
const redactedContent = await blobStore.getString(hash1)
expect(redactedContent).to.equal('Replacement file content')
} finally {
await fs.rm(tmpDir, { recursive: true, force: true })
}
})
it('should error when conflicting options are provided', async function () {
const historyId = new ObjectId().toString()
const blobStore = new BlobStore(historyId)
const blob1 = await blobStore.putString('Confidential data')
const hash1 = blob1.getHash()
// Redact blob1 with conflicting flags
const result = await runScript([
'--historyId',
historyId,
'--blob',
hash1,
'--delete',
'--file',
'dummy.txt',
])
expect(result.status).to.equal(1)
expect(result.stderr).to.include('Error: Conflicting options provided')
expect(result.stderr).to.include('--delete')
expect(result.stderr).to.include('--file')
})
})