mirror of
https://github.com/yu-i-i/overleaf-cep.git
synced 2026-05-29 12:01:32 +02:00
Merge pull request #23058 from overleaf/bg-export-backed-up-blobs
export backed up blobs to csv file GitOrigin-RevId: 5a176055dc85c56ced64cbf13aa705b56071cdae
This commit is contained in:
@@ -20,7 +20,13 @@ async function processRecord(record) {
|
||||
mongoId(record._id)
|
||||
const newId = new ObjectId(record._id)
|
||||
if (config.commit) {
|
||||
await backedUpBlobs.insertOne({ _id: newId, blobs: record.blobs })
|
||||
await backedUpBlobs.updateOne(
|
||||
{ _id: newId },
|
||||
{
|
||||
$addToSet: { blobs: { $each: record.blobs } },
|
||||
},
|
||||
{ upsert: true }
|
||||
)
|
||||
await backedUpBlobs.deleteOne({ _id: record._id })
|
||||
}
|
||||
STATS.replaced++
|
||||
|
||||
149
services/history-v1/storage/scripts/verify_backed_up_blobs.mjs
Normal file
149
services/history-v1/storage/scripts/verify_backed_up_blobs.mjs
Normal file
@@ -0,0 +1,149 @@
|
||||
// @ts-check
|
||||
import { ObjectId } from 'mongodb'
|
||||
import knex from '../lib/knex.js'
|
||||
import {
|
||||
batchedUpdate,
|
||||
objectIdFromInput,
|
||||
READ_PREFERENCE_SECONDARY,
|
||||
} from '@overleaf/mongo-utils/batchedUpdate.js'
|
||||
import {
|
||||
GLOBAL_BLOBS,
|
||||
loadGlobalBlobs,
|
||||
makeProjectKey,
|
||||
} from '../lib/blob_store/index.js'
|
||||
import {
|
||||
backedUpBlobs as backedUpBlobsCollection,
|
||||
db,
|
||||
client,
|
||||
} from '../lib/mongodb.js'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import fs from 'node:fs'
|
||||
|
||||
const projectsCollection = db.collection('projects')
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
function parseArgs() {
|
||||
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
|
||||
const args = commandLineArgs([
|
||||
{
|
||||
name: 'BATCH_RANGE_START',
|
||||
type: String,
|
||||
defaultValue: PUBLIC_LAUNCH_DATE.toISOString(),
|
||||
},
|
||||
{
|
||||
name: 'BATCH_RANGE_END',
|
||||
type: String,
|
||||
defaultValue: new Date().toISOString(),
|
||||
},
|
||||
{
|
||||
name: 'output',
|
||||
type: String,
|
||||
alias: 'o',
|
||||
},
|
||||
])
|
||||
const BATCH_RANGE_START = objectIdFromInput(
|
||||
args['BATCH_RANGE_START']
|
||||
).toString()
|
||||
const BATCH_RANGE_END = objectIdFromInput(args['BATCH_RANGE_END']).toString()
|
||||
if (!args['output']) {
|
||||
throw new Error('missing --output')
|
||||
}
|
||||
const OUTPUT_STREAM = fs.createWriteStream(args['output'])
|
||||
|
||||
return {
|
||||
BATCH_RANGE_START,
|
||||
BATCH_RANGE_END,
|
||||
OUTPUT_STREAM,
|
||||
}
|
||||
}
|
||||
|
||||
const { BATCH_RANGE_START, BATCH_RANGE_END, OUTPUT_STREAM } = parseArgs()
|
||||
|
||||
// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
|
||||
if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
|
||||
throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
|
||||
}
|
||||
|
||||
let gracefulShutdownInitiated = false
|
||||
|
||||
process.on('SIGINT', handleSignal)
|
||||
process.on('SIGTERM', handleSignal)
|
||||
|
||||
function handleSignal() {
|
||||
gracefulShutdownInitiated = true
|
||||
console.warn('graceful shutdown initiated, draining queue')
|
||||
}
|
||||
|
||||
async function processBatch(batch) {
|
||||
if (gracefulShutdownInitiated) {
|
||||
throw new Error('graceful shutdown: aborting batch processing')
|
||||
}
|
||||
|
||||
const N = batch.length
|
||||
const firstId = batch[0]._id
|
||||
const lastId = batch[N - 1]._id
|
||||
const projectCursor = await projectsCollection.find(
|
||||
{ _id: { $gte: firstId, $lte: lastId } },
|
||||
{
|
||||
projection: { _id: 1, 'overleaf.history.id': 1, lastUpdated: 1 },
|
||||
readPreference: READ_PREFERENCE_SECONDARY,
|
||||
}
|
||||
)
|
||||
const projectMap = new Map()
|
||||
for await (const project of projectCursor) {
|
||||
projectMap.set(project._id.toString(), project)
|
||||
}
|
||||
for (const project of batch) {
|
||||
const projectId = project._id.toString()
|
||||
const projectRecord = projectMap.get(projectId)
|
||||
if (!projectRecord) {
|
||||
console.error(`project not found: ${projectId}`)
|
||||
continue
|
||||
}
|
||||
if (!projectRecord.overleaf?.history?.id) {
|
||||
console.error(`project missing history: ${projectId}`)
|
||||
continue
|
||||
}
|
||||
const historyId = projectRecord.overleaf.history.id.toString()
|
||||
const prefix = `${projectId},${projectRecord.lastUpdated.toISOString()},`
|
||||
const hashes = project.blobs.map(blob => blob.toString('hex'))
|
||||
const projectBlobHashes = hashes.filter(hash => !GLOBAL_BLOBS.has(hash))
|
||||
if (projectBlobHashes.length < hashes.length) {
|
||||
console.warn(
|
||||
`project ${projectId} has ${hashes.length - projectBlobHashes.length} global blobs`
|
||||
)
|
||||
}
|
||||
const rows = projectBlobHashes.map(
|
||||
hash => prefix + makeProjectKey(historyId, hash) + '\n'
|
||||
)
|
||||
OUTPUT_STREAM.write(rows.join(''))
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await loadGlobalBlobs()
|
||||
OUTPUT_STREAM.write('projectId,lastUpdated,path\n')
|
||||
await batchedUpdate(
|
||||
backedUpBlobsCollection,
|
||||
{},
|
||||
processBatch,
|
||||
{},
|
||||
{},
|
||||
{ BATCH_RANGE_START, BATCH_RANGE_END }
|
||||
)
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => console.log('Done.'))
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(() => {
|
||||
knex.destroy().catch(err => {
|
||||
console.error('Error closing Postgres connection:', err)
|
||||
})
|
||||
client.close().catch(err => console.error('Error closing MongoDB:', err))
|
||||
})
|
||||
Reference in New Issue
Block a user