From 06f696ced05e22edd30b2039abb68ade16fa16d0 Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Fri, 28 Nov 2025 16:13:47 +0000 Subject: [PATCH] Merge pull request #29980 from overleaf/bg-history-extend-backup-comparison-III Check file tree hashes in backup comparison GitOrigin-RevId: 4bd1f36afa34f326d4b8934c8bb0ea00a52cf1d9 --- .../storage/lib/backup_store/index.js | 44 ++++++++++++++-- .../history-v1/storage/scripts/backup.mjs | 51 ++++++++++++++++++- 2 files changed, 88 insertions(+), 7 deletions(-) diff --git a/services/history-v1/storage/lib/backup_store/index.js b/services/history-v1/storage/lib/backup_store/index.js index 06bd16c139..4f9653bf7c 100644 --- a/services/history-v1/storage/lib/backup_store/index.js +++ b/services/history-v1/storage/lib/backup_store/index.js @@ -68,14 +68,18 @@ async function getHistoryId(projectId) { return project.overleaf.history.id } -async function getBackupStatus(projectId) { +async function getBackupStatus(projectId, options = {}) { + const projection = { + 'overleaf.history': 1, + 'overleaf.backup': 1, + } + if (options.includeRootFolder) { + projection.rootFolder = 1 + } const project = await projects.findOne( { _id: new ObjectId(projectId) }, { - projection: { - 'overleaf.history': 1, - 'overleaf.backup': 1, - }, + projection, } ) if (!project) { @@ -93,9 +97,38 @@ async function getBackupStatus(projectId) { historyId: `${project.overleaf.history.id}`, currentEndVersion: project.overleaf.history.currentEndVersion, currentEndTimestamp: project.overleaf.history.currentEndTimestamp, + ...(options.includeRootFolder && { rootFolder: project.rootFolder?.[0] }), } } +/** + * Recursively traverses the file tree and collects file hashes into a Set. + * + * @param {object} rootFolder - The root folder object of the file tree. + * @returns {Set} A Set containing all unique file hashes found in the file tree. + */ +function getHashesFromFileTree(rootFolder) { + const hashSet = new Set() + + function processFolder(folder) { + for (const file of folder.fileRefs || []) { + if (file?.hash) { + hashSet.add(file.hash) + } + } + + for (const subfolder of folder.folders || []) { + if (subfolder?._id) { + processFolder(subfolder) + } + } + } + + processFolder(rootFolder) + + return hashSet +} + async function setBackupVersion( projectId, previousBackedUpVersion, @@ -216,4 +249,5 @@ module.exports = { listUninitializedBackups, getBackedUpBlobHashes, unsetBackedUpBlobHashes, + getHashesFromFileTree, } diff --git a/services/history-v1/storage/scripts/backup.mjs b/services/history-v1/storage/scripts/backup.mjs index dbb5d22127..037a720de1 100644 --- a/services/history-v1/storage/scripts/backup.mjs +++ b/services/history-v1/storage/scripts/backup.mjs @@ -7,6 +7,7 @@ import { getProjectChunks, getLatestChunkMetadata, create, + getBackend, } from '../lib/chunk_store/index.js' import { client } from '../lib/mongodb.js' import redis from '../lib/redis.js' @@ -27,6 +28,7 @@ import { updatePendingChangeTimestamp, getBackedUpBlobHashes, unsetBackedUpBlobHashes, + getHashesFromFileTree, } from '../lib/backup_store/index.js' import { backupBlob, downloadBlobToDir } from '../lib/backupBlob.mjs' import { @@ -949,8 +951,19 @@ async function getBlobListing(historyId) { */ async function compareBackups(projectId, options, log = console.log) { - log(`Comparing backups for project ${projectId}`) - const { historyId } = await getBackupStatus(projectId) + // Convert any postgres history ids to mongo project ids + const backend = getBackend(projectId) + projectId = await backend.resolveHistoryIdToMongoProjectId(projectId) + const { historyId, rootFolder } = await getBackupStatus(projectId, { + includeRootFolder: true, + }) + + log(`Comparing backups for project ${projectId} historyId ${historyId}`) + const hashesFromFileTree = rootFolder + ? getHashesFromFileTree(rootFolder) + : new Set() + const hashesFromHistory = new Set() + const chunks = await getProjectChunks(historyId) const blobStore = new BlobStore(historyId) const backupPersistorForProject = await backupPersistor.forProject( @@ -1047,6 +1060,9 @@ async function compareBackups(projectId, options, log = console.log) { throw new Error('interrupted') } + // Track all the hashes in the history + hashesFromHistory.add(blob.hash) + if (GLOBAL_BLOBS.has(blob.hash)) { const globalBlob = GLOBAL_BLOBS.get(blob.hash) log( @@ -1158,6 +1174,31 @@ async function compareBackups(projectId, options, log = console.log) { } } + if (gracefulShutdownInitiated) { + throw new Error('interrupted') + } + // Reconcile hashes in file tree with history + log(`Comparing file hashes from file tree with history`) + if (hashesFromFileTree.size > 0) { + for (const hash of hashesFromFileTree) { + const presentInHistory = hashesFromHistory.has(hash) + if (presentInHistory) { + log(` ✓ File tree hash ${hash} present in history`) + } else { + log(` ✗ File tree hash ${hash} not found in history`) + totalBlobsNotFound++ + errors.push({ + type: 'file-not-found', + historyId, + blobHash: hash, + error: `File tree hash ${hash} not found in history`, + }) + } + } + } else { + log(` ✓ File tree does not contain any binary files`) + } + // Print summary log('\nComparison Summary:') log('==================') @@ -1236,6 +1277,9 @@ async function compareProjectAndEmitResult( return false } catch (err) { + if (gracefulShutdownInitiated) { + throw err + } console.log(`FAIL: ${projectId}`) // Output buffered logs on error when verbose @@ -1276,6 +1320,9 @@ async function compareProjectAndEmitResult( case 'blob-size-mismatch': console.log(`size-mismatch: ${projectId},${historyId},${blobHash}`) break + case 'file-not-found': + console.log(`file-not-found: ${projectId},${historyId},${blobHash}`) + break case 'chunk-mismatch': console.log(`chunk-mismatch: ${projectId},${historyId},${chunkId}`) break