From 6884338a3462460c802ed5be35f2074fd7a8bb7a Mon Sep 17 00:00:00 2001 From: Jakob Ackermann Date: Tue, 21 Jan 2025 16:42:56 +0000 Subject: [PATCH] [history-v1] fix remaining files with missing hashes (#22985) GitOrigin-RevId: df382732954538f9db177180aee0a44c5d5679ca --- .../scripts/back_fill_file_hash_fix_up.mjs | 119 ++++++++++++++---- .../back_fill_file_hash_fix_up.test.mjs | 62 ++++++++- 2 files changed, 154 insertions(+), 27 deletions(-) diff --git a/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs b/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs index fc9ad2aa0f..e6ecdc7b2f 100644 --- a/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs +++ b/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs @@ -5,6 +5,7 @@ import Stream from 'node:stream' import { ObjectId } from 'mongodb' import logger from '@overleaf/logger' import OError from '@overleaf/o-error' +import { Blob } from 'overleaf-editor-core' import { BlobStore, getStringLengthOfFile, @@ -25,7 +26,6 @@ Events.setMaxListeners(20) ObjectId.cacheHexString = true /** - * @typedef {import("overleaf-editor-core").Blob} Blob * @typedef {import("mongodb").Collection} Collection * @typedef {import("mongodb").Collection} ProjectsCollection * @typedef {import("mongodb").Collection<{project: Project}>} DeletedProjectsCollection @@ -51,13 +51,14 @@ ObjectId.cacheHexString = true */ /** - * @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, LOGS: string}} + * @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, FIX_MISSING_HASH: boolean, LOGS: string}} */ function parseArgs() { const args = commandLineArgs([ { name: 'fixNotFound', type: String, defaultValue: 'true' }, { name: 'fixDeletePermission', type: String, defaultValue: 'true' }, { name: 'fixHashMismatch', type: String, defaultValue: 'true' }, + { name: 'fixMissingHash', type: String, defaultValue: 'true' }, { name: 'logs', type: String, defaultValue: '' }, ]) /** @@ -74,12 +75,18 @@ function parseArgs() { FIX_HASH_MISMATCH: boolVal('fixNotFound'), FIX_DELETE_PERMISSION: boolVal('fixDeletePermission'), FIX_NOT_FOUND: boolVal('fixHashMismatch'), + FIX_MISSING_HASH: boolVal('fixMissingHash'), LOGS: args.logs, } } -const { FIX_HASH_MISMATCH, FIX_DELETE_PERMISSION, FIX_NOT_FOUND, LOGS } = - parseArgs() +const { + FIX_HASH_MISMATCH, + FIX_DELETE_PERMISSION, + FIX_NOT_FOUND, + FIX_MISSING_HASH, + LOGS, +} = parseArgs() if (!LOGS) { throw new Error('--logs parameter missing') } @@ -326,37 +333,75 @@ async function importRestoredFilestoreFile(projectId, fileId, historyId) { /** * @param {string} projectId * @param {string} fileId - * @return {Promise} + * @param {string} path + * @return {Promise} */ -async function computeFilestoreFileHash(projectId, fileId) { +async function bufferFilestoreFileToDisk(projectId, fileId, path) { const filestoreKey = `${projectId}/${fileId}` - const path = `${BUFFER_DIR}/${projectId}_${fileId}` try { - let s - try { - s = await filestorePersistor.getObjectStream( + await Stream.promises.pipeline( + await filestorePersistor.getObjectStream( USER_FILES_BUCKET_NAME, filestoreKey - ) - } catch (err) { - if (err instanceof NotFoundError) { - throw new OError('missing blob, need to restore filestore file', { - filestoreKey, - }) - } - throw err - } - await Stream.promises.pipeline( - s, + ), fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK }) ) const blob = await makeBlobForFile(path) + blob.setStringLength( + await getStringLengthOfFile(blob.getByteLength(), path) + ) + return blob + } catch (err) { + if (err instanceof NotFoundError) { + throw new OError('missing blob, need to restore filestore file', { + filestoreKey, + }) + } + throw err + } +} + +/** + * @param {string} projectId + * @param {string} fileId + * @return {Promise} + */ +async function computeFilestoreFileHash(projectId, fileId) { + const path = `${BUFFER_DIR}/${projectId}_${fileId}` + try { + const blob = await bufferFilestoreFileToDisk(projectId, fileId, path) return blob.getHash() } finally { await fs.promises.rm(path, { force: true }) } } +/** + * @param {string} projectId + * @param {string} fileId + * @return {Promise} + */ +async function uploadFilestoreFile(projectId, fileId) { + const path = `${BUFFER_DIR}/${projectId}_${fileId}` + try { + const blob = await bufferFilestoreFileToDisk(projectId, fileId, path) + const hash = blob.getHash() + try { + await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) + } catch (err) { + if (!(err instanceof Blob.NotFoundError)) throw err + + const { project } = await getProject(projectId) + const historyId = project.overleaf.history.id.toString() + const blobStore = new BlobStore(historyId) + await blobStore.putBlob(path, blob) + await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) + } + } finally { + await fs.promises.rm(path, { force: true }) + } +} + /** * @param {string} line * @return {Promise} @@ -468,6 +513,23 @@ async function fixDeletePermission(line) { return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) } +/** + * @param {string} line + * @return {Promise} + */ +async function fixMissingHash(line) { + let { projectId, _id: fileId } = JSON.parse(line) + const { + fileRef: { hash }, + } = await findFile(projectId, fileId) + if (hash) { + // processed, double check + return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) + } + await uploadFilestoreFile(projectId, fileId) + return true +} + const CASES = { 'not found': { match: 'NotFoundError', @@ -484,6 +546,11 @@ const CASES = { flag: FIX_DELETE_PERMISSION, action: fixDeletePermission, }, + 'missing file hash': { + match: '"bad file hash"', + flag: FIX_MISSING_HASH, + action: fixMissingHash, + }, } const STATS = { @@ -513,7 +580,15 @@ async function processLog() { nextLine: for await (const line of rl) { if (gracefulShutdownInitiated) break STATS.processedLines++ - if (!line.includes('"failed to process file"')) continue + if ( + !( + line.includes('"failed to process file"') || + // Process missing hashes as flagged by find_malformed_filetrees.mjs + line.includes('"bad file-tree path"') + ) + ) { + continue + } for (const [name, { match, flag, action }] of Object.entries(CASES)) { if (!line.includes(match)) continue diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash_fix_up.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash_fix_up.test.mjs index 37b0fd5350..8828eb8e39 100644 --- a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash_fix_up.test.mjs +++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash_fix_up.test.mjs @@ -113,6 +113,8 @@ describe('back_fill_file_hash_fix_up script', function () { const fileIdRestoreFromFilestore0 = objectIdFromTime('2017-02-01T00:10:00Z') const fileIdRestoreFromFilestore1 = objectIdFromTime('2017-02-01T00:11:00Z') const fileIdMissing2 = objectIdFromTime('2017-02-01T00:12:00Z') + const fileIdHashMissing0 = objectIdFromTime('2017-02-01T00:13:00Z') + const fileIdHashMissing1 = objectIdFromTime('2017-02-01T00:14:00Z') const contentCorruptedBlob = 'string that produces another hash' const contentDoesNotExistAsBlob = 'does not exist as blob' const hashDoesNotExistAsBlob = gitBlobHashBuffer( @@ -145,6 +147,16 @@ describe('back_fill_file_hash_fix_up script', function () { historyId: historyId0, fileId: fileIdRestoreFromFilestore1, }, + { + projectId: projectId0, + historyId: historyId0, + fileId: fileIdHashMissing0, + }, + { + projectId: projectId0, + historyId: historyId0, + fileId: fileIdHashMissing1, + }, { projectId: projectIdDeleted0, historyId: historyIdDeleted0, @@ -263,6 +275,19 @@ describe('back_fill_file_hash_fix_up script', function () { err: { message: 'some other error' }, msg: 'failed to process file', }, + // from find_malformed_filetrees.mjs + { + projectId: projectId0, + _id: fileIdHashMissing0, + reason: 'bad file hash', + msg: 'bad file-tree path', + }, + { + projectId: projectId0, + _id: fileIdHashMissing1, + reason: 'bad file hash', + msg: 'bad file-tree path', + }, ] if (PRINT_IDS_AND_HASHES_FOR_DEBUGGING) { const fileIds = { @@ -279,6 +304,8 @@ describe('back_fill_file_hash_fix_up script', function () { fileIdWithDifferentHashRestore, fileIdRestoreFromFilestore0, fileIdRestoreFromFilestore1, + fileIdHashMissing0, + fileIdHashMissing1, } console.log({ projectId0, @@ -316,6 +343,19 @@ describe('back_fill_file_hash_fix_up script', function () { `${projectId0}/${fileIdRestoreFromFilestore1}`, Stream.Readable.from([fileIdRestoreFromFilestore1.toString()]) ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId0}/${fileIdHashMissing0}`, + Stream.Readable.from([fileIdHashMissing0.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId0}/${fileIdHashMissing1}`, + Stream.Readable.from([fileIdHashMissing1.toString()]) + ) + await new BlobStore(historyId0.toString()).putString( + fileIdHashMissing1.toString() // partially processed + ) await new BlobStore(historyId0.toString()).putString( fileIdBlobExistsInGCS0.toString() ) @@ -362,6 +402,8 @@ describe('back_fill_file_hash_fix_up script', function () { { _id: fileIdMissing0 }, { _id: fileIdMissing0 }, // bad file-tree, duplicated fileRef. { _id: fileIdMissing2 }, + { _id: fileIdHashMissing0 }, + { _id: fileIdHashMissing1 }, { _id: fileIdWithDifferentHashFound, hash: gitBlobHash(fileIdInGoodState), @@ -506,8 +548,8 @@ describe('back_fill_file_hash_fix_up script', function () { }) it('should print stats', function () { expect(stats).to.contain({ - processedLines: 14, - success: 9, + processedLines: 16, + success: 11, alreadyProcessed: 0, fileDeleted: 0, skipped: 0, @@ -518,9 +560,9 @@ describe('back_fill_file_hash_fix_up script', function () { it('should handle re-run on same logs', async function () { ;({ stats } = await runScriptWithLogs()) expect(stats).to.contain({ - processedLines: 14, + processedLines: 16, success: 0, - alreadyProcessed: 6, + alreadyProcessed: 8, fileDeleted: 3, skipped: 0, failed: 3, @@ -577,6 +619,16 @@ describe('back_fill_file_hash_fix_up script', function () { // { _id: fileIdMissing0 }, // Removed // { _id: fileIdMissing2 }, + // Added hash + { + _id: fileIdHashMissing0, + hash: gitBlobHash(fileIdHashMissing0), + }, + // Added hash + { + _id: fileIdHashMissing1, + hash: gitBlobHash(fileIdHashMissing1), + }, // No change, should warn about the find. { _id: fileIdWithDifferentHashFound, @@ -639,7 +691,7 @@ describe('back_fill_file_hash_fix_up script', function () { ], overleaf: { history: { id: historyId0 } }, // Incremented when removing file/updating hash - version: 6, + version: 8, }, ]) expect(await deletedProjectsCollection.find({}).toArray()).to.deep.equal([