From 50cd0f35205896674afc3c86de1e052dfababcd2 Mon Sep 17 00:00:00 2001 From: Jakob Ackermann Date: Fri, 17 Jan 2025 11:22:37 +0000 Subject: [PATCH] Merge pull request #22870 from overleaf/jpa-back-fill-fix-up [history-v1] add script for fixing up back-fill errors GitOrigin-RevId: 118992a32c1f6da4289cd35399ddd07a741da4ee --- .../scripts/back_fill_file_hash_fix_up.mjs | 569 +++++++++++++ .../back_fill_file_hash_fix_up.test.mjs | 761 ++++++++++++++++++ 2 files changed, 1330 insertions(+) create mode 100644 services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs create mode 100644 services/history-v1/test/acceptance/js/storage/back_fill_file_hash_fix_up.test.mjs diff --git a/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs b/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs new file mode 100644 index 0000000000..fc9ad2aa0f --- /dev/null +++ b/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs @@ -0,0 +1,569 @@ +// @ts-check +import Events from 'node:events' +import fs from 'node:fs' +import Stream from 'node:stream' +import { ObjectId } from 'mongodb' +import logger from '@overleaf/logger' +import OError from '@overleaf/o-error' +import { + BlobStore, + getStringLengthOfFile, + GLOBAL_BLOBS, + makeBlobForFile, +} from '../lib/blob_store/index.js' +import { db } from '../lib/mongodb.js' +import commandLineArgs from 'command-line-args' +import readline from 'node:readline' +import { _blobIsBackedUp, backupBlob } from '../lib/backupBlob.mjs' +import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js' +import filestorePersistor from '../lib/persistor.js' + +// Silence warning. +Events.setMaxListeners(20) + +// Enable caching for ObjectId.toString() +ObjectId.cacheHexString = true + +/** + * @typedef {import("overleaf-editor-core").Blob} Blob + * @typedef {import("mongodb").Collection} Collection + * @typedef {import("mongodb").Collection} ProjectsCollection + * @typedef {import("mongodb").Collection<{project: Project}>} DeletedProjectsCollection + */ + +/** + * @typedef {Object} FileRef + * @property {ObjectId} _id + * @property {string} hash + */ + +/** + * @typedef {Object} Folder + * @property {Array} folders + * @property {Array} fileRefs + */ + +/** + * @typedef {Object} Project + * @property {ObjectId} _id + * @property {Array} rootFolder + * @property {{history: {id: (number|string)}}} overleaf + */ + +/** + * @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, LOGS: string}} + */ +function parseArgs() { + const args = commandLineArgs([ + { name: 'fixNotFound', type: String, defaultValue: 'true' }, + { name: 'fixDeletePermission', type: String, defaultValue: 'true' }, + { name: 'fixHashMismatch', type: String, defaultValue: 'true' }, + { name: 'logs', type: String, defaultValue: '' }, + ]) + /** + * commandLineArgs cannot handle --foo=false, so go the long way + * @param {string} name + * @return {boolean} + */ + function boolVal(name) { + const v = args[name] + if (['true', 'false'].includes(v)) return v === 'true' + throw new Error(`expected "true" or "false" for boolean option ${name}`) + } + return { + FIX_HASH_MISMATCH: boolVal('fixNotFound'), + FIX_DELETE_PERMISSION: boolVal('fixDeletePermission'), + FIX_NOT_FOUND: boolVal('fixHashMismatch'), + LOGS: args.logs, + } +} + +const { FIX_HASH_MISMATCH, FIX_DELETE_PERMISSION, FIX_NOT_FOUND, LOGS } = + parseArgs() +if (!LOGS) { + throw new Error('--logs parameter missing') +} +const BUFFER_DIR = fs.mkdtempSync( + process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-' +) +const USER_FILES_BUCKET_NAME = process.env.USER_FILES_BUCKET_NAME || '' +if (!USER_FILES_BUCKET_NAME) { + throw new Error('env var USER_FILES_BUCKET_NAME is missing') +} +// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode +const STREAM_HIGH_WATER_MARK = parseInt( + process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(), + 10 +) + +/** @type {ProjectsCollection} */ +const projectsCollection = db.collection('projects') +/** @type {DeletedProjectsCollection} */ +const deletedProjectsCollection = db.collection('deletedProjects') + +let gracefulShutdownInitiated = false + +process.on('SIGINT', handleSignal) +process.on('SIGTERM', handleSignal) + +function handleSignal() { + gracefulShutdownInitiated = true + console.warn('graceful shutdown initiated, draining queue') +} + +class FileDeletedError extends OError {} + +/** @type {Map} */ +const PROJECT_CACHE = new Map() + +/** + * @param {string} projectId + * @return {Promise<{project: Project, projectSoftDeleted: boolean}>} + */ +async function getProject(projectId) { + const cached = PROJECT_CACHE.get(projectId) + if (cached) return cached + + let projectSoftDeleted + let project = await projectsCollection.findOne({ + _id: new ObjectId(projectId), + }) + if (project) { + projectSoftDeleted = false + } else { + const softDeleted = await deletedProjectsCollection.findOne({ + 'deleterData.deletedProjectId': new ObjectId(projectId), + project: { $exists: true }, + }) + if (!softDeleted) { + throw new OError('project hard-deleted') + } + project = softDeleted.project + projectSoftDeleted = true + } + PROJECT_CACHE.set(projectId, { projectSoftDeleted, project }) + return { projectSoftDeleted, project } +} + +/** + * @param {Folder} folder + * @param {string} fileId + * @return {{path: string, fileRef: FileRef, folder: Folder}|null} + */ +function getFileTreePath(folder, fileId) { + if (!folder) return null + let idx = 0 + if (Array.isArray(folder.fileRefs)) { + for (const fileRef of folder.fileRefs) { + if (fileRef?._id.toString() === fileId) { + return { + fileRef, + path: `.fileRefs.${idx}`, + folder, + } + } + idx++ + } + } + idx = 0 + if (Array.isArray(folder.folders)) { + for (const child of folder.folders) { + const match = getFileTreePath(child, fileId) + if (match) { + return { + fileRef: match.fileRef, + folder: match.folder, + path: `.folders.${idx}${match.path}`, + } + } + idx++ + } + } + return null +} + +/** + * @param {string} projectId + * @param {string} fileId + * @return {Promise<{fileRef: FileRef, folder: Folder, fullPath: string, query: Object, projectSoftDeleted: boolean}>} + */ +async function findFile(projectId, fileId) { + const { projectSoftDeleted, project } = await getProject(projectId) + const match = getFileTreePath(project.rootFolder[0], fileId) + if (!match) { + throw new FileDeletedError('file not found in file-tree', { + projectSoftDeleted, + }) + } + const { path, fileRef, folder } = match + let fullPath + let query + if (projectSoftDeleted) { + fullPath = `project.rootFolder.0${path}` + query = { + 'deleterData.deletedProjectId': new ObjectId(projectId), + [`${fullPath}._id`]: new ObjectId(fileId), + } + } else { + fullPath = `rootFolder.0${path}` + query = { + _id: new ObjectId(projectId), + [`${fullPath}._id`]: new ObjectId(fileId), + } + } + return { + projectSoftDeleted, + query, + fullPath, + fileRef, + folder, + } +} + +/** + * @param {string} line + * @return {Promise} + */ +async function fixNotFound(line) { + const { projectId, fileId, bucketName } = JSON.parse(line) + if (bucketName !== USER_FILES_BUCKET_NAME) { + throw new OError('not found case for another bucket') + } + + const { projectSoftDeleted, query, fullPath, fileRef, folder } = + await findFile(projectId, fileId) + logger.info({ projectId, fileId, fileRef }, 'removing fileRef') + // Copied from _removeElementFromMongoArray (https://github.com/overleaf/internal/blob/11e09528c153de6b7766d18c3c90d94962190371/services/web/app/src/Features/Project/ProjectEntityMongoUpdateHandler.js) + const nonArrayPath = fullPath.slice(0, fullPath.lastIndexOf('.')) + let result + if (projectSoftDeleted) { + result = await deletedProjectsCollection.updateOne(query, { + $pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } }, + $inc: { 'project.version': 1 }, + }) + } else { + result = await projectsCollection.updateOne(query, { + $pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } }, + $inc: { version: 1 }, + }) + } + if (result.matchedCount !== 1) { + throw new OError('file-tree write did not match', { result }) + } + // Update the cache. The mongo-path of the next file will be off otherwise. + folder.fileRefs = folder.fileRefs.filter(f => !f._id.equals(fileId)) + return true +} + +/** + * @param {string} projectId + * @param {string} fileId + * @param {string} hash + * @return {Promise} + */ +async function setHashInMongo(projectId, fileId, hash) { + const { projectSoftDeleted, query, fullPath, fileRef } = await findFile( + projectId, + fileId + ) + if (fileRef.hash === hash) return + logger.info({ projectId, fileId, fileRef, hash }, 'setting fileRef hash') + let result + if (projectSoftDeleted) { + result = await deletedProjectsCollection.updateOne(query, { + $set: { [`${fullPath}.hash`]: hash }, + $inc: { 'project.version': 1 }, + }) + } else { + result = await projectsCollection.updateOne(query, { + $set: { [`${fullPath}.hash`]: hash }, + $inc: { version: 1 }, + }) + } + if (result.matchedCount !== 1) { + throw new OError('file-tree write did not match', { result }) + } + fileRef.hash = hash // Update cache for completeness. +} + +/** + * @param {string} projectId + * @param {string} fileId + * @param {string} historyId + * @return {Promise} + */ +async function importRestoredFilestoreFile(projectId, fileId, historyId) { + const filestoreKey = `${projectId}/${fileId}` + const path = `${BUFFER_DIR}/${projectId}_${fileId}` + try { + let s + try { + s = await filestorePersistor.getObjectStream( + USER_FILES_BUCKET_NAME, + filestoreKey + ) + } catch (err) { + if (err instanceof NotFoundError) { + throw new OError('missing blob, need to restore filestore file', { + filestoreKey, + }) + } + throw err + } + await Stream.promises.pipeline( + s, + fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK }) + ) + const blobStore = new BlobStore(historyId) + const blob = await blobStore.putFile(path) + await backupBlob(historyId, blob, path) + await setHashInMongo(projectId, fileId, blob.getHash()) + } finally { + await fs.promises.rm(path, { force: true }) + } +} + +/** + * @param {string} projectId + * @param {string} fileId + * @return {Promise} + */ +async function computeFilestoreFileHash(projectId, fileId) { + const filestoreKey = `${projectId}/${fileId}` + const path = `${BUFFER_DIR}/${projectId}_${fileId}` + try { + let s + try { + s = await filestorePersistor.getObjectStream( + USER_FILES_BUCKET_NAME, + filestoreKey + ) + } catch (err) { + if (err instanceof NotFoundError) { + throw new OError('missing blob, need to restore filestore file', { + filestoreKey, + }) + } + throw err + } + await Stream.promises.pipeline( + s, + fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK }) + ) + const blob = await makeBlobForFile(path) + return blob.getHash() + } finally { + await fs.promises.rm(path, { force: true }) + } +} + +/** + * @param {string} line + * @return {Promise} + */ +async function fixHashMismatch(line) { + const { + projectId, + fileId, + hash: computedHash, + entry: { + hash: fileTreeHash, + ctx: { historyId }, + }, + } = JSON.parse(line) + const blobStore = new BlobStore(historyId) + if (await blobStore.getBlob(fileTreeHash)) { + throw new OError('found blob with computed filestore object hash') + } + if (!(await blobStore.getBlob(computedHash))) { + await importRestoredFilestoreFile(projectId, fileId, historyId) + return true + } + return await ensureBlobExistsForFileAndUploadToAWS( + projectId, + fileId, + computedHash + ) +} + +/** + * @param {string} projectId + * @param {string} fileId + * @param {string} hash + * @return {Promise} + */ +async function hashAlreadyUpdatedInFileTree(projectId, fileId, hash) { + const { fileRef } = await findFile(projectId, fileId) + return fileRef.hash === hash +} + +/** + * @param {string} projectId + * @param {string} hash + * @return {Promise} + */ +async function needsBackingUpToAWS(projectId, hash) { + if (GLOBAL_BLOBS.has(hash)) return false + return !(await _blobIsBackedUp(projectId, hash)) +} + +/** + * @param {string} projectId + * @param {string} fileId + * @param {string} hash + * @return {Promise} + */ +async function ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) { + const { project } = await getProject(projectId) + const historyId = project.overleaf.history.id.toString() + const blobStore = new BlobStore(historyId) + if ( + (await hashAlreadyUpdatedInFileTree(projectId, fileId, hash)) && + (await blobStore.getBlob(hash)) && + !(await needsBackingUpToAWS(projectId, hash)) + ) { + return false // already processed + } + + const stream = await blobStore.getStream(hash) + const path = `${BUFFER_DIR}/${historyId}_${hash}` + try { + await Stream.promises.pipeline( + stream, + fs.createWriteStream(path, { + highWaterMark: STREAM_HIGH_WATER_MARK, + }) + ) + + const writtenBlob = await makeBlobForFile(path) + writtenBlob.setStringLength( + await getStringLengthOfFile(writtenBlob.getByteLength(), path) + ) + if (writtenBlob.getHash() !== hash) { + // Double check download, better safe than sorry. + throw new OError('blob corrupted', { writtenBlob }) + } + + let blob = await blobStore.getBlob(hash) + if (!blob) { + // Calling blobStore.putBlob would result in the same error again. + // HACK: Skip upload to GCS and finalize putBlob operation directly. + await blobStore.backend.insertBlob(historyId, writtenBlob) + } + await backupBlob(historyId, writtenBlob, path) + } finally { + await fs.promises.rm(path, { force: true }) + } + await setHashInMongo(projectId, fileId, hash) + return true +} + +/** + * @param {string} line + * @return {Promise} + */ +async function fixDeletePermission(line) { + let { projectId, fileId, hash } = JSON.parse(line) + if (!hash) hash = await computeFilestoreFileHash(projectId, fileId) + return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) +} + +const CASES = { + 'not found': { + match: 'NotFoundError', + flag: FIX_NOT_FOUND, + action: fixNotFound, + }, + 'hash mismatch': { + match: 'OError: hash mismatch', + flag: FIX_HASH_MISMATCH, + action: fixHashMismatch, + }, + 'delete permission': { + match: 'storage.objects.delete', + flag: FIX_DELETE_PERMISSION, + action: fixDeletePermission, + }, +} + +const STATS = { + processedLines: 0, + success: 0, + alreadyProcessed: 0, + fileDeleted: 0, + skipped: 0, + failed: 0, + unmatched: 0, +} +function logStats() { + console.log( + JSON.stringify({ + time: new Date(), + gracefulShutdownInitiated, + ...STATS, + }) + ) +} +setInterval(logStats, 10_000) + +async function processLog() { + const rl = readline.createInterface({ + input: fs.createReadStream(LOGS), + }) + nextLine: for await (const line of rl) { + if (gracefulShutdownInitiated) break + STATS.processedLines++ + if (!line.includes('"failed to process file"')) continue + + for (const [name, { match, flag, action }] of Object.entries(CASES)) { + if (!line.includes(match)) continue + if (flag) { + try { + if (await action(line)) { + STATS.success++ + } else { + STATS.alreadyProcessed++ + } + } catch (err) { + if (err instanceof FileDeletedError) { + STATS.fileDeleted++ + logger.info({ err, line }, 'file deleted, skipping') + } else { + STATS.failed++ + logger.error({ err, line }, `failed to fix ${name}`) + } + } + } else { + STATS.skipped++ + } + continue nextLine + } + STATS.unmatched++ + logger.warn({ line }, 'unknown fatal error') + } +} + +async function main() { + try { + await processLog() + } finally { + logStats() + try { + await fs.promises.rm(BUFFER_DIR, { recursive: true, force: true }) + } catch (err) { + console.error(`Cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err) + } + } + const { skipped, failed, unmatched } = STATS + if (failed > 0) { + process.exit(Math.min(failed, 99)) + } else if (unmatched > 0) { + process.exit(100) + } else if (skipped > 0) { + process.exit(101) + } else { + process.exit(0) + } +} + +await main() diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash_fix_up.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash_fix_up.test.mjs new file mode 100644 index 0000000000..37b0fd5350 --- /dev/null +++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash_fix_up.test.mjs @@ -0,0 +1,761 @@ +import fs from 'node:fs' +import Crypto from 'node:crypto' +import Stream from 'node:stream' +import { promisify } from 'node:util' +import { Binary, ObjectId } from 'mongodb' +import { Blob } from 'overleaf-editor-core' +import { backedUpBlobs, blobs, db } from '../../../../storage/lib/mongodb.js' +import cleanup from './support/cleanup.js' +import testProjects from '../api/support/test_projects.js' +import { execFile } from 'node:child_process' +import { expect } from 'chai' +import config from 'config' +import { WritableBuffer } from '@overleaf/stream-utils' +import { + backupPersistor, + projectBlobsBucket, +} from '../../../../storage/lib/backupPersistor.mjs' +import projectKey from '../../../../storage/lib/project_key.js' +import { + BlobStore, + makeProjectKey, +} from '../../../../storage/lib/blob_store/index.js' +import ObjectPersistor from '@overleaf/object-persistor' + +const TIMEOUT = 20 * 1_000 + +const { deksBucket } = config.get('backupStore') +const { tieringStorageClass } = config.get('backupPersistor') + +const projectsCollection = db.collection('projects') +const deletedProjectsCollection = db.collection('deletedProjects') + +const FILESTORE_PERSISTOR = ObjectPersistor({ + backend: 'gcs', + gcs: { + endpoint: { + apiEndpoint: process.env.GCS_API_ENDPOINT, + projectId: process.env.GCS_PROJECT_ID, + }, + }, +}) + +/** + * @param {ObjectId} objectId + * @return {string} + */ +function gitBlobHash(objectId) { + return gitBlobHashBuffer(Buffer.from(objectId.toString())) +} + +/** + * @param {Buffer} buf + * @return {string} + */ +function gitBlobHashBuffer(buf) { + const sha = Crypto.createHash('sha1') + sha.update(`blob ${buf.byteLength}\x00`) + sha.update(buf) + return sha.digest('hex') +} + +/** + * @param {string} gitBlobHash + * @return {Binary} + */ +function binaryForGitBlobHash(gitBlobHash) { + return new Binary(Buffer.from(gitBlobHash, 'hex')) +} + +async function listS3Bucket(bucket, wantStorageClass) { + const client = backupPersistor._getClientForBucket(bucket) + const response = await client.listObjectsV2({ Bucket: bucket }).promise() + + for (const object of response.Contents || []) { + expect(object).to.have.property('StorageClass', wantStorageClass) + } + + return (response.Contents || []).map(item => item.Key || '') +} + +function objectIdFromTime(timestamp) { + return ObjectId.createFromTime(new Date(timestamp).getTime() / 1000) +} + +const PRINT_IDS_AND_HASHES_FOR_DEBUGGING = false + +describe('back_fill_file_hash_fix_up script', function () { + this.timeout(TIMEOUT) + const USER_FILES_BUCKET_NAME = 'fake-user-files-gcs' + + const projectId0 = objectIdFromTime('2017-01-01T00:00:00Z') + const projectIdDeleted0 = objectIdFromTime('2017-01-01T00:04:00Z') + const historyId0 = 42 // stored as number is mongo + const historyIdDeleted0 = projectIdDeleted0.toString() + const fileIdWithDifferentHashFound = objectIdFromTime('2017-02-01T00:00:00Z') + const fileIdInGoodState = objectIdFromTime('2017-02-01T00:01:00Z') + const fileIdBlobExistsInGCS0 = objectIdFromTime('2017-02-01T00:02:00Z') + const fileIdWithDifferentHashNotFound0 = objectIdFromTime( + '2017-02-01T00:03:00Z' + ) + const fileIdWithDifferentHashNotFound1 = objectIdFromTime( + '2017-02-01T00:04:00Z' + ) + const fileIdBlobExistsInGCSCorrupted = objectIdFromTime( + '2017-02-01T00:05:00Z' + ) + const fileIdMissing0 = objectIdFromTime('2024-02-01T00:06:00Z') + const fileIdMissing1 = objectIdFromTime('2017-02-01T00:07:00Z') + const fileIdWithDifferentHashRestore = objectIdFromTime( + '2017-02-01T00:08:00Z' + ) + const fileIdBlobExistsInGCS1 = objectIdFromTime('2017-02-01T00:09:00Z') + const fileIdRestoreFromFilestore0 = objectIdFromTime('2017-02-01T00:10:00Z') + const fileIdRestoreFromFilestore1 = objectIdFromTime('2017-02-01T00:11:00Z') + const fileIdMissing2 = objectIdFromTime('2017-02-01T00:12:00Z') + const contentCorruptedBlob = 'string that produces another hash' + const contentDoesNotExistAsBlob = 'does not exist as blob' + const hashDoesNotExistAsBlob = gitBlobHashBuffer( + Buffer.from(contentDoesNotExistAsBlob) + ) + const deleteProjectsRecordId0 = new ObjectId() + const writtenBlobs = [ + { + projectId: projectId0, + historyId: historyId0, + fileId: fileIdBlobExistsInGCS0, + }, + { + projectId: projectId0, + historyId: historyId0, + fileId: fileIdBlobExistsInGCS1, + }, + { + projectId: projectId0, + historyId: historyId0, + fileId: fileIdWithDifferentHashNotFound0, + }, + { + projectId: projectId0, + historyId: historyId0, + fileId: fileIdRestoreFromFilestore0, + }, + { + projectId: projectId0, + historyId: historyId0, + fileId: fileIdRestoreFromFilestore1, + }, + { + projectId: projectIdDeleted0, + historyId: historyIdDeleted0, + fileId: fileIdWithDifferentHashNotFound1, + }, + ] + const logs = [ + { + projectId: projectId0, + fileId: fileIdWithDifferentHashFound, + err: { message: 'OError: hash mismatch' }, + hash: gitBlobHash(fileIdMissing0), // does not matter + entry: { + ctx: { historyId: historyId0.toString() }, + hash: gitBlobHash(fileIdInGoodState), + }, + msg: 'failed to process file', + }, + { + projectId: projectId0, + fileId: fileIdWithDifferentHashRestore, + err: { message: 'OError: hash mismatch' }, + hash: hashDoesNotExistAsBlob, + entry: { + ctx: { historyId: historyId0.toString() }, + hash: gitBlobHash(fileIdMissing0), // does not matter + }, + msg: 'failed to process file', + }, + { + projectId: projectId0, + fileId: fileIdWithDifferentHashNotFound0, + err: { message: 'OError: hash mismatch' }, + hash: gitBlobHash(fileIdWithDifferentHashNotFound0), + entry: { + ctx: { historyId: historyId0.toString() }, + hash: hashDoesNotExistAsBlob, + }, + msg: 'failed to process file', + }, + { + projectId: projectId0, + fileId: fileIdRestoreFromFilestore0, + err: { message: 'OError: hash mismatch' }, + hash: gitBlobHash(fileIdRestoreFromFilestore0), + entry: { + ctx: { historyId: historyId0.toString() }, + hash: hashDoesNotExistAsBlob, + }, + msg: 'failed to process file', + }, + { + projectId: projectIdDeleted0, + fileId: fileIdWithDifferentHashNotFound1, + err: { message: 'OError: hash mismatch' }, + hash: gitBlobHash(fileIdWithDifferentHashNotFound1), + entry: { + ctx: { historyId: historyIdDeleted0.toString() }, + hash: hashDoesNotExistAsBlob, + }, + msg: 'failed to process file', + }, + { + projectId: projectId0, + fileId: fileIdMissing0, + bucketName: USER_FILES_BUCKET_NAME, + err: { message: 'NotFoundError' }, + msg: 'failed to process file', + }, + { + projectId: projectId0, + fileId: fileIdMissing2, + bucketName: USER_FILES_BUCKET_NAME, + err: { message: 'NotFoundError' }, + msg: 'failed to process file', + }, + { + projectId: projectId0, + fileId: fileIdBlobExistsInGCS0, + hash: gitBlobHash(fileIdBlobExistsInGCS0), + err: { message: 'storage.objects.delete' }, + msg: 'failed to process file', + }, + { + projectId: projectId0, + fileId: fileIdBlobExistsInGCSCorrupted, + hash: gitBlobHash(fileIdBlobExistsInGCSCorrupted), + err: { message: 'storage.objects.delete' }, + msg: 'failed to process file', + }, + { + projectId: projectId0, + fileId: fileIdBlobExistsInGCS1, + hash: gitBlobHash(fileIdBlobExistsInGCS1), + err: { message: 'storage.objects.delete' }, + msg: 'failed to process file', + }, + { + projectId: projectId0, + fileId: fileIdRestoreFromFilestore1, + err: { message: 'storage.objects.delete' }, + msg: 'failed to process file', + }, + { + projectId: projectIdDeleted0, + fileId: fileIdMissing1, + bucketName: USER_FILES_BUCKET_NAME, + err: { message: 'NotFoundError' }, + msg: 'failed to process file', + }, + { + err: { message: 'spurious error' }, + msg: 'failed to process file, trying again', + }, + { + err: { message: 'some other error' }, + msg: 'failed to process file', + }, + ] + if (PRINT_IDS_AND_HASHES_FOR_DEBUGGING) { + const fileIds = { + fileIdWithDifferentHashFound, + fileIdInGoodState, + fileIdBlobExistsInGCS0, + fileIdBlobExistsInGCS1, + fileIdWithDifferentHashNotFound0, + fileIdWithDifferentHashNotFound1, + fileIdBlobExistsInGCSCorrupted, + fileIdMissing0, + fileIdMissing1, + fileIdMissing2, + fileIdWithDifferentHashRestore, + fileIdRestoreFromFilestore0, + fileIdRestoreFromFilestore1, + } + console.log({ + projectId0, + projectIdDeleted0, + historyId0, + historyIdDeleted0, + ...fileIds, + hashDoesNotExistAsBlob, + }) + for (const [name, v] of Object.entries(fileIds)) { + console.log( + name, + gitBlobHash(v), + Array.from(binaryForGitBlobHash(gitBlobHash(v)).value()) + ) + } + } + + before(cleanup.everything) + before('cleanup s3 buckets', async function () { + await backupPersistor.deleteDirectory(deksBucket, '') + await backupPersistor.deleteDirectory(projectBlobsBucket, '') + expect(await listS3Bucket(deksBucket)).to.have.length(0) + expect(await listS3Bucket(projectBlobsBucket)).to.have.length(0) + }) + + before('populate blobs/GCS', async function () { + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId0}/${fileIdRestoreFromFilestore0}`, + Stream.Readable.from([fileIdRestoreFromFilestore0.toString()]) + ) + await FILESTORE_PERSISTOR.sendStream( + USER_FILES_BUCKET_NAME, + `${projectId0}/${fileIdRestoreFromFilestore1}`, + Stream.Readable.from([fileIdRestoreFromFilestore1.toString()]) + ) + await new BlobStore(historyId0.toString()).putString( + fileIdBlobExistsInGCS0.toString() + ) + await new BlobStore(historyId0.toString()).putString( + fileIdBlobExistsInGCS1.toString() + ) + await new BlobStore(historyId0.toString()).putString( + fileIdRestoreFromFilestore1.toString() + ) + const path = '/tmp/test-blob-corrupted' + try { + await fs.promises.writeFile(path, contentCorruptedBlob) + await new BlobStore(historyId0.toString()).putBlob( + path, + new Blob(gitBlobHash(fileIdBlobExistsInGCSCorrupted), 42) + ) + } finally { + await fs.promises.rm(path, { force: true }) + } + await cleanup.postgres() + await cleanup.mongo() + await Promise.all([ + testProjects.createEmptyProject(historyId0.toString()), + testProjects.createEmptyProject(historyIdDeleted0), + ]) + await new BlobStore(historyId0.toString()).putString( + fileIdWithDifferentHashNotFound0.toString() + ) + await new BlobStore(historyIdDeleted0.toString()).putString( + fileIdWithDifferentHashNotFound1.toString() + ) + await new BlobStore(historyId0.toString()).putString( + fileIdInGoodState.toString() + ) + }) + + before('populate mongo', async function () { + await projectsCollection.insertMany([ + { + _id: projectId0, + rootFolder: [ + { + fileRefs: [ + { _id: fileIdMissing0 }, + { _id: fileIdMissing0 }, // bad file-tree, duplicated fileRef. + { _id: fileIdMissing2 }, + { + _id: fileIdWithDifferentHashFound, + hash: gitBlobHash(fileIdInGoodState), + }, + { + _id: fileIdWithDifferentHashRestore, + hash: gitBlobHash(fileIdMissing0), + }, + ], + folders: [ + { + docs: [], + }, + null, + { + fileRefs: [ + null, + { + _id: fileIdInGoodState, + hash: gitBlobHash(fileIdInGoodState), + }, + { + _id: fileIdWithDifferentHashNotFound0, + hash: hashDoesNotExistAsBlob, + }, + { + _id: fileIdRestoreFromFilestore0, + hash: hashDoesNotExistAsBlob, + }, + { + _id: fileIdRestoreFromFilestore1, + }, + { + _id: fileIdBlobExistsInGCS0, + hash: gitBlobHash(fileIdBlobExistsInGCS0), + }, + { + _id: fileIdBlobExistsInGCSCorrupted, + hash: gitBlobHash(fileIdBlobExistsInGCSCorrupted), + }, + { _id: fileIdBlobExistsInGCS1 }, + ], + folders: [], + }, + ], + }, + ], + overleaf: { history: { id: historyId0 } }, + version: 0, + }, + ]) + await deletedProjectsCollection.insertMany([ + { + _id: deleteProjectsRecordId0, + project: { + _id: projectIdDeleted0, + rootFolder: [ + { + fileRefs: [ + { + _id: fileIdWithDifferentHashNotFound1, + hash: hashDoesNotExistAsBlob, + }, + ], + folders: [ + { + fileRefs: [], + folders: [ + { fileRefs: [{ _id: fileIdMissing1 }], folders: [] }, + ], + }, + ], + }, + ], + overleaf: { history: { id: historyIdDeleted0 } }, + version: 100, + }, + deleterData: { + deletedProjectId: projectIdDeleted0, + }, + }, + ]) + }) + + /** + * @param {Array} args + * @param {Record} env + * @return {Promise<{ stdout: string, stderr: string, status: number }>} + */ + async function tryRunScript(args = [], env = {}) { + let result + try { + result = await promisify(execFile)( + process.argv0, + ['storage/scripts/back_fill_file_hash_fix_up.mjs', ...args], + { + encoding: 'utf-8', + timeout: TIMEOUT - 500, + env: { + ...process.env, + USER_FILES_BUCKET_NAME, + ...env, + LOG_LEVEL: 'warn', // Override LOG_LEVEL of acceptance tests + }, + } + ) + result.status = 0 + } catch (err) { + const { stdout, stderr, code } = err + if (typeof code !== 'number') { + console.log(err) + } + result = { stdout, stderr, status: code } + } + expect((await fs.promises.readdir('/tmp')).join(';')).to.not.match( + /back_fill_file_hash/ + ) + return result + } + async function runScriptWithLogs() { + const logsPath = '/tmp/test-script-logs' + let result + try { + await fs.promises.writeFile( + logsPath, + logs.map(e => JSON.stringify(e)).join('\n') + ) + result = await tryRunScript([`--logs=${logsPath}`]) + } finally { + await fs.promises.rm(logsPath, { force: true }) + } + const stats = JSON.parse(result.stdout.trim().split('\n').pop()) + return { + result, + stats, + } + } + + let result, stats + before(async function () { + ;({ result, stats } = await runScriptWithLogs()) + }) + it('should print stats', function () { + expect(stats).to.contain({ + processedLines: 14, + success: 9, + alreadyProcessed: 0, + fileDeleted: 0, + skipped: 0, + failed: 3, + unmatched: 1, + }) + }) + it('should handle re-run on same logs', async function () { + ;({ stats } = await runScriptWithLogs()) + expect(stats).to.contain({ + processedLines: 14, + success: 0, + alreadyProcessed: 6, + fileDeleted: 3, + skipped: 0, + failed: 3, + unmatched: 1, + }) + }) + it('should flag the unknown fatal error', function () { + const unknown = result.stdout + .split('\n') + .filter(l => l.includes('unknown fatal error')) + expect(unknown).to.have.length(1) + const [line] = unknown + expect(line).to.exist + expect(line).to.include('some other error') + }) + it('should flag the unexpected blob on mismatched hash', function () { + const line = result.stdout + .split('\n') + .find(l => l.includes('found blob with computed filestore object hash')) + expect(line).to.exist + expect(line).to.include(projectId0.toString()) + expect(line).to.include(fileIdWithDifferentHashFound.toString()) + expect(line).to.include(gitBlobHash(fileIdInGoodState)) + }) + it('should flag the need to restore', function () { + const line = result.stdout + .split('\n') + .find(l => l.includes('missing blob, need to restore filestore file')) + expect(line).to.exist + expect(line).to.include(projectId0.toString()) + expect(line).to.include(fileIdWithDifferentHashRestore.toString()) + expect(line).to.include(hashDoesNotExistAsBlob) + }) + it('should flag the corrupted blob', function () { + const line = result.stdout + .split('\n') + .find(l => l.includes('blob corrupted')) + expect(line).to.exist + expect(line).to.include(projectId0.toString()) + expect(line).to.include(fileIdBlobExistsInGCSCorrupted.toString()) + expect(line).to.include( + gitBlobHashBuffer(Buffer.from(contentCorruptedBlob)) + ) + expect(line).to.include(gitBlobHash(fileIdBlobExistsInGCSCorrupted)) + }) + it('should update mongo', async function () { + expect(await projectsCollection.find({}).toArray()).to.deep.equal([ + { + _id: projectId0, + rootFolder: [ + { + fileRefs: [ + // Removed + // { _id: fileIdMissing0 }, + // Removed + // { _id: fileIdMissing2 }, + // No change, should warn about the find. + { + _id: fileIdWithDifferentHashFound, + hash: gitBlobHash(fileIdInGoodState), + }, + // No change, should warn about the need to restore. + { + _id: fileIdWithDifferentHashRestore, + hash: gitBlobHash(fileIdMissing0), + }, + ], + folders: [ + { + docs: [], + }, + null, + { + fileRefs: [ + null, + // No change + { + _id: fileIdInGoodState, + hash: gitBlobHash(fileIdInGoodState), + }, + // Updated hash + { + _id: fileIdWithDifferentHashNotFound0, + hash: gitBlobHash(fileIdWithDifferentHashNotFound0), + }, + // Updated hash + { + _id: fileIdRestoreFromFilestore0, + hash: gitBlobHash(fileIdRestoreFromFilestore0), + }, + // Added hash + { + _id: fileIdRestoreFromFilestore1, + hash: gitBlobHash(fileIdRestoreFromFilestore1), + }, + // No change, blob created + { + _id: fileIdBlobExistsInGCS0, + hash: gitBlobHash(fileIdBlobExistsInGCS0), + }, + // No change, flagged + { + _id: fileIdBlobExistsInGCSCorrupted, + hash: gitBlobHash(fileIdBlobExistsInGCSCorrupted), + }, + // Added hash + { + _id: fileIdBlobExistsInGCS1, + hash: gitBlobHash(fileIdBlobExistsInGCS1), + }, + ], + folders: [], + }, + ], + }, + ], + overleaf: { history: { id: historyId0 } }, + // Incremented when removing file/updating hash + version: 6, + }, + ]) + expect(await deletedProjectsCollection.find({}).toArray()).to.deep.equal([ + { + _id: deleteProjectsRecordId0, + project: { + _id: projectIdDeleted0, + rootFolder: [ + { + fileRefs: [ + // Updated hash + { + _id: fileIdWithDifferentHashNotFound1, + hash: gitBlobHash(fileIdWithDifferentHashNotFound1), + }, + ], + folders: [ + { + fileRefs: [], + folders: [ + { + fileRefs: [ + // Removed + // { _id: fileIdMissing1 }, + ], + folders: [], + }, + ], + }, + ], + }, + ], + overleaf: { history: { id: historyIdDeleted0 } }, + // Incremented when removing file/updating hash + version: 102, + }, + deleterData: { + deletedProjectId: projectIdDeleted0, + }, + }, + ]) + const writtenBlobsByProject = new Map() + for (const { projectId, fileId } of writtenBlobs) { + writtenBlobsByProject.set( + projectId, + (writtenBlobsByProject.get(projectId) || []).concat([fileId]) + ) + } + expect( + (await backedUpBlobs.find({}, { sort: { _id: 1 } }).toArray()).map( + entry => { + // blobs are pushed unordered into mongo. Sort the list for consistency. + entry.blobs.sort() + return entry + } + ) + ).to.deep.equal( + Array.from(writtenBlobsByProject.entries()).map( + ([projectId, fileIds]) => { + return { + _id: projectId, + blobs: fileIds + .map(fileId => binaryForGitBlobHash(gitBlobHash(fileId))) + .sort(), + } + } + ) + ) + }) + it('should have backed up all the files', async function () { + expect(tieringStorageClass).to.exist + const objects = await listS3Bucket(projectBlobsBucket, tieringStorageClass) + expect(objects.sort()).to.deep.equal( + writtenBlobs + .map(({ historyId, fileId, hash }) => + makeProjectKey(historyId, hash || gitBlobHash(fileId)) + ) + .sort() + ) + for (let { historyId, fileId } of writtenBlobs) { + const hash = gitBlobHash(fileId.toString()) + const s = await backupPersistor.getObjectStream( + projectBlobsBucket, + makeProjectKey(historyId, hash), + { autoGunzip: true } + ) + const buf = new WritableBuffer() + await Stream.promises.pipeline(s, buf) + expect(gitBlobHashBuffer(buf.getContents())).to.equal(hash) + const id = buf.getContents().toString('utf-8') + expect(id).to.equal(fileId.toString()) + // double check we are not comparing 'undefined' or '[object Object]' above + expect(id).to.match(/^[a-f0-9]{24}$/) + } + const deks = await listS3Bucket(deksBucket, 'STANDARD') + expect(deks.sort()).to.deep.equal( + Array.from( + new Set( + writtenBlobs.map( + ({ historyId }) => projectKey.format(historyId) + '/dek' + ) + ) + ).sort() + ) + }) + it('should have written the back filled files to history v1', async function () { + for (const { historyId, fileId } of writtenBlobs) { + const blobStore = new BlobStore(historyId.toString()) + const hash = gitBlobHash(fileId.toString()) + const blob = await blobStore.getBlob(hash) + expect(blob).to.exist + expect(blob.getByteLength()).to.equal(24) + const id = await blobStore.getString(hash) + expect(id).to.equal(fileId.toString()) + // double check we are not comparing 'undefined' or '[object Object]' above + expect(id).to.match(/^[a-f0-9]{24}$/) + } + }) +})