[history-v1] fix remaining files with missing hashes (#22985)

GitOrigin-RevId: df382732954538f9db177180aee0a44c5d5679ca
This commit is contained in:
Jakob Ackermann
2025-01-21 16:42:56 +00:00
committed by Copybot
parent 04c1497673
commit 6884338a34
2 changed files with 154 additions and 27 deletions
@@ -5,6 +5,7 @@ import Stream from 'node:stream'
import { ObjectId } from 'mongodb'
import logger from '@overleaf/logger'
import OError from '@overleaf/o-error'
import { Blob } from 'overleaf-editor-core'
import {
BlobStore,
getStringLengthOfFile,
@@ -25,7 +26,6 @@ Events.setMaxListeners(20)
ObjectId.cacheHexString = true
/**
* @typedef {import("overleaf-editor-core").Blob} Blob
* @typedef {import("mongodb").Collection} Collection
* @typedef {import("mongodb").Collection<Project>} ProjectsCollection
* @typedef {import("mongodb").Collection<{project: Project}>} DeletedProjectsCollection
@@ -51,13 +51,14 @@ ObjectId.cacheHexString = true
*/
/**
* @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, LOGS: string}}
* @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, FIX_MISSING_HASH: boolean, LOGS: string}}
*/
function parseArgs() {
const args = commandLineArgs([
{ name: 'fixNotFound', type: String, defaultValue: 'true' },
{ name: 'fixDeletePermission', type: String, defaultValue: 'true' },
{ name: 'fixHashMismatch', type: String, defaultValue: 'true' },
{ name: 'fixMissingHash', type: String, defaultValue: 'true' },
{ name: 'logs', type: String, defaultValue: '' },
])
/**
@@ -74,12 +75,18 @@ function parseArgs() {
FIX_HASH_MISMATCH: boolVal('fixNotFound'),
FIX_DELETE_PERMISSION: boolVal('fixDeletePermission'),
FIX_NOT_FOUND: boolVal('fixHashMismatch'),
FIX_MISSING_HASH: boolVal('fixMissingHash'),
LOGS: args.logs,
}
}
const { FIX_HASH_MISMATCH, FIX_DELETE_PERMISSION, FIX_NOT_FOUND, LOGS } =
parseArgs()
const {
FIX_HASH_MISMATCH,
FIX_DELETE_PERMISSION,
FIX_NOT_FOUND,
FIX_MISSING_HASH,
LOGS,
} = parseArgs()
if (!LOGS) {
throw new Error('--logs parameter missing')
}
@@ -326,37 +333,75 @@ async function importRestoredFilestoreFile(projectId, fileId, historyId) {
/**
* @param {string} projectId
* @param {string} fileId
* @return {Promise<string>}
* @param {string} path
* @return {Promise<Blob>}
*/
async function computeFilestoreFileHash(projectId, fileId) {
async function bufferFilestoreFileToDisk(projectId, fileId, path) {
const filestoreKey = `${projectId}/${fileId}`
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
try {
let s
try {
s = await filestorePersistor.getObjectStream(
await Stream.promises.pipeline(
await filestorePersistor.getObjectStream(
USER_FILES_BUCKET_NAME,
filestoreKey
)
} catch (err) {
if (err instanceof NotFoundError) {
throw new OError('missing blob, need to restore filestore file', {
filestoreKey,
})
}
throw err
}
await Stream.promises.pipeline(
s,
),
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
)
const blob = await makeBlobForFile(path)
blob.setStringLength(
await getStringLengthOfFile(blob.getByteLength(), path)
)
return blob
} catch (err) {
if (err instanceof NotFoundError) {
throw new OError('missing blob, need to restore filestore file', {
filestoreKey,
})
}
throw err
}
}
/**
* @param {string} projectId
* @param {string} fileId
* @return {Promise<string>}
*/
async function computeFilestoreFileHash(projectId, fileId) {
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
try {
const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
return blob.getHash()
} finally {
await fs.promises.rm(path, { force: true })
}
}
/**
* @param {string} projectId
* @param {string} fileId
* @return {Promise<void>}
*/
async function uploadFilestoreFile(projectId, fileId) {
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
try {
const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
const hash = blob.getHash()
try {
await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
} catch (err) {
if (!(err instanceof Blob.NotFoundError)) throw err
const { project } = await getProject(projectId)
const historyId = project.overleaf.history.id.toString()
const blobStore = new BlobStore(historyId)
await blobStore.putBlob(path, blob)
await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
}
} finally {
await fs.promises.rm(path, { force: true })
}
}
/**
* @param {string} line
* @return {Promise<boolean>}
@@ -468,6 +513,23 @@ async function fixDeletePermission(line) {
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
}
/**
* @param {string} line
* @return {Promise<boolean>}
*/
async function fixMissingHash(line) {
let { projectId, _id: fileId } = JSON.parse(line)
const {
fileRef: { hash },
} = await findFile(projectId, fileId)
if (hash) {
// processed, double check
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
}
await uploadFilestoreFile(projectId, fileId)
return true
}
const CASES = {
'not found': {
match: 'NotFoundError',
@@ -484,6 +546,11 @@ const CASES = {
flag: FIX_DELETE_PERMISSION,
action: fixDeletePermission,
},
'missing file hash': {
match: '"bad file hash"',
flag: FIX_MISSING_HASH,
action: fixMissingHash,
},
}
const STATS = {
@@ -513,7 +580,15 @@ async function processLog() {
nextLine: for await (const line of rl) {
if (gracefulShutdownInitiated) break
STATS.processedLines++
if (!line.includes('"failed to process file"')) continue
if (
!(
line.includes('"failed to process file"') ||
// Process missing hashes as flagged by find_malformed_filetrees.mjs
line.includes('"bad file-tree path"')
)
) {
continue
}
for (const [name, { match, flag, action }] of Object.entries(CASES)) {
if (!line.includes(match)) continue
@@ -113,6 +113,8 @@ describe('back_fill_file_hash_fix_up script', function () {
const fileIdRestoreFromFilestore0 = objectIdFromTime('2017-02-01T00:10:00Z')
const fileIdRestoreFromFilestore1 = objectIdFromTime('2017-02-01T00:11:00Z')
const fileIdMissing2 = objectIdFromTime('2017-02-01T00:12:00Z')
const fileIdHashMissing0 = objectIdFromTime('2017-02-01T00:13:00Z')
const fileIdHashMissing1 = objectIdFromTime('2017-02-01T00:14:00Z')
const contentCorruptedBlob = 'string that produces another hash'
const contentDoesNotExistAsBlob = 'does not exist as blob'
const hashDoesNotExistAsBlob = gitBlobHashBuffer(
@@ -145,6 +147,16 @@ describe('back_fill_file_hash_fix_up script', function () {
historyId: historyId0,
fileId: fileIdRestoreFromFilestore1,
},
{
projectId: projectId0,
historyId: historyId0,
fileId: fileIdHashMissing0,
},
{
projectId: projectId0,
historyId: historyId0,
fileId: fileIdHashMissing1,
},
{
projectId: projectIdDeleted0,
historyId: historyIdDeleted0,
@@ -263,6 +275,19 @@ describe('back_fill_file_hash_fix_up script', function () {
err: { message: 'some other error' },
msg: 'failed to process file',
},
// from find_malformed_filetrees.mjs
{
projectId: projectId0,
_id: fileIdHashMissing0,
reason: 'bad file hash',
msg: 'bad file-tree path',
},
{
projectId: projectId0,
_id: fileIdHashMissing1,
reason: 'bad file hash',
msg: 'bad file-tree path',
},
]
if (PRINT_IDS_AND_HASHES_FOR_DEBUGGING) {
const fileIds = {
@@ -279,6 +304,8 @@ describe('back_fill_file_hash_fix_up script', function () {
fileIdWithDifferentHashRestore,
fileIdRestoreFromFilestore0,
fileIdRestoreFromFilestore1,
fileIdHashMissing0,
fileIdHashMissing1,
}
console.log({
projectId0,
@@ -316,6 +343,19 @@ describe('back_fill_file_hash_fix_up script', function () {
`${projectId0}/${fileIdRestoreFromFilestore1}`,
Stream.Readable.from([fileIdRestoreFromFilestore1.toString()])
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileIdHashMissing0}`,
Stream.Readable.from([fileIdHashMissing0.toString()])
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileIdHashMissing1}`,
Stream.Readable.from([fileIdHashMissing1.toString()])
)
await new BlobStore(historyId0.toString()).putString(
fileIdHashMissing1.toString() // partially processed
)
await new BlobStore(historyId0.toString()).putString(
fileIdBlobExistsInGCS0.toString()
)
@@ -362,6 +402,8 @@ describe('back_fill_file_hash_fix_up script', function () {
{ _id: fileIdMissing0 },
{ _id: fileIdMissing0 }, // bad file-tree, duplicated fileRef.
{ _id: fileIdMissing2 },
{ _id: fileIdHashMissing0 },
{ _id: fileIdHashMissing1 },
{
_id: fileIdWithDifferentHashFound,
hash: gitBlobHash(fileIdInGoodState),
@@ -506,8 +548,8 @@ describe('back_fill_file_hash_fix_up script', function () {
})
it('should print stats', function () {
expect(stats).to.contain({
processedLines: 14,
success: 9,
processedLines: 16,
success: 11,
alreadyProcessed: 0,
fileDeleted: 0,
skipped: 0,
@@ -518,9 +560,9 @@ describe('back_fill_file_hash_fix_up script', function () {
it('should handle re-run on same logs', async function () {
;({ stats } = await runScriptWithLogs())
expect(stats).to.contain({
processedLines: 14,
processedLines: 16,
success: 0,
alreadyProcessed: 6,
alreadyProcessed: 8,
fileDeleted: 3,
skipped: 0,
failed: 3,
@@ -577,6 +619,16 @@ describe('back_fill_file_hash_fix_up script', function () {
// { _id: fileIdMissing0 },
// Removed
// { _id: fileIdMissing2 },
// Added hash
{
_id: fileIdHashMissing0,
hash: gitBlobHash(fileIdHashMissing0),
},
// Added hash
{
_id: fileIdHashMissing1,
hash: gitBlobHash(fileIdHashMissing1),
},
// No change, should warn about the find.
{
_id: fileIdWithDifferentHashFound,
@@ -639,7 +691,7 @@ describe('back_fill_file_hash_fix_up script', function () {
],
overleaf: { history: { id: historyId0 } },
// Incremented when removing file/updating hash
version: 6,
version: 8,
},
])
expect(await deletedProjectsCollection.find({}).toArray()).to.deep.equal([