Merge pull request #22870 from overleaf/jpa-back-fill-fix-up

[history-v1] add script for fixing up back-fill errors

GitOrigin-RevId: 118992a32c1f6da4289cd35399ddd07a741da4ee
This commit is contained in:
Jakob Ackermann
2025-01-17 11:22:37 +00:00
committed by Copybot
parent 413c108c28
commit ab4d1e0986
2 changed files with 1330 additions and 0 deletions
@@ -0,0 +1,569 @@
// @ts-check
import Events from 'node:events'
import fs from 'node:fs'
import Stream from 'node:stream'
import { ObjectId } from 'mongodb'
import logger from '@overleaf/logger'
import OError from '@overleaf/o-error'
import {
BlobStore,
getStringLengthOfFile,
GLOBAL_BLOBS,
makeBlobForFile,
} from '../lib/blob_store/index.js'
import { db } from '../lib/mongodb.js'
import commandLineArgs from 'command-line-args'
import readline from 'node:readline'
import { _blobIsBackedUp, backupBlob } from '../lib/backupBlob.mjs'
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
import filestorePersistor from '../lib/persistor.js'
// Silence warning.
Events.setMaxListeners(20)
// Enable caching for ObjectId.toString()
ObjectId.cacheHexString = true
/**
* @typedef {import("overleaf-editor-core").Blob} Blob
* @typedef {import("mongodb").Collection} Collection
* @typedef {import("mongodb").Collection<Project>} ProjectsCollection
* @typedef {import("mongodb").Collection<{project: Project}>} DeletedProjectsCollection
*/
/**
* @typedef {Object} FileRef
* @property {ObjectId} _id
* @property {string} hash
*/
/**
* @typedef {Object} Folder
* @property {Array<Folder>} folders
* @property {Array<FileRef>} fileRefs
*/
/**
* @typedef {Object} Project
* @property {ObjectId} _id
* @property {Array<Folder>} rootFolder
* @property {{history: {id: (number|string)}}} overleaf
*/
/**
* @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, LOGS: string}}
*/
function parseArgs() {
const args = commandLineArgs([
{ name: 'fixNotFound', type: String, defaultValue: 'true' },
{ name: 'fixDeletePermission', type: String, defaultValue: 'true' },
{ name: 'fixHashMismatch', type: String, defaultValue: 'true' },
{ name: 'logs', type: String, defaultValue: '' },
])
/**
* commandLineArgs cannot handle --foo=false, so go the long way
* @param {string} name
* @return {boolean}
*/
function boolVal(name) {
const v = args[name]
if (['true', 'false'].includes(v)) return v === 'true'
throw new Error(`expected "true" or "false" for boolean option ${name}`)
}
return {
FIX_HASH_MISMATCH: boolVal('fixNotFound'),
FIX_DELETE_PERMISSION: boolVal('fixDeletePermission'),
FIX_NOT_FOUND: boolVal('fixHashMismatch'),
LOGS: args.logs,
}
}
const { FIX_HASH_MISMATCH, FIX_DELETE_PERMISSION, FIX_NOT_FOUND, LOGS } =
parseArgs()
if (!LOGS) {
throw new Error('--logs parameter missing')
}
const BUFFER_DIR = fs.mkdtempSync(
process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
)
const USER_FILES_BUCKET_NAME = process.env.USER_FILES_BUCKET_NAME || ''
if (!USER_FILES_BUCKET_NAME) {
throw new Error('env var USER_FILES_BUCKET_NAME is missing')
}
// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode
const STREAM_HIGH_WATER_MARK = parseInt(
process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(),
10
)
/** @type {ProjectsCollection} */
const projectsCollection = db.collection('projects')
/** @type {DeletedProjectsCollection} */
const deletedProjectsCollection = db.collection('deletedProjects')
let gracefulShutdownInitiated = false
process.on('SIGINT', handleSignal)
process.on('SIGTERM', handleSignal)
function handleSignal() {
gracefulShutdownInitiated = true
console.warn('graceful shutdown initiated, draining queue')
}
class FileDeletedError extends OError {}
/** @type {Map<string,{project: Project, projectSoftDeleted: boolean}>} */
const PROJECT_CACHE = new Map()
/**
* @param {string} projectId
* @return {Promise<{project: Project, projectSoftDeleted: boolean}>}
*/
async function getProject(projectId) {
const cached = PROJECT_CACHE.get(projectId)
if (cached) return cached
let projectSoftDeleted
let project = await projectsCollection.findOne({
_id: new ObjectId(projectId),
})
if (project) {
projectSoftDeleted = false
} else {
const softDeleted = await deletedProjectsCollection.findOne({
'deleterData.deletedProjectId': new ObjectId(projectId),
project: { $exists: true },
})
if (!softDeleted) {
throw new OError('project hard-deleted')
}
project = softDeleted.project
projectSoftDeleted = true
}
PROJECT_CACHE.set(projectId, { projectSoftDeleted, project })
return { projectSoftDeleted, project }
}
/**
* @param {Folder} folder
* @param {string} fileId
* @return {{path: string, fileRef: FileRef, folder: Folder}|null}
*/
function getFileTreePath(folder, fileId) {
if (!folder) return null
let idx = 0
if (Array.isArray(folder.fileRefs)) {
for (const fileRef of folder.fileRefs) {
if (fileRef?._id.toString() === fileId) {
return {
fileRef,
path: `.fileRefs.${idx}`,
folder,
}
}
idx++
}
}
idx = 0
if (Array.isArray(folder.folders)) {
for (const child of folder.folders) {
const match = getFileTreePath(child, fileId)
if (match) {
return {
fileRef: match.fileRef,
folder: match.folder,
path: `.folders.${idx}${match.path}`,
}
}
idx++
}
}
return null
}
/**
* @param {string} projectId
* @param {string} fileId
* @return {Promise<{fileRef: FileRef, folder: Folder, fullPath: string, query: Object, projectSoftDeleted: boolean}>}
*/
async function findFile(projectId, fileId) {
const { projectSoftDeleted, project } = await getProject(projectId)
const match = getFileTreePath(project.rootFolder[0], fileId)
if (!match) {
throw new FileDeletedError('file not found in file-tree', {
projectSoftDeleted,
})
}
const { path, fileRef, folder } = match
let fullPath
let query
if (projectSoftDeleted) {
fullPath = `project.rootFolder.0${path}`
query = {
'deleterData.deletedProjectId': new ObjectId(projectId),
[`${fullPath}._id`]: new ObjectId(fileId),
}
} else {
fullPath = `rootFolder.0${path}`
query = {
_id: new ObjectId(projectId),
[`${fullPath}._id`]: new ObjectId(fileId),
}
}
return {
projectSoftDeleted,
query,
fullPath,
fileRef,
folder,
}
}
/**
* @param {string} line
* @return {Promise<boolean>}
*/
async function fixNotFound(line) {
const { projectId, fileId, bucketName } = JSON.parse(line)
if (bucketName !== USER_FILES_BUCKET_NAME) {
throw new OError('not found case for another bucket')
}
const { projectSoftDeleted, query, fullPath, fileRef, folder } =
await findFile(projectId, fileId)
logger.info({ projectId, fileId, fileRef }, 'removing fileRef')
// Copied from _removeElementFromMongoArray (https://github.com/overleaf/internal/blob/11e09528c153de6b7766d18c3c90d94962190371/services/web/app/src/Features/Project/ProjectEntityMongoUpdateHandler.js)
const nonArrayPath = fullPath.slice(0, fullPath.lastIndexOf('.'))
let result
if (projectSoftDeleted) {
result = await deletedProjectsCollection.updateOne(query, {
$pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
$inc: { 'project.version': 1 },
})
} else {
result = await projectsCollection.updateOne(query, {
$pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
$inc: { version: 1 },
})
}
if (result.matchedCount !== 1) {
throw new OError('file-tree write did not match', { result })
}
// Update the cache. The mongo-path of the next file will be off otherwise.
folder.fileRefs = folder.fileRefs.filter(f => !f._id.equals(fileId))
return true
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} hash
* @return {Promise<void>}
*/
async function setHashInMongo(projectId, fileId, hash) {
const { projectSoftDeleted, query, fullPath, fileRef } = await findFile(
projectId,
fileId
)
if (fileRef.hash === hash) return
logger.info({ projectId, fileId, fileRef, hash }, 'setting fileRef hash')
let result
if (projectSoftDeleted) {
result = await deletedProjectsCollection.updateOne(query, {
$set: { [`${fullPath}.hash`]: hash },
$inc: { 'project.version': 1 },
})
} else {
result = await projectsCollection.updateOne(query, {
$set: { [`${fullPath}.hash`]: hash },
$inc: { version: 1 },
})
}
if (result.matchedCount !== 1) {
throw new OError('file-tree write did not match', { result })
}
fileRef.hash = hash // Update cache for completeness.
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} historyId
* @return {Promise<void>}
*/
async function importRestoredFilestoreFile(projectId, fileId, historyId) {
const filestoreKey = `${projectId}/${fileId}`
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
try {
let s
try {
s = await filestorePersistor.getObjectStream(
USER_FILES_BUCKET_NAME,
filestoreKey
)
} catch (err) {
if (err instanceof NotFoundError) {
throw new OError('missing blob, need to restore filestore file', {
filestoreKey,
})
}
throw err
}
await Stream.promises.pipeline(
s,
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
)
const blobStore = new BlobStore(historyId)
const blob = await blobStore.putFile(path)
await backupBlob(historyId, blob, path)
await setHashInMongo(projectId, fileId, blob.getHash())
} finally {
await fs.promises.rm(path, { force: true })
}
}
/**
* @param {string} projectId
* @param {string} fileId
* @return {Promise<string>}
*/
async function computeFilestoreFileHash(projectId, fileId) {
const filestoreKey = `${projectId}/${fileId}`
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
try {
let s
try {
s = await filestorePersistor.getObjectStream(
USER_FILES_BUCKET_NAME,
filestoreKey
)
} catch (err) {
if (err instanceof NotFoundError) {
throw new OError('missing blob, need to restore filestore file', {
filestoreKey,
})
}
throw err
}
await Stream.promises.pipeline(
s,
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
)
const blob = await makeBlobForFile(path)
return blob.getHash()
} finally {
await fs.promises.rm(path, { force: true })
}
}
/**
* @param {string} line
* @return {Promise<boolean>}
*/
async function fixHashMismatch(line) {
const {
projectId,
fileId,
hash: computedHash,
entry: {
hash: fileTreeHash,
ctx: { historyId },
},
} = JSON.parse(line)
const blobStore = new BlobStore(historyId)
if (await blobStore.getBlob(fileTreeHash)) {
throw new OError('found blob with computed filestore object hash')
}
if (!(await blobStore.getBlob(computedHash))) {
await importRestoredFilestoreFile(projectId, fileId, historyId)
return true
}
return await ensureBlobExistsForFileAndUploadToAWS(
projectId,
fileId,
computedHash
)
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} hash
* @return {Promise<boolean>}
*/
async function hashAlreadyUpdatedInFileTree(projectId, fileId, hash) {
const { fileRef } = await findFile(projectId, fileId)
return fileRef.hash === hash
}
/**
* @param {string} projectId
* @param {string} hash
* @return {Promise<boolean>}
*/
async function needsBackingUpToAWS(projectId, hash) {
if (GLOBAL_BLOBS.has(hash)) return false
return !(await _blobIsBackedUp(projectId, hash))
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} hash
* @return {Promise<boolean>}
*/
async function ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) {
const { project } = await getProject(projectId)
const historyId = project.overleaf.history.id.toString()
const blobStore = new BlobStore(historyId)
if (
(await hashAlreadyUpdatedInFileTree(projectId, fileId, hash)) &&
(await blobStore.getBlob(hash)) &&
!(await needsBackingUpToAWS(projectId, hash))
) {
return false // already processed
}
const stream = await blobStore.getStream(hash)
const path = `${BUFFER_DIR}/${historyId}_${hash}`
try {
await Stream.promises.pipeline(
stream,
fs.createWriteStream(path, {
highWaterMark: STREAM_HIGH_WATER_MARK,
})
)
const writtenBlob = await makeBlobForFile(path)
writtenBlob.setStringLength(
await getStringLengthOfFile(writtenBlob.getByteLength(), path)
)
if (writtenBlob.getHash() !== hash) {
// Double check download, better safe than sorry.
throw new OError('blob corrupted', { writtenBlob })
}
let blob = await blobStore.getBlob(hash)
if (!blob) {
// Calling blobStore.putBlob would result in the same error again.
// HACK: Skip upload to GCS and finalize putBlob operation directly.
await blobStore.backend.insertBlob(historyId, writtenBlob)
}
await backupBlob(historyId, writtenBlob, path)
} finally {
await fs.promises.rm(path, { force: true })
}
await setHashInMongo(projectId, fileId, hash)
return true
}
/**
* @param {string} line
* @return {Promise<boolean>}
*/
async function fixDeletePermission(line) {
let { projectId, fileId, hash } = JSON.parse(line)
if (!hash) hash = await computeFilestoreFileHash(projectId, fileId)
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
}
const CASES = {
'not found': {
match: 'NotFoundError',
flag: FIX_NOT_FOUND,
action: fixNotFound,
},
'hash mismatch': {
match: 'OError: hash mismatch',
flag: FIX_HASH_MISMATCH,
action: fixHashMismatch,
},
'delete permission': {
match: 'storage.objects.delete',
flag: FIX_DELETE_PERMISSION,
action: fixDeletePermission,
},
}
const STATS = {
processedLines: 0,
success: 0,
alreadyProcessed: 0,
fileDeleted: 0,
skipped: 0,
failed: 0,
unmatched: 0,
}
function logStats() {
console.log(
JSON.stringify({
time: new Date(),
gracefulShutdownInitiated,
...STATS,
})
)
}
setInterval(logStats, 10_000)
async function processLog() {
const rl = readline.createInterface({
input: fs.createReadStream(LOGS),
})
nextLine: for await (const line of rl) {
if (gracefulShutdownInitiated) break
STATS.processedLines++
if (!line.includes('"failed to process file"')) continue
for (const [name, { match, flag, action }] of Object.entries(CASES)) {
if (!line.includes(match)) continue
if (flag) {
try {
if (await action(line)) {
STATS.success++
} else {
STATS.alreadyProcessed++
}
} catch (err) {
if (err instanceof FileDeletedError) {
STATS.fileDeleted++
logger.info({ err, line }, 'file deleted, skipping')
} else {
STATS.failed++
logger.error({ err, line }, `failed to fix ${name}`)
}
}
} else {
STATS.skipped++
}
continue nextLine
}
STATS.unmatched++
logger.warn({ line }, 'unknown fatal error')
}
}
async function main() {
try {
await processLog()
} finally {
logStats()
try {
await fs.promises.rm(BUFFER_DIR, { recursive: true, force: true })
} catch (err) {
console.error(`Cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
}
}
const { skipped, failed, unmatched } = STATS
if (failed > 0) {
process.exit(Math.min(failed, 99))
} else if (unmatched > 0) {
process.exit(100)
} else if (skipped > 0) {
process.exit(101)
} else {
process.exit(0)
}
}
await main()
@@ -0,0 +1,761 @@
import fs from 'node:fs'
import Crypto from 'node:crypto'
import Stream from 'node:stream'
import { promisify } from 'node:util'
import { Binary, ObjectId } from 'mongodb'
import { Blob } from 'overleaf-editor-core'
import { backedUpBlobs, blobs, db } from '../../../../storage/lib/mongodb.js'
import cleanup from './support/cleanup.js'
import testProjects from '../api/support/test_projects.js'
import { execFile } from 'node:child_process'
import { expect } from 'chai'
import config from 'config'
import { WritableBuffer } from '@overleaf/stream-utils'
import {
backupPersistor,
projectBlobsBucket,
} from '../../../../storage/lib/backupPersistor.mjs'
import projectKey from '../../../../storage/lib/project_key.js'
import {
BlobStore,
makeProjectKey,
} from '../../../../storage/lib/blob_store/index.js'
import ObjectPersistor from '@overleaf/object-persistor'
const TIMEOUT = 20 * 1_000
const { deksBucket } = config.get('backupStore')
const { tieringStorageClass } = config.get('backupPersistor')
const projectsCollection = db.collection('projects')
const deletedProjectsCollection = db.collection('deletedProjects')
const FILESTORE_PERSISTOR = ObjectPersistor({
backend: 'gcs',
gcs: {
endpoint: {
apiEndpoint: process.env.GCS_API_ENDPOINT,
projectId: process.env.GCS_PROJECT_ID,
},
},
})
/**
* @param {ObjectId} objectId
* @return {string}
*/
function gitBlobHash(objectId) {
return gitBlobHashBuffer(Buffer.from(objectId.toString()))
}
/**
* @param {Buffer} buf
* @return {string}
*/
function gitBlobHashBuffer(buf) {
const sha = Crypto.createHash('sha1')
sha.update(`blob ${buf.byteLength}\x00`)
sha.update(buf)
return sha.digest('hex')
}
/**
* @param {string} gitBlobHash
* @return {Binary}
*/
function binaryForGitBlobHash(gitBlobHash) {
return new Binary(Buffer.from(gitBlobHash, 'hex'))
}
async function listS3Bucket(bucket, wantStorageClass) {
const client = backupPersistor._getClientForBucket(bucket)
const response = await client.listObjectsV2({ Bucket: bucket }).promise()
for (const object of response.Contents || []) {
expect(object).to.have.property('StorageClass', wantStorageClass)
}
return (response.Contents || []).map(item => item.Key || '')
}
function objectIdFromTime(timestamp) {
return ObjectId.createFromTime(new Date(timestamp).getTime() / 1000)
}
const PRINT_IDS_AND_HASHES_FOR_DEBUGGING = false
describe('back_fill_file_hash_fix_up script', function () {
this.timeout(TIMEOUT)
const USER_FILES_BUCKET_NAME = 'fake-user-files-gcs'
const projectId0 = objectIdFromTime('2017-01-01T00:00:00Z')
const projectIdDeleted0 = objectIdFromTime('2017-01-01T00:04:00Z')
const historyId0 = 42 // stored as number is mongo
const historyIdDeleted0 = projectIdDeleted0.toString()
const fileIdWithDifferentHashFound = objectIdFromTime('2017-02-01T00:00:00Z')
const fileIdInGoodState = objectIdFromTime('2017-02-01T00:01:00Z')
const fileIdBlobExistsInGCS0 = objectIdFromTime('2017-02-01T00:02:00Z')
const fileIdWithDifferentHashNotFound0 = objectIdFromTime(
'2017-02-01T00:03:00Z'
)
const fileIdWithDifferentHashNotFound1 = objectIdFromTime(
'2017-02-01T00:04:00Z'
)
const fileIdBlobExistsInGCSCorrupted = objectIdFromTime(
'2017-02-01T00:05:00Z'
)
const fileIdMissing0 = objectIdFromTime('2024-02-01T00:06:00Z')
const fileIdMissing1 = objectIdFromTime('2017-02-01T00:07:00Z')
const fileIdWithDifferentHashRestore = objectIdFromTime(
'2017-02-01T00:08:00Z'
)
const fileIdBlobExistsInGCS1 = objectIdFromTime('2017-02-01T00:09:00Z')
const fileIdRestoreFromFilestore0 = objectIdFromTime('2017-02-01T00:10:00Z')
const fileIdRestoreFromFilestore1 = objectIdFromTime('2017-02-01T00:11:00Z')
const fileIdMissing2 = objectIdFromTime('2017-02-01T00:12:00Z')
const contentCorruptedBlob = 'string that produces another hash'
const contentDoesNotExistAsBlob = 'does not exist as blob'
const hashDoesNotExistAsBlob = gitBlobHashBuffer(
Buffer.from(contentDoesNotExistAsBlob)
)
const deleteProjectsRecordId0 = new ObjectId()
const writtenBlobs = [
{
projectId: projectId0,
historyId: historyId0,
fileId: fileIdBlobExistsInGCS0,
},
{
projectId: projectId0,
historyId: historyId0,
fileId: fileIdBlobExistsInGCS1,
},
{
projectId: projectId0,
historyId: historyId0,
fileId: fileIdWithDifferentHashNotFound0,
},
{
projectId: projectId0,
historyId: historyId0,
fileId: fileIdRestoreFromFilestore0,
},
{
projectId: projectId0,
historyId: historyId0,
fileId: fileIdRestoreFromFilestore1,
},
{
projectId: projectIdDeleted0,
historyId: historyIdDeleted0,
fileId: fileIdWithDifferentHashNotFound1,
},
]
const logs = [
{
projectId: projectId0,
fileId: fileIdWithDifferentHashFound,
err: { message: 'OError: hash mismatch' },
hash: gitBlobHash(fileIdMissing0), // does not matter
entry: {
ctx: { historyId: historyId0.toString() },
hash: gitBlobHash(fileIdInGoodState),
},
msg: 'failed to process file',
},
{
projectId: projectId0,
fileId: fileIdWithDifferentHashRestore,
err: { message: 'OError: hash mismatch' },
hash: hashDoesNotExistAsBlob,
entry: {
ctx: { historyId: historyId0.toString() },
hash: gitBlobHash(fileIdMissing0), // does not matter
},
msg: 'failed to process file',
},
{
projectId: projectId0,
fileId: fileIdWithDifferentHashNotFound0,
err: { message: 'OError: hash mismatch' },
hash: gitBlobHash(fileIdWithDifferentHashNotFound0),
entry: {
ctx: { historyId: historyId0.toString() },
hash: hashDoesNotExistAsBlob,
},
msg: 'failed to process file',
},
{
projectId: projectId0,
fileId: fileIdRestoreFromFilestore0,
err: { message: 'OError: hash mismatch' },
hash: gitBlobHash(fileIdRestoreFromFilestore0),
entry: {
ctx: { historyId: historyId0.toString() },
hash: hashDoesNotExistAsBlob,
},
msg: 'failed to process file',
},
{
projectId: projectIdDeleted0,
fileId: fileIdWithDifferentHashNotFound1,
err: { message: 'OError: hash mismatch' },
hash: gitBlobHash(fileIdWithDifferentHashNotFound1),
entry: {
ctx: { historyId: historyIdDeleted0.toString() },
hash: hashDoesNotExistAsBlob,
},
msg: 'failed to process file',
},
{
projectId: projectId0,
fileId: fileIdMissing0,
bucketName: USER_FILES_BUCKET_NAME,
err: { message: 'NotFoundError' },
msg: 'failed to process file',
},
{
projectId: projectId0,
fileId: fileIdMissing2,
bucketName: USER_FILES_BUCKET_NAME,
err: { message: 'NotFoundError' },
msg: 'failed to process file',
},
{
projectId: projectId0,
fileId: fileIdBlobExistsInGCS0,
hash: gitBlobHash(fileIdBlobExistsInGCS0),
err: { message: 'storage.objects.delete' },
msg: 'failed to process file',
},
{
projectId: projectId0,
fileId: fileIdBlobExistsInGCSCorrupted,
hash: gitBlobHash(fileIdBlobExistsInGCSCorrupted),
err: { message: 'storage.objects.delete' },
msg: 'failed to process file',
},
{
projectId: projectId0,
fileId: fileIdBlobExistsInGCS1,
hash: gitBlobHash(fileIdBlobExistsInGCS1),
err: { message: 'storage.objects.delete' },
msg: 'failed to process file',
},
{
projectId: projectId0,
fileId: fileIdRestoreFromFilestore1,
err: { message: 'storage.objects.delete' },
msg: 'failed to process file',
},
{
projectId: projectIdDeleted0,
fileId: fileIdMissing1,
bucketName: USER_FILES_BUCKET_NAME,
err: { message: 'NotFoundError' },
msg: 'failed to process file',
},
{
err: { message: 'spurious error' },
msg: 'failed to process file, trying again',
},
{
err: { message: 'some other error' },
msg: 'failed to process file',
},
]
if (PRINT_IDS_AND_HASHES_FOR_DEBUGGING) {
const fileIds = {
fileIdWithDifferentHashFound,
fileIdInGoodState,
fileIdBlobExistsInGCS0,
fileIdBlobExistsInGCS1,
fileIdWithDifferentHashNotFound0,
fileIdWithDifferentHashNotFound1,
fileIdBlobExistsInGCSCorrupted,
fileIdMissing0,
fileIdMissing1,
fileIdMissing2,
fileIdWithDifferentHashRestore,
fileIdRestoreFromFilestore0,
fileIdRestoreFromFilestore1,
}
console.log({
projectId0,
projectIdDeleted0,
historyId0,
historyIdDeleted0,
...fileIds,
hashDoesNotExistAsBlob,
})
for (const [name, v] of Object.entries(fileIds)) {
console.log(
name,
gitBlobHash(v),
Array.from(binaryForGitBlobHash(gitBlobHash(v)).value())
)
}
}
before(cleanup.everything)
before('cleanup s3 buckets', async function () {
await backupPersistor.deleteDirectory(deksBucket, '')
await backupPersistor.deleteDirectory(projectBlobsBucket, '')
expect(await listS3Bucket(deksBucket)).to.have.length(0)
expect(await listS3Bucket(projectBlobsBucket)).to.have.length(0)
})
before('populate blobs/GCS', async function () {
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileIdRestoreFromFilestore0}`,
Stream.Readable.from([fileIdRestoreFromFilestore0.toString()])
)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileIdRestoreFromFilestore1}`,
Stream.Readable.from([fileIdRestoreFromFilestore1.toString()])
)
await new BlobStore(historyId0.toString()).putString(
fileIdBlobExistsInGCS0.toString()
)
await new BlobStore(historyId0.toString()).putString(
fileIdBlobExistsInGCS1.toString()
)
await new BlobStore(historyId0.toString()).putString(
fileIdRestoreFromFilestore1.toString()
)
const path = '/tmp/test-blob-corrupted'
try {
await fs.promises.writeFile(path, contentCorruptedBlob)
await new BlobStore(historyId0.toString()).putBlob(
path,
new Blob(gitBlobHash(fileIdBlobExistsInGCSCorrupted), 42)
)
} finally {
await fs.promises.rm(path, { force: true })
}
await cleanup.postgres()
await cleanup.mongo()
await Promise.all([
testProjects.createEmptyProject(historyId0.toString()),
testProjects.createEmptyProject(historyIdDeleted0),
])
await new BlobStore(historyId0.toString()).putString(
fileIdWithDifferentHashNotFound0.toString()
)
await new BlobStore(historyIdDeleted0.toString()).putString(
fileIdWithDifferentHashNotFound1.toString()
)
await new BlobStore(historyId0.toString()).putString(
fileIdInGoodState.toString()
)
})
before('populate mongo', async function () {
await projectsCollection.insertMany([
{
_id: projectId0,
rootFolder: [
{
fileRefs: [
{ _id: fileIdMissing0 },
{ _id: fileIdMissing0 }, // bad file-tree, duplicated fileRef.
{ _id: fileIdMissing2 },
{
_id: fileIdWithDifferentHashFound,
hash: gitBlobHash(fileIdInGoodState),
},
{
_id: fileIdWithDifferentHashRestore,
hash: gitBlobHash(fileIdMissing0),
},
],
folders: [
{
docs: [],
},
null,
{
fileRefs: [
null,
{
_id: fileIdInGoodState,
hash: gitBlobHash(fileIdInGoodState),
},
{
_id: fileIdWithDifferentHashNotFound0,
hash: hashDoesNotExistAsBlob,
},
{
_id: fileIdRestoreFromFilestore0,
hash: hashDoesNotExistAsBlob,
},
{
_id: fileIdRestoreFromFilestore1,
},
{
_id: fileIdBlobExistsInGCS0,
hash: gitBlobHash(fileIdBlobExistsInGCS0),
},
{
_id: fileIdBlobExistsInGCSCorrupted,
hash: gitBlobHash(fileIdBlobExistsInGCSCorrupted),
},
{ _id: fileIdBlobExistsInGCS1 },
],
folders: [],
},
],
},
],
overleaf: { history: { id: historyId0 } },
version: 0,
},
])
await deletedProjectsCollection.insertMany([
{
_id: deleteProjectsRecordId0,
project: {
_id: projectIdDeleted0,
rootFolder: [
{
fileRefs: [
{
_id: fileIdWithDifferentHashNotFound1,
hash: hashDoesNotExistAsBlob,
},
],
folders: [
{
fileRefs: [],
folders: [
{ fileRefs: [{ _id: fileIdMissing1 }], folders: [] },
],
},
],
},
],
overleaf: { history: { id: historyIdDeleted0 } },
version: 100,
},
deleterData: {
deletedProjectId: projectIdDeleted0,
},
},
])
})
/**
* @param {Array<string>} args
* @param {Record<string, string>} env
* @return {Promise<{ stdout: string, stderr: string, status: number }>}
*/
async function tryRunScript(args = [], env = {}) {
let result
try {
result = await promisify(execFile)(
process.argv0,
['storage/scripts/back_fill_file_hash_fix_up.mjs', ...args],
{
encoding: 'utf-8',
timeout: TIMEOUT - 500,
env: {
...process.env,
USER_FILES_BUCKET_NAME,
...env,
LOG_LEVEL: 'warn', // Override LOG_LEVEL of acceptance tests
},
}
)
result.status = 0
} catch (err) {
const { stdout, stderr, code } = err
if (typeof code !== 'number') {
console.log(err)
}
result = { stdout, stderr, status: code }
}
expect((await fs.promises.readdir('/tmp')).join(';')).to.not.match(
/back_fill_file_hash/
)
return result
}
async function runScriptWithLogs() {
const logsPath = '/tmp/test-script-logs'
let result
try {
await fs.promises.writeFile(
logsPath,
logs.map(e => JSON.stringify(e)).join('\n')
)
result = await tryRunScript([`--logs=${logsPath}`])
} finally {
await fs.promises.rm(logsPath, { force: true })
}
const stats = JSON.parse(result.stdout.trim().split('\n').pop())
return {
result,
stats,
}
}
let result, stats
before(async function () {
;({ result, stats } = await runScriptWithLogs())
})
it('should print stats', function () {
expect(stats).to.contain({
processedLines: 14,
success: 9,
alreadyProcessed: 0,
fileDeleted: 0,
skipped: 0,
failed: 3,
unmatched: 1,
})
})
it('should handle re-run on same logs', async function () {
;({ stats } = await runScriptWithLogs())
expect(stats).to.contain({
processedLines: 14,
success: 0,
alreadyProcessed: 6,
fileDeleted: 3,
skipped: 0,
failed: 3,
unmatched: 1,
})
})
it('should flag the unknown fatal error', function () {
const unknown = result.stdout
.split('\n')
.filter(l => l.includes('unknown fatal error'))
expect(unknown).to.have.length(1)
const [line] = unknown
expect(line).to.exist
expect(line).to.include('some other error')
})
it('should flag the unexpected blob on mismatched hash', function () {
const line = result.stdout
.split('\n')
.find(l => l.includes('found blob with computed filestore object hash'))
expect(line).to.exist
expect(line).to.include(projectId0.toString())
expect(line).to.include(fileIdWithDifferentHashFound.toString())
expect(line).to.include(gitBlobHash(fileIdInGoodState))
})
it('should flag the need to restore', function () {
const line = result.stdout
.split('\n')
.find(l => l.includes('missing blob, need to restore filestore file'))
expect(line).to.exist
expect(line).to.include(projectId0.toString())
expect(line).to.include(fileIdWithDifferentHashRestore.toString())
expect(line).to.include(hashDoesNotExistAsBlob)
})
it('should flag the corrupted blob', function () {
const line = result.stdout
.split('\n')
.find(l => l.includes('blob corrupted'))
expect(line).to.exist
expect(line).to.include(projectId0.toString())
expect(line).to.include(fileIdBlobExistsInGCSCorrupted.toString())
expect(line).to.include(
gitBlobHashBuffer(Buffer.from(contentCorruptedBlob))
)
expect(line).to.include(gitBlobHash(fileIdBlobExistsInGCSCorrupted))
})
it('should update mongo', async function () {
expect(await projectsCollection.find({}).toArray()).to.deep.equal([
{
_id: projectId0,
rootFolder: [
{
fileRefs: [
// Removed
// { _id: fileIdMissing0 },
// Removed
// { _id: fileIdMissing2 },
// No change, should warn about the find.
{
_id: fileIdWithDifferentHashFound,
hash: gitBlobHash(fileIdInGoodState),
},
// No change, should warn about the need to restore.
{
_id: fileIdWithDifferentHashRestore,
hash: gitBlobHash(fileIdMissing0),
},
],
folders: [
{
docs: [],
},
null,
{
fileRefs: [
null,
// No change
{
_id: fileIdInGoodState,
hash: gitBlobHash(fileIdInGoodState),
},
// Updated hash
{
_id: fileIdWithDifferentHashNotFound0,
hash: gitBlobHash(fileIdWithDifferentHashNotFound0),
},
// Updated hash
{
_id: fileIdRestoreFromFilestore0,
hash: gitBlobHash(fileIdRestoreFromFilestore0),
},
// Added hash
{
_id: fileIdRestoreFromFilestore1,
hash: gitBlobHash(fileIdRestoreFromFilestore1),
},
// No change, blob created
{
_id: fileIdBlobExistsInGCS0,
hash: gitBlobHash(fileIdBlobExistsInGCS0),
},
// No change, flagged
{
_id: fileIdBlobExistsInGCSCorrupted,
hash: gitBlobHash(fileIdBlobExistsInGCSCorrupted),
},
// Added hash
{
_id: fileIdBlobExistsInGCS1,
hash: gitBlobHash(fileIdBlobExistsInGCS1),
},
],
folders: [],
},
],
},
],
overleaf: { history: { id: historyId0 } },
// Incremented when removing file/updating hash
version: 6,
},
])
expect(await deletedProjectsCollection.find({}).toArray()).to.deep.equal([
{
_id: deleteProjectsRecordId0,
project: {
_id: projectIdDeleted0,
rootFolder: [
{
fileRefs: [
// Updated hash
{
_id: fileIdWithDifferentHashNotFound1,
hash: gitBlobHash(fileIdWithDifferentHashNotFound1),
},
],
folders: [
{
fileRefs: [],
folders: [
{
fileRefs: [
// Removed
// { _id: fileIdMissing1 },
],
folders: [],
},
],
},
],
},
],
overleaf: { history: { id: historyIdDeleted0 } },
// Incremented when removing file/updating hash
version: 102,
},
deleterData: {
deletedProjectId: projectIdDeleted0,
},
},
])
const writtenBlobsByProject = new Map()
for (const { projectId, fileId } of writtenBlobs) {
writtenBlobsByProject.set(
projectId,
(writtenBlobsByProject.get(projectId) || []).concat([fileId])
)
}
expect(
(await backedUpBlobs.find({}, { sort: { _id: 1 } }).toArray()).map(
entry => {
// blobs are pushed unordered into mongo. Sort the list for consistency.
entry.blobs.sort()
return entry
}
)
).to.deep.equal(
Array.from(writtenBlobsByProject.entries()).map(
([projectId, fileIds]) => {
return {
_id: projectId,
blobs: fileIds
.map(fileId => binaryForGitBlobHash(gitBlobHash(fileId)))
.sort(),
}
}
)
)
})
it('should have backed up all the files', async function () {
expect(tieringStorageClass).to.exist
const objects = await listS3Bucket(projectBlobsBucket, tieringStorageClass)
expect(objects.sort()).to.deep.equal(
writtenBlobs
.map(({ historyId, fileId, hash }) =>
makeProjectKey(historyId, hash || gitBlobHash(fileId))
)
.sort()
)
for (let { historyId, fileId } of writtenBlobs) {
const hash = gitBlobHash(fileId.toString())
const s = await backupPersistor.getObjectStream(
projectBlobsBucket,
makeProjectKey(historyId, hash),
{ autoGunzip: true }
)
const buf = new WritableBuffer()
await Stream.promises.pipeline(s, buf)
expect(gitBlobHashBuffer(buf.getContents())).to.equal(hash)
const id = buf.getContents().toString('utf-8')
expect(id).to.equal(fileId.toString())
// double check we are not comparing 'undefined' or '[object Object]' above
expect(id).to.match(/^[a-f0-9]{24}$/)
}
const deks = await listS3Bucket(deksBucket, 'STANDARD')
expect(deks.sort()).to.deep.equal(
Array.from(
new Set(
writtenBlobs.map(
({ historyId }) => projectKey.format(historyId) + '/dek'
)
)
).sort()
)
})
it('should have written the back filled files to history v1', async function () {
for (const { historyId, fileId } of writtenBlobs) {
const blobStore = new BlobStore(historyId.toString())
const hash = gitBlobHash(fileId.toString())
const blob = await blobStore.getBlob(hash)
expect(blob).to.exist
expect(blob.getByteLength()).to.equal(24)
const id = await blobStore.getString(hash)
expect(id).to.equal(fileId.toString())
// double check we are not comparing 'undefined' or '[object Object]' above
expect(id).to.match(/^[a-f0-9]{24}$/)
}
})
})