diff --git a/services/history-v1/backupVerifier/ProjectSampler.mjs b/services/history-v1/backupVerifier/ProjectSampler.mjs new file mode 100644 index 0000000000..f1f697bd6d --- /dev/null +++ b/services/history-v1/backupVerifier/ProjectSampler.mjs @@ -0,0 +1,61 @@ +// @ts-check +import { objectIdFromDate } from './utils.mjs' +import { db } from '../storage/lib/mongodb.js' +import config from 'config' + +const projectsCollection = db.collection('projects') + +const HAS_PROJECTS_WITHOUT_HISTORY = + config.get('hasProjectsWithoutHistory') === 'true' + +/** + * @param {Date} start + * @param {Date} end + * @param {number} N + * @return {Promise>} + */ +export async function selectProjectsInDateRange(start, end, N) { + let projects = await projectsCollection + .aggregate([ + { + $match: { + _id: { + $gt: objectIdFromDate(start), + $lte: objectIdFromDate(end), + }, + }, + }, + { $sample: { size: N } }, + { $project: { 'overleaf.history.id': 1 } }, + ]) + .toArray() + if (HAS_PROJECTS_WITHOUT_HISTORY) { + projects = projects.filter(p => Boolean(p.overleaf?.history?.id)) + if (projects.length === 0) { + // Very unlucky sample. Try again. + return await selectProjectsInDateRange(start, end, N) + } + } + return projects.map(p => p.overleaf.history.id.toString()) +} + +export async function* getSampleProjectsCursor(N) { + const cursor = projectsCollection.aggregate([ + { $sample: { size: N } }, + { $project: { 'overleaf.history.id': 1 } }, + ]) + + let validProjects = 0 + + for await (const project of cursor) { + if (HAS_PROJECTS_WITHOUT_HISTORY) { + continue + } + validProjects++ + yield project.overleaf.history.id.toString() + } + + if (validProjects === 0) { + yield* getSampleProjectsCursor(N) + } +} diff --git a/services/history-v1/backupVerifier/ProjectVerifier.mjs b/services/history-v1/backupVerifier/ProjectVerifier.mjs new file mode 100644 index 0000000000..33887de805 --- /dev/null +++ b/services/history-v1/backupVerifier/ProjectVerifier.mjs @@ -0,0 +1,272 @@ +// @ts-check +import { + BackupCorruptedError, + BackupCorruptedInvalidBlobError, + BackupCorruptedMissingBlobError, + BackupRPOViolationChunkNotBackedUpError, + BackupRPOViolationError, + verifyProjectWithErrorContext, +} from '../storage/lib/backupVerifier.mjs' +import { promiseMapSettledWithLimit } from '@overleaf/promise-utils' +import logger from '@overleaf/logger' +import metrics from '@overleaf/metrics' +import { + getSampleProjectsCursor, + selectProjectsInDateRange, +} from './ProjectSampler.mjs' +import OError from '@overleaf/o-error' + +const MS_PER_30_DAYS = 30 * 24 * 60 * 60 * 1000 + +const METRICS = { + backup_project_verification_failed: 'backup_project_verification_failed', + backup_project_verification_succeeded: + 'backup_project_verification_succeeded', +} + +let WRITE_METRICS = false + +/** + * Allows writing metrics to be enabled or disabled. + * @param {Boolean} writeMetrics + */ +export function setWriteMetrics(writeMetrics) { + WRITE_METRICS = writeMetrics +} + +/** + * + * @param {Error|unknown} error + * @param {string} historyId + */ +function handleVerificationError(error, historyId) { + let errorType = 'unknown' + // Additional detail for verbose/development + if (error instanceof Error) { + if (error instanceof BackupCorruptedInvalidBlobError) { + errorType = 'invalid-blob' + } else if (error instanceof BackupCorruptedMissingBlobError) { + errorType = 'missing-blob' + } else if (error instanceof BackupCorruptedError) { + errorType = 'backup-corrupted-unknown' + } else if (error instanceof BackupRPOViolationChunkNotBackedUpError) { + errorType = 'rpo-violation-chunk-not-backed-up' + } else if (error instanceof BackupRPOViolationError) { + errorType = 'rpo-violation' + } + } + + logger.error( + { errorType, historyId, error }, + 'error verifying project backup' + ) + + WRITE_METRICS && + metrics.inc(METRICS.backup_project_verification_failed, 1, { errorType }) + + return errorType +} + +/** + * + * @param {Date} startDate + * @param {Date} endDate + * @param {number} interval + * @returns {Array} + */ +function splitJobs(startDate, endDate, interval) { + /** @type {Array} */ + const jobs = [] + while (startDate < endDate) { + const nextStart = new Date( + Math.min(startDate.getTime() + interval, endDate.getTime()) + ) + jobs.push({ startDate, endDate: nextStart }) + startDate = nextStart + } + return jobs +} + +/** + * + * @param {Array} historyIds + * @return {Promise} + */ +async function verifyProjects(historyIds) { + let verified = 0 + const errorTypes = [] + for (const historyId of historyIds) { + try { + await verifyProjectWithErrorContext(historyId) + logger.debug({ historyId }, 'verified project backup successfully') + WRITE_METRICS && + metrics.inc(METRICS.backup_project_verification_succeeded) + verified++ + } catch (error) { + errorTypes.push(handleVerificationError(error, historyId)) + } + } + return { + verified, + errorTypes, + hasFailure: errorTypes.length > 0, + total: historyIds.length, + } +} + +/** + * + * @param {number} nProjectsToSample + * @return {Promise} + */ +export async function verifyRandomProjectSample(nProjectsToSample) { + const historyIds = await getSampleProjectsCursor(nProjectsToSample) + + const errorTypes = [] + let verified = 0 + let total = 0 + for await (const historyId of historyIds) { + total++ + try { + await verifyProjectWithErrorContext(historyId) + logger.debug({ historyId }, 'verified project backup successfully') + WRITE_METRICS && + metrics.inc(METRICS.backup_project_verification_succeeded) + verified++ + } catch (error) { + errorTypes.push(handleVerificationError(error, historyId)) + } + } + return { + verified, + total, + errorTypes, + hasFailure: errorTypes.length > 0, + } +} + +/** + * Samples projects with history IDs between the specified dates and verifies them. + * + * @param {Date} startDate + * @param {Date} endDate + * @param {number} projectsPerRange + * @return {Promise} + */ +async function verifyRange(startDate, endDate, projectsPerRange) { + logger.info({ startDate, endDate }, 'verifying range') + const historyIds = await selectProjectsInDateRange( + startDate, + endDate, + projectsPerRange + ) + if (historyIds.length === 0) { + logger.debug( + { start: startDate, end: endDate }, + 'No projects found in range' + ) + return { + startDate, + endDate, + verified: 0, + total: 0, + hasFailure: false, + errorTypes: [], + } + } + logger.debug( + { startDate, endDate, total: historyIds.length }, + 'Verifying projects in range' + ) + + const { errorTypes, hasFailure, verified } = await verifyProjects(historyIds) + + const jobStatus = { + verified, + total: historyIds.length, + hasFailure, + startDate, + endDate, + errorTypes, + } + + logger.debug(jobStatus, 'verified range') + return jobStatus +} + +/** + * @typedef {Object} VerificationJobSpecification + * @property {Date} startDate + * @property {Date} endDate + */ + +/** + * @typedef {import('./types.d.ts').VerificationJobStatus} VerificationJobStatus + */ + +/** + * @typedef {Object} VerifyDateRangeOptions + * @property {Date} startDate + * @property {Date} endDate + * @property {number} [interval] + * @property {number} [projectsPerRange] + * @property {number} [concurrency] + */ + +/** + * + * @param {VerifyDateRangeOptions} options + * @return {Promise} + */ +export async function verifyProjectsInDateRange({ + concurrency = 0, + projectsPerRange = 10, + startDate, + endDate, + interval = MS_PER_30_DAYS, +}) { + const jobs = splitJobs(startDate, endDate, interval) + if (jobs.length === 0) { + throw new OError('Time range could not be split into jobs', { + start: startDate, + end: endDate, + interval, + }) + } + const settlements = await promiseMapSettledWithLimit( + concurrency, + jobs, + ({ startDate, endDate }) => + verifyRange(startDate, endDate, projectsPerRange) + ) + return settlements.reduce( + /** + * + * @param {VerificationJobStatus} acc + * @param settlement + * @return {VerificationJobStatus} + */ + (acc, settlement) => { + if (settlement.status !== 'rejected') { + if (settlement.value.hasFailure) { + acc.hasFailure = true + } + acc.total += settlement.value.total + acc.verified += settlement.value.verified + acc.errorTypes = acc.errorTypes.concat(settlement.value.errorTypes) + } else { + logger.error({ ...settlement.reason }, 'Error processing range') + } + return acc + }, + /** @type {VerificationJobStatus} */ + { + startDate, + endDate, + verified: 0, + total: 0, + hasFailure: false, + errorTypes: [], + } + ) +} diff --git a/services/history-v1/backupVerifier/types.d.ts b/services/history-v1/backupVerifier/types.d.ts new file mode 100644 index 0000000000..7bfa4a85ff --- /dev/null +++ b/services/history-v1/backupVerifier/types.d.ts @@ -0,0 +1,8 @@ +export type VerificationJobStatus = { + verified: number + total: number + startDate?: Date + endDate?: Date + hasFailure: boolean + errorTypes: Array +} diff --git a/services/history-v1/backupVerifier/utils.mjs b/services/history-v1/backupVerifier/utils.mjs new file mode 100644 index 0000000000..ba3f564eb8 --- /dev/null +++ b/services/history-v1/backupVerifier/utils.mjs @@ -0,0 +1,9 @@ +import { ObjectId } from 'mongodb' + +/** + * @param {Date} time + * @return {ObjectId} + */ +export function objectIdFromDate(time) { + return ObjectId.createFromTime(time.getTime() / 1000) +} diff --git a/services/history-v1/config/default.json b/services/history-v1/config/default.json index c609730650..81ae057171 100644 --- a/services/history-v1/config/default.json +++ b/services/history-v1/config/default.json @@ -30,6 +30,7 @@ "zipStore": { "zipTimeoutMs": "360000" }, + "hasProjectsWithoutHistory": false, "minSoftDeletionPeriodDays": "90", "maxDeleteKeys": "1000", "useDeleteObjects": "true", diff --git a/services/history-v1/storage/scripts/verify_sampled_projects.mjs b/services/history-v1/storage/scripts/verify_sampled_projects.mjs new file mode 100644 index 0000000000..6756ce095b --- /dev/null +++ b/services/history-v1/storage/scripts/verify_sampled_projects.mjs @@ -0,0 +1,182 @@ +// @ts-check +import commandLineArgs from 'command-line-args' +import { + setWriteMetrics, + verifyProjectsInDateRange, + verifyRandomProjectSample, +} from '../../backupVerifier/ProjectVerifier.mjs' +import knex from '../lib/knex.js' +import { client } from '../lib/mongodb.js' +import { setTimeout } from 'node:timers/promises' +import logger from '@overleaf/logger' + +logger.logger.level('fatal') + +const usageMessage = [ + 'Usage: node verify_range.mjs [--startDate ] [--endDate ] [--nProjects ] [--verbose] [--usage] [--writeMetrics] [--concurrency ] [--strategy ]', + 'strategy: defaults to "range"; startDate and endDate are required for "range" strategy', +].join('\n') + +/** + * Gracefully shutdown the process + * @param code + * @return {Promise} + */ +async function gracefulShutdown(code = process.exitCode) { + await knex.destroy() + await client.close() + await setTimeout(1_000) + process.exit(code) +} + +const STATS = { + verifiable: 0, + unverifiable: 0, +} + +/** + * @typedef {Object} CLIOptions + * @property {() => Promise} projectVerifier + * @property {boolean} verbose + */ + +/** + * @typedef {import('../../backupVerifier/types.d.ts').VerificationJobStatus} VerificationJobStatus + */ + +/** + * + * @return {CLIOptions} + */ +function getOptions() { + const { + startDate, + endDate, + concurrency, + writeMetrics, + verbose, + nProjects, + strategy, + usage, + } = commandLineArgs([ + { name: 'startDate', type: String }, + { name: 'endDate', type: String }, + { name: 'concurrency', type: Number, defaultValue: 1 }, + { name: 'verbose', type: Boolean, defaultValue: false }, + { name: 'nProjects', type: Number, defaultValue: 10 }, + { name: 'usage', type: Boolean, defaultValue: false }, + { name: 'writeMetrics', type: Boolean, defaultValue: false }, + { name: 'strategy', type: String, defaultValue: 'range' }, + ]) + + if (usage) { + console.log(usageMessage) + process.exit(0) + } + + if (!['range', 'random'].includes(strategy)) { + throw new Error(`Invalid strategy: ${strategy}`) + } + + setWriteMetrics(writeMetrics) + + switch (strategy) { + case 'random': + console.log('Verifying random projects') + return { + verbose, + projectVerifier: () => verifyRandomProjectSample(nProjects), + } + case 'range': + default: { + if (!startDate || !endDate) { + throw new Error(usageMessage) + } + const start = Date.parse(startDate) + const end = Date.parse(endDate) + if (Number.isNaN(start)) { + throw new Error(`Invalid start date: ${startDate}`) + } + + if (Number.isNaN(end)) { + throw new Error(`Invalid end date: ${endDate}`) + } + if (verbose) { + console.log(`Verifying from ${startDate} to ${endDate}`) + console.log(`Concurrency: ${concurrency}`) + } + STATS.ranges = 0 + return { + projectVerifier: () => + verifyProjectsInDateRange({ + startDate: new Date(start), + endDate: new Date(end), + projectsPerRange: nProjects, + concurrency, + }), + verbose, + } + } + } +} + +/** + * @type {CLIOptions} + */ +let options +try { + options = getOptions() +} catch (error) { + console.error(error) + process.exitCode = 1 + await gracefulShutdown(1) + process.exit() // just here so the type checker knows that the process will exit +} + +const { projectVerifier, verbose } = options + +if (verbose) { + logger.logger.level('debug') +} else { + logger.logger.level('error') +} + +/** + * + * @param {Array} array + * @param {string} matchString + * @return {*} + */ +function sumStringInstances(array, matchString) { + return array.reduce((total, string) => { + return string === matchString ? total + 1 : total + }, 0) +} + +/** + * + * @param {VerificationJobStatus} stats + */ +function displayStats(stats) { + console.log(`Verified projects: ${stats.verified}`) + console.log(`Total projects sampled: ${stats.total}`) + if (stats.errorTypes.length > 0) { + console.log('Errors:') + for (const error of new Set(stats.errorTypes)) { + console.log(`${error}: ${sumStringInstances(stats.errorTypes, error)}`) + } + } +} + +try { + const stats = await projectVerifier() + displayStats(stats) + console.log(`completed`) +} catch (error) { + console.error(error) + console.log('completed with errors') + process.exitCode = 1 +} finally { + console.log('shutting down') + await gracefulShutdown() +}