From d7d7cd85be93717631b8577bdd4210e27d4a8c3f Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Mon, 17 Mar 2025 15:46:23 +0000 Subject: [PATCH] Merge pull request #24335 from overleaf/bg-warn-old-pending-changes add warning for backups outside rpo threshold GitOrigin-RevId: a8421529ae64693d860b0325961b010a132426da --- .../storage/scripts/backup_scheduler.mjs | 37 +++++++++++++++++-- .../storage/scripts/backup_worker.mjs | 12 +++++- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/services/history-v1/storage/scripts/backup_scheduler.mjs b/services/history-v1/storage/scripts/backup_scheduler.mjs index 096e64225a..32edc1d0af 100644 --- a/services/history-v1/storage/scripts/backup_scheduler.mjs +++ b/services/history-v1/storage/scripts/backup_scheduler.mjs @@ -2,7 +2,10 @@ import Queue from 'bull' import config from 'config' import commandLineArgs from 'command-line-args' import logger from '@overleaf/logger' -import { listPendingBackups } from '../lib/backup_store/index.js' +import { + listPendingBackups, + getBackupStatus, +} from '../lib/backup_store/index.js' logger.initialize('backup-queue') @@ -65,6 +68,12 @@ const optionDefinitions = [ description: 'Number of retry attempts for failed jobs (default: 3)', defaultValue: 3, }, + { + name: 'warn-threshold', + type: Number, + description: 'Warn about any project exceeding this pending age', + defaultValue: 2 * 3600, // 2 hours + }, { name: 'verbose', alias: 'v', @@ -75,6 +84,7 @@ const optionDefinitions = [ // Parse command line arguments const options = commandLineArgs(optionDefinitions) +const WARN_THRESHOLD = options['warn-threshold'] // Helper to validate date format function isValidDateFormat(dateStr) { @@ -209,11 +219,27 @@ async function processPendingProjects( let existingCount = 0 // Pass the limit directly to MongoDB query for better performance const pendingCursor = listPendingBackups(timeIntervalMs, limit) - + const changeTimes = [] for await (const project of pendingCursor) { const projectId = project._id.toHexString() const pendingAt = project.overleaf?.backup?.pendingChangeAt - + if (pendingAt) { + changeTimes.push(pendingAt) + const pendingAge = Math.floor((Date.now() - pendingAt.getTime()) / 1000) + if (pendingAge > WARN_THRESHOLD) { + const backupStatus = await getBackupStatus(projectId) + logger.warn( + { + projectId, + pendingAt, + pendingAge, + backupStatus, + warnThreshold: WARN_THRESHOLD, + }, + `pending change exceeds rpo warning threshold` + ) + } + } if (showOnly && verbose) { console.log( `Project: ${projectId} (pending since: ${formatPendingTime(pendingAt)})` @@ -252,6 +278,10 @@ async function processPendingProjects( } } + const oldestChange = changeTimes.reduce((min, time) => + time < min ? time : min + ) + if (showOnly) { console.log( `Found ${count} projects with pending changes (not added to queue)` @@ -260,6 +290,7 @@ async function processPendingProjects( console.log(`Found ${count} projects with pending changes:`) console.log(` ${addedCount} jobs added to queue`) console.log(` ${existingCount} jobs already existed in queue`) + console.log(` Oldest pending change: ${formatPendingTime(oldestChange)}`) } } diff --git a/services/history-v1/storage/scripts/backup_worker.mjs b/services/history-v1/storage/scripts/backup_worker.mjs index 90a81c40b8..1097bb04b9 100644 --- a/services/history-v1/storage/scripts/backup_worker.mjs +++ b/services/history-v1/storage/scripts/backup_worker.mjs @@ -9,6 +9,7 @@ import { } from './backup.mjs' const CONCURRENCY = 15 +const WARN_THRESHOLD = 2 * 60 * 60 * 1000 // warn if projects are older than this const redisOptions = config.get('redis.queue') const JOB_TIME_BUCKETS = [10, 100, 500, 1000, 5000, 10000, 30000, 60000] // milliseconds const LAG_TIME_BUCKETS_HRS = [ @@ -72,7 +73,7 @@ backupQueue.process(CONCURRENCY, async job => { const { projectId, startDate, endDate } = job.data if (projectId) { - return await runBackup(projectId, job.data) + return await runBackup(projectId, job.data, job) } else if (startDate && endDate) { return await runInit(startDate, endDate) } else { @@ -80,7 +81,7 @@ backupQueue.process(CONCURRENCY, async job => { } }) -async function runBackup(projectId, data) { +async function runBackup(projectId, data, job) { const { pendingChangeAt } = data // record the time it takes to run the backup job const timer = new metrics.Timer( @@ -89,6 +90,13 @@ async function runBackup(projectId, data) { {}, JOB_TIME_BUCKETS ) + const pendingAge = Date.now() - pendingChangeAt + if (pendingAge > WARN_THRESHOLD) { + logger.warn( + { projectId, pendingAge, job }, + 'project has been pending for a long time' + ) + } try { logger.debug({ projectId }, 'processing backup for project') await backupProject(projectId, {})