Merge pull request #24335 from overleaf/bg-warn-old-pending-changes

add warning for backups outside rpo threshold

GitOrigin-RevId: a8421529ae64693d860b0325961b010a132426da
This commit is contained in:
Brian Gough
2025-03-17 15:46:23 +00:00
committed by Copybot
parent 1fdf4f866f
commit d7d7cd85be
2 changed files with 44 additions and 5 deletions
@@ -2,7 +2,10 @@ import Queue from 'bull'
import config from 'config'
import commandLineArgs from 'command-line-args'
import logger from '@overleaf/logger'
import { listPendingBackups } from '../lib/backup_store/index.js'
import {
listPendingBackups,
getBackupStatus,
} from '../lib/backup_store/index.js'
logger.initialize('backup-queue')
@@ -65,6 +68,12 @@ const optionDefinitions = [
description: 'Number of retry attempts for failed jobs (default: 3)',
defaultValue: 3,
},
{
name: 'warn-threshold',
type: Number,
description: 'Warn about any project exceeding this pending age',
defaultValue: 2 * 3600, // 2 hours
},
{
name: 'verbose',
alias: 'v',
@@ -75,6 +84,7 @@ const optionDefinitions = [
// Parse command line arguments
const options = commandLineArgs(optionDefinitions)
const WARN_THRESHOLD = options['warn-threshold']
// Helper to validate date format
function isValidDateFormat(dateStr) {
@@ -209,11 +219,27 @@ async function processPendingProjects(
let existingCount = 0
// Pass the limit directly to MongoDB query for better performance
const pendingCursor = listPendingBackups(timeIntervalMs, limit)
const changeTimes = []
for await (const project of pendingCursor) {
const projectId = project._id.toHexString()
const pendingAt = project.overleaf?.backup?.pendingChangeAt
if (pendingAt) {
changeTimes.push(pendingAt)
const pendingAge = Math.floor((Date.now() - pendingAt.getTime()) / 1000)
if (pendingAge > WARN_THRESHOLD) {
const backupStatus = await getBackupStatus(projectId)
logger.warn(
{
projectId,
pendingAt,
pendingAge,
backupStatus,
warnThreshold: WARN_THRESHOLD,
},
`pending change exceeds rpo warning threshold`
)
}
}
if (showOnly && verbose) {
console.log(
`Project: ${projectId} (pending since: ${formatPendingTime(pendingAt)})`
@@ -252,6 +278,10 @@ async function processPendingProjects(
}
}
const oldestChange = changeTimes.reduce((min, time) =>
time < min ? time : min
)
if (showOnly) {
console.log(
`Found ${count} projects with pending changes (not added to queue)`
@@ -260,6 +290,7 @@ async function processPendingProjects(
console.log(`Found ${count} projects with pending changes:`)
console.log(` ${addedCount} jobs added to queue`)
console.log(` ${existingCount} jobs already existed in queue`)
console.log(` Oldest pending change: ${formatPendingTime(oldestChange)}`)
}
}
@@ -9,6 +9,7 @@ import {
} from './backup.mjs'
const CONCURRENCY = 15
const WARN_THRESHOLD = 2 * 60 * 60 * 1000 // warn if projects are older than this
const redisOptions = config.get('redis.queue')
const JOB_TIME_BUCKETS = [10, 100, 500, 1000, 5000, 10000, 30000, 60000] // milliseconds
const LAG_TIME_BUCKETS_HRS = [
@@ -72,7 +73,7 @@ backupQueue.process(CONCURRENCY, async job => {
const { projectId, startDate, endDate } = job.data
if (projectId) {
return await runBackup(projectId, job.data)
return await runBackup(projectId, job.data, job)
} else if (startDate && endDate) {
return await runInit(startDate, endDate)
} else {
@@ -80,7 +81,7 @@ backupQueue.process(CONCURRENCY, async job => {
}
})
async function runBackup(projectId, data) {
async function runBackup(projectId, data, job) {
const { pendingChangeAt } = data
// record the time it takes to run the backup job
const timer = new metrics.Timer(
@@ -89,6 +90,13 @@ async function runBackup(projectId, data) {
{},
JOB_TIME_BUCKETS
)
const pendingAge = Date.now() - pendingChangeAt
if (pendingAge > WARN_THRESHOLD) {
logger.warn(
{ projectId, pendingAge, job },
'project has been pending for a long time'
)
}
try {
logger.debug({ projectId }, 'processing backup for project')
await backupProject(projectId, {})