Merge pull request #32530 from overleaf/mg-stuck-sync-repro

Detect and auto-clear stuck sync states

GitOrigin-RevId: 680861a33e42432dab7d40ad421980b707eb6089
This commit is contained in:
Malik Glossop
2026-04-28 13:35:30 +02:00
committed by Copybot
parent ba182f8275
commit da027f46cf
8 changed files with 538 additions and 18 deletions

View File

@@ -1,5 +1,7 @@
import OError from '@overleaf/o-error'
export const SYNC_ONGOING_ERROR_MESSAGE = 'sync ongoing'
export class NotFoundError extends OError {}
export class BadRequestError extends OError {}
export class SyncError extends OError {}

View File

@@ -374,6 +374,7 @@ export function getResyncPending(req, res, next) {
if (err) return next(err)
res.json({
resyncPending: state.isSyncOngoing(),
syncStuck: state.isSyncStuck(),
})
})
}

View File

@@ -4,6 +4,7 @@ import logger from '@overleaf/logger'
import OError from '@overleaf/o-error'
import * as UpdatesProcessor from './UpdatesProcessor.js'
import * as SyncManager from './SyncManager.js'
import { SYNC_ONGOING_ERROR_MESSAGE } from './Errors.js'
import * as WebApiManager from './WebApiManager.js'
import * as RedisManager from './RedisManager.js'
import * as ErrorRecorder from './ErrorRecorder.js'
@@ -44,9 +45,11 @@ promises.retryFailures = async (options = {}) => {
} else if (failureType === 'hard') {
const batch = await getFailureBatch(hardErrorSelector, limit)
const result = await retryFailureBatch(batch, timeout, async failure => {
await resyncProject(failure.project_id, {
hard: failureRequiresHardResync(failure),
})
// Ongoing-sync failures always use soft resync to preserve sync state.
// SyncManager needs existing state to detect and clear stuck syncs.
const hard =
failureRequiresHardResync(failure) && !isOngoingSyncFailure(failure)
await resyncProject(failure.project_id, { hard })
})
return result
}
@@ -62,6 +65,11 @@ function softErrorSelector(failure) {
}
function hardErrorSelector(failure) {
// Ongoing-sync failures are always retried via soft resync.
// SyncManager's stuck detection handles the retry limit (stuckClearCount).
if (isOngoingSyncFailure(failure)) return true
// Other failures: retry hard/repeated ones, but stop after MAX_RESYNC_ATTEMPTS
return (
(isHardFailure(failure) || isRepeatedFailure(failure)) &&
!isStuckFailure(failure)
@@ -84,6 +92,10 @@ function isRepeatedFailure(failure) {
return failure.attempts > 3
}
export function isOngoingSyncFailure(failure) {
return failure.error?.includes(SYNC_ONGOING_ERROR_MESSAGE) ?? false
}
function isStuckFailure(failure) {
return (
failure.resyncAttempts != null &&

View File

@@ -8,7 +8,11 @@ import logger from '@overleaf/logger'
import Metrics from '@overleaf/metrics'
import OError from '@overleaf/o-error'
import { File, Range } from 'overleaf-editor-core'
import { NeedFullProjectStructureResyncError, SyncError } from './Errors.js'
import {
NeedFullProjectStructureResyncError,
SYNC_ONGOING_ERROR_MESSAGE,
SyncError,
} from './Errors.js'
import { db, ObjectId } from './mongodb.js'
import * as SnapshotManager from './SnapshotManager.js'
import * as LockManager from './LockManager.js'
@@ -30,6 +34,8 @@ import { isInsert, isDelete } from './Utils.js'
*/
const MAX_RESYNC_HISTORY_RECORDS = 100 // keep this many records of previous resyncs
const EXPIRE_RESYNC_HISTORY_INTERVAL_MS = 90 * 24 * 3600 * 1000 // 90 days
const SYNC_STUCK_TIMEOUT_MS = 4 * 60 * 60 * 1000 // 4 hours
const MAX_STUCK_CLEAR_ATTEMPTS = 5
const keys = Settings.redis.lock.key_schema
@@ -93,7 +99,42 @@ async function startResyncWithoutLock(projectId, options) {
const syncState = await getResyncState(projectId)
if (syncState.isSyncOngoing()) {
throw new OError('sync ongoing')
if (syncState.isSyncStuck()) {
const stuckDocPaths = Array.from(syncState.resyncDocContents)
const stuckClearCount = syncState.stuckClearCount
if (stuckClearCount >= MAX_STUCK_CLEAR_ATTEMPTS) {
await _recordStuckClearInSyncState(projectId, stuckDocPaths)
Metrics.inc('project_history_sync_stuck_permanent')
// Log error only on first permanent-stuck detection to avoid spam
if (stuckClearCount === MAX_STUCK_CLEAR_ATTEMPTS) {
logger.error(
{
projectId,
stuckClearCount: stuckClearCount + 1,
stuckDocPaths,
resyncPendingSince: syncState.resyncPendingSince,
},
'sync permanently stuck — exceeded auto-clear limit'
)
}
throw new OError('sync permanently stuck')
}
logger.warn(
{
projectId,
stuckClearCount: stuckClearCount + 1,
stuckDocPaths,
resyncPendingSince: syncState.resyncPendingSince,
},
'sync stuck, clearing state and restarting'
)
Metrics.inc('project_history_sync_stuck_cleared')
await _recordStuckClearInSyncState(projectId, stuckDocPaths)
} else {
throw new OError(SYNC_ONGOING_ERROR_MESSAGE)
}
}
syncState.setOrigin(options.origin || { kind: 'history-resync' })
syncState.startProjectStructureSync()
@@ -167,7 +208,12 @@ async function setResyncState(projectId, syncState) {
update.$set.expiresAt = new Date(
Date.now() + EXPIRE_RESYNC_HISTORY_INTERVAL_MS
)
update.$unset = { resyncPendingSince: 1 }
update.$unset = {
resyncPendingSince: 1,
stuckClearCount: 1,
lastStuckClearAt: 1,
lastStuckDocPaths: 1,
}
}
// apply the update
@@ -184,6 +230,17 @@ async function clearResyncState(projectId) {
})
}
async function _recordStuckClearInSyncState(projectId, stuckDocPaths) {
await db.projectHistorySyncState.updateOne(
{ project_id: new ObjectId(projectId) },
{
$inc: { stuckClearCount: 1 },
$set: { lastStuckClearAt: new Date(), lastStuckDocPaths: stuckDocPaths },
$unset: { resyncPendingSince: 1 },
}
)
}
/**
* @param {string} projectId
* @param {Date} date
@@ -223,6 +280,7 @@ async function skipUpdatesDuringSync(projectId, updates) {
if (!shouldSkipUpdate) {
filteredUpdates.push(update)
} else {
Metrics.inc('project_history_sync_update_skipped')
logger.debug({ projectId, update }, 'skipping update due to resync')
}
}
@@ -297,7 +355,10 @@ class SyncState {
resyncCount,
resyncPendingSince,
lastUpdated,
history
history,
stuckClearCount,
lastStuckClearAt,
lastStuckDocPaths
) {
this.projectId = projectId
this.resyncProjectStructure = resyncProjectStructure
@@ -307,6 +368,9 @@ class SyncState {
this.resyncPendingSince = resyncPendingSince
this.lastUpdated = lastUpdated
this.history = history
this.stuckClearCount = stuckClearCount
this.lastStuckClearAt = lastStuckClearAt
this.lastStuckDocPaths = lastStuckDocPaths
}
static fromRaw(projectId, rawSyncState) {
@@ -336,6 +400,9 @@ class SyncState {
}
}
const lastUpdated = rawSyncState.lastUpdated
const stuckClearCount = rawSyncState.stuckClearCount ?? 0
const lastStuckClearAt = rawSyncState.lastStuckClearAt
const lastStuckDocPaths = rawSyncState.lastStuckDocPaths
return new SyncState(
projectId,
resyncProjectStructure,
@@ -344,7 +411,10 @@ class SyncState {
resyncCount,
resyncPendingSince,
lastUpdated,
history
history,
stuckClearCount,
lastStuckClearAt,
lastStuckDocPaths
)
}
@@ -459,6 +529,20 @@ class SyncState {
isSyncOngoing() {
return this.isProjectStructureSyncing() || this.isAnyDocContentSyncing()
}
isSyncStuck() {
if (!this.isSyncOngoing()) {
return false
}
if (!this.resyncPendingSince) {
// No timestamp recorded — treat long-running syncs without a timestamp
// as potentially stuck (legacy state from before this field was added)
return true
}
return (
Date.now() - this.resyncPendingSince.getTime() > SYNC_STUCK_TIMEOUT_MS
)
}
}
class SyncUpdateExpander {

View File

@@ -2385,6 +2385,204 @@ describe('Syncing with web and doc-updater', function () {
)
})
})
describe('stuck sync state (missing resyncDocContent updates)', function () {
beforeEach(function () {
MockHistoryStore()
.get(`/api/projects/${historyId}/latest/history`)
.reply(200, {
chunk: {
history: {
snapshot: {
files: {
'main.tex': {
hash: '0a207c060e61f3b88eaee0a8cd0696f46fb155eb',
stringLength: 3,
},
},
},
changes: [],
},
startVersion: 0,
},
})
MockHistoryStore()
.get(
`/api/projects/${historyId}/blobs/0a207c060e61f3b88eaee0a8cd0696f46fb155eb`
)
.reply(200, 'a\nb')
})
it('should get stuck when resyncDocContent update is missing', async function () {
// Step 1: Start a resync
await ProjectHistoryClient.resyncHistory(this.project_id)
// Step 2: Push resyncProjectStructure update (lists docs to sync)
// but do NOT push the corresponding resyncDocContent update
const structureUpdate = {
projectHistoryId: historyId,
resyncProjectStructure: {
docs: [{ path: '/main.tex' }],
files: [],
},
meta: {
ts: this.timestamp,
},
}
await ProjectHistoryClient.pushRawUpdate(
this.project_id,
structureUpdate
)
// Step 3: Flush — processes structure update, adds /main.tex to resyncDocContents
// This succeeds (204) but leaves sync in an incomplete state
await ProjectHistoryClient.flushProject(this.project_id)
// Step 4: Verify the sync state is stuck
const syncState = await ProjectHistoryClient.getSyncState(
this.project_id
)
expect(syncState).to.not.be.null
expect(syncState.resyncProjectStructure).to.equal(false)
expect(syncState.resyncDocContents).to.deep.equal(['/main.tex'])
expect(syncState.resyncPendingSince).to.be.a('date')
// Sync thinks it's ongoing because resyncDocContents is not empty
// Step 5: Push a normal text update (simulating a user edit)
const textUpdate = {
doc: this.doc_id,
op: [{ p: 3, i: '\nc' }],
v: 1,
meta: {
ts: this.timestamp,
pathname: '/main.tex',
doc_length: 3,
},
}
await ProjectHistoryClient.pushRawUpdate(this.project_id, textUpdate)
// Step 6: Flush again — the text update should be silently skipped
// because the doc is in resyncDocContents (sync "ongoing")
await ProjectHistoryClient.flushProject(this.project_id)
// Step 7: Verify the sync state is STILL stuck (text update was skipped)
const syncStateAfter = await ProjectHistoryClient.getSyncState(
this.project_id
)
expect(syncStateAfter.resyncProjectStructure).to.equal(false)
expect(syncStateAfter.resyncDocContents).to.deep.equal(['/main.tex'])
// Step 7b: Verify resync-pending reports ongoing but not yet stuck
const pending = await ProjectHistoryClient.getResyncPending(
this.project_id
)
expect(pending.resyncPending).to.equal(true)
expect(pending.syncStuck).to.equal(false)
// Step 8: Verify a normal resync FAILS because sync is "ongoing"
try {
await fetchNothing(
`http://127.0.0.1:3054/project/${this.project_id}/resync`,
{
method: 'POST',
json: { origin: { kind: 'test-origin' } },
}
)
assert.fail('resync should have failed')
} catch (error) {
if (error instanceof RequestFailedError) {
expect(error.response.status).to.equal(500)
} else {
throw error
}
}
})
it('should recover with a hard resync (force=true)', async function () {
// Set up the stuck state (same as above)
await ProjectHistoryClient.resyncHistory(this.project_id)
const structureUpdate = {
projectHistoryId: historyId,
resyncProjectStructure: {
docs: [{ path: '/main.tex' }],
files: [],
},
meta: {
ts: this.timestamp,
},
}
await ProjectHistoryClient.pushRawUpdate(
this.project_id,
structureUpdate
)
await ProjectHistoryClient.flushProject(this.project_id)
// Verify stuck
const syncState = await ProjectHistoryClient.getSyncState(
this.project_id
)
expect(syncState.resyncDocContents).to.deep.equal(['/main.tex'])
// Mock the web resync endpoint for the hard resync
MockWeb()
.post(`/project/${this.project_id}/history/resync`)
.reply(204)
// Hard resync should clear the stuck state and succeed
const response = await fetchNothing(
`http://127.0.0.1:3054/project/${this.project_id}/resync?force=true`,
{
method: 'POST',
json: { origin: { kind: 'test-origin' } },
}
)
expect(response.status).to.equal(204)
// After hard resync, sync state should be cleared or reset
const syncStateAfter = await ProjectHistoryClient.getSyncState(
this.project_id
)
// Hard resync clears and restarts — at this point the new resync
// should be in progress (resyncProjectStructure = true) or cleared
// depending on whether web sent updates
if (syncStateAfter) {
expect(syncStateAfter.resyncProjectStructure).to.equal(true)
}
})
it('should auto-recover when stuck (null resyncPendingSince)', async function () {
// Inject a stuck sync state: ongoing but no resyncPendingSince
// (legacy state from before the timestamp field was added)
await ProjectHistoryClient.injectStuckSyncState(this.project_id, [
'/main.tex',
])
// Verify the state is recognised as stuck
const pending = await ProjectHistoryClient.getResyncPending(
this.project_id
)
expect(pending.resyncPending).to.equal(true)
expect(pending.syncStuck).to.equal(true)
// Mock the web callback that requestResync triggers
MockWeb()
.post(`/project/${this.project_id}/history/resync`)
.reply(204)
// A plain resync (no force) should detect stuck, clear, and restart
await ProjectHistoryClient.resyncHistory(this.project_id)
// stuckClearCount incremented and a new structure sync started
const syncState = await ProjectHistoryClient.getSyncState(
this.project_id
)
expect(syncState.stuckClearCount).to.equal(1)
expect(syncState.resyncProjectStructure).to.equal(true)
expect(syncState.lastStuckDocPaths).to.deep.equal(['/main.tex'])
})
})
})
})

View File

@@ -1,7 +1,7 @@
import { expect } from 'chai'
import Settings from '@overleaf/settings'
import RedisWrapper from '@overleaf/redis-wrapper'
import { db } from '../../../../app/js/mongodb.js'
import { db, ObjectId } from '../../../../app/js/mongodb.js'
import {
fetchJson,
fetchJsonWithResponse,
@@ -223,6 +223,32 @@ export async function getFailures() {
return failures
}
export async function getSyncState(projectId) {
return await db.projectHistorySyncState.findOne({
project_id: new ObjectId(projectId),
})
}
export async function getResyncPending(projectId) {
return await fetchJson(
`http://127.0.0.1:3054/project/${projectId}/resync-pending`
)
}
export async function injectStuckSyncState(projectId, docPaths) {
await db.projectHistorySyncState.replaceOne(
{ project_id: new ObjectId(projectId) },
{
project_id: new ObjectId(projectId),
resyncProjectStructure: false,
resyncDocContents: docPaths,
stuckClearCount: 0,
history: [],
},
{ upsert: true }
)
}
export async function deleteProject(projectId) {
const response = await fetchNothing(
`http://127.0.0.1:3054/project/${projectId}`,

View File

@@ -12,6 +12,8 @@ describe('RetryManager', function () {
this.projectId2 = new ObjectId().toString()
this.projectId3 = new ObjectId().toString()
this.projectId4 = new ObjectId().toString()
this.projectId5 = new ObjectId().toString()
this.projectId6 = new ObjectId().toString()
this.historyId = 12345
this.WebApiManager = {
@@ -39,13 +41,25 @@ describe('RetryManager', function () {
},
{
project_id: this.projectId3,
error: 'sync ongoing',
attempts: 10,
error: 'OpsOutOfOrderError: doc version out of order',
attempts: 5,
resyncAttempts: 1,
},
{
project_id: this.projectId4,
error: 'sync ongoing',
error: 'OpsOutOfOrderError: doc version out of order',
attempts: 5,
resyncAttempts: 2,
},
{
project_id: this.projectId5,
error: 'OError: sync ongoing',
attempts: 10,
resyncAttempts: 1,
},
{
project_id: this.projectId6,
error: 'OError: sync ongoing',
attempts: 10,
resyncAttempts: 2,
},
@@ -129,6 +143,24 @@ describe('RetryManager', function () {
).not.to.have.been.calledWith(this.projectId4)
})
it('should use soft resync for sync ongoing failures regardless of resyncAttempts', function () {
expect(this.SyncManager.promises.startResync).to.have.been.calledWith(
this.projectId5
)
expect(
this.SyncManager.promises.startHardResync
).not.to.have.been.calledWith(this.projectId5)
})
it('should retry stuck sync ongoing failures via soft resync', function () {
expect(this.SyncManager.promises.startResync).to.have.been.calledWith(
this.projectId6
)
expect(
this.SyncManager.promises.startHardResync
).not.to.have.been.calledWith(this.projectId6)
})
it('should count the unprocessed updates', function () {
expect(
this.RedisManager.promises.countUnprocessedUpdates

View File

@@ -194,11 +194,11 @@ describe('SyncManager', function () {
project_id: new ObjectId(this.projectId),
},
sinon.match({
$set: {
$set: sinon.match({
resyncProjectStructure: true,
resyncDocContents: [],
origin: { kind: 'history-resync' },
},
}),
$currentDate: { lastUpdated: true },
$inc: { resyncCount: 1 },
$unset: { expiresAt: true },
@@ -212,7 +212,10 @@ describe('SyncManager', function () {
describe('if project structure sync is in progress', function () {
beforeEach(function () {
const syncState = { resyncProjectStructure: true }
const syncState = {
resyncProjectStructure: true,
resyncPendingSince: new Date(),
}
this.db.projectHistorySyncState.findOne.resolves(syncState)
})
@@ -225,7 +228,10 @@ describe('SyncManager', function () {
describe('if doc content sync in is progress', function () {
beforeEach(async function () {
const syncState = { resyncDocContents: ['/foo.tex'] }
const syncState = {
resyncDocContents: ['/foo.tex'],
resyncPendingSince: new Date(),
}
this.db.projectHistorySyncState.findOne.resolves(syncState)
})
@@ -235,6 +241,112 @@ describe('SyncManager', function () {
).to.be.rejectedWith('sync ongoing')
})
})
describe('if sync is stuck (no resyncPendingSince — legacy state)', function () {
beforeEach(function () {
const syncState = { resyncDocContents: ['/foo.tex'] }
this.db.projectHistorySyncState.findOne.resolves(syncState)
})
it('records the stuck clear, does not delete state, and starts a new resync', async function () {
await this.SyncManager.promises.startResync(this.projectId)
expect(
this.db.projectHistorySyncState.updateOne
).to.have.been.calledWith(
{ project_id: new ObjectId(this.projectId) },
sinon.match({
$inc: { stuckClearCount: 1 },
$unset: { resyncPendingSince: 1 },
})
)
expect(this.WebApiManager.promises.requestResync.calledOnce).to.be.true
expect(
this.Metrics.inc.calledWith('project_history_sync_stuck_cleared')
).to.be.true
})
})
describe('if sync is stuck (resyncPendingSince older than timeout)', function () {
beforeEach(function () {
const fiveHoursAgo = new Date(Date.now() - 5 * 60 * 60 * 1000)
const syncState = {
resyncDocContents: ['/foo.tex'],
resyncPendingSince: fiveHoursAgo,
}
this.db.projectHistorySyncState.findOne.resolves(syncState)
})
it('records the stuck clear and starts a new resync', async function () {
await this.SyncManager.promises.startResync(this.projectId)
expect(
this.db.projectHistorySyncState.updateOne
).to.have.been.calledWith(
{ project_id: new ObjectId(this.projectId) },
sinon.match({
$inc: { stuckClearCount: 1 },
$unset: { resyncPendingSince: 1 },
})
)
expect(this.WebApiManager.promises.requestResync.calledOnce).to.be.true
})
})
describe('if sync is not stuck (resyncPendingSince within timeout)', function () {
beforeEach(function () {
const threeHoursAgo = new Date(Date.now() - 3 * 60 * 60 * 1000)
const syncState = {
resyncDocContents: ['/foo.tex'],
resyncPendingSince: threeHoursAgo,
}
this.db.projectHistorySyncState.findOne.resolves(syncState)
})
it('throws sync ongoing and does not clear', async function () {
await expect(
this.SyncManager.promises.startResync(this.projectId)
).to.be.rejectedWith('sync ongoing')
expect(
this.Metrics.inc.calledWith('project_history_sync_stuck_cleared')
).to.be.false
expect(
this.Metrics.inc.calledWith('project_history_sync_stuck_permanent')
).to.be.false
})
})
describe('if sync is permanently stuck (exceeded retry limit)', function () {
beforeEach(function () {
const syncState = {
resyncDocContents: ['/foo.tex'],
stuckClearCount: 5,
}
this.db.projectHistorySyncState.findOne.resolves(syncState)
})
it('throws permanently stuck, emits metric, and records the attempt', async function () {
await expect(
this.SyncManager.promises.startResync(this.projectId)
).to.be.rejectedWith('sync permanently stuck')
expect(
this.Metrics.inc.calledWith('project_history_sync_stuck_permanent')
).to.be.true
expect(
this.db.projectHistorySyncState.updateOne
).to.have.been.calledWith(
{ project_id: new ObjectId(this.projectId) },
sinon.match({ $inc: { stuckClearCount: 1 } })
)
})
it('does not attempt a resync', async function () {
try {
await this.SyncManager.promises.startResync(this.projectId)
} catch (e) {
// expected
}
expect(this.WebApiManager.promises.requestResync.called).to.be.false
})
})
})
describe('setResyncState', function () {
@@ -265,7 +377,11 @@ describe('SyncManager', function () {
).to.have.been.calledWith(
{ project_id: new ObjectId(this.projectId) },
sinon.match({
$set: this.syncState.toRaw(),
$set: sinon.match({
resyncProjectStructure: true,
resyncDocContents: [],
origin: { kind: 'history-resync' },
}),
$currentDate: { lastUpdated: true },
$inc: { resyncCount: 1 },
$unset: { expiresAt: true },
@@ -289,7 +405,7 @@ describe('SyncManager', function () {
}
})
it('sets the sync state entry in mongo to expire', async function () {
it('sets the sync state entry in mongo to expire and clears stuck fields', async function () {
await this.SyncManager.promises.setResyncState(
this.projectId,
this.syncState
@@ -307,6 +423,12 @@ describe('SyncManager', function () {
expiresAt: new Date(this.now.getTime() + 90 * 24 * 3600 * 1000),
},
$currentDate: { lastUpdated: true },
$unset: {
resyncPendingSince: 1,
stuckClearCount: 1,
lastStuckClearAt: 1,
lastStuckDocPaths: 1,
},
}),
{ upsert: true }
)
@@ -2005,4 +2127,47 @@ describe('SyncManager', function () {
})
})
})
describe('stuck sync state reproduction', function () {
describe('when resyncDocContents has entries but no resyncDocContent update arrives', function () {
beforeEach(function () {
// Simulate the stuck state: project structure sync is done,
// but doc content sync never completed for /main.tex
const stuckSyncState = {
resyncProjectStructure: false,
resyncDocContents: ['/main.tex'],
origin: { kind: 'history-resync' },
}
this.db.projectHistorySyncState.findOne.resolves(stuckSyncState)
})
it('should skip text updates for a doc stuck in resyncDocContents', async function () {
const textUpdate = {
doc: new ObjectId(),
op: [{ i: 'hello', p: 0 }],
meta: {
pathname: '/main.tex',
doc_length: 0,
},
}
this.UpdateTranslator.isTextUpdate.returns(false)
this.UpdateTranslator.isTextUpdate.withArgs(textUpdate).returns(true)
const { updates: filteredUpdates, syncState } =
await this.SyncManager.promises.skipUpdatesDuringSync(
this.projectId,
[textUpdate]
)
// The text update is silently dropped — this is the stuck state bug
expect(filteredUpdates).to.deep.equal([])
// Sync state still shows /main.tex as syncing
expect(syncState.toRaw().resyncDocContents).to.deep.equal(['/main.tex'])
// Metric is emitted for the skipped update
expect(
this.Metrics.inc.calledWith('project_history_sync_update_skipped')
).to.be.true
})
})
})
})