[web] add admin page for project history failures (#32751)

GitOrigin-RevId: c9564e07239b761851c9a86894f9ef3023dfcf2b
This commit is contained in:
Jakob Ackermann
2026-04-15 09:14:15 +02:00
committed by Copybot
parent 7faa4c5db4
commit 3dc8899334
8 changed files with 142 additions and 45 deletions

View File

@@ -11,15 +11,18 @@ import { db } from './mongodb.js'
*/
/**
* @template {{error?: string}} T
* @template {{error: string}|{}} T
* @param {T} failure
* @return {T}
*/
function normalizeFailure(failure) {
return {
...failure,
error: failure.error?.replace('OError:', 'Error:'),
if ('error' in failure && failure.error?.includes('OError:')) {
return {
...failure,
error: failure.error.replace('OError:', 'Error:'),
}
}
return failure
}
/**
@@ -149,10 +152,10 @@ async function getFailuresByType() {
const maxQueueSize = {}
// count all the failures and number of attempts by type
for (const result of results || []) {
const failureType = result.error
const failureType = 'error' in result ? result.error : 'resync'
const attempts = result.attempts || 1 // allow for field to be absent
const requests = result.requestCount || 0
const queueSize = result.queueSize || 0
const queueSize = 'queueSize' in result ? result.queueSize : 0
if (failureCounts[failureType] > 0) {
failureCounts[failureType]++
failureAttempts[failureType] += attempts
@@ -169,45 +172,60 @@ async function getFailuresByType() {
return { failureCounts, failureAttempts, failureRequests, maxQueueSize }
}
/**
* Mapping between error messages and short labels.
* @type {Record<string, string>}
*/
const SHORT_ERROR_NAMES = {
'Error: bad response from filestore: 404': 'filestore-404',
'Error: bad response from filestore: 500': 'filestore-500',
'NotFoundError: got a 404 from web api': 'web-api-404',
'Error: history store a non-success status code: 413': 'history-store-413',
'Error: history store a non-success status code: 422': 'history-store-422',
'Error: history store a non-success status code: 500': 'history-store-500',
'Error: history store a non-success status code: 503': 'history-store-503',
'Error: web returned a non-success status code: 500 (attempts: 2)': 'web-500',
'Error: ESOCKETTIMEDOUT': 'socket-timeout',
'Error: no project found': 'no-project-found',
'OpsOutOfOrderError: project structure version out of order on incoming updates':
'incoming-project-version-out-of-order',
'OpsOutOfOrderError: doc version out of order on incoming updates':
'incoming-doc-version-out-of-order',
'OpsOutOfOrderError: project structure version out of order':
'chunk-project-version-out-of-order',
'OpsOutOfOrderError: doc version out of order':
'chunk-doc-version-out-of-order',
'Error: failed to extend lock': 'lock-overrun',
'Error: tried to release timed out lock': 'lock-overrun',
'Error: Timeout': 'lock-overrun',
'Error: sync ongoing': 'sync-ongoing',
'SyncError: unexpected resyncProjectStructure update': 'sync-error',
'[object Error]': 'unknown-error-object',
'UpdateWithUnknownFormatError: update with unknown format': 'unknown-format',
'Error: update with unknown format': 'unknown-format',
'TextOperationError: The base length of the second operation has to be the target length of the first operation':
'text-op-error',
'Error: ENOSPC: no space left on device, write': 'ENOSPC',
'*': 'other',
}
async function getFailuresFull() {
const results = []
for await (const failure of await getFailedProjects()) {
results.push({
category:
'error' in failure ? SHORT_ERROR_NAMES[failure.error] : undefined,
...failure,
})
}
return results
}
async function getFailures() {
const { failureCounts, failureAttempts, failureRequests, maxQueueSize } =
await getFailuresByType()
let attempts, failureType, label, requests
const shortNames = {
'Error: bad response from filestore: 404': 'filestore-404',
'Error: bad response from filestore: 500': 'filestore-500',
'NotFoundError: got a 404 from web api': 'web-api-404',
'Error: history store a non-success status code: 413': 'history-store-413',
'Error: history store a non-success status code: 422': 'history-store-422',
'Error: history store a non-success status code: 500': 'history-store-500',
'Error: history store a non-success status code: 503': 'history-store-503',
'Error: web returned a non-success status code: 500 (attempts: 2)':
'web-500',
'Error: ESOCKETTIMEDOUT': 'socket-timeout',
'Error: no project found': 'no-project-found',
'OpsOutOfOrderError: project structure version out of order on incoming updates':
'incoming-project-version-out-of-order',
'OpsOutOfOrderError: doc version out of order on incoming updates':
'incoming-doc-version-out-of-order',
'OpsOutOfOrderError: project structure version out of order':
'chunk-project-version-out-of-order',
'OpsOutOfOrderError: doc version out of order':
'chunk-doc-version-out-of-order',
'Error: failed to extend lock': 'lock-overrun',
'Error: tried to release timed out lock': 'lock-overrun',
'Error: Timeout': 'lock-overrun',
'Error: sync ongoing': 'sync-ongoing',
'SyncError: unexpected resyncProjectStructure update': 'sync-error',
'[object Error]': 'unknown-error-object',
'UpdateWithUnknownFormatError: update with unknown format':
'unknown-format',
'Error: update with unknown format': 'unknown-format',
'TextOperationError: The base length of the second operation has to be the target length of the first operation':
'text-op-error',
'Error: ENOSPC: no space left on device, write': 'ENOSPC',
'*': 'other',
}
// set all the known errors to zero if not present (otherwise gauges stay on their last value)
const summaryCounts = {}
@@ -215,8 +233,8 @@ async function getFailures() {
const summaryRequests = {}
const summaryMaxQueueSize = {}
for (failureType in shortNames) {
label = shortNames[failureType]
for (failureType in SHORT_ERROR_NAMES) {
label = SHORT_ERROR_NAMES[failureType]
summaryCounts[label] = 0
summaryAttempts[label] = 0
summaryRequests[label] = 0
@@ -226,7 +244,7 @@ async function getFailures() {
// record a metric for each type of failure
for (failureType in failureCounts) {
const failureCount = failureCounts[failureType]
label = shortNames[failureType] || shortNames['*']
label = SHORT_ERROR_NAMES[failureType] || SHORT_ERROR_NAMES['*']
summaryCounts[label] += failureCount
summaryAttempts[label] += failureAttempts[failureType]
summaryRequests[label] += failureRequests[failureType]
@@ -266,6 +284,7 @@ async function getFailures() {
// EXPORTS
const getFailuresFullCb = callbackify(getFailuresFull)
const getFailedProjectsCb = callbackify(getFailedProjects)
const getFailureRecordCb = callbackify(getFailureRecord)
const getFailuresCb = callbackify(getFailures)
@@ -278,6 +297,7 @@ const setForceDebugCb = callbackify(setForceDebug)
export {
cloneFailureCb as cloneFailure,
getFailuresFullCb as getFailuresFull,
getFailedProjectsCb as getFailedProjects,
getFailureRecordCb as getFailureRecord,
getLastFailureCb as getLastFailure,

View File

@@ -719,6 +719,13 @@ export function getFailures(req, res, next) {
})
}
export function getFailuresFull(req, res, next) {
ErrorRecorder.getFailuresFull((error, result) => {
if (error) return next(error)
res.send(result)
})
}
export function getQueueCounts(req, res, next) {
RedisManager.getProjectIdsWithHistoryOpsCount((err, queuedProjectsCount) => {
if (err != null) {

View File

@@ -87,6 +87,8 @@ export function initialize(app) {
app.get('/status/failures', HttpController.getFailures)
app.get('/status/failures-full', HttpController.getFailuresFull)
app.get('/status/queue', HttpController.getQueueCounts)
app.post('/retry/failures', HttpController.retryFailures)

View File

@@ -7,8 +7,10 @@ export type ProjectHistoryFailure = {
resyncAttempts: number
resyncStartedAt: Date
requestCount?: number
history: (ErrorRecord | SyncStartRecord)[]
} & ErrorRecord
history: FailureRecord[]
} & FailureRecord
type FailureRecord = ErrorRecord | SyncStartRecord
type ErrorRecord = {
error: string

View File

@@ -70,6 +70,11 @@ describe('Retrying failed projects', function () {
attempts: 1,
error: 'OError: ESOCKETTIMEDOUT',
},
{
project_id: new ObjectId(),
attempts: 1,
resyncStartedAt: new Date(),
},
])
const body = await ProjectHistoryClient.getFailures()
@@ -77,9 +82,11 @@ describe('Retrying failed projects', function () {
_.merge(baseLineFailures, {
attempts: {
'socket-timeout': 2,
other: 1,
},
counts: {
'socket-timeout': 2,
other: 1,
},
})
)

View File

@@ -316,6 +316,12 @@ async function getDebugInfo(projectId) {
)
}
async function getHistoryFailures() {
return await fetchJson(
`${settings.apis.project_history.url}/status/failures-full`
)
}
/**
* Get history changes since a given version
*
@@ -492,5 +498,6 @@ export default {
getLatestHistoryWithHistoryId,
ensureNoResyncPending,
getDebugInfo,
getHistoryFailures,
},
}

View File

@@ -68,6 +68,7 @@ import { AlgoliaConfig } from '../../../modules/algolia-search/frontend/js/types
import { WritefullPublicEnv } from '@wf/domain/writefull-public-env'
import { UserNotificationPreferences } from '../../../types/notifications'
import { SharingPermissions } from '../../../modules/sharing-permissions/app/src/types'
import { FullHistoryFailure } from '@ol-types/history/projectHistory'
export interface Meta {
'ol-ExposedSettings': ExposedSettings
@@ -249,6 +250,7 @@ export interface Meta {
'ol-primaryEmail': { email: string; confirmed: boolean }
'ol-project': any // TODO
'ol-projectEntityCounts': { files: number; docs: number }
'ol-projectHistoryFailures': FullHistoryFailure[]
'ol-projectName': string
'ol-projectSyncSuccessMessage': string
'ol-projectTags': Tag[]

View File

@@ -0,0 +1,50 @@
export type FullHistoryFailure = {
category?: string
} & ProjectHistoryFailureRecordSchema
export type ProjectHistoryFailureRecordSchema = {
_id: string
project_id: string
attempts: number
resyncAttempts: number
resyncStartedAt: string
requestCount?: number
history: ProjectHistoryFailureEntrySchema[]
} & ProjectHistoryFailureEntrySchema
export type ProjectHistoryFailureEntrySchema =
| SyncStartRecordSchema
| ErrorRecordSchema
export type ErrorRecordSchema = {
error: string
stack: string
queueSize: number
ts: string
}
export type SyncStartRecordSchema = {
resyncStartedAt: string
}
export type SyncStateSchema = {
resyncProjectStructure: boolean
resyncDocContents: string[]
origin: { kind: string }
}
export type SyncStateHistoryEntry = {
syncState: SyncStateSchema
timestamp: string
}
export type HistoryDebugInfoResponse = {
failureRecord?: ProjectHistoryFailureRecordSchema
syncState: SyncStateSchema & {
resyncPending: boolean
resyncCount: number
resyncPendingSince?: string
lastUpdated: string
history: SyncStateHistoryEntry[]
}
}