diff --git a/services/project-history/app/js/ErrorRecorder.js b/services/project-history/app/js/ErrorRecorder.js index fbce06081f..4977229779 100644 --- a/services/project-history/app/js/ErrorRecorder.js +++ b/services/project-history/app/js/ErrorRecorder.js @@ -11,15 +11,18 @@ import { db } from './mongodb.js' */ /** - * @template {{error?: string}} T + * @template {{error: string}|{}} T * @param {T} failure * @return {T} */ function normalizeFailure(failure) { - return { - ...failure, - error: failure.error?.replace('OError:', 'Error:'), + if ('error' in failure && failure.error?.includes('OError:')) { + return { + ...failure, + error: failure.error.replace('OError:', 'Error:'), + } } + return failure } /** @@ -149,10 +152,10 @@ async function getFailuresByType() { const maxQueueSize = {} // count all the failures and number of attempts by type for (const result of results || []) { - const failureType = result.error + const failureType = 'error' in result ? result.error : 'resync' const attempts = result.attempts || 1 // allow for field to be absent const requests = result.requestCount || 0 - const queueSize = result.queueSize || 0 + const queueSize = 'queueSize' in result ? result.queueSize : 0 if (failureCounts[failureType] > 0) { failureCounts[failureType]++ failureAttempts[failureType] += attempts @@ -169,45 +172,60 @@ async function getFailuresByType() { return { failureCounts, failureAttempts, failureRequests, maxQueueSize } } +/** + * Mapping between error messages and short labels. + * @type {Record} + */ +const SHORT_ERROR_NAMES = { + 'Error: bad response from filestore: 404': 'filestore-404', + 'Error: bad response from filestore: 500': 'filestore-500', + 'NotFoundError: got a 404 from web api': 'web-api-404', + 'Error: history store a non-success status code: 413': 'history-store-413', + 'Error: history store a non-success status code: 422': 'history-store-422', + 'Error: history store a non-success status code: 500': 'history-store-500', + 'Error: history store a non-success status code: 503': 'history-store-503', + 'Error: web returned a non-success status code: 500 (attempts: 2)': 'web-500', + 'Error: ESOCKETTIMEDOUT': 'socket-timeout', + 'Error: no project found': 'no-project-found', + 'OpsOutOfOrderError: project structure version out of order on incoming updates': + 'incoming-project-version-out-of-order', + 'OpsOutOfOrderError: doc version out of order on incoming updates': + 'incoming-doc-version-out-of-order', + 'OpsOutOfOrderError: project structure version out of order': + 'chunk-project-version-out-of-order', + 'OpsOutOfOrderError: doc version out of order': + 'chunk-doc-version-out-of-order', + 'Error: failed to extend lock': 'lock-overrun', + 'Error: tried to release timed out lock': 'lock-overrun', + 'Error: Timeout': 'lock-overrun', + 'Error: sync ongoing': 'sync-ongoing', + 'SyncError: unexpected resyncProjectStructure update': 'sync-error', + '[object Error]': 'unknown-error-object', + 'UpdateWithUnknownFormatError: update with unknown format': 'unknown-format', + 'Error: update with unknown format': 'unknown-format', + 'TextOperationError: The base length of the second operation has to be the target length of the first operation': + 'text-op-error', + 'Error: ENOSPC: no space left on device, write': 'ENOSPC', + '*': 'other', +} + +async function getFailuresFull() { + const results = [] + for await (const failure of await getFailedProjects()) { + results.push({ + category: + 'error' in failure ? SHORT_ERROR_NAMES[failure.error] : undefined, + ...failure, + }) + } + return results +} + async function getFailures() { const { failureCounts, failureAttempts, failureRequests, maxQueueSize } = await getFailuresByType() let attempts, failureType, label, requests - const shortNames = { - 'Error: bad response from filestore: 404': 'filestore-404', - 'Error: bad response from filestore: 500': 'filestore-500', - 'NotFoundError: got a 404 from web api': 'web-api-404', - 'Error: history store a non-success status code: 413': 'history-store-413', - 'Error: history store a non-success status code: 422': 'history-store-422', - 'Error: history store a non-success status code: 500': 'history-store-500', - 'Error: history store a non-success status code: 503': 'history-store-503', - 'Error: web returned a non-success status code: 500 (attempts: 2)': - 'web-500', - 'Error: ESOCKETTIMEDOUT': 'socket-timeout', - 'Error: no project found': 'no-project-found', - 'OpsOutOfOrderError: project structure version out of order on incoming updates': - 'incoming-project-version-out-of-order', - 'OpsOutOfOrderError: doc version out of order on incoming updates': - 'incoming-doc-version-out-of-order', - 'OpsOutOfOrderError: project structure version out of order': - 'chunk-project-version-out-of-order', - 'OpsOutOfOrderError: doc version out of order': - 'chunk-doc-version-out-of-order', - 'Error: failed to extend lock': 'lock-overrun', - 'Error: tried to release timed out lock': 'lock-overrun', - 'Error: Timeout': 'lock-overrun', - 'Error: sync ongoing': 'sync-ongoing', - 'SyncError: unexpected resyncProjectStructure update': 'sync-error', - '[object Error]': 'unknown-error-object', - 'UpdateWithUnknownFormatError: update with unknown format': - 'unknown-format', - 'Error: update with unknown format': 'unknown-format', - 'TextOperationError: The base length of the second operation has to be the target length of the first operation': - 'text-op-error', - 'Error: ENOSPC: no space left on device, write': 'ENOSPC', - '*': 'other', - } // set all the known errors to zero if not present (otherwise gauges stay on their last value) const summaryCounts = {} @@ -215,8 +233,8 @@ async function getFailures() { const summaryRequests = {} const summaryMaxQueueSize = {} - for (failureType in shortNames) { - label = shortNames[failureType] + for (failureType in SHORT_ERROR_NAMES) { + label = SHORT_ERROR_NAMES[failureType] summaryCounts[label] = 0 summaryAttempts[label] = 0 summaryRequests[label] = 0 @@ -226,7 +244,7 @@ async function getFailures() { // record a metric for each type of failure for (failureType in failureCounts) { const failureCount = failureCounts[failureType] - label = shortNames[failureType] || shortNames['*'] + label = SHORT_ERROR_NAMES[failureType] || SHORT_ERROR_NAMES['*'] summaryCounts[label] += failureCount summaryAttempts[label] += failureAttempts[failureType] summaryRequests[label] += failureRequests[failureType] @@ -266,6 +284,7 @@ async function getFailures() { // EXPORTS +const getFailuresFullCb = callbackify(getFailuresFull) const getFailedProjectsCb = callbackify(getFailedProjects) const getFailureRecordCb = callbackify(getFailureRecord) const getFailuresCb = callbackify(getFailures) @@ -278,6 +297,7 @@ const setForceDebugCb = callbackify(setForceDebug) export { cloneFailureCb as cloneFailure, + getFailuresFullCb as getFailuresFull, getFailedProjectsCb as getFailedProjects, getFailureRecordCb as getFailureRecord, getLastFailureCb as getLastFailure, diff --git a/services/project-history/app/js/HttpController.js b/services/project-history/app/js/HttpController.js index bf4f1e7c9a..d38856bc64 100644 --- a/services/project-history/app/js/HttpController.js +++ b/services/project-history/app/js/HttpController.js @@ -719,6 +719,13 @@ export function getFailures(req, res, next) { }) } +export function getFailuresFull(req, res, next) { + ErrorRecorder.getFailuresFull((error, result) => { + if (error) return next(error) + res.send(result) + }) +} + export function getQueueCounts(req, res, next) { RedisManager.getProjectIdsWithHistoryOpsCount((err, queuedProjectsCount) => { if (err != null) { diff --git a/services/project-history/app/js/Router.js b/services/project-history/app/js/Router.js index 0907cb8e1f..dffd9ac39e 100644 --- a/services/project-history/app/js/Router.js +++ b/services/project-history/app/js/Router.js @@ -87,6 +87,8 @@ export function initialize(app) { app.get('/status/failures', HttpController.getFailures) + app.get('/status/failures-full', HttpController.getFailuresFull) + app.get('/status/queue', HttpController.getQueueCounts) app.post('/retry/failures', HttpController.retryFailures) diff --git a/services/project-history/app/js/mongo-types.ts b/services/project-history/app/js/mongo-types.ts index 9894e653d2..f0e63c7b5b 100644 --- a/services/project-history/app/js/mongo-types.ts +++ b/services/project-history/app/js/mongo-types.ts @@ -7,8 +7,10 @@ export type ProjectHistoryFailure = { resyncAttempts: number resyncStartedAt: Date requestCount?: number - history: (ErrorRecord | SyncStartRecord)[] -} & ErrorRecord + history: FailureRecord[] +} & FailureRecord + +type FailureRecord = ErrorRecord | SyncStartRecord type ErrorRecord = { error: string diff --git a/services/project-history/test/acceptance/js/RetryTests.js b/services/project-history/test/acceptance/js/RetryTests.js index 3e5c5d66f8..eac1e2b71c 100644 --- a/services/project-history/test/acceptance/js/RetryTests.js +++ b/services/project-history/test/acceptance/js/RetryTests.js @@ -70,6 +70,11 @@ describe('Retrying failed projects', function () { attempts: 1, error: 'OError: ESOCKETTIMEDOUT', }, + { + project_id: new ObjectId(), + attempts: 1, + resyncStartedAt: new Date(), + }, ]) const body = await ProjectHistoryClient.getFailures() @@ -77,9 +82,11 @@ describe('Retrying failed projects', function () { _.merge(baseLineFailures, { attempts: { 'socket-timeout': 2, + other: 1, }, counts: { 'socket-timeout': 2, + other: 1, }, }) ) diff --git a/services/web/app/src/Features/History/HistoryManager.mjs b/services/web/app/src/Features/History/HistoryManager.mjs index c8db62c176..36d4897c59 100644 --- a/services/web/app/src/Features/History/HistoryManager.mjs +++ b/services/web/app/src/Features/History/HistoryManager.mjs @@ -316,6 +316,12 @@ async function getDebugInfo(projectId) { ) } +async function getHistoryFailures() { + return await fetchJson( + `${settings.apis.project_history.url}/status/failures-full` + ) +} + /** * Get history changes since a given version * @@ -492,5 +498,6 @@ export default { getLatestHistoryWithHistoryId, ensureNoResyncPending, getDebugInfo, + getHistoryFailures, }, } diff --git a/services/web/frontend/js/utils/meta.ts b/services/web/frontend/js/utils/meta.ts index 3706a0911a..9779371f78 100644 --- a/services/web/frontend/js/utils/meta.ts +++ b/services/web/frontend/js/utils/meta.ts @@ -68,6 +68,7 @@ import { AlgoliaConfig } from '../../../modules/algolia-search/frontend/js/types import { WritefullPublicEnv } from '@wf/domain/writefull-public-env' import { UserNotificationPreferences } from '../../../types/notifications' import { SharingPermissions } from '../../../modules/sharing-permissions/app/src/types' +import { FullHistoryFailure } from '@ol-types/history/projectHistory' export interface Meta { 'ol-ExposedSettings': ExposedSettings @@ -249,6 +250,7 @@ export interface Meta { 'ol-primaryEmail': { email: string; confirmed: boolean } 'ol-project': any // TODO 'ol-projectEntityCounts': { files: number; docs: number } + 'ol-projectHistoryFailures': FullHistoryFailure[] 'ol-projectName': string 'ol-projectSyncSuccessMessage': string 'ol-projectTags': Tag[] diff --git a/services/web/types/history/projectHistory.ts b/services/web/types/history/projectHistory.ts new file mode 100644 index 0000000000..74c89b04af --- /dev/null +++ b/services/web/types/history/projectHistory.ts @@ -0,0 +1,50 @@ +export type FullHistoryFailure = { + category?: string +} & ProjectHistoryFailureRecordSchema + +export type ProjectHistoryFailureRecordSchema = { + _id: string + project_id: string + attempts: number + resyncAttempts: number + resyncStartedAt: string + requestCount?: number + history: ProjectHistoryFailureEntrySchema[] +} & ProjectHistoryFailureEntrySchema + +export type ProjectHistoryFailureEntrySchema = + | SyncStartRecordSchema + | ErrorRecordSchema + +export type ErrorRecordSchema = { + error: string + stack: string + queueSize: number + ts: string +} + +export type SyncStartRecordSchema = { + resyncStartedAt: string +} + +export type SyncStateSchema = { + resyncProjectStructure: boolean + resyncDocContents: string[] + origin: { kind: string } +} + +export type SyncStateHistoryEntry = { + syncState: SyncStateSchema + timestamp: string +} + +export type HistoryDebugInfoResponse = { + failureRecord?: ProjectHistoryFailureRecordSchema + syncState: SyncStateSchema & { + resyncPending: boolean + resyncCount: number + resyncPendingSince?: string + lastUpdated: string + history: SyncStateHistoryEntry[] + } +}