From f85fdd3a97d07409fda04b3231b74af931b9550b Mon Sep 17 00:00:00 2001 From: Andrew Rumble Date: Wed, 12 Mar 2025 14:55:13 +0000 Subject: [PATCH] Refactor project sampler and add new sampler type GitOrigin-RevId: 984aa35cef1165e1c8342073b9211a387bd6089e --- .../backupVerifier/ProjectSampler.mjs | 70 ++++++++++++------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/services/history-v1/backupVerifier/ProjectSampler.mjs b/services/history-v1/backupVerifier/ProjectSampler.mjs index f1f697bd6d..93d9a1a31f 100644 --- a/services/history-v1/backupVerifier/ProjectSampler.mjs +++ b/services/history-v1/backupVerifier/ProjectSampler.mjs @@ -12,50 +12,68 @@ const HAS_PROJECTS_WITHOUT_HISTORY = * @param {Date} start * @param {Date} end * @param {number} N - * @return {Promise>} + * @yields {string} */ -export async function selectProjectsInDateRange(start, end, N) { - let projects = await projectsCollection - .aggregate([ - { - $match: { - _id: { - $gt: objectIdFromDate(start), - $lte: objectIdFromDate(end), - }, +export async function* getProjectsCreatedInDateRangeCursor(start, end, N) { + yield* getSampleProjectsCursor(N, [ + { + $match: { + _id: { + $gt: objectIdFromDate(start), + $lte: objectIdFromDate(end), }, }, - { $sample: { size: N } }, - { $project: { 'overleaf.history.id': 1 } }, - ]) - .toArray() - if (HAS_PROJECTS_WITHOUT_HISTORY) { - projects = projects.filter(p => Boolean(p.overleaf?.history?.id)) - if (projects.length === 0) { - // Very unlucky sample. Try again. - return await selectProjectsInDateRange(start, end, N) - } - } - return projects.map(p => p.overleaf.history.id.toString()) + }, + ]) } -export async function* getSampleProjectsCursor(N) { +export async function* getProjectsUpdatedInDateRangeCursor(start, end, N) { + yield* getSampleProjectsCursor(N, [ + { + $match: { + 'overleaf.history.updatedAt': { + $gt: start, + $lte: end, + }, + }, + }, + ]) +} + +/** + * @typedef {import('mongodb').Document} Document + */ + +/** + * + * @generator + * @param {number} N + * @param {Array} preSampleAggregationStages + * @yields {string} + */ +export async function* getSampleProjectsCursor( + N, + preSampleAggregationStages = [] +) { const cursor = projectsCollection.aggregate([ + ...preSampleAggregationStages, { $sample: { size: N } }, { $project: { 'overleaf.history.id': 1 } }, ]) let validProjects = 0 + let hasInvalidProject = false for await (const project of cursor) { - if (HAS_PROJECTS_WITHOUT_HISTORY) { + if (HAS_PROJECTS_WITHOUT_HISTORY && !project.overleaf?.history?.id) { + hasInvalidProject = true continue } validProjects++ yield project.overleaf.history.id.toString() } - if (validProjects === 0) { - yield* getSampleProjectsCursor(N) + if (validProjects === 0 && hasInvalidProject) { + yield* getSampleProjectsCursor(N, preSampleAggregationStages) } }