From be970526f4eefbe00dc76e186f5bfbc9db71eaa9 Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Tue, 30 Nov 2021 15:06:33 +0000 Subject: [PATCH] Merge pull request #5961 from overleaf/bg-remove-deleted-docs [document-updater] remove deleted docs from redis GitOrigin-RevId: ec9ad55d3c5cd9b55f56599de671068c00442f49 --- .../scripts/remove_deleted_docs.js | 173 ++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 services/document-updater/scripts/remove_deleted_docs.js diff --git a/services/document-updater/scripts/remove_deleted_docs.js b/services/document-updater/scripts/remove_deleted_docs.js new file mode 100644 index 0000000000..5e0393dd93 --- /dev/null +++ b/services/document-updater/scripts/remove_deleted_docs.js @@ -0,0 +1,173 @@ +const Settings = require('@overleaf/settings') +const logger = require('@overleaf/logger') +const rclient = require('@overleaf/redis-wrapper').createClient( + Settings.redis.documentupdater +) +const keys = Settings.redis.documentupdater.key_schema +const ProjectFlusher = require('app/js/ProjectFlusher') +const RedisManager = require('app/js/RedisManager') +const util = require('util') +const getDoc = util.promisify((projectId, docId, cb) => + RedisManager.getDoc(projectId, docId, (err, ...args) => cb(err, args)) +) +const removeDocFromMemory = util.promisify(RedisManager.removeDocFromMemory) +const { MongoClient, ObjectId } = require('mongodb') + +const clientPromise = MongoClient.connect( + Settings.mongo.url, + Settings.mongo.options +) + +const summary = { totalDocs: 0, deletedDocs: 0, skippedDocs: 0 } + +const db = {} +clientPromise.then(client => { + db.docs = client.db().collection('docs') + db.projects = client.db().collection('projects') +}) + +async function removeDeletedDocs(dockeys, options) { + const docIds = ProjectFlusher._extractIds(dockeys) + for (const docId of docIds) { + summary.totalDocs++ + const docCount = await db.docs.find({ _id: ObjectId(docId) }).count() + if (!docCount) { + try { + await removeDeletedDoc(docId, options) + } catch (err) { + logger.error({ docId, err }, 'error removing deleted doc') + } + } + } +} + +async function removeDeletedDoc(docId, options) { + const projectId = await rclient.get(keys.projectKey({ doc_id: docId })) + + const [ + docLines, + version, + ranges, + pathname, + projectHistoryId, + unflushedTime, + lastUpdatedAt, + lastUpdatedBy, + ] = await getDoc(projectId, docId) + + const project = await db.projects.findOne({ _id: ObjectId(projectId) }) + + let status + + if (project) { + const projectJSON = JSON.stringify(project.rootFolder) + const containsDoc = projectJSON.indexOf(docId) !== -1 + if (containsDoc) { + logger.warn( + { + projectId, + docId, + docLinesBytes: docLines && docLines.length, + version, + rangesBytes: ranges && ranges.length, + pathname, + projectHistoryId, + unflushedTime, + lastUpdatedAt, + lastUpdatedBy, + }, + 'refusing to delete doc, project contains docId' + ) + summary.skippedDocs++ + return + } else { + logger.warn( + { + projectId, + docId, + docLinesBytes: docLines && docLines.length, + version, + rangesBytes: ranges && ranges.length, + pathname, + projectHistoryId, + unflushedTime, + lastUpdatedAt, + lastUpdatedBy, + }, + 'refusing to delete doc, project still exists' + ) + summary.skippedDocs++ + return + } + } else { + status = 'projectDeleted' + } + summary.deletedDocs++ + if (options.dryRun) { + logger.info( + { + projectId, + docId, + docLinesBytes: docLines && docLines.length, + version, + rangesBytes: ranges && ranges.length, + pathname, + projectHistoryId, + unflushedTime, + lastUpdatedAt, + lastUpdatedBy, + status, + summary, + }, + 'dry run mode - would remove doc from redis' + ) + return + } + removeDocFromMemory(projectId, docId) + logger.info( + { + projectId, + docId, + docLinesBytes: docLines && docLines.length, + version, + rangesBytes: ranges && ranges.length, + pathname, + projectHistoryId, + unflushedTime, + lastUpdatedAt, + lastUpdatedBy, + status, + summary, + }, + 'removed doc from redis' + ) +} + +async function findAndProcessDocs(options) { + logger.info({ options }, 'removing deleted docs') + let cursor = 0 + do { + const [newCursor, doclinesKeys] = await rclient.scan( + cursor, + 'MATCH', + keys.docLines({ doc_id: '*' }), + 'COUNT', + options.limit + ) + await removeDeletedDocs(doclinesKeys, options) + cursor = newCursor + } while (cursor !== '0') +} + +clientPromise.then(client => { + findAndProcessDocs({ limit: 1000, dryRun: process.env.DRY_RUN !== 'false' }) + .then(result => { + rclient.quit() + client.close() + console.log('DONE') + }) + .catch(function (error) { + console.error(error) + process.exit(1) + }) +})