From 98ea3664f2b7f7dd9f82dbb1f65132ab27085f95 Mon Sep 17 00:00:00 2001 From: Antoine Clausse Date: Mon, 17 Feb 2025 16:56:20 +0100 Subject: [PATCH] Create script: remove_unconfirmed_emails.mjs (#23079) * Create script: remove_unconfirmed_emails.mjs * Update script remove_unconfirmed_emails.mjs after pairing Co-authored-by: Rebeka * Add user counts. Add timing * Revert previous changes: just remove the fully unconfirmed emails * Add tests on scripts/remove_unconfirmed_emails.mjs * Update audit log so `removedEmail` is a string and doesn't break the admin-panel audit logs list * Update remove_unconfirmed_emails to have a `--generate` and a `--consume` mode * Update tests on remove_unconfirmed_emails * Add tests checking that `--consume` doesn't delete any email * Update script so `--consume` checks that emails shouldn't be deleted again! * Update CSV path to `/tmp/...` * Add test cases: deleted users, deleted email, comma in email --------- Co-authored-by: Rebeka GitOrigin-RevId: 8c60b56bcdfa33bc6143d66c32a5f430fb76f6d7 --- .../src/Features/User/UserAuditLogHandler.js | 3 +- .../web/scripts/remove_unconfirmed_emails.mjs | 262 ++++++++++++++++ .../RemoveUnconfirmedEmailsScriptTests.mjs | 290 ++++++++++++++++++ 3 files changed, 554 insertions(+), 1 deletion(-) create mode 100644 services/web/scripts/remove_unconfirmed_emails.mjs create mode 100644 services/web/test/acceptance/src/RemoveUnconfirmedEmailsScriptTests.mjs diff --git a/services/web/app/src/Features/User/UserAuditLogHandler.js b/services/web/app/src/Features/User/UserAuditLogHandler.js index 63babcd639..6984c4f034 100644 --- a/services/web/app/src/Features/User/UserAuditLogHandler.js +++ b/services/web/app/src/Features/User/UserAuditLogHandler.js @@ -2,10 +2,11 @@ const OError = require('@overleaf/o-error') const { UserAuditLogEntry } = require('../../models/UserAuditLogEntry') const { callbackify } = require('util') -function _canHaveNoIpAddressId(operation) { +function _canHaveNoIpAddressId(operation, info) { if (operation === 'join-group-subscription') return true if (operation === 'leave-group-subscription') return true if (operation === 'must-reset-password-set') return true + if (operation === 'remove-email' && info.script) return true return false } diff --git a/services/web/scripts/remove_unconfirmed_emails.mjs b/services/web/scripts/remove_unconfirmed_emails.mjs new file mode 100644 index 0000000000..e86540dba6 --- /dev/null +++ b/services/web/scripts/remove_unconfirmed_emails.mjs @@ -0,0 +1,262 @@ +// @ts-check + +import minimist from 'minimist' +import { batchedUpdate } from '@overleaf/mongo-utils/batchedUpdate.js' +import { db, ObjectId } from '../app/src/infrastructure/mongodb.js' +import UserAuditLogHandler from '../app/src/Features/User/UserAuditLogHandler.js' +import fs from 'node:fs/promises' +import * as csv from 'csv' +import { promisify } from 'node:util' +import _ from 'lodash' + +const CSV_FILENAME = '/tmp/remove_unconfirmed_emails.csv' +/** + * @type {(records: string[][]) => Promise} + */ +const stringifyAsync = promisify(csv.stringify) +/** + * @type {(csvString: string) => Promise} + */ +const parseAsync = promisify(csv.parse) + +function usage() { + console.log('Usage: node remove_unconfirmed_emails.mjs') + console.log('Removes unconfirmed emails from users') + console.log('Options:') + console.log( + '' + + ' --generate generate the CSV file (remove_unconfirmed_emails.csv) containing the emails to remove\n' + + ' --consume consume the CSV file (remove_unconfirmed_emails.csv) and remove the emails (by default it is a dry-run)\n' + + ' --commit apply the changes (to be used with --consume)\n' + ) + process.exit(0) +} + +const { generate, consume, commit, help } = minimist(process.argv.slice(2), { + boolean: ['generate', 'consume', 'commit', 'help'], + alias: { help: 'h' }, + default: { generate: false, consume: false, commit: false }, +}) + +async function generateCsvFile() { + console.time('generate_csv') + + let processedUsersCount = 0 + let skippedUnconfirmedPrimaries = 0 + let totalEmailsToRemove = 0 + let totalUsersInCsv = 0 + + const records = [['User ID', 'Email', 'Sign Up Date']] + + await batchedUpdate( + db.users, + { + $and: [ + { emails: { $exists: true } }, + { emails: { $not: { $size: 0 } } }, + // Warning: this also matches unconfirmed primary emails + { + emails: { + $elemMatch: { + $or: [{ confirmedAt: { $exists: false } }, { confirmedAt: null }], + }, + }, + }, + ], + }, + async users => { + console.log('Process', users.length, 'users') + processedUsersCount += users.length + + for (const user of users) { + const unconfirmedSecondaries = user.emails.filter( + email => !email.confirmedAt && email.email !== user.email + ) + + if (unconfirmedSecondaries.length === 0) { + // Users can have been selected because of their unconfirmed primary email + // we don't want to remove those + skippedUnconfirmedPrimaries++ + continue + } + + for (const email of unconfirmedSecondaries) { + records.push([ + user._id.toString(), + email.email, + user.signUpDate.toISOString(), + ]) + } + + totalUsersInCsv++ + totalEmailsToRemove += unconfirmedSecondaries.length + } + }, + { _id: 1, signUpDate: 1, emails: 1, email: 1 } + ) + + const csvContent = await stringifyAsync(records) + await fs.writeFile(CSV_FILENAME, csvContent) + + console.log() + console.log('Processed users:', processedUsersCount) + console.log() + console.log('Generated CSV file:', CSV_FILENAME) + console.log('Total emails in the CSV:', totalEmailsToRemove) + console.log('Total users in the CSV:', totalUsersInCsv) + console.log( + 'Unconfirmed primary emails (skipped):', + skippedUnconfirmedPrimaries + ) + console.log() + console.timeEnd('generate_csv') + console.log() +} + +async function consumeCsvFile() { + console.time('consume_csv') + + const csvContent = await fs.readFile(CSV_FILENAME, 'utf8') + const rows = await parseAsync(csvContent) + rows.shift() // Remove header row + const emailsByUserId = {} + + for (const [userId, email] of rows) { + if (!emailsByUserId[userId]) { + emailsByUserId[userId] = [] + } + emailsByUserId[userId].push(email) + } + + const userIds = Object.keys(emailsByUserId) + let processedUsersCount = 0 + let removedEmailsCount = 0 + let totalModifiedUsersCount = 0 + const skippedEmail = { + userNotFound: 0, + nowConfirmed: 0, + nowPrimary: 0, + nowRemoved: 0, + } + + console.log('Total emails in the CSV:', rows.length) + console.log('Total users in the CSV:', userIds.length) + + for (const userId of userIds) { + const emailsToRemove = emailsByUserId[userId] + + const user = await db.users.findOne({ _id: new ObjectId(userId) }) + if (!user) { + skippedEmail.userNotFound += emailsToRemove.length + continue + } + + const emailsToRemoveNow = emailsToRemove.filter(email => { + const currentEmail = user.emails.find(e => e.email === email) + if (!currentEmail) { + skippedEmail.nowRemoved++ + return false + } + if (currentEmail.confirmedAt) { + skippedEmail.nowConfirmed++ + return false + } + if (currentEmail.email === user.email) { + skippedEmail.nowPrimary++ + return false + } + return true + }) + + removedEmailsCount += emailsToRemoveNow.length + + if (commit && emailsToRemoveNow.length > 0) { + for (const email of emailsToRemove) { + await UserAuditLogHandler.promises.addEntry( + userId, + 'remove-email', + undefined, + undefined, + { + removedEmail: email, + script: true, + note: 'remove unconfirmed secondary emails', + } + ) + } + + const updated = await db.users.updateOne( + { _id: new ObjectId(userId) }, + { $pull: { emails: { email: { $in: emailsToRemove } } } } + ) + totalModifiedUsersCount += updated.modifiedCount + } + + processedUsersCount++ + if (processedUsersCount % 100 === 0) { + console.log('Processed', processedUsersCount, 'users') + } + } + + console.log() + if (!commit) { + console.log('Dry-run, use --commit to apply changes') + console.log('This would be the result:') + console.log() + } + + console.log('Total emails in the CSV:', rows.length) + console.log('Total users in the CSV:', userIds.length) + console.log('Total users processed:', processedUsersCount) + console.log('Total emails removed:', removedEmailsCount) + console.log('Skipped emails:', _.sum(Object.values(skippedEmail))) + console.log(' - User not found:', skippedEmail.userNotFound) + console.log(' - Email now confirmed:', skippedEmail.nowConfirmed) + console.log(' - Email now primary:', skippedEmail.nowPrimary) + console.log(' - Email now removed:', skippedEmail.nowRemoved) + console.log() + + if (commit) { + console.log('Total users modified:', totalModifiedUsersCount) + } else { + console.log('Note: this was a dry-run. No changes were made.') + } + console.log() + console.timeEnd('consume_csv') + console.log() +} + +async function main() { + if (help) { + return usage() + } + + if (!generate && !consume) { + console.error('Error: Either --generate or --consume must be specified') + return usage() + } + + if (generate && consume) { + console.error('Error: Cannot use both --generate and --consume together') + return usage() + } + + if (commit && !consume) { + console.error('Error: --commit can only be used with --consume') + return usage() + } + + if (generate) { + await generateCsvFile() + } else if (consume) { + await consumeCsvFile() + } +} + +try { + await main() + process.exit(0) +} catch (error) { + console.error(error) + process.exit(1) +} diff --git a/services/web/test/acceptance/src/RemoveUnconfirmedEmailsScriptTests.mjs b/services/web/test/acceptance/src/RemoveUnconfirmedEmailsScriptTests.mjs new file mode 100644 index 0000000000..6bb59965cc --- /dev/null +++ b/services/web/test/acceptance/src/RemoveUnconfirmedEmailsScriptTests.mjs @@ -0,0 +1,290 @@ +import { promisify } from 'node:util' +import { exec } from 'node:child_process' +import { expect } from 'chai' +import { filterOutput } from './helpers/settings.mjs' +import { db, ObjectId } from '../../../app/src/infrastructure/mongodb.js' +import fs from 'node:fs/promises' + +const CSV_FILENAME = '/tmp/remove_unconfirmed_emails.csv' + +async function runScript(mode, commit) { + const result = await promisify(exec)( + [ + 'node', + 'scripts/remove_unconfirmed_emails.mjs', + mode === 'generate' ? '--generate' : '--consume', + commit && '--commit', + ] + .filter(Boolean) + .join(' ') + ) + return { + ...result, + stdout: result.stdout.split('\n').filter(filterOutput), + } +} + +function createUser(signUpDate, emails, userIdx) { + const email = `primary${userIdx ?? ''}@overleaf.com` + return { + _id: new ObjectId(), + email, + emails, + signUpDate, + } +} + +describe('scripts/remove_unconfirmed_emails', function () { + let user + + afterEach(async function () { + try { + await fs.unlink(CSV_FILENAME) + } catch (err) { + // Ignore errors if file doesn't exist + } + }) + + describe('when removing unconfirmed secondary emails', function () { + beforeEach(async function () { + user = createUser(new Date('2000-01-01'), [ + { email: 'primary@overleaf.com', confirmedAt: new Date() }, + { email: 'unconfirmed1@overleaf.com' }, + { email: 'unconfirmed-special-,\'"@overleaf.com' }, + ]) + await db.users.insertOne(user) + }) + + it('should remove all unconfirmed secondary emails', async function () { + await runScript('generate') + const r = await runScript('consume', true) + + expect(r.stdout).to.include('Total emails in the CSV: 2') + expect(r.stdout).to.include('Total users processed: 1') + + const updatedUser = await db.users.findOne({ _id: user._id }) + expect(updatedUser.emails).to.have.length(1) + expect(updatedUser.emails[0].email).to.equal(user.email) + }) + + it('should not modify anything in dry run mode', async function () { + await runScript('generate') + const r = await runScript('consume', false) + + expect(r.stdout).to.include('Total emails in the CSV: 2') + expect(r.stdout).to.include('Total users processed: 1') + expect(r.stdout).to.include( + 'Note: this was a dry-run. No changes were made.' + ) + + const updatedUser = await db.users.findOne({ _id: user._id }) + expect(updatedUser.emails).to.have.length(3) + }) + }) + + describe('when handling confirmed secondary emails', function () { + beforeEach(async function () { + user = createUser(new Date('2000-01-01'), [ + { email: 'primary@overleaf.com', confirmedAt: new Date() }, + { email: 'confirmed@overleaf.com', confirmedAt: new Date() }, + ]) + await db.users.insertOne(user) + }) + + it('should preserve confirmed secondary emails', async function () { + await runScript('generate') + const r = await runScript('consume', true) + + expect(r.stdout).to.include('Total emails in the CSV: 0') + expect(r.stdout).to.include('Total users processed: 0') + + const updatedUser = await db.users.findOne({ _id: user._id }) + expect(updatedUser.emails).to.have.length(2) + expect(updatedUser.emails[1].confirmedAt).to.exist + }) + }) + + describe('when handling unconfirmed primary emails', function () { + beforeEach(async function () { + user = createUser(new Date('2000-01-01'), [ + { email: 'primary@overleaf.com' }, + ]) + await db.users.insertOne(user) + }) + + it('should not remove unconfirmed primary emails', async function () { + await runScript('generate') + const r = await runScript('consume', true) + + expect(r.stdout).to.include('Total emails in the CSV: 0') + expect(r.stdout).to.include('Total users processed: 0') + + const updatedUser = await db.users.findOne({ _id: user._id }) + expect(updatedUser.emails).to.have.length(1) + expect(updatedUser.emails[0].email).to.equal('primary@overleaf.com') + }) + }) + + describe('when users confirmed their email in between', function () { + beforeEach(async function () { + user = createUser(new Date('2000-01-01'), [ + { email: 'primary@overleaf.com' }, + { email: 'secondary@overleaf.com' }, + ]) + await db.users.insertOne(user) + }) + + it('should not remove emails from users who confirmed their email in between', async function () { + await runScript('generate') + + await db.users.updateOne( + { _id: user._id }, + { $set: { 'emails.1.confirmedAt': new Date() } } + ) + + const r = await runScript('consume', true) + + expect(r.stdout).to.include('Total emails in the CSV: 1') + expect(r.stdout).to.include('Skipped emails: 1') + expect(r.stdout).to.include(' - Email now confirmed: 1') + + const updatedUser = await db.users.findOne({ _id: user._id }) + expect(updatedUser.emails).to.have.length(2) + }) + }) + + describe('when users changed their primary email in between', function () { + beforeEach(async function () { + user = createUser(new Date('2000-01-01'), [ + { email: 'primary@overleaf.com' }, + { email: 'secondary@overleaf.com' }, + ]) + await db.users.insertOne(user) + }) + + it('should not remove emails from users who changed their primary email in between', async function () { + await runScript('generate') + + await db.users.updateOne( + { _id: user._id }, + { $set: { email: 'secondary@overleaf.com' } } + ) + const r = await runScript('consume', true) + + expect(r.stdout).to.include('Total emails in the CSV: 1') + expect(r.stdout).to.include('Skipped emails: 1') + expect(r.stdout).to.include(' - Email now primary: 1') + + const updatedUser = await db.users.findOne({ _id: user._id }) + expect(updatedUser.emails).to.have.length(2) + }) + }) + + describe('when users account was deleted in between', function () { + beforeEach(async function () { + user = createUser(new Date('2000-01-01'), [ + { email: 'primary@overleaf.com' }, + { email: 'secondary@overleaf.com' }, + ]) + await db.users.insertOne(user) + }) + + it('should skip emails from users whose account was deleted', async function () { + await runScript('generate') + + // Delete the user + await db.users.deleteOne({ _id: user._id }) + + const r = await runScript('consume', true) + + expect(r.stdout).to.include('Total emails in the CSV: 1') + expect(r.stdout).to.include('Skipped emails: 1') + expect(r.stdout).to.include(' - User not found: 1') + }) + }) + + describe('when users email was deleted in between', function () { + beforeEach(async function () { + user = createUser(new Date('2000-01-01'), [ + { email: 'primary@overleaf.com' }, + { email: 'secondary@overleaf.com' }, + ]) + await db.users.insertOne(user) + }) + + it('should skip emails that were already removed', async function () { + await runScript('generate') + + // Remove the secondary email + await db.users.updateOne( + { _id: user._id }, + { $pull: { emails: { email: 'secondary@overleaf.com' } } } + ) + + const r = await runScript('consume', true) + + expect(r.stdout).to.include('Total emails in the CSV: 1') + expect(r.stdout).to.include('Skipped emails: 1') + expect(r.stdout).to.include(' - Email now removed: 1') + + const updatedUser = await db.users.findOne({ _id: user._id }) + expect(updatedUser.emails).to.have.length(1) + expect(updatedUser.emails[0].email).to.equal('primary@overleaf.com') + }) + }) + + describe('when handling confirmation field edge cases', function () { + beforeEach(async function () { + user = createUser(new Date('2000-01-01'), [ + { email: 'primary@overleaf.com', confirmedAt: new Date() }, + { email: 'secondary1@overleaf.com', confirmedAt: null }, + { email: 'secondary2@overleaf.com' }, + ]) + await db.users.insertOne(user) + }) + + it('should remove emails with both missing and null confirmedAt', async function () { + await runScript('generate') + const r = await runScript('consume', true) + + expect(r.stdout).to.include('Total emails in the CSV: 2') + expect(r.stdout).to.include('Total users processed: 1') + + const updatedUser = await db.users.findOne({ _id: user._id }) + expect(updatedUser.emails).to.have.length(1) + expect(updatedUser.emails[0].email).to.equal(user.email) + }) + }) + + describe('CSV file generation', function () { + beforeEach(async function () { + user = createUser(new Date('2000-01-01'), [ + { email: 'primary@overleaf.com', confirmedAt: new Date() }, + { email: 'unconfirmed1@overleaf.com' }, + { email: 'confirmed1@overleaf.com', confirmedAt: new Date() }, + { email: 'unconfirmed2@overleaf.com' }, + { email: '!,@overleaf.com' }, + { email: "!'@overleaf.com" }, + { email: '!,\'"@overleaf.com' }, + ]) + await db.users.insertOne(user) + }) + + it('should generate a valid CSV file', async function () { + const r = await runScript('generate') + + expect(r.stdout).to.include( + 'Generated CSV file: /tmp/remove_unconfirmed_emails.csv' + ) + expect(r.stdout).to.include('Total emails in the CSV: 5') + const csvContent = await fs.readFile(CSV_FILENAME, 'utf8') + expect(csvContent).to.equal(`User ID,Email,Sign Up Date +${user._id},unconfirmed1@overleaf.com,2000-01-01T00:00:00.000Z +${user._id},unconfirmed2@overleaf.com,2000-01-01T00:00:00.000Z +${user._id},"!,@overleaf.com",2000-01-01T00:00:00.000Z +${user._id},!'@overleaf.com,2000-01-01T00:00:00.000Z +${user._id},"!,'""@overleaf.com",2000-01-01T00:00:00.000Z +`) + }) + }) +})