Create script: remove_unconfirmed_emails.mjs (#23079)

* Create script: remove_unconfirmed_emails.mjs

* Update script remove_unconfirmed_emails.mjs after pairing

Co-authored-by: Rebeka <rebeka.dekany@overleaf.com>

* Add user counts. Add timing

* Revert previous changes: just remove the fully unconfirmed emails

* Add tests on scripts/remove_unconfirmed_emails.mjs

* Update audit log so `removedEmail` is a string and doesn't break the admin-panel audit logs list

* Update remove_unconfirmed_emails to have a `--generate` and a `--consume` mode

* Update tests on remove_unconfirmed_emails

* Add tests checking that `--consume` doesn't delete any email

* Update script so `--consume` checks that emails shouldn't be deleted again!

* Update CSV path to `/tmp/...`

* Add test cases: deleted users, deleted email, comma in email

---------

Co-authored-by: Rebeka <rebeka.dekany@overleaf.com>
GitOrigin-RevId: 8c60b56bcdfa33bc6143d66c32a5f430fb76f6d7
This commit is contained in:
Antoine Clausse
2025-02-17 16:56:20 +01:00
committed by Copybot
parent b79af7f6d3
commit 98ea3664f2
3 changed files with 554 additions and 1 deletions
@@ -2,10 +2,11 @@ const OError = require('@overleaf/o-error')
const { UserAuditLogEntry } = require('../../models/UserAuditLogEntry')
const { callbackify } = require('util')
function _canHaveNoIpAddressId(operation) {
function _canHaveNoIpAddressId(operation, info) {
if (operation === 'join-group-subscription') return true
if (operation === 'leave-group-subscription') return true
if (operation === 'must-reset-password-set') return true
if (operation === 'remove-email' && info.script) return true
return false
}
@@ -0,0 +1,262 @@
// @ts-check
import minimist from 'minimist'
import { batchedUpdate } from '@overleaf/mongo-utils/batchedUpdate.js'
import { db, ObjectId } from '../app/src/infrastructure/mongodb.js'
import UserAuditLogHandler from '../app/src/Features/User/UserAuditLogHandler.js'
import fs from 'node:fs/promises'
import * as csv from 'csv'
import { promisify } from 'node:util'
import _ from 'lodash'
const CSV_FILENAME = '/tmp/remove_unconfirmed_emails.csv'
/**
* @type {(records: string[][]) => Promise<string>}
*/
const stringifyAsync = promisify(csv.stringify)
/**
* @type {(csvString: string) => Promise<string[][]>}
*/
const parseAsync = promisify(csv.parse)
function usage() {
console.log('Usage: node remove_unconfirmed_emails.mjs')
console.log('Removes unconfirmed emails from users')
console.log('Options:')
console.log(
'' +
' --generate generate the CSV file (remove_unconfirmed_emails.csv) containing the emails to remove\n' +
' --consume consume the CSV file (remove_unconfirmed_emails.csv) and remove the emails (by default it is a dry-run)\n' +
' --commit apply the changes (to be used with --consume)\n'
)
process.exit(0)
}
const { generate, consume, commit, help } = minimist(process.argv.slice(2), {
boolean: ['generate', 'consume', 'commit', 'help'],
alias: { help: 'h' },
default: { generate: false, consume: false, commit: false },
})
async function generateCsvFile() {
console.time('generate_csv')
let processedUsersCount = 0
let skippedUnconfirmedPrimaries = 0
let totalEmailsToRemove = 0
let totalUsersInCsv = 0
const records = [['User ID', 'Email', 'Sign Up Date']]
await batchedUpdate(
db.users,
{
$and: [
{ emails: { $exists: true } },
{ emails: { $not: { $size: 0 } } },
// Warning: this also matches unconfirmed primary emails
{
emails: {
$elemMatch: {
$or: [{ confirmedAt: { $exists: false } }, { confirmedAt: null }],
},
},
},
],
},
async users => {
console.log('Process', users.length, 'users')
processedUsersCount += users.length
for (const user of users) {
const unconfirmedSecondaries = user.emails.filter(
email => !email.confirmedAt && email.email !== user.email
)
if (unconfirmedSecondaries.length === 0) {
// Users can have been selected because of their unconfirmed primary email
// we don't want to remove those
skippedUnconfirmedPrimaries++
continue
}
for (const email of unconfirmedSecondaries) {
records.push([
user._id.toString(),
email.email,
user.signUpDate.toISOString(),
])
}
totalUsersInCsv++
totalEmailsToRemove += unconfirmedSecondaries.length
}
},
{ _id: 1, signUpDate: 1, emails: 1, email: 1 }
)
const csvContent = await stringifyAsync(records)
await fs.writeFile(CSV_FILENAME, csvContent)
console.log()
console.log('Processed users:', processedUsersCount)
console.log()
console.log('Generated CSV file:', CSV_FILENAME)
console.log('Total emails in the CSV:', totalEmailsToRemove)
console.log('Total users in the CSV:', totalUsersInCsv)
console.log(
'Unconfirmed primary emails (skipped):',
skippedUnconfirmedPrimaries
)
console.log()
console.timeEnd('generate_csv')
console.log()
}
async function consumeCsvFile() {
console.time('consume_csv')
const csvContent = await fs.readFile(CSV_FILENAME, 'utf8')
const rows = await parseAsync(csvContent)
rows.shift() // Remove header row
const emailsByUserId = {}
for (const [userId, email] of rows) {
if (!emailsByUserId[userId]) {
emailsByUserId[userId] = []
}
emailsByUserId[userId].push(email)
}
const userIds = Object.keys(emailsByUserId)
let processedUsersCount = 0
let removedEmailsCount = 0
let totalModifiedUsersCount = 0
const skippedEmail = {
userNotFound: 0,
nowConfirmed: 0,
nowPrimary: 0,
nowRemoved: 0,
}
console.log('Total emails in the CSV:', rows.length)
console.log('Total users in the CSV:', userIds.length)
for (const userId of userIds) {
const emailsToRemove = emailsByUserId[userId]
const user = await db.users.findOne({ _id: new ObjectId(userId) })
if (!user) {
skippedEmail.userNotFound += emailsToRemove.length
continue
}
const emailsToRemoveNow = emailsToRemove.filter(email => {
const currentEmail = user.emails.find(e => e.email === email)
if (!currentEmail) {
skippedEmail.nowRemoved++
return false
}
if (currentEmail.confirmedAt) {
skippedEmail.nowConfirmed++
return false
}
if (currentEmail.email === user.email) {
skippedEmail.nowPrimary++
return false
}
return true
})
removedEmailsCount += emailsToRemoveNow.length
if (commit && emailsToRemoveNow.length > 0) {
for (const email of emailsToRemove) {
await UserAuditLogHandler.promises.addEntry(
userId,
'remove-email',
undefined,
undefined,
{
removedEmail: email,
script: true,
note: 'remove unconfirmed secondary emails',
}
)
}
const updated = await db.users.updateOne(
{ _id: new ObjectId(userId) },
{ $pull: { emails: { email: { $in: emailsToRemove } } } }
)
totalModifiedUsersCount += updated.modifiedCount
}
processedUsersCount++
if (processedUsersCount % 100 === 0) {
console.log('Processed', processedUsersCount, 'users')
}
}
console.log()
if (!commit) {
console.log('Dry-run, use --commit to apply changes')
console.log('This would be the result:')
console.log()
}
console.log('Total emails in the CSV:', rows.length)
console.log('Total users in the CSV:', userIds.length)
console.log('Total users processed:', processedUsersCount)
console.log('Total emails removed:', removedEmailsCount)
console.log('Skipped emails:', _.sum(Object.values(skippedEmail)))
console.log(' - User not found:', skippedEmail.userNotFound)
console.log(' - Email now confirmed:', skippedEmail.nowConfirmed)
console.log(' - Email now primary:', skippedEmail.nowPrimary)
console.log(' - Email now removed:', skippedEmail.nowRemoved)
console.log()
if (commit) {
console.log('Total users modified:', totalModifiedUsersCount)
} else {
console.log('Note: this was a dry-run. No changes were made.')
}
console.log()
console.timeEnd('consume_csv')
console.log()
}
async function main() {
if (help) {
return usage()
}
if (!generate && !consume) {
console.error('Error: Either --generate or --consume must be specified')
return usage()
}
if (generate && consume) {
console.error('Error: Cannot use both --generate and --consume together')
return usage()
}
if (commit && !consume) {
console.error('Error: --commit can only be used with --consume')
return usage()
}
if (generate) {
await generateCsvFile()
} else if (consume) {
await consumeCsvFile()
}
}
try {
await main()
process.exit(0)
} catch (error) {
console.error(error)
process.exit(1)
}
@@ -0,0 +1,290 @@
import { promisify } from 'node:util'
import { exec } from 'node:child_process'
import { expect } from 'chai'
import { filterOutput } from './helpers/settings.mjs'
import { db, ObjectId } from '../../../app/src/infrastructure/mongodb.js'
import fs from 'node:fs/promises'
const CSV_FILENAME = '/tmp/remove_unconfirmed_emails.csv'
async function runScript(mode, commit) {
const result = await promisify(exec)(
[
'node',
'scripts/remove_unconfirmed_emails.mjs',
mode === 'generate' ? '--generate' : '--consume',
commit && '--commit',
]
.filter(Boolean)
.join(' ')
)
return {
...result,
stdout: result.stdout.split('\n').filter(filterOutput),
}
}
function createUser(signUpDate, emails, userIdx) {
const email = `primary${userIdx ?? ''}@overleaf.com`
return {
_id: new ObjectId(),
email,
emails,
signUpDate,
}
}
describe('scripts/remove_unconfirmed_emails', function () {
let user
afterEach(async function () {
try {
await fs.unlink(CSV_FILENAME)
} catch (err) {
// Ignore errors if file doesn't exist
}
})
describe('when removing unconfirmed secondary emails', function () {
beforeEach(async function () {
user = createUser(new Date('2000-01-01'), [
{ email: 'primary@overleaf.com', confirmedAt: new Date() },
{ email: 'unconfirmed1@overleaf.com' },
{ email: 'unconfirmed-special-,\'"@overleaf.com' },
])
await db.users.insertOne(user)
})
it('should remove all unconfirmed secondary emails', async function () {
await runScript('generate')
const r = await runScript('consume', true)
expect(r.stdout).to.include('Total emails in the CSV: 2')
expect(r.stdout).to.include('Total users processed: 1')
const updatedUser = await db.users.findOne({ _id: user._id })
expect(updatedUser.emails).to.have.length(1)
expect(updatedUser.emails[0].email).to.equal(user.email)
})
it('should not modify anything in dry run mode', async function () {
await runScript('generate')
const r = await runScript('consume', false)
expect(r.stdout).to.include('Total emails in the CSV: 2')
expect(r.stdout).to.include('Total users processed: 1')
expect(r.stdout).to.include(
'Note: this was a dry-run. No changes were made.'
)
const updatedUser = await db.users.findOne({ _id: user._id })
expect(updatedUser.emails).to.have.length(3)
})
})
describe('when handling confirmed secondary emails', function () {
beforeEach(async function () {
user = createUser(new Date('2000-01-01'), [
{ email: 'primary@overleaf.com', confirmedAt: new Date() },
{ email: 'confirmed@overleaf.com', confirmedAt: new Date() },
])
await db.users.insertOne(user)
})
it('should preserve confirmed secondary emails', async function () {
await runScript('generate')
const r = await runScript('consume', true)
expect(r.stdout).to.include('Total emails in the CSV: 0')
expect(r.stdout).to.include('Total users processed: 0')
const updatedUser = await db.users.findOne({ _id: user._id })
expect(updatedUser.emails).to.have.length(2)
expect(updatedUser.emails[1].confirmedAt).to.exist
})
})
describe('when handling unconfirmed primary emails', function () {
beforeEach(async function () {
user = createUser(new Date('2000-01-01'), [
{ email: 'primary@overleaf.com' },
])
await db.users.insertOne(user)
})
it('should not remove unconfirmed primary emails', async function () {
await runScript('generate')
const r = await runScript('consume', true)
expect(r.stdout).to.include('Total emails in the CSV: 0')
expect(r.stdout).to.include('Total users processed: 0')
const updatedUser = await db.users.findOne({ _id: user._id })
expect(updatedUser.emails).to.have.length(1)
expect(updatedUser.emails[0].email).to.equal('primary@overleaf.com')
})
})
describe('when users confirmed their email in between', function () {
beforeEach(async function () {
user = createUser(new Date('2000-01-01'), [
{ email: 'primary@overleaf.com' },
{ email: 'secondary@overleaf.com' },
])
await db.users.insertOne(user)
})
it('should not remove emails from users who confirmed their email in between', async function () {
await runScript('generate')
await db.users.updateOne(
{ _id: user._id },
{ $set: { 'emails.1.confirmedAt': new Date() } }
)
const r = await runScript('consume', true)
expect(r.stdout).to.include('Total emails in the CSV: 1')
expect(r.stdout).to.include('Skipped emails: 1')
expect(r.stdout).to.include(' - Email now confirmed: 1')
const updatedUser = await db.users.findOne({ _id: user._id })
expect(updatedUser.emails).to.have.length(2)
})
})
describe('when users changed their primary email in between', function () {
beforeEach(async function () {
user = createUser(new Date('2000-01-01'), [
{ email: 'primary@overleaf.com' },
{ email: 'secondary@overleaf.com' },
])
await db.users.insertOne(user)
})
it('should not remove emails from users who changed their primary email in between', async function () {
await runScript('generate')
await db.users.updateOne(
{ _id: user._id },
{ $set: { email: 'secondary@overleaf.com' } }
)
const r = await runScript('consume', true)
expect(r.stdout).to.include('Total emails in the CSV: 1')
expect(r.stdout).to.include('Skipped emails: 1')
expect(r.stdout).to.include(' - Email now primary: 1')
const updatedUser = await db.users.findOne({ _id: user._id })
expect(updatedUser.emails).to.have.length(2)
})
})
describe('when users account was deleted in between', function () {
beforeEach(async function () {
user = createUser(new Date('2000-01-01'), [
{ email: 'primary@overleaf.com' },
{ email: 'secondary@overleaf.com' },
])
await db.users.insertOne(user)
})
it('should skip emails from users whose account was deleted', async function () {
await runScript('generate')
// Delete the user
await db.users.deleteOne({ _id: user._id })
const r = await runScript('consume', true)
expect(r.stdout).to.include('Total emails in the CSV: 1')
expect(r.stdout).to.include('Skipped emails: 1')
expect(r.stdout).to.include(' - User not found: 1')
})
})
describe('when users email was deleted in between', function () {
beforeEach(async function () {
user = createUser(new Date('2000-01-01'), [
{ email: 'primary@overleaf.com' },
{ email: 'secondary@overleaf.com' },
])
await db.users.insertOne(user)
})
it('should skip emails that were already removed', async function () {
await runScript('generate')
// Remove the secondary email
await db.users.updateOne(
{ _id: user._id },
{ $pull: { emails: { email: 'secondary@overleaf.com' } } }
)
const r = await runScript('consume', true)
expect(r.stdout).to.include('Total emails in the CSV: 1')
expect(r.stdout).to.include('Skipped emails: 1')
expect(r.stdout).to.include(' - Email now removed: 1')
const updatedUser = await db.users.findOne({ _id: user._id })
expect(updatedUser.emails).to.have.length(1)
expect(updatedUser.emails[0].email).to.equal('primary@overleaf.com')
})
})
describe('when handling confirmation field edge cases', function () {
beforeEach(async function () {
user = createUser(new Date('2000-01-01'), [
{ email: 'primary@overleaf.com', confirmedAt: new Date() },
{ email: 'secondary1@overleaf.com', confirmedAt: null },
{ email: 'secondary2@overleaf.com' },
])
await db.users.insertOne(user)
})
it('should remove emails with both missing and null confirmedAt', async function () {
await runScript('generate')
const r = await runScript('consume', true)
expect(r.stdout).to.include('Total emails in the CSV: 2')
expect(r.stdout).to.include('Total users processed: 1')
const updatedUser = await db.users.findOne({ _id: user._id })
expect(updatedUser.emails).to.have.length(1)
expect(updatedUser.emails[0].email).to.equal(user.email)
})
})
describe('CSV file generation', function () {
beforeEach(async function () {
user = createUser(new Date('2000-01-01'), [
{ email: 'primary@overleaf.com', confirmedAt: new Date() },
{ email: 'unconfirmed1@overleaf.com' },
{ email: 'confirmed1@overleaf.com', confirmedAt: new Date() },
{ email: 'unconfirmed2@overleaf.com' },
{ email: '!,@overleaf.com' },
{ email: "!'@overleaf.com" },
{ email: '!,\'"@overleaf.com' },
])
await db.users.insertOne(user)
})
it('should generate a valid CSV file', async function () {
const r = await runScript('generate')
expect(r.stdout).to.include(
'Generated CSV file: /tmp/remove_unconfirmed_emails.csv'
)
expect(r.stdout).to.include('Total emails in the CSV: 5')
const csvContent = await fs.readFile(CSV_FILENAME, 'utf8')
expect(csvContent).to.equal(`User ID,Email,Sign Up Date
${user._id},unconfirmed1@overleaf.com,2000-01-01T00:00:00.000Z
${user._id},unconfirmed2@overleaf.com,2000-01-01T00:00:00.000Z
${user._id},"!,@overleaf.com",2000-01-01T00:00:00.000Z
${user._id},!'@overleaf.com,2000-01-01T00:00:00.000Z
${user._id},"!,'""@overleaf.com",2000-01-01T00:00:00.000Z
`)
})
})
})