From 4712a2d541d81047b0402dca74fac8c200768d99 Mon Sep 17 00:00:00 2001 From: Olzhas Askar Date: Fri, 12 Sep 2025 19:11:26 +0200 Subject: [PATCH] [web] Back to school user extraction script (#28416) * fix: the script was not exiting when no users were found * feat: scaffold a new script * fix: make the script actually runnable * refactor: exit at the same place * feat: filter out subscriptions and without subscriptions * fix: filter out groupies * feat: from IDs to emails docs: fix wording refactor: will not need this script anymore fix: don't use concurrency feat: add id files fix: remove the data files fix: this indeed works! * fix: review comments * fix: prettier complaints GitOrigin-RevId: 839ca850ff0df6972afafbfc25d4b89c10284dc6 --- .../web/scripts/extract_day1_churn_users.js | 2 +- services/web/scripts/get_emails_by_ids.mjs | 130 ++++++++++++++++++ 2 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 services/web/scripts/get_emails_by_ids.mjs diff --git a/services/web/scripts/extract_day1_churn_users.js b/services/web/scripts/extract_day1_churn_users.js index 3b7b6b4cb7..5bc17c5481 100644 --- a/services/web/scripts/extract_day1_churn_users.js +++ b/services/web/scripts/extract_day1_churn_users.js @@ -232,7 +232,7 @@ async function runScript() { if (churnUsers.length === 0) { console.log('No day 1 churn users found for the specified period') - return + process.exit(0) } console.log(`Writing ${churnUsers.length} users to ${args.outputPath}...`) diff --git a/services/web/scripts/get_emails_by_ids.mjs b/services/web/scripts/get_emails_by_ids.mjs new file mode 100644 index 0000000000..4a41486244 --- /dev/null +++ b/services/web/scripts/get_emails_by_ids.mjs @@ -0,0 +1,130 @@ +import { scriptRunner } from './lib/ScriptRunner.mjs' +import fs from 'node:fs' +import readline from 'node:readline' +import minimist from 'minimist' +import { db, ObjectId } from '../app/src/infrastructure/mongodb.js' + +/** + * This script extracts user emails given a list of newline separated IDs + * + * Usage: + * - Locally: + * - docker compose exec web bash + * - node scripts/get_emails_by_ids.mjs + * - On the server: + * - rake run:pod[staging,web] + * - node scripts/get_emails_by_ids.mjs + * - exit + * - kubectl cp web-standalone-prod-XXXXX:/tmp/emails.txt ~/emails.txt + */ + +function usage() { + console.log( + ` + User email extraction, outputs to /tmp/emails.txt + + Usage: + node scripts/get_emails_by_ids.js [--inputPath=] [--outputPath=] [--batchSize=] + + Options: + --help Show this screen + + --inputPath= Input file path (default: ids.txt) + + --outputPath= Output file path (default: /tmp/emails.txt) + + --batchSize= Number of emails to be fetched in one query + + Description: + This script extracts user emails given a list of newline separated IDs + ` + ) +} + +function parseArgs() { + const argv = minimist(process.argv.slice(2), { + string: ['inputPath', 'outputPath'], + bool: ['help'], + number: ['batchSize'], + default: { + help: false, + inputPath: 'ids.txt', + outputPath: '/tmp/emails.txt', + batchSize: 1000, + }, + }) + + if (argv.help) { + usage() + process.exit(0) + } + + return argv +} + +async function processBatch(idBatch, writeStream) { + try { + const cursor = db.users + .find({ + _id: { $in: idBatch }, + }) + .project({ + _id: 0, + email: 1, + }) + + for await (const doc of cursor) { + if (doc.email) { + writeStream.write(doc.email + '\n') + } + } + } catch (err) { + console.error('Error processing batch:', err) + } +} + +async function main(trackProgress) { + const args = parseArgs() + + const readStream = fs.createReadStream(args.inputPath) + const writeStream = fs.createWriteStream(args.outputPath) + const rl = readline.createInterface({ + input: readStream, + crlfDelay: Infinity, + }) + + let idBatch = [] + + for await (const line of rl) { + const id = line.trim() + if (id) { + try { + idBatch.push(new ObjectId(id)) + } catch (e) { + console.warn(`Skipping invalid ObjectId: ${id}`) + } + } + + if (idBatch.length >= args.batchSize) { + await processBatch(idBatch, writeStream) + idBatch = [] + } + } + + if (idBatch.length > 0) { + await processBatch(idBatch, writeStream) + } + + writeStream.end() + + console.log(`✅ Success! Found emails written to ${args.outputPath}`) + await trackProgress('Job finished') +} + +try { + await scriptRunner(main) + process.exit(0) +} catch (error) { + console.error(error) + process.exit(1) +}