mirror of
https://github.com/yu-i-i/overleaf-cep.git
synced 2026-06-06 15:49:01 +02:00
[web] Back to school user extraction script (#28416)
* fix: the script was not exiting when no users were found * feat: scaffold a new script * fix: make the script actually runnable * refactor: exit at the same place * feat: filter out subscriptions and without subscriptions * fix: filter out groupies * feat: from IDs to emails docs: fix wording refactor: will not need this script anymore fix: don't use concurrency feat: add id files fix: remove the data files fix: this indeed works! * fix: review comments * fix: prettier complaints GitOrigin-RevId: 839ca850ff0df6972afafbfc25d4b89c10284dc6
This commit is contained in:
@@ -232,7 +232,7 @@ async function runScript() {
|
||||
|
||||
if (churnUsers.length === 0) {
|
||||
console.log('No day 1 churn users found for the specified period')
|
||||
return
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
console.log(`Writing ${churnUsers.length} users to ${args.outputPath}...`)
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
import { scriptRunner } from './lib/ScriptRunner.mjs'
|
||||
import fs from 'node:fs'
|
||||
import readline from 'node:readline'
|
||||
import minimist from 'minimist'
|
||||
import { db, ObjectId } from '../app/src/infrastructure/mongodb.js'
|
||||
|
||||
/**
|
||||
* This script extracts user emails given a list of newline separated IDs
|
||||
*
|
||||
* Usage:
|
||||
* - Locally:
|
||||
* - docker compose exec web bash
|
||||
* - node scripts/get_emails_by_ids.mjs
|
||||
* - On the server:
|
||||
* - rake run:pod[staging,web]
|
||||
* - node scripts/get_emails_by_ids.mjs
|
||||
* - exit
|
||||
* - kubectl cp web-standalone-prod-XXXXX:/tmp/emails.txt ~/emails.txt
|
||||
*/
|
||||
|
||||
function usage() {
|
||||
console.log(
|
||||
`
|
||||
User email extraction, outputs to /tmp/emails.txt
|
||||
|
||||
Usage:
|
||||
node scripts/get_emails_by_ids.js [--inputPath=<path>] [--outputPath=<path>] [--batchSize=<number>]
|
||||
|
||||
Options:
|
||||
--help Show this screen
|
||||
|
||||
--inputPath=<path> Input file path (default: ids.txt)
|
||||
|
||||
--outputPath=<path> Output file path (default: /tmp/emails.txt)
|
||||
|
||||
--batchSize=<number> Number of emails to be fetched in one query
|
||||
|
||||
Description:
|
||||
This script extracts user emails given a list of newline separated IDs
|
||||
`
|
||||
)
|
||||
}
|
||||
|
||||
function parseArgs() {
|
||||
const argv = minimist(process.argv.slice(2), {
|
||||
string: ['inputPath', 'outputPath'],
|
||||
bool: ['help'],
|
||||
number: ['batchSize'],
|
||||
default: {
|
||||
help: false,
|
||||
inputPath: 'ids.txt',
|
||||
outputPath: '/tmp/emails.txt',
|
||||
batchSize: 1000,
|
||||
},
|
||||
})
|
||||
|
||||
if (argv.help) {
|
||||
usage()
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
return argv
|
||||
}
|
||||
|
||||
async function processBatch(idBatch, writeStream) {
|
||||
try {
|
||||
const cursor = db.users
|
||||
.find({
|
||||
_id: { $in: idBatch },
|
||||
})
|
||||
.project({
|
||||
_id: 0,
|
||||
email: 1,
|
||||
})
|
||||
|
||||
for await (const doc of cursor) {
|
||||
if (doc.email) {
|
||||
writeStream.write(doc.email + '\n')
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('Error processing batch:', err)
|
||||
}
|
||||
}
|
||||
|
||||
async function main(trackProgress) {
|
||||
const args = parseArgs()
|
||||
|
||||
const readStream = fs.createReadStream(args.inputPath)
|
||||
const writeStream = fs.createWriteStream(args.outputPath)
|
||||
const rl = readline.createInterface({
|
||||
input: readStream,
|
||||
crlfDelay: Infinity,
|
||||
})
|
||||
|
||||
let idBatch = []
|
||||
|
||||
for await (const line of rl) {
|
||||
const id = line.trim()
|
||||
if (id) {
|
||||
try {
|
||||
idBatch.push(new ObjectId(id))
|
||||
} catch (e) {
|
||||
console.warn(`Skipping invalid ObjectId: ${id}`)
|
||||
}
|
||||
}
|
||||
|
||||
if (idBatch.length >= args.batchSize) {
|
||||
await processBatch(idBatch, writeStream)
|
||||
idBatch = []
|
||||
}
|
||||
}
|
||||
|
||||
if (idBatch.length > 0) {
|
||||
await processBatch(idBatch, writeStream)
|
||||
}
|
||||
|
||||
writeStream.end()
|
||||
|
||||
console.log(`✅ Success! Found emails written to ${args.outputPath}`)
|
||||
await trackProgress('Job finished')
|
||||
}
|
||||
|
||||
try {
|
||||
await scriptRunner(main)
|
||||
process.exit(0)
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
process.exit(1)
|
||||
}
|
||||
Reference in New Issue
Block a user