Merge pull request #30093 from overleaf/oa-sub-marketing

[web] Enrich subscription data with details for marketing

GitOrigin-RevId: b2a07264e516ed4fac4643d0a64a8a1656c6fd13
This commit is contained in:
Olzhas Askar
2025-12-05 16:14:53 +01:00
committed by Copybot
parent f959329337
commit 6a93b6e76a
2 changed files with 174 additions and 19 deletions

View File

@@ -0,0 +1,165 @@
#!/usr/bin/env node
/* eslint-disable camelcase */
import { scriptRunner } from './lib/ScriptRunner.mjs'
import fs from 'node:fs'
import minimist from 'minimist'
import {
db,
ObjectId,
READ_PREFERENCE_SECONDARY,
} from '../app/src/infrastructure/mongodb.mjs'
// https://github.com/import-js/eslint-plugin-import/issues/1810
// eslint-disable-next-line import/no-unresolved
import * as csv from 'csv/sync'
function usage() {
console.log(
`
This script enriches price and subscription data with user emails and first names, given a csv containing user IDs, outputs to /tmp/output.csv
Usage:
- Locally:
docker compose exec web bash
node scripts/add_marketing_details_to_csv.mjs [--input=<path>] [--output=<path>] [--batchSize=<number>]
- On the server:
rake run:pod[staging,web]
node scripts/add_marketing_details_to_csv.mjs [--input=<path>] [--output=<path>] [--batchSize=<number>]
exit
kubectl cp web-standalone-prod-XXXXX:/tmp/output.csv ~/output.csv
Options:
--help Show this screen
--input=<path> Input file path (default: input.csv)
--output=<path> Output file path (default: /tmp/output.csv)
--batchSize=<number> Number of users to be fetched in one query
`
)
}
function parseArgs() {
const result = minimist(process.argv.slice(2), {
string: ['input', 'output'],
bool: ['help'],
number: ['batchSize'],
default: {
help: false,
input: 'input.csv',
output: '/tmp/output.csv',
batchSize: 1000,
},
})
if (result.help) {
usage()
process.exit(0)
}
return result
}
async function processBatch(trackProgress, idBatch) {
const result = {}
try {
const cursor = db.users.find(
{
_id: { $in: idBatch },
},
{
projection: {
_id: 1,
first_name: 1,
email: 1,
},
readPreference: READ_PREFERENCE_SECONDARY,
}
)
for await (const doc of cursor) {
result[doc._id.toString()] = {
email: doc.email,
first_name: doc.first_name,
}
}
} catch (err) {
await trackProgress(`ERROR Processing batch: ${err}`)
}
return result
}
async function enrichRecords(trackProgress, records, users) {
const result = []
for (const record of records) {
const user = users[record.user_id]
let email = ''
let first_name = ''
if (user) {
if (user.email === '' || user.first_name === '') {
await trackProgress(`WARNING Incomplete data for: ${record.user_id}`)
}
email = user.email
first_name = user.first_name
} else {
await trackProgress(`WARNING Didn't find: ${record.user_id}`)
}
result.push({
...record,
email,
first_name,
})
}
return result
}
async function main(trackProgress) {
const args = parseArgs()
const input = fs.readFileSync(args.input, 'utf8')
const records = csv.parse(input, { columns: true, skipEmptyLines: true })
await trackProgress(`INFO Starting to process ${records.length} records`)
let objectIdBatch = []
let recordBatch = []
const outputRecords = []
for (const record of records) {
try {
objectIdBatch.push(new ObjectId(record.user_id))
recordBatch.push(record)
} catch (e) {
await trackProgress(`ERROR Skipping invalid user ID: ${record.user_id}`)
outputRecords.push({ ...record, email: '', first_name: '' })
}
if (objectIdBatch.length >= args.batchSize) {
const users = await processBatch(trackProgress, objectIdBatch)
const enriched = await enrichRecords(trackProgress, recordBatch, users)
outputRecords.push(...enriched)
objectIdBatch = []
recordBatch = []
}
}
if (objectIdBatch.length > 0) {
const users = await processBatch(trackProgress, objectIdBatch)
const enriched = await enrichRecords(trackProgress, recordBatch, users)
outputRecords.push(...enriched)
}
const output = csv.stringify(outputRecords, { header: true })
fs.writeFileSync(args.output, output)
await trackProgress(
`INFO Finished processing of ${outputRecords.length} records`
)
}
try {
await scriptRunner(main)
process.exit(0)
} catch (error) {
console.error(error)
process.exit(1)
}

View File

@@ -8,27 +8,20 @@ import {
READ_PREFERENCE_SECONDARY,
} from '../app/src/infrastructure/mongodb.mjs'
/**
* This script extracts user emails given a list of newline separated IDs
*
* Usage:
* - Locally:
* - docker compose exec web bash
* - node scripts/get_emails_by_ids.mjs
* - On the server:
* - rake run:pod[staging,web]
* - node scripts/get_emails_by_ids.mjs
* - exit
* - kubectl cp web-standalone-prod-XXXXX:/tmp/emails.txt ~/emails.txt
*/
function usage() {
console.log(
`
User email extraction, outputs to /tmp/emails.txt
This script extracts user emails given a list of newline separated IDs, outputs to /tmp/emails.txt
Usage:
node scripts/get_emails_by_ids.js [--inputPath=<path>] [--outputPath=<path>] [--batchSize=<number>]
- Locally:
docker compose exec web bash
node scripts/get_emails_by_ids.js [--inputPath=<path>] [--outputPath=<path>] [--batchSize=<number>]
- On the server:
rake run:pod[staging,web]
node scripts/get_emails_by_ids.js [--inputPath=<path>] [--outputPath=<path>] [--batchSize=<number>]
exit
kubectl cp web-standalone-prod-XXXXX:/tmp/emails.txt ~/emails.txt
Options:
--help Show this screen
@@ -38,9 +31,6 @@ function usage() {
--outputPath=<path> Output file path (default: /tmp/emails.txt)
--batchSize=<number> Number of emails to be fetched in one query
Description:
This script extracts user emails given a list of newline separated IDs
`
)
}