mirror of
https://github.com/yu-i-i/overleaf-cep.git
synced 2026-06-06 15:49:01 +02:00
Merge pull request #27849 from overleaf/rh-day-one-churn-extract
Extract users who churned after first day GitOrigin-RevId: 2bcbee177926c262e8c0d71f1861945842b35db6
This commit is contained in:
@@ -0,0 +1,274 @@
|
||||
const csv = require('csv')
|
||||
const fs = require('fs')
|
||||
const minimist = require('minimist')
|
||||
const { User } = require('../app/src/models/User')
|
||||
|
||||
/**
|
||||
* This script extracts users who churned after day 1 - ie. their last session was within 24 hours of registering
|
||||
*
|
||||
* It will:
|
||||
* — Find users whose lastActive is within 24 hours of their signUpDate
|
||||
* — Filter for a configurable lookback period (default: 6 months)
|
||||
* — Export user IDs and email addresses to CSV
|
||||
*
|
||||
* Usage:
|
||||
* - Locally:
|
||||
* - docker compose exec web bash
|
||||
* - node scripts/extract_day1_churn_users.js
|
||||
* - On the server:
|
||||
* - rake connect:app[staging,web]
|
||||
* - node scripts/extract_day1_churn_users.js
|
||||
* - exit
|
||||
* - kubectl cp web-standalone-prod-XXXXX:/tmp/day1_churn_users.csv ~/day1_churn_users.csv
|
||||
*/
|
||||
|
||||
function usage() {
|
||||
console.log(
|
||||
`
|
||||
Day 1 Churn Users extraction, outputs to /tmp/day1_churn_users.csv
|
||||
|
||||
Usage:
|
||||
node scripts/extract_day1_churn_users.js [--lookbackMonths=<months>] [--outputPath=<path>] [--sampleSize=<number>] [--excludeRecentDays=<days>] [--includeLastActive] [--includeHoursActive]
|
||||
|
||||
Options:
|
||||
--help Show this screen
|
||||
|
||||
--lookbackMonths=<months> Number of months to look back for registrations (default: 6)
|
||||
|
||||
--outputPath=<path> Output file path (default: /tmp/day1_churn_users.csv)
|
||||
|
||||
--sampleSize=<number> Maximum number of users to randomly sample per month (default: all users)
|
||||
|
||||
--excludeRecentDays=<days> Exclude users who registered in the last X days to avoid premature churn classification (default: 7)
|
||||
|
||||
--includeLastActive Include lastActive column in the output CSV (default: false)
|
||||
|
||||
--includeHoursActive Include hoursActive column in the output CSV (default: false)
|
||||
|
||||
Description:
|
||||
This script identifies users who churned after day 1, meaning their last activity
|
||||
was within 24 hours of their registration date. It looks for users who:
|
||||
1. Registered within the specified lookback period
|
||||
2. Have a lastActive timestamp
|
||||
3. Their lastActive is <= 24 hours after their signUpDate
|
||||
4. Did not register within the recent exclusion period
|
||||
|
||||
Grace Period:
|
||||
The --excludeRecentDays parameter prevents prematurely marking users as churned.
|
||||
For example, with --excludeRecentDays=7, users who registered in the last 7 days
|
||||
will be excluded from the analysis.
|
||||
|
||||
Sampling:
|
||||
When --sampleSize is specified, the script will add a MongoDB $sample stage to
|
||||
randomly sample up to that number of users from each month within the lookback
|
||||
period. For example, with --sampleSize=100 and --lookbackMonths=6, you'll get
|
||||
up to 100 randomly selected users for each of the 6 months, for a maximum of
|
||||
600 users total.
|
||||
`
|
||||
)
|
||||
}
|
||||
|
||||
function parseArgs() {
|
||||
const argv = minimist(process.argv.slice(2), {
|
||||
string: ['outputPath'],
|
||||
number: ['lookbackMonths', 'sampleSize', 'excludeRecentDays'],
|
||||
bool: ['help', 'includeLastActive', 'includeHoursActive'],
|
||||
default: {
|
||||
help: false,
|
||||
lookbackMonths: 6,
|
||||
outputPath: '/tmp/day1_churn_users.csv',
|
||||
sampleSize: null, // null => return all users
|
||||
excludeRecentDays: 7, // Exclude users who registered in the last 7 days
|
||||
includeLastActive: false,
|
||||
includeHoursActive: false,
|
||||
},
|
||||
})
|
||||
|
||||
if (argv.help) {
|
||||
usage()
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
return argv
|
||||
}
|
||||
|
||||
async function getDay1ChurnUsers({
|
||||
lookbackMonths,
|
||||
sampleSize,
|
||||
excludeRecentDays,
|
||||
}) {
|
||||
// Calculate the actual lookback date used in queries (first day of the oldest month)
|
||||
const lookbackDate = new Date()
|
||||
lookbackDate.setMonth(lookbackDate.getMonth() - lookbackMonths)
|
||||
lookbackDate.setDate(1)
|
||||
lookbackDate.setHours(0, 0, 0, 0)
|
||||
|
||||
const exclusionDate = new Date()
|
||||
exclusionDate.setDate(exclusionDate.getDate() - excludeRecentDays)
|
||||
|
||||
console.log(
|
||||
`Looking for users who registered after: ${lookbackDate.toISOString()}`
|
||||
)
|
||||
console.log(
|
||||
`Excluding users who registered after: ${exclusionDate.toISOString()} (last ${excludeRecentDays} days)`
|
||||
)
|
||||
|
||||
const allChurnUsers = []
|
||||
|
||||
for (let monthOffset = 0; monthOffset < lookbackMonths; monthOffset++) {
|
||||
const monthStart = new Date()
|
||||
monthStart.setMonth(monthStart.getMonth() - monthOffset - 1)
|
||||
monthStart.setDate(1)
|
||||
monthStart.setHours(0, 0, 0, 0)
|
||||
|
||||
const monthEnd = new Date(monthStart)
|
||||
monthEnd.setMonth(monthEnd.getMonth() + 1)
|
||||
|
||||
// Skip months that would include users in the exclusion period
|
||||
if (monthEnd > exclusionDate) {
|
||||
// Adjust monthEnd to the exclusion date if the month overlaps
|
||||
if (monthStart < exclusionDate) {
|
||||
monthEnd.setTime(exclusionDate.getTime())
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
const monthKey = `${monthStart.getFullYear()}-${String(monthStart.getMonth() + 1).padStart(2, '0')}`
|
||||
|
||||
console.log(
|
||||
`Processing month ${monthKey} (${monthStart.toISOString()} to ${monthEnd.toISOString()})`
|
||||
)
|
||||
|
||||
const pipeline = [
|
||||
// Match users who registered in this month and have a lastActive property
|
||||
{
|
||||
$match: {
|
||||
signUpDate: {
|
||||
$gte: monthStart,
|
||||
$lt: monthEnd,
|
||||
},
|
||||
lastActive: { $exists: true, $ne: null },
|
||||
},
|
||||
},
|
||||
// Compute the time between registration and last active
|
||||
{
|
||||
$addFields: {
|
||||
timeDiffHours: {
|
||||
$divide: [
|
||||
{ $subtract: ['$lastActive', '$signUpDate'] },
|
||||
1000 * 60 * 60, // Convert milliseconds to hours
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
// Filter for day 1 churn (0-24 hours)
|
||||
{
|
||||
$match: {
|
||||
timeDiffHours: { $gte: 0, $lte: 24 },
|
||||
},
|
||||
},
|
||||
{
|
||||
$project: {
|
||||
_id: 1,
|
||||
email: 1,
|
||||
signUpDate: 1,
|
||||
lastActive: 1,
|
||||
timeDiffHours: 1,
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
// Add sampling stage if specified
|
||||
if (sampleSize && sampleSize > 0) {
|
||||
pipeline.push({ $sample: { size: sampleSize } })
|
||||
}
|
||||
|
||||
const monthUsers = await User.aggregate(pipeline).exec()
|
||||
|
||||
console.log(
|
||||
`Month ${monthKey}: Found ${monthUsers.length} day 1 churn users`
|
||||
)
|
||||
|
||||
const formattedUsers = monthUsers.map(user => ({
|
||||
userId: user._id.toString(),
|
||||
email: user.email,
|
||||
signUpDate: new Date(user.signUpDate).toISOString(),
|
||||
lastActive: new Date(user.lastActive).toISOString(),
|
||||
hoursActive: user.timeDiffHours.toFixed(2),
|
||||
}))
|
||||
|
||||
allChurnUsers.push(...formattedUsers)
|
||||
}
|
||||
|
||||
console.log(`Total users collected: ${allChurnUsers.length}`)
|
||||
return allChurnUsers
|
||||
}
|
||||
|
||||
async function runScript() {
|
||||
const args = parseArgs()
|
||||
|
||||
console.log(
|
||||
`Starting Day 1 churn extraction with lookback period: ${args.lookbackMonths} months`
|
||||
)
|
||||
console.log(
|
||||
`Excluding users who registered in the last ${args.excludeRecentDays} days`
|
||||
)
|
||||
if (args.sampleSize) {
|
||||
console.log(`Sampling enabled: maximum ${args.sampleSize} users per month`)
|
||||
} else {
|
||||
console.log('No sampling - returning all users')
|
||||
}
|
||||
|
||||
if (args.includeLastActive) {
|
||||
console.log('Including lastActive column in output')
|
||||
}
|
||||
|
||||
if (args.includeHoursActive) {
|
||||
console.log('Including hoursActive column in output')
|
||||
}
|
||||
|
||||
const churnUsers = await getDay1ChurnUsers(args)
|
||||
|
||||
if (churnUsers.length === 0) {
|
||||
console.log('No day 1 churn users found for the specified period')
|
||||
return
|
||||
}
|
||||
|
||||
console.log(`Writing ${churnUsers.length} users to ${args.outputPath}...`)
|
||||
|
||||
const columns = ['userId', 'email', 'signUpDate']
|
||||
|
||||
if (args.includeLastActive) {
|
||||
columns.push('lastActive')
|
||||
}
|
||||
|
||||
if (args.includeHoursActive) {
|
||||
columns.push('hoursActive')
|
||||
}
|
||||
|
||||
csv.stringify(
|
||||
churnUsers,
|
||||
{
|
||||
header: true,
|
||||
columns,
|
||||
},
|
||||
function (err, output) {
|
||||
if (err) {
|
||||
console.error('Error writing CSV output:', err)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
fs.writeFileSync(args.outputPath, output)
|
||||
console.log(
|
||||
`Successfully wrote ${churnUsers.length} day 1 churn users to ${args.outputPath}`
|
||||
)
|
||||
process.exit(0)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
runScript().catch(err => {
|
||||
console.error('Script failed:', err)
|
||||
process.exit(1)
|
||||
})
|
||||
Reference in New Issue
Block a user