[history-v1] extend --report with number of files to check/missing (#27397)

* [history-v1] put logs of back_fill_file_hash script in /var/log/overleaf

* [history-v1] do not leave tmp dir behind when using --report

* [history-v1] extend --report with number of files to check/missing

* [history-v1] move all the options into command line flags

Change default values:
- concurrency to 10
- concurrent batches to 1
- stream high water mark to 1MiB

* [web] find_malformed_filetrees: log progress on stderr

* [history-v1] fix percentage of projects that need back-filling

GitOrigin-RevId: ebd7889840f97b24eda43774ffe7524998fb634b
This commit is contained in:
Jakob Ackermann
2025-07-29 15:58:28 +02:00
committed by Copybot
parent 3a17c1dc61
commit 93c04fe240
3 changed files with 188 additions and 98 deletions

View File

@@ -79,11 +79,11 @@ ObjectId.cacheHexString = true
*/
/**
* @return {{PROJECT_IDS_FROM: string, PROCESS_HASHED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, PROCESS_BLOBS: boolean, DRY_RUN: boolean, OUTPUT_FILE: string, DISPLAY_REPORT: boolean}}
* @return {{PROJECT_IDS_FROM: string, PROCESS_HASHED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, PROCESS_BLOBS: boolean, DRY_RUN: boolean, OUTPUT_FILE: string, DISPLAY_REPORT: boolean, CONCURRENCY: number, CONCURRENT_BATCHES: number, RETRIES: number, RETRY_DELAY_MS: number, RETRY_FILESTORE_404: boolean, BUFFER_DIR_PREFIX: string, STREAM_HIGH_WATER_MARK: number, LOGGING_INTERVAL: number, SLEEP_BEFORE_EXIT: number }}
*/
function parseArgs() {
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
const DEFAULT_OUTPUT_FILE = `file-migration-${new Date()
const DEFAULT_OUTPUT_FILE = `/var/log/overleaf/file-migration-${new Date()
.toISOString()
.replace(/[:.]/g, '_')}.log`
@@ -95,6 +95,12 @@ function parseArgs() {
{ name: 'skip-hashed-files', type: Boolean },
{ name: 'skip-existing-blobs', type: Boolean },
{ name: 'from-file', type: String, defaultValue: '' },
{ name: 'concurrency', type: Number, defaultValue: 10 },
{ name: 'concurrent-batches', type: Number, defaultValue: 1 },
{ name: 'stream-high-water-mark', type: Number, defaultValue: 1024 * 1024 },
{ name: 'retries', type: Number, defaultValue: 10 },
{ name: 'retry-delay-ms', type: Number, defaultValue: 100 },
{ name: 'retry-filestore-404', type: Boolean },
{ name: 'dry-run', alias: 'n', type: Boolean },
{
name: 'output',
@@ -114,6 +120,13 @@ function parseArgs() {
defaultValue: new Date().toISOString(),
},
{ name: 'logging-id', type: String, defaultValue: '' },
{ name: 'logging-interval-ms', type: Number, defaultValue: 60_000 },
{
name: 'buffer-dir-prefix',
type: String,
defaultValue: '/tmp/back_fill_file_hash-',
},
{ name: 'sleep-before-exit-ms', type: Number, defaultValue: 1_000 },
])
// If no arguments are provided, display a usage message
@@ -143,6 +156,8 @@ Logging options:
(default: file-migration-<timestamp>.log)
--logging-id <id> Identifier for logging
(default: BATCH_RANGE_START)
--logging-interval-ms <ms> Interval for logging progres stats
(default: 60000, 1min)
Batch range options:
--BATCH_RANGE_START <date> Start date for processing
@@ -150,10 +165,30 @@ Batch range options:
--BATCH_RANGE_END <date> End date for processing
(default: ${args.BATCH_RANGE_END})
Concurrency:
--concurrency <n> Number of files to process concurrently
(default: 10)
--concurrent-batches <n> Number of project batches to process concurrently
(default: 1)
--stream-high-water-mark n In-Memory buffering threshold
(default: 1MiB)
Retries:
--retries <n> Number of times to retry processing a file
(default: 10)
--retry-delay-ms <ms> How long to wait before processing a file again
(default: 100, 100ms)
--retry-filestore-404 Retry downloading a file when receiving a 404
(default: false)
Other options:
--report Display a report of the current status
--dry-run, -n Perform a dry run without making changes
--help, -h Show this help message
--buffer-dir-prefix <p> Folder/prefix for buffering files on disk
(default: ${args['buffer-dir-prefix']})
--sleep-before-exit-ms <n> Defer exiting from the script
(default: 1000, 1s)
Typical usage:
@@ -208,12 +243,21 @@ is equivalent to
PROCESS_HASHED_FILES: !args['skip-hashed-files'],
PROCESS_BLOBS: !args['skip-existing-blobs'],
DRY_RUN: args['dry-run'],
OUTPUT_FILE: args.output,
OUTPUT_FILE: args.report ? '-' : args.output,
BATCH_RANGE_START,
BATCH_RANGE_END,
LOGGING_IDENTIFIER: args['logging-id'] || BATCH_RANGE_START,
LOGGING_INTERVAL: args['logging-interval-ms'],
PROJECT_IDS_FROM: args['from-file'],
DISPLAY_REPORT: args.report,
CONCURRENCY: args.concurrency,
CONCURRENT_BATCHES: args['concurrent-batches'],
STREAM_HIGH_WATER_MARK: args['stream-high-water-mark'],
RETRIES: args.retries,
RETRY_DELAY_MS: args['retry-delay-ms'],
RETRY_FILESTORE_404: args['retry-filestore-404'],
BUFFER_DIR_PREFIX: args['buffer-dir-prefix'],
SLEEP_BEFORE_EXIT: args['sleep-before-exit-ms'],
}
}
@@ -229,6 +273,15 @@ const {
LOGGING_IDENTIFIER,
PROJECT_IDS_FROM,
DISPLAY_REPORT,
CONCURRENCY,
CONCURRENT_BATCHES,
RETRIES,
RETRY_DELAY_MS,
RETRY_FILESTORE_404,
BUFFER_DIR_PREFIX,
STREAM_HIGH_WATER_MARK,
LOGGING_INTERVAL,
SLEEP_BEFORE_EXIT,
} = parseArgs()
// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
@@ -236,26 +289,12 @@ if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
}
// Concurrency for downloading from GCS and updating hashes in mongo
const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10)
const CONCURRENT_BATCHES = parseInt(process.env.CONCURRENT_BATCHES || '2', 10)
// Retries for processing a given file
const RETRIES = parseInt(process.env.RETRIES || '10', 10)
const RETRY_DELAY_MS = parseInt(process.env.RETRY_DELAY_MS || '100', 10)
const RETRY_FILESTORE_404 = process.env.RETRY_FILESTORE_404 === 'true'
const BUFFER_DIR = fs.mkdtempSync(
process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
)
// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode
const STREAM_HIGH_WATER_MARK = parseInt(
process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(),
10
)
const LOGGING_INTERVAL = parseInt(process.env.LOGGING_INTERVAL || '60000', 10)
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
const BUFFER_DIR = fs.mkdtempSync(BUFFER_DIR_PREFIX)
// Log output to a file
if (OUTPUT_FILE !== '-') {
console.warn(`Writing logs into ${OUTPUT_FILE}`)
}
logger.initialize('file-migration', {
streams: [
{
@@ -313,6 +352,7 @@ async function getStatsForCollection(
projectsWithAllHashes: 0,
fileCount: 0,
fileWithHashCount: 0,
fileMissingInHistoryCount: 0,
}
// Pick a random sample of projects and estimate the number of files without hashes
const result = await collection
@@ -339,25 +379,44 @@ async function getStatsForCollection(
const filesWithoutHash = fileTree.match(/\{"_id":"[0-9a-f]{24}"\}/g) || []
// count the number of files with a hash, these are uniquely identified
// by the number of "hash" strings due to the filtering
const filesWithHash = fileTree.match(/"hash"/g) || []
const filesWithHash = fileTree.match(/"hash":"[0-9a-f]{40}"/g) || []
stats.fileCount += filesWithoutHash.length + filesWithHash.length
stats.fileWithHashCount += filesWithHash.length
stats.projectCount++
stats.projectsWithAllHashes += filesWithoutHash.length === 0 ? 1 : 0
const projectId = project._id.toString()
const { blobs: perProjectBlobs } = await getProjectBlobsBatch([projectId])
const blobs = new Set(
(perProjectBlobs.get(projectId) || []).map(b => b.getHash())
)
const uniqueHashes = new Set(filesWithHash.map(m => m.slice(8, 48)))
for (const hash of uniqueHashes) {
if (blobs.has(hash) || GLOBAL_BLOBS.has(hash)) continue
stats.fileMissingInHistoryCount++
}
}
console.log(`Sampled stats for ${name}:`)
const fractionSampled = stats.projectCount / collectionCount
const percentageSampled = (fractionSampled * 100).toFixed(1)
const percentageSampled = (fractionSampled * 100).toFixed(0)
const fractionConverted = stats.projectsWithAllHashes / stats.projectCount
const percentageConverted = (fractionConverted * 100).toFixed(1)
const fractionToBackFill = 1 - fractionConverted
const percentageToBackFill = (fractionToBackFill * 100).toFixed(0)
const fractionMissing = stats.fileMissingInHistoryCount / stats.fileCount
const percentageMissing = (fractionMissing * 100).toFixed(0)
console.log(
`- Sampled ${name}: ${stats.projectCount} (${percentageSampled}%)`
`- Sampled ${name}: ${stats.projectCount} (${percentageSampled}% of all ${name})`
)
console.log(
`- Sampled ${name} with all hashes present: ${stats.projectsWithAllHashes}`
)
console.log(
`- Percentage of ${name} converted: ${percentageConverted}% (estimated)`
`- Percentage of ${name} that need back-filling hashes: ${percentageToBackFill}% (estimated)`
)
console.log(
`- Sampled ${name} have ${stats.fileCount} files that need to be checked against the full project history system.`
)
console.log(
`- Sampled ${name} have ${stats.fileMissingInHistoryCount} files that need to be uploaded to the full project history system (estimating ${percentageMissing}% of all files).`
)
}
@@ -366,13 +425,15 @@ async function getStatsForCollection(
* including counts and estimated progress based on a sample.
*/
async function displayReport() {
const projectsCountResult = await projectsCollection.countDocuments()
const projectsCountResult = await projectsCollection.estimatedDocumentCount()
const deletedProjectsCountResult =
await deletedProjectsCollection.countDocuments()
await deletedProjectsCollection.estimatedDocumentCount()
const sampleSize = 1000
console.log('Current status:')
console.log(`- Projects: ${projectsCountResult}`)
console.log(`- Deleted projects: ${deletedProjectsCountResult}`)
console.log(`- Total number of projects: ${projectsCountResult}`)
console.log(
`- Total number of deleted projects: ${deletedProjectsCountResult}`
)
console.log(`Sampling ${sampleSize} projects to estimate progress...`)
await getStatsForCollection(
sampleSize,
@@ -392,7 +453,7 @@ async function displayReport() {
)
}
// Filestore endpoint location
// Filestore endpoint location (configured by /etc/overleaf/env.sh)
const FILESTORE_HOST = process.env.FILESTORE_HOST || '127.0.0.1'
const FILESTORE_PORT = process.env.FILESTORE_PORT || '3009'
@@ -1370,7 +1431,18 @@ async function main() {
console.warn('Done.')
}
async function cleanupBufferDir() {
try {
// Perform non-recursive removal of the BUFFER_DIR. Individual files
// should get removed in parallel as part of batch processing.
await fs.promises.rmdir(BUFFER_DIR)
} catch (err) {
console.error(`cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
}
}
if (DISPLAY_REPORT) {
await cleanupBufferDir()
console.warn('Displaying report...')
await displayReport()
process.exit(0)
@@ -1381,13 +1453,7 @@ try {
await main()
} finally {
printStats(true)
try {
// Perform non-recursive removal of the BUFFER_DIR. Individual files
// should get removed in parallel as part of batch processing.
await fs.promises.rmdir(BUFFER_DIR)
} catch (err) {
console.error(`cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
}
await cleanupBufferDir()
}
let code = 0

View File

@@ -63,7 +63,6 @@ const PRINT_IDS_AND_HASHES_FOR_DEBUGGING = false
describe('back_fill_file_hash script', function () {
this.timeout(TIMEOUT)
const USER_FILES_BUCKET_NAME = 'fake-user-files-gcs'
const projectId0 = objectIdFromTime('2017-01-01T00:00:00Z')
const projectId1 = objectIdFromTime('2017-01-01T00:01:00Z')
@@ -480,20 +479,16 @@ describe('back_fill_file_hash script', function () {
/**
* @param {Array<string>} args
* @param {Record<string, string>} env
* @param {boolean} shouldHaveWritten
* @return {Promise<{result, stats: any}>}
* @return {Promise<{result: { stdout: string, stderr: string, status: number }, stats: any}>}
*/
async function tryRunScript(args = [], env = {}, shouldHaveWritten) {
async function rawRunScript(args = []) {
let result
try {
result = await promisify(execFile)(
process.argv0,
[
'storage/scripts/back_fill_file_hash.mjs',
'--output=-',
'--projects',
'--deleted-projects',
'--sleep-before-exit-ms=1',
...args,
],
{
@@ -501,9 +496,7 @@ describe('back_fill_file_hash script', function () {
timeout: TIMEOUT - 500,
env: {
...process.env,
USER_FILES_BUCKET_NAME,
SLEEP_BEFORE_EXIT: '1',
...env,
AWS_SDK_JS_SUPPRESS_MAINTENANCE_MODE_MESSAGE: '1',
LOG_LEVEL: 'warn', // Override LOG_LEVEL of acceptance tests
},
}
@@ -516,9 +509,25 @@ describe('back_fill_file_hash script', function () {
}
result = { stdout, stderr, status: code }
}
// Ensure no tmp folder is left behind.
expect((await fs.promises.readdir('/tmp')).join(';')).to.not.match(
/back_fill_file_hash/
)
return result
}
/**
* @param {Array<string>} args
* @param {boolean} shouldHaveWritten
* @return {Promise<{result, stats: any}>}
*/
async function tryRunScript(args = [], shouldHaveWritten) {
const result = await rawRunScript([
'--output=-',
'--projects',
'--deleted-projects',
...args,
])
const extraStatsKeys = ['eventLoop', 'readFromGCSThroughputMiBPerSecond']
const stats = JSON.parse(
result.stderr
@@ -549,12 +558,11 @@ describe('back_fill_file_hash script', function () {
/**
* @param {Array<string>} args
* @param {Record<string, string>} env
* @param {boolean} shouldHaveWritten
* @return {Promise<{result, stats: any}>}
*/
async function runScript(args = [], env = {}, shouldHaveWritten = true) {
const { stats, result } = await tryRunScript(args, env, shouldHaveWritten)
async function runScript(args = [], shouldHaveWritten = true) {
const { stats, result } = await tryRunScript(args, shouldHaveWritten)
if (result.status !== 0) {
console.log(result)
expect(result).to.have.property('status', 0)
@@ -803,7 +811,6 @@ describe('back_fill_file_hash script', function () {
it('should process nothing on re-run', async function () {
const rerun = await runScript(
!processHashedFiles ? ['--skip-hashed-files'] : [],
{},
false
)
let stats = {
@@ -928,10 +935,11 @@ describe('back_fill_file_hash script', function () {
it('should gracefully handle fatal errors', async function () {
mockFilestore.deleteObject(projectId0, fileId0)
const t0 = Date.now()
const { stats, result } = await tryRunScript(['--skip-hashed-files'], {
RETRIES: '10',
RETRY_DELAY_MS: '1000',
})
const { stats, result } = await tryRunScript([
'--skip-hashed-files',
'--retries=10',
'--retry-delay-ms=1000',
])
const t1 = Date.now()
expectNotFoundError(result, 'failed to process file')
expect(result.status).to.equal(1)
@@ -963,11 +971,12 @@ describe('back_fill_file_hash script', function () {
value: { stats, result },
},
] = await Promise.allSettled([
tryRunScript(['--skip-hashed-files'], {
RETRY_DELAY_MS: '100',
RETRIES: '60',
RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests
}),
tryRunScript([
'--skip-hashed-files',
'--retries=60',
'--retry-delay-ms=1000',
'--retry-filestore-404',
]),
restoreFileAfter5s(),
])
expectNotFoundError(result, 'failed to process file, trying again')
@@ -989,9 +998,7 @@ describe('back_fill_file_hash script', function () {
let output
before('prepare environment', prepareEnvironment)
before('run script', async function () {
output = await runScript(['--skip-hashed-files'], {
CONCURRENCY: '1',
})
output = await runScript(['--skip-hashed-files', '--concurrency=1'])
})
/**
@@ -1058,10 +1065,10 @@ describe('back_fill_file_hash script', function () {
let output1, output2
before('prepare environment', prepareEnvironment)
before('run script without hashed files', async function () {
output1 = await runScript(['--skip-hashed-files'], {})
output1 = await runScript(['--skip-hashed-files'])
})
before('run script with hashed files', async function () {
output2 = await runScript([], {})
output2 = await runScript([])
})
it('should print stats for the first run without hashed files', function () {
expect(output1.stats).deep.equal(STATS_ALL)
@@ -1076,6 +1083,34 @@ describe('back_fill_file_hash script', function () {
})
commonAssertions(true)
})
describe('report mode', function () {
let output
before('prepare environment', prepareEnvironment)
before('run script', async function () {
output = await rawRunScript(['--report'])
})
it('should print the report', () => {
expect(output.status).to.equal(0)
expect(output.stdout).to.equal(`\
Current status:
- Total number of projects: 10
- Total number of deleted projects: 5
Sampling 1000 projects to estimate progress...
Sampled stats for projects:
- Sampled projects: 9 (90% of all projects)
- Sampled projects with all hashes present: 5
- Percentage of projects that need back-filling hashes: 44% (estimated)
- Sampled projects have 11 files that need to be checked against the full project history system.
- Sampled projects have 3 files that need to be uploaded to the full project history system (estimating 27% of all files).
Sampled stats for deleted projects:
- Sampled deleted projects: 4 (80% of all deleted projects)
- Sampled deleted projects with all hashes present: 3
- Percentage of deleted projects that need back-filling hashes: 25% (estimated)
- Sampled deleted projects have 2 files that need to be checked against the full project history system.
- Sampled deleted projects have 1 files that need to be uploaded to the full project history system (estimating 50% of all files).
`)
})
})
describe('full run in dry-run mode', function () {
let output
@@ -1089,13 +1124,7 @@ describe('back_fill_file_hash script', function () {
.toArray()
})
before('run script', async function () {
output = await runScript(
['--dry-run'],
{
CONCURRENCY: '1',
},
false
)
output = await runScript(['--dry-run', '--concurrency=1'], false)
})
it('should print stats for dry-run mode', function () {
@@ -1136,9 +1165,7 @@ describe('back_fill_file_hash script', function () {
let output
before('prepare environment', prepareEnvironment)
before('run script', async function () {
output = await runScript(['--skip-hashed-files'], {
CONCURRENCY: '10',
})
output = await runScript(['--skip-hashed-files', '--concurrency=10'])
})
it('should print stats', function () {
expect(output.stats).deep.equal(STATS_ALL)
@@ -1146,13 +1173,14 @@ describe('back_fill_file_hash script', function () {
commonAssertions()
})
describe('full run STREAM_HIGH_WATER_MARK=1MB', function () {
describe('full run STREAM_HIGH_WATER_MARK=64kiB', function () {
let output
before('prepare environment', prepareEnvironment)
before('run script', async function () {
output = await runScript(['--skip-hashed-files'], {
STREAM_HIGH_WATER_MARK: (1024 * 1024).toString(),
})
output = await runScript([
'--skip-hashed-files',
`--stream-high-water-mark=${64 * 1024}`,
])
})
it('should print stats', function () {
expect(output.stats).deep.equal(STATS_ALL)
@@ -1164,7 +1192,7 @@ describe('back_fill_file_hash script', function () {
let output
before('prepare environment', prepareEnvironment)
before('run script', async function () {
output = await runScript([], {})
output = await runScript([])
})
it('should print stats', function () {
expect(output.stats).deep.equal(
@@ -1193,9 +1221,7 @@ describe('back_fill_file_hash script', function () {
})
let output
before('run script', async function () {
output = await runScript(['--skip-hashed-files'], {
CONCURRENCY: '1',
})
output = await runScript(['--skip-hashed-files', '--concurrency=1'])
})
it('should print stats', function () {
@@ -1214,20 +1240,18 @@ describe('back_fill_file_hash script', function () {
let outputPart0, outputPart1
before('prepare environment', prepareEnvironment)
before('run script on part 0', async function () {
outputPart0 = await runScript(
['--skip-hashed-files', `--BATCH_RANGE_END=${edge}`],
{
CONCURRENCY: '1',
}
)
outputPart0 = await runScript([
'--skip-hashed-files',
`--BATCH_RANGE_END=${edge}`,
'--concurrency=1',
])
})
before('run script on part 1', async function () {
outputPart1 = await runScript(
['--skip-hashed-files', `--BATCH_RANGE_START=${edge}`],
{
CONCURRENCY: '1',
}
)
outputPart1 = await runScript([
'--skip-hashed-files',
`--BATCH_RANGE_START=${edge}`,
'--concurrency=1',
])
})
it('should print stats for part 0', function () {

View File

@@ -49,7 +49,7 @@ async function main(trackProgress) {
for (const project of projects) {
projectsProcessed += 1
if (projectsProcessed % 100000 === 0) {
console.log(projectsProcessed, 'projects processed')
console.warn(projectsProcessed, 'projects processed')
}
const projectId = project._id.toString()
for (const { reason, path, _id } of processProject(project)) {