mirror of
https://github.com/yu-i-i/overleaf-cep.git
synced 2026-05-31 21:01:33 +02:00
[history-v1] extend --report with number of files to check/missing (#27397)
* [history-v1] put logs of back_fill_file_hash script in /var/log/overleaf * [history-v1] do not leave tmp dir behind when using --report * [history-v1] extend --report with number of files to check/missing * [history-v1] move all the options into command line flags Change default values: - concurrency to 10 - concurrent batches to 1 - stream high water mark to 1MiB * [web] find_malformed_filetrees: log progress on stderr * [history-v1] fix percentage of projects that need back-filling GitOrigin-RevId: ebd7889840f97b24eda43774ffe7524998fb634b
This commit is contained in:
@@ -79,11 +79,11 @@ ObjectId.cacheHexString = true
|
||||
*/
|
||||
|
||||
/**
|
||||
* @return {{PROJECT_IDS_FROM: string, PROCESS_HASHED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, PROCESS_BLOBS: boolean, DRY_RUN: boolean, OUTPUT_FILE: string, DISPLAY_REPORT: boolean}}
|
||||
* @return {{PROJECT_IDS_FROM: string, PROCESS_HASHED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, PROCESS_BLOBS: boolean, DRY_RUN: boolean, OUTPUT_FILE: string, DISPLAY_REPORT: boolean, CONCURRENCY: number, CONCURRENT_BATCHES: number, RETRIES: number, RETRY_DELAY_MS: number, RETRY_FILESTORE_404: boolean, BUFFER_DIR_PREFIX: string, STREAM_HIGH_WATER_MARK: number, LOGGING_INTERVAL: number, SLEEP_BEFORE_EXIT: number }}
|
||||
*/
|
||||
function parseArgs() {
|
||||
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
|
||||
const DEFAULT_OUTPUT_FILE = `file-migration-${new Date()
|
||||
const DEFAULT_OUTPUT_FILE = `/var/log/overleaf/file-migration-${new Date()
|
||||
.toISOString()
|
||||
.replace(/[:.]/g, '_')}.log`
|
||||
|
||||
@@ -95,6 +95,12 @@ function parseArgs() {
|
||||
{ name: 'skip-hashed-files', type: Boolean },
|
||||
{ name: 'skip-existing-blobs', type: Boolean },
|
||||
{ name: 'from-file', type: String, defaultValue: '' },
|
||||
{ name: 'concurrency', type: Number, defaultValue: 10 },
|
||||
{ name: 'concurrent-batches', type: Number, defaultValue: 1 },
|
||||
{ name: 'stream-high-water-mark', type: Number, defaultValue: 1024 * 1024 },
|
||||
{ name: 'retries', type: Number, defaultValue: 10 },
|
||||
{ name: 'retry-delay-ms', type: Number, defaultValue: 100 },
|
||||
{ name: 'retry-filestore-404', type: Boolean },
|
||||
{ name: 'dry-run', alias: 'n', type: Boolean },
|
||||
{
|
||||
name: 'output',
|
||||
@@ -114,6 +120,13 @@ function parseArgs() {
|
||||
defaultValue: new Date().toISOString(),
|
||||
},
|
||||
{ name: 'logging-id', type: String, defaultValue: '' },
|
||||
{ name: 'logging-interval-ms', type: Number, defaultValue: 60_000 },
|
||||
{
|
||||
name: 'buffer-dir-prefix',
|
||||
type: String,
|
||||
defaultValue: '/tmp/back_fill_file_hash-',
|
||||
},
|
||||
{ name: 'sleep-before-exit-ms', type: Number, defaultValue: 1_000 },
|
||||
])
|
||||
|
||||
// If no arguments are provided, display a usage message
|
||||
@@ -143,6 +156,8 @@ Logging options:
|
||||
(default: file-migration-<timestamp>.log)
|
||||
--logging-id <id> Identifier for logging
|
||||
(default: BATCH_RANGE_START)
|
||||
--logging-interval-ms <ms> Interval for logging progres stats
|
||||
(default: 60000, 1min)
|
||||
|
||||
Batch range options:
|
||||
--BATCH_RANGE_START <date> Start date for processing
|
||||
@@ -150,10 +165,30 @@ Batch range options:
|
||||
--BATCH_RANGE_END <date> End date for processing
|
||||
(default: ${args.BATCH_RANGE_END})
|
||||
|
||||
Concurrency:
|
||||
--concurrency <n> Number of files to process concurrently
|
||||
(default: 10)
|
||||
--concurrent-batches <n> Number of project batches to process concurrently
|
||||
(default: 1)
|
||||
--stream-high-water-mark n In-Memory buffering threshold
|
||||
(default: 1MiB)
|
||||
|
||||
Retries:
|
||||
--retries <n> Number of times to retry processing a file
|
||||
(default: 10)
|
||||
--retry-delay-ms <ms> How long to wait before processing a file again
|
||||
(default: 100, 100ms)
|
||||
--retry-filestore-404 Retry downloading a file when receiving a 404
|
||||
(default: false)
|
||||
|
||||
Other options:
|
||||
--report Display a report of the current status
|
||||
--dry-run, -n Perform a dry run without making changes
|
||||
--help, -h Show this help message
|
||||
--buffer-dir-prefix <p> Folder/prefix for buffering files on disk
|
||||
(default: ${args['buffer-dir-prefix']})
|
||||
--sleep-before-exit-ms <n> Defer exiting from the script
|
||||
(default: 1000, 1s)
|
||||
|
||||
Typical usage:
|
||||
|
||||
@@ -208,12 +243,21 @@ is equivalent to
|
||||
PROCESS_HASHED_FILES: !args['skip-hashed-files'],
|
||||
PROCESS_BLOBS: !args['skip-existing-blobs'],
|
||||
DRY_RUN: args['dry-run'],
|
||||
OUTPUT_FILE: args.output,
|
||||
OUTPUT_FILE: args.report ? '-' : args.output,
|
||||
BATCH_RANGE_START,
|
||||
BATCH_RANGE_END,
|
||||
LOGGING_IDENTIFIER: args['logging-id'] || BATCH_RANGE_START,
|
||||
LOGGING_INTERVAL: args['logging-interval-ms'],
|
||||
PROJECT_IDS_FROM: args['from-file'],
|
||||
DISPLAY_REPORT: args.report,
|
||||
CONCURRENCY: args.concurrency,
|
||||
CONCURRENT_BATCHES: args['concurrent-batches'],
|
||||
STREAM_HIGH_WATER_MARK: args['stream-high-water-mark'],
|
||||
RETRIES: args.retries,
|
||||
RETRY_DELAY_MS: args['retry-delay-ms'],
|
||||
RETRY_FILESTORE_404: args['retry-filestore-404'],
|
||||
BUFFER_DIR_PREFIX: args['buffer-dir-prefix'],
|
||||
SLEEP_BEFORE_EXIT: args['sleep-before-exit-ms'],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -229,6 +273,15 @@ const {
|
||||
LOGGING_IDENTIFIER,
|
||||
PROJECT_IDS_FROM,
|
||||
DISPLAY_REPORT,
|
||||
CONCURRENCY,
|
||||
CONCURRENT_BATCHES,
|
||||
RETRIES,
|
||||
RETRY_DELAY_MS,
|
||||
RETRY_FILESTORE_404,
|
||||
BUFFER_DIR_PREFIX,
|
||||
STREAM_HIGH_WATER_MARK,
|
||||
LOGGING_INTERVAL,
|
||||
SLEEP_BEFORE_EXIT,
|
||||
} = parseArgs()
|
||||
|
||||
// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
|
||||
@@ -236,26 +289,12 @@ if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
|
||||
throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
|
||||
}
|
||||
|
||||
// Concurrency for downloading from GCS and updating hashes in mongo
|
||||
const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10)
|
||||
const CONCURRENT_BATCHES = parseInt(process.env.CONCURRENT_BATCHES || '2', 10)
|
||||
// Retries for processing a given file
|
||||
const RETRIES = parseInt(process.env.RETRIES || '10', 10)
|
||||
const RETRY_DELAY_MS = parseInt(process.env.RETRY_DELAY_MS || '100', 10)
|
||||
|
||||
const RETRY_FILESTORE_404 = process.env.RETRY_FILESTORE_404 === 'true'
|
||||
const BUFFER_DIR = fs.mkdtempSync(
|
||||
process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
|
||||
)
|
||||
// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode
|
||||
const STREAM_HIGH_WATER_MARK = parseInt(
|
||||
process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(),
|
||||
10
|
||||
)
|
||||
const LOGGING_INTERVAL = parseInt(process.env.LOGGING_INTERVAL || '60000', 10)
|
||||
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
|
||||
const BUFFER_DIR = fs.mkdtempSync(BUFFER_DIR_PREFIX)
|
||||
|
||||
// Log output to a file
|
||||
if (OUTPUT_FILE !== '-') {
|
||||
console.warn(`Writing logs into ${OUTPUT_FILE}`)
|
||||
}
|
||||
logger.initialize('file-migration', {
|
||||
streams: [
|
||||
{
|
||||
@@ -313,6 +352,7 @@ async function getStatsForCollection(
|
||||
projectsWithAllHashes: 0,
|
||||
fileCount: 0,
|
||||
fileWithHashCount: 0,
|
||||
fileMissingInHistoryCount: 0,
|
||||
}
|
||||
// Pick a random sample of projects and estimate the number of files without hashes
|
||||
const result = await collection
|
||||
@@ -339,25 +379,44 @@ async function getStatsForCollection(
|
||||
const filesWithoutHash = fileTree.match(/\{"_id":"[0-9a-f]{24}"\}/g) || []
|
||||
// count the number of files with a hash, these are uniquely identified
|
||||
// by the number of "hash" strings due to the filtering
|
||||
const filesWithHash = fileTree.match(/"hash"/g) || []
|
||||
const filesWithHash = fileTree.match(/"hash":"[0-9a-f]{40}"/g) || []
|
||||
stats.fileCount += filesWithoutHash.length + filesWithHash.length
|
||||
stats.fileWithHashCount += filesWithHash.length
|
||||
stats.projectCount++
|
||||
stats.projectsWithAllHashes += filesWithoutHash.length === 0 ? 1 : 0
|
||||
const projectId = project._id.toString()
|
||||
const { blobs: perProjectBlobs } = await getProjectBlobsBatch([projectId])
|
||||
const blobs = new Set(
|
||||
(perProjectBlobs.get(projectId) || []).map(b => b.getHash())
|
||||
)
|
||||
const uniqueHashes = new Set(filesWithHash.map(m => m.slice(8, 48)))
|
||||
for (const hash of uniqueHashes) {
|
||||
if (blobs.has(hash) || GLOBAL_BLOBS.has(hash)) continue
|
||||
stats.fileMissingInHistoryCount++
|
||||
}
|
||||
}
|
||||
console.log(`Sampled stats for ${name}:`)
|
||||
const fractionSampled = stats.projectCount / collectionCount
|
||||
const percentageSampled = (fractionSampled * 100).toFixed(1)
|
||||
const percentageSampled = (fractionSampled * 100).toFixed(0)
|
||||
const fractionConverted = stats.projectsWithAllHashes / stats.projectCount
|
||||
const percentageConverted = (fractionConverted * 100).toFixed(1)
|
||||
const fractionToBackFill = 1 - fractionConverted
|
||||
const percentageToBackFill = (fractionToBackFill * 100).toFixed(0)
|
||||
const fractionMissing = stats.fileMissingInHistoryCount / stats.fileCount
|
||||
const percentageMissing = (fractionMissing * 100).toFixed(0)
|
||||
console.log(
|
||||
`- Sampled ${name}: ${stats.projectCount} (${percentageSampled}%)`
|
||||
`- Sampled ${name}: ${stats.projectCount} (${percentageSampled}% of all ${name})`
|
||||
)
|
||||
console.log(
|
||||
`- Sampled ${name} with all hashes present: ${stats.projectsWithAllHashes}`
|
||||
)
|
||||
console.log(
|
||||
`- Percentage of ${name} converted: ${percentageConverted}% (estimated)`
|
||||
`- Percentage of ${name} that need back-filling hashes: ${percentageToBackFill}% (estimated)`
|
||||
)
|
||||
console.log(
|
||||
`- Sampled ${name} have ${stats.fileCount} files that need to be checked against the full project history system.`
|
||||
)
|
||||
console.log(
|
||||
`- Sampled ${name} have ${stats.fileMissingInHistoryCount} files that need to be uploaded to the full project history system (estimating ${percentageMissing}% of all files).`
|
||||
)
|
||||
}
|
||||
|
||||
@@ -366,13 +425,15 @@ async function getStatsForCollection(
|
||||
* including counts and estimated progress based on a sample.
|
||||
*/
|
||||
async function displayReport() {
|
||||
const projectsCountResult = await projectsCollection.countDocuments()
|
||||
const projectsCountResult = await projectsCollection.estimatedDocumentCount()
|
||||
const deletedProjectsCountResult =
|
||||
await deletedProjectsCollection.countDocuments()
|
||||
await deletedProjectsCollection.estimatedDocumentCount()
|
||||
const sampleSize = 1000
|
||||
console.log('Current status:')
|
||||
console.log(`- Projects: ${projectsCountResult}`)
|
||||
console.log(`- Deleted projects: ${deletedProjectsCountResult}`)
|
||||
console.log(`- Total number of projects: ${projectsCountResult}`)
|
||||
console.log(
|
||||
`- Total number of deleted projects: ${deletedProjectsCountResult}`
|
||||
)
|
||||
console.log(`Sampling ${sampleSize} projects to estimate progress...`)
|
||||
await getStatsForCollection(
|
||||
sampleSize,
|
||||
@@ -392,7 +453,7 @@ async function displayReport() {
|
||||
)
|
||||
}
|
||||
|
||||
// Filestore endpoint location
|
||||
// Filestore endpoint location (configured by /etc/overleaf/env.sh)
|
||||
const FILESTORE_HOST = process.env.FILESTORE_HOST || '127.0.0.1'
|
||||
const FILESTORE_PORT = process.env.FILESTORE_PORT || '3009'
|
||||
|
||||
@@ -1370,7 +1431,18 @@ async function main() {
|
||||
console.warn('Done.')
|
||||
}
|
||||
|
||||
async function cleanupBufferDir() {
|
||||
try {
|
||||
// Perform non-recursive removal of the BUFFER_DIR. Individual files
|
||||
// should get removed in parallel as part of batch processing.
|
||||
await fs.promises.rmdir(BUFFER_DIR)
|
||||
} catch (err) {
|
||||
console.error(`cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
|
||||
}
|
||||
}
|
||||
|
||||
if (DISPLAY_REPORT) {
|
||||
await cleanupBufferDir()
|
||||
console.warn('Displaying report...')
|
||||
await displayReport()
|
||||
process.exit(0)
|
||||
@@ -1381,13 +1453,7 @@ try {
|
||||
await main()
|
||||
} finally {
|
||||
printStats(true)
|
||||
try {
|
||||
// Perform non-recursive removal of the BUFFER_DIR. Individual files
|
||||
// should get removed in parallel as part of batch processing.
|
||||
await fs.promises.rmdir(BUFFER_DIR)
|
||||
} catch (err) {
|
||||
console.error(`cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
|
||||
}
|
||||
await cleanupBufferDir()
|
||||
}
|
||||
|
||||
let code = 0
|
||||
|
||||
@@ -63,7 +63,6 @@ const PRINT_IDS_AND_HASHES_FOR_DEBUGGING = false
|
||||
|
||||
describe('back_fill_file_hash script', function () {
|
||||
this.timeout(TIMEOUT)
|
||||
const USER_FILES_BUCKET_NAME = 'fake-user-files-gcs'
|
||||
|
||||
const projectId0 = objectIdFromTime('2017-01-01T00:00:00Z')
|
||||
const projectId1 = objectIdFromTime('2017-01-01T00:01:00Z')
|
||||
@@ -480,20 +479,16 @@ describe('back_fill_file_hash script', function () {
|
||||
|
||||
/**
|
||||
* @param {Array<string>} args
|
||||
* @param {Record<string, string>} env
|
||||
* @param {boolean} shouldHaveWritten
|
||||
* @return {Promise<{result, stats: any}>}
|
||||
* @return {Promise<{result: { stdout: string, stderr: string, status: number }, stats: any}>}
|
||||
*/
|
||||
async function tryRunScript(args = [], env = {}, shouldHaveWritten) {
|
||||
async function rawRunScript(args = []) {
|
||||
let result
|
||||
try {
|
||||
result = await promisify(execFile)(
|
||||
process.argv0,
|
||||
[
|
||||
'storage/scripts/back_fill_file_hash.mjs',
|
||||
'--output=-',
|
||||
'--projects',
|
||||
'--deleted-projects',
|
||||
'--sleep-before-exit-ms=1',
|
||||
...args,
|
||||
],
|
||||
{
|
||||
@@ -501,9 +496,7 @@ describe('back_fill_file_hash script', function () {
|
||||
timeout: TIMEOUT - 500,
|
||||
env: {
|
||||
...process.env,
|
||||
USER_FILES_BUCKET_NAME,
|
||||
SLEEP_BEFORE_EXIT: '1',
|
||||
...env,
|
||||
AWS_SDK_JS_SUPPRESS_MAINTENANCE_MODE_MESSAGE: '1',
|
||||
LOG_LEVEL: 'warn', // Override LOG_LEVEL of acceptance tests
|
||||
},
|
||||
}
|
||||
@@ -516,9 +509,25 @@ describe('back_fill_file_hash script', function () {
|
||||
}
|
||||
result = { stdout, stderr, status: code }
|
||||
}
|
||||
// Ensure no tmp folder is left behind.
|
||||
expect((await fs.promises.readdir('/tmp')).join(';')).to.not.match(
|
||||
/back_fill_file_hash/
|
||||
)
|
||||
return result
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Array<string>} args
|
||||
* @param {boolean} shouldHaveWritten
|
||||
* @return {Promise<{result, stats: any}>}
|
||||
*/
|
||||
async function tryRunScript(args = [], shouldHaveWritten) {
|
||||
const result = await rawRunScript([
|
||||
'--output=-',
|
||||
'--projects',
|
||||
'--deleted-projects',
|
||||
...args,
|
||||
])
|
||||
const extraStatsKeys = ['eventLoop', 'readFromGCSThroughputMiBPerSecond']
|
||||
const stats = JSON.parse(
|
||||
result.stderr
|
||||
@@ -549,12 +558,11 @@ describe('back_fill_file_hash script', function () {
|
||||
|
||||
/**
|
||||
* @param {Array<string>} args
|
||||
* @param {Record<string, string>} env
|
||||
* @param {boolean} shouldHaveWritten
|
||||
* @return {Promise<{result, stats: any}>}
|
||||
*/
|
||||
async function runScript(args = [], env = {}, shouldHaveWritten = true) {
|
||||
const { stats, result } = await tryRunScript(args, env, shouldHaveWritten)
|
||||
async function runScript(args = [], shouldHaveWritten = true) {
|
||||
const { stats, result } = await tryRunScript(args, shouldHaveWritten)
|
||||
if (result.status !== 0) {
|
||||
console.log(result)
|
||||
expect(result).to.have.property('status', 0)
|
||||
@@ -803,7 +811,6 @@ describe('back_fill_file_hash script', function () {
|
||||
it('should process nothing on re-run', async function () {
|
||||
const rerun = await runScript(
|
||||
!processHashedFiles ? ['--skip-hashed-files'] : [],
|
||||
{},
|
||||
false
|
||||
)
|
||||
let stats = {
|
||||
@@ -928,10 +935,11 @@ describe('back_fill_file_hash script', function () {
|
||||
it('should gracefully handle fatal errors', async function () {
|
||||
mockFilestore.deleteObject(projectId0, fileId0)
|
||||
const t0 = Date.now()
|
||||
const { stats, result } = await tryRunScript(['--skip-hashed-files'], {
|
||||
RETRIES: '10',
|
||||
RETRY_DELAY_MS: '1000',
|
||||
})
|
||||
const { stats, result } = await tryRunScript([
|
||||
'--skip-hashed-files',
|
||||
'--retries=10',
|
||||
'--retry-delay-ms=1000',
|
||||
])
|
||||
const t1 = Date.now()
|
||||
expectNotFoundError(result, 'failed to process file')
|
||||
expect(result.status).to.equal(1)
|
||||
@@ -963,11 +971,12 @@ describe('back_fill_file_hash script', function () {
|
||||
value: { stats, result },
|
||||
},
|
||||
] = await Promise.allSettled([
|
||||
tryRunScript(['--skip-hashed-files'], {
|
||||
RETRY_DELAY_MS: '100',
|
||||
RETRIES: '60',
|
||||
RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests
|
||||
}),
|
||||
tryRunScript([
|
||||
'--skip-hashed-files',
|
||||
'--retries=60',
|
||||
'--retry-delay-ms=1000',
|
||||
'--retry-filestore-404',
|
||||
]),
|
||||
restoreFileAfter5s(),
|
||||
])
|
||||
expectNotFoundError(result, 'failed to process file, trying again')
|
||||
@@ -989,9 +998,7 @@ describe('back_fill_file_hash script', function () {
|
||||
let output
|
||||
before('prepare environment', prepareEnvironment)
|
||||
before('run script', async function () {
|
||||
output = await runScript(['--skip-hashed-files'], {
|
||||
CONCURRENCY: '1',
|
||||
})
|
||||
output = await runScript(['--skip-hashed-files', '--concurrency=1'])
|
||||
})
|
||||
|
||||
/**
|
||||
@@ -1058,10 +1065,10 @@ describe('back_fill_file_hash script', function () {
|
||||
let output1, output2
|
||||
before('prepare environment', prepareEnvironment)
|
||||
before('run script without hashed files', async function () {
|
||||
output1 = await runScript(['--skip-hashed-files'], {})
|
||||
output1 = await runScript(['--skip-hashed-files'])
|
||||
})
|
||||
before('run script with hashed files', async function () {
|
||||
output2 = await runScript([], {})
|
||||
output2 = await runScript([])
|
||||
})
|
||||
it('should print stats for the first run without hashed files', function () {
|
||||
expect(output1.stats).deep.equal(STATS_ALL)
|
||||
@@ -1076,6 +1083,34 @@ describe('back_fill_file_hash script', function () {
|
||||
})
|
||||
commonAssertions(true)
|
||||
})
|
||||
describe('report mode', function () {
|
||||
let output
|
||||
before('prepare environment', prepareEnvironment)
|
||||
before('run script', async function () {
|
||||
output = await rawRunScript(['--report'])
|
||||
})
|
||||
it('should print the report', () => {
|
||||
expect(output.status).to.equal(0)
|
||||
expect(output.stdout).to.equal(`\
|
||||
Current status:
|
||||
- Total number of projects: 10
|
||||
- Total number of deleted projects: 5
|
||||
Sampling 1000 projects to estimate progress...
|
||||
Sampled stats for projects:
|
||||
- Sampled projects: 9 (90% of all projects)
|
||||
- Sampled projects with all hashes present: 5
|
||||
- Percentage of projects that need back-filling hashes: 44% (estimated)
|
||||
- Sampled projects have 11 files that need to be checked against the full project history system.
|
||||
- Sampled projects have 3 files that need to be uploaded to the full project history system (estimating 27% of all files).
|
||||
Sampled stats for deleted projects:
|
||||
- Sampled deleted projects: 4 (80% of all deleted projects)
|
||||
- Sampled deleted projects with all hashes present: 3
|
||||
- Percentage of deleted projects that need back-filling hashes: 25% (estimated)
|
||||
- Sampled deleted projects have 2 files that need to be checked against the full project history system.
|
||||
- Sampled deleted projects have 1 files that need to be uploaded to the full project history system (estimating 50% of all files).
|
||||
`)
|
||||
})
|
||||
})
|
||||
|
||||
describe('full run in dry-run mode', function () {
|
||||
let output
|
||||
@@ -1089,13 +1124,7 @@ describe('back_fill_file_hash script', function () {
|
||||
.toArray()
|
||||
})
|
||||
before('run script', async function () {
|
||||
output = await runScript(
|
||||
['--dry-run'],
|
||||
{
|
||||
CONCURRENCY: '1',
|
||||
},
|
||||
false
|
||||
)
|
||||
output = await runScript(['--dry-run', '--concurrency=1'], false)
|
||||
})
|
||||
|
||||
it('should print stats for dry-run mode', function () {
|
||||
@@ -1136,9 +1165,7 @@ describe('back_fill_file_hash script', function () {
|
||||
let output
|
||||
before('prepare environment', prepareEnvironment)
|
||||
before('run script', async function () {
|
||||
output = await runScript(['--skip-hashed-files'], {
|
||||
CONCURRENCY: '10',
|
||||
})
|
||||
output = await runScript(['--skip-hashed-files', '--concurrency=10'])
|
||||
})
|
||||
it('should print stats', function () {
|
||||
expect(output.stats).deep.equal(STATS_ALL)
|
||||
@@ -1146,13 +1173,14 @@ describe('back_fill_file_hash script', function () {
|
||||
commonAssertions()
|
||||
})
|
||||
|
||||
describe('full run STREAM_HIGH_WATER_MARK=1MB', function () {
|
||||
describe('full run STREAM_HIGH_WATER_MARK=64kiB', function () {
|
||||
let output
|
||||
before('prepare environment', prepareEnvironment)
|
||||
before('run script', async function () {
|
||||
output = await runScript(['--skip-hashed-files'], {
|
||||
STREAM_HIGH_WATER_MARK: (1024 * 1024).toString(),
|
||||
})
|
||||
output = await runScript([
|
||||
'--skip-hashed-files',
|
||||
`--stream-high-water-mark=${64 * 1024}`,
|
||||
])
|
||||
})
|
||||
it('should print stats', function () {
|
||||
expect(output.stats).deep.equal(STATS_ALL)
|
||||
@@ -1164,7 +1192,7 @@ describe('back_fill_file_hash script', function () {
|
||||
let output
|
||||
before('prepare environment', prepareEnvironment)
|
||||
before('run script', async function () {
|
||||
output = await runScript([], {})
|
||||
output = await runScript([])
|
||||
})
|
||||
it('should print stats', function () {
|
||||
expect(output.stats).deep.equal(
|
||||
@@ -1193,9 +1221,7 @@ describe('back_fill_file_hash script', function () {
|
||||
})
|
||||
let output
|
||||
before('run script', async function () {
|
||||
output = await runScript(['--skip-hashed-files'], {
|
||||
CONCURRENCY: '1',
|
||||
})
|
||||
output = await runScript(['--skip-hashed-files', '--concurrency=1'])
|
||||
})
|
||||
|
||||
it('should print stats', function () {
|
||||
@@ -1214,20 +1240,18 @@ describe('back_fill_file_hash script', function () {
|
||||
let outputPart0, outputPart1
|
||||
before('prepare environment', prepareEnvironment)
|
||||
before('run script on part 0', async function () {
|
||||
outputPart0 = await runScript(
|
||||
['--skip-hashed-files', `--BATCH_RANGE_END=${edge}`],
|
||||
{
|
||||
CONCURRENCY: '1',
|
||||
}
|
||||
)
|
||||
outputPart0 = await runScript([
|
||||
'--skip-hashed-files',
|
||||
`--BATCH_RANGE_END=${edge}`,
|
||||
'--concurrency=1',
|
||||
])
|
||||
})
|
||||
before('run script on part 1', async function () {
|
||||
outputPart1 = await runScript(
|
||||
['--skip-hashed-files', `--BATCH_RANGE_START=${edge}`],
|
||||
{
|
||||
CONCURRENCY: '1',
|
||||
}
|
||||
)
|
||||
outputPart1 = await runScript([
|
||||
'--skip-hashed-files',
|
||||
`--BATCH_RANGE_START=${edge}`,
|
||||
'--concurrency=1',
|
||||
])
|
||||
})
|
||||
|
||||
it('should print stats for part 0', function () {
|
||||
|
||||
@@ -49,7 +49,7 @@ async function main(trackProgress) {
|
||||
for (const project of projects) {
|
||||
projectsProcessed += 1
|
||||
if (projectsProcessed % 100000 === 0) {
|
||||
console.log(projectsProcessed, 'projects processed')
|
||||
console.warn(projectsProcessed, 'projects processed')
|
||||
}
|
||||
const projectId = project._id.toString()
|
||||
for (const { reason, path, _id } of processProject(project)) {
|
||||
|
||||
Reference in New Issue
Block a user