[history-v1] speed up tests for back-fill script (#23440)

* [history-v1] speed up tests for back-fill script

* [history-v1] move "should process nothing on re-run" test to the end

GitOrigin-RevId: d5180cfaefca3c46c440230eb3566539de1582bd
This commit is contained in:
Jakob Ackermann
2025-02-06 11:40:16 +00:00
committed by Copybot
parent dd379487be
commit 48448c626d
@@ -260,9 +260,7 @@ describe('back_fill_file_hash script', function () {
}
}
beforeEach(cleanup.everything)
beforeEach('populate mongo', async function () {
async function populateMongo() {
await globalBlobs.insertMany([
{ _id: gitBlobHash(fileId6), byteLength: 24, stringLength: 24 },
{ _id: gitBlobHash(fileId8), byteLength: 24, stringLength: 24 },
@@ -457,7 +455,9 @@ describe('back_fill_file_hash script', function () {
},
{ _id: fileIdDeleted5, projectId: projectId0 },
])
}
async function populateHistoryV1() {
await Promise.all([
testProjects.createEmptyProject(historyId0.toString()),
testProjects.createEmptyProject(historyId1),
@@ -479,9 +479,9 @@ describe('back_fill_file_hash script', function () {
await blobStore2.putString(contentTextBlob2.toString())
const blobStoreBadFileTree = new BlobStore(historyIdBadFileTree0.toString())
await blobStoreBadFileTree.putString(contentTextBlob3.toString())
})
}
beforeEach('populate filestore', async function () {
async function populateFilestore() {
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileId0}`,
@@ -553,7 +553,14 @@ describe('back_fill_file_hash script', function () {
`${projectIdBadFileTree3}/${fileId9}`,
Stream.Readable.from([fileId9.toString()])
)
})
}
async function prepareEnvironment() {
await cleanup.everything()
await populateMongo()
await populateHistoryV1()
await populateFilestore()
}
/**
* @param {Array<string>} args
@@ -945,29 +952,6 @@ describe('back_fill_file_hash script', function () {
},
])
})
it('should process nothing on re-run', async function () {
const rerun = await runScript(
processHashedFiles ? ['--processHashedFiles=true'] : [],
{},
false
)
let stats = {
...STATS_ALL_ZERO,
// We still need to iterate over all the projects and blobs.
projects: 10,
blobs: 13,
backedUpBlobs: 13,
badFileTrees: 4,
}
if (processHashedFiles) {
stats = sumStats(stats, {
...STATS_ALL_ZERO,
blobs: 3,
backedUpBlobs: 3,
})
}
expect(rerun.stats).deep.equal(stats)
})
it('should have backed up all the files', async function () {
expect(tieringStorageClass).to.exist
const blobs = await listS3Bucket(projectBlobsBucket, tieringStorageClass)
@@ -1026,6 +1010,31 @@ describe('back_fill_file_hash script', function () {
expect(id).to.match(/^[a-f0-9]{24}$/)
}
})
// Technically, we should move the below test into its own environment to ensure it does not impact any assertions.
// Practically, this is slow and moving it to the end of the tests gets us there most of the way.
it('should process nothing on re-run', async function () {
const rerun = await runScript(
processHashedFiles ? ['--processHashedFiles=true'] : [],
{},
false
)
let stats = {
...STATS_ALL_ZERO,
// We still need to iterate over all the projects and blobs.
projects: 10,
blobs: 13,
backedUpBlobs: 13,
badFileTrees: 4,
}
if (processHashedFiles) {
stats = sumStats(stats, {
...STATS_ALL_ZERO,
blobs: 3,
backedUpBlobs: 3,
})
}
expect(rerun.stats).deep.equal(stats)
})
}
function expectNotFoundError(result, msg) {
@@ -1149,78 +1158,85 @@ describe('back_fill_file_hash script', function () {
STATS_UP_FROM_PROJECT1_ONWARD
)
it('should gracefully handle fatal errors', async function () {
await FILESTORE_PERSISTOR.deleteObject(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileId0}`
)
const t0 = Date.now()
const { stats, result } = await tryRunScript([], {
RETRIES: '10',
RETRY_DELAY_MS: '1000',
})
const t1 = Date.now()
expectNotFoundError(result, 'failed to process file')
expect(result.status).to.equal(1)
expect(stats).to.deep.equal(
sumStats(STATS_ALL, {
...STATS_ALL_ZERO,
filesFailed: 1,
readFromGCSIngress: -24,
writeToAWSCount: -1,
writeToAWSEgress: -28,
writeToGCSCount: -1,
writeToGCSEgress: -24,
})
)
// should not retry 404
expect(result.stdout).to.not.include('failed to process file, trying again')
expect(t1 - t0).to.be.below(10_000)
})
describe('error cases', () => {
beforeEach('prepare environment', prepareEnvironment)
it('should retry on error', async function () {
await FILESTORE_PERSISTOR.deleteObject(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileId0}`
)
const restoreFileAfter5s = async () => {
await setTimeout(5_000)
await FILESTORE_PERSISTOR.sendStream(
it('should gracefully handle fatal errors', async function () {
await FILESTORE_PERSISTOR.deleteObject(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileId0}`,
Stream.Readable.from([fileId0.toString()])
`${projectId0}/${fileId0}`
)
const t0 = Date.now()
const { stats, result } = await tryRunScript([], {
RETRIES: '10',
RETRY_DELAY_MS: '1000',
})
const t1 = Date.now()
expectNotFoundError(result, 'failed to process file')
expect(result.status).to.equal(1)
expect(stats).to.deep.equal(
sumStats(STATS_ALL, {
...STATS_ALL_ZERO,
filesFailed: 1,
readFromGCSIngress: -24,
writeToAWSCount: -1,
writeToAWSEgress: -28,
writeToGCSCount: -1,
writeToGCSEgress: -24,
})
)
// should not retry 404
expect(result.stdout).to.not.include(
'failed to process file, trying again'
)
expect(t1 - t0).to.be.below(10_000)
})
it('should retry on error', async function () {
await FILESTORE_PERSISTOR.deleteObject(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileId0}`
)
const restoreFileAfter5s = async () => {
await setTimeout(5_000)
await FILESTORE_PERSISTOR.sendStream(
USER_FILES_BUCKET_NAME,
`${projectId0}/${fileId0}`,
Stream.Readable.from([fileId0.toString()])
)
}
// use Promise.allSettled to ensure the above sendStream call finishes before this test completes
const [
{
value: { stats, result },
},
] = await Promise.allSettled([
tryRunScript([], {
RETRY_DELAY_MS: '100',
RETRIES: '60',
RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests
}),
restoreFileAfter5s(),
])
expectNotFoundError(result, 'failed to process file, trying again')
expect(result.status).to.equal(0)
expect({ ...stats, filesRetries: 0, readFromGCSCount: 0 }).to.deep.equal({
...STATS_ALL,
filesRetries: 0,
readFromGCSCount: 0,
})
expect(stats.filesRetries).to.be.greaterThan(0, 'should have retried')
expect(stats.readFromGCSCount).to.be.greaterThan(
STATS_ALL.readFromGCSCount,
'should have read more times from GCS compared to normal operations'
)
}
// use Promise.allSettled to ensure the above sendStream call finishes before this test completes
const [
{
value: { stats, result },
},
] = await Promise.allSettled([
tryRunScript([], {
RETRY_DELAY_MS: '100',
RETRIES: '60',
RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests
}),
restoreFileAfter5s(),
])
expectNotFoundError(result, 'failed to process file, trying again')
expect(result.status).to.equal(0)
expect({ ...stats, filesRetries: 0, readFromGCSCount: 0 }).to.deep.equal({
...STATS_ALL,
filesRetries: 0,
readFromGCSCount: 0,
})
expect(stats.filesRetries).to.be.greaterThan(0, 'should have retried')
expect(stats.readFromGCSCount).to.be.greaterThan(
STATS_ALL.readFromGCSCount,
'should have read more times from GCS compared to normal operations'
)
})
describe('full run CONCURRENCY=1', function () {
let output
beforeEach('run script', async function () {
before('prepare environment', prepareEnvironment)
before('run script', async function () {
output = await runScript([], {
CONCURRENCY: '1',
})
@@ -1229,6 +1245,7 @@ describe('back_fill_file_hash script', function () {
/**
* @param {ObjectId} projectId
* @param {string} msg
* @param {string} path
*/
function expectBadFileTreeMessage(projectId, msg, path) {
const line = output.result.stdout
@@ -1283,29 +1300,35 @@ describe('back_fill_file_hash script', function () {
)
})
commonAssertions()
})
describe('when processing hashed files later', function () {
let output
beforeEach('run script', async function () {
output = await runScript(['--processHashedFiles=true'], {})
})
it('should print stats', function () {
expect(output.stats).deep.equal({
...STATS_FILES_HASHED_EXTRA,
projects: 10,
blobs: 13,
backedUpBlobs: 13,
badFileTrees: 4,
mongoUpdates: 3,
})
})
commonAssertions(true)
describe('when processing hashed files later', function () {
let output1, output2
before('prepare environment', prepareEnvironment)
before('run script without hashed files', async function () {
output1 = await runScript([], {})
})
before('run script with hashed files', async function () {
output2 = await runScript(['--processHashedFiles=true'], {})
})
it('should print stats', function () {
expect(output1.stats).deep.equal(STATS_ALL)
expect(output2.stats).deep.equal({
...STATS_FILES_HASHED_EXTRA,
projects: 10,
blobs: 13,
backedUpBlobs: 13,
badFileTrees: 4,
mongoUpdates: 3,
})
})
commonAssertions(true)
})
describe('full run CONCURRENCY=10', function () {
let output
beforeEach('run script', async function () {
before('prepare environment', prepareEnvironment)
before('run script', async function () {
output = await runScript([], {
CONCURRENCY: '10',
})
@@ -1318,7 +1341,8 @@ describe('back_fill_file_hash script', function () {
describe('full run STREAM_HIGH_WATER_MARK=1MB', function () {
let output
beforeEach('run script', async function () {
before('prepare environment', prepareEnvironment)
before('run script', async function () {
output = await runScript([], {
STREAM_HIGH_WATER_MARK: (1024 * 1024).toString(),
})
@@ -1331,7 +1355,8 @@ describe('back_fill_file_hash script', function () {
describe('when processing hashed files', function () {
let output
beforeEach('run script', async function () {
before('prepare environment', prepareEnvironment)
before('run script', async function () {
output = await runScript(['--processHashedFiles=true'], {})
})
it('should print stats', function () {
@@ -1343,7 +1368,8 @@ describe('back_fill_file_hash script', function () {
})
describe('with something in the bucket already', function () {
beforeEach('create a file in s3', async function () {
before('prepare environment', prepareEnvironment)
before('create a file in s3', async function () {
const buf = Buffer.from(fileId0.toString())
await backupPersistor.sendStream(
projectBlobsBucket,
@@ -1353,7 +1379,7 @@ describe('back_fill_file_hash script', function () {
)
})
let output
beforeEach('run script', async function () {
before('run script', async function () {
output = await runScript([], {
CONCURRENCY: '1',
})
@@ -1374,7 +1400,8 @@ describe('back_fill_file_hash script', function () {
})
describe('with something in the bucket and marked as processed', function () {
beforeEach('create a file in s3', async function () {
before('prepare environment', prepareEnvironment)
before('create a file in s3', async function () {
await backupPersistor.sendStream(
projectBlobsBucket,
makeProjectKey(historyId0, hashTextBlob0),
@@ -1389,7 +1416,7 @@ describe('back_fill_file_hash script', function () {
])
})
let output
beforeEach('run script', async function () {
before('run script', async function () {
output = await runScript([], {
CONCURRENCY: '1',
})
@@ -1414,12 +1441,13 @@ describe('back_fill_file_hash script', function () {
// part0: project0+project1, part1: project2 onwards
const edge = projectId1.toString()
let outputPart0, outputPart1
beforeEach('run script on part 0', async function () {
before('prepare environment', prepareEnvironment)
before('run script on part 0', async function () {
outputPart0 = await runScript([`--BATCH_RANGE_END=${edge}`], {
CONCURRENCY: '1',
})
})
beforeEach('run script on part 1', async function () {
before('run script on part 1', async function () {
outputPart1 = await runScript([`--BATCH_RANGE_START=${edge}`], {
CONCURRENCY: '1',
})
@@ -1435,7 +1463,8 @@ describe('back_fill_file_hash script', function () {
describe('projectIds from file', () => {
const path0 = '/tmp/project-ids-0.txt'
const path1 = '/tmp/project-ids-1.txt'
beforeEach('create project-ids.txt files', async function () {
before('prepare environment', prepareEnvironment)
before('create project-ids.txt files', async function () {
await fs.promises.writeFile(
path0,
[projectId0, projectId1].map(id => id.toString()).join('\n')
@@ -1463,10 +1492,10 @@ describe('back_fill_file_hash script', function () {
})
let outputPart0, outputPart1
beforeEach('run script on part 0', async function () {
before('run script on part 0', async function () {
outputPart0 = await runScript([`--projectIdsFrom=${path0}`])
})
beforeEach('run script on part 1', async function () {
before('run script on part 1', async function () {
outputPart1 = await runScript([`--projectIdsFrom=${path1}`])
})