From b918e234197309c06082fe48f2d683a6f46c894d Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Fri, 17 Apr 2026 09:11:23 +0100 Subject: [PATCH] Merge pull request #32780 from overleaf/bg-history-v1-check-chunk-script add script for checking chunks with resyncs GitOrigin-RevId: bf17bf262cf0a691a66c0d23256d53b606007461 --- .../storage/scripts/check_chunk.mjs | 290 ++++++++++++++++++ 1 file changed, 290 insertions(+) create mode 100644 services/history-v1/storage/scripts/check_chunk.mjs diff --git a/services/history-v1/storage/scripts/check_chunk.mjs b/services/history-v1/storage/scripts/check_chunk.mjs new file mode 100644 index 0000000000..ccc0a980d2 --- /dev/null +++ b/services/history-v1/storage/scripts/check_chunk.mjs @@ -0,0 +1,290 @@ +import commandLineArgs from 'command-line-args' +import { + loadAtVersion, + getProjectChunksFromVersion, +} from '../lib/chunk_store/index.js' +import { client } from '../lib/mongodb.js' +import knex from '../lib/knex.js' +import redis from '../lib/redis.js' +import { loadGlobalBlobs, BlobStore } from '../lib/blob_store/index.js' +import { getContentHash } from '../lib/content_hash.js' +import core from 'overleaf-editor-core' +import Events from 'node:events' +Events.setMaxListeners(20) + +const { StringFileData, LazyStringFileData } = core + +const optionDefinitions = [ + { name: 'historyId', alias: 'p', type: String }, + { name: 'version', alias: 'v', type: Number }, + { name: 'persistedOnly', alias: 'o', type: Boolean }, +] + +async function ensureFileLoaded(file, blobStore, currentVersion, path) { + if ( + typeof file.load === 'function' && + file.data instanceof LazyStringFileData + ) { + if (file.data.rangesHash) { + console.log( + `Loading rangesHash ${file.data.rangesHash} for ${path} at version ${currentVersion}` + ) + } else { + console.log( + `No rangesHash found for ${path} at version ${currentVersion}` + ) + } + await file.load('eager', blobStore) + console.log('=> file', file.toRaw()) + } +} + +function checkFileTrackedChanges(path, file, currentVersion) { + let violations = false + const positions = [] + if (!(file.data instanceof StringFileData)) { + return { violations, positions } + } + + let tcList + try { + tcList = file.getTrackedChanges() + } catch (err) { + return { violations, positions } + } + + if (!tcList) return { violations, positions } + + const changesArr = Array.from(tcList) + let prevTc = null + + for (const tc of changesArr) { + positions.push(`(${tc.range.start}, ${tc.range.end})`) + if (prevTc) { + if (prevTc.range.start > tc.range.start) { + console.error( + `VIOLATION: Unsorted ranges in ${path} at version ${currentVersion}: [${prevTc.range.start}, ${prevTc.range.end}] comes before [${tc.range.start}, ${tc.range.end}]` + ) + violations = true + } + if (prevTc.range.overlaps(tc.range)) { + console.error( + `VIOLATION: Overlapping ranges in ${path} at version ${currentVersion}: [${prevTc.range.start}, ${prevTc.range.end}] overlaps [${tc.range.start}, ${tc.range.end}]` + ) + violations = true + } + } + prevTc = tc + } + + return { violations, positions } +} + +async function checkSnapshot(snapshot, blobStore, currentVersion) { + let containsViolations = false + const pathnames = snapshot.getFilePathnames() + const diagnostics = [] + + for (const path of pathnames) { + const file = snapshot.getFile(path) + if (!file) continue + + try { + await ensureFileLoaded(file, blobStore, currentVersion, path) + } catch (err) { + console.error( + `Failed to load file ${path} at version ${currentVersion}:`, + err + ) + continue + } + + const { violations, positions } = checkFileTrackedChanges( + path, + file, + currentVersion + ) + if (violations) containsViolations = true + if (positions.length > 0) { + diagnostics.push(` ${path}: changes at [${positions.join(', ')}]`) + } + } + + if (diagnostics.length > 0) { + console.log(`Version ${currentVersion} tracked changes summary:`) + console.log(diagnostics.join('\n')) + } + + return containsViolations +} + +async function validateContentHash( + operation, + snapshot, + currentVersion, + blobStore +) { + if (operation instanceof core.EditFileOperation) { + const editOperation = operation.getOperation() + if ( + editOperation instanceof core.TextOperation && + editOperation.contentHash != null + ) { + const path = operation.getPathname() + const file = snapshot.getFile(path) + if (file == null) { + console.error( + `VIOLATION: file ${path} not found for hash validation at version ${currentVersion}` + ) + return true + } + try { + await ensureFileLoaded(file, blobStore, currentVersion, path) + } catch (err) { + console.error( + `Failed to load file ${path} for hash validation at version ${currentVersion}:`, + err + ) + return true + } + const content = file.getContent({ filterTrackedDeletes: true }) + const expectedHash = editOperation.contentHash + const actualHash = content != null ? getContentHash(content) : null + + if (actualHash !== expectedHash) { + console.error( + `VIOLATION: content hash mismatch in ${path} at version ${currentVersion}: expected ${expectedHash}, got ${actualHash}` + ) + return true + } + } + } + return false +} + +async function checkChunkChanges(historyId, chunk) { + const snapshot = chunk.getSnapshot().clone() + const blobStore = new BlobStore(historyId) + + const changes = chunk.getChanges() + let currentVersion = chunk.getStartVersion() + console.log( + `Checking chunk starting at version ${currentVersion} with ${changes.length} changes.` + ) + + const initialViolations = await checkSnapshot( + snapshot, + blobStore, + currentVersion + ) + if (initialViolations) { + console.log( + `Tracked changes corrupted at initial snapshot version ${currentVersion}` + ) + } + + for (const change of changes) { + currentVersion += 1 + let localViolations = false + if (change?.origin?.kind === 'history-resync') { + console.log('-'.repeat(16), 'history-resync', '-'.repeat(16)) + } + console.log( + `Version ${currentVersion} change:`, + JSON.stringify(change.toRaw()) + ) + if (change?.origin?.kind === 'history-resync') { + process.exit() + } + + try { + for (const _operation of change.iterativelyApplyTo(snapshot, { + strict: true, + })) { + console.log( + `Version ${currentVersion} operation:`, + JSON.stringify(_operation.toRaw()) + ) + const hashErr = await validateContentHash( + _operation, + snapshot, + currentVersion, + blobStore + ) + if (hashErr) localViolations = true + } + } catch (err) { + console.error(`Failed to apply change at version ${currentVersion}:`, err) + continue + } + + const snapViolations = await checkSnapshot( + snapshot, + blobStore, + currentVersion + ) + if (snapViolations) localViolations = true + + if (localViolations) { + console.log( + `Tracked changes corrupted or hash mismatch at version ${currentVersion}` + ) + console.log('Change was:', JSON.stringify(change.toRaw(), null, 2)) + } + } +} + +async function main() { + const options = commandLineArgs(optionDefinitions) + const { historyId, version, persistedOnly } = options + + if (!historyId) { + console.error('Error: --historyId is required.') + process.exit(1) + } + + await loadGlobalBlobs() + + if (version != null) { + const chunk = await loadAtVersion(historyId, version, { + persistedOnly: persistedOnly || false, + }) + if (!chunk) { + console.error(`Chunk not found at version ${version}`) + process.exit(1) + } + await checkChunkChanges(historyId, chunk) + } else { + let checkedAny = false + for await (const chunkRecord of getProjectChunksFromVersion(historyId, 0)) { + const chunk = await loadAtVersion(historyId, chunkRecord.startVersion, { + persistedOnly: persistedOnly || false, + }) + if (chunk) { + checkedAny = true + await checkChunkChanges(historyId, chunk) + } else { + console.error( + `Failed to load chunk starting at ${chunkRecord.startVersion}` + ) + } + } + if (!checkedAny) { + console.log(`No chunks found for project ${historyId}`) + } + } +} + +main() + .then(() => console.log('Done.')) + .catch(err => { + console.error('Error:', err) + process.exit(1) + }) + .finally(() => { + knex.destroy().catch(err => console.error('Error closing Postgres:', err)) + client.close().catch(err => console.error('Error closing MongoDB:', err)) + redis + .disconnect() + .catch(err => console.error('Error disconnecting Redis:', err)) + })