diff --git a/libraries/overleaf-editor-core/lib/change.js b/libraries/overleaf-editor-core/lib/change.js index 63b49265fb..e36cda9ad3 100644 --- a/libraries/overleaf-editor-core/lib/change.js +++ b/libraries/overleaf-editor-core/lib/change.js @@ -100,6 +100,9 @@ class Change { ) } + /** + * @return {Operation[]} + */ getOperations() { return this.operations } @@ -248,6 +251,24 @@ class Change { * @param {boolean} [opts.strict] - Do not ignore recoverable errors */ applyTo(snapshot, opts = {}) { + // eslint-disable-next-line no-unused-vars + for (const operation of this.iterativelyApplyTo(snapshot, opts)) { + // Nothing to do: we're just consuming the iterator for the side effects + } + } + + /** + * Generator that applies this change to a snapshot and yields each + * operation after it has been applied. + * + * Recoverable errors (caused by historical bad data) are ignored unless + * opts.strict is true + * + * @param {Snapshot} snapshot modified in place + * @param {object} opts + * @param {boolean} [opts.strict] - Do not ignore recoverable errors + */ + *iterativelyApplyTo(snapshot, opts = {}) { assert.object(snapshot, 'bad snapshot') for (const operation of this.operations) { @@ -261,6 +282,7 @@ class Change { throw err } } + yield operation } // update project version if present in change diff --git a/services/history-v1/storage/lib/content_hash.js b/services/history-v1/storage/lib/content_hash.js new file mode 100644 index 0000000000..a381babc04 --- /dev/null +++ b/services/history-v1/storage/lib/content_hash.js @@ -0,0 +1,18 @@ +// @ts-check + +const { createHash } = require('node:crypto') + +/** + * Compute a SHA-1 hash of the content + * + * This is used to validate incoming updates. + * + * @param {string} content + */ +function getContentHash(content) { + const hash = createHash('sha-1') + hash.update(content) + return hash.digest('hex') +} + +module.exports = { getContentHash } diff --git a/services/history-v1/storage/lib/errors.js b/services/history-v1/storage/lib/errors.js new file mode 100644 index 0000000000..626536b079 --- /dev/null +++ b/services/history-v1/storage/lib/errors.js @@ -0,0 +1,5 @@ +const OError = require('@overleaf/o-error') + +class InvalidChangeError extends OError {} + +module.exports = { InvalidChangeError } diff --git a/services/history-v1/storage/lib/persist_changes.js b/services/history-v1/storage/lib/persist_changes.js index e01bf75b46..03d36fda1c 100644 --- a/services/history-v1/storage/lib/persist_changes.js +++ b/services/history-v1/storage/lib/persist_changes.js @@ -3,6 +3,7 @@ 'use strict' const _ = require('lodash') +const logger = require('@overleaf/logger') const core = require('overleaf-editor-core') const Chunk = core.Chunk @@ -10,6 +11,9 @@ const History = core.History const assert = require('./assert') const chunkStore = require('./chunk_store') +const { BlobStore } = require('./blob_store') +const { InvalidChangeError } = require('./errors') +const { getContentHash } = require('./content_hash') function countChangeBytes(change) { // Note: This is not quite accurate, because the raw change may contain raw @@ -59,10 +63,18 @@ async function persistChanges(projectId, allChanges, limits, clientEndVersion) { assert.maybe.object(limits) assert.integer(clientEndVersion) + const blobStore = new BlobStore(projectId) + let currentChunk - // currentSnapshot tracks the latest change that we're applying; we use it to - // check that the changes we are persisting are valid. + + /** + * currentSnapshot tracks the latest change that we're applying; we use it to + * check that the changes we are persisting are valid. + * + * @type {core.Snapshot} + */ let currentSnapshot + let originalEndVersion let changesToPersist @@ -83,22 +95,95 @@ async function persistChanges(projectId, allChanges, limits, clientEndVersion) { } } - function fillChunk(chunk, changes) { + /** + * Add changes to a chunk until the chunk is full + * + * The chunk is full if it reaches a certain number of changes or a certain + * size in bytes + * + * @param {core.Chunk} chunk + * @param {core.Change[]} changes + */ + async function fillChunk(chunk, changes) { let totalBytes = totalChangeBytes(chunk.getChanges()) let changesPushed = false while (changes.length > 0) { - if (chunk.getChanges().length >= limits.maxChunkChanges) break - const changeBytes = countChangeBytes(changes[0]) - if (totalBytes + changeBytes > limits.maxChunkChangeBytes) break - const changesToFill = changes.splice(0, 1) - currentSnapshot.applyAll(changesToFill, { strict: true }) - chunk.pushChanges(changesToFill) + if (chunk.getChanges().length >= limits.maxChunkChanges) { + break + } + + const change = changes[0] + const changeBytes = countChangeBytes(change) + + if (totalBytes + changeBytes > limits.maxChunkChangeBytes) { + break + } + + for (const operation of change.iterativelyApplyTo(currentSnapshot, { + strict: true, + })) { + try { + await validateContentHash(operation) + } catch (err) { + // Temporary: skip validation errors + if (err instanceof InvalidChangeError) { + logger.warn( + { err, projectId }, + 'content snapshot mismatch (ignored)' + ) + } else { + throw err + } + } + } + + chunk.pushChanges([change]) + changes.shift() totalBytes += changeBytes changesPushed = true } return changesPushed } + /** + * Check that the operation is valid and can be incorporated to the history. + * + * For now, this checks content hashes when they are provided. + * + * @param {core.Operation} operation + */ + async function validateContentHash(operation) { + if (operation instanceof core.EditFileOperation) { + const editOperation = operation.getOperation() + if ( + editOperation instanceof core.TextOperation && + editOperation.contentHash != null + ) { + const path = operation.getPathname() + const file = currentSnapshot.getFile(path) + if (file == null) { + throw new InvalidChangeError('file not found for hash validation', { + projectId, + path, + }) + } + await file.load('eager', blobStore) + const content = file.getContent({ filterTrackedDeletes: true }) + const expectedHash = editOperation.contentHash + const actualHash = content != null ? getContentHash(content) : null + logger.debug({ expectedHash, actualHash }, 'validating content hash') + if (actualHash !== expectedHash) { + throw new InvalidChangeError('content hash mismatch', { + projectId, + path, + expectedHash, + actualHash, + }) + } + } + } + } + async function extendLastChunkIfPossible() { const latestChunk = await chunkStore.loadLatest(projectId) @@ -115,7 +200,11 @@ async function persistChanges(projectId, allChanges, limits, clientEndVersion) { const timer = new Timer() currentSnapshot.applyAll(latestChunk.getChanges()) - if (!fillChunk(currentChunk, changesToPersist)) return + const changesPushed = await fillChunk(currentChunk, changesToPersist) + if (!changesPushed) { + return + } + checkElapsedTime(timer) await chunkStore.update(projectId, originalEndVersion, currentChunk) @@ -127,7 +216,9 @@ async function persistChanges(projectId, allChanges, limits, clientEndVersion) { const history = new History(currentSnapshot.clone(), []) const chunk = new Chunk(history, endVersion) const timer = new Timer() - if (fillChunk(chunk, changesToPersist)) { + + const changesPushed = await fillChunk(chunk, changesToPersist) + if (changesPushed) { checkElapsedTime(timer) currentChunk = chunk await chunkStore.create(projectId, chunk) diff --git a/services/history-v1/test/acceptance/js/storage/persist_changes.test.js b/services/history-v1/test/acceptance/js/storage/persist_changes.test.js index d661007c39..50eb505681 100644 --- a/services/history-v1/test/acceptance/js/storage/persist_changes.test.js +++ b/services/history-v1/test/acceptance/js/storage/persist_changes.test.js @@ -1,5 +1,6 @@ 'use strict' +const { createHash } = require('node:crypto') const { expect } = require('chai') const cleanup = require('./support/cleanup') @@ -11,6 +12,8 @@ const persistChanges = storage.persistChanges const core = require('overleaf-editor-core') const AddFileOperation = core.AddFileOperation +const EditFileOperation = core.EditFileOperation +const TextOperation = core.TextOperation const Change = core.Change const Chunk = core.Chunk const File = core.File @@ -28,7 +31,7 @@ describe('persistChanges', function () { farFuture.setTime(farFuture.getTime() + 7 * 24 * 3600 * 1000) }) - it('persists changes', function () { + it('persists changes', async function () { const limitsToPersistImmediately = { minChangeTimestamp: farFuture, maxChangeTimestamp: farFuture, @@ -41,29 +44,29 @@ describe('persistChanges', function () { ) const changes = [change] - return chunkStore - .initializeProject(projectId) - .then(() => { - return persistChanges(projectId, changes, limitsToPersistImmediately, 0) - }) - .then(result => { - const history = new History(new Snapshot(), changes) - const currentChunk = new Chunk(history, 0) - expect(result).to.deep.equal({ - numberOfChangesPersisted: 1, - originalEndVersion: 0, - currentChunk, - }) - return chunkStore.loadLatest(projectId) - }) - .then(chunk => { - expect(chunk.getStartVersion()).to.equal(0) - expect(chunk.getEndVersion()).to.equal(1) - expect(chunk.getChanges().length).to.equal(1) - }) + await chunkStore.initializeProject(projectId) + const result = await persistChanges( + projectId, + changes, + limitsToPersistImmediately, + 0 + ) + + const history = new History(new Snapshot(), changes) + const currentChunk = new Chunk(history, 0) + expect(result).to.deep.equal({ + numberOfChangesPersisted: 1, + originalEndVersion: 0, + currentChunk, + }) + + const chunk = await chunkStore.loadLatest(projectId) + expect(chunk.getStartVersion()).to.equal(0) + expect(chunk.getEndVersion()).to.equal(1) + expect(chunk.getChanges().length).to.equal(1) }) - it('persists changes in two chunks', function () { + it('persists changes in two chunks', async function () { const limitsToPersistImmediately = { maxChunkChanges: 1, minChangeTimestamp: farFuture, @@ -82,36 +85,36 @@ describe('persistChanges', function () { ) const changes = [firstChange, secondChange] - return chunkStore - .initializeProject(projectId) - .then(() => { - return persistChanges(projectId, changes, limitsToPersistImmediately, 0) - }) - .then(result => { - const snapshot = Snapshot.fromRaw({ - files: { - 'a.tex': { - content: '', - }, - }, - }) - const history = new History(snapshot, [secondChange]) - const currentChunk = new Chunk(history, 1) - expect(result).to.deep.equal({ - numberOfChangesPersisted: 2, - originalEndVersion: 0, - currentChunk, - }) - return chunkStore.loadLatest(projectId) - }) - .then(chunk => { - expect(chunk.getStartVersion()).to.equal(1) - expect(chunk.getEndVersion()).to.equal(2) - expect(chunk.getChanges().length).to.equal(1) - }) + await chunkStore.initializeProject(projectId) + const result = await persistChanges( + projectId, + changes, + limitsToPersistImmediately, + 0 + ) + + const snapshot = Snapshot.fromRaw({ + files: { + 'a.tex': { + content: '', + }, + }, + }) + const history = new History(snapshot, [secondChange]) + const currentChunk = new Chunk(history, 1) + expect(result).to.deep.equal({ + numberOfChangesPersisted: 2, + originalEndVersion: 0, + currentChunk, + }) + + const chunk = await chunkStore.loadLatest(projectId) + expect(chunk.getStartVersion()).to.equal(1) + expect(chunk.getEndVersion()).to.equal(2) + expect(chunk.getChanges().length).to.equal(1) }) - it('persists the snapshot at the start of the chunk', function () { + it('persists the snapshot at the start of the chunk', async function () { const limitsToPersistImmediately = { maxChunkChanges: 2, minChangeTimestamp: farFuture, @@ -130,29 +133,29 @@ describe('persistChanges', function () { ) const changes = [firstChange, secondChange] - return chunkStore - .initializeProject(projectId) - .then(() => { - return persistChanges(projectId, changes, limitsToPersistImmediately, 0) - }) - .then(result => { - const history = new History(new Snapshot(), changes) - const currentChunk = new Chunk(history, 0) - expect(result).to.deep.equal({ - numberOfChangesPersisted: 2, - originalEndVersion: 0, - currentChunk, - }) - return chunkStore.loadLatest(projectId) - }) - .then(chunk => { - expect(chunk.getStartVersion()).to.equal(0) - expect(chunk.getEndVersion()).to.equal(2) - expect(chunk.getChanges().length).to.equal(2) - }) + await chunkStore.initializeProject(projectId) + const result = await persistChanges( + projectId, + changes, + limitsToPersistImmediately, + 0 + ) + + const history = new History(new Snapshot(), changes) + const currentChunk = new Chunk(history, 0) + expect(result).to.deep.equal({ + numberOfChangesPersisted: 2, + originalEndVersion: 0, + currentChunk, + }) + + const chunk = await chunkStore.loadLatest(projectId) + expect(chunk.getStartVersion()).to.equal(0) + expect(chunk.getEndVersion()).to.equal(2) + expect(chunk.getChanges().length).to.equal(2) }) - it("errors if the version doesn't match the latest chunk", function () { + it("errors if the version doesn't match the latest chunk", async function () { const limitsToPersistImmediately = { minChangeTimestamp: farFuture, maxChangeTimestamp: farFuture, @@ -169,18 +172,82 @@ describe('persistChanges', function () { [] ) const changes = [firstChange, secondChange] - return chunkStore - .initializeProject(projectId) - .then(() => { - return persistChanges(projectId, changes, limitsToPersistImmediately, 1) - }) - .then(() => { - expect.fail() - }) - .catch(err => { - expect(err.message).to.equal( - 'client sent updates with end_version 1 but latest chunk has end_version 0' - ) - }) + + await chunkStore.initializeProject(projectId) + await expect( + persistChanges(projectId, changes, limitsToPersistImmediately, 1) + ).to.be.rejectedWith( + 'client sent updates with end_version 1 but latest chunk has end_version 0' + ) + }) + + describe('content hash validation', function () { + it('acccepts a change with a valid hash', async function () { + const limitsToPersistImmediately = { + minChangeTimestamp: farFuture, + maxChangeTimestamp: farFuture, + } + + const projectId = fixtures.docs.uninitializedProject.id + await chunkStore.initializeProject(projectId) + const textOperation = new TextOperation() + textOperation.insert('hello ') + textOperation.retain(5) + textOperation.contentHash = hashString('hello world') + const change = new Change( + [ + new AddFileOperation('a.tex', File.fromString('world')), + new EditFileOperation('a.tex', textOperation), + ], + new Date(), + [] + ) + const changes = [change] + + const result = await persistChanges( + projectId, + changes, + limitsToPersistImmediately, + 0 + ) + expect(result.numberOfChangesPersisted).to.equal(1) + }) + + it('acccepts a change with an invalid hash (only logs for now)', async function () { + const limitsToPersistImmediately = { + minChangeTimestamp: farFuture, + maxChangeTimestamp: farFuture, + } + + const projectId = fixtures.docs.uninitializedProject.id + await chunkStore.initializeProject(projectId) + const textOperation = new TextOperation() + textOperation.insert('hello ') + textOperation.retain(5) + textOperation.contentHash = hashString('bad hash') + const change = new Change( + [ + new AddFileOperation('a.tex', File.fromString('world')), + new EditFileOperation('a.tex', textOperation), + ], + new Date(), + [] + ) + const changes = [change] + + const result = await persistChanges( + projectId, + changes, + limitsToPersistImmediately, + 0 + ) + expect(result.numberOfChangesPersisted).to.equal(1) + }) }) }) + +function hashString(s) { + const hash = createHash('sha-1') + hash.update(s) + return hash.digest('hex') +}