Merge pull request #23398 from overleaf/em-log-doc-hash-mismatches

Validate content hashes in history (log only)

GitOrigin-RevId: ed772fc4e4d0aa9e980f9693a759647bd937e13a
This commit is contained in:
Eric Mc Sween
2025-02-06 08:12:24 -05:00
committed by Copybot
parent 735fd761cd
commit ce4c8a4e47
5 changed files with 298 additions and 95 deletions

View File

@@ -100,6 +100,9 @@ class Change {
) )
} }
/**
* @return {Operation[]}
*/
getOperations() { getOperations() {
return this.operations return this.operations
} }
@@ -248,6 +251,24 @@ class Change {
* @param {boolean} [opts.strict] - Do not ignore recoverable errors * @param {boolean} [opts.strict] - Do not ignore recoverable errors
*/ */
applyTo(snapshot, opts = {}) { applyTo(snapshot, opts = {}) {
// eslint-disable-next-line no-unused-vars
for (const operation of this.iterativelyApplyTo(snapshot, opts)) {
// Nothing to do: we're just consuming the iterator for the side effects
}
}
/**
* Generator that applies this change to a snapshot and yields each
* operation after it has been applied.
*
* Recoverable errors (caused by historical bad data) are ignored unless
* opts.strict is true
*
* @param {Snapshot} snapshot modified in place
* @param {object} opts
* @param {boolean} [opts.strict] - Do not ignore recoverable errors
*/
*iterativelyApplyTo(snapshot, opts = {}) {
assert.object(snapshot, 'bad snapshot') assert.object(snapshot, 'bad snapshot')
for (const operation of this.operations) { for (const operation of this.operations) {
@@ -261,6 +282,7 @@ class Change {
throw err throw err
} }
} }
yield operation
} }
// update project version if present in change // update project version if present in change

View File

@@ -0,0 +1,18 @@
// @ts-check
const { createHash } = require('node:crypto')
/**
* Compute a SHA-1 hash of the content
*
* This is used to validate incoming updates.
*
* @param {string} content
*/
function getContentHash(content) {
const hash = createHash('sha-1')
hash.update(content)
return hash.digest('hex')
}
module.exports = { getContentHash }

View File

@@ -0,0 +1,5 @@
const OError = require('@overleaf/o-error')
class InvalidChangeError extends OError {}
module.exports = { InvalidChangeError }

View File

@@ -3,6 +3,7 @@
'use strict' 'use strict'
const _ = require('lodash') const _ = require('lodash')
const logger = require('@overleaf/logger')
const core = require('overleaf-editor-core') const core = require('overleaf-editor-core')
const Chunk = core.Chunk const Chunk = core.Chunk
@@ -10,6 +11,9 @@ const History = core.History
const assert = require('./assert') const assert = require('./assert')
const chunkStore = require('./chunk_store') const chunkStore = require('./chunk_store')
const { BlobStore } = require('./blob_store')
const { InvalidChangeError } = require('./errors')
const { getContentHash } = require('./content_hash')
function countChangeBytes(change) { function countChangeBytes(change) {
// Note: This is not quite accurate, because the raw change may contain raw // Note: This is not quite accurate, because the raw change may contain raw
@@ -59,10 +63,18 @@ async function persistChanges(projectId, allChanges, limits, clientEndVersion) {
assert.maybe.object(limits) assert.maybe.object(limits)
assert.integer(clientEndVersion) assert.integer(clientEndVersion)
const blobStore = new BlobStore(projectId)
let currentChunk let currentChunk
// currentSnapshot tracks the latest change that we're applying; we use it to
// check that the changes we are persisting are valid. /**
* currentSnapshot tracks the latest change that we're applying; we use it to
* check that the changes we are persisting are valid.
*
* @type {core.Snapshot}
*/
let currentSnapshot let currentSnapshot
let originalEndVersion let originalEndVersion
let changesToPersist let changesToPersist
@@ -83,22 +95,95 @@ async function persistChanges(projectId, allChanges, limits, clientEndVersion) {
} }
} }
function fillChunk(chunk, changes) { /**
* Add changes to a chunk until the chunk is full
*
* The chunk is full if it reaches a certain number of changes or a certain
* size in bytes
*
* @param {core.Chunk} chunk
* @param {core.Change[]} changes
*/
async function fillChunk(chunk, changes) {
let totalBytes = totalChangeBytes(chunk.getChanges()) let totalBytes = totalChangeBytes(chunk.getChanges())
let changesPushed = false let changesPushed = false
while (changes.length > 0) { while (changes.length > 0) {
if (chunk.getChanges().length >= limits.maxChunkChanges) break if (chunk.getChanges().length >= limits.maxChunkChanges) {
const changeBytes = countChangeBytes(changes[0]) break
if (totalBytes + changeBytes > limits.maxChunkChangeBytes) break }
const changesToFill = changes.splice(0, 1)
currentSnapshot.applyAll(changesToFill, { strict: true }) const change = changes[0]
chunk.pushChanges(changesToFill) const changeBytes = countChangeBytes(change)
if (totalBytes + changeBytes > limits.maxChunkChangeBytes) {
break
}
for (const operation of change.iterativelyApplyTo(currentSnapshot, {
strict: true,
})) {
try {
await validateContentHash(operation)
} catch (err) {
// Temporary: skip validation errors
if (err instanceof InvalidChangeError) {
logger.warn(
{ err, projectId },
'content snapshot mismatch (ignored)'
)
} else {
throw err
}
}
}
chunk.pushChanges([change])
changes.shift()
totalBytes += changeBytes totalBytes += changeBytes
changesPushed = true changesPushed = true
} }
return changesPushed return changesPushed
} }
/**
* Check that the operation is valid and can be incorporated to the history.
*
* For now, this checks content hashes when they are provided.
*
* @param {core.Operation} operation
*/
async function validateContentHash(operation) {
if (operation instanceof core.EditFileOperation) {
const editOperation = operation.getOperation()
if (
editOperation instanceof core.TextOperation &&
editOperation.contentHash != null
) {
const path = operation.getPathname()
const file = currentSnapshot.getFile(path)
if (file == null) {
throw new InvalidChangeError('file not found for hash validation', {
projectId,
path,
})
}
await file.load('eager', blobStore)
const content = file.getContent({ filterTrackedDeletes: true })
const expectedHash = editOperation.contentHash
const actualHash = content != null ? getContentHash(content) : null
logger.debug({ expectedHash, actualHash }, 'validating content hash')
if (actualHash !== expectedHash) {
throw new InvalidChangeError('content hash mismatch', {
projectId,
path,
expectedHash,
actualHash,
})
}
}
}
}
async function extendLastChunkIfPossible() { async function extendLastChunkIfPossible() {
const latestChunk = await chunkStore.loadLatest(projectId) const latestChunk = await chunkStore.loadLatest(projectId)
@@ -115,7 +200,11 @@ async function persistChanges(projectId, allChanges, limits, clientEndVersion) {
const timer = new Timer() const timer = new Timer()
currentSnapshot.applyAll(latestChunk.getChanges()) currentSnapshot.applyAll(latestChunk.getChanges())
if (!fillChunk(currentChunk, changesToPersist)) return const changesPushed = await fillChunk(currentChunk, changesToPersist)
if (!changesPushed) {
return
}
checkElapsedTime(timer) checkElapsedTime(timer)
await chunkStore.update(projectId, originalEndVersion, currentChunk) await chunkStore.update(projectId, originalEndVersion, currentChunk)
@@ -127,7 +216,9 @@ async function persistChanges(projectId, allChanges, limits, clientEndVersion) {
const history = new History(currentSnapshot.clone(), []) const history = new History(currentSnapshot.clone(), [])
const chunk = new Chunk(history, endVersion) const chunk = new Chunk(history, endVersion)
const timer = new Timer() const timer = new Timer()
if (fillChunk(chunk, changesToPersist)) {
const changesPushed = await fillChunk(chunk, changesToPersist)
if (changesPushed) {
checkElapsedTime(timer) checkElapsedTime(timer)
currentChunk = chunk currentChunk = chunk
await chunkStore.create(projectId, chunk) await chunkStore.create(projectId, chunk)

View File

@@ -1,5 +1,6 @@
'use strict' 'use strict'
const { createHash } = require('node:crypto')
const { expect } = require('chai') const { expect } = require('chai')
const cleanup = require('./support/cleanup') const cleanup = require('./support/cleanup')
@@ -11,6 +12,8 @@ const persistChanges = storage.persistChanges
const core = require('overleaf-editor-core') const core = require('overleaf-editor-core')
const AddFileOperation = core.AddFileOperation const AddFileOperation = core.AddFileOperation
const EditFileOperation = core.EditFileOperation
const TextOperation = core.TextOperation
const Change = core.Change const Change = core.Change
const Chunk = core.Chunk const Chunk = core.Chunk
const File = core.File const File = core.File
@@ -28,7 +31,7 @@ describe('persistChanges', function () {
farFuture.setTime(farFuture.getTime() + 7 * 24 * 3600 * 1000) farFuture.setTime(farFuture.getTime() + 7 * 24 * 3600 * 1000)
}) })
it('persists changes', function () { it('persists changes', async function () {
const limitsToPersistImmediately = { const limitsToPersistImmediately = {
minChangeTimestamp: farFuture, minChangeTimestamp: farFuture,
maxChangeTimestamp: farFuture, maxChangeTimestamp: farFuture,
@@ -41,29 +44,29 @@ describe('persistChanges', function () {
) )
const changes = [change] const changes = [change]
return chunkStore await chunkStore.initializeProject(projectId)
.initializeProject(projectId) const result = await persistChanges(
.then(() => { projectId,
return persistChanges(projectId, changes, limitsToPersistImmediately, 0) changes,
}) limitsToPersistImmediately,
.then(result => { 0
const history = new History(new Snapshot(), changes) )
const currentChunk = new Chunk(history, 0)
expect(result).to.deep.equal({ const history = new History(new Snapshot(), changes)
numberOfChangesPersisted: 1, const currentChunk = new Chunk(history, 0)
originalEndVersion: 0, expect(result).to.deep.equal({
currentChunk, numberOfChangesPersisted: 1,
}) originalEndVersion: 0,
return chunkStore.loadLatest(projectId) currentChunk,
}) })
.then(chunk => {
expect(chunk.getStartVersion()).to.equal(0) const chunk = await chunkStore.loadLatest(projectId)
expect(chunk.getEndVersion()).to.equal(1) expect(chunk.getStartVersion()).to.equal(0)
expect(chunk.getChanges().length).to.equal(1) expect(chunk.getEndVersion()).to.equal(1)
}) expect(chunk.getChanges().length).to.equal(1)
}) })
it('persists changes in two chunks', function () { it('persists changes in two chunks', async function () {
const limitsToPersistImmediately = { const limitsToPersistImmediately = {
maxChunkChanges: 1, maxChunkChanges: 1,
minChangeTimestamp: farFuture, minChangeTimestamp: farFuture,
@@ -82,36 +85,36 @@ describe('persistChanges', function () {
) )
const changes = [firstChange, secondChange] const changes = [firstChange, secondChange]
return chunkStore await chunkStore.initializeProject(projectId)
.initializeProject(projectId) const result = await persistChanges(
.then(() => { projectId,
return persistChanges(projectId, changes, limitsToPersistImmediately, 0) changes,
}) limitsToPersistImmediately,
.then(result => { 0
const snapshot = Snapshot.fromRaw({ )
files: {
'a.tex': { const snapshot = Snapshot.fromRaw({
content: '', files: {
}, 'a.tex': {
}, content: '',
}) },
const history = new History(snapshot, [secondChange]) },
const currentChunk = new Chunk(history, 1) })
expect(result).to.deep.equal({ const history = new History(snapshot, [secondChange])
numberOfChangesPersisted: 2, const currentChunk = new Chunk(history, 1)
originalEndVersion: 0, expect(result).to.deep.equal({
currentChunk, numberOfChangesPersisted: 2,
}) originalEndVersion: 0,
return chunkStore.loadLatest(projectId) currentChunk,
}) })
.then(chunk => {
expect(chunk.getStartVersion()).to.equal(1) const chunk = await chunkStore.loadLatest(projectId)
expect(chunk.getEndVersion()).to.equal(2) expect(chunk.getStartVersion()).to.equal(1)
expect(chunk.getChanges().length).to.equal(1) expect(chunk.getEndVersion()).to.equal(2)
}) expect(chunk.getChanges().length).to.equal(1)
}) })
it('persists the snapshot at the start of the chunk', function () { it('persists the snapshot at the start of the chunk', async function () {
const limitsToPersistImmediately = { const limitsToPersistImmediately = {
maxChunkChanges: 2, maxChunkChanges: 2,
minChangeTimestamp: farFuture, minChangeTimestamp: farFuture,
@@ -130,29 +133,29 @@ describe('persistChanges', function () {
) )
const changes = [firstChange, secondChange] const changes = [firstChange, secondChange]
return chunkStore await chunkStore.initializeProject(projectId)
.initializeProject(projectId) const result = await persistChanges(
.then(() => { projectId,
return persistChanges(projectId, changes, limitsToPersistImmediately, 0) changes,
}) limitsToPersistImmediately,
.then(result => { 0
const history = new History(new Snapshot(), changes) )
const currentChunk = new Chunk(history, 0)
expect(result).to.deep.equal({ const history = new History(new Snapshot(), changes)
numberOfChangesPersisted: 2, const currentChunk = new Chunk(history, 0)
originalEndVersion: 0, expect(result).to.deep.equal({
currentChunk, numberOfChangesPersisted: 2,
}) originalEndVersion: 0,
return chunkStore.loadLatest(projectId) currentChunk,
}) })
.then(chunk => {
expect(chunk.getStartVersion()).to.equal(0) const chunk = await chunkStore.loadLatest(projectId)
expect(chunk.getEndVersion()).to.equal(2) expect(chunk.getStartVersion()).to.equal(0)
expect(chunk.getChanges().length).to.equal(2) expect(chunk.getEndVersion()).to.equal(2)
}) expect(chunk.getChanges().length).to.equal(2)
}) })
it("errors if the version doesn't match the latest chunk", function () { it("errors if the version doesn't match the latest chunk", async function () {
const limitsToPersistImmediately = { const limitsToPersistImmediately = {
minChangeTimestamp: farFuture, minChangeTimestamp: farFuture,
maxChangeTimestamp: farFuture, maxChangeTimestamp: farFuture,
@@ -169,18 +172,82 @@ describe('persistChanges', function () {
[] []
) )
const changes = [firstChange, secondChange] const changes = [firstChange, secondChange]
return chunkStore
.initializeProject(projectId) await chunkStore.initializeProject(projectId)
.then(() => { await expect(
return persistChanges(projectId, changes, limitsToPersistImmediately, 1) persistChanges(projectId, changes, limitsToPersistImmediately, 1)
}) ).to.be.rejectedWith(
.then(() => { 'client sent updates with end_version 1 but latest chunk has end_version 0'
expect.fail() )
}) })
.catch(err => {
expect(err.message).to.equal( describe('content hash validation', function () {
'client sent updates with end_version 1 but latest chunk has end_version 0' it('acccepts a change with a valid hash', async function () {
) const limitsToPersistImmediately = {
}) minChangeTimestamp: farFuture,
maxChangeTimestamp: farFuture,
}
const projectId = fixtures.docs.uninitializedProject.id
await chunkStore.initializeProject(projectId)
const textOperation = new TextOperation()
textOperation.insert('hello ')
textOperation.retain(5)
textOperation.contentHash = hashString('hello world')
const change = new Change(
[
new AddFileOperation('a.tex', File.fromString('world')),
new EditFileOperation('a.tex', textOperation),
],
new Date(),
[]
)
const changes = [change]
const result = await persistChanges(
projectId,
changes,
limitsToPersistImmediately,
0
)
expect(result.numberOfChangesPersisted).to.equal(1)
})
it('acccepts a change with an invalid hash (only logs for now)', async function () {
const limitsToPersistImmediately = {
minChangeTimestamp: farFuture,
maxChangeTimestamp: farFuture,
}
const projectId = fixtures.docs.uninitializedProject.id
await chunkStore.initializeProject(projectId)
const textOperation = new TextOperation()
textOperation.insert('hello ')
textOperation.retain(5)
textOperation.contentHash = hashString('bad hash')
const change = new Change(
[
new AddFileOperation('a.tex', File.fromString('world')),
new EditFileOperation('a.tex', textOperation),
],
new Date(),
[]
)
const changes = [change]
const result = await persistChanges(
projectId,
changes,
limitsToPersistImmediately,
0
)
expect(result.numberOfChangesPersisted).to.equal(1)
})
}) })
}) })
function hashString(s) {
const hash = createHash('sha-1')
hash.update(s)
return hash.digest('hex')
}