Merge pull request #23398 from overleaf/em-log-doc-hash-mismatches

Validate content hashes in history (log only)

GitOrigin-RevId: ed772fc4e4d0aa9e980f9693a759647bd937e13a
This commit is contained in:
Eric Mc Sween
2025-02-06 08:12:24 -05:00
committed by Copybot
parent 735fd761cd
commit ce4c8a4e47
5 changed files with 298 additions and 95 deletions

View File

@@ -100,6 +100,9 @@ class Change {
)
}
/**
* @return {Operation[]}
*/
getOperations() {
return this.operations
}
@@ -248,6 +251,24 @@ class Change {
* @param {boolean} [opts.strict] - Do not ignore recoverable errors
*/
applyTo(snapshot, opts = {}) {
// eslint-disable-next-line no-unused-vars
for (const operation of this.iterativelyApplyTo(snapshot, opts)) {
// Nothing to do: we're just consuming the iterator for the side effects
}
}
/**
* Generator that applies this change to a snapshot and yields each
* operation after it has been applied.
*
* Recoverable errors (caused by historical bad data) are ignored unless
* opts.strict is true
*
* @param {Snapshot} snapshot modified in place
* @param {object} opts
* @param {boolean} [opts.strict] - Do not ignore recoverable errors
*/
*iterativelyApplyTo(snapshot, opts = {}) {
assert.object(snapshot, 'bad snapshot')
for (const operation of this.operations) {
@@ -261,6 +282,7 @@ class Change {
throw err
}
}
yield operation
}
// update project version if present in change

View File

@@ -0,0 +1,18 @@
// @ts-check
const { createHash } = require('node:crypto')
/**
* Compute a SHA-1 hash of the content
*
* This is used to validate incoming updates.
*
* @param {string} content
*/
function getContentHash(content) {
const hash = createHash('sha-1')
hash.update(content)
return hash.digest('hex')
}
module.exports = { getContentHash }

View File

@@ -0,0 +1,5 @@
const OError = require('@overleaf/o-error')
class InvalidChangeError extends OError {}
module.exports = { InvalidChangeError }

View File

@@ -3,6 +3,7 @@
'use strict'
const _ = require('lodash')
const logger = require('@overleaf/logger')
const core = require('overleaf-editor-core')
const Chunk = core.Chunk
@@ -10,6 +11,9 @@ const History = core.History
const assert = require('./assert')
const chunkStore = require('./chunk_store')
const { BlobStore } = require('./blob_store')
const { InvalidChangeError } = require('./errors')
const { getContentHash } = require('./content_hash')
function countChangeBytes(change) {
// Note: This is not quite accurate, because the raw change may contain raw
@@ -59,10 +63,18 @@ async function persistChanges(projectId, allChanges, limits, clientEndVersion) {
assert.maybe.object(limits)
assert.integer(clientEndVersion)
const blobStore = new BlobStore(projectId)
let currentChunk
// currentSnapshot tracks the latest change that we're applying; we use it to
// check that the changes we are persisting are valid.
/**
* currentSnapshot tracks the latest change that we're applying; we use it to
* check that the changes we are persisting are valid.
*
* @type {core.Snapshot}
*/
let currentSnapshot
let originalEndVersion
let changesToPersist
@@ -83,22 +95,95 @@ async function persistChanges(projectId, allChanges, limits, clientEndVersion) {
}
}
function fillChunk(chunk, changes) {
/**
* Add changes to a chunk until the chunk is full
*
* The chunk is full if it reaches a certain number of changes or a certain
* size in bytes
*
* @param {core.Chunk} chunk
* @param {core.Change[]} changes
*/
async function fillChunk(chunk, changes) {
let totalBytes = totalChangeBytes(chunk.getChanges())
let changesPushed = false
while (changes.length > 0) {
if (chunk.getChanges().length >= limits.maxChunkChanges) break
const changeBytes = countChangeBytes(changes[0])
if (totalBytes + changeBytes > limits.maxChunkChangeBytes) break
const changesToFill = changes.splice(0, 1)
currentSnapshot.applyAll(changesToFill, { strict: true })
chunk.pushChanges(changesToFill)
if (chunk.getChanges().length >= limits.maxChunkChanges) {
break
}
const change = changes[0]
const changeBytes = countChangeBytes(change)
if (totalBytes + changeBytes > limits.maxChunkChangeBytes) {
break
}
for (const operation of change.iterativelyApplyTo(currentSnapshot, {
strict: true,
})) {
try {
await validateContentHash(operation)
} catch (err) {
// Temporary: skip validation errors
if (err instanceof InvalidChangeError) {
logger.warn(
{ err, projectId },
'content snapshot mismatch (ignored)'
)
} else {
throw err
}
}
}
chunk.pushChanges([change])
changes.shift()
totalBytes += changeBytes
changesPushed = true
}
return changesPushed
}
/**
* Check that the operation is valid and can be incorporated to the history.
*
* For now, this checks content hashes when they are provided.
*
* @param {core.Operation} operation
*/
async function validateContentHash(operation) {
if (operation instanceof core.EditFileOperation) {
const editOperation = operation.getOperation()
if (
editOperation instanceof core.TextOperation &&
editOperation.contentHash != null
) {
const path = operation.getPathname()
const file = currentSnapshot.getFile(path)
if (file == null) {
throw new InvalidChangeError('file not found for hash validation', {
projectId,
path,
})
}
await file.load('eager', blobStore)
const content = file.getContent({ filterTrackedDeletes: true })
const expectedHash = editOperation.contentHash
const actualHash = content != null ? getContentHash(content) : null
logger.debug({ expectedHash, actualHash }, 'validating content hash')
if (actualHash !== expectedHash) {
throw new InvalidChangeError('content hash mismatch', {
projectId,
path,
expectedHash,
actualHash,
})
}
}
}
}
async function extendLastChunkIfPossible() {
const latestChunk = await chunkStore.loadLatest(projectId)
@@ -115,7 +200,11 @@ async function persistChanges(projectId, allChanges, limits, clientEndVersion) {
const timer = new Timer()
currentSnapshot.applyAll(latestChunk.getChanges())
if (!fillChunk(currentChunk, changesToPersist)) return
const changesPushed = await fillChunk(currentChunk, changesToPersist)
if (!changesPushed) {
return
}
checkElapsedTime(timer)
await chunkStore.update(projectId, originalEndVersion, currentChunk)
@@ -127,7 +216,9 @@ async function persistChanges(projectId, allChanges, limits, clientEndVersion) {
const history = new History(currentSnapshot.clone(), [])
const chunk = new Chunk(history, endVersion)
const timer = new Timer()
if (fillChunk(chunk, changesToPersist)) {
const changesPushed = await fillChunk(chunk, changesToPersist)
if (changesPushed) {
checkElapsedTime(timer)
currentChunk = chunk
await chunkStore.create(projectId, chunk)

View File

@@ -1,5 +1,6 @@
'use strict'
const { createHash } = require('node:crypto')
const { expect } = require('chai')
const cleanup = require('./support/cleanup')
@@ -11,6 +12,8 @@ const persistChanges = storage.persistChanges
const core = require('overleaf-editor-core')
const AddFileOperation = core.AddFileOperation
const EditFileOperation = core.EditFileOperation
const TextOperation = core.TextOperation
const Change = core.Change
const Chunk = core.Chunk
const File = core.File
@@ -28,7 +31,7 @@ describe('persistChanges', function () {
farFuture.setTime(farFuture.getTime() + 7 * 24 * 3600 * 1000)
})
it('persists changes', function () {
it('persists changes', async function () {
const limitsToPersistImmediately = {
minChangeTimestamp: farFuture,
maxChangeTimestamp: farFuture,
@@ -41,29 +44,29 @@ describe('persistChanges', function () {
)
const changes = [change]
return chunkStore
.initializeProject(projectId)
.then(() => {
return persistChanges(projectId, changes, limitsToPersistImmediately, 0)
})
.then(result => {
const history = new History(new Snapshot(), changes)
const currentChunk = new Chunk(history, 0)
expect(result).to.deep.equal({
numberOfChangesPersisted: 1,
originalEndVersion: 0,
currentChunk,
})
return chunkStore.loadLatest(projectId)
})
.then(chunk => {
expect(chunk.getStartVersion()).to.equal(0)
expect(chunk.getEndVersion()).to.equal(1)
expect(chunk.getChanges().length).to.equal(1)
})
await chunkStore.initializeProject(projectId)
const result = await persistChanges(
projectId,
changes,
limitsToPersistImmediately,
0
)
const history = new History(new Snapshot(), changes)
const currentChunk = new Chunk(history, 0)
expect(result).to.deep.equal({
numberOfChangesPersisted: 1,
originalEndVersion: 0,
currentChunk,
})
const chunk = await chunkStore.loadLatest(projectId)
expect(chunk.getStartVersion()).to.equal(0)
expect(chunk.getEndVersion()).to.equal(1)
expect(chunk.getChanges().length).to.equal(1)
})
it('persists changes in two chunks', function () {
it('persists changes in two chunks', async function () {
const limitsToPersistImmediately = {
maxChunkChanges: 1,
minChangeTimestamp: farFuture,
@@ -82,36 +85,36 @@ describe('persistChanges', function () {
)
const changes = [firstChange, secondChange]
return chunkStore
.initializeProject(projectId)
.then(() => {
return persistChanges(projectId, changes, limitsToPersistImmediately, 0)
})
.then(result => {
const snapshot = Snapshot.fromRaw({
files: {
'a.tex': {
content: '',
},
},
})
const history = new History(snapshot, [secondChange])
const currentChunk = new Chunk(history, 1)
expect(result).to.deep.equal({
numberOfChangesPersisted: 2,
originalEndVersion: 0,
currentChunk,
})
return chunkStore.loadLatest(projectId)
})
.then(chunk => {
expect(chunk.getStartVersion()).to.equal(1)
expect(chunk.getEndVersion()).to.equal(2)
expect(chunk.getChanges().length).to.equal(1)
})
await chunkStore.initializeProject(projectId)
const result = await persistChanges(
projectId,
changes,
limitsToPersistImmediately,
0
)
const snapshot = Snapshot.fromRaw({
files: {
'a.tex': {
content: '',
},
},
})
const history = new History(snapshot, [secondChange])
const currentChunk = new Chunk(history, 1)
expect(result).to.deep.equal({
numberOfChangesPersisted: 2,
originalEndVersion: 0,
currentChunk,
})
const chunk = await chunkStore.loadLatest(projectId)
expect(chunk.getStartVersion()).to.equal(1)
expect(chunk.getEndVersion()).to.equal(2)
expect(chunk.getChanges().length).to.equal(1)
})
it('persists the snapshot at the start of the chunk', function () {
it('persists the snapshot at the start of the chunk', async function () {
const limitsToPersistImmediately = {
maxChunkChanges: 2,
minChangeTimestamp: farFuture,
@@ -130,29 +133,29 @@ describe('persistChanges', function () {
)
const changes = [firstChange, secondChange]
return chunkStore
.initializeProject(projectId)
.then(() => {
return persistChanges(projectId, changes, limitsToPersistImmediately, 0)
})
.then(result => {
const history = new History(new Snapshot(), changes)
const currentChunk = new Chunk(history, 0)
expect(result).to.deep.equal({
numberOfChangesPersisted: 2,
originalEndVersion: 0,
currentChunk,
})
return chunkStore.loadLatest(projectId)
})
.then(chunk => {
expect(chunk.getStartVersion()).to.equal(0)
expect(chunk.getEndVersion()).to.equal(2)
expect(chunk.getChanges().length).to.equal(2)
})
await chunkStore.initializeProject(projectId)
const result = await persistChanges(
projectId,
changes,
limitsToPersistImmediately,
0
)
const history = new History(new Snapshot(), changes)
const currentChunk = new Chunk(history, 0)
expect(result).to.deep.equal({
numberOfChangesPersisted: 2,
originalEndVersion: 0,
currentChunk,
})
const chunk = await chunkStore.loadLatest(projectId)
expect(chunk.getStartVersion()).to.equal(0)
expect(chunk.getEndVersion()).to.equal(2)
expect(chunk.getChanges().length).to.equal(2)
})
it("errors if the version doesn't match the latest chunk", function () {
it("errors if the version doesn't match the latest chunk", async function () {
const limitsToPersistImmediately = {
minChangeTimestamp: farFuture,
maxChangeTimestamp: farFuture,
@@ -169,18 +172,82 @@ describe('persistChanges', function () {
[]
)
const changes = [firstChange, secondChange]
return chunkStore
.initializeProject(projectId)
.then(() => {
return persistChanges(projectId, changes, limitsToPersistImmediately, 1)
})
.then(() => {
expect.fail()
})
.catch(err => {
expect(err.message).to.equal(
'client sent updates with end_version 1 but latest chunk has end_version 0'
)
})
await chunkStore.initializeProject(projectId)
await expect(
persistChanges(projectId, changes, limitsToPersistImmediately, 1)
).to.be.rejectedWith(
'client sent updates with end_version 1 but latest chunk has end_version 0'
)
})
describe('content hash validation', function () {
it('acccepts a change with a valid hash', async function () {
const limitsToPersistImmediately = {
minChangeTimestamp: farFuture,
maxChangeTimestamp: farFuture,
}
const projectId = fixtures.docs.uninitializedProject.id
await chunkStore.initializeProject(projectId)
const textOperation = new TextOperation()
textOperation.insert('hello ')
textOperation.retain(5)
textOperation.contentHash = hashString('hello world')
const change = new Change(
[
new AddFileOperation('a.tex', File.fromString('world')),
new EditFileOperation('a.tex', textOperation),
],
new Date(),
[]
)
const changes = [change]
const result = await persistChanges(
projectId,
changes,
limitsToPersistImmediately,
0
)
expect(result.numberOfChangesPersisted).to.equal(1)
})
it('acccepts a change with an invalid hash (only logs for now)', async function () {
const limitsToPersistImmediately = {
minChangeTimestamp: farFuture,
maxChangeTimestamp: farFuture,
}
const projectId = fixtures.docs.uninitializedProject.id
await chunkStore.initializeProject(projectId)
const textOperation = new TextOperation()
textOperation.insert('hello ')
textOperation.retain(5)
textOperation.contentHash = hashString('bad hash')
const change = new Change(
[
new AddFileOperation('a.tex', File.fromString('world')),
new EditFileOperation('a.tex', textOperation),
],
new Date(),
[]
)
const changes = [change]
const result = await persistChanges(
projectId,
changes,
limitsToPersistImmediately,
0
)
expect(result.numberOfChangesPersisted).to.equal(1)
})
})
})
function hashString(s) {
const hash = createHash('sha-1')
hash.update(s)
return hash.digest('hex')
}