diff --git a/services/web/frontend/js/infrastructure/project-snapshot.ts b/services/web/frontend/js/infrastructure/project-snapshot.ts index 3d8247437f..55ac882cd3 100644 --- a/services/web/frontend/js/infrastructure/project-snapshot.ts +++ b/services/web/frontend/js/infrastructure/project-snapshot.ts @@ -347,5 +347,13 @@ async function fetchBlob( if (!res.ok) { throw new FetchError('Failed to fetch blob', url, undefined, res) } - return await res.text() + + // Use arrayBuffer + TextDecoder rather than res.text() to preserve any + // UTF-8 BOM (U+FEFF) in the blob content. The server stores blobs as-is + // and includes the BOM in stringLength, so text operations are built + // against a BOM-inclusive length. Response.text() strips the BOM per the + // Encoding spec, making the string 1 char shorter than expected and causing + // ApplyError when the operations are applied. + const buffer = await res.arrayBuffer() + return new TextDecoder('utf-8', { ignoreBOM: true }).decode(buffer) } diff --git a/services/web/test/frontend/infrastructure/project-snapshot.test.ts b/services/web/test/frontend/infrastructure/project-snapshot.test.ts index aba3c833a6..3fa1aea527 100644 --- a/services/web/test/frontend/infrastructure/project-snapshot.test.ts +++ b/services/web/test/frontend/infrastructure/project-snapshot.test.ts @@ -477,4 +477,241 @@ describe('ProjectSnapshot', function () { expect(fetchMock.callHistory.calls('changes-2')).to.have.length(1) }) }) + + describe('blob with UTF-8 BOM', function () { + // Files uploaded from Windows editors often have a UTF-8 BOM (U+FEFF) at + // the start. The server stores the blob as-is and counts the BOM in + // stringLength. TextOperations are built against that length. + // + // Response.text() strips the BOM per the Encoding spec, making the content + // 1 char shorter than expected — causing ApplyError on every page load. + // The fix uses arrayBuffer() + TextDecoder({ ignoreBOM: true }) to preserve + // the BOM, matching how the server counts stringLength. + + const bomHash = '1111111111111111111111111111111111111111' + const bomHash2 = '2222222222222222222222222222222222222222' + const noBomHash = '3333333333333333333333333333333333333333' + + const bomContent = '\uFEFF@article{Test2020,\n author = {Smith, J},\n}\n' + const bomContent2 = '\uFEFF@article{Other2021,\n author = {Jones, A},\n}\n' + const noBomContent = '@article{NoBom2022,\n author = {Lee, B},\n}\n' + + afterEach(function () { + fetchMock.removeRoutes().clearHistory() + }) + + it('loads a doc whose blob starts with a UTF-8 BOM', async function () { + // The main production bug: upload a BOM file, make one edit, reload. + const insertedText = '% comment\n' + const bomChunk = { + history: { + snapshot: { files: {} }, + changes: [ + { + operations: [ + { + pathname: 'refs.bib', + file: { hash: bomHash, stringLength: bomContent.length }, + }, + ], + timestamp: '2025-01-01T12:00:00.000Z', + }, + { + operations: [ + { + pathname: 'refs.bib', + // baseLength includes BOM — matches server stringLength + textOperation: [bomContent.length, insertedText], + }, + ], + timestamp: '2025-01-01T12:01:00.000Z', + }, + ], + }, + startVersion: 0, + } + + fetchMock.post(`/project/${projectId}/flush`, 200) + fetchMock.getOnce(`/project/${projectId}/latest/history`, { + chunk: bomChunk, + }) + fetchMock.get(`/project/${projectId}/blob/${bomHash}`, bomContent) + + await snapshot.refresh() + + expect(snapshot.getDocContents('refs.bib')).to.equal( + bomContent + insertedText + ) + }) + + it('loads multiple BOM files in the same project', async function () { + const insert1 = '% first\n' + const insert2 = '% second\n' + const bomChunk = { + history: { + snapshot: { files: {} }, + changes: [ + { + operations: [ + { + pathname: 'refs1.bib', + file: { hash: bomHash, stringLength: bomContent.length }, + }, + { + pathname: 'refs2.bib', + file: { hash: bomHash2, stringLength: bomContent2.length }, + }, + ], + timestamp: '2025-01-01T12:00:00.000Z', + }, + { + operations: [ + { + pathname: 'refs1.bib', + textOperation: [bomContent.length, insert1], + }, + { + pathname: 'refs2.bib', + textOperation: [bomContent2.length, insert2], + }, + ], + timestamp: '2025-01-01T12:01:00.000Z', + }, + ], + }, + startVersion: 0, + } + + fetchMock.post(`/project/${projectId}/flush`, 200) + fetchMock.getOnce(`/project/${projectId}/latest/history`, { + chunk: bomChunk, + }) + fetchMock.get(`/project/${projectId}/blob/${bomHash}`, bomContent) + fetchMock.get(`/project/${projectId}/blob/${bomHash2}`, bomContent2) + + await snapshot.refresh() + + expect(snapshot.getDocContents('refs1.bib')).to.equal( + bomContent + insert1 + ) + expect(snapshot.getDocContents('refs2.bib')).to.equal( + bomContent2 + insert2 + ) + }) + + it('loads a BOM file with multiple accumulated textOps', async function () { + // Multiple edits accumulate in the lazy operations list before toEager + // is called. All ops use BOM-inclusive baseLengths. + const bomChunk = { + history: { + snapshot: { files: {} }, + changes: [ + { + operations: [ + { + pathname: 'refs.bib', + file: { hash: bomHash, stringLength: bomContent.length }, + }, + ], + timestamp: '2025-01-01T12:00:00.000Z', + }, + { + operations: [ + { + pathname: 'refs.bib', + // first edit: insert text at end + textOperation: [bomContent.length, '% edit1\n'], + }, + ], + timestamp: '2025-01-01T12:01:00.000Z', + }, + { + operations: [ + { + pathname: 'refs.bib', + // second edit: insert more text at end + textOperation: [ + bomContent.length + '% edit1\n'.length, + '% edit2\n', + ], + }, + ], + timestamp: '2025-01-01T12:02:00.000Z', + }, + ], + }, + startVersion: 0, + } + + fetchMock.post(`/project/${projectId}/flush`, 200) + fetchMock.getOnce(`/project/${projectId}/latest/history`, { + chunk: bomChunk, + }) + fetchMock.get(`/project/${projectId}/blob/${bomHash}`, bomContent) + + await snapshot.refresh() + + expect(snapshot.getDocContents('refs.bib')).to.equal( + bomContent + '% edit1\n' + '% edit2\n' + ) + }) + + it('does not affect files without a BOM', async function () { + // BOM handling is per-file; non-BOM files must not be broken. + const insertedText = '% added\n' + const mixedChunk = { + history: { + snapshot: { files: {} }, + changes: [ + { + operations: [ + { + pathname: 'bom.bib', + file: { hash: bomHash, stringLength: bomContent.length }, + }, + { + pathname: 'nobom.bib', + file: { + hash: noBomHash, + stringLength: noBomContent.length, + }, + }, + ], + timestamp: '2025-01-01T12:00:00.000Z', + }, + { + operations: [ + { + pathname: 'bom.bib', + textOperation: [bomContent.length, insertedText], + }, + { + pathname: 'nobom.bib', + textOperation: [noBomContent.length, insertedText], + }, + ], + timestamp: '2025-01-01T12:01:00.000Z', + }, + ], + }, + startVersion: 0, + } + + fetchMock.post(`/project/${projectId}/flush`, 200) + fetchMock.getOnce(`/project/${projectId}/latest/history`, { + chunk: mixedChunk, + }) + fetchMock.get(`/project/${projectId}/blob/${bomHash}`, bomContent) + fetchMock.get(`/project/${projectId}/blob/${noBomHash}`, noBomContent) + + await snapshot.refresh() + + expect(snapshot.getDocContents('bom.bib')).to.equal( + bomContent + insertedText + ) + expect(snapshot.getDocContents('nobom.bib')).to.equal( + noBomContent + insertedText + ) + }) + }) })