Merge pull request #32877 from overleaf/mg-fix-bom-applyerror

Preserve UTF-8 BOM when fetching blobs in ProjectSnapshot GitOrigin-RevId: 758156f8c9ac1d02b5ea06447f759012118e1905
2026-06-03 22:29:01 +02:00 · 2026-04-17 11:16:06 +02:00
parent ecda482213
commit 99c8bd2e1c
2 changed files with 246 additions and 1 deletions
@@ -347,5 +347,13 @@ async function fetchBlob(
  if (!res.ok) {
    throw new FetchError('Failed to fetch blob', url, undefined, res)
  }
-  return await res.text()
+
+  // Use arrayBuffer + TextDecoder rather than res.text() to preserve any
+  // UTF-8 BOM (U+FEFF) in the blob content. The server stores blobs as-is
+  // and includes the BOM in stringLength, so text operations are built
+  // against a BOM-inclusive length. Response.text() strips the BOM per the
+  // Encoding spec, making the string 1 char shorter than expected and causing
+  // ApplyError when the operations are applied.
+  const buffer = await res.arrayBuffer()
+  return new TextDecoder('utf-8', { ignoreBOM: true }).decode(buffer)
 }
@@ -477,4 +477,241 @@ describe('ProjectSnapshot', function () {
      expect(fetchMock.callHistory.calls('changes-2')).to.have.length(1)
    })
  })
+
+  describe('blob with UTF-8 BOM', function () {
+    // Files uploaded from Windows editors often have a UTF-8 BOM (U+FEFF) at
+    // the start. The server stores the blob as-is and counts the BOM in
+    // stringLength. TextOperations are built against that length.
+    //
+    // Response.text() strips the BOM per the Encoding spec, making the content
+    // 1 char shorter than expected — causing ApplyError on every page load.
+    // The fix uses arrayBuffer() + TextDecoder({ ignoreBOM: true }) to preserve
+    // the BOM, matching how the server counts stringLength.
+
+    const bomHash = '1111111111111111111111111111111111111111'
+    const bomHash2 = '2222222222222222222222222222222222222222'
+    const noBomHash = '3333333333333333333333333333333333333333'
+
+    const bomContent = '\uFEFF@article{Test2020,\n  author = {Smith, J},\n}\n'
+    const bomContent2 = '\uFEFF@article{Other2021,\n  author = {Jones, A},\n}\n'
+    const noBomContent = '@article{NoBom2022,\n  author = {Lee, B},\n}\n'
+
+    afterEach(function () {
+      fetchMock.removeRoutes().clearHistory()
+    })
+
+    it('loads a doc whose blob starts with a UTF-8 BOM', async function () {
+      // The main production bug: upload a BOM file, make one edit, reload.
+      const insertedText = '% comment\n'
+      const bomChunk = {
+        history: {
+          snapshot: { files: {} },
+          changes: [
+            {
+              operations: [
+                {
+                  pathname: 'refs.bib',
+                  file: { hash: bomHash, stringLength: bomContent.length },
+                },
+              ],
+              timestamp: '2025-01-01T12:00:00.000Z',
+            },
+            {
+              operations: [
+                {
+                  pathname: 'refs.bib',
+                  // baseLength includes BOM — matches server stringLength
+                  textOperation: [bomContent.length, insertedText],
+                },
+              ],
+              timestamp: '2025-01-01T12:01:00.000Z',
+            },
+          ],
+        },
+        startVersion: 0,
+      }
+
+      fetchMock.post(`/project/${projectId}/flush`, 200)
+      fetchMock.getOnce(`/project/${projectId}/latest/history`, {
+        chunk: bomChunk,
+      })
+      fetchMock.get(`/project/${projectId}/blob/${bomHash}`, bomContent)
+
+      await snapshot.refresh()
+
+      expect(snapshot.getDocContents('refs.bib')).to.equal(
+        bomContent + insertedText
+      )
+    })
+
+    it('loads multiple BOM files in the same project', async function () {
+      const insert1 = '% first\n'
+      const insert2 = '% second\n'
+      const bomChunk = {
+        history: {
+          snapshot: { files: {} },
+          changes: [
+            {
+              operations: [
+                {
+                  pathname: 'refs1.bib',
+                  file: { hash: bomHash, stringLength: bomContent.length },
+                },
+                {
+                  pathname: 'refs2.bib',
+                  file: { hash: bomHash2, stringLength: bomContent2.length },
+                },
+              ],
+              timestamp: '2025-01-01T12:00:00.000Z',
+            },
+            {
+              operations: [
+                {
+                  pathname: 'refs1.bib',
+                  textOperation: [bomContent.length, insert1],
+                },
+                {
+                  pathname: 'refs2.bib',
+                  textOperation: [bomContent2.length, insert2],
+                },
+              ],
+              timestamp: '2025-01-01T12:01:00.000Z',
+            },
+          ],
+        },
+        startVersion: 0,
+      }
+
+      fetchMock.post(`/project/${projectId}/flush`, 200)
+      fetchMock.getOnce(`/project/${projectId}/latest/history`, {
+        chunk: bomChunk,
+      })
+      fetchMock.get(`/project/${projectId}/blob/${bomHash}`, bomContent)
+      fetchMock.get(`/project/${projectId}/blob/${bomHash2}`, bomContent2)
+
+      await snapshot.refresh()
+
+      expect(snapshot.getDocContents('refs1.bib')).to.equal(
+        bomContent + insert1
+      )
+      expect(snapshot.getDocContents('refs2.bib')).to.equal(
+        bomContent2 + insert2
+      )
+    })
+
+    it('loads a BOM file with multiple accumulated textOps', async function () {
+      // Multiple edits accumulate in the lazy operations list before toEager
+      // is called. All ops use BOM-inclusive baseLengths.
+      const bomChunk = {
+        history: {
+          snapshot: { files: {} },
+          changes: [
+            {
+              operations: [
+                {
+                  pathname: 'refs.bib',
+                  file: { hash: bomHash, stringLength: bomContent.length },
+                },
+              ],
+              timestamp: '2025-01-01T12:00:00.000Z',
+            },
+            {
+              operations: [
+                {
+                  pathname: 'refs.bib',
+                  // first edit: insert text at end
+                  textOperation: [bomContent.length, '% edit1\n'],
+                },
+              ],
+              timestamp: '2025-01-01T12:01:00.000Z',
+            },
+            {
+              operations: [
+                {
+                  pathname: 'refs.bib',
+                  // second edit: insert more text at end
+                  textOperation: [
+                    bomContent.length + '% edit1\n'.length,
+                    '% edit2\n',
+                  ],
+                },
+              ],
+              timestamp: '2025-01-01T12:02:00.000Z',
+            },
+          ],
+        },
+        startVersion: 0,
+      }
+
+      fetchMock.post(`/project/${projectId}/flush`, 200)
+      fetchMock.getOnce(`/project/${projectId}/latest/history`, {
+        chunk: bomChunk,
+      })
+      fetchMock.get(`/project/${projectId}/blob/${bomHash}`, bomContent)
+
+      await snapshot.refresh()
+
+      expect(snapshot.getDocContents('refs.bib')).to.equal(
+        bomContent + '% edit1\n' + '% edit2\n'
+      )
+    })
+
+    it('does not affect files without a BOM', async function () {
+      // BOM handling is per-file; non-BOM files must not be broken.
+      const insertedText = '% added\n'
+      const mixedChunk = {
+        history: {
+          snapshot: { files: {} },
+          changes: [
+            {
+              operations: [
+                {
+                  pathname: 'bom.bib',
+                  file: { hash: bomHash, stringLength: bomContent.length },
+                },
+                {
+                  pathname: 'nobom.bib',
+                  file: {
+                    hash: noBomHash,
+                    stringLength: noBomContent.length,
+                  },
+                },
+              ],
+              timestamp: '2025-01-01T12:00:00.000Z',
+            },
+            {
+              operations: [
+                {
+                  pathname: 'bom.bib',
+                  textOperation: [bomContent.length, insertedText],
+                },
+                {
+                  pathname: 'nobom.bib',
+                  textOperation: [noBomContent.length, insertedText],
+                },
+              ],
+              timestamp: '2025-01-01T12:01:00.000Z',
+            },
+          ],
+        },
+        startVersion: 0,
+      }
+
+      fetchMock.post(`/project/${projectId}/flush`, 200)
+      fetchMock.getOnce(`/project/${projectId}/latest/history`, {
+        chunk: mixedChunk,
+      })
+      fetchMock.get(`/project/${projectId}/blob/${bomHash}`, bomContent)
+      fetchMock.get(`/project/${projectId}/blob/${noBomHash}`, noBomContent)
+
+      await snapshot.refresh()
+
+      expect(snapshot.getDocContents('bom.bib')).to.equal(
+        bomContent + insertedText
+      )
+      expect(snapshot.getDocContents('nobom.bib')).to.equal(
+        noBomContent + insertedText
+      )
+    })
+  })
 })