Merge pull request #28533 from overleaf/mj-blob-ranges-maxsize

[web+history-v1] Use range request for binary blobs in project snapshot

GitOrigin-RevId: e72a2d481bf163b3e85361b42e30e82cf867f1cf
This commit is contained in:
Mathias Jakobsen
2025-09-19 09:42:30 +01:00
committed by Copybot
parent 6ed14c56ae
commit 6c79ce97cc
9 changed files with 313 additions and 55 deletions

View File

@@ -284,21 +284,22 @@ async function headProjectBlob(req, res) {
}
// Support simple, singular ranges starting from zero only, up-to 2MB = 2_000_000, 7 digits
const RANGE_HEADER = /^bytes=0-(\d{1,7})$/
const RANGE_HEADER = /^bytes=(\d{1,7})-(\d{1,7})$/
/**
* @param {string} header
* @return {{}|{start: number, end: number}}
* @return {undefined | {start: number, end: number}}
* @private
*/
function _getRangeOpts(header) {
if (!header) return {}
if (!header) return undefined
const match = header.match(RANGE_HEADER)
if (match) {
const end = parseInt(match[1], 10)
return { start: 0, end }
const start = parseInt(match[1], 10)
const end = parseInt(match[2], 10)
return { start, end }
}
return {}
return undefined
}
async function getProjectBlob(req, res, next) {
@@ -311,6 +312,31 @@ async function getProjectBlob(req, res, next) {
try {
let stream
try {
if (opts) {
// This is a range request, so we need to set the appropriate headers
// Browser caching only works if the total size is known, so we have
// to fetch the blob metadata first.
const metaData = await blobStore.getBlob(hash)
if (metaData) {
const blobLength = metaData.getByteLength()
if (opts.start > opts.end || opts.start >= blobLength) {
return res
.status(416) // Requested Range Not Satisfiable
.set('Content-Range', `bytes */${blobLength}`)
.set('Content-Length', '0')
.end()
}
// Valid range request
const actualEnd = Math.min(opts.end, blobLength - 1)
const returnedSize = actualEnd - opts.start + 1
res.set('Content-Length', returnedSize)
res.set(
'Content-Range',
`bytes ${opts.start}-${actualEnd}/${blobLength}`
)
res.status(206)
}
}
stream = await blobStore.getStream(hash, opts)
} catch (err) {
if (err instanceof Blob.NotFoundError) {

View File

@@ -190,6 +190,12 @@ exports.paths = {
type: 'file',
},
},
206: {
description: 'Success',
schema: {
type: 'file',
},
},
404: {
description: 'Not Found',
schema: {

View File

@@ -116,7 +116,7 @@ describe('Project blobs API', function () {
expect(payload).to.equal(fileContents.toString())
})
it('supports range request', async function () {
it('supports range request from beginning', async function () {
const url = new URL(
testServer.url(
`/api/projects/${projectId}/blobs/${testFiles.HELLO_TXT_HASH}`
@@ -125,7 +125,127 @@ describe('Project blobs API', function () {
url.searchParams.append('token', token)
const response = await fetch(url, { headers: { Range: 'bytes=0-4' } })
const payload = await response.text()
expect(payload).to.equal(fileContents.toString().slice(0, 4))
// 0-4 is inclusive, so 5 bytes
expect(payload).to.equal(fileContents.slice(0, 5).toString())
expect(response.headers.get('Content-Range')).to.equal(
`bytes 0-4/${testFiles.HELLO_TXT_BYTE_LENGTH}`
)
expect(response.headers.get('Content-Length')).to.equal('5')
expect(response.status).to.equal(HTTPStatus.PARTIAL_CONTENT)
})
it('supports range request in middle', async function () {
const url = new URL(
testServer.url(
`/api/projects/${projectId}/blobs/${testFiles.HELLO_TXT_HASH}`
)
)
url.searchParams.append('token', token)
const response = await fetch(url, { headers: { Range: 'bytes=5-9' } })
const payload = await response.text()
// 5-9 is inclusive, so 5 bytes
expect(payload).to.equal(fileContents.slice(5, 10).toString())
expect(response.headers.get('Content-Range')).to.equal(
`bytes 5-9/${testFiles.HELLO_TXT_BYTE_LENGTH}`
)
expect(response.headers.get('Content-Length')).to.equal('5')
expect(response.status).to.equal(HTTPStatus.PARTIAL_CONTENT)
})
it('supports range request past end', async function () {
const url = new URL(
testServer.url(
`/api/projects/${projectId}/blobs/${testFiles.HELLO_TXT_HASH}`
)
)
url.searchParams.append('token', token)
const response = await fetch(url, { headers: { Range: 'bytes=5-100' } })
const payload = await response.text()
expect(payload).to.equal(fileContents.slice(5).toString())
// Response range should be capped to the actual file size
expect(response.headers.get('Content-Range')).to.equal(
`bytes 5-10/${testFiles.HELLO_TXT_BYTE_LENGTH}`
)
expect(response.headers.get('Content-Length')).to.equal('6')
expect(response.status).to.equal(HTTPStatus.PARTIAL_CONTENT)
})
describe('with invalid request ranges', async function () {
async function testInvalidRange(range) {
const url = new URL(
testServer.url(
`/api/projects/${projectId}/blobs/${testFiles.HELLO_TXT_HASH}`
)
)
url.searchParams.append('token', token)
const response = await fetch(url, { headers: { Range: range } })
expect(response.headers.get('Content-Range')).to.equal(
`bytes */${testFiles.HELLO_TXT_BYTE_LENGTH}`
)
expect(response.headers.get('Content-Length')).to.equal('0')
expect(response.status).to.equal(
HTTPStatus.REQUESTED_RANGE_NOT_SATISFIABLE
)
}
it('should fail on invalid range where start > end', async function () {
testInvalidRange('bytes=10-5')
})
it('should fail on invalid range where start = length', async function () {
testInvalidRange(`bytes=${testFiles.HELLO_TXT_BYTE_LENGTH + 1}-105`)
})
it('should fail on invalid range where start > length', async function () {
testInvalidRange(`bytes=${testFiles.HELLO_TXT_BYTE_LENGTH}-105`)
})
})
describe('with an empty blob', async function () {
beforeEach(async function () {
const response = await fetch(
testServer.url(
`/api/projects/${projectId}/blobs/${testFiles.EMPTY_FILE_HASH}`
),
{
method: 'PUT',
headers: { Authorization: `Bearer ${token}` },
body: '',
}
)
expect(response.ok).to.be.true
})
it('should not handle range requests', async function () {
const url = new URL(
testServer.url(
`/api/projects/${projectId}/blobs/${testFiles.EMPTY_FILE_HASH}`
)
)
url.searchParams.append('token', token)
const response = await fetch(url, { headers: { Range: 'bytes=0-0' } })
expect(response.headers.get('Content-Range')).to.equal(
`bytes */${testFiles.EMPTY_FILE_BYTE_LENGTH}`
)
expect(response.headers.get('Content-Length')).to.equal('0')
expect(response.status).to.equal(
HTTPStatus.REQUESTED_RANGE_NOT_SATISFIABLE
)
})
it('should handle full requests', async function () {
const url = new URL(
testServer.url(
`/api/projects/${projectId}/blobs/${testFiles.EMPTY_FILE_HASH}`
)
)
url.searchParams.append('token', token)
const response = await fetch(url)
const payload = await response.text()
expect(payload).to.equal('')
expect(response.status).to.equal(HTTPStatus.OK)
})
})
it('supports HEAD request', async function () {

View File

@@ -19,6 +19,9 @@ exports.NON_BMP_TXT_BYTE_LENGTH = 57
exports.NULL_CHARACTERS_TXT_HASH = '4227ca4e8736af63036e7457e2db376ddf7e5795'
exports.NULL_CHARACTERS_TXT_BYTE_LENGTH = 3
exports.EMPTY_FILE_HASH = 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391'
exports.EMPTY_FILE_BYTE_LENGTH = 0
// git hashes of some short strings for testing
exports.STRING_A_HASH = '2e65efe2a145dda7ee51d1741299f848e5bf752e'
exports.STRING_AB_HASH = '9ae9e86b7bd6cb1472d9373702d8249973da0832'

View File

@@ -66,9 +66,9 @@ async function requestBlob(method, req, res) {
}
const range = req.get('Range')
let stream, contentLength
let stream, contentLength, contentRange
try {
;({ stream, contentLength } =
;({ stream, contentLength, contentRange } =
await HistoryManager.promises.requestBlobWithProjectId(
projectId,
hash,
@@ -80,7 +80,11 @@ async function requestBlob(method, req, res) {
throw err
}
if (contentLength) res.setHeader('Content-Length', contentLength) // set on HEAD
if (contentLength) res.setHeader('Content-Length', contentLength)
if (contentRange) {
res.setHeader('Content-Range', contentRange)
res.status(206) // Partial Content
}
res.setHeader('Content-Type', 'application/octet-stream')
setBlobCacheHeaders(res, hash)

View File

@@ -207,6 +207,7 @@ async function requestBlob(historyId, hash, method = 'GET', range = '') {
url,
stream,
contentLength: parseInt(response.headers.get('Content-Length'), 10),
contentRange: response.headers.get('Content-Range'),
}
}

View File

@@ -143,10 +143,18 @@ export class ProjectSnapshot {
): Promise<any> {
const file = this.snapshot.getFile(path)
const hash = file?.getHash()
const byteLength = file?.getByteLength()
if (hash == null) {
return null
}
return await this.blobStore.getString(hash, options)
if (byteLength == null) {
return null
}
let blobStoreOptions
if (options?.maxSize != null && byteLength > options?.maxSize) {
blobStoreOptions = { maxSize: options.maxSize }
}
return await this.blobStore.getString(hash, blobStoreOptions)
}
getDocs(): Map<string, File> {
@@ -323,44 +331,20 @@ async function fetchBlob(
options?: { maxSize?: number }
): Promise<string> {
const url = `/project/${projectId}/blob/${hash}`
if (options?.maxSize) {
return await fetchTextFileWithSizeLimit(url, options.maxSize)
let fetchOpts
if (options?.maxSize === 0) {
return ''
}
const res = await fetch(url)
if (options?.maxSize) {
fetchOpts = {
headers: {
Range: `bytes=0-${options.maxSize - 1}`,
},
}
}
const res = await fetch(url, fetchOpts)
if (!res.ok) {
throw new FetchError('Failed to fetch blob', url, undefined, res)
}
return await res.text()
}
async function fetchTextFileWithSizeLimit(url: string, maxSize: number) {
let result = ''
try {
const abortController = new AbortController()
const response = await fetch(url, {
signal: abortController.signal,
})
if (!response.ok) {
throw new Error('Failed to fetch blob')
}
if (!response.body) {
throw new Error('Response body is empty')
}
const reader = response.body.pipeThrough(new TextDecoderStream())
for await (const chunk of reader) {
result += chunk
if (result.length > maxSize) {
abortController.abort()
}
}
} catch (error: any) {
if (error?.name === 'AbortError') {
// This is fine, we just return the result we have so far
} else {
throw error
}
}
return result.slice(0, maxSize)
}

View File

@@ -44,6 +44,10 @@ describe('ProjectSnapshot', function () {
), // 6.5MB
hash: 'eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee',
},
'empty.png': {
contents: '',
hash: 'ffffffffffffffffffffffffffffffffffffffff',
},
}
const chunk = {
@@ -82,6 +86,13 @@ describe('ProjectSnapshot', function () {
byteLength: files['bibliography.bib'].contents.length,
},
},
{
pathname: 'empty.png',
file: {
hash: files['empty.png'].hash,
byteLength: files['empty.png'].contents.length,
},
},
],
timestamp: '2025-01-01T12:00:00.000Z',
},
@@ -145,10 +156,24 @@ describe('ProjectSnapshot', function () {
})
}
// fetch-mock doesn't seem to expose the header to the response function,
// so we just use a constant here
const MOCKED_MAX_SIZE = 100
function mockBlobs(paths = Object.keys(files) as (keyof typeof files)[]) {
for (const path of paths) {
const file = files[path]
fetchMock.get(`/project/${projectId}/blob/${file.hash}`, file.contents)
fetchMock
.get({
url: `/project/${projectId}/blob/${file.hash}`,
missingHeaders: ['Range'],
response: file.contents,
})
.get({
url: `/project/${projectId}/blob/${file.hash}`,
headers: { Range: `bytes=0-${MOCKED_MAX_SIZE - 1}` },
response: file.contents.slice(0, MOCKED_MAX_SIZE),
})
}
}
@@ -243,13 +268,18 @@ describe('ProjectSnapshot', function () {
hash: files['bibliography.bib'].hash,
size: files['bibliography.bib'].contents.length,
},
{
path: 'empty.png',
hash: 'ffffffffffffffffffffffffffffffffffffffff',
size: 0,
},
])
})
})
describe('getBinaryFileContents', function () {
beforeEach(function () {
mockBlobs(['bibliography.bib'])
mockBlobs(['bibliography.bib', 'empty.png'])
})
it('can fetch whole file', async function () {
@@ -257,14 +287,20 @@ describe('ProjectSnapshot', function () {
expect(blob).to.equal(files['bibliography.bib'].contents)
})
// NOTE: fetch-mock does not support the .response.body.pipeThrough API,
// so this test is skipped for now.
// eslint-disable-next-line mocha/no-skipped-tests
it.skip('can fetch part of file', async function () {
it('can fetch part of file', async function () {
const blob = await snapshot.getBinaryFileContents('bibliography.bib', {
maxSize: 100,
maxSize: MOCKED_MAX_SIZE,
})
expect(blob).to.equal(files['bibliography.bib'].contents.slice(0, 100))
expect(blob).to.equal(
files['bibliography.bib'].contents.slice(0, MOCKED_MAX_SIZE)
)
})
it('can fetch empty file with maxSize', async function () {
const blob = await snapshot.getBinaryFileContents('empty.png', {
maxSize: 200,
})
expect(blob).to.equal(files['empty.png'].contents)
})
})
})

View File

@@ -32,6 +32,7 @@ describe('HistoryController', function () {
ctx.HistoryManager = {
promises: {
injectUserDetails: sinon.stub(),
requestBlobWithProjectId: sinon.stub(),
},
}
@@ -297,4 +298,81 @@ describe('HistoryController', function () {
})
})
})
describe('requestBlob', function () {
describe('With Range header', function () {
beforeEach(async function (ctx) {
ctx.req = {
params: { project_id: ctx.project_id, hash: 'foo' },
body: {},
get: sinon.stub(),
}
ctx.req.get.withArgs('Range').returns('bytes=0-42')
ctx.res = { setHeader: sinon.stub(), status: sinon.stub() }
ctx.HistoryManager.promises.requestBlobWithProjectId.resolves({
stream: null,
contentLength: '43',
contentRange: 'bytes 0-42/100',
})
await ctx.HistoryController.getBlob(ctx.req, ctx.res, ctx.next)
})
it('should forward the range request', function (ctx) {
ctx.HistoryManager.promises.requestBlobWithProjectId.should.have.been.calledWith(
ctx.project_id,
'foo',
'GET',
'bytes=0-42'
)
})
it('should forward the Content-Range header', function (ctx) {
ctx.res.setHeader.should.have.been.calledWith(
'Content-Range',
'bytes 0-42/100'
)
})
it('should forward the Content-Length header', function (ctx) {
ctx.res.setHeader.should.have.been.calledWith('Content-Length', '43')
})
it('should have status 206', function (ctx) {
ctx.res.status.should.have.been.calledWith(206)
})
})
describe('Without Range header', function () {
beforeEach(async function (ctx) {
ctx.req = {
params: { project_id: ctx.project_id, hash: 'foo' },
body: {},
get: sinon.stub(),
}
ctx.req.get.withArgs('Range').returns(null)
ctx.res = { setHeader: sinon.stub(), status: sinon.stub() }
ctx.HistoryManager.promises.requestBlobWithProjectId.resolves({
stream: null,
contentLength: '100',
range: null,
})
await ctx.HistoryController.getBlob(ctx.req, ctx.res, ctx.next)
})
it('should not have a Content-Range header', function (ctx) {
expect(ctx.res.setHeader).to.not.have.been.calledWith(
'Content-Range',
sinon.match.string
)
})
it('should forward the Content-Length header', function (ctx) {
ctx.res.setHeader.should.have.been.calledWith('Content-Length', '100')
})
it('should not have status 206', function (ctx) {
ctx.res.status.should.not.have.been.calledWith(206)
})
})
})
})