From e4574effbf770b77659615ae79b5c3dcf79463f9 Mon Sep 17 00:00:00 2001 From: David <33458145+davidmcpowell@users.noreply.github.com> Date: Mon, 8 Sep 2025 15:26:51 +0100 Subject: [PATCH] Merge pull request #27928 from overleaf/dp-pdf-caching-typescript-2 Convert pdf-caching file to typescript GitOrigin-RevId: 9acd2fc0697490008d82abfad0994df362814bad --- .../js/features/pdf-preview/util/metrics.ts | 6 +- .../pdf-preview/util/pdf-caching-transport.ts | 20 +- .../util/{pdf-caching.js => pdf-caching.ts} | 621 ++++++++++-------- .../js/features/pdf-preview/util/types.ts | 29 + services/web/types/compile.ts | 46 +- 5 files changed, 408 insertions(+), 314 deletions(-) rename services/web/frontend/js/features/pdf-preview/util/{pdf-caching.js => pdf-caching.ts} (71%) diff --git a/services/web/frontend/js/features/pdf-preview/util/metrics.ts b/services/web/frontend/js/features/pdf-preview/util/metrics.ts index b9bdf8d7a7..ea38e29958 100644 --- a/services/web/frontend/js/features/pdf-preview/util/metrics.ts +++ b/services/web/frontend/js/features/pdf-preview/util/metrics.ts @@ -2,7 +2,7 @@ import { v4 as uuid } from 'uuid' import { sendMB } from '../../../infrastructure/event-tracking' import { trackPdfDownloadEnabled } from './pdf-caching-flags' import { debugConsole } from '@/utils/debugging' -import { DeliveryLatencies } from './types' +import { DeliveryLatencies, PdfCachingMetrics } from './types' import { CompileResponseData } from '@ol-types/compile' // VERSION should get incremented when making changes to caching behavior or @@ -12,10 +12,6 @@ const VERSION = 9 // editing session id export const EDITOR_SESSION_ID = uuid() -type PdfCachingMetrics = { - viewerId: string -} - const pdfCachingMetrics: PdfCachingMetrics = { viewerId: EDITOR_SESSION_ID, } diff --git a/services/web/frontend/js/features/pdf-preview/util/pdf-caching-transport.ts b/services/web/frontend/js/features/pdf-preview/util/pdf-caching-transport.ts index 1e58ec61e8..a5af9fca64 100644 --- a/services/web/frontend/js/features/pdf-preview/util/pdf-caching-transport.ts +++ b/services/web/frontend/js/features/pdf-preview/util/pdf-caching-transport.ts @@ -1,5 +1,5 @@ import OError from '@overleaf/o-error' -import { fallbackRequest, fetchRange } from './pdf-caching' +import { fallbackRequest, fetchRange, preprocessFileOnce } from './pdf-caching' import { captureException } from '@/infrastructure/error-reporter' import { EDITOR_SESSION_ID, getPdfCachingMetrics } from './metrics' import { @@ -15,7 +15,8 @@ import { debugConsole } from '@/utils/debugging' import { PDFJS } from './pdf-js' import { sendMB } from '@/infrastructure/event-tracking' import getMeta from '@/utils/meta' -import { PDFFile, PDFRange } from '@ol-types/compile' +import { PDFFile, PDFRange, ProcessedPDFFile } from '@ol-types/compile' +import { PdfCachingMetricsFull } from './types' // 30 seconds: The shutdown grace period of a clsi pre-emp instance. const STALE_OUTPUT_REQUEST_THRESHOLD_MS = 30 * 1000 @@ -28,7 +29,7 @@ export function generatePdfCachingTransportFactory() { const projectId = getMeta('ol-project_id') const usageScore = new Map() const cachedUrls = new Map() - const metrics = Object.assign(getPdfCachingMetrics(), { + const metrics: PdfCachingMetricsFull = Object.assign(getPdfCachingMetrics(), { failedCount: 0, failedOnce: false, tooMuchBandwidthCount: 0, @@ -53,7 +54,7 @@ export function generatePdfCachingTransportFactory() { class PDFDataRangeTransport extends PDFJS.PDFDataRangeTransport { url: string - pdfFile: PDFFile + pdfFile: ProcessedPDFFile abortController: AbortController leanPdfRanges: PDFRange[] handleFetchError: (error: any) => void @@ -76,7 +77,13 @@ export function generatePdfCachingTransportFactory() { this.url = url pdfFile.ranges = pdfFile.ranges || [] pdfFile.editorId = pdfFile.editorId || EDITOR_SESSION_ID - this.pdfFile = pdfFile + preprocessFileOnce({ + file: pdfFile, + usageScore, + cachedUrls, + }) + // We can safely cast as preprocessFileOnce mutates the file object into a ProcessedPDFFile + this.pdfFile = pdfFile as unknown as ProcessedPDFFile // Clone the chunks as the objectId field is encoded to a Uint8Array. this.leanPdfRanges = pdfFile.ranges.map(r => Object.assign({}, r)) this.handleFetchError = handleFetchError @@ -192,8 +199,6 @@ export function generatePdfCachingTransportFactory() { }) } - // @ts-ignore is incorrectly inferring the type of fetchRange. - // Remove this when we convert pdf-caching.js to typescript. fetchRange({ url: this.url, start, @@ -248,6 +253,7 @@ export function generatePdfCachingTransportFactory() { isFromOutputPDFRequest: isFromOutputPDFRequest(err), }, }) + return fallbackRequest({ file: this.pdfFile, url: this.url, diff --git a/services/web/frontend/js/features/pdf-preview/util/pdf-caching.js b/services/web/frontend/js/features/pdf-preview/util/pdf-caching.ts similarity index 71% rename from services/web/frontend/js/features/pdf-preview/util/pdf-caching.js rename to services/web/frontend/js/features/pdf-preview/util/pdf-caching.ts index c3dba41d8b..bf63867905 100644 --- a/services/web/frontend/js/features/pdf-preview/util/pdf-caching.js +++ b/services/web/frontend/js/features/pdf-preview/util/pdf-caching.ts @@ -1,4 +1,12 @@ +import { + Chunk, + PartiallyProcessedPDFFile, + PDFRange, + PrefetchedChunk, + ProcessedPDFFile, +} from '@ol-types/compile' import OError from '@overleaf/o-error' +import { PdfCachingMetricsFull } from './types' const PDF_JS_CHUNK_SIZE = 128 * 1024 const MAX_SUB_REQUEST_COUNT = 4 @@ -21,20 +29,19 @@ const CHUNK_USAGE_THRESHOLD_CACHED = 42 // 42 * 0.7^11 < 1, aka we keep stale entries around for 11 compiles. const CHUNK_USAGE_STALE_DECAY_RATE = 0.7 -let cacheFlag = 'default' +let cacheFlag: RequestCache = 'default' // Work around a Chrome bug: https://issues.chromium.org/issues/40542704 // Multiple simultaneous requests to same URL with Range header cause failure (block backend returns ERR_CACHE_OPERATION_NOT_SUPPORTED) const CACHE_NO_STORE = 'no-store' -/** - * @param {string} url - * @param {RequestInit} init - */ -async function fetchWithBrowserCacheFallback(url, init) { +async function fetchWithBrowserCacheFallback(url: string, init: RequestInit) { try { return await fetch(url, init) } catch (err) { - if (init.headers?.has('Range') && init.cache !== CACHE_NO_STORE) { + if ( + (init.headers as Headers | undefined)?.has('Range') && + init.cache !== CACHE_NO_STORE + ) { cacheFlag = CACHE_NO_STORE init.cache = CACHE_NO_STORE return await fetch(url, init) @@ -43,30 +50,29 @@ async function fetchWithBrowserCacheFallback(url, init) { } } -/** - * @param {Object} file - */ -function backfillEdgeBounds(file) { +function backfillEdgeBounds(file: PartiallyProcessedPDFFile) { const encoder = new TextEncoder() for (const chunk of file.ranges) { if (chunk.objectId) { - chunk.objectId = encoder.encode(chunk.objectId) + chunk.objectId = encoder.encode(chunk.objectId as string) chunk.start -= chunk.objectId.byteLength chunk.size = chunk.end - chunk.start } } } -/** - * @param {Map} usageScore - * @param {Map} cachedUrls - */ -function trimState({ usageScore, cachedUrls }) { - for (const hash of usageScore) { +function trimState({ + usageScore, + cachedUrls, +}: { + usageScore: Map + cachedUrls: Map +}) { + for (const [hash, score] of usageScore) { if (usageScore.size < INCREMENTAL_CACHE_SIZE) { break } - const score = usageScore.get(hash) + if (score >= CHUNK_USAGE_THRESHOLD_TRIGGER_PREFERRED) { // Keep entries that are worth caching around for longer. usageScore.set(hash, score * CHUNK_USAGE_STALE_DECAY_RATE) @@ -77,25 +83,25 @@ function trimState({ usageScore, cachedUrls }) { } } -/** - * @param {Object} file - * @param {Map} usageScore - * @param {Map} cachedUrls - */ -function preprocessFileOnce({ file, usageScore, cachedUrls }) { - if (file.preprocessed) return +export function preprocessFileOnce({ + file, + usageScore, + cachedUrls, +}: { + file: PartiallyProcessedPDFFile + usageScore: Map + cachedUrls: Map +}) { + if ('preprocessed' in file && file.preprocessed) return file file.preprocessed = true - file.createdAt = new Date(file.createdAt) + file.createdAt = new Date(file.createdAt || '') file.prefetched = file.prefetched || [] trimState({ usageScore, cachedUrls }) backfillEdgeBounds(file) } -/** - * @param {Array} chunks - */ -export function estimateSizeOfMultipartResponse(chunks) { +function estimateSizeOfMultipartResponse(chunks: Chunk[]) { /* --boundary HEADER @@ -116,18 +122,21 @@ export function estimateSizeOfMultipartResponse(chunks) { ) } -/** - * - * @param {Object} metrics - * @param {number} size - * @param {number} cachedCount - * @param {number} cachedBytes - * @param {number} fetchedCount - * @param {number} fetchedBytes - */ function trackDownloadStats( - metrics, - { size, cachedCount, cachedBytes, fetchedCount, fetchedBytes } + metrics: PdfCachingMetricsFull, + { + size, + cachedCount, + cachedBytes, + fetchedCount, + fetchedBytes, + }: { + size: number + cachedCount: number + cachedBytes: number + fetchedCount: number + fetchedBytes: number + } ) { metrics.cachedCount += cachedCount metrics.cachedBytes += cachedBytes @@ -137,34 +146,34 @@ function trackDownloadStats( metrics.requestedBytes += size } -/** - * @param {Object} metrics - * @param {boolean} sizeDiffers - * @param {boolean} mismatch - * @param {boolean} success - */ -function trackChunkVerify(metrics, { sizeDiffers, mismatch, success }) { +function trackChunkVerify( + metrics: PdfCachingMetricsFull, + { + sizeDiffers, + mismatch, + success, + }: { + sizeDiffers: boolean + mismatch: boolean + success: boolean + } +) { if (sizeDiffers) { - metrics.chunkVerifySizeDiffers |= 0 - metrics.chunkVerifySizeDiffers += 1 + incrementMetric(metrics, 'chunkVerifySizeDiffers') } if (mismatch) { - metrics.chunkVerifyMismatch |= 0 - metrics.chunkVerifyMismatch += 1 + incrementMetric(metrics, 'chunkVerifyMismatch') } if (success) { - metrics.chunkVerifySuccess |= 0 - metrics.chunkVerifySuccess += 1 + incrementMetric(metrics, 'chunkVerifySuccess') } } -/** - * @param chunk - * @param {ArrayBuffer} arrayBuffer - * @return {Uint8Array} - */ -function backFillObjectContext(chunk, arrayBuffer) { - if (!chunk.objectId) { +function backFillObjectContext( + chunk: Chunk | Chunk[] | PDFRange, + arrayBuffer: ArrayBuffer +) { + if (!('objectId' in chunk)) { // This is a dynamic chunk return new Uint8Array(arrayBuffer) } @@ -185,14 +194,12 @@ function backFillObjectContext(chunk, arrayBuffer) { return fullBuffer } -/** - * @param {Array} chunks - * @param {number} start - * @param {number} end - * @returns {Array} - */ -function getMatchingChunks(chunks, start, end) { - const matchingChunks = [] +function getMatchingChunks( + chunks: ChunkType[], + start: number, + end: number +) { + const matchingChunks: ChunkType[] = [] for (const chunk of chunks) { if (chunk.end <= start) { // no overlap: @@ -211,39 +218,19 @@ function getMatchingChunks(chunks, start, end) { return matchingChunks } -/** - * @param {Object} a - * @param {Object} b - */ -function sortBySizeDESC(a, b) { +function sortBySizeDESC(a: { size: number }, b: { size: number }) { return a.size > b.size ? -1 : 1 } -/** - * @param {Object} a - * @param {Object} b - */ -function sortByStartASC(a, b) { +function sortByStartASC(a: { start: number }, b: { start: number }) { return a.start > b.start ? 1 : -1 } -/** - * @param {Object} chunk - */ -function usageAboveThreshold(chunk) { +function usageAboveThreshold(chunk: PDFRange) { // We fetched enough shards of this chunk. Cache it in full now. return chunk.totalUsage > CHUNK_USAGE_THRESHOLD_TRIGGER_PREFERRED } -/** - * @param {Array} potentialChunks - * @param {Map} usageScore - * @param {Map} cachedUrls - * @param {Object} metrics - * @param {number} start - * @param {number} end - * @param {boolean} prefetchLargeEnabled - */ function cutRequestAmplification({ potentialChunks, usageScore, @@ -251,12 +238,19 @@ function cutRequestAmplification({ start, end, prefetchLargeEnabled, +}: { + potentialChunks: PDFRange[] + usageScore: Map + metrics: PdfCachingMetricsFull + start: number + end: number + prefetchLargeEnabled: boolean }) { // NOTE: Map keys are stored in insertion order. // We re-insert keys on cache hit and turn 'usageScore' into a cheap LRU. - const chunks = [] - const skipAlreadyAdded = chunk => !chunks.includes(chunk) + const chunks: PDFRange[] = [] + const skipAlreadyAdded = (chunk: PDFRange) => !chunks.includes(chunk) let tooManyRequests = false let tooMuchBandwidth = false let newChunks = 0 @@ -332,14 +326,12 @@ function cutRequestAmplification({ return chunks } -/** - * @param {Array} chunks - * @param {number} start - * @param {number} end - * @returns {Array} - */ -function getInterleavingDynamicChunks(chunks, start, end) { - const dynamicChunks = [] +function getInterleavingDynamicChunks( + chunks: Chunk[], + start: number, + end: number +) { + const dynamicChunks: Chunk[] = [] for (const chunk of chunks) { if (start < chunk.start) { dynamicChunks.push({ start, end: chunk.start }) @@ -353,36 +345,24 @@ function getInterleavingDynamicChunks(chunks, start, end) { return dynamicChunks } -/** - * - * @param {Response} response - */ -function getServerTime(response) { +function getServerTime(response: Response) { const raw = response.headers.get('Date') if (!raw) return new Date() return new Date(raw) } -/** - * - * @param {Response} response - */ -function getResponseSize(response) { +function getResponseSize(response: Response) { const raw = response.headers.get('Content-Length') if (!raw) return 0 return parseInt(raw, 10) } -/** - * - * @param {Response} response - * @param chunk - */ -export function getMultipartBoundary(response, chunk) { +function getMultipartBoundary(response: Response, chunk: Chunk | Chunk[]) { if (!Array.isArray(chunk)) return '' const raw = response.headers.get('Content-Type') - if (raw.includes('multipart/byteranges')) { + + if (raw?.includes('multipart/byteranges')) { const idx = raw.indexOf('boundary=') if (idx !== -1) return raw.slice(idx + 'boundary='.length) } @@ -393,32 +373,45 @@ export function getMultipartBoundary(response, chunk) { }) } -/** - * @param {string} boundary - * @param {number} start - * @param {number} end - * @param {number} size - * @return {string} - */ -function composeMultipartHeader({ boundary, start, end, size }) { +function composeMultipartHeader({ + boundary, + start, + end, + size, +}: { + boundary: string + start: number + end: number + size: number +}) { return `\r\n--${boundary}\r\nContent-Type: application/pdf\r\nContent-Range: bytes ${start}-${ end - 1 }/${size}\r\n\r\n` } -/** - * @param {Object} file - * @param {Array} chunks - * @param {Uint8Array} data - * @param {string} boundary - * @param {Object} metrics - */ -export function resolveMultiPartResponses({ +function incrementMetric( + metrics: PdfCachingMetricsFull, + key: keyof PdfCachingMetricsFull +) { + if (key in metrics) { + metrics[key]++ + } else { + ;(metrics[key] as number) = 1 + } +} + +function resolveMultiPartResponses({ file, chunks, data, boundary, metrics, +}: { + file: ProcessedPDFFile + chunks: Chunk[] + data: Uint8Array + boundary: string + metrics: PdfCachingMetricsFull }) { const responses = [] let offsetStart = 0 @@ -439,8 +432,7 @@ export function resolveMultiPartResponses({ .subarray(offsetStart, offsetStart + headerSize) .every((v, idx) => v === headerRaw[idx]) ) { - metrics.headerVerifyFailure |= 0 - metrics.headerVerifyFailure++ + incrementMetric(metrics, 'headerVerifyFailure') throw new OError('multipart response header does not match', { actual: new TextDecoder().decode( data.subarray(offsetStart, offsetStart + headerSize) @@ -457,16 +449,15 @@ export function resolveMultiPartResponses({ }) offsetStart += chunkSize } + return responses } -/** - * - * @param {Response} response - * @param {number} estimatedSize - * @param {RequestInit} init - */ -export function checkChunkResponse(response, estimatedSize, init) { +function checkChunkResponse( + response: Response, + estimatedSize: number, + init: RequestInit +) { if (!(response.status === 206 || response.status === 200)) { throw new OError('non successful response status: ' + response.status, { statusCode: response.status, @@ -491,7 +482,17 @@ export function checkChunkResponse(response, estimatedSize, init) { } } -function getDynamicChunkInit({ file, start, end, signal }) { +function getDynamicChunkInit({ + file, + start, + end, + signal, +}: { + file: ProcessedPDFFile + start: number + end: number + signal?: AbortSignal | null +}): RequestInit { // Avoid making range request when downloading the PDF file in full. const isFullFile = start === 0 && end === file.size return { @@ -503,15 +504,19 @@ function getDynamicChunkInit({ file, start, end, signal }) { } } -/** - * - * @param {Object} file - * @param {string} url - * @param {number} start - * @param {number} end - * @param {AbortSignal} abortSignal - */ -export async function fallbackRequest({ file, url, start, end, abortSignal }) { +export async function fallbackRequest({ + file, + url, + start, + end, + abortSignal, +}: { + file: ProcessedPDFFile + url: string + start: number + end: number + abortSignal: AbortSignal +}) { try { const init = getDynamicChunkInit({ file, start, end, signal: abortSignal }) const response = await fetchWithBrowserCacheFallback(url, init) @@ -522,16 +527,6 @@ export async function fallbackRequest({ file, url, start, end, abortSignal }) { } } -/** - * - * @param {Object} file - * @param {string} url - * @param {number} start - * @param {number} end - * @param {Object} metrics - * @param {Uint8Array} actual - * @param {AbortSignal} abortSignal - */ async function verifyRange({ file, url, @@ -540,6 +535,14 @@ async function verifyRange({ metrics, actual, abortSignal, +}: { + file: ProcessedPDFFile + url: string + start: number + end: number + metrics: PdfCachingMetricsFull + actual: Uint8Array + abortSignal: AbortSignal }) { let expectedRaw try { @@ -554,7 +557,11 @@ async function verifyRange({ throw OError.tag(error, 'cannot verify range', { url, start, end }) } const expected = new Uint8Array(expectedRaw) - const stats = {} + const stats = { + sizeDiffers: false, + mismatch: false, + success: false, + } if (actual.byteLength !== expected.byteLength) { stats.sizeDiffers = true } else if (!expected.every((v, idx) => v === actual[idx])) { @@ -566,13 +573,12 @@ async function verifyRange({ return expected } -/** - * @param {Array} chunks - * @param {Array} prefetched - * @param {number} start - * @param {number} end - */ -function skipPrefetched(chunks, prefetched, start, end) { +function skipPrefetched( + chunks: ChunkType[], + prefetched: PDFRange[], + start: number, + end: number +): ChunkType[] { return chunks.filter(chunk => { return !prefetched.find( c => @@ -582,18 +588,6 @@ function skipPrefetched(chunks, prefetched, start, end) { }) } -/** - * @param {Object|Object[]} chunk - * @param {string} url - * @param {RequestInit} init - * @param {Map} cachedUrls - * @param {Object} metrics - * @param {boolean} cachedUrlLookupEnabled - * @param {() => boolean} canTryFromCache - * @param {string} fallbackToCacheURL - * @param {Object} file - * @param {() => void} recordFallbackToClsiCache - */ async function fetchChunk({ chunk, url, @@ -605,47 +599,61 @@ async function fetchChunk({ fallbackToCacheURL, file, recordFallbackToClsiCache, +}: { + chunk: Chunk | PDFRange | Chunk[] + url: string + init: RequestInit + cachedUrls: Map + metrics: PdfCachingMetricsFull + cachedUrlLookupEnabled: boolean + canTryFromCache: (error: any) => boolean + fallbackToCacheURL: string + file: ProcessedPDFFile + recordFallbackToClsiCache: () => void }) { const estimatedSize = Array.isArray(chunk) ? estimateSizeOfMultipartResponse(chunk) : chunk.end - chunk.start - const oldUrl = cachedUrls.get(chunk.hash) - if (cachedUrlLookupEnabled && chunk.hash && oldUrl && oldUrl.url !== url) { - // When the clsi server id changes, the content id changes too and as a - // result all the browser cache keys (aka urls) get invalidated. - // We memorize the previous browser cache keys in `cachedUrls`. - try { - oldUrl.init.signal = init.signal - const response = await fetchWithBrowserCacheFallback( - oldUrl.url, - oldUrl.init - ) - if (response.status === 200) { - checkChunkResponse(response, estimatedSize, init) - metrics.oldUrlHitCount += 1 - return response + if ('hash' in chunk) { + const oldUrl = cachedUrls.get(chunk.hash) + if (cachedUrlLookupEnabled && chunk.hash && oldUrl && oldUrl.url !== url) { + // When the clsi server id changes, the content id changes too and as a + // result all the browser cache keys (aka urls) get invalidated. + // We memorize the previous browser cache keys in `cachedUrls`. + try { + oldUrl.init.signal = init.signal + const response = await fetchWithBrowserCacheFallback( + oldUrl.url, + oldUrl.init + ) + if (response.status === 200) { + checkChunkResponse(response, estimatedSize, init) + metrics.oldUrlHitCount += 1 + return response + } + if (response.status === 404) { + // The old browser cache entry is gone and the old file is gone too. + metrics.oldUrlMissCount += 1 + } + // Fallback to the latest url. + } catch (e) { + // Fallback to the latest url. } - if (response.status === 404) { - // The old browser cache entry is gone and the old file is gone too. - metrics.oldUrlMissCount += 1 - } - // Fallback to the latest url. - } catch (e) { - // Fallback to the latest url. + cachedUrls.delete(chunk.hash) // clear cached state } - cachedUrls.delete(chunk.hash) // clear cached state } + let response try { response = await fetchWithBrowserCacheFallback(url, init) checkChunkResponse(response, estimatedSize, init) - if (chunk.hash) { + if ('hash' in chunk && chunk.hash) { delete init.signal // omit the signal from the cache cachedUrls.set(chunk.hash, { url, init }) } } catch (err1) { - if (chunk.hash) { + if ('hash' in chunk && chunk.hash) { cachedUrls.delete(chunk.hash) } const hasOthersCached = cachedUrls.size > 0 @@ -656,7 +664,7 @@ async function fetchChunk({ file.ranges = file.ranges.filter(r => cachedUrls.has(r.hash)) // Try harder at fetching the chunk, fallback to cache url = fallbackToCacheURL - if (chunk.hash) { + if ('hash' in chunk && chunk.hash) { init = getDynamicChunkInit({ file, // skip object id prefix @@ -679,14 +687,6 @@ async function fetchChunk({ return response } -/** - * @param {Object} file - * @param {number} start - * @param {number} end - * @param {Array} dynamicChunks - * @param {boolean} prefetchXRefTable - * @param {number} startXRefTableRange - */ function addPrefetchingChunks({ file, start, @@ -694,6 +694,13 @@ function addPrefetchingChunks({ dynamicChunks, prefetchXRefTable, startXRefTableRange, +}: { + file: ProcessedPDFFile + start: number + end: number + dynamicChunks: Chunk[] + prefetchXRefTable: boolean + startXRefTableRange: number }) { // Prefetch in case this is the first range, or we are fetching dynamic // chunks anyway (so we can ride-share the round trip). @@ -703,7 +710,7 @@ function addPrefetchingChunks({ return } - let extraChunks = [] + let extraChunks: Chunk[] = [] if (prefetchXRefTable) { // Prefetch the dynamic chunks around the xref table. extraChunks = skipPrefetched( @@ -779,6 +786,10 @@ function addPrefetchingChunks({ } class Timer { + max: number + total: number + lastStart: number + constructor() { this.max = 0 this.total = 0 @@ -799,7 +810,7 @@ class Timer { this.lastStart = 0 } - reportInto(metrics) { + reportInto(metrics: PdfCachingMetricsFull) { const max = Math.ceil(this.max) const total = Math.ceil(this.total) if (max > metrics.latencyComputeMax) { @@ -809,25 +820,6 @@ class Timer { } } -/** - * - * @param {string} url - * @param {number} start - * @param {number} end - * @param {Object} file - * @param {queryForChunks} start - * @param {Object} metrics - * @param {Map} usageScore - * @param {Map} cachedUrls - * @param {boolean} verifyChunks - * @param {boolean} prefetchingEnabled - * @param {boolean} prefetchLargeEnabled - * @param {boolean} tryOldCachedUrlEnabled - * @param {AbortSignal} abortSignal - * @param {() => boolean} canTryFromCache - * @param {string} fallbackToCacheURL - * @param {() => void} recordFallbackToClsiCache - */ export async function fetchRange({ url, start, @@ -845,19 +837,39 @@ export async function fetchRange({ canTryFromCache, fallbackToCacheURL, recordFallbackToClsiCache, +}: { + url: string + start: number + end: number + file: ProcessedPDFFile + queryForChunks: string + metrics: PdfCachingMetricsFull + usageScore: Map + cachedUrls: Map + verifyChunks: boolean + prefetchingEnabled: boolean + prefetchLargeEnabled: boolean + cachedUrlLookupEnabled: boolean + abortSignal: AbortSignal + canTryFromCache: (error: any) => boolean + fallbackToCacheURL: string + recordFallbackToClsiCache: () => void }) { const timer = new Timer() timer.startBlockingCompute() - preprocessFileOnce({ file, usageScore, cachedUrls }) + const startXRefTableRange = - Math.floor(file.startXRefTable / PDF_JS_CHUNK_SIZE) * PDF_JS_CHUNK_SIZE + file.startXRefTable !== undefined + ? Math.floor(file.startXRefTable / PDF_JS_CHUNK_SIZE) * PDF_JS_CHUNK_SIZE + : 0 + const prefetchXRefTable = prefetchingEnabled && startXRefTableRange > 0 && start === 0 const prefetched = getMatchingChunks(file.prefetched, start, end) // Check that handling the range request won't trigger excessive sub-requests, // (to avoid unwanted latency compared to the original request). - const chunks = cutRequestAmplification({ + const chunks: PDFRange[] = cutRequestAmplification({ potentialChunks: skipPrefetched( getMatchingChunks(file.ranges, start, end), prefetched, @@ -865,7 +877,6 @@ export async function fetchRange({ end ), usageScore, - cachedUrls, metrics, start, end, @@ -913,7 +924,13 @@ export async function fetchRange({ const byteRanges = dynamicChunks .map(chunk => `${chunk.start}-${chunk.end - 1}`) .join(',') - const coalescedDynamicChunks = [] + + const coalescedDynamicChunks: { + chunk: Chunk | Chunk[] + url: string + init: RequestInit + }[] = [] + switch (true) { case dynamicChunks.length === 0: break @@ -962,11 +979,16 @@ export async function fetchRange({ const perUserPrefix = url.slice(0, url.indexOf('/build/')) const requests = chunks .map(chunk => ({ - chunk, + chunk: chunk as Chunk | Chunk[], url: `${perUserPrefix}/content/${file.contentId}/${chunk.hash}?${queryForChunks}`, init: {}, })) - .concat(coalescedDynamicChunks) + .concat(coalescedDynamicChunks) as { + chunk: PDFRange | Chunk | Chunk[] + url: string + init: RequestInit + }[] + let cachedCount = 0 let cachedBytes = 0 let fetchedCount = 0 @@ -1000,7 +1022,7 @@ export async function fetchRange({ // | pdf.js chunk | // | A BIG IMAGE BLOB | // | THE FULL PDF | - if (chunk.hash && blobFetchDate < file.createdAt) { + if ('hash' in chunk && chunk.hash && blobFetchDate < file.createdAt) { const usedChunkSection = Math.min(end, chunk.end) - Math.max(start, chunk.start) cachedCount++ @@ -1021,6 +1043,7 @@ export async function fetchRange({ if (!Array.isArray(chunk)) { return [{ chunk, data }] } + return resolveMultiPartResponses({ file, chunks: chunk, @@ -1041,52 +1064,65 @@ export async function fetchRange({ rawResponses .flat() // flatten after splitting multipart responses .concat(prefetched.map(chunk => ({ chunk, data: chunk.buffer }))) - .forEach(({ chunk, data }) => { - if (!chunk.hash && chunk.end > end) { - // This is a (partially) prefetched chunk. - chunk.buffer = data - file.prefetched.push(chunk) - if (chunk.start > end) return // This is a fully prefetched chunk. - } - // overlap: - // | REQUESTED_RANGE | - // | CHUNK | - const offsetStart = Math.max(start - chunk.start, 0) - // overlap: - // | REQUESTED_RANGE | - // | CHUNK | - const offsetEnd = Math.max(chunk.end - end, 0) - const oldDataLength = data.length - if (offsetStart > 0 || offsetEnd > 0) { - // compute index positions for slice to handle case where offsetEnd=0 - const chunkSize = chunk.end - chunk.start - data = data.subarray(offsetStart, chunkSize - offsetEnd) - } - const newDataLength = data.length - const insertPosition = Math.max(chunk.start - start, 0) - try { - reassembledBlob.set(data, insertPosition) - } catch (err) { - const reassembledBlobLength = reassembledBlob.length - const trimmedChunk = { - start: chunk.start, - end: chunk.end, - hash: chunk.hash, - objectId: new TextDecoder().decode(chunk.objectId), + .forEach( + ({ + chunk, + data, + }: { + chunk: Chunk | PrefetchedChunk> + data: Uint8Array + }) => { + if ('hash' in chunk && chunk.end > end) { + // This is a (partially) prefetched chunk. + chunk.buffer = data + file.prefetched.push(chunk) + if (chunk.start > end) return // This is a fully prefetched chunk. + } + // overlap: + // | REQUESTED_RANGE | + // | CHUNK | + const offsetStart = Math.max(start - chunk.start, 0) + // overlap: + // | REQUESTED_RANGE | + // | CHUNK | + const offsetEnd = Math.max(chunk.end - end, 0) + const oldDataLength = data.length + if (offsetStart > 0 || offsetEnd > 0) { + // compute index positions for slice to handle case where offsetEnd=0 + const chunkSize = chunk.end - chunk.start + data = data.subarray(offsetStart, chunkSize - offsetEnd) + } + const newDataLength = data.length + const insertPosition = Math.max(chunk.start - start, 0) + + try { + reassembledBlob.set(data, insertPosition) + } catch (err) { + const reassembledBlobLength = reassembledBlob.length + + const trimmedChunk = { + start: chunk.start, + end: chunk.end, + hash: 'hash' in chunk ? chunk.hash : undefined, + objectId: + 'objectId' in chunk + ? new TextDecoder().decode(chunk.objectId) + : undefined, + } + throw OError.tag(err, 'broken reassembly', { + start, + end, + chunk: trimmedChunk, + oldDataLength, + newDataLength, + offsetStart, + offsetEnd, + insertPosition, + reassembledBlobLength, + }) } - throw OError.tag(err, 'broken reassembly', { - start, - end, - chunk: trimmedChunk, - oldDataLength, - newDataLength, - offsetStart, - offsetEnd, - insertPosition, - reassembledBlobLength, - }) } - }) + ) timer.finishBlockingCompute() timer.reportInto(metrics) @@ -1100,6 +1136,7 @@ export async function fetchRange({ if (verifyChunks) { return await verifyRange({ + file, url, start, end, diff --git a/services/web/frontend/js/features/pdf-preview/util/types.ts b/services/web/frontend/js/features/pdf-preview/util/types.ts index 1168f6979b..39c72193c9 100644 --- a/services/web/frontend/js/features/pdf-preview/util/types.ts +++ b/services/web/frontend/js/features/pdf-preview/util/types.ts @@ -56,3 +56,32 @@ export type DeliveryLatencies = { latencyFetch?: number latencyRender?: number } + +export type PdfCachingMetrics = { + viewerId: string +} + +export type PdfCachingMetricsFull = PdfCachingMetrics & { + failedCount: number + failedOnce: boolean + tooMuchBandwidthCount: number + tooManyRequestsCount: number + cachedCount: number + cachedBytes: number + fetchedCount: number + fetchedBytes: number + latencyComputeMax: number + latencyComputeTotal: number + requestedCount: number + requestedBytes: number + oldUrlHitCount: number + oldUrlMissCount: number + enablePdfCaching: boolean + prefetchingEnabled: boolean + prefetchLargeEnabled: boolean + cachedUrlLookupEnabled: boolean + chunkVerifySizeDiffers?: number + chunkVerifyMismatch?: number + chunkVerifySuccess?: number + headerVerifyFailure?: number +} diff --git a/services/web/types/compile.ts b/services/web/types/compile.ts index 469695d292..f666b850c7 100644 --- a/services/web/types/compile.ts +++ b/services/web/types/compile.ts @@ -1,9 +1,16 @@ -export type PDFRange = { - objectId: Uint8Array +export type Chunk = { + start: number end: number +} + +export type PrefetchedChunk = ChunkType & { + buffer: Uint8Array +} + +export type PDFRange = Chunk & { + objectId: ObjectIdType hash: string size: number - start: number totalUsage: number } @@ -18,17 +25,36 @@ type OutputFileBase = { main?: boolean } -export type PDFFile = OutputFileBase & { +type PDFFileBase = OutputFileBase & { clsiCacheShard: string contentId: string - createdAt: Date editorId: string - pdfDownloadURL: string - pdfURL: string - prefetched: any[] - preprocessed: boolean - ranges: PDFRange[] + pdfDownloadUrl: string + pdfUrl: string size: number + startXRefTable?: number +} + +export type PDFFile = PDFFileBase & { + createdAt?: string + ranges: PDFRange[] + prefetched?: PrefetchedChunk>[] +} + +export type ProcessedPDFFile = PDFFileBase & { + preprocessed: true + createdAt: Date + prefetched: PrefetchedChunk>[] + ranges: PDFRange[] +} + +// This type is a little bit of a hack to work around the fact that we mutate +// the PDFFile object directly when processed into a ProcessedPDFFile +export type PartiallyProcessedPDFFile = PDFFileBase & { + preprocessed?: boolean + createdAt?: Date | string + prefetched?: PrefetchedChunk>[] + ranges: PDFRange[] | PDFRange[] } export type CompileOutputFile = OutputFileBase | PDFFile