[clsi-cache] shard each zone into three instances (#25301)

* [clsi-cache] shard per zone into three instances

Keep the old instance as read fallback. We can remove it in 4 days.

Disk size: 2Ti gives us the maximum write throughput of 240MiB/s on a
N2D instance with fewer than 8 vCPUs.

* [clsi] fix format

* [k8s] clsi-cache: bring back storage-classes

* [k8s] clsi-cache: fix reference to zonal storage-classes

* [k8s] clsi-cache: add logging configs

* [clsi] improve sharding

Co-authored-by: Brian Gough <brian.gough@overleaf.com>

* [clsi] fix sharding

Index needs to be positive.

* [clsi] fix sharding

The random part is static per machine/process.

* [clsi] restrict clsi-cache to user projects

Co-authored-by: Brian Gough <brian.gough@overleaf.com>

* [k8s] clsi-cache: align CLSI_CACHE_NGINX_HOST with service LB

---------

Co-authored-by: Brian Gough <brian.gough@overleaf.com>
GitOrigin-RevId: 1efb1b3245c8194c305420b25e774ea735251fb3
This commit is contained in:
Jakob Ackermann
2025-05-06 13:32:00 +02:00
committed by Copybot
parent 18e05fca60
commit bcceca0dbe
15 changed files with 87 additions and 34 deletions

View File

@@ -20,6 +20,19 @@ const TIMING_BUCKETS = [
0, 10, 100, 1000, 2000, 5000, 10000, 15000, 20000, 30000,
]
const MAX_ENTRIES_IN_OUTPUT_TAR = 100
const OBJECT_ID_REGEX = /^[0-9a-f]{24}$/
/**
* @param {string} projectId
* @return {{shard: string, url: string}}
*/
function getShard(projectId) {
// [timestamp 4bytes][random per machine 5bytes][counter 3bytes]
// [32bit 4bytes]
const last4Bytes = Buffer.from(projectId, 'hex').subarray(8, 12)
const idx = last4Bytes.readUInt32BE() % Settings.apis.clsiCache.shards.length
return Settings.apis.clsiCache.shards[idx]
}
/**
* @param {string} projectId
@@ -29,6 +42,7 @@ const MAX_ENTRIES_IN_OUTPUT_TAR = 100
* @param {[{path: string}]} outputFiles
* @param {string} compileGroup
* @param {Record<string, any>} options
* @return {string | undefined}
*/
function notifyCLSICacheAboutBuild({
projectId,
@@ -39,14 +53,16 @@ function notifyCLSICacheAboutBuild({
compileGroup,
options,
}) {
if (!Settings.apis.clsiCache.enabled) return
if (!Settings.apis.clsiCache.enabled) return undefined
if (!OBJECT_ID_REGEX.test(projectId)) return undefined
const { url, shard } = getShard(projectId)
/**
* @param {[{path: string}]} files
*/
const enqueue = files => {
Metrics.count('clsi_cache_enqueue_files', files.length)
fetchNothing(`${Settings.apis.clsiCache.url}/enqueue`, {
fetchNothing(`${url}/enqueue`, {
method: 'POST',
json: {
projectId,
@@ -97,6 +113,8 @@ function notifyCLSICacheAboutBuild({
'build output.tar.gz for clsi cache failed'
)
})
return shard
}
/**
@@ -155,6 +173,7 @@ async function downloadOutputDotSynctexFromCompileCache(
outputDir
) {
if (!Settings.apis.clsiCache.enabled) return false
if (!OBJECT_ID_REGEX.test(projectId)) return false
const timer = new Metrics.Timer(
'clsi_cache_download',
@@ -165,7 +184,7 @@ async function downloadOutputDotSynctexFromCompileCache(
let stream
try {
stream = await fetchStream(
`${Settings.apis.clsiCache.url}/project/${projectId}/${
`${getShard(projectId).url}/project/${projectId}/${
userId ? `user/${userId}/` : ''
}build/${editorId}-${buildId}/search/output/output.synctex.gz`,
{
@@ -205,8 +224,9 @@ async function downloadOutputDotSynctexFromCompileCache(
*/
async function downloadLatestCompileCache(projectId, userId, compileDir) {
if (!Settings.apis.clsiCache.enabled) return false
if (!OBJECT_ID_REGEX.test(projectId)) return false
const url = `${Settings.apis.clsiCache.url}/project/${projectId}/${
const url = `${getShard(projectId).url}/project/${projectId}/${
userId ? `user/${userId}/` : ''
}latest/output/output.tar.gz`
const timer = new Metrics.Timer(

View File

@@ -112,12 +112,13 @@ function compile(req, res, next) {
buildId = error.buildId
}
let clsiCacheShard
if (
status === 'success' &&
request.editorId &&
request.populateClsiCache
) {
notifyCLSICacheAboutBuild({
clsiCacheShard = notifyCLSICacheAboutBuild({
projectId: request.project_id,
userId: request.user_id,
buildId: outputFiles[0].build,
@@ -144,6 +145,7 @@ function compile(req, res, next) {
stats,
timings,
buildId,
clsiCacheShard,
outputUrlPrefix: Settings.apis.clsi.outputUrlPrefix,
outputFiles: outputFiles.map(file => ({
url:

View File

@@ -60,9 +60,15 @@ module.exports = {
}`,
},
clsiCache: {
enabled: !!process.env.CLSI_CACHE_HOST,
url: `http://${process.env.CLSI_CACHE_HOST}:3044`,
downloadURL: `http://${process.env.CLSI_CACHE_NGINX_HOST || process.env.CLSI_CACHE_HOST}:8080`,
enabled: !!(process.env.CLSI_CACHE_SHARDS || process.env.CLSI_CACHE_HOST),
shards: process.env.CLSI_CACHE_SHARDS
? JSON.parse(process.env.CLSI_CACHE_SHARDS)
: [
{
url: `http://${process.env.CLSI_CACHE_HOST}:3044`,
shard: 'cache',
},
],
},
},

View File

@@ -129,6 +129,7 @@ describe('CompileController', function () {
url: `${this.Settings.apis.clsi.url}/project/${this.project_id}/build/${file.build}/output/${file.path}`,
...file,
})),
clsiCacheShard: undefined,
},
})
.should.equal(true)
@@ -156,6 +157,7 @@ describe('CompileController', function () {
url: `${this.Settings.apis.clsi.url}/project/${this.project_id}/build/${file.build}/output/${file.path}`,
...file,
})),
clsiCacheShard: undefined,
},
})
.should.equal(true)
@@ -203,6 +205,7 @@ describe('CompileController', function () {
url: `${this.Settings.apis.clsi.url}/project/${this.project_id}/build/${file.build}/output/${file.path}`,
...file,
})),
clsiCacheShard: undefined,
},
})
})
@@ -250,6 +253,7 @@ describe('CompileController', function () {
url: `${this.Settings.apis.clsi.url}/project/${this.project_id}/build/${file.build}/output/${file.path}`,
...file,
})),
clsiCacheShard: undefined,
},
})
})
@@ -281,6 +285,7 @@ describe('CompileController', function () {
buildId: this.buildId,
stats: this.stats,
timings: this.timings,
clsiCacheShard: undefined,
},
})
.should.equal(true)
@@ -315,6 +320,7 @@ describe('CompileController', function () {
timings: this.timings,
// JSON.stringify will omit these undefined values
buildId: undefined,
clsiCacheShard: undefined,
},
})
.should.equal(true)
@@ -348,6 +354,7 @@ describe('CompileController', function () {
timings: this.timings,
// JSON.stringify will omit these undefined values
buildId: undefined,
clsiCacheShard: undefined,
},
})
.should.equal(true)
@@ -379,6 +386,7 @@ describe('CompileController', function () {
timings: this.timings,
// JSON.stringify will omit these undefined values
buildId: undefined,
clsiCacheShard: undefined,
},
})
.should.equal(true)

View File

@@ -110,8 +110,8 @@ async function getLatestBuildFromCache(req, res) {
const userId = CompileController._getUserIdForCompile(req)
try {
const {
internal: { location: metaLocation, zone },
external: { isUpToDate, allFiles },
internal: { location: metaLocation },
external: { isUpToDate, allFiles, zone, shard },
} = await ClsiCacheManager.getLatestBuildFromCache(
projectId,
userId,
@@ -153,7 +153,7 @@ async function getLatestBuildFromCache(req, res) {
size,
editorId,
})
if (clsiServerId !== 'cache') {
if (clsiServerId !== shard) {
// Enable PDF caching and attempt to download from VM first.
// (clsi VMs do not have the editorId in the path on disk, omit it).
Object.assign(f, {
@@ -174,6 +174,7 @@ async function getLatestBuildFromCache(req, res) {
outputFiles,
compileGroup,
clsiServerId,
clsiCacheShard: shard,
pdfDownloadDomain,
pdfCachingMinChunkSize,
options,

View File

@@ -41,7 +41,7 @@ async function clearCache(projectId, userId) {
path += '/output'
await Promise.all(
Settings.apis.clsiCache.instances.map(async ({ url, zone }) => {
Settings.apis.clsiCache.instances.map(async ({ url, shard }) => {
const u = new URL(url)
u.pathname = path
try {
@@ -50,7 +50,7 @@ async function clearCache(projectId, userId) {
signal: AbortSignal.timeout(15_000),
})
} catch (err) {
throw OError.tag(err, 'clear clsi-cache', { url, zone })
throw OError.tag(err, 'clear clsi-cache', { url, shard })
}
})
)
@@ -64,7 +64,7 @@ async function clearCache(projectId, userId) {
* @param buildId
* @param filename
* @param signal
* @return {Promise<{size: number, zone: string, location: string, lastModified: Date, allFiles: string[]}>}
* @return {Promise<{size: number, zone: string, shard: string, location: string, lastModified: Date, allFiles: string[]}>}
*/
async function getOutputFile(
projectId,
@@ -93,7 +93,7 @@ async function getOutputFile(
* @param userId
* @param filename
* @param signal
* @return {Promise<{size: number, zone: string, location: string, lastModified: Date, allFiles: string[]}>}
* @return {Promise<{size: number, zone: string, shard: string, location: string, lastModified: Date, allFiles: string[]}>}
*/
async function getLatestOutputFile(
projectId,
@@ -125,7 +125,7 @@ async function getLatestOutputFile(
* @param userId
* @param path
* @param signal
* @return {Promise<{size: number, zone: string, location: string, lastModified: Date, allFiles: string[]}>}
* @return {Promise<{size: number, zone: string, shard: string, location: string, lastModified: Date, allFiles: string[]}>}
*/
async function getRedirectWithFallback(
projectId,
@@ -135,7 +135,7 @@ async function getRedirectWithFallback(
) {
// Avoid hitting the same instance first all the time.
const instances = _.shuffle(Settings.apis.clsiCache.instances)
for (const { url, zone } of instances) {
for (const { url, shard } of instances) {
const u = new URL(url)
u.pathname = path
try {
@@ -149,6 +149,7 @@ async function getRedirectWithFallback(
return {
location,
zone: headers.get('X-Zone'),
shard: headers.get('X-Shard') || 'cache',
lastModified: new Date(headers.get('X-Last-Modified')),
size: parseInt(headers.get('X-Content-Length'), 10),
allFiles: JSON.parse(headers.get('X-All-Files')),
@@ -158,7 +159,7 @@ async function getRedirectWithFallback(
break // No clsi-cache instance has cached something for this project/user.
}
logger.warn(
{ err, projectId, userId, url, zone },
{ err, projectId, userId, url, shard },
'getLatestOutputFile from clsi-cache failed'
)
// This clsi-cache instance is down, try the next backend.
@@ -178,18 +179,18 @@ async function getRedirectWithFallback(
* @param templateId
* @param templateVersionId
* @param lastUpdated
* @param zone
* @param shard
* @param signal
* @return {Promise<void>}
*/
async function prepareCacheSource(
projectId,
userId,
{ sourceProjectId, templateId, templateVersionId, lastUpdated, zone, signal }
{ sourceProjectId, templateId, templateVersionId, lastUpdated, shard, signal }
) {
const url = new URL(
`/project/${projectId}/user/${userId}/import-from`,
Settings.apis.clsiCache.instances.find(i => i.zone === zone).url
Settings.apis.clsiCache.instances.find(i => i.shard === shard).url
)
try {
await fetchNothing(url, {

View File

@@ -1,9 +1,11 @@
const _ = require('lodash')
const { NotFoundError } = require('../Errors/Errors')
const ClsiCacheHandler = require('./ClsiCacheHandler')
const DocumentUpdaterHandler = require('../DocumentUpdater/DocumentUpdaterHandler')
const ProjectGetter = require('../Project/ProjectGetter')
const SplitTestHandler = require('../SplitTests/SplitTestHandler')
const UserGetter = require('../User/UserGetter')
const Settings = require('@overleaf/settings')
/**
* Get the most recent build and metadata
@@ -14,11 +16,11 @@ const UserGetter = require('../User/UserGetter')
* @param userId
* @param filename
* @param signal
* @return {Promise<{internal: {zone: string, location: string}, external: {isUpToDate: boolean, lastUpdated: Date, size: number, allFiles: string[]}}>}
* @return {Promise<{internal: {location: string}, external: {zone: string, shard: string, isUpToDate: boolean, lastUpdated: Date, size: number, allFiles: string[]}}>}
*/
async function getLatestBuildFromCache(projectId, userId, filename, signal) {
const [
{ location, lastModified: lastCompiled, zone, size, allFiles },
{ location, lastModified: lastCompiled, zone, shard, size, allFiles },
lastUpdatedInRedis,
{ lastUpdated: lastUpdatedInMongo },
] = await Promise.all([
@@ -36,13 +38,14 @@ async function getLatestBuildFromCache(projectId, userId, filename, signal) {
return {
internal: {
location,
zone,
},
external: {
isUpToDate,
lastUpdated,
size,
allFiles,
shard,
zone,
},
}
}
@@ -73,12 +76,11 @@ async function prepareClsiCache(
const signal = AbortSignal.timeout(5_000)
let lastUpdated
let zone = 'b' // populate template data on zone b
let shard = _.shuffle(Settings.apis.clsiCache.instances)[0].shard
if (sourceProjectId) {
try {
;({
internal: { zone },
external: { lastUpdated },
external: { lastUpdated, shard },
} = await getLatestBuildFromCache(
sourceProjectId,
userId,
@@ -95,7 +97,7 @@ async function prepareClsiCache(
sourceProjectId,
templateId,
templateVersionId,
zone,
shard,
lastUpdated,
signal,
})

View File

@@ -207,6 +207,7 @@ async function _sendBuiltRequest(projectId, userId, req, options, callback) {
stats: compile.stats,
timings: compile.timings,
outputUrlPrefix: compile.outputUrlPrefix,
clsiCacheShard: compile.clsiCacheShard,
}
}
@@ -853,6 +854,7 @@ module.exports = {
'timings',
'outputUrlPrefix',
'buildId',
'clsiCacheShard',
]),
sendExternalRequest: callbackifyMultiResult(sendExternalRequest, [
'status',

View File

@@ -192,7 +192,8 @@ module.exports = CompileController = {
stats,
timings,
outputUrlPrefix,
buildId
buildId,
clsiCacheShard
) => {
if (error) {
Metrics.inc('compile-error')
@@ -236,6 +237,7 @@ module.exports = CompileController = {
outputFilesArchive,
compileGroup: limits?.compileGroup,
clsiServerId,
clsiCacheShard,
validationProblems,
stats,
timings,

View File

@@ -86,6 +86,7 @@ async function compile(projectId, userId, options = {}) {
timings,
outputUrlPrefix,
buildId,
clsiCacheShard,
} = await ClsiManager.promises.sendRequest(projectId, compileAsUser, options)
return {
@@ -98,6 +99,7 @@ async function compile(projectId, userId, options = {}) {
timings,
outputUrlPrefix,
buildId,
clsiCacheShard,
}
}
@@ -184,6 +186,7 @@ module.exports = CompileManager = {
'timings',
'outputUrlPrefix',
'buildId',
'clsiCacheShard',
]),
stopCompile: callbackify(stopCompile),

View File

@@ -48,6 +48,7 @@ const compileFromCacheResponse = () => {
fromCache: true,
status: 'success',
clsiServerId: 'foo',
clsiCacheShard: 'clsi-cache-zone-b-shard-1',
compileGroup: 'priority',
pdfDownloadDomain: 'https://clsi.test-overleaf.com',
outputFiles: outputFiles(),
@@ -166,10 +167,10 @@ export const waitForCompileOutput = ({
} = {}) => {
cy.wait(`@${prefix}-log`)
.its('request.query.clsiserverid')
.should('eq', cached ? 'cache' : 'foo') // straight from cache if cached
.should('eq', cached ? 'clsi-cache-zone-b-shard-1' : 'foo') // straight from cache if cached
cy.wait(`@${prefix}-blg`)
.its('request.query.clsiserverid')
.should('eq', cached ? 'cache' : 'foo') // straight from cache if cached
.should('eq', cached ? 'clsi-cache-zone-b-shard-1' : 'foo') // straight from cache if cached
if (pdf) {
cy.wait(`@${prefix}-pdf`)
.its('request.query.clsiserverid')

View File

@@ -13,6 +13,7 @@ export function buildFileList(
outputFiles: Map<string, CompileOutputFile>,
{
clsiServerId,
clsiCacheShard,
compileGroup,
outputFilesArchive,
fromCache = false,
@@ -24,7 +25,7 @@ export function buildFileList(
const params = new URLSearchParams()
if (fromCache) {
params.set('clsiserverid', 'cache')
params.set('clsiserverid', clsiCacheShard || 'cache')
} else if (clsiServerId) {
params.set('clsiserverid', clsiServerId)
}

View File

@@ -17,6 +17,7 @@ export function handleOutputFiles(outputFiles, projectId, data) {
if (!outputFile) return null
outputFile.editorId = outputFile.editorId || EDITOR_SESSION_ID
outputFile.clsiCacheShard = data.clsiCacheShard || 'cache'
// build the URL for viewing the PDF in the preview UI
const params = new URLSearchParams()

View File

@@ -116,7 +116,9 @@ export function generatePdfCachingTransportFactory() {
return (
u.pathname.endsWith(
`build/${this.pdfFile.editorId}-${this.pdfFile.build}/output/output.pdf`
) && u.searchParams.get('clsiserverid') === 'cache'
) &&
(u.searchParams.get('clsiserverid') === 'cache' ||
u.searchParams.get('clsiserverid')?.startsWith('clsi-cache-'))
)
}
const canTryFromCache = err => {
@@ -127,7 +129,7 @@ export function generatePdfCachingTransportFactory() {
const getOutputPDFURLFromCache = () => {
if (usesCache(this.url)) return this.url
const u = new URL(this.url)
u.searchParams.set('clsiserverid', 'cache')
u.searchParams.set('clsiserverid', this.pdfFile.clsiCacheShard)
u.pathname = u.pathname.replace(
/build\/[a-f0-9-]+\//,
`build/${this.pdfFile.editorId}-${this.pdfFile.build}/`

View File

@@ -23,6 +23,7 @@ export type CompileResponseData = {
outputFiles: CompileOutputFile[]
compileGroup?: string
clsiServerId?: string
clsiCacheShard?: string
pdfDownloadDomain?: string
pdfCachingMinChunkSize: number
validationProblems: any