From 09057984bf39d4e37506deaa5726099c0c6d89b9 Mon Sep 17 00:00:00 2001 From: Jakob Ackermann Date: Tue, 20 May 2025 11:26:00 +0100 Subject: [PATCH] [clsi] mark VM as unhealthy when detecting of-of-disk condition (#25721) * [clsi] shed load when detecting out-of-disk condition * [clsi] mark VM as unhealthy when detecting of-of-disk condition GitOrigin-RevId: 25cda6785c0d973f50ec6206bee389804f35917e --- services/clsi/app.js | 14 +++++++--- .../clsi/app/js/ProjectPersistenceManager.js | 26 +++++++++++++++++-- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/services/clsi/app.js b/services/clsi/app.js index 8de9d89b9b..872f612d9c 100644 --- a/services/clsi/app.js +++ b/services/clsi/app.js @@ -249,6 +249,9 @@ app.get('/health_check', function (req, res) { if (Settings.processTooOld) { return res.status(500).json({ processTooOld: true }) } + if (ProjectPersistenceManager.isAnyDiskCriticalLow()) { + return res.status(500).json({ diskCritical: true }) + } smokeTest.sendLastResult(res) }) @@ -296,9 +299,14 @@ const loadTcpServer = net.createServer(function (socket) { } const freeLoad = availableWorkingCpus - currentLoad - const freeLoadPercentage = Math.round( - (freeLoad / availableWorkingCpus) * 100 - ) + let freeLoadPercentage = Math.round((freeLoad / availableWorkingCpus) * 100) + if (ProjectPersistenceManager.isAnyDiskCriticalLow()) { + freeLoadPercentage = 0 + } + if (ProjectPersistenceManager.isAnyDiskLow()) { + freeLoadPercentage = freeLoadPercentage / 2 + } + if ( Settings.internal.load_balancer_agent.allow_maintenance && freeLoadPercentage <= 0 diff --git a/services/clsi/app/js/ProjectPersistenceManager.js b/services/clsi/app/js/ProjectPersistenceManager.js index e96a4591c3..41cdd07f4d 100644 --- a/services/clsi/app/js/ProjectPersistenceManager.js +++ b/services/clsi/app/js/ProjectPersistenceManager.js @@ -22,6 +22,9 @@ const fs = require('node:fs') // projectId -> timestamp mapping. const LAST_ACCESS = new Map() +let ANY_DISK_LOW = false +let ANY_DISK_CRITICAL_LOW = false + async function collectDiskStats() { const paths = [ Settings.path.compilesDir, @@ -30,6 +33,8 @@ async function collectDiskStats() { ] const diskStats = {} + let anyDiskLow = false + let anyDiskCriticalLow = false for (const path of paths) { try { const { blocks, bavail, bsize } = await fs.promises.statfs(path) @@ -45,10 +50,16 @@ async function collectDiskStats() { }) const lowDisk = diskAvailablePercent < 10 diskStats[path] = { stats, lowDisk } + + const criticalLowDisk = diskAvailablePercent < 3 + anyDiskLow = anyDiskLow || lowDisk + anyDiskCriticalLow = anyDiskCriticalLow || criticalLowDisk } catch (err) { logger.err({ err, path }, 'error getting disk usage') } } + ANY_DISK_LOW = anyDiskLow + ANY_DISK_CRITICAL_LOW = anyDiskCriticalLow return diskStats } @@ -70,11 +81,22 @@ async function refreshExpiryTimeout() { break } } + Metrics.gauge( + 'project_persistence_expiry_timeout', + ProjectPersistenceManager.EXPIRY_TIMEOUT + ) } module.exports = ProjectPersistenceManager = { EXPIRY_TIMEOUT: Settings.project_cache_length_ms || oneDay * 2.5, + isAnyDiskLow() { + return ANY_DISK_LOW + }, + isAnyDiskCriticalLow() { + return ANY_DISK_CRITICAL_LOW + }, + promises: { refreshExpiryTimeout, }, @@ -125,12 +147,12 @@ module.exports = ProjectPersistenceManager = { ) }) - // Collect disk stats frequently to have them ready the next time /metrics is scraped (60s +- jitter). + // Collect disk stats frequently to have them ready the next time /metrics is scraped (60s +- jitter) or every 5th scrape of the load agent (3s +- jitter). setInterval(() => { collectDiskStats().catch(err => { logger.err({ err }, 'low level error collecting disk stats') }) - }, 50_000) + }, 15_000) }, markProjectAsJustAccessed(projectId, callback) {