[clsi] mark VM as unhealthy when detecting of-of-disk condition (#25721)

* [clsi] shed load when detecting out-of-disk condition

* [clsi] mark VM as unhealthy when detecting of-of-disk condition

GitOrigin-RevId: 25cda6785c0d973f50ec6206bee389804f35917e
This commit is contained in:
Jakob Ackermann
2025-05-20 11:26:00 +01:00
committed by Copybot
parent 3bcd4bd349
commit 09057984bf
2 changed files with 35 additions and 5 deletions
+11 -3
View File
@@ -249,6 +249,9 @@ app.get('/health_check', function (req, res) {
if (Settings.processTooOld) {
return res.status(500).json({ processTooOld: true })
}
if (ProjectPersistenceManager.isAnyDiskCriticalLow()) {
return res.status(500).json({ diskCritical: true })
}
smokeTest.sendLastResult(res)
})
@@ -296,9 +299,14 @@ const loadTcpServer = net.createServer(function (socket) {
}
const freeLoad = availableWorkingCpus - currentLoad
const freeLoadPercentage = Math.round(
(freeLoad / availableWorkingCpus) * 100
)
let freeLoadPercentage = Math.round((freeLoad / availableWorkingCpus) * 100)
if (ProjectPersistenceManager.isAnyDiskCriticalLow()) {
freeLoadPercentage = 0
}
if (ProjectPersistenceManager.isAnyDiskLow()) {
freeLoadPercentage = freeLoadPercentage / 2
}
if (
Settings.internal.load_balancer_agent.allow_maintenance &&
freeLoadPercentage <= 0
@@ -22,6 +22,9 @@ const fs = require('node:fs')
// projectId -> timestamp mapping.
const LAST_ACCESS = new Map()
let ANY_DISK_LOW = false
let ANY_DISK_CRITICAL_LOW = false
async function collectDiskStats() {
const paths = [
Settings.path.compilesDir,
@@ -30,6 +33,8 @@ async function collectDiskStats() {
]
const diskStats = {}
let anyDiskLow = false
let anyDiskCriticalLow = false
for (const path of paths) {
try {
const { blocks, bavail, bsize } = await fs.promises.statfs(path)
@@ -45,10 +50,16 @@ async function collectDiskStats() {
})
const lowDisk = diskAvailablePercent < 10
diskStats[path] = { stats, lowDisk }
const criticalLowDisk = diskAvailablePercent < 3
anyDiskLow = anyDiskLow || lowDisk
anyDiskCriticalLow = anyDiskCriticalLow || criticalLowDisk
} catch (err) {
logger.err({ err, path }, 'error getting disk usage')
}
}
ANY_DISK_LOW = anyDiskLow
ANY_DISK_CRITICAL_LOW = anyDiskCriticalLow
return diskStats
}
@@ -70,11 +81,22 @@ async function refreshExpiryTimeout() {
break
}
}
Metrics.gauge(
'project_persistence_expiry_timeout',
ProjectPersistenceManager.EXPIRY_TIMEOUT
)
}
module.exports = ProjectPersistenceManager = {
EXPIRY_TIMEOUT: Settings.project_cache_length_ms || oneDay * 2.5,
isAnyDiskLow() {
return ANY_DISK_LOW
},
isAnyDiskCriticalLow() {
return ANY_DISK_CRITICAL_LOW
},
promises: {
refreshExpiryTimeout,
},
@@ -125,12 +147,12 @@ module.exports = ProjectPersistenceManager = {
)
})
// Collect disk stats frequently to have them ready the next time /metrics is scraped (60s +- jitter).
// Collect disk stats frequently to have them ready the next time /metrics is scraped (60s +- jitter) or every 5th scrape of the load agent (3s +- jitter).
setInterval(() => {
collectDiskStats().catch(err => {
logger.err({ err }, 'low level error collecting disk stats')
})
}, 50_000)
}, 15_000)
},
markProjectAsJustAccessed(projectId, callback) {