mirror of
https://github.com/yu-i-i/overleaf-cep.git
synced 2026-06-04 14:49:01 +02:00
[clsi] mark VM as unhealthy when detecting of-of-disk condition (#25721)
* [clsi] shed load when detecting out-of-disk condition * [clsi] mark VM as unhealthy when detecting of-of-disk condition GitOrigin-RevId: 25cda6785c0d973f50ec6206bee389804f35917e
This commit is contained in:
+11
-3
@@ -249,6 +249,9 @@ app.get('/health_check', function (req, res) {
|
||||
if (Settings.processTooOld) {
|
||||
return res.status(500).json({ processTooOld: true })
|
||||
}
|
||||
if (ProjectPersistenceManager.isAnyDiskCriticalLow()) {
|
||||
return res.status(500).json({ diskCritical: true })
|
||||
}
|
||||
smokeTest.sendLastResult(res)
|
||||
})
|
||||
|
||||
@@ -296,9 +299,14 @@ const loadTcpServer = net.createServer(function (socket) {
|
||||
}
|
||||
|
||||
const freeLoad = availableWorkingCpus - currentLoad
|
||||
const freeLoadPercentage = Math.round(
|
||||
(freeLoad / availableWorkingCpus) * 100
|
||||
)
|
||||
let freeLoadPercentage = Math.round((freeLoad / availableWorkingCpus) * 100)
|
||||
if (ProjectPersistenceManager.isAnyDiskCriticalLow()) {
|
||||
freeLoadPercentage = 0
|
||||
}
|
||||
if (ProjectPersistenceManager.isAnyDiskLow()) {
|
||||
freeLoadPercentage = freeLoadPercentage / 2
|
||||
}
|
||||
|
||||
if (
|
||||
Settings.internal.load_balancer_agent.allow_maintenance &&
|
||||
freeLoadPercentage <= 0
|
||||
|
||||
@@ -22,6 +22,9 @@ const fs = require('node:fs')
|
||||
// projectId -> timestamp mapping.
|
||||
const LAST_ACCESS = new Map()
|
||||
|
||||
let ANY_DISK_LOW = false
|
||||
let ANY_DISK_CRITICAL_LOW = false
|
||||
|
||||
async function collectDiskStats() {
|
||||
const paths = [
|
||||
Settings.path.compilesDir,
|
||||
@@ -30,6 +33,8 @@ async function collectDiskStats() {
|
||||
]
|
||||
|
||||
const diskStats = {}
|
||||
let anyDiskLow = false
|
||||
let anyDiskCriticalLow = false
|
||||
for (const path of paths) {
|
||||
try {
|
||||
const { blocks, bavail, bsize } = await fs.promises.statfs(path)
|
||||
@@ -45,10 +50,16 @@ async function collectDiskStats() {
|
||||
})
|
||||
const lowDisk = diskAvailablePercent < 10
|
||||
diskStats[path] = { stats, lowDisk }
|
||||
|
||||
const criticalLowDisk = diskAvailablePercent < 3
|
||||
anyDiskLow = anyDiskLow || lowDisk
|
||||
anyDiskCriticalLow = anyDiskCriticalLow || criticalLowDisk
|
||||
} catch (err) {
|
||||
logger.err({ err, path }, 'error getting disk usage')
|
||||
}
|
||||
}
|
||||
ANY_DISK_LOW = anyDiskLow
|
||||
ANY_DISK_CRITICAL_LOW = anyDiskCriticalLow
|
||||
return diskStats
|
||||
}
|
||||
|
||||
@@ -70,11 +81,22 @@ async function refreshExpiryTimeout() {
|
||||
break
|
||||
}
|
||||
}
|
||||
Metrics.gauge(
|
||||
'project_persistence_expiry_timeout',
|
||||
ProjectPersistenceManager.EXPIRY_TIMEOUT
|
||||
)
|
||||
}
|
||||
|
||||
module.exports = ProjectPersistenceManager = {
|
||||
EXPIRY_TIMEOUT: Settings.project_cache_length_ms || oneDay * 2.5,
|
||||
|
||||
isAnyDiskLow() {
|
||||
return ANY_DISK_LOW
|
||||
},
|
||||
isAnyDiskCriticalLow() {
|
||||
return ANY_DISK_CRITICAL_LOW
|
||||
},
|
||||
|
||||
promises: {
|
||||
refreshExpiryTimeout,
|
||||
},
|
||||
@@ -125,12 +147,12 @@ module.exports = ProjectPersistenceManager = {
|
||||
)
|
||||
})
|
||||
|
||||
// Collect disk stats frequently to have them ready the next time /metrics is scraped (60s +- jitter).
|
||||
// Collect disk stats frequently to have them ready the next time /metrics is scraped (60s +- jitter) or every 5th scrape of the load agent (3s +- jitter).
|
||||
setInterval(() => {
|
||||
collectDiskStats().catch(err => {
|
||||
logger.err({ err }, 'low level error collecting disk stats')
|
||||
})
|
||||
}, 50_000)
|
||||
}, 15_000)
|
||||
},
|
||||
|
||||
markProjectAsJustAccessed(projectId, callback) {
|
||||
|
||||
Reference in New Issue
Block a user