Merge pull request #33277 from overleaf/mj-pandoc-clsi-two-step-download

[clsi] Use clsi-nginx for downloading pandoc exports

GitOrigin-RevId: b6013fae6f53c7af714634d700ceed491d724653
This commit is contained in:
Mathias Jakobsen
2026-05-07 12:33:52 +01:00
committed by Copybot
parent ae31ad218c
commit 32da6548c8
7 changed files with 273 additions and 8 deletions

View File

@@ -1,8 +1,11 @@
import crypto from 'node:crypto'
import logger from '@overleaf/logger'
import { expressify } from '@overleaf/promise-utils'
import fs from 'node:fs/promises'
import fsSync from 'node:fs'
import ConversionManager from './ConversionManager.js'
import ConversionOutputCleaner from './ConversionOutputCleaner.js'
import OutputCacheManager from './OutputCacheManager.js'
import ResourceWriter from './ResourceWriter.js'
import RequestParser from './RequestParser.js'
import { pipeline } from 'node:stream/promises'
@@ -70,6 +73,8 @@ async function convertProjectToDocument(req, res) {
request.user_id = req.params.user_id
request.metricsOpts = {}
const responseFormat = req.query.responseFormat === 'json' ? 'json' : 'stream'
const conversionId = crypto.randomUUID()
const conversionDir = Path.join(Settings.path.compilesDir, conversionId)
@@ -94,12 +99,31 @@ async function convertProjectToDocument(req, res) {
type
)
const outputName = `output.${config.extension}`
if (responseFormat === 'json') {
// TODO: drop the streaming branch once web is migrated to the two-step flow
const buildId = await OutputCacheManager.promises.generateBuildId()
const buildDir = Path.join(
Settings.path.outputDir,
conversionId,
OutputCacheManager.CACHE_SUBDIR,
buildId
)
try {
await fs.mkdir(buildDir, { recursive: true })
await fs.copyFile(documentPath, Path.join(buildDir, outputName))
res.json({ conversionId, buildId, file: outputName })
} finally {
ConversionOutputCleaner.scheduleCleanup(conversionId)
}
} else {
const documentStat = await fs.stat(documentPath)
res.setHeader('Content-Length', documentStat.size)
res.attachment(`output.${config.extension}`)
res.attachment(outputName)
res.setHeader('X-Content-Type-Options', 'nosniff')
const readStream = fsSync.createReadStream(documentPath)
await pipeline(readStream, res)
}
} finally {
await fs.rm(conversionDir, { recursive: true, force: true }).catch(() => {})
}

View File

@@ -0,0 +1,23 @@
import fs from 'node:fs/promises'
import Path from 'node:path'
import logger from '@overleaf/logger'
import Settings from '@overleaf/settings'
const TTL_MS = 60 * 1000
function scheduleCleanup(conversionId, ttlMs = TTL_MS) {
const conversionOutputDir = Path.join(Settings.path.outputDir, conversionId)
setTimeout(() => {
fs.rm(conversionOutputDir, { recursive: true, force: true }).catch(err => {
logger.warn(
{ err, conversionId, conversionOutputDir },
'failed to clean up conversion output directory'
)
})
}, ttlMs)
}
export default {
TTL_MS,
scheduleCleanup,
}

View File

@@ -684,4 +684,5 @@ OutputCacheManager.promises = {
OutputCacheManager.saveOutputFilesInBuildDir
),
queueDirOperation,
generateBuildId: promisify(OutputCacheManager.generateBuildId),
}

View File

@@ -3,8 +3,11 @@ import ClsiApp from './helpers/ClsiApp.js'
import Path from 'node:path'
import fs from 'node:fs'
import { pipeline } from 'node:stream/promises'
import { buffer } from 'node:stream/consumers'
import yauzl from 'yauzl'
import { expect } from 'chai'
import { fetchStreamWithResponse } from '@overleaf/fetch-utils'
import Settings from '@overleaf/settings'
describe('Conversions', function () {
describe('docx conversion', function () {
@@ -81,4 +84,58 @@ describe('Conversions', function () {
.rejected
})
})
describe('project-to-document conversion (responseFormat=json)', function () {
before(async function () {
await ClsiApp.ensureRunning()
})
it('returns ids and serves the docx output via nginx', async function () {
const projectId = Client.randomId()
const userId = '0123456789abcdef01234567'
const request = {
rootResourcePath: 'main.tex',
resources: [
{
path: 'main.tex',
content: `\
\\documentclass{article}
\\begin{document}
Hello world
\\end{document}\
`,
},
],
}
const { conversionId, buildId, file } =
await Client.convertProjectToDocument(
projectId,
userId,
'docx',
request,
'json'
)
expect(conversionId).to.match(
/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/
)
expect(buildId).to.match(/^[0-9a-f]+-[0-9a-f]+$/)
const downloadUrl = new URL(Settings.apis.clsi.downloadHost)
downloadUrl.pathname = `/project/${conversionId}/build/${buildId}/output/${file}`
const { stream, response } = await fetchStreamWithResponse(
downloadUrl.href
)
expect(response.status).to.equal(200)
const body = await buffer(stream)
expect(body.length).to.be.greaterThan(0)
// .docx is a zip archive — verify the PK\x03\x04 magic bytes
expect(body[0]).to.equal(0x50)
expect(body[1]).to.equal(0x4b)
expect(body[2]).to.equal(0x03)
expect(body[3]).to.equal(0x04)
})
})
})

View File

@@ -39,6 +39,27 @@ async function convertDocument(path, type) {
})
}
async function convertProjectToDocument(
projectId,
userId,
type,
request,
responseFormat
) {
const url = new URL(
`${host}/project/${projectId}/user/${userId}/download/project-to-document`
)
url.searchParams.set('type', type)
if (responseFormat) {
url.searchParams.set('responseFormat', responseFormat)
}
const opts = { method: 'POST', json: { compile: request } }
if (responseFormat === 'json') {
return await fetchJson(url.href, opts)
}
return await fetchStream(url.href, opts)
}
async function stopCompile(projectId) {
return await fetchNothing(`${host}/project/${projectId}/compile/stop`, {
method: 'POST',
@@ -202,6 +223,7 @@ function smokeTest() {
export default {
randomId,
compile,
convertProjectToDocument,
convertDocument,
stopCompile,
clearCache,

View File

@@ -17,7 +17,16 @@ describe('ConversionController', function () {
ctx.documentStat = { size: 5678 }
ctx.Settings = {
enablePandocConversions: true,
path: { compilesDir: '/compiles' },
path: { compilesDir: '/compiles', outputDir: '/output' },
}
ctx.OutputCacheManager = {
CACHE_SUBDIR: 'generated-files',
promises: {
generateBuildId: sinon.stub().resolves('00000000001-0000000000000001'),
},
}
ctx.ConversionOutputCleaner = {
scheduleCleanup: sinon.stub(),
}
ctx.parsedRequest = { rootResourcePath: 'main.tex' }
ctx.ConversionManager = {
@@ -43,6 +52,8 @@ describe('ConversionController', function () {
stat: sinon.stub().resolves(ctx.zipStat),
unlink: sinon.stub().resolves(),
rm: sinon.stub().resolves(),
mkdir: sinon.stub().resolves(),
copyFile: sinon.stub().resolves(),
}
ctx.readStream = new PassThrough()
@@ -79,9 +90,18 @@ describe('ConversionController', function () {
default: ctx.RequestParser,
}))
vi.doMock('../../../app/js/OutputCacheManager', () => ({
default: ctx.OutputCacheManager,
}))
vi.doMock('../../../app/js/ConversionOutputCleaner', () => ({
default: ctx.ConversionOutputCleaner,
}))
ctx.res = new PassThrough()
ctx.res.attachment = sinon.stub()
ctx.res.setHeader = sinon.stub()
ctx.res.json = sinon.stub()
ctx.ConversionController = (await import(MODULE_PATH)).default
})
@@ -322,7 +342,7 @@ describe('ConversionController', function () {
const uuidDirPattern =
/^\/compiles\/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/
describe('successfully', function () {
describe('successfully (default streaming response)', function () {
beforeEach(async function (ctx) {
await ctx.ConversionController.convertProjectToDocument(
ctx.req,
@@ -376,6 +396,11 @@ describe('ConversionController', function () {
sinon.assert.calledWith(ctx.pipeline, ctx.readStream, ctx.res)
})
it('should not move the document or schedule cleanup', function (ctx) {
sinon.assert.notCalled(ctx.fs.copyFile)
sinon.assert.notCalled(ctx.ConversionOutputCleaner.scheduleCleanup)
})
it('should clean up the conversion directory', function (ctx) {
sinon.assert.calledWith(ctx.fs.rm, sinon.match(uuidDirPattern), {
recursive: true,
@@ -384,6 +409,71 @@ describe('ConversionController', function () {
})
})
describe('successfully (responseFormat=json)', function () {
beforeEach(async function (ctx) {
ctx.req.query.responseFormat = 'json'
await ctx.ConversionController.convertProjectToDocument(
ctx.req,
ctx.res,
sinon.stub()
)
})
it('should move the document into the conversion output build dir', function (ctx) {
const outputBuildDirPattern =
/^\/output\/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\/generated-files\/[0-9a-f]+-[0-9a-f]+$/
sinon.assert.calledWith(
ctx.fs.mkdir,
sinon.match(outputBuildDirPattern),
{ recursive: true }
)
sinon.assert.calledWith(
ctx.fs.copyFile,
ctx.documentPath,
sinon.match(filePath => {
return (
filePath.startsWith('/output/') &&
filePath.endsWith('/output.docx')
)
})
)
})
it('should schedule cleanup of the conversion output dir', function (ctx) {
sinon.assert.calledWith(
ctx.ConversionOutputCleaner.scheduleCleanup,
sinon.match(
/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/
)
)
})
it('should respond with the conversion id, build id, and file name', function (ctx) {
sinon.assert.calledWith(
ctx.res.json,
sinon.match({
conversionId: sinon.match(
/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/
),
buildId: sinon.match(/^[0-9a-f]+-[0-9a-f]+$/),
file: 'output.docx',
})
)
})
it('should not stream the document', function (ctx) {
sinon.assert.notCalled(ctx.fsSync.createReadStream)
sinon.assert.notCalled(ctx.pipeline)
})
it('should clean up the working conversion directory', function (ctx) {
sinon.assert.calledWith(ctx.fs.rm, sinon.match(uuidDirPattern), {
recursive: true,
force: true,
})
})
})
describe('with conversionType=markdown', function () {
beforeEach(async function (ctx) {
ctx.req.query = { type: 'markdown', projectName: 'My_Project' }

View File

@@ -0,0 +1,48 @@
import sinon from 'sinon'
import { vi, describe, it, beforeEach, afterEach } from 'vitest'
import Path from 'node:path'
const MODULE_PATH = Path.join(
import.meta.dirname,
'../../../app/js/ConversionOutputCleaner'
)
describe('ConversionOutputCleaner', function () {
beforeEach(async function (ctx) {
ctx.clock = sinon.useFakeTimers()
ctx.Settings = {
path: { outputDir: '/output' },
}
ctx.fs = {
rm: sinon.stub().resolves(),
}
ctx.logger = {
warn: sinon.stub(),
}
vi.doMock('node:fs/promises', () => ({ default: ctx.fs }))
vi.doMock('@overleaf/settings', () => ({ default: ctx.Settings }))
vi.doMock('@overleaf/logger', () => ({ default: ctx.logger }))
ctx.ConversionOutputCleaner = (await import(MODULE_PATH)).default
})
afterEach(function (ctx) {
ctx.clock.restore()
})
it('does not remove the directory before the TTL elapses', function (ctx) {
ctx.ConversionOutputCleaner.scheduleCleanup('test-conversion-id')
ctx.clock.tick(ctx.ConversionOutputCleaner.TTL_MS - 1)
sinon.assert.notCalled(ctx.fs.rm)
})
it('removes the conversion output directory once the TTL elapses', function (ctx) {
ctx.ConversionOutputCleaner.scheduleCleanup('test-conversion-id')
ctx.clock.tick(ctx.ConversionOutputCleaner.TTL_MS)
sinon.assert.calledWith(ctx.fs.rm, '/output/test-conversion-id', {
recursive: true,
force: true,
})
})
})