diff --git a/services/clsi/app/js/ConversionController.js b/services/clsi/app/js/ConversionController.js index 7214143fe5..6bb37a36a7 100644 --- a/services/clsi/app/js/ConversionController.js +++ b/services/clsi/app/js/ConversionController.js @@ -1,8 +1,11 @@ +import crypto from 'node:crypto' import logger from '@overleaf/logger' import { expressify } from '@overleaf/promise-utils' import fs from 'node:fs/promises' import fsSync from 'node:fs' import ConversionManager from './ConversionManager.js' +import ConversionOutputCleaner from './ConversionOutputCleaner.js' +import OutputCacheManager from './OutputCacheManager.js' import ResourceWriter from './ResourceWriter.js' import RequestParser from './RequestParser.js' import { pipeline } from 'node:stream/promises' @@ -70,6 +73,8 @@ async function convertProjectToDocument(req, res) { request.user_id = req.params.user_id request.metricsOpts = {} + const responseFormat = req.query.responseFormat === 'json' ? 'json' : 'stream' + const conversionId = crypto.randomUUID() const conversionDir = Path.join(Settings.path.compilesDir, conversionId) @@ -94,12 +99,31 @@ async function convertProjectToDocument(req, res) { type ) - const documentStat = await fs.stat(documentPath) - res.setHeader('Content-Length', documentStat.size) - res.attachment(`output.${config.extension}`) - res.setHeader('X-Content-Type-Options', 'nosniff') - const readStream = fsSync.createReadStream(documentPath) - await pipeline(readStream, res) + const outputName = `output.${config.extension}` + if (responseFormat === 'json') { + // TODO: drop the streaming branch once web is migrated to the two-step flow + const buildId = await OutputCacheManager.promises.generateBuildId() + const buildDir = Path.join( + Settings.path.outputDir, + conversionId, + OutputCacheManager.CACHE_SUBDIR, + buildId + ) + try { + await fs.mkdir(buildDir, { recursive: true }) + await fs.copyFile(documentPath, Path.join(buildDir, outputName)) + res.json({ conversionId, buildId, file: outputName }) + } finally { + ConversionOutputCleaner.scheduleCleanup(conversionId) + } + } else { + const documentStat = await fs.stat(documentPath) + res.setHeader('Content-Length', documentStat.size) + res.attachment(outputName) + res.setHeader('X-Content-Type-Options', 'nosniff') + const readStream = fsSync.createReadStream(documentPath) + await pipeline(readStream, res) + } } finally { await fs.rm(conversionDir, { recursive: true, force: true }).catch(() => {}) } diff --git a/services/clsi/app/js/ConversionOutputCleaner.js b/services/clsi/app/js/ConversionOutputCleaner.js new file mode 100644 index 0000000000..ef5c16b398 --- /dev/null +++ b/services/clsi/app/js/ConversionOutputCleaner.js @@ -0,0 +1,23 @@ +import fs from 'node:fs/promises' +import Path from 'node:path' +import logger from '@overleaf/logger' +import Settings from '@overleaf/settings' + +const TTL_MS = 60 * 1000 + +function scheduleCleanup(conversionId, ttlMs = TTL_MS) { + const conversionOutputDir = Path.join(Settings.path.outputDir, conversionId) + setTimeout(() => { + fs.rm(conversionOutputDir, { recursive: true, force: true }).catch(err => { + logger.warn( + { err, conversionId, conversionOutputDir }, + 'failed to clean up conversion output directory' + ) + }) + }, ttlMs) +} + +export default { + TTL_MS, + scheduleCleanup, +} diff --git a/services/clsi/app/js/OutputCacheManager.js b/services/clsi/app/js/OutputCacheManager.js index 1abdc7a6fc..7ef7ead354 100644 --- a/services/clsi/app/js/OutputCacheManager.js +++ b/services/clsi/app/js/OutputCacheManager.js @@ -684,4 +684,5 @@ OutputCacheManager.promises = { OutputCacheManager.saveOutputFilesInBuildDir ), queueDirOperation, + generateBuildId: promisify(OutputCacheManager.generateBuildId), } diff --git a/services/clsi/test/acceptance/js/ConversionTests.js b/services/clsi/test/acceptance/js/ConversionTests.js index 56e93290ec..a855c34cd3 100644 --- a/services/clsi/test/acceptance/js/ConversionTests.js +++ b/services/clsi/test/acceptance/js/ConversionTests.js @@ -3,8 +3,11 @@ import ClsiApp from './helpers/ClsiApp.js' import Path from 'node:path' import fs from 'node:fs' import { pipeline } from 'node:stream/promises' +import { buffer } from 'node:stream/consumers' import yauzl from 'yauzl' import { expect } from 'chai' +import { fetchStreamWithResponse } from '@overleaf/fetch-utils' +import Settings from '@overleaf/settings' describe('Conversions', function () { describe('docx conversion', function () { @@ -81,4 +84,58 @@ describe('Conversions', function () { .rejected }) }) + + describe('project-to-document conversion (responseFormat=json)', function () { + before(async function () { + await ClsiApp.ensureRunning() + }) + + it('returns ids and serves the docx output via nginx', async function () { + const projectId = Client.randomId() + const userId = '0123456789abcdef01234567' + const request = { + rootResourcePath: 'main.tex', + resources: [ + { + path: 'main.tex', + content: `\ +\\documentclass{article} +\\begin{document} +Hello world +\\end{document}\ +`, + }, + ], + } + + const { conversionId, buildId, file } = + await Client.convertProjectToDocument( + projectId, + userId, + 'docx', + request, + 'json' + ) + + expect(conversionId).to.match( + /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/ + ) + expect(buildId).to.match(/^[0-9a-f]+-[0-9a-f]+$/) + + const downloadUrl = new URL(Settings.apis.clsi.downloadHost) + downloadUrl.pathname = `/project/${conversionId}/build/${buildId}/output/${file}` + const { stream, response } = await fetchStreamWithResponse( + downloadUrl.href + ) + expect(response.status).to.equal(200) + + const body = await buffer(stream) + expect(body.length).to.be.greaterThan(0) + // .docx is a zip archive — verify the PK\x03\x04 magic bytes + expect(body[0]).to.equal(0x50) + expect(body[1]).to.equal(0x4b) + expect(body[2]).to.equal(0x03) + expect(body[3]).to.equal(0x04) + }) + }) }) diff --git a/services/clsi/test/acceptance/js/helpers/Client.js b/services/clsi/test/acceptance/js/helpers/Client.js index fac1d9a26a..d9b0a81eb7 100644 --- a/services/clsi/test/acceptance/js/helpers/Client.js +++ b/services/clsi/test/acceptance/js/helpers/Client.js @@ -39,6 +39,27 @@ async function convertDocument(path, type) { }) } +async function convertProjectToDocument( + projectId, + userId, + type, + request, + responseFormat +) { + const url = new URL( + `${host}/project/${projectId}/user/${userId}/download/project-to-document` + ) + url.searchParams.set('type', type) + if (responseFormat) { + url.searchParams.set('responseFormat', responseFormat) + } + const opts = { method: 'POST', json: { compile: request } } + if (responseFormat === 'json') { + return await fetchJson(url.href, opts) + } + return await fetchStream(url.href, opts) +} + async function stopCompile(projectId) { return await fetchNothing(`${host}/project/${projectId}/compile/stop`, { method: 'POST', @@ -202,6 +223,7 @@ function smokeTest() { export default { randomId, compile, + convertProjectToDocument, convertDocument, stopCompile, clearCache, diff --git a/services/clsi/test/unit/js/ConversionController.test.js b/services/clsi/test/unit/js/ConversionController.test.js index a785d2139f..94138ab1f3 100644 --- a/services/clsi/test/unit/js/ConversionController.test.js +++ b/services/clsi/test/unit/js/ConversionController.test.js @@ -17,7 +17,16 @@ describe('ConversionController', function () { ctx.documentStat = { size: 5678 } ctx.Settings = { enablePandocConversions: true, - path: { compilesDir: '/compiles' }, + path: { compilesDir: '/compiles', outputDir: '/output' }, + } + ctx.OutputCacheManager = { + CACHE_SUBDIR: 'generated-files', + promises: { + generateBuildId: sinon.stub().resolves('00000000001-0000000000000001'), + }, + } + ctx.ConversionOutputCleaner = { + scheduleCleanup: sinon.stub(), } ctx.parsedRequest = { rootResourcePath: 'main.tex' } ctx.ConversionManager = { @@ -43,6 +52,8 @@ describe('ConversionController', function () { stat: sinon.stub().resolves(ctx.zipStat), unlink: sinon.stub().resolves(), rm: sinon.stub().resolves(), + mkdir: sinon.stub().resolves(), + copyFile: sinon.stub().resolves(), } ctx.readStream = new PassThrough() @@ -79,9 +90,18 @@ describe('ConversionController', function () { default: ctx.RequestParser, })) + vi.doMock('../../../app/js/OutputCacheManager', () => ({ + default: ctx.OutputCacheManager, + })) + + vi.doMock('../../../app/js/ConversionOutputCleaner', () => ({ + default: ctx.ConversionOutputCleaner, + })) + ctx.res = new PassThrough() ctx.res.attachment = sinon.stub() ctx.res.setHeader = sinon.stub() + ctx.res.json = sinon.stub() ctx.ConversionController = (await import(MODULE_PATH)).default }) @@ -322,7 +342,7 @@ describe('ConversionController', function () { const uuidDirPattern = /^\/compiles\/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/ - describe('successfully', function () { + describe('successfully (default streaming response)', function () { beforeEach(async function (ctx) { await ctx.ConversionController.convertProjectToDocument( ctx.req, @@ -376,6 +396,11 @@ describe('ConversionController', function () { sinon.assert.calledWith(ctx.pipeline, ctx.readStream, ctx.res) }) + it('should not move the document or schedule cleanup', function (ctx) { + sinon.assert.notCalled(ctx.fs.copyFile) + sinon.assert.notCalled(ctx.ConversionOutputCleaner.scheduleCleanup) + }) + it('should clean up the conversion directory', function (ctx) { sinon.assert.calledWith(ctx.fs.rm, sinon.match(uuidDirPattern), { recursive: true, @@ -384,6 +409,71 @@ describe('ConversionController', function () { }) }) + describe('successfully (responseFormat=json)', function () { + beforeEach(async function (ctx) { + ctx.req.query.responseFormat = 'json' + await ctx.ConversionController.convertProjectToDocument( + ctx.req, + ctx.res, + sinon.stub() + ) + }) + + it('should move the document into the conversion output build dir', function (ctx) { + const outputBuildDirPattern = + /^\/output\/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\/generated-files\/[0-9a-f]+-[0-9a-f]+$/ + sinon.assert.calledWith( + ctx.fs.mkdir, + sinon.match(outputBuildDirPattern), + { recursive: true } + ) + sinon.assert.calledWith( + ctx.fs.copyFile, + ctx.documentPath, + sinon.match(filePath => { + return ( + filePath.startsWith('/output/') && + filePath.endsWith('/output.docx') + ) + }) + ) + }) + + it('should schedule cleanup of the conversion output dir', function (ctx) { + sinon.assert.calledWith( + ctx.ConversionOutputCleaner.scheduleCleanup, + sinon.match( + /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/ + ) + ) + }) + + it('should respond with the conversion id, build id, and file name', function (ctx) { + sinon.assert.calledWith( + ctx.res.json, + sinon.match({ + conversionId: sinon.match( + /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/ + ), + buildId: sinon.match(/^[0-9a-f]+-[0-9a-f]+$/), + file: 'output.docx', + }) + ) + }) + + it('should not stream the document', function (ctx) { + sinon.assert.notCalled(ctx.fsSync.createReadStream) + sinon.assert.notCalled(ctx.pipeline) + }) + + it('should clean up the working conversion directory', function (ctx) { + sinon.assert.calledWith(ctx.fs.rm, sinon.match(uuidDirPattern), { + recursive: true, + force: true, + }) + }) + }) + describe('with conversionType=markdown', function () { beforeEach(async function (ctx) { ctx.req.query = { type: 'markdown', projectName: 'My_Project' } diff --git a/services/clsi/test/unit/js/ConversionOutputCleaner.test.js b/services/clsi/test/unit/js/ConversionOutputCleaner.test.js new file mode 100644 index 0000000000..73de1a5922 --- /dev/null +++ b/services/clsi/test/unit/js/ConversionOutputCleaner.test.js @@ -0,0 +1,48 @@ +import sinon from 'sinon' +import { vi, describe, it, beforeEach, afterEach } from 'vitest' +import Path from 'node:path' + +const MODULE_PATH = Path.join( + import.meta.dirname, + '../../../app/js/ConversionOutputCleaner' +) + +describe('ConversionOutputCleaner', function () { + beforeEach(async function (ctx) { + ctx.clock = sinon.useFakeTimers() + ctx.Settings = { + path: { outputDir: '/output' }, + } + ctx.fs = { + rm: sinon.stub().resolves(), + } + ctx.logger = { + warn: sinon.stub(), + } + + vi.doMock('node:fs/promises', () => ({ default: ctx.fs })) + vi.doMock('@overleaf/settings', () => ({ default: ctx.Settings })) + vi.doMock('@overleaf/logger', () => ({ default: ctx.logger })) + + ctx.ConversionOutputCleaner = (await import(MODULE_PATH)).default + }) + + afterEach(function (ctx) { + ctx.clock.restore() + }) + + it('does not remove the directory before the TTL elapses', function (ctx) { + ctx.ConversionOutputCleaner.scheduleCleanup('test-conversion-id') + ctx.clock.tick(ctx.ConversionOutputCleaner.TTL_MS - 1) + sinon.assert.notCalled(ctx.fs.rm) + }) + + it('removes the conversion output directory once the TTL elapses', function (ctx) { + ctx.ConversionOutputCleaner.scheduleCleanup('test-conversion-id') + ctx.clock.tick(ctx.ConversionOutputCleaner.TTL_MS) + sinon.assert.calledWith(ctx.fs.rm, '/output/test-conversion-id', { + recursive: true, + force: true, + }) + }) +})