Merge pull request #32857 from overleaf/ds-pandoc-import-md

[WEB + CLSI] Import markdown files using pandoc

GitOrigin-RevId: adad7831ddb13a8fcb8063871166bde13cbbf1b6
This commit is contained in:
Mathias Jakobsen
2026-05-07 12:09:10 +01:00
committed by Copybot
parent 44efc9d745
commit eddcc5a42e
26 changed files with 813 additions and 312 deletions

View File

@@ -25,7 +25,7 @@ describe('Conversions', function () {
const outputStream = fs.createWriteStream(
'/tmp/clsi_acceptance_tests_' + crypto.randomUUID() + '.zip'
)
const stream = await Client.convertDocx(sourcePath)
const stream = await Client.convertDocument(sourcePath, 'docx')
await pipeline(stream, outputStream)
await new Promise((resolve, reject) => {
@@ -77,7 +77,8 @@ describe('Conversions', function () {
import.meta.dirname,
'../fixtures/minimal.pdf'
)
await expect(Client.convertDocx(sourcePath)).to.eventually.be.rejected
await expect(Client.convertDocument(sourcePath, 'docx')).to.eventually.be
.rejected
})
})
})

View File

@@ -30,10 +30,10 @@ function compile(projectId, data) {
})
}
async function convertDocx(path) {
async function convertDocument(path, type) {
const formData = new FormData()
formData.append('qqfile', fs.createReadStream(path))
return await fetchStream(`${host}/convert/docx-to-latex`, {
return await fetchStream(`${host}/convert/document-to-latex?type=${type}`, {
method: 'POST',
body: formData,
})
@@ -202,7 +202,7 @@ function smokeTest() {
export default {
randomId,
compile,
convertDocx,
convertDocument,
stopCompile,
clearCache,
getOutputFile,

View File

@@ -22,7 +22,7 @@ describe('ConversionController', function () {
ctx.parsedRequest = { rootResourcePath: 'main.tex' }
ctx.ConversionManager = {
promises: {
convertDocxToLaTeXWithLock: sinon.stub().resolves(ctx.zipPath),
convertToLaTeXWithLock: sinon.stub().resolves(ctx.zipPath),
convertLaTeXToDocumentInDirWithLock: sinon
.stub()
.resolves(ctx.documentPath),
@@ -86,16 +86,17 @@ describe('ConversionController', function () {
ctx.ConversionController = (await import(MODULE_PATH)).default
})
describe('convertDocxToLaTeX', function () {
describe('convertDocumentToLaTeX', function () {
describe('when conversions are disabled', function () {
beforeEach(async function (ctx) {
ctx.Settings.enablePandocConversions = false
ctx.req = {
file: { path: '/path/to/uploaded/file.docx' },
query: { type: 'docx' },
}
ctx.res.sendStatus = sinon.stub()
await ctx.ConversionController.convertDocxToLaTeX(ctx.req, ctx.res)
await ctx.ConversionController.convertDocumentToLaTeX(ctx.req, ctx.res)
})
it('should remove the uploaded file', function (ctx) {
@@ -108,7 +109,59 @@ describe('ConversionController', function () {
it('should not call the conversion manager', function (ctx) {
sinon.assert.notCalled(
ctx.ConversionManager.promises.convertDocxToLaTeXWithLock
ctx.ConversionManager.promises.convertToLaTeXWithLock
)
})
})
describe('when conversionType is missing', function () {
beforeEach(async function (ctx) {
ctx.req = {
file: { path: '/path/to/uploaded/file.docx' },
query: {},
}
ctx.res.sendStatus = sinon.stub()
await ctx.ConversionController.convertDocumentToLaTeX(ctx.req, ctx.res)
})
it('should remove the uploaded file', function (ctx) {
sinon.assert.calledWith(ctx.fs.unlink, ctx.req.file.path)
})
it('should return 400', function (ctx) {
sinon.assert.calledWith(ctx.res.sendStatus, 400)
})
it('should not call the conversion manager', function (ctx) {
sinon.assert.notCalled(
ctx.ConversionManager.promises.convertToLaTeXWithLock
)
})
})
describe('when conversionType is unsupported', function () {
beforeEach(async function (ctx) {
ctx.req = {
file: { path: '/path/to/uploaded/file.docx' },
query: { type: 'invalid' },
}
ctx.res.sendStatus = sinon.stub()
await ctx.ConversionController.convertDocumentToLaTeX(ctx.req, ctx.res)
})
it('should remove the uploaded file', function (ctx) {
sinon.assert.calledWith(ctx.fs.unlink, ctx.req.file.path)
})
it('should return 400', function (ctx) {
sinon.assert.calledWith(ctx.res.sendStatus, 400)
})
it('should not call the conversion manager', function (ctx) {
sinon.assert.notCalled(
ctx.ConversionManager.promises.convertToLaTeXWithLock
)
})
})
@@ -117,18 +170,20 @@ describe('ConversionController', function () {
beforeEach(async function (ctx) {
ctx.req = {
file: { path: '/path/to/uploaded/file.docx' },
query: { type: 'docx' },
}
await ctx.ConversionController.convertDocxToLaTeX(ctx.req, ctx.res)
await ctx.ConversionController.convertDocumentToLaTeX(ctx.req, ctx.res)
})
it('should call the conversion manager with the uploaded file path', function (ctx) {
it('should call the conversion manager with the uploaded file path and type', function (ctx) {
sinon.assert.calledWith(
ctx.ConversionManager.promises.convertDocxToLaTeXWithLock,
ctx.ConversionManager.promises.convertToLaTeXWithLock,
sinon.match(
/^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/
),
ctx.req.file.path
ctx.req.file.path,
'docx'
)
})
@@ -160,6 +215,28 @@ describe('ConversionController', function () {
})
})
describe('with conversionType=markdown', function () {
beforeEach(async function (ctx) {
ctx.req = {
file: { path: '/path/to/uploaded/file.md' },
query: { type: 'markdown' },
}
await ctx.ConversionController.convertDocumentToLaTeX(ctx.req, ctx.res)
})
it('should call the conversion manager with the uploaded file path and markdown type', function (ctx) {
sinon.assert.calledWith(
ctx.ConversionManager.promises.convertToLaTeXWithLock,
sinon.match(
/^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/
),
ctx.req.file.path,
'markdown'
)
})
})
describe('unsuccessfully', function () {
describe('on streaming error', function () {
it('should propagate the error and still clean up', async function (ctx) {
@@ -169,10 +246,13 @@ describe('ConversionController', function () {
res.attachment = sinon.stub()
res.setHeader = sinon.stub()
const req = { file: { path: '/path/to/uploaded/file.docx' } }
const req = {
file: { path: '/path/to/uploaded/file.docx' },
query: { type: 'docx' },
}
await expect(
ctx.ConversionController.convertDocxToLaTeX(req, res)
ctx.ConversionController.convertDocumentToLaTeX(req, res)
).to.be.rejectedWith('mock stream error')
sinon.assert.calledWith(ctx.fs.rm, ctx.conversionDir)

View File

@@ -36,7 +36,6 @@ describe('ConversionManager', function () {
}
ctx.conversionId = 'test-conversion-id'
ctx.inputPath = '/path/to/input.docx'
ctx.conversionDir = '/compiles/test-conversion-id'
ctx.outputPath = '/compiles/test-conversion-id/output-uuid.zip'
@@ -65,188 +64,287 @@ describe('ConversionManager', function () {
ctx.uuidStub.restore()
})
describe('convertDocxToLaTeXWithLock', function () {
describe('general behavior', function () {
beforeEach(async function (ctx) {
ctx.result =
await ctx.ConversionManager.promises.convertDocxToLaTeXWithLock(
ctx.conversionId,
ctx.inputPath
describe('convertToLaTeXWithLock', function () {
describe('with conversionType=docx', function () {
beforeEach(function (ctx) {
ctx.inputPath = '/path/to/input.docx'
})
describe('file setup and pandoc args', function () {
beforeEach(async function (ctx) {
ctx.result =
await ctx.ConversionManager.promises.convertToLaTeXWithLock(
ctx.conversionId,
ctx.inputPath,
'docx'
)
})
it('should acquire a lock', async function (ctx) {
sinon.assert.calledWith(ctx.LockManager.acquire, ctx.conversionDir)
})
it('should copy the input file to the conversion directory with docx filename', async function (ctx) {
sinon.assert.calledWith(ctx.fs.mkdir, ctx.conversionDir, {
recursive: true,
})
sinon.assert.calledWith(
ctx.fs.copyFile,
ctx.inputPath,
Path.join(ctx.conversionDir, 'input.docx')
)
})
it('should acquire a lock', async function (ctx) {
sinon.assert.calledWith(ctx.LockManager.acquire, ctx.conversionDir)
})
it('should copy the input file to the conversion directory', async function (ctx) {
sinon.assert.calledWith(ctx.fs.mkdir, ctx.conversionDir, {
recursive: true,
})
sinon.assert.calledWith(
ctx.fs.copyFile,
ctx.inputPath,
Path.join(ctx.conversionDir, 'input.docx')
)
})
it('should convert conversion timeout to milliseconds', async function (ctx) {
expect(ctx.CommandRunner.promises.run.firstCall.args[4]).toBe(60_000)
expect(ctx.CommandRunner.promises.run.secondCall.args[4]).toBe(60_000)
})
it('should run pandoc followed by zip in the conversion directory', function (ctx) {
expect(ctx.CommandRunner.promises.run.callCount).toBe(2)
expect(ctx.CommandRunner.promises.run.firstCall.args).toEqual([
ctx.conversionId,
[
'pandoc',
'input.docx',
'--output',
'main.tex',
'--extract-media=.',
'--from',
'docx+citations',
'--to',
'latex',
'--citeproc',
'--standalone',
],
ctx.conversionDir,
ctx.Settings.pandocImage,
60_000,
{},
'conversions',
])
expect(ctx.CommandRunner.promises.run.secondCall.args).toEqual([
ctx.conversionId,
['zip', '-r', 'output-uuid.zip', '.'],
ctx.conversionDir,
ctx.Settings.pandocImage,
60_000,
{},
'conversions',
])
})
})
describe('successful conversion', function () {
beforeEach(async function (ctx) {
ctx.CommandRunner.promises.run.resolves({
stdout: 'mock-stdout',
stderr: 'mock-stderr',
exitCode: 0,
})
ctx.result =
await ctx.ConversionManager.promises.convertDocxToLaTeXWithLock(
it('should convert conversion timeout to milliseconds', async function (ctx) {
expect(ctx.CommandRunner.promises.run.firstCall.args[4]).toBe(60_000)
expect(ctx.CommandRunner.promises.run.secondCall.args[4]).toBe(60_000)
})
it('should run pandoc with docx args followed by zip', function (ctx) {
expect(ctx.CommandRunner.promises.run.callCount).toBe(2)
expect(ctx.CommandRunner.promises.run.firstCall.args).toEqual([
ctx.conversionId,
ctx.inputPath
)
})
it('should remove the source document after conversion', async function (ctx) {
sinon.assert.calledWith(
ctx.fs.unlink,
Path.join(ctx.conversionDir, 'input.docx')
)
})
it('should return the conversion directory', function (ctx) {
expect(ctx.result).toBe(ctx.outputPath)
})
it('should release the lock', function (ctx) {
sinon.assert.called(ctx.lock.release)
})
})
describe('unsuccessful conversion (exitcode)', function () {
beforeEach(async function (ctx) {
ctx.CommandRunner.promises.run.resolves({
stdout: 'mock-stdout',
stderr: 'mock-stderr',
exitCode: 63,
})
await expect(
ctx.ConversionManager.promises.convertDocxToLaTeXWithLock(
[
'pandoc',
'input.docx',
'--output',
'main.tex',
'--to',
'latex',
'--standalone',
'--extract-media=.',
'--from',
'docx+citations',
'--citeproc',
],
ctx.conversionDir,
ctx.Settings.pandocImage,
60_000,
{},
'conversions',
])
expect(ctx.CommandRunner.promises.run.secondCall.args).toEqual([
ctx.conversionId,
ctx.inputPath
)
).to.be.rejectedWith('pandoc conversion failed')
})
it('should remove the entire conversion directory', async function (ctx) {
sinon.assert.calledWith(ctx.fs.rm, ctx.conversionDir, {
force: true,
recursive: true,
['zip', '-r', 'output-uuid.zip', '.'],
ctx.conversionDir,
ctx.Settings.pandocImage,
60_000,
{},
'conversions',
])
})
})
it('should release the lock', function (ctx) {
sinon.assert.called(ctx.lock.release)
})
})
describe('unsuccessful compression (exitcode)', function () {
beforeEach(async function (ctx) {
ctx.CommandRunner.promises.run
.onFirstCall()
.resolves({
stdout: 'mock-pandoc-stdout',
stderr: 'mock-pandoc-stderr',
describe('successful conversion', function () {
beforeEach(async function (ctx) {
ctx.CommandRunner.promises.run.resolves({
stdout: 'mock-stdout',
stderr: 'mock-stderr',
exitCode: 0,
})
.onSecondCall()
.resolves({
stdout: 'mock-zip-stdout',
stderr: 'mock-zip-stderr',
exitCode: 12,
})
await expect(
ctx.ConversionManager.promises.convertDocxToLaTeXWithLock(
ctx.conversionId,
ctx.inputPath
ctx.result =
await ctx.ConversionManager.promises.convertToLaTeXWithLock(
ctx.conversionId,
ctx.inputPath,
'docx'
)
})
it('should remove the source document after conversion', async function (ctx) {
sinon.assert.calledWith(
ctx.fs.unlink,
Path.join(ctx.conversionDir, 'input.docx')
)
).to.be.rejectedWith('pandoc conversion failed')
})
})
it('should remove the entire conversion directory', async function (ctx) {
sinon.assert.calledWith(ctx.fs.rm, ctx.conversionDir, {
force: true,
recursive: true,
it('should return the output zip path', function (ctx) {
expect(ctx.result).toBe(ctx.outputPath)
})
it('should release the lock', function (ctx) {
sinon.assert.called(ctx.lock.release)
})
})
it('should release the lock', function (ctx) {
sinon.assert.called(ctx.lock.release)
describe('unsuccessful conversion (exitcode)', function () {
beforeEach(async function (ctx) {
ctx.CommandRunner.promises.run.resolves({
stdout: 'mock-stdout',
stderr: 'mock-stderr',
exitCode: 63,
})
await expect(
ctx.ConversionManager.promises.convertToLaTeXWithLock(
ctx.conversionId,
ctx.inputPath,
'docx'
)
).to.be.rejectedWith('pandoc conversion failed')
})
it('should remove the entire conversion directory', async function (ctx) {
sinon.assert.calledWith(ctx.fs.rm, ctx.conversionDir, {
force: true,
recursive: true,
})
})
it('should release the lock', function (ctx) {
sinon.assert.called(ctx.lock.release)
})
})
describe('unsuccessful compression (exitcode)', function () {
beforeEach(async function (ctx) {
ctx.CommandRunner.promises.run
.onFirstCall()
.resolves({
stdout: 'mock-pandoc-stdout',
stderr: 'mock-pandoc-stderr',
exitCode: 0,
})
.onSecondCall()
.resolves({
stdout: 'mock-zip-stdout',
stderr: 'mock-zip-stderr',
exitCode: 12,
})
await expect(
ctx.ConversionManager.promises.convertToLaTeXWithLock(
ctx.conversionId,
ctx.inputPath,
'docx'
)
).to.be.rejectedWith('pandoc conversion failed')
})
it('should remove the entire conversion directory', async function (ctx) {
sinon.assert.calledWith(ctx.fs.rm, ctx.conversionDir, {
force: true,
recursive: true,
})
})
it('should release the lock', function (ctx) {
sinon.assert.called(ctx.lock.release)
})
})
describe('unsuccessful conversion (throws)', function () {
beforeEach(async function (ctx) {
ctx.CommandRunner.promises.run.rejects(
new Error('mock conversion error')
)
await expect(
ctx.ConversionManager.promises.convertToLaTeXWithLock(
ctx.conversionId,
ctx.inputPath,
'docx'
)
).to.be.rejectedWith('pandoc conversion failed')
})
it('should remove the entire conversion directory', async function (ctx) {
sinon.assert.calledWith(ctx.fs.rm, ctx.conversionDir, {
force: true,
recursive: true,
})
})
it('should release the lock', function (ctx) {
sinon.assert.called(ctx.lock.release)
})
})
})
describe('unsuccessful conversion (throws)', function () {
beforeEach(async function (ctx) {
ctx.CommandRunner.promises.run.rejects(
new Error('mock conversion error')
)
await expect(
ctx.ConversionManager.promises.convertDocxToLaTeXWithLock(
ctx.conversionId,
ctx.inputPath
)
).to.be.rejectedWith('pandoc conversion failed')
describe('with conversionType=markdown', function () {
beforeEach(function (ctx) {
ctx.inputPath = '/path/to/input.md'
})
it('should remove the entire conversion directory', async function (ctx) {
sinon.assert.calledWith(ctx.fs.rm, ctx.conversionDir, {
force: true,
recursive: true,
describe('file setup and pandoc args', function () {
beforeEach(async function (ctx) {
ctx.result =
await ctx.ConversionManager.promises.convertToLaTeXWithLock(
ctx.conversionId,
ctx.inputPath,
'markdown'
)
})
it('should copy the input file to the conversion directory with md filename', async function (ctx) {
sinon.assert.calledWith(ctx.fs.mkdir, ctx.conversionDir, {
recursive: true,
})
sinon.assert.calledWith(
ctx.fs.copyFile,
ctx.inputPath,
Path.join(ctx.conversionDir, 'input.md')
)
})
it('should run pandoc with markdown args followed by zip', function (ctx) {
expect(ctx.CommandRunner.promises.run.callCount).toBe(2)
expect(ctx.CommandRunner.promises.run.firstCall.args).toEqual([
ctx.conversionId,
[
'pandoc',
'input.md',
'--output',
'main.tex',
'--to',
'latex',
'--standalone',
'--from',
'markdown',
],
ctx.conversionDir,
ctx.Settings.pandocImage,
60_000,
{},
'conversions',
])
expect(ctx.CommandRunner.promises.run.secondCall.args).toEqual([
ctx.conversionId,
['zip', '-r', 'output-uuid.zip', '.'],
ctx.conversionDir,
ctx.Settings.pandocImage,
60_000,
{},
'conversions',
])
})
})
it('should release the lock', function (ctx) {
sinon.assert.called(ctx.lock.release)
describe('successful conversion', function () {
beforeEach(async function (ctx) {
ctx.CommandRunner.promises.run.resolves({
stdout: 'mock-stdout',
stderr: 'mock-stderr',
exitCode: 0,
})
ctx.result =
await ctx.ConversionManager.promises.convertToLaTeXWithLock(
ctx.conversionId,
ctx.inputPath,
'markdown'
)
})
it('should remove the source document after conversion', async function (ctx) {
sinon.assert.calledWith(
ctx.fs.unlink,
Path.join(ctx.conversionDir, 'input.md')
)
})
it('should return the output zip path', function (ctx) {
expect(ctx.result).toBe(ctx.outputPath)
})
})
})
})