From 9970dd907a164000de214716a53ca65b9e2c5674 Mon Sep 17 00:00:00 2001 From: Jakob Ackermann Date: Fri, 30 Jan 2026 12:42:32 +0000 Subject: [PATCH] [e2e] use plain pdf.js for extracting PDF text (#31125) * [monorepo] also cycle ssl_proxy to avoid hitting circuit break in nginx * [e2e] use plain pdf.js for extracting PDF text * [monorepo] manually download canvas binding for MacOS GitOrigin-RevId: dbadbbaa0e121953c06ab4be9241bf361997865e --- package-lock.json | 33 ++--------------------------- server-ce/test/helpers/read-file.ts | 20 ++++++++++------- server-ce/test/package.json | 2 +- 3 files changed, 15 insertions(+), 40 deletions(-) diff --git a/package-lock.json b/package-lock.json index 51150bc543..c2b46cf03a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -40447,35 +40447,6 @@ "resolved": "https://registry.npmjs.org/pause/-/pause-0.0.1.tgz", "integrity": "sha1-HUCLP9t2kjuVQ9lvtMnf1TXZy10=" }, - "node_modules/pdf-parse": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-2.3.0.tgz", - "integrity": "sha512-VRKvhqdZ694CjdR1vusZ7VIA7ZuMN/GQ7eKz+e3z9ujCQdCQMOEG9x6cHfq8ddS7XspXVrruWuKmXm8g0hFlSQ==", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "pdfjs-dist": "^5.4.296" - }, - "engines": { - "node": ">=20.16.0 <21 || >=22.3.0" - }, - "optionalDependencies": { - "@napi-rs/canvas": "^0.1.80" - } - }, - "node_modules/pdf-parse/node_modules/pdfjs-dist": { - "version": "5.4.296", - "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.296.tgz", - "integrity": "sha512-DlOzet0HO7OEnmUmB6wWGJrrdvbyJKftI1bhMitK7O2N8W2gc757yyYBbINy9IDafXAV9wmKr9t7xsTaNKRG5Q==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=20.16.0 || >=22.3.0" - }, - "optionalDependencies": { - "@napi-rs/canvas": "^0.1.80" - } - }, "node_modules/pdfjs-dist": { "version": "5.1.91", "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.1.91.tgz", @@ -52774,7 +52745,7 @@ "isomorphic-git": "^1.33.1", "js-yaml": "^4.1.1", "mocha-junit-reporter": "^2.2.1", - "pdf-parse": "^2.3.0", + "pdfjs-dist": "^5.1.91", "uuid": "^9.0.1", "zod-validation-error": "^4.0.1" } @@ -60906,7 +60877,7 @@ "isomorphic-git": "^1.33.1", "mailtrap": "^4.3.0", "mocha-junit-reporter": "^2.2.1", - "pdf-parse": "^2.3.0", + "pdfjs-dist": "^5.1.91", "typescript": "^5.0.4", "uuid": "^9.0.1" } diff --git a/server-ce/test/helpers/read-file.ts b/server-ce/test/helpers/read-file.ts index a1c5a1260e..bdd2ad11a5 100644 --- a/server-ce/test/helpers/read-file.ts +++ b/server-ce/test/helpers/read-file.ts @@ -1,6 +1,6 @@ import fs from 'node:fs' import path from 'node:path' -import { PDFParse } from 'pdf-parse' +import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs' import AdmZip from 'adm-zip' import { setTimeout } from 'node:timers/promises' @@ -39,15 +39,19 @@ export async function readPdf(file: string) { let attempt = 0 while (attempt < MAX_ATTEMPTS) { if (fs.existsSync(file)) { - const dataBuffer = fs.readFileSync(path.resolve(file)) - const parser = new PDFParse({ data: dataBuffer }) + const pdf = await getDocument(file).promise + const text = [] try { - const result = await parser.getText() - return result.text - } catch (error) { - console.error('PDF parsing failed:', error) + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i) + const content = await page.getTextContent() + for (const item of content.items) { + if ('str' in item) text.push(item.str) + } + } + return text.join('\n') } finally { - await parser.destroy() + await pdf.destroy() } } await setTimeout(POLL_INTERVAL) diff --git a/server-ce/test/package.json b/server-ce/test/package.json index e0610cbb2e..dc20715c6a 100644 --- a/server-ce/test/package.json +++ b/server-ce/test/package.json @@ -26,7 +26,7 @@ "isomorphic-git": "^1.33.1", "js-yaml": "^4.1.1", "mocha-junit-reporter": "^2.2.1", - "pdf-parse": "^2.3.0", + "pdfjs-dist": "^5.1.91", "uuid": "^9.0.1", "zod-validation-error": "^4.0.1" }