[e2e] use plain pdf.js for extracting PDF text (#31125)

* [monorepo] also cycle ssl_proxy to avoid hitting circuit break in nginx

* [e2e] use plain pdf.js for extracting PDF text

* [monorepo] manually download canvas binding for MacOS

GitOrigin-RevId: dbadbbaa0e121953c06ab4be9241bf361997865e
This commit is contained in:
Jakob Ackermann
2026-01-30 12:42:32 +00:00
committed by Copybot
parent 4d6c1138c7
commit 9970dd907a
3 changed files with 15 additions and 40 deletions

33
package-lock.json generated
View File

@@ -40447,35 +40447,6 @@
"resolved": "https://registry.npmjs.org/pause/-/pause-0.0.1.tgz", "resolved": "https://registry.npmjs.org/pause/-/pause-0.0.1.tgz",
"integrity": "sha1-HUCLP9t2kjuVQ9lvtMnf1TXZy10=" "integrity": "sha1-HUCLP9t2kjuVQ9lvtMnf1TXZy10="
}, },
"node_modules/pdf-parse": {
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-2.3.0.tgz",
"integrity": "sha512-VRKvhqdZ694CjdR1vusZ7VIA7ZuMN/GQ7eKz+e3z9ujCQdCQMOEG9x6cHfq8ddS7XspXVrruWuKmXm8g0hFlSQ==",
"dev": true,
"license": "Apache-2.0",
"dependencies": {
"pdfjs-dist": "^5.4.296"
},
"engines": {
"node": ">=20.16.0 <21 || >=22.3.0"
},
"optionalDependencies": {
"@napi-rs/canvas": "^0.1.80"
}
},
"node_modules/pdf-parse/node_modules/pdfjs-dist": {
"version": "5.4.296",
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.296.tgz",
"integrity": "sha512-DlOzet0HO7OEnmUmB6wWGJrrdvbyJKftI1bhMitK7O2N8W2gc757yyYBbINy9IDafXAV9wmKr9t7xsTaNKRG5Q==",
"dev": true,
"license": "Apache-2.0",
"engines": {
"node": ">=20.16.0 || >=22.3.0"
},
"optionalDependencies": {
"@napi-rs/canvas": "^0.1.80"
}
},
"node_modules/pdfjs-dist": { "node_modules/pdfjs-dist": {
"version": "5.1.91", "version": "5.1.91",
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.1.91.tgz", "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.1.91.tgz",
@@ -52774,7 +52745,7 @@
"isomorphic-git": "^1.33.1", "isomorphic-git": "^1.33.1",
"js-yaml": "^4.1.1", "js-yaml": "^4.1.1",
"mocha-junit-reporter": "^2.2.1", "mocha-junit-reporter": "^2.2.1",
"pdf-parse": "^2.3.0", "pdfjs-dist": "^5.1.91",
"uuid": "^9.0.1", "uuid": "^9.0.1",
"zod-validation-error": "^4.0.1" "zod-validation-error": "^4.0.1"
} }
@@ -60906,7 +60877,7 @@
"isomorphic-git": "^1.33.1", "isomorphic-git": "^1.33.1",
"mailtrap": "^4.3.0", "mailtrap": "^4.3.0",
"mocha-junit-reporter": "^2.2.1", "mocha-junit-reporter": "^2.2.1",
"pdf-parse": "^2.3.0", "pdfjs-dist": "^5.1.91",
"typescript": "^5.0.4", "typescript": "^5.0.4",
"uuid": "^9.0.1" "uuid": "^9.0.1"
} }

View File

@@ -1,6 +1,6 @@
import fs from 'node:fs' import fs from 'node:fs'
import path from 'node:path' import path from 'node:path'
import { PDFParse } from 'pdf-parse' import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs'
import AdmZip from 'adm-zip' import AdmZip from 'adm-zip'
import { setTimeout } from 'node:timers/promises' import { setTimeout } from 'node:timers/promises'
@@ -39,15 +39,19 @@ export async function readPdf(file: string) {
let attempt = 0 let attempt = 0
while (attempt < MAX_ATTEMPTS) { while (attempt < MAX_ATTEMPTS) {
if (fs.existsSync(file)) { if (fs.existsSync(file)) {
const dataBuffer = fs.readFileSync(path.resolve(file)) const pdf = await getDocument(file).promise
const parser = new PDFParse({ data: dataBuffer }) const text = []
try { try {
const result = await parser.getText() for (let i = 1; i <= pdf.numPages; i++) {
return result.text const page = await pdf.getPage(i)
} catch (error) { const content = await page.getTextContent()
console.error('PDF parsing failed:', error) for (const item of content.items) {
if ('str' in item) text.push(item.str)
}
}
return text.join('\n')
} finally { } finally {
await parser.destroy() await pdf.destroy()
} }
} }
await setTimeout(POLL_INTERVAL) await setTimeout(POLL_INTERVAL)

View File

@@ -26,7 +26,7 @@
"isomorphic-git": "^1.33.1", "isomorphic-git": "^1.33.1",
"js-yaml": "^4.1.1", "js-yaml": "^4.1.1",
"mocha-junit-reporter": "^2.2.1", "mocha-junit-reporter": "^2.2.1",
"pdf-parse": "^2.3.0", "pdfjs-dist": "^5.1.91",
"uuid": "^9.0.1", "uuid": "^9.0.1",
"zod-validation-error": "^4.0.1" "zod-validation-error": "^4.0.1"
} }