[e2e] use plain pdf.js for extracting PDF text (#31125)

* [monorepo] also cycle ssl_proxy to avoid hitting circuit break in nginx

* [e2e] use plain pdf.js for extracting PDF text

* [monorepo] manually download canvas binding for MacOS

GitOrigin-RevId: dbadbbaa0e121953c06ab4be9241bf361997865e
This commit is contained in:
Jakob Ackermann
2026-01-30 12:42:32 +00:00
committed by Copybot
parent 4d6c1138c7
commit 9970dd907a
3 changed files with 15 additions and 40 deletions

View File

@@ -1,6 +1,6 @@
import fs from 'node:fs'
import path from 'node:path'
import { PDFParse } from 'pdf-parse'
import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs'
import AdmZip from 'adm-zip'
import { setTimeout } from 'node:timers/promises'
@@ -39,15 +39,19 @@ export async function readPdf(file: string) {
let attempt = 0
while (attempt < MAX_ATTEMPTS) {
if (fs.existsSync(file)) {
const dataBuffer = fs.readFileSync(path.resolve(file))
const parser = new PDFParse({ data: dataBuffer })
const pdf = await getDocument(file).promise
const text = []
try {
const result = await parser.getText()
return result.text
} catch (error) {
console.error('PDF parsing failed:', error)
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i)
const content = await page.getTextContent()
for (const item of content.items) {
if ('str' in item) text.push(item.str)
}
}
return text.join('\n')
} finally {
await parser.destroy()
await pdf.destroy()
}
}
await setTimeout(POLL_INTERVAL)

View File

@@ -26,7 +26,7 @@
"isomorphic-git": "^1.33.1",
"js-yaml": "^4.1.1",
"mocha-junit-reporter": "^2.2.1",
"pdf-parse": "^2.3.0",
"pdfjs-dist": "^5.1.91",
"uuid": "^9.0.1",
"zod-validation-error": "^4.0.1"
}