mirror of
https://github.com/yu-i-i/overleaf-cep.git
synced 2026-05-23 09:09:36 +02:00
[e2e] use plain pdf.js for extracting PDF text (#31125)
* [monorepo] also cycle ssl_proxy to avoid hitting circuit break in nginx * [e2e] use plain pdf.js for extracting PDF text * [monorepo] manually download canvas binding for MacOS GitOrigin-RevId: dbadbbaa0e121953c06ab4be9241bf361997865e
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import fs from 'node:fs'
|
||||
import path from 'node:path'
|
||||
import { PDFParse } from 'pdf-parse'
|
||||
import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs'
|
||||
import AdmZip from 'adm-zip'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
|
||||
@@ -39,15 +39,19 @@ export async function readPdf(file: string) {
|
||||
let attempt = 0
|
||||
while (attempt < MAX_ATTEMPTS) {
|
||||
if (fs.existsSync(file)) {
|
||||
const dataBuffer = fs.readFileSync(path.resolve(file))
|
||||
const parser = new PDFParse({ data: dataBuffer })
|
||||
const pdf = await getDocument(file).promise
|
||||
const text = []
|
||||
try {
|
||||
const result = await parser.getText()
|
||||
return result.text
|
||||
} catch (error) {
|
||||
console.error('PDF parsing failed:', error)
|
||||
for (let i = 1; i <= pdf.numPages; i++) {
|
||||
const page = await pdf.getPage(i)
|
||||
const content = await page.getTextContent()
|
||||
for (const item of content.items) {
|
||||
if ('str' in item) text.push(item.str)
|
||||
}
|
||||
}
|
||||
return text.join('\n')
|
||||
} finally {
|
||||
await parser.destroy()
|
||||
await pdf.destroy()
|
||||
}
|
||||
}
|
||||
await setTimeout(POLL_INTERVAL)
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
"isomorphic-git": "^1.33.1",
|
||||
"js-yaml": "^4.1.1",
|
||||
"mocha-junit-reporter": "^2.2.1",
|
||||
"pdf-parse": "^2.3.0",
|
||||
"pdfjs-dist": "^5.1.91",
|
||||
"uuid": "^9.0.1",
|
||||
"zod-validation-error": "^4.0.1"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user