mirror of
https://github.com/yu-i-i/overleaf-cep.git
synced 2026-05-23 09:09:36 +02:00
[e2e] use plain pdf.js for extracting PDF text (#31125)
* [monorepo] also cycle ssl_proxy to avoid hitting circuit break in nginx * [e2e] use plain pdf.js for extracting PDF text * [monorepo] manually download canvas binding for MacOS GitOrigin-RevId: dbadbbaa0e121953c06ab4be9241bf361997865e
This commit is contained in:
33
package-lock.json
generated
33
package-lock.json
generated
@@ -40447,35 +40447,6 @@
|
|||||||
"resolved": "https://registry.npmjs.org/pause/-/pause-0.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/pause/-/pause-0.0.1.tgz",
|
||||||
"integrity": "sha1-HUCLP9t2kjuVQ9lvtMnf1TXZy10="
|
"integrity": "sha1-HUCLP9t2kjuVQ9lvtMnf1TXZy10="
|
||||||
},
|
},
|
||||||
"node_modules/pdf-parse": {
|
|
||||||
"version": "2.3.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-2.3.0.tgz",
|
|
||||||
"integrity": "sha512-VRKvhqdZ694CjdR1vusZ7VIA7ZuMN/GQ7eKz+e3z9ujCQdCQMOEG9x6cHfq8ddS7XspXVrruWuKmXm8g0hFlSQ==",
|
|
||||||
"dev": true,
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"dependencies": {
|
|
||||||
"pdfjs-dist": "^5.4.296"
|
|
||||||
},
|
|
||||||
"engines": {
|
|
||||||
"node": ">=20.16.0 <21 || >=22.3.0"
|
|
||||||
},
|
|
||||||
"optionalDependencies": {
|
|
||||||
"@napi-rs/canvas": "^0.1.80"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/pdf-parse/node_modules/pdfjs-dist": {
|
|
||||||
"version": "5.4.296",
|
|
||||||
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.296.tgz",
|
|
||||||
"integrity": "sha512-DlOzet0HO7OEnmUmB6wWGJrrdvbyJKftI1bhMitK7O2N8W2gc757yyYBbINy9IDafXAV9wmKr9t7xsTaNKRG5Q==",
|
|
||||||
"dev": true,
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"engines": {
|
|
||||||
"node": ">=20.16.0 || >=22.3.0"
|
|
||||||
},
|
|
||||||
"optionalDependencies": {
|
|
||||||
"@napi-rs/canvas": "^0.1.80"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/pdfjs-dist": {
|
"node_modules/pdfjs-dist": {
|
||||||
"version": "5.1.91",
|
"version": "5.1.91",
|
||||||
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.1.91.tgz",
|
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.1.91.tgz",
|
||||||
@@ -52774,7 +52745,7 @@
|
|||||||
"isomorphic-git": "^1.33.1",
|
"isomorphic-git": "^1.33.1",
|
||||||
"js-yaml": "^4.1.1",
|
"js-yaml": "^4.1.1",
|
||||||
"mocha-junit-reporter": "^2.2.1",
|
"mocha-junit-reporter": "^2.2.1",
|
||||||
"pdf-parse": "^2.3.0",
|
"pdfjs-dist": "^5.1.91",
|
||||||
"uuid": "^9.0.1",
|
"uuid": "^9.0.1",
|
||||||
"zod-validation-error": "^4.0.1"
|
"zod-validation-error": "^4.0.1"
|
||||||
}
|
}
|
||||||
@@ -60906,7 +60877,7 @@
|
|||||||
"isomorphic-git": "^1.33.1",
|
"isomorphic-git": "^1.33.1",
|
||||||
"mailtrap": "^4.3.0",
|
"mailtrap": "^4.3.0",
|
||||||
"mocha-junit-reporter": "^2.2.1",
|
"mocha-junit-reporter": "^2.2.1",
|
||||||
"pdf-parse": "^2.3.0",
|
"pdfjs-dist": "^5.1.91",
|
||||||
"typescript": "^5.0.4",
|
"typescript": "^5.0.4",
|
||||||
"uuid": "^9.0.1"
|
"uuid": "^9.0.1"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import fs from 'node:fs'
|
import fs from 'node:fs'
|
||||||
import path from 'node:path'
|
import path from 'node:path'
|
||||||
import { PDFParse } from 'pdf-parse'
|
import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs'
|
||||||
import AdmZip from 'adm-zip'
|
import AdmZip from 'adm-zip'
|
||||||
import { setTimeout } from 'node:timers/promises'
|
import { setTimeout } from 'node:timers/promises'
|
||||||
|
|
||||||
@@ -39,15 +39,19 @@ export async function readPdf(file: string) {
|
|||||||
let attempt = 0
|
let attempt = 0
|
||||||
while (attempt < MAX_ATTEMPTS) {
|
while (attempt < MAX_ATTEMPTS) {
|
||||||
if (fs.existsSync(file)) {
|
if (fs.existsSync(file)) {
|
||||||
const dataBuffer = fs.readFileSync(path.resolve(file))
|
const pdf = await getDocument(file).promise
|
||||||
const parser = new PDFParse({ data: dataBuffer })
|
const text = []
|
||||||
try {
|
try {
|
||||||
const result = await parser.getText()
|
for (let i = 1; i <= pdf.numPages; i++) {
|
||||||
return result.text
|
const page = await pdf.getPage(i)
|
||||||
} catch (error) {
|
const content = await page.getTextContent()
|
||||||
console.error('PDF parsing failed:', error)
|
for (const item of content.items) {
|
||||||
|
if ('str' in item) text.push(item.str)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return text.join('\n')
|
||||||
} finally {
|
} finally {
|
||||||
await parser.destroy()
|
await pdf.destroy()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
await setTimeout(POLL_INTERVAL)
|
await setTimeout(POLL_INTERVAL)
|
||||||
|
|||||||
@@ -26,7 +26,7 @@
|
|||||||
"isomorphic-git": "^1.33.1",
|
"isomorphic-git": "^1.33.1",
|
||||||
"js-yaml": "^4.1.1",
|
"js-yaml": "^4.1.1",
|
||||||
"mocha-junit-reporter": "^2.2.1",
|
"mocha-junit-reporter": "^2.2.1",
|
||||||
"pdf-parse": "^2.3.0",
|
"pdfjs-dist": "^5.1.91",
|
||||||
"uuid": "^9.0.1",
|
"uuid": "^9.0.1",
|
||||||
"zod-validation-error": "^4.0.1"
|
"zod-validation-error": "^4.0.1"
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user