mirror of
https://github.com/yu-i-i/overleaf-cep.git
synced 2026-05-23 09:09:36 +02:00
* [monorepo] also cycle ssl_proxy to avoid hitting circuit break in nginx * [e2e] use plain pdf.js for extracting PDF text * [monorepo] manually download canvas binding for MacOS GitOrigin-RevId: dbadbbaa0e121953c06ab4be9241bf361997865e
62 lines
1.5 KiB
TypeScript
62 lines
1.5 KiB
TypeScript
import fs from 'node:fs'
|
|
import path from 'node:path'
|
|
import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs'
|
|
import AdmZip from 'adm-zip'
|
|
import { setTimeout } from 'node:timers/promises'
|
|
|
|
const MAX_ATTEMPTS = 15
|
|
const POLL_INTERVAL = 500
|
|
|
|
type ReadFileInZipArgs = {
|
|
pathToZip: string
|
|
fileToRead: string
|
|
}
|
|
|
|
export async function readFileInZip({
|
|
pathToZip,
|
|
fileToRead,
|
|
}: ReadFileInZipArgs) {
|
|
let attempt = 0
|
|
while (attempt < MAX_ATTEMPTS) {
|
|
if (fs.existsSync(pathToZip)) {
|
|
const zip = new AdmZip(path.resolve(pathToZip))
|
|
const entry = zip
|
|
.getEntries()
|
|
.find(entry => entry.entryName === fileToRead)
|
|
if (entry) {
|
|
return entry.getData().toString('utf8')
|
|
} else {
|
|
throw new Error(`${fileToRead} not found in ${pathToZip}`)
|
|
}
|
|
}
|
|
await setTimeout(POLL_INTERVAL)
|
|
attempt++
|
|
}
|
|
throw new Error(`${pathToZip} not found`)
|
|
}
|
|
|
|
export async function readPdf(file: string) {
|
|
let attempt = 0
|
|
while (attempt < MAX_ATTEMPTS) {
|
|
if (fs.existsSync(file)) {
|
|
const pdf = await getDocument(file).promise
|
|
const text = []
|
|
try {
|
|
for (let i = 1; i <= pdf.numPages; i++) {
|
|
const page = await pdf.getPage(i)
|
|
const content = await page.getTextContent()
|
|
for (const item of content.items) {
|
|
if ('str' in item) text.push(item.str)
|
|
}
|
|
}
|
|
return text.join('\n')
|
|
} finally {
|
|
await pdf.destroy()
|
|
}
|
|
}
|
|
await setTimeout(POLL_INTERVAL)
|
|
attempt++
|
|
}
|
|
throw new Error(`${file} not found`)
|
|
}
|