Files
overleaf-cep/server-ce/test/helpers/read-file.ts
Jakob Ackermann 9970dd907a [e2e] use plain pdf.js for extracting PDF text (#31125)
* [monorepo] also cycle ssl_proxy to avoid hitting circuit break in nginx

* [e2e] use plain pdf.js for extracting PDF text

* [monorepo] manually download canvas binding for MacOS

GitOrigin-RevId: dbadbbaa0e121953c06ab4be9241bf361997865e
2026-02-02 09:05:29 +00:00

62 lines
1.5 KiB
TypeScript

import fs from 'node:fs'
import path from 'node:path'
import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs'
import AdmZip from 'adm-zip'
import { setTimeout } from 'node:timers/promises'
const MAX_ATTEMPTS = 15
const POLL_INTERVAL = 500
type ReadFileInZipArgs = {
pathToZip: string
fileToRead: string
}
export async function readFileInZip({
pathToZip,
fileToRead,
}: ReadFileInZipArgs) {
let attempt = 0
while (attempt < MAX_ATTEMPTS) {
if (fs.existsSync(pathToZip)) {
const zip = new AdmZip(path.resolve(pathToZip))
const entry = zip
.getEntries()
.find(entry => entry.entryName === fileToRead)
if (entry) {
return entry.getData().toString('utf8')
} else {
throw new Error(`${fileToRead} not found in ${pathToZip}`)
}
}
await setTimeout(POLL_INTERVAL)
attempt++
}
throw new Error(`${pathToZip} not found`)
}
export async function readPdf(file: string) {
let attempt = 0
while (attempt < MAX_ATTEMPTS) {
if (fs.existsSync(file)) {
const pdf = await getDocument(file).promise
const text = []
try {
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i)
const content = await page.getTextContent()
for (const item of content.items) {
if ('str' in item) text.push(item.str)
}
}
return text.join('\n')
} finally {
await pdf.destroy()
}
}
await setTimeout(POLL_INTERVAL)
attempt++
}
throw new Error(`${file} not found`)
}