Skip to content

Commit

Permalink
feat: resolve PDF.js build for browser context
Browse files Browse the repository at this point in the history
  • Loading branch information
johannschopplich committed Aug 12, 2023
1 parent 66a2fc7 commit 325d450
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 35 deletions.
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# unpdf

A collection of utilities to work with PDFs.
A collection of utilities to work with PDFs. Uses Mozilla's [PDF.js](https://github.com/mozilla/pdf.js) under the hood.

`unpdf` takes advantage of [export conditions](https://nodejs.org/api/packages.html#packages_conditional_exports) to provide a minimal bundle size for the browser. As of now, the available methods are only supported in Node contexts.
`unpdf` takes advantage of [export conditions](https://nodejs.org/api/packages.html#packages_conditional_exports) to circumvent build issues in serverless environments. PDF.js depends on the optional `canvas` module, which [doesn't work inside worker threads](https://github.com/Automattic/node-canvas/issues/1394).

**Why this package then?**
This library is also intended as a modern alternative to the unmaintained [`pdf-parse`](https://www.npmjs.com/package/pdf-parse).

- To circumvent build issues in serverless environments, where the `canvas` package used by `PDF.js` is not supported.
- WIP and more to come.
## Features

- 🏗️ Conditional exports for Browser, Node and worker environments
- 💬 Extract text from PDFs

## Installation

Expand Down
2 changes: 1 addition & 1 deletion build.config.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { defineBuildConfig } from 'unbuild'

export default defineBuildConfig({
entries: ['src/index.node', 'src/index.web.ts'],
entries: ['src/index.web.ts', 'src/index.worker', 'src/index.node'],
clean: true,
declaration: true,
externals: ['pdfjs-dist'],
Expand Down
22 changes: 11 additions & 11 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"type": "module",
"version": "0.2.2",
"packageManager": "pnpm@8.6.12",
"description": "Utilities to work with PDFs",
"description": "Utilities to work with PDFs, like extracting text",
"author": "Johann Schopplich <pkg@johannschopplich.com>",
"license": "MIT",
"homepage": "https://github.com/johannschopplich/unpdf#readme",
Expand All @@ -21,16 +21,16 @@
"sideEffects": false,
"exports": {
"browser": "./dist/index.web.mjs",
"bun": "./dist/index.web.mjs",
"deno": "./dist/index.web.mjs",
"edge-light": "./dist/index.web.mjs",
"edge-routine": "./dist/index.web.mjs",
"lagon": "./dist/index.web.mjs",
"netlify": "./dist/index.web.mjs",
"react-native": "./dist/index.web.mjs",
"wintercg": "./dist/index.web.mjs",
"worker": "./dist/index.web.mjs",
"workerd": "./dist/index.web.mjs",
"bun": "./dist/index.worker.mjs",
"deno": "./dist/index.worker.mjs",
"edge-light": "./dist/index.worker.mjs",
"edge-routine": "./dist/index.worker.mjs",
"lagon": "./dist/index.worker.mjs",
"netlify": "./dist/index.worker.mjs",
"react-native": "./dist/index.worker.mjs",
"wintercg": "./dist/index.worker.mjs",
"worker": "./dist/index.worker.mjs",
"workerd": "./dist/index.worker.mjs",
"node": {
"types": "./dist/index.node.d.ts",
"import": "./dist/index.node.mjs",
Expand Down
4 changes: 2 additions & 2 deletions src/image.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { getDocumentProxy, getPDFJSImports } from './utils'
import { getDocumentProxy, getResolvedPDFJSInstance } from './utils'

export async function getImagesFromPage(
data: ArrayBuffer,
Expand All @@ -7,7 +7,7 @@ export async function getImagesFromPage(
const pdf = await getDocumentProxy(data)
const page = await pdf.getPage(pageNumber)
const operatorList = await page.getOperatorList()
const { OPS } = await getPDFJSImports()
const { OPS } = getResolvedPDFJSInstance()

const images: ArrayBuffer[] = []
for (const op of operatorList.fnArray) {
Expand Down
15 changes: 13 additions & 2 deletions src/index.node.ts
Original file line number Diff line number Diff line change
@@ -1,2 +1,13 @@
export * from './text'
export * from './image'
import { decodePDFText as _decodePDFText } from './text'
import { getImagesFromPage as _getImagesFromPage } from './image'
import { resolvePDFJSNodeImports } from './utils'

export const decodePDFText: typeof _decodePDFText = async (...args) => {
await resolvePDFJSNodeImports()
return await _decodePDFText(...args)
}

export const getImagesFromPage: typeof _getImagesFromPage = async (...args) => {
await resolvePDFJSNodeImports()
return await _getImagesFromPage(...args)
}
17 changes: 9 additions & 8 deletions src/index.web.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import type {
decodePDFText as _decodePDFText,
getImagesFromPage as _getImagesFromPage,
} from './index.node'
import { decodePDFText as _decodePDFText } from './text'
import { getImagesFromPage as _getImagesFromPage } from './image'
import { resolvePDFJSWebImports } from './utils'

export const decodePDFText: typeof _decodePDFText = async () => {
throw new Error('Not supported in browser context yet')
export const decodePDFText: typeof _decodePDFText = async (...args) => {
await resolvePDFJSWebImports()
return await _decodePDFText(...args)
}

export const getImagesFromPage: typeof _getImagesFromPage = async () => {
throw new Error('Not supported in browser context yet')
export const getImagesFromPage: typeof _getImagesFromPage = async (...args) => {
await resolvePDFJSWebImports()
return await _getImagesFromPage(...args)
}
10 changes: 10 additions & 0 deletions src/index.worker.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import type { decodePDFText as _decodePDFText } from './text'
import type { getImagesFromPage as _getImagesFromPage } from './image'

export const decodePDFText: typeof _decodePDFText = async () => {
throw new Error('Not implemented in worker context yet')
}

export const getImagesFromPage: typeof _getImagesFromPage = async () => {
throw new Error('Not implemented in worker context yet')
}
4 changes: 4 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@ export interface PDFContent {
metadata?: any
text: string | string[]
}

export type Prettify<T> = {
[K in keyof T]: T[K];
} & Record<never, never>
37 changes: 31 additions & 6 deletions src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import type * as PDFJS from 'pdfjs-dist'

let instance: typeof PDFJS | undefined

export async function getDocumentProxy(data: ArrayBuffer) {
const { getDocument } = await getPDFJSImports()
const { getDocument } = instance!
const pdf = await getDocument({
data,
useWorkerFetch: false,
Expand All @@ -10,15 +14,36 @@ export async function getDocumentProxy(data: ArrayBuffer) {
return pdf
}

export async function getPDFJSImports() {
export function getResolvedPDFJSInstance() {
return instance!
}

export async function resolvePDFJSWebImports() {
if (instance)
return

try {
const { default: mod } = await import('pdfjs-dist')
instance = mod
}
catch (error) {
throw new Error(
'PDF.js is not available. Please add the package as a dependency.',
)
}
}

export async function resolvePDFJSNodeImports() {
if (instance)
return

try {
const { default: mod } = await import('pdfjs-dist/legacy/build/pdf')
return mod
instance = mod
}
catch (e) {
console.error(e)
catch (error) {
throw new Error(
'PDF.js is not available. Please run `pnpm add -D pdfjs-dist` and try again.',
'PDF.js is not available. Please add the package as a dependency.',
)
}
}

0 comments on commit 325d450

Please sign in to comment.