Files
openclaw/extensions/document-extract/document-extractor.ts
Vincent Koc e3cba98f39 refactor(pdf): move document extraction to plugin
* refactor(pdf): move document extraction to plugin

* fix(deps): sync document extract lockfile

* fix(pdf): harden document extraction plugin
2026-04-24 17:15:05 -07:00

217 lines
6.0 KiB
TypeScript

import type {
DocumentExtractedImage,
DocumentExtractionRequest,
DocumentExtractionResult,
DocumentExtractorPlugin,
} from "openclaw/plugin-sdk/document-extractor";
type CanvasLike = {
toBuffer(type: "image/png"): Buffer;
};
type CanvasModule = {
createCanvas(width: number, height: number): CanvasLike;
};
type PdfTextItem = {
str: string;
};
type PdfTextContent = {
items: Array<PdfTextItem | object>;
};
type PdfViewport = {
width: number;
height: number;
};
type PdfPage = {
getTextContent(): Promise<PdfTextContent>;
getViewport(params: { scale: number }): PdfViewport;
render(params: { canvas: unknown; viewport: PdfViewport }): { promise: Promise<void> };
};
type PdfDocument = {
numPages: number;
getPage(pageNumber: number): Promise<PdfPage>;
};
type PdfJsModule = {
getDocument(params: { data: Uint8Array; disableWorker?: boolean }): {
promise: Promise<PdfDocument>;
};
};
const CANVAS_MODULE = "@napi-rs/canvas";
const PDFJS_MODULE = "pdfjs-dist/legacy/build/pdf.mjs";
const MAX_EXTRACTED_TEXT_CHARS = 200_000;
const MAX_RENDER_DIMENSION = 10_000;
let canvasModulePromise: Promise<CanvasModule> | null = null;
let pdfJsModulePromise: Promise<PdfJsModule> | null = null;
async function loadCanvasModule(): Promise<CanvasModule> {
if (!canvasModulePromise) {
canvasModulePromise = (import(CANVAS_MODULE) as Promise<CanvasModule>).catch((err) => {
canvasModulePromise = null;
throw new Error("Optional dependency @napi-rs/canvas is required for PDF image extraction", {
cause: err,
});
});
}
return canvasModulePromise;
}
async function loadPdfJsModule(): Promise<PdfJsModule> {
if (!pdfJsModulePromise) {
pdfJsModulePromise = (import(PDFJS_MODULE) as Promise<PdfJsModule>).catch((err) => {
pdfJsModulePromise = null;
throw new Error("Optional dependency pdfjs-dist is required for PDF extraction", {
cause: err,
});
});
}
return pdfJsModulePromise;
}
function appendTextWithinLimit(parts: string[], pageText: string, currentLength: number): number {
if (!pageText) {
return currentLength;
}
const remaining = MAX_EXTRACTED_TEXT_CHARS - currentLength;
if (remaining <= 0) {
return currentLength;
}
const nextText = pageText.length > remaining ? pageText.slice(0, remaining) : pageText;
parts.push(nextText);
return currentLength + nextText.length;
}
function resolveRenderPlan(
viewport: PdfViewport,
remainingPixels: number,
): { scale: number; width: number; height: number; pixels: number } | null {
if (
remainingPixels <= 0 ||
!Number.isFinite(viewport.width) ||
!Number.isFinite(viewport.height) ||
viewport.width <= 0 ||
viewport.height <= 0
) {
return null;
}
const pagePixels = Math.max(1, viewport.width * viewport.height);
const maxScale = Math.min(
1,
Math.sqrt(remainingPixels / pagePixels),
MAX_RENDER_DIMENSION / viewport.width,
MAX_RENDER_DIMENSION / viewport.height,
);
if (!Number.isFinite(maxScale) || maxScale <= 0) {
return null;
}
let best: { scale: number; width: number; height: number; pixels: number } | null = null;
let low = 0;
let high = maxScale;
for (let i = 0; i < 32; i += 1) {
const scale = (low + high) / 2;
const width = Math.max(1, Math.ceil(viewport.width * scale));
const height = Math.max(1, Math.ceil(viewport.height * scale));
const pixels = width * height;
if (
width <= MAX_RENDER_DIMENSION &&
height <= MAX_RENDER_DIMENSION &&
pixels <= remainingPixels
) {
best = { scale, width, height, pixels };
low = scale;
} else {
high = scale;
}
}
return best;
}
async function extractPdfContent(
request: DocumentExtractionRequest,
): Promise<DocumentExtractionResult> {
const pdfJsModule = await loadPdfJsModule();
const pdf = await pdfJsModule.getDocument({
data: new Uint8Array(request.buffer),
disableWorker: true,
}).promise;
const effectivePages: number[] = request.pageNumbers
? request.pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, request.maxPages)
: Array.from({ length: Math.min(pdf.numPages, request.maxPages) }, (_, i) => i + 1);
const textParts: string[] = [];
let extractedTextLength = 0;
for (const pageNum of effectivePages) {
const page = await pdf.getPage(pageNum);
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item) => ("str" in item ? item.str : ""))
.filter(Boolean)
.join(" ");
if (pageText) {
extractedTextLength = appendTextWithinLimit(textParts, pageText, extractedTextLength);
if (extractedTextLength >= MAX_EXTRACTED_TEXT_CHARS) {
break;
}
}
}
const text = textParts.join("\n\n");
if (text.trim().length >= request.minTextChars) {
return { text, images: [] };
}
let canvasModule: CanvasModule;
try {
canvasModule = await loadCanvasModule();
} catch (err) {
request.onImageExtractionError?.(err);
return { text, images: [] };
}
const images: DocumentExtractedImage[] = [];
let remainingPixels = Math.max(1, Math.floor(request.maxPixels));
for (const pageNum of effectivePages) {
if (remainingPixels <= 0) {
break;
}
const page = await pdf.getPage(pageNum);
const viewport = page.getViewport({ scale: 1 });
const plan = resolveRenderPlan(viewport, remainingPixels);
if (!plan) {
break;
}
const scaled = page.getViewport({ scale: plan.scale });
const canvas = canvasModule.createCanvas(plan.width, plan.height);
await page.render({
canvas: canvas as unknown as HTMLCanvasElement,
viewport: scaled,
}).promise;
const png = canvas.toBuffer("image/png");
images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" });
remainingPixels -= plan.pixels;
}
return { text, images };
}
export function createPdfDocumentExtractor(): DocumentExtractorPlugin {
return {
id: "pdf",
label: "PDF",
mimeTypes: ["application/pdf"],
autoDetectOrder: 10,
extract: extractPdfContent,
};
}