File size: 12,077 Bytes
f56a29b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 | /**
* PDF Parsing Provider Implementation
*
* Factory pattern for routing PDF parsing requests to appropriate provider implementations.
* Follows the same architecture as lib/ai/providers.ts for consistency.
*
* Currently Supported Providers:
* - unpdf: Built-in Node.js PDF parser with text and image extraction
* - MinerU: Advanced commercial service with OCR, formula, and table extraction
* (https://mineru.ai or self-hosted)
*
* HOW TO ADD A NEW PROVIDER:
*
* 1. Add provider ID to PDFProviderId in lib/pdf/types.ts
* Example: | 'tesseract-ocr'
*
* 2. Add provider configuration to lib/pdf/constants.ts
* Example:
* 'tesseract-ocr': {
* id: 'tesseract-ocr',
* name: 'Tesseract OCR',
* requiresApiKey: false,
* icon: '/tesseract.svg',
* features: ['text', 'images', 'ocr']
* }
*
* 3. Implement provider function in this file
* Pattern: async function parseWithXxx(config, pdfBuffer): Promise<ParsedPdfContent>
* - Accept PDF as Buffer
* - Extract text, images, tables, formulas as needed
* - Return unified format:
* {
* text: string, // Markdown or plain text
* images: string[], // Base64 data URLs
* metadata: {
* pageCount: number,
* parser: string,
* ... // Provider-specific metadata
* }
* }
*
* Example:
* async function parseWithTesseractOCR(
* config: PDFParserConfig,
* pdfBuffer: Buffer
* ): Promise<ParsedPdfContent> {
* const { createWorker } = await import('tesseract.js');
*
* // Convert PDF pages to images
* const pdf = await getDocumentProxy(new Uint8Array(pdfBuffer));
* const numPages = pdf.numPages;
*
* const texts: string[] = [];
* const images: string[] = [];
*
* for (let pageNum = 1; pageNum <= numPages; pageNum++) {
* // Render page to canvas/image
* const page = await pdf.getPage(pageNum);
* const viewport = page.getViewport({ scale: 2.0 });
* const canvas = createCanvas(viewport.width, viewport.height);
* const context = canvas.getContext('2d');
* await page.render({ canvasContext: context, viewport }).promise;
*
* // OCR the image
* const worker = await createWorker('eng+chi_sim');
* const { data: { text } } = await worker.recognize(canvas.toBuffer());
* texts.push(text);
* await worker.terminate();
*
* // Save image
* images.push(canvas.toDataURL());
* }
*
* return {
* text: texts.join('\n\n'),
* images,
* metadata: {
* pageCount: numPages,
* parser: 'tesseract-ocr',
* },
* };
* }
*
* 4. Add case to parsePDF() switch statement
* case 'tesseract-ocr':
* result = await parseWithTesseractOCR(config, pdfBuffer);
* break;
*
* 5. Add i18n translations in lib/i18n.ts
* providerTesseractOCR: { zh: 'Tesseract OCR', en: 'Tesseract OCR' }
*
* 6. Update features in constants.ts to reflect parser capabilities
* features: ['text', 'images', 'ocr'] // OCR-capable
*
* Provider Implementation Patterns:
*
* Pattern 1: Local Node.js Parser (like unpdf)
* - Import parsing library
* - Process Buffer directly
* - Extract text and images synchronously or asynchronously
* - Convert images to base64 data URLs
* - Return immediately
*
* Pattern 2: Remote API (like MinerU)
* - Upload PDF or provide URL
* - Create task and get task ID
* - Poll for completion (with timeout)
* - Download results (text, images, metadata)
* - Parse and convert to unified format
*
* Pattern 3: OCR-based Parser (Tesseract, Google Vision)
* - Render PDF pages to images
* - Send images to OCR service
* - Collect text from all pages
* - Combine with layout analysis if available
* - Return combined text and original images
*
* Image Extraction Best Practices:
* - Always convert to base64 data URLs (data:image/png;base64,...)
* - Use PNG for lossless quality
* - Use sharp for efficient image processing
* - Handle errors per image (don't fail entire parsing)
* - Log extraction failures but continue processing
*
* Metadata Recommendations:
* - pageCount: Number of pages in PDF
* - parser: Provider ID for debugging
* - processingTime: Time taken (auto-added)
* - taskId/jobId: For async providers (useful for troubleshooting)
* - Custom fields: imageMapping, pdfImages, tables, formulas, etc.
*
* Error Handling:
* - Validate API key if requiresApiKey is true
* - Throw descriptive errors for missing configuration
* - For async providers, handle timeout and polling errors
* - Log warnings for non-critical failures (e.g., single page errors)
* - Always include provider name in error messages
*/
import { extractText, getDocumentProxy, extractImages } from 'unpdf';
import sharp from 'sharp';
import type { PDFParserConfig } from './types';
import type { ParsedPdfContent } from '@/lib/types/pdf';
import { PDF_PROVIDERS } from './constants';
import { createLogger } from '@/lib/logger';
import { extractMinerUResult } from './mineru-parser';
import { parseWithMinerUCloud } from './mineru-cloud';
const log = createLogger('PDFProviders');
/**
* Parse PDF using specified provider
*/
export async function parsePDF(
config: PDFParserConfig,
pdfBuffer: Buffer,
): Promise<ParsedPdfContent> {
const provider = PDF_PROVIDERS[config.providerId];
if (!provider) {
throw new Error(`Unknown PDF provider: ${config.providerId}`);
}
// Validate API key if required
if (provider.requiresApiKey && !config.apiKey) {
throw new Error(`API key required for PDF provider: ${config.providerId}`);
}
const startTime = Date.now();
let result: ParsedPdfContent;
switch (config.providerId) {
case 'unpdf':
result = await parseWithUnpdf(pdfBuffer);
break;
case 'mineru':
result = await parseWithMinerU(config, pdfBuffer);
break;
case 'mineru-cloud':
result = await parseWithMinerUCloud(config, pdfBuffer);
break;
default:
throw new Error(`Unsupported PDF provider: ${config.providerId}`);
}
// Add processing time to metadata
if (result.metadata) {
result.metadata.processingTime = Date.now() - startTime;
}
return result;
}
/**
* Parse PDF using unpdf (existing implementation)
*/
async function parseWithUnpdf(pdfBuffer: Buffer): Promise<ParsedPdfContent> {
const uint8Array = new Uint8Array(pdfBuffer);
const pdf = await getDocumentProxy(uint8Array);
const numPages = pdf.numPages;
// Extract text using the document proxy
const { text: pdfText } = await extractText(pdf, {
mergePages: true,
});
// Extract images using the same document proxy
const images: string[] = [];
const pdfImagesMeta: Array<{
id: string;
src: string;
pageNumber: number;
width: number;
height: number;
}> = [];
let imageCounter = 0;
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
try {
const pageImages = await extractImages(pdf, pageNum);
for (let i = 0; i < pageImages.length; i++) {
const imgData = pageImages[i];
try {
// Use sharp to convert raw image data to PNG base64
const pngBuffer = await sharp(Buffer.from(imgData.data), {
raw: {
width: imgData.width,
height: imgData.height,
channels: imgData.channels,
},
})
.png()
.toBuffer();
// Convert to base64
const base64 = `data:image/png;base64,${pngBuffer.toString('base64')}`;
imageCounter++;
const imgId = `img_${imageCounter}`;
images.push(base64);
pdfImagesMeta.push({
id: imgId,
src: base64,
pageNumber: pageNum,
width: imgData.width,
height: imgData.height,
});
} catch (sharpError) {
log.error(`Failed to convert image ${i + 1} from page ${pageNum}:`, sharpError);
}
}
} catch (pageError) {
log.error(`Failed to extract images from page ${pageNum}:`, pageError);
}
}
return {
text: pdfText,
images,
metadata: {
pageCount: numPages,
parser: 'unpdf',
imageMapping: Object.fromEntries(pdfImagesMeta.map((m) => [m.id, m.src])),
pdfImages: pdfImagesMeta,
},
};
}
/**
* Parse PDF using self-hosted MinerU service (mineru-api)
*
* Official MinerU API endpoint:
* POST /file_parse (multipart/form-data)
*
* Response format:
* { results: { "document.pdf": { md_content, images, content_list, ... } } }
*
* @see https://github.com/opendatalab/MinerU
*/
async function parseWithMinerU(
config: PDFParserConfig,
pdfBuffer: Buffer,
): Promise<ParsedPdfContent> {
if (!config.baseUrl) {
throw new Error(
'MinerU base URL is required. ' +
'Please deploy MinerU locally or specify the server URL. ' +
'See: https://github.com/opendatalab/MinerU',
);
}
log.info('[MinerU] Parsing PDF with MinerU server:', config.baseUrl);
const fileName = 'document.pdf';
// Create FormData for file upload
const formData = new FormData();
// Convert Buffer to Blob
const arrayBuffer = pdfBuffer.buffer.slice(
pdfBuffer.byteOffset,
pdfBuffer.byteOffset + pdfBuffer.byteLength,
);
const blob = new Blob([arrayBuffer as ArrayBuffer], {
type: 'application/pdf',
});
formData.append('files', blob, fileName);
// MinerU API form fields
// Defaults already: return_md=true, formula_enable=true, table_enable=true
formData.append('parse_method', 'auto');
// hybrid-auto-engine: best accuracy, uses VLM for layout understanding (requires GPU)
// pipeline: basic mode, no VLM, faster but lower quality image extraction
formData.append('backend', 'hybrid-auto-engine');
formData.append('return_content_list', 'true');
formData.append('return_images', 'true');
// API key (if required by deployment)
const headers: Record<string, string> = {};
if (config.apiKey) {
headers['Authorization'] = `Bearer ${config.apiKey}`;
}
// POST /file_parse
const response = await fetch(`${config.baseUrl}/file_parse`, {
method: 'POST',
headers,
body: formData,
});
if (!response.ok) {
const errorText = await response.text().catch(() => response.statusText);
throw new Error(`MinerU API error (${response.status}): ${errorText}`);
}
const json = await response.json();
// Response: { results: { "<fileName>": { md_content, images, content_list, ... } } }
const fileResult = json.results?.[fileName];
if (!fileResult) {
const keys = json.results ? Object.keys(json.results) : [];
// Try first available key in case filename doesn't match exactly
const fallback = keys.length > 0 ? json.results[keys[0]] : null;
if (!fallback) {
throw new Error(`MinerU returned no results. Response keys: ${JSON.stringify(keys)}`);
}
log.warn(`[MinerU] Filename mismatch, using key "${keys[0]}" instead of "${fileName}"`);
return extractMinerUResult(fallback);
}
return extractMinerUResult(fileResult);
}
/**
* Get current PDF parser configuration from settings store
* Note: This function should only be called in browser context
*/
export async function getCurrentPDFConfig(): Promise<PDFParserConfig> {
if (typeof window === 'undefined') {
throw new Error('getCurrentPDFConfig() can only be called in browser context');
}
// Dynamic import to avoid circular dependency
const { useSettingsStore } = await import('@/lib/store/settings');
const { pdfProviderId, pdfProvidersConfig } = useSettingsStore.getState();
const providerConfig = pdfProvidersConfig?.[pdfProviderId];
return {
providerId: pdfProviderId,
apiKey: providerConfig?.apiKey,
baseUrl: providerConfig?.baseUrl,
};
}
// Re-export from constants for convenience
export { getAllPDFProviders, getPDFProvider } from './constants';
|