File size: 3,617 Bytes
f56a29b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | /**
* Shared MinerU result parser.
* Used by both self-hosted (pdf-providers.ts) and cloud (mineru-cloud.ts) paths.
* Normalizes MinerU output (markdown + images dict + content_list) into ParsedPdfContent.
*/
import type { ParsedPdfContent } from '@/lib/types/pdf';
import { createLogger } from '@/lib/logger';
const log = createLogger('MinerUParser');
/** Extract ParsedPdfContent from a single MinerU file result */
export function extractMinerUResult(fileResult: Record<string, unknown>): ParsedPdfContent {
const markdown: string = (fileResult.md_content as string) || '';
const imageData: Record<string, string> = {};
let pageCount = 0;
// Extract images from the images object (key → base64 string)
if (fileResult.images && typeof fileResult.images === 'object') {
Object.entries(fileResult.images as Record<string, string>).forEach(([key, value]) => {
imageData[key] = value.startsWith('data:') ? value : `data:image/png;base64,${value}`;
});
}
// Parse content_list to build image metadata lookup (img_path → metadata)
const imageMetaLookup = new Map<string, { pageIdx: number; bbox: number[]; caption?: string }>();
let contentList: unknown;
try {
contentList =
typeof fileResult.content_list === 'string'
? JSON.parse(fileResult.content_list as string)
: fileResult.content_list;
} catch {
log.warn('[MinerU] content_list JSON parse failed, continuing without metadata');
}
if (Array.isArray(contentList)) {
const pages = new Set(
contentList
.map((item: Record<string, unknown>) => item.page_idx)
.filter((v: unknown) => v != null),
);
pageCount = pages.size;
for (const item of contentList) {
if (item.type === 'image' && item.img_path) {
const metaEntry = {
pageIdx: item.page_idx ?? 0,
bbox: item.bbox || [0, 0, 1000, 1000],
caption: Array.isArray(item.image_caption) ? item.image_caption[0] : undefined,
};
// Store under both the full path and basename so lookup works
// regardless of whether images dict uses "abc.jpg" or "images/abc.jpg"
imageMetaLookup.set(item.img_path, metaEntry);
const basename = item.img_path.split('/').pop();
if (basename && basename !== item.img_path) {
imageMetaLookup.set(basename, metaEntry);
}
}
}
}
// Build image mapping and pdfImages array
const imageMapping: Record<string, string> = {};
const pdfImages: Array<{
id: string;
src: string;
pageNumber: number;
description?: string;
width?: number;
height?: number;
}> = [];
Object.entries(imageData).forEach(([key, base64Url], index) => {
const imageId = key.startsWith('img_') ? key : `img_${index + 1}`;
imageMapping[imageId] = base64Url;
// Try exact key first, then with 'images/' prefix (MinerU content_list uses prefixed paths)
const meta = imageMetaLookup.get(key) || imageMetaLookup.get(`images/${key}`);
pdfImages.push({
id: imageId,
src: base64Url,
pageNumber: meta ? meta.pageIdx + 1 : 0,
description: meta?.caption,
width: meta ? meta.bbox[2] - meta.bbox[0] : undefined,
height: meta ? meta.bbox[3] - meta.bbox[1] : undefined,
});
});
const images = Object.values(imageMapping);
log.info(
`[MinerU] Parsed successfully: ${images.length} images, ` +
`${markdown.length} chars of markdown`,
);
return {
text: markdown,
images,
metadata: {
pageCount,
parser: 'mineru',
imageMapping,
pdfImages,
},
};
}
|