Spaces:

ibm-granite
/

granite-vision-document-intelligence

Running on Zero

granite-vision-document-intelligence / src /document_parser.py

Deploybot

Deploy from stable branch

49574d5 7 days ago

5.63 kB

	"""Docling document parsing with figure extraction and markdown export."""

	import os
	import tempfile
	from collections.abc import Callable
	from typing import Any


	_EXT_TO_INPUT_FORMAT = {
	".pdf": "PDF",
	".docx": "DOCX",
	".xlsx": "XLSX",
	".pptx": "PPTX",
	}


	def parse_document(
	file_bytes: bytes,
	file_ext: str = ".pdf",
	on_progress: Callable[[str], None] \| None = None,
	) -> dict[str, Any]:
	"""Parse a document with Docling and extract markdown, text, and figure regions.

	Args:
	file_bytes: Document file content as bytes.
	file_ext: File extension (e.g. ``".pdf"``, ``".docx"``, ``".xlsx"``, ``".pptx"``).
	on_progress: Optional callback ``(phase_message) -> None`` for progress reporting.

	Returns:
	Dictionary with keys:
	- ``html``: HTML-wrapped markdown representation of the document.
	- ``text``: Full extracted plain text.
	- ``figures``: List of figure dicts with ``bbox``, ``page``, ``caption``, and ``image``.
	"""
	def _report(msg: str) -> None:
	if on_progress:
	on_progress(msg)

	try:
	from docling.datamodel.base_models import InputFormat
	from docling.document_converter import DocumentConverter, PdfFormatOption

	ext = file_ext.lower()
	with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
	tmp.write(file_bytes)
	tmp_path = tmp.name

	try:
	format_name = _EXT_TO_INPUT_FORMAT.get(ext, "PDF")
	input_format = getattr(InputFormat, format_name)

	_report("Initializing document converter...")

	if input_format == InputFormat.PDF:
	pdf_format_option = PdfFormatOption()
	pdf_format_option.pipeline_options.generate_picture_images = True
	pdf_format_option.pipeline_options.images_scale = 2.0

	# Force CPU to avoid CUDA init outside @spaces.GPU on ZeroGPU spaces
	try:
	from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
	pdf_format_option.pipeline_options.accelerator_options = AcceleratorOptions(
	device=AcceleratorDevice.CPU
	)
	except Exception: # noqa: BLE001
	pass # Older docling versions without AcceleratorOptions

	converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
	else:
	converter = DocumentConverter()

	_report("Converting document (this may take a moment)...")
	result = converter.convert(tmp_path)
	doc = result.document

	_report("Exporting document content...")
	markdown_text = doc.export_to_markdown()
	html = markdown_text
	text = doc.export_to_text()

	_report("Processing figures...")
	figures: list[dict[str, Any]] = []
	try:
	if hasattr(doc, "pictures"):
	for figure in doc.pictures:
	if figure.content_layer.value != "body":
	continue

	page_num = 0
	bbox_list = None

	if figure.prov:
	page_num = figure.prov[0].page_no - 1 # Docling is 1-based
	bbox = figure.prov[0].bbox
	bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height]

	caption = ""
	if figure.captions:
	for cap_ref in figure.captions:
	try:
	if hasattr(cap_ref, "cref") and cap_ref.cref.startswith("#/texts/"):
	idx = int(cap_ref.cref.split("/")[-1])
	if idx < len(doc.texts):
	caption = doc.texts[idx].text
	break
	except Exception: # noqa: BLE001
	pass

	if figure.image:
	try:
	pil_image = figure.image.pil_image
	figures.append({
	"bbox": bbox_list,
	"page": page_num,
	"caption": caption,
	"image": pil_image,
	})
	except Exception: # noqa: BLE001
	pass

	except Exception: # noqa: BLE001
	figures = []

	return {"html": html, "text": text, "figures": figures}

	finally:
	if os.path.exists(tmp_path):
	os.unlink(tmp_path)

	except ImportError as e:
	print(f"Docling import error: {e}, using placeholder")
	return {
	"html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>",
	"text": "Sample text from PDF.\n\nDocling not available - using placeholder.",
	"figures": [],
	}
	except Exception as e: # noqa: BLE001
	import traceback

	print(f"Docling parse error: {e}")
	traceback.print_exc()
	return {
	"html": f"<h1>Error</h1><pre>{e!s}</pre>",
	"text": f"Error: {e!s}",
	"figures": [],
	}