"""Docling document parsing with figure extraction and markdown export.""" import os import tempfile from collections.abc import Callable from typing import Any _EXT_TO_INPUT_FORMAT = { ".pdf": "PDF", ".docx": "DOCX", ".xlsx": "XLSX", ".pptx": "PPTX", } def parse_document( file_bytes: bytes, file_ext: str = ".pdf", on_progress: Callable[[str], None] | None = None, ) -> dict[str, Any]: """Parse a document with Docling and extract markdown, text, and figure regions. Args: file_bytes: Document file content as bytes. file_ext: File extension (e.g. ``".pdf"``, ``".docx"``, ``".xlsx"``, ``".pptx"``). on_progress: Optional callback ``(phase_message) -> None`` for progress reporting. Returns: Dictionary with keys: - ``html``: HTML-wrapped markdown representation of the document. - ``text``: Full extracted plain text. - ``figures``: List of figure dicts with ``bbox``, ``page``, ``caption``, and ``image``. """ def _report(msg: str) -> None: if on_progress: on_progress(msg) try: from docling.datamodel.base_models import InputFormat from docling.document_converter import DocumentConverter, PdfFormatOption ext = file_ext.lower() with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: tmp.write(file_bytes) tmp_path = tmp.name try: format_name = _EXT_TO_INPUT_FORMAT.get(ext, "PDF") input_format = getattr(InputFormat, format_name) _report("Initializing document converter...") if input_format == InputFormat.PDF: pdf_format_option = PdfFormatOption() pdf_format_option.pipeline_options.generate_picture_images = True pdf_format_option.pipeline_options.images_scale = 2.0 # Force CPU to avoid CUDA init outside @spaces.GPU on ZeroGPU spaces try: from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions pdf_format_option.pipeline_options.accelerator_options = AcceleratorOptions( device=AcceleratorDevice.CPU ) except Exception: # noqa: BLE001 pass # Older docling versions without AcceleratorOptions converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option}) else: converter = DocumentConverter() _report("Converting document (this may take a moment)...") result = converter.convert(tmp_path) doc = result.document _report("Exporting document content...") markdown_text = doc.export_to_markdown() html = markdown_text text = doc.export_to_text() _report("Processing figures...") figures: list[dict[str, Any]] = [] try: if hasattr(doc, "pictures"): for figure in doc.pictures: if figure.content_layer.value != "body": continue page_num = 0 bbox_list = None if figure.prov: page_num = figure.prov[0].page_no - 1 # Docling is 1-based bbox = figure.prov[0].bbox bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height] caption = "" if figure.captions: for cap_ref in figure.captions: try: if hasattr(cap_ref, "cref") and cap_ref.cref.startswith("#/texts/"): idx = int(cap_ref.cref.split("/")[-1]) if idx < len(doc.texts): caption = doc.texts[idx].text break except Exception: # noqa: BLE001 pass if figure.image: try: pil_image = figure.image.pil_image figures.append({ "bbox": bbox_list, "page": page_num, "caption": caption, "image": pil_image, }) except Exception: # noqa: BLE001 pass except Exception: # noqa: BLE001 figures = [] return {"html": html, "text": text, "figures": figures} finally: if os.path.exists(tmp_path): os.unlink(tmp_path) except ImportError as e: print(f"Docling import error: {e}, using placeholder") return { "html": "
Docling not available - using placeholder.
", "text": "Sample text from PDF.\n\nDocling not available - using placeholder.", "figures": [], } except Exception as e: # noqa: BLE001 import traceback print(f"Docling parse error: {e}") traceback.print_exc() return { "html": f"{e!s}",
"text": f"Error: {e!s}",
"figures": [],
}