Spaces:

ibm-granite
/

granite-vision-document-intelligence

Running on Zero

File size: 5,631 Bytes

49574d5

"""Docling document parsing with figure extraction and markdown export."""

import os
import tempfile
from collections.abc import Callable
from typing import Any


_EXT_TO_INPUT_FORMAT = {
    ".pdf": "PDF",
    ".docx": "DOCX",
    ".xlsx": "XLSX",
    ".pptx": "PPTX",
}


def parse_document(
    file_bytes: bytes,
    file_ext: str = ".pdf",
    on_progress: Callable[[str], None] | None = None,
) -> dict[str, Any]:
    """Parse a document with Docling and extract markdown, text, and figure regions.

    Args:
        file_bytes: Document file content as bytes.
        file_ext: File extension (e.g. ``".pdf"``, ``".docx"``, ``".xlsx"``, ``".pptx"``).
        on_progress: Optional callback ``(phase_message) -> None`` for progress reporting.

    Returns:
        Dictionary with keys:
            - ``html``: HTML-wrapped markdown representation of the document.
            - ``text``: Full extracted plain text.
            - ``figures``: List of figure dicts with ``bbox``, ``page``, ``caption``, and ``image``.
    """
    def _report(msg: str) -> None:
        if on_progress:
            on_progress(msg)

    try:
        from docling.datamodel.base_models import InputFormat
        from docling.document_converter import DocumentConverter, PdfFormatOption

        ext = file_ext.lower()
        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
            tmp.write(file_bytes)
            tmp_path = tmp.name

        try:
            format_name = _EXT_TO_INPUT_FORMAT.get(ext, "PDF")
            input_format = getattr(InputFormat, format_name)

            _report("Initializing document converter...")

            if input_format == InputFormat.PDF:
                pdf_format_option = PdfFormatOption()
                pdf_format_option.pipeline_options.generate_picture_images = True
                pdf_format_option.pipeline_options.images_scale = 2.0

                # Force CPU to avoid CUDA init outside @spaces.GPU on ZeroGPU spaces
                try:
                    from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
                    pdf_format_option.pipeline_options.accelerator_options = AcceleratorOptions(
                        device=AcceleratorDevice.CPU
                    )
                except Exception:  # noqa: BLE001
                    pass  # Older docling versions without AcceleratorOptions

                converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
            else:
                converter = DocumentConverter()

            _report("Converting document (this may take a moment)...")
            result = converter.convert(tmp_path)
            doc = result.document

            _report("Exporting document content...")
            markdown_text = doc.export_to_markdown()
            html = markdown_text
            text = doc.export_to_text()

            _report("Processing figures...")
            figures: list[dict[str, Any]] = []
            try:
                if hasattr(doc, "pictures"):
                    for figure in doc.pictures:
                        if figure.content_layer.value != "body":
                            continue

                        page_num = 0
                        bbox_list = None

                        if figure.prov:
                            page_num = figure.prov[0].page_no - 1  # Docling is 1-based
                            bbox = figure.prov[0].bbox
                            bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height]

                        caption = ""
                        if figure.captions:
                            for cap_ref in figure.captions:
                                try:
                                    if hasattr(cap_ref, "cref") and cap_ref.cref.startswith("#/texts/"):
                                        idx = int(cap_ref.cref.split("/")[-1])
                                        if idx < len(doc.texts):
                                            caption = doc.texts[idx].text
                                            break
                                except Exception:  # noqa: BLE001
                                    pass

                        if figure.image:
                            try:
                                pil_image = figure.image.pil_image
                                figures.append({
                                    "bbox": bbox_list,
                                    "page": page_num,
                                    "caption": caption,
                                    "image": pil_image,
                                })
                            except Exception:  # noqa: BLE001
                                pass

            except Exception:  # noqa: BLE001
                figures = []

            return {"html": html, "text": text, "figures": figures}

        finally:
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)

    except ImportError as e:
        print(f"Docling import error: {e}, using placeholder")
        return {
            "html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>",
            "text": "Sample text from PDF.\n\nDocling not available - using placeholder.",
            "figures": [],
        }
    except Exception as e:  # noqa: BLE001
        import traceback

        print(f"Docling parse error: {e}")
        traceback.print_exc()
        return {
            "html": f"<h1>Error</h1><pre>{e!s}</pre>",
            "text": f"Error: {e!s}",
            "figures": [],
        }