Spaces:
Running on Zero
Running on Zero
| """Docling document parsing with figure extraction and markdown export.""" | |
| import os | |
| import tempfile | |
| from collections.abc import Callable | |
| from typing import Any | |
| _EXT_TO_INPUT_FORMAT = { | |
| ".pdf": "PDF", | |
| ".docx": "DOCX", | |
| ".xlsx": "XLSX", | |
| ".pptx": "PPTX", | |
| } | |
| def parse_document( | |
| file_bytes: bytes, | |
| file_ext: str = ".pdf", | |
| on_progress: Callable[[str], None] | None = None, | |
| ) -> dict[str, Any]: | |
| """Parse a document with Docling and extract markdown, text, and figure regions. | |
| Args: | |
| file_bytes: Document file content as bytes. | |
| file_ext: File extension (e.g. ``".pdf"``, ``".docx"``, ``".xlsx"``, ``".pptx"``). | |
| on_progress: Optional callback ``(phase_message) -> None`` for progress reporting. | |
| Returns: | |
| Dictionary with keys: | |
| - ``html``: HTML-wrapped markdown representation of the document. | |
| - ``text``: Full extracted plain text. | |
| - ``figures``: List of figure dicts with ``bbox``, ``page``, ``caption``, and ``image``. | |
| """ | |
| def _report(msg: str) -> None: | |
| if on_progress: | |
| on_progress(msg) | |
| try: | |
| from docling.datamodel.base_models import InputFormat | |
| from docling.document_converter import DocumentConverter, PdfFormatOption | |
| ext = file_ext.lower() | |
| with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: | |
| tmp.write(file_bytes) | |
| tmp_path = tmp.name | |
| try: | |
| format_name = _EXT_TO_INPUT_FORMAT.get(ext, "PDF") | |
| input_format = getattr(InputFormat, format_name) | |
| _report("Initializing document converter...") | |
| if input_format == InputFormat.PDF: | |
| pdf_format_option = PdfFormatOption() | |
| pdf_format_option.pipeline_options.generate_picture_images = True | |
| pdf_format_option.pipeline_options.images_scale = 2.0 | |
| # Force CPU to avoid CUDA init outside @spaces.GPU on ZeroGPU spaces | |
| try: | |
| from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions | |
| pdf_format_option.pipeline_options.accelerator_options = AcceleratorOptions( | |
| device=AcceleratorDevice.CPU | |
| ) | |
| except Exception: # noqa: BLE001 | |
| pass # Older docling versions without AcceleratorOptions | |
| converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option}) | |
| else: | |
| converter = DocumentConverter() | |
| _report("Converting document (this may take a moment)...") | |
| result = converter.convert(tmp_path) | |
| doc = result.document | |
| _report("Exporting document content...") | |
| markdown_text = doc.export_to_markdown() | |
| html = markdown_text | |
| text = doc.export_to_text() | |
| _report("Processing figures...") | |
| figures: list[dict[str, Any]] = [] | |
| try: | |
| if hasattr(doc, "pictures"): | |
| for figure in doc.pictures: | |
| if figure.content_layer.value != "body": | |
| continue | |
| page_num = 0 | |
| bbox_list = None | |
| if figure.prov: | |
| page_num = figure.prov[0].page_no - 1 # Docling is 1-based | |
| bbox = figure.prov[0].bbox | |
| bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height] | |
| caption = "" | |
| if figure.captions: | |
| for cap_ref in figure.captions: | |
| try: | |
| if hasattr(cap_ref, "cref") and cap_ref.cref.startswith("#/texts/"): | |
| idx = int(cap_ref.cref.split("/")[-1]) | |
| if idx < len(doc.texts): | |
| caption = doc.texts[idx].text | |
| break | |
| except Exception: # noqa: BLE001 | |
| pass | |
| if figure.image: | |
| try: | |
| pil_image = figure.image.pil_image | |
| figures.append({ | |
| "bbox": bbox_list, | |
| "page": page_num, | |
| "caption": caption, | |
| "image": pil_image, | |
| }) | |
| except Exception: # noqa: BLE001 | |
| pass | |
| except Exception: # noqa: BLE001 | |
| figures = [] | |
| return {"html": html, "text": text, "figures": figures} | |
| finally: | |
| if os.path.exists(tmp_path): | |
| os.unlink(tmp_path) | |
| except ImportError as e: | |
| print(f"Docling import error: {e}, using placeholder") | |
| return { | |
| "html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>", | |
| "text": "Sample text from PDF.\n\nDocling not available - using placeholder.", | |
| "figures": [], | |
| } | |
| except Exception as e: # noqa: BLE001 | |
| import traceback | |
| print(f"Docling parse error: {e}") | |
| traceback.print_exc() | |
| return { | |
| "html": f"<h1>Error</h1><pre>{e!s}</pre>", | |
| "text": f"Error: {e!s}", | |
| "figures": [], | |
| } | |