Deploybot
Deploy from stable branch
49574d5
"""Docling document parsing with figure extraction and markdown export."""
import os
import tempfile
from collections.abc import Callable
from typing import Any
_EXT_TO_INPUT_FORMAT = {
".pdf": "PDF",
".docx": "DOCX",
".xlsx": "XLSX",
".pptx": "PPTX",
}
def parse_document(
file_bytes: bytes,
file_ext: str = ".pdf",
on_progress: Callable[[str], None] | None = None,
) -> dict[str, Any]:
"""Parse a document with Docling and extract markdown, text, and figure regions.
Args:
file_bytes: Document file content as bytes.
file_ext: File extension (e.g. ``".pdf"``, ``".docx"``, ``".xlsx"``, ``".pptx"``).
on_progress: Optional callback ``(phase_message) -> None`` for progress reporting.
Returns:
Dictionary with keys:
- ``html``: HTML-wrapped markdown representation of the document.
- ``text``: Full extracted plain text.
- ``figures``: List of figure dicts with ``bbox``, ``page``, ``caption``, and ``image``.
"""
def _report(msg: str) -> None:
if on_progress:
on_progress(msg)
try:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
ext = file_ext.lower()
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
tmp.write(file_bytes)
tmp_path = tmp.name
try:
format_name = _EXT_TO_INPUT_FORMAT.get(ext, "PDF")
input_format = getattr(InputFormat, format_name)
_report("Initializing document converter...")
if input_format == InputFormat.PDF:
pdf_format_option = PdfFormatOption()
pdf_format_option.pipeline_options.generate_picture_images = True
pdf_format_option.pipeline_options.images_scale = 2.0
# Force CPU to avoid CUDA init outside @spaces.GPU on ZeroGPU spaces
try:
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
pdf_format_option.pipeline_options.accelerator_options = AcceleratorOptions(
device=AcceleratorDevice.CPU
)
except Exception: # noqa: BLE001
pass # Older docling versions without AcceleratorOptions
converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
else:
converter = DocumentConverter()
_report("Converting document (this may take a moment)...")
result = converter.convert(tmp_path)
doc = result.document
_report("Exporting document content...")
markdown_text = doc.export_to_markdown()
html = markdown_text
text = doc.export_to_text()
_report("Processing figures...")
figures: list[dict[str, Any]] = []
try:
if hasattr(doc, "pictures"):
for figure in doc.pictures:
if figure.content_layer.value != "body":
continue
page_num = 0
bbox_list = None
if figure.prov:
page_num = figure.prov[0].page_no - 1 # Docling is 1-based
bbox = figure.prov[0].bbox
bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height]
caption = ""
if figure.captions:
for cap_ref in figure.captions:
try:
if hasattr(cap_ref, "cref") and cap_ref.cref.startswith("#/texts/"):
idx = int(cap_ref.cref.split("/")[-1])
if idx < len(doc.texts):
caption = doc.texts[idx].text
break
except Exception: # noqa: BLE001
pass
if figure.image:
try:
pil_image = figure.image.pil_image
figures.append({
"bbox": bbox_list,
"page": page_num,
"caption": caption,
"image": pil_image,
})
except Exception: # noqa: BLE001
pass
except Exception: # noqa: BLE001
figures = []
return {"html": html, "text": text, "figures": figures}
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except ImportError as e:
print(f"Docling import error: {e}, using placeholder")
return {
"html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>",
"text": "Sample text from PDF.\n\nDocling not available - using placeholder.",
"figures": [],
}
except Exception as e: # noqa: BLE001
import traceback
print(f"Docling parse error: {e}")
traceback.print_exc()
return {
"html": f"<h1>Error</h1><pre>{e!s}</pre>",
"text": f"Error: {e!s}",
"figures": [],
}