Spaces:
Running on Zero
Running on Zero
File size: 5,631 Bytes
49574d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | """Docling document parsing with figure extraction and markdown export."""
import os
import tempfile
from collections.abc import Callable
from typing import Any
_EXT_TO_INPUT_FORMAT = {
".pdf": "PDF",
".docx": "DOCX",
".xlsx": "XLSX",
".pptx": "PPTX",
}
def parse_document(
file_bytes: bytes,
file_ext: str = ".pdf",
on_progress: Callable[[str], None] | None = None,
) -> dict[str, Any]:
"""Parse a document with Docling and extract markdown, text, and figure regions.
Args:
file_bytes: Document file content as bytes.
file_ext: File extension (e.g. ``".pdf"``, ``".docx"``, ``".xlsx"``, ``".pptx"``).
on_progress: Optional callback ``(phase_message) -> None`` for progress reporting.
Returns:
Dictionary with keys:
- ``html``: HTML-wrapped markdown representation of the document.
- ``text``: Full extracted plain text.
- ``figures``: List of figure dicts with ``bbox``, ``page``, ``caption``, and ``image``.
"""
def _report(msg: str) -> None:
if on_progress:
on_progress(msg)
try:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
ext = file_ext.lower()
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
tmp.write(file_bytes)
tmp_path = tmp.name
try:
format_name = _EXT_TO_INPUT_FORMAT.get(ext, "PDF")
input_format = getattr(InputFormat, format_name)
_report("Initializing document converter...")
if input_format == InputFormat.PDF:
pdf_format_option = PdfFormatOption()
pdf_format_option.pipeline_options.generate_picture_images = True
pdf_format_option.pipeline_options.images_scale = 2.0
# Force CPU to avoid CUDA init outside @spaces.GPU on ZeroGPU spaces
try:
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
pdf_format_option.pipeline_options.accelerator_options = AcceleratorOptions(
device=AcceleratorDevice.CPU
)
except Exception: # noqa: BLE001
pass # Older docling versions without AcceleratorOptions
converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
else:
converter = DocumentConverter()
_report("Converting document (this may take a moment)...")
result = converter.convert(tmp_path)
doc = result.document
_report("Exporting document content...")
markdown_text = doc.export_to_markdown()
html = markdown_text
text = doc.export_to_text()
_report("Processing figures...")
figures: list[dict[str, Any]] = []
try:
if hasattr(doc, "pictures"):
for figure in doc.pictures:
if figure.content_layer.value != "body":
continue
page_num = 0
bbox_list = None
if figure.prov:
page_num = figure.prov[0].page_no - 1 # Docling is 1-based
bbox = figure.prov[0].bbox
bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height]
caption = ""
if figure.captions:
for cap_ref in figure.captions:
try:
if hasattr(cap_ref, "cref") and cap_ref.cref.startswith("#/texts/"):
idx = int(cap_ref.cref.split("/")[-1])
if idx < len(doc.texts):
caption = doc.texts[idx].text
break
except Exception: # noqa: BLE001
pass
if figure.image:
try:
pil_image = figure.image.pil_image
figures.append({
"bbox": bbox_list,
"page": page_num,
"caption": caption,
"image": pil_image,
})
except Exception: # noqa: BLE001
pass
except Exception: # noqa: BLE001
figures = []
return {"html": html, "text": text, "figures": figures}
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except ImportError as e:
print(f"Docling import error: {e}, using placeholder")
return {
"html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>",
"text": "Sample text from PDF.\n\nDocling not available - using placeholder.",
"figures": [],
}
except Exception as e: # noqa: BLE001
import traceback
print(f"Docling parse error: {e}")
traceback.print_exc()
return {
"html": f"<h1>Error</h1><pre>{e!s}</pre>",
"text": f"Error: {e!s}",
"figures": [],
}
|