File size: 5,631 Bytes
49574d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""Docling document parsing with figure extraction and markdown export."""

import os
import tempfile
from collections.abc import Callable
from typing import Any


_EXT_TO_INPUT_FORMAT = {
    ".pdf": "PDF",
    ".docx": "DOCX",
    ".xlsx": "XLSX",
    ".pptx": "PPTX",
}


def parse_document(
    file_bytes: bytes,
    file_ext: str = ".pdf",
    on_progress: Callable[[str], None] | None = None,
) -> dict[str, Any]:
    """Parse a document with Docling and extract markdown, text, and figure regions.

    Args:
        file_bytes: Document file content as bytes.
        file_ext: File extension (e.g. ``".pdf"``, ``".docx"``, ``".xlsx"``, ``".pptx"``).
        on_progress: Optional callback ``(phase_message) -> None`` for progress reporting.

    Returns:
        Dictionary with keys:
            - ``html``: HTML-wrapped markdown representation of the document.
            - ``text``: Full extracted plain text.
            - ``figures``: List of figure dicts with ``bbox``, ``page``, ``caption``, and ``image``.
    """
    def _report(msg: str) -> None:
        if on_progress:
            on_progress(msg)

    try:
        from docling.datamodel.base_models import InputFormat
        from docling.document_converter import DocumentConverter, PdfFormatOption

        ext = file_ext.lower()
        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
            tmp.write(file_bytes)
            tmp_path = tmp.name

        try:
            format_name = _EXT_TO_INPUT_FORMAT.get(ext, "PDF")
            input_format = getattr(InputFormat, format_name)

            _report("Initializing document converter...")

            if input_format == InputFormat.PDF:
                pdf_format_option = PdfFormatOption()
                pdf_format_option.pipeline_options.generate_picture_images = True
                pdf_format_option.pipeline_options.images_scale = 2.0

                # Force CPU to avoid CUDA init outside @spaces.GPU on ZeroGPU spaces
                try:
                    from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
                    pdf_format_option.pipeline_options.accelerator_options = AcceleratorOptions(
                        device=AcceleratorDevice.CPU
                    )
                except Exception:  # noqa: BLE001
                    pass  # Older docling versions without AcceleratorOptions

                converter = DocumentConverter(format_options={InputFormat.PDF: pdf_format_option})
            else:
                converter = DocumentConverter()

            _report("Converting document (this may take a moment)...")
            result = converter.convert(tmp_path)
            doc = result.document

            _report("Exporting document content...")
            markdown_text = doc.export_to_markdown()
            html = markdown_text
            text = doc.export_to_text()

            _report("Processing figures...")
            figures: list[dict[str, Any]] = []
            try:
                if hasattr(doc, "pictures"):
                    for figure in doc.pictures:
                        if figure.content_layer.value != "body":
                            continue

                        page_num = 0
                        bbox_list = None

                        if figure.prov:
                            page_num = figure.prov[0].page_no - 1  # Docling is 1-based
                            bbox = figure.prov[0].bbox
                            bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height]

                        caption = ""
                        if figure.captions:
                            for cap_ref in figure.captions:
                                try:
                                    if hasattr(cap_ref, "cref") and cap_ref.cref.startswith("#/texts/"):
                                        idx = int(cap_ref.cref.split("/")[-1])
                                        if idx < len(doc.texts):
                                            caption = doc.texts[idx].text
                                            break
                                except Exception:  # noqa: BLE001
                                    pass

                        if figure.image:
                            try:
                                pil_image = figure.image.pil_image
                                figures.append({
                                    "bbox": bbox_list,
                                    "page": page_num,
                                    "caption": caption,
                                    "image": pil_image,
                                })
                            except Exception:  # noqa: BLE001
                                pass

            except Exception:  # noqa: BLE001
                figures = []

            return {"html": html, "text": text, "figures": figures}

        finally:
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)

    except ImportError as e:
        print(f"Docling import error: {e}, using placeholder")
        return {
            "html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>",
            "text": "Sample text from PDF.\n\nDocling not available - using placeholder.",
            "figures": [],
        }
    except Exception as e:  # noqa: BLE001
        import traceback

        print(f"Docling parse error: {e}")
        traceback.print_exc()
        return {
            "html": f"<h1>Error</h1><pre>{e!s}</pre>",
            "text": f"Error: {e!s}",
            "figures": [],
        }