File size: 2,305 Bytes
49574d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""Figure extraction and processing using PIL images from Docling."""

from typing import Any

from PIL import Image


def extract_figures(
    page_images: list[Image.Image],
    figures_info: list[dict[str, Any]],
) -> list[dict[str, Any]]:
    """Process figures extracted by Docling into a consistent format.

    Docling extracts figure images directly. This function validates them,
    creates thumbnails, and falls back to page previews when no figures exist.

    Args:
        page_images: List of rendered page images (used as fallback only).
        figures_info: List of figure metadata dicts from Docling, each containing
            ``image``, ``bbox``, ``page``, and ``caption`` fields.

    Returns:
        List of dicts with ``image`` (PIL), ``thumbnail``, ``bbox``, ``page``, and ``caption``.
    """
    results: list[dict[str, Any]] = []

    try:
        for fig_info in figures_info:
            pil_image = fig_info.get("image")

            if pil_image is None:
                print(f"Figure missing image: {fig_info}")
                continue

            if not isinstance(pil_image, Image.Image):
                print(f"Figure image is not PIL Image: {type(pil_image)}")
                continue

            thumb = pil_image.copy()
            thumb.thumbnail((200, 200), Image.Resampling.LANCZOS)

            results.append({
                "image": pil_image,
                "thumbnail": thumb,
                "bbox": fig_info.get("bbox"),
                "page": fig_info.get("page", 0),
                "caption": fig_info.get("caption", ""),
            })

            print(f"Processed figure {len(results)}")

    except Exception as e:  # noqa: BLE001
        import traceback

        print(f"Error processing figures: {e}")
        traceback.print_exc()

    if not results and page_images:
        print("No figures extracted, using page previews as fallback")
        for i, page_img in enumerate(page_images[:3]):
            thumb = page_img.copy()
            thumb.thumbnail((200, 200), Image.Resampling.LANCZOS)
            results.append({
                "image": page_img,
                "thumbnail": thumb,
                "bbox": None,
                "page": i,
                "caption": f"Page {i + 1}",
            })

    return results