Spaces:
Running on Zero
Running on Zero
File size: 2,305 Bytes
49574d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | """Figure extraction and processing using PIL images from Docling."""
from typing import Any
from PIL import Image
def extract_figures(
page_images: list[Image.Image],
figures_info: list[dict[str, Any]],
) -> list[dict[str, Any]]:
"""Process figures extracted by Docling into a consistent format.
Docling extracts figure images directly. This function validates them,
creates thumbnails, and falls back to page previews when no figures exist.
Args:
page_images: List of rendered page images (used as fallback only).
figures_info: List of figure metadata dicts from Docling, each containing
``image``, ``bbox``, ``page``, and ``caption`` fields.
Returns:
List of dicts with ``image`` (PIL), ``thumbnail``, ``bbox``, ``page``, and ``caption``.
"""
results: list[dict[str, Any]] = []
try:
for fig_info in figures_info:
pil_image = fig_info.get("image")
if pil_image is None:
print(f"Figure missing image: {fig_info}")
continue
if not isinstance(pil_image, Image.Image):
print(f"Figure image is not PIL Image: {type(pil_image)}")
continue
thumb = pil_image.copy()
thumb.thumbnail((200, 200), Image.Resampling.LANCZOS)
results.append({
"image": pil_image,
"thumbnail": thumb,
"bbox": fig_info.get("bbox"),
"page": fig_info.get("page", 0),
"caption": fig_info.get("caption", ""),
})
print(f"Processed figure {len(results)}")
except Exception as e: # noqa: BLE001
import traceback
print(f"Error processing figures: {e}")
traceback.print_exc()
if not results and page_images:
print("No figures extracted, using page previews as fallback")
for i, page_img in enumerate(page_images[:3]):
thumb = page_img.copy()
thumb.thumbnail((200, 200), Image.Resampling.LANCZOS)
results.append({
"image": page_img,
"thumbnail": thumb,
"bbox": None,
"page": i,
"caption": f"Page {i + 1}",
})
return results
|