Deploybot
Deploy from stable branch
49574d5
"""Figure extraction and processing using PIL images from Docling."""
from typing import Any
from PIL import Image
def extract_figures(
page_images: list[Image.Image],
figures_info: list[dict[str, Any]],
) -> list[dict[str, Any]]:
"""Process figures extracted by Docling into a consistent format.
Docling extracts figure images directly. This function validates them,
creates thumbnails, and falls back to page previews when no figures exist.
Args:
page_images: List of rendered page images (used as fallback only).
figures_info: List of figure metadata dicts from Docling, each containing
``image``, ``bbox``, ``page``, and ``caption`` fields.
Returns:
List of dicts with ``image`` (PIL), ``thumbnail``, ``bbox``, ``page``, and ``caption``.
"""
results: list[dict[str, Any]] = []
try:
for fig_info in figures_info:
pil_image = fig_info.get("image")
if pil_image is None:
print(f"Figure missing image: {fig_info}")
continue
if not isinstance(pil_image, Image.Image):
print(f"Figure image is not PIL Image: {type(pil_image)}")
continue
thumb = pil_image.copy()
thumb.thumbnail((200, 200), Image.Resampling.LANCZOS)
results.append({
"image": pil_image,
"thumbnail": thumb,
"bbox": fig_info.get("bbox"),
"page": fig_info.get("page", 0),
"caption": fig_info.get("caption", ""),
})
print(f"Processed figure {len(results)}")
except Exception as e: # noqa: BLE001
import traceback
print(f"Error processing figures: {e}")
traceback.print_exc()
if not results and page_images:
print("No figures extracted, using page previews as fallback")
for i, page_img in enumerate(page_images[:3]):
thumb = page_img.copy()
thumb.thumbnail((200, 200), Image.Resampling.LANCZOS)
results.append({
"image": page_img,
"thumbnail": thumb,
"bbox": None,
"page": i,
"caption": f"Page {i + 1}",
})
return results