| from PIL import Image |
| import numpy as np |
| import base64 |
| import io |
| from io import BytesIO |
| from PIL import Image, ImageFile |
| from pdf2image import convert_from_path |
| import tempfile |
| from multiprocessing import Pool |
| import os |
| from loguru import logger |
| import uuid |
|
|
| from typing import Any, List, Tuple, Type, Literal, Optional, Union, Dict |
|
|
| def encode_image(image_path): |
| with open(image_path, "rb") as image_file: |
| return base64.b64encode(image_file.read()).decode('utf-8') |
|
|
| def load_image_from_base64(image): |
| return Image.open(BytesIO(base64.b64decode(image))) |
|
|
| def pil_image_to_base64(image: Image) -> str: |
| """ |
| Convert a PIL Image object to its base64 representation. |
| |
| Args: |
| image (Image): The PIL Image object to be converted. |
| |
| Returns: |
| str: The base64 representation of the image. |
| """ |
|
|
| |
| buffer = io.BytesIO() |
|
|
| |
| image.save(buffer, format="PNG") |
|
|
| |
| img_bytes = buffer.getvalue() |
|
|
| |
| img_base64 = base64.b64encode(img_bytes).decode("utf-8") |
|
|
| return img_base64 |
|
|
| def scale_image(image: Image.Image, new_height: int = 1024) -> Image.Image: |
| """ |
| Scale an image to a new height while maintaining the aspect ratio. |
| """ |
| width, height = image.size |
| aspect_ratio = width / height |
| new_width = int(new_height * aspect_ratio) |
|
|
| scaled_image = image.resize((new_width, new_height)) |
|
|
| return scaled_image |
|
|
| def unflatten_array(flat_list, vector_size=128): |
| return np.array(flat_list).reshape(-1, vector_size) |
|
|
| def get_image_embedding(image_list: list[Image], openai_client, model: str, flatten: bool = False) -> list: |
| """ |
| Get the embedding of an image. |
| |
| Args: |
| image (Image): The image to be embedded. |
| |
| Returns: |
| list[list[float]] if flatten, |
| else: list[list[list[float]]] with shape = (number of images (m), number of vector for each text (n), vector dim = 128) |
| """ |
| if not isinstance(image_list, list): |
| image_list = [image_list] |
|
|
| input_base64_list = [f"data:image/png;base64,{pil_image_to_base64(image)}" for image in image_list] |
| |
| embedding = openai_client.embeddings.create( |
| input=input_base64_list, |
| model=model, |
| extra_body={ |
| "modality": "image", |
| "encoding_format":"float" if not flatten else "base64", |
| }, |
| ) |
|
|
| result = [] |
| for embed in embedding.data: |
| result.append(embed.embedding) |
| return result |
|
|
| def get_text_embedding(texts: list[str], openai_client, model: str, flatten: bool = False) -> list: |
| """ |
| Get the embedding of a text. |
| |
| Args: |
| text (str): The text to be embedded. |
| |
| Returns: |
| list[list[float]] if flatten, |
| else: list[list[list[float]]] with shape = (number of texts (m), number of vector for each text (n), vector dim = 128) |
| """ |
| if not isinstance(texts, list): |
| texts = [texts] |
|
|
| |
| embedding = openai_client.embeddings.create( |
| input=texts, |
| model=model, |
| extra_body={ |
| "encoding_format":"float" if not flatten else "base64", |
| }, |
| ) |
|
|
| result = [] |
| for embed in embedding.data: |
| result.append(embed.embedding) |
| return result |
|
|
| def load_images(image_paths): |
| """ |
| Load images from a list of paths and return a list of PIL image objects. |
| |
| Args: |
| image_paths (list): List of image paths. |
| |
| Returns: |
| list: List of PIL image objects. |
| """ |
| images = [] |
| for path in image_paths: |
| try: |
| img = Image.open(path) |
| images.append(img) |
| except Exception as e: |
| logger.error(f"Error loading image at path {path}: {str(e)}") |
| return images |
| |
|
|
| def process_pdf(pdf_path: str, output_folder: str, thread_count=1): |
| result_image_paths = [] |
|
|
| with tempfile.TemporaryDirectory() as temp_dir: |
| images = convert_from_path(pdf_path, dpi=200, output_folder=temp_dir, thread_count=thread_count) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| return images |
|
|
|
|
| def pdf_folder_to_images(pdf_folder: str, output_folder: str, process_count: int = 2): |
| try: |
| if process_count is None: |
| process_count = os.cpu_count() |
|
|
| pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) |
| if f.lower().endswith('.pdf')] |
| |
| |
| args = [(pdf_file, output_folder) for pdf_file in pdf_files] |
| |
| with Pool(process_count) as pool: |
| all_images = pool.starmap(process_pdf, args) |
| |
| result = [img for sublist in all_images for img in sublist] |
|
|
| logger.debug(f"Number of pdfs processed: {len(all_images)} - Number of images: {len(result)}") |
| return result |
| except Exception as e: |
| logger.exception(f"Error during processing pdf: {e}") |
|
|
|
|
|
|