| import logging |
|
|
| from typing import List, Optional, Union |
|
|
| import torch |
| from PIL import Image |
| from transformers import BatchEncoding, BatchFeature |
| from transformers.models.qwen3_vl import Qwen3VLProcessor |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def get_torch_device(device: str = "auto") -> str: |
| """ |
| Returns the device (string) to be used by PyTorch. |
| |
| `device` arg defaults to "auto" which will use: |
| - "cuda:0" if available |
| - else "mps" if available |
| - else "cpu". |
| """ |
|
|
| if device == "auto": |
| if torch.cuda.is_available(): |
| device = "cuda:0" |
| elif torch.backends.mps.is_available(): |
| device = "mps" |
| else: |
| device = "cpu" |
| logger.info(f"Using device: {device}") |
|
|
| return device |
|
|
|
|
| class OpsColQwen3Processor(Qwen3VLProcessor): |
| """ |
| Processor for OpsColQwen3 model. |
| """ |
|
|
| attributes = ["image_processor", "tokenizer"] |
| image_processor_class = "AutoImageProcessor" |
| tokenizer_class = "AutoTokenizer" |
|
|
| query_prefix: str = "Query: " |
| visual_prompt_prefix: str = "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|im_start|>assistant\n<|endoftext|>" |
| query_augmentation_token: str = "<|endoftext|>" |
| image_token: str = "<|image_pad|>" |
|
|
| def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): |
| """ |
| Initialize the processor. |
| |
| Args: |
| image_processor: Image processor instance |
| tokenizer: Tokenizer instance |
| chat_template: Optional chat template |
| **kwargs: Additional arguments |
| """ |
| super().__init__(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template, **kwargs) |
|
|
| if self.tokenizer is not None: |
| self.tokenizer.padding_side = "left" |
|
|
| def process_images(self, images: List[Image.Image], return_tensors: str = "pt", **kwargs) -> Union[BatchFeature, BatchEncoding]: |
| """ |
| Process a batch of PIL images for the model. |
| """ |
| images = [image.convert("RGB") for image in images] |
|
|
| batch_doc = self(text=[self.visual_prompt_prefix] * len(images), images=images, padding="longest", return_tensors=return_tensors, **kwargs) |
|
|
| if batch_doc["pixel_values"].numel() == 0: |
| return batch_doc |
|
|
| offsets = batch_doc["image_grid_thw"].prod(dim=1) |
| pixel_values = list(torch.split(batch_doc["pixel_values"], offsets.tolist())) |
| batch_doc["pixel_values"] = torch.nn.utils.rnn.pad_sequence(pixel_values, batch_first=True) |
|
|
| return batch_doc |
|
|
| def process_queries(self, queries: List[str], return_tensors: str = "pt", **kwargs) -> Union[BatchFeature, BatchEncoding]: |
| """ |
| Process a list of text queries. |
| """ |
| processed_queries = [self.query_prefix + q + self.query_augmentation_token * 10 for q in queries] |
| return self(text=processed_queries, return_tensors=return_tensors, padding="longest", **kwargs) |
|
|
| @staticmethod |
| def score_multi_vector( |
| qs: Union[torch.Tensor, List[torch.Tensor]], |
| ps: Union[torch.Tensor, List[torch.Tensor]], |
| batch_size: int = 128, |
| device: Optional[Union[str, torch.device]] = None, |
| ) -> torch.Tensor: |
| """ |
| Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector |
| query embeddings (`qs`) and passage embeddings (`ps`). For ColPali, a passage is the |
| image of a document page. |
| |
| Because the embedding tensors are multi-vector and can thus have different shapes, they |
| should be fed as: |
| (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim) |
| (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually |
| obtained by padding the list of tensors. |
| |
| Args: |
| qs (`Union[torch.Tensor, List[torch.Tensor]`): Query embeddings. |
| ps (`Union[torch.Tensor, List[torch.Tensor]`): Passage embeddings. |
| batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores. |
| device (`Union[str, torch.device]`, *optional*): Device to use for computation. If not |
| provided, uses `get_torch_device("auto")`. |
| |
| Returns: |
| `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score |
| tensor is saved on the "cpu" device. |
| """ |
| device = device or get_torch_device("auto") |
|
|
| if len(qs) == 0: |
| raise ValueError("No queries provided") |
| if len(ps) == 0: |
| raise ValueError("No passages provided") |
|
|
| scores_list: List[torch.Tensor] = [] |
|
|
| for i in range(0, len(qs), batch_size): |
| scores_batch = [] |
| qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i : i + batch_size], batch_first=True, padding_value=0).to(device) |
| for j in range(0, len(ps), batch_size): |
| ps_batch = torch.nn.utils.rnn.pad_sequence(ps[j : j + batch_size], batch_first=True, padding_value=0).to(device) |
| scores_batch.append(torch.einsum("bnd,csd->bcns", qs_batch, ps_batch).max(dim=3)[0].sum(dim=2)) |
| scores_batch = torch.cat(scores_batch, dim=1).cpu() |
| scores_list.append(scores_batch) |
|
|
| scores = torch.cat(scores_list, dim=0) |
| assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}" |
|
|
| scores = scores.to(torch.float32) |
| return scores |
|
|