import math from typing import List, Optional, Union from PIL import Image import torch from transformers.image_processing_base import BatchFeature from transformers.image_processing_utils_fast import BaseImageProcessorFast from transformers.image_utils import make_list_of_images, get_image_type, ImageInput, ImageType from transformers.utils import TensorType import torchvision.transforms as T class NemotronH_Nano_Omni_Reasoning_V3ImageProcessor(BaseImageProcessorFast): """ Dynamic-resolution image processor for the V3 omni model. Each image is resized to a single tile whose patch-grid `(h_patches, w_patches)` is chosen to land between `min_num_patches` and `max_num_patches` (on a 16×16-pixel grid), respecting aspect ratio. This matches the algorithm in vLLM's `DynamicResolutionImageTiler` (`vllm/model_executor/models/nano_nemotron_vl.py`) so HF and vLLM inference see identical pixel inputs. """ model_input_names = ["pixel_values"] def __init__( self, norm_mean=None, norm_std=None, patch_size=16, downsample_ratio=0.5, min_num_patches=1024, max_num_patches=13312, max_model_len=16384, video_target_num_patches=1024, video_maintain_aspect_ratio=True, **kwargs, ): super().__init__(**kwargs) self.norm_mean = norm_mean self.norm_std = norm_std self.patch_size = patch_size self.downsample_ratio = downsample_ratio # Integer reduction factor for pixel_shuffle (downsample_ratio = 0.5 → factor 2). self._downsample_factor = int(round(1.0 / downsample_ratio)) # Per-image patch-grid bounds (on the pre-pixel-shuffle 16×16 grid). self.min_num_patches = min_num_patches self.max_num_patches = max_num_patches self.max_model_len = max_model_len # Video frames use a separate (fixed) target-patch budget with aspect-ratio preserved. # Matches vLLM's `_compute_aspect_preserving_size` in `nano_nemotron_vl.py`. self.video_target_num_patches = video_target_num_patches self.video_maintain_aspect_ratio = video_maintain_aspect_ratio # Keep the PIL image through to `_preprocess` — we need PIL.resize (bicubic) to match vLLM's # algorithm exactly; resizing a tensor via `torchvision.transforms.Resize` uses different # kernels and breaks bit-exact agreement. def _process_image(self, image: ImageInput, **kwargs): if get_image_type(image) == ImageType.PIL: if image.mode != "RGB": image = image.convert("RGB") return image # transformers 5.6 renamed this hook from `_process_image` to `process_image`; alias both. process_image = _process_image # Toggled by `processing.py` around video calls (the strict `ImagesKwargs` validator won't let # us thread a new kwarg down, so we use a flag on the instance instead). _is_video_mode: bool = False def _preprocess( self, images, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs, ) -> BatchFeature: """Port of vLLM's `DynamicResolutionImageTiler._images_to_pixel_values_lst`. When `self._is_video_mode=True` (flipped by `processing.py` before the video call), each input is resized using the **video** target-size rule (`video_target_num_patches`, aspect-ratio preserved) instead of the image dynamic-res rule. This matches vLLM's split between `video_to_pixel_values` (video path) and `DynamicResolutionImageTiler` (image path). """ is_video = self._is_video_mode images = make_list_of_images(images) target_sizes = [] if is_video: for img in images: target_w_patches, target_h_patches = self._compute_target_patches_video(img) target_sizes.append((target_w_patches, target_h_patches)) else: # Image path: per-image budget bounded by [min_num_patches, max_num_patches], with a # global cap derived from `max_model_len` × pixel-shuffle factor². num_tokens_available = self.max_model_len - 4 # match vLLM's reserve budget = num_tokens_available * (self._downsample_factor ** 2) budget = max(budget, self.min_num_patches * len(images)) max_budget = self.max_num_patches if (self.max_num_patches and self.max_num_patches > 0) else float("inf") per_image_budget = [max(min(budget, max_budget), self.min_num_patches) for _ in images] # Single-pass — vLLM has an iterative scale-down for the batch, but it rarely binds in # single-image / small-batch inference. for img, tokens_for_media in zip(images, per_image_budget): target_w_patches, target_h_patches = self._compute_target_patches(img, tokens_for_media) target_sizes.append((target_w_patches, target_h_patches)) import numpy as np norm_mean = torch.tensor(self.norm_mean).view(1, 3, 1, 1) norm_std = torch.tensor(self.norm_std).view(1, 3, 1, 1) pixel_values_list = [] num_tokens_per_image = [] imgs_sizes = [] for img, (wp, hp) in zip(images, target_sizes): target_w = wp * self.patch_size target_h = hp * self.patch_size # Use torch's antialiased bicubic interpolation to match vLLM's # `_bicubic_resize_and_normalize` (`torch.nn.functional.interpolate`, `antialias=True`). # PIL's bicubic uses a different kernel (and no antialiasing), producing visibly different # pixel values that amplify through the 52-layer ViT / mamba stack and cause HF/vLLM # outputs to diverge past the first few tokens. arr = np.asarray(img, dtype=np.uint8) # (H, W, 3) t = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).to(dtype=torch.float32) # (1, 3, H, W) if t.shape[-2] != target_h or t.shape[-1] != target_w: t = torch.nn.functional.interpolate( t, size=(target_h, target_w), mode="bicubic", align_corners=False, antialias=True ) t = (t / 255.0 - norm_mean) / norm_std pixel_values_list.append(t.squeeze(0)) # (3, H, W) num_tokens_per_image.append((wp * hp) // (self._downsample_factor ** 2)) imgs_sizes.append((target_h, target_w)) # Stack if all images have the same target size (common for same-aspect-ratio batches); # otherwise keep as a list of (3, H_i, W_i) tensors. The outer model's `extract_feature` # handles both. all_same_shape = all(t.shape == pixel_values_list[0].shape for t in pixel_values_list) if all_same_shape: pixel_values = torch.stack(pixel_values_list, dim=0) else: pixel_values = pixel_values_list return BatchFeature( data={ "pixel_values": pixel_values, # One tile per image in dynamic mode — `num_tokens` is what the text-side # placeholder expansion should use. "num_patches": [1] * len(images), "num_tokens": num_tokens_per_image, "imgs_sizes": imgs_sizes, }, tensor_type=(return_tensors if all_same_shape else None), ) def _compute_target_patches(self, img: Image.Image, tokens_available: int): """Port of `DynamicResolutionImageTiler.process_media` (image-only, no thumbnail).""" orig_w, orig_h = img.width, img.height # Ceil-ish: `round(x + 0.5)` == `floor(x) + 1` for non-integer x, `x` for integer. closest_patch_h = round(orig_h / self.patch_size + 0.5) closest_patch_w = round(orig_w / self.patch_size + 0.5) patches = closest_patch_h * closest_patch_w # Downscale to fit the token budget. factor = min(math.sqrt(tokens_available / patches), 1.0) target_h = math.floor(factor * closest_patch_h) target_w = math.floor(factor * closest_patch_w) # Scale up if below the per-image minimum. if ( tokens_available > self.min_num_patches and target_h * target_w < self.min_num_patches ): up = math.sqrt(self.min_num_patches / (target_h * target_w)) target_h = math.ceil(up * target_h) target_w = math.ceil(up * target_w) # Round each dim to a multiple of the pixel_shuffle factor so tokens divide evenly. divisor = self._downsample_factor rem_h = target_h % divisor if rem_h: inc_h = divisor - rem_h if (target_h + inc_h) * target_w <= tokens_available: target_h += inc_h else: target_h = max(divisor, target_h - rem_h) rem_w = target_w % divisor if rem_w: inc_w = divisor - rem_w if target_h * (target_w + inc_w) <= tokens_available: target_w += inc_w else: target_w = max(divisor, target_w - rem_w) return target_w, target_h def _compute_target_patches_video(self, img: Image.Image): """Port of vLLM's `_compute_aspect_preserving_size` for video frames. Each frame is resized to roughly `video_target_num_patches` (default 1024) on the 16×16 grid, with aspect ratio preserved and dims snapped to a multiple of the pixel_shuffle factor. For `maintain_aspect_ratio=False`, it falls back to a square of sqrt(target) patches. """ orig_w, orig_h = img.width, img.height target = self.video_target_num_patches divisor = self._downsample_factor # 2 for pixel_shuffle if self.video_maintain_aspect_ratio: aspect_wh = orig_w / max(orig_h, 1) ph = max(round(math.sqrt(target / aspect_wh)), 1) pw = max(round(math.sqrt(target * aspect_wh)), 1) if divisor > 1: rem_h = ph % divisor rem_w = pw % divisor ph_up = ph + (divisor - rem_h if rem_h else 0) ph_down = ph - rem_h pw_up = pw + (divisor - rem_w if rem_w else 0) pw_down = pw - rem_w # Prefer rounding up when the up-rounded patch count still fits the target; # otherwise round down (mirrors vLLM's logic exactly). if ph_up * pw_up <= target: ph, pw = ph_up, pw_up else: ph = max(divisor, ph_down) pw = max(divisor, pw_down) else: side = int(math.sqrt(target)) side = max(divisor, (side // divisor) * divisor) ph = pw = side return pw, ph