File size: 10,883 Bytes
14189d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import math
from typing import List, Optional, Union

from PIL import Image
import torch
from transformers.image_processing_base import BatchFeature
from transformers.image_processing_utils_fast import BaseImageProcessorFast
from transformers.image_utils import make_list_of_images, get_image_type, ImageInput, ImageType
from transformers.utils import TensorType
import torchvision.transforms as T


class NemotronH_Nano_Omni_Reasoning_V3ImageProcessor(BaseImageProcessorFast):
    """
    Dynamic-resolution image processor for the V3 omni model.

    Each image is resized to a single tile whose patch-grid `(h_patches, w_patches)` is chosen to
    land between `min_num_patches` and `max_num_patches` (on a 16×16-pixel grid), respecting
    aspect ratio. This matches the algorithm in vLLM's `DynamicResolutionImageTiler`
    (`vllm/model_executor/models/nano_nemotron_vl.py`) so HF and vLLM inference see identical pixel
    inputs.
    """

    model_input_names = ["pixel_values"]

    def __init__(
        self,
        norm_mean=None,
        norm_std=None,
        patch_size=16,
        downsample_ratio=0.5,
        min_num_patches=1024,
        max_num_patches=13312,
        max_model_len=16384,
        video_target_num_patches=1024,
        video_maintain_aspect_ratio=True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.norm_mean = norm_mean
        self.norm_std = norm_std
        self.patch_size = patch_size
        self.downsample_ratio = downsample_ratio
        # Integer reduction factor for pixel_shuffle (downsample_ratio = 0.5 → factor 2).
        self._downsample_factor = int(round(1.0 / downsample_ratio))
        # Per-image patch-grid bounds (on the pre-pixel-shuffle 16×16 grid).
        self.min_num_patches = min_num_patches
        self.max_num_patches = max_num_patches
        self.max_model_len = max_model_len
        # Video frames use a separate (fixed) target-patch budget with aspect-ratio preserved.
        # Matches vLLM's `_compute_aspect_preserving_size` in `nano_nemotron_vl.py`.
        self.video_target_num_patches = video_target_num_patches
        self.video_maintain_aspect_ratio = video_maintain_aspect_ratio

    # Keep the PIL image through to `_preprocess` — we need PIL.resize (bicubic) to match vLLM's
    # algorithm exactly; resizing a tensor via `torchvision.transforms.Resize` uses different
    # kernels and breaks bit-exact agreement.
    def _process_image(self, image: ImageInput, **kwargs):
        if get_image_type(image) == ImageType.PIL:
            if image.mode != "RGB":
                image = image.convert("RGB")
        return image

    # transformers 5.6 renamed this hook from `_process_image` to `process_image`; alias both.
    process_image = _process_image

    # Toggled by `processing.py` around video calls (the strict `ImagesKwargs` validator won't let
    # us thread a new kwarg down, so we use a flag on the instance instead).
    _is_video_mode: bool = False

    def _preprocess(
        self,
        images,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ) -> BatchFeature:
        """Port of vLLM's `DynamicResolutionImageTiler._images_to_pixel_values_lst`.

        When `self._is_video_mode=True` (flipped by `processing.py` before the video call), each
        input is resized using the **video** target-size rule (`video_target_num_patches`,
        aspect-ratio preserved) instead of the image dynamic-res rule. This matches vLLM's split
        between `video_to_pixel_values` (video path) and `DynamicResolutionImageTiler` (image
        path).
        """
        is_video = self._is_video_mode
        images = make_list_of_images(images)

        target_sizes = []
        if is_video:
            for img in images:
                target_w_patches, target_h_patches = self._compute_target_patches_video(img)
                target_sizes.append((target_w_patches, target_h_patches))
        else:
            # Image path: per-image budget bounded by [min_num_patches, max_num_patches], with a
            # global cap derived from `max_model_len` × pixel-shuffle factor².
            num_tokens_available = self.max_model_len - 4  # match vLLM's reserve
            budget = num_tokens_available * (self._downsample_factor ** 2)
            budget = max(budget, self.min_num_patches * len(images))
            max_budget = self.max_num_patches if (self.max_num_patches and self.max_num_patches > 0) else float("inf")
            per_image_budget = [max(min(budget, max_budget), self.min_num_patches) for _ in images]
            # Single-pass — vLLM has an iterative scale-down for the batch, but it rarely binds in
            # single-image / small-batch inference.
            for img, tokens_for_media in zip(images, per_image_budget):
                target_w_patches, target_h_patches = self._compute_target_patches(img, tokens_for_media)
                target_sizes.append((target_w_patches, target_h_patches))

        import numpy as np
        norm_mean = torch.tensor(self.norm_mean).view(1, 3, 1, 1)
        norm_std = torch.tensor(self.norm_std).view(1, 3, 1, 1)

        pixel_values_list = []
        num_tokens_per_image = []
        imgs_sizes = []
        for img, (wp, hp) in zip(images, target_sizes):
            target_w = wp * self.patch_size
            target_h = hp * self.patch_size
            # Use torch's antialiased bicubic interpolation to match vLLM's
            # `_bicubic_resize_and_normalize` (`torch.nn.functional.interpolate`, `antialias=True`).
            # PIL's bicubic uses a different kernel (and no antialiasing), producing visibly different
            # pixel values that amplify through the 52-layer ViT / mamba stack and cause HF/vLLM
            # outputs to diverge past the first few tokens.
            arr = np.asarray(img, dtype=np.uint8)  # (H, W, 3)
            t = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).to(dtype=torch.float32)  # (1, 3, H, W)
            if t.shape[-2] != target_h or t.shape[-1] != target_w:
                t = torch.nn.functional.interpolate(
                    t, size=(target_h, target_w), mode="bicubic", align_corners=False, antialias=True
                )
            t = (t / 255.0 - norm_mean) / norm_std
            pixel_values_list.append(t.squeeze(0))  # (3, H, W)
            num_tokens_per_image.append((wp * hp) // (self._downsample_factor ** 2))
            imgs_sizes.append((target_h, target_w))

        # Stack if all images have the same target size (common for same-aspect-ratio batches);
        # otherwise keep as a list of (3, H_i, W_i) tensors. The outer model's `extract_feature`
        # handles both.
        all_same_shape = all(t.shape == pixel_values_list[0].shape for t in pixel_values_list)
        if all_same_shape:
            pixel_values = torch.stack(pixel_values_list, dim=0)
        else:
            pixel_values = pixel_values_list

        return BatchFeature(
            data={
                "pixel_values": pixel_values,
                # One tile per image in dynamic mode — `num_tokens` is what the text-side
                # placeholder expansion should use.
                "num_patches": [1] * len(images),
                "num_tokens": num_tokens_per_image,
                "imgs_sizes": imgs_sizes,
            },
            tensor_type=(return_tensors if all_same_shape else None),
        )

    def _compute_target_patches(self, img: Image.Image, tokens_available: int):
        """Port of `DynamicResolutionImageTiler.process_media` (image-only, no thumbnail)."""
        orig_w, orig_h = img.width, img.height
        # Ceil-ish: `round(x + 0.5)` == `floor(x) + 1` for non-integer x, `x` for integer.
        closest_patch_h = round(orig_h / self.patch_size + 0.5)
        closest_patch_w = round(orig_w / self.patch_size + 0.5)
        patches = closest_patch_h * closest_patch_w

        # Downscale to fit the token budget.
        factor = min(math.sqrt(tokens_available / patches), 1.0)
        target_h = math.floor(factor * closest_patch_h)
        target_w = math.floor(factor * closest_patch_w)

        # Scale up if below the per-image minimum.
        if (
            tokens_available > self.min_num_patches
            and target_h * target_w < self.min_num_patches
        ):
            up = math.sqrt(self.min_num_patches / (target_h * target_w))
            target_h = math.ceil(up * target_h)
            target_w = math.ceil(up * target_w)

        # Round each dim to a multiple of the pixel_shuffle factor so tokens divide evenly.
        divisor = self._downsample_factor
        rem_h = target_h % divisor
        if rem_h:
            inc_h = divisor - rem_h
            if (target_h + inc_h) * target_w <= tokens_available:
                target_h += inc_h
            else:
                target_h = max(divisor, target_h - rem_h)
        rem_w = target_w % divisor
        if rem_w:
            inc_w = divisor - rem_w
            if target_h * (target_w + inc_w) <= tokens_available:
                target_w += inc_w
            else:
                target_w = max(divisor, target_w - rem_w)

        return target_w, target_h

    def _compute_target_patches_video(self, img: Image.Image):
        """Port of vLLM's `_compute_aspect_preserving_size` for video frames.

        Each frame is resized to roughly `video_target_num_patches` (default 1024) on the 16×16
        grid, with aspect ratio preserved and dims snapped to a multiple of the pixel_shuffle
        factor. For `maintain_aspect_ratio=False`, it falls back to a square of sqrt(target)
        patches.
        """
        orig_w, orig_h = img.width, img.height
        target = self.video_target_num_patches
        divisor = self._downsample_factor  # 2 for pixel_shuffle
        if self.video_maintain_aspect_ratio:
            aspect_wh = orig_w / max(orig_h, 1)
            ph = max(round(math.sqrt(target / aspect_wh)), 1)
            pw = max(round(math.sqrt(target * aspect_wh)), 1)
            if divisor > 1:
                rem_h = ph % divisor
                rem_w = pw % divisor
                ph_up = ph + (divisor - rem_h if rem_h else 0)
                ph_down = ph - rem_h
                pw_up = pw + (divisor - rem_w if rem_w else 0)
                pw_down = pw - rem_w
                # Prefer rounding up when the up-rounded patch count still fits the target;
                # otherwise round down (mirrors vLLM's logic exactly).
                if ph_up * pw_up <= target:
                    ph, pw = ph_up, pw_up
                else:
                    ph = max(divisor, ph_down)
                    pw = max(divisor, pw_down)
        else:
            side = int(math.sqrt(target))
            side = max(divisor, (side // divisor) * divisor)
            ph = pw = side
        return pw, ph