Instructions to use allenai/Molmo2-ER with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use allenai/Molmo2-ER with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="allenai/Molmo2-ER", trust_remote_code=True)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoModelForImageTextToText
model = AutoModelForImageTextToText.from_pretrained("allenai/Molmo2-ER", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use allenai/Molmo2-ER with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "allenai/Molmo2-ER"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "allenai/Molmo2-ER",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/allenai/Molmo2-ER

SGLang

How to use allenai/Molmo2-ER with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "allenai/Molmo2-ER" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "allenai/Molmo2-ER",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "allenai/Molmo2-ER" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "allenai/Molmo2-ER",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use allenai/Molmo2-ER with Docker Model Runner:
```
docker model run hf.co/allenai/Molmo2-ER
```

weikaih commited on 3 days ago

Commit

413c18f

verified ·

1 Parent(s): 1e5a25f

Re-converted with official allenai/molmo2 conversion script

Browse files

Files changed (11) hide show

added_tokens.json +4 -0
config.json +3 -3
generation_config.json +1 -1
image_processing_molmo2.py +25 -5
processing_molmo2.py +33 -8
processor_config.json +1 -0
special_tokens_map.json +5 -1
tokenizer.json +2 -2
tokenizer_config.json +37 -1
video_preprocessor_config.json +2 -1
video_processing_molmo2.py +14 -5

added_tokens.json CHANGED Viewed

@@ -27,14 +27,18 @@
   "<|image|>": 151941,
   "<|object_ref_end|>": 151647,
   "<|object_ref_start|>": 151646,
   "<|quad_end|>": 151651,
   "<|quad_start|>": 151650,
   "<|repo_name|>": 151663,
   "<|video_pad|>": 151656,
   "<|video|>": 151945,
   "<|vision_end|>": 151653,
   "<|vision_pad|>": 151654,
   "<|vision_start|>": 151652,
   "|<EXTRA_TOKENS_0>|": 151669,
   "|<EXTRA_TOKENS_100>|": 151769,
   "|<EXTRA_TOKENS_101>|": 151770,

   "<|image|>": 151941,
   "<|object_ref_end|>": 151647,
   "<|object_ref_start|>": 151646,
+  "<|points|>": 151946,
   "<|quad_end|>": 151651,
   "<|quad_start|>": 151650,
   "<|repo_name|>": 151663,
+  "<|token_index|>": 151947,
   "<|video_pad|>": 151656,
   "<|video|>": 151945,
   "<|vision_end|>": 151653,
   "<|vision_pad|>": 151654,
   "<|vision_start|>": 151652,
+  "<|vit_index|>": 151948,
+  "<|vit_loc|>": 151949,
   "|<EXTRA_TOKENS_0>|": 151669,
   "|<EXTRA_TOKENS_100>|": 151769,
   "|<EXTRA_TOKENS_101>|": 151770,

config.json CHANGED Viewed

@@ -50,7 +50,7 @@
     "initializer_range": 0.02,
     "intermediate_size": 9728,
     "layer_norm_eps": 1e-06,
-    "max_position_embeddings": 36864,
     "model_type": "molmo2_text",
     "norm_after": false,
     "num_attention_heads": 32,
@@ -67,9 +67,9 @@
     "vocab_size": 151936
   },
   "tie_word_embeddings": false,
-  "transformers_version": "4.57.1",
   "use_cache": true,
-  "use_frame_special_tokens": true,
   "vit_config": {
     "attention_dropout": 0.0,
     "attn_implementation": "sdpa",

     "initializer_range": 0.02,
     "intermediate_size": 9728,
     "layer_norm_eps": 1e-06,
+    "max_position_embeddings": 16384,
     "model_type": "molmo2_text",
     "norm_after": false,
     "num_attention_heads": 32,
     "vocab_size": 151936
   },
   "tie_word_embeddings": false,
+  "transformers_version": "4.56.2",
   "use_cache": true,
+  "use_frame_special_tokens": false,
   "vit_config": {
     "attention_dropout": 0.0,
     "attn_implementation": "sdpa",

generation_config.json CHANGED Viewed

@@ -2,5 +2,5 @@
   "bos_token_id": 151645,
   "eos_token_id": 151645,
   "pad_token_id": 151643,
-  "transformers_version": "4.57.1"
 }

   "bos_token_id": 151645,
   "eos_token_id": 151645,
   "pad_token_id": 151643,
+  "transformers_version": "4.56.2"
 }

image_processing_molmo2.py CHANGED Viewed

@@ -259,12 +259,13 @@ def image_to_patches_and_grids(
     image_patch_size: int,
     image_pooling_w: int,
     image_pooling_h: int,
-) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
     """
     :return image_grids, the shape of each (low-res, high-res) image after pooling
     :return crops, the image crops to processes with the ViT
     :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
                                 patches in `crops` to pool for that token, masked with -1
     """
     if isinstance(base_image_input_size, int):
         base_image_input_size = (base_image_input_size, base_image_input_size)
@@ -298,6 +299,7 @@ def image_to_patches_and_grids(
         image_std,
         image_patch_size,
     )
     crop_arr = np.concatenate([resized, crop_arr], 0)
     resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
@@ -316,7 +318,8 @@ def image_to_patches_and_grids(
     return (
         np.stack(image_grid, 0),
         batch_pixels_to_patches(crop_arr, image_patch_size),
-        pooling_idx
     )
@@ -395,6 +398,7 @@ class Molmo2ImageProcessor(BaseImageProcessor):
         patch_size: Optional[int] = None,
         pooling_size: Optional[list[int]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchFeature:
         """
@@ -428,6 +432,8 @@ class Molmo2ImageProcessor(BaseImageProcessor):
                 - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                 - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                 - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
         Returns:
             A `BatchFeature` containing the following keys:
@@ -473,6 +479,9 @@ class Molmo2ImageProcessor(BaseImageProcessor):
         images = [to_numpy_array(image) for image in images]
         data = {}
         if images is not None:
             batch_grids = []
             batch_crops = []
@@ -480,7 +489,7 @@ class Molmo2ImageProcessor(BaseImageProcessor):
             batch_num_crops = []
             for image in images:
-                image_grid, crops, pooled_idx = image_to_patches_and_grids(
                     image,
                     max_crops,
                     overlap_margins,
@@ -496,6 +505,12 @@ class Molmo2ImageProcessor(BaseImageProcessor):
                 batch_crops.append(crops)
                 batch_pooled_patches_idx.append(pooled_idx)
                 batch_num_crops.append(crops.shape[0])
             pixel_values = np.concatenate(batch_crops, 0)
             image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
@@ -509,7 +524,12 @@ class Molmo2ImageProcessor(BaseImageProcessor):
                 image_num_crops=image_num_crops,
             )
-        return BatchFeature(data, tensor_type=return_tensors)
-Molmo2ImageProcessor.register_for_auto_class()

     image_patch_size: int,
     image_pooling_w: int,
     image_pooling_h: int,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
     """
     :return image_grids, the shape of each (low-res, high-res) image after pooling
     :return crops, the image crops to processes with the ViT
     :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
                                 patches in `crops` to pool for that token, masked with -1
+    :rturn patch_idx_arr, map patch coordiantes to patch ids
     """
     if isinstance(base_image_input_size, int):
         base_image_input_size = (base_image_input_size, base_image_input_size)
         image_std,
         image_patch_size,
     )
+    patch_idx_arr += crop_patch_h*crop_patch_w
     crop_arr = np.concatenate([resized, crop_arr], 0)
     resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
     return (
         np.stack(image_grid, 0),
         batch_pixels_to_patches(crop_arr, image_patch_size),
+        pooling_idx,
+        patch_idx_arr
     )
         patch_size: Optional[int] = None,
         pooling_size: Optional[list[int]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
+        return_pointing_metadata: bool = False,
         **kwargs,
     ) -> BatchFeature:
         """
                 - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                 - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                 - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            return_patch_mappings (bool, optional):
+                Whether to return patch mappings used for decoding MolmoPoint points
         Returns:
             A `BatchFeature` containing the following keys:
         images = [to_numpy_array(image) for image in images]
         data = {}
+        patch_mappings = []
+        absolute_token_pooling = []
+        offset = 0
         if images is not None:
             batch_grids = []
             batch_crops = []
             batch_num_crops = []
             for image in images:
+                image_grid, crops, pooled_idx, patch_mapping = image_to_patches_and_grids(
                     image,
                     max_crops,
                     overlap_margins,
                 batch_crops.append(crops)
                 batch_pooled_patches_idx.append(pooled_idx)
                 batch_num_crops.append(crops.shape[0])
+                if return_pointing_metadata:
+                    absolute_token_pooling.append(
+                        np.where(pooled_idx >= 0, pooled_idx + offset, -1))
+                    patch_mappings.append(patch_mapping + offset)
+                    n_patches = np.prod(crops.shape[:2])
+                    offset += n_patches
             pixel_values = np.concatenate(batch_crops, 0)
             image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
                 image_num_crops=image_num_crops,
             )
+        data = BatchFeature(data, tensor_type=return_tensors)
+        if return_pointing_metadata:
+            data["image_token_pooling_np"] = np.concatenate(absolute_token_pooling, 0) if len(images) else None
+            data["subpatch_mapping"] = patch_mappings
+            data["image_sizes"] = [x.shape[:2][::-1] for x in images]
+        return data
+Molmo2ImageProcessor.register_for_auto_class()

processing_molmo2.py CHANGED Viewed

@@ -11,7 +11,7 @@ from transformers.video_utils import VideoInput
 from transformers.processing_utils import (
     Unpack,
     ProcessingKwargs,
-    ProcessorMixin,
 )
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
@@ -88,6 +88,7 @@ class Molmo2Processor(ProcessorMixin):
         use_single_crop_start_token: Optional[bool] = True,
         video_use_col_tokens: Optional[bool] = False,
         use_frame_special_tokens: Optional[bool] = True,
         **kwargs
     ) -> None:
         super().__init__(
@@ -101,13 +102,14 @@ class Molmo2Processor(ProcessorMixin):
             video_use_col_tokens=video_use_col_tokens,
             use_frame_special_tokens=use_frame_special_tokens,
         )
         self.image_placeholder_token = IMAGE_PROMPT
         self.video_placeholder_token = VIDEO_PROMPT
         self.image_token_ids = [
             tokenizer.convert_tokens_to_ids(token)
             for token in IMAGE_TOKENS
         ]
     def get_image_tokens(self, image_grid: np.ndarray):
         resized_h, resized_w, height, width = image_grid
@@ -119,7 +121,10 @@ class Molmo2Processor(ProcessorMixin):
             np.tile(per_row, [height]),
             [IM_END_TOKEN],
         ]
-        per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
         use_single_crop_col_tokens = (
             self.image_use_col_tokens
             if self.use_single_crop_col_tokens is None
@@ -247,6 +252,8 @@ class Molmo2Processor(ProcessorMixin):
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         images: ImageInput = None,
         videos: VideoInput = None,
         **kwargs: Unpack[Molmo2ProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -287,22 +294,37 @@ class Molmo2Processor(ProcessorMixin):
               Returned when `videos` is not `None`.
             - **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
             Molmo2ProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
         if images is not None:
-            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
             image_grids = image_inputs["image_grids"]
         else:
             image_inputs = {}
             image_grids = None
         if videos is not None:
-            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
             video_grids = videos_inputs["video_grids"]
             # If user has not requested video metadata, pop it
             if "return_metadata" not in kwargs:
@@ -367,10 +389,13 @@ class Molmo2Processor(ProcessorMixin):
         text_inputs["input_ids"] = input_ids.tolist()
         text_inputs["attention_mask"] = attention_mask.tolist()
-        return BatchFeature(
             data={**text_inputs, **image_inputs, **videos_inputs},
             tensor_type=return_tensors,
         )
     def post_process_image_text_to_text(
         self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs

 from transformers.processing_utils import (
     Unpack,
     ProcessingKwargs,
+    ProcessorMixin, AllKwargsForChatTemplate,
 )
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
         use_single_crop_start_token: Optional[bool] = True,
         video_use_col_tokens: Optional[bool] = False,
         use_frame_special_tokens: Optional[bool] = True,
+        use_low_res_token_for_global_crops: bool = False,
         **kwargs
     ) -> None:
         super().__init__(
             video_use_col_tokens=video_use_col_tokens,
             use_frame_special_tokens=use_frame_special_tokens,
         )
         self.image_placeholder_token = IMAGE_PROMPT
         self.video_placeholder_token = VIDEO_PROMPT
         self.image_token_ids = [
             tokenizer.convert_tokens_to_ids(token)
             for token in IMAGE_TOKENS
         ]
+        self.use_low_res_token_for_global_crops = use_low_res_token_for_global_crops
+        self._patch_metadata = None
     def get_image_tokens(self, image_grid: np.ndarray):
         resized_h, resized_w, height, width = image_grid
             np.tile(per_row, [height]),
             [IM_END_TOKEN],
         ]
+        if self.use_low_res_token_for_global_crops:
+            per_row = np.full(resized_w, IMAGE_LOW_RES_TOKEN)
+        else:
+            per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
         use_single_crop_col_tokens = (
             self.image_use_col_tokens
             if self.use_single_crop_col_tokens is None
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         images: ImageInput = None,
         videos: VideoInput = None,
+        return_pointing_metadata: bool = False,
+        use_low_res_token_for_global_crops: bool = False,
         **kwargs: Unpack[Molmo2ProcessorKwargs],
     ) -> BatchFeature:
         """
               Returned when `videos` is not `None`.
             - **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
             Molmo2ProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
+        patch_metadata = {}
         if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"],
+                                                return_pointing_metadata=return_pointing_metadata)
+            if return_pointing_metadata:
+                patch_metadata["token_pooling"] = image_inputs.pop("image_token_pooling_np")
+                patch_metadata["subpatch_mapping"] = image_inputs.pop("subpatch_mapping")
+                patch_metadata["image_sizes"] = image_inputs.pop("image_sizes")
             image_grids = image_inputs["image_grids"]
         else:
             image_inputs = {}
             image_grids = None
         if videos is not None:
+            videos_inputs = self.video_processor(
+                videos=videos, **output_kwargs["videos_kwargs"],
+                return_pointing_metadata=return_pointing_metadata
+            )
+            if return_pointing_metadata:
+                assert len(videos_inputs['video_metadata']) == 1
+                vd_metadata = videos_inputs['video_metadata'][0]
+                patch_metadata["token_pooling"] = videos_inputs.pop("video_token_pooling_np")
+                patch_metadata["subpatch_mapping"] = videos_inputs.pop("subpatch_mapping")
+                patch_metadata["timestamps"] = vd_metadata.timestamps
+                patch_metadata["video_size"] = (vd_metadata.width, vd_metadata.height)
             video_grids = videos_inputs["video_grids"]
             # If user has not requested video metadata, pop it
             if "return_metadata" not in kwargs:
         text_inputs["input_ids"] = input_ids.tolist()
         text_inputs["attention_mask"] = attention_mask.tolist()
+        features = BatchFeature(
             data={**text_inputs, **image_inputs, **videos_inputs},
             tensor_type=return_tensors,
         )
+        if return_pointing_metadata:
+            features["metadata"] = patch_metadata
+        return features
     def post_process_image_text_to_text(
         self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs

processor_config.json CHANGED Viewed

@@ -5,6 +5,7 @@
   "image_use_col_tokens": true,
   "processor_class": "Molmo2Processor",
   "use_frame_special_tokens": true,
   "use_single_crop_col_tokens": false,
   "use_single_crop_start_token": true,
   "video_use_col_tokens": false

   "image_use_col_tokens": true,
   "processor_class": "Molmo2Processor",
   "use_frame_special_tokens": true,
+  "use_low_res_token_for_global_crops": false,
   "use_single_crop_col_tokens": false,
   "use_single_crop_start_token": true,
   "video_use_col_tokens": false

special_tokens_map.json CHANGED Viewed

@@ -276,7 +276,11 @@
     "<im_low>",
     "<frame_start>",
     "<frame_end>",
-    "<|video|>"
   ],
   "bos_token": "<|im_end|>",
   "eos_token": {

     "<im_low>",
     "<frame_start>",
     "<frame_end>",
+    "<|video|>",
+    "<|points|>",
+    "<|token_index|>",
+    "<|vit_index|>",
+    "<|vit_loc|>"
   ],
   "bos_token": "<|im_end|>",
   "eos_token": {

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:95e80901c901584f416b8fd4349fd60022774b89ba4377626511f0562cc599f7
-size 11477017

 version https://git-lfs.github.com/spec/v1
+oid sha256:3ef24b8f324674c3a1163e2c205b9e2b1a230fd2d4294de7b779bb419fc23914
+size 11477774

tokenizer_config.json CHANGED Viewed

@@ -2425,6 +2425,38 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "additional_special_tokens": [
@@ -2704,7 +2736,11 @@
     "<im_low>",
     "<frame_start>",
     "<frame_end>",
-    "<|video|>"
   ],
   "auto_map": {
     "AutoProcessor": "processing_molmo2.Molmo2Processor"

       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "151946": {
+      "content": "<|points|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151947": {
+      "content": "<|token_index|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151948": {
+      "content": "<|vit_index|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151949": {
+      "content": "<|vit_loc|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
   "additional_special_tokens": [
     "<im_low>",
     "<frame_start>",
     "<frame_end>",
+    "<|video|>",
+    "<|points|>",
+    "<|token_index|>",
+    "<|vit_index|>",
+    "<|vit_loc|>"
   ],
   "auto_map": {
     "AutoProcessor": "processing_molmo2.Molmo2Processor"

video_preprocessor_config.json CHANGED Viewed

@@ -10,6 +10,7 @@
   "do_center_crop": null,
   "do_convert_rgb": true,
   "do_normalize": true,
   "do_rescale": true,
   "do_resize": true,
   "do_sample_frames": true,
@@ -28,7 +29,6 @@
   "input_data_format": null,
   "max_fps": 2.0,
   "num_frames": 128,
-  "pad_size": null,
   "patch_size": 14,
   "pooling_size": [
     3,
@@ -43,6 +43,7 @@
     "height": 378,
     "width": 378
   },
   "video_metadata": null,
   "video_processor_type": "Molmo2VideoProcessor"
 }

   "do_center_crop": null,
   "do_convert_rgb": true,
   "do_normalize": true,
+  "do_pad": null,
   "do_rescale": true,
   "do_resize": true,
   "do_sample_frames": true,
   "input_data_format": null,
   "max_fps": 2.0,
   "num_frames": 128,
   "patch_size": 14,
   "pooling_size": [
     3,
     "height": 378,
     "width": 378
   },
+  "size_divisor": null,
   "video_metadata": null,
   "video_processor_type": "Molmo2VideoProcessor"
 }

video_processing_molmo2.py CHANGED Viewed

@@ -826,7 +826,8 @@ class Molmo2VideoProcessor(BaseVideoProcessor):
     ) -> BatchFeature:
         validate_kwargs(
             captured_kwargs=kwargs.keys(),
-            valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
         )
         # Set default kwargs from self. This ensures that if a kwarg is not provided
@@ -867,6 +868,7 @@ class Molmo2VideoProcessor(BaseVideoProcessor):
         patch_size: Optional[int] = None,
         pooling_size: Optional[list[int]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchFeature:
         """
@@ -955,13 +957,20 @@ class Molmo2VideoProcessor(BaseVideoProcessor):
         pixel_values_videos = np.concatenate(batch_crops, 0)
         video_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
-        data =dict(
             pixel_values_videos=pixel_values_videos,
             video_token_pooling=video_token_pooling,
             video_grids=video_grids,
-        )
-        return BatchFeature(data, tensor_type=return_tensors)
 Molmo2VideoProcessor.register_for_auto_class()

     ) -> BatchFeature:
         validate_kwargs(
             captured_kwargs=kwargs.keys(),
+            valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) +
+                                 ["return_tensors", "return_pointing_metadata"],
         )
         # Set default kwargs from self. This ensures that if a kwarg is not provided
         patch_size: Optional[int] = None,
         pooling_size: Optional[list[int]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
+        return_pointing_metadata: bool = False,
         **kwargs,
     ) -> BatchFeature:
         """
         pixel_values_videos = np.concatenate(batch_crops, 0)
         video_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
+        data = BatchFeature(dict(
             pixel_values_videos=pixel_values_videos,
             video_token_pooling=video_token_pooling,
             video_grids=video_grids,
+        ), tensor_type=return_tensors)
+        if return_pointing_metadata:
+            t = pixel_values_videos.shape[0]
+            assert base_image_input_size[0] % self.patch_size == 0
+            assert base_image_input_size[1] % self.patch_size == 0
+            crop_w = base_image_input_size[0] // self.patch_size
+            crop_h = base_image_input_size[1] // self.patch_size
+            data["subpatch_mapping"] = np.arange(t*crop_w*crop_h).reshape([t, crop_h, crop_w])
+            data["video_token_pooling_np"] = video_token_pooling
+        return data
 Molmo2VideoProcessor.register_for_auto_class()