weikaih commited on
Commit
413c18f
·
verified ·
1 Parent(s): 1e5a25f

Re-converted with official allenai/molmo2 conversion script

Browse files
added_tokens.json CHANGED
@@ -27,14 +27,18 @@
27
  "<|image|>": 151941,
28
  "<|object_ref_end|>": 151647,
29
  "<|object_ref_start|>": 151646,
 
30
  "<|quad_end|>": 151651,
31
  "<|quad_start|>": 151650,
32
  "<|repo_name|>": 151663,
 
33
  "<|video_pad|>": 151656,
34
  "<|video|>": 151945,
35
  "<|vision_end|>": 151653,
36
  "<|vision_pad|>": 151654,
37
  "<|vision_start|>": 151652,
 
 
38
  "|<EXTRA_TOKENS_0>|": 151669,
39
  "|<EXTRA_TOKENS_100>|": 151769,
40
  "|<EXTRA_TOKENS_101>|": 151770,
 
27
  "<|image|>": 151941,
28
  "<|object_ref_end|>": 151647,
29
  "<|object_ref_start|>": 151646,
30
+ "<|points|>": 151946,
31
  "<|quad_end|>": 151651,
32
  "<|quad_start|>": 151650,
33
  "<|repo_name|>": 151663,
34
+ "<|token_index|>": 151947,
35
  "<|video_pad|>": 151656,
36
  "<|video|>": 151945,
37
  "<|vision_end|>": 151653,
38
  "<|vision_pad|>": 151654,
39
  "<|vision_start|>": 151652,
40
+ "<|vit_index|>": 151948,
41
+ "<|vit_loc|>": 151949,
42
  "|<EXTRA_TOKENS_0>|": 151669,
43
  "|<EXTRA_TOKENS_100>|": 151769,
44
  "|<EXTRA_TOKENS_101>|": 151770,
config.json CHANGED
@@ -50,7 +50,7 @@
50
  "initializer_range": 0.02,
51
  "intermediate_size": 9728,
52
  "layer_norm_eps": 1e-06,
53
- "max_position_embeddings": 36864,
54
  "model_type": "molmo2_text",
55
  "norm_after": false,
56
  "num_attention_heads": 32,
@@ -67,9 +67,9 @@
67
  "vocab_size": 151936
68
  },
69
  "tie_word_embeddings": false,
70
- "transformers_version": "4.57.1",
71
  "use_cache": true,
72
- "use_frame_special_tokens": true,
73
  "vit_config": {
74
  "attention_dropout": 0.0,
75
  "attn_implementation": "sdpa",
 
50
  "initializer_range": 0.02,
51
  "intermediate_size": 9728,
52
  "layer_norm_eps": 1e-06,
53
+ "max_position_embeddings": 16384,
54
  "model_type": "molmo2_text",
55
  "norm_after": false,
56
  "num_attention_heads": 32,
 
67
  "vocab_size": 151936
68
  },
69
  "tie_word_embeddings": false,
70
+ "transformers_version": "4.56.2",
71
  "use_cache": true,
72
+ "use_frame_special_tokens": false,
73
  "vit_config": {
74
  "attention_dropout": 0.0,
75
  "attn_implementation": "sdpa",
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "bos_token_id": 151645,
3
  "eos_token_id": 151645,
4
  "pad_token_id": 151643,
5
- "transformers_version": "4.57.1"
6
  }
 
2
  "bos_token_id": 151645,
3
  "eos_token_id": 151645,
4
  "pad_token_id": 151643,
5
+ "transformers_version": "4.56.2"
6
  }
image_processing_molmo2.py CHANGED
@@ -259,12 +259,13 @@ def image_to_patches_and_grids(
259
  image_patch_size: int,
260
  image_pooling_w: int,
261
  image_pooling_h: int,
262
- ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
263
  """
264
  :return image_grids, the shape of each (low-res, high-res) image after pooling
265
  :return crops, the image crops to processes with the ViT
266
  :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
267
  patches in `crops` to pool for that token, masked with -1
 
268
  """
269
  if isinstance(base_image_input_size, int):
270
  base_image_input_size = (base_image_input_size, base_image_input_size)
@@ -298,6 +299,7 @@ def image_to_patches_and_grids(
298
  image_std,
299
  image_patch_size,
300
  )
 
301
  crop_arr = np.concatenate([resized, crop_arr], 0)
302
 
303
  resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
@@ -316,7 +318,8 @@ def image_to_patches_and_grids(
316
  return (
317
  np.stack(image_grid, 0),
318
  batch_pixels_to_patches(crop_arr, image_patch_size),
319
- pooling_idx
 
320
  )
321
 
322
 
@@ -395,6 +398,7 @@ class Molmo2ImageProcessor(BaseImageProcessor):
395
  patch_size: Optional[int] = None,
396
  pooling_size: Optional[list[int]] = None,
397
  return_tensors: Optional[Union[str, TensorType]] = None,
 
398
  **kwargs,
399
  ) -> BatchFeature:
400
  """
@@ -428,6 +432,8 @@ class Molmo2ImageProcessor(BaseImageProcessor):
428
  - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
429
  - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
430
  - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
 
 
431
 
432
  Returns:
433
  A `BatchFeature` containing the following keys:
@@ -473,6 +479,9 @@ class Molmo2ImageProcessor(BaseImageProcessor):
473
  images = [to_numpy_array(image) for image in images]
474
 
475
  data = {}
 
 
 
476
  if images is not None:
477
  batch_grids = []
478
  batch_crops = []
@@ -480,7 +489,7 @@ class Molmo2ImageProcessor(BaseImageProcessor):
480
  batch_num_crops = []
481
 
482
  for image in images:
483
- image_grid, crops, pooled_idx = image_to_patches_and_grids(
484
  image,
485
  max_crops,
486
  overlap_margins,
@@ -496,6 +505,12 @@ class Molmo2ImageProcessor(BaseImageProcessor):
496
  batch_crops.append(crops)
497
  batch_pooled_patches_idx.append(pooled_idx)
498
  batch_num_crops.append(crops.shape[0])
 
 
 
 
 
 
499
 
500
  pixel_values = np.concatenate(batch_crops, 0)
501
  image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
@@ -509,7 +524,12 @@ class Molmo2ImageProcessor(BaseImageProcessor):
509
  image_num_crops=image_num_crops,
510
  )
511
 
512
- return BatchFeature(data, tensor_type=return_tensors)
 
 
 
 
 
513
 
514
 
515
- Molmo2ImageProcessor.register_for_auto_class()
 
259
  image_patch_size: int,
260
  image_pooling_w: int,
261
  image_pooling_h: int,
262
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
263
  """
264
  :return image_grids, the shape of each (low-res, high-res) image after pooling
265
  :return crops, the image crops to processes with the ViT
266
  :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
267
  patches in `crops` to pool for that token, masked with -1
268
+ :rturn patch_idx_arr, map patch coordiantes to patch ids
269
  """
270
  if isinstance(base_image_input_size, int):
271
  base_image_input_size = (base_image_input_size, base_image_input_size)
 
299
  image_std,
300
  image_patch_size,
301
  )
302
+ patch_idx_arr += crop_patch_h*crop_patch_w
303
  crop_arr = np.concatenate([resized, crop_arr], 0)
304
 
305
  resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
 
318
  return (
319
  np.stack(image_grid, 0),
320
  batch_pixels_to_patches(crop_arr, image_patch_size),
321
+ pooling_idx,
322
+ patch_idx_arr
323
  )
324
 
325
 
 
398
  patch_size: Optional[int] = None,
399
  pooling_size: Optional[list[int]] = None,
400
  return_tensors: Optional[Union[str, TensorType]] = None,
401
+ return_pointing_metadata: bool = False,
402
  **kwargs,
403
  ) -> BatchFeature:
404
  """
 
432
  - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
433
  - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
434
  - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
435
+ return_patch_mappings (bool, optional):
436
+ Whether to return patch mappings used for decoding MolmoPoint points
437
 
438
  Returns:
439
  A `BatchFeature` containing the following keys:
 
479
  images = [to_numpy_array(image) for image in images]
480
 
481
  data = {}
482
+ patch_mappings = []
483
+ absolute_token_pooling = []
484
+ offset = 0
485
  if images is not None:
486
  batch_grids = []
487
  batch_crops = []
 
489
  batch_num_crops = []
490
 
491
  for image in images:
492
+ image_grid, crops, pooled_idx, patch_mapping = image_to_patches_and_grids(
493
  image,
494
  max_crops,
495
  overlap_margins,
 
505
  batch_crops.append(crops)
506
  batch_pooled_patches_idx.append(pooled_idx)
507
  batch_num_crops.append(crops.shape[0])
508
+ if return_pointing_metadata:
509
+ absolute_token_pooling.append(
510
+ np.where(pooled_idx >= 0, pooled_idx + offset, -1))
511
+ patch_mappings.append(patch_mapping + offset)
512
+ n_patches = np.prod(crops.shape[:2])
513
+ offset += n_patches
514
 
515
  pixel_values = np.concatenate(batch_crops, 0)
516
  image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
 
524
  image_num_crops=image_num_crops,
525
  )
526
 
527
+ data = BatchFeature(data, tensor_type=return_tensors)
528
+ if return_pointing_metadata:
529
+ data["image_token_pooling_np"] = np.concatenate(absolute_token_pooling, 0) if len(images) else None
530
+ data["subpatch_mapping"] = patch_mappings
531
+ data["image_sizes"] = [x.shape[:2][::-1] for x in images]
532
+ return data
533
 
534
 
535
+ Molmo2ImageProcessor.register_for_auto_class()
processing_molmo2.py CHANGED
@@ -11,7 +11,7 @@ from transformers.video_utils import VideoInput
11
  from transformers.processing_utils import (
12
  Unpack,
13
  ProcessingKwargs,
14
- ProcessorMixin,
15
  )
16
  from transformers.feature_extraction_utils import BatchFeature
17
  from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
@@ -88,6 +88,7 @@ class Molmo2Processor(ProcessorMixin):
88
  use_single_crop_start_token: Optional[bool] = True,
89
  video_use_col_tokens: Optional[bool] = False,
90
  use_frame_special_tokens: Optional[bool] = True,
 
91
  **kwargs
92
  ) -> None:
93
  super().__init__(
@@ -101,13 +102,14 @@ class Molmo2Processor(ProcessorMixin):
101
  video_use_col_tokens=video_use_col_tokens,
102
  use_frame_special_tokens=use_frame_special_tokens,
103
  )
104
-
105
  self.image_placeholder_token = IMAGE_PROMPT
106
  self.video_placeholder_token = VIDEO_PROMPT
107
  self.image_token_ids = [
108
  tokenizer.convert_tokens_to_ids(token)
109
  for token in IMAGE_TOKENS
110
  ]
 
 
111
 
112
  def get_image_tokens(self, image_grid: np.ndarray):
113
  resized_h, resized_w, height, width = image_grid
@@ -119,7 +121,10 @@ class Molmo2Processor(ProcessorMixin):
119
  np.tile(per_row, [height]),
120
  [IM_END_TOKEN],
121
  ]
122
- per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
 
 
 
123
  use_single_crop_col_tokens = (
124
  self.image_use_col_tokens
125
  if self.use_single_crop_col_tokens is None
@@ -247,6 +252,8 @@ class Molmo2Processor(ProcessorMixin):
247
  text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
248
  images: ImageInput = None,
249
  videos: VideoInput = None,
 
 
250
  **kwargs: Unpack[Molmo2ProcessorKwargs],
251
  ) -> BatchFeature:
252
  """
@@ -287,22 +294,37 @@ class Molmo2Processor(ProcessorMixin):
287
  Returned when `videos` is not `None`.
288
  - **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
289
  """
290
-
291
  output_kwargs = self._merge_kwargs(
292
  Molmo2ProcessorKwargs,
293
  tokenizer_init_kwargs=self.tokenizer.init_kwargs,
294
  **kwargs,
295
  )
296
-
297
  if images is not None:
298
- image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
 
 
 
 
 
299
  image_grids = image_inputs["image_grids"]
300
  else:
301
  image_inputs = {}
302
  image_grids = None
303
 
304
  if videos is not None:
305
- videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
 
 
 
 
 
 
 
 
 
 
 
306
  video_grids = videos_inputs["video_grids"]
307
  # If user has not requested video metadata, pop it
308
  if "return_metadata" not in kwargs:
@@ -367,10 +389,13 @@ class Molmo2Processor(ProcessorMixin):
367
  text_inputs["input_ids"] = input_ids.tolist()
368
  text_inputs["attention_mask"] = attention_mask.tolist()
369
 
370
- return BatchFeature(
371
  data={**text_inputs, **image_inputs, **videos_inputs},
372
  tensor_type=return_tensors,
373
  )
 
 
 
374
 
375
  def post_process_image_text_to_text(
376
  self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
 
11
  from transformers.processing_utils import (
12
  Unpack,
13
  ProcessingKwargs,
14
+ ProcessorMixin, AllKwargsForChatTemplate,
15
  )
16
  from transformers.feature_extraction_utils import BatchFeature
17
  from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
 
88
  use_single_crop_start_token: Optional[bool] = True,
89
  video_use_col_tokens: Optional[bool] = False,
90
  use_frame_special_tokens: Optional[bool] = True,
91
+ use_low_res_token_for_global_crops: bool = False,
92
  **kwargs
93
  ) -> None:
94
  super().__init__(
 
102
  video_use_col_tokens=video_use_col_tokens,
103
  use_frame_special_tokens=use_frame_special_tokens,
104
  )
 
105
  self.image_placeholder_token = IMAGE_PROMPT
106
  self.video_placeholder_token = VIDEO_PROMPT
107
  self.image_token_ids = [
108
  tokenizer.convert_tokens_to_ids(token)
109
  for token in IMAGE_TOKENS
110
  ]
111
+ self.use_low_res_token_for_global_crops = use_low_res_token_for_global_crops
112
+ self._patch_metadata = None
113
 
114
  def get_image_tokens(self, image_grid: np.ndarray):
115
  resized_h, resized_w, height, width = image_grid
 
121
  np.tile(per_row, [height]),
122
  [IM_END_TOKEN],
123
  ]
124
+ if self.use_low_res_token_for_global_crops:
125
+ per_row = np.full(resized_w, IMAGE_LOW_RES_TOKEN)
126
+ else:
127
+ per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
128
  use_single_crop_col_tokens = (
129
  self.image_use_col_tokens
130
  if self.use_single_crop_col_tokens is None
 
252
  text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
253
  images: ImageInput = None,
254
  videos: VideoInput = None,
255
+ return_pointing_metadata: bool = False,
256
+ use_low_res_token_for_global_crops: bool = False,
257
  **kwargs: Unpack[Molmo2ProcessorKwargs],
258
  ) -> BatchFeature:
259
  """
 
294
  Returned when `videos` is not `None`.
295
  - **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
296
  """
 
297
  output_kwargs = self._merge_kwargs(
298
  Molmo2ProcessorKwargs,
299
  tokenizer_init_kwargs=self.tokenizer.init_kwargs,
300
  **kwargs,
301
  )
302
+ patch_metadata = {}
303
  if images is not None:
304
+ image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"],
305
+ return_pointing_metadata=return_pointing_metadata)
306
+ if return_pointing_metadata:
307
+ patch_metadata["token_pooling"] = image_inputs.pop("image_token_pooling_np")
308
+ patch_metadata["subpatch_mapping"] = image_inputs.pop("subpatch_mapping")
309
+ patch_metadata["image_sizes"] = image_inputs.pop("image_sizes")
310
  image_grids = image_inputs["image_grids"]
311
  else:
312
  image_inputs = {}
313
  image_grids = None
314
 
315
  if videos is not None:
316
+ videos_inputs = self.video_processor(
317
+ videos=videos, **output_kwargs["videos_kwargs"],
318
+ return_pointing_metadata=return_pointing_metadata
319
+ )
320
+ if return_pointing_metadata:
321
+ assert len(videos_inputs['video_metadata']) == 1
322
+ vd_metadata = videos_inputs['video_metadata'][0]
323
+ patch_metadata["token_pooling"] = videos_inputs.pop("video_token_pooling_np")
324
+ patch_metadata["subpatch_mapping"] = videos_inputs.pop("subpatch_mapping")
325
+ patch_metadata["timestamps"] = vd_metadata.timestamps
326
+ patch_metadata["video_size"] = (vd_metadata.width, vd_metadata.height)
327
+
328
  video_grids = videos_inputs["video_grids"]
329
  # If user has not requested video metadata, pop it
330
  if "return_metadata" not in kwargs:
 
389
  text_inputs["input_ids"] = input_ids.tolist()
390
  text_inputs["attention_mask"] = attention_mask.tolist()
391
 
392
+ features = BatchFeature(
393
  data={**text_inputs, **image_inputs, **videos_inputs},
394
  tensor_type=return_tensors,
395
  )
396
+ if return_pointing_metadata:
397
+ features["metadata"] = patch_metadata
398
+ return features
399
 
400
  def post_process_image_text_to_text(
401
  self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
processor_config.json CHANGED
@@ -5,6 +5,7 @@
5
  "image_use_col_tokens": true,
6
  "processor_class": "Molmo2Processor",
7
  "use_frame_special_tokens": true,
 
8
  "use_single_crop_col_tokens": false,
9
  "use_single_crop_start_token": true,
10
  "video_use_col_tokens": false
 
5
  "image_use_col_tokens": true,
6
  "processor_class": "Molmo2Processor",
7
  "use_frame_special_tokens": true,
8
+ "use_low_res_token_for_global_crops": false,
9
  "use_single_crop_col_tokens": false,
10
  "use_single_crop_start_token": true,
11
  "video_use_col_tokens": false
special_tokens_map.json CHANGED
@@ -276,7 +276,11 @@
276
  "<im_low>",
277
  "<frame_start>",
278
  "<frame_end>",
279
- "<|video|>"
 
 
 
 
280
  ],
281
  "bos_token": "<|im_end|>",
282
  "eos_token": {
 
276
  "<im_low>",
277
  "<frame_start>",
278
  "<frame_end>",
279
+ "<|video|>",
280
+ "<|points|>",
281
+ "<|token_index|>",
282
+ "<|vit_index|>",
283
+ "<|vit_loc|>"
284
  ],
285
  "bos_token": "<|im_end|>",
286
  "eos_token": {
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95e80901c901584f416b8fd4349fd60022774b89ba4377626511f0562cc599f7
3
- size 11477017
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ef24b8f324674c3a1163e2c205b9e2b1a230fd2d4294de7b779bb419fc23914
3
+ size 11477774
tokenizer_config.json CHANGED
@@ -2425,6 +2425,38 @@
2425
  "rstrip": false,
2426
  "single_word": false,
2427
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2428
  }
2429
  },
2430
  "additional_special_tokens": [
@@ -2704,7 +2736,11 @@
2704
  "<im_low>",
2705
  "<frame_start>",
2706
  "<frame_end>",
2707
- "<|video|>"
 
 
 
 
2708
  ],
2709
  "auto_map": {
2710
  "AutoProcessor": "processing_molmo2.Molmo2Processor"
 
2425
  "rstrip": false,
2426
  "single_word": false,
2427
  "special": true
2428
+ },
2429
+ "151946": {
2430
+ "content": "<|points|>",
2431
+ "lstrip": false,
2432
+ "normalized": false,
2433
+ "rstrip": false,
2434
+ "single_word": false,
2435
+ "special": true
2436
+ },
2437
+ "151947": {
2438
+ "content": "<|token_index|>",
2439
+ "lstrip": false,
2440
+ "normalized": false,
2441
+ "rstrip": false,
2442
+ "single_word": false,
2443
+ "special": true
2444
+ },
2445
+ "151948": {
2446
+ "content": "<|vit_index|>",
2447
+ "lstrip": false,
2448
+ "normalized": false,
2449
+ "rstrip": false,
2450
+ "single_word": false,
2451
+ "special": true
2452
+ },
2453
+ "151949": {
2454
+ "content": "<|vit_loc|>",
2455
+ "lstrip": false,
2456
+ "normalized": false,
2457
+ "rstrip": false,
2458
+ "single_word": false,
2459
+ "special": true
2460
  }
2461
  },
2462
  "additional_special_tokens": [
 
2736
  "<im_low>",
2737
  "<frame_start>",
2738
  "<frame_end>",
2739
+ "<|video|>",
2740
+ "<|points|>",
2741
+ "<|token_index|>",
2742
+ "<|vit_index|>",
2743
+ "<|vit_loc|>"
2744
  ],
2745
  "auto_map": {
2746
  "AutoProcessor": "processing_molmo2.Molmo2Processor"
video_preprocessor_config.json CHANGED
@@ -10,6 +10,7 @@
10
  "do_center_crop": null,
11
  "do_convert_rgb": true,
12
  "do_normalize": true,
 
13
  "do_rescale": true,
14
  "do_resize": true,
15
  "do_sample_frames": true,
@@ -28,7 +29,6 @@
28
  "input_data_format": null,
29
  "max_fps": 2.0,
30
  "num_frames": 128,
31
- "pad_size": null,
32
  "patch_size": 14,
33
  "pooling_size": [
34
  3,
@@ -43,6 +43,7 @@
43
  "height": 378,
44
  "width": 378
45
  },
 
46
  "video_metadata": null,
47
  "video_processor_type": "Molmo2VideoProcessor"
48
  }
 
10
  "do_center_crop": null,
11
  "do_convert_rgb": true,
12
  "do_normalize": true,
13
+ "do_pad": null,
14
  "do_rescale": true,
15
  "do_resize": true,
16
  "do_sample_frames": true,
 
29
  "input_data_format": null,
30
  "max_fps": 2.0,
31
  "num_frames": 128,
 
32
  "patch_size": 14,
33
  "pooling_size": [
34
  3,
 
43
  "height": 378,
44
  "width": 378
45
  },
46
+ "size_divisor": null,
47
  "video_metadata": null,
48
  "video_processor_type": "Molmo2VideoProcessor"
49
  }
video_processing_molmo2.py CHANGED
@@ -826,7 +826,8 @@ class Molmo2VideoProcessor(BaseVideoProcessor):
826
  ) -> BatchFeature:
827
  validate_kwargs(
828
  captured_kwargs=kwargs.keys(),
829
- valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
 
830
  )
831
 
832
  # Set default kwargs from self. This ensures that if a kwarg is not provided
@@ -867,6 +868,7 @@ class Molmo2VideoProcessor(BaseVideoProcessor):
867
  patch_size: Optional[int] = None,
868
  pooling_size: Optional[list[int]] = None,
869
  return_tensors: Optional[Union[str, TensorType]] = None,
 
870
  **kwargs,
871
  ) -> BatchFeature:
872
  """
@@ -955,13 +957,20 @@ class Molmo2VideoProcessor(BaseVideoProcessor):
955
  pixel_values_videos = np.concatenate(batch_crops, 0)
956
  video_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
957
 
958
- data =dict(
959
  pixel_values_videos=pixel_values_videos,
960
  video_token_pooling=video_token_pooling,
961
  video_grids=video_grids,
962
- )
963
-
964
- return BatchFeature(data, tensor_type=return_tensors)
 
 
 
 
 
 
 
965
 
966
 
967
  Molmo2VideoProcessor.register_for_auto_class()
 
826
  ) -> BatchFeature:
827
  validate_kwargs(
828
  captured_kwargs=kwargs.keys(),
829
+ valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) +
830
+ ["return_tensors", "return_pointing_metadata"],
831
  )
832
 
833
  # Set default kwargs from self. This ensures that if a kwarg is not provided
 
868
  patch_size: Optional[int] = None,
869
  pooling_size: Optional[list[int]] = None,
870
  return_tensors: Optional[Union[str, TensorType]] = None,
871
+ return_pointing_metadata: bool = False,
872
  **kwargs,
873
  ) -> BatchFeature:
874
  """
 
957
  pixel_values_videos = np.concatenate(batch_crops, 0)
958
  video_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
959
 
960
+ data = BatchFeature(dict(
961
  pixel_values_videos=pixel_values_videos,
962
  video_token_pooling=video_token_pooling,
963
  video_grids=video_grids,
964
+ ), tensor_type=return_tensors)
965
+ if return_pointing_metadata:
966
+ t = pixel_values_videos.shape[0]
967
+ assert base_image_input_size[0] % self.patch_size == 0
968
+ assert base_image_input_size[1] % self.patch_size == 0
969
+ crop_w = base_image_input_size[0] // self.patch_size
970
+ crop_h = base_image_input_size[1] // self.patch_size
971
+ data["subpatch_mapping"] = np.arange(t*crop_w*crop_h).reshape([t, crop_h, crop_w])
972
+ data["video_token_pooling_np"] = video_token_pooling
973
+ return data
974
 
975
 
976
  Molmo2VideoProcessor.register_for_auto_class()