chrisc36 commited on
Commit
807b58f
·
verified ·
1 Parent(s): 4d0968d

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -36,6 +36,7 @@
36
  "image_col_id": 151939,
37
  "image_end_token_id": 151937,
38
  "image_high_res_id": 151938,
 
39
  "image_patch_id": 151938,
40
  "image_start_token_id": 151936,
41
  "initializer_range": 0.02,
 
36
  "image_col_id": 151939,
37
  "image_end_token_id": 151937,
38
  "image_high_res_id": 151938,
39
+ "image_non_indexable_patch_id": 151942,
40
  "image_patch_id": 151938,
41
  "image_start_token_id": 151936,
42
  "initializer_range": 0.02,
configuration_molmo_point.py CHANGED
@@ -141,6 +141,7 @@ class MolmoPointConfig(PretrainedConfig):
141
  low_res_image_start_token_id: int = None,
142
  image_end_token_id: int = None,
143
  image_patch_id: int = None,
 
144
  image_col_id: int = None,
145
  frame_start_token_id: int = None,
146
  frame_end_token_id: int = None,
@@ -190,6 +191,7 @@ class MolmoPointConfig(PretrainedConfig):
190
  self.low_res_image_start_token_id = low_res_image_start_token_id
191
  self.image_end_token_id = image_end_token_id
192
  self.image_high_res_id = image_patch_id
 
193
  self.image_patch_id = image_patch_id
194
  self.image_col_id = image_col_id
195
  self.frame_start_token_id = frame_start_token_id
 
141
  low_res_image_start_token_id: int = None,
142
  image_end_token_id: int = None,
143
  image_patch_id: int = None,
144
+ image_non_indexable_patch_id: int = None,
145
  image_col_id: int = None,
146
  frame_start_token_id: int = None,
147
  frame_end_token_id: int = None,
 
191
  self.low_res_image_start_token_id = low_res_image_start_token_id
192
  self.image_end_token_id = image_end_token_id
193
  self.image_high_res_id = image_patch_id
194
+ self.image_non_indexable_patch_id = image_non_indexable_patch_id
195
  self.image_patch_id = image_patch_id
196
  self.image_col_id = image_col_id
197
  self.frame_start_token_id = frame_start_token_id
convert_molmo_point_to_hf.py CHANGED
@@ -170,6 +170,7 @@ def convert_config(
170
  patch_token_id=tokenizer.token_index_token_id,
171
  location_token_id=tokenizer.subpatch_loc_token_id,
172
  subpatch_token_id=tokenizer.subpatch_index_token_id,
 
173
  frame_start_token_id=frame_start_token_id,
174
  frame_end_token_id=frame_end_token_id,
175
  use_frame_special_tokens=model_config.mm_preprocessor.video.use_frame_special_tokens,
@@ -382,6 +383,7 @@ def save(
382
  use_single_crop_start_token=use_single_crop_start_token,
383
  video_use_col_tokens=False,
384
  use_frame_special_tokens=use_frame_special_tokens,
 
385
  )
386
  processor.audio_tokenizer = None
387
  processor.save_pretrained(output_dir)
 
170
  patch_token_id=tokenizer.token_index_token_id,
171
  location_token_id=tokenizer.subpatch_loc_token_id,
172
  subpatch_token_id=tokenizer.subpatch_index_token_id,
173
+ image_non_indexable_patch_id=tokenizer.image_low_res_token_id,
174
  frame_start_token_id=frame_start_token_id,
175
  frame_end_token_id=frame_end_token_id,
176
  use_frame_special_tokens=model_config.mm_preprocessor.video.use_frame_special_tokens,
 
383
  use_single_crop_start_token=use_single_crop_start_token,
384
  video_use_col_tokens=False,
385
  use_frame_special_tokens=use_frame_special_tokens,
386
+ use_low_res_token_for_global_crops=True
387
  )
388
  processor.audio_tokenizer = None
389
  processor.save_pretrained(output_dir)
image_processing_molmo2.py CHANGED
@@ -480,6 +480,8 @@ class Molmo2ImageProcessor(BaseImageProcessor):
480
 
481
  data = {}
482
  patch_mappings = []
 
 
483
  if images is not None:
484
  batch_grids = []
485
  batch_crops = []
@@ -503,7 +505,12 @@ class Molmo2ImageProcessor(BaseImageProcessor):
503
  batch_crops.append(crops)
504
  batch_pooled_patches_idx.append(pooled_idx)
505
  batch_num_crops.append(crops.shape[0])
506
- patch_mappings.append(patch_mapping)
 
 
 
 
 
507
 
508
  pixel_values = np.concatenate(batch_crops, 0)
509
  image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
@@ -519,7 +526,7 @@ class Molmo2ImageProcessor(BaseImageProcessor):
519
 
520
  data = BatchFeature(data, tensor_type=return_tensors)
521
  if return_pointing_metadata:
522
- data["image_token_pooling_np"] = image_token_pooling if len(images) else None
523
  data["subpatch_mapping"] = patch_mappings
524
  data["image_sizes"] = [x.shape[:2][::-1] for x in images]
525
  return data
 
480
 
481
  data = {}
482
  patch_mappings = []
483
+ absolute_token_pooling = []
484
+ offset = 0
485
  if images is not None:
486
  batch_grids = []
487
  batch_crops = []
 
505
  batch_crops.append(crops)
506
  batch_pooled_patches_idx.append(pooled_idx)
507
  batch_num_crops.append(crops.shape[0])
508
+ if return_pointing_metadata:
509
+ absolute_token_pooling.append(
510
+ np.where(pooled_idx >= 0, pooled_idx + offset, -1))
511
+ patch_mappings.append(patch_mapping + offset)
512
+ n_patches = np.prod(crops.shape[:2])
513
+ offset += n_patches
514
 
515
  pixel_values = np.concatenate(batch_crops, 0)
516
  image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
 
526
 
527
  data = BatchFeature(data, tensor_type=return_tensors)
528
  if return_pointing_metadata:
529
+ data["image_token_pooling_np"] = np.concatenate(absolute_token_pooling, 0) if len(images) else None
530
  data["subpatch_mapping"] = patch_mappings
531
  data["image_sizes"] = [x.shape[:2][::-1] for x in images]
532
  return data
modeling_molmo_point.py CHANGED
@@ -1312,10 +1312,12 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1312
  batch_idx = torch.arange(batch_size, device=self.device)
1313
 
1314
  # TODO update embeddings for patch/subpatch tokens
1315
-
1316
  vit_features_flat: Optional[torch.FloatTensor] = None
1317
  if images is not None:
1318
- is_image_token = input_ids == self.config.image_patch_id
 
 
 
1319
  images = images.to(device=self.device, dtype=self.dtype)
1320
  B, T, N, D = images.shape
1321
  images = images.view(B * T, N, D)
@@ -1346,15 +1348,7 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1346
 
1347
  # Build position ids for the image features, which we might need for rotary
1348
  # embeddings
1349
- image_token_indices = torch.cumsum(is_image_token, dim=-1) - 1
1350
- if image_grids is not None:
1351
- # Global crop is always the first 196 images tokens and cannot be pointed to
1352
- is_indexable_image_token = is_image_token & (image_token_indices >= 196)
1353
- is_non_indexable_image_token = is_image_token & (image_token_indices < 196)
1354
- image_token_indices = torch.clip(image_token_indices - 196, min=0)
1355
- else:
1356
- is_indexable_image_token = is_image_token
1357
- is_non_indexable_image_token = torch.zeros_like(is_indexable_image_token)
1358
  image_pos_ids_flat = image_token_indices.view(-1)[is_image_token.view(-1)]
1359
  image_pos_ids = torch.zeros([batch_size, token_pooling.shape[1]], dtype=torch.long, device=self.device)
1360
  image_pos_ids.view(-1)[image_features_mask.view(-1)] = image_pos_ids_flat
 
1312
  batch_idx = torch.arange(batch_size, device=self.device)
1313
 
1314
  # TODO update embeddings for patch/subpatch tokens
 
1315
  vit_features_flat: Optional[torch.FloatTensor] = None
1316
  if images is not None:
1317
+ is_indexable_image_token = input_ids == self.config.image_patch_id
1318
+ is_non_indexable_image_token = input_ids == self.config.image_non_indexable_patch_id
1319
+ is_image_token = is_indexable_image_token | is_non_indexable_image_token
1320
+
1321
  images = images.to(device=self.device, dtype=self.dtype)
1322
  B, T, N, D = images.shape
1323
  images = images.view(B * T, N, D)
 
1348
 
1349
  # Build position ids for the image features, which we might need for rotary
1350
  # embeddings
1351
+ image_token_indices = torch.cumsum(is_indexable_image_token, dim=-1) - 1
 
 
 
 
 
 
 
 
1352
  image_pos_ids_flat = image_token_indices.view(-1)[is_image_token.view(-1)]
1353
  image_pos_ids = torch.zeros([batch_size, token_pooling.shape[1]], dtype=torch.long, device=self.device)
1354
  image_pos_ids.view(-1)[image_features_mask.view(-1)] = image_pos_ids_flat
processing_molmo2.py CHANGED
@@ -88,6 +88,7 @@ class Molmo2Processor(ProcessorMixin):
88
  use_single_crop_start_token: Optional[bool] = True,
89
  video_use_col_tokens: Optional[bool] = False,
90
  use_frame_special_tokens: Optional[bool] = True,
 
91
  **kwargs
92
  ) -> None:
93
  super().__init__(
@@ -107,6 +108,7 @@ class Molmo2Processor(ProcessorMixin):
107
  tokenizer.convert_tokens_to_ids(token)
108
  for token in IMAGE_TOKENS
109
  ]
 
110
  self._patch_metadata = None
111
 
112
  def get_image_tokens(self, image_grid: np.ndarray):
@@ -119,7 +121,10 @@ class Molmo2Processor(ProcessorMixin):
119
  np.tile(per_row, [height]),
120
  [IM_END_TOKEN],
121
  ]
122
- per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
 
 
 
123
  use_single_crop_col_tokens = (
124
  self.image_use_col_tokens
125
  if self.use_single_crop_col_tokens is None
@@ -248,6 +253,7 @@ class Molmo2Processor(ProcessorMixin):
248
  images: ImageInput = None,
249
  videos: VideoInput = None,
250
  return_pointing_metadata: bool = False,
 
251
  **kwargs: Unpack[Molmo2ProcessorKwargs],
252
  ) -> BatchFeature:
253
  """
 
88
  use_single_crop_start_token: Optional[bool] = True,
89
  video_use_col_tokens: Optional[bool] = False,
90
  use_frame_special_tokens: Optional[bool] = True,
91
+ use_low_res_token_for_global_crops: bool = False,
92
  **kwargs
93
  ) -> None:
94
  super().__init__(
 
108
  tokenizer.convert_tokens_to_ids(token)
109
  for token in IMAGE_TOKENS
110
  ]
111
+ self.use_low_res_token_for_global_crops = use_low_res_token_for_global_crops
112
  self._patch_metadata = None
113
 
114
  def get_image_tokens(self, image_grid: np.ndarray):
 
121
  np.tile(per_row, [height]),
122
  [IM_END_TOKEN],
123
  ]
124
+ if self.use_low_res_token_for_global_crops:
125
+ per_row = np.full(resized_w, IMAGE_LOW_RES_TOKEN)
126
+ else:
127
+ per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
128
  use_single_crop_col_tokens = (
129
  self.image_use_col_tokens
130
  if self.use_single_crop_col_tokens is None
 
253
  images: ImageInput = None,
254
  videos: VideoInput = None,
255
  return_pointing_metadata: bool = False,
256
+ use_low_res_token_for_global_crops: bool = False,
257
  **kwargs: Unpack[Molmo2ProcessorKwargs],
258
  ) -> BatchFeature:
259
  """
processor_config.json CHANGED
@@ -5,6 +5,7 @@
5
  "image_use_col_tokens": true,
6
  "processor_class": "Molmo2Processor",
7
  "use_frame_special_tokens": true,
 
8
  "use_single_crop_col_tokens": false,
9
  "use_single_crop_start_token": true,
10
  "video_use_col_tokens": false
 
5
  "image_use_col_tokens": true,
6
  "processor_class": "Molmo2Processor",
7
  "use_frame_special_tokens": true,
8
+ "use_low_res_token_for_global_crops": true,
9
  "use_single_crop_col_tokens": false,
10
  "use_single_crop_start_token": true,
11
  "video_use_col_tokens": false