Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

config.json +1 -0
configuration_molmo_point.py +2 -0
convert_molmo_point_to_hf.py +2 -0
image_processing_molmo2.py +9 -2
modeling_molmo_point.py +5 -11
processing_molmo2.py +7 -1
processor_config.json +1 -0

config.json CHANGED Viewed

@@ -36,6 +36,7 @@
   "image_col_id": 151939,
   "image_end_token_id": 151937,
   "image_high_res_id": 151938,
   "image_patch_id": 151938,
   "image_start_token_id": 151936,
   "initializer_range": 0.02,

   "image_col_id": 151939,
   "image_end_token_id": 151937,
   "image_high_res_id": 151938,
+  "image_non_indexable_patch_id": 151942,
   "image_patch_id": 151938,
   "image_start_token_id": 151936,
   "initializer_range": 0.02,

configuration_molmo_point.py CHANGED Viewed

@@ -141,6 +141,7 @@ class MolmoPointConfig(PretrainedConfig):
         low_res_image_start_token_id: int = None,
         image_end_token_id: int = None,
         image_patch_id: int = None,
         image_col_id: int = None,
         frame_start_token_id: int = None,
         frame_end_token_id: int = None,
@@ -190,6 +191,7 @@ class MolmoPointConfig(PretrainedConfig):
         self.low_res_image_start_token_id = low_res_image_start_token_id
         self.image_end_token_id = image_end_token_id
         self.image_high_res_id = image_patch_id
         self.image_patch_id = image_patch_id
         self.image_col_id = image_col_id
         self.frame_start_token_id = frame_start_token_id

         low_res_image_start_token_id: int = None,
         image_end_token_id: int = None,
         image_patch_id: int = None,
+        image_non_indexable_patch_id: int = None,
         image_col_id: int = None,
         frame_start_token_id: int = None,
         frame_end_token_id: int = None,
         self.low_res_image_start_token_id = low_res_image_start_token_id
         self.image_end_token_id = image_end_token_id
         self.image_high_res_id = image_patch_id
+        self.image_non_indexable_patch_id = image_non_indexable_patch_id
         self.image_patch_id = image_patch_id
         self.image_col_id = image_col_id
         self.frame_start_token_id = frame_start_token_id

convert_molmo_point_to_hf.py CHANGED Viewed

@@ -170,6 +170,7 @@ def convert_config(
         patch_token_id=tokenizer.token_index_token_id,
         location_token_id=tokenizer.subpatch_loc_token_id,
         subpatch_token_id=tokenizer.subpatch_index_token_id,
         frame_start_token_id=frame_start_token_id,
         frame_end_token_id=frame_end_token_id,
         use_frame_special_tokens=model_config.mm_preprocessor.video.use_frame_special_tokens,
@@ -382,6 +383,7 @@ def save(
         use_single_crop_start_token=use_single_crop_start_token,
         video_use_col_tokens=False,
         use_frame_special_tokens=use_frame_special_tokens,
     )
     processor.audio_tokenizer = None
     processor.save_pretrained(output_dir)

         patch_token_id=tokenizer.token_index_token_id,
         location_token_id=tokenizer.subpatch_loc_token_id,
         subpatch_token_id=tokenizer.subpatch_index_token_id,
+        image_non_indexable_patch_id=tokenizer.image_low_res_token_id,
         frame_start_token_id=frame_start_token_id,
         frame_end_token_id=frame_end_token_id,
         use_frame_special_tokens=model_config.mm_preprocessor.video.use_frame_special_tokens,
         use_single_crop_start_token=use_single_crop_start_token,
         video_use_col_tokens=False,
         use_frame_special_tokens=use_frame_special_tokens,
+        use_low_res_token_for_global_crops=True
     )
     processor.audio_tokenizer = None
     processor.save_pretrained(output_dir)

image_processing_molmo2.py CHANGED Viewed

@@ -480,6 +480,8 @@ class Molmo2ImageProcessor(BaseImageProcessor):
         data = {}
         patch_mappings = []
         if images is not None:
             batch_grids = []
             batch_crops = []
@@ -503,7 +505,12 @@ class Molmo2ImageProcessor(BaseImageProcessor):
                 batch_crops.append(crops)
                 batch_pooled_patches_idx.append(pooled_idx)
                 batch_num_crops.append(crops.shape[0])
-                patch_mappings.append(patch_mapping)
             pixel_values = np.concatenate(batch_crops, 0)
             image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
@@ -519,7 +526,7 @@ class Molmo2ImageProcessor(BaseImageProcessor):
         data = BatchFeature(data, tensor_type=return_tensors)
         if return_pointing_metadata:
-            data["image_token_pooling_np"] = image_token_pooling if len(images) else None
             data["subpatch_mapping"] = patch_mappings
             data["image_sizes"] = [x.shape[:2][::-1] for x in images]
         return data

         data = {}
         patch_mappings = []
+        absolute_token_pooling = []
+        offset = 0
         if images is not None:
             batch_grids = []
             batch_crops = []
                 batch_crops.append(crops)
                 batch_pooled_patches_idx.append(pooled_idx)
                 batch_num_crops.append(crops.shape[0])
+                if return_pointing_metadata:
+                    absolute_token_pooling.append(
+                        np.where(pooled_idx >= 0, pooled_idx + offset, -1))
+                    patch_mappings.append(patch_mapping + offset)
+                    n_patches = np.prod(crops.shape[:2])
+                    offset += n_patches
             pixel_values = np.concatenate(batch_crops, 0)
             image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
         data = BatchFeature(data, tensor_type=return_tensors)
         if return_pointing_metadata:
+            data["image_token_pooling_np"] = np.concatenate(absolute_token_pooling, 0) if len(images) else None
             data["subpatch_mapping"] = patch_mappings
             data["image_sizes"] = [x.shape[:2][::-1] for x in images]
         return data

modeling_molmo_point.py CHANGED Viewed

@@ -1312,10 +1312,12 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
         batch_idx = torch.arange(batch_size, device=self.device)
         # TODO update embeddings for patch/subpatch tokens
         vit_features_flat: Optional[torch.FloatTensor] = None
         if images is not None:
-            is_image_token = input_ids == self.config.image_patch_id
             images = images.to(device=self.device, dtype=self.dtype)
             B, T, N, D = images.shape
             images = images.view(B * T, N, D)
@@ -1346,15 +1348,7 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
             # Build position ids for the image features, which we might need for rotary
             # embeddings
-            image_token_indices = torch.cumsum(is_image_token, dim=-1) - 1
-            if image_grids is not None:
-                # Global crop is always the first 196 images tokens and cannot be pointed to
-                is_indexable_image_token = is_image_token & (image_token_indices >= 196)
-                is_non_indexable_image_token = is_image_token & (image_token_indices < 196)
-                image_token_indices = torch.clip(image_token_indices - 196, min=0)
-            else:
-                is_indexable_image_token = is_image_token
-                is_non_indexable_image_token = torch.zeros_like(is_indexable_image_token)
             image_pos_ids_flat = image_token_indices.view(-1)[is_image_token.view(-1)]
             image_pos_ids = torch.zeros([batch_size, token_pooling.shape[1]], dtype=torch.long, device=self.device)
             image_pos_ids.view(-1)[image_features_mask.view(-1)] = image_pos_ids_flat

         batch_idx = torch.arange(batch_size, device=self.device)
         # TODO update embeddings for patch/subpatch tokens
         vit_features_flat: Optional[torch.FloatTensor] = None
         if images is not None:
+            is_indexable_image_token = input_ids == self.config.image_patch_id
+            is_non_indexable_image_token = input_ids == self.config.image_non_indexable_patch_id
+            is_image_token = is_indexable_image_token | is_non_indexable_image_token
             images = images.to(device=self.device, dtype=self.dtype)
             B, T, N, D = images.shape
             images = images.view(B * T, N, D)
             # Build position ids for the image features, which we might need for rotary
             # embeddings
+            image_token_indices = torch.cumsum(is_indexable_image_token, dim=-1) - 1
             image_pos_ids_flat = image_token_indices.view(-1)[is_image_token.view(-1)]
             image_pos_ids = torch.zeros([batch_size, token_pooling.shape[1]], dtype=torch.long, device=self.device)
             image_pos_ids.view(-1)[image_features_mask.view(-1)] = image_pos_ids_flat

processing_molmo2.py CHANGED Viewed

@@ -88,6 +88,7 @@ class Molmo2Processor(ProcessorMixin):
         use_single_crop_start_token: Optional[bool] = True,
         video_use_col_tokens: Optional[bool] = False,
         use_frame_special_tokens: Optional[bool] = True,
         **kwargs
     ) -> None:
         super().__init__(
@@ -107,6 +108,7 @@ class Molmo2Processor(ProcessorMixin):
             tokenizer.convert_tokens_to_ids(token)
             for token in IMAGE_TOKENS
         ]
         self._patch_metadata = None
     def get_image_tokens(self, image_grid: np.ndarray):
@@ -119,7 +121,10 @@ class Molmo2Processor(ProcessorMixin):
             np.tile(per_row, [height]),
             [IM_END_TOKEN],
         ]
-        per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
         use_single_crop_col_tokens = (
             self.image_use_col_tokens
             if self.use_single_crop_col_tokens is None
@@ -248,6 +253,7 @@ class Molmo2Processor(ProcessorMixin):
         images: ImageInput = None,
         videos: VideoInput = None,
         return_pointing_metadata: bool = False,
         **kwargs: Unpack[Molmo2ProcessorKwargs],
     ) -> BatchFeature:
         """

         use_single_crop_start_token: Optional[bool] = True,
         video_use_col_tokens: Optional[bool] = False,
         use_frame_special_tokens: Optional[bool] = True,
+        use_low_res_token_for_global_crops: bool = False,
         **kwargs
     ) -> None:
         super().__init__(
             tokenizer.convert_tokens_to_ids(token)
             for token in IMAGE_TOKENS
         ]
+        self.use_low_res_token_for_global_crops = use_low_res_token_for_global_crops
         self._patch_metadata = None
     def get_image_tokens(self, image_grid: np.ndarray):
             np.tile(per_row, [height]),
             [IM_END_TOKEN],
         ]
+        if self.use_low_res_token_for_global_crops:
+            per_row = np.full(resized_w, IMAGE_LOW_RES_TOKEN)
+        else:
+            per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
         use_single_crop_col_tokens = (
             self.image_use_col_tokens
             if self.use_single_crop_col_tokens is None
         images: ImageInput = None,
         videos: VideoInput = None,
         return_pointing_metadata: bool = False,
+        use_low_res_token_for_global_crops: bool = False,
         **kwargs: Unpack[Molmo2ProcessorKwargs],
     ) -> BatchFeature:
         """

processor_config.json CHANGED Viewed

@@ -5,6 +5,7 @@
   "image_use_col_tokens": true,
   "processor_class": "Molmo2Processor",
   "use_frame_special_tokens": true,
   "use_single_crop_col_tokens": false,
   "use_single_crop_start_token": true,
   "video_use_col_tokens": false

   "image_use_col_tokens": true,
   "processor_class": "Molmo2Processor",
   "use_frame_special_tokens": true,
+  "use_low_res_token_for_global_crops": true,
   "use_single_crop_col_tokens": false,
   "use_single_crop_start_token": true,
   "video_use_col_tokens": false