Upload folder using huggingface_hub
Browse files- config.json +1 -0
- configuration_molmo_point.py +2 -0
- convert_molmo_point_to_hf.py +2 -0
- image_processing_molmo2.py +9 -2
- modeling_molmo_point.py +5 -11
- processing_molmo2.py +7 -1
- processor_config.json +1 -0
config.json
CHANGED
|
@@ -36,6 +36,7 @@
|
|
| 36 |
"image_col_id": 151939,
|
| 37 |
"image_end_token_id": 151937,
|
| 38 |
"image_high_res_id": 151938,
|
|
|
|
| 39 |
"image_patch_id": 151938,
|
| 40 |
"image_start_token_id": 151936,
|
| 41 |
"initializer_range": 0.02,
|
|
|
|
| 36 |
"image_col_id": 151939,
|
| 37 |
"image_end_token_id": 151937,
|
| 38 |
"image_high_res_id": 151938,
|
| 39 |
+
"image_non_indexable_patch_id": 151942,
|
| 40 |
"image_patch_id": 151938,
|
| 41 |
"image_start_token_id": 151936,
|
| 42 |
"initializer_range": 0.02,
|
configuration_molmo_point.py
CHANGED
|
@@ -141,6 +141,7 @@ class MolmoPointConfig(PretrainedConfig):
|
|
| 141 |
low_res_image_start_token_id: int = None,
|
| 142 |
image_end_token_id: int = None,
|
| 143 |
image_patch_id: int = None,
|
|
|
|
| 144 |
image_col_id: int = None,
|
| 145 |
frame_start_token_id: int = None,
|
| 146 |
frame_end_token_id: int = None,
|
|
@@ -190,6 +191,7 @@ class MolmoPointConfig(PretrainedConfig):
|
|
| 190 |
self.low_res_image_start_token_id = low_res_image_start_token_id
|
| 191 |
self.image_end_token_id = image_end_token_id
|
| 192 |
self.image_high_res_id = image_patch_id
|
|
|
|
| 193 |
self.image_patch_id = image_patch_id
|
| 194 |
self.image_col_id = image_col_id
|
| 195 |
self.frame_start_token_id = frame_start_token_id
|
|
|
|
| 141 |
low_res_image_start_token_id: int = None,
|
| 142 |
image_end_token_id: int = None,
|
| 143 |
image_patch_id: int = None,
|
| 144 |
+
image_non_indexable_patch_id: int = None,
|
| 145 |
image_col_id: int = None,
|
| 146 |
frame_start_token_id: int = None,
|
| 147 |
frame_end_token_id: int = None,
|
|
|
|
| 191 |
self.low_res_image_start_token_id = low_res_image_start_token_id
|
| 192 |
self.image_end_token_id = image_end_token_id
|
| 193 |
self.image_high_res_id = image_patch_id
|
| 194 |
+
self.image_non_indexable_patch_id = image_non_indexable_patch_id
|
| 195 |
self.image_patch_id = image_patch_id
|
| 196 |
self.image_col_id = image_col_id
|
| 197 |
self.frame_start_token_id = frame_start_token_id
|
convert_molmo_point_to_hf.py
CHANGED
|
@@ -170,6 +170,7 @@ def convert_config(
|
|
| 170 |
patch_token_id=tokenizer.token_index_token_id,
|
| 171 |
location_token_id=tokenizer.subpatch_loc_token_id,
|
| 172 |
subpatch_token_id=tokenizer.subpatch_index_token_id,
|
|
|
|
| 173 |
frame_start_token_id=frame_start_token_id,
|
| 174 |
frame_end_token_id=frame_end_token_id,
|
| 175 |
use_frame_special_tokens=model_config.mm_preprocessor.video.use_frame_special_tokens,
|
|
@@ -382,6 +383,7 @@ def save(
|
|
| 382 |
use_single_crop_start_token=use_single_crop_start_token,
|
| 383 |
video_use_col_tokens=False,
|
| 384 |
use_frame_special_tokens=use_frame_special_tokens,
|
|
|
|
| 385 |
)
|
| 386 |
processor.audio_tokenizer = None
|
| 387 |
processor.save_pretrained(output_dir)
|
|
|
|
| 170 |
patch_token_id=tokenizer.token_index_token_id,
|
| 171 |
location_token_id=tokenizer.subpatch_loc_token_id,
|
| 172 |
subpatch_token_id=tokenizer.subpatch_index_token_id,
|
| 173 |
+
image_non_indexable_patch_id=tokenizer.image_low_res_token_id,
|
| 174 |
frame_start_token_id=frame_start_token_id,
|
| 175 |
frame_end_token_id=frame_end_token_id,
|
| 176 |
use_frame_special_tokens=model_config.mm_preprocessor.video.use_frame_special_tokens,
|
|
|
|
| 383 |
use_single_crop_start_token=use_single_crop_start_token,
|
| 384 |
video_use_col_tokens=False,
|
| 385 |
use_frame_special_tokens=use_frame_special_tokens,
|
| 386 |
+
use_low_res_token_for_global_crops=True
|
| 387 |
)
|
| 388 |
processor.audio_tokenizer = None
|
| 389 |
processor.save_pretrained(output_dir)
|
image_processing_molmo2.py
CHANGED
|
@@ -480,6 +480,8 @@ class Molmo2ImageProcessor(BaseImageProcessor):
|
|
| 480 |
|
| 481 |
data = {}
|
| 482 |
patch_mappings = []
|
|
|
|
|
|
|
| 483 |
if images is not None:
|
| 484 |
batch_grids = []
|
| 485 |
batch_crops = []
|
|
@@ -503,7 +505,12 @@ class Molmo2ImageProcessor(BaseImageProcessor):
|
|
| 503 |
batch_crops.append(crops)
|
| 504 |
batch_pooled_patches_idx.append(pooled_idx)
|
| 505 |
batch_num_crops.append(crops.shape[0])
|
| 506 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
|
| 508 |
pixel_values = np.concatenate(batch_crops, 0)
|
| 509 |
image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
|
|
@@ -519,7 +526,7 @@ class Molmo2ImageProcessor(BaseImageProcessor):
|
|
| 519 |
|
| 520 |
data = BatchFeature(data, tensor_type=return_tensors)
|
| 521 |
if return_pointing_metadata:
|
| 522 |
-
data["image_token_pooling_np"] =
|
| 523 |
data["subpatch_mapping"] = patch_mappings
|
| 524 |
data["image_sizes"] = [x.shape[:2][::-1] for x in images]
|
| 525 |
return data
|
|
|
|
| 480 |
|
| 481 |
data = {}
|
| 482 |
patch_mappings = []
|
| 483 |
+
absolute_token_pooling = []
|
| 484 |
+
offset = 0
|
| 485 |
if images is not None:
|
| 486 |
batch_grids = []
|
| 487 |
batch_crops = []
|
|
|
|
| 505 |
batch_crops.append(crops)
|
| 506 |
batch_pooled_patches_idx.append(pooled_idx)
|
| 507 |
batch_num_crops.append(crops.shape[0])
|
| 508 |
+
if return_pointing_metadata:
|
| 509 |
+
absolute_token_pooling.append(
|
| 510 |
+
np.where(pooled_idx >= 0, pooled_idx + offset, -1))
|
| 511 |
+
patch_mappings.append(patch_mapping + offset)
|
| 512 |
+
n_patches = np.prod(crops.shape[:2])
|
| 513 |
+
offset += n_patches
|
| 514 |
|
| 515 |
pixel_values = np.concatenate(batch_crops, 0)
|
| 516 |
image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
|
|
|
|
| 526 |
|
| 527 |
data = BatchFeature(data, tensor_type=return_tensors)
|
| 528 |
if return_pointing_metadata:
|
| 529 |
+
data["image_token_pooling_np"] = np.concatenate(absolute_token_pooling, 0) if len(images) else None
|
| 530 |
data["subpatch_mapping"] = patch_mappings
|
| 531 |
data["image_sizes"] = [x.shape[:2][::-1] for x in images]
|
| 532 |
return data
|
modeling_molmo_point.py
CHANGED
|
@@ -1312,10 +1312,12 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
|
|
| 1312 |
batch_idx = torch.arange(batch_size, device=self.device)
|
| 1313 |
|
| 1314 |
# TODO update embeddings for patch/subpatch tokens
|
| 1315 |
-
|
| 1316 |
vit_features_flat: Optional[torch.FloatTensor] = None
|
| 1317 |
if images is not None:
|
| 1318 |
-
|
|
|
|
|
|
|
|
|
|
| 1319 |
images = images.to(device=self.device, dtype=self.dtype)
|
| 1320 |
B, T, N, D = images.shape
|
| 1321 |
images = images.view(B * T, N, D)
|
|
@@ -1346,15 +1348,7 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
|
|
| 1346 |
|
| 1347 |
# Build position ids for the image features, which we might need for rotary
|
| 1348 |
# embeddings
|
| 1349 |
-
image_token_indices = torch.cumsum(
|
| 1350 |
-
if image_grids is not None:
|
| 1351 |
-
# Global crop is always the first 196 images tokens and cannot be pointed to
|
| 1352 |
-
is_indexable_image_token = is_image_token & (image_token_indices >= 196)
|
| 1353 |
-
is_non_indexable_image_token = is_image_token & (image_token_indices < 196)
|
| 1354 |
-
image_token_indices = torch.clip(image_token_indices - 196, min=0)
|
| 1355 |
-
else:
|
| 1356 |
-
is_indexable_image_token = is_image_token
|
| 1357 |
-
is_non_indexable_image_token = torch.zeros_like(is_indexable_image_token)
|
| 1358 |
image_pos_ids_flat = image_token_indices.view(-1)[is_image_token.view(-1)]
|
| 1359 |
image_pos_ids = torch.zeros([batch_size, token_pooling.shape[1]], dtype=torch.long, device=self.device)
|
| 1360 |
image_pos_ids.view(-1)[image_features_mask.view(-1)] = image_pos_ids_flat
|
|
|
|
| 1312 |
batch_idx = torch.arange(batch_size, device=self.device)
|
| 1313 |
|
| 1314 |
# TODO update embeddings for patch/subpatch tokens
|
|
|
|
| 1315 |
vit_features_flat: Optional[torch.FloatTensor] = None
|
| 1316 |
if images is not None:
|
| 1317 |
+
is_indexable_image_token = input_ids == self.config.image_patch_id
|
| 1318 |
+
is_non_indexable_image_token = input_ids == self.config.image_non_indexable_patch_id
|
| 1319 |
+
is_image_token = is_indexable_image_token | is_non_indexable_image_token
|
| 1320 |
+
|
| 1321 |
images = images.to(device=self.device, dtype=self.dtype)
|
| 1322 |
B, T, N, D = images.shape
|
| 1323 |
images = images.view(B * T, N, D)
|
|
|
|
| 1348 |
|
| 1349 |
# Build position ids for the image features, which we might need for rotary
|
| 1350 |
# embeddings
|
| 1351 |
+
image_token_indices = torch.cumsum(is_indexable_image_token, dim=-1) - 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1352 |
image_pos_ids_flat = image_token_indices.view(-1)[is_image_token.view(-1)]
|
| 1353 |
image_pos_ids = torch.zeros([batch_size, token_pooling.shape[1]], dtype=torch.long, device=self.device)
|
| 1354 |
image_pos_ids.view(-1)[image_features_mask.view(-1)] = image_pos_ids_flat
|
processing_molmo2.py
CHANGED
|
@@ -88,6 +88,7 @@ class Molmo2Processor(ProcessorMixin):
|
|
| 88 |
use_single_crop_start_token: Optional[bool] = True,
|
| 89 |
video_use_col_tokens: Optional[bool] = False,
|
| 90 |
use_frame_special_tokens: Optional[bool] = True,
|
|
|
|
| 91 |
**kwargs
|
| 92 |
) -> None:
|
| 93 |
super().__init__(
|
|
@@ -107,6 +108,7 @@ class Molmo2Processor(ProcessorMixin):
|
|
| 107 |
tokenizer.convert_tokens_to_ids(token)
|
| 108 |
for token in IMAGE_TOKENS
|
| 109 |
]
|
|
|
|
| 110 |
self._patch_metadata = None
|
| 111 |
|
| 112 |
def get_image_tokens(self, image_grid: np.ndarray):
|
|
@@ -119,7 +121,10 @@ class Molmo2Processor(ProcessorMixin):
|
|
| 119 |
np.tile(per_row, [height]),
|
| 120 |
[IM_END_TOKEN],
|
| 121 |
]
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
| 123 |
use_single_crop_col_tokens = (
|
| 124 |
self.image_use_col_tokens
|
| 125 |
if self.use_single_crop_col_tokens is None
|
|
@@ -248,6 +253,7 @@ class Molmo2Processor(ProcessorMixin):
|
|
| 248 |
images: ImageInput = None,
|
| 249 |
videos: VideoInput = None,
|
| 250 |
return_pointing_metadata: bool = False,
|
|
|
|
| 251 |
**kwargs: Unpack[Molmo2ProcessorKwargs],
|
| 252 |
) -> BatchFeature:
|
| 253 |
"""
|
|
|
|
| 88 |
use_single_crop_start_token: Optional[bool] = True,
|
| 89 |
video_use_col_tokens: Optional[bool] = False,
|
| 90 |
use_frame_special_tokens: Optional[bool] = True,
|
| 91 |
+
use_low_res_token_for_global_crops: bool = False,
|
| 92 |
**kwargs
|
| 93 |
) -> None:
|
| 94 |
super().__init__(
|
|
|
|
| 108 |
tokenizer.convert_tokens_to_ids(token)
|
| 109 |
for token in IMAGE_TOKENS
|
| 110 |
]
|
| 111 |
+
self.use_low_res_token_for_global_crops = use_low_res_token_for_global_crops
|
| 112 |
self._patch_metadata = None
|
| 113 |
|
| 114 |
def get_image_tokens(self, image_grid: np.ndarray):
|
|
|
|
| 121 |
np.tile(per_row, [height]),
|
| 122 |
[IM_END_TOKEN],
|
| 123 |
]
|
| 124 |
+
if self.use_low_res_token_for_global_crops:
|
| 125 |
+
per_row = np.full(resized_w, IMAGE_LOW_RES_TOKEN)
|
| 126 |
+
else:
|
| 127 |
+
per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
|
| 128 |
use_single_crop_col_tokens = (
|
| 129 |
self.image_use_col_tokens
|
| 130 |
if self.use_single_crop_col_tokens is None
|
|
|
|
| 253 |
images: ImageInput = None,
|
| 254 |
videos: VideoInput = None,
|
| 255 |
return_pointing_metadata: bool = False,
|
| 256 |
+
use_low_res_token_for_global_crops: bool = False,
|
| 257 |
**kwargs: Unpack[Molmo2ProcessorKwargs],
|
| 258 |
) -> BatchFeature:
|
| 259 |
"""
|
processor_config.json
CHANGED
|
@@ -5,6 +5,7 @@
|
|
| 5 |
"image_use_col_tokens": true,
|
| 6 |
"processor_class": "Molmo2Processor",
|
| 7 |
"use_frame_special_tokens": true,
|
|
|
|
| 8 |
"use_single_crop_col_tokens": false,
|
| 9 |
"use_single_crop_start_token": true,
|
| 10 |
"video_use_col_tokens": false
|
|
|
|
| 5 |
"image_use_col_tokens": true,
|
| 6 |
"processor_class": "Molmo2Processor",
|
| 7 |
"use_frame_special_tokens": true,
|
| 8 |
+
"use_low_res_token_for_global_crops": true,
|
| 9 |
"use_single_crop_col_tokens": false,
|
| 10 |
"use_single_crop_start_token": true,
|
| 11 |
"video_use_col_tokens": false
|