Re-converted with official allenai/molmo2 conversion script
Browse files- added_tokens.json +4 -0
- config.json +3 -3
- generation_config.json +1 -1
- image_processing_molmo2.py +25 -5
- processing_molmo2.py +33 -8
- processor_config.json +1 -0
- special_tokens_map.json +5 -1
- tokenizer.json +2 -2
- tokenizer_config.json +37 -1
- video_preprocessor_config.json +2 -1
- video_processing_molmo2.py +14 -5
added_tokens.json
CHANGED
|
@@ -27,14 +27,18 @@
|
|
| 27 |
"<|image|>": 151941,
|
| 28 |
"<|object_ref_end|>": 151647,
|
| 29 |
"<|object_ref_start|>": 151646,
|
|
|
|
| 30 |
"<|quad_end|>": 151651,
|
| 31 |
"<|quad_start|>": 151650,
|
| 32 |
"<|repo_name|>": 151663,
|
|
|
|
| 33 |
"<|video_pad|>": 151656,
|
| 34 |
"<|video|>": 151945,
|
| 35 |
"<|vision_end|>": 151653,
|
| 36 |
"<|vision_pad|>": 151654,
|
| 37 |
"<|vision_start|>": 151652,
|
|
|
|
|
|
|
| 38 |
"|<EXTRA_TOKENS_0>|": 151669,
|
| 39 |
"|<EXTRA_TOKENS_100>|": 151769,
|
| 40 |
"|<EXTRA_TOKENS_101>|": 151770,
|
|
|
|
| 27 |
"<|image|>": 151941,
|
| 28 |
"<|object_ref_end|>": 151647,
|
| 29 |
"<|object_ref_start|>": 151646,
|
| 30 |
+
"<|points|>": 151946,
|
| 31 |
"<|quad_end|>": 151651,
|
| 32 |
"<|quad_start|>": 151650,
|
| 33 |
"<|repo_name|>": 151663,
|
| 34 |
+
"<|token_index|>": 151947,
|
| 35 |
"<|video_pad|>": 151656,
|
| 36 |
"<|video|>": 151945,
|
| 37 |
"<|vision_end|>": 151653,
|
| 38 |
"<|vision_pad|>": 151654,
|
| 39 |
"<|vision_start|>": 151652,
|
| 40 |
+
"<|vit_index|>": 151948,
|
| 41 |
+
"<|vit_loc|>": 151949,
|
| 42 |
"|<EXTRA_TOKENS_0>|": 151669,
|
| 43 |
"|<EXTRA_TOKENS_100>|": 151769,
|
| 44 |
"|<EXTRA_TOKENS_101>|": 151770,
|
config.json
CHANGED
|
@@ -50,7 +50,7 @@
|
|
| 50 |
"initializer_range": 0.02,
|
| 51 |
"intermediate_size": 9728,
|
| 52 |
"layer_norm_eps": 1e-06,
|
| 53 |
-
"max_position_embeddings":
|
| 54 |
"model_type": "molmo2_text",
|
| 55 |
"norm_after": false,
|
| 56 |
"num_attention_heads": 32,
|
|
@@ -67,9 +67,9 @@
|
|
| 67 |
"vocab_size": 151936
|
| 68 |
},
|
| 69 |
"tie_word_embeddings": false,
|
| 70 |
-
"transformers_version": "4.
|
| 71 |
"use_cache": true,
|
| 72 |
-
"use_frame_special_tokens":
|
| 73 |
"vit_config": {
|
| 74 |
"attention_dropout": 0.0,
|
| 75 |
"attn_implementation": "sdpa",
|
|
|
|
| 50 |
"initializer_range": 0.02,
|
| 51 |
"intermediate_size": 9728,
|
| 52 |
"layer_norm_eps": 1e-06,
|
| 53 |
+
"max_position_embeddings": 16384,
|
| 54 |
"model_type": "molmo2_text",
|
| 55 |
"norm_after": false,
|
| 56 |
"num_attention_heads": 32,
|
|
|
|
| 67 |
"vocab_size": 151936
|
| 68 |
},
|
| 69 |
"tie_word_embeddings": false,
|
| 70 |
+
"transformers_version": "4.56.2",
|
| 71 |
"use_cache": true,
|
| 72 |
+
"use_frame_special_tokens": false,
|
| 73 |
"vit_config": {
|
| 74 |
"attention_dropout": 0.0,
|
| 75 |
"attn_implementation": "sdpa",
|
generation_config.json
CHANGED
|
@@ -2,5 +2,5 @@
|
|
| 2 |
"bos_token_id": 151645,
|
| 3 |
"eos_token_id": 151645,
|
| 4 |
"pad_token_id": 151643,
|
| 5 |
-
"transformers_version": "4.
|
| 6 |
}
|
|
|
|
| 2 |
"bos_token_id": 151645,
|
| 3 |
"eos_token_id": 151645,
|
| 4 |
"pad_token_id": 151643,
|
| 5 |
+
"transformers_version": "4.56.2"
|
| 6 |
}
|
image_processing_molmo2.py
CHANGED
|
@@ -259,12 +259,13 @@ def image_to_patches_and_grids(
|
|
| 259 |
image_patch_size: int,
|
| 260 |
image_pooling_w: int,
|
| 261 |
image_pooling_h: int,
|
| 262 |
-
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
| 263 |
"""
|
| 264 |
:return image_grids, the shape of each (low-res, high-res) image after pooling
|
| 265 |
:return crops, the image crops to processes with the ViT
|
| 266 |
:return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
|
| 267 |
patches in `crops` to pool for that token, masked with -1
|
|
|
|
| 268 |
"""
|
| 269 |
if isinstance(base_image_input_size, int):
|
| 270 |
base_image_input_size = (base_image_input_size, base_image_input_size)
|
|
@@ -298,6 +299,7 @@ def image_to_patches_and_grids(
|
|
| 298 |
image_std,
|
| 299 |
image_patch_size,
|
| 300 |
)
|
|
|
|
| 301 |
crop_arr = np.concatenate([resized, crop_arr], 0)
|
| 302 |
|
| 303 |
resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
|
|
@@ -316,7 +318,8 @@ def image_to_patches_and_grids(
|
|
| 316 |
return (
|
| 317 |
np.stack(image_grid, 0),
|
| 318 |
batch_pixels_to_patches(crop_arr, image_patch_size),
|
| 319 |
-
pooling_idx
|
|
|
|
| 320 |
)
|
| 321 |
|
| 322 |
|
|
@@ -395,6 +398,7 @@ class Molmo2ImageProcessor(BaseImageProcessor):
|
|
| 395 |
patch_size: Optional[int] = None,
|
| 396 |
pooling_size: Optional[list[int]] = None,
|
| 397 |
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
|
|
| 398 |
**kwargs,
|
| 399 |
) -> BatchFeature:
|
| 400 |
"""
|
|
@@ -428,6 +432,8 @@ class Molmo2ImageProcessor(BaseImageProcessor):
|
|
| 428 |
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
| 429 |
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
| 430 |
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
|
|
|
|
|
|
| 431 |
|
| 432 |
Returns:
|
| 433 |
A `BatchFeature` containing the following keys:
|
|
@@ -473,6 +479,9 @@ class Molmo2ImageProcessor(BaseImageProcessor):
|
|
| 473 |
images = [to_numpy_array(image) for image in images]
|
| 474 |
|
| 475 |
data = {}
|
|
|
|
|
|
|
|
|
|
| 476 |
if images is not None:
|
| 477 |
batch_grids = []
|
| 478 |
batch_crops = []
|
|
@@ -480,7 +489,7 @@ class Molmo2ImageProcessor(BaseImageProcessor):
|
|
| 480 |
batch_num_crops = []
|
| 481 |
|
| 482 |
for image in images:
|
| 483 |
-
image_grid, crops, pooled_idx = image_to_patches_and_grids(
|
| 484 |
image,
|
| 485 |
max_crops,
|
| 486 |
overlap_margins,
|
|
@@ -496,6 +505,12 @@ class Molmo2ImageProcessor(BaseImageProcessor):
|
|
| 496 |
batch_crops.append(crops)
|
| 497 |
batch_pooled_patches_idx.append(pooled_idx)
|
| 498 |
batch_num_crops.append(crops.shape[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
|
| 500 |
pixel_values = np.concatenate(batch_crops, 0)
|
| 501 |
image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
|
|
@@ -509,7 +524,12 @@ class Molmo2ImageProcessor(BaseImageProcessor):
|
|
| 509 |
image_num_crops=image_num_crops,
|
| 510 |
)
|
| 511 |
|
| 512 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
|
| 514 |
|
| 515 |
-
Molmo2ImageProcessor.register_for_auto_class()
|
|
|
|
| 259 |
image_patch_size: int,
|
| 260 |
image_pooling_w: int,
|
| 261 |
image_pooling_h: int,
|
| 262 |
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
| 263 |
"""
|
| 264 |
:return image_grids, the shape of each (low-res, high-res) image after pooling
|
| 265 |
:return crops, the image crops to processes with the ViT
|
| 266 |
:return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
|
| 267 |
patches in `crops` to pool for that token, masked with -1
|
| 268 |
+
:rturn patch_idx_arr, map patch coordiantes to patch ids
|
| 269 |
"""
|
| 270 |
if isinstance(base_image_input_size, int):
|
| 271 |
base_image_input_size = (base_image_input_size, base_image_input_size)
|
|
|
|
| 299 |
image_std,
|
| 300 |
image_patch_size,
|
| 301 |
)
|
| 302 |
+
patch_idx_arr += crop_patch_h*crop_patch_w
|
| 303 |
crop_arr = np.concatenate([resized, crop_arr], 0)
|
| 304 |
|
| 305 |
resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
|
|
|
|
| 318 |
return (
|
| 319 |
np.stack(image_grid, 0),
|
| 320 |
batch_pixels_to_patches(crop_arr, image_patch_size),
|
| 321 |
+
pooling_idx,
|
| 322 |
+
patch_idx_arr
|
| 323 |
)
|
| 324 |
|
| 325 |
|
|
|
|
| 398 |
patch_size: Optional[int] = None,
|
| 399 |
pooling_size: Optional[list[int]] = None,
|
| 400 |
return_tensors: Optional[Union[str, TensorType]] = None,
|
| 401 |
+
return_pointing_metadata: bool = False,
|
| 402 |
**kwargs,
|
| 403 |
) -> BatchFeature:
|
| 404 |
"""
|
|
|
|
| 432 |
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
| 433 |
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
| 434 |
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
| 435 |
+
return_patch_mappings (bool, optional):
|
| 436 |
+
Whether to return patch mappings used for decoding MolmoPoint points
|
| 437 |
|
| 438 |
Returns:
|
| 439 |
A `BatchFeature` containing the following keys:
|
|
|
|
| 479 |
images = [to_numpy_array(image) for image in images]
|
| 480 |
|
| 481 |
data = {}
|
| 482 |
+
patch_mappings = []
|
| 483 |
+
absolute_token_pooling = []
|
| 484 |
+
offset = 0
|
| 485 |
if images is not None:
|
| 486 |
batch_grids = []
|
| 487 |
batch_crops = []
|
|
|
|
| 489 |
batch_num_crops = []
|
| 490 |
|
| 491 |
for image in images:
|
| 492 |
+
image_grid, crops, pooled_idx, patch_mapping = image_to_patches_and_grids(
|
| 493 |
image,
|
| 494 |
max_crops,
|
| 495 |
overlap_margins,
|
|
|
|
| 505 |
batch_crops.append(crops)
|
| 506 |
batch_pooled_patches_idx.append(pooled_idx)
|
| 507 |
batch_num_crops.append(crops.shape[0])
|
| 508 |
+
if return_pointing_metadata:
|
| 509 |
+
absolute_token_pooling.append(
|
| 510 |
+
np.where(pooled_idx >= 0, pooled_idx + offset, -1))
|
| 511 |
+
patch_mappings.append(patch_mapping + offset)
|
| 512 |
+
n_patches = np.prod(crops.shape[:2])
|
| 513 |
+
offset += n_patches
|
| 514 |
|
| 515 |
pixel_values = np.concatenate(batch_crops, 0)
|
| 516 |
image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
|
|
|
|
| 524 |
image_num_crops=image_num_crops,
|
| 525 |
)
|
| 526 |
|
| 527 |
+
data = BatchFeature(data, tensor_type=return_tensors)
|
| 528 |
+
if return_pointing_metadata:
|
| 529 |
+
data["image_token_pooling_np"] = np.concatenate(absolute_token_pooling, 0) if len(images) else None
|
| 530 |
+
data["subpatch_mapping"] = patch_mappings
|
| 531 |
+
data["image_sizes"] = [x.shape[:2][::-1] for x in images]
|
| 532 |
+
return data
|
| 533 |
|
| 534 |
|
| 535 |
+
Molmo2ImageProcessor.register_for_auto_class()
|
processing_molmo2.py
CHANGED
|
@@ -11,7 +11,7 @@ from transformers.video_utils import VideoInput
|
|
| 11 |
from transformers.processing_utils import (
|
| 12 |
Unpack,
|
| 13 |
ProcessingKwargs,
|
| 14 |
-
ProcessorMixin,
|
| 15 |
)
|
| 16 |
from transformers.feature_extraction_utils import BatchFeature
|
| 17 |
from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
|
|
@@ -88,6 +88,7 @@ class Molmo2Processor(ProcessorMixin):
|
|
| 88 |
use_single_crop_start_token: Optional[bool] = True,
|
| 89 |
video_use_col_tokens: Optional[bool] = False,
|
| 90 |
use_frame_special_tokens: Optional[bool] = True,
|
|
|
|
| 91 |
**kwargs
|
| 92 |
) -> None:
|
| 93 |
super().__init__(
|
|
@@ -101,13 +102,14 @@ class Molmo2Processor(ProcessorMixin):
|
|
| 101 |
video_use_col_tokens=video_use_col_tokens,
|
| 102 |
use_frame_special_tokens=use_frame_special_tokens,
|
| 103 |
)
|
| 104 |
-
|
| 105 |
self.image_placeholder_token = IMAGE_PROMPT
|
| 106 |
self.video_placeholder_token = VIDEO_PROMPT
|
| 107 |
self.image_token_ids = [
|
| 108 |
tokenizer.convert_tokens_to_ids(token)
|
| 109 |
for token in IMAGE_TOKENS
|
| 110 |
]
|
|
|
|
|
|
|
| 111 |
|
| 112 |
def get_image_tokens(self, image_grid: np.ndarray):
|
| 113 |
resized_h, resized_w, height, width = image_grid
|
|
@@ -119,7 +121,10 @@ class Molmo2Processor(ProcessorMixin):
|
|
| 119 |
np.tile(per_row, [height]),
|
| 120 |
[IM_END_TOKEN],
|
| 121 |
]
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
| 123 |
use_single_crop_col_tokens = (
|
| 124 |
self.image_use_col_tokens
|
| 125 |
if self.use_single_crop_col_tokens is None
|
|
@@ -247,6 +252,8 @@ class Molmo2Processor(ProcessorMixin):
|
|
| 247 |
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
| 248 |
images: ImageInput = None,
|
| 249 |
videos: VideoInput = None,
|
|
|
|
|
|
|
| 250 |
**kwargs: Unpack[Molmo2ProcessorKwargs],
|
| 251 |
) -> BatchFeature:
|
| 252 |
"""
|
|
@@ -287,22 +294,37 @@ class Molmo2Processor(ProcessorMixin):
|
|
| 287 |
Returned when `videos` is not `None`.
|
| 288 |
- **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
|
| 289 |
"""
|
| 290 |
-
|
| 291 |
output_kwargs = self._merge_kwargs(
|
| 292 |
Molmo2ProcessorKwargs,
|
| 293 |
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
| 294 |
**kwargs,
|
| 295 |
)
|
| 296 |
-
|
| 297 |
if images is not None:
|
| 298 |
-
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
image_grids = image_inputs["image_grids"]
|
| 300 |
else:
|
| 301 |
image_inputs = {}
|
| 302 |
image_grids = None
|
| 303 |
|
| 304 |
if videos is not None:
|
| 305 |
-
videos_inputs = self.video_processor(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
video_grids = videos_inputs["video_grids"]
|
| 307 |
# If user has not requested video metadata, pop it
|
| 308 |
if "return_metadata" not in kwargs:
|
|
@@ -367,10 +389,13 @@ class Molmo2Processor(ProcessorMixin):
|
|
| 367 |
text_inputs["input_ids"] = input_ids.tolist()
|
| 368 |
text_inputs["attention_mask"] = attention_mask.tolist()
|
| 369 |
|
| 370 |
-
|
| 371 |
data={**text_inputs, **image_inputs, **videos_inputs},
|
| 372 |
tensor_type=return_tensors,
|
| 373 |
)
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
def post_process_image_text_to_text(
|
| 376 |
self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
|
|
|
|
| 11 |
from transformers.processing_utils import (
|
| 12 |
Unpack,
|
| 13 |
ProcessingKwargs,
|
| 14 |
+
ProcessorMixin, AllKwargsForChatTemplate,
|
| 15 |
)
|
| 16 |
from transformers.feature_extraction_utils import BatchFeature
|
| 17 |
from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
|
|
|
|
| 88 |
use_single_crop_start_token: Optional[bool] = True,
|
| 89 |
video_use_col_tokens: Optional[bool] = False,
|
| 90 |
use_frame_special_tokens: Optional[bool] = True,
|
| 91 |
+
use_low_res_token_for_global_crops: bool = False,
|
| 92 |
**kwargs
|
| 93 |
) -> None:
|
| 94 |
super().__init__(
|
|
|
|
| 102 |
video_use_col_tokens=video_use_col_tokens,
|
| 103 |
use_frame_special_tokens=use_frame_special_tokens,
|
| 104 |
)
|
|
|
|
| 105 |
self.image_placeholder_token = IMAGE_PROMPT
|
| 106 |
self.video_placeholder_token = VIDEO_PROMPT
|
| 107 |
self.image_token_ids = [
|
| 108 |
tokenizer.convert_tokens_to_ids(token)
|
| 109 |
for token in IMAGE_TOKENS
|
| 110 |
]
|
| 111 |
+
self.use_low_res_token_for_global_crops = use_low_res_token_for_global_crops
|
| 112 |
+
self._patch_metadata = None
|
| 113 |
|
| 114 |
def get_image_tokens(self, image_grid: np.ndarray):
|
| 115 |
resized_h, resized_w, height, width = image_grid
|
|
|
|
| 121 |
np.tile(per_row, [height]),
|
| 122 |
[IM_END_TOKEN],
|
| 123 |
]
|
| 124 |
+
if self.use_low_res_token_for_global_crops:
|
| 125 |
+
per_row = np.full(resized_w, IMAGE_LOW_RES_TOKEN)
|
| 126 |
+
else:
|
| 127 |
+
per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
|
| 128 |
use_single_crop_col_tokens = (
|
| 129 |
self.image_use_col_tokens
|
| 130 |
if self.use_single_crop_col_tokens is None
|
|
|
|
| 252 |
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
| 253 |
images: ImageInput = None,
|
| 254 |
videos: VideoInput = None,
|
| 255 |
+
return_pointing_metadata: bool = False,
|
| 256 |
+
use_low_res_token_for_global_crops: bool = False,
|
| 257 |
**kwargs: Unpack[Molmo2ProcessorKwargs],
|
| 258 |
) -> BatchFeature:
|
| 259 |
"""
|
|
|
|
| 294 |
Returned when `videos` is not `None`.
|
| 295 |
- **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
|
| 296 |
"""
|
|
|
|
| 297 |
output_kwargs = self._merge_kwargs(
|
| 298 |
Molmo2ProcessorKwargs,
|
| 299 |
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
| 300 |
**kwargs,
|
| 301 |
)
|
| 302 |
+
patch_metadata = {}
|
| 303 |
if images is not None:
|
| 304 |
+
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"],
|
| 305 |
+
return_pointing_metadata=return_pointing_metadata)
|
| 306 |
+
if return_pointing_metadata:
|
| 307 |
+
patch_metadata["token_pooling"] = image_inputs.pop("image_token_pooling_np")
|
| 308 |
+
patch_metadata["subpatch_mapping"] = image_inputs.pop("subpatch_mapping")
|
| 309 |
+
patch_metadata["image_sizes"] = image_inputs.pop("image_sizes")
|
| 310 |
image_grids = image_inputs["image_grids"]
|
| 311 |
else:
|
| 312 |
image_inputs = {}
|
| 313 |
image_grids = None
|
| 314 |
|
| 315 |
if videos is not None:
|
| 316 |
+
videos_inputs = self.video_processor(
|
| 317 |
+
videos=videos, **output_kwargs["videos_kwargs"],
|
| 318 |
+
return_pointing_metadata=return_pointing_metadata
|
| 319 |
+
)
|
| 320 |
+
if return_pointing_metadata:
|
| 321 |
+
assert len(videos_inputs['video_metadata']) == 1
|
| 322 |
+
vd_metadata = videos_inputs['video_metadata'][0]
|
| 323 |
+
patch_metadata["token_pooling"] = videos_inputs.pop("video_token_pooling_np")
|
| 324 |
+
patch_metadata["subpatch_mapping"] = videos_inputs.pop("subpatch_mapping")
|
| 325 |
+
patch_metadata["timestamps"] = vd_metadata.timestamps
|
| 326 |
+
patch_metadata["video_size"] = (vd_metadata.width, vd_metadata.height)
|
| 327 |
+
|
| 328 |
video_grids = videos_inputs["video_grids"]
|
| 329 |
# If user has not requested video metadata, pop it
|
| 330 |
if "return_metadata" not in kwargs:
|
|
|
|
| 389 |
text_inputs["input_ids"] = input_ids.tolist()
|
| 390 |
text_inputs["attention_mask"] = attention_mask.tolist()
|
| 391 |
|
| 392 |
+
features = BatchFeature(
|
| 393 |
data={**text_inputs, **image_inputs, **videos_inputs},
|
| 394 |
tensor_type=return_tensors,
|
| 395 |
)
|
| 396 |
+
if return_pointing_metadata:
|
| 397 |
+
features["metadata"] = patch_metadata
|
| 398 |
+
return features
|
| 399 |
|
| 400 |
def post_process_image_text_to_text(
|
| 401 |
self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
|
processor_config.json
CHANGED
|
@@ -5,6 +5,7 @@
|
|
| 5 |
"image_use_col_tokens": true,
|
| 6 |
"processor_class": "Molmo2Processor",
|
| 7 |
"use_frame_special_tokens": true,
|
|
|
|
| 8 |
"use_single_crop_col_tokens": false,
|
| 9 |
"use_single_crop_start_token": true,
|
| 10 |
"video_use_col_tokens": false
|
|
|
|
| 5 |
"image_use_col_tokens": true,
|
| 6 |
"processor_class": "Molmo2Processor",
|
| 7 |
"use_frame_special_tokens": true,
|
| 8 |
+
"use_low_res_token_for_global_crops": false,
|
| 9 |
"use_single_crop_col_tokens": false,
|
| 10 |
"use_single_crop_start_token": true,
|
| 11 |
"video_use_col_tokens": false
|
special_tokens_map.json
CHANGED
|
@@ -276,7 +276,11 @@
|
|
| 276 |
"<im_low>",
|
| 277 |
"<frame_start>",
|
| 278 |
"<frame_end>",
|
| 279 |
-
"<|video|>"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
],
|
| 281 |
"bos_token": "<|im_end|>",
|
| 282 |
"eos_token": {
|
|
|
|
| 276 |
"<im_low>",
|
| 277 |
"<frame_start>",
|
| 278 |
"<frame_end>",
|
| 279 |
+
"<|video|>",
|
| 280 |
+
"<|points|>",
|
| 281 |
+
"<|token_index|>",
|
| 282 |
+
"<|vit_index|>",
|
| 283 |
+
"<|vit_loc|>"
|
| 284 |
],
|
| 285 |
"bos_token": "<|im_end|>",
|
| 286 |
"eos_token": {
|
tokenizer.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ef24b8f324674c3a1163e2c205b9e2b1a230fd2d4294de7b779bb419fc23914
|
| 3 |
+
size 11477774
|
tokenizer_config.json
CHANGED
|
@@ -2425,6 +2425,38 @@
|
|
| 2425 |
"rstrip": false,
|
| 2426 |
"single_word": false,
|
| 2427 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2428 |
}
|
| 2429 |
},
|
| 2430 |
"additional_special_tokens": [
|
|
@@ -2704,7 +2736,11 @@
|
|
| 2704 |
"<im_low>",
|
| 2705 |
"<frame_start>",
|
| 2706 |
"<frame_end>",
|
| 2707 |
-
"<|video|>"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2708 |
],
|
| 2709 |
"auto_map": {
|
| 2710 |
"AutoProcessor": "processing_molmo2.Molmo2Processor"
|
|
|
|
| 2425 |
"rstrip": false,
|
| 2426 |
"single_word": false,
|
| 2427 |
"special": true
|
| 2428 |
+
},
|
| 2429 |
+
"151946": {
|
| 2430 |
+
"content": "<|points|>",
|
| 2431 |
+
"lstrip": false,
|
| 2432 |
+
"normalized": false,
|
| 2433 |
+
"rstrip": false,
|
| 2434 |
+
"single_word": false,
|
| 2435 |
+
"special": true
|
| 2436 |
+
},
|
| 2437 |
+
"151947": {
|
| 2438 |
+
"content": "<|token_index|>",
|
| 2439 |
+
"lstrip": false,
|
| 2440 |
+
"normalized": false,
|
| 2441 |
+
"rstrip": false,
|
| 2442 |
+
"single_word": false,
|
| 2443 |
+
"special": true
|
| 2444 |
+
},
|
| 2445 |
+
"151948": {
|
| 2446 |
+
"content": "<|vit_index|>",
|
| 2447 |
+
"lstrip": false,
|
| 2448 |
+
"normalized": false,
|
| 2449 |
+
"rstrip": false,
|
| 2450 |
+
"single_word": false,
|
| 2451 |
+
"special": true
|
| 2452 |
+
},
|
| 2453 |
+
"151949": {
|
| 2454 |
+
"content": "<|vit_loc|>",
|
| 2455 |
+
"lstrip": false,
|
| 2456 |
+
"normalized": false,
|
| 2457 |
+
"rstrip": false,
|
| 2458 |
+
"single_word": false,
|
| 2459 |
+
"special": true
|
| 2460 |
}
|
| 2461 |
},
|
| 2462 |
"additional_special_tokens": [
|
|
|
|
| 2736 |
"<im_low>",
|
| 2737 |
"<frame_start>",
|
| 2738 |
"<frame_end>",
|
| 2739 |
+
"<|video|>",
|
| 2740 |
+
"<|points|>",
|
| 2741 |
+
"<|token_index|>",
|
| 2742 |
+
"<|vit_index|>",
|
| 2743 |
+
"<|vit_loc|>"
|
| 2744 |
],
|
| 2745 |
"auto_map": {
|
| 2746 |
"AutoProcessor": "processing_molmo2.Molmo2Processor"
|
video_preprocessor_config.json
CHANGED
|
@@ -10,6 +10,7 @@
|
|
| 10 |
"do_center_crop": null,
|
| 11 |
"do_convert_rgb": true,
|
| 12 |
"do_normalize": true,
|
|
|
|
| 13 |
"do_rescale": true,
|
| 14 |
"do_resize": true,
|
| 15 |
"do_sample_frames": true,
|
|
@@ -28,7 +29,6 @@
|
|
| 28 |
"input_data_format": null,
|
| 29 |
"max_fps": 2.0,
|
| 30 |
"num_frames": 128,
|
| 31 |
-
"pad_size": null,
|
| 32 |
"patch_size": 14,
|
| 33 |
"pooling_size": [
|
| 34 |
3,
|
|
@@ -43,6 +43,7 @@
|
|
| 43 |
"height": 378,
|
| 44 |
"width": 378
|
| 45 |
},
|
|
|
|
| 46 |
"video_metadata": null,
|
| 47 |
"video_processor_type": "Molmo2VideoProcessor"
|
| 48 |
}
|
|
|
|
| 10 |
"do_center_crop": null,
|
| 11 |
"do_convert_rgb": true,
|
| 12 |
"do_normalize": true,
|
| 13 |
+
"do_pad": null,
|
| 14 |
"do_rescale": true,
|
| 15 |
"do_resize": true,
|
| 16 |
"do_sample_frames": true,
|
|
|
|
| 29 |
"input_data_format": null,
|
| 30 |
"max_fps": 2.0,
|
| 31 |
"num_frames": 128,
|
|
|
|
| 32 |
"patch_size": 14,
|
| 33 |
"pooling_size": [
|
| 34 |
3,
|
|
|
|
| 43 |
"height": 378,
|
| 44 |
"width": 378
|
| 45 |
},
|
| 46 |
+
"size_divisor": null,
|
| 47 |
"video_metadata": null,
|
| 48 |
"video_processor_type": "Molmo2VideoProcessor"
|
| 49 |
}
|
video_processing_molmo2.py
CHANGED
|
@@ -826,7 +826,8 @@ class Molmo2VideoProcessor(BaseVideoProcessor):
|
|
| 826 |
) -> BatchFeature:
|
| 827 |
validate_kwargs(
|
| 828 |
captured_kwargs=kwargs.keys(),
|
| 829 |
-
valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) +
|
|
|
|
| 830 |
)
|
| 831 |
|
| 832 |
# Set default kwargs from self. This ensures that if a kwarg is not provided
|
|
@@ -867,6 +868,7 @@ class Molmo2VideoProcessor(BaseVideoProcessor):
|
|
| 867 |
patch_size: Optional[int] = None,
|
| 868 |
pooling_size: Optional[list[int]] = None,
|
| 869 |
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
|
|
| 870 |
**kwargs,
|
| 871 |
) -> BatchFeature:
|
| 872 |
"""
|
|
@@ -955,13 +957,20 @@ class Molmo2VideoProcessor(BaseVideoProcessor):
|
|
| 955 |
pixel_values_videos = np.concatenate(batch_crops, 0)
|
| 956 |
video_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
|
| 957 |
|
| 958 |
-
data =dict(
|
| 959 |
pixel_values_videos=pixel_values_videos,
|
| 960 |
video_token_pooling=video_token_pooling,
|
| 961 |
video_grids=video_grids,
|
| 962 |
-
)
|
| 963 |
-
|
| 964 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 965 |
|
| 966 |
|
| 967 |
Molmo2VideoProcessor.register_for_auto_class()
|
|
|
|
| 826 |
) -> BatchFeature:
|
| 827 |
validate_kwargs(
|
| 828 |
captured_kwargs=kwargs.keys(),
|
| 829 |
+
valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) +
|
| 830 |
+
["return_tensors", "return_pointing_metadata"],
|
| 831 |
)
|
| 832 |
|
| 833 |
# Set default kwargs from self. This ensures that if a kwarg is not provided
|
|
|
|
| 868 |
patch_size: Optional[int] = None,
|
| 869 |
pooling_size: Optional[list[int]] = None,
|
| 870 |
return_tensors: Optional[Union[str, TensorType]] = None,
|
| 871 |
+
return_pointing_metadata: bool = False,
|
| 872 |
**kwargs,
|
| 873 |
) -> BatchFeature:
|
| 874 |
"""
|
|
|
|
| 957 |
pixel_values_videos = np.concatenate(batch_crops, 0)
|
| 958 |
video_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
|
| 959 |
|
| 960 |
+
data = BatchFeature(dict(
|
| 961 |
pixel_values_videos=pixel_values_videos,
|
| 962 |
video_token_pooling=video_token_pooling,
|
| 963 |
video_grids=video_grids,
|
| 964 |
+
), tensor_type=return_tensors)
|
| 965 |
+
if return_pointing_metadata:
|
| 966 |
+
t = pixel_values_videos.shape[0]
|
| 967 |
+
assert base_image_input_size[0] % self.patch_size == 0
|
| 968 |
+
assert base_image_input_size[1] % self.patch_size == 0
|
| 969 |
+
crop_w = base_image_input_size[0] // self.patch_size
|
| 970 |
+
crop_h = base_image_input_size[1] // self.patch_size
|
| 971 |
+
data["subpatch_mapping"] = np.arange(t*crop_w*crop_h).reshape([t, crop_h, crop_w])
|
| 972 |
+
data["video_token_pooling_np"] = video_token_pooling
|
| 973 |
+
return data
|
| 974 |
|
| 975 |
|
| 976 |
Molmo2VideoProcessor.register_for_auto_class()
|