hqfang commited on 3 days ago

Commit

048fc26

verified ·

1 Parent(s): 2478201

Add files using upload-large-folder tool

Browse files

Files changed (22) hide show

.gitattributes +1 -0
README.md +31 -3
assets/MolmoAct2-Think.svg +36 -0
chat_template.jinja +1 -0
config.json +160 -0
configuration_molmoact2.py +565 -0
generation_config.json +6 -0
image_processing_molmoact2.py +546 -0
inference.py +768 -0
model-00001-of-00005.safetensors +3 -0
model-00002-of-00005.safetensors +3 -0
model-00003-of-00005.safetensors +3 -0
model-00004-of-00005.safetensors +3 -0
model-00005-of-00005.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_molmoact2.py +0 -0
norm_stats.json +1739 -0
processing_molmoact2.py +418 -0
processor_config.json +85 -0
tokenizer.json +3 -0
tokenizer_config.json +34 -0
video_processing_molmoact2.py +969 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,31 @@
----
-license: apache-2.0
----

+---
+library_name: transformers
+tags:
+  - molmoact2
+  - robotics
+  - image-text-to-text
+  - depth-reasoning
+---
+<img src="assets/MolmoAct2-Think.svg" alt="MolmoAct Think Logo" style="width: auto; height: 50px;">
+# **MolmoAct2-Think**
+MolmoAct2-Think extends MolmoAct2 with depth-token reasoning. Before producing an action, the model can predict a compact 10 x 10 discrete depth representation and condition the action expert on the resulting depth-aware VLM cache.
+This checkpoint is the post-trained, multi-embodiment depth-reasoning model. It is intended as a foundation checkpoint for further robot fine-tuning rather than as a ready-to-run policy for a single deployment setting.
+## Quick Links
+- 📂 Models: [Models](https://huggingface.co/collections/allenai/molmoact2-models), [Finetuned Models](https://huggingface.co/collections/allenai/molmoact2-finetuned-models)
+- 📂 Datasets: [MolmoAct2-BimanualYAM Dataset](https://huggingface.co/collections/allenai/molmoact2-datasets), [MolmoAct2 Datasets](https://huggingface.co/collections/allenai/molmoact2-datasets), [Molmo2-ER Datasets](https://huggingface.co/collections/allenai/molmo2-er-datasets)
+- 📄 Paper:
+- 💻 Code: [allenai/molmoact2](https://github.com/allenai/molmoact2)
+- 🎥 Blog Post: [MolmoAct2](https://allenai.org/blog/molmoact2)
+## Intended Use
+Use this checkpoint for further fine-tuning when the downstream policy should use depth reasoning. It contains the VLM, action expert, and depth-token weights, plus normalization metadata for the post-training mixture in `norm_stats.json`.
+This model card intentionally does not include direct policy inference code. For ready-to-run depth-reasoning inference, use the fine-tuned `MolmoAct2-Think-LIBERO` checkpoint.

assets/MolmoAct2-Think.svg ADDED Viewed

chat_template.jinja ADDED Viewed

	@@ -0,0 +1 @@

+ {% set DEMO_STYLES = ['point_count','pointing','cosyn_point','user_qa','long_caption','short_caption','video_long_caption','video_short_caption','video_point_track_per_frame','video_point_track_start_end','video_point_track_all_frames','video_single_point_track_start_end','video_transcript','video_clip_caption_start_end','video_clip_caption_start_end_in_seconds','video_clip_transcript_start_end','video_clip_transcript_start_end_in_seconds','video_frame_caption_timestamp','video_frame_caption_timestamp_in_seconds','correction_qa','text_sft','video_point','video_point_count','video_count','video_count_point','multi_image_pointing','multi_image_counting','multi_image_point_then_count','multi_image_count_then_point','demo','a_okvqa_mc','ai2_diagram_no_letter','ai2_diagram','science_qa','multi_image_mc','multi_image_mc_exp','mantis_instruct_mc','video_multiple_choice','video_multiple_choice_count_without_pointing','video_multiple_choice_multiple_correct','video_multiple_choice_w_subtitle'] %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% set has_subtitle = messages and messages[0]['role'].lower() == 'subtitle' %}{% for message in messages %}{% if message['content'] is not string %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% elif content['type'] == 'video' or 'video' in content or 'video_url' in content %}{% set video_count.value = video_count.value + 1 %}{% endif %}{% endfor %}{% endif %}{% endfor %}{% if image_count.value == 1 %}{{ '<|image|>' }}{% elif image_count.value > 1 %}{% for i in range(image_count.value) %}{{ 'Image ' ~ (i + 1) ~ '<|image|>' }}{% endfor %}{% endif %}{% for _ in range(video_count.value) %}{{ '<|video|>' }}{% endfor %}{% if has_subtitle %}{{ messages[0]['content'] }}{% endif %}{% for message in messages %}{% set role = message['role'].lower() %}{% if role == 'subtitle' %}{% continue %}{% endif %}{% set conv_index = loop.index - (1 if has_subtitle else 0) %}{%- if (conv_index % 2 == 1 and role != 'user') or (conv_index % 2 == 0 and role != 'assistant') -%}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{%- endif -%}{% if message['content'] is string %}{% set text_content = message['content'] %}{% else %}{% set m = namespace(text='') %}{% for content in message['content'] %}{% if content['type'] == 'text' %}{% if content['style'] is defined and content['style'] not in DEMO_STYLES %}{% set seg = content['style'] ~ ': ' ~ content['text'] %}{% else %}{% set seg = content['text'] %}{% endif %}{% set m.text = m.text ~ ('' if not m.text else ' ') ~ seg %}{% endif %}{% endfor %}{% set text_content = m.text %}{% endif %}{% if role == 'user' %}{% if not (has_subtitle and loop.index == 2) and not (not has_subtitle and loop.first) %}{{ '<|im_end|>\n' }}{% endif %}{{ '<|im_start|>user\n' }}{{ text_content }}{{ '<|im_end|>\n' }}{% else %} {# assistant #}{{ '<|im_start|>assistant\n' }}{{ text_content }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,160 @@

+{
+  "action_end_token_id": 151933,
+  "action_expert_condition_source": "kv_cache",
+  "action_expert_config": {
+    "attn_dropout": 0.0,
+    "causal_attn": false,
+    "compile": "blocks",
+    "context_layer_norm": true,
+    "dropout": 0.0,
+    "ffn_multiple_of": 256,
+    "hidden_size": 768,
+    "implementation": "new",
+    "max_action_dim": 32,
+    "max_horizon": 32,
+    "mlp_ratio": 4.0,
+    "model_type": "molmoact2_action_expert",
+    "num_heads": 8,
+    "num_layers": 36,
+    "qk_norm": true,
+    "qk_norm_eps": 1e-06,
+    "rope": true,
+    "rope_on_cross_attention": true,
+    "timestep_embed_dim": 256
+  },
+  "action_expert_depth_gate": false,
+  "action_expert_depth_gate_init_bias": -4.0,
+  "action_expert_depth_gate_per_layer": false,
+  "action_expert_layer_mode": "per_layer",
+  "action_format": "both",
+  "action_horizon": 30,
+  "action_output_token_id": 151931,
+  "action_start_token_id": 151932,
+  "action_token_start_id": 151934,
+  "adapter_config": {
+    "attention_dropout": 0.0,
+    "attn_implementation": "sdpa",
+    "float32_attention": true,
+    "head_dim": 72,
+    "hidden_act": "silu",
+    "hidden_size": 1152,
+    "image_feature_dropout": 0.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "model_type": "molmoact2",
+    "num_attention_heads": 16,
+    "num_key_value_heads": 16,
+    "pooling_attention_mask": true,
+    "residual_dropout": 0.0,
+    "text_hidden_size": 2560,
+    "vit_layers": [
+      -3,
+      -9
+    ]
+  },
+  "add_action_expert": true,
+  "add_control_tokens": true,
+  "add_setup_tokens": true,
+  "architectures": [
+    "MolmoAct2ForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_molmoact2.MolmoAct2Config",
+    "AutoModelForImageTextToText": "modeling_molmoact2.MolmoAct2ForConditionalGeneration"
+  },
+  "depth_end_token_id": 153984,
+  "depth_mode": 2,
+  "depth_output_token_id": 153982,
+  "depth_start_token_id": 153983,
+  "depth_token_start_id": 153985,
+  "dtype": "float32",
+  "enable_depth_reasoning": true,
+  "flow_matching_beta_alpha": 1.0,
+  "flow_matching_beta_beta": 1.5,
+  "flow_matching_cutoff": 1.0,
+  "flow_matching_num_steps": 10,
+  "flow_matching_time_offset": 0.001,
+  "flow_matching_time_scale": 0.999,
+  "frame_end_token_id": 155656,
+  "frame_start_token_id": 155655,
+  "image_col_id": 155651,
+  "image_end_token_id": 155649,
+  "image_high_res_id": 155650,
+  "image_low_res_id": 155654,
+  "image_patch_id": 155650,
+  "image_start_token_id": 155648,
+  "initializer_range": 0.02,
+  "low_res_image_start_token_id": 155652,
+  "mask_action_dim_padding": true,
+  "max_action_dim": 32,
+  "model_type": "molmoact2",
+  "n_obs_steps": 1,
+  "norm_stats_filename": "norm_stats.json",
+  "num_action_tokens": 2048,
+  "num_depth_codes": 100,
+  "num_depth_tokens": 128,
+  "num_state_tokens": 256,
+  "state_end_token_id": 151674,
+  "state_format": "discrete",
+  "state_start_token_id": 151673,
+  "state_token_start_id": 151675,
+  "text_config": {
+    "additional_vocab_size": 128,
+    "attention_dropout": 0.0,
+    "attn_implementation": "sdpa",
+    "embedding_dropout": 0.0,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "layer_norm_eps": 1e-06,
+    "max_position_embeddings": 16384,
+    "model_type": "molmoact2_text",
+    "norm_after": false,
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "qk_norm_type": "qwen3",
+    "qkv_bias": false,
+    "residual_dropout": 0.0,
+    "rope_parameters": {
+      "rope_theta": 5000000.0,
+      "rope_type": "default"
+    },
+    "rope_scaling_layers": null,
+    "rope_theta": 5000000.0,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_qk_norm": true,
+    "vocab_size": 155648
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.3.0",
+  "use_frame_special_tokens": true,
+  "vit_config": {
+    "attention_dropout": 0.0,
+    "attn_implementation": "sdpa",
+    "float32_attention": true,
+    "head_dim": 72,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_default_input_size": [
+      378,
+      378
+    ],
+    "image_num_pos": 729,
+    "image_patch_size": 14,
+    "initializer_range": 0.02,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "molmoact2",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "num_key_value_heads": 16,
+    "residual_dropout": 0.0
+  },
+  "bos_token_id": 151645,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643
+}

configuration_molmoact2.py ADDED Viewed

	@@ -0,0 +1,565 @@

+"""
+MolmoAct2 configuration
+"""
+from typing import Optional, Any
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class MolmoAct2VitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2VisionTransformer`].
+    It is used to instantiate a `MolmoAct2VisionTransformer` according to the specified arguments,
+    defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2VitConfig, MolmoAct2VisionTransformer
+    >>> # Initializing a MolmoAct2VitConfig
+    >>> configuration = MolmoAct2VitConfig()
+    >>> # Initializing a MolmoAct2VisionTransformer (with random weights)
+    >>> model = MolmoAct2VisionTransformer(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "molmoact2"
+    base_config_key = "vit_config"
+    def __init__(
+        self,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        num_hidden_layers: int = 27,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 72,
+        hidden_act: str = "gelu_pytorch_tanh",
+        layer_norm_eps: float = 1e-6,
+        image_default_input_size: tuple[int, int] = (378, 378),
+        image_patch_size: int = 14,
+        image_num_pos: int = 577,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        float32_attention: bool = True,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(
+            attn_implementation=attn_implementation,
+            **kwargs
+        )
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.image_default_input_size = image_default_input_size
+        self.image_patch_size = image_patch_size
+        self.image_num_pos = image_num_pos
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.initializer_range = initializer_range
+        self.float32_attention = float32_attention
+    @property
+    def image_num_patch(self):
+        h, w = self.image_default_input_size
+        return h // self.image_patch_size, w // self.image_patch_size
+class MolmoAct2AdapterConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of MolmoAct2Adapter. With MolmoAct2VitConfig,
+    It is used to instantiate an MolmoAct2VisionBackbone according to the specified arguments,
+    defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2VitConfig, MolmoAct2AdapterConfig, MolmoAct2VisionBackbone
+    >>> # Initializing a MolmoAct2VitConfig and a MolmoAct2AdapterConfig
+    >>> vit_config = MolmoAct2VitConfig()
+    >>> adapter_config = MolmoPoolingConfig()
+    >>> # Initializing a MolmoAct2VisionBackbone (with random weights)
+    >>> model = MolmoAct2VisionBackbone(vit_config, adapter_config)
+    >>> # Accessing the model configuration
+    >>> vit_configuration = model.vit_config
+    >>> adapter_configuration = model.adapter_config
+    ```"""
+    model_type = "molmoact2"
+    base_config_key = "adapter_config"
+    def __init__(
+        self,
+        vit_layers: tuple = (-3, -9),
+        pooling_attention_mask: bool = False,
+        hidden_size: int = 1152,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 72,
+        float32_attention: bool = True,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        hidden_act: str = "silu",
+        intermediate_size: int = 18944,
+        text_hidden_size: int = 3584,
+        image_feature_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(
+            attn_implementation=attn_implementation,
+            **kwargs
+        )
+        self.vit_layers = vit_layers
+        self.pooling_attention_mask = pooling_attention_mask
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.float32_attention = float32_attention
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.text_hidden_size = text_hidden_size
+        self.image_feature_dropout = image_feature_dropout
+        self.initializer_range = initializer_range
+class MolmoAct2TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2TextModel`]. It is used to instantiate a
+    `MolmoAct2TextModel` according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2TextConfig, MolmoAct2TextModel
+    >>> # Initializing a MolmoAct2TextConfig
+    >>> configuration = MolmoAct2TextConfig()
+    >>> # Initializing a MolmoAct2TextModel (with random weights)
+    >>> model = MolmoAct2TextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "molmoact2_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "blocks.*.self_attn.att_proj": "colwise",
+        "blocks.*.self_attn.attn_out": "rowwise",
+        "blocks.*.mlp.ff_proj": "colwise",
+        "blocks.*.mlp.ff_out": "rowwise",
+    }
+    base_model_pp_plan = {
+        "wte": (["input_ids"], ["inputs_embeds"]),
+        "blocks": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "ln_f": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        hidden_size: int = 3584,
+        num_attention_heads: int = 28,
+        num_key_value_heads: Optional[int] = 4,
+        head_dim: int = 128,
+        vocab_size: int = 152064,
+        additional_vocab_size: int = 128,
+        qkv_bias: bool = True,
+        num_hidden_layers: int = 48,
+        intermediate_size: int = 18944,
+        hidden_act: str = "silu",
+        embedding_dropout: float=0.0,
+        attention_dropout: float=0.0,
+        residual_dropout: float = 0.0,
+        max_position_embeddings: int = 4096,
+        rope_theta: float = 1000000.0,
+        rope_scaling: dict[str, Any] = None,
+        rope_scaling_layers: Optional[list[int]] = None,
+        use_qk_norm: bool = False,
+        qk_norm_type: str = "olmo",
+        layer_norm_eps: int = 1e-6,
+        norm_after: bool = False,
+        initializer_range: float = 0.02,
+        use_cache=True,
+        tie_word_embeddings=False,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            attn_implementation=attn_implementation,
+            **kwargs
+        )
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.vocab_size = vocab_size
+        self.additional_vocab_size = additional_vocab_size
+        self.qkv_bias = qkv_bias
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.embedding_dropout = embedding_dropout
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.rope_scaling_layers = rope_scaling_layers
+        self.use_qk_norm = use_qk_norm
+        self.qk_norm_type = qk_norm_type
+        self.layer_norm_eps = layer_norm_eps
+        self.norm_after = norm_after
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+class MolmoAct2ActionExpertConfig(PretrainedConfig):
+    r"""Configuration for the MolmoAct2 modern action expert."""
+    model_type = "molmoact2_action_expert"
+    base_config_key = "action_expert_config"
+    def __init__(
+        self,
+        max_horizon: int = 32,
+        max_action_dim: int = 14,
+        hidden_size: int = 1024,
+        num_layers: int = 32,
+        num_heads: int = 16,
+        mlp_ratio: float = 8.0 / 3.0,
+        ffn_multiple_of: int = 256,
+        timestep_embed_dim: int = 256,
+        dropout: float = 0.0,
+        attn_dropout: float = 0.0,
+        context_layer_norm: bool = True,
+        qk_norm: bool = True,
+        qk_norm_eps: float = 1e-6,
+        rope: bool = True,
+        rope_on_cross_attention: bool = False,
+        causal_attn: bool = False,
+        compile: str = "blocks",
+        implementation: str = "new",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if implementation != "new":
+            raise ValueError(
+                "MolmoAct2 HF export supports only action_expert.implementation='new'."
+            )
+        self.max_horizon = max_horizon
+        self.max_action_dim = max_action_dim
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.ffn_multiple_of = ffn_multiple_of
+        self.timestep_embed_dim = timestep_embed_dim
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.context_layer_norm = context_layer_norm
+        self.qk_norm = qk_norm
+        self.qk_norm_eps = qk_norm_eps
+        self.rope = rope
+        self.rope_on_cross_attention = rope_on_cross_attention
+        self.causal_attn = causal_attn
+        self.compile = compile
+        self.implementation = implementation
+class MolmoAct2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2ForConditionalGeneration`].
+    It is used to instantiate an MolmoAct2 model according to the specified arguments, defining the model architecture.
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2Config, MolmoAct2VitConfig, MolmoAct2AdapterConfig, MolmoAct2TextConfig
+    >>> # Initializing a MolmoAct2VitConfig
+    >>> vit_config = MolmoAct2VitConfig()
+    >>> # Initializing a MolmoAct2AdapterConfig
+    >>> adapter_config = MolmoAct2AdapterConfig()
+    >>> # Initializing a MolmoAct2TextConfig
+    >>> text_config = MolmoAct2TextConfig()
+    >>> # Initializing a MolmoAct2Config
+    >>> configuration = MolmoAct2Config(
+    >>>     vit_config=vit_config,
+    >>>     adapter_config=adapter_config,
+    >>>     text_config=text_config,
+    >>>     image_start_token_id=151936,
+    >>>     image_end_token_id=151937,
+    >>>     image_patch_id=151938,
+    >>>     image_col_id=151939,
+    >>>     low_res_image_start_token_id=151940,
+    >>>     image_low_res_id=151942,
+    >>>     frame_start_token_id=151943,
+    >>>     frame_end_token_id=151944,
+    >>> )
+    >>> # Initializing a model
+    >>> model = MolmoAct2ForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "molmoact2"
+    sub_configs = {
+        "text_config": MolmoAct2TextConfig,
+        "vit_config": MolmoAct2VitConfig,
+        "adapter_config": MolmoAct2AdapterConfig,
+        "action_expert_config": MolmoAct2ActionExpertConfig,
+    }
+    def __init__(
+        self,
+        vit_config: MolmoAct2VitConfig = None,
+        adapter_config: MolmoAct2AdapterConfig = None,
+        text_config: MolmoAct2TextConfig = None,
+        action_expert_config: MolmoAct2ActionExpertConfig = None,
+        image_start_token_id: int = None,
+        low_res_image_start_token_id: int = None,
+        image_end_token_id: int = None,
+        image_low_res_id: int = None,
+        image_patch_id: int = None,
+        image_col_id: int = None,
+        frame_start_token_id: int = None,
+        frame_end_token_id: int = None,
+        use_frame_special_tokens: bool = True,
+        initializer_range: float = 0.02,
+        add_action_expert: bool = True,
+        max_action_dim: int = 7,
+        action_horizon: int = 16,
+        n_obs_steps: int = 1,
+        action_format: str = "continuous",
+        state_format: str = "discrete",
+        action_expert_condition_source: str = "kv_cache",
+        action_expert_layer_mode: str = "per_layer",
+        flow_matching_num_steps: int = 10,
+        flow_matching_cutoff: float = 1.0,
+        flow_matching_time_offset: float = 0.001,
+        flow_matching_time_scale: float = 0.999,
+        flow_matching_beta_alpha: float = 1.0,
+        flow_matching_beta_beta: float = 1.5,
+        mask_action_dim_padding: bool = True,
+        enable_depth_reasoning: bool = False,
+        depth_mode: int = 2,
+        num_depth_codes: int = 100,
+        action_expert_depth_gate: bool = False,
+        action_expert_depth_gate_per_layer: bool = False,
+        action_expert_depth_gate_init_bias: float = -4.0,
+        action_output_token_id: int = None,
+        action_start_token_id: int = None,
+        action_end_token_id: int = None,
+        action_token_start_id: int = None,
+        num_action_tokens: int = 0,
+        depth_output_token_id: int = None,
+        depth_start_token_id: int = None,
+        depth_end_token_id: int = None,
+        depth_token_start_id: int = None,
+        num_depth_tokens: int = 0,
+        state_start_token_id: int = None,
+        state_end_token_id: int = None,
+        state_token_start_id: int = None,
+        num_state_tokens: int = 0,
+        add_setup_tokens: bool = True,
+        add_control_tokens: bool = True,
+        norm_stats_filename: str = "norm_stats.json",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if vit_config is None:
+            self.vit_config = MolmoAct2VitConfig()
+        elif isinstance(vit_config, dict):
+            self.vit_config = MolmoAct2VitConfig(**vit_config)
+        else:
+            self.vit_config = vit_config
+        if adapter_config is None:
+            self.adapter_config = MolmoAct2AdapterConfig()
+        elif isinstance(adapter_config, dict):
+            self.adapter_config = MolmoAct2AdapterConfig(**adapter_config)
+        else:
+            self.adapter_config = adapter_config
+        if text_config is None:
+            self.text_config = MolmoAct2TextConfig()
+        elif isinstance(text_config, dict):
+            self.text_config = MolmoAct2TextConfig(**text_config)
+        else:
+            self.text_config = text_config
+        self.add_action_expert = bool(add_action_expert)
+        if not self.add_action_expert:
+            self.action_expert_config = None
+        elif action_expert_config is None:
+            self.action_expert_config = MolmoAct2ActionExpertConfig(
+                max_horizon=action_horizon,
+                max_action_dim=max_action_dim,
+                num_layers=self.text_config.num_hidden_layers,
+            )
+        elif isinstance(action_expert_config, dict):
+            self.action_expert_config = MolmoAct2ActionExpertConfig(**action_expert_config)
+        else:
+            self.action_expert_config = action_expert_config
+        if self.add_action_expert:
+            self._validate_release_action_config(
+                action_expert_config=self.action_expert_config,
+                action_expert_condition_source=action_expert_condition_source,
+                action_expert_layer_mode=action_expert_layer_mode,
+                state_format=state_format,
+            )
+        self.image_start_token_id = image_start_token_id
+        self.low_res_image_start_token_id = low_res_image_start_token_id
+        self.image_end_token_id = image_end_token_id
+        self.image_low_res_id = image_low_res_id
+        self.image_high_res_id = image_patch_id
+        self.image_patch_id = image_patch_id
+        self.image_col_id = image_col_id
+        self.frame_start_token_id = frame_start_token_id
+        self.frame_end_token_id = frame_end_token_id
+        self.use_frame_special_tokens = use_frame_special_tokens
+        self.initializer_range = initializer_range
+        self.max_action_dim = max_action_dim
+        self.action_horizon = action_horizon
+        self.n_obs_steps = n_obs_steps
+        self.action_format = action_format
+        self.state_format = state_format
+        self.action_expert_condition_source = action_expert_condition_source
+        self.action_expert_layer_mode = action_expert_layer_mode
+        self.flow_matching_num_steps = flow_matching_num_steps
+        self.flow_matching_cutoff = flow_matching_cutoff
+        self.flow_matching_time_offset = flow_matching_time_offset
+        self.flow_matching_time_scale = flow_matching_time_scale
+        self.flow_matching_beta_alpha = flow_matching_beta_alpha
+        self.flow_matching_beta_beta = flow_matching_beta_beta
+        self.mask_action_dim_padding = mask_action_dim_padding
+        self.enable_depth_reasoning = enable_depth_reasoning
+        self.depth_mode = depth_mode
+        self.num_depth_codes = num_depth_codes
+        self.action_expert_depth_gate = action_expert_depth_gate
+        self.action_expert_depth_gate_per_layer = action_expert_depth_gate_per_layer
+        self.action_expert_depth_gate_init_bias = action_expert_depth_gate_init_bias
+        self.action_output_token_id = action_output_token_id
+        self.action_start_token_id = action_start_token_id
+        self.action_end_token_id = action_end_token_id
+        self.action_token_start_id = action_token_start_id
+        self.num_action_tokens = num_action_tokens
+        self.depth_output_token_id = depth_output_token_id
+        self.depth_start_token_id = depth_start_token_id
+        self.depth_end_token_id = depth_end_token_id
+        self.depth_token_start_id = depth_token_start_id
+        self.num_depth_tokens = num_depth_tokens
+        self.state_start_token_id = state_start_token_id
+        self.state_end_token_id = state_end_token_id
+        self.state_token_start_id = state_token_start_id
+        self.num_state_tokens = num_state_tokens
+        self.add_setup_tokens = add_setup_tokens
+        self.add_control_tokens = add_control_tokens
+        self.norm_stats_filename = norm_stats_filename
+    @staticmethod
+    def _validate_release_action_config(
+        *,
+        action_expert_config: MolmoAct2ActionExpertConfig,
+        action_expert_condition_source: str,
+        action_expert_layer_mode: str,
+        state_format: str,
+    ) -> None:
+        if action_expert_config.implementation != "new":
+            raise ValueError(
+                "MolmoAct2 HF export supports only action_expert.implementation='new'."
+            )
+        if action_expert_condition_source != "kv_cache":
+            raise ValueError(
+                "MolmoAct2 HF export supports only action_expert_condition_source='kv_cache'."
+            )
+        if action_expert_layer_mode != "per_layer":
+            raise ValueError(
+                "MolmoAct2 HF export supports only action_expert_layer_mode='per_layer'."
+            )
+        if state_format != "discrete":
+            raise ValueError("MolmoAct2 HF export supports only state_format='discrete'.")
+    @property
+    def image_num_patch(self):
+        assert self.vit_config is not None
+        return self.vit_config.image_num_patch
+    @property
+    def num_attention_heads(self):
+        return self.text_config.num_attention_heads
+    @property
+    def num_key_value_heads(self):
+        return self.text_config.num_key_value_heads
+    @property
+    def head_dim(self):
+        return self.text_config.head_dim
+    @property
+    def num_hidden_layers(self):
+        return self.text_config.num_hidden_layers
+    @property
+    def hidden_size(self):
+        return self.text_config.hidden_size
+    @property
+    def vocab_size(self):
+        return self.text_config.vocab_size
+    @property
+    def max_position_embeddings(self):
+        return self.text_config.max_position_embeddings
+MolmoAct2VitConfig.register_for_auto_class()
+MolmoAct2AdapterConfig.register_for_auto_class()
+MolmoAct2TextConfig.register_for_auto_class()
+MolmoAct2ActionExpertConfig.register_for_auto_class()
+MolmoAct2Config.register_for_auto_class()

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token_id": 151645,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "transformers_version": "5.3.0"
+}

image_processing_molmoact2.py ADDED Viewed

	@@ -0,0 +1,546 @@

+"""Image processor class for MolmoAct2"""
+from typing import Optional, Union
+import numpy as np
+import einops
+import torch
+import torchvision.transforms
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    PILImageResampling,
+    make_flat_list_of_images,
+    valid_images,
+    to_numpy_array,
+)
+from transformers.image_transforms import convert_to_rgb
+from transformers.processing_utils import ImagesKwargs
+from transformers.image_processing_utils import BaseImageProcessor, get_size_dict
+from transformers.utils import logging
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.utils import TensorType, logging
+logger = logging.get_logger(__name__)
+def normalize_image(
+    image: np.ndarray,
+    image_mean: list[float],
+    image_std: list[float],
+) -> np.ndarray:
+    if np.allclose(image_mean, [0.5, 0.5, 0.5]) and np.allclose(image_std, [0.5, 0.5, 0.5]):
+        return image * np.asarray(2.0, dtype=np.float32) - np.asarray(1.0, dtype=np.float32)
+    image -= np.array(image_mean, dtype=np.float32)[None, None, :]
+    image /= np.array(image_std, dtype=np.float32)[None, None, :]
+    return image
+def resize_image(
+    image: np.ndarray,
+    desired_output_size: list[int],
+    resample: PILImageResampling,
+) -> np.ndarray:
+    image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+    dtype = image.dtype
+    if torch.is_floating_point(image):
+        in_min = 0.0
+        in_max = 1.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0.0, 1.0).to(dtype)
+    else:
+        assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(image.dtype)
+        in_min = 0.0
+        in_max = 255.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0, 255).to(dtype)
+    resized = resized.to(torch.float32)
+    resized = (resized - in_min) / (in_max - in_min)
+    resized = torch.permute(resized, [1, 2, 0]).numpy()
+    return resized
+def select_tiling(h, w, patch_size, max_num_crops):
+    """Divide in image of size [w, h] in up to max_num_patches of size patch_size"""
+    original_size = np.stack([h, w])  # [1, 2]
+    original_res = h * w
+    tilings = []
+    for i in range(1, max_num_crops + 1):
+        for j in range(1, max_num_crops + 1):
+            if i*j <= max_num_crops:
+                tilings.append((i, j))
+    # sort so argmin and argmax favour smaller tilings in the event of a tie
+    tilings.sort(key=lambda x: (x[0]*x[1], x[0]))
+    candidate_tilings = np.array(tilings, dtype=np.int32)  # [n_resolutions, 2]
+    candidate_resolutions = candidate_tilings * patch_size  # [n_resolutions, 2]
+    # How much we would need to scale the image to fit exactly in each tiling
+    original_size = np.stack([h, w], dtype=np.float32)  # [1, 2]
+    # The original size can be zero in rare cases if the image is smaller than the margin
+    # In those cases letting the scale become infinite means the tiling is based on the
+    # other side, or falls back to the smallest tiling
+    with np.errstate(divide='ignore'):
+        required_scale_d = candidate_resolutions.astype(np.float32) / original_size,
+    required_scale = np.min(required_scale_d, axis=-1, keepdims=True)  # [n_resolutions, 1]
+    if np.all(required_scale < 1):
+        # We are forced to downscale, so try to minimize the amount of downscaling
+        ix = np.argmax(required_scale)
+    else:
+        # Pick the resolution that required the least upscaling so that it most closely fits the image
+        required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+        ix = np.argmin(required_scale)
+    return candidate_tilings[ix]
+def build_resized_image(
+    image: np.ndarray,
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    resized = resize_image(
+        image, base_image_input_size, resample,
+    )
+    resized = normalize_image(resized, image_mean, image_std)
+    if len(resized.shape) == 3:
+        resized = np.expand_dims(resized, 0)
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    resize_idx = np.arange(crop_patch_w*crop_patch_h).reshape([crop_patch_h, crop_patch_w])
+    return resized, resize_idx
+def build_overlapping_crops(
+    image: np.ndarray,
+    max_crops: int,
+    overlap_margins: list[int],
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Decompose an image into a set of overlapping crops
+    :return crop_arr: [n_crops, h, w, 3] The crops
+    :return patch_idx: [overlap_patch_h, overlap_patch_w] For each patch in the resized image
+                        the crops were extracted from, what patch in `crop_arr` it corresponds to
+    """
+    original_image_h, original_image_w = image.shape[:2]
+    crop_size = base_image_input_size[0]
+    assert base_image_input_size[0] == base_image_input_size[1]
+    left_margin, right_margin = overlap_margins
+    total_margin_pixels = image_patch_size * (right_margin + left_margin)  # pixels removed per dim
+    crop_patches = base_image_input_size[0] // image_patch_size  # patches per crop dim
+    crop_window_patches = crop_patches - (right_margin + left_margin)  # usable patches
+    crop_window_size = crop_window_patches * image_patch_size
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    original_image_h, original_image_w = image.shape[:2]
+    crop_size = base_image_input_size[0]
+    # Decide how to tile the image, to account for the overlap margins we compute the tiling
+    # as if we had an image without the margins and were using a crop size without the margins
+    tiling = select_tiling(
+        original_image_h - total_margin_pixels,
+        original_image_w - total_margin_pixels,
+        crop_window_size,
+        max_crops,
+    )
+    src = resize_image(
+        image,
+        [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels],
+        resample,
+    )
+    src = normalize_image(src, image_mean, image_std)
+    # Now we have to split the image into crops, and track what patches came from
+    # where in `patch_idx_arr`
+    n_crops = tiling[0] * tiling[1]
+    crop_arr = np.zeros([n_crops, crop_size, crop_size, 3], dtype=src.dtype)
+    patch_idx_arr = np.zeros([n_crops, crop_patch_h, crop_patch_w], dtype=np.int32)
+    on_crop = 0
+    for i in range(tiling[0]):
+        # Slide over `src` by `crop_window_size` steps, but extract crops of size `crops_size`
+        # which results in overlapping crop windows
+        y0 = i*crop_window_size
+        for j in range(tiling[1]):
+            x0 = j*crop_window_size
+            crop_arr[on_crop] = src[y0:y0+crop_size, x0:x0+crop_size]
+            patch_idx = np.arange(crop_patch_w*crop_patch_h).reshape(crop_patch_h, crop_patch_w)
+            patch_idx += on_crop * crop_patch_h * crop_patch_w
+            # Mask out idx that are in the overlap region
+            if i != 0:
+                patch_idx[:left_margin, :] = -1
+            if j != 0:
+                patch_idx[:, :left_margin] = -1
+            if i != tiling[0]-1:
+                patch_idx[-right_margin:, :] = -1
+            if j != tiling[1]-1:
+                patch_idx[:, -right_margin:] = -1
+            patch_idx_arr[on_crop] = patch_idx
+            on_crop += 1
+    # `patch_idx_arr` is ordered crop-by-crop, here we transpose `patch_idx_arr`
+    # so it is ordered left-to-right order
+    patch_idx_arr = np.reshape(
+        patch_idx_arr,
+        [tiling[0], tiling[1], crop_patch_h, crop_patch_w]
+    )
+    patch_idx_arr = np.transpose(patch_idx_arr, [0, 2, 1, 3])
+    patch_idx_arr = np.reshape(patch_idx_arr, [-1])
+    # Now get the parts not in the overlap region, so it should map each patch in `src`
+    # to the correct patch it should come from in `crop_arr`
+    patch_idx_arr = patch_idx_arr[patch_idx_arr >= 0].reshape(
+        src.shape[0]//image_patch_size,
+        src.shape[1]//image_patch_size,
+    )
+    return crop_arr, patch_idx_arr
+def batch_pixels_to_patches(array: np.ndarray, patch_size: int) -> np.ndarray:
+    """Reshape images of [n_images, h, w, 3] -> [n_images, n_patches, pixels_per_patch]"""
+    if len(array.shape) == 3:
+        n_crops, h, w = array.shape
+        h_patches = h//patch_size
+        w_patches = w//patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size])
+        array = np.transpose(array, [0, 1, 3, 2, 4])
+        array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size])
+        return array
+    else:
+        n_crops, h, w, c = array.shape
+        h_patches = h//patch_size
+        w_patches = w//patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size, c])
+        array = np.transpose(array, [0, 1, 3, 2, 4, 5])
+        array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size*c])
+        return array
+def arange_for_pooling(
+    idx_arr: np.ndarray,
+    pool_h: int,
+    pool_w: int,
+) -> np.ndarray:
+    h_pad = pool_h * ((idx_arr.shape[0] + pool_h - 1) // pool_h) - idx_arr.shape[0]
+    w_pad = pool_w * ((idx_arr.shape[1] + pool_w - 1) // pool_w) - idx_arr.shape[1]
+    idx_arr = np.pad(idx_arr, [[h_pad//2, (h_pad+1)//2], [w_pad//2, (w_pad+1)//2]],
+                     mode='constant',constant_values=-1)
+    return einops.rearrange(
+        idx_arr, "(h dh) (w dw) -> h w (dh dw)", dh=pool_h, dw=pool_w)
+def image_to_patches_and_grids(
+    image: np.ndarray,
+    max_crops: int,
+    overlap_margins: list[int],
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+    image_pooling_w: int,
+    image_pooling_h: int,
+    crop_mode: str = "overlap-and-resize-c2",
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    :return image_grids, the shape of each (low-res, high-res) image after pooling
+    :return crops, the image crops to processes with the ViT
+    :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
+                                patches in `crops` to pool for that token, masked with -1
+    """
+    if isinstance(base_image_input_size, int):
+        base_image_input_size = (base_image_input_size, base_image_input_size)
+    base_image_input_d = image_patch_size
+    pooling_w = image_pooling_w
+    pooling_h = image_pooling_h
+    crop_patch_w = base_image_input_size[1] // base_image_input_d
+    crop_patch_h = base_image_input_size[0] // base_image_input_d
+    if crop_mode == "resize":
+        resized, resize_idx = build_resized_image(
+            image,
+            base_image_input_size,
+            resample,
+            image_mean,
+            image_std,
+            image_patch_size,
+        )
+        resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+        resized_h, resized_w = resize_idx.shape[:2]
+        resize_idx = resize_idx.reshape([-1, pooling_h * pooling_w])
+        image_grid = [np.array([resized_h, resized_w, 0, 0])]
+        return (
+            np.stack(image_grid, 0),
+            batch_pixels_to_patches(resized, image_patch_size),
+            resize_idx,
+        )
+    if crop_mode not in {"overlap-and-resize-c2", "overlap-and-resize"}:
+        raise ValueError(f"Unsupported MolmoAct2 image crop_mode {crop_mode!r}.")
+    crop_arr, patch_idx_arr = build_overlapping_crops(
+        image,
+        max_crops,
+        overlap_margins,
+        base_image_input_size,
+        resample,
+        image_mean,
+        image_std,
+        image_patch_size,
+    )
+    pooling_idx = arange_for_pooling(patch_idx_arr, pooling_h, pooling_w)
+    h, w = pooling_idx.shape[:2]
+    pooling_idx = pooling_idx.reshape([-1, pooling_h*pooling_w])
+    # Finally do the same for the global image
+    resized, resize_idx = build_resized_image(
+        image,
+        base_image_input_size,
+        resample,
+        image_mean,
+        image_std,
+        image_patch_size,
+    )
+    crop_arr = np.concatenate([resized, crop_arr], 0)
+    resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+    resized_h, resized_w = resize_idx.shape[:2]
+    resize_idx = resize_idx.reshape([-1, pooling_h*pooling_w])
+    # Global image goes first, so the order of patches in previous crops gets increased
+    pooling_idx = np.where(
+        pooling_idx >= 0,
+        pooling_idx + crop_patch_h*crop_patch_w,
+        -1
+    )
+    pooling_idx = np.concatenate([resize_idx, pooling_idx])
+    image_grid = [np.array([resized_h, resized_w, h, w])]
+    return (
+        np.stack(image_grid, 0),
+        batch_pixels_to_patches(crop_arr, image_patch_size),
+        pooling_idx
+    )
+class MolmoAct2ImagesKwargs(ImagesKwargs, total=False):
+    max_crops: Optional[int]
+    overlap_margins: Optional[list[int]]
+    crop_mode: Optional[str]
+    patch_size: Optional[int]
+    pooling_size: Optional[list[int]]
+class MolmoAct2ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a MolmoAct2 image processor that preprocesses images for the model.
+    Args:
+        size (`dict[str, int]` *optional*, defaults to `{"height": 378, "width": 378}`):
+            Size of the image after resizing.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use when resizing the image.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        max_crops (`int`, *optional*, defaults to `8`):
+            Maximum number of crops to use per image.
+        overlap_margins (`list[int]`, *optional*, defaults to `[4, 4]`):
+            Overlap margins to use.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spatial patch size of the vision encoder.
+        pooling_size (`list[int]`, *optional*, defaults to `[2, 2]`):
+            The pooling size of the vision adapter.
+    """
+    model_input_names = ["pixel_values", "image_token_pooling", "image_grids", "image_num_crops"]
+    def __init__(
+        self,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        max_crops: int = 8,
+        overlap_margins: list[int] = [4, 4],
+        crop_mode: str = "overlap-and-resize-c2",
+        patch_size: int = 14,
+        pooling_size: list[int] = [2, 2],
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 378, "width": 378}
+        size = get_size_dict(size, default_to_square=True)
+        self.size = size
+        self.resample = resample
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_convert_rgb = do_convert_rgb
+        self.max_crops = max_crops
+        self.overlap_margins = overlap_margins
+        self.crop_mode = crop_mode
+        self.patch_size = patch_size
+        self.pooling_size = pooling_size
+    def preprocess(
+        self,
+        images: ImageInput,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        max_crops: Optional[int] = None,
+        overlap_margins: Optional[list[int]] = None,
+        crop_mode: Optional[str] = None,
+        patch_size: Optional[int] = None,
+        pooling_size: Optional[list[int]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use when resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            max_crops (`int`, *optional*, defaults to `self.max_crops`):
+                Maximum number of crops to use per image.
+            overlap_margins (`list[int]`, *optional*, defaults to `self.overlap_margins`):
+                Overlap margins to use.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            pooling_size (`list[int]`, *optional*, defaults to `self.pooling_size`):
+                The pooling size of the vision adapter.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+        Returns:
+            A `BatchFeature` containing the following keys:
+                - `pixel_values`: The preprocessed images.
+                - `image_token_pooling`: The indices of the patches in `crops` to pool for each token in `image_tokens`.
+                - `image_grids`: The image grids.
+                - `image_num_crops`: The number of crops for each image.
+        """
+        if size is not None:
+            if "height" not in size or "width" not in size:
+                raise ValueError("size must contain 'height' and 'width' keys.")
+        else:
+            size = {**self.size}
+        base_image_input_size = [size["height"], size["width"]]
+        resample = resample or self.resample
+        image_mean = image_mean or self.image_mean
+        image_std = image_std or self.image_std
+        do_convert_rgb = do_convert_rgb or self.do_convert_rgb
+        max_crops = max_crops or self.max_crops
+        overlap_margins = overlap_margins or self.overlap_margins
+        crop_mode = crop_mode or self.crop_mode
+        patch_size = patch_size or self.patch_size
+        pooling_size = pooling_size or self.pooling_size
+        image_pooling_h, image_pooling_w = pooling_size
+        if images is not None:
+            images = self.fetch_images(images)
+            images = make_flat_list_of_images(images)
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        data = {}
+        if images is not None:
+            batch_grids = []
+            batch_crops = []
+            batch_pooled_patches_idx = []
+            batch_num_crops = []
+            for image in images:
+                image_grid, crops, pooled_idx = image_to_patches_and_grids(
+                    image,
+                    max_crops,
+                    overlap_margins,
+                    base_image_input_size,
+                    resample,
+                    image_mean,
+                    image_std,
+                    patch_size,
+                    image_pooling_w,
+                    image_pooling_h,
+                    crop_mode,
+                )
+                batch_grids.append(image_grid)
+                batch_crops.append(crops)
+                batch_pooled_patches_idx.append(pooled_idx)
+                batch_num_crops.append(crops.shape[0])
+            pixel_values = np.concatenate(batch_crops, 0)
+            image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
+            image_grids = np.concatenate(batch_grids, 0)
+            image_num_crops = np.array(batch_num_crops)
+            data.update(
+                pixel_values=pixel_values,
+                image_token_pooling=image_token_pooling,
+                image_grids=image_grids,
+                image_num_crops=image_num_crops,
+            )
+        return BatchFeature(data, tensor_type=return_tensors)
+MolmoAct2ImageProcessor.register_for_auto_class()

inference.py ADDED Viewed

	@@ -0,0 +1,768 @@

+"""Inference utilities for MolmoAct2"""
+from dataclasses import dataclass
+from typing import Any, Iterable, Optional, Sequence, Tuple
+import torch
+from torch.nn import functional as F
+from transformers.cache_utils import Cache
+from transformers.configuration_utils import PretrainedConfig
+@dataclass
+class _ActionFlowInputs:
+    trajectory: torch.Tensor
+    context: Any
+    modulations: Sequence[Any]
+    action_dim_is_pad: Optional[torch.Tensor]
+@dataclass
+class _ActionFlowCudaGraph:
+    key: Tuple[Any, ...]
+    graph: torch.cuda.CUDAGraph
+    static_inputs: _ActionFlowInputs
+    output: torch.Tensor
+@dataclass
+class _DepthDecodeCudaGraphLayerStage:
+    residual: torch.Tensor
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+@dataclass
+class _DepthDecodeCudaGraphPostStage:
+    graph: torch.cuda.CUDAGraph
+    attn_context: torch.Tensor
+@dataclass
+class _DepthDecodeCudaGraph:
+    cache_key: Tuple[Any, ...]
+    pre_graph: torch.cuda.CUDAGraph
+    token_ids: torch.Tensor
+    cos: torch.Tensor
+    sin: torch.Tensor
+    positions: torch.Tensor
+    stages: Sequence[_DepthDecodeCudaGraphLayerStage]
+    post_graphs: Sequence[_DepthDecodeCudaGraphPostStage]
+    output: torch.Tensor
+@dataclass
+class _DepthDecodeCudaGraphSpec:
+    eligible: bool
+    cache_key_prefix: Tuple[Any, ...]
+    num_hidden_layers: int
+    head_dim: int
+    num_attention_heads: int
+def _cache_seq_len_int(past_key_values: Optional[Cache]) -> int:
+    if past_key_values is None:
+        return 0
+    seq_len = past_key_values.get_seq_length()
+    if torch.is_tensor(seq_len):
+        return int(seq_len.item())
+    return int(seq_len)
+def _cache_max_len_int(past_key_values: Optional[Cache]) -> int:
+    if past_key_values is None:
+        return -1
+    max_len = past_key_values.get_max_cache_shape()
+    if torch.is_tensor(max_len):
+        return int(max_len.item())
+    return int(max_len)
+def _iter_cache_key_values(
+    past_key_values: Cache,
+) -> Iterable[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]]:
+    layers = getattr(past_key_values, "layers", None)
+    if layers is not None:
+        for layer in layers:
+            yield getattr(layer, "keys", None), getattr(layer, "values", None)
+        return
+    for layer in past_key_values:
+        yield layer[0], layer[1]
+class _DepthDecodeStaticLayerCache:
+    is_compileable = False
+    is_sliding = False
+    def __init__(self, max_cache_len: int) -> None:
+        self.max_cache_len = int(max_cache_len)
+        self.cumulative_length = 0
+        self.keys: Optional[torch.Tensor] = None
+        self.values: Optional[torch.Tensor] = None
+    def _allocate(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
+        bsz, n_heads = key_states.shape[:2]
+        self.keys = torch.empty(
+            (bsz, n_heads, self.max_cache_len, key_states.shape[-1]),
+            dtype=key_states.dtype,
+            device=key_states.device,
+        )
+        self.values = torch.empty(
+            (bsz, n_heads, self.max_cache_len, value_states.shape[-1]),
+            dtype=value_states.dtype,
+            device=value_states.device,
+        )
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        *args,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.keys is None:
+            self._allocate(key_states, value_states)
+        start = self.cumulative_length
+        end = start + key_states.shape[-2]
+        if end > self.max_cache_len:
+            raise RuntimeError(
+                f"KV cache length {end} exceeds max_cache_len={self.max_cache_len}."
+            )
+        self.keys[:, :, start:end, :].copy_(key_states)
+        self.values[:, :, start:end, :].copy_(value_states)
+        self.cumulative_length = end
+        return self.keys[:, :, :end, :], self.values[:, :, :end, :]
+    def get_seq_length(self) -> int:
+        return self.cumulative_length
+    def get_max_cache_shape(self) -> int:
+        return -1
+    def reset(self) -> None:
+        self.cumulative_length = 0
+class _DepthDecodeStaticCache(Cache):
+    def __init__(self, config: PretrainedConfig, max_cache_len: int) -> None:
+        text_config = config.get_text_config(decoder=True)
+        super().__init__(
+            layers=[
+                _DepthDecodeStaticLayerCache(max_cache_len=max_cache_len)
+                for _ in range(text_config.num_hidden_layers)
+            ]
+        )
+    def get_seq_length(self, layer_idx: int = 0) -> int:
+        return self.layers[layer_idx].get_seq_length()
+    def get_max_cache_shape(self, layer_idx: int = 0) -> int:
+        return self.layers[layer_idx].get_max_cache_shape()
+    def reset(self) -> None:
+        for layer in self.layers:
+            layer.reset()
+class ActionCudaGraphManager:
+    def __init__(self, model: Any) -> None:
+        self.model = model
+        self.enabled = True
+        self.action_flow_graph: Optional[_ActionFlowCudaGraph] = None
+    def set_enabled(self, enabled: bool) -> None:
+        self.enabled = bool(enabled)
+    def can_use_action_flow(self, inputs: _ActionFlowInputs) -> bool:
+        action_model = self.model
+        if not self.enabled:
+            return False
+        if action_model.training or action_model._require_action_expert().training:
+            return False
+        if inputs.trajectory.device.type != "cuda":
+            return False
+        def all_on_cuda():
+            yield inputs.trajectory
+            for k, v in inputs.context.kv_contexts:
+                yield k
+                yield v
+            for t in (
+                inputs.context.cross_mask,
+                inputs.context.self_mask,
+                inputs.context.valid_action,
+                inputs.action_dim_is_pad,
+            ):
+                if t is not None:
+                    yield t
+            if inputs.context.rope_cache is not None:
+                yield from inputs.context.rope_cache
+            for step in inputs.modulations:
+                yield step.conditioning
+                for block_modulation in step.block_modulations:
+                    yield from block_modulation
+                yield from step.final_modulation
+        return all(t.device.type == "cuda" for t in all_on_cuda())
+    def run_action_flow(
+        self,
+        inputs: _ActionFlowInputs,
+        steps: int,
+        run_loop,
+    ) -> torch.Tensor:
+        key = _cuda_graph_key(inputs, steps)
+        cache = self.action_flow_graph
+        if cache is None or cache.key != key:
+            static_inputs = _clone_static_inputs(inputs)
+            graph, output = _capture_cuda_graph(
+                lambda: run_loop(static_inputs, steps),
+                inputs.trajectory.device,
+                after_warmup=lambda: static_inputs.trajectory.copy_(inputs.trajectory),
+            )
+            cache = _ActionFlowCudaGraph(
+                key=key,
+                graph=graph,
+                static_inputs=static_inputs,
+                output=output,
+            )
+            self.action_flow_graph = cache
+        else:
+            _copy_inputs_(cache.static_inputs, inputs)
+        cache.graph.replay()
+        return cache.output.clone()
+class DepthDecodeCudaGraphManager:
+    def __init__(self, model: Any) -> None:
+        self.model = model
+        self.backbone = model.model
+        self.enabled = True
+        self.graph: Optional[_DepthDecodeCudaGraph] = None
+        self.graph_spec: Optional[_DepthDecodeCudaGraphSpec] = None
+    def set_enabled(self, enabled: bool) -> None:
+        self.enabled = bool(enabled)
+    def make_static_cache(self, max_cache_len: int) -> _DepthDecodeStaticCache:
+        return _DepthDecodeStaticCache(
+            config=self.model.config.text_config,
+            max_cache_len=max_cache_len,
+        )
+    def _depth_decode_spec(self) -> _DepthDecodeCudaGraphSpec:
+        static = self.graph_spec
+        if static is None:
+            cfg = self.backbone.transformer.config
+            rotary_emb = getattr(self.backbone.transformer, "rotary_emb", None)
+            static = _DepthDecodeCudaGraphSpec(
+                eligible=(
+                    not cfg.norm_after
+                    and cfg.rope_scaling_layers is None
+                    and getattr(rotary_emb, "rope_type", None) == "default"
+                    and cfg._attn_implementation == "sdpa"
+                ),
+                cache_key_prefix=(
+                    cfg.hidden_size,
+                    cfg.num_attention_heads,
+                    cfg.num_key_value_heads,
+                    cfg.head_dim,
+                    cfg.num_hidden_layers,
+                    cfg.use_qk_norm,
+                    cfg.qk_norm_type,
+                    cfg._attn_implementation,
+                ),
+                num_hidden_layers=cfg.num_hidden_layers,
+                head_dim=cfg.head_dim,
+                num_attention_heads=cfg.num_attention_heads,
+            )
+            self.graph_spec = static
+        return static
+    def can_use(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+    ) -> bool:
+        if (
+            not self.enabled
+            or self.model.training
+            or self.backbone.transformer.training
+        ):
+            return False
+        if next_input_ids.device.type != "cuda":
+            return False
+        if (
+            next_input_ids.ndim != 2
+            or next_input_ids.shape[0] != 1
+            or next_input_ids.shape[1] != 1
+        ):
+            return False
+        if not isinstance(past_key_values, _DepthDecodeStaticCache):
+            return False
+        if (
+            not torch.is_tensor(attention_bias)
+            or attention_bias.device != next_input_ids.device
+        ):
+            return False
+        return self._depth_decode_spec().eligible
+    def _depth_decode_key(
+        self,
+        next_input_ids: torch.Tensor,
+        attention_bias: torch.Tensor,
+    ) -> Tuple[Any, ...]:
+        device = next_input_ids.device
+        return (
+            self._depth_decode_spec().cache_key_prefix,
+            device.type,
+            device.index,
+            self.model.lm_head.weight.dtype,
+            attention_bias.shape[-1],
+        )
+    def _select_depth_decode_rope(
+        self, cos: torch.Tensor, sin: torch.Tensor, *, past_length: int
+    ) -> None:
+        emb = self.backbone.transformer.rotary_emb
+        cos.copy_(emb._pos_cos_cache[0, :, past_length : past_length + 1, :])
+        sin.copy_(emb._pos_sin_cache[0, :, past_length : past_length + 1, :])
+    def _depth_decode_pre_layer(
+        self,
+        layer_idx: int,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        block = self.backbone.transformer.blocks[layer_idx]
+        attention = block.self_attn
+        residual = hidden_states
+        hidden_states = block.attn_norm(hidden_states)
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, attention.head_dim)
+        qkv = attention.att_proj(hidden_states)
+        query_states, key_states, value_states = qkv.split(attention.fused_dims, dim=-1)
+        value_states = value_states.view(hidden_shape)
+        apply_qk_norm = attention.q_norm is not None and attention.k_norm is not None
+        norm_after_view = apply_qk_norm and attention.qk_norm_type == "qwen3"
+        if apply_qk_norm and not norm_after_view:
+            query_states = attention.q_norm(query_states)
+            key_states = attention.k_norm(key_states)
+        query_states = query_states.view(hidden_shape)
+        key_states = key_states.view(hidden_shape)
+        if norm_after_view:
+            query_states = attention.q_norm(query_states)
+            key_states = attention.k_norm(key_states)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        query_states, key_states = _apply_rotary_pos_emb(
+            query_states, key_states, cos, sin
+        )
+        return residual, query_states, key_states, value_states
+    def _depth_decode_pre0(
+        self,
+        token_ids: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        inputs_embeds = self.model._embed_base_tokens(token_ids)
+        return self._depth_decode_pre_layer(0, inputs_embeds, cos, sin)
+    def _depth_decode_post_layer(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+    ) -> torch.Tensor:
+        block = self.backbone.transformer.blocks[layer_idx]
+        attention = block.self_attn
+        input_shape = residual.shape[:-1]
+        attn_output = attn_context.reshape(*input_shape, -1).contiguous()
+        attn_output = attention.attn_out(attn_output)
+        hidden_states = residual + block.dropout(attn_output)
+        residual = hidden_states
+        hidden_states = block.ff_norm(hidden_states)
+        hidden_states = block.mlp(hidden_states)
+        hidden_states = residual + block.dropout(hidden_states)
+        return hidden_states
+    def _depth_decode_post_and_pre_next(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
+        return self._depth_decode_pre_layer(layer_idx + 1, hidden_states, cos, sin)
+    def _depth_decode_last_post(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
+        return self.backbone.transformer.ln_f(hidden_states)
+    def _build_depth_decode_graph(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_length: int,
+        attention_bias: torch.Tensor,
+    ) -> _DepthDecodeCudaGraph:
+        text_config = self.backbone.transformer.config
+        device = next_input_ids.device
+        dtype = self.model.lm_head.weight.dtype
+        static = self._depth_decode_spec()
+        num_layers = static.num_hidden_layers
+        head_dim = static.head_dim
+        max_cache_len = int(attention_bias.shape[-1])
+        max_rope_len = max(int(text_config.max_position_embeddings or 0), max_cache_len)
+        self.backbone.transformer.prepare_rope_cache(
+            device=device, max_seq_len=max_rope_len
+        )
+        token_ids = torch.empty((1, 1), device=device, dtype=torch.long)
+        cos = torch.empty((1, 1, head_dim), device=device, dtype=dtype)
+        sin = torch.empty_like(cos)
+        positions = torch.arange(max_cache_len, device=device, dtype=torch.long)
+        context_shape = (1, 1, static.num_attention_heads, head_dim)
+        token_ids.copy_(next_input_ids)
+        self._select_depth_decode_rope(cos, sin, past_length=past_length)
+        pre_graph, pre_output = _capture_cuda_graph(
+            lambda: self._depth_decode_pre0(token_ids, cos, sin),
+            device,
+        )
+        stages = [_DepthDecodeCudaGraphLayerStage(*pre_output)]
+        post_graphs = []
+        for layer_idx in range(num_layers - 1):
+            stage = stages[-1]
+            attn_context = torch.empty(context_shape, device=device, dtype=dtype)
+            graph, output = _capture_cuda_graph(
+                lambda layer_idx=layer_idx, stage=stage, attn_context=attn_context: (
+                    self._depth_decode_post_and_pre_next(
+                        layer_idx,
+                        stage.residual,
+                        attn_context,
+                        cos,
+                        sin,
+                    )
+                ),
+                device,
+            )
+            post_graphs.append(
+                _DepthDecodeCudaGraphPostStage(graph=graph, attn_context=attn_context)
+            )
+            stages.append(_DepthDecodeCudaGraphLayerStage(*output))
+        last_stage = stages[-1]
+        last_attn_context = torch.empty(context_shape, device=device, dtype=dtype)
+        last_graph, last_output = _capture_cuda_graph(
+            lambda: self._depth_decode_last_post(
+                num_layers - 1,
+                last_stage.residual,
+                last_attn_context,
+            ),
+            device,
+        )
+        post_graphs.append(
+            _DepthDecodeCudaGraphPostStage(
+                graph=last_graph, attn_context=last_attn_context
+            )
+        )
+        return _DepthDecodeCudaGraph(
+            cache_key=self._depth_decode_key(next_input_ids, attention_bias),
+            pre_graph=pre_graph,
+            token_ids=token_ids,
+            cos=cos,
+            sin=sin,
+            positions=positions,
+            stages=tuple(stages),
+            post_graphs=tuple(post_graphs),
+            output=last_output,
+        )
+    def _get_depth_decode_graph(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_length: int,
+        attention_bias: torch.Tensor,
+    ) -> _DepthDecodeCudaGraph:
+        key = self._depth_decode_key(next_input_ids, attention_bias)
+        decode_graph = self.graph
+        if decode_graph is None or decode_graph.cache_key != key:
+            decode_graph = self._build_depth_decode_graph(
+                next_input_ids,
+                past_length=past_length,
+                attention_bias=attention_bias,
+            )
+            self.graph = decode_graph
+        else:
+            decode_graph.token_ids.copy_(next_input_ids)
+            self._select_depth_decode_rope(
+                decode_graph.cos, decode_graph.sin, past_length=past_length
+            )
+        return decode_graph
+    def _run_depth_decode_attention_core(
+        self,
+        layer_idx: int,
+        stage: _DepthDecodeCudaGraphLayerStage,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+        cache_position: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        attention = self.backbone.transformer.blocks[layer_idx].self_attn
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_values.update(
+            stage.key,
+            stage.value,
+            layer_idx,
+            cache_kwargs,
+        )
+        key_states = _repeat_kv(key_states, attention.num_key_value_groups)
+        value_states = _repeat_kv(value_states, attention.num_key_value_groups)
+        attn_output = F.scaled_dot_product_attention(
+            stage.query,
+            key_states,
+            value_states,
+            attn_mask=attention_bias,
+            dropout_p=0.0,
+            is_causal=False,
+        )
+        return attn_output.transpose(1, 2)
+    def run(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+        past_length: int,
+    ) -> Tuple[torch.Tensor, Cache]:
+        end = past_length + 1
+        decode_graph = self._get_depth_decode_graph(
+            next_input_ids,
+            past_length=past_length,
+            attention_bias=attention_bias,
+        )
+        cache_position = decode_graph.positions[past_length:end]
+        attention_bias_q = attention_bias[:, :, past_length:end, :end]
+        decode_graph.pre_graph.replay()
+        for layer_idx, post_graph in enumerate(decode_graph.post_graphs):
+            attn_context = self._run_depth_decode_attention_core(
+                layer_idx,
+                decode_graph.stages[layer_idx],
+                past_key_values=past_key_values,
+                attention_bias=attention_bias_q,
+                cache_position=cache_position,
+                cos=decode_graph.cos,
+                sin=decode_graph.sin,
+            )
+            post_graph.attn_context.copy_(attn_context)
+            post_graph.graph.replay()
+        return decode_graph.output, past_key_values
+def _cuda_graph_tensor_signature(
+    tensor: Optional[torch.Tensor],
+) -> Optional[Tuple[Any, ...]]:
+    if tensor is None:
+        return None
+    return (
+        tuple(tensor.shape),
+        tuple(tensor.stride()),
+        str(tensor.dtype),
+        str(tensor.device),
+    )
+def _cuda_graph_context_signature(context: Any) -> Tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return (
+        tuple((sig(k), sig(v)) for k, v in context.kv_contexts),
+        sig(context.cross_mask),
+        sig(context.self_mask),
+        sig(context.valid_action),
+        None
+        if context.rope_cache is None
+        else tuple(sig(t) for t in context.rope_cache),
+    )
+def _cuda_graph_modulation_signature(modulations: Sequence[Any]) -> Tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return tuple(
+        (
+            sig(step.conditioning),
+            tuple(
+                tuple(sig(t) for t in block_modulation)
+                for block_modulation in step.block_modulations
+            ),
+            tuple(sig(t) for t in step.final_modulation),
+        )
+        for step in modulations
+    )
+def _cuda_graph_key(inputs: _ActionFlowInputs, steps: int) -> Tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return (
+        sig(inputs.trajectory),
+        _cuda_graph_context_signature(inputs.context),
+        _cuda_graph_modulation_signature(inputs.modulations),
+        sig(inputs.action_dim_is_pad),
+        int(steps),
+    )
+def _clone_static_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    if tensor is None:
+        return None
+    static = torch.empty_strided(
+        tuple(tensor.shape),
+        tuple(tensor.stride()),
+        device=tensor.device,
+        dtype=tensor.dtype,
+    )
+    static.copy_(tensor)
+    return static
+def _clone_static_context(context: Any) -> Any:
+    rope_cache = None
+    if context.rope_cache is not None:
+        rope_cache = tuple(_clone_static_tensor(t) for t in context.rope_cache)
+    return context.__class__(
+        kv_contexts=tuple(
+            (_clone_static_tensor(k), _clone_static_tensor(v))
+            for k, v in context.kv_contexts
+        ),
+        cross_mask=_clone_static_tensor(context.cross_mask),
+        self_mask=_clone_static_tensor(context.self_mask),
+        valid_action=_clone_static_tensor(context.valid_action),
+        rope_cache=rope_cache,
+    )
+def _clone_static_modulations(modulations: Sequence[Any]) -> Sequence[Any]:
+    return tuple(
+        step.__class__(
+            conditioning=_clone_static_tensor(step.conditioning),
+            block_modulations=tuple(
+                tuple(_clone_static_tensor(t) for t in block_modulation)
+                for block_modulation in step.block_modulations
+            ),
+            final_modulation=tuple(
+                _clone_static_tensor(t) for t in step.final_modulation
+            ),
+        )
+        for step in modulations
+    )
+def _clone_static_inputs(inputs: _ActionFlowInputs) -> _ActionFlowInputs:
+    return _ActionFlowInputs(
+        trajectory=_clone_static_tensor(inputs.trajectory),
+        context=_clone_static_context(inputs.context),
+        modulations=_clone_static_modulations(inputs.modulations),
+        action_dim_is_pad=_clone_static_tensor(inputs.action_dim_is_pad),
+    )
+def _copy_context_(dst: Any, src: Any) -> None:
+    for (dst_k, dst_v), (src_k, src_v) in zip(dst.kv_contexts, src.kv_contexts):
+        dst_k.copy_(src_k)
+        dst_v.copy_(src_v)
+    if src.cross_mask is not None:
+        dst.cross_mask.copy_(src.cross_mask)
+    if src.self_mask is not None:
+        dst.self_mask.copy_(src.self_mask)
+    if src.valid_action is not None:
+        dst.valid_action.copy_(src.valid_action)
+    if src.rope_cache is not None:
+        for dst_tensor, src_tensor in zip(dst.rope_cache, src.rope_cache):
+            dst_tensor.copy_(src_tensor)
+def _copy_inputs_(dst: _ActionFlowInputs, src: _ActionFlowInputs) -> None:
+    dst.trajectory.copy_(src.trajectory)
+    _copy_context_(dst.context, src.context)
+    if src.action_dim_is_pad is not None:
+        dst.action_dim_is_pad.copy_(src.action_dim_is_pad)
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def _apply_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (_rotate_half(q) * sin)
+    k_embed = (k * cos) + (_rotate_half(k) * sin)
+    return q_embed, k_embed
+def _repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def _capture_cuda_graph(
+    fn,
+    device: torch.device,
+    *,
+    after_warmup=None,
+) -> Tuple[torch.cuda.CUDAGraph, Any]:
+    warmup_stream = torch.cuda.Stream(device=device)
+    warmup_stream.wait_stream(torch.cuda.current_stream(device))
+    with torch.cuda.stream(warmup_stream):
+        fn()
+    torch.cuda.current_stream(device).wait_stream(warmup_stream)
+    if after_warmup is not None:
+        after_warmup()
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        output = fn()
+    return graph, output

model-00001-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08663a88bc5c1f6c1cf8534a8cdf7971eb2fd66979ac42d38752d5209b971e6b
+size 4929809880

model-00002-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9282d1390ca3fe81f31eaa4a925bc881ecb7c56c315eb8d3d1e2f6616cba7af9
+size 4844690992

model-00003-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f35fdd96126d04975d1feae1b715a875d03ad9d07c7de25fa91a6573cceb1e7
+size 4844691024

model-00004-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1b546c537a6eb6b5e281e5b8d6819ffe8ee8f88c55da5025e1e6ee2721cc907
+size 4998106920

model-00005-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a925159b5e363f72007c38a2c4c2fc7cdac6e8b6ae990ddaa51f3abe526beb77
+size 2345090936

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_molmoact2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

norm_stats.json ADDED Viewed

	@@ -0,0 +1,1739 @@

+{
+  "format": "molmoact2_norm_stats.v1",
+  "norm_mode": "q01_q99",
+  "metadata_by_tag": {
+    "franka_molmoact": {
+      "action_key": "action.del_ee_action",
+      "state_key": "observation.state",
+      "camera_keys": [
+        "observation.images.primary",
+        "observation.images.secondary"
+      ],
+      "normalize_gripper": false,
+      "action_horizon": 10,
+      "n_action_steps": 10,
+      "setup_type": "single franka robotic arm in molmoact",
+      "control_mode": "delta end-effector pose",
+      "action_stats": {
+        "min": [
+          -0.07434078305959702,
+          -0.07339745759963989,
+          -0.06539416313171387,
+          -0.1688285619020462,
+          -0.10289879888296127,
+          -0.2667275667190552,
+          0.0
+        ],
+        "max": [
+          0.06042003631591797,
+          0.09417290985584259,
+          0.07019275426864624,
+          0.2616892158985138,
+          0.11751057207584381,
+          0.16968433558940887,
+          1.0
+        ],
+        "mean": [
+          0.0005923698136522352,
+          0.000245022598131832,
+          -4.604843771714063e-05,
+          0.00022562421486693225,
+          -0.0005166618849942836,
+          -0.0002193919428051152,
+          0.557619424517478
+        ],
+        "std": [
+          0.005274540883280089,
+          0.007662320435387572,
+          0.006516662891595147,
+          0.013564563259375743,
+          0.011179215063905077,
+          0.015195633113705318,
+          0.49666890583432166
+        ],
+        "count": [
+          1482599.0
+        ],
+        "q01": [
+          -0.011251236153566059,
+          -0.014918113203115847,
+          -0.011753186696798671,
+          -0.02785908205770074,
+          -0.025679744407356857,
+          -0.03279371599275369,
+          3.7096921558780464e-05
+        ],
+        "q10": [
+          -0.005157558671709432,
+          -0.007627389508279324,
+          -0.006774633516067545,
+          -0.013867640389035468,
+          -0.01314412247667587,
+          -0.016390209597024155,
+          0.012615970474925397
+        ],
+        "q50": [
+          0.00047587496704591567,
+          -5.756867525949417e-05,
+          -0.0004126053693703461,
+          0.00010505655624582394,
+          6.41251115100509e-05,
+          -0.00028445035571581385,
+          0.6884608295876035
+        ],
+        "q90": [
+          0.006329953764757322,
+          0.008685542226677301,
+          0.008054135204293992,
+          0.01397800046720906,
+          0.010417940135392682,
+          0.016052135597642597,
+          0.9523435823006378
+        ],
+        "q99": [
+          0.01117553552099105,
+          0.016859041899882184,
+          0.015590574732817865,
+          0.029286192888436802,
+          0.023178728984454205,
+          0.031348125431223534,
+          0.9523741512556912
+        ],
+        "names": [
+          "x",
+          "y",
+          "z",
+          "rx",
+          "ry",
+          "rz",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "state_stats": {
+        "min": [
+          -0.26428329944610596,
+          -0.6690786480903625,
+          -0.11737073212862015,
+          -3.141592264175415,
+          -1.4651211500167847,
+          -2.9524343013763428,
+          -0.014517154544591904
+        ],
+        "max": [
+          0.8226616978645325,
+          0.7252005338668823,
+          0.9137527346611023,
+          3.141592264175415,
+          1.3202887773513794,
+          1.35053551197052,
+          1.0004678964614868
+        ],
+        "mean": [
+          0.524974531443455,
+          -0.009077995640631331,
+          0.37626277677807307,
+          -1.1230985182050761,
+          -0.15037831955429493,
+          -0.8360877101239638,
+          0.4828000792054066
+        ],
+        "std": [
+          0.108734747557998,
+          0.19003219833018514,
+          0.13128520583933115,
+          2.684752550519181,
+          0.2757065272207643,
+          0.41552838131031417,
+          0.44164051887464084
+        ],
+        "count": [
+          1482599.0
+        ],
+        "q01": [
+          0.3835934249761093,
+          -0.16975945635008685,
+          0.26948875059068883,
+          -3.1154851500608602,
+          -0.3736599588300681,
+          -1.1556019879922976,
+          -0.010862173339892917
+        ],
+        "q10": [
+          0.4262234785864046,
+          -0.14936716135148972,
+          0.28255691882796946,
+          -2.886297848869603,
+          -0.33795875325689667,
+          -1.111211596662902,
+          -0.010235056183139664
+        ],
+        "q50": [
+          0.526748316869364,
+          -0.018291417601920976,
+          0.37479483390901625,
+          -1.3501052773711595,
+          -0.15482441515331163,
+          -0.8602460018117026,
+          0.5755928858365665
+        ],
+        "q90": [
+          0.6151325476883934,
+          0.13938787377123313,
+          0.47141213932315906,
+          0.9833625012077128,
+          0.044654971996918924,
+          -0.5307531964313489,
+          0.8665980232471624
+        ],
+        "q99": [
+          0.6251391330078412,
+          0.16582465215431033,
+          0.5049115299029577,
+          1.351274663819693,
+          0.09734563994616442,
+          -0.4656737437923123,
+          0.8676457869434169
+        ],
+        "names": [
+          "x",
+          "y",
+          "z",
+          "rx",
+          "ry",
+          "rz",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      }
+    },
+    "franka_droid": {
+      "action_key": "action",
+      "state_key": "observation.state",
+      "camera_keys": [
+        "observation.images.exterior_1_left",
+        "observation.images.exterior_2_left",
+        "observation.images.wrist_left"
+      ],
+      "normalize_gripper": false,
+      "action_horizon": 15,
+      "n_action_steps": 15,
+      "setup_type": "single franka robotic arm in droid",
+      "control_mode": "absolute joint pose",
+      "action_stats": {
+        "min": [
+          -2.781099557876587,
+          -1.6407934427261353,
+          -2.7493984699249268,
+          -2.9508564472198486,
+          -2.7826988697052,
+          0.17983438074588776,
+          -2.901715040206909,
+          0.0
+        ],
+        "max": [
+          2.7449073791503906,
+          1.6668277978897095,
+          2.7546653747558594,
+          -0.1936211884021759,
+          2.7786083221435547,
+          4.402013778686523,
+          2.90183162689209,
+          1.0
+        ],
+        "mean": [
+          0.010418229819396566,
+          0.28233935319840636,
+          -0.015346633420959944,
+          -2.0060878874674715,
+          -0.029448930257783886,
+          2.350942437431684,
+          0.09820869537671756,
+          0.4390250813949694
+        ],
+        "std": [
+          0.3170372143097277,
+          0.4863630998896905,
+          0.27477375809610444,
+          0.48806966037647037,
+          0.528105567983804,
+          0.4517944470893175,
+          0.7430287051319469,
+          0.44171628153080567
+        ],
+        "count": [
+          17758044.0
+        ],
+        "q01": [
+          -0.2879620949867506,
+          -0.5702219304684566,
+          -0.31101638810433413,
+          -2.5622234922052725,
+          -0.5101021838814974,
+          1.7376836093987995,
+          -0.5227783063045004,
+          8.3274762776141e-05
+        ],
+        "q10": [
+          -0.2014563967491066,
+          -0.1953558605308627,
+          -0.20948523622127932,
+          -2.402722104277799,
+          -0.3766226599271436,
+          1.9723782378158212,
+          -0.35517365133256956,
+          0.006328163222238114
+        ],
+        "q50": [
+          0.007162417319678469,
+          0.336456475452052,
+          -0.013974891825914252,
+          -2.008015245005848,
+          -0.025656272672692895,
+          2.3675065323600304,
+          0.09545267517159627,
+          0.4136583280016901
+        ],
+        "q90": [
+          0.226670788411882,
+          0.6598602025239771,
+          0.17603345191458397,
+          -1.6119685483207011,
+          0.3092560943750579,
+          2.7008573894589345,
+          0.5521874183259908,
+          0.8737818408325101
+        ],
+        "q99": [
+          0.32190872731317344,
+          0.7405054873177153,
+          0.2737893247287367,
+          -1.5075067942029405,
+          0.4329542718063284,
+          2.804162424656418,
+          0.7128911284154664,
+          0.8917437724235555
+        ],
+        "names": [
+          "joint_0",
+          "joint_1",
+          "joint_2",
+          "joint_3",
+          "joint_4",
+          "joint_5",
+          "joint_6",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "state_stats": {
+        "min": [
+          -2.6536705493927,
+          -1.6156227588653564,
+          -2.6781487464904785,
+          -2.9409868717193604,
+          -2.6705946922302246,
+          0.24893812835216522,
+          -2.757359266281128,
+          0.0
+        ],
+        "max": [
+          2.6687583923339844,
+          1.5840554237365723,
+          2.666306734085083,
+          -0.29779934883117676,
+          2.6624162197113037,
+          4.272191524505615,
+          2.755643367767334,
+          1.0
+        ],
+        "mean": [
+          0.011081824850861873,
+          0.27280296447760194,
+          -0.01550719225628586,
+          -2.01647228106023,
+          -0.029620826332964655,
+          2.3483866081585507,
+          0.09636965416886735,
+          0.3927326432557614
+        ],
+        "std": [
+          0.31291266868924655,
+          0.4934370267472678,
+          0.2728791258795487,
+          0.48437020229024425,
+          0.521435680610052,
+          0.44821751701382595,
+          0.7352730961005634,
+          0.4070640216658998
+        ],
+        "count": [
+          17758044.0
+        ],
+        "q01": [
+          -0.2793009809782748,
+          -0.5873924424866738,
+          -0.3058546817065916,
+          -2.5639055042030354,
+          -0.491431808753978,
+          1.7381500993283228,
+          -0.5086147192989775,
+          1.6414552399718753e-05
+        ],
+        "q10": [
+          -0.1994457930505723,
+          -0.2381088441987148,
+          -0.2103897594636481,
+          -2.421918892949847,
+          -0.3725951094142233,
+          1.961410109454104,
+          -0.35782982482940473,
+          0.005005809072924616
+        ],
+        "q50": [
+          0.007891181486763803,
+          0.3376595448103942,
+          -0.014280627673021464,
+          -2.0134951539128574,
+          -0.025990006808582142,
+          2.3690656185268972,
+          0.09443906823538496,
+          0.38343357074070045
+        ],
+        "q90": [
+          0.22605189533019984,
+          0.6543162155730768,
+          0.17689204963635444,
+          -1.6243810394635305,
+          0.30497772553178637,
+          2.696376125344824,
+          0.5494813775877777,
+          0.7734412581580631
+        ],
+        "q99": [
+          0.3148177895778054,
+          0.7235689468221655,
+          0.2683897323238184,
+          -1.530780071911146,
+          0.415067150345451,
+          2.7863710743039887,
+          0.6952765173061115,
+          0.7968550629755542
+        ],
+        "names": [
+          "joint_0",
+          "joint_1",
+          "joint_2",
+          "joint_3",
+          "joint_4",
+          "joint_5",
+          "joint_6",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      }
+    },
+    "google_robot_fractal": {
+      "action_key": "action",
+      "state_key": "observation.state",
+      "camera_keys": [
+        "observation.images.image"
+      ],
+      "normalize_gripper": false,
+      "action_horizon": 3,
+      "n_action_steps": 3,
+      "setup_type": "google robot in rt_1",
+      "control_mode": "delta end-effector pose",
+      "action_stats": {
+        "min": [
+          -2.0204520225524902,
+          -5.497899532318115,
+          -2.031663417816162,
+          -1.569917917251587,
+          -1.569892168045044,
+          -1.570419430732727,
+          0.0
+        ],
+        "max": [
+          2.9984593391418457,
+          22.09052848815918,
+          2.7507524490356445,
+          1.570636510848999,
+          1.5321086645126343,
+          1.5691522359848022,
+          1.0
+        ],
+        "mean": [
+          0.006986742172085001,
+          0.006266400645656189,
+          -0.012625619452946994,
+          0.04333477176605177,
+          -0.005755843126369106,
+          0.0009133710921551742,
+          0.5354204546016331
+        ],
+        "std": [
+          0.06943342828666754,
+          0.05987580207886052,
+          0.07384291122356837,
+          0.15697640227077467,
+          0.13192376844373777,
+          0.1463219229157086,
+          0.49874381100185294
+        ],
+        "count": [
+          3786400.0
+        ],
+        "q01": [
+          -0.22488493870935375,
+          -0.14842987771463928,
+          -0.23165991540148315,
+          -0.3518507387123856,
+          -0.4191961375830685,
+          -0.43642424734739155,
+          -1.000000013351432e-10
+        ],
+        "q10": [
+          -0.057097137110108394,
+          -0.04180085777840345,
+          -0.08797302699742898,
+          -0.08695764133325046,
+          -0.14987822626697328,
+          -0.14407043696379337,
+          -1.000000013351432e-10
+        ],
+        "q50": [
+          0.0024323156617234785,
+          0.001999621430072272,
+          -0.006186507557852898,
+          0.010844173385829027,
+          9.716094932283909e-05,
+          0.00029282634304717123,
+          0.9998131999001298
+        ],
+        "q90": [
+          0.0799327921066265,
+          0.06281248479995295,
+          0.05719906967641521,
+          0.2181351081319081,
+          0.12581539646577725,
+          0.14653933152766907,
+          0.999962639980026
+        ],
+        "q99": [
+          0.1780379284730618,
+          0.1492598341805028,
+          0.2184954847280796,
+          0.5894017219543457,
+          0.3527610110385077,
+          0.4478335709948289,
+          0.9999962639980026
+        ],
+        "names": [
+          "x",
+          "y",
+          "z",
+          "roll",
+          "pitch",
+          "yaw",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "state_stats": {
+        "min": [
+          -0.4436439275741577,
+          -0.9970501065254211,
+          -0.006579156965017319,
+          -0.8643477559089661,
+          -0.7079970240592957,
+          -0.7688722014427185,
+          -0.4999994933605194,
+          0.0
+        ],
+        "max": [
+          1.0534898042678833,
+          0.48018959164619446,
+          1.6896663904190063,
+          0.9999993443489075,
+          0.9999874830245972,
+          0.9554369449615479,
+          0.9914546012878418,
+          1.0
+        ],
+        "mean": [
+          0.5582046028643476,
+          -0.08324323429555826,
+          0.7708198142579598,
+          -0.24752762586024715,
+          0.4959921774813562,
+          0.0925577145133276,
+          0.20941890216560163,
+          0.42619563761216767
+        ],
+        "std": [
+          0.12440319799919354,
+          0.11571359399631491,
+          0.2458943611771509,
+          0.5132342578001884,
+          0.5223439094545202,
+          0.1666598633276366,
+          0.27617123901287927,
+          0.4538753441706389
+        ],
+        "count": [
+          3786400.0
+        ],
+        "q01": [
+          0.3249422830693862,
+          -0.28341992821874495,
+          0.14102827969076331,
+          -0.6864852132802142,
+          -0.6809632829655476,
+          -0.36044700054021983,
+          -0.4542378536110671,
+          -1.000000013351432e-10
+        ],
+        "q10": [
+          0.42490653590113253,
+          -0.2163404740670024,
+          0.37762326560147996,
+          -0.6294334687684712,
+          -0.5920843577131312,
+          -0.09803071723264807,
+          -0.23202098126670248,
+          -1.000000013351432e-10
+        ],
+        "q50": [
+          0.5389458633818717,
+          -0.10059445446247807,
+          0.8738477700690715,
+          -0.4849259061727551,
+          0.7293306254210121,
+          0.09137287071030761,
+          0.23796976550241536,
+          0.1832136750707287
+        ],
+        "q90": [
+          0.7370583820538442,
+          0.08210784119164745,
+          0.9798527660285249,
+          0.7291734785677116,
+          0.84104651841686,
+          0.3032210107222038,
+          0.5373912158511455,
+          0.9999365636178622
+        ],
+        "q99": [
+          0.8750117781915163,
+          0.21252014598261149,
+          1.0727446933587392,
+          0.9378297494636977,
+          0.9562844548524763,
+          0.46002622460251424,
+          0.721691133425786,
+          0.9999936563617862
+        ],
+        "names": [
+          "x",
+          "y",
+          "z",
+          "rx",
+          "ry",
+          "rz",
+          "rw",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      }
+    },
+    "widowx_bridge": {
+      "action_key": "action",
+      "state_key": "observation.state",
+      "camera_keys": [
+        "observation.images.image_0",
+        "observation.images.image_1",
+        "observation.images.image_2",
+        "observation.images.image_3"
+      ],
+      "normalize_gripper": false,
+      "action_horizon": 5,
+      "n_action_steps": 5,
+      "setup_type": "single widowx robotic arm in bridge",
+      "control_mode": "delta end-effector pose",
+      "action_stats": {
+        "min": [
+          -0.4007510244846344,
+          -0.13874775171279907,
+          -0.22553899884223938,
+          -3.2010786533355713,
+          -1.8618112802505493,
+          -6.279075622558594,
+          0.0
+        ],
+        "max": [
+          0.41691166162490845,
+          0.25864794850349426,
+          0.21218234300613403,
+          3.122201919555664,
+          1.8618112802505493,
+          6.272472858428955,
+          1.0
+        ],
+        "mean": [
+          0.00022731789976267202,
+          0.0001311203695138562,
+          -0.00012641641264803482,
+          -0.00014410962647987843,
+          -0.0003903070519037156,
+          0.00024063480455490454,
+          0.5765894392570026
+        ],
+        "std": [
+          0.009782343005332487,
+          0.013714070718580267,
+          0.012687395519404626,
+          0.02848996416069207,
+          0.030552792886390234,
+          0.07751153262919225,
+          0.49409209255711634
+        ],
+        "count": [
+          1893026.0
+        ],
+        "q01": [
+          -0.02871995611488819,
+          -0.04170781908448411,
+          -0.02608340910386921,
+          -0.0808367313719228,
+          -0.09246813206247581,
+          -0.20693750972396757,
+          -1.000000013351432e-10
+        ],
+        "q10": [
+          -0.010151055597043716,
+          -0.014922217821287087,
+          -0.01393665282931255,
+          -0.029593090264604636,
+          -0.03406380769665256,
+          -0.06413116391050117,
+          -1.000000013351432e-10
+        ],
+        "q50": [
+          2.1248139354103056e-05,
+          -9.382913823534339e-06,
+          -0.0008275577521357758,
+          -0.00014731252460737677,
+          0.00047152176188271845,
+          0.0012537133528066303,
+          0.9998265319000765
+        ],
+        "q90": [
+          0.011082387395765428,
+          0.015737555353724994,
+          0.016874204550636374,
+          0.02832893788750676,
+          0.0322629905973504,
+          0.06417266804375155,
+          0.9999653063800154
+        ],
+        "q99": [
+          0.028291364035668985,
+          0.040898679036702676,
+          0.04018220331768194,
+          0.08177042032653538,
+          0.07759675528459531,
+          0.203201938256362,
+          0.9999965306380015
+        ],
+        "names": [
+          "x",
+          "y",
+          "z",
+          "roll",
+          "pitch",
+          "yaw",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "state_stats": {
+        "min": [
+          -0.04167502000927925,
+          -0.3563207685947418,
+          -0.15537554025650024,
+          -3.141592502593994,
+          -1.4992541074752808,
+          -3.14153790473938,
+          0.0,
+          0.04637829214334488
+        ],
+        "max": [
+          0.5862360596656799,
+          0.4034728705883026,
+          0.3568263053894043,
+          1.3517684936523438,
+          1.570796251296997,
+          3.141204357147217,
+          0.0,
+          1.1121242046356201
+        ],
+        "mean": [
+          0.3094503633235095,
+          0.030725376723448255,
+          0.06443996750169499,
+          0.0064906683342908335,
+          -0.07720050195254197,
+          0.10766038148835028,
+          0.0,
+          0.7081244810708762
+        ],
+        "std": [
+          0.06060302901710459,
+          0.0919536927343182,
+          0.05159382707079282,
+          0.1312174751351825,
+          0.16924010047039229,
+          0.5787203550709503,
+          0.0,
+          0.35365012001260804
+        ],
+        "count": [
+          1893026.0
+        ],
+        "q01": [
+          0.17102651970064053,
+          -0.16977934478310977,
+          -0.05565095783375642,
+          -0.3649685841887744,
+          -0.5418705685890239,
+          -1.3540046312592247,
+          0.0,
+          0.05212163980268402
+        ],
+        "q10": [
+          0.234054275333357,
+          -0.08584102855009192,
+          0.007129108058706664,
+          -0.13279207613930774,
+          -0.2879179685802783,
+          -0.47590377710082316,
+          0.0,
+          0.08160105384386226
+        ],
+        "q50": [
+          0.30824996150509265,
+          0.02806205006373531,
+          0.061364141277506515,
+          0.003477529234181987,
+          -0.06586482997881163,
+          0.033681061760553146,
+          0.0,
+          0.9850432498405283
+        ],
+        "q90": [
+          0.3866535994382209,
+          0.15225549791502352,
+          0.1303319111924363,
+          0.14920492884702988,
+          0.11511126950562722,
+          0.8206040455663128,
+          0.0,
+          1.0013512433353218
+        ],
+        "q99": [
+          0.453255677819252,
+          0.23543677111215228,
+          0.19489739182202712,
+          0.378015822982788,
+          0.27597790842706504,
+          1.8504199743270873,
+          0.0,
+          1.0106366157291133
+        ],
+        "names": [
+          "x",
+          "y",
+          "z",
+          "roll",
+          "pitch",
+          "yaw",
+          "pad",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      }
+    },
+    "so100_so101_molmoact2": {
+      "action_key": "action",
+      "state_key": "observation.state",
+      "camera_keys": [],
+      "normalize_gripper": true,
+      "action_horizon": 30,
+      "n_action_steps": 30,
+      "setup_type": "single so100/so101 robotic arm in molmoact2",
+      "control_mode": "absolute joint pose",
+      "action_stats": {
+        "min": [
+          -122.607421875,
+          -270.0,
+          -269.208984375,
+          -125.771484375,
+          -269.912109375,
+          -31.57327651977539
+        ],
+        "max": [
+          179.208984375,
+          219.638671875,
+          195.380859375,
+          178.9453125,
+          269.82421875,
+          119.40789031982422
+        ],
+        "mean": [
+          3.343996486826433,
+          125.7905980370996,
+          120.20220128113388,
+          55.88144220174933,
+          -11.543010633027725,
+          11.25886240824774
+        ],
+        "std": [
+          28.909870406169997,
+          52.25069634659296,
+          47.94432906599221,
+          36.01019142727721,
+          69.35504013212369,
+          17.116239869449775
+        ],
+        "count": [
+          19619650.0
+        ],
+        "q01": [
+          -42.1300246338976,
+          45.18258358164995,
+          35.40059182962813,
+          4.929781836327758,
+          -65.57568617645342,
+          -0.3016556932619033
+        ],
+        "q10": [
+          -25.040070398997557,
+          68.27827215165794,
+          65.76540485606242,
+          26.58811186925123,
+          -39.81868441470048,
+          0.26123181871944706
+        ],
+        "q50": [
+          3.0828094324713105,
+          124.5495736487354,
+          122.75175717637279,
+          57.77960070056314,
+          -11.094802886190045,
+          4.866634607477139
+        ],
+        "q90": [
+          31.591544866079253,
+          181.76986724267596,
+          168.5741215400282,
+          82.4353358815596,
+          16.05609349144359,
+          32.12324970648343
+        ],
+        "q99": [
+          48.55349563198916,
+          186.10646680077767,
+          173.6076722013997,
+          93.41056417929472,
+          43.53107398260694,
+          44.74649336930881
+        ],
+        "names": [
+          "shoulder_pan",
+          "shoulder_lift",
+          "elbow_flex",
+          "wrist_flex",
+          "wrist_roll",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true
+        ]
+      },
+      "state_stats": {
+        "min": [
+          -115.048828125,
+          -270.0,
+          -235.8984375,
+          -113.818359375,
+          -268.9453125,
+          -8.521058082580566
+        ],
+        "max": [
+          178.505859375,
+          218.49609375,
+          192.041015625,
+          207.861328125,
+          250.048828125,
+          118.2519302368164
+        ],
+        "mean": [
+          3.3225097946752244,
+          124.40594064960378,
+          121.59550610749059,
+          55.903039878016074,
+          -11.41740021122887,
+          13.358497334686597
+        ],
+        "std": [
+          28.79265204113751,
+          52.702867303079756,
+          47.00596021941705,
+          35.53803566355756,
+          69.12836626047817,
+          16.333280282904557
+        ],
+        "count": [
+          19619650.0
+        ],
+        "q01": [
+          -41.90962240941357,
+          43.66791235922949,
+          38.38770483255723,
+          5.711740446834044,
+          -63.44539045209019,
+          0.9435577790191543
+        ],
+        "q10": [
+          -24.949315993050774,
+          66.30007546431412,
+          68.16816985859437,
+          27.120731646136054,
+          -39.50255020332888,
+          1.6190225837869365
+        ],
+        "q50": [
+          3.066375725640164,
+          123.16482094240277,
+          124.39930058290133,
+          57.88605464633133,
+          -11.037436711677765,
+          9.241478261568748
+        ],
+        "q90": [
+          31.472920732960127,
+          180.87158401301218,
+          168.5699720215359,
+          81.64709150074712,
+          15.887605114617852,
+          31.887861734718296
+        ],
+        "q99": [
+          48.29435703371732,
+          185.2611055842669,
+          173.13578487933165,
+          91.78122415137209,
+          42.94491979114059,
+          44.13755601580974
+        ],
+        "names": [
+          "shoulder_pan",
+          "shoulder_lift",
+          "elbow_flex",
+          "wrist_flex",
+          "wrist_roll",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true
+        ]
+      }
+    },
+    "google_robot_bc_z": {
+      "action_key": "action",
+      "state_key": "observation.state",
+      "camera_keys": [
+        "observation.images.image"
+      ],
+      "normalize_gripper": false,
+      "action_horizon": 10,
+      "n_action_steps": 10,
+      "setup_type": "google robot in bc_z",
+      "control_mode": "delta end-effector pose",
+      "action_stats": {
+        "min": [
+          -0.1677047461271286,
+          -0.14630407094955444,
+          -0.10066790133714676,
+          -0.29421567916870117,
+          -0.32101404666900635,
+          -0.4635624885559082,
+          0.0
+        ],
+        "max": [
+          0.2165454924106598,
+          0.1251407265663147,
+          0.09988310933113098,
+          0.33544227480888367,
+          0.28117990493774414,
+          0.40614867210388184,
+          1.0
+        ],
+        "mean": [
+          -0.009960200864471745,
+          0.0009084977087131892,
+          0.00499393515302369,
+          0.00028739003438370427,
+          -0.00871610909893306,
+          -0.030692461306736755,
+          0.8343520005664466
+        ],
+        "std": [
+          0.03080177058689462,
+          0.023236620172139833,
+          0.020777592916798007,
+          0.041763587623031895,
+          0.046686683400427,
+          0.07753463216688747,
+          0.3717643553432202
+        ],
+        "count": [
+          5471693.0
+        ],
+        "q01": [
+          -0.09213472068957661,
+          -0.06450906318665113,
+          -0.04912072456744037,
+          -0.11609895664024446,
+          -0.1413486404610977,
+          -0.22517701597416145,
+          -1.000000013351432e-10
+        ],
+        "q10": [
+          -0.05253115985050928,
+          -0.028533985817234882,
+          -0.021736428190829056,
+          -0.04809403695382897,
+          -0.0664864549799673,
+          -0.1391167833364122,
+          -1.000000013351432e-10
+        ],
+        "q50": [
+          -0.0031453596109414592,
+          0.0004054125482836473,
+          0.0023481391860319715,
+          -8.489440239357886e-05,
+          -0.002574837787014793,
+          -0.014108526356650069,
+          0.9998801266205536
+        ],
+        "q90": [
+          0.019494707527676427,
+          0.029460992205482695,
+          0.032557826189659966,
+          0.04931595102291217,
+          0.042994841552155126,
+          0.05302803170853769,
+          0.9999760253241107
+        ],
+        "q99": [
+          0.07630278211772451,
+          0.05802308552485688,
+          0.052553275338456634,
+          0.1173714221625478,
+          0.11711249897425843,
+          0.1673988100025391,
+          0.9999976025324111
+        ],
+        "names": [
+          "x",
+          "y",
+          "z",
+          "roll",
+          "pitch",
+          "yaw",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "state_stats": {
+        "min": [
+          -0.7190948724746704,
+          -0.3756217360496521,
+          -0.281008243560791,
+          -2.400146484375,
+          -2.500656843185425,
+          -3.1274476051330566,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.6597589254379272,
+          0.7259413599967957,
+          1.1217665672302246,
+          2.2803165912628174,
+          1.8151572942733765,
+          3.1237573623657227,
+          0.0,
+          1.0
+        ],
+        "mean": [
+          0.0176884768449917,
+          0.10948195169606133,
+          0.784290845584472,
+          -0.5290053991424425,
+          -0.22605912165135514,
+          -0.17858785012278866,
+          0.0,
+          0.5600556496096702
+        ],
+        "std": [
+          0.1841601172406892,
+          0.09627411033983578,
+          0.08699189118288073,
+          0.24700645691257475,
+          0.4286554852012691,
+          1.0001615516228195,
+          0.0,
+          0.3586031013748201
+        ],
+        "count": [
+          5471693.0
+        ],
+        "q01": [
+          -0.38789819221198557,
+          -0.1118956928319213,
+          0.6110697470322705,
+          -1.0415028765133625,
+          -1.1876200204022105,
+          -2.3808376895782,
+          0.0,
+          0.19986777120588917
+        ],
+        "q10": [
+          -0.2318964688694949,
+          -0.015558046064633315,
+          0.6822309043992328,
+          -0.7563316012340816,
+          -0.7533119325741587,
+          -1.3938289285869132,
+          0.0,
+          0.2000496453831541
+        ],
+        "q50": [
+          0.022859327635303822,
+          0.10637610222856157,
+          0.776611691927557,
+          -0.5671171062825059,
+          -0.24114911945667813,
+          -0.25162686787881255,
+          0.0,
+          0.3501994619818789
+        ],
+        "q90": [
+          0.2666238156546802,
+          0.23844897018337458,
+          0.9059002565684082,
+          -0.26983885858517637,
+          0.3994129877275485,
+          1.374448904817122,
+          0.0,
+          0.999900866490248
+        ],
+        "q99": [
+          0.3325375374171561,
+          0.31715197447407467,
+          0.982179447052214,
+          0.34632693633800826,
+          0.7713777675821983,
+          2.029990628516839,
+          0.0,
+          0.9999900866490248
+        ],
+        "names": [
+          "x",
+          "y",
+          "z",
+          "roll",
+          "pitch",
+          "yaw",
+          "pad",
+          "gripper"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      }
+    },
+    "yam_dual_molmoact2": {
+      "action_key": "action",
+      "state_key": "observation.state",
+      "camera_keys": [
+        "observation.images.top",
+        "observation.images.left",
+        "observation.images.right"
+      ],
+      "normalize_gripper": false,
+      "action_horizon": 30,
+      "n_action_steps": 30,
+      "setup_type": "bimanual yam robotic arms in molmoact2",
+      "control_mode": "absolute joint pose",
+      "action_stats": {
+        "min": [
+          -1.9876782894134521,
+          -0.007057297509163618,
+          -0.002861066721379757,
+          -1.6958495378494263,
+          -1.5730143785476685,
+          -2.184138298034668,
+          0.0,
+          -1.6771572828292847,
+          -0.00667582219466567,
+          -0.0032425422687083483,
+          -1.7061493396759033,
+          -1.6287097930908203,
+          -2.143320322036743,
+          0.0
+        ],
+        "max": [
+          1.808003306388855,
+          3.1988632678985596,
+          3.1507973670959473,
+          1.592851161956787,
+          1.5890363454818726,
+          2.2081711292266846,
+          1.0,
+          2.440871238708496,
+          3.1084535121917725,
+          3.1530861854553223,
+          1.6649500131607056,
+          1.5947585105895996,
+          2.1639199256896973,
+          1.0
+        ],
+        "mean": [
+          -0.08857854148141169,
+          1.3813960226201991,
+          1.2242081192216245,
+          -0.7456114034786908,
+          0.15342910390834139,
+          -0.2406550926649683,
+          0.6405881969404109,
+          0.11816370494944337,
+          1.3440412881232742,
+          1.1275448419933234,
+          -0.6567647967296087,
+          -0.15745777770921981,
+          0.20879381691599022,
+          0.5971762495146153
+        ],
+        "std": [
+          0.31549225693975164,
+          0.7241109409894698,
+          0.6724976443740277,
+          0.4912531895036823,
+          0.3766601597067631,
+          0.3683009171682207,
+          0.41042883365599214,
+          0.33538355728349317,
+          0.8035033283123882,
+          0.7129305114483252,
+          0.5147389512393373,
+          0.37362261558635523,
+          0.35878804842243267,
+          0.42346789755808983
+        ],
+        "count": [
+          76046658.0
+        ],
+        "q01": [
+          -0.6603105582072047,
+          0.0041340051935240115,
+          0.013831665477596221,
+          -1.3744044717113109,
+          -0.3593570239425977,
+          -0.9302641712677729,
+          0.051016362361406005,
+          -0.49367228465810536,
+          0.004744360313868616,
+          0.017154297804418434,
+          -1.4240273823045295,
+          -0.9737084779331572,
+          -0.4719268433374943,
+          0.033350514024370274
+        ],
+        "q10": [
+          -0.4158939180171844,
+          0.49040349295087926,
+          0.48318427047331663,
+          -1.1595704371830307,
+          -0.13299944787425266,
+          -0.5670792130135129,
+          0.11117863560492024,
+          -0.19067792775434206,
+          0.19335683280594596,
+          0.1783492294932824,
+          -1.165289828212844,
+          -0.5363078842413471,
+          -0.11410713925580458,
+          0.054251135868839034
+        ],
+        "q50": [
+          -0.07347940057883112,
+          1.4486934996424023,
+          1.2826819985862519,
+          -0.8018464396181274,
+          0.11333067563787286,
+          -0.22188306769880142,
+          0.7333514901431821,
+          0.08159376899519756,
+          1.542016049355695,
+          1.2518141457542857,
+          -0.6816567194944295,
+          -0.12921257250905716,
+          0.19217648232095094,
+          0.6965966006454063
+        ],
+        "q90": [
+          0.21224325405051755,
+          2.0044457220962184,
+          1.7599272535504926,
+          -0.17992348512991949,
+          0.5121005560866031,
+          0.06588770556098025,
+          0.9798257827982823,
+          0.49762827627115913,
+          2.062871328579572,
+          1.7914606668876476,
+          -0.07308204053490945,
+          0.182291301998786,
+          0.5569780500008801,
+          0.9922195168313757
+        ],
+        "q99": [
+          0.4704245731743921,
+          2.244327078820327,
+          2.0080105207169177,
+          0.13399061379118773,
+          0.8834156417282395,
+          0.334483290041328,
+          0.987078674113364,
+          0.7377501348730936,
+          2.285076596429336,
+          2.0605540868103542,
+          0.23968854170206916,
+          0.5304791687465945,
+          0.9621494841801348,
+          0.9953596816858612
+        ],
+        "names": [
+          "left_joint_0.pos",
+          "left_joint_1.pos",
+          "left_joint_2.pos",
+          "left_joint_3.pos",
+          "left_joint_4.pos",
+          "left_joint_5.pos",
+          "left_gripper.pos",
+          "right_joint_0.pos",
+          "right_joint_1.pos",
+          "right_joint_2.pos",
+          "right_joint_3.pos",
+          "right_joint_4.pos",
+          "right_joint_5.pos",
+          "right_gripper.pos"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "state_stats": {
+        "min": [
+          -1.971656322479248,
+          0.00019073777366429567,
+          0.001716639962978661,
+          -1.7023346424102783,
+          -1.576829195022583,
+          -2.0963988304138184,
+          0.0005250918911769986,
+          -1.6741054058074951,
+          -0.0009536888683214784,
+          0.004386968910694122,
+          -1.737811803817749,
+          -1.574158787727356,
+          -2.0941100120544434,
+          0.003973988350480795
+        ],
+        "max": [
+          1.813725471496582,
+          3.101205348968506,
+          3.1466009616851807,
+          1.5821698904037476,
+          1.6222248077392578,
+          2.1040284633636475,
+          0.9997128844261169,
+          2.4343862533569336,
+          3.11112380027771,
+          3.1492714881896973,
+          1.5836957693099976,
+          1.6062028408050537,
+          2.1452276706695557,
+          1.0
+        ],
+        "mean": [
+          -0.08969431138176573,
+          1.3833397954729871,
+          1.2214299123909826,
+          -0.7438162535789633,
+          0.15467924320885904,
+          -0.2444551331990551,
+          0.6477599794157677,
+          0.11772745375836342,
+          1.3475698442605246,
+          1.1241839262647857,
+          -0.657754523106273,
+          -0.16024992695882134,
+          0.2095172679704065,
+          0.6019240399143698
+        ],
+        "std": [
+          0.3152726802877428,
+          0.7215555774539155,
+          0.6677525379386945,
+          0.49249044506684236,
+          0.3669531426180722,
+          0.36500773276171394,
+          0.4034043094483581,
+          0.3350780291739786,
+          0.8015514140140498,
+          0.7087483761552382,
+          0.5140769455948587,
+          0.36485948060191936,
+          0.35558886385685473,
+          0.4187505380995499
+        ],
+        "count": [
+          76046658.0
+        ],
+        "q01": [
+          -0.6603467782218314,
+          0.012553692652370085,
+          0.021776265158983142,
+          -1.3705572057237516,
+          -0.3332034826366618,
+          -0.9193192400336088,
+          0.059239047676073166,
+          -0.4935656974138795,
+          0.012780929401173773,
+          0.022236669213863816,
+          -1.4227596196972356,
+          -0.9434528867624581,
+          -0.4598343195103144,
+          0.037835498581155064
+        ],
+        "q10": [
+          -0.41642163282166217,
+          0.49507907198249584,
+          0.486584320872561,
+          -1.1582997707602973,
+          -0.12275828541607876,
+          -0.5663963402767317,
+          0.1261316463154828,
+          -0.1908506486628405,
+          0.1993559996076043,
+          0.18204643795012038,
+          -1.1656159852054215,
+          -0.5295295866303873,
+          -0.10955673634265617,
+          0.06449180996120647
+        ],
+        "q50": [
+          -0.07460956719060403,
+          1.4518741988602484,
+          1.2790339607814287,
+          -0.8004009901188069,
+          0.11633919425925929,
+          -0.2256239564587,
+          0.7410515739838786,
+          0.08125296212737787,
+          1.546374492933441,
+          1.2473645258976782,
+          -0.6826830989658852,
+          -0.13268823647237576,
+          0.19324335769817771,
+          0.6975293719700979
+        ],
+        "q90": [
+          0.21065792154366084,
+          2.001783199519663,
+          1.7536904322237028,
+          -0.17570477043577734,
+          0.5016373270395832,
+          0.057081945863381375,
+          0.9793483311612012,
+          0.496661138954089,
+          2.0633422575822404,
+          1.784104252873167,
+          -0.07449674242785952,
+          0.17045548433242785,
+          0.5532139533377123,
+          0.9916430884848699
+        ],
+        "q99": [
+          0.4683004661020414,
+          2.2309715341843326,
+          1.9982285068319416,
+          0.13319204881075056,
+          0.8574646079271142,
+          0.31881311685642116,
+          0.9862640952345495,
+          0.736253091937041,
+          2.276675221510269,
+          2.0496951704229227,
+          0.23446313153252643,
+          0.503194049828884,
+          0.9489437100128476,
+          0.9945109907992316
+        ],
+        "names": [
+          "left_joint_0.pos",
+          "left_joint_1.pos",
+          "left_joint_2.pos",
+          "left_joint_3.pos",
+          "left_joint_4.pos",
+          "left_joint_5.pos",
+          "left_gripper.pos",
+          "right_joint_0.pos",
+          "right_joint_1.pos",
+          "right_joint_2.pos",
+          "right_joint_3.pos",
+          "right_joint_4.pos",
+          "right_joint_5.pos",
+          "right_gripper.pos"
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      }
+    }
+  }
+}

processing_molmoact2.py ADDED Viewed

	@@ -0,0 +1,418 @@

+"""
+Processor class for MolmoAct2.
+"""
+from typing import Optional, Union
+import dataclasses
+import numpy as np
+from transformers.image_utils import ImageInput
+from transformers.video_utils import VideoInput
+from transformers.processing_utils import (
+    Unpack,
+    ProcessingKwargs,
+    ProcessorMixin,
+)
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
+from transformers.utils import logging
+from transformers import AutoTokenizer
+from .image_processing_molmoact2 import MolmoAct2ImagesKwargs, MolmoAct2ImageProcessor
+from .video_processing_molmoact2 import MolmoAct2VideoProcessorKwargs, MolmoAct2VideoProcessor
+logger = logging.get_logger(__name__)
+# Special tokens, these should be present in any tokenizer we use since the preprocessor uses them
+IMAGE_PATCH_TOKEN = f"<im_patch>"  # Where to insert high-res tokens
+IMAGE_LOW_RES_TOKEN = f"<im_low>"  # Where to insert low-res tokens
+IM_START_TOKEN = f"<im_start>"
+LOW_RES_IMAGE_START_TOKEN = f"<low_res_im_start>"
+FRAME_START_TOKEN = f"<frame_start>"
+IM_END_TOKEN = f"<im_end>"
+FRAME_END_TOKEN= f"<frame_end>"
+IM_COL_TOKEN = f"<im_col>"
+IMAGE_PROMPT = "<|image|>"
+VIDEO_PROMPT = "<|video|>"
+IMAGE_TOKENS = [
+    IMAGE_PATCH_TOKEN,
+    IM_COL_TOKEN,
+    IM_START_TOKEN,
+    LOW_RES_IMAGE_START_TOKEN,
+    FRAME_START_TOKEN,
+    IM_END_TOKEN,
+    FRAME_END_TOKEN,
+    IMAGE_LOW_RES_TOKEN,
+]
+class MolmoAct2ProcessorKwargs(ProcessingKwargs, total=False):
+    """MolmoAct2 processor kwargs"""
+    images_kwargs: MolmoAct2ImagesKwargs
+    videos_kwargs: MolmoAct2VideoProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": True,
+        },
+        "videos_kwargs": {"return_metadata": True},
+    }
+class MolmoAct2Processor(ProcessorMixin):
+    attributes = ["image_processor", "video_processor", "tokenizer"]
+    optional_attributes = [
+        "chat_template",
+        "time_mode",
+        "image_use_col_tokens",
+        "use_single_crop_col_tokens",
+        "use_single_crop_start_token",
+        "video_use_col_tokens",
+        "use_frame_special_tokens",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        image_processor: MolmoAct2ImageProcessor = None,
+        video_processor: MolmoAct2VideoProcessor = None,
+        tokenizer: AutoTokenizer = None,
+        chat_template: Optional[str] = None,
+        image_use_col_tokens: Optional[bool] = True,
+        use_single_crop_col_tokens: Optional[bool] = None,
+        use_single_crop_start_token: Optional[bool] = True,
+        video_use_col_tokens: Optional[bool] = False,
+        use_frame_special_tokens: Optional[bool] = True,
+        **kwargs
+    ) -> None:
+        super().__init__(
+            image_processor,
+            video_processor,
+            tokenizer,
+            chat_template=chat_template,
+        )
+        self.image_use_col_tokens = image_use_col_tokens
+        self.use_single_crop_col_tokens = use_single_crop_col_tokens
+        self.use_single_crop_start_token = use_single_crop_start_token
+        self.video_use_col_tokens = video_use_col_tokens
+        self.use_frame_special_tokens = use_frame_special_tokens
+        self.image_placeholder_token = IMAGE_PROMPT
+        self.video_placeholder_token = VIDEO_PROMPT
+        self.image_token_ids = [
+            tokenizer.convert_tokens_to_ids(token)
+            for token in IMAGE_TOKENS
+        ]
+    def get_image_tokens(self, image_grid: np.ndarray):
+        resized_h, resized_w, height, width = image_grid
+        if int(height) == 0 or int(width) == 0:
+            per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
+            use_single_crop_col_tokens = (
+                self.image_use_col_tokens
+                if self.use_single_crop_col_tokens is None
+                else self.use_single_crop_col_tokens
+            )
+            if use_single_crop_col_tokens:
+                per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+            joint = [
+                [IM_START_TOKEN],
+                np.tile(per_row, [resized_h]),
+                [IM_END_TOKEN],
+            ]
+            return np.concatenate(joint)
+        per_row = np.full(width, IMAGE_PATCH_TOKEN)
+        if self.image_use_col_tokens:
+            per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+        joint = [
+            [IM_START_TOKEN],
+            np.tile(per_row, [height]),
+            [IM_END_TOKEN],
+        ]
+        per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
+        use_single_crop_col_tokens = (
+            self.image_use_col_tokens
+            if self.use_single_crop_col_tokens is None
+            else self.use_single_crop_col_tokens
+        )
+        image_start_token = (
+            LOW_RES_IMAGE_START_TOKEN
+            if self.use_single_crop_start_token
+            else IM_START_TOKEN
+        )
+        if use_single_crop_col_tokens:
+            per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+        joint = [
+            [image_start_token],
+            np.tile(per_row, [resized_h]),
+            [IM_END_TOKEN],
+        ] + joint
+        return np.concatenate(joint)
+    def get_video_string(
+        self,
+        video_grid: np.ndarray,
+        timestamps: np.ndarray,
+    ):
+        if self.use_frame_special_tokens:
+            start_token_id = FRAME_START_TOKEN
+            end_token_id = FRAME_END_TOKEN
+        else:
+            start_token_id = IM_START_TOKEN
+            end_token_id = IM_END_TOKEN
+        num_frames, h, w = video_grid
+        video_string: str = ""
+        for frame_idx, frame_time in enumerate(timestamps):
+            # `per-frame-compact` time mode
+            prev_space = " " if frame_idx > 0 else ""
+            frame_prefix = prev_space + f"{frame_time:.1f} " # explicit whitespace before/after image tokens
+            video_string += frame_prefix
+            per_row = np.full(w, IMAGE_PATCH_TOKEN)
+            if self.video_use_col_tokens:
+                per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+            extra_tokens = np.tile(per_row, [h])
+            video_tokens = [
+                [start_token_id],
+                extra_tokens,
+                [end_token_id],
+            ]
+            video_string += "".join(np.concatenate(video_tokens, 0))
+        return video_string
+    def insert_bos(
+        self,
+        input_ids: np.ndarray,
+        attention_mask: np.ndarray,
+        bos_token_id: int,
+        pad_token_id: int,
+    ):
+        """
+        Args:
+            input_ids: [B, S] array with left padding
+            attention_mask: [B, S] array (0 for pad, 1 for valid)
+            bos_token_id: int
+            pad_token_id: int
+        Returns:
+            input_ids_out: [B, S] or [B, S+1] array with bos inserted if needed
+            attention_mask_out: same shape as input_ids_out
+        """
+        need_to_expand = len(input_ids.shape) == 1
+        if need_to_expand:
+            input_ids = input_ids[None, :]
+            attention_mask = attention_mask[None, :]
+        B, S = input_ids.shape
+        # Handle zero-length sequence
+        if S == 0:
+            new_input_ids = np.full((B, 1), bos_token_id, dtype=input_ids.dtype)
+            new_attention_mask = np.ones((B, 1), dtype=attention_mask.dtype)
+            if need_to_expand:
+                new_input_ids = new_input_ids[0]
+                new_attention_mask = new_attention_mask[0]
+            return new_input_ids, new_attention_mask
+        first_valid_index = (attention_mask == 1).argmax(axis=-1)  # [B]
+        bos_already_present = np.all(input_ids[np.arange(B), first_valid_index] == bos_token_id)
+        if bos_already_present:
+            if need_to_expand:
+                input_ids = input_ids[0]
+                attention_mask = attention_mask[0]
+            return input_ids, attention_mask
+        else:
+            new_input_ids = np.full((B, S+1), pad_token_id, dtype=input_ids.dtype)
+            new_attention_mask = np.zeros((B, S+1), dtype=attention_mask.dtype)
+            src_idx = np.tile(np.arange(S), (B, 1))  # [B, S]
+            valid_mask = src_idx >= first_valid_index[:, None]  # [B, S]
+            tgt_idx = src_idx + 1  # shit right
+            batch_idx = np.tile(np.arange(B)[:, None], (1, S))  # [B, S]
+            # flatten valid_positions
+            flat_vals = input_ids[valid_mask]
+            flat_batch = batch_idx[valid_mask]
+            flat_tgt = tgt_idx[valid_mask]
+            new_input_ids[flat_batch, flat_tgt] = flat_vals
+            new_attention_mask[flat_batch, flat_tgt] = 1
+            insert_pos = first_valid_index
+            new_input_ids[np.arange(B), insert_pos] = bos_token_id
+            new_attention_mask[np.arange(B), insert_pos] = 1
+            if need_to_expand:
+                new_input_ids = new_input_ids[0]
+                new_attention_mask = new_attention_mask[0]
+            return new_input_ids, new_attention_mask
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[MolmoAct2ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            videos (`dict[str, Any]` or `list[dict[str, Any]]`):
+                The video or batch of videos to be prepared. Each video can be a dictionary with the following keys:
+                - `"frames"`: `np.ndarray` of shape (T, H, W, 3)
+                - `"timestamps"`: `np.ndarray` of shape (T,)
+                - `"sampled_fps"`: `float` (optional)
+                - `"sampling_augmentation"`: `str` (optional)
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            `BatchFeature`: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **image_token_pooling** -- Indices of the patches in `image_grids` to pool for each token in `image_tokens`.
+              Returned when `images` is not `None`.
+            - **image_grids** -- Grids of images. Returned when `images` is not `None`.
+            - **image_num_crops** -- Number of crops for each image. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **video_token_pooling** -- Indices of the patches in `video_grids` to pool for each token in `video_tokens`.
+              Returned when `videos` is not `None`.
+            - **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            MolmoAct2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+            image_grids = image_inputs["image_grids"]
+        else:
+            image_inputs = {}
+            image_grids = None
+        if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
+            video_grids = videos_inputs["video_grids"]
+            # If user has not requested video metadata, pop it
+            if "return_metadata" not in kwargs:
+                video_metadata = videos_inputs.pop("video_metadata")
+            else:
+                video_metadata = videos_inputs["video_metadata"]
+        else:
+            videos_inputs = {}
+            video_grids = None
+        if not isinstance(text, list):
+            text = [text]
+        text = text.copy() # below lines change text in-place
+        if image_grids is not None:
+            index = 0
+            for i in range(len(text)):
+                num_images = text[i].count(self.image_placeholder_token)
+                image_grids_i = image_grids[index:index+num_images]
+                for image_grid in image_grids_i:
+                    image_tokens = self.get_image_tokens(image_grid)
+                    image_string = "".join(image_tokens)
+                    text[i] = text[i].replace(self.image_placeholder_token, image_string, 1)
+                index += num_images
+        if video_grids is not None:
+            index = 0
+            for i in range(len(text)):
+                num_videos = text[i].count(self.video_placeholder_token)
+                assert num_videos in {0, 1}, "At most one video is supported for now"
+                video_grids_i = video_grids[index:index+num_videos]
+                metadata_i = video_metadata[index:index+num_videos]
+                for video_grid, metadata in zip(video_grids_i, metadata_i):
+                    video_string = self.get_video_string(
+                        video_grid,
+                        metadata.timestamps,
+                    )
+                    text[i] = text[i].replace(self.video_placeholder_token, video_string, 1)
+                index += num_videos
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        input_ids = text_inputs["input_ids"]
+        attention_mask = text_inputs["attention_mask"]
+        input_ids = np.array(input_ids)
+        attention_mask = np.array(attention_mask)
+        bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+        input_ids, attention_mask = self.insert_bos(
+            input_ids, attention_mask, bos, self.tokenizer.pad_token_id
+        )
+        if return_mm_token_type_ids:
+            image_tokens = np.array(self.image_token_ids).astype(input_ids.dtype)
+            token_type_ids = np.any(input_ids[:, :, None] == image_tokens[None, None, :], axis=-1)
+            text_inputs["token_type_ids"] = token_type_ids.tolist()
+        text_inputs["input_ids"] = input_ids.tolist()
+        text_inputs["attention_mask"] = attention_mask.tolist()
+        return BatchFeature(
+            data={**text_inputs, **image_inputs, **videos_inputs},
+            tensor_type=return_tensors,
+        )
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+        Returns:
+            `list[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+MolmoAct2Processor.register_for_auto_class()

processor_config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_molmoact2.MolmoAct2Processor"
+  },
+  "image_processor": {
+    "auto_map": {
+      "AutoImageProcessor": "image_processing_molmoact2.MolmoAct2ImageProcessor",
+      "AutoProcessor": "processing_molmoact2.MolmoAct2Processor"
+    },
+    "crop_mode": "resize",
+    "do_convert_rgb": true,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_processor_type": "MolmoAct2ImageProcessor",
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "max_crops": 8,
+    "overlap_margins": [
+      4,
+      4
+    ],
+    "patch_size": 14,
+    "pooling_size": [
+      2,
+      2
+    ],
+    "resample": 2,
+    "size": {
+      "height": 378,
+      "width": 378
+    }
+  },
+  "image_use_col_tokens": true,
+  "processor_class": "MolmoAct2Processor",
+  "use_frame_special_tokens": true,
+  "use_single_crop_col_tokens": false,
+  "use_single_crop_start_token": true,
+  "video_processor": {
+    "auto_map": {
+      "AutoProcessor": "processing_molmoact2.MolmoAct2Processor",
+      "AutoVideoProcessor": "video_processing_molmoact2.MolmoAct2VideoProcessor"
+    },
+    "data_format": "channels_first",
+    "default_to_square": true,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "frame_sample_mode": "uniform_last_frame",
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "max_fps": 2.0,
+    "num_frames": 8,
+    "patch_size": 14,
+    "pooling_size": [
+      3,
+      3
+    ],
+    "resample": 2,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "sampling_fps": 2,
+    "size": {
+      "height": 378,
+      "width": 378
+    },
+    "video_processor_type": "MolmoAct2VideoProcessor"
+  },
+  "video_use_col_tokens": false
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b6aeec78de2b0c7e95d7ae9d71cd04eba3d57351045a86c95520730e9c80d83
+size 12176547

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "add_prefix_space": false,
+  "auto_map": {
+    "AutoProcessor": "processing_molmoact2.MolmoAct2Processor"
+  },
+  "backend": "tokenizers",
+  "bos_token": "<|im_end|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<im_start>",
+    "<im_end>",
+    "<im_patch>",
+    "<im_col>",
+    "<low_res_im_start>",
+    "<|image|>",
+    "<im_low>",
+    "<frame_start>",
+    "<frame_end>",
+    "<|video|>",
+    "<|points|>",
+    "<|token_index|>",
+    "<|vit_index|>",
+    "<|vit_loc|>"
+  ],
+  "is_local": false,
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "MolmoAct2Processor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

video_processing_molmoact2.py ADDED Viewed

	@@ -0,0 +1,969 @@

+"""Video processor class for MolmoAct2"""
+from functools import partial
+import os
+import warnings
+from contextlib import redirect_stdout
+from io import BytesIO
+from urllib.parse import urlparse
+from typing import Optional, Union, Callable
+import numpy as np
+import requests
+import einops
+import torch
+import torchvision.transforms
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+    validate_kwargs,
+)
+from transformers.video_utils import (
+    VideoInput,
+    is_valid_video,
+    make_batched_videos,
+    make_batched_metadata,
+    VideoMetadata,
+)
+from transformers.processing_utils import Unpack, VideosKwargs
+from transformers.video_processing_utils import BaseVideoProcessor
+from transformers.utils import logging
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.utils import (
+    is_av_available,
+    is_decord_available,
+    is_torchcodec_available,
+    is_yt_dlp_available,
+    TensorType,
+    logging,
+    to_numpy,
+)
+logger = logging.get_logger(__name__)
+MAX_VIDEO_FPS = 8
+def normalize_image(
+    image: np.ndarray,
+    image_mean: list[float],
+    image_std: list[float],
+) -> np.ndarray:
+    if np.allclose(image_mean, [0.5, 0.5, 0.5]) and np.allclose(image_std, [0.5, 0.5, 0.5]):
+        return image * np.asarray(2.0, dtype=np.float32) - np.asarray(1.0, dtype=np.float32)
+    image -= np.array(image_mean, dtype=np.float32)[None, None, :]
+    image /= np.array(image_std, dtype=np.float32)[None, None, :]
+    return image
+def resize_image(
+    image: np.ndarray,
+    desired_output_size: list[int],
+    resample: PILImageResampling,
+) -> np.ndarray:
+    if len(image.shape) == 3:
+        is_video = False
+        image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+    else:
+        is_video = True
+        image = torch.permute(torch.from_numpy(image), [0, 3, 1, 2])
+    dtype = image.dtype
+    if torch.is_floating_point(image):
+        in_min = 0.0
+        in_max = 1.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0.0, 1.0).to(dtype)
+    else:
+        assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(image.dtype)
+        in_min = 0.0
+        in_max = 255.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0, 255).to(dtype)
+    resized = resized.to(torch.float32)
+    resized = (resized - in_min) / (in_max - in_min)
+    if is_video:
+        resized = torch.permute(resized, [0, 2, 3, 1]).numpy()
+    else:
+        resized = torch.permute(resized, [1, 2, 0]).numpy()
+    return resized
+def build_resized_image(
+    image: np.ndarray,
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    resized = resize_image(
+        image, base_image_input_size, resample,
+    )
+    resized = normalize_image(resized, image_mean, image_std)
+    if len(resized.shape) == 3:
+        resized = np.expand_dims(resized, 0)
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    resize_idx = np.arange(crop_patch_w*crop_patch_h).reshape([crop_patch_h, crop_patch_w])
+    return resized, resize_idx
+def batch_pixels_to_patches(array: np.ndarray, patch_size: int) -> np.ndarray:
+    """Reshape images of [n_images, h, w, 3] -> [n_images, n_patches, pixels_per_patch]"""
+    if len(array.shape) == 3:
+        n_crops, h, w = array.shape
+        h_patches = h//patch_size
+        w_patches = w//patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size])
+        array = np.transpose(array, [0, 1, 3, 2, 4])
+        array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size])
+        return array
+    else:
+        n_crops, h, w, c = array.shape
+        h_patches = h//patch_size
+        w_patches = w//patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size, c])
+        array = np.transpose(array, [0, 1, 3, 2, 4, 5])
+        array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size*c])
+        return array
+def arange_for_pooling(
+    idx_arr: np.ndarray,
+    pool_h: int,
+    pool_w: int,
+) -> np.ndarray:
+    h_pad = pool_h * ((idx_arr.shape[0] + pool_h - 1) // pool_h) - idx_arr.shape[0]
+    w_pad = pool_w * ((idx_arr.shape[1] + pool_w - 1) // pool_w) - idx_arr.shape[1]
+    idx_arr = np.pad(idx_arr, [[h_pad//2, (h_pad+1)//2], [w_pad//2, (w_pad+1)//2]],
+                     mode='constant',constant_values=-1)
+    return einops.rearrange(
+        idx_arr, "(h dh) (w dw) -> h w (dh dw)", dh=pool_h, dw=pool_w)
+def image_to_patches_and_grids(
+    image: ImageInput,
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+    image_pooling_w: int,
+    image_pooling_h: int,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    :return image_grids, the shape of each image after pooling
+    :return crops, the image crops to processes with the ViT
+    :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
+                                patches in `crops` to pool for that token, masked with -1
+    """
+    if isinstance(base_image_input_size, int):
+        base_image_input_size = (base_image_input_size, base_image_input_size)
+    pooling_w = image_pooling_w
+    pooling_h = image_pooling_h
+    resized, resize_idx = build_resized_image(
+        image,
+        base_image_input_size,
+        resample,
+        image_mean,
+        image_std,
+        image_patch_size,
+    )
+    pooling_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+    h, w = pooling_idx.shape[:2]
+    pooling_idx = pooling_idx.reshape([-1, pooling_h*pooling_w])
+    image_grid = [h, w]
+    return (
+        image_grid,
+        batch_pixels_to_patches(resized, image_patch_size),
+        pooling_idx,
+    )
+def get_candidate_target_fps(
+    video_fps: Union[int, float],
+    sampling_fps: Union[int, float],
+    max_fps: Union[int, float] = MAX_VIDEO_FPS,
+) -> list[float]:
+    """
+    Return the subset of `video_fps` factors that remain multiples of `sampling_fps`.
+    Examples:
+        >>> get_candidate_target_fps(video_fps=6, sampling_fps=2)
+        [2, 6]
+        >>> get_candidate_target_fps(video_fps=5, sampling_fps=1)
+        [1, 5]
+        >>> get_candidate_target_fps(video_fps=2, sampling_fps=2)
+        [2]
+        >>> get_candidate_target_fps(video_fps=5, sampling_fps=2)
+        Traceback (most recent call last):
+            ...
+        ValueError: sampling_fps=2 must divide video_fps=5 to produce consistent frame steps.
+    """
+    video_fps = int(video_fps)
+    sampling_fps = int(sampling_fps)
+    max_fps = int(max_fps)
+    if sampling_fps is None:
+        raise ValueError("sampling_fps must be provided")
+    if video_fps <= 0 or sampling_fps <= 0:
+        raise ValueError(f"video_fps and sampling_fps must be positive (got {video_fps}, {sampling_fps})")
+    if video_fps % sampling_fps != 0:
+        raise ValueError(f"sampling_fps={sampling_fps} must divide video_fps={video_fps}.")
+    candidates = []
+    for candidate in range(sampling_fps, video_fps + 1, sampling_fps):
+        if candidate > max_fps:
+            break
+        if video_fps % candidate == 0:
+            candidates.append(float(candidate))
+    return candidates
+def read_video_decord(
+    video_path,
+    sample_timestamps_fn: Callable,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Decode a video using the Decord backend.
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import from decord
+    import importlib
+    decord = importlib.import_module("decord")
+    vr = decord.VideoReader(uri=video_path, ctx=decord.cpu(0))  # decord has problems with gpu
+    video_fps = vr.get_avg_fps()
+    total_num_frames = len(vr)
+    time_stamps = vr.get_frame_timestamp(list(range(len(vr))))
+    duration = time_stamps[-1][1] - time_stamps[0][0]
+    metadata = VideoMetadata(
+        total_num_frames=int(total_num_frames),
+        fps=float(video_fps),
+        duration=float(duration),
+        video_backend="decord",
+    )
+    target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
+    target_timestamps = np.array(target_timestamps)
+    offset = time_stamps[0, 0]
+    ix = np.searchsorted(time_stamps[:, 1], target_timestamps + offset, side='right')
+    ix = np.minimum(ix, len(time_stamps) - 1)
+    video = vr.get_batch(ix).asnumpy()
+    metadata.update(
+        {
+            "frames_indices": target_timestamps * video_fps,
+            "height": video.shape[1],
+            "width": video.shape[2],
+        }
+    )
+    return video, metadata
+def read_video_torchcodec(
+    video_path,
+    sample_timestamps_fn: Callable,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Decode a video using torchcodec decoder.
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import torchcodec
+    import importlib
+    torchcodec = importlib.import_module("torchcodec")
+    decoder = torchcodec.decoders.VideoDecoder(
+        video_path,
+        # Interestingly `exact` mode takes less than approximate when we load the whole video
+        seek_mode="exact",
+        # Allow FFmpeg decide on the number of threads for efficiency
+        num_ffmpeg_threads=0,
+    )
+    # If the first frame starts at > 0, we effectively clip the video starting at that time
+    # since (most) video players would also skip to that time
+    time_offset = decoder.metadata.begin_stream_seconds_from_content
+    # Note this duration does assume we started playing at `time_offset`
+    duration = decoder.metadata.duration_seconds
+    metadata = VideoMetadata(
+        total_num_frames=decoder.metadata.num_frames,
+        fps=decoder.metadata.average_fps,
+        duration=duration,
+        video_backend="torchcodec",
+        height=decoder.metadata.height,
+        width=decoder.metadata.width,
+    )
+    target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
+    # Floating point/rounding issues might cause `target_timestamps` to be very slightly
+    # out-of-bounds, to handle this we sanity check then clip them
+    assert all(x >= 0 for x in target_timestamps)
+    assert all(x < duration+1e-6 for x in target_timestamps)
+    # 1e-6 padding since torchcodec can throw out-of-bounds errors even if you ask for the
+    # exact boundary value, we should still get the first/last frame anyway
+    max_timestamp = decoder.metadata.end_stream_seconds_from_content - 1e-6
+    min_timestamp = decoder.metadata.begin_stream_seconds_from_content + 1e-6
+    # Note we avoid using numpy ops here to reduce floating precision issues
+    timestamps = [x + time_offset for x in target_timestamps]
+    timestamps = [max(min_timestamp, min(max_timestamp, x)) for x in timestamps]
+    video = decoder.get_frames_played_at(timestamps).data.numpy().transpose(0, 2, 3, 1)  # Convert to THWC format
+    target_timestamps = np.array(target_timestamps)
+    metadata.frames_indices = target_timestamps * metadata.fps
+    return video, metadata
+def read_video_pyav(
+    video_path,
+    sample_timestamps_fn: Callable,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Decode a video using the PyAV backend.
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import torchcodec
+    import importlib
+    av = importlib.import_module("av")
+    with av.open(video_path) as container:
+        video_stream = container.streams.video[0]
+        fps = video_stream.average_rate or video_stream.guessed_rate
+        it = container.decode(video=0)
+        frames = list(it)
+        stream = container.streams.video[0]
+        start = frames[0].pts * stream.time_base
+        container_end = stream.duration
+        if container_end is not None:
+            container_end *= stream.time_base
+        if container_end is None or container_end < frames[-1].pts:
+            # Some problem with stream duration, so use the frame PTS directly
+            # and guess the duration of the last frame
+            end = frames[-1].pts * stream.time_base + 1/fps
+        else:
+            end = container_end
+        duration = float(end - start)
+        metadata = VideoMetadata(
+            total_num_frames=len(frames),
+            fps=float(fps),
+            duration=float(duration),
+            video_backend="pyav",
+            height=video_stream.height,
+            width=video_stream.width,
+        )
+        target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
+        offset = float(start)
+        target_timestamps = np.array(target_timestamps)
+        end_time_stamps = np.array([float(frame.pts * stream.time_base) for frame in frames[1:]] + [duration])
+        indices = np.searchsorted(end_time_stamps, target_timestamps + offset, side='right')
+        indices = np.minimum(indices, len(end_time_stamps) - 1)
+        video = np.stack(
+            [frames[i].to_ndarray(format="rgb24", channel_last=True) for i in indices],
+            axis=0,
+        )
+        metadata.frames_indices = target_timestamps * fps
+        return video, metadata
+VIDEO_DECODERS = {
+    "decord": read_video_decord,
+    "torchcodec": read_video_torchcodec,
+    "pyav": read_video_pyav,
+}
+def load_video(
+    video: VideoInput,
+    backend: str = "decord",
+    sample_timestamps_fn: Optional[Callable] = None,
+    **kwargs,
+):
+    """
+    Loads `video` to a numpy array.
+    Args:
+        video (`VideoInput`):
+            The video to convert to the numpy array format. Can be a link to video or local path.
+        backend (`str`, *optional*, defaults to `"decord"`):
+            The backend to use when loading the video. Can be any of ["decord", "pyav", ""torchcodec"]. Defaults to "decord".
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+    """
+    # Early exit if provided an array or `PIL` frames
+    if not isinstance(video, str):
+        metadata = [None] * len(video)
+        return video, metadata
+    if urlparse(video).netloc in ["www.youtube.com", "youtube.com"]:
+        if not is_yt_dlp_available():
+            raise ImportError("To load a video from YouTube url you have  to install `yt_dlp` first.")
+        # Lazy import from yt_dlp
+        import importlib
+        yt_dlp = importlib.import_module("yt_dlp")
+        buffer = BytesIO()
+        with redirect_stdout(buffer), yt_dlp.YoutubeDL() as f:
+            f.download([video])
+        bytes_obj = buffer.getvalue()
+        file_obj = BytesIO(bytes_obj)
+    elif video.startswith("http://") or video.startswith("https://"):
+        file_obj = BytesIO(requests.get(video).content)
+    elif os.path.isfile(video):
+        file_obj = video
+    else:
+        raise TypeError("Incorrect format used for video. Should be an url linking to an video or a local path.")
+    # can also load with decord, but not cv2/torchvision
+    # both will fail in case of url links
+    video_is_url = video.startswith("http://") or video.startswith("https://")
+    if video_is_url and backend == "opencv":
+        raise ValueError("If you are trying to load a video from URL, you cannot use 'opencv' as backend")
+    if (
+        (not is_decord_available() and backend == "decord")
+        or (not is_torchcodec_available() and backend == "torchcodec")
+        or (not is_av_available() and backend == "pyav")
+    ):
+        raise ImportError(
+            f"You chose backend={backend} for loading the video but the required library is not found in your environment "
+            f"Make sure to install {backend} before loading the video."
+        )
+    video_decoder = VIDEO_DECODERS[backend]
+    video, metadata = video_decoder(file_obj, sample_timestamps_fn, **kwargs)
+    return video, metadata
+def get_target_fps(
+    video_fps: float,
+    max_frames: int,
+    total_frames: int,
+    frame_sample_mode: str,
+    candidate_target_fps: tuple[float],
+) -> float:
+    """
+    Get the target fps that best spans the video and has the most frames sampled
+    """
+    num_frames_sampled = 0
+    selected_target_fps = None
+    for target_fps in candidate_target_fps:
+        step_size = max(int(video_fps / target_fps), 1)
+        num_frames_sampled_at_fps = int(total_frames / step_size)
+        if num_frames_sampled == 0:
+            if "uniform" in frame_sample_mode:
+                if num_frames_sampled_at_fps > max_frames:
+                    break
+            selected_target_fps = target_fps
+            num_frames_sampled = num_frames_sampled_at_fps
+        else:
+            # the candidate sampling fps increases so frame count can't decrease
+            assert num_frames_sampled <= num_frames_sampled_at_fps
+            if num_frames_sampled_at_fps > max_frames:
+                # choose the sampling fps that spans the video
+                continue
+            elif num_frames_sampled_at_fps > num_frames_sampled:
+                # both are less than max_frames, choose the one with higher density of frames sampled
+                selected_target_fps = target_fps
+                num_frames_sampled = num_frames_sampled_at_fps
+    return selected_target_fps
+def get_frame_times_and_chosen_fps(
+    selected_target_fps,
+    total_frames,
+    max_frames,
+    video_fps
+):
+    if selected_target_fps is None:
+        frame_indices = np.linspace(0, total_frames, max_frames, endpoint=False, dtype=int)
+    else:
+        step_size = max(int(video_fps / selected_target_fps), 1)
+        frame_indices = np.arange(0, total_frames, step_size)
+    if len(frame_indices) > max_frames:
+        frame_indices = frame_indices[:max_frames]
+    return selected_target_fps, frame_indices
+class MolmoAct2VideoProcessorKwargs(VideosKwargs, total=False):
+    patch_size: Optional[int]
+    pooling_size: Optional[list[int]]
+    frame_sample_mode: Optional[str]
+    max_fps: Optional[int]
+    sampling_fps: Optional[int]
+class MolmoAct2VideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BILINEAR
+    size = {"height": 378, "width": 378}
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    patch_size = 14
+    pooling_size = [3, 3]
+    do_sample_frames = True
+    frame_sample_mode = "uniform_last_frame"
+    max_fps = 2
+    sampling_fps = 2
+    valid_kwargs = MolmoAct2VideoProcessorKwargs
+    model_input_names = ["pixel_values_videos", "video_token_pooling", "video_grids"]
+    def __init__(self, **kwargs: Unpack[MolmoAct2VideoProcessorKwargs]):
+        super().__init__(**kwargs)
+        if self.size is not None and (
+            self.size.get("height", None) is None or self.size.get("width", None) is None
+        ):
+            raise ValueError("size must contain 'height' and 'width' keys.")
+    def _further_process_kwargs(
+        self,
+        size: Optional[SizeDict] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if size is not None and ("height" not in size or "width" not in size):
+            raise ValueError("size must contain 'height' and 'width' keys.")
+        return super()._further_process_kwargs(size=size, **kwargs)
+    def sample_times(
+        self,
+        metadata: VideoMetadata,
+        frame_sample_mode: str,
+        num_frames: int,
+        max_fps: Optional[int] = None,
+        sampling_fps: Optional[int] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Time-based sampling if an array video is passed
+        Args:
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            frame_sample_mode (`str`, *optional*):
+                Mode to sample frames. Defaults to `self.frame_sample_mode`.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            man_fps (`int`, *optional*):
+                Maximum frames per second to sample.
+            sampling_fps (`int`, *optional*):
+                Sampling frames per second. Defaults to `self.sampling_fps`.
+                Used when `frame_sample_mode` is `"fps"`.
+        """
+        frame_sample_mode = frame_sample_mode or self.frame_sample_mode
+        num_frames = num_frames or self.num_frames
+        sampling_fps = sampling_fps or self.sampling_fps
+        duration = metadata.duration or metadata.total_num_frames / metadata.fps
+        if frame_sample_mode == "fps":
+            candidate_target_fps = get_candidate_target_fps(metadata.fps, sampling_fps)
+            # Try larger and larger FPSs until we hit one that can't span the video
+            target_fps = candidate_target_fps[0]
+            for candidate_fps in candidate_target_fps[1:]:
+                if num_frames / candidate_fps < duration:
+                    break
+                target_fps = candidate_fps
+            times = np.arange(0, num_frames) / target_fps
+            times = times[times < duration]
+            return times
+        elif frame_sample_mode == "uniform_last_frame":
+            if max_fps is not None:
+                max_duration = (num_frames-1) / max_fps  # -1 to include the last frame
+                if max_duration < duration:
+                    times = np.linspace(
+                        0, duration, num=num_frames, endpoint=True, dtype=np.float64
+                    )
+                else:
+                    times = np.arange(0.0, stop=duration, step=1/max_fps)
+                    times = np.concatenate([times, [duration]], axis=0)
+                    assert len(times) <= num_frames
+            else:
+                times = np.linspace(
+                    0, duration, num=num_frames, endpoint=True, dtype=np.float64
+                )
+            return times
+        else:
+            raise NotImplementedError(frame_sample_mode)
+    def sample_frames(
+        self,
+        metadata: VideoMetadata,
+        frame_sample_mode: Optional[str] = None,
+        num_frames: Optional[int] = None,
+        max_fps: Optional[int] = None,
+        sampling_fps: Optional[int] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Frame-based sampling if an array video is passed
+        Args:
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            frame_sample_mode (`str`, *optional*):
+                Mode to sample frames. Defaults to `self.frame_sample_mode`.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            max_fps (`int`, *optional*):
+                Maximum frames per second to sample.
+            sampling_fps (`int`, *optional*):
+                Sampling frames per second. Defaults to `self.sampling_fps`.
+                Used when `frame_sample_mode` is `"fps"`.
+        """
+        frame_sample_mode = frame_sample_mode or self.frame_sample_mode
+        num_frames = num_frames or self.num_frames
+        sampling_fps = sampling_fps or self.sampling_fps
+        total_num_frames = metadata.total_num_frames
+        if frame_sample_mode == "uniform_last_frame" and max_fps is not None:
+            duration = total_num_frames / metadata.fps
+            if total_num_frames <= 2:
+                return np.arange(total_num_frames).astype(int)
+            if duration > (num_frames - 1) / max_fps:  # -1 to include the last frame
+                # uniform fallback
+                indices = np.linspace(
+                    0,
+                    total_num_frames - 1,
+                    num=min(num_frames, total_num_frames),
+                    endpoint=True,
+                ).astype(int)
+                return indices
+            else:
+                float_indices = np.arange(
+                    0.0, stop=total_num_frames - 1, step=float(metadata.fps / max_fps),
+                )
+                if np.round(float_indices[-1]) != total_num_frames - 1:
+                    float_indices = np.concatenate([float_indices, [total_num_frames - 1]], axis=0)
+                indices = np.round(float_indices).astype(int)
+                assert indices[-1] < total_num_frames
+                assert len(float_indices) <= num_frames
+                return indices
+        elif frame_sample_mode == "uniform_last_frame":
+            indices = np.linspace(
+                0, total_num_frames - 1, num=min(num_frames, total_num_frames), endpoint=True,
+            ).astype(int)
+            return indices
+        elif frame_sample_mode == "fps":
+            candidate_target_fps = get_candidate_target_fps(metadata.fps, sampling_fps)
+            selected_target_fps = get_target_fps(
+                metadata.fps,
+                num_frames,
+                total_num_frames,
+                frame_sample_mode,
+                candidate_target_fps,
+            )
+            _, indices = get_frame_times_and_chosen_fps(
+                selected_target_fps,
+                total_num_frames,
+                num_frames,
+                metadata.fps,
+            )
+            return indices
+        else:
+            raise NotImplementedError(frame_sample_mode)
+    def fetch_videos(
+        self,
+        video_url_or_urls: Union[str, list[str], list[list[str]]],
+        sample_timestamps_fn=None
+    ):
+        """
+        Convert a single or a list of urls into the corresponding `np.array` objects.
+        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
+        returned.
+        """
+        if (
+            (not is_decord_available())
+            and (not is_torchcodec_available())
+            and (not is_av_available())
+        ):
+            raise ImportError(
+                "MolmoAct2VideoProcessor requires `decord`, `torchcodec`, or `av` to be installed."
+            )
+        if is_decord_available():
+            backend = "decord"
+        elif is_torchcodec_available():
+            warnings.warn(
+                "`decord` is not installed and cannot be used to decode the video by default. "
+                "Falling back to `torchcodec`."
+            )
+            backend = "torchcodec"
+        else:
+            warnings.warn(
+                "`decord` is not installed and cannot be used to decode the video by default. "
+                "Falling back to `PyAV`."
+            )
+            backend = "pyav"
+        if isinstance(video_url_or_urls, list):
+            return list(zip(*[self.fetch_videos(x, sample_timestamps_fn=sample_timestamps_fn) for x in video_url_or_urls]))
+        else:
+            return load_video(video_url_or_urls, backend=backend, sample_timestamps_fn=sample_timestamps_fn)
+    def _decode_and_sample_videos(
+        self,
+        videos: VideoInput,
+        video_metadata: Union[VideoMetadata, dict],
+        do_sample_frames: Optional[bool] = None,
+        sample_indices_fn: Optional[Callable] = None,
+        sample_timestamps_fn: Optional[Callable] = None,
+    ):
+        """
+        Decode input videos and sample frames if needed.
+        """
+        videos = make_batched_videos(videos)
+        video_metadata = make_batched_metadata(videos, video_metadata=video_metadata)
+        # Framed-based sampling if an array video is passed
+        # Otherwise, time-based sampling with decoding
+        if is_valid_video(videos[0]) and do_sample_frames:
+            assert video_metadata[0].fps is not None, "FPS must be provided for video input"
+            sampled_videos = []
+            sampled_metadata = []
+            for video, metadata in zip(videos, video_metadata):
+                indices = sample_indices_fn(metadata=metadata)
+                metadata.frames_indices = indices
+                sampled_videos.append(video[indices])
+                sampled_metadata.append(metadata)
+            videos = sampled_videos
+            video_metadata = sampled_metadata
+        elif not is_valid_video(videos[0]):
+            if sample_indices_fn is None:
+                logger.warning(
+                    "do_sample_frames is False, but video array is not provided: "
+                    "Will decode the video and sample frames using MolmoAct2's default sampling mode"
+                )
+            if isinstance(videos[0], list):
+                raise ValueError(
+                    "A list of images is not supported for video input!"
+                )
+            else:
+                videos, video_metadata = self.fetch_videos(videos, sample_timestamps_fn=sample_timestamps_fn)
+        return videos, video_metadata
+    def _prepare_input_videos(
+        self,
+        videos: VideoInput,
+        **kwargs,
+    ) -> list[np.ndarray]:
+        processed_videos = [to_numpy(video) for video in videos]
+        return processed_videos
+    def preprocess(
+        self,
+        videos: VideoInput,
+        **kwargs: Unpack[MolmoAct2VideoProcessorKwargs],
+    ) -> BatchFeature:
+        validate_kwargs(
+            captured_kwargs=kwargs.keys(),
+            valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
+        )
+        # Set default kwargs from self. This ensures that if a kwarg is not provided
+        # by the user, it gets its default value from the instance, or is set to None.
+        for kwarg_name in self.valid_kwargs.__annotations__:
+            kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
+        do_sample_frames = kwargs.pop("do_sample_frames")
+        video_metadata = kwargs.pop("video_metadata")
+        sample_indices_fn = partial(self.sample_frames, **kwargs) if do_sample_frames else None
+        sample_timestamps_fn = partial(self.sample_times, **kwargs)
+        videos, video_metadata = self._decode_and_sample_videos(
+            videos,
+            video_metadata=video_metadata,
+            do_sample_frames=do_sample_frames,
+            sample_indices_fn=sample_indices_fn,
+            sample_timestamps_fn=sample_timestamps_fn,
+        )
+        videos = self._prepare_input_videos(videos=videos)
+        kwargs = self._further_process_kwargs(**kwargs)
+        return_metadata = kwargs.pop("return_metadata")
+        preprocessed_videos = self._preprocess(videos=videos, **kwargs)
+        if return_metadata:
+            preprocessed_videos["video_metadata"] = video_metadata
+        return preprocessed_videos
+    def _preprocess(
+        self,
+        videos: list[np.ndarray],
+        size: Optional[SizeDict] = None,
+        resample: Optional[PILImageResampling] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        patch_size: Optional[int] = None,
+        pooling_size: Optional[list[int]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess a video for the model.
+        Args:
+            videos (`VideoInput`):
+                Video to preprocess.
+            size (`SizeDict`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use when resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            pooling_size (`list[int]`, *optional*, defaults to `self.pooling_size`):
+                The pooling size of the vision adapter.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+        Returns:
+            A `BatchFeature` containing the following keys:
+                - `pixel_values_videos`: The preprocessed videos.
+                - `video_token_pooling`: The indices of the patches in `crops` to pool for each token in `video_tokens`.
+                - `video_grids`: The video grids.
+        """
+        if size.height is None or size.width is None:
+            raise ValueError("size must contain 'height' and 'width' keys.")
+        base_image_input_size = [size.height, size.width]
+        resample = resample or self.resample
+        image_mean = image_mean or self.image_mean
+        image_std = image_std or self.image_std
+        do_convert_rgb = do_convert_rgb or self.do_convert_rgb
+        patch_size = patch_size or self.patch_size
+        pooling_size = pooling_size or self.pooling_size
+        image_pooling_h, image_pooling_w = pooling_size
+        batch_grids = []
+        batch_crops = []
+        batch_pooled_patches_idx = []
+        for video in videos:
+            all_crops = []
+            pooled_patches_idx = []
+            for frame in video:
+                image_grid, crops, pooled_idx = image_to_patches_and_grids(
+                    frame,
+                    base_image_input_size,
+                    resample,
+                    image_mean,
+                    image_std,
+                    patch_size,
+                    image_pooling_w,
+                    image_pooling_h,
+                )
+                offset = sum(np.prod(x.shape[:2]) for x in all_crops)
+                pooled_idx_with_offset = np.where(pooled_idx >= 0, pooled_idx + offset, pooled_idx)
+                pooled_patches_idx.append(pooled_idx_with_offset)
+                all_crops.append(crops)
+            video_grid = np.array([len(video), image_grid[0], image_grid[1]])
+            all_crops = np.concatenate(all_crops, 0)
+            pooled_patches_idx = np.concatenate(pooled_patches_idx, 0)
+            batch_grids.append(video_grid)
+            batch_crops.append(all_crops)
+            batch_pooled_patches_idx.append(pooled_patches_idx)
+        video_grids = np.stack(batch_grids, 0)
+        pixel_values_videos = np.concatenate(batch_crops, 0)
+        video_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
+        data =dict(
+            pixel_values_videos=pixel_values_videos,
+            video_token_pooling=video_token_pooling,
+            video_grids=video_grids,
+        )
+        return BatchFeature(data, tensor_type=return_tensors)
+MolmoAct2VideoProcessor.register_for_auto_class()