| { |
| "action_end_token_id": 151933, |
| "action_expert_condition_source": "kv_cache", |
| "action_expert_config": { |
| "attn_dropout": 0.0, |
| "causal_attn": false, |
| "compile": "blocks", |
| "context_layer_norm": true, |
| "dropout": 0.0, |
| "ffn_multiple_of": 256, |
| "hidden_size": 768, |
| "implementation": "new", |
| "max_action_dim": 32, |
| "max_horizon": 32, |
| "mlp_ratio": 4.0, |
| "model_type": "molmoact2_action_expert", |
| "num_heads": 8, |
| "num_layers": 36, |
| "qk_norm": true, |
| "qk_norm_eps": 1e-06, |
| "rope": true, |
| "rope_on_cross_attention": true, |
| "timestep_embed_dim": 256 |
| }, |
| "action_expert_depth_gate": false, |
| "action_expert_depth_gate_init_bias": -4.0, |
| "action_expert_depth_gate_per_layer": false, |
| "action_expert_layer_mode": "per_layer", |
| "action_format": "both", |
| "action_horizon": 30, |
| "action_output_token_id": 151931, |
| "action_start_token_id": 151932, |
| "action_token_start_id": 151934, |
| "adapter_config": { |
| "attention_dropout": 0.0, |
| "attn_implementation": "sdpa", |
| "float32_attention": true, |
| "head_dim": 72, |
| "hidden_act": "silu", |
| "hidden_size": 1152, |
| "image_feature_dropout": 0.0, |
| "initializer_range": 0.02, |
| "intermediate_size": 9728, |
| "model_type": "molmoact2", |
| "num_attention_heads": 16, |
| "num_key_value_heads": 16, |
| "pooling_attention_mask": true, |
| "residual_dropout": 0.0, |
| "text_hidden_size": 2560, |
| "vit_layers": [ |
| -3, |
| -9 |
| ] |
| }, |
| "add_action_expert": true, |
| "add_control_tokens": true, |
| "add_setup_tokens": true, |
| "architectures": [ |
| "MolmoAct2ForConditionalGeneration" |
| ], |
| "auto_map": { |
| "AutoConfig": "configuration_molmoact2.MolmoAct2Config", |
| "AutoModelForImageTextToText": "modeling_molmoact2.MolmoAct2ForConditionalGeneration" |
| }, |
| "depth_end_token_id": 153984, |
| "depth_mode": 2, |
| "depth_output_token_id": 153982, |
| "depth_start_token_id": 153983, |
| "depth_token_start_id": 153985, |
| "dtype": "float32", |
| "enable_depth_reasoning": true, |
| "flow_matching_beta_alpha": 1.0, |
| "flow_matching_beta_beta": 1.5, |
| "flow_matching_cutoff": 1.0, |
| "flow_matching_num_steps": 10, |
| "flow_matching_time_offset": 0.001, |
| "flow_matching_time_scale": 0.999, |
| "frame_end_token_id": 155656, |
| "frame_start_token_id": 155655, |
| "image_col_id": 155651, |
| "image_end_token_id": 155649, |
| "image_high_res_id": 155650, |
| "image_low_res_id": 155654, |
| "image_patch_id": 155650, |
| "image_start_token_id": 155648, |
| "initializer_range": 0.02, |
| "low_res_image_start_token_id": 155652, |
| "mask_action_dim_padding": true, |
| "max_action_dim": 32, |
| "model_type": "molmoact2", |
| "n_obs_steps": 1, |
| "norm_stats_filename": "norm_stats.json", |
| "num_action_tokens": 2048, |
| "num_depth_codes": 100, |
| "num_depth_tokens": 128, |
| "num_state_tokens": 256, |
| "state_end_token_id": 151674, |
| "state_format": "discrete", |
| "state_start_token_id": 151673, |
| "state_token_start_id": 151675, |
| "text_config": { |
| "additional_vocab_size": 128, |
| "attention_dropout": 0.0, |
| "attn_implementation": "sdpa", |
| "embedding_dropout": 0.0, |
| "head_dim": 128, |
| "hidden_act": "silu", |
| "hidden_size": 2560, |
| "initializer_range": 0.02, |
| "intermediate_size": 9728, |
| "layer_norm_eps": 1e-06, |
| "max_position_embeddings": 16384, |
| "model_type": "molmoact2_text", |
| "norm_after": false, |
| "num_attention_heads": 32, |
| "num_hidden_layers": 36, |
| "num_key_value_heads": 8, |
| "qk_norm_type": "qwen3", |
| "qkv_bias": false, |
| "residual_dropout": 0.0, |
| "rope_parameters": { |
| "rope_theta": 5000000.0, |
| "rope_type": "default" |
| }, |
| "rope_scaling_layers": null, |
| "rope_theta": 5000000.0, |
| "tie_word_embeddings": false, |
| "use_cache": true, |
| "use_qk_norm": true, |
| "vocab_size": 155648 |
| }, |
| "tie_word_embeddings": false, |
| "transformers_version": "5.3.0", |
| "use_frame_special_tokens": true, |
| "vit_config": { |
| "attention_dropout": 0.0, |
| "attn_implementation": "sdpa", |
| "float32_attention": true, |
| "head_dim": 72, |
| "hidden_act": "gelu_pytorch_tanh", |
| "hidden_size": 1152, |
| "image_default_input_size": [ |
| 378, |
| 378 |
| ], |
| "image_num_pos": 729, |
| "image_patch_size": 14, |
| "initializer_range": 0.02, |
| "intermediate_size": 4304, |
| "layer_norm_eps": 1e-06, |
| "model_type": "molmoact2", |
| "num_attention_heads": 16, |
| "num_hidden_layers": 27, |
| "num_key_value_heads": 16, |
| "residual_dropout": 0.0 |
| }, |
| "bos_token_id": 151645, |
| "eos_token_id": 151645, |
| "pad_token_id": 151643 |
| } |