MiMo-V2.5-NVFP4 / config.json
shadowlilac's picture
Update config.json
377ff5a verified
{
"attention_projection_layout": "fused_qkv",
"vocab_size": 152576,
"max_position_embeddings": 1048576,
"hidden_size": 4096,
"intermediate_size": 16384,
"num_hidden_layers": 48,
"num_attention_heads": 64,
"num_key_value_heads": 4,
"hidden_act": "silu",
"initializer_range": 0.02,
"layernorm_epsilon": 1e-05,
"use_cache": true,
"rope_theta": 10000000,
"rope_parameters": {
"rope_type": "default",
"type": "default",
"rope_theta": 10000000,
"partial_rotary_factor": 0.334
},
"auto_map": {
"AutoConfig": "configuration_mimo_v2.MiMoV2Config",
"AutoModel": "modeling_mimo_v2.MiMoV2Model",
"AutoModelForCausalLM": "modeling_mimo_v2.MiMoV2ForCausalLM"
},
"attention_dropout": 0.0,
"attention_bias": false,
"attention_value_scale": 0.707,
"head_dim": 192,
"v_head_dim": 128,
"swa_num_attention_heads": 64,
"swa_num_key_value_heads": 8,
"swa_head_dim": 192,
"swa_v_head_dim": 128,
"swa_rope_theta": 10000,
"sliding_window": 128,
"sliding_window_size": 128,
"add_full_attention_sink_bias": false,
"add_swa_attention_sink_bias": true,
"hybrid_block_size": null,
"hybrid_layer_pattern": [
0,
1,
1,
1,
1,
0,
1,
1,
1,
1,
1,
0,
1,
1,
1,
1,
1,
0,
1,
1,
1,
1,
1,
0,
1,
1,
1,
1,
1,
0,
1,
1,
1,
1,
1,
0,
1,
1,
1,
1,
1,
0,
1,
1,
1,
1,
1,
0
],
"partial_rotary_factor": 0.334,
"n_routed_experts": 256,
"moe_intermediate_size": 2048,
"num_experts_per_tok": 8,
"routed_scaling_factor": null,
"scoring_func": "sigmoid",
"topk_method": "noaux_tc",
"n_group": 1,
"topk_group": 1,
"norm_topk_prob": true,
"moe_layer_freq": [
0,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1
],
"vision_config": {
"depth": 28,
"fullatt_block_indexes": [
0,
9,
18,
27
],
"hidden_act": "silu",
"hidden_size": 1280,
"in_chans": 3,
"intermediate_size": 4608,
"num_heads": 32,
"num_key_value_heads": 8,
"num_query_groups": 4,
"out_hidden_size": 4096,
"patch_size": 16,
"spatial_merge_size": 2,
"spatial_patch_size": 16,
"temporal_patch_size": 2,
"tokens_per_second": 2,
"use_sink": true,
"visual_token_window_size": 64,
"vit_window_attn_types": [
-1,
0,
0,
0,
0,
1,
1,
1,
1,
-1,
0,
0,
0,
0,
1,
1,
1,
1,
-1,
0,
0,
0,
0,
1,
1,
1,
1,
-1
],
"window_size": 128
},
"audio_config": {
"add_post_norm": true,
"audio_channels": 20,
"audio_segment_size": 6000,
"group_size": 4,
"input_full_attention": true,
"input_local_attn_heads": 16,
"input_local_dim": 1024,
"input_local_head_dim": 64,
"input_local_hidden_dropout": 0.0,
"input_local_intermediate_size": 4096,
"input_local_layers": 6,
"out_hidden_size": 4096,
"partial_rotary_factor": 1.0,
"projection_layers": 2,
"rope_theta": 640000,
"speech_vocab_size": "1280",
"speech_zeroemb_idx": "1024"
},
"processor_config": {
"audio_avg_pooler": 2,
"audio_channels": 20,
"audio_end_token_id": 151674,
"audio_fmax": null,
"audio_fmin": 0,
"audio_group_size": 4,
"audio_hop_length": 240,
"audio_input_id_per_second": 25.0,
"audio_kernel_size": 3,
"audio_n_mels": 128,
"audio_nfft": 960,
"audio_sampling_rate": 24000,
"audio_segment_size": 6000,
"audio_start_token_id": 151673,
"audio_stride_size": 2,
"audio_token_id": 151669,
"audio_window_size": 960,
"audio_zeroemb_idx": [
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024
],
"fps": 1.0,
"image_max_pixels": 8388608,
"image_min_pixels": 8192,
"image_token_id": 151655,
"max_frames": 3600,
"merge_size": 2,
"min_frames": null,
"num_frames": null,
"pad_token_id": 151643,
"patch_size": 16,
"rope_type": "rope",
"temporal_compression_ratio": 1,
"temporal_patch_size": 2,
"use_per_grid_t_timestamps": false,
"use_video_timestamps": true,
"video_audio_interleave_length": 0.0,
"video_end_token_id": 151671,
"video_max_pixels": 8388608,
"video_min_pixels": 8192,
"video_process_num_threads": 16,
"video_start_token_id": 151670,
"video_token_id": 151656,
"video_tokens_per_second": 2,
"video_total_max_pixels": 268435456,
"vision_end_token_id": 151653,
"vision_start_token_id": 151652
},
"image_token_id": 151655,
"video_token_id": 151656,
"vision_start_token_id": 151652,
"vision_end_token_id": 151653,
"vision_model_type": "mimovl",
"audio_token_id": 151669,
"audio_start_token_id": 151673,
"audio_end_token_id": 151674,
"transformers_version": "5.6.2",
"architectures": [
"MiMoV2ForCausalLM"
],
"output_hidden_states": false,
"return_dict": true,
"dtype": "bfloat16",
"chunk_size_feed_forward": 0,
"is_encoder_decoder": false,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"problem_type": null,
"_name_or_path": "",
"tie_word_embeddings": false,
"attention_chunk_size": 128,
"eos_token_id": 151645,
"model_type": "mimo_v2",
"n_shared_experts": null,
"pad_token_id": 151643,
"layer_types": [
"full_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"full_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"full_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"full_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"full_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"full_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"full_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"full_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"sliding_attention",
"full_attention"
],
"output_attentions": false,
"quantization_config": {
"config_groups": {
"group_0": {
"input_activations": {
"dynamic": false,
"num_bits": 4,
"type": "float",
"group_size": 16
},
"weights": {
"dynamic": false,
"num_bits": 4,
"type": "float",
"group_size": 16
},
"targets": [
"Linear"
]
}
},
"ignore": [
"audio_encoder*",
"speech_embeddings*",
"visual*",
"lm_head",
"model.embed_tokens",
"model.norm",
"model.rotary_emb*",
"model.swa_rotary_emb*",
"model.mtp*",
"model.layers.0*",
"model.layers.1.self_attn*",
"model.layers.1.mlp.gate*",
"model.layers.1.mlp.shared_expert*",
"model.layers.1.mlp.shared_expert_gate*",
"model.layers.2.self_attn*",
"model.layers.2.mlp.gate*",
"model.layers.2.mlp.shared_expert*",
"model.layers.2.mlp.shared_expert_gate*",
"model.layers.3.self_attn*",
"model.layers.3.mlp.gate*",
"model.layers.3.mlp.shared_expert*",
"model.layers.3.mlp.shared_expert_gate*",
"model.layers.4.self_attn*",
"model.layers.4.mlp.gate*",
"model.layers.4.mlp.shared_expert*",
"model.layers.4.mlp.shared_expert_gate*",
"model.layers.5.self_attn*",
"model.layers.5.mlp.gate*",
"model.layers.5.mlp.shared_expert*",
"model.layers.5.mlp.shared_expert_gate*",
"model.layers.6.self_attn*",
"model.layers.6.mlp.gate*",
"model.layers.6.mlp.shared_expert*",
"model.layers.6.mlp.shared_expert_gate*",
"model.layers.7.self_attn*",
"model.layers.7.mlp.gate*",
"model.layers.7.mlp.shared_expert*",
"model.layers.7.mlp.shared_expert_gate*",
"model.layers.8.self_attn*",
"model.layers.8.mlp.gate*",
"model.layers.8.mlp.shared_expert*",
"model.layers.8.mlp.shared_expert_gate*",
"model.layers.9.self_attn*",
"model.layers.9.mlp.gate*",
"model.layers.9.mlp.shared_expert*",
"model.layers.9.mlp.shared_expert_gate*",
"model.layers.10.self_attn*",
"model.layers.10.mlp.gate*",
"model.layers.10.mlp.shared_expert*",
"model.layers.10.mlp.shared_expert_gate*",
"model.layers.11.self_attn*",
"model.layers.11.mlp.gate*",
"model.layers.11.mlp.shared_expert*",
"model.layers.11.mlp.shared_expert_gate*",
"model.layers.12.self_attn*",
"model.layers.12.mlp.gate*",
"model.layers.12.mlp.shared_expert*",
"model.layers.12.mlp.shared_expert_gate*",
"model.layers.13.self_attn*",
"model.layers.13.mlp.gate*",
"model.layers.13.mlp.shared_expert*",
"model.layers.13.mlp.shared_expert_gate*",
"model.layers.14.self_attn*",
"model.layers.14.mlp.gate*",
"model.layers.14.mlp.shared_expert*",
"model.layers.14.mlp.shared_expert_gate*",
"model.layers.15.self_attn*",
"model.layers.15.mlp.gate*",
"model.layers.15.mlp.shared_expert*",
"model.layers.15.mlp.shared_expert_gate*",
"model.layers.16.self_attn*",
"model.layers.16.mlp.gate*",
"model.layers.16.mlp.shared_expert*",
"model.layers.16.mlp.shared_expert_gate*",
"model.layers.17.self_attn*",
"model.layers.17.mlp.gate*",
"model.layers.17.mlp.shared_expert*",
"model.layers.17.mlp.shared_expert_gate*",
"model.layers.18.self_attn*",
"model.layers.18.mlp.gate*",
"model.layers.18.mlp.shared_expert*",
"model.layers.18.mlp.shared_expert_gate*",
"model.layers.19.self_attn*",
"model.layers.19.mlp.gate*",
"model.layers.19.mlp.shared_expert*",
"model.layers.19.mlp.shared_expert_gate*",
"model.layers.20.self_attn*",
"model.layers.20.mlp.gate*",
"model.layers.20.mlp.shared_expert*",
"model.layers.20.mlp.shared_expert_gate*",
"model.layers.21.self_attn*",
"model.layers.21.mlp.gate*",
"model.layers.21.mlp.shared_expert*",
"model.layers.21.mlp.shared_expert_gate*",
"model.layers.22.self_attn*",
"model.layers.22.mlp.gate*",
"model.layers.22.mlp.shared_expert*",
"model.layers.22.mlp.shared_expert_gate*",
"model.layers.23.self_attn*",
"model.layers.23.mlp.gate*",
"model.layers.23.mlp.shared_expert*",
"model.layers.23.mlp.shared_expert_gate*",
"model.layers.24.self_attn*",
"model.layers.24.mlp.gate*",
"model.layers.24.mlp.shared_expert*",
"model.layers.24.mlp.shared_expert_gate*",
"model.layers.25.self_attn*",
"model.layers.25.mlp.gate*",
"model.layers.25.mlp.shared_expert*",
"model.layers.25.mlp.shared_expert_gate*",
"model.layers.26.self_attn*",
"model.layers.26.mlp.gate*",
"model.layers.26.mlp.shared_expert*",
"model.layers.26.mlp.shared_expert_gate*",
"model.layers.27.self_attn*",
"model.layers.27.mlp.gate*",
"model.layers.27.mlp.shared_expert*",
"model.layers.27.mlp.shared_expert_gate*",
"model.layers.28.self_attn*",
"model.layers.28.mlp.gate*",
"model.layers.28.mlp.shared_expert*",
"model.layers.28.mlp.shared_expert_gate*",
"model.layers.29.self_attn*",
"model.layers.29.mlp.gate*",
"model.layers.29.mlp.shared_expert*",
"model.layers.29.mlp.shared_expert_gate*",
"model.layers.30.self_attn*",
"model.layers.30.mlp.gate*",
"model.layers.30.mlp.shared_expert*",
"model.layers.30.mlp.shared_expert_gate*",
"model.layers.31.self_attn*",
"model.layers.31.mlp.gate*",
"model.layers.31.mlp.shared_expert*",
"model.layers.31.mlp.shared_expert_gate*",
"model.layers.32.self_attn*",
"model.layers.32.mlp.gate*",
"model.layers.32.mlp.shared_expert*",
"model.layers.32.mlp.shared_expert_gate*",
"model.layers.33.self_attn*",
"model.layers.33.mlp.gate*",
"model.layers.33.mlp.shared_expert*",
"model.layers.33.mlp.shared_expert_gate*",
"model.layers.34.self_attn*",
"model.layers.34.mlp.gate*",
"model.layers.34.mlp.shared_expert*",
"model.layers.34.mlp.shared_expert_gate*",
"model.layers.35.self_attn*",
"model.layers.35.mlp.gate*",
"model.layers.35.mlp.shared_expert*",
"model.layers.35.mlp.shared_expert_gate*",
"model.layers.36.self_attn*",
"model.layers.36.mlp.gate*",
"model.layers.36.mlp.shared_expert*",
"model.layers.36.mlp.shared_expert_gate*",
"model.layers.37.self_attn*",
"model.layers.37.mlp.gate*",
"model.layers.37.mlp.shared_expert*",
"model.layers.37.mlp.shared_expert_gate*",
"model.layers.38.self_attn*",
"model.layers.38.mlp.gate*",
"model.layers.38.mlp.shared_expert*",
"model.layers.38.mlp.shared_expert_gate*",
"model.layers.39.self_attn*",
"model.layers.39.mlp.gate*",
"model.layers.39.mlp.shared_expert*",
"model.layers.39.mlp.shared_expert_gate*",
"model.layers.40.self_attn*",
"model.layers.40.mlp.gate*",
"model.layers.40.mlp.shared_expert*",
"model.layers.40.mlp.shared_expert_gate*",
"model.layers.41.self_attn*",
"model.layers.41.mlp.gate*",
"model.layers.41.mlp.shared_expert*",
"model.layers.41.mlp.shared_expert_gate*",
"model.layers.42.self_attn*",
"model.layers.42.mlp.gate*",
"model.layers.42.mlp.shared_expert*",
"model.layers.42.mlp.shared_expert_gate*",
"model.layers.43.self_attn*",
"model.layers.43.mlp.gate*",
"model.layers.43.mlp.shared_expert*",
"model.layers.43.mlp.shared_expert_gate*",
"model.layers.44.self_attn*",
"model.layers.44.mlp.gate*",
"model.layers.44.mlp.shared_expert*",
"model.layers.44.mlp.shared_expert_gate*",
"model.layers.45.self_attn*",
"model.layers.45.mlp.gate*",
"model.layers.45.mlp.shared_expert*",
"model.layers.45.mlp.shared_expert_gate*",
"model.layers.46.self_attn*",
"model.layers.46.mlp.gate*",
"model.layers.46.mlp.shared_expert*",
"model.layers.46.mlp.shared_expert_gate*",
"model.layers.47.self_attn*",
"model.layers.47.mlp.gate*",
"model.layers.47.mlp.shared_expert*",
"model.layers.47.mlp.shared_expert_gate*"
],
"quant_algo": "NVFP4",
"producer": {
"name": "modelopt",
"version": "0.39.0.dev290+gf9d9a71de.d20260407"
},
"quant_method": "modelopt"
}
}