MiMo-V2.5-ASR-MLX-8bit / config.json
ailuntz's picture
Add metadata files for 8bit export
5b2f3b0 verified
{
"add_input_local_transformer": true,
"add_speech_sosp_eosp": false,
"architectures": [
"MiMoV2ASRForCausalLM"
],
"attention_bias": true,
"attention_dropout": 0.0,
"audio_channels": 8,
"delay_pattern": "0-1-2-3-4-5-6-7",
"dtype": "bfloat16",
"empty_loss_weight": 0.01,
"group_size": 4,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"input_full_attention": true,
"input_local_dim": 1024,
"input_local_layers": 6,
"intermediate_size": 11008,
"layer_types": [
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention"
],
"local_attn_dropout": 0.1,
"local_attn_heads": 64,
"local_dim": 1024,
"local_ffn_dim": 4096,
"local_hidden_dropout": 0.1,
"local_layers": 16,
"local_rotary_base": 640000,
"max_position_embeddings": 8192,
"max_window_layers": 28,
"mlp_layers": 1,
"model_type": "qwen2",
"n_rvq": 20,
"no_speech_loss": false,
"no_text_loss": false,
"num_attention_heads": 32,
"num_hidden_layers": 36,
"num_key_value_heads": 8,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 640000,
"sliding_window": null,
"speech_vocab_size": "1025-1025-129-129-129-129-129-129",
"speech_zeroemb_idx": "1024-1024-128-128-128-128-128-128",
"tie_word_embeddings": false,
"transformers_version": "4.57.1",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 151680,
"audio_config": {
"tokenizer_version": "v1",
"speech_vocab_size": "1025-1025-129-129-129-129-129-129",
"speech_zeroemb_idx": "1024-1024-128-128-128-128-128-128",
"group_size": 4,
"audio_channels": 8,
"input_local_layers": 6,
"input_local_dim": 1024,
"input_full_attention": true,
"input_local_attn_heads": 64,
"input_local_head_dim": 16,
"input_local_intermediate_size": 4096,
"input_local_hidden_dropout": 0.1,
"out_hidden_size": 4096,
"rope_theta": 640000,
"partial_rotary_factor": 1.0,
"projection_layers": 1,
"add_post_norm": true,
"audio_segment_size": 6000
},
"quantization": {
"group_size": 64,
"bits": 8,
"mode": "affine"
},
"quantization_config": {
"group_size": 64,
"bits": 8,
"mode": "affine"
}
}