File size: 1,208 Bytes
af4e1c7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | {
"model_id": "deepseek-ai/DeepSeek-V4-Pro",
"moe_config": {
"model_type": "deepseek_v4",
"num_layers": 61,
"hidden_size": 7168,
"intermediate_size": 18432,
"moe_intermediate_size": 3072,
"n_routed_experts": 384,
"n_shared_experts": 1,
"num_experts_per_tok": 6,
"first_k_dense_replace": 0,
"torch_dtype": "bfloat16",
"quant_method": "fp8",
"templates": {
"expert_gate_proj": "layers.{layer}.ffn.experts.{e}.w1.weight",
"expert_up_proj": "layers.{layer}.ffn.experts.{e}.w3.weight",
"expert_down_proj": "layers.{layer}.ffn.experts.{e}.w2.weight",
"fused_gate_proj": "layers.{layer}.ffn.experts.w1",
"fused_down_proj": "layers.{layer}.ffn.experts.w2",
"shared_down_proj": [
"layers.{layer}.ffn.shared_experts.w2.weight"
],
"router": [
"layers.{layer}.ffn.gate.weight",
"layers.{layer}.ffn.router.weight"
],
"dense_down_proj": [
"layers.{layer}.ffn.w2.weight"
]
}
},
"num_feats": 64,
"provenance": {
"aggregator_type": "moe",
"quant_format": "fp8",
"probe_mode": "weight_svd_per_expert",
"fp8_handling": "cast_to_bfloat16_for_svd"
}
} |