File size: 1,208 Bytes
af4e1c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
{
  "model_id": "deepseek-ai/DeepSeek-V4-Pro",
  "moe_config": {
    "model_type": "deepseek_v4",
    "num_layers": 61,
    "hidden_size": 7168,
    "intermediate_size": 18432,
    "moe_intermediate_size": 3072,
    "n_routed_experts": 384,
    "n_shared_experts": 1,
    "num_experts_per_tok": 6,
    "first_k_dense_replace": 0,
    "torch_dtype": "bfloat16",
    "quant_method": "fp8",
    "templates": {
      "expert_gate_proj": "layers.{layer}.ffn.experts.{e}.w1.weight",
      "expert_up_proj": "layers.{layer}.ffn.experts.{e}.w3.weight",
      "expert_down_proj": "layers.{layer}.ffn.experts.{e}.w2.weight",
      "fused_gate_proj": "layers.{layer}.ffn.experts.w1",
      "fused_down_proj": "layers.{layer}.ffn.experts.w2",
      "shared_down_proj": [
        "layers.{layer}.ffn.shared_experts.w2.weight"
      ],
      "router": [
        "layers.{layer}.ffn.gate.weight",
        "layers.{layer}.ffn.router.weight"
      ],
      "dense_down_proj": [
        "layers.{layer}.ffn.w2.weight"
      ]
    }
  },
  "num_feats": 64,
  "provenance": {
    "aggregator_type": "moe",
    "quant_format": "fp8",
    "probe_mode": "weight_svd_per_expert",
    "fp8_handling": "cast_to_bfloat16_for_svd"
  }
}