Files changed (3) hide show
  1. DeepSeek_V4.pdf +2 -2
  2. config.json +0 -1
  3. inference/model.py +2 -1
DeepSeek_V4.pdf CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a03dadae71894de1515da33e296f0df1dbeed3e7f4bf0badd087f9af77f29e9
3
- size 4480407
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4cbe4fcbd2888b25b2890a98cc6ef4ce0489df7c93e140b6f853c451d3f5c52
3
+ size 4479907
config.json CHANGED
@@ -6,7 +6,6 @@
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 0,
8
  "eos_token_id": 1,
9
- "expert_dtype": "fp4",
10
  "hc_eps": 1e-06,
11
  "hc_mult": 4,
12
  "hc_sinkhorn_iters": 20,
 
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 0,
8
  "eos_token_id": 1,
 
9
  "hc_eps": 1e-06,
10
  "hc_mult": 4,
11
  "hc_sinkhorn_iters": 20,
inference/model.py CHANGED
@@ -624,7 +624,8 @@ class MoE(nn.Module):
624
  self.experts = nn.ModuleList([Expert(args.dim, args.moe_inter_dim, dtype=expert_dtype, swiglu_limit=args.swiglu_limit) if self.experts_start_idx <= i < self.experts_end_idx else None
625
  for i in range(self.n_routed_experts)])
626
  assert args.n_shared_experts == 1
627
- self.shared_experts = Expert(args.dim, args.moe_inter_dim, swiglu_limit=args.swiglu_limit)
 
628
 
629
  def forward(self, x: torch.Tensor, input_ids: torch.Tensor) -> torch.Tensor:
630
  shape = x.size()
 
624
  self.experts = nn.ModuleList([Expert(args.dim, args.moe_inter_dim, dtype=expert_dtype, swiglu_limit=args.swiglu_limit) if self.experts_start_idx <= i < self.experts_end_idx else None
625
  for i in range(self.n_routed_experts)])
626
  assert args.n_shared_experts == 1
627
+ # no swiglu_limit
628
+ self.shared_experts = Expert(args.dim, args.moe_inter_dim)
629
 
630
  def forward(self, x: torch.Tensor, input_ids: torch.Tensor) -> torch.Tensor:
631
  shape = x.size()