deepseek-ai/DeepSeek-V4-Pro
#153
by cmax123 - opened
- DeepSeek_V4.pdf +2 -2
- config.json +0 -1
- inference/model.py +2 -1
DeepSeek_V4.pdf
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4cbe4fcbd2888b25b2890a98cc6ef4ce0489df7c93e140b6f853c451d3f5c52
|
| 3 |
+
size 4479907
|
config.json
CHANGED
|
@@ -6,7 +6,6 @@
|
|
| 6 |
"attention_dropout": 0.0,
|
| 7 |
"bos_token_id": 0,
|
| 8 |
"eos_token_id": 1,
|
| 9 |
-
"expert_dtype": "fp4",
|
| 10 |
"hc_eps": 1e-06,
|
| 11 |
"hc_mult": 4,
|
| 12 |
"hc_sinkhorn_iters": 20,
|
|
|
|
| 6 |
"attention_dropout": 0.0,
|
| 7 |
"bos_token_id": 0,
|
| 8 |
"eos_token_id": 1,
|
|
|
|
| 9 |
"hc_eps": 1e-06,
|
| 10 |
"hc_mult": 4,
|
| 11 |
"hc_sinkhorn_iters": 20,
|
inference/model.py
CHANGED
|
@@ -624,7 +624,8 @@ class MoE(nn.Module):
|
|
| 624 |
self.experts = nn.ModuleList([Expert(args.dim, args.moe_inter_dim, dtype=expert_dtype, swiglu_limit=args.swiglu_limit) if self.experts_start_idx <= i < self.experts_end_idx else None
|
| 625 |
for i in range(self.n_routed_experts)])
|
| 626 |
assert args.n_shared_experts == 1
|
| 627 |
-
|
|
|
|
| 628 |
|
| 629 |
def forward(self, x: torch.Tensor, input_ids: torch.Tensor) -> torch.Tensor:
|
| 630 |
shape = x.size()
|
|
|
|
| 624 |
self.experts = nn.ModuleList([Expert(args.dim, args.moe_inter_dim, dtype=expert_dtype, swiglu_limit=args.swiglu_limit) if self.experts_start_idx <= i < self.experts_end_idx else None
|
| 625 |
for i in range(self.n_routed_experts)])
|
| 626 |
assert args.n_shared_experts == 1
|
| 627 |
+
# no swiglu_limit
|
| 628 |
+
self.shared_experts = Expert(args.dim, args.moe_inter_dim)
|
| 629 |
|
| 630 |
def forward(self, x: torch.Tensor, input_ids: torch.Tensor) -> torch.Tensor:
|
| 631 |
shape = x.size()
|