qgallouedec HF Staff commited on
Commit
7f3a06f
·
verified ·
1 Parent(s): 5db63c1

Upload DeepseekV4ForCausalLM

Browse files
Files changed (4) hide show
  1. chat_template.jinja +10 -0
  2. config.json +51 -35
  3. generation_config.json +1 -1
  4. model.safetensors +2 -2
chat_template.jinja CHANGED
@@ -60,6 +60,16 @@
60
  {%- endif -%}
61
  {%- endfor -%}
62
 
 
 
 
 
 
 
 
 
 
 
63
  {%- set lu = namespace(idx=-1) -%}
64
  {%- for m in mns.list -%}
65
  {%- if m.role == "user" or m.role == "developer" -%}
 
60
  {%- endif -%}
61
  {%- endfor -%}
62
 
63
+ {%- if tools -%}
64
+ {%- set anchor_ns = namespace(found=false) -%}
65
+ {%- for m in mns.list -%}
66
+ {%- if m.role == "system" or m.role == "developer" -%}{%- set anchor_ns.found = true -%}{%- endif -%}
67
+ {%- endfor -%}
68
+ {%- if not anchor_ns.found -%}
69
+ {%- set mns.list = [{"role": "system", "content": ""}] + mns.list -%}
70
+ {%- endif -%}
71
+ {%- endif -%}
72
+
73
  {%- set lu = namespace(idx=-1) -%}
74
  {%- for m in mns.list -%}
75
  {%- if m.role == "user" or m.role == "developer" -%}
config.json CHANGED
@@ -5,26 +5,13 @@
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 0,
8
- "compress_ratios": [
9
- 0,
10
- 0,
11
- 4,
12
- 128
13
- ],
14
- "compress_rope_parameters": {
15
- "beta_fast": 32,
16
- "beta_slow": 1,
17
- "factor": 16,
18
- "original_max_position_embeddings": 65536,
19
- "partial_rotary_factor": 0.125,
20
- "rope_theta": 160000.0,
21
- "rope_type": "yarn",
22
- "type": "yarn"
23
  },
24
  "compress_rope_theta": 160000.0,
25
  "dtype": "bfloat16",
26
  "eos_token_id": 1,
27
- "first_k_dense_replace": 2,
28
  "hc_eps": 1e-06,
29
  "hc_mult": 4,
30
  "hc_sinkhorn_iters": 20,
@@ -35,18 +22,27 @@
35
  "index_n_heads": 64,
36
  "index_topk": 512,
37
  "initializer_range": 0.02,
38
- "intermediate_size": 32,
39
- "kv_lora_rank": null,
 
 
 
 
40
  "max_position_embeddings": 1048576,
 
 
 
 
 
 
 
41
  "model_type": "deepseek_v4",
42
- "moe_intermediate_size": 2048,
43
- "n_group": null,
44
- "n_routed_experts": 4,
45
  "n_shared_experts": 1,
46
  "norm_topk_prob": true,
47
  "num_attention_heads": 4,
48
- "num_experts_per_tok": 2,
49
- "num_hash_layers": 3,
50
  "num_hidden_layers": 4,
51
  "num_key_value_heads": 1,
52
  "num_nextn_predict_layers": 1,
@@ -55,21 +51,43 @@
55
  "output_router_logits": false,
56
  "pad_token_id": null,
57
  "partial_rotary_factor": 0.125,
58
- "pretraining_tp": 1,
59
  "q_lora_rank": 1024,
60
- "qk_nope_head_dim": 448,
61
  "qk_rope_head_dim": 64,
 
 
 
 
 
 
 
 
 
 
62
  "rms_norm_eps": 1e-06,
63
- "rope_interleave": true,
64
  "rope_parameters": {
65
- "beta_fast": 32,
66
- "beta_slow": 1,
67
- "factor": 16,
68
- "original_max_position_embeddings": 65536,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  "partial_rotary_factor": 0.125,
70
  "rope_theta": 10000.0,
71
- "rope_type": "yarn",
72
- "type": "yarn"
73
  },
74
  "rope_theta": 10000.0,
75
  "routed_scaling_factor": 1.5,
@@ -79,10 +97,8 @@
79
  "sliding_window": 128,
80
  "swiglu_limit": 10.0,
81
  "tie_word_embeddings": false,
82
- "topk_group": null,
83
  "topk_method": "noaux_tc",
84
- "transformers_version": "5.7.0.dev0",
85
  "use_cache": true,
86
- "v_head_dim": null,
87
  "vocab_size": 129280
88
  }
 
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 0,
8
+ "compress_rates": {
9
+ "compressed_sparse_attention": 4,
10
+ "heavily_compressed_attention": 128
 
 
 
 
 
 
 
 
 
 
 
 
11
  },
12
  "compress_rope_theta": 160000.0,
13
  "dtype": "bfloat16",
14
  "eos_token_id": 1,
 
15
  "hc_eps": 1e-06,
16
  "hc_mult": 4,
17
  "hc_sinkhorn_iters": 20,
 
22
  "index_n_heads": 64,
23
  "index_topk": 512,
24
  "initializer_range": 0.02,
25
+ "layer_types": [
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "compressed_sparse_attention",
29
+ "heavily_compressed_attention"
30
+ ],
31
  "max_position_embeddings": 1048576,
32
+ "mlp_bias": false,
33
+ "mlp_layer_types": [
34
+ "hash_moe",
35
+ "hash_moe",
36
+ "hash_moe",
37
+ "moe"
38
+ ],
39
  "model_type": "deepseek_v4",
40
+ "moe_intermediate_size": 32,
41
+ "n_routed_experts": 256,
 
42
  "n_shared_experts": 1,
43
  "norm_topk_prob": true,
44
  "num_attention_heads": 4,
45
+ "num_experts_per_tok": 6,
 
46
  "num_hidden_layers": 4,
47
  "num_key_value_heads": 1,
48
  "num_nextn_predict_layers": 1,
 
51
  "output_router_logits": false,
52
  "pad_token_id": null,
53
  "partial_rotary_factor": 0.125,
 
54
  "q_lora_rank": 1024,
 
55
  "qk_rope_head_dim": 64,
56
+ "quantization_config": {
57
+ "activation_scheme": "dynamic",
58
+ "fmt": "e4m3",
59
+ "quant_method": "fp8",
60
+ "scale_fmt": "ue8m0",
61
+ "weight_block_size": [
62
+ 128,
63
+ 128
64
+ ]
65
+ },
66
  "rms_norm_eps": 1e-06,
 
67
  "rope_parameters": {
68
+ "compress": {
69
+ "beta_fast": 32,
70
+ "beta_slow": 1,
71
+ "factor": 16,
72
+ "original_max_position_embeddings": 65536,
73
+ "partial_rotary_factor": 0.125,
74
+ "rope_theta": 160000,
75
+ "rope_type": "yarn",
76
+ "type": "yarn"
77
+ },
78
+ "main": {
79
+ "beta_fast": 32,
80
+ "beta_slow": 1,
81
+ "factor": 16,
82
+ "original_max_position_embeddings": 65536,
83
+ "partial_rotary_factor": 0.125,
84
+ "rope_theta": 10000,
85
+ "rope_type": "yarn",
86
+ "type": "yarn"
87
+ },
88
  "partial_rotary_factor": 0.125,
89
  "rope_theta": 10000.0,
90
+ "rope_type": "default"
 
91
  },
92
  "rope_theta": 10000.0,
93
  "routed_scaling_factor": 1.5,
 
97
  "sliding_window": 128,
98
  "swiglu_limit": 10.0,
99
  "tie_word_embeddings": false,
 
100
  "topk_method": "noaux_tc",
101
+ "transformers_version": "5.8.0",
102
  "use_cache": true,
 
103
  "vocab_size": 129280
104
  }
generation_config.json CHANGED
@@ -5,5 +5,5 @@
5
  "eos_token_id": 1,
6
  "temperature": 1.0,
7
  "top_p": 1.0,
8
- "transformers_version": "5.7.0.dev0"
9
  }
 
5
  "eos_token_id": 1,
6
  "temperature": 1.0,
7
  "top_p": 1.0,
8
+ "transformers_version": "5.8.0"
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4076740c031a3b6670e03573ec7f9ea4095e54c5dde682ac8e2eb629572cd39c
3
- size 37758460
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c87b575cc8bfd39f7ded48954bb03fc673799196e342d1a49332a849a309d664
3
+ size 75882058