ganeshnanduru commited on
Commit
6d96c78
·
verified ·
1 Parent(s): 3c60a0d

Upload ZayaForCausalLM

Browse files
config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_func": "swiglu",
3
+ "activation_func_fp8_input_store": false,
4
+ "add_bias_linear": false,
5
+ "architectures": [
6
+ "ZayaForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "bias_activation_fusion": true,
11
+ "bos_token_id": 2,
12
+ "cca": true,
13
+ "cca_num_q_heads": 8,
14
+ "dtype": "float32",
15
+ "eos_token_id": 106,
16
+ "ffn_hidden_size": 4096,
17
+ "gated_linear_unit": true,
18
+ "hidden_size": 2048,
19
+ "kv_channels": 128,
20
+ "lm_head_bias": false,
21
+ "mamba_cache_dtype": "float32",
22
+ "max_position_embeddings": 131072,
23
+ "model_type": "zaya",
24
+ "moe_router_topk": 1,
25
+ "norm_epsilon": 1e-05,
26
+ "normalization": "RMSNorm",
27
+ "num_attention_heads": 16,
28
+ "num_experts": 16,
29
+ "num_hidden_layers": 80,
30
+ "num_key_value_heads": 2,
31
+ "num_query_groups": 2,
32
+ "pad_token_id": 0,
33
+ "partial_rotary_factor": 0.5,
34
+ "residual_in_fp32": true,
35
+ "rope_scaling": false,
36
+ "rope_theta": 5000000,
37
+ "scale_residual_merge": true,
38
+ "sliding_window": null,
39
+ "transformers_version": "4.57.1",
40
+ "use_cache": true,
41
+ "vocab_size": 262272,
42
+ "zaya_mlp_expansion": 256,
43
+ "zaya_use_eda": true,
44
+ "zaya_use_mod": true
45
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.57.1"
7
+ }
model-00001-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ad50424422025ebc1c5e92894b40b1e93fb0f860bb72c8fb9687a501ca405a4
3
+ size 4966526688
model-00002-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57db4bb23638465972e88f33cfc41b06ca2e91f2e8991cef4521a04d5e9a0340
3
+ size 4982059552
model-00003-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90976cf2a55ab79609b115f82f11f279052abcb72a2b831a1f3045ed992d74c7
3
+ size 4982059728
model-00004-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8acd76fe82af6c5b72819c72ff971ea7446d526380497aff995c0bb1aea5406
3
+ size 4982059728
model-00005-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aa5f30fc4dfa664da6b40ba602647d739bee99dae46c351cd0c23df6bb69159
3
+ size 4982059728
model-00006-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70f92b9c9624b96fdf4b2a6fe2d07141b7ac1e9de5364395c0656b11646db6c7
3
+ size 4982059728
model-00007-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11dc765ba1addf95bc98496c0869006ef27f4bc65d65edb9be6ec5aa0050e76e
3
+ size 4982059728
model-00008-of-00008.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24f3bff5171578f453b47f20780fd691f7f4aba8d2e442c94a5a1ed0003a7619
3
+ size 503402336
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff