Upload ZayaForCausalLM
Browse files- config.json +45 -0
- generation_config.json +7 -0
- model-00001-of-00008.safetensors +3 -0
- model-00002-of-00008.safetensors +3 -0
- model-00003-of-00008.safetensors +3 -0
- model-00004-of-00008.safetensors +3 -0
- model-00005-of-00008.safetensors +3 -0
- model-00006-of-00008.safetensors +3 -0
- model-00007-of-00008.safetensors +3 -0
- model-00008-of-00008.safetensors +3 -0
- model.safetensors.index.json +0 -0
config.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_func": "swiglu",
|
| 3 |
+
"activation_func_fp8_input_store": false,
|
| 4 |
+
"add_bias_linear": false,
|
| 5 |
+
"architectures": [
|
| 6 |
+
"ZayaForCausalLM"
|
| 7 |
+
],
|
| 8 |
+
"attention_bias": false,
|
| 9 |
+
"attention_dropout": 0.0,
|
| 10 |
+
"bias_activation_fusion": true,
|
| 11 |
+
"bos_token_id": 2,
|
| 12 |
+
"cca": true,
|
| 13 |
+
"cca_num_q_heads": 8,
|
| 14 |
+
"dtype": "float32",
|
| 15 |
+
"eos_token_id": 106,
|
| 16 |
+
"ffn_hidden_size": 4096,
|
| 17 |
+
"gated_linear_unit": true,
|
| 18 |
+
"hidden_size": 2048,
|
| 19 |
+
"kv_channels": 128,
|
| 20 |
+
"lm_head_bias": false,
|
| 21 |
+
"mamba_cache_dtype": "float32",
|
| 22 |
+
"max_position_embeddings": 131072,
|
| 23 |
+
"model_type": "zaya",
|
| 24 |
+
"moe_router_topk": 1,
|
| 25 |
+
"norm_epsilon": 1e-05,
|
| 26 |
+
"normalization": "RMSNorm",
|
| 27 |
+
"num_attention_heads": 16,
|
| 28 |
+
"num_experts": 16,
|
| 29 |
+
"num_hidden_layers": 80,
|
| 30 |
+
"num_key_value_heads": 2,
|
| 31 |
+
"num_query_groups": 2,
|
| 32 |
+
"pad_token_id": 0,
|
| 33 |
+
"partial_rotary_factor": 0.5,
|
| 34 |
+
"residual_in_fp32": true,
|
| 35 |
+
"rope_scaling": false,
|
| 36 |
+
"rope_theta": 5000000,
|
| 37 |
+
"scale_residual_merge": true,
|
| 38 |
+
"sliding_window": null,
|
| 39 |
+
"transformers_version": "4.57.1",
|
| 40 |
+
"use_cache": true,
|
| 41 |
+
"vocab_size": 262272,
|
| 42 |
+
"zaya_mlp_expansion": 256,
|
| 43 |
+
"zaya_use_eda": true,
|
| 44 |
+
"zaya_use_mod": true
|
| 45 |
+
}
|
generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 2,
|
| 4 |
+
"eos_token_id": 1,
|
| 5 |
+
"pad_token_id": 0,
|
| 6 |
+
"transformers_version": "4.57.1"
|
| 7 |
+
}
|
model-00001-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ad50424422025ebc1c5e92894b40b1e93fb0f860bb72c8fb9687a501ca405a4
|
| 3 |
+
size 4966526688
|
model-00002-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57db4bb23638465972e88f33cfc41b06ca2e91f2e8991cef4521a04d5e9a0340
|
| 3 |
+
size 4982059552
|
model-00003-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90976cf2a55ab79609b115f82f11f279052abcb72a2b831a1f3045ed992d74c7
|
| 3 |
+
size 4982059728
|
model-00004-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8acd76fe82af6c5b72819c72ff971ea7446d526380497aff995c0bb1aea5406
|
| 3 |
+
size 4982059728
|
model-00005-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9aa5f30fc4dfa664da6b40ba602647d739bee99dae46c351cd0c23df6bb69159
|
| 3 |
+
size 4982059728
|
model-00006-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:70f92b9c9624b96fdf4b2a6fe2d07141b7ac1e9de5364395c0656b11646db6c7
|
| 3 |
+
size 4982059728
|
model-00007-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:11dc765ba1addf95bc98496c0869006ef27f4bc65d65edb9be6ec5aa0050e76e
|
| 3 |
+
size 4982059728
|
model-00008-of-00008.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24f3bff5171578f453b47f20780fd691f7f4aba8d2e442c94a5a1ed0003a7619
|
| 3 |
+
size 503402336
|
model.safetensors.index.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|