Upload ZayaForCausalLM

Files changed (11) hide show

config.json ADDED Viewed

+{
+  "activation_func": "swiglu",
+  "activation_func_fp8_input_store": false,
+  "add_bias_linear": false,
+  "architectures": [
+    "ZayaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bias_activation_fusion": true,
+  "bos_token_id": 2,
+  "cca": true,
+  "cca_num_q_heads": 8,
+  "dtype": "float32",
+  "eos_token_id": 106,
+  "ffn_hidden_size": 4096,
+  "gated_linear_unit": true,
+  "hidden_size": 2048,
+  "kv_channels": 128,
+  "lm_head_bias": false,
+  "mamba_cache_dtype": "float32",
+  "max_position_embeddings": 131072,
+  "model_type": "zaya",
+  "moe_router_topk": 1,
+  "norm_epsilon": 1e-05,
+  "normalization": "RMSNorm",
+  "num_attention_heads": 16,
+  "num_experts": 16,
+  "num_hidden_layers": 80,
+  "num_key_value_heads": 2,
+  "num_query_groups": 2,
+  "pad_token_id": 0,
+  "partial_rotary_factor": 0.5,
+  "residual_in_fp32": true,
+  "rope_scaling": false,
+  "rope_theta": 5000000,
+  "scale_residual_merge": true,
+  "sliding_window": null,
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "vocab_size": 262272,
+  "zaya_mlp_expansion": 256,
+  "zaya_use_eda": true,
+  "zaya_use_mod": true
+}

generation_config.json ADDED Viewed

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.57.1"
+}

model-00001-of-00008.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ad50424422025ebc1c5e92894b40b1e93fb0f860bb72c8fb9687a501ca405a4
+size 4966526688

model-00002-of-00008.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:57db4bb23638465972e88f33cfc41b06ca2e91f2e8991cef4521a04d5e9a0340
+size 4982059552

model-00003-of-00008.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:90976cf2a55ab79609b115f82f11f279052abcb72a2b831a1f3045ed992d74c7
+size 4982059728

model-00004-of-00008.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8acd76fe82af6c5b72819c72ff971ea7446d526380497aff995c0bb1aea5406
+size 4982059728

model-00005-of-00008.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:9aa5f30fc4dfa664da6b40ba602647d739bee99dae46c351cd0c23df6bb69159
+size 4982059728

model-00006-of-00008.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:70f92b9c9624b96fdf4b2a6fe2d07141b7ac1e9de5364395c0656b11646db6c7
+size 4982059728

model-00007-of-00008.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:11dc765ba1addf95bc98496c0869006ef27f4bc65d65edb9be6ec5aa0050e76e
+size 4982059728

model-00008-of-00008.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:24f3bff5171578f453b47f20780fd691f7f4aba8d2e442c94a5a1ed0003a7619
+size 503402336

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff