Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

config.json +142 -0
generation_config.json +9 -0
model.safetensors +3 -0
model.safetensors.index.json +157 -0
tokenizer.json +0 -0
tokenizer_config.json +14 -0

config.json ADDED Viewed

	@@ -0,0 +1,142 @@

+{
+  "architectures": [
+    "DeepseekV4ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "compress_ratios": [
+    0,
+    0,
+    4,
+    128,
+    4
+  ],
+  "compress_rope_theta": 160000.0,
+  "eos_token_id": 1,
+  "hc_eps": 1e-06,
+  "hc_mult": 4,
+  "hc_sinkhorn_iters": 20,
+  "head_dim": 512,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "index_head_dim": 128,
+  "index_n_heads": 64,
+  "index_topk": 512,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 1048576,
+  "model_type": "deepseek_v4",
+  "moe_intermediate_size": 2048,
+  "n_routed_experts": 256,
+  "n_shared_experts": 1,
+  "norm_topk_prob": true,
+  "num_attention_heads": 64,
+  "num_experts_per_tok": 6,
+  "num_hash_layers": 3,
+  "num_hidden_layers": 5,
+  "num_key_value_heads": 1,
+  "num_nextn_predict_layers": 0,
+  "o_groups": 8,
+  "o_lora_rank": 1024,
+  "q_lora_rank": 1024,
+  "qk_rope_head_dim": 64,
+  "quantization": {
+    "group_size": 32,
+    "bits": 4,
+    "mode": "mxfp4",
+    "model.layers.0.ffn.switch_mlp.gate_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.0.ffn.switch_mlp.up_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.0.ffn.switch_mlp.down_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.1.ffn.switch_mlp.gate_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.1.ffn.switch_mlp.up_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.1.ffn.switch_mlp.down_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.2.ffn.switch_mlp.gate_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.2.ffn.switch_mlp.up_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.2.ffn.switch_mlp.down_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.3.ffn.switch_mlp.gate_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.3.ffn.switch_mlp.up_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.3.ffn.switch_mlp.down_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.4.ffn.switch_mlp.gate_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.4.ffn.switch_mlp.up_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    },
+    "model.layers.4.ffn.switch_mlp.down_proj": {
+      "group_size": 32,
+      "bits": 4,
+      "mode": "mxfp4"
+    }
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "beta_fast": 32,
+    "beta_slow": 1,
+    "factor": 16,
+    "original_max_position_embeddings": 65536,
+    "type": "yarn"
+  },
+  "rope_theta": 10000.0,
+  "routed_scaling_factor": 1.5,
+  "scoring_func": "sqrtsoftplus",
+  "sliding_window": 128,
+  "swiglu_limit": 10.0,
+  "tie_word_embeddings": false,
+  "topk_method": "noaux_tc",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "vocab_size": 129280
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "do_sample": true,
+  "temperature": 1.0,
+  "top_p": 1.0,
+  "transformers_version": "4.46.3"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6aef629b91b2830c6cdcc53310a06ff535e1e37d3600339083fe9a43106b997e
+size 67994148004

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,157 @@

+{
+  "metadata": {
+    "total_size": 67994131276
+  },
+  "weight_map": {
+    "lm_head.weight": "model.safetensors",
+    "model.embed_tokens.weight": "model.safetensors",
+    "model.hc_head.base": "model.safetensors",
+    "model.hc_head.fn": "model.safetensors",
+    "model.hc_head.scale": "model.safetensors",
+    "model.layers.0.attn.attn_sink": "model.safetensors",
+    "model.layers.0.attn.kv_norm.weight": "model.safetensors",
+    "model.layers.0.attn.q_norm.weight": "model.safetensors",
+    "model.layers.0.attn.wkv.weight": "model.safetensors",
+    "model.layers.0.attn.wo_a.weight": "model.safetensors",
+    "model.layers.0.attn.wo_b.weight": "model.safetensors",
+    "model.layers.0.attn.wq_a.weight": "model.safetensors",
+    "model.layers.0.attn.wq_b.weight": "model.safetensors",
+    "model.layers.0.attn_hc.base": "model.safetensors",
+    "model.layers.0.attn_hc.fn": "model.safetensors",
+    "model.layers.0.attn_hc.scale": "model.safetensors",
+    "model.layers.0.attn_norm.weight": "model.safetensors",
+    "model.layers.0.ffn.gate.tid2eid": "model.safetensors",
+    "model.layers.0.ffn.gate.weight": "model.safetensors",
+    "model.layers.0.ffn.shared_experts.down_proj.weight": "model.safetensors",
+    "model.layers.0.ffn.shared_experts.gate_proj.weight": "model.safetensors",
+    "model.layers.0.ffn.shared_experts.up_proj.weight": "model.safetensors",
+    "model.layers.0.ffn.switch_mlp.down_proj.weight": "model.safetensors",
+    "model.layers.0.ffn.switch_mlp.gate_proj.weight": "model.safetensors",
+    "model.layers.0.ffn.switch_mlp.up_proj.weight": "model.safetensors",
+    "model.layers.0.ffn_hc.base": "model.safetensors",
+    "model.layers.0.ffn_hc.fn": "model.safetensors",
+    "model.layers.0.ffn_hc.scale": "model.safetensors",
+    "model.layers.0.ffn_norm.weight": "model.safetensors",
+    "model.layers.1.attn.attn_sink": "model.safetensors",
+    "model.layers.1.attn.kv_norm.weight": "model.safetensors",
+    "model.layers.1.attn.q_norm.weight": "model.safetensors",
+    "model.layers.1.attn.wkv.weight": "model.safetensors",
+    "model.layers.1.attn.wo_a.weight": "model.safetensors",
+    "model.layers.1.attn.wo_b.weight": "model.safetensors",
+    "model.layers.1.attn.wq_a.weight": "model.safetensors",
+    "model.layers.1.attn.wq_b.weight": "model.safetensors",
+    "model.layers.1.attn_hc.base": "model.safetensors",
+    "model.layers.1.attn_hc.fn": "model.safetensors",
+    "model.layers.1.attn_hc.scale": "model.safetensors",
+    "model.layers.1.attn_norm.weight": "model.safetensors",
+    "model.layers.1.ffn.gate.tid2eid": "model.safetensors",
+    "model.layers.1.ffn.gate.weight": "model.safetensors",
+    "model.layers.1.ffn.shared_experts.down_proj.weight": "model.safetensors",
+    "model.layers.1.ffn.shared_experts.gate_proj.weight": "model.safetensors",
+    "model.layers.1.ffn.shared_experts.up_proj.weight": "model.safetensors",
+    "model.layers.1.ffn.switch_mlp.down_proj.weight": "model.safetensors",
+    "model.layers.1.ffn.switch_mlp.gate_proj.weight": "model.safetensors",
+    "model.layers.1.ffn.switch_mlp.up_proj.weight": "model.safetensors",
+    "model.layers.1.ffn_hc.base": "model.safetensors",
+    "model.layers.1.ffn_hc.fn": "model.safetensors",
+    "model.layers.1.ffn_hc.scale": "model.safetensors",
+    "model.layers.1.ffn_norm.weight": "model.safetensors",
+    "model.layers.2.attn.attn_sink": "model.safetensors",
+    "model.layers.2.attn.compressor.ape": "model.safetensors",
+    "model.layers.2.attn.compressor.norm.weight": "model.safetensors",
+    "model.layers.2.attn.compressor.wgate.weight": "model.safetensors",
+    "model.layers.2.attn.compressor.wkv.weight": "model.safetensors",
+    "model.layers.2.attn.indexer.compressor.ape": "model.safetensors",
+    "model.layers.2.attn.indexer.compressor.norm.weight": "model.safetensors",
+    "model.layers.2.attn.indexer.compressor.wgate.weight": "model.safetensors",
+    "model.layers.2.attn.indexer.compressor.wkv.weight": "model.safetensors",
+    "model.layers.2.attn.indexer.weights_proj.weight": "model.safetensors",
+    "model.layers.2.attn.indexer.wq_b.weight": "model.safetensors",
+    "model.layers.2.attn.kv_norm.weight": "model.safetensors",
+    "model.layers.2.attn.q_norm.weight": "model.safetensors",
+    "model.layers.2.attn.wkv.weight": "model.safetensors",
+    "model.layers.2.attn.wo_a.weight": "model.safetensors",
+    "model.layers.2.attn.wo_b.weight": "model.safetensors",
+    "model.layers.2.attn.wq_a.weight": "model.safetensors",
+    "model.layers.2.attn.wq_b.weight": "model.safetensors",
+    "model.layers.2.attn_hc.base": "model.safetensors",
+    "model.layers.2.attn_hc.fn": "model.safetensors",
+    "model.layers.2.attn_hc.scale": "model.safetensors",
+    "model.layers.2.attn_norm.weight": "model.safetensors",
+    "model.layers.2.ffn.gate.tid2eid": "model.safetensors",
+    "model.layers.2.ffn.gate.weight": "model.safetensors",
+    "model.layers.2.ffn.shared_experts.down_proj.weight": "model.safetensors",
+    "model.layers.2.ffn.shared_experts.gate_proj.weight": "model.safetensors",
+    "model.layers.2.ffn.shared_experts.up_proj.weight": "model.safetensors",
+    "model.layers.2.ffn.switch_mlp.down_proj.weight": "model.safetensors",
+    "model.layers.2.ffn.switch_mlp.gate_proj.weight": "model.safetensors",
+    "model.layers.2.ffn.switch_mlp.up_proj.weight": "model.safetensors",
+    "model.layers.2.ffn_hc.base": "model.safetensors",
+    "model.layers.2.ffn_hc.fn": "model.safetensors",
+    "model.layers.2.ffn_hc.scale": "model.safetensors",
+    "model.layers.2.ffn_norm.weight": "model.safetensors",
+    "model.layers.3.attn.attn_sink": "model.safetensors",
+    "model.layers.3.attn.compressor.ape": "model.safetensors",
+    "model.layers.3.attn.compressor.norm.weight": "model.safetensors",
+    "model.layers.3.attn.compressor.wgate.weight": "model.safetensors",
+    "model.layers.3.attn.compressor.wkv.weight": "model.safetensors",
+    "model.layers.3.attn.kv_norm.weight": "model.safetensors",
+    "model.layers.3.attn.q_norm.weight": "model.safetensors",
+    "model.layers.3.attn.wkv.weight": "model.safetensors",
+    "model.layers.3.attn.wo_a.weight": "model.safetensors",
+    "model.layers.3.attn.wo_b.weight": "model.safetensors",
+    "model.layers.3.attn.wq_a.weight": "model.safetensors",
+    "model.layers.3.attn.wq_b.weight": "model.safetensors",
+    "model.layers.3.attn_hc.base": "model.safetensors",
+    "model.layers.3.attn_hc.fn": "model.safetensors",
+    "model.layers.3.attn_hc.scale": "model.safetensors",
+    "model.layers.3.attn_norm.weight": "model.safetensors",
+    "model.layers.3.ffn.gate.e_score_correction_bias": "model.safetensors",
+    "model.layers.3.ffn.gate.weight": "model.safetensors",
+    "model.layers.3.ffn.shared_experts.down_proj.weight": "model.safetensors",
+    "model.layers.3.ffn.shared_experts.gate_proj.weight": "model.safetensors",
+    "model.layers.3.ffn.shared_experts.up_proj.weight": "model.safetensors",
+    "model.layers.3.ffn.switch_mlp.down_proj.weight": "model.safetensors",
+    "model.layers.3.ffn.switch_mlp.gate_proj.weight": "model.safetensors",
+    "model.layers.3.ffn.switch_mlp.up_proj.weight": "model.safetensors",
+    "model.layers.3.ffn_hc.base": "model.safetensors",
+    "model.layers.3.ffn_hc.fn": "model.safetensors",
+    "model.layers.3.ffn_hc.scale": "model.safetensors",
+    "model.layers.3.ffn_norm.weight": "model.safetensors",
+    "model.layers.4.attn.attn_sink": "model.safetensors",
+    "model.layers.4.attn.compressor.ape": "model.safetensors",
+    "model.layers.4.attn.compressor.norm.weight": "model.safetensors",
+    "model.layers.4.attn.compressor.wgate.weight": "model.safetensors",
+    "model.layers.4.attn.compressor.wkv.weight": "model.safetensors",
+    "model.layers.4.attn.indexer.compressor.ape": "model.safetensors",
+    "model.layers.4.attn.indexer.compressor.norm.weight": "model.safetensors",
+    "model.layers.4.attn.indexer.compressor.wgate.weight": "model.safetensors",
+    "model.layers.4.attn.indexer.compressor.wkv.weight": "model.safetensors",
+    "model.layers.4.attn.indexer.weights_proj.weight": "model.safetensors",
+    "model.layers.4.attn.indexer.wq_b.weight": "model.safetensors",
+    "model.layers.4.attn.kv_norm.weight": "model.safetensors",
+    "model.layers.4.attn.q_norm.weight": "model.safetensors",
+    "model.layers.4.attn.wkv.weight": "model.safetensors",
+    "model.layers.4.attn.wo_a.weight": "model.safetensors",
+    "model.layers.4.attn.wo_b.weight": "model.safetensors",
+    "model.layers.4.attn.wq_a.weight": "model.safetensors",
+    "model.layers.4.attn.wq_b.weight": "model.safetensors",
+    "model.layers.4.attn_hc.base": "model.safetensors",
+    "model.layers.4.attn_hc.fn": "model.safetensors",
+    "model.layers.4.attn_hc.scale": "model.safetensors",
+    "model.layers.4.attn_norm.weight": "model.safetensors",
+    "model.layers.4.ffn.gate.e_score_correction_bias": "model.safetensors",
+    "model.layers.4.ffn.gate.weight": "model.safetensors",
+    "model.layers.4.ffn.shared_experts.down_proj.weight": "model.safetensors",
+    "model.layers.4.ffn.shared_experts.gate_proj.weight": "model.safetensors",
+    "model.layers.4.ffn.shared_experts.up_proj.weight": "model.safetensors",
+    "model.layers.4.ffn.switch_mlp.down_proj.weight": "model.safetensors",
+    "model.layers.4.ffn.switch_mlp.gate_proj.weight": "model.safetensors",
+    "model.layers.4.ffn.switch_mlp.up_proj.weight": "model.safetensors",
+    "model.layers.4.ffn_hc.base": "model.safetensors",
+    "model.layers.4.ffn_hc.fn": "model.safetensors",
+    "model.layers.4.ffn_hc.scale": "model.safetensors",
+    "model.layers.4.ffn_norm.weight": "model.safetensors",
+    "model.norm.weight": "model.safetensors"
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "is_local": true,
+  "legacy": true,
+  "local_files_only": false,
+  "model_max_length": 1048576,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": null
+}