Instructions to use SparseLLM/DECO-1.2B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use SparseLLM/DECO-1.2B with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="SparseLLM/DECO-1.2B", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("SparseLLM/DECO-1.2B", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use SparseLLM/DECO-1.2B with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "SparseLLM/DECO-1.2B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "SparseLLM/DECO-1.2B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/SparseLLM/DECO-1.2B

SGLang

How to use SparseLLM/DECO-1.2B with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "SparseLLM/DECO-1.2B" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "SparseLLM/DECO-1.2B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "SparseLLM/DECO-1.2B" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "SparseLLM/DECO-1.2B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use SparseLLM/DECO-1.2B with Docker Model Runner:
```
docker model run hf.co/SparseLLM/DECO-1.2B
```

Raincleared commited on about 22 hours ago

Commit

8563fb4

verified ·

1 Parent(s): 7758f0a

Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

c4_validation.json +0 -0
config.json +93 -0
configuration_blockffn.py +169 -0
evaluation.log +0 -0
evaluation/results__hf_ckpts__blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512__/results_2026-03-09T14-06-28.603901.json +609 -0
generation_config.json +8 -0
modeling_blockffn.py +1040 -0
pytorch_model.bin +3 -0
special_tokens_map.json +81 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +116 -0

c4_validation.json ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,93 @@

+{
+    "architectures": [
+        "BlockFFNForCausalLM"
+    ],
+    "auto_map": {
+        "AutoConfig": "configuration_blockffn.BlockFFNConfig",
+        "AutoModel": "modeling_blockffn.BlockFFNModel",
+        "AutoModelForCausalLM": "modeling_blockffn.BlockFFNForCausalLM"
+    },
+    "bos_token_id": 1,
+    "eos_token_id": [
+        2,
+        73440
+    ],
+    "pad_token_id": 2,
+    "hidden_act": "silu",
+    "hidden_size": 1792,
+    "initializer_range": 0.1,
+    "intermediate_size": 10240,
+    "head_dim": 128,
+    "max_position_embeddings": 4096,
+    "model_type": "blockffn",
+    "num_attention_heads": 14,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "rope_theta": 10000.0,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.36.0",
+    "use_cache": true,
+    "vocab_size": 73448,
+    "use_mup": false,
+    "num_experts": 102,
+    "moe_ffn_hidden_size": 64,
+    "moe_shared_expert_intermediate_size": 128,
+    "moe_layer_freq": [
+        0,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1
+    ],
+    "moe_router_dtype": "fp32",
+    "router_act_func": "relu",
+    "router_norm_type": "simple",
+    "expert_act_func": "norm_silu",
+    "expert_act_norm_type": "normal",
+    "num_layers": 32,
+    "ffn_hidden_size": 4480,
+    "num_query_groups": 14,
+    "norm_epsilon": 1e-05,
+    "router_norm_fixed": false,
+    "router_norm_scalar": false,
+    "router_norm_init_var": 0.1,
+    "moe_expert_bias_apply_method": "rms",
+    "use_blockffn": true,
+    "router_type": "topk",
+    "moe_router_enable_expert_bias": false,
+    "expert_not_gated": true,
+    "moe_router_pre_softmax": false,
+    "moe_router_topk": 2,
+    "moe_router_topp": 0.5,
+    "moe_router_score_function": "softmax",
+    "moe_router_topk_scaling_factor": null
+}

configuration_blockffn.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BlockFFN model configuration"""
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+class BlockFFNConfig(PretrainedConfig):
+    model_type = "blockffn"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `BlockFFNModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        ffn_hidden_size=11008,
+        num_layers=32,
+        num_attention_heads=32,
+        num_query_groups=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        norm_epsilon=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        head_dim=None,
+        use_mup=True,
+        mup_emb_scale=12,
+        mup_depth_scale=1.4,
+        mup_base_hidden_size=256,
+        num_experts=180,
+        moe_ffn_hidden_size=128,
+        moe_shared_expert_intermediate_size=128,
+        moe_layer_freq="([0]*3+[1]*29)",
+        moe_router_dtype="fp32",
+        router_act_func="relu",
+        router_norm_type="simple",
+        router_norm_fixed=False,
+        router_norm_scalar=False,
+        router_norm_init_var=0.1,
+        expert_act_func="norm_silu",
+        expert_act_norm_type="normal",
+        use_blockffn=False,
+        router_type="topk",
+        moe_router_topk=0,
+        moe_router_topp=0,
+        moe_router_enable_expert_bias=False,
+        moe_expert_bias_apply_method="base",
+        moe_router_score_function="sigmoid",
+        moe_router_topk_scaling_factor=2.5,
+        expert_not_gated=False,
+        moe_router_pre_softmax=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.num_layers = num_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_query_groups is None:
+            num_query_groups = num_attention_heads
+        self.num_query_groups = num_query_groups
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.norm_epsilon = norm_epsilon
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        self.use_mup = use_mup
+        self.mup_emb_scale = mup_emb_scale
+        self.mup_depth_scale = mup_depth_scale
+        self.mup_base_hidden_size = mup_base_hidden_size
+        self.num_experts = num_experts
+        self.moe_ffn_hidden_size = moe_ffn_hidden_size
+        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size
+        self.moe_layer_freq = moe_layer_freq if isinstance(moe_layer_freq, (str, list)) else ([0] * num_layers)
+        self.moe_router_dtype = moe_router_dtype
+        self.router_act_func = router_act_func
+        self.router_norm_type = router_norm_type
+        self.router_norm_fixed = router_norm_fixed
+        self.router_norm_scalar = router_norm_scalar
+        self.router_norm_init_var = router_norm_init_var
+        self.expert_act_func = expert_act_func
+        self.expert_act_norm_type = expert_act_norm_type
+        self.use_blockffn = use_blockffn
+        self.router_type = router_type
+        self.moe_router_topk = moe_router_topk
+        self.moe_router_topp = moe_router_topp
+        self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
+        self.moe_expert_bias_apply_method = moe_expert_bias_apply_method
+        self.moe_router_score_function = moe_router_score_function
+        self.moe_router_topk_scaling_factor = moe_router_topk_scaling_factor
+        self.expert_not_gated = expert_not_gated
+        self.moe_router_pre_softmax = moe_router_pre_softmax
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    @property
+    def mup_width_scale(self):
+        return (self.hidden_size / self.mup_base_hidden_size) if (self.use_mup and self.mup_base_hidden_size > 0) else 1
+__all__ = ["BlockFFNConfig"]

evaluation.log ADDED Viewed

The diff for this file is too large to render. See raw diff

evaluation/results__hf_ckpts__blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512__/results_2026-03-09T14-06-28.603901.json ADDED Viewed

	@@ -0,0 +1,609 @@

+{
+  "results": {
+    "arc_challenge": {
+      "alias": "arc_challenge",
+      "acc,none": 0.29692832764505117,
+      "acc_stderr,none": 0.013352025976725228,
+      "acc_norm,none": 0.30802047781569963,
+      "acc_norm_stderr,none": 0.01349142951729204
+    },
+    "arc_easy": {
+      "alias": "arc_easy",
+      "acc,none": 0.6296296296296297,
+      "acc_stderr,none": 0.009908978578665757,
+      "acc_norm,none": 0.577020202020202,
+      "acc_norm_stderr,none": 0.010137328382209107
+    },
+    "boolq": {
+      "alias": "boolq",
+      "acc,none": 0.6256880733944954,
+      "acc_stderr,none": 0.008464246656443236
+    },
+    "hellaswag": {
+      "alias": "hellaswag",
+      "acc,none": 0.37422824138617805,
+      "acc_stderr,none": 0.004829339926388328,
+      "acc_norm,none": 0.4738099980083649,
+      "acc_norm_stderr,none": 0.004982931565945946
+    },
+    "lambada_openai": {
+      "alias": "lambada_openai",
+      "perplexity,none": 16.99295013502752,
+      "perplexity_stderr,none": 0.5693576969205276,
+      "acc,none": 0.443042887638269,
+      "acc_stderr,none": 0.0069206332420079265
+    },
+    "lambada_standard": {
+      "alias": "lambada_standard",
+      "perplexity,none": 27.533257711233524,
+      "perplexity_stderr,none": 0.9519324571988527,
+      "acc,none": 0.35532699398408696,
+      "acc_stderr,none": 0.006668004996315348
+    },
+    "piqa": {
+      "alias": "piqa",
+      "acc,none": 0.7023939064200218,
+      "acc_stderr,none": 0.01066735379238821,
+      "acc_norm,none": 0.6980413492927094,
+      "acc_norm_stderr,none": 0.010711732891588352
+    },
+    "social_iqa": {
+      "alias": "social_iqa",
+      "acc,none": 0.40890481064483114,
+      "acc_stderr,none": 0.011124710055682845
+    },
+    "wikitext": {
+      "alias": "wikitext",
+      "word_perplexity,none": 21.603753499994546,
+      "word_perplexity_stderr,none": "N/A",
+      "byte_perplexity,none": 1.7764935452161108,
+      "byte_perplexity_stderr,none": "N/A",
+      "bits_per_byte,none": 0.8290324467318287,
+      "bits_per_byte_stderr,none": "N/A"
+    },
+    "winogrande": {
+      "alias": "winogrande",
+      "acc,none": 0.5493291239147593,
+      "acc_stderr,none": 0.013983928869040235
+    }
+  },
+  "group_subtasks": {
+    "arc_challenge": [],
+    "arc_easy": [],
+    "boolq": [],
+    "hellaswag": [],
+    "lambada_openai": [],
+    "lambada_standard": [],
+    "piqa": [],
+    "social_iqa": [],
+    "wikitext": [],
+    "winogrande": []
+  },
+  "configs": {
+    "arc_challenge": {
+      "task": "arc_challenge",
+      "tag": [
+        "ai2_arc"
+      ],
+      "dataset_path": "allenai/ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "unsafe_code": false,
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "arc_easy": {
+      "task": "arc_easy",
+      "tag": [
+        "ai2_arc"
+      ],
+      "dataset_path": "allenai/ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "unsafe_code": false,
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "boolq": {
+      "task": "boolq",
+      "tag": [
+        "super-glue-lm-eval-v1"
+      ],
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
+      "doc_to_target": "label",
+      "unsafe_code": false,
+      "doc_to_choice": [
+        "no",
+        "yes"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc"
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "passage",
+      "metadata": {
+        "version": 2.0,
+        "pretrained": "results/hf_ckpts/blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "hellaswag": {
+      "task": "hellaswag",
+      "tag": [
+        "multiple_choice"
+      ],
+      "dataset_path": "Rowan/hellaswag",
+      "training_split": "train",
+      "validation_split": "validation",
+      "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
+      "doc_to_text": "{{query}}",
+      "doc_to_target": "{{label}}",
+      "unsafe_code": false,
+      "doc_to_choice": "choices",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "lambada_openai": {
+      "task": "lambada_openai",
+      "tag": [
+        "lambada"
+      ],
+      "dataset_path": "EleutherAI/lambada_openai",
+      "dataset_name": "default",
+      "test_split": "test",
+      "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
+      "doc_to_target": "{{' '+text.split(' ')[-1]}}",
+      "unsafe_code": false,
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "perplexity",
+          "aggregation": "perplexity",
+          "higher_is_better": false
+        },
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "loglikelihood",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{text}}",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "lambada_standard": {
+      "task": "lambada_standard",
+      "tag": [
+        "lambada"
+      ],
+      "dataset_path": "lambada",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
+      "doc_to_target": "{{' '+text.split(' ')[-1]}}",
+      "unsafe_code": false,
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "perplexity",
+          "aggregation": "perplexity",
+          "higher_is_better": false
+        },
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "loglikelihood",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{text}}",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "piqa": {
+      "task": "piqa",
+      "dataset_path": "baber/piqa",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "Question: {{goal}}\nAnswer:",
+      "doc_to_target": "label",
+      "unsafe_code": false,
+      "doc_to_choice": "{{[sol1, sol2]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "goal",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "social_iqa": {
+      "task": "social_iqa",
+      "dataset_path": "social_i_qa",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "Q: {{context}} {{question}}\nA:",
+      "doc_to_target": "{{ (label|int) - 1 }}",
+      "unsafe_code": false,
+      "doc_to_choice": "{{[answerA, answerB, answerC]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 0.0,
+        "pretrained": "results/hf_ckpts/blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "wikitext": {
+      "task": "wikitext",
+      "dataset_path": "EleutherAI/wikitext_document_level",
+      "dataset_name": "wikitext-2-raw-v1",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "",
+      "doc_to_target": "def wikitext_detokenizer(doc):\n    string = doc[\"page\"]\n    # contractions\n    string = string.replace(\"s '\", \"s'\")\n    string = re.sub(r\"/' [0-9]/\", r\"/'[0-9]/\", string)\n    # number separators\n    string = string.replace(\" @-@ \", \"-\")\n    string = string.replace(\" @,@ \", \",\")\n    string = string.replace(\" @.@ \", \".\")\n    # punctuation\n    string = string.replace(\" : \", \": \")\n    string = string.replace(\" ; \", \"; \")\n    string = string.replace(\" . \", \". \")\n    string = string.replace(\" ! \", \"! \")\n    string = string.replace(\" ? \", \"? \")\n    string = string.replace(\" , \", \", \")\n    # double brackets\n    string = re.sub(r\"\\(\\s*([^\\)]*?)\\s*\\)\", r\"(\\1)\", string)\n    string = re.sub(r\"\\[\\s*([^\\]]*?)\\s*\\]\", r\"[\\1]\", string)\n    string = re.sub(r\"{\\s*([^}]*?)\\s*}\", r\"{\\1}\", string)\n    string = re.sub(r\"\\\"\\s*([^\\\"]*?)\\s*\\\"\", r'\"\\1\"', string)\n    string = re.sub(r\"'\\s*([^']*?)\\s*'\", r\"'\\1'\", string)\n    # miscellaneous\n    string = string.replace(\"= = = =\", \"====\")\n    string = string.replace(\"= = =\", \"===\")\n    string = string.replace(\"= =\", \"==\")\n    string = string.replace(\" \" + chr(176) + \" \", chr(176))\n    string = string.replace(\" \\n\", \"\\n\")\n    string = string.replace(\"\\n \", \"\\n\")\n    string = string.replace(\" N \", \" 1 \")\n    string = string.replace(\" 's\", \"'s\")\n\n    return string\n",
+      "unsafe_code": false,
+      "process_results": "def process_results(doc, results):\n    (loglikelihood,) = results\n    # IMPORTANT: wikitext counts number of words in *original doc before detokenization*\n    _words = len(re.split(r\"\\s+\", doc[\"page\"]))\n    _bytes = len(doc[\"page\"].encode(\"utf-8\"))\n    return {\n        \"word_perplexity\": (loglikelihood, _words),\n        \"byte_perplexity\": (loglikelihood, _bytes),\n        \"bits_per_byte\": (loglikelihood, _bytes),\n    }\n",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "word_perplexity"
+        },
+        {
+          "metric": "byte_perplexity"
+        },
+        {
+          "metric": "bits_per_byte"
+        }
+      ],
+      "output_type": "loglikelihood_rolling",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{page}}",
+      "metadata": {
+        "version": 2.0,
+        "pretrained": "results/hf_ckpts/blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "winogrande": {
+      "task": "winogrande",
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "def doc_to_text(doc):\n    answer_to_num = {\"1\": 0, \"2\": 1}\n    return answer_to_num[doc[\"answer\"]]\n",
+      "doc_to_target": "def doc_to_target(doc):\n    idx = doc[\"sentence\"].index(\"_\") + 1\n    return doc[\"sentence\"][idx:].strip()\n",
+      "unsafe_code": false,
+      "doc_to_choice": "def doc_to_choice(doc):\n    idx = doc[\"sentence\"].index(\"_\")\n    options = [doc[\"option1\"], doc[\"option2\"]]\n    return [doc[\"sentence\"][:idx] + opt for opt in options]\n",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "sentence",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    }
+  },
+  "versions": {
+    "arc_challenge": 1.0,
+    "arc_easy": 1.0,
+    "boolq": 2.0,
+    "hellaswag": 1.0,
+    "lambada_openai": 1.0,
+    "lambada_standard": 1.0,
+    "piqa": 1.0,
+    "social_iqa": 0.0,
+    "wikitext": 2.0,
+    "winogrande": 1.0
+  },
+  "n-shot": {
+    "arc_challenge": 0,
+    "arc_easy": 0,
+    "boolq": 0,
+    "hellaswag": 0,
+    "lambada_openai": 0,
+    "lambada_standard": 0,
+    "piqa": 0,
+    "social_iqa": 0,
+    "wikitext": 0,
+    "winogrande": 0
+  },
+  "higher_is_better": {
+    "arc_challenge": {
+      "acc": true,
+      "acc_norm": true
+    },
+    "arc_easy": {
+      "acc": true,
+      "acc_norm": true
+    },
+    "boolq": {
+      "acc": true
+    },
+    "hellaswag": {
+      "acc": true,
+      "acc_norm": true
+    },
+    "lambada_openai": {
+      "perplexity": false,
+      "acc": true
+    },
+    "lambada_standard": {
+      "perplexity": false,
+      "acc": true
+    },
+    "piqa": {
+      "acc": true,
+      "acc_norm": true
+    },
+    "social_iqa": {
+      "acc": true
+    },
+    "wikitext": {
+      "word_perplexity": false,
+      "byte_perplexity": false,
+      "bits_per_byte": false
+    },
+    "winogrande": {
+      "acc": true
+    }
+  },
+  "n-samples": {
+    "winogrande": {
+      "original": 1267,
+      "effective": 1267
+    },
+    "wikitext": {
+      "original": 62,
+      "effective": 62
+    },
+    "social_iqa": {
+      "original": 1954,
+      "effective": 1954
+    },
+    "piqa": {
+      "original": 1838,
+      "effective": 1838
+    },
+    "lambada_standard": {
+      "original": 5153,
+      "effective": 5153
+    },
+    "lambada_openai": {
+      "original": 5153,
+      "effective": 5153
+    },
+    "hellaswag": {
+      "original": 10042,
+      "effective": 10042
+    },
+    "boolq": {
+      "original": 3270,
+      "effective": 3270
+    },
+    "arc_easy": {
+      "original": 2376,
+      "effective": 2376
+    },
+    "arc_challenge": {
+      "original": 1172,
+      "effective": 1172
+    }
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=results/hf_ckpts/blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512/,dtype=bfloat16,trust_remote_code=True,trust_remote_code=True",
+    "model_num_parameters": 1450770714,
+    "model_dtype": "torch.bfloat16",
+    "model_revision": "main",
+    "model_sha": "",
+    "batch_size": "8",
+    "batch_sizes": [],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null,
+    "random_seed": 0,
+    "numpy_seed": 1234,
+    "torch_seed": 1234,
+    "fewshot_seed": 1234
+  },
+  "git_hash": "core_v0.12.0-135-g7f33517",
+  "date": 1773035718.973999,
+  "pretty_env_info": "PyTorch version: 2.6.0+cu124\nIs debug build: False\nCUDA used to build PyTorch: 12.4\nROCM used to build PyTorch: N/A\n\nOS: CentOS Linux 7 (Core) (x86_64)\nGCC version: (conda-forge gcc 9.5.0-19) 9.5.0\nClang version: Could not collect\nCMake version: version 3.30.1\nLibc version: glibc-2.17\n\nPython version: 3.10.14 (main, May  6 2024, 19:42:50) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-3.10.0-1160.el7.x86_64-x86_64-with-glibc2.17\nIs CUDA available: True\nCUDA runtime version: 12.4.131\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A800-SXM4-80GB\nGPU 1: NVIDIA A800-SXM4-80GB\nGPU 2: NVIDIA A800-SXM4-80GB\nGPU 3: NVIDIA A800-SXM4-80GB\nGPU 4: NVIDIA A800-SXM4-80GB\nGPU 5: NVIDIA A800-SXM4-80GB\nGPU 6: NVIDIA A800-SXM4-80GB\nGPU 7: NVIDIA A800-SXM4-80GB\n\nNvidia driver version: 535.183.06\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:          x86_64\nCPU op-mode(s):        32-bit, 64-bit\nByte Order:            Little Endian\nCPU(s):                104\nOn-line CPU(s) list:   0-103\nThread(s) per core:    1\nCore(s) per socket:    52\nSocket(s):             2\nNUMA node(s):          2\nVendor ID:             GenuineIntel\nCPU family:            6\nModel:                 143\nModel name:            Intel(R) Xeon(R) Platinum 8470\nStepping:              8\nCPU MHz:               800.048\nCPU max MHz:           3800.0000\nCPU min MHz:           800.0000\nBogoMIPS:              4000.00\nVirtualization:        VT-x\nL1d cache:             48K\nL1i cache:             32K\nL2 cache:              2048K\nL3 cache:              107520K\nNUMA node0 CPU(s):     0-51\nNUMA node1 CPU(s):     52-103\nFlags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch epb cat_l3 cat_l2 cdp_l3 invpcid_single intel_pt cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local dtherm ida arat pln pts hwp hwp_act_window hwp_epp hwp_pkg_req avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq cldemote movdiri movdir64b md_clear pconfig spec_ctrl intel_stibp flush_l1d arch_capabilities\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] nvidia-cublas-cu12==12.4.5.8\n[pip3] nvidia-cuda-cupti-cu12==12.4.127\n[pip3] nvidia-cuda-nvrtc-cu12==12.4.127\n[pip3] nvidia-cuda-runtime-cu12==12.4.127\n[pip3] nvidia-cudnn-cu12==9.1.0.70\n[pip3] nvidia-cufft-cu12==11.2.1.3\n[pip3] nvidia-curand-cu12==10.3.5.147\n[pip3] nvidia-cusolver-cu12==11.6.1.9\n[pip3] nvidia-cusparse-cu12==12.3.1.170\n[pip3] nvidia-cusparselt-cu12==0.6.2\n[pip3] nvidia-nccl-cu11==2.21.5\n[pip3] nvidia-nccl-cu12==2.21.5\n[pip3] nvidia-nvjitlink-cu12==12.4.127\n[pip3] nvidia-nvtx-cu12==12.4.127\n[pip3] torch==2.6.0\n[pip3] torchaudio==2.6.0\n[pip3] torchdata==0.11.0\n[pip3] torchvision==0.21.0\n[pip3] triton==3.2.0\n[conda] cuda-cudart               12.4.99              hd3aeb46_0    conda-forge\n[conda] cuda-cudart_linux-64      12.4.99              h59595ed_0    conda-forge\n[conda] cuda-cupti                12.4.127             he02047a_2    conda-forge\n[conda] cuda-libraries            12.4.0               ha770c72_0    conda-forge\n[conda] cuda-nvrtc                12.4.99              hd3aeb46_0    conda-forge\n[conda] cuda-nvtx                 12.4.127             he02047a_2    conda-forge\n[conda] cuda-opencl               12.4.99              h59595ed_0    conda-forge\n[conda] cuda-runtime              12.4.0               ha804496_0    conda-forge\n[conda] ffmpeg                    4.3                  hf484d3e_0    pytorch\n[conda] libcublas                 12.4.2.65            hd3aeb46_0    conda-forge\n[conda] libcufft                  11.2.0.44            hd3aeb46_0    conda-forge\n[conda] libcurand                 10.3.5.119           hd3aeb46_0    conda-forge\n[conda] libcusolver               11.6.0.99            hd3aeb46_0    conda-forge\n[conda] libcusparse               12.3.0.142           hd3aeb46_0    conda-forge\n[conda] libjpeg-turbo             2.0.0                h9bf148f_0    pytorch\n[conda] libnvjitlink              12.4.99              hd3aeb46_0    conda-forge\n[conda] mkl                       2023.1.0         h213fc3f_46344    defaults\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] nvidia-cublas-cu12        12.4.5.8                 pypi_0    pypi\n[conda] nvidia-cuda-cupti-cu12    12.4.127                 pypi_0    pypi\n[conda] nvidia-cuda-nvrtc-cu12    12.4.127                 pypi_0    pypi\n[conda] nvidia-cuda-runtime-cu12  12.4.127                 pypi_0    pypi\n[conda] nvidia-cudnn-cu12         9.1.0.70                 pypi_0    pypi\n[conda] nvidia-cufft-cu12         11.2.1.3                 pypi_0    pypi\n[conda] nvidia-curand-cu12        10.3.5.147               pypi_0    pypi\n[conda] nvidia-cusolver-cu12      11.6.1.9                 pypi_0    pypi\n[conda] nvidia-cusparse-cu12      12.3.1.170               pypi_0    pypi\n[conda] nvidia-cusparselt-cu12    0.6.2                    pypi_0    pypi\n[conda] nvidia-nccl-cu11          2.21.5                   pypi_0    pypi\n[conda] nvidia-nccl-cu12          2.21.5                   pypi_0    pypi\n[conda] nvidia-nvjitlink-cu12     12.4.127                 pypi_0    pypi\n[conda] nvidia-nvtx-cu12          12.4.127                 pypi_0    pypi\n[conda] pytorch-cuda              12.4                 hc786d27_6    pytorch\n[conda] pytorch-mutex             1.0                        cuda    pytorch\n[conda] torch                     2.6.0                    pypi_0    pypi\n[conda] torchaudio                2.6.0                    pypi_0    pypi\n[conda] torchdata                 0.11.0                   pypi_0    pypi\n[conda] torchvision               0.21.0                   pypi_0    pypi\n[conda] triton                    3.2.0                    pypi_0    pypi",
+  "transformers_version": "4.55.2",
+  "lm_eval_version": "0.4.9.1",
+  "upper_git_hash": null,
+  "tokenizer_pad_token": [
+    "<unk>",
+    "0"
+  ],
+  "tokenizer_eos_token": [
+    "<|im_end|>",
+    "73440"
+  ],
+  "tokenizer_bos_token": [
+    "<s>",
+    "1"
+  ],
+  "eot_token_id": 73440,
+  "max_length": 4096,
+  "task_hashes": {},
+  "model_source": "hf",
+  "model_name": "results/hf_ckpts/blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512/",
+  "model_name_sanitized": "results__hf_ckpts__blockffn_12b_mul1001_withmean_d64_s128_lr654e4_b512__",
+  "system_instruction": null,
+  "system_instruction_sha": null,
+  "fewshot_as_multiturn": false,
+  "chat_template": null,
+  "chat_template_sha": null,
+  "start_time": 11591492.163990611,
+  "end_time": 11592289.97671808,
+  "total_evaluation_time_seconds": "797.8127274680883"
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "do_sample": true,
+    "top_p": 0.8,
+    "temperature": 0.8,
+    "bos_token_id": 1,
+    "eos_token_id": [2,73440],
+    "pad_token_id": 2
+}

modeling_blockffn.py ADDED Viewed

	@@ -0,0 +1,1040 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+import math
+import torch
+from torch import nn
+import tree
+from abc import ABC, abstractmethod
+from fmoe.linear import MOELinear
+from fmoe.functions import prepare_forward, MOEScatter, MOEGather
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from transformers.utils.generic import check_model_inputs
+from .configuration_blockffn import BlockFFNConfig
+logger = logging.get_logger(__name__)
+@use_kernel_forward_from_hub("RMSNorm")
+class BlockFFNRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class BlockFFNRotaryEmbedding(nn.Module):
+    def __init__(self, config: BlockFFNConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class SimpleLayerNorm(nn.Module):
+    def __init__(self, dim_norm: int, fixed: bool = False, init_var: float = 1.0):
+        super().__init__()
+        self.dim_norm = dim_norm
+        self.fixed = fixed
+        if self.fixed:
+            self.weight = init_var
+        else:
+            self.weight = torch.nn.Parameter(torch.full((self.dim_norm,), init_var))
+    @torch.compile
+    def forward(self, x: torch.Tensor):
+        return  x * self.weight
+class BlockFFNMLP(nn.Module):
+    def __init__(self, config: BlockFFNConfig, intermediate_size: int = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.ffn_hidden_size if intermediate_size is None else intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class BlockFFNRouter(nn.Module):
+    def __init__(self, config: BlockFFNConfig):
+        super().__init__()
+        self.config = config
+        self.num_experts = self.config.num_experts
+        if self.config.moe_router_dtype == "fp32":
+            self.router_dtype = torch.float32
+        elif self.config.moe_router_dtype == "fp64":
+            self.router_dtype = torch.float64
+        elif self.config.moe_router_dtype == "bf16":
+            self.router_dtype = torch.bfloat16
+        else:
+            raise NotImplementedError(f"{self.config.moe_router_dtype} is not supported.")
+        self.weight = torch.nn.Parameter(
+            torch.empty((self.config.num_experts, self.config.hidden_size), dtype=self.router_dtype)
+        )
+    def forward(self, x: torch.Tensor):
+        return nn.functional.linear(x.to(self.router_dtype), self.weight)
+class NormSiLU(nn.Module):
+    def __init__(self, config: BlockFFNConfig):
+        super().__init__()
+        self.num_blocks, self.block_size = config.num_experts, config.moe_ffn_hidden_size
+        self.activate_fn_type = config.expert_act_func
+        assert self.activate_fn_type in ["norm_silu", "norm_silu_norms", "norm_silu_nomean", "silu"]
+        self.rms_norm = None
+        if self.activate_fn_type not in ["norm_silu_norms", "silu"]:
+            self.rms_norm = BlockFFNRMSNorm(config.moe_ffn_hidden_size, eps=config.norm_epsilon)
+        self.silu = torch.nn.SiLU()
+    @torch.compile
+    def forward(self, hidden: torch.Tensor) -> torch.Tensor:
+        assert hidden.ndim == 2
+        if self.activate_fn_type not in ["norm_silu_nomean", "silu"]:
+            hidden = hidden - torch.mean(hidden, dim=-1, keepdim=True)
+        if self.activate_fn_type not in ["norm_silu_norms", "silu"]:
+            return self.silu(self.rms_norm(hidden.view(hidden.shape[0], self.num_blocks, self.block_size)))
+        else:
+            return self.silu(hidden)
+class BlockFFNLayer(nn.Module):
+    def __init__(self, config: BlockFFNConfig):
+        super(BlockFFNLayer, self).__init__()
+        self.config = config
+        self.num_experts, self.dim_expert, self.hidden_size = \
+            config.num_experts, config.moe_ffn_hidden_size, config.hidden_size
+        self.dim_shared_expert = config.moe_shared_expert_intermediate_size
+        self.router_norm_type = config.router_norm_type
+        self.moe_router = BlockFFNRouter(self.config)
+        assert config.router_act_func == "relu"
+        self.router_act = nn.ReLU()
+        if config.router_norm_type == "simple":
+            self.router_norm = SimpleLayerNorm(
+                dim_norm=(1 if self.config.router_norm_scalar else config.num_experts),
+                fixed=config.router_norm_fixed,
+                init_var=config.router_norm_init_var,
+            )
+        elif config.router_norm_type == "rms":
+            self.router_norm = BlockFFNRMSNorm(self.config.num_experts, eps=config.norm_epsilon)
+        else:
+            raise NotImplementedError
+        self.expert_gated = not config.expert_not_gated
+        if self.expert_gated:
+            self.expert_gate_proj = nn.Linear(self.hidden_size, self.num_experts * self.dim_expert, bias=config.mlp_bias)
+        self.expert_up_proj = nn.Linear(self.hidden_size, self.num_experts * self.dim_expert, bias=config.mlp_bias)
+        assert config.expert_act_norm_type == "normal"
+        self.expert_act = NormSiLU(self.config)
+        self.expert_down_proj = nn.Linear(self.num_experts * self.dim_expert, self.hidden_size, bias=config.mlp_bias)
+        self.use_shared_expert = self.dim_shared_expert is not None and self.dim_shared_expert > 0
+        if self.use_shared_expert:
+            self.shared_experts = BlockFFNMLP(self.config, intermediate_size=self.dim_shared_expert)
+        self.enable_expert_bias = config.moe_router_enable_expert_bias
+        if self.enable_expert_bias:
+            self.expert_bias = torch.nn.Parameter(torch.zeros(self.num_experts, dtype=torch.float32))
+        self.expert_bias_apply_method = config.moe_expert_bias_apply_method
+    def apply_expert_bias(self, router_scores: torch.Tensor) -> torch.Tensor:
+        if self.expert_bias_apply_method == "base":
+            scores_for_routing = router_scores + self.expert_bias
+        elif self.expert_bias_apply_method == "rms":
+            variance = router_scores.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
+            scores_for_routing = router_scores + self.expert_bias.unsqueeze(0) * torch.sqrt(variance)
+        else:
+            raise NotImplementedError(f"invalid apply method: {self.expert_bias_apply_method}")
+        return scores_for_routing
+    def forward(self, hidden_states: torch.Tensor):
+        ori_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        seq_len = hidden_states.shape[0]
+        # router module forward
+        raw_router_score = self.moe_router(hidden_states)  # [seq_len, num_experts]
+        if self.enable_expert_bias:
+            scores_for_routing = self.apply_expert_bias(raw_router_score)
+            router_score = self.router_act(raw_router_score) * torch.gt(scores_for_routing, 0).type_as(raw_router_score)
+        else:
+            router_score = self.router_act(raw_router_score)
+        router_score = self.router_norm(router_score)
+        # expert module forward
+        x_in = self.expert_up_proj(hidden_states)  # [seq_len, num_experts * dim_expert]
+        if self.expert_gated:
+            x_gate = self.expert_gate_proj(hidden_states)
+            x_gate = self.expert_act(x_gate)
+            if x_gate.ndim == 3:
+                x_in = x_in.view(seq_len, self.num_experts, self.dim_expert)
+            x_in = x_in * x_gate
+        else:
+            x_in = self.expert_act(x_in)
+        if x_in.ndim == 3:
+            scored_x_in = x_in * router_score.type_as(hidden_states).unsqueeze(-1)
+        else:
+            scored_x_in = x_in.view(seq_len, self.num_experts, self.dim_expert) * router_score.type_as(hidden_states).unsqueeze(-1)
+        output = self.expert_down_proj(scored_x_in.view(seq_len, self.num_experts * self.dim_expert))
+        if self.use_shared_expert:
+            output = output + self.shared_experts(hidden_states)
+        return output.view(*ori_shape)
+class BaseRouter(ABC, nn.Module):
+    """Base Router class"""
+    def __init__(self, config: BlockFFNConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.num_experts = self.config.num_experts
+        if self.config.moe_router_dtype == "fp32":
+            self.router_dtype = torch.float32
+        elif self.config.moe_router_dtype == "fp64":
+            self.router_dtype = torch.float64
+        elif self.config.moe_router_dtype == "bf16":
+            self.router_dtype = torch.bfloat16
+        else:
+            raise NotImplementedError(f"{self.config.moe_router_dtype} is not supported.")
+        self.weight = torch.nn.Parameter(
+            torch.empty((self.num_experts, self.config.hidden_size), dtype=self.router_dtype)
+        )
+    def gating(self, input: torch.Tensor):
+        return torch.nn.functional.linear(input.to(self.router_dtype), self.weight.to(self.router_dtype))
+    @abstractmethod
+    def routing(self, logits: torch.Tensor):
+        """Routing function.
+        Args:
+            logits (torch.Tensor): Logits tensor.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing token assignment
+            probabilities and mapping.
+        """
+        raise NotImplementedError("Routing function not implemented.")
+    @abstractmethod
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        raise NotImplementedError("Forward function not implemented.")
+class TopKRouter(BaseRouter):
+    """Route each token to the top-k experts."""
+    def __init__(self, config: BlockFFNConfig) -> None:
+        super().__init__(config)
+        self.config = config
+        self.topk = self.config.moe_router_topk
+        self.score_function = self.config.moe_router_score_function
+        self.use_pre_softmax = self.config.moe_router_pre_softmax
+        self.scaling_factor = self.config.moe_router_topk_scaling_factor
+        self.enable_expert_bias = self.config.moe_router_enable_expert_bias
+        if self.enable_expert_bias:
+            self.expert_bias = torch.nn.Parameter(torch.zeros(self.num_experts, dtype=torch.float32))
+        else:
+            self.expert_bias = None
+    def _maintain_float32_expert_bias(self):
+        """
+        Maintain the expert bias in float32.
+        When using bf16/fp16, the expert bias gets converted to lower precision in Float16Module.
+        We keep it in float32 to avoid routing errors when updating the expert_bias.
+        """
+        if hasattr(self, 'expert_bias') and self.expert_bias is not None:
+            if self.expert_bias.dtype != torch.float32:
+                self.expert_bias.data = self.expert_bias.data.to(torch.float32)
+    def routing(self, logits: torch.Tensor):
+        """Top-k routing function
+        Args:
+            logits (torch.Tensor): Logits tensor after gating.
+        Returns:
+            probs (torch.Tensor): The probabilities of token to experts assignment.
+            routing_map (torch.Tensor): The mapping of token to experts assignment,
+                with shape [num_tokens, num_experts].
+        """
+        logits = logits.view(-1, self.num_experts)
+        if self.score_function == "softmax":
+            if self.use_pre_softmax:
+                scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
+                probs, top_indices = torch.topk(scores, k=self.topk, dim=1)
+            else:
+                scores, top_indices = torch.topk(logits, k=self.topk, dim=1)
+                probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits)
+        elif self.score_function == "sigmoid":
+            scores = torch.sigmoid(logits.float()).type_as(logits)
+            if self.expert_bias is not None:
+                scores_for_routing = scores + self.expert_bias
+                _, top_indices = torch.topk(scores_for_routing, k=self.topk, dim=1)
+                scores = torch.gather(scores, dim=1, index=top_indices).type_as(logits)
+            else:
+                scores, top_indices = torch.topk(scores, k=self.topk, dim=1)
+            probs = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) if self.topk > 1 else scores
+        else:
+            raise ValueError(f"Invalid score_function: {self.score_function}")
+        if self.scaling_factor:
+            probs = probs * self.scaling_factor
+        return probs, top_indices
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        self._maintain_float32_expert_bias()
+        logits = self.gating(input)
+        top_scores, top_indices = self.routing(logits)
+        return top_scores, top_indices
+class ReMoERouter(BaseRouter):
+    def __init__(self, config: BlockFFNConfig) -> None:
+        super().__init__(config)
+        self.config = config
+        self.router_act = torch.nn.ReLU()
+    def routing(self, logits: torch.Tensor):
+        """Top-k routing function
+        Args:
+            logits (torch.Tensor): Logits tensor after gating.
+        Returns:
+            probs (torch.Tensor): The probabilities of token to experts assignment.
+            routing_map (torch.Tensor): The mapping of token to experts assignment,
+                with shape [num_tokens, num_experts].
+        """
+        logits = logits.view(-1, self.num_experts)
+        router_score = self.router_act(logits)
+        routing_map = router_score > 0
+        sorted_probs, sorted_indices = torch.sort(router_score, descending=True, dim=-1)
+        sorted_map = sorted_probs <= 0
+        sorted_indices = torch.where(sorted_map, -1, sorted_indices)
+        max_valid_num = max(sorted_probs.size(-1) - torch.min(torch.sum(sorted_map, dim=-1)).item(), 1)
+        assert torch.all(sorted_map[:, max_valid_num:])
+        sorted_probs = sorted_probs[:, :max_valid_num]
+        sorted_indices = sorted_indices[:, :max_valid_num]
+        assert torch.sum(routing_map) == torch.sum(sorted_indices != -1)
+        return sorted_probs, sorted_indices
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        logits = self.gating(input)
+        top_scores, top_indices = self.routing(logits)
+        return top_scores, top_indices
+class TopPRouter(BaseRouter):
+    def __init__(self, config: BlockFFNConfig) -> None:
+        super().__init__(config)
+        self.config = config
+        self.top_p = config.moe_router_topp
+    def routing(self, logits: torch.Tensor):
+        """Top-k routing function
+        Args:
+            logits (torch.Tensor): Logits tensor after gating.
+        Returns:
+            probs (torch.Tensor): The probabilities of token to experts assignment.
+            routing_map (torch.Tensor): The mapping of token to experts assignment,
+                with shape [num_tokens, num_experts].
+        """
+        logits = logits.view(-1, self.num_experts)
+        router_score = torch.abs(logits)
+        router_score = router_score / (router_score.sum(dim=-1, keepdim=True) + 1e-20)
+        sorted_probs, sorted_indices = torch.sort(router_score, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+        mask = cumulative_probs > self.top_p
+        threshold_indices = mask.long().argmax(dim=-1)
+        threshold_mask = torch.nn.functional.one_hot(threshold_indices, num_classes=sorted_indices.size(-1)).bool()
+        mask = mask & ~threshold_mask
+        sorted_indices = torch.where(mask, -1, sorted_indices)
+        sorted_probs = torch.where(mask, 0.0, sorted_probs)
+        max_valid_num = max(mask.size(-1) - torch.min(torch.sum(mask, dim=-1)).item(), 1)
+        assert torch.all(mask[:, max_valid_num:])
+        sorted_indices = sorted_indices[:, :max_valid_num]
+        sorted_probs = sorted_probs[:, :max_valid_num]
+        sorted_probs = sorted_probs / sorted_probs.sum(dim=-1, keepdim=True)
+        return sorted_probs, sorted_indices
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        logits = self.gating(input)
+        top_scores, top_indices = self.routing(logits)
+        return top_scores, top_indices
+class FastTopKCalculator:
+    def __init__(self, num_experts: int):
+        self.num_experts = num_experts
+    def fmoe_sparse_topk_forward(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, experts: torch.nn.Module):
+        (
+            pos,
+            local_expert_count,
+            global_expert_count,
+            fwd_expert_count,
+            fwd_batch_size,
+        ) = prepare_forward(topk_indices, self.num_experts, 1)
+        topk = 1
+        if len(topk_indices.shape) == 2:
+            topk = topk_indices.shape[1]
+        def scatter_func(tensor):
+            return MOEScatter.apply(
+                tensor,
+                torch.div(pos, topk, rounding_mode='floor'),
+                local_expert_count,
+                global_expert_count,
+                fwd_batch_size,
+                1,
+            )
+        x = tree.map_structure(scatter_func, hidden_states)
+        x = experts(x, fwd_expert_count, topk_indices=topk_indices)
+        out_batch_size = tree.flatten(hidden_states)[0].shape[0]
+        if len(topk_indices.shape) == 2:
+            out_batch_size *= topk_indices.shape[1]
+        def gather_func(tensor):
+            return MOEGather.apply(
+                tensor,
+                pos,
+                local_expert_count,
+                global_expert_count,
+                out_batch_size,
+                1,
+            )
+        outp = tree.map_structure(gather_func, x)
+        return outp
+    def forward(self, hidden_states, topk_indices, topk_weights, experts):
+        assert topk_indices.shape == topk_weights.shape
+        top_k = topk_indices.shape[-1]
+        dim3 = hidden_states.ndim == 3
+        if dim3:
+            batch_size, seq_len, dim = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size * seq_len, dim)
+        else:
+            assert hidden_states.ndim == 2
+            batch_size, (seq_len, dim) = -1, hidden_states.shape
+        fwd = self.fmoe_sparse_topk_forward(hidden_states, topk_indices, experts)
+        def view_func(tensor):
+            n_dim = tensor.shape[-1]
+            tensor = tensor.view(-1, top_k, n_dim)
+            return tensor
+        moe_output = tree.map_structure(view_func, fwd)
+        topk_weights = topk_weights.unsqueeze(1)
+        def bmm_func(tensor):
+            n_dim = tensor.shape[-1]
+            tensor = torch.bmm(topk_weights, tensor).reshape(-1, n_dim)
+            return tensor
+        moe_output = tree.map_structure(bmm_func, moe_output)
+        if dim3:
+            moe_output = moe_output.view(batch_size, seq_len, -1)
+        return moe_output
+class MoELinearExperts(nn.Module):
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        num_experts: int,
+        ffn_bias: bool,
+    ):
+        super().__init__()
+        self.dim_in = self.in_features = dim_in
+        self.dim_out = self.out_features = dim_out
+        self.weight = torch.nn.Parameter(torch.empty(num_experts, dim_out, dim_in))
+        self.bias = None
+        if ffn_bias:
+            self.bias = torch.nn.Parameter(torch.empty(num_experts, dim_out))
+    def forward(self, x: torch.Tensor, fwd_expert_count: torch.Tensor):
+        x = MOELinear.apply(x, fwd_expert_count, self.weight, self.bias)
+        return x
+class MoEGatedExperts(nn.Module):
+    def __init__(
+        self,
+        dim_in: int,
+        dim_ff: int,
+        is_gated: bool,
+        act_name: str,
+        num_experts: int,
+        ffn_bias: bool = False,
+    ):
+        super().__init__()
+        self.is_gated = is_gated
+        self.dim_in, self.dim_ff, self.num_experts = dim_in, dim_ff, num_experts
+        if self.is_gated:
+            self.gate_proj = MoELinearExperts(dim_in, dim_ff, num_experts, ffn_bias)
+        self.up_proj = MoELinearExperts(dim_in, dim_ff, num_experts, ffn_bias)
+        self.down_proj = MoELinearExperts(dim_ff, dim_in, num_experts, ffn_bias)
+        self.act_fn = ACT2FN[act_name]
+    def forward(self, x: torch.Tensor, fwd_expert_count: torch.Tensor, **kwargs) -> torch.Tensor:
+        if self.is_gated:
+            gate_score = self.gate_proj(x, fwd_expert_count)
+            up_proj = self.up_proj(x, fwd_expert_count)
+            x = up_proj * self.act_fn(gate_score)
+        else:
+            up_score = self.up_proj(x, fwd_expert_count)
+            x = self.act_fn(up_score)
+        x = self.down_proj(x, fwd_expert_count)
+        return x
+class VanillaMoELayer(nn.Module):
+    def __init__(self, config: BlockFFNConfig):
+        super(VanillaMoELayer, self).__init__()
+        self.config = config
+        # Initialize router
+        if config.router_type == "topk":
+            self.router = TopKRouter(config=self.config)
+        elif config.router_type == "remoe":
+            self.router = ReMoERouter(config=self.config)
+        elif config.router_type == "topp":
+            self.router = TopPRouter(config=self.config)
+        else:
+            raise NotImplementedError(f"Router type {config.router_type} not implemented.")
+        self.mix_calculator = FastTopKCalculator(num_experts=self.config.num_experts)
+        # Initialize experts
+        self.experts = MoEGatedExperts(
+            dim_in=self.config.hidden_size,
+            dim_ff=self.config.moe_ffn_hidden_size,
+            is_gated=not self.config.expert_not_gated,
+            act_name="silu",
+            num_experts=self.config.num_experts,
+        )
+        self.dim_shared_expert = self.config.moe_shared_expert_intermediate_size
+        self.use_shared_expert = self.dim_shared_expert is not None and self.dim_shared_expert > 0
+        if self.use_shared_expert:
+            self.shared_experts = BlockFFNMLP(self.config, intermediate_size=self.dim_shared_expert)
+    def forward(self, hidden_states: torch.Tensor):
+        top_scores, top_indices = self.router(hidden_states)
+        y = self.mix_calculator.forward(
+            hidden_states=hidden_states,
+            topk_indices=top_indices.contiguous(),
+            topk_weights=top_scores.type_as(hidden_states),
+            experts=self.experts,
+        )
+        if self.shared_experts is not None:
+            y = y + self.shared_experts(hidden_states)
+        return y
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class BlockFFNAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: BlockFFNConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_query_groups
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_query_groups * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_query_groups * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class BlockFFNDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: BlockFFNConfig, layer_idx: int, is_moe_layer: bool):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = BlockFFNAttention(config=config, layer_idx=layer_idx)
+        if is_moe_layer:
+            if config.use_blockffn:
+                self.mlp = BlockFFNLayer(config)
+            elif config.router_type in ["topk", "remoe", "topp"]:
+                self.mlp = VanillaMoELayer(config)
+            else:
+                raise NotImplementedError
+        else:
+            self.mlp = BlockFFNMLP(config)
+        self.input_layernorm = BlockFFNRMSNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.post_attention_layernorm = BlockFFNRMSNorm(config.hidden_size, eps=config.norm_epsilon)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        if self.config.use_mup:
+            hidden_states = residual + hidden_states * (self.config.mup_depth_scale / math.sqrt(self.config.num_layers))
+        else:
+            hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.config.use_mup:
+            hidden_states = residual + hidden_states * (self.config.mup_depth_scale / math.sqrt(self.config.num_layers))
+        else:
+            hidden_states = residual + hidden_states
+        return hidden_states
+@auto_docstring
+class BlockFFNPreTrainedModel(PreTrainedModel):
+    config: BlockFFNConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["BlockFFNDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": BlockFFNDecoderLayer,
+        "attentions": BlockFFNAttention,
+    }
+@auto_docstring
+class BlockFFNModel(BlockFFNPreTrainedModel):
+    def __init__(self, config: BlockFFNConfig):
+        super().__init__(config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.moe_layer_freq = eval(config.moe_layer_freq) if isinstance(config.moe_layer_freq, str) else config.moe_layer_freq
+        assert len(self.moe_layer_freq) == config.num_layers
+        self.layers = nn.ModuleList(
+            [BlockFFNDecoderLayer(config, layer_idx, bool(self.moe_layer_freq[layer_idx])) for layer_idx in range(config.num_layers)]
+        )
+        self.norm = BlockFFNRMSNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.rotary_emb = BlockFFNRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+        if self.config.use_mup:
+            inputs_embeds = inputs_embeds * self.config.mup_emb_scale
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+@auto_docstring
+class BlockFFNForCausalLM(BlockFFNPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config: BlockFFNConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = BlockFFNModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        if self.config.use_mup:
+            hidden_states = hidden_states / self.config.mup_width_scale
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "BlockFFNForCausalLM",
+    "BlockFFNModel",
+    "BlockFFNPreTrainedModel",
+]

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21d500ba074f2d0d0e202a56c2869c13f554f4fa1348148c22cbb5c012d5b2a3
+size 2901693453

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,81 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|execute_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|execute_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb74d51116831c3bf65db812c553f94ab0c88dcf97a5bbb37e3504f6d359c530
+size 1181204

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,116 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73440": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73441": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73442": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73443": {
+      "content": "<|execute_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73444": {
+      "content": "<|execute_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73445": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73446": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73447": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_end|>",
+    "<|im_start|>",
+    "<|tool_call|>",
+    "<|execute_start|>",
+    "<|execute_end|>",
+    "<|fim_prefix|>",
+    "<|fim_middle|>",
+    "<|fim_suffix|>"
+  ],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+}