Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

c4_validation.json +0 -0
config.json +77 -0
configuration_blockffn.py +161 -0
evaluation.log +0 -0
evaluation.log.bak +0 -0
evaluation/results__hf_ckpts__blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128__/results_2026-01-23T14-52-19.555032.json +609 -0
evaluation/results__hf_ckpts__blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128__/results_2026-03-30T20-55-02.181492.json +609 -0
evaluation2.log +0 -0
generation_config.json +8 -0
modeling_blockffn.py +1014 -0
modeling_blockffn.py.bak +1024 -0
pytorch_model.bin +3 -0
special_tokens_map.json +81 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +116 -0

c4_validation.json ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+    "architectures": [
+        "BlockFFNForCausalLM"
+    ],
+    "auto_map": {
+        "AutoConfig": "configuration_blockffn.BlockFFNConfig",
+        "AutoModel": "modeling_blockffn.BlockFFNModel",
+        "AutoModelForCausalLM": "modeling_blockffn.BlockFFNForCausalLM"
+    },
+    "bos_token_id": 1,
+    "eos_token_id": [
+        2,
+        73440
+    ],
+    "pad_token_id": 2,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "initializer_range": 0.1,
+    "intermediate_size": 10240,
+    "head_dim": 128,
+    "max_position_embeddings": 4096,
+    "model_type": "blockffn",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "rope_theta": 10000.0,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.36.0",
+    "use_cache": true,
+    "vocab_size": 73448,
+    "use_mup": false,
+    "num_experts": 57,
+    "moe_ffn_hidden_size": 64,
+    "moe_shared_expert_intermediate_size": 128,
+    "moe_layer_freq": [
+        0,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1
+    ],
+    "moe_router_dtype": "fp32",
+    "router_act_func": "relu",
+    "router_norm_type": "simple",
+    "expert_act_func": "norm_silu",
+    "expert_act_norm_type": "normal",
+    "num_layers": 20,
+    "ffn_hidden_size": 2560,
+    "num_query_groups": 8,
+    "norm_epsilon": 1e-05,
+    "use_blockffn": true,
+    "router_type": "topk",
+    "moe_router_enable_expert_bias": false,
+    "expert_not_gated": true,
+    "moe_router_pre_softmax": false,
+    "moe_router_topk": 2,
+    "moe_router_topp": 0.5,
+    "moe_router_score_function": "softmax",
+    "moe_router_topk_scaling_factor": null
+}

configuration_blockffn.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BlockFFN model configuration"""
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+class BlockFFNConfig(PretrainedConfig):
+    model_type = "blockffn"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `BlockFFNModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        ffn_hidden_size=11008,
+        num_layers=32,
+        num_attention_heads=32,
+        num_query_groups=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        norm_epsilon=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        head_dim=None,
+        use_mup=True,
+        mup_emb_scale=12,
+        mup_depth_scale=1.4,
+        mup_base_hidden_size=256,
+        num_experts=180,
+        moe_ffn_hidden_size=128,
+        moe_shared_expert_intermediate_size=128,
+        moe_layer_freq="([0]*3+[1]*29)",
+        moe_router_dtype="fp32",
+        router_act_func="relu",
+        router_norm_type="simple",
+        expert_act_func="norm_silu",
+        expert_act_norm_type="normal",
+        use_blockffn=False,
+        router_type="topk",
+        moe_router_topk=0,
+        moe_router_topp=0,
+        moe_router_enable_expert_bias=False,
+        moe_router_score_function="sigmoid",
+        moe_router_topk_scaling_factor=2.5,
+        expert_not_gated=False,
+        moe_router_pre_softmax=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.num_layers = num_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_query_groups is None:
+            num_query_groups = num_attention_heads
+        self.num_query_groups = num_query_groups
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.norm_epsilon = norm_epsilon
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        self.use_mup = use_mup
+        self.mup_emb_scale = mup_emb_scale
+        self.mup_depth_scale = mup_depth_scale
+        self.mup_base_hidden_size = mup_base_hidden_size
+        self.num_experts = num_experts
+        self.moe_ffn_hidden_size = moe_ffn_hidden_size
+        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size
+        self.moe_layer_freq = moe_layer_freq if isinstance(moe_layer_freq, (str, list)) else ([0] * num_layers)
+        self.moe_router_dtype = moe_router_dtype
+        self.router_act_func = router_act_func
+        self.router_norm_type = router_norm_type
+        self.expert_act_func = expert_act_func
+        self.expert_act_norm_type = expert_act_norm_type
+        self.use_blockffn = use_blockffn
+        self.router_type = router_type
+        self.moe_router_topk = moe_router_topk
+        self.moe_router_topp = moe_router_topp
+        self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
+        self.moe_router_score_function = moe_router_score_function
+        self.moe_router_topk_scaling_factor = moe_router_topk_scaling_factor
+        self.expert_not_gated = expert_not_gated
+        self.moe_router_pre_softmax = moe_router_pre_softmax
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    @property
+    def mup_width_scale(self):
+        return (self.hidden_size / self.mup_base_hidden_size) if (self.use_mup and self.mup_base_hidden_size > 0) else 1
+__all__ = ["BlockFFNConfig"]

evaluation.log ADDED Viewed

The diff for this file is too large to render. See raw diff

evaluation.log.bak ADDED Viewed

The diff for this file is too large to render. See raw diff

evaluation/results__hf_ckpts__blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128__/results_2026-01-23T14-52-19.555032.json ADDED Viewed

	@@ -0,0 +1,609 @@

+{
+  "results": {
+    "arc_challenge": {
+      "alias": "arc_challenge",
+      "acc,none": 0.20733788395904437,
+      "acc_stderr,none": 0.011846905782971337,
+      "acc_norm,none": 0.24914675767918087,
+      "acc_norm_stderr,none": 0.012639407111926433
+    },
+    "arc_easy": {
+      "alias": "arc_easy",
+      "acc,none": 0.5197811447811448,
+      "acc_stderr,none": 0.010251751199542735,
+      "acc_norm,none": 0.44907407407407407,
+      "acc_norm_stderr,none": 0.010206428316323365
+    },
+    "boolq": {
+      "alias": "boolq",
+      "acc,none": 0.6113149847094801,
+      "acc_stderr,none": 0.008525580498982967
+    },
+    "hellaswag": {
+      "alias": "hellaswag",
+      "acc,none": 0.29466241784505076,
+      "acc_stderr,none": 0.004549591490046219,
+      "acc_norm,none": 0.3293168691495718,
+      "acc_norm_stderr,none": 0.004690047021719822
+    },
+    "lambada_openai": {
+      "alias": "lambada_openai",
+      "perplexity,none": 66.63752818011969,
+      "perplexity_stderr,none": 2.753797550415125,
+      "acc,none": 0.27653793906462254,
+      "acc_stderr,none": 0.006231567654090107
+    },
+    "lambada_standard": {
+      "alias": "lambada_standard",
+      "perplexity,none": 198.31426320573388,
+      "perplexity_stderr,none": 8.56348322119257,
+      "acc,none": 0.19483795847079372,
+      "acc_stderr,none": 0.005518111913121867
+    },
+    "piqa": {
+      "alias": "piqa",
+      "acc,none": 0.6436343852013058,
+      "acc_stderr,none": 0.011174109865864717,
+      "acc_norm,none": 0.6131664853101197,
+      "acc_norm_stderr,none": 0.011363095931902848
+    },
+    "social_iqa": {
+      "alias": "social_iqa",
+      "acc,none": 0.3618219037871034,
+      "acc_stderr,none": 0.010873447266941618
+    },
+    "wikitext": {
+      "alias": "wikitext",
+      "word_perplexity,none": 39.00052118822888,
+      "word_perplexity_stderr,none": "N/A",
+      "byte_perplexity,none": 1.9839837931891766,
+      "byte_perplexity_stderr,none": "N/A",
+      "bits_per_byte,none": 0.9884002406536678,
+      "bits_per_byte_stderr,none": "N/A"
+    },
+    "winogrande": {
+      "alias": "winogrande",
+      "acc,none": 0.5209155485398579,
+      "acc_stderr,none": 0.014040185494212945
+    }
+  },
+  "group_subtasks": {
+    "arc_challenge": [],
+    "arc_easy": [],
+    "boolq": [],
+    "hellaswag": [],
+    "lambada_openai": [],
+    "lambada_standard": [],
+    "piqa": [],
+    "social_iqa": [],
+    "wikitext": [],
+    "winogrande": []
+  },
+  "configs": {
+    "arc_challenge": {
+      "task": "arc_challenge",
+      "tag": [
+        "ai2_arc"
+      ],
+      "dataset_path": "allenai/ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "unsafe_code": false,
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "arc_easy": {
+      "task": "arc_easy",
+      "tag": [
+        "ai2_arc"
+      ],
+      "dataset_path": "allenai/ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "unsafe_code": false,
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "boolq": {
+      "task": "boolq",
+      "tag": [
+        "super-glue-lm-eval-v1"
+      ],
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
+      "doc_to_target": "label",
+      "unsafe_code": false,
+      "doc_to_choice": [
+        "no",
+        "yes"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc"
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "passage",
+      "metadata": {
+        "version": 2.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "hellaswag": {
+      "task": "hellaswag",
+      "tag": [
+        "multiple_choice"
+      ],
+      "dataset_path": "Rowan/hellaswag",
+      "training_split": "train",
+      "validation_split": "validation",
+      "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
+      "doc_to_text": "{{query}}",
+      "doc_to_target": "{{label}}",
+      "unsafe_code": false,
+      "doc_to_choice": "choices",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "lambada_openai": {
+      "task": "lambada_openai",
+      "tag": [
+        "lambada"
+      ],
+      "dataset_path": "EleutherAI/lambada_openai",
+      "dataset_name": "default",
+      "test_split": "test",
+      "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
+      "doc_to_target": "{{' '+text.split(' ')[-1]}}",
+      "unsafe_code": false,
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "perplexity",
+          "aggregation": "perplexity",
+          "higher_is_better": false
+        },
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "loglikelihood",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{text}}",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "lambada_standard": {
+      "task": "lambada_standard",
+      "tag": [
+        "lambada"
+      ],
+      "dataset_path": "lambada",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
+      "doc_to_target": "{{' '+text.split(' ')[-1]}}",
+      "unsafe_code": false,
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "perplexity",
+          "aggregation": "perplexity",
+          "higher_is_better": false
+        },
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "loglikelihood",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{text}}",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "piqa": {
+      "task": "piqa",
+      "dataset_path": "baber/piqa",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "Question: {{goal}}\nAnswer:",
+      "doc_to_target": "label",
+      "unsafe_code": false,
+      "doc_to_choice": "{{[sol1, sol2]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "goal",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "social_iqa": {
+      "task": "social_iqa",
+      "dataset_path": "social_i_qa",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "Q: {{context}} {{question}}\nA:",
+      "doc_to_target": "{{ (label|int) - 1 }}",
+      "unsafe_code": false,
+      "doc_to_choice": "{{[answerA, answerB, answerC]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 0.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "wikitext": {
+      "task": "wikitext",
+      "dataset_path": "EleutherAI/wikitext_document_level",
+      "dataset_name": "wikitext-2-raw-v1",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "",
+      "doc_to_target": "def wikitext_detokenizer(doc):\n    string = doc[\"page\"]\n    # contractions\n    string = string.replace(\"s '\", \"s'\")\n    string = re.sub(r\"/' [0-9]/\", r\"/'[0-9]/\", string)\n    # number separators\n    string = string.replace(\" @-@ \", \"-\")\n    string = string.replace(\" @,@ \", \",\")\n    string = string.replace(\" @.@ \", \".\")\n    # punctuation\n    string = string.replace(\" : \", \": \")\n    string = string.replace(\" ; \", \"; \")\n    string = string.replace(\" . \", \". \")\n    string = string.replace(\" ! \", \"! \")\n    string = string.replace(\" ? \", \"? \")\n    string = string.replace(\" , \", \", \")\n    # double brackets\n    string = re.sub(r\"\\(\\s*([^\\)]*?)\\s*\\)\", r\"(\\1)\", string)\n    string = re.sub(r\"\\[\\s*([^\\]]*?)\\s*\\]\", r\"[\\1]\", string)\n    string = re.sub(r\"{\\s*([^}]*?)\\s*}\", r\"{\\1}\", string)\n    string = re.sub(r\"\\\"\\s*([^\\\"]*?)\\s*\\\"\", r'\"\\1\"', string)\n    string = re.sub(r\"'\\s*([^']*?)\\s*'\", r\"'\\1'\", string)\n    # miscellaneous\n    string = string.replace(\"= = = =\", \"====\")\n    string = string.replace(\"= = =\", \"===\")\n    string = string.replace(\"= =\", \"==\")\n    string = string.replace(\" \" + chr(176) + \" \", chr(176))\n    string = string.replace(\" \\n\", \"\\n\")\n    string = string.replace(\"\\n \", \"\\n\")\n    string = string.replace(\" N \", \" 1 \")\n    string = string.replace(\" 's\", \"'s\")\n\n    return string\n",
+      "unsafe_code": false,
+      "process_results": "def process_results(doc, results):\n    (loglikelihood,) = results\n    # IMPORTANT: wikitext counts number of words in *original doc before detokenization*\n    _words = len(re.split(r\"\\s+\", doc[\"page\"]))\n    _bytes = len(doc[\"page\"].encode(\"utf-8\"))\n    return {\n        \"word_perplexity\": (loglikelihood, _words),\n        \"byte_perplexity\": (loglikelihood, _bytes),\n        \"bits_per_byte\": (loglikelihood, _bytes),\n    }\n",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "word_perplexity"
+        },
+        {
+          "metric": "byte_perplexity"
+        },
+        {
+          "metric": "bits_per_byte"
+        }
+      ],
+      "output_type": "loglikelihood_rolling",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{page}}",
+      "metadata": {
+        "version": 2.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "winogrande": {
+      "task": "winogrande",
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "def doc_to_text(doc):\n    answer_to_num = {\"1\": 0, \"2\": 1}\n    return answer_to_num[doc[\"answer\"]]\n",
+      "doc_to_target": "def doc_to_target(doc):\n    idx = doc[\"sentence\"].index(\"_\") + 1\n    return doc[\"sentence\"][idx:].strip()\n",
+      "unsafe_code": false,
+      "doc_to_choice": "def doc_to_choice(doc):\n    idx = doc[\"sentence\"].index(\"_\")\n    options = [doc[\"option1\"], doc[\"option2\"]]\n    return [doc[\"sentence\"][:idx] + opt for opt in options]\n",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "sentence",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    }
+  },
+  "versions": {
+    "arc_challenge": 1.0,
+    "arc_easy": 1.0,
+    "boolq": 2.0,
+    "hellaswag": 1.0,
+    "lambada_openai": 1.0,
+    "lambada_standard": 1.0,
+    "piqa": 1.0,
+    "social_iqa": 0.0,
+    "wikitext": 2.0,
+    "winogrande": 1.0
+  },
+  "n-shot": {
+    "arc_challenge": 0,
+    "arc_easy": 0,
+    "boolq": 0,
+    "hellaswag": 0,
+    "lambada_openai": 0,
+    "lambada_standard": 0,
+    "piqa": 0,
+    "social_iqa": 0,
+    "wikitext": 0,
+    "winogrande": 0
+  },
+  "higher_is_better": {
+    "arc_challenge": {
+      "acc": true,
+      "acc_norm": true
+    },
+    "arc_easy": {
+      "acc": true,
+      "acc_norm": true
+    },
+    "boolq": {
+      "acc": true
+    },
+    "hellaswag": {
+      "acc": true,
+      "acc_norm": true
+    },
+    "lambada_openai": {
+      "perplexity": false,
+      "acc": true
+    },
+    "lambada_standard": {
+      "perplexity": false,
+      "acc": true
+    },
+    "piqa": {
+      "acc": true,
+      "acc_norm": true
+    },
+    "social_iqa": {
+      "acc": true
+    },
+    "wikitext": {
+      "word_perplexity": false,
+      "byte_perplexity": false,
+      "bits_per_byte": false
+    },
+    "winogrande": {
+      "acc": true
+    }
+  },
+  "n-samples": {
+    "winogrande": {
+      "original": 1267,
+      "effective": 1267
+    },
+    "wikitext": {
+      "original": 62,
+      "effective": 62
+    },
+    "social_iqa": {
+      "original": 1954,
+      "effective": 1954
+    },
+    "piqa": {
+      "original": 1838,
+      "effective": 1838
+    },
+    "lambada_standard": {
+      "original": 5153,
+      "effective": 5153
+    },
+    "lambada_openai": {
+      "original": 5153,
+      "effective": 5153
+    },
+    "hellaswag": {
+      "original": 10042,
+      "effective": 10042
+    },
+    "boolq": {
+      "original": 3270,
+      "effective": 3270
+    },
+    "arc_easy": {
+      "original": 2376,
+      "effective": 2376
+    },
+    "arc_challenge": {
+      "original": 1172,
+      "effective": 1172
+    }
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/,dtype=bfloat16,trust_remote_code=True,trust_remote_code=True",
+    "model_num_parameters": 392747259,
+    "model_dtype": "torch.bfloat16",
+    "model_revision": "main",
+    "model_sha": "",
+    "batch_size": "8",
+    "batch_sizes": [],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null,
+    "random_seed": 0,
+    "numpy_seed": 1234,
+    "torch_seed": 1234,
+    "fewshot_seed": 1234
+  },
+  "git_hash": "core_v0.12.0-111-g418d5cb59",
+  "date": 1769150810.054304,
+  "pretty_env_info": "PyTorch version: 2.6.0+cu124\nIs debug build: False\nCUDA used to build PyTorch: 12.4\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.4 LTS (x86_64)\nGCC version: (conda-forge gcc 9.5.0-19) 9.5.0\nClang version: Could not collect\nCMake version: version 3.30.1\nLibc version: glibc-2.35\n\nPython version: 3.10.14 (main, May  6 2024, 19:42:50) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-6.5.0-18-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.4.131\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A800-SXM4-80GB\nGPU 1: NVIDIA A800-SXM4-80GB\nGPU 2: NVIDIA A800-SXM4-80GB\nGPU 3: NVIDIA A800-SXM4-80GB\nGPU 4: NVIDIA A800-SXM4-80GB\nGPU 5: NVIDIA A800-SXM4-80GB\nGPU 6: NVIDIA A800-SXM4-80GB\nGPU 7: NVIDIA A800-SXM4-80GB\n\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nAddress sizes:                      52 bits physical, 57 bits virtual\nByte Order:                         Little Endian\nCPU(s):                             104\nOn-line CPU(s) list:                0-103\nVendor ID:                          GenuineIntel\nModel name:                         Intel(R) Xeon(R) Platinum 8470\nCPU family:                         6\nModel:                              143\nThread(s) per core:                 1\nCore(s) per socket:                 52\nSocket(s):                          2\nStepping:                           8\nCPU max MHz:                        3800.0000\nCPU min MHz:                        800.0000\nBogoMIPS:                           4000.00\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 invpcid_single intel_ppin cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local split_lock_detect avx_vnni avx512_bf16 wbnoinvd dtherm ida arat pln pts hwp hwp_act_window hwp_epp hwp_pkg_req vnmi avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b enqcmd fsrm md_clear serialize tsxldtrk pconfig arch_lbr ibt amx_bf16 avx512_fp16 amx_tile amx_int8 flush_l1d arch_capabilities\nVirtualization:                     VT-x\nL1d cache:                          4.9 MiB (104 instances)\nL1i cache:                          3.3 MiB (104 instances)\nL2 cache:                           208 MiB (104 instances)\nL3 cache:                           210 MiB (2 instances)\nNUMA node(s):                       2\nNUMA node0 CPU(s):                  0-51\nNUMA node1 CPU(s):                  52-103\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] nvidia-cublas-cu12==12.4.5.8\n[pip3] nvidia-cuda-cupti-cu12==12.4.127\n[pip3] nvidia-cuda-nvrtc-cu12==12.4.127\n[pip3] nvidia-cuda-runtime-cu12==12.4.127\n[pip3] nvidia-cudnn-cu12==9.1.0.70\n[pip3] nvidia-cufft-cu12==11.2.1.3\n[pip3] nvidia-curand-cu12==10.3.5.147\n[pip3] nvidia-cusolver-cu12==11.6.1.9\n[pip3] nvidia-cusparse-cu12==12.3.1.170\n[pip3] nvidia-cusparselt-cu12==0.6.2\n[pip3] nvidia-nccl-cu11==2.21.5\n[pip3] nvidia-nccl-cu12==2.21.5\n[pip3] nvidia-nvjitlink-cu12==12.4.127\n[pip3] nvidia-nvtx-cu12==12.4.127\n[pip3] torch==2.6.0\n[pip3] torchaudio==2.6.0\n[pip3] torchdata==0.11.0\n[pip3] torchvision==0.21.0\n[pip3] triton==3.2.0\n[conda] cuda-cudart               12.4.99              hd3aeb46_0    conda-forge\n[conda] cuda-cudart_linux-64      12.4.99              h59595ed_0    conda-forge\n[conda] cuda-cupti                12.4.127             he02047a_2    conda-forge\n[conda] cuda-libraries            12.4.0               ha770c72_0    conda-forge\n[conda] cuda-nvrtc                12.4.99              hd3aeb46_0    conda-forge\n[conda] cuda-nvtx                 12.4.127             he02047a_2    conda-forge\n[conda] cuda-opencl               12.4.99              h59595ed_0    conda-forge\n[conda] cuda-runtime              12.4.0               ha804496_0    conda-forge\n[conda] ffmpeg                    4.3                  hf484d3e_0    pytorch\n[conda] libcublas                 12.4.2.65            hd3aeb46_0    conda-forge\n[conda] libcufft                  11.2.0.44            hd3aeb46_0    conda-forge\n[conda] libcurand                 10.3.5.119           hd3aeb46_0    conda-forge\n[conda] libcusolver               11.6.0.99            hd3aeb46_0    conda-forge\n[conda] libcusparse               12.3.0.142           hd3aeb46_0    conda-forge\n[conda] libjpeg-turbo             2.0.0                h9bf148f_0    pytorch\n[conda] libnvjitlink              12.4.99              hd3aeb46_0    conda-forge\n[conda] mkl                       2023.1.0         h213fc3f_46344    defaults\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] nvidia-cublas-cu12        12.4.5.8                 pypi_0    pypi\n[conda] nvidia-cuda-cupti-cu12    12.4.127                 pypi_0    pypi\n[conda] nvidia-cuda-nvrtc-cu12    12.4.127                 pypi_0    pypi\n[conda] nvidia-cuda-runtime-cu12  12.4.127                 pypi_0    pypi\n[conda] nvidia-cudnn-cu12         9.1.0.70                 pypi_0    pypi\n[conda] nvidia-cufft-cu12         11.2.1.3                 pypi_0    pypi\n[conda] nvidia-curand-cu12        10.3.5.147               pypi_0    pypi\n[conda] nvidia-cusolver-cu12      11.6.1.9                 pypi_0    pypi\n[conda] nvidia-cusparse-cu12      12.3.1.170               pypi_0    pypi\n[conda] nvidia-cusparselt-cu12    0.6.2                    pypi_0    pypi\n[conda] nvidia-nccl-cu11          2.21.5                   pypi_0    pypi\n[conda] nvidia-nccl-cu12          2.21.5                   pypi_0    pypi\n[conda] nvidia-nvjitlink-cu12     12.4.127                 pypi_0    pypi\n[conda] nvidia-nvtx-cu12          12.4.127                 pypi_0    pypi\n[conda] pytorch-cuda              12.4                 hc786d27_6    pytorch\n[conda] pytorch-mutex             1.0                        cuda    pytorch\n[conda] torch                     2.6.0                    pypi_0    pypi\n[conda] torchaudio                2.6.0                    pypi_0    pypi\n[conda] torchdata                 0.11.0                   pypi_0    pypi\n[conda] torchvision               0.21.0                   pypi_0    pypi\n[conda] triton                    3.2.0                    pypi_0    pypi",
+  "transformers_version": "4.55.2",
+  "lm_eval_version": "0.4.9.1",
+  "upper_git_hash": null,
+  "tokenizer_pad_token": [
+    "<unk>",
+    "0"
+  ],
+  "tokenizer_eos_token": [
+    "<|im_end|>",
+    "73440"
+  ],
+  "tokenizer_bos_token": [
+    "<s>",
+    "1"
+  ],
+  "eot_token_id": 73440,
+  "max_length": 4096,
+  "task_hashes": {},
+  "model_source": "hf",
+  "model_name": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+  "model_name_sanitized": "results__hf_ckpts__blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128__",
+  "system_instruction": null,
+  "system_instruction_sha": null,
+  "fewshot_as_multiturn": false,
+  "chat_template": null,
+  "chat_template_sha": null,
+  "start_time": 684014.828182741,
+  "end_time": 684465.591942707,
+  "total_evaluation_time_seconds": "450.76375996600837"
+}

evaluation/results__hf_ckpts__blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128__/results_2026-03-30T20-55-02.181492.json ADDED Viewed

	@@ -0,0 +1,609 @@

+{
+  "results": {
+    "arc_challenge": {
+      "alias": "arc_challenge",
+      "acc,none": 0.20733788395904437,
+      "acc_stderr,none": 0.011846905782971337,
+      "acc_norm,none": 0.24914675767918087,
+      "acc_norm_stderr,none": 0.012639407111926433
+    },
+    "arc_easy": {
+      "alias": "arc_easy",
+      "acc,none": 0.5197811447811448,
+      "acc_stderr,none": 0.010251751199542735,
+      "acc_norm,none": 0.44907407407407407,
+      "acc_norm_stderr,none": 0.010206428316323365
+    },
+    "boolq": {
+      "alias": "boolq",
+      "acc,none": 0.6113149847094801,
+      "acc_stderr,none": 0.008525580498982967
+    },
+    "hellaswag": {
+      "alias": "hellaswag",
+      "acc,none": 0.29466241784505076,
+      "acc_stderr,none": 0.004549591490046219,
+      "acc_norm,none": 0.3293168691495718,
+      "acc_norm_stderr,none": 0.004690047021719822
+    },
+    "lambada_openai": {
+      "alias": "lambada_openai",
+      "perplexity,none": 66.63752818011969,
+      "perplexity_stderr,none": 2.753797550415125,
+      "acc,none": 0.27653793906462254,
+      "acc_stderr,none": 0.006231567654090107
+    },
+    "lambada_standard": {
+      "alias": "lambada_standard",
+      "perplexity,none": 198.31426320573388,
+      "perplexity_stderr,none": 8.56348322119257,
+      "acc,none": 0.19483795847079372,
+      "acc_stderr,none": 0.005518111913121867
+    },
+    "piqa": {
+      "alias": "piqa",
+      "acc,none": 0.6436343852013058,
+      "acc_stderr,none": 0.011174109865864717,
+      "acc_norm,none": 0.6131664853101197,
+      "acc_norm_stderr,none": 0.011363095931902848
+    },
+    "social_iqa": {
+      "alias": "social_iqa",
+      "acc,none": 0.3618219037871034,
+      "acc_stderr,none": 0.010873447266941618
+    },
+    "wikitext": {
+      "alias": "wikitext",
+      "word_perplexity,none": 39.00052118822888,
+      "word_perplexity_stderr,none": "N/A",
+      "byte_perplexity,none": 1.9839837931891766,
+      "byte_perplexity_stderr,none": "N/A",
+      "bits_per_byte,none": 0.9884002406536678,
+      "bits_per_byte_stderr,none": "N/A"
+    },
+    "winogrande": {
+      "alias": "winogrande",
+      "acc,none": 0.5209155485398579,
+      "acc_stderr,none": 0.014040185494212945
+    }
+  },
+  "group_subtasks": {
+    "arc_challenge": [],
+    "arc_easy": [],
+    "boolq": [],
+    "hellaswag": [],
+    "lambada_openai": [],
+    "lambada_standard": [],
+    "piqa": [],
+    "social_iqa": [],
+    "wikitext": [],
+    "winogrande": []
+  },
+  "configs": {
+    "arc_challenge": {
+      "task": "arc_challenge",
+      "tag": [
+        "ai2_arc"
+      ],
+      "dataset_path": "allenai/ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "unsafe_code": false,
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "arc_easy": {
+      "task": "arc_easy",
+      "tag": [
+        "ai2_arc"
+      ],
+      "dataset_path": "allenai/ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "unsafe_code": false,
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "boolq": {
+      "task": "boolq",
+      "tag": [
+        "super-glue-lm-eval-v1"
+      ],
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
+      "doc_to_target": "label",
+      "unsafe_code": false,
+      "doc_to_choice": [
+        "no",
+        "yes"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc"
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "passage",
+      "metadata": {
+        "version": 2.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "hellaswag": {
+      "task": "hellaswag",
+      "tag": [
+        "multiple_choice"
+      ],
+      "dataset_path": "Rowan/hellaswag",
+      "training_split": "train",
+      "validation_split": "validation",
+      "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
+      "doc_to_text": "{{query}}",
+      "doc_to_target": "{{label}}",
+      "unsafe_code": false,
+      "doc_to_choice": "choices",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "lambada_openai": {
+      "task": "lambada_openai",
+      "tag": [
+        "lambada"
+      ],
+      "dataset_path": "EleutherAI/lambada_openai",
+      "dataset_name": "default",
+      "test_split": "test",
+      "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
+      "doc_to_target": "{{' '+text.split(' ')[-1]}}",
+      "unsafe_code": false,
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "perplexity",
+          "aggregation": "perplexity",
+          "higher_is_better": false
+        },
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "loglikelihood",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{text}}",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "lambada_standard": {
+      "task": "lambada_standard",
+      "tag": [
+        "lambada"
+      ],
+      "dataset_path": "lambada",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
+      "doc_to_target": "{{' '+text.split(' ')[-1]}}",
+      "unsafe_code": false,
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "perplexity",
+          "aggregation": "perplexity",
+          "higher_is_better": false
+        },
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "loglikelihood",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{text}}",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "piqa": {
+      "task": "piqa",
+      "dataset_path": "baber/piqa",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "Question: {{goal}}\nAnswer:",
+      "doc_to_target": "label",
+      "unsafe_code": false,
+      "doc_to_choice": "{{[sol1, sol2]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "goal",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "social_iqa": {
+      "task": "social_iqa",
+      "dataset_path": "social_i_qa",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "Q: {{context}} {{question}}\nA:",
+      "doc_to_target": "{{ (label|int) - 1 }}",
+      "unsafe_code": false,
+      "doc_to_choice": "{{[answerA, answerB, answerC]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 0.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "wikitext": {
+      "task": "wikitext",
+      "dataset_path": "EleutherAI/wikitext_document_level",
+      "dataset_name": "wikitext-2-raw-v1",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "",
+      "doc_to_target": "def wikitext_detokenizer(doc):\n    string = doc[\"page\"]\n    # contractions\n    string = string.replace(\"s '\", \"s'\")\n    string = re.sub(r\"/' [0-9]/\", r\"/'[0-9]/\", string)\n    # number separators\n    string = string.replace(\" @-@ \", \"-\")\n    string = string.replace(\" @,@ \", \",\")\n    string = string.replace(\" @.@ \", \".\")\n    # punctuation\n    string = string.replace(\" : \", \": \")\n    string = string.replace(\" ; \", \"; \")\n    string = string.replace(\" . \", \". \")\n    string = string.replace(\" ! \", \"! \")\n    string = string.replace(\" ? \", \"? \")\n    string = string.replace(\" , \", \", \")\n    # double brackets\n    string = re.sub(r\"\\(\\s*([^\\)]*?)\\s*\\)\", r\"(\\1)\", string)\n    string = re.sub(r\"\\[\\s*([^\\]]*?)\\s*\\]\", r\"[\\1]\", string)\n    string = re.sub(r\"{\\s*([^}]*?)\\s*}\", r\"{\\1}\", string)\n    string = re.sub(r\"\\\"\\s*([^\\\"]*?)\\s*\\\"\", r'\"\\1\"', string)\n    string = re.sub(r\"'\\s*([^']*?)\\s*'\", r\"'\\1'\", string)\n    # miscellaneous\n    string = string.replace(\"= = = =\", \"====\")\n    string = string.replace(\"= = =\", \"===\")\n    string = string.replace(\"= =\", \"==\")\n    string = string.replace(\" \" + chr(176) + \" \", chr(176))\n    string = string.replace(\" \\n\", \"\\n\")\n    string = string.replace(\"\\n \", \"\\n\")\n    string = string.replace(\" N \", \" 1 \")\n    string = string.replace(\" 's\", \"'s\")\n\n    return string\n",
+      "unsafe_code": false,
+      "process_results": "def process_results(doc, results):\n    (loglikelihood,) = results\n    # IMPORTANT: wikitext counts number of words in *original doc before detokenization*\n    _words = len(re.split(r\"\\s+\", doc[\"page\"]))\n    _bytes = len(doc[\"page\"].encode(\"utf-8\"))\n    return {\n        \"word_perplexity\": (loglikelihood, _words),\n        \"byte_perplexity\": (loglikelihood, _bytes),\n        \"bits_per_byte\": (loglikelihood, _bytes),\n    }\n",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "word_perplexity"
+        },
+        {
+          "metric": "byte_perplexity"
+        },
+        {
+          "metric": "bits_per_byte"
+        }
+      ],
+      "output_type": "loglikelihood_rolling",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{page}}",
+      "metadata": {
+        "version": 2.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    },
+    "winogrande": {
+      "task": "winogrande",
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "def doc_to_text(doc):\n    answer_to_num = {\"1\": 0, \"2\": 1}\n    return answer_to_num[doc[\"answer\"]]\n",
+      "doc_to_target": "def doc_to_target(doc):\n    idx = doc[\"sentence\"].index(\"_\") + 1\n    return doc[\"sentence\"][idx:].strip()\n",
+      "unsafe_code": false,
+      "doc_to_choice": "def doc_to_choice(doc):\n    idx = doc[\"sentence\"].index(\"_\")\n    options = [doc[\"option1\"], doc[\"option2\"]]\n    return [doc[\"sentence\"][:idx] + opt for opt in options]\n",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "sentence",
+      "metadata": {
+        "version": 1.0,
+        "pretrained": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+        "dtype": "bfloat16",
+        "trust_remote_code": true
+      }
+    }
+  },
+  "versions": {
+    "arc_challenge": 1.0,
+    "arc_easy": 1.0,
+    "boolq": 2.0,
+    "hellaswag": 1.0,
+    "lambada_openai": 1.0,
+    "lambada_standard": 1.0,
+    "piqa": 1.0,
+    "social_iqa": 0.0,
+    "wikitext": 2.0,
+    "winogrande": 1.0
+  },
+  "n-shot": {
+    "arc_challenge": 0,
+    "arc_easy": 0,
+    "boolq": 0,
+    "hellaswag": 0,
+    "lambada_openai": 0,
+    "lambada_standard": 0,
+    "piqa": 0,
+    "social_iqa": 0,
+    "wikitext": 0,
+    "winogrande": 0
+  },
+  "higher_is_better": {
+    "arc_challenge": {
+      "acc": true,
+      "acc_norm": true
+    },
+    "arc_easy": {
+      "acc": true,
+      "acc_norm": true
+    },
+    "boolq": {
+      "acc": true
+    },
+    "hellaswag": {
+      "acc": true,
+      "acc_norm": true
+    },
+    "lambada_openai": {
+      "perplexity": false,
+      "acc": true
+    },
+    "lambada_standard": {
+      "perplexity": false,
+      "acc": true
+    },
+    "piqa": {
+      "acc": true,
+      "acc_norm": true
+    },
+    "social_iqa": {
+      "acc": true
+    },
+    "wikitext": {
+      "word_perplexity": false,
+      "byte_perplexity": false,
+      "bits_per_byte": false
+    },
+    "winogrande": {
+      "acc": true
+    }
+  },
+  "n-samples": {
+    "winogrande": {
+      "original": 1267,
+      "effective": 1267
+    },
+    "wikitext": {
+      "original": 62,
+      "effective": 62
+    },
+    "social_iqa": {
+      "original": 1954,
+      "effective": 1954
+    },
+    "piqa": {
+      "original": 1838,
+      "effective": 1838
+    },
+    "lambada_standard": {
+      "original": 5153,
+      "effective": 5153
+    },
+    "lambada_openai": {
+      "original": 5153,
+      "effective": 5153
+    },
+    "hellaswag": {
+      "original": 10042,
+      "effective": 10042
+    },
+    "boolq": {
+      "original": 3270,
+      "effective": 3270
+    },
+    "arc_easy": {
+      "original": 2376,
+      "effective": 2376
+    },
+    "arc_challenge": {
+      "original": 1172,
+      "effective": 1172
+    }
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/,dtype=bfloat16,trust_remote_code=True,trust_remote_code=True",
+    "model_num_parameters": 392747259,
+    "model_dtype": "torch.bfloat16",
+    "model_revision": "main",
+    "model_sha": "",
+    "batch_size": "8",
+    "batch_sizes": [],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null,
+    "random_seed": 0,
+    "numpy_seed": 1234,
+    "torch_seed": 1234,
+    "fewshot_seed": 1234
+  },
+  "git_hash": "core_v0.12.0-147-g5c103f4",
+  "date": 1774874949.7520695,
+  "pretty_env_info": "PyTorch version: 2.6.0+cu124\nIs debug build: False\nCUDA used to build PyTorch: 12.4\nROCM used to build PyTorch: N/A\n\nOS: CentOS Linux 7 (Core) (x86_64)\nGCC version: (conda-forge gcc 9.5.0-19) 9.5.0\nClang version: Could not collect\nCMake version: version 3.30.1\nLibc version: glibc-2.17\n\nPython version: 3.10.14 (main, May  6 2024, 19:42:50) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-3.10.0-1160.el7.x86_64-x86_64-with-glibc2.17\nIs CUDA available: True\nCUDA runtime version: 12.4.131\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A800-SXM4-80GB\nGPU 1: NVIDIA A800-SXM4-80GB\nGPU 2: NVIDIA A800-SXM4-80GB\nGPU 3: NVIDIA A800-SXM4-80GB\nGPU 4: NVIDIA A800-SXM4-80GB\nGPU 5: NVIDIA A800-SXM4-80GB\nGPU 6: NVIDIA A800-SXM4-80GB\nGPU 7: NVIDIA A800-SXM4-80GB\n\nNvidia driver version: 550.163.01\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:          x86_64\nCPU op-mode(s):        32-bit, 64-bit\nByte Order:            Little Endian\nCPU(s):                104\nOn-line CPU(s) list:   0-103\nThread(s) per core:    1\nCore(s) per socket:    52\n座：                 2\nNUMA 节点：         2\n厂商 ID：           GenuineIntel\nCPU 系列：          6\n型号：              143\n型号名称：        Intel(R) Xeon(R) Platinum 8470\n步进：              8\nCPU MHz：             799.926\nCPU max MHz:           3800.0000\nCPU min MHz:           800.0000\nBogoMIPS：            4000.00\n虚拟化：           VT-x\nL1d 缓存：          48K\nL1i 缓存：          32K\nL2 缓存：           2048K\nL3 缓存：           107520K\nNUMA 节点0 CPU：    0-51\nNUMA 节点1 CPU：    52-103\nFlags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch epb cat_l3 cat_l2 cdp_l3 invpcid_single intel_pt cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local dtherm ida arat pln pts hwp hwp_act_window hwp_epp hwp_pkg_req avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq cldemote movdiri movdir64b md_clear pconfig spec_ctrl intel_stibp flush_l1d arch_capabilities\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] nvidia-cublas-cu12==12.4.5.8\n[pip3] nvidia-cuda-cupti-cu12==12.4.127\n[pip3] nvidia-cuda-nvrtc-cu12==12.4.127\n[pip3] nvidia-cuda-runtime-cu12==12.4.127\n[pip3] nvidia-cudnn-cu12==9.1.0.70\n[pip3] nvidia-cufft-cu12==11.2.1.3\n[pip3] nvidia-curand-cu12==10.3.5.147\n[pip3] nvidia-cusolver-cu12==11.6.1.9\n[pip3] nvidia-cusparse-cu12==12.3.1.170\n[pip3] nvidia-cusparselt-cu12==0.6.2\n[pip3] nvidia-nccl-cu11==2.21.5\n[pip3] nvidia-nccl-cu12==2.21.5\n[pip3] nvidia-nvjitlink-cu12==12.4.127\n[pip3] nvidia-nvtx-cu12==12.4.127\n[pip3] torch==2.6.0\n[pip3] torchaudio==2.6.0\n[pip3] torchdata==0.11.0\n[pip3] torchvision==0.21.0\n[pip3] triton==3.2.0\n[conda] cuda-cudart               12.4.99              hd3aeb46_0    conda-forge\n[conda] cuda-cudart_linux-64      12.4.99              h59595ed_0    conda-forge\n[conda] cuda-cupti                12.4.127             he02047a_2    conda-forge\n[conda] cuda-libraries            12.4.0               ha770c72_0    conda-forge\n[conda] cuda-nvrtc                12.4.99              hd3aeb46_0    conda-forge\n[conda] cuda-nvtx                 12.4.127             he02047a_2    conda-forge\n[conda] cuda-opencl               12.4.99              h59595ed_0    conda-forge\n[conda] cuda-runtime              12.4.0               ha804496_0    conda-forge\n[conda] ffmpeg                    4.3                  hf484d3e_0    pytorch\n[conda] libcublas                 12.4.2.65            hd3aeb46_0    conda-forge\n[conda] libcufft                  11.2.0.44            hd3aeb46_0    conda-forge\n[conda] libcurand                 10.3.5.119           hd3aeb46_0    conda-forge\n[conda] libcusolver               11.6.0.99            hd3aeb46_0    conda-forge\n[conda] libcusparse               12.3.0.142           hd3aeb46_0    conda-forge\n[conda] libjpeg-turbo             2.0.0                h9bf148f_0    pytorch\n[conda] libnvjitlink              12.4.99              hd3aeb46_0    conda-forge\n[conda] mkl                       2023.1.0         h213fc3f_46344    defaults\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] nvidia-cublas-cu12        12.4.5.8                 pypi_0    pypi\n[conda] nvidia-cuda-cupti-cu12    12.4.127                 pypi_0    pypi\n[conda] nvidia-cuda-nvrtc-cu12    12.4.127                 pypi_0    pypi\n[conda] nvidia-cuda-runtime-cu12  12.4.127                 pypi_0    pypi\n[conda] nvidia-cudnn-cu12         9.1.0.70                 pypi_0    pypi\n[conda] nvidia-cufft-cu12         11.2.1.3                 pypi_0    pypi\n[conda] nvidia-curand-cu12        10.3.5.147               pypi_0    pypi\n[conda] nvidia-cusolver-cu12      11.6.1.9                 pypi_0    pypi\n[conda] nvidia-cusparse-cu12      12.3.1.170               pypi_0    pypi\n[conda] nvidia-cusparselt-cu12    0.6.2                    pypi_0    pypi\n[conda] nvidia-nccl-cu11          2.21.5                   pypi_0    pypi\n[conda] nvidia-nccl-cu12          2.21.5                   pypi_0    pypi\n[conda] nvidia-nvjitlink-cu12     12.4.127                 pypi_0    pypi\n[conda] nvidia-nvtx-cu12          12.4.127                 pypi_0    pypi\n[conda] pytorch-cuda              12.4                 hc786d27_6    pytorch\n[conda] pytorch-mutex             1.0                        cuda    pytorch\n[conda] torch                     2.6.0                    pypi_0    pypi\n[conda] torchaudio                2.6.0                    pypi_0    pypi\n[conda] torchdata                 0.11.0                   pypi_0    pypi\n[conda] torchvision               0.21.0                   pypi_0    pypi\n[conda] triton                    3.2.0                    pypi_0    pypi",
+  "transformers_version": "4.55.2",
+  "lm_eval_version": "0.4.9.1",
+  "upper_git_hash": null,
+  "tokenizer_pad_token": [
+    "<unk>",
+    "0"
+  ],
+  "tokenizer_eos_token": [
+    "<|im_end|>",
+    "73440"
+  ],
+  "tokenizer_bos_token": [
+    "<s>",
+    "1"
+  ],
+  "eot_token_id": 73440,
+  "max_length": 4096,
+  "task_hashes": {},
+  "model_source": "hf",
+  "model_name": "results/hf_ckpts/blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128/",
+  "model_name_sanitized": "results__hf_ckpts__blockffn_02b_mul1002_withmean_d64_s128_lr93e4_b128__",
+  "system_instruction": null,
+  "system_instruction_sha": null,
+  "fewshot_as_multiturn": false,
+  "chat_template": null,
+  "chat_template_sha": null,
+  "start_time": 1822696.504315611,
+  "end_time": 1823059.519498931,
+  "total_evaluation_time_seconds": "363.01518332003616"
+}

evaluation2.log ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "do_sample": true,
+    "top_p": 0.8,
+    "temperature": 0.8,
+    "bos_token_id": 1,
+    "eos_token_id": [2,73440],
+    "pad_token_id": 2
+}

modeling_blockffn.py ADDED Viewed

	@@ -0,0 +1,1014 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+import math
+import torch
+from torch import nn
+import tree
+from abc import ABC, abstractmethod
+from fmoe.linear import MOELinear
+from fmoe.functions import prepare_forward, MOEScatter, MOEGather
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from transformers.utils.generic import check_model_inputs
+from .configuration_blockffn import BlockFFNConfig
+logger = logging.get_logger(__name__)
+@use_kernel_forward_from_hub("RMSNorm")
+class BlockFFNRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class BlockFFNRotaryEmbedding(nn.Module):
+    def __init__(self, config: BlockFFNConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class SimpleLayerNorm(nn.Module):
+    def __init__(self, dim_norm: int):
+        super().__init__()
+        self.dim_norm = dim_norm
+        self.weight = torch.nn.Parameter(torch.empty(self.dim_norm))
+    @torch.compile
+    def forward(self, x: torch.Tensor):
+        return  x * self.weight
+class BlockFFNMLP(nn.Module):
+    def __init__(self, config: BlockFFNConfig, intermediate_size: int = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.ffn_hidden_size if intermediate_size is None else intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class BlockFFNRouter(nn.Module):
+    def __init__(self, config: BlockFFNConfig):
+        super().__init__()
+        self.config = config
+        self.num_experts = self.config.num_experts
+        if self.config.moe_router_dtype == "fp32":
+            self.router_dtype = torch.float32
+        elif self.config.moe_router_dtype == "fp64":
+            self.router_dtype = torch.float64
+        elif self.config.moe_router_dtype == "bf16":
+            self.router_dtype = torch.bfloat16
+        else:
+            raise NotImplementedError(f"{self.config.moe_router_dtype} is not supported.")
+        self.weight = torch.nn.Parameter(
+            torch.empty((self.config.num_experts, self.config.hidden_size), dtype=self.router_dtype)
+        )
+    def forward(self, x: torch.Tensor):
+        return nn.functional.linear(x.to(self.router_dtype), self.weight)
+class NormSiLU(nn.Module):
+    def __init__(self, config: BlockFFNConfig):
+        super().__init__()
+        self.num_blocks, self.block_size = config.num_experts, config.moe_ffn_hidden_size
+        self.activate_fn_type = config.expert_act_func
+        assert self.activate_fn_type in ["norm_silu", "norm_silu_norms", "norm_silu_nomean"]
+        self.rms_norm = None
+        if self.activate_fn_type != "norm_silu_norms":
+            self.rms_norm = BlockFFNRMSNorm(config.moe_ffn_hidden_size, eps=config.norm_epsilon)
+        self.silu = torch.nn.SiLU()
+    @torch.compile
+    def forward(self, hidden: torch.Tensor) -> torch.Tensor:
+        assert hidden.ndim == 2
+        if self.activate_fn_type != "norm_silu_nomean":
+            hidden = hidden - torch.mean(hidden, dim=-1, keepdim=True)
+        if self.activate_fn_type != "norm_silu_norms":
+            return self.silu(self.rms_norm(hidden.view(hidden.shape[0], self.num_blocks, self.block_size)))
+        else:
+            return self.silu(hidden)
+class BlockFFNLayer(nn.Module):
+    def __init__(self, config: BlockFFNConfig):
+        super(BlockFFNLayer, self).__init__()
+        self.config = config
+        self.num_experts, self.dim_expert, self.hidden_size = \
+            config.num_experts, config.moe_ffn_hidden_size, config.hidden_size
+        self.dim_shared_expert = config.moe_shared_expert_intermediate_size
+        self.router_norm_type = config.router_norm_type
+        self.moe_router = BlockFFNRouter(self.config)
+        assert config.router_act_func == "relu"
+        self.router_act = nn.ReLU()
+        if config.router_norm_type == "simple":
+            self.router_norm = SimpleLayerNorm(self.config.num_experts)
+        elif config.router_norm_type == "rms":
+            self.router_norm = BlockFFNRMSNorm(self.config.num_experts, eps=config.norm_epsilon)
+        else:
+            raise NotImplementedError
+        self.expert_gated = not config.expert_not_gated
+        if self.expert_gated:
+            self.expert_gate_proj = nn.Linear(self.hidden_size, self.num_experts * self.dim_expert, bias=config.mlp_bias)
+        self.expert_up_proj = nn.Linear(self.hidden_size, self.num_experts * self.dim_expert, bias=config.mlp_bias)
+        assert config.expert_act_norm_type == "normal"
+        if config.expert_act_func == "norm_silu":
+            self.expert_act = NormSiLU(self.config)
+        elif config.expert_act_func == "silu":
+            self.expert_act = nn.SiLU()
+        else:
+            raise NotImplementedError
+        self.expert_down_proj = nn.Linear(self.num_experts * self.dim_expert, self.hidden_size, bias=config.mlp_bias)
+        self.use_shared_expert = self.dim_shared_expert is not None and self.dim_shared_expert > 0
+        if self.use_shared_expert:
+            self.shared_experts = BlockFFNMLP(self.config, intermediate_size=self.dim_shared_expert)
+    def forward(self, hidden_states: torch.Tensor):
+        ori_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        seq_len = hidden_states.shape[0]
+        # router module forward
+        raw_router_score = self.moe_router(hidden_states)  # [seq_len, num_experts]
+        router_score = self.router_act(raw_router_score)
+        router_score = self.router_norm(router_score)
+        # expert module forward
+        x_in = self.expert_up_proj(hidden_states)  # [seq_len, num_experts * dim_expert]
+        if self.expert_gated:
+            x_gate = self.expert_gate_proj(hidden_states)
+            x_in = x_in * self.expert_act(x_gate)
+        else:
+            x_in = self.expert_act(x_in)
+        if x_in.ndim == 3:
+            scored_x_in = x_in * router_score.type_as(hidden_states).unsqueeze(-1)
+        else:
+            scored_x_in = x_in.view(seq_len, self.num_experts, self.dim_expert) * router_score.type_as(hidden_states).unsqueeze(-1)
+        output = self.expert_down_proj(scored_x_in.view(seq_len, self.num_experts * self.dim_expert))
+        if self.use_shared_expert:
+            output = output + self.shared_experts(hidden_states)
+        return output.view(*ori_shape)
+class BaseRouter(ABC, nn.Module):
+    """Base Router class"""
+    def __init__(self, config: BlockFFNConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.num_experts = self.config.num_experts
+        if self.config.moe_router_dtype == "fp32":
+            self.router_dtype = torch.float32
+        elif self.config.moe_router_dtype == "fp64":
+            self.router_dtype = torch.float64
+        elif self.config.moe_router_dtype == "bf16":
+            self.router_dtype = torch.bfloat16
+        else:
+            raise NotImplementedError(f"{self.config.moe_router_dtype} is not supported.")
+        self.weight = torch.nn.Parameter(
+            torch.empty((self.num_experts, self.config.hidden_size), dtype=self.router_dtype)
+        )
+    def gating(self, input: torch.Tensor):
+        return torch.nn.functional.linear(input.to(self.router_dtype), self.weight.to(self.router_dtype))
+    @abstractmethod
+    def routing(self, logits: torch.Tensor):
+        """Routing function.
+        Args:
+            logits (torch.Tensor): Logits tensor.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing token assignment
+            probabilities and mapping.
+        """
+        raise NotImplementedError("Routing function not implemented.")
+    @abstractmethod
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        raise NotImplementedError("Forward function not implemented.")
+class TopKRouter(BaseRouter):
+    """Route each token to the top-k experts."""
+    def __init__(self, config: BlockFFNConfig) -> None:
+        super().__init__(config)
+        self.config = config
+        self.topk = self.config.moe_router_topk
+        self.score_function = self.config.moe_router_score_function
+        self.use_pre_softmax = self.config.moe_router_pre_softmax
+        self.scaling_factor = self.config.moe_router_topk_scaling_factor
+        self.enable_expert_bias = self.config.moe_router_enable_expert_bias
+        if self.enable_expert_bias:
+            self.expert_bias = torch.nn.Parameter(torch.zeros(self.num_experts, dtype=torch.float32))
+        else:
+            self.expert_bias = None
+    def _maintain_float32_expert_bias(self):
+        """
+        Maintain the expert bias in float32.
+        When using bf16/fp16, the expert bias gets converted to lower precision in Float16Module.
+        We keep it in float32 to avoid routing errors when updating the expert_bias.
+        """
+        if hasattr(self, 'expert_bias') and self.expert_bias is not None:
+            if self.expert_bias.dtype != torch.float32:
+                self.expert_bias.data = self.expert_bias.data.to(torch.float32)
+    def routing(self, logits: torch.Tensor):
+        """Top-k routing function
+        Args:
+            logits (torch.Tensor): Logits tensor after gating.
+        Returns:
+            probs (torch.Tensor): The probabilities of token to experts assignment.
+            routing_map (torch.Tensor): The mapping of token to experts assignment,
+                with shape [num_tokens, num_experts].
+        """
+        logits = logits.view(-1, self.num_experts)
+        if self.score_function == "softmax":
+            if self.use_pre_softmax:
+                scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
+                probs, top_indices = torch.topk(scores, k=self.topk, dim=1)
+            else:
+                scores, top_indices = torch.topk(logits, k=self.topk, dim=1)
+                probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits)
+        elif self.score_function == "sigmoid":
+            scores = torch.sigmoid(logits.float()).type_as(logits)
+            if self.expert_bias is not None:
+                scores_for_routing = scores + self.expert_bias
+                _, top_indices = torch.topk(scores_for_routing, k=self.topk, dim=1)
+                scores = torch.gather(scores, dim=1, index=top_indices).type_as(logits)
+            else:
+                scores, top_indices = torch.topk(scores, k=self.topk, dim=1)
+            probs = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) if self.topk > 1 else scores
+        else:
+            raise ValueError(f"Invalid score_function: {self.score_function}")
+        if self.scaling_factor:
+            probs = probs * self.scaling_factor
+        return probs, top_indices
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        self._maintain_float32_expert_bias()
+        logits = self.gating(input)
+        top_scores, top_indices = self.routing(logits)
+        return top_scores, top_indices
+class ReMoERouter(BaseRouter):
+    def __init__(self, config: BlockFFNConfig) -> None:
+        super().__init__(config)
+        self.config = config
+        self.router_act = torch.nn.ReLU()
+    def routing(self, logits: torch.Tensor):
+        """Top-k routing function
+        Args:
+            logits (torch.Tensor): Logits tensor after gating.
+        Returns:
+            probs (torch.Tensor): The probabilities of token to experts assignment.
+            routing_map (torch.Tensor): The mapping of token to experts assignment,
+                with shape [num_tokens, num_experts].
+        """
+        logits = logits.view(-1, self.num_experts)
+        router_score = self.router_act(logits)
+        routing_map = router_score > 0
+        sorted_probs, sorted_indices = torch.sort(router_score, descending=True, dim=-1)
+        sorted_map = sorted_probs <= 0
+        sorted_indices = torch.where(sorted_map, -1, sorted_indices)
+        max_valid_num = max(sorted_probs.size(-1) - torch.min(torch.sum(sorted_map, dim=-1)).item(), 1)
+        assert torch.all(sorted_map[:, max_valid_num:])
+        sorted_probs = sorted_probs[:, :max_valid_num]
+        sorted_indices = sorted_indices[:, :max_valid_num]
+        assert torch.sum(routing_map) == torch.sum(sorted_indices != -1)
+        return sorted_probs, sorted_indices
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        logits = self.gating(input)
+        top_scores, top_indices = self.routing(logits)
+        return top_scores, top_indices
+class TopPRouter(BaseRouter):
+    def __init__(self, config: BlockFFNConfig) -> None:
+        super().__init__(config)
+        self.config = config
+        self.top_p = config.moe_router_topp
+    def routing(self, logits: torch.Tensor):
+        """Top-k routing function
+        Args:
+            logits (torch.Tensor): Logits tensor after gating.
+        Returns:
+            probs (torch.Tensor): The probabilities of token to experts assignment.
+            routing_map (torch.Tensor): The mapping of token to experts assignment,
+                with shape [num_tokens, num_experts].
+        """
+        logits = logits.view(-1, self.num_experts)
+        router_score = torch.abs(logits)
+        router_score = router_score / (router_score.sum(dim=-1, keepdim=True) + 1e-20)
+        sorted_probs, sorted_indices = torch.sort(router_score, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+        mask = cumulative_probs > self.top_p
+        threshold_indices = mask.long().argmax(dim=-1)
+        threshold_mask = torch.nn.functional.one_hot(threshold_indices, num_classes=sorted_indices.size(-1)).bool()
+        mask = mask & ~threshold_mask
+        sorted_indices = torch.where(mask, -1, sorted_indices)
+        sorted_probs = torch.where(mask, 0.0, sorted_probs)
+        max_valid_num = max(mask.size(-1) - torch.min(torch.sum(mask, dim=-1)).item(), 1)
+        assert torch.all(mask[:, max_valid_num:])
+        sorted_indices = sorted_indices[:, :max_valid_num]
+        sorted_probs = sorted_probs[:, :max_valid_num]
+        sorted_probs = sorted_probs / sorted_probs.sum(dim=-1, keepdim=True)
+        return sorted_probs, sorted_indices
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        logits = self.gating(input)
+        top_scores, top_indices = self.routing(logits)
+        return top_scores, top_indices
+class FastTopKCalculator:
+    def __init__(self, num_experts: int):
+        self.num_experts = num_experts
+    def fmoe_sparse_topk_forward(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, experts: torch.nn.Module):
+        (
+            pos,
+            local_expert_count,
+            global_expert_count,
+            fwd_expert_count,
+            fwd_batch_size,
+        ) = prepare_forward(topk_indices, self.num_experts, 1)
+        topk = 1
+        if len(topk_indices.shape) == 2:
+            topk = topk_indices.shape[1]
+        def scatter_func(tensor):
+            return MOEScatter.apply(
+                tensor,
+                torch.div(pos, topk, rounding_mode='floor'),
+                local_expert_count,
+                global_expert_count,
+                fwd_batch_size,
+                1,
+            )
+        x = tree.map_structure(scatter_func, hidden_states)
+        x = experts(x, fwd_expert_count, topk_indices=topk_indices)
+        out_batch_size = tree.flatten(hidden_states)[0].shape[0]
+        if len(topk_indices.shape) == 2:
+            out_batch_size *= topk_indices.shape[1]
+        def gather_func(tensor):
+            return MOEGather.apply(
+                tensor,
+                pos,
+                local_expert_count,
+                global_expert_count,
+                out_batch_size,
+                1,
+            )
+        outp = tree.map_structure(gather_func, x)
+        return outp
+    def forward(self, hidden_states, topk_indices, topk_weights, experts):
+        assert topk_indices.shape == topk_weights.shape
+        top_k = topk_indices.shape[-1]
+        dim3 = hidden_states.ndim == 3
+        if dim3:
+            batch_size, seq_len, dim = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size * seq_len, dim)
+        else:
+            assert hidden_states.ndim == 2
+            batch_size, (seq_len, dim) = -1, hidden_states.shape
+        fwd = self.fmoe_sparse_topk_forward(hidden_states, topk_indices, experts)
+        def view_func(tensor):
+            n_dim = tensor.shape[-1]
+            tensor = tensor.view(-1, top_k, n_dim)
+            return tensor
+        moe_output = tree.map_structure(view_func, fwd)
+        topk_weights = topk_weights.unsqueeze(1)
+        def bmm_func(tensor):
+            n_dim = tensor.shape[-1]
+            tensor = torch.bmm(topk_weights, tensor).reshape(-1, n_dim)
+            return tensor
+        moe_output = tree.map_structure(bmm_func, moe_output)
+        if dim3:
+            moe_output = moe_output.view(batch_size, seq_len, -1)
+        return moe_output
+class MoELinearExperts(nn.Module):
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        num_experts: int,
+        ffn_bias: bool,
+    ):
+        super().__init__()
+        self.dim_in = self.in_features = dim_in
+        self.dim_out = self.out_features = dim_out
+        self.weight = torch.nn.Parameter(torch.empty(num_experts, dim_out, dim_in))
+        self.bias = None
+        if ffn_bias:
+            self.bias = torch.nn.Parameter(torch.empty(num_experts, dim_out))
+    def forward(self, x: torch.Tensor, fwd_expert_count: torch.Tensor):
+        x = MOELinear.apply(x, fwd_expert_count, self.weight, self.bias)
+        return x
+class MoEGatedExperts(nn.Module):
+    def __init__(
+        self,
+        dim_in: int,
+        dim_ff: int,
+        is_gated: bool,
+        act_name: str,
+        num_experts: int,
+        ffn_bias: bool = False,
+    ):
+        super().__init__()
+        self.is_gated = is_gated
+        self.dim_in, self.dim_ff, self.num_experts = dim_in, dim_ff, num_experts
+        if self.is_gated:
+            self.gate_proj = MoELinearExperts(dim_in, dim_ff, num_experts, ffn_bias)
+        self.up_proj = MoELinearExperts(dim_in, dim_ff, num_experts, ffn_bias)
+        self.down_proj = MoELinearExperts(dim_ff, dim_in, num_experts, ffn_bias)
+        self.act_fn = ACT2FN[act_name]
+    def forward(self, x: torch.Tensor, fwd_expert_count: torch.Tensor, **kwargs) -> torch.Tensor:
+        if self.is_gated:
+            gate_score = self.gate_proj(x, fwd_expert_count)
+            up_proj = self.up_proj(x, fwd_expert_count)
+            x = up_proj * self.act_fn(gate_score)
+        else:
+            up_score = self.up_proj(x, fwd_expert_count)
+            x = self.act_fn(up_score)
+        x = self.down_proj(x, fwd_expert_count)
+        return x
+class VanillaMoELayer(nn.Module):
+    def __init__(self, config: BlockFFNConfig):
+        super(VanillaMoELayer, self).__init__()
+        self.config = config
+        # Initialize router
+        if config.router_type == "topk":
+            self.router = TopKRouter(config=self.config)
+        elif config.router_type == "remoe":
+            self.router = ReMoERouter(config=self.config)
+        elif config.router_type == "topp":
+            self.router = TopPRouter(config=self.config)
+        else:
+            raise NotImplementedError(f"Router type {config.router_type} not implemented.")
+        self.mix_calculator = FastTopKCalculator(num_experts=self.config.num_experts)
+        # Initialize experts
+        self.experts = MoEGatedExperts(
+            dim_in=self.config.hidden_size,
+            dim_ff=self.config.moe_ffn_hidden_size,
+            is_gated=not self.config.expert_not_gated,
+            act_name="silu",
+            num_experts=self.config.num_experts,
+        )
+        self.dim_shared_expert = self.config.moe_shared_expert_intermediate_size
+        self.use_shared_expert = self.dim_shared_expert is not None and self.dim_shared_expert > 0
+        if self.use_shared_expert:
+            self.shared_experts = BlockFFNMLP(self.config, intermediate_size=self.dim_shared_expert)
+    def forward(self, hidden_states: torch.Tensor):
+        top_scores, top_indices = self.router(hidden_states)
+        y = self.mix_calculator.forward(
+            hidden_states=hidden_states,
+            topk_indices=top_indices.contiguous(),
+            topk_weights=top_scores.type_as(hidden_states),
+            experts=self.experts,
+        )
+        if self.shared_experts is not None:
+            y = y + self.shared_experts(hidden_states)
+        return y
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class BlockFFNAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: BlockFFNConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_query_groups
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_query_groups * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_query_groups * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class BlockFFNDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: BlockFFNConfig, layer_idx: int, is_moe_layer: bool):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = BlockFFNAttention(config=config, layer_idx=layer_idx)
+        if is_moe_layer:
+            if config.use_blockffn:
+                self.mlp = BlockFFNLayer(config)
+            elif config.router_type in ["topk", "remoe", "topp"]:
+                self.mlp = VanillaMoELayer(config)
+            else:
+                raise NotImplementedError
+        else:
+            self.mlp = BlockFFNMLP(config)
+        self.input_layernorm = BlockFFNRMSNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.post_attention_layernorm = BlockFFNRMSNorm(config.hidden_size, eps=config.norm_epsilon)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        if self.config.use_mup:
+            hidden_states = residual + hidden_states * (self.config.mup_depth_scale / math.sqrt(self.config.num_layers))
+        else:
+            hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.config.use_mup:
+            hidden_states = residual + hidden_states * (self.config.mup_depth_scale / math.sqrt(self.config.num_layers))
+        else:
+            hidden_states = residual + hidden_states
+        return hidden_states
+@auto_docstring
+class BlockFFNPreTrainedModel(PreTrainedModel):
+    config: BlockFFNConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["BlockFFNDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": BlockFFNDecoderLayer,
+        "attentions": BlockFFNAttention,
+    }
+@auto_docstring
+class BlockFFNModel(BlockFFNPreTrainedModel):
+    def __init__(self, config: BlockFFNConfig):
+        super().__init__(config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.moe_layer_freq = eval(config.moe_layer_freq) if isinstance(config.moe_layer_freq, str) else config.moe_layer_freq
+        assert len(self.moe_layer_freq) == config.num_layers
+        self.layers = nn.ModuleList(
+            [BlockFFNDecoderLayer(config, layer_idx, bool(self.moe_layer_freq[layer_idx])) for layer_idx in range(config.num_layers)]
+        )
+        self.norm = BlockFFNRMSNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.rotary_emb = BlockFFNRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+        if self.config.use_mup:
+            inputs_embeds = inputs_embeds * self.config.mup_emb_scale
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+@auto_docstring
+class BlockFFNForCausalLM(BlockFFNPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config: BlockFFNConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = BlockFFNModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        if self.config.use_mup:
+            hidden_states = hidden_states / self.config.mup_width_scale
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "BlockFFNForCausalLM",
+    "BlockFFNModel",
+    "BlockFFNPreTrainedModel",
+]

modeling_blockffn.py.bak ADDED Viewed

	@@ -0,0 +1,1024 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+import math
+import torch
+from torch import nn
+import tree
+from abc import ABC, abstractmethod
+from fmoe.linear import MOELinear
+from fmoe.functions import prepare_forward, MOEScatter, MOEGather
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from transformers.utils.generic import check_model_inputs
+from .configuration_blockffn import BlockFFNConfig
+logger = logging.get_logger(__name__)
+@use_kernel_forward_from_hub("RMSNorm")
+class BlockFFNRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class BlockFFNRotaryEmbedding(nn.Module):
+    def __init__(self, config: BlockFFNConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class SimpleLayerNorm(nn.Module):
+    def __init__(self, dim_norm: int):
+        super().__init__()
+        self.dim_norm = dim_norm
+        self.weight = torch.nn.Parameter(torch.empty(self.dim_norm))
+    @torch.compile
+    def forward(self, x: torch.Tensor):
+        return  x * self.weight
+class BlockFFNMLP(nn.Module):
+    def __init__(self, config: BlockFFNConfig, intermediate_size: int = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.ffn_hidden_size if intermediate_size is None else intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class BlockFFNRouter(nn.Module):
+    def __init__(self, config: BlockFFNConfig):
+        super().__init__()
+        self.config = config
+        self.num_experts = self.config.num_experts
+        if self.config.moe_router_dtype == "fp32":
+            self.router_dtype = torch.float32
+        elif self.config.moe_router_dtype == "fp64":
+            self.router_dtype = torch.float64
+        elif self.config.moe_router_dtype == "bf16":
+            self.router_dtype = torch.bfloat16
+        else:
+            raise NotImplementedError(f"{self.config.moe_router_dtype} is not supported.")
+        self.weight = torch.nn.Parameter(
+            torch.empty((self.config.num_experts, self.config.hidden_size), dtype=self.router_dtype)
+        )
+    def forward(self, x: torch.Tensor):
+        return nn.functional.linear(x.to(self.router_dtype), self.weight)
+class NormSiLU(nn.Module):
+    def __init__(self, config: BlockFFNConfig):
+        super().__init__()
+        self.num_blocks, self.block_size = config.num_experts, config.moe_ffn_hidden_size
+        self.activate_fn_type = config.expert_act_func
+        assert self.activate_fn_type in ["norm_silu", "norm_silu_norms", "norm_silu_nomean"]
+        self.rms_norm = None
+        if self.activate_fn_type != "norm_silu_norms":
+            self.rms_norm = BlockFFNRMSNorm(config.moe_ffn_hidden_size, eps=config.norm_epsilon)
+        self.silu = torch.nn.SiLU()
+    @torch.compile
+    def forward(self, hidden: torch.Tensor) -> torch.Tensor:
+        assert hidden.ndim == 2
+        if self.activate_fn_type != "norm_silu_nomean":
+            hidden = hidden - torch.mean(hidden, dim=-1, keepdim=True)
+        if self.activate_fn_type != "norm_silu_norms":
+            return self.silu(self.rms_norm(hidden.view(hidden.shape[0], self.num_blocks, self.block_size)))
+        else:
+            return self.silu(hidden)
+class BlockFFNLayer(nn.Module):
+    def __init__(self, config: BlockFFNConfig):
+        super(BlockFFNLayer, self).__init__()
+        self.config = config
+        self.num_experts, self.dim_expert, self.hidden_size = \
+            config.num_experts, config.moe_ffn_hidden_size, config.hidden_size
+        self.dim_shared_expert = config.moe_shared_expert_intermediate_size
+        self.router_norm_type = config.router_norm_type
+        self.moe_router = BlockFFNRouter(self.config)
+        assert config.router_act_func == "relu"
+        self.router_act = nn.ReLU()
+        if config.router_norm_type == "simple":
+            self.router_norm = SimpleLayerNorm(self.config.num_experts)
+        elif config.router_norm_type == "rms":
+            self.router_norm = BlockFFNRMSNorm(self.config.num_experts, eps=config.norm_epsilon)
+        else:
+            raise NotImplementedError
+        self.expert_gated = not config.expert_not_gated
+        if self.expert_gated:
+            self.expert_gate_proj = nn.Linear(self.hidden_size, self.num_experts * self.dim_expert, bias=config.mlp_bias)
+        self.expert_up_proj = nn.Linear(self.hidden_size, self.num_experts * self.dim_expert, bias=config.mlp_bias)
+        assert config.expert_act_norm_type == "normal"
+        if config.expert_act_func == "norm_silu":
+            self.expert_act = NormSiLU(self.config)
+        elif config.expert_act_func == "silu":
+            self.expert_act = nn.SiLU()
+        else:
+            raise NotImplementedError
+        self.expert_down_proj = nn.Linear(self.num_experts * self.dim_expert, self.hidden_size, bias=config.mlp_bias)
+        self.use_shared_expert = self.dim_shared_expert is not None and self.dim_shared_expert > 0
+        if self.use_shared_expert:
+            self.shared_experts = BlockFFNMLP(self.config, intermediate_size=self.dim_shared_expert)
+        self.expert_wise_scales = []
+    def forward(self, hidden_states: torch.Tensor):
+        ori_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        seq_len = hidden_states.shape[0]
+        # router module forward
+        raw_router_score = self.moe_router(hidden_states)  # [seq_len, num_experts]
+        router_score = self.router_act(raw_router_score)
+        router_score = self.router_norm(router_score)
+        # expert module forward
+        x_in = self.expert_up_proj(hidden_states)  # [seq_len, num_experts * dim_expert]
+        ori_x_in = x_in
+        if self.expert_gated:
+            x_gate = self.expert_gate_proj(hidden_states)
+            x_in = x_in * self.expert_act(x_gate)
+        else:
+            x_in = self.expert_act(x_in)
+        if x_in.ndim == 3:
+            scored_x_in = x_in * router_score.type_as(hidden_states).unsqueeze(-1)
+        else:
+            scored_x_in = x_in.view(seq_len, self.num_experts, self.dim_expert) * router_score.type_as(hidden_states).unsqueeze(-1)
+        output = self.expert_down_proj(scored_x_in.view(seq_len, self.num_experts * self.dim_expert))
+        with torch.no_grad():
+            ori_x_in = ori_x_in.view(seq_len, self.num_experts, self.dim_expert)
+            down_proj_weight = self.expert_down_proj.weight.view(self.hidden_size, self.num_experts, self.dim_expert)
+            expert_wise_outputs = torch.einsum("sed,hed->seh", ori_x_in, down_proj_weight).transpose(0, 1).reshape(self.num_experts, seq_len * self.hidden_size)
+            expert_wise_scale = torch.norm(expert_wise_outputs, p=2, dim=1) / seq_len
+            self.expert_wise_scales.append(expert_wise_scale.tolist())
+        if self.use_shared_expert:
+            output = output + self.shared_experts(hidden_states)
+        return output.view(*ori_shape)
+class BaseRouter(ABC, nn.Module):
+    """Base Router class"""
+    def __init__(self, config: BlockFFNConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.num_experts = self.config.num_experts
+        if self.config.moe_router_dtype == "fp32":
+            self.router_dtype = torch.float32
+        elif self.config.moe_router_dtype == "fp64":
+            self.router_dtype = torch.float64
+        elif self.config.moe_router_dtype == "bf16":
+            self.router_dtype = torch.bfloat16
+        else:
+            raise NotImplementedError(f"{self.config.moe_router_dtype} is not supported.")
+        self.weight = torch.nn.Parameter(
+            torch.empty((self.num_experts, self.config.hidden_size), dtype=self.router_dtype)
+        )
+    def gating(self, input: torch.Tensor):
+        return torch.nn.functional.linear(input.to(self.router_dtype), self.weight.to(self.router_dtype))
+    @abstractmethod
+    def routing(self, logits: torch.Tensor):
+        """Routing function.
+        Args:
+            logits (torch.Tensor): Logits tensor.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing token assignment
+            probabilities and mapping.
+        """
+        raise NotImplementedError("Routing function not implemented.")
+    @abstractmethod
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        raise NotImplementedError("Forward function not implemented.")
+class TopKRouter(BaseRouter):
+    """Route each token to the top-k experts."""
+    def __init__(self, config: BlockFFNConfig) -> None:
+        super().__init__(config)
+        self.config = config
+        self.topk = self.config.moe_router_topk
+        self.score_function = self.config.moe_router_score_function
+        self.use_pre_softmax = self.config.moe_router_pre_softmax
+        self.scaling_factor = self.config.moe_router_topk_scaling_factor
+        self.enable_expert_bias = self.config.moe_router_enable_expert_bias
+        if self.enable_expert_bias:
+            self.expert_bias = torch.nn.Parameter(torch.zeros(self.num_experts, dtype=torch.float32))
+        else:
+            self.expert_bias = None
+    def _maintain_float32_expert_bias(self):
+        """
+        Maintain the expert bias in float32.
+        When using bf16/fp16, the expert bias gets converted to lower precision in Float16Module.
+        We keep it in float32 to avoid routing errors when updating the expert_bias.
+        """
+        if hasattr(self, 'expert_bias') and self.expert_bias is not None:
+            if self.expert_bias.dtype != torch.float32:
+                self.expert_bias.data = self.expert_bias.data.to(torch.float32)
+    def routing(self, logits: torch.Tensor):
+        """Top-k routing function
+        Args:
+            logits (torch.Tensor): Logits tensor after gating.
+        Returns:
+            probs (torch.Tensor): The probabilities of token to experts assignment.
+            routing_map (torch.Tensor): The mapping of token to experts assignment,
+                with shape [num_tokens, num_experts].
+        """
+        logits = logits.view(-1, self.num_experts)
+        if self.score_function == "softmax":
+            if self.use_pre_softmax:
+                scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
+                probs, top_indices = torch.topk(scores, k=self.topk, dim=1)
+            else:
+                scores, top_indices = torch.topk(logits, k=self.topk, dim=1)
+                probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits)
+        elif self.score_function == "sigmoid":
+            scores = torch.sigmoid(logits.float()).type_as(logits)
+            if self.expert_bias is not None:
+                scores_for_routing = scores + self.expert_bias
+                _, top_indices = torch.topk(scores_for_routing, k=self.topk, dim=1)
+                scores = torch.gather(scores, dim=1, index=top_indices).type_as(logits)
+            else:
+                scores, top_indices = torch.topk(scores, k=self.topk, dim=1)
+            probs = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) if self.topk > 1 else scores
+        else:
+            raise ValueError(f"Invalid score_function: {self.score_function}")
+        if self.scaling_factor:
+            probs = probs * self.scaling_factor
+        return probs, top_indices
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        self._maintain_float32_expert_bias()
+        logits = self.gating(input)
+        top_scores, top_indices = self.routing(logits)
+        return top_scores, top_indices
+class ReMoERouter(BaseRouter):
+    def __init__(self, config: BlockFFNConfig) -> None:
+        super().__init__(config)
+        self.config = config
+        self.router_act = torch.nn.ReLU()
+    def routing(self, logits: torch.Tensor):
+        """Top-k routing function
+        Args:
+            logits (torch.Tensor): Logits tensor after gating.
+        Returns:
+            probs (torch.Tensor): The probabilities of token to experts assignment.
+            routing_map (torch.Tensor): The mapping of token to experts assignment,
+                with shape [num_tokens, num_experts].
+        """
+        logits = logits.view(-1, self.num_experts)
+        router_score = self.router_act(logits)
+        routing_map = router_score > 0
+        sorted_probs, sorted_indices = torch.sort(router_score, descending=True, dim=-1)
+        sorted_map = sorted_probs <= 0
+        sorted_indices = torch.where(sorted_map, -1, sorted_indices)
+        max_valid_num = max(sorted_probs.size(-1) - torch.min(torch.sum(sorted_map, dim=-1)).item(), 1)
+        assert torch.all(sorted_map[:, max_valid_num:])
+        sorted_probs = sorted_probs[:, :max_valid_num]
+        sorted_indices = sorted_indices[:, :max_valid_num]
+        assert torch.sum(routing_map) == torch.sum(sorted_indices != -1)
+        return sorted_probs, sorted_indices
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        logits = self.gating(input)
+        top_scores, top_indices = self.routing(logits)
+        return top_scores, top_indices
+class TopPRouter(BaseRouter):
+    def __init__(self, config: BlockFFNConfig) -> None:
+        super().__init__(config)
+        self.config = config
+        self.top_p = config.moe_router_topp
+    def routing(self, logits: torch.Tensor):
+        """Top-k routing function
+        Args:
+            logits (torch.Tensor): Logits tensor after gating.
+        Returns:
+            probs (torch.Tensor): The probabilities of token to experts assignment.
+            routing_map (torch.Tensor): The mapping of token to experts assignment,
+                with shape [num_tokens, num_experts].
+        """
+        logits = logits.view(-1, self.num_experts)
+        router_score = torch.abs(logits)
+        router_score = router_score / (router_score.sum(dim=-1, keepdim=True) + 1e-20)
+        sorted_probs, sorted_indices = torch.sort(router_score, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+        mask = cumulative_probs > self.top_p
+        threshold_indices = mask.long().argmax(dim=-1)
+        threshold_mask = torch.nn.functional.one_hot(threshold_indices, num_classes=sorted_indices.size(-1)).bool()
+        mask = mask & ~threshold_mask
+        sorted_indices = torch.where(mask, -1, sorted_indices)
+        sorted_probs = torch.where(mask, 0.0, sorted_probs)
+        max_valid_num = max(mask.size(-1) - torch.min(torch.sum(mask, dim=-1)).item(), 1)
+        assert torch.all(mask[:, max_valid_num:])
+        sorted_indices = sorted_indices[:, :max_valid_num]
+        sorted_probs = sorted_probs[:, :max_valid_num]
+        sorted_probs = sorted_probs / sorted_probs.sum(dim=-1, keepdim=True)
+        return sorted_probs, sorted_indices
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        logits = self.gating(input)
+        top_scores, top_indices = self.routing(logits)
+        return top_scores, top_indices
+class FastTopKCalculator:
+    def __init__(self, num_experts: int):
+        self.num_experts = num_experts
+    def fmoe_sparse_topk_forward(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, experts: torch.nn.Module):
+        (
+            pos,
+            local_expert_count,
+            global_expert_count,
+            fwd_expert_count,
+            fwd_batch_size,
+        ) = prepare_forward(topk_indices, self.num_experts, 1)
+        topk = 1
+        if len(topk_indices.shape) == 2:
+            topk = topk_indices.shape[1]
+        def scatter_func(tensor):
+            return MOEScatter.apply(
+                tensor,
+                torch.div(pos, topk, rounding_mode='floor'),
+                local_expert_count,
+                global_expert_count,
+                fwd_batch_size,
+                1,
+            )
+        x = tree.map_structure(scatter_func, hidden_states)
+        x = experts(x, fwd_expert_count, topk_indices=topk_indices)
+        out_batch_size = tree.flatten(hidden_states)[0].shape[0]
+        if len(topk_indices.shape) == 2:
+            out_batch_size *= topk_indices.shape[1]
+        def gather_func(tensor):
+            return MOEGather.apply(
+                tensor,
+                pos,
+                local_expert_count,
+                global_expert_count,
+                out_batch_size,
+                1,
+            )
+        outp = tree.map_structure(gather_func, x)
+        return outp
+    def forward(self, hidden_states, topk_indices, topk_weights, experts):
+        assert topk_indices.shape == topk_weights.shape
+        top_k = topk_indices.shape[-1]
+        dim3 = hidden_states.ndim == 3
+        if dim3:
+            batch_size, seq_len, dim = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size * seq_len, dim)
+        else:
+            assert hidden_states.ndim == 2
+            batch_size, (seq_len, dim) = -1, hidden_states.shape
+        fwd = self.fmoe_sparse_topk_forward(hidden_states, topk_indices, experts)
+        def view_func(tensor):
+            n_dim = tensor.shape[-1]
+            tensor = tensor.view(-1, top_k, n_dim)
+            return tensor
+        moe_output = tree.map_structure(view_func, fwd)
+        topk_weights = topk_weights.unsqueeze(1)
+        def bmm_func(tensor):
+            n_dim = tensor.shape[-1]
+            tensor = torch.bmm(topk_weights, tensor).reshape(-1, n_dim)
+            return tensor
+        moe_output = tree.map_structure(bmm_func, moe_output)
+        if dim3:
+            moe_output = moe_output.view(batch_size, seq_len, -1)
+        return moe_output
+class MoELinearExperts(nn.Module):
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        num_experts: int,
+        ffn_bias: bool,
+    ):
+        super().__init__()
+        self.dim_in = self.in_features = dim_in
+        self.dim_out = self.out_features = dim_out
+        self.weight = torch.nn.Parameter(torch.empty(num_experts, dim_out, dim_in))
+        self.bias = None
+        if ffn_bias:
+            self.bias = torch.nn.Parameter(torch.empty(num_experts, dim_out))
+    def forward(self, x: torch.Tensor, fwd_expert_count: torch.Tensor):
+        x = MOELinear.apply(x, fwd_expert_count, self.weight, self.bias)
+        return x
+class MoEGatedExperts(nn.Module):
+    def __init__(
+        self,
+        dim_in: int,
+        dim_ff: int,
+        is_gated: bool,
+        act_name: str,
+        num_experts: int,
+        ffn_bias: bool = False,
+    ):
+        super().__init__()
+        self.is_gated = is_gated
+        self.dim_in, self.dim_ff, self.num_experts = dim_in, dim_ff, num_experts
+        if self.is_gated:
+            self.gate_proj = MoELinearExperts(dim_in, dim_ff, num_experts, ffn_bias)
+        self.up_proj = MoELinearExperts(dim_in, dim_ff, num_experts, ffn_bias)
+        self.down_proj = MoELinearExperts(dim_ff, dim_in, num_experts, ffn_bias)
+        self.act_fn = ACT2FN[act_name]
+    def forward(self, x: torch.Tensor, fwd_expert_count: torch.Tensor, **kwargs) -> torch.Tensor:
+        if self.is_gated:
+            gate_score = self.gate_proj(x, fwd_expert_count)
+            up_proj = self.up_proj(x, fwd_expert_count)
+            x = up_proj * self.act_fn(gate_score)
+        else:
+            up_score = self.up_proj(x, fwd_expert_count)
+            x = self.act_fn(up_score)
+        x = self.down_proj(x, fwd_expert_count)
+        return x
+class VanillaMoELayer(nn.Module):
+    def __init__(self, config: BlockFFNConfig):
+        super(VanillaMoELayer, self).__init__()
+        self.config = config
+        # Initialize router
+        if config.router_type == "topk":
+            self.router = TopKRouter(config=self.config)
+        elif config.router_type == "remoe":
+            self.router = ReMoERouter(config=self.config)
+        elif config.router_type == "topp":
+            self.router = TopPRouter(config=self.config)
+        else:
+            raise NotImplementedError(f"Router type {config.router_type} not implemented.")
+        self.mix_calculator = FastTopKCalculator(num_experts=self.config.num_experts)
+        # Initialize experts
+        self.experts = MoEGatedExperts(
+            dim_in=self.config.hidden_size,
+            dim_ff=self.config.moe_ffn_hidden_size,
+            is_gated=not self.config.expert_not_gated,
+            act_name="silu",
+            num_experts=self.config.num_experts,
+        )
+        self.dim_shared_expert = self.config.moe_shared_expert_intermediate_size
+        self.use_shared_expert = self.dim_shared_expert is not None and self.dim_shared_expert > 0
+        if self.use_shared_expert:
+            self.shared_experts = BlockFFNMLP(self.config, intermediate_size=self.dim_shared_expert)
+    def forward(self, hidden_states: torch.Tensor):
+        top_scores, top_indices = self.router(hidden_states)
+        y = self.mix_calculator.forward(
+            hidden_states=hidden_states,
+            topk_indices=top_indices.contiguous(),
+            topk_weights=top_scores.type_as(hidden_states),
+            experts=self.experts,
+        )
+        if self.shared_experts is not None:
+            y = y + self.shared_experts(hidden_states)
+        return y
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class BlockFFNAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: BlockFFNConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_query_groups
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_query_groups * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_query_groups * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class BlockFFNDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: BlockFFNConfig, layer_idx: int, is_moe_layer: bool):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = BlockFFNAttention(config=config, layer_idx=layer_idx)
+        if is_moe_layer:
+            if config.use_blockffn:
+                self.mlp = BlockFFNLayer(config)
+            elif config.router_type in ["topk", "remoe", "topp"]:
+                self.mlp = VanillaMoELayer(config)
+            else:
+                raise NotImplementedError
+        else:
+            self.mlp = BlockFFNMLP(config)
+        self.input_layernorm = BlockFFNRMSNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.post_attention_layernorm = BlockFFNRMSNorm(config.hidden_size, eps=config.norm_epsilon)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        if self.config.use_mup:
+            hidden_states = residual + hidden_states * (self.config.mup_depth_scale / math.sqrt(self.config.num_layers))
+        else:
+            hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.config.use_mup:
+            hidden_states = residual + hidden_states * (self.config.mup_depth_scale / math.sqrt(self.config.num_layers))
+        else:
+            hidden_states = residual + hidden_states
+        return hidden_states
+@auto_docstring
+class BlockFFNPreTrainedModel(PreTrainedModel):
+    config: BlockFFNConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["BlockFFNDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": BlockFFNDecoderLayer,
+        "attentions": BlockFFNAttention,
+    }
+@auto_docstring
+class BlockFFNModel(BlockFFNPreTrainedModel):
+    def __init__(self, config: BlockFFNConfig):
+        super().__init__(config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.moe_layer_freq = eval(config.moe_layer_freq) if isinstance(config.moe_layer_freq, str) else config.moe_layer_freq
+        assert len(self.moe_layer_freq) == config.num_layers
+        self.layers = nn.ModuleList(
+            [BlockFFNDecoderLayer(config, layer_idx, bool(self.moe_layer_freq[layer_idx])) for layer_idx in range(config.num_layers)]
+        )
+        self.norm = BlockFFNRMSNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.rotary_emb = BlockFFNRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+        if self.config.use_mup:
+            inputs_embeds = inputs_embeds * self.config.mup_emb_scale
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+@auto_docstring
+class BlockFFNForCausalLM(BlockFFNPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config: BlockFFNConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = BlockFFNModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        if self.config.use_mup:
+            hidden_states = hidden_states / self.config.mup_width_scale
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "BlockFFNForCausalLM",
+    "BlockFFNModel",
+    "BlockFFNPreTrainedModel",
+]

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74b0e11eedf5a22c1bf2c1a55805f9fbbc4a859404d3530cddd6ed16f8292166
+size 785588525

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,81 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|execute_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|execute_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb74d51116831c3bf65db812c553f94ab0c88dcf97a5bbb37e3504f6d359c530
+size 1181204

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,116 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73440": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73441": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73442": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73443": {
+      "content": "<|execute_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73444": {
+      "content": "<|execute_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73445": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73446": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73447": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_end|>",
+    "<|im_start|>",
+    "<|tool_call|>",
+    "<|execute_start|>",
+    "<|execute_end|>",
+    "<|fim_prefix|>",
+    "<|fim_middle|>",
+    "<|fim_suffix|>"
+  ],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+}