| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """BlockFFN model configuration""" |
|
|
| from transformers import PretrainedConfig |
| from transformers.modeling_rope_utils import rope_config_validation |
|
|
|
|
| class BlockFFNConfig(PretrainedConfig): |
|
|
| model_type = "blockffn" |
| keys_to_ignore_at_inference = ["past_key_values"] |
| |
| base_model_tp_plan = { |
| "layers.*.self_attn.q_proj": "colwise", |
| "layers.*.self_attn.k_proj": "colwise", |
| "layers.*.self_attn.v_proj": "colwise", |
| "layers.*.self_attn.o_proj": "rowwise", |
| "layers.*.mlp.gate_proj": "colwise", |
| "layers.*.mlp.up_proj": "colwise", |
| "layers.*.mlp.down_proj": "rowwise", |
| } |
| base_model_pp_plan = { |
| "embed_tokens": (["input_ids"], ["inputs_embeds"]), |
| "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), |
| "norm": (["hidden_states"], ["hidden_states"]), |
| } |
|
|
| def __init__( |
| self, |
| vocab_size=32000, |
| hidden_size=4096, |
| ffn_hidden_size=11008, |
| num_layers=32, |
| num_attention_heads=32, |
| num_query_groups=None, |
| hidden_act="silu", |
| max_position_embeddings=2048, |
| initializer_range=0.02, |
| norm_epsilon=1e-6, |
| use_cache=True, |
| pad_token_id=None, |
| bos_token_id=1, |
| eos_token_id=2, |
| pretraining_tp=1, |
| tie_word_embeddings=False, |
| rope_theta=10000.0, |
| rope_scaling=None, |
| attention_bias=False, |
| attention_dropout=0.0, |
| mlp_bias=False, |
| head_dim=None, |
| use_mup=True, |
| mup_emb_scale=12, |
| mup_depth_scale=1.4, |
| mup_base_hidden_size=256, |
| num_experts=180, |
| moe_ffn_hidden_size=128, |
| moe_shared_expert_intermediate_size=128, |
| moe_layer_freq="([0]*3+[1]*29)", |
| moe_router_dtype="fp32", |
| router_act_func="relu", |
| router_norm_type="simple", |
| router_norm_fixed=False, |
| router_norm_scalar=False, |
| router_norm_init_var=0.1, |
| expert_act_func="norm_silu", |
| expert_act_norm_type="normal", |
| use_blockffn=False, |
| router_type="topk", |
| moe_router_topk=0, |
| moe_router_topp=0, |
| moe_router_enable_expert_bias=False, |
| moe_router_score_function="sigmoid", |
| moe_router_topk_scaling_factor=2.5, |
| expert_not_gated=False, |
| moe_router_pre_softmax=False, |
| **kwargs, |
| ): |
| self.vocab_size = vocab_size |
| self.max_position_embeddings = max_position_embeddings |
| self.hidden_size = hidden_size |
| self.ffn_hidden_size = ffn_hidden_size |
| self.num_layers = num_layers |
| self.num_attention_heads = num_attention_heads |
|
|
| |
| if num_query_groups is None: |
| num_query_groups = num_attention_heads |
|
|
| self.num_query_groups = num_query_groups |
| self.hidden_act = hidden_act |
| self.initializer_range = initializer_range |
| self.norm_epsilon = norm_epsilon |
| self.pretraining_tp = pretraining_tp |
| self.use_cache = use_cache |
| self.rope_theta = rope_theta |
| self.rope_scaling = rope_scaling |
| self.attention_bias = attention_bias |
| self.attention_dropout = attention_dropout |
| self.mlp_bias = mlp_bias |
| self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads |
| self.use_mup = use_mup |
| self.mup_emb_scale = mup_emb_scale |
| self.mup_depth_scale = mup_depth_scale |
| self.mup_base_hidden_size = mup_base_hidden_size |
|
|
| self.num_experts = num_experts |
| self.moe_ffn_hidden_size = moe_ffn_hidden_size |
| self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size |
| self.moe_layer_freq = moe_layer_freq if isinstance(moe_layer_freq, (str, list)) else ([0] * num_layers) |
| self.moe_router_dtype = moe_router_dtype |
| self.router_act_func = router_act_func |
| self.router_norm_type = router_norm_type |
| self.router_norm_fixed = router_norm_fixed |
| self.router_norm_scalar = router_norm_scalar |
| self.router_norm_init_var = router_norm_init_var |
| self.expert_act_func = expert_act_func |
| self.expert_act_norm_type = expert_act_norm_type |
|
|
| self.use_blockffn = use_blockffn |
| self.router_type = router_type |
| self.moe_router_topk = moe_router_topk |
| self.moe_router_topp = moe_router_topp |
| self.moe_router_enable_expert_bias = moe_router_enable_expert_bias |
| self.moe_router_score_function = moe_router_score_function |
| self.moe_router_topk_scaling_factor = moe_router_topk_scaling_factor |
| self.expert_not_gated = expert_not_gated |
| self.moe_router_pre_softmax = moe_router_pre_softmax |
|
|
| |
| |
| if self.rope_scaling is not None and "type" in self.rope_scaling: |
| self.rope_scaling["rope_type"] = self.rope_scaling["type"] |
| rope_config_validation(self) |
|
|
| super().__init__( |
| pad_token_id=pad_token_id, |
| bos_token_id=bos_token_id, |
| eos_token_id=eos_token_id, |
| tie_word_embeddings=tie_word_embeddings, |
| **kwargs, |
| ) |
|
|
| @property |
| def mup_width_scale(self): |
| return (self.hidden_size / self.mup_base_hidden_size) if (self.use_mup and self.mup_base_hidden_size > 0) else 1 |
|
|
|
|
| __all__ = ["BlockFFNConfig"] |
|
|