{ "architectures": [ "BlockFFNForCausalLM" ], "auto_map": { "AutoConfig": "configuration_blockffn.BlockFFNConfig", "AutoModel": "modeling_blockffn.BlockFFNModel", "AutoModelForCausalLM": "modeling_blockffn.BlockFFNForCausalLM" }, "bos_token_id": 1, "eos_token_id": [ 2, 73440 ], "pad_token_id": 2, "hidden_act": "silu", "hidden_size": 1792, "initializer_range": 0.1, "intermediate_size": 10240, "head_dim": 128, "max_position_embeddings": 4096, "model_type": "blockffn", "num_attention_heads": 14, "num_hidden_layers": 32, "num_key_value_heads": 2, "rms_norm_eps": 1e-05, "rope_scaling": null, "rope_theta": 10000.0, "torch_dtype": "bfloat16", "transformers_version": "4.36.0", "use_cache": true, "vocab_size": 73448, "use_mup": false, "num_experts": 102, "moe_ffn_hidden_size": 64, "moe_shared_expert_intermediate_size": 128, "moe_layer_freq": [ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ], "moe_router_dtype": "fp32", "router_act_func": "relu", "router_norm_type": "simple", "expert_act_func": "norm_silu", "expert_act_norm_type": "normal", "num_layers": 32, "ffn_hidden_size": 4480, "num_query_groups": 14, "norm_epsilon": 1e-05, "router_norm_fixed": false, "router_norm_scalar": false, "router_norm_init_var": 0.1, "moe_expert_bias_apply_method": "rms", "use_blockffn": true, "router_type": "topk", "moe_router_enable_expert_bias": false, "expert_not_gated": true, "moe_router_pre_softmax": false, "moe_router_topk": 2, "moe_router_topp": 0.5, "moe_router_score_function": "softmax", "moe_router_topk_scaling_factor": null }