| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """GravityMoE model configuration — inherits from DeepSeek V3.""" |
|
|
| from transformers import DeepseekV3Config |
|
|
|
|
| class GravityMoEConfig(DeepseekV3Config): |
| r""" |
| Configuration class for the GravityMoE model, inheriting from |
| [`DeepseekV3Config`]. GravityMoE shares the same architecture as |
| DeepSeek V3 (sparse MoE with MLA) but uses different hyperparameters. |
| |
| Only default values that differ from DeepSeek V3 are overridden here. |
| See [`DeepseekV3Config`] for full documentation of all parameters. |
| |
| Example: |
| |
| ```python |
| >>> from configuration_gravity_moe import GravityMoEConfig |
| |
| >>> configuration = GravityMoEConfig() |
| >>> configuration.model_type |
| 'gravity_moe' |
| ``` |
| """ |
|
|
| model_type = "gravity_moe" |
|
|
| def __init__( |
| self, |
| vocab_size=151552, |
| hidden_size=2048, |
| intermediate_size=8192, |
| moe_intermediate_size=1408, |
| num_hidden_layers=28, |
| num_attention_heads=16, |
| num_key_value_heads=16, |
| n_shared_experts=1, |
| n_routed_experts=64, |
| routed_scaling_factor=2.446, |
| kv_lora_rank=512, |
| q_lora_rank=None, |
| qk_rope_head_dim=64, |
| v_head_dim=128, |
| qk_nope_head_dim=128, |
| n_group=1, |
| topk_group=1, |
| num_experts_per_tok=8, |
| first_k_dense_replace=1, |
| norm_topk_prob=True, |
| hidden_act="silu", |
| max_position_embeddings=65536, |
| initializer_range=0.02, |
| rms_norm_eps=1e-6, |
| use_cache=True, |
| pad_token_id=None, |
| bos_token_id=0, |
| eos_token_id=1, |
| tie_word_embeddings=False, |
| rope_theta=1000000.0, |
| rope_scaling=None, |
| rope_interleave=True, |
| attention_bias=False, |
| attention_dropout=0.0, |
| **kwargs, |
| ): |
| super().__init__( |
| vocab_size=vocab_size, |
| hidden_size=hidden_size, |
| intermediate_size=intermediate_size, |
| moe_intermediate_size=moe_intermediate_size, |
| num_hidden_layers=num_hidden_layers, |
| num_attention_heads=num_attention_heads, |
| num_key_value_heads=num_key_value_heads, |
| n_shared_experts=n_shared_experts, |
| n_routed_experts=n_routed_experts, |
| routed_scaling_factor=routed_scaling_factor, |
| kv_lora_rank=kv_lora_rank, |
| q_lora_rank=q_lora_rank, |
| qk_rope_head_dim=qk_rope_head_dim, |
| v_head_dim=v_head_dim, |
| qk_nope_head_dim=qk_nope_head_dim, |
| n_group=n_group, |
| topk_group=topk_group, |
| num_experts_per_tok=num_experts_per_tok, |
| first_k_dense_replace=first_k_dense_replace, |
| norm_topk_prob=norm_topk_prob, |
| hidden_act=hidden_act, |
| max_position_embeddings=max_position_embeddings, |
| initializer_range=initializer_range, |
| rms_norm_eps=rms_norm_eps, |
| use_cache=use_cache, |
| pad_token_id=pad_token_id, |
| bos_token_id=bos_token_id, |
| eos_token_id=eos_token_id, |
| tie_word_embeddings=tie_word_embeddings, |
| rope_theta=rope_theta, |
| rope_scaling=rope_scaling, |
| rope_interleave=rope_interleave, |
| attention_bias=attention_bias, |
| attention_dropout=attention_dropout, |
| **kwargs, |
| ) |
|
|
|
|
| __all__ = ["GravityMoEConfig"] |
|
|