# Copyright 2026 Trillion Labs and the HuggingFace Inc. team. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """GravityMoE model configuration — inherits from DeepSeek V3.""" from transformers import DeepseekV3Config class GravityMoEConfig(DeepseekV3Config): r""" Configuration class for the GravityMoE model, inheriting from [`DeepseekV3Config`]. GravityMoE shares the same architecture as DeepSeek V3 (sparse MoE with MLA) but uses different hyperparameters. Only default values that differ from DeepSeek V3 are overridden here. See [`DeepseekV3Config`] for full documentation of all parameters. Example: ```python >>> from configuration_gravity_moe import GravityMoEConfig >>> configuration = GravityMoEConfig() >>> configuration.model_type 'gravity_moe' ``` """ model_type = "gravity_moe" def __init__( self, vocab_size=151552, hidden_size=2048, intermediate_size=8192, moe_intermediate_size=1408, num_hidden_layers=28, num_attention_heads=16, num_key_value_heads=16, n_shared_experts=1, n_routed_experts=64, routed_scaling_factor=2.446, kv_lora_rank=512, q_lora_rank=None, qk_rope_head_dim=64, v_head_dim=128, qk_nope_head_dim=128, n_group=1, topk_group=1, num_experts_per_tok=8, first_k_dense_replace=1, norm_topk_prob=True, hidden_act="silu", max_position_embeddings=65536, initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, pad_token_id=None, bos_token_id=0, eos_token_id=1, tie_word_embeddings=False, rope_theta=1000000.0, rope_scaling=None, rope_interleave=True, attention_bias=False, attention_dropout=0.0, **kwargs, ): super().__init__( vocab_size=vocab_size, hidden_size=hidden_size, intermediate_size=intermediate_size, moe_intermediate_size=moe_intermediate_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, num_key_value_heads=num_key_value_heads, n_shared_experts=n_shared_experts, n_routed_experts=n_routed_experts, routed_scaling_factor=routed_scaling_factor, kv_lora_rank=kv_lora_rank, q_lora_rank=q_lora_rank, qk_rope_head_dim=qk_rope_head_dim, v_head_dim=v_head_dim, qk_nope_head_dim=qk_nope_head_dim, n_group=n_group, topk_group=topk_group, num_experts_per_tok=num_experts_per_tok, first_k_dense_replace=first_k_dense_replace, norm_topk_prob=norm_topk_prob, hidden_act=hidden_act, max_position_embeddings=max_position_embeddings, initializer_range=initializer_range, rms_norm_eps=rms_norm_eps, use_cache=use_cache, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, rope_theta=rope_theta, rope_scaling=rope_scaling, rope_interleave=rope_interleave, attention_bias=attention_bias, attention_dropout=attention_dropout, **kwargs, ) __all__ = ["GravityMoEConfig"]