| |
| from transformers.configuration_utils import PretrainedConfig |
|
|
| class SmallThinkerConfig(PretrainedConfig): |
| """ |
| This is the configuration class to store the configuration of a [`SmallThinkerModel`]. |
| It is used to instantiate a SmallThinker model according to the specified arguments, defining the model architecture. |
| The default values for each of the parameters are the same as the ones used in the original SmallThinker 4B model. |
| |
| General configs: |
| - model_type: "smallthinker" |
| - model_name |
| - num_hidden_layers |
| - hidden_size |
| |
| Tokenizer configs: |
| - pad_token_id |
| - bos_token_id |
| - eos_token_id |
| |
| Embedding configs: |
| - vocab_size |
| |
| RMSNorm configs: |
| - rms_norm_eps |
| |
| Attention configs: |
| - num_attention_heads |
| - num_key_value_heads |
| - head_dim |
| - use_cache |
| - rope_layout: array of 0 or 1s, 0 for nope, 1 for rope |
| - rope_theta |
| - max_position_embeddings |
| - sliding_window_layout: array of 0 or 1s, 0 for normal attention, 1 for SWA |
| - sliding_window_size |
| |
| MoE FFN configs: |
| - moe_num_primary_experts |
| - moe_ffn_hidden_size |
| - moe_primary_router_apply_softmax: Use topk-softmax in routing instead of topk-sigmoid-normalize |
| - moe_num_active_primary_experts |
| |
| LM Head configs: |
| - tie_word_embeddings |
| |
| Other configs: |
| - initializer_range |
| """ |
| def __init__(self, |
| model_type = "smallthinker", |
| model_name="smallthinker_4b_base", |
| num_hidden_layers=32, |
| hidden_size=1536, |
| pad_token_id=None, |
| bos_token_id=151643, |
| eos_token_id=[151643,151645], |
| vocab_size=151936, |
| rms_norm_eps=1e-6, |
| num_attention_heads=12, |
| num_key_value_heads=2, |
| head_dim=128, |
| use_cache=True, |
| rope_layout=[1]*32, |
| rope_theta=1e6, |
| max_position_embeddings=4096 * 32, |
| sliding_window_layout=[0]*32, |
| sliding_window_size=4096, |
| moe_num_primary_experts=32, |
| moe_ffn_hidden_size=768, |
| moe_primary_router_apply_softmax=False, |
| moe_num_active_primary_experts=4, |
| tie_word_embeddings=True, |
| initializer_range=0.02, |
| **kwargs, |
| ): |
| |
| assert num_attention_heads % num_key_value_heads == 0, "[SmallThinker config sanitizer] num_attention_heads must be divisible by num_key_value_heads" |
| assert len(rope_layout) == num_hidden_layers, "[SmallThinker config sanitizer] rope_layout must have the same length as num_hidden_layers" |
| assert len(sliding_window_layout) == num_hidden_layers, "[SmallThinker config sanitizer] sliding_window_layout must have the same length as num_hidden_layers" |
| |
| |
| self.model_type = model_type |
| self.model_name = model_name |
| self.num_hidden_layers = num_hidden_layers |
| self.hidden_size = hidden_size |
|
|
| |
| self.pad_token_id = pad_token_id |
| self.bos_token_id = bos_token_id |
| self.eos_token_id = eos_token_id |
|
|
| |
| self.vocab_size = vocab_size |
|
|
| |
| self.rms_norm_eps = rms_norm_eps |
|
|
| |
| self.num_attention_heads = num_attention_heads |
| self.num_key_value_heads = num_key_value_heads |
| self.head_dim = head_dim |
| self.use_cache = use_cache |
| self.rope_layout = rope_layout |
| self.rope_theta = rope_theta |
| self.max_position_embeddings = max_position_embeddings |
| self.sliding_window_layout = sliding_window_layout |
| self.sliding_window_size = sliding_window_size |
|
|
| |
| self.moe_num_primary_experts = moe_num_primary_experts |
| self.moe_ffn_hidden_size = moe_ffn_hidden_size |
| self.moe_primary_router_apply_softmax = moe_primary_router_apply_softmax |
| self.moe_num_active_primary_experts = moe_num_active_primary_experts |
|
|
| |
| self.initializer_range = initializer_range |
|
|
| super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs) |
|
|
| |
| self.sliding_window = sliding_window_size |
| self.sliding_window_pattern = sliding_window_layout |
|
|
| self._attn_implementation = "sdpa" |
| |
| __all__ = ["SmallThinkerConfig"] |
|
|