| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| """ Transnormer configuration""" |
|
|
| from transformers.configuration_utils import PretrainedConfig |
| from transformers.utils import logging |
|
|
| logger = logging.get_logger(__name__) |
|
|
|
|
| class TransnormerConfig(PretrainedConfig): |
| model_type = "transnormer" |
| keys_to_ignore_at_inference = ["past_key_values"] |
|
|
| def __init__( |
| self, |
| pad_token_id=0, |
| bos_token_id=1, |
| eos_token_id=2, |
| vocab_size=64000, |
| use_cache=True, |
| init_std=0.02, |
| |
| decoder_embed_dim=1024, |
| decoder_layers=24, |
| decoder_attention_heads=8, |
| no_scale_embedding=False, |
| add_bos_token=False, |
| norm_type="simplermsnorm", |
| linear_use_lrpe_list=[], |
| hidden_dim=1024, |
| linear_act_fun="silu", |
| glu_dim=2816, |
| bias=False, |
| **kwargs, |
| ): |
| super().__init__( |
| pad_token_id=pad_token_id, |
| bos_token_id=bos_token_id, |
| eos_token_id=eos_token_id, |
| **kwargs, |
| ) |
| |
| self.vocab_size = vocab_size |
| self.use_cache = use_cache |
| self.init_std = init_std |
| |
| self.decoder_embed_dim = decoder_embed_dim |
| self.decoder_layers = decoder_layers |
| self.decoder_attention_heads = decoder_attention_heads |
| self.no_scale_embedding = no_scale_embedding |
| self.add_bos_token = add_bos_token |
| self.norm_type = norm_type |
| self.linear_use_lrpe_list = linear_use_lrpe_list |
| self.hidden_dim = hidden_dim |
| self.linear_act_fun = linear_act_fun |
| self.glu_dim = glu_dim |
| self.bias = bias |
|
|