| from typing import Optional |
|
|
| from transformers import PretrainedConfig |
|
|
|
|
| class VeronicaConfig(PretrainedConfig): |
| model_type = "veronica" |
|
|
| def __init__( |
| self, |
| vocab_size: int = 50257, |
| n_layer: int = 24, |
| n_head: int = 12, |
| n_embd: int = 768, |
| mlp_mult: float = 4.0, |
| num_funcs: int = 3, |
| router_dim: Optional[int] = None, |
| dropout: float = 0.0, |
| use_channel_attention: bool = False, |
| max_position_embeddings: int = 4096, |
| layer_norm_epsilon: float = 1e-5, |
| gradient_checkpointing: bool = False, |
| |
| router_aux_weight: float = 0.02, |
| |
| router_tau: float = 1.0, |
| |
| rope_theta: float = 10000.0, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
|
|
| |
| self.vocab_size = vocab_size |
| self.n_layer = n_layer |
| self.n_head = n_head |
| self.n_embd = n_embd |
| self.mlp_mult = mlp_mult |
| self.num_funcs = num_funcs |
| self.router_dim = router_dim |
| self.dropout = dropout |
| self.use_channel_attention = use_channel_attention |
| self.max_position_embeddings = max_position_embeddings |
| self.layer_norm_epsilon = layer_norm_epsilon |
| self.gradient_checkpointing = gradient_checkpointing |
|
|
| |
| self.num_hidden_layers = n_layer |
| self.num_attention_heads = n_head |
| self.hidden_size = n_embd |
|
|
| |
| self.router_aux_weight = router_aux_weight |
| self.router_tau = router_tau |
| |
| |
| self.rope_theta = rope_theta |
|
|