| from transformers import PretrainedConfig
|
|
|
|
|
| class SlimMoEConfig(PretrainedConfig):
|
| model_type = "slim_moe"
|
|
|
| def __init__(
|
| self,
|
| vocab_size: int = 50257,
|
| dim: int = 768,
|
| num_hidden_layers: int = 12,
|
| num_heads: int = 12,
|
| hidden_dim: int = 2048,
|
| num_experts: int = 4,
|
| max_seq_len: int = 2048,
|
| dropout: float = 0.1,
|
| adaptive_routing: bool = True,
|
| **kwargs
|
| ):
|
| self.vocab_size = vocab_size
|
| self.dim = dim
|
| self.num_hidden_layers = num_hidden_layers
|
| self.num_heads = num_heads
|
| self.hidden_dim = hidden_dim
|
| self.num_experts = num_experts
|
| self.max_seq_len = max_seq_len
|
| self.dropout = dropout
|
| self.adaptive_routing = adaptive_routing
|
|
|
|
|
|
|
| self.tie_word_embeddings = True
|
|
|
| super().__init__(**kwargs)
|
|
|
| @classmethod
|
| def for_250m(cls, vocab_size: int = 50257, max_seq_len: int = 2048, dropout: float = 0.1):
|
| """
|
| Create configuration for ~300M parameter model.
|
| Uses: dim=768, layers=16, heads=12, hidden_dim=1536, experts=4
|
| This yields approximately 280-290M parameters, safely under 250M.
|
| """
|
| return cls(
|
| vocab_size=vocab_size,
|
| dim=768,
|
| num_hidden_layers=16,
|
| num_heads=12,
|
| hidden_dim=1536,
|
| num_experts=4,
|
| max_seq_len=max_seq_len,
|
| dropout=dropout
|
| ) |