from transformers import PretrainedConfig class SlimMoEConfig(PretrainedConfig): model_type = "slim_moe" def __init__( self, vocab_size: int = 50257, dim: int = 768, num_hidden_layers: int = 12, num_heads: int = 12, hidden_dim: int = 2048, num_experts: int = 4, max_seq_len: int = 2048, dropout: float = 0.1, adaptive_routing: bool = True, **kwargs ): self.vocab_size = vocab_size self.dim = dim self.num_hidden_layers = num_hidden_layers self.num_heads = num_heads self.hidden_dim = hidden_dim self.num_experts = num_experts self.max_seq_len = max_seq_len self.dropout = dropout self.adaptive_routing = adaptive_routing # --- FIX: Enable automatic weight tying by the framework --- # This tells the PreTrainedModel's post_init to handle the tie correctly. self.tie_word_embeddings = True super().__init__(**kwargs) @classmethod def for_250m(cls, vocab_size: int = 50257, max_seq_len: int = 2048, dropout: float = 0.1): """ Create configuration for ~300M parameter model. Uses: dim=768, layers=16, heads=12, hidden_dim=1536, experts=4 This yields approximately 280-290M parameters, safely under 250M. """ return cls( vocab_size=vocab_size, dim=768, num_hidden_layers=16, num_heads=12, hidden_dim=1536, num_experts=4, max_seq_len=max_seq_len, dropout=dropout )