SlimMoE-250M-instruct / configuration_slim_moe.py
SlimFactory's picture
Upload folder using huggingface_hub
e65ee65 verified
raw
history blame
1.72 kB
from transformers import PretrainedConfig
class SlimMoEConfig(PretrainedConfig):
model_type = "slim_moe"
def __init__(
self,
vocab_size: int = 50257,
dim: int = 768,
num_hidden_layers: int = 12,
num_heads: int = 12,
hidden_dim: int = 2048,
num_experts: int = 4,
max_seq_len: int = 2048,
dropout: float = 0.1,
adaptive_routing: bool = True,
**kwargs
):
self.vocab_size = vocab_size
self.dim = dim
self.num_hidden_layers = num_hidden_layers
self.num_heads = num_heads
self.hidden_dim = hidden_dim
self.num_experts = num_experts
self.max_seq_len = max_seq_len
self.dropout = dropout
self.adaptive_routing = adaptive_routing
# --- FIX: Enable automatic weight tying by the framework ---
# This tells the PreTrainedModel's post_init to handle the tie correctly.
self.tie_word_embeddings = True
super().__init__(**kwargs)
@classmethod
def for_250m(cls, vocab_size: int = 50257, max_seq_len: int = 2048, dropout: float = 0.1):
"""
Create configuration for ~300M parameter model.
Uses: dim=768, layers=16, heads=12, hidden_dim=1536, experts=4
This yields approximately 280-290M parameters, safely under 250M.
"""
return cls(
vocab_size=vocab_size,
dim=768,
num_hidden_layers=16,
num_heads=12,
hidden_dim=1536,
num_experts=4,
max_seq_len=max_seq_len,
dropout=dropout
)