| """ |
| DomainTransformer Configuration. |
| |
| HF-compatible config following Nubank nuFormer architecture choices: |
| - GPT-style causal decoder |
| - NoPE (no positional encoding) by default — Kazemnejad et al. 2023 |
| - Pre-norm (LayerNorm before attention and FFN) |
| - Weight-tied embedding ↔ LM head |
| - Two reference sizes: 24M (d=512, L=6) and 330M (d=1024, L=24) |
| """ |
|
|
| from transformers import PretrainedConfig |
|
|
|
|
| class DomainTransformerConfig(PretrainedConfig): |
| """Configuration for DomainTransformer causal language model. |
| |
| This config produces a GPT-style decoder-only Transformer with: |
| - No positional encoding (NoPE) by default |
| - Pre-norm architecture (LayerNorm before attn/FFN) |
| - GELU activation in FFN |
| - Weight tying between token embeddings and LM head |
| |
| Predefined sizes following Nubank nuFormer (arXiv:2507.23267): |
| - "24m": 6 layers, d_model=512, 8 heads, FFN=2048 (~24M params) |
| - "85m": 12 layers, d_model=768, 12 heads, FFN=3072 (~85M params) |
| - "330m": 24 layers, d_model=1024, 16 heads, FFN=4096 (~330M params) |
| |
| Args: |
| vocab_size: Size of the token vocabulary. |
| hidden_size: Dimension of hidden representations (d_model). |
| num_hidden_layers: Number of transformer blocks. |
| num_attention_heads: Number of attention heads. |
| intermediate_size: FFN intermediate dimension (default: 4 * hidden_size). |
| hidden_act: Activation function in FFN. |
| hidden_dropout_prob: Dropout rate for embeddings and residual connections. |
| attention_probs_dropout_prob: Dropout rate for attention weights. |
| max_position_embeddings: Maximum sequence length (for buffer sizing, not PE). |
| initializer_range: Std for weight initialization (normal distribution). |
| layer_norm_eps: Epsilon for LayerNorm. |
| use_cache: Whether to return past key values for generation. |
| tie_word_embeddings: Whether to tie input/output embeddings. |
| """ |
|
|
| model_type = "domain_transformer" |
|
|
| def __init__( |
| self, |
| vocab_size: int = 32000, |
| hidden_size: int = 512, |
| num_hidden_layers: int = 6, |
| num_attention_heads: int = 8, |
| intermediate_size: int = None, |
| hidden_act: str = "gelu", |
| hidden_dropout_prob: float = 0.0, |
| attention_probs_dropout_prob: float = 0.0, |
| max_position_embeddings: int = 2048, |
| initializer_range: float = 0.02, |
| layer_norm_eps: float = 1e-5, |
| use_cache: bool = True, |
| tie_word_embeddings: bool = True, |
| **kwargs, |
| ): |
| self.vocab_size = vocab_size |
| self.hidden_size = hidden_size |
| self.num_hidden_layers = num_hidden_layers |
| self.num_attention_heads = num_attention_heads |
| self.intermediate_size = intermediate_size if intermediate_size is not None else 4 * hidden_size |
| self.hidden_act = hidden_act |
| self.hidden_dropout_prob = hidden_dropout_prob |
| self.attention_probs_dropout_prob = attention_probs_dropout_prob |
| self.max_position_embeddings = max_position_embeddings |
| self.initializer_range = initializer_range |
| self.layer_norm_eps = layer_norm_eps |
| self.use_cache = use_cache |
|
|
| assert hidden_size % num_attention_heads == 0, ( |
| f"hidden_size ({hidden_size}) must be divisible by " |
| f"num_attention_heads ({num_attention_heads})" |
| ) |
|
|
| super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) |
|
|
| @classmethod |
| def from_preset(cls, name: str, vocab_size: int = 32000, **overrides) -> "DomainTransformerConfig": |
| """Create config from a named preset. |
| |
| Presets: |
| "24m": ~24M params (6 layers, d=512, 8 heads) |
| "85m": ~85M params (12 layers, d=768, 12 heads) |
| "330m": ~330M params (24 layers, d=1024, 16 heads) |
| """ |
| presets = { |
| "24m": dict(hidden_size=512, num_hidden_layers=6, num_attention_heads=8, intermediate_size=2048), |
| "85m": dict(hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072), |
| "330m": dict(hidden_size=1024, num_hidden_layers=24, num_attention_heads=16, intermediate_size=4096), |
| } |
| if name not in presets: |
| raise ValueError(f"Unknown preset '{name}'. Available: {list(presets.keys())}") |
| params = {**presets[name], "vocab_size": vocab_size, **overrides} |
| return cls(**params) |
|
|