joerowell commited on
Commit
94107a2
·
verified ·
1 Parent(s): 7a9028a

Sync bundled HF code with upstream Laguna PR (v5 schema)

Browse files
Files changed (1) hide show
  1. configuration_laguna.py +172 -146
configuration_laguna.py CHANGED
@@ -1,5 +1,4 @@
1
- # ruff: noqa
2
- # Copyright 2025 Poolside and the HuggingFace Inc. team. All rights reserved.
3
  #
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
  # you may not use this file except in compliance with the License.
@@ -12,79 +11,44 @@
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
 
 
 
 
15
  from transformers.configuration_utils import PreTrainedConfig
16
  from transformers.modeling_rope_utils import RopeParameters
 
17
 
18
 
 
 
19
  class LagunaConfig(PreTrainedConfig):
20
  r"""
21
- Configuration class for Laguna model.
22
-
23
- Laguna is Poolside's MoE architecture with:
24
- - Attention output gating (softplus gate)
25
- - Sigmoid routing instead of softmax
26
- - No QKV bias
27
- - Explicit head_dim parameter
28
-
29
- Args:
30
- head_dim (`int`, *optional*, defaults to 128):
31
- Dimension of attention heads. Laguna uses explicit head_dim rather than
32
- computing it from hidden_size // num_attention_heads.
33
- qkv_bias (`bool`, *optional*, defaults to `False`):
34
- Whether to add bias to QKV projections. Laguna uses no QKV bias.
35
- attention_bias (`bool`, *optional*, defaults to `False`):
36
- Whether to add bias to attention output projection. Laguna uses no attention bias.
37
- gating (`bool`, *optional*, defaults to `True`):
38
- Whether to use softplus output gating on attention. When True, a g_proj linear
39
- layer is added and attn_output = attn_output * softplus(g_proj(x)).
40
- sliding_window (`int`, *optional*):
41
- Sliding window attention size. Used by layers whose type in ``layer_types``
42
- is ``"sliding_attention"``. When ``None``, all layers use full attention.
43
- layer_types (`list[str]`, *optional*):
44
- Per-layer attention type. Each element should be ``"sliding_attention"`` or
45
- ``"global_attention"``. Length must equal ``num_hidden_layers``. When ``None``,
46
- all layers default to global attention.
47
- swa_attention_sink_enabled (`bool`, *optional*, defaults to `False`):
48
- Whether to enable learnable attention sinks on sliding-window attention layers.
49
- When enabled, a per-head bias parameter is added that allows the model to attend
50
- to position 0 even when it falls outside the sliding window.
51
- swa_rope_parameters (`RopeParameters`, *optional*):
52
- Separate RoPE configuration for sliding-window attention layers. When ``None``,
53
- SWA layers use the same RoPE as global attention layers.
54
- vocab_size (`int`, *optional*, defaults to 100352):
55
- Vocabulary size of the Laguna model.
56
- hidden_size (`int`, *optional*, defaults to 2048):
57
- Dimension of the hidden representations.
58
- intermediate_size (`int`, *optional*, defaults to 8192):
59
- Dimension of the MLP representations for dense layers.
60
- num_hidden_layers (`int`, *optional*, defaults to 48):
61
- Number of hidden layers in the Transformer.
62
- num_attention_heads (`int`, *optional*, defaults to 32):
63
- Number of attention heads.
64
- num_key_value_heads (`int`, *optional*, defaults to 8):
65
- Number of key-value heads for GQA.
66
- max_position_embeddings (`int`, *optional*, defaults to 4096):
67
- Maximum sequence length.
68
- rms_norm_eps (`float`, *optional*, defaults to 1e-6):
69
- Epsilon for RMSNorm layers.
70
- num_experts (`int`, *optional*, defaults to 256):
71
- Number of routed experts.
72
- num_experts_per_tok (`int`, *optional*, defaults to 16):
73
- Number of experts selected per token (top-k).
74
- moe_intermediate_size (`int`, *optional*, defaults to 1024):
75
- Intermediate size of routed experts.
76
- shared_expert_intermediate_size (`int`, *optional*, defaults to 1024):
77
- Intermediate size of the shared expert.
78
- norm_topk_prob (`bool`, *optional*, defaults to `True`):
79
- Whether to normalize top-k routing probabilities.
80
- decoder_sparse_step (`int`, *optional*, defaults to 1):
81
- Frequency of MoE layers (1 = every layer is MoE after mlp_only_layers).
82
- mlp_only_layers (`list[int]`, *optional*, defaults to `[0]`):
83
- Layer indices that use dense MLP instead of MoE.
84
- router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
85
- Auxiliary loss coefficient for load balancing.
86
- rope_parameters (`RopeParameters`, *optional*):
87
- RoPE configuration. Defaults to rope_theta=500000.0.
88
  """
89
 
90
  model_type = "laguna"
@@ -93,11 +57,19 @@ class LagunaConfig(PreTrainedConfig):
93
  "layers.*.self_attn.q_proj": "colwise",
94
  "layers.*.self_attn.k_proj": "colwise",
95
  "layers.*.self_attn.v_proj": "colwise",
96
- "layers.*.self_attn.g_proj": "colwise", # Laguna-specific gating projection
97
  "layers.*.self_attn.o_proj": "rowwise",
 
 
98
  "layers.*.mlp.gate_proj": "colwise",
99
  "layers.*.mlp.up_proj": "colwise",
100
  "layers.*.mlp.down_proj": "rowwise",
 
 
 
 
 
 
101
  }
102
  base_model_pp_plan = {
103
  "embed_tokens": (["input_ids"], ["inputs_embeds"]),
@@ -105,83 +77,137 @@ class LagunaConfig(PreTrainedConfig):
105
  "norm": (["hidden_states"], ["hidden_states"]),
106
  }
107
 
108
- def __init__(
109
- self,
110
- vocab_size: int = 100352,
111
- hidden_size: int = 2048,
112
- intermediate_size: int = 8192,
113
- num_hidden_layers: int = 48,
114
- num_attention_heads: int = 32,
115
- num_key_value_heads: int = 8,
116
- head_dim: int = 128,
117
- qkv_bias: bool = False,
118
- attention_bias: bool = False,
119
- gating: bool = True,
120
- hidden_act: str = "silu",
121
- max_position_embeddings: int = 4096,
122
- initializer_range: float = 0.02,
123
- rms_norm_eps: float = 1e-6,
124
- use_cache: bool = True,
125
- tie_word_embeddings: bool = False,
126
- rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
127
- attention_dropout: float = 0.0,
128
- sliding_window: int | None = None,
129
- layer_types: list[str] | None = None,
130
- swa_attention_sink_enabled: bool = False,
131
- swa_rope_parameters: RopeParameters | None = None,
132
- num_experts: int = 256,
133
- num_experts_per_tok: int = 16,
134
- moe_intermediate_size: int = 1024,
135
- shared_expert_intermediate_size: int = 1024,
136
- norm_topk_prob: bool = True,
137
- decoder_sparse_step: int = 1,
138
- mlp_only_layers: list[int] | None = None,
139
- router_aux_loss_coef: float = 0.001,
140
- output_router_logits: bool = False,
141
- **kwargs,
142
- ):
143
- # Default mlp_only_layers: first layer is dense (moe_first_k_dense_replace=1)
144
- if mlp_only_layers is None:
145
- mlp_only_layers = [0]
146
-
147
- # Default rope_parameters with Laguna's theta
148
- if rope_parameters is None:
149
- rope_parameters = {"rope_type": "default", "rope_theta": 500000.0}
150
-
151
- self.vocab_size = vocab_size
152
- self.hidden_size = hidden_size
153
- self.intermediate_size = intermediate_size
154
- self.num_hidden_layers = num_hidden_layers
155
- self.num_attention_heads = num_attention_heads
156
- self.num_key_value_heads = num_key_value_heads
157
- self.head_dim = head_dim
158
- self.qkv_bias = qkv_bias
159
- self.attention_bias = attention_bias
160
- self.gating = gating
161
- self.hidden_act = hidden_act
162
- self.max_position_embeddings = max_position_embeddings
163
- self.initializer_range = initializer_range
164
- self.rms_norm_eps = rms_norm_eps
165
- self.use_cache = use_cache
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  self.rope_parameters = rope_parameters
167
- self.attention_dropout = attention_dropout
168
- # Sliding window attention arguments
169
- self.sliding_window = sliding_window
170
- self.layer_types = layer_types
171
- self.swa_attention_sink_enabled = swa_attention_sink_enabled
172
- self.swa_rope_parameters = swa_rope_parameters
173
- # MoE arguments
174
- self.num_experts = num_experts
175
- self.num_experts_per_tok = num_experts_per_tok
176
- self.moe_intermediate_size = moe_intermediate_size
177
- self.shared_expert_intermediate_size = shared_expert_intermediate_size
178
- self.norm_topk_prob = norm_topk_prob
179
- self.decoder_sparse_step = decoder_sparse_step
180
- self.mlp_only_layers = mlp_only_layers
181
- self.router_aux_loss_coef = router_aux_loss_coef
182
- self.output_router_logits = output_router_logits
183
-
184
- super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
 
187
  __all__ = ["LagunaConfig"]
 
1
+ # Copyright 2026 Poolside and the HuggingFace Inc. team. All rights reserved.
 
2
  #
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
  # you may not use this file except in compliance with the License.
 
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
+ from typing import Any, Literal
15
+
16
+ from huggingface_hub.dataclasses import strict
17
+
18
  from transformers.configuration_utils import PreTrainedConfig
19
  from transformers.modeling_rope_utils import RopeParameters
20
+ from transformers.utils import auto_docstring
21
 
22
 
23
+ @auto_docstring(checkpoint="poolside/laguna-XS.2")
24
+ @strict
25
  class LagunaConfig(PreTrainedConfig):
26
  r"""
27
+ partial_rotary_factor (`float`, *optional*):
28
+ Fraction of ``head_dim`` to rotate. Folded into each ``rope_parameters[layer_type]``
29
+ entry by ``__post_init__``.
30
+ num_attention_heads_per_layer (`list[int]`, *optional*):
31
+ Per-layer override for ``num_attention_heads``. Length must equal ``num_hidden_layers``.
32
+ mlp_layer_types (`list[str]`, *optional*):
33
+ Per-layer MLP type — ``"dense"`` or ``"sparse"``. Length must equal
34
+ ``num_hidden_layers``. Defaults to first layer dense, rest sparse.
35
+ moe_routed_scaling_factor (`float`, *optional*, defaults to 1.0):
36
+ Scalar applied to routed-expert output before combining with the shared-expert output.
37
+ moe_apply_router_weight_on_input (`bool`, *optional*, defaults to `False`):
38
+ Whether to apply router weights to the MoE input rather than the output. Not supported
39
+ in transformers yet; ``True`` will raise a ``NotImplementedError`` for now.
40
+ moe_router_logit_softcapping (`float`, *optional*, defaults to 0.0):
41
+ Scaling factor when applying tanh softcapping on the logits of the MoE router logits.
42
+
43
+ Example:
44
+
45
+ ```python
46
+ >>> from transformers import LagunaModel, LagunaConfig
47
+
48
+ >>> configuration = LagunaConfig()
49
+ >>> model = LagunaModel(configuration)
50
+ >>> configuration = model.config
51
+ ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  """
53
 
54
  model_type = "laguna"
 
57
  "layers.*.self_attn.q_proj": "colwise",
58
  "layers.*.self_attn.k_proj": "colwise",
59
  "layers.*.self_attn.v_proj": "colwise",
60
+ "layers.*.self_attn.g_proj": "colwise",
61
  "layers.*.self_attn.o_proj": "rowwise",
62
+ "layers.*.self_attn.q_norm": "replicated_with_grad_allreduce",
63
+ "layers.*.self_attn.k_norm": "replicated_with_grad_allreduce",
64
  "layers.*.mlp.gate_proj": "colwise",
65
  "layers.*.mlp.up_proj": "colwise",
66
  "layers.*.mlp.down_proj": "rowwise",
67
+ "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
68
+ "layers.*.mlp.experts.down_proj": "rowwise",
69
+ "layers.*.mlp.experts": "moe_tp_experts",
70
+ "layers.*.mlp.shared_experts.gate_proj": "colwise",
71
+ "layers.*.mlp.shared_experts.up_proj": "colwise",
72
+ "layers.*.mlp.shared_experts.down_proj": "rowwise",
73
  }
74
  base_model_pp_plan = {
75
  "embed_tokens": (["input_ids"], ["inputs_embeds"]),
 
77
  "norm": (["hidden_states"], ["hidden_states"]),
78
  }
79
 
80
+ # Qwen2Moe-inherited defaults we want to override for Laguna's typical shape.
81
+ vocab_size: int = 100352
82
+ hidden_size: int = 2048
83
+ intermediate_size: int = 8192
84
+ num_hidden_layers: int = 40
85
+ num_attention_heads: int = 48
86
+ num_key_value_heads: int = 8
87
+ hidden_act: str = "silu"
88
+ max_position_embeddings: int = 131072
89
+ initializer_range: float = 0.02
90
+ rms_norm_eps: float = 1e-6
91
+ use_cache: bool = True
92
+ tie_word_embeddings: bool = False
93
+ rope_parameters: RopeParameters | dict | None = None
94
+ sliding_window: int | None = None
95
+ attention_dropout: float | int = 0.0
96
+ moe_intermediate_size: int = 512
97
+ shared_expert_intermediate_size: int = 512
98
+ num_experts_per_tok: int = 8
99
+ num_experts: int = 256
100
+ output_router_logits: bool = False
101
+ router_aux_loss_coef: float = 0.001
102
+ layer_types: list[str] | None = None
103
+ pad_token_id: int | None = None
104
+ bos_token_id: int | None = None
105
+ eos_token_id: int | list[int] | None = None
106
+
107
+ # Laguna-specific attention
108
+ head_dim: int = 128
109
+ attention_bias: bool = False
110
+ partial_rotary_factor: float | None = None
111
+ num_attention_heads_per_layer: list[int] | None = None
112
+ # Laguna-specific MoE
113
+ mlp_layer_types: list[str] | None = None
114
+ moe_routed_scaling_factor: float = 1.0
115
+ moe_apply_router_weight_on_input: bool = False
116
+ moe_router_logit_softcapping: float = 0.0
117
+
118
+ def __post_init__(self, **kwargs):
119
+ if self.layer_types is None:
120
+ self.layer_types = ["full_attention"] * self.num_hidden_layers
121
+ if self.mlp_layer_types is None:
122
+ self.mlp_layer_types = ["dense"] + ["sparse"] * (self.num_hidden_layers - 1)
123
+ if self.num_attention_heads_per_layer is None:
124
+ self.num_attention_heads_per_layer = [self.num_attention_heads] * self.num_hidden_layers
125
+
126
+ default_rope_params: dict[Literal["full_attention", "sliding_attention"], dict[str, Any]] = {
127
+ "full_attention": {"rope_type": "default", "rope_theta": 500000.0},
128
+ "sliding_attention": {"rope_type": "default", "rope_theta": 10000.0},
129
+ }
130
+ if self.rope_parameters is None:
131
+ self.rope_parameters = default_rope_params
132
+
133
+ self._normalize_rope_parameters()
134
+ # Skip ``Qwen2MoeConfig.__post_init__`` it references ``mlp_only_layers`` /
135
+ # ``use_sliding_window`` / ``max_window_layers`` which Laguna drops above.
136
+ super().__post_init__(**kwargs)
137
+
138
+ def _normalize_rope_parameters(self):
139
+ """Coerce ``rope_parameters`` to the nested ``{layer_type: {...}}`` shape.
140
+
141
+ Accepts an already-nested dict as-is, or a flat dict that gets broadcast to every
142
+ layer type. A top-level ``partial_rotary_factor`` is folded into each sub-dict as
143
+ a default.
144
+ """
145
+ layer_types = set(self.layer_types)
146
+ rope_params = self.rope_parameters or {}
147
+ is_nested = isinstance(rope_params, dict) and any(k in layer_types for k in rope_params)
148
+ if is_nested:
149
+ nested = {lt: dict(rope_params.get(lt, {})) for lt in layer_types}
150
+ else:
151
+ nested = {lt: dict(rope_params) for lt in layer_types}
152
+
153
+ if self.partial_rotary_factor is not None:
154
+ for params in nested.values():
155
+ params.setdefault("partial_rotary_factor", self.partial_rotary_factor)
156
+
157
+ for params in nested.values():
158
+ params.setdefault("rope_type", "default")
159
+
160
+ self.rope_parameters = nested
161
+ # Null the top-level field now that its value lives in each sub-dict — otherwise
162
+ # ``standardize_rope_params`` would overwrite per-type values with the global one.
163
+ self.partial_rotary_factor = None
164
+
165
+ def convert_rope_params_to_dict(self, **kwargs):
166
+ # No need to handle BC for new models, because they have no old-format `rope_scaling`
167
+ return kwargs
168
+
169
+ def _validate_yarn_rope_parameters(self, rope_parameters: dict, ignore_keys=None):
170
+ """Override: parent reads ``self.rope_parameters["original_max_position_embeddings"]``
171
+ for its post-hoc factor sanity-check, which works for flat rope configs but raises
172
+ ``KeyError`` when ``self.rope_parameters`` is the Laguna/Gemma3-style per-layer-type
173
+ map (its keys are layer types like ``"full_attention"``). Fix locally by reading
174
+ from the per-call ``rope_parameters`` dict that ``validate_rope`` already passes in.
175
+ """
176
+ # Delegate to parent for the shared checks by temporarily swapping in a flat
177
+ # ``self.rope_parameters`` that has the key the parent expects. Cheapest way to
178
+ # share the parent's logic without reimplementing it here.
179
+ flat = getattr(self, "rope_parameters", None)
180
  self.rope_parameters = rope_parameters
181
+ try:
182
+ super()._validate_yarn_rope_parameters(rope_parameters, ignore_keys=ignore_keys)
183
+ finally:
184
+ self.rope_parameters = flat
185
+
186
+ def validate_architecture(self):
187
+ """Part of ``@strict``-powered validation."""
188
+ if self.moe_apply_router_weight_on_input:
189
+ raise NotImplementedError(
190
+ "moe_apply_router_weight_on_input=True is not yet supported in the "
191
+ "transformers implementation of Laguna."
192
+ )
193
+ if (
194
+ self.num_attention_heads_per_layer is not None
195
+ and len(self.num_attention_heads_per_layer) != self.num_hidden_layers
196
+ ):
197
+ raise ValueError(
198
+ f"num_attention_heads_per_layer length ({len(self.num_attention_heads_per_layer)}) "
199
+ f"must equal num_hidden_layers ({self.num_hidden_layers})."
200
+ )
201
+ if len(self.layer_types) != self.num_hidden_layers:
202
+ raise ValueError(
203
+ f"layer_types length ({len(self.layer_types)}) "
204
+ f"must equal num_hidden_layers ({self.num_hidden_layers})."
205
+ )
206
+ if len(self.mlp_layer_types) != self.num_hidden_layers:
207
+ raise ValueError(
208
+ f"mlp_layer_types length ({len(self.mlp_layer_types)}) "
209
+ f"must equal num_hidden_layers ({self.num_hidden_layers})."
210
+ )
211
 
212
 
213
  __all__ = ["LagunaConfig"]