Instructions to use poolside/Laguna-XS.2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use poolside/Laguna-XS.2 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="poolside/Laguna-XS.2", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("poolside/Laguna-XS.2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("poolside/Laguna-XS.2", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use poolside/Laguna-XS.2 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "poolside/Laguna-XS.2"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "poolside/Laguna-XS.2",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/poolside/Laguna-XS.2

SGLang

How to use poolside/Laguna-XS.2 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "poolside/Laguna-XS.2" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "poolside/Laguna-XS.2",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "poolside/Laguna-XS.2" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "poolside/Laguna-XS.2",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use poolside/Laguna-XS.2 with Docker Model Runner:
```
docker model run hf.co/poolside/Laguna-XS.2
```

joerowell commited on 12 days ago

Commit

4d48297

verified ·

1 Parent(s): a4fde8e

Upload Laguna-XS.2 checkpoint

Browse files

Files changed (21) hide show

chat_template.jinja +132 -0
config.json +80 -29
configuration_laguna.py +3 -10
model-00001-of-00014.safetensors +1 -1
model-00002-of-00014.safetensors +1 -1
model-00003-of-00014.safetensors +1 -1
model-00004-of-00014.safetensors +1 -1
model-00005-of-00014.safetensors +1 -1
model-00006-of-00014.safetensors +1 -1
model-00007-of-00014.safetensors +1 -1
model-00008-of-00014.safetensors +1 -1
model-00009-of-00014.safetensors +1 -1
model-00010-of-00014.safetensors +1 -1
model-00011-of-00014.safetensors +1 -1
model-00012-of-00014.safetensors +1 -1
model-00013-of-00014.safetensors +1 -1
model-00014-of-00014.safetensors +1 -1
modeling_laguna.py +64 -141
special_tokens_map.json +1 -1
tokenizer.json +20 -20
tokenizer_config.json +51 -50

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,132 @@

+{#- Copied from laguna_glm_thinking_v4/chat_template.jinja -#}
+{#- Removes prefix that references <think> token, and replaces message.reasoning_content reference with message.reasoning -#}
+{{- "〈|EOS|〉" -}}
+{%- set enable_thinking = enable_thinking | default(false) -%}
+{%- set render_assistant_messages_raw = render_assistant_messages_raw | default(false) -%}
+{%- set add_generation_prompt = add_generation_prompt | default(false) -%}
+{#- ───── header (system message) ───── -#}
+{%- set system_message = "" -%}
+{%- if messages and messages[0].role == "system" -%}
+  {%- set system_message = messages[0].content -%}
+{%- endif -%}
+{%- if (system_message and system_message.strip()) or tools -%}
+  {{- "<system>\n" -}}
+  {%- if system_message and system_message.strip() -%}
+    {{- "\n" -}}
+    {{- system_message.rstrip() -}}
+  {%- endif -%}
+  {%- if tools -%}
+    {{- "\n\n### Tools\n\n" -}}
+    {%- set ns = namespace(tool_string="You may call functions to assist with the user query.\n"
+          ~ "All available function signatures are listed below:\n"
+          ~ "<available_tools>\n") -%}
+    {%- for tool in tools -%}
+      {%- set ns.tool_string = ns.tool_string ~ (tool | tojson) ~ "\n" -%}
+    {%- endfor -%}
+    {%- if enable_thinking -%}
+      {%- set tool_string = ns.tool_string + "</available_tools>\n\n" ~
+            "Wrap your thinking in '<think>', '</think>' tags, followed by a function call. For each function call, return an unescaped XML-like object with function name and arguments within '<tool_call>' and '</tool_call>' tags, like here:\n" ~
+            "<think> your thoughts here </think>\n" ~
+            "<tool_call>function-name\n<arg_key>argument-key</arg_key>\n<arg_value>value-of-argument-key</arg_value>\n" ~
+            "</tool_call>" -%}
+    {%- else -%}
+      {%- set tool_string = ns.tool_string + "</available_tools>\n\n" ~
+            "For each function call, return an unescaped XML-like object " ~
+            "with function name and arguments within '<tool_call>' and '</tool_call>' tags, like here:\n" ~
+            "<tool_call>function-name\n<arg_key>argument-key</arg_key>\n<arg_value>value-of-argument-key</arg_value>\n" ~
+            "</tool_call>" -%}
+    {%- endif -%}
+    {{- tool_string -}}
+  {%- endif -%}
+  {{- "\n</system>\n" -}}
+{%- endif -%}
+{#- ───── main loop ───── -#}
+{%- for message in messages -%}
+  {%- set content = message.content if message.content is string else "" -%}
+  {%- if message.role == "user" -%}
+    {{- "<user>\n" + content + "\n</user>\n" -}}
+  {%- elif message.role == "assistant" -%}
+    {%- generation -%}
+      {{- "<assistant>\n" -}}
+      {%- if render_assistant_messages_raw -%}
+        {#- Raw mode: prepend the generation prompt token, then dump content verbatim. -#}
+        {#- The generation prompt is <think> when enable_thinking, </think> otherwise. -#}
+        {#- Only prepend if content doesn't already start with it. -#}
+        {%- if enable_thinking -%}
+          {%- if not content.startswith('<think>') -%}
+            {{- '<think>' -}}
+          {%- endif -%}
+        {%- else -%}
+          {%- if not content.startswith('</think>') -%}
+            {{- '</think>' -}}
+          {%- endif -%}
+        {%- endif -%}
+        {{- content -}}
+        {#- Append closing tag if content doesn't already end with it. -#}
+        {%- if not content.endswith('</assistant>\n') and not content.endswith('</assistant>') -%}
+          {{- '\n</assistant>' -}}
+        {%- endif -%}
+        {{- "\n" -}}
+      {%- else -%}
+        {#- Extract reasoning content from message.reasoning (vLLM field name) or message.reasoning_content, or from <think> tags -#}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning is string %}
+          {%- set reasoning_content = message.reasoning %}
+        {%- elif message.reasoning_content is string %}
+          {%- set reasoning_content = message.reasoning_content %}
+        {%- endif %}
+        {#- Always strip <think> tags from content if present to avoid duplication -#}
+        {%- if '</think>' in content %}
+          {%- if not reasoning_content %}
+            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+          {%- endif %}
+          {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+        {%- endif %}
+        {#- Display reasoning content for all messages -#}
+        {%- if reasoning_content -%}
+          {{- '<think>\n' + reasoning_content.strip() + '\n</think>\n' -}}
+        {%- else -%}
+          {{- '</think>\n' -}}
+        {%- endif -%}
+        {#- Display main content -#}
+        {%- if content.strip() -%}
+          {{- content.strip() ~ "\n" -}}
+        {%- endif -%}
+        {%- if message.tool_calls -%}
+          {%- for tool_call in message.tool_calls -%}
+            {%- set function_data = tool_call.function -%}
+            {{- '<tool_call>' + function_data.name }}
+            {% set _args = function_data.arguments %}
+            {%- for k, v in _args.items() -%}
+              {{- "<arg_key>" ~ k ~ "</arg_key>\n" -}}
+              {{- "<arg_value>"}}{{ v | tojson(ensure_ascii=False) if v is not string else v }}{{ "</arg_value>\n" -}}
+            {%- endfor -%}
+            {{- "</tool_call>\n" -}}
+          {%- endfor -%}
+        {%- endif -%}
+        {{- "</assistant>\n" -}}
+      {%- endif -%}
+    {%- endgeneration -%}
+  {%- elif message.role == "tool" -%}
+    {{- "<tool_response>\n" + content + "\n</tool_response>\n" -}}
+  {%- elif message.role == "system" and loop.index0 != 0 -%}
+    {#- Render additional system messages (skip the first one which is handled separately in the header) -#}
+    {{- "<system>\n" + content + "\n</system>\n" -}}
+  {%- endif -%}
+{%- endfor -%}
+{#- ───── generation prompt ───── -#}
+{%- if add_generation_prompt -%}
+  {{- "<assistant>\n" -}}
+  {#- ───── Include reasoning mode directive ───── -#}
+  {%- if not enable_thinking %}
+    {{- '</think>' -}}
+  {%- else %}
+    {{- '<think>' -}}
+  {%- endif %}
+{%- endif -%}

config.json CHANGED Viewed

@@ -15,7 +15,6 @@
   "num_key_value_heads": 8,
   "head_dim": 128,
   "max_position_embeddings": 131072,
-  "qkv_bias": false,
   "attention_bias": false,
   "attention_dropout": 0.0,
   "rms_norm_eps": 1e-06,
@@ -23,12 +22,7 @@
   "num_experts_per_tok": 8,
   "moe_intermediate_size": 512,
   "shared_expert_intermediate_size": 512,
-  "norm_topk_prob": true,
-  "router_aux_loss_coef": 0.001,
-  "decoder_sparse_step": 1,
-  "mlp_only_layers": [
-    0
-  ],
   "bos_token_id": 2,
   "eos_token_id": [
     2,
@@ -38,16 +32,24 @@
   "tie_word_embeddings": false,
   "use_cache": true,
   "torch_dtype": "bfloat16",
-  "gating": "per-head",
   "sliding_window": 512,
   "rope_parameters": {
-    "rope_theta": 500000.0,
-    "rope_type": "yarn",
-    "factor": 32.0,
-    "original_max_position_embeddings": 4096,
-    "beta_slow": 1.0,
-    "beta_fast": 64.0,
-    "attention_factor": 1.0
   },
   "layer_types": [
     "full_attention",
@@ -91,6 +93,52 @@
     "sliding_attention",
     "sliding_attention"
   ],
   "num_attention_heads_per_layer": [
     48,
     64,
@@ -133,19 +181,22 @@
     64,
     64
   ],
-  "swa_rope_parameters": {
-    "rope_theta": 10000.0,
-    "rope_type": "linear",
-    "factor": 1.0,
-    "partial_rotary_factor": 1.0
   },
-  "moe_router_use_sigmoid": true,
-  "moe_apply_router_weight_on_input": false,
-  "moe_shared_gate": false,
-  "moe_routed_scaling_factor": 2.5,
-  "qk_norm_type": "rmsnorm",
-  "norm_type": "rmsnorm",
-  "rope_style": "rotate-half",
-  "partial_rotary_factor": 0.5,
-  "swa_attention_sink_enabled": false
 }

   "num_key_value_heads": 8,
   "head_dim": 128,
   "max_position_embeddings": 131072,
   "attention_bias": false,
   "attention_dropout": 0.0,
   "rms_norm_eps": 1e-06,
   "num_experts_per_tok": 8,
   "moe_intermediate_size": 512,
   "shared_expert_intermediate_size": 512,
+  "router_aux_loss_coef": 0.0,
   "bos_token_id": 2,
   "eos_token_id": [
     2,
   "tie_word_embeddings": false,
   "use_cache": true,
   "torch_dtype": "bfloat16",
+  "gating": true,
   "sliding_window": 512,
   "rope_parameters": {
+    "full_attention": {
+      "rope_theta": 500000.0,
+      "rope_type": "yarn",
+      "factor": 32.0,
+      "original_max_position_embeddings": 4096,
+      "beta_slow": 1.0,
+      "beta_fast": 64.0,
+      "attention_factor": 1.0,
+      "partial_rotary_factor": 0.5
+    },
+    "sliding_attention": {
+      "rope_type": "default",
+      "rope_theta": 10000.0,
+      "partial_rotary_factor": 1.0
+    }
   },
   "layer_types": [
     "full_attention",
     "sliding_attention",
     "sliding_attention"
   ],
+  "moe_apply_router_weight_on_input": false,
+  "partial_rotary_factor": 0.5,
+  "mlp_layer_types": [
+    "dense",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse",
+    "sparse"
+  ],
+  "use_bidirectional_attention": false,
+  "moe_routed_scaling_factor": 2.5,
   "num_attention_heads_per_layer": [
     48,
     64,
     64,
     64
   ],
+  "compression_config": {
+    "mode": null,
+    "group_size": 32,
+    "eps": 1e-05,
+    "filter_fqns": [
+      "output"
+    ],
+    "recompute_fake_quantize": false
   },
+  "quantization_config": {
+    "mode": null,
+    "group_size": 32,
+    "eps": 1e-05,
+    "filter_fqns": [
+      "output"
+    ],
+    "recompute_fake_quantize": false
+  }
 }

configuration_laguna.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # Copyright 2025 Poolside and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -41,7 +42,7 @@ class LagunaConfig(PreTrainedConfig):
             is ``"sliding_attention"``. When ``None``, all layers use full attention.
         layer_types (`list[str]`, *optional*):
             Per-layer attention type. Each element should be ``"sliding_attention"`` or
-            ``"full_attention"``. Length must equal ``num_hidden_layers``. When ``None``,
             all layers default to global attention.
         swa_attention_sink_enabled (`bool`, *optional*, defaults to `False`):
             Whether to enable learnable attention sinks on sliding-window attention layers.
@@ -115,7 +116,7 @@ class LagunaConfig(PreTrainedConfig):
         head_dim: int = 128,
         qkv_bias: bool = False,
         attention_bias: bool = False,
-        gating: bool | str = True,
         hidden_act: str = "silu",
         max_position_embeddings: int = 4096,
         initializer_range: float = 0.02,
@@ -123,13 +124,11 @@ class LagunaConfig(PreTrainedConfig):
         use_cache: bool = True,
         tie_word_embeddings: bool = False,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
-        partial_rotary_factor: float = 1.0,
         attention_dropout: float = 0.0,
         sliding_window: int | None = None,
         layer_types: list[str] | None = None,
         swa_attention_sink_enabled: bool = False,
         swa_rope_parameters: RopeParameters | None = None,
-        num_attention_heads_per_layer: list[int] | None = None,
         num_experts: int = 256,
         num_experts_per_tok: int = 16,
         moe_intermediate_size: int = 1024,
@@ -139,8 +138,6 @@ class LagunaConfig(PreTrainedConfig):
         mlp_only_layers: list[int] | None = None,
         router_aux_loss_coef: float = 0.001,
         output_router_logits: bool = False,
-        moe_routed_scaling_factor: float = 1.0,
-        moe_apply_router_weight_on_input: bool = False,
         **kwargs,
     ):
         # Default mlp_only_layers: first layer is dense (moe_first_k_dense_replace=1)
@@ -167,14 +164,12 @@ class LagunaConfig(PreTrainedConfig):
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.rope_parameters = rope_parameters
-        self.partial_rotary_factor = partial_rotary_factor
         self.attention_dropout = attention_dropout
         # Sliding window attention arguments
         self.sliding_window = sliding_window
         self.layer_types = layer_types
         self.swa_attention_sink_enabled = swa_attention_sink_enabled
         self.swa_rope_parameters = swa_rope_parameters
-        self.num_attention_heads_per_layer = num_attention_heads_per_layer
         # MoE arguments
         self.num_experts = num_experts
         self.num_experts_per_tok = num_experts_per_tok
@@ -185,8 +180,6 @@ class LagunaConfig(PreTrainedConfig):
         self.mlp_only_layers = mlp_only_layers
         self.router_aux_loss_coef = router_aux_loss_coef
         self.output_router_logits = output_router_logits
-        self.moe_routed_scaling_factor = moe_routed_scaling_factor
-        self.moe_apply_router_weight_on_input = moe_apply_router_weight_on_input
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

+# ruff: noqa
 # Copyright 2025 Poolside and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
             is ``"sliding_attention"``. When ``None``, all layers use full attention.
         layer_types (`list[str]`, *optional*):
             Per-layer attention type. Each element should be ``"sliding_attention"`` or
+            ``"global_attention"``. Length must equal ``num_hidden_layers``. When ``None``,
             all layers default to global attention.
         swa_attention_sink_enabled (`bool`, *optional*, defaults to `False`):
             Whether to enable learnable attention sinks on sliding-window attention layers.
         head_dim: int = 128,
         qkv_bias: bool = False,
         attention_bias: bool = False,
+        gating: bool = True,
         hidden_act: str = "silu",
         max_position_embeddings: int = 4096,
         initializer_range: float = 0.02,
         use_cache: bool = True,
         tie_word_embeddings: bool = False,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         attention_dropout: float = 0.0,
         sliding_window: int | None = None,
         layer_types: list[str] | None = None,
         swa_attention_sink_enabled: bool = False,
         swa_rope_parameters: RopeParameters | None = None,
         num_experts: int = 256,
         num_experts_per_tok: int = 16,
         moe_intermediate_size: int = 1024,
         mlp_only_layers: list[int] | None = None,
         router_aux_loss_coef: float = 0.001,
         output_router_logits: bool = False,
         **kwargs,
     ):
         # Default mlp_only_layers: first layer is dense (moe_first_k_dense_replace=1)
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.rope_parameters = rope_parameters
         self.attention_dropout = attention_dropout
         # Sliding window attention arguments
         self.sliding_window = sliding_window
         self.layer_types = layer_types
         self.swa_attention_sink_enabled = swa_attention_sink_enabled
         self.swa_rope_parameters = swa_rope_parameters
         # MoE arguments
         self.num_experts = num_experts
         self.num_experts_per_tok = num_experts_per_tok
         self.mlp_only_layers = mlp_only_layers
         self.router_aux_loss_coef = router_aux_loss_coef
         self.output_router_logits = output_router_logits
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

model-00001-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:096bec47fccb4e593cda439e96c441b4df24da603f6996ad4cc2f42b07b62979
 size 5120041576

 version https://git-lfs.github.com/spec/v1
+oid sha256:a3abd724208b29f3db5e9f4cc30e7eaa34184a9fa8eb371398bceba4cbfd5c5d
 size 5120041576

model-00002-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9b033cde77d0dfc467217228ac1fe56955da6f6f0539d217c0e87bc9c6141a02
 size 5119449520

 version https://git-lfs.github.com/spec/v1
+oid sha256:ca23cb6e0d937ebc639873501cec52901d2b6cb533287d8dc6665ca3ee867cd2
 size 5119449520

model-00003-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a4322f9a3659ac1b3f1aa6445d23e00294b876d76c2dcb940b103a94afb68290
 size 5119449504

 version https://git-lfs.github.com/spec/v1
+oid sha256:9e178aeae8b9e195af1cee84a229ca61c5d08151949130f861b771ca14400de9
 size 5119449504

model-00004-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fc9a1c934aa3e438031f7272ab103fc42d8dbbaad5b35a6a9041fe8b2615c03b
 size 5119450272

 version https://git-lfs.github.com/spec/v1
+oid sha256:60e319fd813ae3277bc065a00495bb4baf34ed34c503bd16d3b30d006b2ca120
 size 5119450272

model-00005-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:52aac8a7fb885688771a7c74a9d06e62b57cdbbecb5282347e7d9c9ad0ebf59c
 size 5119451824

 version https://git-lfs.github.com/spec/v1
+oid sha256:691181b3106c2aa23d1203e636cb53f689b6fdc3525ce6b39d9f8b1673f030d6
 size 5119451824

model-00006-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:05f9030c4d16a4b858e31cd470511784d3917a2a6f023ed8a5362bb239b7997c
 size 5119451944

 version https://git-lfs.github.com/spec/v1
+oid sha256:f154810446ac59484e37d322098203ddb1f26753705c3a7bbf0ec483f5b35251
 size 5119451944

model-00007-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2c5fb7baabed09175615fe9d9fd93544bfb8c70b24d81a139719eeaae0b105ab
 size 5119451960

 version https://git-lfs.github.com/spec/v1
+oid sha256:e7859544da0dfea93221c1366f6eff2c00eb81e8f36ec2ea809262cb712c33fb
 size 5119451960

model-00008-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:452ccc8d15c66187c90845a504b8eb66105ed185da996d180ff2a93aea19889b
 size 5119451960

 version https://git-lfs.github.com/spec/v1
+oid sha256:4fa2b6d8763e7efda98440ef37bada8369fc5e0ed965d9817fc97e714a289959
 size 5119451960

model-00009-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:91343e6489e08e6b8c1f94ad333dade5c1dd34ff10b9bcd7600aff346337c7e5
 size 5119451872

 version https://git-lfs.github.com/spec/v1
+oid sha256:71ec0467818b22fa8d452e3e7247d3f3631291394e3462f6c08ef94555e79dc2
 size 5119451872

model-00010-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0fd9ba3702aff6e57e11362b8347382279ceef6a9ef0896571771ea5c3d3da08
 size 5119451824

 version https://git-lfs.github.com/spec/v1
+oid sha256:31011cb26eee41bd6ad61da46f50482b2c5af9e10b37ddf0bfb353e80aa41d84
 size 5119451824

model-00011-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b131998e1f04900a4c809675ccbbb33ee2d3fd8237ab364d809331d59c0f09bb
 size 5119451856

 version https://git-lfs.github.com/spec/v1
+oid sha256:0d3048c464aef99424696588aacdf835e873d75ac705af9552674450c079db43
 size 5119451856

model-00012-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:07749587fc5f27ce84ca6889afd840b68dd5878019d37e22881ec727cdbf59aa
 size 5119451960

 version https://git-lfs.github.com/spec/v1
+oid sha256:61ab2dc43c00edc6c8531c57b68f6606cf2dfeec296ccfa2c0ead7ce34fd20dd
 size 5119451960

model-00013-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:933f10a0e0b31fb9f9904f21a2bbd0beaf5ec211ad1ebb7ff91b6086a304d243
 size 5119451960

 version https://git-lfs.github.com/spec/v1
+oid sha256:114944b680e6e7cc25fe74c98e60a7e9ec92597cf8463676fe6287a32389ad04
 size 5119451960

model-00014-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:52bf6dae97176c8476d198fb820912f9f6a6b51b682b10560befd88f2969c384
 size 335563984

 version https://git-lfs.github.com/spec/v1
+oid sha256:f61c87abb39348f3b07d92ee31dc2aa5c1521d5a0aa408f283f849d00df24690
 size 335563984

modeling_laguna.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # Copyright 2025 Poolside and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,51 +13,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
-from collections.abc import Callable
 from typing import Optional
 import torch
 import torch.nn.functional as F
 from torch import nn
 from transformers import initialization as init
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
-from transformers.generation import GenerationMixin
 from transformers.integrations import (
-    use_kernel_forward_from_hub,
-    use_kernel_func_from_hub,
     use_kernelized_func,
 )
-from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
-from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
-from transformers.modeling_layers import GradientCheckpointingLayer
-from transformers.modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
-from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
-from transformers.utils import auto_docstring, can_return_tuple, is_grouped_mm_available
-from transformers.utils.generic import TransformersKwargs, check_model_inputs, maybe_autocast
-try:
-    # transformers >= 5.5 relocated OutputRecorder to a dedicated module.
-    from transformers.utils.output_capturing import OutputRecorder
-except ImportError:
-    from transformers.utils.generic import OutputRecorder  # type: ignore[no-redef]
 from .configuration_laguna import LagunaConfig
-def _build_rope_config(base_config, rope_params, partial_rotary_factor):
-    """Shallow-copy the config with rope_parameters / partial_rotary_factor overridden."""
-    cfg = copy.copy(base_config)
-    if rope_params is not None:
-        cfg.rope_parameters = dict(rope_params)
-    if partial_rotary_factor is not None:
-        cfg.partial_rotary_factor = float(partial_rotary_factor)
-    return cfg
 @use_kernel_forward_from_hub("RMSNorm")
 class LagunaRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
@@ -112,14 +96,14 @@ class LagunaRotaryEmbedding(nn.Module):
                 The device to use for initialization of the inverse frequencies.
             seq_len (`int`, *optional*):
                 The current sequence length. Unused for this type of RoPE.
-        Returns:
             Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
             post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
         """
         base = config.rope_parameters["rope_theta"]
-        head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
-        partial = getattr(config, "partial_rotary_factor", 1.0)
-        dim = int(head_dim * partial)
         attention_factor = 1.0  # Unused in this type of RoPE
@@ -172,17 +156,11 @@ class LagunaTopKRouter(nn.Module):
         self.hidden_dim = config.hidden_size
         self.weight = nn.Parameter(torch.zeros(self.num_experts, self.hidden_dim))
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        e_score_correction_bias: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         hidden_states = hidden_states.reshape(-1, self.hidden_dim)
         router_logits = F.linear(hidden_states, self.weight)
         # Laguna-specific: sigmoid routing in float32 for precision
         routing_weights = torch.sigmoid(router_logits.float())
-        if e_score_correction_bias is not None:
-            routing_weights = routing_weights + e_score_correction_bias.float()
         routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
         if self.norm_topk_prob:
             routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
@@ -197,42 +175,42 @@ class LagunaSparseMoeBlock(nn.Module):
         super().__init__()
         self.num_experts = config.num_experts
         self.top_k = config.num_experts_per_tok
-        self.routed_scaling_factor = float(getattr(config, "moe_routed_scaling_factor", 1.0))
-        self.apply_router_weight_on_input = bool(getattr(config, "moe_apply_router_weight_on_input", False))
         self.gate = LagunaTopKRouter(config)
         self.experts = nn.ModuleList(
             [LagunaMLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(self.num_experts)]
         )
-        self.experts.e_score_correction_bias = nn.Parameter(torch.zeros(self.num_experts))
         self.shared_expert = LagunaMLP(config, intermediate_size=config.shared_expert_intermediate_size)
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         shared_expert_output = self.shared_expert(hidden_states)
-        _, routing_weights, selected_experts = self.gate(
-            hidden_states, e_score_correction_bias=self.experts.e_score_correction_bias
-        )
-        routed_output = torch.zeros_like(hidden_states)
-        expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
         for expert_idx in range(self.num_experts):
             top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
             if token_idx.shape[0] == 0:
                 continue
-            w = routing_weights[token_idx, top_k_pos, None]
-            if self.apply_router_weight_on_input:
-                current = self.experts[expert_idx](hidden_states[token_idx] * w)
-            else:
-                current = self.experts[expert_idx](hidden_states[token_idx]) * w
-            routed_output.index_add_(0, token_idx, current.to(routed_output.dtype))
-        routed_output = routed_output * self.routed_scaling_factor
-        final_hidden_states = routed_output + shared_expert_output
-        return final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
 def rotate_half(x):
@@ -258,21 +236,16 @@ def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
             cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
     cos = cos.unsqueeze(unsqueeze_dim)
     sin = sin.unsqueeze(unsqueeze_dim)
-    rot_dim = cos.shape[-1]
-    if rot_dim == q.shape[-1]:
-        q_embed = (q * cos) + (rotate_half(q) * sin)
-        k_embed = (k * cos) + (rotate_half(k) * sin)
-        return q_embed, k_embed
-    q_rot, q_pass = q[..., :rot_dim], q[..., rot_dim:]
-    k_rot, k_pass = k[..., :rot_dim], k[..., rot_dim:]
-    q_rot = (q_rot * cos) + (rotate_half(q_rot) * sin)
-    k_rot = (k_rot * cos) + (rotate_half(k_rot) * sin)
-    return torch.cat([q_rot, q_pass], dim=-1), torch.cat([k_rot, k_pass], dim=-1)
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -325,28 +298,19 @@ class LagunaAttention(nn.Module):
         self.config = config
         self.layer_idx = layer_idx
         self.head_dim = config.head_dim
-        per_layer_heads = getattr(config, "num_attention_heads_per_layer", None)
-        num_heads = per_layer_heads[layer_idx] if per_layer_heads is not None else config.num_attention_heads
-        self.num_heads = num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = num_heads // config.num_key_value_heads
         self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
         self.is_causal = True
-        self.q_proj = nn.Linear(config.hidden_size, num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(num_heads * self.head_dim, config.hidden_size, bias=False)
-        gating = getattr(config, "gating", True)
-        self.gating = bool(gating)
-        self.gate_per_head = gating == "per-head"
-        if self.gating:
-            g_out = num_heads if self.gate_per_head else num_heads * self.head_dim
-            self.g_proj = nn.Linear(config.hidden_size, g_out, bias=False)
         self.q_norm = LagunaRMSNorm(config.head_dim, eps=config.rms_norm_eps)
         self.k_norm = LagunaRMSNorm(config.head_dim, eps=config.rms_norm_eps)
@@ -399,15 +363,10 @@ class LagunaAttention(nn.Module):
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
-        if self.gating:
-            gate = F.softplus(self.g_proj(hidden_states).float()).to(attn_output.dtype)
-            if self.gate_per_head:
-                shape = attn_output.shape
-                attn_output = (
-                    attn_output.view(*shape[:-1], self.num_heads, self.head_dim) * gate.unsqueeze(-1)
-                ).view(shape)
-            else:
-                attn_output = attn_output * gate
         attn_output = self.o_proj(attn_output)
@@ -419,12 +378,8 @@ class LagunaDecoderLayer(GradientCheckpointingLayer):
     def __init__(self, config: LagunaConfig, layer_idx: int):
         super().__init__()
-        self.layer_idx = layer_idx
-        layer_types = getattr(config, "layer_types", None)
-        self.attention_type = (
-            layer_types[layer_idx] if layer_types is not None else "full_attention"
-        )
         self.self_attn = LagunaAttention(config, layer_idx)
         if (layer_idx not in config.mlp_only_layers) and (
             config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
         ):
@@ -435,11 +390,6 @@ class LagunaDecoderLayer(GradientCheckpointingLayer):
         self.post_attention_layernorm = LagunaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hidden_size = config.hidden_size
-    def _pick(self, obj):
-        if isinstance(obj, dict):
-            return obj.get(self.attention_type, obj.get("full_attention"))
-        return obj
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -456,12 +406,12 @@ class LagunaDecoderLayer(GradientCheckpointingLayer):
         # Self Attention
         hidden_states, _ = self.self_attn(
             hidden_states=hidden_states,
-            attention_mask=self._pick(attention_mask),
             position_ids=position_ids,
             past_key_values=past_key_values,
             use_cache=use_cache,
             cache_position=cache_position,
-            position_embeddings=self._pick(position_embeddings),
             **kwargs,
         )
         hidden_states = residual + hidden_states
@@ -514,18 +464,6 @@ class LagunaModel(LagunaPreTrainedModel):
         )
         self.norm = LagunaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = LagunaRotaryEmbedding(config=config)
-        self._has_swa = (
-            config.layer_types is not None and "sliding_attention" in config.layer_types
-        )
-        swa_rp = getattr(config, "swa_rope_parameters", None)
-        if self._has_swa and swa_rp is not None:
-            swa_partial = swa_rp.get("partial_rotary_factor", None)
-            swa_cfg = _build_rope_config(config, swa_rp, swa_partial)
-            self.swa_rotary_emb = LagunaRotaryEmbedding(config=swa_cfg)
-        else:
-            self.swa_rotary_emb = None
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -543,7 +481,6 @@ class LagunaModel(LagunaPreTrainedModel):
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ):
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -562,7 +499,8 @@ class LagunaModel(LagunaPreTrainedModel):
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
-        global_mask = create_causal_mask(
             config=self.config,
             input_embeds=inputs_embeds,
             attention_mask=attention_mask,
@@ -572,23 +510,7 @@ class LagunaModel(LagunaPreTrainedModel):
         )
         hidden_states = inputs_embeds
-        global_pe = self.rotary_emb(hidden_states, position_ids)
-        if self._has_swa:
-            swa_mask = create_sliding_window_causal_mask(
-                config=self.config,
-                input_embeds=inputs_embeds,
-                attention_mask=attention_mask,
-                cache_position=cache_position,
-                past_key_values=past_key_values,
-                position_ids=position_ids,
-            )
-            causal_mask = {"full_attention": global_mask, "sliding_attention": swa_mask}
-            swa_pe = self.swa_rotary_emb(hidden_states, position_ids) if self.swa_rotary_emb is not None else global_pe
-            position_embeddings = {"full_attention": global_pe, "sliding_attention": swa_pe}
-        else:
-            causal_mask = global_mask
-            position_embeddings = global_pe
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             hidden_states = decoder_layer(
@@ -636,7 +558,8 @@ def load_balancing_loss_func(
             The attention_mask used in forward function
             shape [batch_size X sequence_length] if not None.
-    Returns:
         The auxiliary loss.
     """
     if gate_logits is None or not isinstance(gate_logits, tuple):
@@ -727,7 +650,7 @@ class LagunaForCausalLM(LagunaPreTrainedModel, GenerationMixin):
         **kwargs: Unpack[TransformersKwargs],
     ) -> MoeCausalLMOutputWithPast:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
             config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
             (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
@@ -768,8 +691,8 @@ class LagunaForCausalLM(LagunaPreTrainedModel, GenerationMixin):
                 self.num_experts_per_tok,
                 attention_mask,
             )
-            if labels is not None:
-                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
         return MoeCausalLMOutputWithPast(
             loss=loss,

+# ruff: noqa
 # Copyright 2025 Poolside and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Optional
+from collections.abc import Callable
 import torch
 import torch.nn.functional as F
 from torch import nn
 from transformers import initialization as init
+from transformers.utils import auto_docstring, can_return_tuple, is_grouped_mm_available
+from transformers.generation import GenerationMixin
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.integrations import (
     use_kernelized_func,
+    use_kernel_func_from_hub,
+    use_kernel_forward_from_hub,
 )
+from transformers.masking_utils import create_causal_mask
+from transformers.utils.generic import OutputRecorder, TransformersKwargs, maybe_autocast, check_model_inputs
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import MoeModelOutputWithPast, MoeCausalLMOutputWithPast
 from transformers.processing_utils import Unpack
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from .configuration_laguna import LagunaConfig
 @use_kernel_forward_from_hub("RMSNorm")
 class LagunaRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
                 The device to use for initialization of the inverse frequencies.
             seq_len (`int`, *optional*):
                 The current sequence length. Unused for this type of RoPE.
+        Returns
+        -------
             Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
             post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
         """
         base = config.rope_parameters["rope_theta"]
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
         attention_factor = 1.0  # Unused in this type of RoPE
         self.hidden_dim = config.hidden_size
         self.weight = nn.Parameter(torch.zeros(self.num_experts, self.hidden_dim))
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         hidden_states = hidden_states.reshape(-1, self.hidden_dim)
         router_logits = F.linear(hidden_states, self.weight)
         # Laguna-specific: sigmoid routing in float32 for precision
         routing_weights = torch.sigmoid(router_logits.float())
         routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
         if self.norm_topk_prob:
             routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
         super().__init__()
         self.num_experts = config.num_experts
         self.top_k = config.num_experts_per_tok
         self.gate = LagunaTopKRouter(config)
         self.experts = nn.ModuleList(
             [LagunaMLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(self.num_experts)]
         )
         self.shared_expert = LagunaMLP(config, intermediate_size=config.shared_expert_intermediate_size)
+        self.shared_expert_gate = (
+            nn.Linear(config.hidden_size, 1, bias=False) if getattr(config, "moe_shared_gate", False) else None
+        )
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         shared_expert_output = self.shared_expert(hidden_states)
+        if self.shared_expert_gate is not None:
+            shared_expert_output = shared_expert_output * torch.sigmoid(self.shared_expert_gate(hidden_states))
+        # Routed experts
+        _, routing_weights, selected_experts = self.gate(hidden_states)
+        final_hidden_states = torch.zeros_like(hidden_states)
+        expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts)
+        expert_mask = expert_mask.permute(2, 1, 0)
         for expert_idx in range(self.num_experts):
             top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
             if token_idx.shape[0] == 0:
                 continue
+            current_state = hidden_states[token_idx]
+            current_hidden_states = self.experts[expert_idx](current_state)
+            current_hidden_states = current_hidden_states * routing_weights[token_idx, top_k_pos, None]
+            final_hidden_states.index_add_(0, token_idx, current_hidden_states.to(final_hidden_states.dtype))
+        final_hidden_states = final_hidden_states + shared_expert_output
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states
 def rotate_half(x):
             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
             cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns
+    -------
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
     cos = cos.unsqueeze(unsqueeze_dim)
     sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
         self.config = config
         self.layer_idx = layer_idx
         self.head_dim = config.head_dim
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
         self.is_causal = True
+        # Laguna: no QKV bias, explicit head_dim
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * config.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=False)
+        self.o_proj = nn.Linear(config.num_attention_heads * config.head_dim, config.hidden_size, bias=False)
+        # Laguna-specific: gating projection
+        self.g_proj = nn.Linear(config.hidden_size, config.num_attention_heads * config.head_dim, bias=False)
+        # QK normalization (RMSNorm applied per-head after reshape, before RoPE)
         self.q_norm = LagunaRMSNorm(config.head_dim, eps=config.rms_norm_eps)
         self.k_norm = LagunaRMSNorm(config.head_dim, eps=config.rms_norm_eps)
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        # Laguna-specific: apply gating BEFORE o_proj
+        # gate values are computed from original hidden_states, applied in attention dimension
+        gate = F.softplus(self.g_proj(hidden_states).float()).to(attn_output.dtype)
+        attn_output = attn_output * gate
         attn_output = self.o_proj(attn_output)
     def __init__(self, config: LagunaConfig, layer_idx: int):
         super().__init__()
         self.self_attn = LagunaAttention(config, layer_idx)
+        # Use MoE or dense MLP based on layer configuration
         if (layer_idx not in config.mlp_only_layers) and (
             config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
         ):
         self.post_attention_layernorm = LagunaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hidden_size = config.hidden_size
     def forward(
         self,
         hidden_states: torch.Tensor,
         # Self Attention
         hidden_states, _ = self.self_attn(
             hidden_states=hidden_states,
+            attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
             use_cache=use_cache,
             cache_position=cache_position,
+            position_embeddings=position_embeddings,
             **kwargs,
         )
         hidden_states = residual + hidden_states
         )
         self.norm = LagunaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = LagunaRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ):
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
+        # Laguna uses full attention only (no sliding window)
+        causal_mask = create_causal_mask(
             config=self.config,
             input_embeds=inputs_embeds,
             attention_mask=attention_mask,
         )
         hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             hidden_states = decoder_layer(
             The attention_mask used in forward function
             shape [batch_size X sequence_length] if not None.
+    Returns
+    -------
         The auxiliary loss.
     """
     if gate_logits is None or not isinstance(gate_logits, tuple):
         **kwargs: Unpack[TransformersKwargs],
     ) -> MoeCausalLMOutputWithPast:
         r"""
+        Labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
             config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
             (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
                 self.num_experts_per_tok,
                 attention_mask,
             )
+            if labels is not None and isinstance(aux_loss, torch.Tensor):
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)
         return MoeCausalLMOutputWithPast(
             loss=loss,

special_tokens_map.json CHANGED Viewed

@@ -6,4 +6,4 @@
   "pad_token": "〈|PAD|〉",
   "sep_token": "〈|SEP|〉",
   "unk_token": "〈|UNK|〉"
-}

   "pad_token": "〈|PAD|〉",
   "sep_token": "〈|SEP|〉",
   "unk_token": "〈|UNK|〉"
+}

tokenizer.json CHANGED Viewed

@@ -167,21 +167,21 @@
     },
     {
       "id": 18,
-      "content": "〈|THINK_START|〉",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "normalized": false,
-      "special": true
     },
     {
       "id": 19,
-      "content": "〈|THINK_END|〉",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "normalized": false,
-      "special": true
     },
     {
       "id": 20,
@@ -212,39 +212,39 @@
     },
     {
       "id": 23,
-      "content": "〈|SPECIAL_4|〉",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "normalized": false,
-      "special": true
     },
     {
       "id": 24,
-      "content": "〈|SPECIAL_5|〉",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "normalized": false,
-      "special": true
     },
     {
       "id": 25,
-      "content": "〈|SPECIAL_6|〉",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "normalized": false,
-      "special": true
     },
     {
       "id": 26,
-      "content": "〈|SPECIAL_7|〉",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "normalized": false,
-      "special": true
     },
     {
       "id": 27,
@@ -750,15 +750,9 @@
       "|〉": 15,
       "〈|/": 16,
       "/|〉": 17,
-      "〈|THINK_START|〉": 18,
-      "〈|THINK_END|〉": 19,
       "〈|SPECIAL_1|〉": 20,
       "〈|SPECIAL_2|〉": 21,
       "〈|SPECIAL_3|〉": 22,
-      "〈|SPECIAL_4|〉": 23,
-      "〈|SPECIAL_5|〉": 24,
-      "〈|SPECIAL_6|〉": 25,
-      "〈|SPECIAL_7|〉": 26,
       "〈|SPECIAL_8|〉": 27,
       "〈|SPECIAL_9|〉": 28,
       "〈|SPECIAL_10|〉": 29,
@@ -101083,7 +101077,13 @@
       "wagon": 100348,
       "/lldb": 100349,
       "CHANGED": 100350,
-      "IsNotNull": 100351
     },
     "merges": [
       [
@@ -501192,4 +501192,4 @@
       ]
     ]
   }
-}

     },
     {
       "id": 18,
+      "content": "<think>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "normalized": false,
+      "special": false
     },
     {
       "id": 19,
+      "content": "</think>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "normalized": false,
+      "special": false
     },
     {
       "id": 20,
     },
     {
       "id": 23,
+      "content": "<assistant>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "normalized": false,
+      "special": false
     },
     {
       "id": 24,
+      "content": "</assistant>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "normalized": false,
+      "special": false
     },
     {
       "id": 25,
+      "content": "<tool_call>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "normalized": false,
+      "special": false
     },
     {
       "id": 26,
+      "content": "</tool_call>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "normalized": false,
+      "special": false
     },
     {
       "id": 27,
       "|〉": 15,
       "〈|/": 16,
       "/|〉": 17,
       "〈|SPECIAL_1|〉": 20,
       "〈|SPECIAL_2|〉": 21,
       "〈|SPECIAL_3|〉": 22,
       "〈|SPECIAL_8|〉": 27,
       "〈|SPECIAL_9|〉": 28,
       "〈|SPECIAL_10|〉": 29,
       "wagon": 100348,
       "/lldb": 100349,
       "CHANGED": 100350,
+      "IsNotNull": 100351,
+      "<think>": 18,
+      "</think>": 19,
+      "<assistant>": 23,
+      "</assistant>": 24,
+      "<tool_call>": 25,
+      "</tool_call>": 26
     },
     "merges": [
       [
       ]
     ]
   }
+}

tokenizer_config.json CHANGED Viewed

@@ -144,22 +144,6 @@
       "single_word": false,
       "special": true
     },
-    "18": {
-      "content": "〈|THINK_START|〉",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "19": {
-      "content": "〈|THINK_END|〉",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
     "20": {
       "content": "〈|SPECIAL_1|〉",
       "lstrip": false,
@@ -184,38 +168,6 @@
       "single_word": false,
       "special": true
     },
-    "23": {
-      "content": "〈|SPECIAL_4|〉",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "24": {
-      "content": "〈|SPECIAL_5|〉",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "25": {
-      "content": "〈|SPECIAL_6|〉",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "26": {
-      "content": "〈|SPECIAL_7|〉",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
     "27": {
       "content": "〈|SPECIAL_8|〉",
       "lstrip": false,
@@ -559,6 +511,54 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "bos_token": "〈|EOS|〉",
@@ -571,5 +571,6 @@
   "pad_token": "〈|PAD|〉",
   "sep_token": "〈|SEP|〉",
   "tokenizer_class": "PreTrainedTokenizerFast",
-  "unk_token": "〈|UNK|〉"
-}

       "single_word": false,
       "special": true
     },
     "20": {
       "content": "〈|SPECIAL_1|〉",
       "lstrip": false,
       "single_word": false,
       "special": true
     },
     "27": {
       "content": "〈|SPECIAL_8|〉",
       "lstrip": false,
       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "18": {
+      "content": "<think>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": false
+    },
+    "19": {
+      "content": "</think>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": false
+    },
+    "23": {
+      "content": "<assistant>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": false
+    },
+    "24": {
+      "content": "</assistant>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": false
+    },
+    "25": {
+      "content": "<tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": false
+    },
+    "26": {
+      "content": "</tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": false
     }
   },
   "bos_token": "〈|EOS|〉",
   "pad_token": "〈|PAD|〉",
   "sep_token": "〈|SEP|〉",
   "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "〈|UNK|〉",
+  "chat_template": "{% include 'chat_template.jinja' %}"
+}