Upload quantized model HRM-Text-1B-autoround-MXFP4

Browse files

Files changed (9) hide show

README.md +118 -0
config.json +61 -0
configuration_hrm_text.py +146 -0
generation_config.json +8 -0
model.safetensors +3 -0
modeling_hrm_text.py +644 -0
quantization_config.json +16 -0
tokenizer.json +0 -0
tokenizer_config.json +12 -0

README.md ADDED Viewed

	@@ -0,0 +1,118 @@

+---
+base_model:
+- sapientinc/HRM-Text-1B
+pipeline_tag: text-generation
+tags:
+- quantized
+- mxfp4
+- autoround
+- low-bit-open-llm-leaderboard
+---
+# HRM-Text-1B-autoround-MXFP4
+## Model Details
+This model is a MXFP4 (Microscaling FP4) quantization of [sapientinc/HRM-Text-1B](https://huggingface.co/sapientinc/HRM-Text-1B) generated by [AutoRound](https://github.com/intel/auto-round). Please follow the license of the original model.
+## Quantization Details
+| Attribute | Value |
+|-----------|-------|
+| Base Model | [sapientinc/HRM-Text-1B](https://huggingface.co/sapientinc/HRM-Text-1B) |
+| Quantization Tool | [AutoRound](https://github.com/intel/auto-round) |
+| Quantization Scheme | MXFP4 |
+| Original Size | 2256 MB |
+| Quantized Size | 886 MB |
+## Evaluation Results
+| Task | Accuracy |
+|------|----------|
+| hellaswag | 0.2504 |
+| mmlu | 0.2309 |
+| piqa | 0.4951 |
+## How to Use
+### HF Usage
+**Step 1: Install [AutoRound](https://github.com/intel/auto-round)**
+```bash
+pip install auto-round
+```
+**Step 2: Load and run the quantized model**
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "HRM-Text-1B-autoround-MXFP4"
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
+# prepare the model input
+prompt = "Write a quick sort algorithm."
+messages = [{"role": "user", "content": prompt}]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+# conduct text completion
+generated_ids = model.generate(**model_inputs, max_new_tokens=512)
+output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :].tolist()
+content = tokenizer.decode(output_ids, skip_special_tokens=True)
+print("content:", content)
+```
+### VLLM Usage
+```bash
+vllm serve HRM-Text-1B-autoround-MXFP4 \
+    --trust-remote-code \
+    --dtype bfloat16 \
+    --tensor_parallel_size 1
+```
+If you encounter any issues, feel free to open an issue on the [AutoRound GitHub repo](https://github.com/intel/auto-round/issues) or provide feedback on the [Low-Bit Open LLM Leaderboard](https://huggingface.co/spaces/Intel/low_bit_open_llm_leaderboard).
+## Ethical Considerations and Limitations
+The model can produce factually incorrect output, and should not be relied on to produce factually accurate information. Because of the limitations of the pretrained model and the finetuning datasets, it is possible that this model could generate lewd, biased or otherwise offensive outputs.
+Therefore, before deploying any applications of the model, developers should perform safety testing.
+## Caveats and Recommendations
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model.
+Here are a couple of useful links to learn more about Intel's AI software:
+- [Intel Neural Compressor](https://github.com/intel/neural-compressor)
+- [AutoRound](https://github.com/intel/auto-round)
+## Disclaimer
+The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
+## Cite
+```
+@article{cheng2023optimize,
+  title={Optimize weight rounding via signed gradient descent for the quantization of llms},
+  author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi},
+  journal={arXiv preprint arXiv:2309.05516},
+  year={2023}
+}
+```
+[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
+---
+*This model is part of the [Intel Low-Bit Open LLM Leaderboard](https://huggingface.co/spaces/Intel/low_bit_open_llm_leaderboard) initiative.*

config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "H_cycles": 2,
+  "L_bp_cycles": [
+    0,
+    3
+  ],
+  "L_cycles": 3,
+  "architectures": [
+    "HrmTextForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_hrm_text.HrmTextConfig",
+    "AutoModel": "modeling_hrm_text.HrmTextModel",
+    "AutoModelForCausalLM": "modeling_hrm_text.HrmTextForCausalLM"
+  },
+  "bos_token_id": 6,
+  "dtype": "bfloat16",
+  "embedding_scale": 39.191835884530846,
+  "eos_token_id": 11,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.025515518153991442,
+  "intermediate_size": 4096,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "hrm_text",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 128,
+  "num_key_value_heads": 12,
+  "num_layers_per_stack": 16,
+  "pad_token_id": 5,
+  "prefix_lm": true,
+  "quantization_config": {
+    "act_bits": 4,
+    "act_data_type": "mx_fp",
+    "act_dynamic": true,
+    "act_group_size": 32,
+    "act_sym": true,
+    "autoround_version": "0.12.3",
+    "bits": 4,
+    "data_type": "mx_fp",
+    "group_size": 32,
+    "iters": 0,
+    "low_gpu_mem_usage": true,
+    "packing_format": "auto_round:llm_compressor",
+    "quant_method": "auto-round",
+    "sym": true
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 10000.0,
+    "rope_type": "default"
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.9.0",
+  "use_cache": true,
+  "vocab_size": 65536
+}

configuration_hrm_text.py ADDED Viewed

	@@ -0,0 +1,146 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hrm_text/modular_hrm_text.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hrm_text.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 The Sapient AI Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from huggingface_hub.dataclasses import strict
+from transformers.configuration_utils import PreTrainedConfig
+from transformers.modeling_rope_utils import RopeParameters
+from transformers.utils import auto_docstring
+from transformers.utils.generic import is_flash_attention_requested, split_attention_implementation
+from transformers.utils.type_validators import interval
+@auto_docstring(checkpoint="sapientinc/HRM-Text-1B")
+@strict
+class HrmTextConfig(PreTrainedConfig):
+    r"""
+    H_cycles (`int`, *optional*, defaults to 2):
+        Number of high-level cycles.
+    L_cycles (`int`, *optional*, defaults to 3):
+        Number of low-level cycles per H-cycle.
+    L_bp_cycles (`list[int]`, *optional*, defaults to `[2]`):
+        Training-time gradient-routing list; left-padded with `1`s up to `L_cycles` inside the model.
+        Inference-time no-op.
+    embedding_scale (`float`, *optional*):
+        Token-embedding multiplier. If `None`, defaults to `1 / initializer_range`.
+    prefix_lm (`bool`, *optional*, defaults to `True`):
+        Instruction tokens attend bidirectionally, response tokens attend causally.
+    num_layers_per_stack (`int`, *optional*):
+        Real number of transformer blocks inside each
+        of the H / L stacks. Set automatically on first construction: the value passed as
+        `num_hidden_layers` is remembered here and `num_hidden_layers` is then rewritten to
+        `num_layers_per_stack * H_cycles * (L_cycles + 1)` so that
+        `DynamicCache(config=...)` pre-allocates one slot per unique attention invocation
+        under the recurrent forward. Do not set this directly on first construction — pass
+        the real per-stack count as `num_hidden_layers` and let `__post_init__` split it.
+    """
+    model_type = "hrm_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        **{f"{stack}.layers.*.self_attn.q_proj": "colwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.self_attn.k_proj": "colwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.self_attn.v_proj": "colwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.self_attn.gate_proj": "colwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.self_attn.o_proj": "rowwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.mlp.gate_proj": "colwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.mlp.up_proj": "colwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.mlp.down_proj": "rowwise" for stack in ("L_module", "H_module")},
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    vocab_size: int = 151808
+    hidden_size: int = 1536
+    intermediate_size: int = 4096
+    num_hidden_layers: int = 16
+    num_attention_heads: int = 12
+    hidden_act: str = "silu"
+    max_position_embeddings: int = 2048
+    initializer_range: float = interval(min=0.0, max=1.0)(default=0.02)
+    rms_norm_eps: float = 1e-6
+    use_cache: bool = True
+    pad_token_id: int | None = None
+    bos_token_id: int | None = None
+    eos_token_id: int | list[int] | None = None
+    tie_word_embeddings: bool = False
+    rope_parameters: RopeParameters | dict | None = None
+    attention_bias: bool = False
+    attention_dropout: int | float | None = 0.0
+    mlp_bias: bool = False
+    head_dim: int = 128
+    H_cycles: int = 2
+    L_cycles: int = 3
+    L_bp_cycles: list[int] | None = None
+    embedding_scale: float | None = None
+    prefix_lm: bool = True
+    num_layers_per_stack: int | None = None  # Usually inferred in post init
+    def __post_init__(self, **kwargs):
+        if self.L_bp_cycles is None:
+            # Default `[2]` = backprop only the last 2 L-iterations per H-cycle (training-time
+            # gradient-routing knob). Left-padding to length `L_cycles` is performed inside
+            # [`HrmTextModel`] since it depends on `L_cycles`.
+            self.L_bp_cycles = [2]
+        if self.embedding_scale is None:
+            self.embedding_scale = 1.0 / self.initializer_range
+        if self.num_layers_per_stack is None:
+            # Initial construction, or legacy checkpoint where `num_hidden_layers` carries the
+            # real per-stack count: remember that value and rewrite `num_hidden_layers` to the
+            # inflated total, so standard HF cache allocation gives us one slot per unique
+            # attention invocation. Serialised configs round-trip as (inflated, real) pairs.
+            self.num_layers_per_stack = self.num_hidden_layers
+            self.num_hidden_layers = self.num_layers_per_stack * self.H_cycles * (self.L_cycles + 1)
+        super().__post_init__(**kwargs)
+    def validate_architecture(self):
+        """Part of `@strict`-powered validation. Validates the architecture of the config."""
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})."
+            )
+    @property
+    def _attn_implementation(self):
+        return self._attn_implementation_internal
+    @_attn_implementation.setter
+    def _attn_implementation(self, value: str | dict | None):
+        if value is not None and self.prefix_lm:
+            _, base_implementation = split_attention_implementation(value)
+            if is_flash_attention_requested(requested_attention_implementation=base_implementation):
+                raise ValueError(
+                    f"`attn_implementation={value!r}` is not supported when "
+                    "`config.prefix_lm=True`: FlashAttention cannot represent the PrefixLM 4-D mask "
+                    "overlay. Use `'sdpa'` (default) or `'flex_attention'`, or set `config.prefix_lm=False`."
+                )
+        PreTrainedConfig._attn_implementation.__set__(self, value)
+__all__ = ["HrmTextConfig"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 6,
+  "do_sample": true,
+  "eos_token_id": 11,
+  "pad_token_id": 5,
+  "transformers_version": "5.9.0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:905484c08c12cf186265a02a23b6f780366999d2caf2b6614fafd6b684fd2499
+size 924093024

modeling_hrm_text.py ADDED Viewed

	@@ -0,0 +1,644 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hrm_text/modular_hrm_text.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hrm_text.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 The Sapient AI Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Callable
+from contextlib import nullcontext
+from typing import Optional
+import torch
+from torch import nn
+from transformers import initialization as init
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.configuration_utils import PreTrainedConfig
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_func_from_hub, use_kernelized_func
+from transformers.masking_utils import create_causal_mask, create_masks_for_generate
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import auto_docstring, can_return_tuple, logging
+from transformers.utils.generic import (
+    TransformersKwargs,
+    is_flash_attention_requested,
+    maybe_autocast,
+    merge_with_config_defaults,
+    split_attention_implementation,
+)
+from transformers.utils.output_capturing import capture_outputs
+from .configuration_hrm_text import HrmTextConfig
+logger = logging.get_logger(__name__)
+class HrmTextRMSNorm(torch.nn.Module):
+    def __init__(self, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        return self._norm(x.float()).type_as(x)
+    def extra_repr(self):
+        return f"eps={self.eps}"
+class HrmTextMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+@use_kernel_func_from_hub("rotary_pos_emb")
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+@use_kernelized_func(apply_rotary_pos_emb)
+class HrmTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: HrmTextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = 1  # Uses MHA instead of GQA
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        # Additional sigmoid gate applied at the end
+        self.gate_proj = nn.Linear(
+            config.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: Cache | None = None,
+        cycle_offset: int = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        gate_states = self.gate_proj(hidden_states).view(hidden_shape)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_values is not None:
+            # Adjust cache slot by `cycle_offset` which is determined by it's current recurrent step through the stacks
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx + cycle_offset)
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        # Additional sigmoid gating (similar to Qwen3Next)
+        attn_output = torch.sigmoid(gate_states) * attn_output
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class HrmTextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: HrmTextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = HrmTextAttention(config=config, layer_idx=layer_idx)
+        self.mlp = HrmTextMLP(config)
+        self.input_layernorm = HrmTextRMSNorm(eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HrmTextRMSNorm(eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        use_cache: bool | None = False,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class HrmTextStack(nn.Module):
+    """A single transformer stack — used twice inside, once as H module and once as L module"""
+    def __init__(self, config: HrmTextConfig):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [HrmTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_layers_per_stack)]
+        )
+        self.final_norm = HrmTextRMSNorm(eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: Cache | None = None,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        cycle_offset: int = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            hidden_states = layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                position_embeddings=position_embeddings,
+                cycle_offset=cycle_offset,
+                **kwargs,
+            )
+        return self.final_norm(hidden_states)
+@auto_docstring
+class HrmTextPreTrainedModel(PreTrainedModel):
+    config: HrmTextConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HrmTextDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": HrmTextDecoderLayer,
+        "attentions": HrmTextAttention,
+    }
+    def _check_and_adjust_attn_implementation(
+        self, attn_implementation: str | None, is_init_check: bool = False, allow_all_kernels: bool = False
+    ) -> str:
+        if attn_implementation is not None and self.config.prefix_lm:
+            _, base_implementation = split_attention_implementation(attn_implementation)
+            if is_flash_attention_requested(requested_attention_implementation=base_implementation):
+                raise ValueError(
+                    f"`attn_implementation={attn_implementation!r}` is not supported when "
+                    "`config.prefix_lm=True`: FlashAttention cannot represent the PrefixLM 4-D mask "
+                    "overlay. Use `'sdpa'` (default) or `'flex_attention'`, or set `config.prefix_lm=False`."
+                )
+        return super()._check_and_adjust_attn_implementation(attn_implementation, is_init_check, allow_all_kernels)
+    @torch.no_grad()
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, HrmTextModel):
+            init.zeros_(module.z_L_init)
+            # `z_L_init` is the frozen low-cycle initial state and never trains.
+            module.z_L_init.requires_grad_(False)  # trf-ignore: TRF012
+class HrmTextRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: HrmTextConfig, device=None):
+        super().__init__()
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_type = self.config.rope_parameters["rope_type"]
+        rope_init_fn: Callable = self.compute_default_rope_parameters
+        if self.rope_type != "default":
+            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: HrmTextConfig | None = None,
+        device: Optional["torch.device"] = None,
+        seq_len: int | None = None,
+    ) -> tuple["torch.Tensor", float]:
+        """
+        Computes the inverse frequencies according to the original RoPE implementation
+        Args:
+            config ([`~transformers.PreTrainedConfig`]):
+                The model configuration.
+            device (`torch.device`):
+                The device to use for initialization of the inverse frequencies.
+            seq_len (`int`, *optional*):
+                The current sequence length. Unused for this type of RoPE.
+        Returns:
+            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+        """
+        base = config.rope_parameters["rope_theta"]
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        attention_factor = 1.0  # Unused in this type of RoPE
+        # Compute the inverse frequencies
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+@auto_docstring
+class HrmTextModel(HrmTextPreTrainedModel):
+    def __init__(self, config: HrmTextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.rotary_emb = HrmTextRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.embedding_scale = config.embedding_scale
+        # Recursive module structures
+        self.L_module = HrmTextStack(config)
+        self.H_module = HrmTextStack(config)
+        # Initial state for the low cycle module
+        self.z_L_init = nn.Parameter(torch.zeros(config.hidden_size), requires_grad=False)
+        raw_bp = list(config.L_bp_cycles)
+        self.L_bp_cycles_padded = [1] * max(0, config.H_cycles - len(raw_bp)) + raw_bp
+        # Initialize weights and apply final processing
+        self.post_init()
+    @merge_with_config_defaults
+    @capture_outputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch, seq_len)`, *optional*):
+            Per-position bidirectional/causal indicator. Tokens with `token_type_ids == 1`
+            form a single bidirectional block; all other positions are causal.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # Additional scaling on the input embeds
+        inputs_embeds = inputs_embeds * self.embedding_scale
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if position_ids is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
+            position_ids = position_ids.unsqueeze(0)
+        # Create mask with optional prefix-based bidirectionality
+        mask_kwargs = {
+            "config": self.config,
+            "inputs_embeds": inputs_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+        }
+        is_first_iteration = past_key_values is None or not past_key_values.is_initialized
+        if token_type_ids is not None and is_first_iteration:
+            if self.config.prefix_lm:
+                mask_kwargs["block_sequence_ids"] = torch.where(token_type_ids == 1, 0, -1)
+            else:
+                logger.warning_once("`token_type_ids` was provided but `config.prefix_lm=False`; ignoring it.")
+        attention_mask = create_causal_mask(**mask_kwargs)
+        position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
+        # Hierarchical (H/L)-cycle recurrence
+        #
+        # `z_H` - slow / high-level state
+        hidden_states_high_cycle = inputs_embeds
+        # `z_L` - fast / low-level state
+        hidden_states_low_cycle = (
+            self.z_L_init.to(dtype=hidden_states_high_cycle.dtype, device=hidden_states_high_cycle.device)
+            .expand_as(hidden_states_high_cycle)
+            .contiguous()
+        )
+        # Cache-slot layout under the recurrent forward:
+        #
+        #   slot(h, l, layer)   = (h * (L_cycles + 1) + l) * num_layers_per_stack + layer
+        #                                                       ^— L-stack invocation at (h, l)
+        #   slot(h, H, layer)   = (h * (L_cycles + 1) + L_cycles) * num_layers_per_stack + layer
+        #                                                       ^— trailing H-stack invocation
+        #
+        # That totals `num_layers_per_stack * H_cycles * (L_cycles + 1)` slots, i.e. the `config.num_hidden_layers`.
+        num_layers_per_stack = self.config.num_layers_per_stack
+        for high_cycle_idx in range(self.config.H_cycles):
+            # `L_bp_cycles` k-step grad trick: only the trailing `num_grad_iterations` of the
+            # `L_cycles` inner iterations propagate gradients; earlier iterations run under
+            # `torch.no_grad()` to bound activation memory.
+            num_grad_iterations = (
+                self.L_bp_cycles_padded[high_cycle_idx] if high_cycle_idx < len(self.L_bp_cycles_padded) else 1
+            )
+            grad_threshold = self.config.L_cycles - num_grad_iterations
+            for low_cycle_idx in range(self.config.L_cycles):
+                cycle_offset = (high_cycle_idx * (self.config.L_cycles + 1) + low_cycle_idx) * num_layers_per_stack
+                ctx = nullcontext() if low_cycle_idx >= grad_threshold else torch.no_grad()
+                with ctx:
+                    hidden_states_low_cycle = self.L_module(
+                        hidden_states_low_cycle.to(hidden_states_high_cycle.device) + hidden_states_high_cycle,
+                        attention_mask=attention_mask,
+                        past_key_values=past_key_values,
+                        position_embeddings=position_embeddings,
+                        position_ids=position_ids,
+                        cycle_offset=cycle_offset,
+                        **kwargs,
+                    )
+            cycle_offset = (high_cycle_idx * (self.config.L_cycles + 1) + self.config.L_cycles) * num_layers_per_stack
+            hidden_states_high_cycle = self.H_module(
+                hidden_states_high_cycle + hidden_states_low_cycle.to(hidden_states_high_cycle.device),
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                position_embeddings=position_embeddings,
+                position_ids=position_ids,
+                cycle_offset=cycle_offset,
+                **kwargs,
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states_high_cycle,
+            past_key_values=past_key_values,
+        )
+@auto_docstring
+class HrmTextForCausalLM(HrmTextPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = HrmTextModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch, seq_len)`, *optional*):
+            Per-position bidirectional/causal indicator. Tokens with `token_type_ids == 1`
+            form a single bidirectional block; all other positions are causal.
+        """
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @staticmethod
+    def create_masks_for_generate(
+        config: PreTrainedConfig,
+        inputs_embeds: torch.Tensor,
+        attention_mask: torch.Tensor | None,
+        past_key_values: Cache | None,
+        position_ids: torch.Tensor | None,
+        token_type_ids: torch.Tensor | None = None,
+        is_first_iteration: bool | None = False,
+        **kwargs,
+    ) -> dict:
+        mask_kwargs = {
+            "config": config,
+            "inputs_embeds": inputs_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+        }
+        if token_type_ids is not None and is_first_iteration:
+            if config.prefix_lm:
+                mask_kwargs["block_sequence_ids"] = torch.where(token_type_ids == 1, 0, -1)
+            else:
+                logger.warning_once("`token_type_ids` was provided but `config.prefix_lm=False`; ignoring it.")
+        return create_masks_for_generate(**mask_kwargs)
+__all__ = ["HrmTextForCausalLM", "HrmTextModel", "HrmTextPreTrainedModel"]

quantization_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "bits": 4,
+  "act_bits": 4,
+  "data_type": "mx_fp",
+  "act_data_type": "mx_fp",
+  "group_size": 32,
+  "act_group_size": 32,
+  "sym": true,
+  "act_sym": true,
+  "act_dynamic": true,
+  "iters": 0,
+  "low_gpu_mem_usage": true,
+  "autoround_version": "0.12.3",
+  "quant_method": "auto-round",
+  "packing_format": "auto_round:llm_compressor"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "add_prefix_space": null,
+  "backend": "tokenizers",
+  "bos_token": "<|im_start|>",
+  "eos_token": "<|box_end|>",
+  "is_local": false,
+  "local_files_only": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}