Switch to native transformers hrm_text support

transformers 5.9.0 ships native HrmTextForCausalLM. Drop the custom
modeling code and trust_remote_code path: bump install requirement to
>=5.9.0, remove auto_map from config, and delete the Python sources.

Files changed (5) hide show

README.md +2 -3
__init__.py +0 -15
config.json +1 -6
configuration_hrm_text.py +0 -146
modeling_hrm_text.py +0 -644

README.md CHANGED Viewed

@@ -46,10 +46,10 @@ The four single condition tags and their assigned tokenizer special tokens (toke
 ## Requirements
-Use a Transformers build that includes the `hrm_text` model class. If your installed release does not include it yet, install Transformers directly from the upstream `main` branch:
 ```bash
-pip install --upgrade "git+https://github.com/huggingface/transformers.git@main"
 ```
 ## Model details
@@ -85,7 +85,6 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     dtype=torch.bfloat16,
-    trust_remote_code=True,
 ).cuda().eval()
 # synth,cot composite — reasoning / CoT style (see Disclaimer for other modes)

 ## Requirements
+Requires `transformers >= 5.9.0`, which ships native support for the `hrm_text` model class:
 ```bash
+pip install --upgrade "transformers>=5.9.0"
 ```
 ## Model details
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     dtype=torch.bfloat16,
 ).cuda().eval()
 # synth,cot composite — reasoning / CoT style (see Disclaimer for other modes)

__init__.py DELETED Viewed

@@ -1,15 +0,0 @@
-# Copyright 2026 The Sapient AI Authors and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .configuration_hrm_text import *
-from .modeling_hrm_text import *

config.json CHANGED Viewed

@@ -25,10 +25,5 @@
   "prefix_lm": true,
   "pad_token_id": 5,
   "bos_token_id": 6,
-  "eos_token_id": 11,
-  "auto_map": {
-    "AutoConfig": "configuration_hrm_text.HrmTextConfig",
-    "AutoModel": "modeling_hrm_text.HrmTextModel",
-    "AutoModelForCausalLM": "modeling_hrm_text.HrmTextForCausalLM"
-  }
 }

   "prefix_lm": true,
   "pad_token_id": 5,
   "bos_token_id": 6,
+  "eos_token_id": 11
 }

configuration_hrm_text.py DELETED Viewed

@@ -1,146 +0,0 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/hrm_text/modular_hrm_text.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_hrm_text.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-# Copyright 2026 The Sapient AI Authors and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from huggingface_hub.dataclasses import strict
-from transformers.configuration_utils import PreTrainedConfig
-from transformers.modeling_rope_utils import RopeParameters
-from transformers.utils import auto_docstring
-from transformers.utils.generic import is_flash_attention_requested, split_attention_implementation
-from transformers.utils.type_validators import interval
-@auto_docstring(checkpoint="sapientinc/HRM-Text-1B")
-@strict
-class HrmTextConfig(PreTrainedConfig):
-    r"""
-    H_cycles (`int`, *optional*, defaults to 2):
-        Number of high-level cycles.
-    L_cycles (`int`, *optional*, defaults to 3):
-        Number of low-level cycles per H-cycle.
-    L_bp_cycles (`list[int]`, *optional*, defaults to `[2]`):
-        Training-time gradient-routing list; left-padded with `1`s up to `L_cycles` inside the model.
-        Inference-time no-op.
-    embedding_scale (`float`, *optional*):
-        Token-embedding multiplier. If `None`, defaults to `1 / initializer_range`.
-    prefix_lm (`bool`, *optional*, defaults to `True`):
-        Instruction tokens attend bidirectionally, response tokens attend causally.
-    num_layers_per_stack (`int`, *optional*):
-        Real number of transformer blocks inside each
-        of the H / L stacks. Set automatically on first construction: the value passed as
-        `num_hidden_layers` is remembered here and `num_hidden_layers` is then rewritten to
-        `num_layers_per_stack * H_cycles * (L_cycles + 1)` so that
-        `DynamicCache(config=...)` pre-allocates one slot per unique attention invocation
-        under the recurrent forward. Do not set this directly on first construction — pass
-        the real per-stack count as `num_hidden_layers` and let `__post_init__` split it.
-    """
-    model_type = "hrm_text"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    base_model_tp_plan = {
-        **{f"{stack}.layers.*.self_attn.q_proj": "colwise" for stack in ("L_module", "H_module")},
-        **{f"{stack}.layers.*.self_attn.k_proj": "colwise" for stack in ("L_module", "H_module")},
-        **{f"{stack}.layers.*.self_attn.v_proj": "colwise" for stack in ("L_module", "H_module")},
-        **{f"{stack}.layers.*.self_attn.gate_proj": "colwise" for stack in ("L_module", "H_module")},
-        **{f"{stack}.layers.*.self_attn.o_proj": "rowwise" for stack in ("L_module", "H_module")},
-        **{f"{stack}.layers.*.mlp.gate_proj": "colwise" for stack in ("L_module", "H_module")},
-        **{f"{stack}.layers.*.mlp.up_proj": "colwise" for stack in ("L_module", "H_module")},
-        **{f"{stack}.layers.*.mlp.down_proj": "rowwise" for stack in ("L_module", "H_module")},
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-    vocab_size: int = 151808
-    hidden_size: int = 1536
-    intermediate_size: int = 4096
-    num_hidden_layers: int = 16
-    num_attention_heads: int = 12
-    hidden_act: str = "silu"
-    max_position_embeddings: int = 2048
-    initializer_range: float = interval(min=0.0, max=1.0)(default=0.02)
-    rms_norm_eps: float = 1e-6
-    use_cache: bool = True
-    pad_token_id: int | None = None
-    bos_token_id: int | None = None
-    eos_token_id: int | list[int] | None = None
-    tie_word_embeddings: bool = False
-    rope_parameters: RopeParameters | dict | None = None
-    attention_bias: bool = False
-    attention_dropout: int | float | None = 0.0
-    mlp_bias: bool = False
-    head_dim: int = 128
-    H_cycles: int = 2
-    L_cycles: int = 3
-    L_bp_cycles: list[int] | None = None
-    embedding_scale: float | None = None
-    prefix_lm: bool = True
-    num_layers_per_stack: int | None = None  # Usually inferred in post init
-    def __post_init__(self, **kwargs):
-        if self.L_bp_cycles is None:
-            # Default `[2]` = backprop only the last 2 L-iterations per H-cycle (training-time
-            # gradient-routing knob). Left-padding to length `L_cycles` is performed inside
-            # [`HrmTextModel`] since it depends on `L_cycles`.
-            self.L_bp_cycles = [2]
-        if self.embedding_scale is None:
-            self.embedding_scale = 1.0 / self.initializer_range
-        if self.num_layers_per_stack is None:
-            # Initial construction, or legacy checkpoint where `num_hidden_layers` carries the
-            # real per-stack count: remember that value and rewrite `num_hidden_layers` to the
-            # inflated total, so standard HF cache allocation gives us one slot per unique
-            # attention invocation. Serialised configs round-trip as (inflated, real) pairs.
-            self.num_layers_per_stack = self.num_hidden_layers
-            self.num_hidden_layers = self.num_layers_per_stack * self.H_cycles * (self.L_cycles + 1)
-        super().__post_init__(**kwargs)
-    def validate_architecture(self):
-        """Part of `@strict`-powered validation. Validates the architecture of the config."""
-        if self.hidden_size % self.num_attention_heads != 0:
-            raise ValueError(
-                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({self.num_attention_heads})."
-            )
-    @property
-    def _attn_implementation(self):
-        return self._attn_implementation_internal
-    @_attn_implementation.setter
-    def _attn_implementation(self, value: str | dict | None):
-        if value is not None and self.prefix_lm:
-            _, base_implementation = split_attention_implementation(value)
-            if is_flash_attention_requested(requested_attention_implementation=base_implementation):
-                raise ValueError(
-                    f"`attn_implementation={value!r}` is not supported when "
-                    "`config.prefix_lm=True`: FlashAttention cannot represent the PrefixLM 4-D mask "
-                    "overlay. Use `'sdpa'` (default) or `'flex_attention'`, or set `config.prefix_lm=False`."
-                )
-        PreTrainedConfig._attn_implementation.__set__(self, value)
-__all__ = ["HrmTextConfig"]

modeling_hrm_text.py DELETED Viewed

@@ -1,644 +0,0 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/hrm_text/modular_hrm_text.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_hrm_text.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-# Copyright 2026 The Sapient AI Authors and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from collections.abc import Callable
-from contextlib import nullcontext
-from typing import Optional
-import torch
-from torch import nn
-from transformers import initialization as init
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.configuration_utils import PreTrainedConfig
-from transformers.generation import GenerationMixin
-from transformers.integrations import use_kernel_func_from_hub, use_kernelized_func
-from transformers.masking_utils import create_causal_mask, create_masks_for_generate
-from transformers.modeling_layers import GradientCheckpointingLayer
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
-from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from transformers.processing_utils import Unpack
-from transformers.utils import auto_docstring, can_return_tuple, logging
-from transformers.utils.generic import (
-    TransformersKwargs,
-    is_flash_attention_requested,
-    maybe_autocast,
-    merge_with_config_defaults,
-    split_attention_implementation,
-)
-from transformers.utils.output_capturing import capture_outputs
-from .configuration_hrm_text import HrmTextConfig
-logger = logging.get_logger(__name__)
-class HrmTextRMSNorm(torch.nn.Module):
-    def __init__(self, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-    def forward(self, x):
-        return self._norm(x.float()).type_as(x)
-    def extra_repr(self):
-        return f"eps={self.eps}"
-class HrmTextMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
-        self.act_fn = ACT2FN[config.hidden_act]
-    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-@use_kernel_func_from_hub("rotary_pos_emb")
-def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-def eager_attention_forward(
-    module: nn.Module,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: torch.Tensor | None,
-    scaling: float,
-    dropout: float = 0.0,
-    **kwargs: Unpack[TransformersKwargs],
-):
-    key_states = repeat_kv(key, module.num_key_value_groups)
-    value_states = repeat_kv(value, module.num_key_value_groups)
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
-    if attention_mask is not None:
-        attn_weights = attn_weights + attention_mask
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    return attn_output, attn_weights
-@use_kernelized_func(apply_rotary_pos_emb)
-class HrmTextAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-    def __init__(self, config: HrmTextConfig, layer_idx: int):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        self.num_key_value_groups = 1  # Uses MHA instead of GQA
-        self.scaling = self.head_dim**-0.5
-        self.attention_dropout = config.attention_dropout
-        self.is_causal = True
-        self.q_proj = nn.Linear(
-            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.k_proj = nn.Linear(
-            config.hidden_size,
-            config.num_attention_heads * self.head_dim,
-            bias=config.attention_bias,
-        )
-        self.v_proj = nn.Linear(
-            config.hidden_size,
-            config.num_attention_heads * self.head_dim,
-            bias=config.attention_bias,
-        )
-        self.o_proj = nn.Linear(
-            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
-        )
-        # Additional sigmoid gate applied at the end
-        self.gate_proj = nn.Linear(
-            config.hidden_size,
-            config.num_attention_heads * self.head_dim,
-            bias=config.attention_bias,
-        )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
-        attention_mask: torch.Tensor | None = None,
-        past_key_values: Cache | None = None,
-        cycle_offset: int = 0,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        input_shape = hidden_states.shape[:-1]
-        hidden_shape = (*input_shape, -1, self.head_dim)
-        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        gate_states = self.gate_proj(hidden_states).view(hidden_shape)
-        cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-        if past_key_values is not None:
-            # Adjust cache slot by `cycle_offset` which is determined by it's current recurrent step through the stacks
-            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx + cycle_offset)
-        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
-            self.config._attn_implementation, eager_attention_forward
-        )
-        attn_output, attn_weights = attention_interface(
-            self,
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.scaling,
-            **kwargs,
-        )
-        # Additional sigmoid gating (similar to Qwen3Next)
-        attn_output = torch.sigmoid(gate_states) * attn_output
-        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-        return attn_output, attn_weights
-class HrmTextDecoderLayer(GradientCheckpointingLayer):
-    def __init__(self, config: HrmTextConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = HrmTextAttention(config=config, layer_idx=layer_idx)
-        self.mlp = HrmTextMLP(config)
-        self.input_layernorm = HrmTextRMSNorm(eps=config.rms_norm_eps)
-        self.post_attention_layernorm = HrmTextRMSNorm(eps=config.rms_norm_eps)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: Cache | None = None,
-        use_cache: bool | None = False,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> torch.Tensor:
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        # Self Attention
-        hidden_states, _ = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            position_embeddings=position_embeddings,
-            **kwargs,
-        )
-        hidden_states = residual + hidden_states
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states
-class HrmTextStack(nn.Module):
-    """A single transformer stack — used twice inside, once as H module and once as L module"""
-    def __init__(self, config: HrmTextConfig):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [HrmTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_layers_per_stack)]
-        )
-        self.final_norm = HrmTextRMSNorm(eps=config.rms_norm_eps)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor | None = None,
-        past_key_values: Cache | None = None,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
-        cycle_offset: int = 0,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> torch.Tensor:
-        for layer in self.layers:
-            hidden_states = layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-                position_embeddings=position_embeddings,
-                cycle_offset=cycle_offset,
-                **kwargs,
-            )
-        return self.final_norm(hidden_states)
-@auto_docstring
-class HrmTextPreTrainedModel(PreTrainedModel):
-    config: HrmTextConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["HrmTextDecoderLayer"]
-    _skip_keys_device_placement = ["past_key_values"]
-    _supports_flash_attn = True
-    _supports_sdpa = True
-    _supports_flex_attn = True
-    _can_compile_fullgraph = True
-    _supports_attention_backend = True
-    _can_record_outputs = {
-        "hidden_states": HrmTextDecoderLayer,
-        "attentions": HrmTextAttention,
-    }
-    def _check_and_adjust_attn_implementation(
-        self, attn_implementation: str | None, is_init_check: bool = False, allow_all_kernels: bool = False
-    ) -> str:
-        if attn_implementation is not None and self.config.prefix_lm:
-            _, base_implementation = split_attention_implementation(attn_implementation)
-            if is_flash_attention_requested(requested_attention_implementation=base_implementation):
-                raise ValueError(
-                    f"`attn_implementation={attn_implementation!r}` is not supported when "
-                    "`config.prefix_lm=True`: FlashAttention cannot represent the PrefixLM 4-D mask "
-                    "overlay. Use `'sdpa'` (default) or `'flex_attention'`, or set `config.prefix_lm=False`."
-                )
-        return super()._check_and_adjust_attn_implementation(attn_implementation, is_init_check, allow_all_kernels)
-    @torch.no_grad()
-    def _init_weights(self, module):
-        super()._init_weights(module)
-        if isinstance(module, HrmTextModel):
-            init.zeros_(module.z_L_init)
-            # `z_L_init` is the frozen low-cycle initial state and never trains.
-            module.z_L_init.requires_grad_(False)  # trf-ignore: TRF012
-class HrmTextRotaryEmbedding(nn.Module):
-    inv_freq: torch.Tensor  # fix linting for `register_buffer`
-    def __init__(self, config: HrmTextConfig, device=None):
-        super().__init__()
-        self.max_seq_len_cached = config.max_position_embeddings
-        self.original_max_seq_len = config.max_position_embeddings
-        self.config = config
-        self.rope_type = self.config.rope_parameters["rope_type"]
-        rope_init_fn: Callable = self.compute_default_rope_parameters
-        if self.rope_type != "default":
-            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
-    @staticmethod
-    def compute_default_rope_parameters(
-        config: HrmTextConfig | None = None,
-        device: Optional["torch.device"] = None,
-        seq_len: int | None = None,
-    ) -> tuple["torch.Tensor", float]:
-        """
-        Computes the inverse frequencies according to the original RoPE implementation
-        Args:
-            config ([`~transformers.PreTrainedConfig`]):
-                The model configuration.
-            device (`torch.device`):
-                The device to use for initialization of the inverse frequencies.
-            seq_len (`int`, *optional*):
-                The current sequence length. Unused for this type of RoPE.
-        Returns:
-            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
-            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
-        """
-        base = config.rope_parameters["rope_theta"]
-        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
-        attention_factor = 1.0  # Unused in this type of RoPE
-        # Compute the inverse frequencies
-        inv_freq = 1.0 / (
-            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
-        )
-        return inv_freq, attention_factor
-    @torch.no_grad()
-    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-    def forward(self, x, position_ids):
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
-        position_ids_expanded = position_ids[:, None, :].float()
-        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
-        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos() * self.attention_scaling
-            sin = emb.sin() * self.attention_scaling
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-@auto_docstring
-class HrmTextModel(HrmTextPreTrainedModel):
-    def __init__(self, config: HrmTextConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.rotary_emb = HrmTextRotaryEmbedding(config=config)
-        self.gradient_checkpointing = False
-        self.embedding_scale = config.embedding_scale
-        # Recursive module structures
-        self.L_module = HrmTextStack(config)
-        self.H_module = HrmTextStack(config)
-        # Initial state for the low cycle module
-        self.z_L_init = nn.Parameter(torch.zeros(config.hidden_size), requires_grad=False)
-        raw_bp = list(config.L_bp_cycles)
-        self.L_bp_cycles_padded = [1] * max(0, config.H_cycles - len(raw_bp)) + raw_bp
-        # Initialize weights and apply final processing
-        self.post_init()
-    @merge_with_config_defaults
-    @capture_outputs
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: torch.LongTensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: Cache | None = None,
-        token_type_ids: torch.LongTensor | None = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-        use_cache: bool | None = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> BaseModelOutputWithPast:
-        r"""
-        token_type_ids (`torch.LongTensor` of shape `(batch, seq_len)`, *optional*):
-            Per-position bidirectional/causal indicator. Tokens with `token_type_ids == 1`
-            form a single bidirectional block; all other positions are causal.
-        """
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # Additional scaling on the input embeds
-        inputs_embeds = inputs_embeds * self.embedding_scale
-        if use_cache and past_key_values is None:
-            past_key_values = DynamicCache(config=self.config)
-        if position_ids is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
-            position_ids = position_ids.unsqueeze(0)
-        # Create mask with optional prefix-based bidirectionality
-        mask_kwargs = {
-            "config": self.config,
-            "inputs_embeds": inputs_embeds,
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "position_ids": position_ids,
-        }
-        is_first_iteration = past_key_values is None or not past_key_values.is_initialized
-        if token_type_ids is not None and is_first_iteration:
-            if self.config.prefix_lm:
-                mask_kwargs["block_sequence_ids"] = torch.where(token_type_ids == 1, 0, -1)
-            else:
-                logger.warning_once("`token_type_ids` was provided but `config.prefix_lm=False`; ignoring it.")
-        attention_mask = create_causal_mask(**mask_kwargs)
-        position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
-        # Hierarchical (H/L)-cycle recurrence
-        #
-        # `z_H` - slow / high-level state
-        hidden_states_high_cycle = inputs_embeds
-        # `z_L` - fast / low-level state
-        hidden_states_low_cycle = (
-            self.z_L_init.to(dtype=hidden_states_high_cycle.dtype, device=hidden_states_high_cycle.device)
-            .expand_as(hidden_states_high_cycle)
-            .contiguous()
-        )
-        # Cache-slot layout under the recurrent forward:
-        #
-        #   slot(h, l, layer)   = (h * (L_cycles + 1) + l) * num_layers_per_stack + layer
-        #                                                       ^— L-stack invocation at (h, l)
-        #   slot(h, H, layer)   = (h * (L_cycles + 1) + L_cycles) * num_layers_per_stack + layer
-        #                                                       ^— trailing H-stack invocation
-        #
-        # That totals `num_layers_per_stack * H_cycles * (L_cycles + 1)` slots, i.e. the `config.num_hidden_layers`.
-        num_layers_per_stack = self.config.num_layers_per_stack
-        for high_cycle_idx in range(self.config.H_cycles):
-            # `L_bp_cycles` k-step grad trick: only the trailing `num_grad_iterations` of the
-            # `L_cycles` inner iterations propagate gradients; earlier iterations run under
-            # `torch.no_grad()` to bound activation memory.
-            num_grad_iterations = (
-                self.L_bp_cycles_padded[high_cycle_idx] if high_cycle_idx < len(self.L_bp_cycles_padded) else 1
-            )
-            grad_threshold = self.config.L_cycles - num_grad_iterations
-            for low_cycle_idx in range(self.config.L_cycles):
-                cycle_offset = (high_cycle_idx * (self.config.L_cycles + 1) + low_cycle_idx) * num_layers_per_stack
-                ctx = nullcontext() if low_cycle_idx >= grad_threshold else torch.no_grad()
-                with ctx:
-                    hidden_states_low_cycle = self.L_module(
-                        hidden_states_low_cycle.to(hidden_states_high_cycle.device) + hidden_states_high_cycle,
-                        attention_mask=attention_mask,
-                        past_key_values=past_key_values,
-                        position_embeddings=position_embeddings,
-                        position_ids=position_ids,
-                        cycle_offset=cycle_offset,
-                        **kwargs,
-                    )
-            cycle_offset = (high_cycle_idx * (self.config.L_cycles + 1) + self.config.L_cycles) * num_layers_per_stack
-            hidden_states_high_cycle = self.H_module(
-                hidden_states_high_cycle + hidden_states_low_cycle.to(hidden_states_high_cycle.device),
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-                position_embeddings=position_embeddings,
-                position_ids=position_ids,
-                cycle_offset=cycle_offset,
-                **kwargs,
-            )
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states_high_cycle,
-            past_key_values=past_key_values,
-        )
-@auto_docstring
-class HrmTextForCausalLM(HrmTextPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
-    _tp_plan = {"lm_head": "colwise_gather_output"}
-    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = HrmTextModel(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
-    @can_return_tuple
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: torch.LongTensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: Cache | None = None,
-        token_type_ids: torch.LongTensor | None = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-        labels: torch.LongTensor | None = None,
-        use_cache: bool | None = None,
-        logits_to_keep: int | torch.Tensor = 0,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> CausalLMOutputWithPast:
-        r"""
-        token_type_ids (`torch.LongTensor` of shape `(batch, seq_len)`, *optional*):
-            Per-position bidirectional/causal indicator. Tokens with `token_type_ids == 1`
-            form a single bidirectional block; all other positions are causal.
-        """
-        outputs: BaseModelOutputWithPast = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            **kwargs,
-        )
-        hidden_states = outputs.last_hidden_state
-        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        loss = None
-        if labels is not None:
-            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-    @staticmethod
-    def create_masks_for_generate(
-        config: PreTrainedConfig,
-        inputs_embeds: torch.Tensor,
-        attention_mask: torch.Tensor | None,
-        past_key_values: Cache | None,
-        position_ids: torch.Tensor | None,
-        token_type_ids: torch.Tensor | None = None,
-        is_first_iteration: bool | None = False,
-        **kwargs,
-    ) -> dict:
-        mask_kwargs = {
-            "config": config,
-            "inputs_embeds": inputs_embeds,
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "position_ids": position_ids,
-        }
-        if token_type_ids is not None and is_first_iteration:
-            if config.prefix_lm:
-                mask_kwargs["block_sequence_ids"] = torch.where(token_type_ids == 1, 0, -1)
-            else:
-                logger.warning_once("`token_type_ids` was provided but `config.prefix_lm=False`; ignoring it.")
-        return create_masks_for_generate(**mask_kwargs)
-__all__ = ["HrmTextForCausalLM", "HrmTextModel", "HrmTextPreTrainedModel"]