HRM-Text-1B

Browse files

Files changed (12) hide show

.gitattributes +37 -0
LICENSE +202 -0
README.md +147 -0
__init__.py +15 -0
banner.jpg +3 -0
benchmark_scatter.png +3 -0
config.json +33 -0
configuration_hrm_text.py +146 -0
model.safetensors +3 -0
modeling_hrm_text.py +644 -0
tokenizer.json +0 -0
tokenizer_config.json +12 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+banner.jpg filter=lfs diff=lfs merge=lfs -text
+benchmark_scatter.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,147 @@

+---
+license: apache-2.0
+language:
+- en
+library_name: transformers
+pipeline_tag: text-generation
+tags:
+- hrm
+- hierarchical-reasoning
+- prefix-lm
+- base-model
+---
+![HRM-Text banner](banner.jpg)
+![Benchmark scatter: FLOPs and tokens vs benchmark average for HRM-Text-1B vs comparable models](benchmark_scatter.png)
+<p align="center">
+  <a href="https://github.com/sapientinc/HRM-Text"><img alt="GitHub" src="https://img.shields.io/badge/GitHub-sapientinc%2FHRM--Text-181717?logo=github&logoColor=white"></a>
+</p>
+# HRM-Text-1B
+A 1 B-parameter base language model built on the **Hierarchical Reasoning Model (HRM)** architecture, trained from scratch on a curated text corpus by Sapient Intelligence.
+HRM is a dual-timescale recurrent architecture: two Transformer modules (H = high-level / slow, L = low-level / fast) iterate over the same input embeddings for `H_cycles × L_cycles` steps, with additive state injection (`z_L + z_H`). This gives effectively unbounded compute depth at bounded parameter count.
+## Disclaimer
+This is a **base** model. It is pre-trained on a PrefixLM objective with condition prefix tokens and has **not** been instruction-tuned, RLHF'd, or otherwise post-trained. For any serious downstream use we recommend post-training (SFT and/or RL) on task-specific data; the base checkpoint is meant as a starting point, not a finished assistant.
+Practical guidance for prompting the raw base model:
+- **NLP tasks (classification, extraction, structured output, short-form QA)**: use the `direct` condition with 2–8 few-shot in-context examples. `direct` + few-shot is the strongest zero-extra-training setup we have measured; pure zero-shot is noticeably weaker.
+- **Reasoning / math / open-ended generation**: use the **composite condition** `synth,cot`. This is *one* composite prefix, not two alternatives — at tokenization time the comma-separated tags are mapped to their prefix tokens and concatenated, in order, into a single prefix block. So `synth,cot` produces the two-token prefix `<|quad_end|><|object_ref_end|>` (synth first, then cot), wrapped in the usual `<|im_start|>` … `<|im_end|>` envelope. Under this composite the model exhibits some chain-of-thought / instruct-like behavior — enough to answer many zero-shot math and reasoning prompts in a step-by-step style — but quality is uneven and below an instruction-tuned model of comparable size. Treat this "instruct" ability as a side effect of the pre-training mix, not a guaranteed capability.
+The four single tags and their prefix tokens (for reference; you can compose any subset, comma-separated, in the order you want them emitted):
+- `direct` → `<|object_ref_start|>` — direct answer, no CoT
+- `cot` → `<|object_ref_end|>` — chain-of-thought
+- `noisy` → `<|quad_start|>` — noisy / web-crawl style
+- `synth` → `<|quad_end|>` — synthetic / curated style
+## Requirements
+The `hrm_text` model class has been merged into Transformers `main`. The PyPI release containing it may still be in flight; until then, install Transformers directly from the upstream `main` branch:
+```bash
+pip install --upgrade "git+https://github.com/huggingface/transformers.git@main"
+```
+## Model details
+| Field | Value |
+|---|---|
+| Parameters | ~1 B |
+| Hidden size | 1536 |
+| Layers (per H / L stack) | 16 |
+| Attention heads | 12 (MHA, head_dim 128) |
+| H_cycles × L_cycles | 2 × 3 |
+| Max sequence length | 4096 |
+| Vocabulary | 65,536 |
+| Embedding | Scaled (lecun_normal) |
+| Position encoding | RoPE (theta 10000) |
+| Activation | SwiGLU |
+| Normalization | Parameterless Pre-RMSNorm |
+| Attention | Gated (sigmoid output gate) |
+| Training unique tokens | 40 B |
+| Optimizer | AdamATan2 (beta 0.9 / 0.95, wd 0.1, EMA 0.9999) |
+| LR | 2.2e-4 (warmup 2000 steps) |
+| Global batch | 196,608 tokens |
+| dtype | bfloat16 |
+## Usage
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+model_id = "sapientinc/HRM-Text-1B"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True,
+).cuda().eval()
+# synth,cot composite — reasoning / CoT style (see Disclaimer for other modes)
+condition = "<|quad_end|><|object_ref_end|>"
+prompt = f"<|im_start|>{condition}Explain why the sky is blue.<|im_end|>"
+inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+# Mark the prompt as a single bidirectional prefix block — see "PrefixLM mask" below.
+inputs["token_type_ids"] = torch.ones_like(inputs["input_ids"])
+with torch.no_grad():
+    out = model.generate(**inputs, max_new_tokens=256, do_sample=False)
+print(tokenizer.decode(out[0], skip_special_tokens=False))
+```
+### PrefixLM mask — pass `token_type_ids`
+HRM-Text was pre-trained with a PrefixLM mask: prompt tokens attend bidirectionally to each other, response tokens attend causally. To match the training-time forward at inference you must tell the model which positions are prefix.
+In the current Transformers port the mask is controlled by `token_type_ids`:
+- `token_type_ids[i] == 1` → position `i` is part of the prefix block (bidirectional within the block).
+- otherwise → causal.
+If you omit `token_type_ids`, attention falls back to **pure causal**, which does **not** match the pre-training distribution and will give noticeably worse logits. The simplest correct call passes `token_type_ids = torch.ones_like(input_ids)`, marking the entire input prompt as one bidirectional prefix block — exactly how training-time prefill ran.
+## Architecture
+The recurrent core (per forward pass, in inference mode):
+```
+z_H = embed(input_ids) * embedding_scale
+z_L = z_L_init.expand_as(z_H)
+for _ in range(H_cycles):
+    for _ in range(L_cycles):
+        z_L = L_module(z_L + z_H)
+    z_H = H_module(z_H + z_L)
+return z_H
+```
+Both stacks share the same Transformer block design (gated attention, RoPE, SwiGLU, pre-RMSNorm); see Model details above for shapes.
+## Training data
+Pre-trained on a sampled mixture of publicly available text corpora. The full dataset composition, sampling weights, and preprocessing pipeline are open-sourced:
+<p align="center">
+  <a href="https://github.com/sapientinc/data_io"><img alt="data_io" src="https://img.shields.io/badge/GitHub-sapientinc%2Fdata__io-181717?logo=github&logoColor=white"></a>
+</p>
+## Limitations
+- English only (training corpus is predominantly English).
+- Outputs may be inaccurate, biased, or unsafe.
+## License
+[Apache License 2.0](LICENSE).
+## Citation
+Citation information will be added with the accompanying paper.

__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright 2026 The Sapient AI Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_hrm_text import *
+from .modeling_hrm_text import *

banner.jpg ADDED Viewed

Git LFS Details

SHA256: 4ec52ca7bc19373cbf999451bf79c55c3ff09c3d1fd24a46c2f467b852abf420
Pointer size: 131 Bytes
Size of remote file: 516 kB

benchmark_scatter.png ADDED Viewed

Git LFS Details

SHA256: f0eddfa5c28e0069bfd0b2d50a8c9e4b2a7c720144f4ea9abb38105f77e2ba20
Pointer size: 131 Bytes
Size of remote file: 413 kB

config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "model_type": "hrm_text",
+  "architectures": [
+    "HrmTextForCausalLM"
+  ],
+  "vocab_size": 65536,
+  "hidden_size": 1536,
+  "intermediate_size": 4096,
+  "num_hidden_layers": 16,
+  "num_attention_heads": 12,
+  "num_key_value_heads": 12,
+  "head_dim": 128,
+  "H_cycles": 2,
+  "L_cycles": 3,
+  "L_bp_cycles": [
+    2
+  ],
+  "max_position_embeddings": 4096,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "initializer_range": 0.025515518153991442,
+  "embedding_scale": 39.191835884530846,
+  "prefix_lm": true,
+  "pad_token_id": 5,
+  "bos_token_id": 6,
+  "eos_token_id": 11,
+  "auto_map": {
+    "AutoConfig": "configuration_hrm_text.HrmTextConfig",
+    "AutoModel": "modeling_hrm_text.HrmTextModel",
+    "AutoModelForCausalLM": "modeling_hrm_text.HrmTextForCausalLM"
+  }
+}

configuration_hrm_text.py ADDED Viewed

	@@ -0,0 +1,146 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hrm_text/modular_hrm_text.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hrm_text.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 The Sapient AI Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from huggingface_hub.dataclasses import strict
+from transformers.configuration_utils import PreTrainedConfig
+from transformers.modeling_rope_utils import RopeParameters
+from transformers.utils import auto_docstring
+from transformers.utils.generic import is_flash_attention_requested, split_attention_implementation
+from transformers.utils.type_validators import interval
+@auto_docstring(checkpoint="sapientinc/HRM-Text-1B")
+@strict
+class HrmTextConfig(PreTrainedConfig):
+    r"""
+    H_cycles (`int`, *optional*, defaults to 2):
+        Number of high-level cycles.
+    L_cycles (`int`, *optional*, defaults to 3):
+        Number of low-level cycles per H-cycle.
+    L_bp_cycles (`list[int]`, *optional*, defaults to `[2]`):
+        Training-time gradient-routing list; left-padded with `1`s up to `L_cycles` inside the model.
+        Inference-time no-op.
+    embedding_scale (`float`, *optional*):
+        Token-embedding multiplier. If `None`, defaults to `1 / initializer_range`.
+    prefix_lm (`bool`, *optional*, defaults to `True`):
+        Instruction tokens attend bidirectionally, response tokens attend causally.
+    num_layers_per_stack (`int`, *optional*):
+        Real number of transformer blocks inside each
+        of the H / L stacks. Set automatically on first construction: the value passed as
+        `num_hidden_layers` is remembered here and `num_hidden_layers` is then rewritten to
+        `num_layers_per_stack * H_cycles * (L_cycles + 1)` so that
+        `DynamicCache(config=...)` pre-allocates one slot per unique attention invocation
+        under the recurrent forward. Do not set this directly on first construction — pass
+        the real per-stack count as `num_hidden_layers` and let `__post_init__` split it.
+    """
+    model_type = "hrm_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        **{f"{stack}.layers.*.self_attn.q_proj": "colwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.self_attn.k_proj": "colwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.self_attn.v_proj": "colwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.self_attn.gate_proj": "colwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.self_attn.o_proj": "rowwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.mlp.gate_proj": "colwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.mlp.up_proj": "colwise" for stack in ("L_module", "H_module")},
+        **{f"{stack}.layers.*.mlp.down_proj": "rowwise" for stack in ("L_module", "H_module")},
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    vocab_size: int = 151808
+    hidden_size: int = 1536
+    intermediate_size: int = 4096
+    num_hidden_layers: int = 16
+    num_attention_heads: int = 12
+    hidden_act: str = "silu"
+    max_position_embeddings: int = 2048
+    initializer_range: float = interval(min=0.0, max=1.0)(default=0.02)
+    rms_norm_eps: float = 1e-6
+    use_cache: bool = True
+    pad_token_id: int | None = None
+    bos_token_id: int | None = None
+    eos_token_id: int | list[int] | None = None
+    tie_word_embeddings: bool = False
+    rope_parameters: RopeParameters | dict | None = None
+    attention_bias: bool = False
+    attention_dropout: int | float | None = 0.0
+    mlp_bias: bool = False
+    head_dim: int = 128
+    H_cycles: int = 2
+    L_cycles: int = 3
+    L_bp_cycles: list[int] | None = None
+    embedding_scale: float | None = None
+    prefix_lm: bool = True
+    num_layers_per_stack: int | None = None  # Usually inferred in post init
+    def __post_init__(self, **kwargs):
+        if self.L_bp_cycles is None:
+            # Default `[2]` = backprop only the last 2 L-iterations per H-cycle (training-time
+            # gradient-routing knob). Left-padding to length `L_cycles` is performed inside
+            # [`HrmTextModel`] since it depends on `L_cycles`.
+            self.L_bp_cycles = [2]
+        if self.embedding_scale is None:
+            self.embedding_scale = 1.0 / self.initializer_range
+        if self.num_layers_per_stack is None:
+            # Initial construction, or legacy checkpoint where `num_hidden_layers` carries the
+            # real per-stack count: remember that value and rewrite `num_hidden_layers` to the
+            # inflated total, so standard HF cache allocation gives us one slot per unique
+            # attention invocation. Serialised configs round-trip as (inflated, real) pairs.
+            self.num_layers_per_stack = self.num_hidden_layers
+            self.num_hidden_layers = self.num_layers_per_stack * self.H_cycles * (self.L_cycles + 1)
+        super().__post_init__(**kwargs)
+    def validate_architecture(self):
+        """Part of `@strict`-powered validation. Validates the architecture of the config."""
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})."
+            )
+    @property
+    def _attn_implementation(self):
+        return self._attn_implementation_internal
+    @_attn_implementation.setter
+    def _attn_implementation(self, value: str | dict | None):
+        if value is not None and self.prefix_lm:
+            _, base_implementation = split_attention_implementation(value)
+            if is_flash_attention_requested(requested_attention_implementation=base_implementation):
+                raise ValueError(
+                    f"`attn_implementation={value!r}` is not supported when "
+                    "`config.prefix_lm=True`: FlashAttention cannot represent the PrefixLM 4-D mask "
+                    "overlay. Use `'sdpa'` (default) or `'flex_attention'`, or set `config.prefix_lm=False`."
+                )
+        PreTrainedConfig._attn_implementation.__set__(self, value)
+__all__ = ["HrmTextConfig"]

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8fe2b2bf6948414e8e8d6538659198726d98f967c55b533b7aabe8a1fa9a584
+size 2365606568

modeling_hrm_text.py ADDED Viewed

	@@ -0,0 +1,644 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hrm_text/modular_hrm_text.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hrm_text.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 The Sapient AI Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Callable
+from contextlib import nullcontext
+from typing import Optional
+import torch
+from torch import nn
+from transformers import initialization as init
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.configuration_utils import PreTrainedConfig
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_func_from_hub, use_kernelized_func
+from transformers.masking_utils import create_causal_mask, create_masks_for_generate
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import auto_docstring, can_return_tuple, logging
+from transformers.utils.generic import (
+    TransformersKwargs,
+    is_flash_attention_requested,
+    maybe_autocast,
+    merge_with_config_defaults,
+    split_attention_implementation,
+)
+from transformers.utils.output_capturing import capture_outputs
+from .configuration_hrm_text import HrmTextConfig
+logger = logging.get_logger(__name__)
+class HrmTextRMSNorm(torch.nn.Module):
+    def __init__(self, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        return self._norm(x.float()).type_as(x)
+    def extra_repr(self):
+        return f"eps={self.eps}"
+class HrmTextMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+@use_kernel_func_from_hub("rotary_pos_emb")
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+@use_kernelized_func(apply_rotary_pos_emb)
+class HrmTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: HrmTextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = 1  # Uses MHA instead of GQA
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        # Additional sigmoid gate applied at the end
+        self.gate_proj = nn.Linear(
+            config.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: Cache | None = None,
+        cycle_offset: int = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        gate_states = self.gate_proj(hidden_states).view(hidden_shape)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_values is not None:
+            # Adjust cache slot by `cycle_offset` which is determined by it's current recurrent step through the stacks
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx + cycle_offset)
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        # Additional sigmoid gating (similar to Qwen3Next)
+        attn_output = torch.sigmoid(gate_states) * attn_output
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class HrmTextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: HrmTextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = HrmTextAttention(config=config, layer_idx=layer_idx)
+        self.mlp = HrmTextMLP(config)
+        self.input_layernorm = HrmTextRMSNorm(eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HrmTextRMSNorm(eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        use_cache: bool | None = False,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class HrmTextStack(nn.Module):
+    """A single transformer stack — used twice inside, once as H module and once as L module"""
+    def __init__(self, config: HrmTextConfig):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [HrmTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_layers_per_stack)]
+        )
+        self.final_norm = HrmTextRMSNorm(eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: Cache | None = None,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        cycle_offset: int = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            hidden_states = layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                position_embeddings=position_embeddings,
+                cycle_offset=cycle_offset,
+                **kwargs,
+            )
+        return self.final_norm(hidden_states)
+@auto_docstring
+class HrmTextPreTrainedModel(PreTrainedModel):
+    config: HrmTextConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HrmTextDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": HrmTextDecoderLayer,
+        "attentions": HrmTextAttention,
+    }
+    def _check_and_adjust_attn_implementation(
+        self, attn_implementation: str | None, is_init_check: bool = False, allow_all_kernels: bool = False
+    ) -> str:
+        if attn_implementation is not None and self.config.prefix_lm:
+            _, base_implementation = split_attention_implementation(attn_implementation)
+            if is_flash_attention_requested(requested_attention_implementation=base_implementation):
+                raise ValueError(
+                    f"`attn_implementation={attn_implementation!r}` is not supported when "
+                    "`config.prefix_lm=True`: FlashAttention cannot represent the PrefixLM 4-D mask "
+                    "overlay. Use `'sdpa'` (default) or `'flex_attention'`, or set `config.prefix_lm=False`."
+                )
+        return super()._check_and_adjust_attn_implementation(attn_implementation, is_init_check, allow_all_kernels)
+    @torch.no_grad()
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, HrmTextModel):
+            init.zeros_(module.z_L_init)
+            # `z_L_init` is the frozen low-cycle initial state and never trains.
+            module.z_L_init.requires_grad_(False)  # trf-ignore: TRF012
+class HrmTextRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: HrmTextConfig, device=None):
+        super().__init__()
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_type = self.config.rope_parameters["rope_type"]
+        rope_init_fn: Callable = self.compute_default_rope_parameters
+        if self.rope_type != "default":
+            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: HrmTextConfig | None = None,
+        device: Optional["torch.device"] = None,
+        seq_len: int | None = None,
+    ) -> tuple["torch.Tensor", float]:
+        """
+        Computes the inverse frequencies according to the original RoPE implementation
+        Args:
+            config ([`~transformers.PreTrainedConfig`]):
+                The model configuration.
+            device (`torch.device`):
+                The device to use for initialization of the inverse frequencies.
+            seq_len (`int`, *optional*):
+                The current sequence length. Unused for this type of RoPE.
+        Returns:
+            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+        """
+        base = config.rope_parameters["rope_theta"]
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        attention_factor = 1.0  # Unused in this type of RoPE
+        # Compute the inverse frequencies
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+@auto_docstring
+class HrmTextModel(HrmTextPreTrainedModel):
+    def __init__(self, config: HrmTextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.rotary_emb = HrmTextRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.embedding_scale = config.embedding_scale
+        # Recursive module structures
+        self.L_module = HrmTextStack(config)
+        self.H_module = HrmTextStack(config)
+        # Initial state for the low cycle module
+        self.z_L_init = nn.Parameter(torch.zeros(config.hidden_size), requires_grad=False)
+        raw_bp = list(config.L_bp_cycles)
+        self.L_bp_cycles_padded = [1] * max(0, config.L_cycles - len(raw_bp)) + raw_bp
+        # Initialize weights and apply final processing
+        self.post_init()
+    @merge_with_config_defaults
+    @capture_outputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch, seq_len)`, *optional*):
+            Per-position bidirectional/causal indicator. Tokens with `token_type_ids == 1`
+            form a single bidirectional block; all other positions are causal.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # Additional scaling on the input embeds
+        inputs_embeds = inputs_embeds * self.embedding_scale
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if position_ids is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
+            position_ids = position_ids.unsqueeze(0)
+        # Create mask with optional prefix-based bidirectionality
+        mask_kwargs = {
+            "config": self.config,
+            "inputs_embeds": inputs_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+        }
+        is_first_iteration = past_key_values is None or not past_key_values.is_initialized
+        if token_type_ids is not None and is_first_iteration:
+            if self.config.prefix_lm:
+                mask_kwargs["block_sequence_ids"] = torch.where(token_type_ids == 1, 0, -1)
+            else:
+                logger.warning_once("`token_type_ids` was provided but `config.prefix_lm=False`; ignoring it.")
+        attention_mask = create_causal_mask(**mask_kwargs)
+        position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
+        # Hierarchical (H/L)-cycle recurrence
+        #
+        # `z_H` - slow / high-level state
+        hidden_states_high_cycle = inputs_embeds
+        # `z_L` - fast / low-level state
+        hidden_states_low_cycle = (
+            self.z_L_init.to(dtype=hidden_states_high_cycle.dtype, device=hidden_states_high_cycle.device)
+            .expand_as(hidden_states_high_cycle)
+            .contiguous()
+        )
+        # Cache-slot layout under the recurrent forward:
+        #
+        #   slot(h, l, layer)   = (h * (L_cycles + 1) + l) * num_layers_per_stack + layer
+        #                                                       ^— L-stack invocation at (h, l)
+        #   slot(h, H, layer)   = (h * (L_cycles + 1) + L_cycles) * num_layers_per_stack + layer
+        #                                                       ^— trailing H-stack invocation
+        #
+        # That totals `num_layers_per_stack * H_cycles * (L_cycles + 1)` slots, i.e. the `config.num_hidden_layers`.
+        num_layers_per_stack = self.config.num_layers_per_stack
+        for high_cycle_idx in range(self.config.H_cycles):
+            # `L_bp_cycles` k-step grad trick: only the trailing `num_grad_iterations` of the
+            # `L_cycles` inner iterations propagate gradients; earlier iterations run under
+            # `torch.no_grad()` to bound activation memory.
+            num_grad_iterations = (
+                self.L_bp_cycles_padded[high_cycle_idx] if high_cycle_idx < len(self.L_bp_cycles_padded) else 1
+            )
+            grad_threshold = self.config.L_cycles - num_grad_iterations
+            for low_cycle_idx in range(self.config.L_cycles):
+                cycle_offset = (high_cycle_idx * (self.config.L_cycles + 1) + low_cycle_idx) * num_layers_per_stack
+                ctx = nullcontext() if low_cycle_idx >= grad_threshold else torch.no_grad()
+                with ctx:
+                    hidden_states_low_cycle = self.L_module(
+                        hidden_states_low_cycle.to(hidden_states_high_cycle.device) + hidden_states_high_cycle,
+                        attention_mask=attention_mask,
+                        past_key_values=past_key_values,
+                        position_embeddings=position_embeddings,
+                        position_ids=position_ids,
+                        cycle_offset=cycle_offset,
+                        **kwargs,
+                    )
+            cycle_offset = (high_cycle_idx * (self.config.L_cycles + 1) + self.config.L_cycles) * num_layers_per_stack
+            hidden_states_high_cycle = self.H_module(
+                hidden_states_high_cycle + hidden_states_low_cycle.to(hidden_states_high_cycle.device),
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                position_embeddings=position_embeddings,
+                position_ids=position_ids,
+                cycle_offset=cycle_offset,
+                **kwargs,
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states_high_cycle,
+            past_key_values=past_key_values,
+        )
+@auto_docstring
+class HrmTextForCausalLM(HrmTextPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = HrmTextModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch, seq_len)`, *optional*):
+            Per-position bidirectional/causal indicator. Tokens with `token_type_ids == 1`
+            form a single bidirectional block; all other positions are causal.
+        """
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @staticmethod
+    def create_masks_for_generate(
+        config: PreTrainedConfig,
+        inputs_embeds: torch.Tensor,
+        attention_mask: torch.Tensor | None,
+        past_key_values: Cache | None,
+        position_ids: torch.Tensor | None,
+        token_type_ids: torch.Tensor | None = None,
+        is_first_iteration: bool | None = False,
+        **kwargs,
+    ) -> dict:
+        mask_kwargs = {
+            "config": config,
+            "inputs_embeds": inputs_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+        }
+        if token_type_ids is not None and is_first_iteration:
+            if config.prefix_lm:
+                mask_kwargs["block_sequence_ids"] = torch.where(token_type_ids == 1, 0, -1)
+            else:
+                logger.warning_once("`token_type_ids` was provided but `config.prefix_lm=False`; ignoring it.")
+        return create_masks_for_generate(**mask_kwargs)
+__all__ = ["HrmTextForCausalLM", "HrmTextModel", "HrmTextPreTrainedModel"]

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "add_prefix_space": null,
+  "backend": "tokenizers",
+  "bos_token": "<|im_start|>",
+  "eos_token": "<|box_end|>",
+  "is_local": true,
+  "local_files_only": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}