Safetensors
English
hrm_text
custom_code
SassyDiffusion commited on
Commit
3dcb68a
·
verified ·
1 Parent(s): 4a79f6e

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - teknium/OpenHermes-2.5
5
+ - HuggingFaceH4/ultrachat_200k
6
+ - Magpie-Align/Magpie-Air-MT-300K-v0.1
7
+ language:
8
+ - en
9
+ base_model:
10
+ - sapientinc/HRM-Text-1B
11
+ ---
config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "H_cycles": 2,
3
+ "L_bp_cycles": [
4
+ 0,
5
+ 3
6
+ ],
7
+ "L_cycles": 3,
8
+ "architectures": [
9
+ "HrmTextForCausalLM"
10
+ ],
11
+ "attention_bias": false,
12
+ "attention_dropout": 0.0,
13
+ "auto_map": {
14
+ "AutoConfig": "configuration_hrm_text.HrmTextConfig",
15
+ "AutoModel": "modeling_hrm_text.HrmTextModel",
16
+ "AutoModelForCausalLM": "modeling_hrm_text.HrmTextForCausalLM"
17
+ },
18
+ "bos_token_id": 6,
19
+ "dtype": "bfloat16",
20
+ "embedding_scale": 39.191835884530846,
21
+ "eos_token_id": 11,
22
+ "head_dim": 128,
23
+ "hidden_act": "silu",
24
+ "hidden_size": 1536,
25
+ "initializer_range": 0.025515518153991442,
26
+ "intermediate_size": 4096,
27
+ "max_position_embeddings": 4096,
28
+ "mlp_bias": false,
29
+ "model_type": "hrm_text",
30
+ "num_attention_heads": 12,
31
+ "num_hidden_layers": 128,
32
+ "num_key_value_heads": 12,
33
+ "num_layers_per_stack": 16,
34
+ "pad_token_id": 11,
35
+ "prefix_lm": true,
36
+ "rms_norm_eps": 1e-06,
37
+ "rope_parameters": {
38
+ "rope_theta": 10000.0,
39
+ "rope_type": "default"
40
+ },
41
+ "tie_word_embeddings": false,
42
+ "transformers_version": "5.8.0.dev0",
43
+ "use_cache": false,
44
+ "vocab_size": 65536
45
+ }
configuration_hrm_text.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/hrm_text/modular_hrm_text.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_hrm_text.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # Copyright 2026 The Sapient AI Authors and the HuggingFace Inc. team. All rights reserved.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ from huggingface_hub.dataclasses import strict
22
+
23
+ from transformers.configuration_utils import PreTrainedConfig
24
+ from transformers.modeling_rope_utils import RopeParameters
25
+ from transformers.utils import auto_docstring
26
+ from transformers.utils.generic import is_flash_attention_requested, split_attention_implementation
27
+ from transformers.utils.type_validators import interval
28
+
29
+
30
+ @auto_docstring(checkpoint="sapientinc/HRM-Text-1B")
31
+ @strict
32
+ class HrmTextConfig(PreTrainedConfig):
33
+ r"""
34
+ H_cycles (`int`, *optional*, defaults to 2):
35
+ Number of high-level cycles.
36
+ L_cycles (`int`, *optional*, defaults to 3):
37
+ Number of low-level cycles per H-cycle.
38
+ L_bp_cycles (`list[int]`, *optional*, defaults to `[2]`):
39
+ Training-time gradient-routing list; left-padded with `1`s up to `L_cycles` inside the model.
40
+ Inference-time no-op.
41
+ embedding_scale (`float`, *optional*):
42
+ Token-embedding multiplier. If `None`, defaults to `1 / initializer_range`.
43
+ prefix_lm (`bool`, *optional*, defaults to `True`):
44
+ Instruction tokens attend bidirectionally, response tokens attend causally.
45
+ num_layers_per_stack (`int`, *optional*):
46
+ Real number of transformer blocks inside each
47
+ of the H / L stacks. Set automatically on first construction: the value passed as
48
+ `num_hidden_layers` is remembered here and `num_hidden_layers` is then rewritten to
49
+ `num_layers_per_stack * H_cycles * (L_cycles + 1)` so that
50
+ `DynamicCache(config=...)` pre-allocates one slot per unique attention invocation
51
+ under the recurrent forward. Do not set this directly on first construction — pass
52
+ the real per-stack count as `num_hidden_layers` and let `__post_init__` split it.
53
+ """
54
+
55
+ model_type = "hrm_text"
56
+ keys_to_ignore_at_inference = ["past_key_values"]
57
+
58
+ base_model_tp_plan = {
59
+ **{f"{stack}.layers.*.self_attn.q_proj": "colwise" for stack in ("L_module", "H_module")},
60
+ **{f"{stack}.layers.*.self_attn.k_proj": "colwise" for stack in ("L_module", "H_module")},
61
+ **{f"{stack}.layers.*.self_attn.v_proj": "colwise" for stack in ("L_module", "H_module")},
62
+ **{f"{stack}.layers.*.self_attn.gate_proj": "colwise" for stack in ("L_module", "H_module")},
63
+ **{f"{stack}.layers.*.self_attn.o_proj": "rowwise" for stack in ("L_module", "H_module")},
64
+ **{f"{stack}.layers.*.mlp.gate_proj": "colwise" for stack in ("L_module", "H_module")},
65
+ **{f"{stack}.layers.*.mlp.up_proj": "colwise" for stack in ("L_module", "H_module")},
66
+ **{f"{stack}.layers.*.mlp.down_proj": "rowwise" for stack in ("L_module", "H_module")},
67
+ }
68
+ base_model_pp_plan = {
69
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
70
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
71
+ "norm": (["hidden_states"], ["hidden_states"]),
72
+ }
73
+
74
+ vocab_size: int = 151808
75
+ hidden_size: int = 1536
76
+ intermediate_size: int = 4096
77
+ num_hidden_layers: int = 16
78
+ num_attention_heads: int = 12
79
+ hidden_act: str = "silu"
80
+ max_position_embeddings: int = 2048
81
+ initializer_range: float = interval(min=0.0, max=1.0)(default=0.02)
82
+ rms_norm_eps: float = 1e-6
83
+ use_cache: bool = True
84
+ pad_token_id: int | None = None
85
+ bos_token_id: int | None = None
86
+ eos_token_id: int | list[int] | None = None
87
+ tie_word_embeddings: bool = False
88
+ rope_parameters: RopeParameters | dict | None = None
89
+ attention_bias: bool = False
90
+ attention_dropout: int | float | None = 0.0
91
+ mlp_bias: bool = False
92
+ head_dim: int = 128
93
+
94
+ H_cycles: int = 2
95
+ L_cycles: int = 3
96
+ L_bp_cycles: list[int] | None = None
97
+ embedding_scale: float | None = None
98
+ prefix_lm: bool = True
99
+ num_layers_per_stack: int | None = None # Usually inferred in post init
100
+
101
+ def __post_init__(self, **kwargs):
102
+ if self.L_bp_cycles is None:
103
+ # Default `[2]` = backprop only the last 2 L-iterations per H-cycle (training-time
104
+ # gradient-routing knob). Left-padding to length `L_cycles` is performed inside
105
+ # [`HrmTextModel`] since it depends on `L_cycles`.
106
+ self.L_bp_cycles = [2]
107
+
108
+ if self.embedding_scale is None:
109
+ self.embedding_scale = 1.0 / self.initializer_range
110
+
111
+ if self.num_layers_per_stack is None:
112
+ # Initial construction, or legacy checkpoint where `num_hidden_layers` carries the
113
+ # real per-stack count: remember that value and rewrite `num_hidden_layers` to the
114
+ # inflated total, so standard HF cache allocation gives us one slot per unique
115
+ # attention invocation. Serialised configs round-trip as (inflated, real) pairs.
116
+ self.num_layers_per_stack = self.num_hidden_layers
117
+ self.num_hidden_layers = self.num_layers_per_stack * self.H_cycles * (self.L_cycles + 1)
118
+
119
+ super().__post_init__(**kwargs)
120
+
121
+ def validate_architecture(self):
122
+ """Part of `@strict`-powered validation. Validates the architecture of the config."""
123
+ if self.hidden_size % self.num_attention_heads != 0:
124
+ raise ValueError(
125
+ f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
126
+ f"heads ({self.num_attention_heads})."
127
+ )
128
+
129
+ @property
130
+ def _attn_implementation(self):
131
+ return self._attn_implementation_internal
132
+
133
+ @_attn_implementation.setter
134
+ def _attn_implementation(self, value: str | dict | None):
135
+ if value is not None and self.prefix_lm:
136
+ _, base_implementation = split_attention_implementation(value)
137
+ if is_flash_attention_requested(requested_attention_implementation=base_implementation):
138
+ raise ValueError(
139
+ f"`attn_implementation={value!r}` is not supported when "
140
+ "`config.prefix_lm=True`: FlashAttention cannot represent the PrefixLM 4-D mask "
141
+ "overlay. Use `'sdpa'` (default) or `'flex_attention'`, or set `config.prefix_lm=False`."
142
+ )
143
+ PreTrainedConfig._attn_implementation.__set__(self, value)
144
+
145
+
146
+ __all__ = ["HrmTextConfig"]
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 6,
4
+ "eos_token_id": [
5
+ 11
6
+ ],
7
+ "pad_token_id": 11,
8
+ "transformers_version": "5.8.0.dev0"
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adb082b2815f710454c8dab3b9bc488655fe09f5f417481f59ab95e476e21789
3
+ size 2365606600
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": null,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|im_start|>",
5
+ "eos_token": "<|box_end|>",
6
+ "is_local": false,
7
+ "local_files_only": false,
8
+ "model_max_length": 1000000000000000019884624838656,
9
+ "pad_token": "<|box_end|>",
10
+ "tokenizer_class": "Qwen2Tokenizer",
11
+ "unk_token": "<|endoftext|>"
12
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fbf7e7531cf51bb6223a2b1646e5ce1d4272e24f7eaf9dc16566e4ab9d10e12
3
+ size 5521