Instructions to use sinimiini/HRM-Text-1B-GGUF with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use sinimiini/HRM-Text-1B-GGUF with llama-cpp-python:

# !pip install llama-cpp-python

from llama_cpp import Llama

llm = Llama.from_pretrained(
	repo_id="sinimiini/HRM-Text-1B-GGUF",
	filename="HRM-Text-1B-BF16.gguf",
)

output = llm(
	"Once upon a time,",
	max_tokens=512,
	echo=True
)
print(output)

Notebooks
Google Colab
Kaggle
Local Apps

llama.cpp

How to use sinimiini/HRM-Text-1B-GGUF with llama.cpp:

Install from brew

brew install llama.cpp
# Start a local OpenAI-compatible server with a web UI:
llama-server -hf sinimiini/HRM-Text-1B-GGUF:BF16
# Run inference directly in the terminal:
llama-cli -hf sinimiini/HRM-Text-1B-GGUF:BF16

Install from WinGet (Windows)

winget install llama.cpp
# Start a local OpenAI-compatible server with a web UI:
llama-server -hf sinimiini/HRM-Text-1B-GGUF:BF16
# Run inference directly in the terminal:
llama-cli -hf sinimiini/HRM-Text-1B-GGUF:BF16

Use pre-built binary

# Download pre-built binary from:
# https://github.com/ggerganov/llama.cpp/releases
# Start a local OpenAI-compatible server with a web UI:
./llama-server -hf sinimiini/HRM-Text-1B-GGUF:BF16
# Run inference directly in the terminal:
./llama-cli -hf sinimiini/HRM-Text-1B-GGUF:BF16

Build from source code

git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
cmake -B build
cmake --build build -j --target llama-server llama-cli
# Start a local OpenAI-compatible server with a web UI:
./build/bin/llama-server -hf sinimiini/HRM-Text-1B-GGUF:BF16
# Run inference directly in the terminal:
./build/bin/llama-cli -hf sinimiini/HRM-Text-1B-GGUF:BF16

Use Docker

docker model run hf.co/sinimiini/HRM-Text-1B-GGUF:BF16

LM Studio
Jan

vLLM

How to use sinimiini/HRM-Text-1B-GGUF with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "sinimiini/HRM-Text-1B-GGUF"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "sinimiini/HRM-Text-1B-GGUF",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/sinimiini/HRM-Text-1B-GGUF:BF16

Ollama
How to use sinimiini/HRM-Text-1B-GGUF with Ollama:
```
ollama run hf.co/sinimiini/HRM-Text-1B-GGUF:BF16
```

Unsloth Studio new

How to use sinimiini/HRM-Text-1B-GGUF with Unsloth Studio:

Install Unsloth Studio (macOS, Linux, WSL)

curl -fsSL https://unsloth.ai/install.sh | sh
# Run unsloth studio
unsloth studio -H 0.0.0.0 -p 8888
# Then open http://localhost:8888 in your browser
# Search for sinimiini/HRM-Text-1B-GGUF to start chatting

Install Unsloth Studio (Windows)

irm https://unsloth.ai/install.ps1 | iex
# Run unsloth studio
unsloth studio -H 0.0.0.0 -p 8888
# Then open http://localhost:8888 in your browser
# Search for sinimiini/HRM-Text-1B-GGUF to start chatting

Using HuggingFace Spaces for Unsloth

# No setup required
# Open https://huggingface.co/spaces/unsloth/studio in your browser
# Search for sinimiini/HRM-Text-1B-GGUF to start chatting

Docker Model Runner
How to use sinimiini/HRM-Text-1B-GGUF with Docker Model Runner:
```
docker model run hf.co/sinimiini/HRM-Text-1B-GGUF:BF16
```

Lemonade

How to use sinimiini/HRM-Text-1B-GGUF with Lemonade:

Pull the model

# Download Lemonade from https://lemonade-server.ai/
lemonade pull sinimiini/HRM-Text-1B-GGUF:BF16

Run and chat with the model

lemonade run user.HRM-Text-1B-GGUF-BF16

List all available models

lemonade list

HRM-Text-1B-GGUF / runtime /llama.cpp-hrm_text.patch

sinimiini

Upload folder using huggingface_hub

1c10575 verified about 19 hours ago

raw

history blame contribute delete

25 kB

	diff --git a/conversion/__init__.py b/conversion/__init__.py
	index 2c38123df..ecf1be2db 100644
	--- a/conversion/__init__.py
	+++ b/conversion/__init__.py
	@@ -95,6 +95,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
	"HunYuanDenseV1ForCausalLM": "hunyuan",
	"HunYuanMoEV1ForCausalLM": "hunyuan",
	"HunYuanVLForConditionalGeneration": "hunyuan",
	+ "HrmTextForCausalLM": "hrm_text",
	"IQuestCoderForCausalLM": "llama",
	"InternLM2ForCausalLM": "internlm",
	"InternLM3ForCausalLM": "internlm",
	diff --git a/conversion/hrm_text.py b/conversion/hrm_text.py
	new file mode 100644
	index 000000000..1f29ab55e
	--- /dev/null
	+++ b/conversion/hrm_text.py
	@@ -0,0 +1,120 @@
	+from __future__ import annotations
	+
	+import re
	+import json
	+
	+from typing import Iterable, TYPE_CHECKING
	+
	+import torch
	+
	+if TYPE_CHECKING:
	+ from torch import Tensor
	+
	+from .base import ModelBase, TextModel, gguf, logger
	+
	+
	+@ModelBase.register("HrmTextForCausalLM")
	+class HrmTextModel(TextModel):
	+ model_arch = gguf.MODEL_ARCH.HRM_TEXT
	+
	+ def __init__(self, args, *kwargs):
	+ super().__init__(args, *kwargs)
	+
	+ with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
	+ self.raw_hparams = json.load(f)
	+
	+ self.layers_per_stack = self.raw_hparams["num_hidden_layers"]
	+ self.h_cycles = self.raw_hparams["H_cycles"]
	+ self.l_cycles = self.raw_hparams["L_cycles"]
	+ self.physical_block_count = self.layers_per_stack * 2
	+ self.cache_block_count = self.layers_per_stack * self.h_cycles * (self.l_cycles + 1)
	+
	+ # GGUF tensors store one physical L stack followed by one physical H stack.
	+ # The runtime expands these 32 physical layers across 128 KV-cache slots.
	+ self.block_count = self.physical_block_count
	+ self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
	+
	+ def set_vocab(self):
	+ # HRM-Text ships a Qwen2-style tokenizer.json. Keep it as a plain tokenizer;
	+ # do not add a chat template for validation GGUFs.
	+ self._set_vocab_gpt2()
	+
	+ def get_vocab_base_pre(self, tokenizer) -> str:
	+ del tokenizer
	+ return "qwen2"
	+
	+ def set_gguf_parameters(self):
	+ hp = self.raw_hparams
	+ head_dim = hp["head_dim"]
	+
	+ self.gguf_writer.add_context_length(hp["max_position_embeddings"])
	+ self.gguf_writer.add_embedding_length(hp["hidden_size"])
	+ self.gguf_writer.add_block_count(self.cache_block_count)
	+ self.gguf_writer.add_feed_forward_length(hp["intermediate_size"])
	+ self.gguf_writer.add_head_count(hp["num_attention_heads"])
	+ self.gguf_writer.add_head_count_kv(hp["num_key_value_heads"])
	+ self.gguf_writer.add_key_length(head_dim)
	+ self.gguf_writer.add_value_length(head_dim)
	+ self.gguf_writer.add_rope_dimension_count(head_dim)
	+ self.gguf_writer.add_rope_freq_base(hp.get("rope_theta", 10000.0))
	+ self.gguf_writer.add_layer_norm_rms_eps(hp["rms_norm_eps"])
	+ self.gguf_writer.add_embedding_scale(hp["embedding_scale"])
	+
	+ arch = self.gguf_writer.arch
	+ self.gguf_writer.add_uint32(gguf.Keys.LLM.HRM_LAYERS_PER_STACK.format(arch=arch), self.layers_per_stack)
	+ self.gguf_writer.add_uint32(gguf.Keys.LLM.HRM_H_CYCLES.format(arch=arch), self.h_cycles)
	+ self.gguf_writer.add_uint32(gguf.Keys.LLM.HRM_L_CYCLES.format(arch=arch), self.l_cycles)
	+ self.gguf_writer.add_bool(gguf.Keys.LLM.HRM_PREFIX_LM.format(arch=arch), bool(hp.get("prefix_lm", False)))
	+
	+ def _format(self, key: gguf.MODEL_TENSOR, bid: int \| None = None, suffix: str = ".weight") -> str:
	+ return self.format_tensor_name(key, bid=bid, suffix=suffix)
	+
	+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int \| None) -> Iterable[tuple[str, Tensor]]:
	+ if name == "model.embed_tokens.weight":
	+ yield self._format(gguf.MODEL_TENSOR.TOKEN_EMBD), data_torch
	+ return
	+
	+ if name == "lm_head.weight":
	+ yield self._format(gguf.MODEL_TENSOR.OUTPUT), data_torch
	+ return
	+
	+ if name == "model.z_L_init":
	+ yield self._format(gguf.MODEL_TENSOR.HRM_Z_L_INIT, suffix=""), data_torch
	+ return
	+
	+ match = re.fullmatch(r"model\.([LH])_module\.layers\.(\d+)\.(.+)", name)
	+ if match is None:
	+ raise ValueError(f"Can not map tensor {name!r}")
	+
	+ stack, layer_s, tensor_name = match.groups()
	+ layer_idx = int(layer_s)
	+ if layer_idx >= self.layers_per_stack:
	+ raise ValueError(f"Layer index {layer_idx} outside HRM stack size {self.layers_per_stack}")
	+
	+ physical_bid = layer_idx + (self.layers_per_stack if stack == "H" else 0)
	+
	+ if tensor_name == "attn.gqkv_proj.weight":
	+ gate, q, k, v = torch.chunk(data_torch, 4, dim=0)
	+ logger.debug("Split %s as gate, q, k, v", name)
	+ yield self._format(gguf.MODEL_TENSOR.ATTN_GATE, physical_bid), gate.contiguous()
	+ yield self._format(gguf.MODEL_TENSOR.ATTN_Q, physical_bid), q.contiguous()
	+ yield self._format(gguf.MODEL_TENSOR.ATTN_K, physical_bid), k.contiguous()
	+ yield self._format(gguf.MODEL_TENSOR.ATTN_V, physical_bid), v.contiguous()
	+ return
	+
	+ if tensor_name == "attn.o_proj.weight":
	+ yield self._format(gguf.MODEL_TENSOR.ATTN_OUT, physical_bid), data_torch
	+ return
	+
	+ if tensor_name == "mlp.gate_up_proj.weight":
	+ gate, up = torch.chunk(data_torch, 2, dim=0)
	+ logger.debug("Split %s as gate, up", name)
	+ yield self._format(gguf.MODEL_TENSOR.FFN_GATE, physical_bid), gate.contiguous()
	+ yield self._format(gguf.MODEL_TENSOR.FFN_UP, physical_bid), up.contiguous()
	+ return
	+
	+ if tensor_name == "mlp.down_proj.weight":
	+ yield self._format(gguf.MODEL_TENSOR.FFN_DOWN, physical_bid), data_torch
	+ return
	+
	+ raise ValueError(f"Can not map tensor {name!r}")
	diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
	index 7fdcf03d7..b84cc8827 100644
	--- a/gguf-py/gguf/constants.py
	+++ b/gguf-py/gguf/constants.py
	@@ -144,6 +144,10 @@ class Keys:
	TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
	INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
	FULL_ATTENTION_INTERVAL = "{arch}.full_attention_interval"
	+ HRM_LAYERS_PER_STACK = "{arch}.layers_per_stack"
	+ HRM_H_CYCLES = "{arch}.h_cycles"
	+ HRM_L_CYCLES = "{arch}.l_cycles"
	+ HRM_PREFIX_LM = "{arch}.prefix_lm"
	ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale"
	ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
	ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
	@@ -410,6 +414,7 @@ class MODEL_ARCH(IntEnum):
	QWEN3 = auto()
	QWEN3MOE = auto()
	QWEN3NEXT = auto()
	+ HRM_TEXT = auto()
	QWEN3VL = auto()
	QWEN3VLMOE = auto()
	QWEN35 = auto()
	@@ -527,6 +532,7 @@ class MODEL_TENSOR(IntEnum):
	TOKEN_TYPES = auto()
	POS_EMBD = auto()
	OUTPUT = auto()
	+ HRM_Z_L_INIT = auto()
	DENSE_2_OUT = auto() # embeddinggemma 2_Dense
	DENSE_3_OUT = auto() # embeddinggemma 3_Dense
	OUTPUT_NORM = auto()
	@@ -925,6 +931,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
	MODEL_ARCH.QWEN3: "qwen3",
	MODEL_ARCH.QWEN3MOE: "qwen3moe",
	MODEL_ARCH.QWEN3NEXT: "qwen3next",
	+ MODEL_ARCH.HRM_TEXT: "hrm_text",
	MODEL_ARCH.QWEN3VL: "qwen3vl",
	MODEL_ARCH.QWEN3VLMOE: "qwen3vlmoe",
	MODEL_ARCH.QWEN35: "qwen35",
	@@ -1042,6 +1049,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
	MODEL_TENSOR.POS_EMBD: "position_embd",
	MODEL_TENSOR.OUTPUT_NORM: "output_norm",
	MODEL_TENSOR.OUTPUT: "output",
	+ MODEL_TENSOR.HRM_Z_L_INIT: "hrm.z_l_init",
	MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
	MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
	MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
	@@ -2057,6 +2065,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
	MODEL_TENSOR.SSM_BETA_ALPHA,
	MODEL_TENSOR.SSM_OUT
	],
	+ MODEL_ARCH.HRM_TEXT: [
	+ MODEL_TENSOR.TOKEN_EMBD,
	+ MODEL_TENSOR.OUTPUT,
	+ MODEL_TENSOR.HRM_Z_L_INIT,
	+ MODEL_TENSOR.ATTN_Q,
	+ MODEL_TENSOR.ATTN_K,
	+ MODEL_TENSOR.ATTN_V,
	+ MODEL_TENSOR.ATTN_GATE,
	+ MODEL_TENSOR.ATTN_OUT,
	+ MODEL_TENSOR.FFN_GATE,
	+ MODEL_TENSOR.FFN_DOWN,
	+ MODEL_TENSOR.FFN_UP,
	+ ],
	MODEL_ARCH.QWEN3VL: [
	MODEL_TENSOR.TOKEN_EMBD,
	MODEL_TENSOR.OUTPUT_NORM,
	diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
	index c9eead18a..5b8ee3781 100644
	--- a/src/llama-arch.cpp
	+++ b/src/llama-arch.cpp
	@@ -37,6 +37,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
	{ LLM_ARCH_QWEN3, "qwen3" },
	{ LLM_ARCH_QWEN3MOE, "qwen3moe" },
	{ LLM_ARCH_QWEN3NEXT, "qwen3next" },
	+ { LLM_ARCH_HRM_TEXT, "hrm_text" },
	{ LLM_ARCH_QWEN3VL, "qwen3vl" },
	{ LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
	{ LLM_ARCH_QWEN35, "qwen35" },
	@@ -209,6 +210,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
	{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
	{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
	{ LLM_KV_FULL_ATTENTION_INTERVAL, "%s.full_attention_interval" },
	+ { LLM_KV_HRM_LAYERS_PER_STACK, "%s.layers_per_stack" },
	+ { LLM_KV_HRM_H_CYCLES, "%s.h_cycles" },
	+ { LLM_KV_HRM_L_CYCLES, "%s.l_cycles" },
	+ { LLM_KV_HRM_PREFIX_LM, "%s.prefix_lm" },

	{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
	{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
	@@ -346,6 +351,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
	{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
	{ LLM_TENSOR_OUTPUT_NORM_LFM2, "token_embd_norm" }, // fix for wrong tensor name
	{ LLM_TENSOR_OUTPUT, "output" },
	+ { LLM_TENSOR_HRM_Z_L_INIT, "hrm.z_l_init" },
	{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
	{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
	{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
	@@ -565,6 +571,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
	{LLM_TENSOR_POS_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
	{LLM_TENSOR_TOKEN_TYPES, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
	{LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // do the norms on the first layer (not the input layer)
	+ {LLM_TENSOR_HRM_Z_L_INIT, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}},
	{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
	{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
	{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
	diff --git a/src/llama-arch.h b/src/llama-arch.h
	index 89cf16cc3..fa04b684b 100644
	--- a/src/llama-arch.h
	+++ b/src/llama-arch.h
	@@ -41,6 +41,7 @@ enum llm_arch {
	LLM_ARCH_QWEN3,
	LLM_ARCH_QWEN3MOE,
	LLM_ARCH_QWEN3NEXT,
	+ LLM_ARCH_HRM_TEXT,
	LLM_ARCH_QWEN3VL,
	LLM_ARCH_QWEN3VLMOE,
	LLM_ARCH_QWEN35,
	@@ -213,6 +214,10 @@ enum llm_kv {
	LLM_KV_TOKEN_SHIFT_COUNT,
	LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
	LLM_KV_FULL_ATTENTION_INTERVAL,
	+ LLM_KV_HRM_LAYERS_PER_STACK,
	+ LLM_KV_HRM_H_CYCLES,
	+ LLM_KV_HRM_L_CYCLES,
	+ LLM_KV_HRM_PREFIX_LM,

	LLM_KV_ATTENTION_HEAD_COUNT,
	LLM_KV_ATTENTION_HEAD_COUNT_KV,
	@@ -354,6 +359,7 @@ enum llm_tensor {
	LLM_TENSOR_DENSE_2_OUT,
	LLM_TENSOR_DENSE_3_OUT,
	LLM_TENSOR_OUTPUT,
	+ LLM_TENSOR_HRM_Z_L_INIT,
	LLM_TENSOR_OUTPUT_NORM,
	LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
	LLM_TENSOR_ROPE_FREQS,
	diff --git a/src/llama-context.cpp b/src/llama-context.cpp
	index ad36c0666..fa80f4260 100644
	--- a/src/llama-context.cpp
	+++ b/src/llama-context.cpp
	@@ -2208,6 +2208,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
	if (model.arch == LLM_ARCH_QWEN3NEXT \|\| model.arch == LLM_ARCH_KIMI_LINEAR \|\| model.arch == LLM_ARCH_QWEN35 \|\| model.arch == LLM_ARCH_QWEN35MOE) {
	return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
	}
	+ if (model.arch == LLM_ARCH_HRM_TEXT) {
	+ return std::max<uint32_t>(n_tokens * 80, 64u * model.n_tensors());
	+ }
	uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
	for (const auto & lora : model.loras) {
	res += lora->get_n_nodes();
	diff --git a/src/llama-hparams.h b/src/llama-hparams.h
	index e2d051edc..812598f69 100644
	--- a/src/llama-hparams.h
	+++ b/src/llama-hparams.h
	@@ -164,6 +164,12 @@ struct llama_hparams {
	float f_embedding_scale = 0.0f;
	float f_attention_scale = 0.0f;

	+ // HRM-Text recurrence metadata. n_layer remains the expanded KV-cache slot count.
	+ uint32_t n_hrm_layer_per_stack = 0;
	+ uint32_t n_hrm_h_cycles = 0;
	+ uint32_t n_hrm_l_cycles = 0;
	+ bool hrm_prefix_lm = false;
	+
	// grok-2
	float f_attn_out_scale = 0.0f;
	uint32_t attn_temp_length = 0;
	diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp
	index 528e4c9c0..8a6e009c6 100644
	--- a/src/llama-model-saver.cpp
	+++ b/src/llama-model-saver.cpp
	@@ -245,6 +245,10 @@ void llama_model_saver::add_kv_from_model() {
	add_kv(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count);
	add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
	// add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???);
	+ add_kv(LLM_KV_HRM_LAYERS_PER_STACK, hparams.n_hrm_layer_per_stack);
	+ add_kv(LLM_KV_HRM_H_CYCLES, hparams.n_hrm_h_cycles);
	+ add_kv(LLM_KV_HRM_L_CYCLES, hparams.n_hrm_l_cycles);
	+ add_kv(LLM_KV_HRM_PREFIX_LM, hparams.hrm_prefix_lm);

	add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
	add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
	diff --git a/src/llama-model.cpp b/src/llama-model.cpp
	index 8bf20a716..a3cc996aa 100644
	--- a/src/llama-model.cpp
	+++ b/src/llama-model.cpp
	@@ -96,6 +96,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
	return new llama_model_qwen2moe(params);
	case LLM_ARCH_QWEN3:
	return new llama_model_qwen3(params);
	+ case LLM_ARCH_HRM_TEXT:
	+ return new llama_model_hrm_text(params);
	case LLM_ARCH_QWEN3MOE:
	return new llama_model_qwen3moe(params);
	case LLM_ARCH_QWEN3VL:
	@@ -2339,6 +2341,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
	case LLM_ARCH_PANGU_EMBED:
	case LLM_ARCH_AFMOE:
	case LLM_ARCH_QWEN3NEXT:
	+ case LLM_ARCH_HRM_TEXT:
	case LLM_ARCH_MIMO2:
	case LLM_ARCH_STEP35:
	return LLAMA_ROPE_TYPE_NEOX;
	diff --git a/src/models/hrm-text.cpp b/src/models/hrm-text.cpp
	new file mode 100644
	index 000000000..e0a3e9f59
	--- /dev/null
	+++ b/src/models/hrm-text.cpp
	@@ -0,0 +1,183 @@
	+#include "models.h"
	+
	+#include <cmath>
	+#include <vector>
	+
	+void llama_model_hrm_text::load_arch_hparams(llama_model_loader & ml) {
	+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
	+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
	+ ml.get_key(LLM_KV_HRM_LAYERS_PER_STACK, hparams.n_hrm_layer_per_stack);
	+ ml.get_key(LLM_KV_HRM_H_CYCLES, hparams.n_hrm_h_cycles);
	+ ml.get_key(LLM_KV_HRM_L_CYCLES, hparams.n_hrm_l_cycles);
	+ ml.get_key(LLM_KV_HRM_PREFIX_LM, hparams.hrm_prefix_lm, false);
	+
	+ switch (hparams.n_embd) {
	+ case 1536: type = LLM_TYPE_1B; break;
	+ default: type = LLM_TYPE_UNKNOWN;
	+ }
	+}
	+
	+void llama_model_hrm_text::load_arch_tensors(llama_model_loader &) {
	+ LLAMA_LOAD_LOCALS;
	+
	+ const int64_t n_stack = hparams.n_hrm_layer_per_stack;
	+ const int64_t n_cycle_slots = n_stack * (hparams.n_hrm_l_cycles + 1);
	+
	+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
	+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
	+
	+ hrm_z_l_init = create_tensor(tn(LLM_TENSOR_HRM_Z_L_INIT), {n_embd}, 0);
	+
	+ std::vector<bool> loaded_physical(2 * n_stack, false);
	+
	+ for (int il = 0; il < n_layer; ++il) {
	+ auto & layer = layers[il];
	+
	+ const int64_t layer_in_stack = il % n_stack;
	+ const int64_t phase = (il % n_cycle_slots) / n_stack;
	+ const bool is_h_stack = phase == int64_t(hparams.n_hrm_l_cycles);
	+ const int physical_bid = int((is_h_stack ? n_stack : 0) + layer_in_stack);
	+
	+ const int flags = loaded_physical[physical_bid] ? TENSOR_DUPLICATED : 0;
	+ loaded_physical[physical_bid] = true;
	+
	+ create_tensor_qkv(layer, physical_bid,
	+ n_embd,
	+ n_embd_head_k * n_head,
	+ n_embd_k_gqa,
	+ n_embd_v_gqa,
	+ flags);
	+
	+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", physical_bid), {n_embd, n_embd_head_k * n_head}, flags);
	+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", physical_bid), {n_embd_head_k * n_head, n_embd}, flags);
	+
	+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", physical_bid), {n_embd, n_ff}, flags);
	+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", physical_bid), {n_ff, n_embd}, flags);
	+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", physical_bid), {n_embd, n_ff}, flags);
	+ }
	+}
	+
	+std::unique_ptr<llm_graph_context> llama_model_hrm_text::build_arch_graph(const llm_graph_params & params) const {
	+ return std::make_unique<graph>(*this, params);
	+}
	+
	+llama_model_hrm_text::graph::graph(const llama_model & model_, const llm_graph_params & params) : llm_graph_context(params) {
	+ const auto & model = static_cast<const llama_model_hrm_text &>(model_);
	+
	+ GGML_ASSERT(model.tok_embd != nullptr);
	+ GGML_ASSERT(model.output != nullptr);
	+ GGML_ASSERT(model.hrm_z_l_init != nullptr);
	+
	+ const int64_t n_embd_head = hparams.n_embd_head_v();
	+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
	+ GGML_ASSERT(n_embd_head == n_rot);
	+
	+ const int64_t n_stack = hparams.n_hrm_layer_per_stack;
	+ const int64_t h_cycles = hparams.n_hrm_h_cycles;
	+ const int64_t l_cycles = hparams.n_hrm_l_cycles;
	+
	+ ggml_tensor * inp_pos = build_inp_pos();
	+ auto * inp_attn = build_attn_inp_kv();
	+ ggml_tensor * inp_out_ids = build_inp_out_ids();
	+
	+ ggml_tensor * hidden_high = build_inp_embd(model.tok_embd);
	+ ggml_tensor * hidden_low = ggml_repeat(ctx0, model.hrm_z_l_init, hidden_high);
	+ cb(hidden_low, "hrm_z_l_init", -1);
	+
	+ const float kq_scale = 1.0f / std::sqrt(float(n_embd_head));
	+
	+ auto build_stack = [&](ggml_tensor * stack_inp, int slot_offset) -> ggml_tensor * {
	+ ggml_tensor * stack_cur = stack_inp;
	+
	+ for (int layer_idx = 0; layer_idx < n_stack; ++layer_idx) {
	+ const int il = slot_offset + layer_idx;
	+ const auto & layer = model.layers[il];
	+
	+ ggml_tensor * inpSA = stack_cur;
	+ ggml_tensor * cur = build_norm(stack_cur, nullptr, nullptr, LLM_NORM_RMS, il);
	+ cb(cur, "attn_norm", il);
	+
	+ {
	+ ggml_tensor * attn_inp = cur;
	+ auto [Qcur, Kcur, Vcur] = build_qkv(layer, cur, n_embd_head, n_head, n_head_kv, il);
	+
	+ ggml_tensor * gate = build_lora_mm(layer.wqkv_gate, attn_inp, layer.wqkv_gate_s);
	+ cb(gate, "attn_gate_proj", il);
	+
	+ Qcur = ggml_rope_ext(
	+ ctx0, Qcur, inp_pos, nullptr,
	+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
	+ ext_factor, attn_factor, beta_fast, beta_slow);
	+ cb(Qcur, "Qcur_rope", il);
	+
	+ Kcur = ggml_rope_ext(
	+ ctx0, Kcur, inp_pos, nullptr,
	+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
	+ ext_factor, attn_factor, beta_fast, beta_slow);
	+ cb(Kcur, "Kcur_rope", il);
	+
	+ cur = build_attn(inp_attn,
	+ nullptr, nullptr, nullptr,
	+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
	+ cb(cur, "attn_out", il);
	+
	+ gate = ggml_sigmoid(ctx0, gate);
	+ cb(gate, "attn_gate_sig", il);
	+
	+ cur = ggml_mul(ctx0, cur, gate);
	+ cb(cur, "attn_gated", il);
	+
	+ cur = build_lora_mm(layer.wo, cur, layer.wo_s);
	+ cb(cur, "attn_o_proj", il);
	+ }
	+
	+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
	+ cb(ffn_inp, "ffn_inp", il);
	+
	+ cur = build_norm(ffn_inp, nullptr, nullptr, LLM_NORM_RMS, il);
	+ cb(cur, "ffn_norm", il);
	+
	+ cur = build_ffn(cur,
	+ layer.ffn_up, nullptr, layer.ffn_up_s,
	+ layer.ffn_gate, nullptr, layer.ffn_gate_s,
	+ layer.ffn_down, nullptr, layer.ffn_down_s,
	+ nullptr,
	+ LLM_FFN_SILU, LLM_FFN_PAR, il);
	+ cb(cur, "ffn_out", il);
	+
	+ cur = ggml_add(ctx0, cur, ffn_inp);
	+ cur = build_cvec(cur, il);
	+ cb(cur, "hrm_layer_out", il);
	+
	+ stack_cur = cur;
	+ }
	+
	+ stack_cur = build_norm(stack_cur, nullptr, nullptr, LLM_NORM_RMS, slot_offset);
	+ cb(stack_cur, "stack_final_norm", slot_offset);
	+ return stack_cur;
	+ };
	+
	+ for (int h = 0; h < h_cycles; ++h) {
	+ for (int l = 0; l < l_cycles; ++l) {
	+ const int slot_offset = int((h * (l_cycles + 1) + l) * n_stack);
	+ hidden_low = build_stack(ggml_add(ctx0, hidden_low, hidden_high), slot_offset);
	+ }
	+
	+ const int slot_offset = int((h * (l_cycles + 1) + l_cycles) * n_stack);
	+ hidden_high = build_stack(ggml_add(ctx0, hidden_high, hidden_low), slot_offset);
	+ }
	+
	+ ggml_tensor * cur = hidden_high;
	+
	+ if (inp_out_ids) {
	+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
	+ }
	+
	+ res->t_embd = cur;
	+
	+ cur = build_lora_mm(model.output, cur, model.output_s);
	+ cb(cur, "result_output", -1);
	+
	+ res->t_logits = cur;
	+ ggml_build_forward_expand(gf, cur);
	+}
	diff --git a/src/models/models.h b/src/models/models.h
	index 7e551eb96..7da6b7f7f 100644
	--- a/src/models/models.h
	+++ b/src/models/models.h
	@@ -515,6 +515,20 @@ struct llama_model_qwen3 : public llama_model_base {
	std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
	};

	+struct llama_model_hrm_text : public llama_model_base {
	+ llama_model_hrm_text(const struct llama_model_params & params) : llama_model_base(params) {}
	+ void load_arch_hparams(llama_model_loader & ml) override;
	+ void load_arch_tensors(llama_model_loader & ml) override;
	+
	+ ggml_tensor * hrm_z_l_init = nullptr;
	+
	+ struct graph : public llm_graph_context {
	+ graph(const llama_model & model, const llm_graph_params & params);
	+ };
	+
	+ std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
	+};
	+

	struct llama_model_qwen3moe : public llama_model_base {
	llama_model_qwen3moe(const struct llama_model_params & params) : llama_model_base(params) {}