Instructions to use sinimiini/HRM-Text-1B-GGUF with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use sinimiini/HRM-Text-1B-GGUF with llama-cpp-python:

# !pip install llama-cpp-python

from llama_cpp import Llama

llm = Llama.from_pretrained(
	repo_id="sinimiini/HRM-Text-1B-GGUF",
	filename="HRM-Text-1B-BF16.gguf",
)

output = llm(
	"Once upon a time,",
	max_tokens=512,
	echo=True
)
print(output)

Notebooks
Google Colab
Kaggle
Local Apps

llama.cpp

How to use sinimiini/HRM-Text-1B-GGUF with llama.cpp:

Install from brew

brew install llama.cpp
# Start a local OpenAI-compatible server with a web UI:
llama-server -hf sinimiini/HRM-Text-1B-GGUF:BF16
# Run inference directly in the terminal:
llama-cli -hf sinimiini/HRM-Text-1B-GGUF:BF16

Install from WinGet (Windows)

winget install llama.cpp
# Start a local OpenAI-compatible server with a web UI:
llama-server -hf sinimiini/HRM-Text-1B-GGUF:BF16
# Run inference directly in the terminal:
llama-cli -hf sinimiini/HRM-Text-1B-GGUF:BF16

Use pre-built binary

# Download pre-built binary from:
# https://github.com/ggerganov/llama.cpp/releases
# Start a local OpenAI-compatible server with a web UI:
./llama-server -hf sinimiini/HRM-Text-1B-GGUF:BF16
# Run inference directly in the terminal:
./llama-cli -hf sinimiini/HRM-Text-1B-GGUF:BF16

Build from source code

git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
cmake -B build
cmake --build build -j --target llama-server llama-cli
# Start a local OpenAI-compatible server with a web UI:
./build/bin/llama-server -hf sinimiini/HRM-Text-1B-GGUF:BF16
# Run inference directly in the terminal:
./build/bin/llama-cli -hf sinimiini/HRM-Text-1B-GGUF:BF16

Use Docker

docker model run hf.co/sinimiini/HRM-Text-1B-GGUF:BF16

LM Studio
Jan

vLLM

How to use sinimiini/HRM-Text-1B-GGUF with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "sinimiini/HRM-Text-1B-GGUF"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "sinimiini/HRM-Text-1B-GGUF",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/sinimiini/HRM-Text-1B-GGUF:BF16

Ollama
How to use sinimiini/HRM-Text-1B-GGUF with Ollama:
```
ollama run hf.co/sinimiini/HRM-Text-1B-GGUF:BF16
```

Unsloth Studio new

How to use sinimiini/HRM-Text-1B-GGUF with Unsloth Studio:

Install Unsloth Studio (macOS, Linux, WSL)

curl -fsSL https://unsloth.ai/install.sh | sh
# Run unsloth studio
unsloth studio -H 0.0.0.0 -p 8888
# Then open http://localhost:8888 in your browser
# Search for sinimiini/HRM-Text-1B-GGUF to start chatting

Install Unsloth Studio (Windows)

irm https://unsloth.ai/install.ps1 | iex
# Run unsloth studio
unsloth studio -H 0.0.0.0 -p 8888
# Then open http://localhost:8888 in your browser
# Search for sinimiini/HRM-Text-1B-GGUF to start chatting

Using HuggingFace Spaces for Unsloth

# No setup required
# Open https://huggingface.co/spaces/unsloth/studio in your browser
# Search for sinimiini/HRM-Text-1B-GGUF to start chatting

Docker Model Runner
How to use sinimiini/HRM-Text-1B-GGUF with Docker Model Runner:
```
docker model run hf.co/sinimiini/HRM-Text-1B-GGUF:BF16
```

Lemonade

How to use sinimiini/HRM-Text-1B-GGUF with Lemonade:

Pull the model

# Download Lemonade from https://lemonade-server.ai/
lemonade pull sinimiini/HRM-Text-1B-GGUF:BF16

Run and chat with the model

lemonade run user.HRM-Text-1B-GGUF-BF16

List all available models

lemonade list

HRM-Text-1B-GGUF

File size: 25,011 Bytes

1c10575

diff --git a/conversion/__init__.py b/conversion/__init__.py
index 2c38123df..ecf1be2db 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -95,6 +95,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
     "HunYuanDenseV1ForCausalLM": "hunyuan",
     "HunYuanMoEV1ForCausalLM": "hunyuan",
     "HunYuanVLForConditionalGeneration": "hunyuan",
+    "HrmTextForCausalLM": "hrm_text",
     "IQuestCoderForCausalLM": "llama",
     "InternLM2ForCausalLM": "internlm",
     "InternLM3ForCausalLM": "internlm",
diff --git a/conversion/hrm_text.py b/conversion/hrm_text.py
new file mode 100644
index 000000000..1f29ab55e
--- /dev/null
+++ b/conversion/hrm_text.py
@@ -0,0 +1,120 @@
+from __future__ import annotations
+
+import re
+import json
+
+from typing import Iterable, TYPE_CHECKING
+
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+from .base import ModelBase, TextModel, gguf, logger
+
+
+@ModelBase.register("HrmTextForCausalLM")
+class HrmTextModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.HRM_TEXT
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
+            self.raw_hparams = json.load(f)
+
+        self.layers_per_stack = self.raw_hparams["num_hidden_layers"]
+        self.h_cycles = self.raw_hparams["H_cycles"]
+        self.l_cycles = self.raw_hparams["L_cycles"]
+        self.physical_block_count = self.layers_per_stack * 2
+        self.cache_block_count = self.layers_per_stack * self.h_cycles * (self.l_cycles + 1)
+
+        # GGUF tensors store one physical L stack followed by one physical H stack.
+        # The runtime expands these 32 physical layers across 128 KV-cache slots.
+        self.block_count = self.physical_block_count
+        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+
+    def set_vocab(self):
+        # HRM-Text ships a Qwen2-style tokenizer.json. Keep it as a plain tokenizer;
+        # do not add a chat template for validation GGUFs.
+        self._set_vocab_gpt2()
+
+    def get_vocab_base_pre(self, tokenizer) -> str:
+        del tokenizer
+        return "qwen2"
+
+    def set_gguf_parameters(self):
+        hp = self.raw_hparams
+        head_dim = hp["head_dim"]
+
+        self.gguf_writer.add_context_length(hp["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(hp["hidden_size"])
+        self.gguf_writer.add_block_count(self.cache_block_count)
+        self.gguf_writer.add_feed_forward_length(hp["intermediate_size"])
+        self.gguf_writer.add_head_count(hp["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(hp["num_key_value_heads"])
+        self.gguf_writer.add_key_length(head_dim)
+        self.gguf_writer.add_value_length(head_dim)
+        self.gguf_writer.add_rope_dimension_count(head_dim)
+        self.gguf_writer.add_rope_freq_base(hp.get("rope_theta", 10000.0))
+        self.gguf_writer.add_layer_norm_rms_eps(hp["rms_norm_eps"])
+        self.gguf_writer.add_embedding_scale(hp["embedding_scale"])
+
+        arch = self.gguf_writer.arch
+        self.gguf_writer.add_uint32(gguf.Keys.LLM.HRM_LAYERS_PER_STACK.format(arch=arch), self.layers_per_stack)
+        self.gguf_writer.add_uint32(gguf.Keys.LLM.HRM_H_CYCLES.format(arch=arch), self.h_cycles)
+        self.gguf_writer.add_uint32(gguf.Keys.LLM.HRM_L_CYCLES.format(arch=arch), self.l_cycles)
+        self.gguf_writer.add_bool(gguf.Keys.LLM.HRM_PREFIX_LM.format(arch=arch), bool(hp.get("prefix_lm", False)))
+
+    def _format(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
+        return self.format_tensor_name(key, bid=bid, suffix=suffix)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name == "model.embed_tokens.weight":
+            yield self._format(gguf.MODEL_TENSOR.TOKEN_EMBD), data_torch
+            return
+
+        if name == "lm_head.weight":
+            yield self._format(gguf.MODEL_TENSOR.OUTPUT), data_torch
+            return
+
+        if name == "model.z_L_init":
+            yield self._format(gguf.MODEL_TENSOR.HRM_Z_L_INIT, suffix=""), data_torch
+            return
+
+        match = re.fullmatch(r"model\.([LH])_module\.layers\.(\d+)\.(.+)", name)
+        if match is None:
+            raise ValueError(f"Can not map tensor {name!r}")
+
+        stack, layer_s, tensor_name = match.groups()
+        layer_idx = int(layer_s)
+        if layer_idx >= self.layers_per_stack:
+            raise ValueError(f"Layer index {layer_idx} outside HRM stack size {self.layers_per_stack}")
+
+        physical_bid = layer_idx + (self.layers_per_stack if stack == "H" else 0)
+
+        if tensor_name == "attn.gqkv_proj.weight":
+            gate, q, k, v = torch.chunk(data_torch, 4, dim=0)
+            logger.debug("Split %s as gate, q, k, v", name)
+            yield self._format(gguf.MODEL_TENSOR.ATTN_GATE, physical_bid), gate.contiguous()
+            yield self._format(gguf.MODEL_TENSOR.ATTN_Q, physical_bid), q.contiguous()
+            yield self._format(gguf.MODEL_TENSOR.ATTN_K, physical_bid), k.contiguous()
+            yield self._format(gguf.MODEL_TENSOR.ATTN_V, physical_bid), v.contiguous()
+            return
+
+        if tensor_name == "attn.o_proj.weight":
+            yield self._format(gguf.MODEL_TENSOR.ATTN_OUT, physical_bid), data_torch
+            return
+
+        if tensor_name == "mlp.gate_up_proj.weight":
+            gate, up = torch.chunk(data_torch, 2, dim=0)
+            logger.debug("Split %s as gate, up", name)
+            yield self._format(gguf.MODEL_TENSOR.FFN_GATE, physical_bid), gate.contiguous()
+            yield self._format(gguf.MODEL_TENSOR.FFN_UP, physical_bid), up.contiguous()
+            return
+
+        if tensor_name == "mlp.down_proj.weight":
+            yield self._format(gguf.MODEL_TENSOR.FFN_DOWN, physical_bid), data_torch
+            return
+
+        raise ValueError(f"Can not map tensor {name!r}")
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 7fdcf03d7..b84cc8827 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -144,6 +144,10 @@ class Keys:
         TOKEN_SHIFT_COUNT                 = "{arch}.token_shift_count"
         INTERLEAVE_MOE_LAYER_STEP         = "{arch}.interleave_moe_layer_step"
         FULL_ATTENTION_INTERVAL           = "{arch}.full_attention_interval"
+        HRM_LAYERS_PER_STACK              = "{arch}.layers_per_stack"
+        HRM_H_CYCLES                      = "{arch}.h_cycles"
+        HRM_L_CYCLES                      = "{arch}.l_cycles"
+        HRM_PREFIX_LM                     = "{arch}.prefix_lm"
         ACTIVATION_SPARSITY_SCALE         = "{arch}.activation_sparsity_scale"
         ALTUP_ACTIVE_IDX                  = "{arch}.altup.active_idx"
         ALTUP_NUM_INPUTS                  = "{arch}.altup.num_inputs"
@@ -410,6 +414,7 @@ class MODEL_ARCH(IntEnum):
     QWEN3            = auto()
     QWEN3MOE         = auto()
     QWEN3NEXT        = auto()
+    HRM_TEXT         = auto()
     QWEN3VL          = auto()
     QWEN3VLMOE       = auto()
     QWEN35           = auto()
@@ -527,6 +532,7 @@ class MODEL_TENSOR(IntEnum):
     TOKEN_TYPES          = auto()
     POS_EMBD             = auto()
     OUTPUT               = auto()
+    HRM_Z_L_INIT         = auto()
     DENSE_2_OUT          = auto() # embeddinggemma 2_Dense
     DENSE_3_OUT          = auto() # embeddinggemma 3_Dense
     OUTPUT_NORM          = auto()
@@ -925,6 +931,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.QWEN3:            "qwen3",
     MODEL_ARCH.QWEN3MOE:         "qwen3moe",
     MODEL_ARCH.QWEN3NEXT:        "qwen3next",
+    MODEL_ARCH.HRM_TEXT:         "hrm_text",
     MODEL_ARCH.QWEN3VL:          "qwen3vl",
     MODEL_ARCH.QWEN3VLMOE:       "qwen3vlmoe",
     MODEL_ARCH.QWEN35:           "qwen35",
@@ -1042,6 +1049,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.POS_EMBD:                  "position_embd",
     MODEL_TENSOR.OUTPUT_NORM:               "output_norm",
     MODEL_TENSOR.OUTPUT:                    "output",
+    MODEL_TENSOR.HRM_Z_L_INIT:              "hrm.z_l_init",
     MODEL_TENSOR.DENSE_2_OUT:                "dense_2", # embeddinggemma 2_Dense
     MODEL_TENSOR.DENSE_3_OUT:                "dense_3", # embeddinggemma 2_Dense
     MODEL_TENSOR.ROPE_FREQS:                "rope_freqs",
@@ -2057,6 +2065,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.SSM_BETA_ALPHA,
         MODEL_TENSOR.SSM_OUT
     ],
+    MODEL_ARCH.HRM_TEXT: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.HRM_Z_L_INIT,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_GATE,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.QWEN3VL: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index c9eead18a..5b8ee3781 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -37,6 +37,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_QWEN3,            "qwen3"            },
     { LLM_ARCH_QWEN3MOE,         "qwen3moe"         },
     { LLM_ARCH_QWEN3NEXT,        "qwen3next"        },
+    { LLM_ARCH_HRM_TEXT,         "hrm_text"         },
     { LLM_ARCH_QWEN3VL,          "qwen3vl"          },
     { LLM_ARCH_QWEN3VLMOE,       "qwen3vlmoe"       },
     { LLM_ARCH_QWEN35,           "qwen35"           },
@@ -209,6 +210,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKEN_SHIFT_COUNT,                 "%s.token_shift_count"                 },
     { LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         "%s.interleave_moe_layer_step"         },
     { LLM_KV_FULL_ATTENTION_INTERVAL,           "%s.full_attention_interval"           },
+    { LLM_KV_HRM_LAYERS_PER_STACK,              "%s.layers_per_stack"                  },
+    { LLM_KV_HRM_H_CYCLES,                      "%s.h_cycles"                          },
+    { LLM_KV_HRM_L_CYCLES,                      "%s.l_cycles"                          },
+    { LLM_KV_HRM_PREFIX_LM,                     "%s.prefix_lm"                         },
 
     { LLM_KV_ATTENTION_HEAD_COUNT,                   "%s.attention.head_count"                   },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,                "%s.attention.head_count_kv"                },
@@ -346,6 +351,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_OUTPUT_NORM,                            "output_norm" },
     { LLM_TENSOR_OUTPUT_NORM_LFM2,                       "token_embd_norm" }, // fix for wrong tensor name
     { LLM_TENSOR_OUTPUT,                                 "output" },
+    { LLM_TENSOR_HRM_Z_L_INIT,                           "hrm.z_l_init" },
     { LLM_TENSOR_ROPE_FREQS,                             "rope_freqs" },
     { LLM_TENSOR_ATTN_NORM,                              "blk.%d.attn_norm" },
     { LLM_TENSOR_ATTN_Q,                                 "blk.%d.attn_q" },
@@ -565,6 +571,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_POS_EMBD,                   {LLM_TENSOR_LAYER_INPUT,     GGML_OP_GET_ROWS}},
     {LLM_TENSOR_TOKEN_TYPES,                {LLM_TENSOR_LAYER_INPUT,     GGML_OP_GET_ROWS}},
     {LLM_TENSOR_TOKEN_EMBD_NORM,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},  // do the norms on the first layer (not the input layer)
+    {LLM_TENSOR_HRM_Z_L_INIT,               {LLM_TENSOR_LAYER_INPUT,     GGML_OP_MUL}},
     {LLM_TENSOR_OUTPUT,                     {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CLS,                        {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CLS_OUT,                    {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 89cf16cc3..fa04b684b 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -41,6 +41,7 @@ enum llm_arch {
     LLM_ARCH_QWEN3,
     LLM_ARCH_QWEN3MOE,
     LLM_ARCH_QWEN3NEXT,
+    LLM_ARCH_HRM_TEXT,
     LLM_ARCH_QWEN3VL,
     LLM_ARCH_QWEN3VLMOE,
     LLM_ARCH_QWEN35,
@@ -213,6 +214,10 @@ enum llm_kv {
     LLM_KV_TOKEN_SHIFT_COUNT,
     LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
     LLM_KV_FULL_ATTENTION_INTERVAL,
+    LLM_KV_HRM_LAYERS_PER_STACK,
+    LLM_KV_HRM_H_CYCLES,
+    LLM_KV_HRM_L_CYCLES,
+    LLM_KV_HRM_PREFIX_LM,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -354,6 +359,7 @@ enum llm_tensor {
     LLM_TENSOR_DENSE_2_OUT,
     LLM_TENSOR_DENSE_3_OUT,
     LLM_TENSOR_OUTPUT,
+    LLM_TENSOR_HRM_Z_L_INIT,
     LLM_TENSOR_OUTPUT_NORM,
     LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
     LLM_TENSOR_ROPE_FREQS,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index ad36c0666..fa80f4260 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2208,6 +2208,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
     if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
         return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
     }
+    if (model.arch == LLM_ARCH_HRM_TEXT) {
+        return std::max<uint32_t>(n_tokens * 80, 64u * model.n_tensors());
+    }
     uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
     for (const auto & lora : model.loras) {
         res += lora->get_n_nodes();
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index e2d051edc..812598f69 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -164,6 +164,12 @@ struct llama_hparams {
     float f_embedding_scale = 0.0f;
     float f_attention_scale = 0.0f;
 
+    // HRM-Text recurrence metadata. n_layer remains the expanded KV-cache slot count.
+    uint32_t n_hrm_layer_per_stack = 0;
+    uint32_t n_hrm_h_cycles        = 0;
+    uint32_t n_hrm_l_cycles        = 0;
+    bool     hrm_prefix_lm         = false;
+
     // grok-2
     float    f_attn_out_scale = 0.0f;
     uint32_t attn_temp_length = 0;
diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp
index 528e4c9c0..8a6e009c6 100644
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -245,6 +245,10 @@ void llama_model_saver::add_kv_from_model() {
     add_kv(LLM_KV_TOKEN_SHIFT_COUNT,                 hparams.token_shift_count);
     add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         hparams.n_moe_layer_step);
     // add_kv(LLM_KV_FULL_ATTENTION_INTERVAL,           ???);
+    add_kv(LLM_KV_HRM_LAYERS_PER_STACK,              hparams.n_hrm_layer_per_stack);
+    add_kv(LLM_KV_HRM_H_CYCLES,                      hparams.n_hrm_h_cycles);
+    add_kv(LLM_KV_HRM_L_CYCLES,                      hparams.n_hrm_l_cycles);
+    add_kv(LLM_KV_HRM_PREFIX_LM,                     hparams.hrm_prefix_lm);
 
     add_kv(LLM_KV_ATTENTION_HEAD_COUNT,              hparams.n_head_arr, true);
     add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV,           hparams.n_head_kv_arr, true);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 8bf20a716..a3cc996aa 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -96,6 +96,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
             return new llama_model_qwen2moe(params);
         case LLM_ARCH_QWEN3:
             return new llama_model_qwen3(params);
+        case LLM_ARCH_HRM_TEXT:
+            return new llama_model_hrm_text(params);
         case LLM_ARCH_QWEN3MOE:
             return new llama_model_qwen3moe(params);
         case LLM_ARCH_QWEN3VL:
@@ -2339,6 +2341,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_PANGU_EMBED:
         case LLM_ARCH_AFMOE:
         case LLM_ARCH_QWEN3NEXT:
+        case LLM_ARCH_HRM_TEXT:
         case LLM_ARCH_MIMO2:
         case LLM_ARCH_STEP35:
             return LLAMA_ROPE_TYPE_NEOX;
diff --git a/src/models/hrm-text.cpp b/src/models/hrm-text.cpp
new file mode 100644
index 000000000..e0a3e9f59
--- /dev/null
+++ b/src/models/hrm-text.cpp
@@ -0,0 +1,183 @@
+#include "models.h"
+
+#include <cmath>
+#include <vector>
+
+void llama_model_hrm_text::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale);
+    ml.get_key(LLM_KV_HRM_LAYERS_PER_STACK,        hparams.n_hrm_layer_per_stack);
+    ml.get_key(LLM_KV_HRM_H_CYCLES,                hparams.n_hrm_h_cycles);
+    ml.get_key(LLM_KV_HRM_L_CYCLES,                hparams.n_hrm_l_cycles);
+    ml.get_key(LLM_KV_HRM_PREFIX_LM,               hparams.hrm_prefix_lm, false);
+
+    switch (hparams.n_embd) {
+        case 1536: type = LLM_TYPE_1B; break;
+        default:   type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_hrm_text::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t n_stack = hparams.n_hrm_layer_per_stack;
+    const int64_t n_cycle_slots = n_stack * (hparams.n_hrm_l_cycles + 1);
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+    output   = create_tensor(tn(LLM_TENSOR_OUTPUT,     "weight"), {n_embd, n_vocab}, 0);
+
+    hrm_z_l_init = create_tensor(tn(LLM_TENSOR_HRM_Z_L_INIT), {n_embd}, 0);
+
+    std::vector<bool> loaded_physical(2 * n_stack, false);
+
+    for (int il = 0; il < n_layer; ++il) {
+        auto & layer = layers[il];
+
+        const int64_t layer_in_stack = il % n_stack;
+        const int64_t phase = (il % n_cycle_slots) / n_stack;
+        const bool is_h_stack = phase == int64_t(hparams.n_hrm_l_cycles);
+        const int physical_bid = int((is_h_stack ? n_stack : 0) + layer_in_stack);
+
+        const int flags = loaded_physical[physical_bid] ? TENSOR_DUPLICATED : 0;
+        loaded_physical[physical_bid] = true;
+
+        create_tensor_qkv(layer, physical_bid,
+                n_embd,
+                n_embd_head_k * n_head,
+                n_embd_k_gqa,
+                n_embd_v_gqa,
+                flags);
+
+        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", physical_bid), {n_embd, n_embd_head_k * n_head}, flags);
+        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,  "weight", physical_bid), {n_embd_head_k * n_head, n_embd}, flags);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", physical_bid), {n_embd, n_ff}, flags);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", physical_bid), {n_ff, n_embd}, flags);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", physical_bid), {n_embd, n_ff}, flags);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_hrm_text::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_hrm_text::graph::graph(const llama_model & model_, const llm_graph_params & params) : llm_graph_context(params) {
+    const auto & model = static_cast<const llama_model_hrm_text &>(model_);
+
+    GGML_ASSERT(model.tok_embd != nullptr);
+    GGML_ASSERT(model.output != nullptr);
+    GGML_ASSERT(model.hrm_z_l_init != nullptr);
+
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
+
+    const int64_t n_stack = hparams.n_hrm_layer_per_stack;
+    const int64_t h_cycles = hparams.n_hrm_h_cycles;
+    const int64_t l_cycles = hparams.n_hrm_l_cycles;
+
+    ggml_tensor * inp_pos = build_inp_pos();
+    auto * inp_attn = build_attn_inp_kv();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    ggml_tensor * hidden_high = build_inp_embd(model.tok_embd);
+    ggml_tensor * hidden_low = ggml_repeat(ctx0, model.hrm_z_l_init, hidden_high);
+    cb(hidden_low, "hrm_z_l_init", -1);
+
+    const float kq_scale = 1.0f / std::sqrt(float(n_embd_head));
+
+    auto build_stack = [&](ggml_tensor * stack_inp, int slot_offset) -> ggml_tensor * {
+        ggml_tensor * stack_cur = stack_inp;
+
+        for (int layer_idx = 0; layer_idx < n_stack; ++layer_idx) {
+            const int il = slot_offset + layer_idx;
+            const auto & layer = model.layers[il];
+
+            ggml_tensor * inpSA = stack_cur;
+            ggml_tensor * cur = build_norm(stack_cur, nullptr, nullptr, LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            {
+                ggml_tensor * attn_inp = cur;
+                auto [Qcur, Kcur, Vcur] = build_qkv(layer, cur, n_embd_head, n_head, n_head_kv, il);
+
+                ggml_tensor * gate = build_lora_mm(layer.wqkv_gate, attn_inp, layer.wqkv_gate_s);
+                cb(gate, "attn_gate_proj", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur_rope", il);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur_rope", il);
+
+                cur = build_attn(inp_attn,
+                        nullptr, nullptr, nullptr,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+                cb(cur, "attn_out", il);
+
+                gate = ggml_sigmoid(ctx0, gate);
+                cb(gate, "attn_gate_sig", il);
+
+                cur = ggml_mul(ctx0, cur, gate);
+                cb(cur, "attn_gated", il);
+
+                cur = build_lora_mm(layer.wo, cur, layer.wo_s);
+                cb(cur, "attn_o_proj", il);
+            }
+
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            cur = build_norm(ffn_inp, nullptr, nullptr, LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    layer.ffn_up,   nullptr, layer.ffn_up_s,
+                    layer.ffn_gate, nullptr, layer.ffn_gate_s,
+                    layer.ffn_down, nullptr, layer.ffn_down_s,
+                    nullptr,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = build_cvec(cur, il);
+            cb(cur, "hrm_layer_out", il);
+
+            stack_cur = cur;
+        }
+
+        stack_cur = build_norm(stack_cur, nullptr, nullptr, LLM_NORM_RMS, slot_offset);
+        cb(stack_cur, "stack_final_norm", slot_offset);
+        return stack_cur;
+    };
+
+    for (int h = 0; h < h_cycles; ++h) {
+        for (int l = 0; l < l_cycles; ++l) {
+            const int slot_offset = int((h * (l_cycles + 1) + l) * n_stack);
+            hidden_low = build_stack(ggml_add(ctx0, hidden_low, hidden_high), slot_offset);
+        }
+
+        const int slot_offset = int((h * (l_cycles + 1) + l_cycles) * n_stack);
+        hidden_high = build_stack(ggml_add(ctx0, hidden_high, hidden_low), slot_offset);
+    }
+
+    ggml_tensor * cur = hidden_high;
+
+    if (inp_out_ids) {
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+    }
+
+    res->t_embd = cur;
+
+    cur = build_lora_mm(model.output, cur, model.output_s);
+    cb(cur, "result_output", -1);
+
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/src/models/models.h b/src/models/models.h
index 7e551eb96..7da6b7f7f 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -515,6 +515,20 @@ struct llama_model_qwen3 : public llama_model_base {
     std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
+struct llama_model_hrm_text : public llama_model_base {
+    llama_model_hrm_text(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    ggml_tensor * hrm_z_l_init = nullptr;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
 
 struct llama_model_qwen3moe : public llama_model_base {
     llama_model_qwen3moe(const struct llama_model_params & params) : llama_model_base(params) {}