gyung commited on
Commit
857a0d8
·
verified ·
1 Parent(s): f9f5786

Upload HRM Ko-Terminal tokenizer v1

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HRM Ko-Terminal 131K Tokenizer v1
2
+
3
+ Built on 2026-05-23 for HRM-Text Korean terminal/tool-call pre-training.
4
+
5
+ ## Training
6
+
7
+ - Algorithm: byte-level BPE
8
+ - Vocabulary size: 131,072
9
+ - Normalization: NFC
10
+ - Corpus cap: 2.5GiB total, 256MiB per top-level input source
11
+ - Goal: Korean, English, code, terminal commands, JSON/tool-call formats
12
+
13
+ ## Efficiency Check
14
+
15
+ | sample | chars/token |
16
+ |---|---:|
17
+ | Korean general | 2.60 |
18
+ | Korean legal | 2.36 |
19
+ | Korean terminal instruction | 2.18 |
20
+ | Shell command | 2.68 |
21
+ | Tool-call JSON | 3.32 |
22
+ | Python code | 3.37 |
23
+ | English | 4.40 |
24
+
25
+ Core HRM/chat/tool special tokens encode as single tokens:
26
+
27
+ - `<|im_start|>`
28
+ - `<|im_end|>`
29
+ - `<|assistant|>`
30
+ - `<|tool_call|>`
31
+ - `<|terminal|>`
32
+ - `<|box_end|>`
33
+
34
+ ## Notes
35
+
36
+ This tokenizer keeps HRM-Text control tokens used by `scripts/prepare_sft_data.py`,
37
+ including the default condition mapping:
38
+
39
+ - `direct=<|object_ref_start|>`
40
+ - `cot=<|object_ref_end|>`
41
+ - `noisy=<|quad_start|>`
42
+ - `synth=<|quad_end|>`
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef6d5204ebfb25e992926714af88ad6b77e12a90ea6f3eb0f200e1a1f8712d5c
3
+ size 11457812
tokenizer_training_manifest.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 131072,
3
+ "requested_vocab_size": 131072,
4
+ "max_gib": 2.5,
5
+ "max_mib_per_input": 256.0,
6
+ "min_frequency": 2,
7
+ "special_tokens": [
8
+ "<|PAD|>",
9
+ "<|unk|>",
10
+ "<|im_start|>",
11
+ "<|im_end|>",
12
+ "<|system|>",
13
+ "<|user|>",
14
+ "<|assistant|>",
15
+ "<|tool_call|>",
16
+ "<|/tool_call|>",
17
+ "<|tool_response|>",
18
+ "<|function|>",
19
+ "<|/function|>",
20
+ "<|execute|>",
21
+ "<|result|>",
22
+ "<|terminal|>",
23
+ "<|/terminal|>",
24
+ "<|command|>",
25
+ "<|output|>",
26
+ "<|error|>",
27
+ "<|exit_code|>",
28
+ "<|json_start|>",
29
+ "<|json_end|>",
30
+ "<|xml_start|>",
31
+ "<|xml_end|>",
32
+ "<|code_start|>",
33
+ "<|code_end|>",
34
+ "<think>",
35
+ "</think>",
36
+ "<|direct|>",
37
+ "<|cot|>",
38
+ "<|noisy|>",
39
+ "<|synth|>",
40
+ "<|object_ref_start|>",
41
+ "<|object_ref_end|>",
42
+ "<|box_start|>",
43
+ "<|box_end|>",
44
+ "<|quad_start|>",
45
+ "<|quad_end|>",
46
+ "<|vision_start|>",
47
+ "<|vision_end|>",
48
+ "<|vision_pad|>",
49
+ "<|image_pad|>",
50
+ "<|video_pad|>",
51
+ "<|fim_prefix|>",
52
+ "<|fim_middle|>",
53
+ "<|fim_suffix|>"
54
+ ],
55
+ "inputs": [
56
+ "HRM-Text/legalize-kr",
57
+ "HRM-Text/ordinance-kr",
58
+ "admrule-kr",
59
+ "precedent-kr",
60
+ "dataset",
61
+ "HRM-Text/data_toolbench/data",
62
+ "/home/work/.data/huggingface/hrm_text_extra/sft",
63
+ "/home/work/.data/huggingface/hrm_text_extra/tokenizer_corpus",
64
+ "/home/work/.data/huggingface/hrm_text_extra/raw/angrygiraffe__claude-opus-4.6-4.7-reasoning-8.7k"
65
+ ]
66
+ }