feat: add custom tokenizer with multi-char Chinese token splitting

#8
Files changed (2) hide show
  1. tokenization_voxcpm2.py +72 -0
  2. tokenizer_config.json +9 -3
tokenization_voxcpm2.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Custom tokenizer for VoxCPM2 that splits multi-character Chinese tokens.
2
+
3
+ VoxCPM2 was trained with ``mask_multichar_chinese_tokens`` which splits
4
+ multi-character Chinese tokens (e.g. "你好" -> ["你", "好"]) into individual
5
+ character IDs before embedding. The base LlamaTokenizerFast produces
6
+ multi-character Chinese tokens that the model has never seen during training,
7
+ yielding garbled Chinese audio output in downstream inference frameworks.
8
+
9
+ This module provides ``VoxCPM2Tokenizer`` which transparently applies the
10
+ character splitting inside ``encode()`` and ``__call__()``, so any downstream
11
+ consumer (vLLM, vLLM-Omni, Nano-vLLM, etc.) gets correct single-character
12
+ IDs without code changes.
13
+ """
14
+
15
+ from transformers import LlamaTokenizerFast
16
+
17
+
18
+ class VoxCPM2Tokenizer(LlamaTokenizerFast):
19
+
20
+ def __init__(self, *args, **kwargs):
21
+ super().__init__(*args, **kwargs)
22
+ self._split_map = self._build_split_map()
23
+
24
+ def _build_split_map(self) -> dict[int, list[int]]:
25
+ vocab = self.get_vocab()
26
+ split_map: dict[int, list[int]] = {}
27
+ for token, tid in vocab.items():
28
+ clean = token.replace("\u2581", "")
29
+ if len(clean) >= 2 and all(self._is_cjk(c) for c in clean):
30
+ char_ids = self.convert_tokens_to_ids(list(clean))
31
+ if all(c != self.unk_token_id for c in char_ids):
32
+ split_map[tid] = char_ids
33
+ return split_map
34
+
35
+ @staticmethod
36
+ def _is_cjk(c: str) -> bool:
37
+ return (
38
+ "\u4e00" <= c <= "\u9fff"
39
+ or "\u3400" <= c <= "\u4dbf"
40
+ or "\uf900" <= c <= "\ufaff"
41
+ or "\U00020000" <= c <= "\U0002a6df"
42
+ )
43
+
44
+ def _expand_ids(self, ids: list[int]) -> list[int]:
45
+ result: list[int] = []
46
+ for tid in ids:
47
+ expansion = self._split_map.get(tid)
48
+ if expansion is not None:
49
+ result.extend(expansion)
50
+ else:
51
+ result.append(tid)
52
+ return result
53
+
54
+ def encode(self, text, *args, **kwargs):
55
+ ids = super().encode(text, *args, **kwargs)
56
+ return self._expand_ids(ids)
57
+
58
+ def __call__(self, text, *args, **kwargs):
59
+ result = super().__call__(text, *args, **kwargs)
60
+ if hasattr(result, "input_ids"):
61
+ ids = result["input_ids"]
62
+ if isinstance(ids, list) and ids and isinstance(ids[0], list):
63
+ result["input_ids"] = [self._expand_ids(x) for x in ids]
64
+ if "attention_mask" in result:
65
+ result["attention_mask"] = [
66
+ [1] * len(x) for x in result["input_ids"]
67
+ ]
68
+ elif isinstance(ids, list):
69
+ result["input_ids"] = self._expand_ids(ids)
70
+ if "attention_mask" in result:
71
+ result["attention_mask"] = [1] * len(result["input_ids"])
72
+ return result
tokenizer_config.json CHANGED
@@ -97,7 +97,7 @@
97
  "rstrip": false,
98
  "single_word": false,
99
  "special": true
100
- },
101
  "110": {
102
  "content": "<|/speaker_id|>",
103
  "lstrip": false,
@@ -205,8 +205,14 @@
205
  "pad_token": null,
206
  "sp_model_kwargs": {},
207
  "spaces_between_special_tokens": false,
208
- "tokenizer_class": "LlamaTokenizer",
209
  "unk_token": "<unk>",
210
  "use_default_system_prompt": false,
211
- "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
 
 
 
 
 
 
212
  }
 
97
  "rstrip": false,
98
  "single_word": false,
99
  "special": true
100
+ },
101
  "110": {
102
  "content": "<|/speaker_id|>",
103
  "lstrip": false,
 
205
  "pad_token": null,
206
  "sp_model_kwargs": {},
207
  "spaces_between_special_tokens": false,
208
+ "tokenizer_class": "VoxCPM2Tokenizer",
209
  "unk_token": "<unk>",
210
  "use_default_system_prompt": false,
211
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
212
+ "auto_map": {
213
+ "AutoTokenizer": [
214
+ "tokenization_voxcpm2.VoxCPM2Tokenizer",
215
+ null
216
+ ]
217
+ }
218
  }