thekusaldarshana commited on
Commit
e59ea28
·
1 Parent(s): fa9c240
Files changed (6) hide show
  1. EVALUATION.md +7 -7
  2. encoder.py +80 -41
  3. linguis_trie.py +55 -35
  4. router.py +53 -115
  5. tokenizer.json +0 -0
  6. vocab.json +0 -0
EVALUATION.md CHANGED
@@ -153,15 +153,15 @@ Evaluating 1,499,950 sentences...
153
  ====== Sinhala Results ======
154
  Tokenizer | Tokens | TWR | Chr/Tok | % Reduction
155
  ----------------------------------------------------------------------
156
- SGPE | 6,665,177 | 1.276 | 4.83 | -
157
- OpenAI (o200k_base) | 17,360,196 | 3.324 | 1.85 | 61.6%
158
- Llama 4 Scout | 18,157,707 | 3.476 | 1.77 | 63.3%
159
- DeepSeek V3 | 29,152,698 | 5.581 | 1.10 | 77.1%
160
 
161
  ====== Hindi Results ======
162
  Tokenizer | Tokens | TWR | Chr/Tok | % Reduction
163
  ----------------------------------------------------------------------
164
- SGPE | 13,432,763 | 1.181 | 4.29 | -
165
  OpenAI (o200k_base) | 18,394,075 | 1.617 | 3.13 | 27.0%
166
  Llama 4 Scout | 19,566,121 | 1.720 | 2.94 | 31.3%
167
  DeepSeek V3 | 31,682,218 | 2.786 | 1.82 | 57.6%
@@ -169,7 +169,7 @@ DeepSeek V3 | 31,682,218 | 2.786 | 1.82 | 57.6%
169
  ====== English Results ======
170
  Tokenizer | Tokens | TWR | Chr/Tok | % Reduction
171
  ----------------------------------------------------------------------
172
- SGPE | 7,240,151 | 1.330 | 4.46 | -
173
  OpenAI (o200k_base) | 7,420,527 | 1.364 | 4.35 | 2.4%
174
  Llama 4 Scout | 7,512,843 | 1.381 | 4.30 | 3.6%
175
  DeepSeek V3 | 7,904,670 | 1.453 | 4.09 | 8.4%
@@ -177,7 +177,7 @@ DeepSeek V3 | 7,904,670 | 1.453 | 4.09 | 8.4%
177
  ========================= OVERALL Results =========================
178
  Tokenizer | Tokens | TWR | Chr/Tok | % Reduction
179
  ----------------------------------------------------------------------
180
- SGPE | 27,338,091 | 1.241 | 4.47 | -
181
  OpenAI (o200k_base) | 43,174,798 | 1.959 | 2.83 | 36.7%
182
  Llama 4 Scout | 45,236,671 | 2.053 | 2.70 | 39.6%
183
  DeepSeek V3 | 68,739,586 | 3.119 | 1.78 | 60.2%
 
153
  ====== Sinhala Results ======
154
  Tokenizer | Tokens | TWR | Chr/Tok | % Reduction
155
  ----------------------------------------------------------------------
156
+ SGPE | 6,654,288 | 1.274 | 4.83 | -
157
+ OpenAI (o200k_base) | 17,360,196 | 3.324 | 1.85 | 61.7%
158
+ Llama 4 Scout | 18,157,707 | 3.476 | 1.77 | 63.4%
159
+ DeepSeek V3 | 29,152,698 | 5.581 | 1.10 | 77.2%
160
 
161
  ====== Hindi Results ======
162
  Tokenizer | Tokens | TWR | Chr/Tok | % Reduction
163
  ----------------------------------------------------------------------
164
+ SGPE | 13,433,554 | 1.181 | 4.29 | -
165
  OpenAI (o200k_base) | 18,394,075 | 1.617 | 3.13 | 27.0%
166
  Llama 4 Scout | 19,566,121 | 1.720 | 2.94 | 31.3%
167
  DeepSeek V3 | 31,682,218 | 2.786 | 1.82 | 57.6%
 
169
  ====== English Results ======
170
  Tokenizer | Tokens | TWR | Chr/Tok | % Reduction
171
  ----------------------------------------------------------------------
172
+ SGPE | 7,240,147 | 1.330 | 4.46 | -
173
  OpenAI (o200k_base) | 7,420,527 | 1.364 | 4.35 | 2.4%
174
  Llama 4 Scout | 7,512,843 | 1.381 | 4.30 | 3.6%
175
  DeepSeek V3 | 7,904,670 | 1.453 | 4.09 | 8.4%
 
177
  ========================= OVERALL Results =========================
178
  Tokenizer | Tokens | TWR | Chr/Tok | % Reduction
179
  ----------------------------------------------------------------------
180
+ SGPE | 27,327,989 | 1.240 | 4.47 | -
181
  OpenAI (o200k_base) | 43,174,798 | 1.959 | 2.83 | 36.7%
182
  Llama 4 Scout | 45,236,671 | 2.053 | 2.70 | 39.6%
183
  DeepSeek V3 | 68,739,586 | 3.119 | 1.78 | 60.2%
encoder.py CHANGED
@@ -10,8 +10,35 @@ import argparse
10
  import json
11
  from typing import Optional
12
 
13
- from linguis_trie import LinguisTrie, build_sinhala_linguis_trie
14
- from gpe_trainer import segment_into_words, _is_boundary_token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  class SGPEEncoder:
17
 
@@ -22,9 +49,17 @@ class SGPEEncoder:
22
  self.vocab: dict[str, int] = data["vocab"]
23
  self.merges: list[tuple[str, str]] = [tuple(m) for m in data["merges"]]
24
  self.special_tokens: list[str] = data["special_tokens"]
25
- self.tokenizer = build_sinhala_linguis_trie()
26
- self.unk_id = self.vocab.get("[UNK]", 1)
27
  self.leading_space: bool = data.get("leading_space", False)
 
 
 
 
 
 
 
 
 
 
28
 
29
  self._merge_priority: dict[tuple[str, str], int] = {
30
  (a, b): rank for rank, (a, b) in enumerate(self.merges)
@@ -55,19 +90,24 @@ class SGPEEncoder:
55
  return tokens
56
 
57
  def tokenize(self, text: str) -> list[str]:
58
- syllables = self.layer1_tokenize(text)
59
- words = segment_into_words(syllables)
60
- result: list[str] = []
61
- for word_tokens in words:
62
- if len(word_tokens) == 1 and _is_boundary_token(word_tokens[0]):
63
- result.append(word_tokens[0])
64
- continue
65
- cleaned = [t if t in self.vocab else "[UNK]" for t in word_tokens]
66
- result.extend(self._apply_merges_to_word(cleaned))
67
- return result
68
-
69
- def layer1_tokenize(self, text: str) -> list[str]:
70
- return self.tokenizer.tokenize(text, leading_space=self.leading_space)
 
 
 
 
 
71
 
72
  def decode(self, ids: list[int]) -> str:
73
  id_to_token = {v: k for k, v in self.vocab.items()}
@@ -155,15 +195,15 @@ class WWHOMetaEncoder:
155
  self._meta = MetaVocab(sgpe_vocab, self._tik.n_vocab)
156
  self._space_id: int = self._meta._sgpe_offset[" "]
157
 
158
- # Router
159
- from router import CodeSwitchSegmenter, Script
160
- self._segmenter = CodeSwitchSegmenter()
161
- self._Script = Script
162
-
163
  # Indic LinguisTries
164
- from linguis_trie import build_sinhala_linguis_trie, build_devanagari_linguis_trie
165
- self._sinhala_dfa = build_sinhala_linguis_trie()
166
- self._devanagari_dfa = build_devanagari_linguis_trie()
 
 
 
 
 
167
 
168
  # ------------------------------------------------------------------
169
  # Public API
@@ -184,18 +224,17 @@ class WWHOMetaEncoder:
184
  def encode(self, text: str) -> list[int]:
185
  ids: list[int] = []
186
  for seg in self._segmenter.segment(text):
187
- if seg.script == self._Script.LATIN:
188
  ids.extend(self._tik.encode(seg.text))
189
  else:
190
- dfa = (
191
- self._sinhala_dfa
192
- if seg.script == self._Script.SINHALA
193
- else self._devanagari_dfa
194
- )
195
  syllables = dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
196
- words = segment_into_words(syllables)
197
  for word_toks in words:
198
- if len(word_toks) == 1 and _is_boundary_token(word_toks[0]):
199
  ids.extend(self._tik.encode(word_toks[0]))
200
  continue
201
  merged = self._apply_merges(word_toks)
@@ -226,19 +265,19 @@ class WWHOMetaEncoder:
226
  def tokenize(self, text: str) -> list[str]:
227
  tokens: list[str] = []
228
  for seg in self._segmenter.segment(text):
229
- if seg.script == self._Script.LATIN:
230
  ids = self._tik.encode(seg.text)
231
  tokens.extend(self._tik.decode([i]) for i in ids)
232
  else:
233
- dfa = (
234
- self._sinhala_dfa
235
- if seg.script == self._Script.SINHALA
236
- else self._devanagari_dfa
237
- )
238
  syllables = dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
239
- words = segment_into_words(syllables)
240
  for word_toks in words:
241
- if len(word_toks) == 1 and _is_boundary_token(word_toks[0]):
242
  ids = self._tik.encode(word_toks[0])
243
  tokens.extend(self._tik.decode([i]) for i in ids)
244
  continue
 
10
  import json
11
  from typing import Optional
12
 
13
+ from linguis_trie import LinguisTrie
14
+
15
+ def _is_boundary_token(token: str, segmenter) -> bool:
16
+ for ch in token:
17
+ if segmenter:
18
+ lang = segmenter._get_char_language(ch)
19
+ if lang is not None and lang != "latin":
20
+ return False
21
+ return True
22
+
23
+ def segment_into_words(syllables: list[str], segmenter) -> list[list[str]]:
24
+ words: list[list[str]] = []
25
+ current: list[str] = []
26
+
27
+ for tok in syllables:
28
+ if _is_boundary_token(tok, segmenter):
29
+ if current:
30
+ words.append(current)
31
+ current = []
32
+ words.append([tok])
33
+ else:
34
+ if tok[0] in (' ', '\t', '\n', '\r') and current:
35
+ words.append(current)
36
+ current = []
37
+ current.append(tok)
38
+
39
+ if current:
40
+ words.append(current)
41
+ return words
42
 
43
  class SGPEEncoder:
44
 
 
49
  self.vocab: dict[str, int] = data["vocab"]
50
  self.merges: list[tuple[str, str]] = [tuple(m) for m in data["merges"]]
51
  self.special_tokens: list[str] = data["special_tokens"]
 
 
52
  self.leading_space: bool = data.get("leading_space", False)
53
+
54
+ script_mode = data.get("script_mode", "mixed")
55
+
56
+ from linguis_trie import load_dfa_map
57
+ from router import CodeSwitchSegmenter
58
+
59
+ self._dfa_map = load_dfa_map(script_mode)
60
+
61
+ language_blocks = {lang: dfa.unicode_blocks for lang, dfa in self._dfa_map.items()}
62
+ self._segmenter = CodeSwitchSegmenter(language_blocks)
63
 
64
  self._merge_priority: dict[tuple[str, str], int] = {
65
  (a, b): rank for rank, (a, b) in enumerate(self.merges)
 
90
  return tokens
91
 
92
  def tokenize(self, text: str) -> list[str]:
93
+ tokens: list[str] = []
94
+ for seg in self._segmenter.segment(text):
95
+ if seg.language == "latin":
96
+ tokens.append(seg.text)
97
+ else:
98
+ dfa = self._dfa_map.get(seg.language)
99
+ if not dfa:
100
+ tokens.append(seg.text)
101
+ continue
102
+ syllables = dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
103
+ words = segment_into_words(syllables, self._segmenter)
104
+ for word_toks in words:
105
+ if len(word_toks) == 1 and _is_boundary_token(word_toks[0], self._segmenter):
106
+ tokens.append(word_toks[0])
107
+ continue
108
+ cleaned = [t if t in self.vocab else "[UNK]" for t in word_toks]
109
+ tokens.extend(self._apply_merges_to_word(cleaned))
110
+ return tokens
111
 
112
  def decode(self, ids: list[int]) -> str:
113
  id_to_token = {v: k for k, v in self.vocab.items()}
 
195
  self._meta = MetaVocab(sgpe_vocab, self._tik.n_vocab)
196
  self._space_id: int = self._meta._sgpe_offset[" "]
197
 
 
 
 
 
 
198
  # Indic LinguisTries
199
+ from linguis_trie import load_dfa_map, LinguisTrie
200
+
201
+ self._dfa_map: dict[str, LinguisTrie] = load_dfa_map("mixed")
202
+
203
+ # Router Segmenter
204
+ from router import CodeSwitchSegmenter
205
+ language_blocks = {lang: dfa.unicode_blocks for lang, dfa in self._dfa_map.items()}
206
+ self._segmenter = CodeSwitchSegmenter(language_blocks)
207
 
208
  # ------------------------------------------------------------------
209
  # Public API
 
224
  def encode(self, text: str) -> list[int]:
225
  ids: list[int] = []
226
  for seg in self._segmenter.segment(text):
227
+ if seg.language == "latin":
228
  ids.extend(self._tik.encode(seg.text))
229
  else:
230
+ dfa = self._dfa_map.get(seg.language)
231
+ if not dfa:
232
+ ids.extend(self._tik.encode(seg.text))
233
+ continue
 
234
  syllables = dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
235
+ words = segment_into_words(syllables, self._segmenter)
236
  for word_toks in words:
237
+ if len(word_toks) == 1 and _is_boundary_token(word_toks[0], self._segmenter):
238
  ids.extend(self._tik.encode(word_toks[0]))
239
  continue
240
  merged = self._apply_merges(word_toks)
 
265
  def tokenize(self, text: str) -> list[str]:
266
  tokens: list[str] = []
267
  for seg in self._segmenter.segment(text):
268
+ if seg.language == "latin":
269
  ids = self._tik.encode(seg.text)
270
  tokens.extend(self._tik.decode([i]) for i in ids)
271
  else:
272
+ dfa = self._dfa_map.get(seg.language)
273
+ if not dfa:
274
+ ids = self._tik.encode(seg.text)
275
+ tokens.extend(self._tik.decode([i]) for i in ids)
276
+ continue
277
  syllables = dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
278
+ words = segment_into_words(syllables, self._segmenter)
279
  for word_toks in words:
280
+ if len(word_toks) == 1 and _is_boundary_token(word_toks[0], self._segmenter):
281
  ids = self._tik.encode(word_toks[0])
282
  tokens.extend(self._tik.decode([i]) for i in ids)
283
  continue
linguis_trie.py CHANGED
@@ -23,8 +23,9 @@ class SchemaError(ValueError):
23
  class LanguageSchema:
24
  language: str
25
  grammar_notation: str
26
- char_classes: dict[str, set[int]] # class-label → set of codepoints
27
- transitions: dict[str, dict[str, Optional[str]]] # state → (class → next_state | None)
 
28
  start_state: str
29
  accept_states: set[str]
30
  emit_states: set[str]
@@ -62,6 +63,10 @@ class SchemaLoader:
62
  if "dfa" not in raw:
63
  raise SchemaError(f"[{path}] Missing 'dfa' key.")
64
 
 
 
 
 
65
  char_classes: dict[str, set[int]] = {}
66
  for label, definition in raw["char_classes"].items():
67
  if label.startswith("_"):
@@ -83,6 +88,7 @@ class SchemaLoader:
83
  return LanguageSchema(
84
  language=language,
85
  grammar_notation=grammar,
 
86
  char_classes=char_classes,
87
  transitions=transitions,
88
  start_state=start_state,
@@ -192,7 +198,7 @@ class LinguisTrie:
192
  if last_accept_pos > span_start:
193
  emit_end = last_accept_pos
194
  else:
195
- emit_end = pos
196
 
197
  tokens.append(pending_space + text[span_start:emit_end])
198
  pending_space = ""
@@ -211,6 +217,10 @@ class LinguisTrie:
211
  def language(self) -> str:
212
  return self._schema.language
213
 
 
 
 
 
214
  @property
215
  def regex(self) -> str:
216
  return self._schema.get_regex()
@@ -237,12 +247,18 @@ def build_linguis_trie(schema_path: str) -> LinguisTrie:
237
  return _dfa_cache[schema_path]
238
 
239
 
240
- def build_sinhala_linguis_trie() -> LinguisTrie:
241
- return build_linguis_trie(os.path.join(_SCHEMA_DIR, "sinhala.json"))
242
-
243
-
244
- def build_devanagari_linguis_trie() -> LinguisTrie:
245
- return build_linguis_trie(os.path.join(_SCHEMA_DIR, "devanagari.json"))
 
 
 
 
 
 
246
 
247
 
248
  # ---------------------------------------------------------------------------
@@ -256,9 +272,12 @@ if __name__ == "__main__":
256
  print("DFA Tokenizer — self-test")
257
  print("=" * 65)
258
 
259
- # --- Sinhala ---
260
- sinhala_dfa = build_sinhala_linguis_trie()
261
- print(f"\n[Sinhala DFA] grammar: {sinhala_dfa.grammar}\n")
 
 
 
262
 
263
  sinhala_tests = [
264
  "ශ්‍රී ලංකා ද්වීපයේ ස්වෛරීභාවය සහ ත්‍රිවිධ හමුදාව.",
@@ -279,29 +298,30 @@ if __name__ == "__main__":
279
  print("-" * 65)
280
 
281
  # --- Devanagari ---
282
- deva_dfa = build_devanagari_linguis_trie()
283
- print(f"\n[Devanagari DFA] grammar: {deva_dfa.grammar}\n")
284
-
285
- deva_tests = [
286
- "नमस्ते",
287
- "भारत",
288
- "हिन्दी",
289
- "संसकृत",
290
- "क़िला",
291
- "़िंदगी",
292
- "प्रेम",
293
- "वा",
294
- "्रीमन्",
295
- "हिनदुस्तान",
296
- "नमसकार दुनिया",
297
- "मैं ठीहूँ",
298
- "विद्यालय पढ़ाई होती है।",
299
- ]
300
-
301
- for text in deva_tests:
302
- toks = deva_dfa.tokenize(text, leading_space=True)
303
- print(f" Input : {text}")
304
- print(f" Syllables: {toks}")
 
305
  print(f" Count : {len(toks)}")
306
  print("-" * 65)
307
 
 
23
  class LanguageSchema:
24
  language: str
25
  grammar_notation: str
26
+ unicode_blocks: list[tuple[int, int]]
27
+ char_classes: dict[str, set[int]]
28
+ transitions: dict[str, dict[str, Optional[str]]]
29
  start_state: str
30
  accept_states: set[str]
31
  emit_states: set[str]
 
63
  if "dfa" not in raw:
64
  raise SchemaError(f"[{path}] Missing 'dfa' key.")
65
 
66
+ unicode_blocks = []
67
+ for rng in raw.get("unicode_blocks", []):
68
+ unicode_blocks.append((int(rng[0], 16), int(rng[1], 16)))
69
+
70
  char_classes: dict[str, set[int]] = {}
71
  for label, definition in raw["char_classes"].items():
72
  if label.startswith("_"):
 
88
  return LanguageSchema(
89
  language=language,
90
  grammar_notation=grammar,
91
+ unicode_blocks=unicode_blocks,
92
  char_classes=char_classes,
93
  transitions=transitions,
94
  start_state=start_state,
 
198
  if last_accept_pos > span_start:
199
  emit_end = last_accept_pos
200
  else:
201
+ emit_end = span_start + 1 # Fallback: Emit only the first character as an ORPHAN
202
 
203
  tokens.append(pending_space + text[span_start:emit_end])
204
  pending_space = ""
 
217
  def language(self) -> str:
218
  return self._schema.language
219
 
220
+ @property
221
+ def unicode_blocks(self) -> list[tuple[int, int]]:
222
+ return self._schema.unicode_blocks
223
+
224
  @property
225
  def regex(self) -> str:
226
  return self._schema.get_regex()
 
247
  return _dfa_cache[schema_path]
248
 
249
 
250
+ def load_dfa_map(script_mode: str) -> dict[str, LinguisTrie]:
251
+ import glob
252
+ dfa_map = {}
253
+ pattern = os.path.join(_SCHEMA_DIR, "*.json")
254
+ for file in glob.glob(pattern):
255
+ try:
256
+ trie = build_linguis_trie(file)
257
+ if script_mode in ("mixed", "all") or script_mode == trie.language:
258
+ dfa_map[trie.language] = trie
259
+ except Exception as e:
260
+ print(f"Warning: Failed to load schema {file}: {e}")
261
+ return dfa_map
262
 
263
 
264
  # ---------------------------------------------------------------------------
 
272
  print("DFA Tokenizer — self-test")
273
  print("=" * 65)
274
 
275
+ # --- Load All Schemas ---
276
+ dfas = load_dfa_map("all")
277
+ sinhala_dfa = dfas.get("sinhala")
278
+
279
+ if sinhala_dfa:
280
+ print(f"\n[Sinhala DFA] grammar: {sinhala_dfa.grammar}\n")
281
 
282
  sinhala_tests = [
283
  "ශ්‍රී ලංකා ද්වීපයේ ස්වෛරීභාවය සහ ත්‍රිවිධ හමුදාව.",
 
298
  print("-" * 65)
299
 
300
  # --- Devanagari ---
301
+ deva_dfa = dfas.get("devanagari")
302
+ if deva_dfa:
303
+ print(f"\n[Devanagari DFA] grammar: {deva_dfa.grammar}\n")
304
+
305
+ deva_tests = [
306
+ "नमस्",
307
+ "भारत",
308
+ "हिनदी",
309
+ "संस्ृत",
310
+ "़िला",
311
+ "ज़िंदगी",
312
+ "्रेम",
313
+ "वारा",
314
+ "रीमान",
315
+ "हिन्दुस्त",
316
+ "स्ार दुनिया",
317
+ "मूँ",
318
+ "विद्यालय में पढ़ाई होती है।",
319
+ ]
320
+
321
+ for text in deva_tests:
322
+ toks = deva_dfa.tokenize(text, leading_space=True)
323
+ print(f" Input : {text}")
324
+ print(f" Syllables: {toks}")
325
  print(f" Count : {len(toks)}")
326
  print("-" * 65)
327
 
router.py CHANGED
@@ -8,37 +8,12 @@ from __future__ import annotations
8
 
9
  import re
10
  from dataclasses import dataclass
11
- from enum import Enum, auto
12
- from typing import Optional
13
 
14
  import tiktoken
15
-
16
- from linguis_trie import build_sinhala_linguis_trie, build_devanagari_linguis_trie, LinguisTrie
17
-
18
-
19
  # ---------------------------------------------------------------------------
20
  # Script-block detection
21
  # ---------------------------------------------------------------------------
22
 
23
- class Script(Enum):
24
- LATIN = auto() # ASCII, Latin, digits, punctuation, code, emoji, etc.
25
- SINHALA = auto()
26
- DEVANAGARI = auto()
27
-
28
- _sinhala_dfa = build_sinhala_linguis_trie()
29
- _devanagari_dfa = build_devanagari_linguis_trie()
30
-
31
- _INDIC_PUNCT_CHARS = "\u0964\u0965"
32
-
33
- def _get_char_script(ch: str) -> Optional[Script]:
34
- if '\u0D80' <= ch <= '\u0DFF':
35
- return Script.SINHALA
36
- if '\u0900' <= ch <= '\u097F':
37
- return Script.DEVANAGARI
38
- if ch in _INDIC_PUNCT_CHARS:
39
- return Script.SINHALA # Dandas handled identically by both schemas
40
- return None
41
-
42
  def _is_indic_joiner(ch: str) -> bool:
43
  # True if ZWJ or ZWNJ
44
  return ch in ('\u200C', '\u200D')
@@ -51,7 +26,7 @@ def _is_indic_joiner(ch: str) -> bool:
51
  @dataclass
52
  class TextSegment:
53
  text: str
54
- script: Script
55
  has_leading_space: bool = False # True if a boundary space was absorbed
56
 
57
 
@@ -60,6 +35,25 @@ class TextSegment:
60
  # ---------------------------------------------------------------------------
61
 
62
  class CodeSwitchSegmenter:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def segment(self, text: str) -> list[TextSegment]:
64
  if not text:
65
  return []
@@ -70,40 +64,45 @@ class CodeSwitchSegmenter:
70
 
71
  while pos < n:
72
  ch = text[pos]
73
- ch_script = _get_char_script(ch)
74
 
75
- is_indic_start = (ch_script is not None)
76
 
77
  if not is_indic_start:
78
  # ─── 1. Accumulate Latin block ───
79
  start = pos
80
  while pos < n:
81
  ch2 = text[pos]
82
- if _get_char_script(ch2) is not None:
 
83
  break # Found distinct Indic start
84
  pos += 1
85
 
86
- latin_chunk = text[start:pos]
87
 
88
  has_ls = False
89
- if pos < n and latin_chunk.endswith(" "):
90
- latin_chunk = latin_chunk[:-1]
91
  has_ls = True
92
 
93
- if latin_chunk:
94
- segments.append(TextSegment(text=latin_chunk, script=Script.LATIN))
95
 
96
  if has_ls and pos < n:
97
  indic_start = pos
98
- current_script = _get_char_script(text[pos]) or Script.SINHALA
 
 
99
 
100
  while pos < n:
101
  c = text[pos]
102
- c_script = _get_char_script(c)
103
- if _is_indic_joiner(c):
104
  pos += 1
105
- elif c_script is not None:
106
- if c_script != current_script and c not in _INDIC_PUNCT_CHARS:
 
 
107
  break
108
  pos += 1
109
  else:
@@ -111,21 +110,21 @@ class CodeSwitchSegmenter:
111
 
112
  segments.append(TextSegment(
113
  text=text[indic_start:pos],
114
- script=current_script,
115
  has_leading_space=True
116
  ))
117
  else:
118
  # ─── 2. Accumulate Indic block (no prior Latin with space) ───
119
  indic_start = pos
120
- current_script = ch_script
121
 
122
  while pos < n:
123
  c = text[pos]
124
- c_script = _get_char_script(c)
125
- if _is_indic_joiner(c):
126
  pos += 1
127
- elif c_script is not None:
128
- if c_script != current_script and c not in _INDIC_PUNCT_CHARS:
129
  break
130
  pos += 1
131
  else:
@@ -133,7 +132,7 @@ class CodeSwitchSegmenter:
133
 
134
  segments.append(TextSegment(
135
  text=text[indic_start:pos],
136
- script=current_script,
137
  has_leading_space=False
138
  ))
139
 
@@ -145,71 +144,11 @@ class CodeSwitchSegmenter:
145
  # Router
146
  # ---------------------------------------------------------------------------
147
 
148
- class CodeSwitchRouter:
149
- def __init__(
150
- self,
151
- tiktoken_model: str = "o200k_base",
152
- sinhala_schema: Optional[str] = None,
153
- devanagari_schema: Optional[str] = None,
154
- ):
155
- # Indic DFAs
156
- self._sinhala_dfa: LinguisTrie = build_sinhala_linguis_trie()
157
- self._devanagari_dfa: LinguisTrie = build_devanagari_linguis_trie()
158
-
159
- self._enc = tiktoken.get_encoding(tiktoken_model)
160
-
161
- self._segmenter = CodeSwitchSegmenter()
162
-
163
- # ------------------------------------------------------------------
164
- # Public API
165
- # ------------------------------------------------------------------
166
-
167
- def tokenize_to_strings(self, text: str) -> list[str]:
168
- result: list[str] = []
169
- for seg in self._segmenter.segment(text):
170
- result.extend(self._route_segment_strings(seg))
171
- return result
172
-
173
- def tokenize_to_ids(self, text: str) -> list[int]:
174
- raise NotImplementedError(
175
- "Use WWHOMetaEncoder.encode() for unified IDs. "
176
- "tokenize_to_ids() on the raw router is intentionally not implemented "
177
- "to prevent accidental ID space collision."
178
- )
179
-
180
- return self._enc.encode(text)
181
-
182
- def tiktoken_decode(self, ids: list[int]) -> str:
183
- return self._enc.decode(ids)
184
-
185
- def tiktoken_vocab_size(self) -> int:
186
- return self._enc.n_vocab
187
-
188
- # ------------------------------------------------------------------
189
- # Internal routing
190
- # ------------------------------------------------------------------
191
-
192
- def _route_segment_strings(self, seg: TextSegment) -> list[str]:
193
- if seg.script == Script.LATIN:
194
- ids = self._enc.encode(seg.text)
195
- return [self._enc.decode([i]) for i in ids]
196
-
197
- # Indic — route to appropriate DFA
198
- dfa = (
199
- self._sinhala_dfa
200
- if seg.script == Script.SINHALA
201
- else self._devanagari_dfa
202
- )
203
- return dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
204
-
205
-
206
  # ---------------------------------------------------------------------------
207
  # Self-test
208
  # ---------------------------------------------------------------------------
209
 
210
  if __name__ == "__main__":
211
- router = CodeSwitchRouter()
212
-
213
  test_cases = [
214
  # Pure Sinhala
215
  "ශ්‍රී ලංකාව",
@@ -233,15 +172,14 @@ if __name__ == "__main__":
233
  "AI (Artificial Intelligence) සහ देवनागरी text.",
234
  ]
235
 
236
- print("=" * 70)
237
- print("CodeSwitchRouter self-test")
238
- print("=" * 70)
239
-
240
- seg = CodeSwitchSegmenter()
 
 
241
  for text in test_cases:
242
- tokens = router.tokenize_to_strings(text)
243
  blocks = seg.segment(text)
244
  print(f"\n Input : {text!r}")
245
- print(f" Blocks : {[(b.text, b.script.name, b.has_leading_space) for b in blocks]}")
246
- print(f" Tokens : {tokens}")
247
- print(f" Count : {len(tokens)}")
 
8
 
9
  import re
10
  from dataclasses import dataclass
 
 
11
 
12
  import tiktoken
 
 
 
 
13
  # ---------------------------------------------------------------------------
14
  # Script-block detection
15
  # ---------------------------------------------------------------------------
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def _is_indic_joiner(ch: str) -> bool:
18
  # True if ZWJ or ZWNJ
19
  return ch in ('\u200C', '\u200D')
 
26
  @dataclass
27
  class TextSegment:
28
  text: str
29
+ language: str # "latin", "sinhala", "devanagari", etc
30
  has_leading_space: bool = False # True if a boundary space was absorbed
31
 
32
 
 
35
  # ---------------------------------------------------------------------------
36
 
37
  class CodeSwitchSegmenter:
38
+ def __init__(self, language_blocks: dict[str, list[tuple[int, int]]] = None):
39
+ """
40
+ language_blocks: maps language name (e.g. 'sinhala') to a list of (start_cp, end_cp) inclusive
41
+ """
42
+ self._ranges: list[tuple[int, int, str]] = []
43
+ if language_blocks:
44
+ for lang, blocks in language_blocks.items():
45
+ for start, end in blocks:
46
+ self._ranges.append((start, end, lang))
47
+
48
+ def _get_char_language(self, ch: str) -> Optional[str]:
49
+ if ch in ('\u200C', '\u200D'):
50
+ return "__joiner__"
51
+ cp = ord(ch)
52
+ for start, end, lang in self._ranges:
53
+ if start <= cp <= end:
54
+ return lang
55
+ return None
56
+
57
  def segment(self, text: str) -> list[TextSegment]:
58
  if not text:
59
  return []
 
64
 
65
  while pos < n:
66
  ch = text[pos]
67
+ ch_lang = self._get_char_language(ch)
68
 
69
+ is_indic_start = (ch_lang is not None)
70
 
71
  if not is_indic_start:
72
  # ─── 1. Accumulate Latin block ───
73
  start = pos
74
  while pos < n:
75
  ch2 = text[pos]
76
+ lang2 = self._get_char_language(ch2)
77
+ if lang2 is not None and lang2 != "__joiner__":
78
  break # Found distinct Indic start
79
  pos += 1
80
 
81
+ latino_only = text[start:pos]
82
 
83
  has_ls = False
84
+ if pos < n and latino_only.endswith(" "):
85
+ latino_only = latino_only[:-1]
86
  has_ls = True
87
 
88
+ if latino_only:
89
+ segments.append(TextSegment(text=latino_only, language="latin"))
90
 
91
  if has_ls and pos < n:
92
  indic_start = pos
93
+ current_lang = self._get_char_language(text[pos])
94
+ if current_lang == "__joiner__" or current_lang is None:
95
+ current_lang = "__unknown__" # fallback
96
 
97
  while pos < n:
98
  c = text[pos]
99
+ c_lang = self._get_char_language(c)
100
+ if c_lang == "__joiner__":
101
  pos += 1
102
+ elif c_lang is not None:
103
+ if current_lang == "__unknown__":
104
+ current_lang = c_lang # adapt
105
+ elif c_lang != current_lang:
106
  break
107
  pos += 1
108
  else:
 
110
 
111
  segments.append(TextSegment(
112
  text=text[indic_start:pos],
113
+ language=current_lang,
114
  has_leading_space=True
115
  ))
116
  else:
117
  # ─── 2. Accumulate Indic block (no prior Latin with space) ───
118
  indic_start = pos
119
+ current_lang = ch_lang
120
 
121
  while pos < n:
122
  c = text[pos]
123
+ c_lang = self._get_char_language(c)
124
+ if c_lang == "__joiner__":
125
  pos += 1
126
+ elif c_lang is not None:
127
+ if c_lang != current_lang:
128
  break
129
  pos += 1
130
  else:
 
132
 
133
  segments.append(TextSegment(
134
  text=text[indic_start:pos],
135
+ language=current_lang,
136
  has_leading_space=False
137
  ))
138
 
 
144
  # Router
145
  # ---------------------------------------------------------------------------
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  # ---------------------------------------------------------------------------
148
  # Self-test
149
  # ---------------------------------------------------------------------------
150
 
151
  if __name__ == "__main__":
 
 
152
  test_cases = [
153
  # Pure Sinhala
154
  "ශ්‍රී ලංකාව",
 
172
  "AI (Artificial Intelligence) සහ देवनागरी text.",
173
  ]
174
 
175
+ # _test segmenter independently
176
+ language_blocks = {
177
+ "sinhala": [(0x0d80, 0x0dff)],
178
+ "devanagari": [(0x0900, 0x097f)]
179
+ }
180
+ seg = CodeSwitchSegmenter(language_blocks)
181
+
182
  for text in test_cases:
 
183
  blocks = seg.segment(text)
184
  print(f"\n Input : {text!r}")
185
+ print(f" Blocks : {[(b.text, b.language, b.has_leading_space) for b in blocks]}")
 
 
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
vocab.json CHANGED
The diff for this file is too large to render. See raw diff