WWHO

Browse files

Files changed (6) hide show

EVALUATION.md +7 -7
encoder.py +80 -41
linguis_trie.py +55 -35
router.py +53 -115
tokenizer.json +0 -0
vocab.json +0 -0

EVALUATION.md CHANGED Viewed

@@ -153,15 +153,15 @@ Evaluating 1,499,950 sentences...
 ====== Sinhala Results ======
 Tokenizer            |       Tokens |     TWR | Chr/Tok |  % Reduction
 ----------------------------------------------------------------------
-SGPE                 |    6,665,177 |   1.276 |    4.83 |            -
-OpenAI (o200k_base)  |   17,360,196 |   3.324 |    1.85 |        61.6%
-Llama 4 Scout        |   18,157,707 |   3.476 |    1.77 |        63.3%
-DeepSeek V3          |   29,152,698 |   5.581 |    1.10 |        77.1%
 ====== Hindi Results ======
 Tokenizer            |       Tokens |     TWR | Chr/Tok |  % Reduction
 ----------------------------------------------------------------------
-SGPE                 |   13,432,763 |   1.181 |    4.29 |            -
 OpenAI (o200k_base)  |   18,394,075 |   1.617 |    3.13 |        27.0%
 Llama 4 Scout        |   19,566,121 |   1.720 |    2.94 |        31.3%
 DeepSeek V3          |   31,682,218 |   2.786 |    1.82 |        57.6%
@@ -169,7 +169,7 @@ DeepSeek V3          |   31,682,218 |   2.786 |    1.82 |        57.6%
 ====== English Results ======
 Tokenizer            |       Tokens |     TWR | Chr/Tok |  % Reduction
 ----------------------------------------------------------------------
-SGPE                 |    7,240,151 |   1.330 |    4.46 |            -
 OpenAI (o200k_base)  |    7,420,527 |   1.364 |    4.35 |         2.4%
 Llama 4 Scout        |    7,512,843 |   1.381 |    4.30 |         3.6%
 DeepSeek V3          |    7,904,670 |   1.453 |    4.09 |         8.4%
@@ -177,7 +177,7 @@ DeepSeek V3          |    7,904,670 |   1.453 |    4.09 |         8.4%
 ========================= OVERALL Results =========================
 Tokenizer            |       Tokens |     TWR | Chr/Tok |  % Reduction
 ----------------------------------------------------------------------
-SGPE                 |   27,338,091 |   1.241 |    4.47 |            -
 OpenAI (o200k_base)  |   43,174,798 |   1.959 |    2.83 |        36.7%
 Llama 4 Scout        |   45,236,671 |   2.053 |    2.70 |        39.6%
 DeepSeek V3          |   68,739,586 |   3.119 |    1.78 |        60.2%

 ====== Sinhala Results ======
 Tokenizer            |       Tokens |     TWR | Chr/Tok |  % Reduction
 ----------------------------------------------------------------------
+SGPE                 |    6,654,288 |   1.274 |    4.83 |            -
+OpenAI (o200k_base)  |   17,360,196 |   3.324 |    1.85 |        61.7%
+Llama 4 Scout        |   18,157,707 |   3.476 |    1.77 |        63.4%
+DeepSeek V3          |   29,152,698 |   5.581 |    1.10 |        77.2%
 ====== Hindi Results ======
 Tokenizer            |       Tokens |     TWR | Chr/Tok |  % Reduction
 ----------------------------------------------------------------------
+SGPE                 |   13,433,554 |   1.181 |    4.29 |            -
 OpenAI (o200k_base)  |   18,394,075 |   1.617 |    3.13 |        27.0%
 Llama 4 Scout        |   19,566,121 |   1.720 |    2.94 |        31.3%
 DeepSeek V3          |   31,682,218 |   2.786 |    1.82 |        57.6%
 ====== English Results ======
 Tokenizer            |       Tokens |     TWR | Chr/Tok |  % Reduction
 ----------------------------------------------------------------------
+SGPE                 |    7,240,147 |   1.330 |    4.46 |            -
 OpenAI (o200k_base)  |    7,420,527 |   1.364 |    4.35 |         2.4%
 Llama 4 Scout        |    7,512,843 |   1.381 |    4.30 |         3.6%
 DeepSeek V3          |    7,904,670 |   1.453 |    4.09 |         8.4%
 ========================= OVERALL Results =========================
 Tokenizer            |       Tokens |     TWR | Chr/Tok |  % Reduction
 ----------------------------------------------------------------------
+SGPE                 |   27,327,989 |   1.240 |    4.47 |            -
 OpenAI (o200k_base)  |   43,174,798 |   1.959 |    2.83 |        36.7%
 Llama 4 Scout        |   45,236,671 |   2.053 |    2.70 |        39.6%
 DeepSeek V3          |   68,739,586 |   3.119 |    1.78 |        60.2%

encoder.py CHANGED Viewed

@@ -10,8 +10,35 @@ import argparse
 import json
 from typing import Optional
-from linguis_trie import LinguisTrie, build_sinhala_linguis_trie
-from gpe_trainer import segment_into_words, _is_boundary_token
 class SGPEEncoder:
@@ -22,9 +49,17 @@ class SGPEEncoder:
         self.vocab: dict[str, int]             = data["vocab"]
         self.merges: list[tuple[str, str]]     = [tuple(m) for m in data["merges"]]
         self.special_tokens: list[str]         = data["special_tokens"]
-        self.tokenizer                         = build_sinhala_linguis_trie()
-        self.unk_id                            = self.vocab.get("[UNK]", 1)
         self.leading_space: bool               = data.get("leading_space", False)
         self._merge_priority: dict[tuple[str, str], int] = {
             (a, b): rank for rank, (a, b) in enumerate(self.merges)
@@ -55,19 +90,24 @@ class SGPEEncoder:
         return tokens
     def tokenize(self, text: str) -> list[str]:
-        syllables = self.layer1_tokenize(text)
-        words     = segment_into_words(syllables)
-        result: list[str] = []
-        for word_tokens in words:
-            if len(word_tokens) == 1 and _is_boundary_token(word_tokens[0]):
-                result.append(word_tokens[0])
-                continue
-            cleaned = [t if t in self.vocab else "[UNK]" for t in word_tokens]
-            result.extend(self._apply_merges_to_word(cleaned))
-        return result
-    def layer1_tokenize(self, text: str) -> list[str]:
-        return self.tokenizer.tokenize(text, leading_space=self.leading_space)
     def decode(self, ids: list[int]) -> str:
         id_to_token = {v: k for k, v in self.vocab.items()}
@@ -155,15 +195,15 @@ class WWHOMetaEncoder:
         self._meta = MetaVocab(sgpe_vocab, self._tik.n_vocab)
         self._space_id: int = self._meta._sgpe_offset[" "]
-        # Router
-        from router import CodeSwitchSegmenter, Script
-        self._segmenter = CodeSwitchSegmenter()
-        self._Script    = Script
         # Indic LinguisTries
-        from linguis_trie import build_sinhala_linguis_trie, build_devanagari_linguis_trie
-        self._sinhala_dfa    = build_sinhala_linguis_trie()
-        self._devanagari_dfa = build_devanagari_linguis_trie()
     # ------------------------------------------------------------------
     # Public API
@@ -184,18 +224,17 @@ class WWHOMetaEncoder:
     def encode(self, text: str) -> list[int]:
         ids: list[int] = []
         for seg in self._segmenter.segment(text):
-            if seg.script == self._Script.LATIN:
                 ids.extend(self._tik.encode(seg.text))
             else:
-                dfa = (
-                    self._sinhala_dfa
-                    if seg.script == self._Script.SINHALA
-                    else self._devanagari_dfa
-                )
                 syllables = dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
-                words     = segment_into_words(syllables)
                 for word_toks in words:
-                    if len(word_toks) == 1 and _is_boundary_token(word_toks[0]):
                         ids.extend(self._tik.encode(word_toks[0]))
                         continue
                     merged = self._apply_merges(word_toks)
@@ -226,19 +265,19 @@ class WWHOMetaEncoder:
     def tokenize(self, text: str) -> list[str]:
         tokens: list[str] = []
         for seg in self._segmenter.segment(text):
-            if seg.script == self._Script.LATIN:
                 ids = self._tik.encode(seg.text)
                 tokens.extend(self._tik.decode([i]) for i in ids)
             else:
-                dfa = (
-                    self._sinhala_dfa
-                    if seg.script == self._Script.SINHALA
-                    else self._devanagari_dfa
-                )
                 syllables = dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
-                words     = segment_into_words(syllables)
                 for word_toks in words:
-                    if len(word_toks) == 1 and _is_boundary_token(word_toks[0]):
                         ids = self._tik.encode(word_toks[0])
                         tokens.extend(self._tik.decode([i]) for i in ids)
                         continue

 import json
 from typing import Optional
+from linguis_trie import LinguisTrie
+def _is_boundary_token(token: str, segmenter) -> bool:
+    for ch in token:
+        if segmenter:
+            lang = segmenter._get_char_language(ch)
+            if lang is not None and lang != "latin":
+                return False
+    return True
+def segment_into_words(syllables: list[str], segmenter) -> list[list[str]]:
+    words: list[list[str]] = []
+    current: list[str] = []
+    for tok in syllables:
+        if _is_boundary_token(tok, segmenter):
+            if current:
+                words.append(current)
+                current = []
+            words.append([tok])
+        else:
+            if tok[0] in (' ', '\t', '\n', '\r') and current:
+                words.append(current)
+                current = []
+            current.append(tok)
+    if current:
+        words.append(current)
+    return words
 class SGPEEncoder:
         self.vocab: dict[str, int]             = data["vocab"]
         self.merges: list[tuple[str, str]]     = [tuple(m) for m in data["merges"]]
         self.special_tokens: list[str]         = data["special_tokens"]
         self.leading_space: bool               = data.get("leading_space", False)
+        script_mode = data.get("script_mode", "mixed")
+        from linguis_trie import load_dfa_map
+        from router import CodeSwitchSegmenter
+        self._dfa_map = load_dfa_map(script_mode)
+        language_blocks = {lang: dfa.unicode_blocks for lang, dfa in self._dfa_map.items()}
+        self._segmenter = CodeSwitchSegmenter(language_blocks)
         self._merge_priority: dict[tuple[str, str], int] = {
             (a, b): rank for rank, (a, b) in enumerate(self.merges)
         return tokens
     def tokenize(self, text: str) -> list[str]:
+        tokens: list[str] = []
+        for seg in self._segmenter.segment(text):
+            if seg.language == "latin":
+                tokens.append(seg.text)
+            else:
+                dfa = self._dfa_map.get(seg.language)
+                if not dfa:
+                    tokens.append(seg.text)
+                    continue
+                syllables = dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
+                words     = segment_into_words(syllables, self._segmenter)
+                for word_toks in words:
+                    if len(word_toks) == 1 and _is_boundary_token(word_toks[0], self._segmenter):
+                        tokens.append(word_toks[0])
+                        continue
+                    cleaned = [t if t in self.vocab else "[UNK]" for t in word_toks]
+                    tokens.extend(self._apply_merges_to_word(cleaned))
+        return tokens
     def decode(self, ids: list[int]) -> str:
         id_to_token = {v: k for k, v in self.vocab.items()}
         self._meta = MetaVocab(sgpe_vocab, self._tik.n_vocab)
         self._space_id: int = self._meta._sgpe_offset[" "]
         # Indic LinguisTries
+        from linguis_trie import load_dfa_map, LinguisTrie
+        self._dfa_map: dict[str, LinguisTrie] = load_dfa_map("mixed")
+        # Router Segmenter
+        from router import CodeSwitchSegmenter
+        language_blocks = {lang: dfa.unicode_blocks for lang, dfa in self._dfa_map.items()}
+        self._segmenter = CodeSwitchSegmenter(language_blocks)
     # ------------------------------------------------------------------
     # Public API
     def encode(self, text: str) -> list[int]:
         ids: list[int] = []
         for seg in self._segmenter.segment(text):
+            if seg.language == "latin":
                 ids.extend(self._tik.encode(seg.text))
             else:
+                dfa = self._dfa_map.get(seg.language)
+                if not dfa:
+                    ids.extend(self._tik.encode(seg.text))
+                    continue
                 syllables = dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
+                words     = segment_into_words(syllables, self._segmenter)
                 for word_toks in words:
+                    if len(word_toks) == 1 and _is_boundary_token(word_toks[0], self._segmenter):
                         ids.extend(self._tik.encode(word_toks[0]))
                         continue
                     merged = self._apply_merges(word_toks)
     def tokenize(self, text: str) -> list[str]:
         tokens: list[str] = []
         for seg in self._segmenter.segment(text):
+            if seg.language == "latin":
                 ids = self._tik.encode(seg.text)
                 tokens.extend(self._tik.decode([i]) for i in ids)
             else:
+                dfa = self._dfa_map.get(seg.language)
+                if not dfa:
+                    ids = self._tik.encode(seg.text)
+                    tokens.extend(self._tik.decode([i]) for i in ids)
+                    continue
                 syllables = dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
+                words     = segment_into_words(syllables, self._segmenter)
                 for word_toks in words:
+                    if len(word_toks) == 1 and _is_boundary_token(word_toks[0], self._segmenter):
                         ids = self._tik.encode(word_toks[0])
                         tokens.extend(self._tik.decode([i]) for i in ids)
                         continue

linguis_trie.py CHANGED Viewed

@@ -23,8 +23,9 @@ class SchemaError(ValueError):
 class LanguageSchema:
     language: str
     grammar_notation: str
-    char_classes: dict[str, set[int]]         # class-label → set of codepoints
-    transitions: dict[str, dict[str, Optional[str]]]  # state → (class → next_state | None)
     start_state: str
     accept_states: set[str]
     emit_states: set[str]
@@ -62,6 +63,10 @@ class SchemaLoader:
         if "dfa" not in raw:
             raise SchemaError(f"[{path}] Missing 'dfa' key.")
         char_classes: dict[str, set[int]] = {}
         for label, definition in raw["char_classes"].items():
             if label.startswith("_"):
@@ -83,6 +88,7 @@ class SchemaLoader:
         return LanguageSchema(
             language=language,
             grammar_notation=grammar,
             char_classes=char_classes,
             transitions=transitions,
             start_state=start_state,
@@ -192,7 +198,7 @@ class LinguisTrie:
             if last_accept_pos > span_start:
                 emit_end = last_accept_pos
             else:
-                emit_end = pos
             tokens.append(pending_space + text[span_start:emit_end])
             pending_space = ""
@@ -211,6 +217,10 @@ class LinguisTrie:
     def language(self) -> str:
         return self._schema.language
     @property
     def regex(self) -> str:
         return self._schema.get_regex()
@@ -237,12 +247,18 @@ def build_linguis_trie(schema_path: str) -> LinguisTrie:
     return _dfa_cache[schema_path]
-def build_sinhala_linguis_trie() -> LinguisTrie:
-    return build_linguis_trie(os.path.join(_SCHEMA_DIR, "sinhala.json"))
-def build_devanagari_linguis_trie() -> LinguisTrie:
-    return build_linguis_trie(os.path.join(_SCHEMA_DIR, "devanagari.json"))
 # ---------------------------------------------------------------------------
@@ -256,9 +272,12 @@ if __name__ == "__main__":
     print("DFA Tokenizer — self-test")
     print("=" * 65)
-    # --- Sinhala ---
-    sinhala_dfa = build_sinhala_linguis_trie()
-    print(f"\n[Sinhala DFA]  grammar: {sinhala_dfa.grammar}\n")
     sinhala_tests = [
         "ශ්‍රී ලංකා ද්වීපයේ ස්වෛරීභාවය සහ ත්‍රිවිධ හමුදාව.",
@@ -279,29 +298,30 @@ if __name__ == "__main__":
         print("-" * 65)
     # --- Devanagari ---
-    deva_dfa = build_devanagari_linguis_trie()
-    print(f"\n[Devanagari DFA]  grammar: {deva_dfa.grammar}\n")
-    deva_tests = [
-        "नमस्ते",
-        "भारत",
-        "हिन्दी",
-        "संस्कृत",
-        "क़िला",
-        "ज़िंदगी",
-        "प्रेम",
-        "द्वारा",
-        "श्रीमान्",
-        "हिन्दुस्तान",
-        "नमस्कार दुनिया",
-        "मैं ठीक हूँ",
-        "विद्यालय में पढ़ाई होती है।",
-    ]
-    for text in deva_tests:
-        toks = deva_dfa.tokenize(text, leading_space=True)
-        print(f"  Input : {text}")
-        print(f"  Syllables: {toks}")
         print(f"  Count : {len(toks)}")
         print("-" * 65)

 class LanguageSchema:
     language: str
     grammar_notation: str
+    unicode_blocks: list[tuple[int, int]]
+    char_classes: dict[str, set[int]]
+    transitions: dict[str, dict[str, Optional[str]]]
     start_state: str
     accept_states: set[str]
     emit_states: set[str]
         if "dfa" not in raw:
             raise SchemaError(f"[{path}] Missing 'dfa' key.")
+        unicode_blocks = []
+        for rng in raw.get("unicode_blocks", []):
+            unicode_blocks.append((int(rng[0], 16), int(rng[1], 16)))
         char_classes: dict[str, set[int]] = {}
         for label, definition in raw["char_classes"].items():
             if label.startswith("_"):
         return LanguageSchema(
             language=language,
             grammar_notation=grammar,
+            unicode_blocks=unicode_blocks,
             char_classes=char_classes,
             transitions=transitions,
             start_state=start_state,
             if last_accept_pos > span_start:
                 emit_end = last_accept_pos
             else:
+                emit_end = span_start + 1  # Fallback: Emit only the first character as an ORPHAN
             tokens.append(pending_space + text[span_start:emit_end])
             pending_space = ""
     def language(self) -> str:
         return self._schema.language
+    @property
+    def unicode_blocks(self) -> list[tuple[int, int]]:
+        return self._schema.unicode_blocks
     @property
     def regex(self) -> str:
         return self._schema.get_regex()
     return _dfa_cache[schema_path]
+def load_dfa_map(script_mode: str) -> dict[str, LinguisTrie]:
+    import glob
+    dfa_map = {}
+    pattern = os.path.join(_SCHEMA_DIR, "*.json")
+    for file in glob.glob(pattern):
+        try:
+            trie = build_linguis_trie(file)
+            if script_mode in ("mixed", "all") or script_mode == trie.language:
+                dfa_map[trie.language] = trie
+        except Exception as e:
+            print(f"Warning: Failed to load schema {file}: {e}")
+    return dfa_map
 # ---------------------------------------------------------------------------
     print("DFA Tokenizer — self-test")
     print("=" * 65)
+    # --- Load All Schemas ---
+    dfas = load_dfa_map("all")
+    sinhala_dfa = dfas.get("sinhala")
+    if sinhala_dfa:
+        print(f"\n[Sinhala DFA]  grammar: {sinhala_dfa.grammar}\n")
     sinhala_tests = [
         "ශ්‍රී ලංකා ද්වීපයේ ස්වෛරීභාවය සහ ත්‍රිවිධ හමුදාව.",
         print("-" * 65)
     # --- Devanagari ---
+    deva_dfa = dfas.get("devanagari")
+    if deva_dfa:
+        print(f"\n[Devanagari DFA]  grammar: {deva_dfa.grammar}\n")
+        deva_tests = [
+            "नमस्ते",
+            "भारत",
+            "हिन्दी",
+            "संस्कृत",
+            "क़िला",
+            "ज़िंदगी",
+            "प्रेम",
+            "द्वारा",
+            "श्रीमान्",
+            "हिन्दुस्तान",
+            "नमस्कार दुनिया",
+            "मैं ठीक हूँ",
+            "विद्यालय में पढ़ाई होती है।",
+        ]
+        for text in deva_tests:
+            toks = deva_dfa.tokenize(text, leading_space=True)
+            print(f"  Input : {text}")
+            print(f"  Syllables: {toks}")
         print(f"  Count : {len(toks)}")
         print("-" * 65)

router.py CHANGED Viewed

@@ -8,37 +8,12 @@ from __future__ import annotations
 import re
 from dataclasses import dataclass
-from enum import Enum, auto
-from typing import Optional
 import tiktoken
-from linguis_trie import build_sinhala_linguis_trie, build_devanagari_linguis_trie, LinguisTrie
 # ---------------------------------------------------------------------------
 # Script-block detection
 # ---------------------------------------------------------------------------
-class Script(Enum):
-    LATIN  = auto()   # ASCII, Latin, digits, punctuation, code, emoji, etc.
-    SINHALA     = auto()
-    DEVANAGARI  = auto()
-_sinhala_dfa    = build_sinhala_linguis_trie()
-_devanagari_dfa = build_devanagari_linguis_trie()
-_INDIC_PUNCT_CHARS = "\u0964\u0965"
-def _get_char_script(ch: str) -> Optional[Script]:
-    if '\u0D80' <= ch <= '\u0DFF':
-        return Script.SINHALA
-    if '\u0900' <= ch <= '\u097F':
-        return Script.DEVANAGARI
-    if ch in _INDIC_PUNCT_CHARS:
-        return Script.SINHALA  # Dandas handled identically by both schemas
-    return None
 def _is_indic_joiner(ch: str) -> bool:
     # True if ZWJ or ZWNJ
     return ch in ('\u200C', '\u200D')
@@ -51,7 +26,7 @@ def _is_indic_joiner(ch: str) -> bool:
 @dataclass
 class TextSegment:
     text: str
-    script: Script
     has_leading_space: bool = False   # True if a boundary space was absorbed
@@ -60,6 +35,25 @@ class TextSegment:
 # ---------------------------------------------------------------------------
 class CodeSwitchSegmenter:
     def segment(self, text: str) -> list[TextSegment]:
         if not text:
             return []
@@ -70,40 +64,45 @@ class CodeSwitchSegmenter:
         while pos < n:
             ch = text[pos]
-            ch_script = _get_char_script(ch)
-            is_indic_start = (ch_script is not None)
             if not is_indic_start:
                 # ─── 1. Accumulate Latin block ───
                 start = pos
                 while pos < n:
                     ch2 = text[pos]
-                    if _get_char_script(ch2) is not None:
                         break  # Found distinct Indic start
                     pos += 1
-                latin_chunk = text[start:pos]
                 has_ls = False
-                if pos < n and latin_chunk.endswith(" "):
-                    latin_chunk = latin_chunk[:-1]
                     has_ls = True
-                if latin_chunk:
-                    segments.append(TextSegment(text=latin_chunk, script=Script.LATIN))
                 if has_ls and pos < n:
                     indic_start = pos
-                    current_script = _get_char_script(text[pos]) or Script.SINHALA
                     while pos < n:
                         c = text[pos]
-                        c_script = _get_char_script(c)
-                        if _is_indic_joiner(c):
                             pos += 1
-                        elif c_script is not None:
-                            if c_script != current_script and c not in _INDIC_PUNCT_CHARS:
                                 break
                             pos += 1
                         else:
@@ -111,21 +110,21 @@ class CodeSwitchSegmenter:
                     segments.append(TextSegment(
                         text=text[indic_start:pos],
-                        script=current_script,
                         has_leading_space=True
                     ))
             else:
                 # ─── 2. Accumulate Indic block (no prior Latin with space) ───
                 indic_start = pos
-                current_script = ch_script
                 while pos < n:
                     c = text[pos]
-                    c_script = _get_char_script(c)
-                    if _is_indic_joiner(c):
                         pos += 1
-                    elif c_script is not None:
-                        if c_script != current_script and c not in _INDIC_PUNCT_CHARS:
                             break
                         pos += 1
                     else:
@@ -133,7 +132,7 @@ class CodeSwitchSegmenter:
                 segments.append(TextSegment(
                     text=text[indic_start:pos],
-                    script=current_script,
                     has_leading_space=False
                 ))
@@ -145,71 +144,11 @@ class CodeSwitchSegmenter:
 # Router
 # ---------------------------------------------------------------------------
-class CodeSwitchRouter:
-    def __init__(
-        self,
-        tiktoken_model: str = "o200k_base",
-        sinhala_schema: Optional[str] = None,
-        devanagari_schema: Optional[str] = None,
-    ):
-        # Indic DFAs
-        self._sinhala_dfa:    LinguisTrie = build_sinhala_linguis_trie()
-        self._devanagari_dfa: LinguisTrie = build_devanagari_linguis_trie()
-        self._enc = tiktoken.get_encoding(tiktoken_model)
-        self._segmenter = CodeSwitchSegmenter()
-    # ------------------------------------------------------------------
-    # Public API
-    # ------------------------------------------------------------------
-    def tokenize_to_strings(self, text: str) -> list[str]:
-        result: list[str] = []
-        for seg in self._segmenter.segment(text):
-            result.extend(self._route_segment_strings(seg))
-        return result
-    def tokenize_to_ids(self, text: str) -> list[int]:
-        raise NotImplementedError(
-            "Use WWHOMetaEncoder.encode() for unified IDs. "
-            "tokenize_to_ids() on the raw router is intentionally not implemented "
-            "to prevent accidental ID space collision."
-        )
-        return self._enc.encode(text)
-    def tiktoken_decode(self, ids: list[int]) -> str:
-        return self._enc.decode(ids)
-    def tiktoken_vocab_size(self) -> int:
-        return self._enc.n_vocab
-    # ------------------------------------------------------------------
-    # Internal routing
-    # ------------------------------------------------------------------
-    def _route_segment_strings(self, seg: TextSegment) -> list[str]:
-        if seg.script == Script.LATIN:
-            ids = self._enc.encode(seg.text)
-            return [self._enc.decode([i]) for i in ids]
-        # Indic — route to appropriate DFA
-        dfa = (
-            self._sinhala_dfa
-            if seg.script == Script.SINHALA
-            else self._devanagari_dfa
-        )
-        return dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
 # ---------------------------------------------------------------------------
 # Self-test
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
-    router = CodeSwitchRouter()
     test_cases = [
         # Pure Sinhala
         "ශ්‍රී ලංකාව",
@@ -233,15 +172,14 @@ if __name__ == "__main__":
         "AI (Artificial Intelligence) සහ देवनागरी text.",
     ]
-    print("=" * 70)
-    print("CodeSwitchRouter — self-test")
-    print("=" * 70)
-    seg = CodeSwitchSegmenter()
     for text in test_cases:
-        tokens = router.tokenize_to_strings(text)
         blocks = seg.segment(text)
         print(f"\n  Input  : {text!r}")
-        print(f"  Blocks : {[(b.text, b.script.name, b.has_leading_space) for b in blocks]}")
-        print(f"  Tokens : {tokens}")
-        print(f"  Count  : {len(tokens)}")

 import re
 from dataclasses import dataclass
 import tiktoken
 # ---------------------------------------------------------------------------
 # Script-block detection
 # ---------------------------------------------------------------------------
 def _is_indic_joiner(ch: str) -> bool:
     # True if ZWJ or ZWNJ
     return ch in ('\u200C', '\u200D')
 @dataclass
 class TextSegment:
     text: str
+    language: str                     # "latin", "sinhala", "devanagari", etc
     has_leading_space: bool = False   # True if a boundary space was absorbed
 # ---------------------------------------------------------------------------
 class CodeSwitchSegmenter:
+    def __init__(self, language_blocks: dict[str, list[tuple[int, int]]] = None):
+        """
+        language_blocks: maps language name (e.g. 'sinhala') to a list of (start_cp, end_cp) inclusive
+        """
+        self._ranges: list[tuple[int, int, str]] = []
+        if language_blocks:
+            for lang, blocks in language_blocks.items():
+                for start, end in blocks:
+                    self._ranges.append((start, end, lang))
+    def _get_char_language(self, ch: str) -> Optional[str]:
+        if ch in ('\u200C', '\u200D'):
+            return "__joiner__"
+        cp = ord(ch)
+        for start, end, lang in self._ranges:
+            if start <= cp <= end:
+                return lang
+        return None
     def segment(self, text: str) -> list[TextSegment]:
         if not text:
             return []
         while pos < n:
             ch = text[pos]
+            ch_lang = self._get_char_language(ch)
+            is_indic_start = (ch_lang is not None)
             if not is_indic_start:
                 # ─── 1. Accumulate Latin block ───
                 start = pos
                 while pos < n:
                     ch2 = text[pos]
+                    lang2 = self._get_char_language(ch2)
+                    if lang2 is not None and lang2 != "__joiner__":
                         break  # Found distinct Indic start
                     pos += 1
+                latino_only = text[start:pos]
                 has_ls = False
+                if pos < n and latino_only.endswith(" "):
+                    latino_only = latino_only[:-1]
                     has_ls = True
+                if latino_only:
+                    segments.append(TextSegment(text=latino_only, language="latin"))
                 if has_ls and pos < n:
                     indic_start = pos
+                    current_lang = self._get_char_language(text[pos])
+                    if current_lang == "__joiner__" or current_lang is None:
+                        current_lang = "__unknown__" # fallback
                     while pos < n:
                         c = text[pos]
+                        c_lang = self._get_char_language(c)
+                        if c_lang == "__joiner__":
                             pos += 1
+                        elif c_lang is not None:
+                            if current_lang == "__unknown__":
+                                current_lang = c_lang # adapt
+                            elif c_lang != current_lang:
                                 break
                             pos += 1
                         else:
                     segments.append(TextSegment(
                         text=text[indic_start:pos],
+                        language=current_lang,
                         has_leading_space=True
                     ))
             else:
                 # ─── 2. Accumulate Indic block (no prior Latin with space) ───
                 indic_start = pos
+                current_lang = ch_lang
                 while pos < n:
                     c = text[pos]
+                    c_lang = self._get_char_language(c)
+                    if c_lang == "__joiner__":
                         pos += 1
+                    elif c_lang is not None:
+                        if c_lang != current_lang:
                             break
                         pos += 1
                     else:
                 segments.append(TextSegment(
                     text=text[indic_start:pos],
+                    language=current_lang,
                     has_leading_space=False
                 ))
 # Router
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
 # Self-test
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
     test_cases = [
         # Pure Sinhala
         "ශ්‍රී ලංකාව",
         "AI (Artificial Intelligence) සහ देवनागरी text.",
     ]
+    # _test segmenter independently
+    language_blocks = {
+        "sinhala": [(0x0d80, 0x0dff)],
+        "devanagari": [(0x0900, 0x097f)]
+    }
+    seg = CodeSwitchSegmenter(language_blocks)
     for text in test_cases:
         blocks = seg.segment(text)
         print(f"\n  Input  : {text!r}")
+        print(f"  Blocks : {[(b.text, b.language, b.has_leading_space) for b in blocks]}")

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

vocab.json CHANGED Viewed

The diff for this file is too large to render. See raw diff