Migrate to zemberek-python, remove JVM dependency and 31MB JAR, apply O(N^2) init fix

Browse files

Files changed (11) hide show

README.md +2 -10
nedo_turkish_tokenizer/_compound.py +3 -4
nedo_turkish_tokenizer/_context_aware.py +12 -11
nedo_turkish_tokenizer/_java_check.py +0 -57
nedo_turkish_tokenizer/_preprocessor.py +3 -2
nedo_turkish_tokenizer/_root_validator.py +47 -48
nedo_turkish_tokenizer/data/zemberek-full.jar +0 -3
nedo_turkish_tokenizer/tokenizer.py +79 -2
pyproject.toml +2 -2
test_lattice.py +72 -0
tokenizer_config.json +2 -2

README.md CHANGED Viewed

@@ -26,7 +26,7 @@ NedoTurkishTokenizer performs linguistically-aware tokenization of Turkish text
 | **Language** | Turkish (`tr`) |
 | **License** | MIT |
 | **Benchmark** | TR-MMLU **95.45%** (world record) |
-| **Morphological engine** | Zemberek NLP (bundled) |
 ---
@@ -38,15 +38,7 @@ NedoTurkishTokenizer performs linguistically-aware tokenization of Turkish text
 pip install git+https://huggingface.co/Ethosoft/NedoTurkishTokenizer
 ```
-> **Java is required** for Zemberek morphological analysis.
-> If you get a Java error, install it first:
->
-> | OS | Command |
-> |---|---|
-> | Ubuntu / Debian | `sudo apt install default-jre` |
-> | Fedora / RHEL | `sudo dnf install java-latest-openjdk` |
-> | macOS | `brew install openjdk` |
-> | Windows | `winget install Microsoft.OpenJDK.21` |
 ---

 | **Language** | Turkish (`tr`) |
 | **License** | MIT |
 | **Benchmark** | TR-MMLU **95.45%** (world record) |
+| **Morphological engine** | zemberek-python |
 ---
 pip install git+https://huggingface.co/Ethosoft/NedoTurkishTokenizer
 ```
 ---

nedo_turkish_tokenizer/_compound.py CHANGED Viewed

@@ -41,10 +41,9 @@ KNOWN_COMPOUNDS: dict[str, list[str]] = {
 def _decompose_zemberek(word: str, morphology) -> list[str] | None:
     try:
-        import jpype  # noqa: PLC0415
-        wa = morphology.analyze(jpype.JString(word))
-        for sa in wa.getAnalysisResults():
-            morphemes = [str(m) for m in sa.getMorphemes()]
             roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
             if len(roots) > 1:
                 return roots

 def _decompose_zemberek(word: str, morphology) -> list[str] | None:
     try:
+        wa = morphology.analyze(word)
+        for sa in wa:
+            morphemes = [str(m) for m in sa.get_morphemes()]
             roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
             if len(roots) > 1:
                 return roots

nedo_turkish_tokenizer/_context_aware.py CHANGED Viewed

@@ -1,8 +1,11 @@
-"""Fix 12: Context-aware Zemberek disambiguation."""
 from __future__ import annotations
-from ._root_validator import ZEMBEREK_AVAILABLE, _morphology, _jstr
 AMBIGUOUS_WORDS = {
     "yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
@@ -17,20 +20,18 @@ def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
         return tokens
     try:
-        sa_result = _morphology.analyzeAndDisambiguate(_jstr(original_text.strip()))
-        best_list = sa_result.bestAnalysis()
         analyses: dict[str, dict] = {}
-        for idx in range(best_list.size()):
             try:
-                sa   = best_list.get(idx)
-                item = sa.getDictionaryItem()
-                sf   = str(sa.surfaceForm()).lower().strip()
                 if sf not in analyses:
                     analyses[sf] = {
-                        "lemma":     str(item.lemma),
-                        "pos":       str(sa.getPos().shortForm),
-                        "morphemes": [str(m) for m in sa.getMorphemes()],
                     }
             except Exception:  # noqa: BLE001
                 continue

+"""Fix 12: Context-aware Zemberek disambiguation.
+Uses zemberek-python (pure Python) — no JVM required.
+"""
 from __future__ import annotations
+from ._root_validator import ZEMBEREK_AVAILABLE, _morphology
 AMBIGUOUS_WORDS = {
     "yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
         return tokens
     try:
+        sa_result = _morphology.analyze_and_disambiguate(original_text.strip())
+        best_list = sa_result.best_analysis()
         analyses: dict[str, dict] = {}
+        for sa in best_list:
             try:
+                sf = (str(sa.get_stem()) + str(sa.get_ending())).lower().strip()
                 if sf not in analyses:
                     analyses[sf] = {
+                        "lemma":     str(sa.item.lemma),
+                        "pos":       str(sa.item.primary_pos.short_form),
+                        "morphemes": [str(m) for m in sa.get_morphemes()],
                     }
             except Exception:  # noqa: BLE001
                 continue

nedo_turkish_tokenizer/_java_check.py DELETED Viewed

@@ -1,57 +0,0 @@
-"""Java/JVM presence check with actionable error messages."""
-from __future__ import annotations
-import shutil
-import subprocess
-import sys
-def ensure_java() -> None:
-    """Raise a clear RuntimeError if Java is not installed."""
-    if shutil.which("java") is not None:
-        return
-    # Try jpype's own detection as a fallback
-    try:
-        import jpype  # noqa: PLC0415
-        jpype.getDefaultJVMPath()
-        return
-    except Exception:  # noqa: BLE001
-        pass
-    _install_cmd = _get_install_cmd()
-    raise RuntimeError(
-        "\n"
-        "╔══════════════════════════════════════════════════════════════╗\n"
-        "║  NedoTurkishTokenizer requires Java (JVM) — not found on this system  ║\n"
-        "╠══════════════════════════════════════════════════════════════╣\n"
-        f"║  Install Java with:                                          ║\n"
-        f"║    {_install_cmd:<58}║\n"
-        "║                                                              ║\n"
-        "║  Then re-run your script.                                    ║\n"
-        "╚══════════════════════════════════════════════════════════════╝\n"
-    )
-def _get_install_cmd() -> str:
-    if sys.platform == "linux":
-        # Try to detect distro
-        try:
-            out = subprocess.check_output(
-                ["cat", "/etc/os-release"], text=True, stderr=subprocess.DEVNULL
-            )
-            if "ubuntu" in out.lower() or "debian" in out.lower():
-                return "sudo apt install default-jre"
-            if "fedora" in out.lower() or "rhel" in out.lower() or "centos" in out.lower():
-                return "sudo dnf install java-latest-openjdk"
-            if "arch" in out.lower():
-                return "sudo pacman -S jre-openjdk"
-        except Exception:  # noqa: BLE001
-            pass
-        return "sudo apt install default-jre"
-    if sys.platform == "darwin":
-        return "brew install openjdk"
-    if sys.platform == "win32":
-        return "winget install Microsoft.OpenJDK.21"
-    return "Install Java from https://adoptium.net"

nedo_turkish_tokenizer/_preprocessor.py CHANGED Viewed

@@ -71,8 +71,9 @@ def _is_turkish_base(word: str) -> bool:
     try:
         from ._root_validator import _morphology, ZEMBEREK_AVAILABLE  # noqa: PLC0415
         if ZEMBEREK_AVAILABLE and _morphology:
-            for analysis in _morphology.analyze(wl):
-                lemma = str(analysis).split("]")[0].lstrip("[")
                 if any(c in TR_CHARS for c in lemma):
                     return True
     except Exception:  # noqa: BLE001

     try:
         from ._root_validator import _morphology, ZEMBEREK_AVAILABLE  # noqa: PLC0415
         if ZEMBEREK_AVAILABLE and _morphology:
+            wa = _morphology.analyze(wl)
+            for sa in wa:
+                lemma = str(sa.item.lemma)
                 if any(c in TR_CHARS for c in lemma):
                     return True
     except Exception:  # noqa: BLE001

nedo_turkish_tokenizer/_root_validator.py CHANGED Viewed

@@ -1,46 +1,51 @@
-"""Zemberek-based root validation and correction (Fix 4)."""
-from __future__ import annotations
-import os
-from pathlib import Path
-# ── Zemberek JAR: bundled with package ───────────────────────────────────────
-_DATA_DIR = Path(__file__).parent / "data"
-JAR_PATH  = _DATA_DIR / "zemberek-full.jar"
 ZEMBEREK_AVAILABLE = False
 _morphology = None
 def _init_zemberek() -> None:
     global ZEMBEREK_AVAILABLE, _morphology
-    if not JAR_PATH.exists():
-        print(
-            f"[NedoTurkishTokenizer] zemberek-full.jar not found at {JAR_PATH}\n"
-            "  Root validation disabled — morphological fixes will be limited."
-        )
-        return
     try:
-        import jpype  # noqa: PLC0415
-        if not jpype.isJVMStarted():
-            jpype.startJVM(
-                jpype.getDefaultJVMPath(),
-                "-ea",
-                f"-Djava.class.path={JAR_PATH}",
-                convertStrings=False,
-            )
-        TurkishMorphology = jpype.JClass("zemberek.morphology.TurkishMorphology")
-        _morphology = TurkishMorphology.createWithDefaults()
         ZEMBEREK_AVAILABLE = True
     except ImportError:
-        print("[NedoTurkishTokenizer] jpype1 not installed → pip install jpype1")
     except Exception as exc:  # noqa: BLE001
         print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
@@ -50,25 +55,20 @@ _init_zemberek()
 # ── Zemberek API helpers ──────────────────────────────────────────────────────
-def _jstr(s: str):
-    import jpype  # noqa: PLC0415
-    return jpype.JString(s)
 def analyze_word(word: str) -> list[dict]:
     """Return all Zemberek analyses for a single word."""
     if not ZEMBEREK_AVAILABLE:
         return []
     try:
-        wa = _morphology.analyze(_jstr(word))
         return [
             {
-                "lemma":    str(sa.getDictionaryItem().lemma),
-                "pos":      str(sa.getPos().shortForm),
-                "morphemes":[str(m) for m in sa.getMorphemes()],
-                "surface":  str(sa.surfaceForm()),
             }
-            for sa in wa.getAnalysisResults()
         ]
     except Exception:  # noqa: BLE001
         return []
@@ -185,17 +185,16 @@ def disambiguate_sentence(words: list[str]) -> list[dict | None]:
     if not ZEMBEREK_AVAILABLE:
         return [None] * len(words)
     try:
-        sa_result = _morphology.analyzeAndDisambiguate(_jstr(" ".join(words)))
-        best = sa_result.bestAnalysis()
         out = []
-        for i in range(best.size()):
             try:
-                sa = best.get(i)
-                item = sa.getDictionaryItem()
                 out.append({
-                    "lemma":     str(item.lemma),
-                    "pos":       str(sa.getPos().shortForm),
-                    "morphemes": [str(m) for m in sa.getMorphemes()],
                 })
             except Exception:  # noqa: BLE001
                 out.append(None)

+"""Zemberek-based root validation and correction (Fix 4).
+Uses zemberek-python (pure Python) — no JVM or JPype required.
+"""
+from __future__ import annotations
 ZEMBEREK_AVAILABLE = False
 _morphology = None
+def _apply_zemberek_patch() -> None:
+    """Fix O(N^2) loading time bug in zemberek-python 0.2.3."""
+    import csv
+    import zemberek.morphology.lexicon.root_lexicon as rl
+    def fast_load_from_resources(resource_path: str):
+        items = list()
+        csv.field_size_limit(100000000)
+        with open(resource_path, 'r', encoding='utf-8') as f:
+            lex = list(csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE))
+            # O(1) dictionary lookup instead of O(N) iteration per reference
+            lex_dict = {line[0]: line for line in lex}
+            for i, line in enumerate(lex):
+                item = rl.DictionaryReader.make_dict_item_from_line(line)
+                if line[7] != 'null':
+                    reference_item_line = lex_dict.get(line[7])
+                    if reference_item_line is not None:
+                        item.set_reference_item(rl.DictionaryReader.make_dict_item_from_line(reference_item_line))
+                items.append(item)
+        return rl.RootLexicon(items)
+    rl.DictionaryReader.load_from_resources = fast_load_from_resources
 def _init_zemberek() -> None:
     global ZEMBEREK_AVAILABLE, _morphology
     try:
+        from zemberek import TurkishMorphology  # noqa: PLC0415
+        _apply_zemberek_patch()
+        _morphology = TurkishMorphology.create_with_defaults()
         ZEMBEREK_AVAILABLE = True
     except ImportError:
+        print("[NedoTurkishTokenizer] zemberek-python not installed → pip install zemberek-python")
     except Exception as exc:  # noqa: BLE001
         print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
 # ── Zemberek API helpers ──────────────────────────────────────────────────────
 def analyze_word(word: str) -> list[dict]:
     """Return all Zemberek analyses for a single word."""
     if not ZEMBEREK_AVAILABLE:
         return []
     try:
+        wa = _morphology.analyze(word)
         return [
             {
+                "lemma":    str(sa.item.lemma),
+                "pos":      str(sa.item.primary_pos.short_form),
+                "morphemes": [str(m) for m in sa.get_morphemes()],
+                "surface":  str(sa.get_stem()) + str(sa.get_ending()),
             }
+            for sa in wa
         ]
     except Exception:  # noqa: BLE001
         return []
     if not ZEMBEREK_AVAILABLE:
         return [None] * len(words)
     try:
+        sentence = " ".join(words)
+        sa_result = _morphology.analyze_and_disambiguate(sentence)
+        best = sa_result.best_analysis()
         out = []
+        for sa in best:
             try:
                 out.append({
+                    "lemma":     str(sa.item.lemma),
+                    "pos":       str(sa.item.primary_pos.short_form),
+                    "morphemes": [str(m) for m in sa.get_morphemes()],
                 })
             except Exception:  # noqa: BLE001
                 out.append(None)

nedo_turkish_tokenizer/data/zemberek-full.jar DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:74ee8736b73dc2ca878071b80829f9c5acccc268d4b8b7795d36d60db26a1731
-size 31644792

nedo_turkish_tokenizer/tokenizer.py CHANGED Viewed

@@ -26,11 +26,12 @@ Output fields per token:
 from __future__ import annotations
 import os
 import multiprocessing
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from pathlib import Path
-from ._java_check import ensure_java
 from ._preprocessor import preprocess, postprocess
 from ._suffix_expander import reclassify_bpe_suffixes
 from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
@@ -98,7 +99,7 @@ class NedoTurkishTokenizer:
     """
     def __init__(self) -> None:
-        ensure_java()
         from turkish_tokenizer import TurkishTokenizer  # noqa: PLC0415
         self._base = TurkishTokenizer()
         self.zemberek_available = ZEMBEREK_AVAILABLE
@@ -236,6 +237,82 @@ class NedoTurkishTokenizer:
             json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
         )
     # ── Utility ───────────────────────────────────────────────────────────────
     def stats(self, tokens: list[dict]) -> dict:

 from __future__ import annotations
 import os
+import re
 import multiprocessing
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from pathlib import Path
 from ._preprocessor import preprocess, postprocess
 from ._suffix_expander import reclassify_bpe_suffixes
 from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
     """
     def __init__(self) -> None:
         from turkish_tokenizer import TurkishTokenizer  # noqa: PLC0415
         self._base = TurkishTokenizer()
         self.zemberek_available = ZEMBEREK_AVAILABLE
             json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
         )
+    # ── Morphological Lattice API ─────────────────────────────────────────────
+    def get_morphological_lattice(self, word: str) -> list[dict]:
+        """Return all possible morphological analyses for *word* as a lattice.
+        Each entry in the returned list is a dict with:
+            ``root``          – the lemma / root form
+            ``suffixes``      – list of surface-form suffixes
+            ``pos``           – abbreviated POS tag (Noun, Verb, Adj, …)
+            ``lexical_form``  – full lexical representation from Zemberek
+        Returns an **empty list** when Zemberek cannot analyse the word
+        (unknown word) or when Zemberek is not available.
+        """
+        if _zemb_morphology is None:
+            return []
+        try:
+            word_analysis = _zemb_morphology.analyze(word)
+            lattice: list[dict] = []
+            for sa in word_analysis:
+                try:
+                    root = str(sa.item.lemma)
+                    pos = str(sa.item.primary_pos.short_form)
+                    lexical_form = str(sa.format_string())
+                    # Build suffix list from morpheme chain (skip the root morpheme)
+                    morphemes = list(sa.get_morphemes())
+                    suffixes = [str(m) for m in morphemes[1:]] if len(morphemes) > 1 else []
+                    lattice.append({
+                        "root": root,
+                        "suffixes": suffixes,
+                        "pos": pos,
+                        "lexical_form": lexical_form,
+                    })
+                except Exception:  # noqa: BLE001
+                    continue
+            return lattice
+        except Exception:  # noqa: BLE001
+            return []
+    def tokenize_lattice(self, text: str) -> dict:
+        """Tokenize *text* and return a morphological lattice for every word.
+        Returns a dict with:
+            ``input``   – the original text
+            ``words``   – list of per-word dicts, each containing
+                          ``word`` (str) and ``lattice`` (list of analyses)
+        Example::
+            tok  = NedoTurkishTokenizer()
+            data = tok.tokenize_lattice("Evin güzel gelir")
+            for w in data["words"]:
+                print(w["word"], "→", len(w["lattice"]), "analysis(es)")
+        """
+        # Split text on whitespace, respecting punctuation
+        words = re.findall(r"\S+", text)
+        result_words: list[dict] = []
+        for w in words:
+            lattice = self.get_morphological_lattice(w)
+            result_words.append({
+                "word": w,
+                "lattice": lattice,
+            })
+        return {
+            "input": text,
+            "words": result_words,
+        }
     # ── Utility ───────────────────────────────────────────────────────────────
     def stats(self, tokens: list[dict]) -> dict:

pyproject.toml CHANGED Viewed

@@ -20,7 +20,7 @@ classifiers = [
 ]
 dependencies = [
     "turkish-tokenizer>=0.1.0",
-    "jpype1>=1.4.0",
     "requests>=2.28.0",
 ]
@@ -36,4 +36,4 @@ where = ["."]
 include = ["nedo_turkish_tokenizer*"]
 [tool.setuptools.package-data]
-nedo_turkish_tokenizer = ["data/*.jar"]

 ]
 dependencies = [
     "turkish-tokenizer>=0.1.0",
+    "zemberek-python>=0.2.3",
     "requests>=2.28.0",
 ]
 include = ["nedo_turkish_tokenizer*"]
 [tool.setuptools.package-data]
+nedo_turkish_tokenizer = ["data/*.txt"]

test_lattice.py ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/env python3
+"""
+Test / demo script for the Morphological Lattice API.
+Shows how ambiguous Turkish words like "evin" and "gelir" produce
+multiple alternative analyses in the lattice.
+"""
+import json
+from nedo_turkish_tokenizer import NedoTurkishTokenizer
+def section(title: str) -> None:
+    print(f"\n{'═' * 60}")
+    print(f"  {title}")
+    print(f"{'═' * 60}")
+def main() -> None:
+    tok = NedoTurkishTokenizer()
+    # ── 1. Tek kelime lattice testi ──────────────────────────────────────
+    section("1) get_morphological_lattice — tek kelime örnekleri")
+    test_words = ["evin", "gelir", "yüz", "çıkar", "koşar"]
+    for word in test_words:
+        lattice = tok.get_morphological_lattice(word)
+        print(f"\n▸ \"{word}\" → {len(lattice)} analiz:")
+        for i, entry in enumerate(lattice):
+            print(f"    [{i}] root={entry['root']:<12} "
+                  f"pos={entry['pos']:<6} "
+                  f"suffixes={entry['suffixes']}")
+            print(f"         lexical_form = {entry['lexical_form']}")
+    # ── 2. Bilinmeyen kelime (boş lattice) ───────────────────────────────
+    section("2) Bilinmeyen / yabancı kelime → boş lattice")
+    unknown_words = ["xyzfoo", "meeting", "blockchain"]
+    for word in unknown_words:
+        lattice = tok.get_morphological_lattice(word)
+        print(f"  \"{word}\" → lattice boş mu? {len(lattice) == 0}  (len={len(lattice)})")
+    # ── 3. tokenize_lattice — cümle bazlı test ──────────────────────────
+    section("3) tokenize_lattice — cümle testi")
+    sentences = [
+        "Evin güzel gelir",
+        "Çocuk okula koşar adım gitti",
+        "Yüz yıllık çınar",
+    ]
+    for sent in sentences:
+        print(f"\n▸ Input: \"{sent}\"")
+        result = tok.tokenize_lattice(sent)
+        for winfo in result["words"]:
+            n = len(winfo["lattice"])
+            print(f"    {winfo['word']:<16} → {n} analiz(ler)")
+            for entry in winfo["lattice"]:
+                print(f"        root={entry['root']:<12} pos={entry['pos']:<6} "
+                      f"suffixes={entry['suffixes']}")
+    # ── 4. JSON çıktı formatı ────────────────────────────────────────────
+    section("4) tokenize_lattice JSON çıktı")
+    data = tok.tokenize_lattice("evin gelir")
+    print(json.dumps(data, ensure_ascii=False, indent=2))
+    print("\n✅ Tüm testler başarıyla tamamlandı.")
+if __name__ == "__main__":
+    main()

tokenizer_config.json CHANGED Viewed

@@ -7,6 +7,6 @@
   "version": "1.0.0",
   "language": "tr",
   "description": "Turkish morphological tokenizer — TR-MMLU world record 92%",
-  "requires_java": true,
-  "dependencies": ["turkish-tokenizer", "jpype1"]
 }

   "version": "1.0.0",
   "language": "tr",
   "description": "Turkish morphological tokenizer — TR-MMLU world record 92%",
+  "requires_java": false,
+  "dependencies": ["turkish-tokenizer", "zemberek-python"]
 }