use zemberek-python and add regression tests

Browse files

Files changed (12) hide show

nedo_turkish_tokenizer/_compound.py +4 -5
nedo_turkish_tokenizer/_context_aware.py +10 -10
nedo_turkish_tokenizer/_java_check.py +0 -57
nedo_turkish_tokenizer/_preprocessor.py +4 -3
nedo_turkish_tokenizer/_root_validator.py +22 -48
nedo_turkish_tokenizer/_tdk_vocab.py +33 -7
nedo_turkish_tokenizer/data/zemberek-full.jar +0 -3
nedo_turkish_tokenizer/tokenizer.py +0 -2
pyproject.toml +3 -3
tests/test_tdk_vocab.py +31 -0
tests/test_zemberek_integration.py +58 -0
tokenizer_config.json +2 -2

nedo_turkish_tokenizer/_compound.py CHANGED Viewed

@@ -41,11 +41,10 @@ KNOWN_COMPOUNDS: dict[str, list[str]] = {
 def _decompose_zemberek(word: str, morphology) -> list[str] | None:
     try:
-        import jpype  # noqa: PLC0415
-        wa = morphology.analyze(jpype.JString(word))
-        for sa in wa.getAnalysisResults():
-            morphemes = [str(m) for m in sa.getMorphemes()]
-            roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
             if len(roots) > 1:
                 return roots
     except Exception:  # noqa: BLE001

 def _decompose_zemberek(word: str, morphology) -> list[str] | None:
     try:
+        wa = morphology.analyze(word)
+        for sa in wa.analysis_results:
+            morphemes = [m.id_ for m in sa.get_morphemes()]
+            roots = [m for m in morphemes if m in ("Noun", "Verb", "Adj")]
             if len(roots) > 1:
                 return roots
     except Exception:  # noqa: BLE001

nedo_turkish_tokenizer/_context_aware.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from __future__ import annotations
-from ._root_validator import ZEMBEREK_AVAILABLE, _morphology, _jstr
 AMBIGUOUS_WORDS = {
     "yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
@@ -17,20 +17,20 @@ def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
         return tokens
     try:
-        sa_result = _morphology.analyzeAndDisambiguate(_jstr(original_text.strip()))
-        best_list = sa_result.bestAnalysis()
         analyses: dict[str, dict] = {}
-        for idx in range(best_list.size()):
             try:
-                sa   = best_list.get(idx)
-                item = sa.getDictionaryItem()
-                sf   = str(sa.surfaceForm()).lower().strip()
                 if sf not in analyses:
                     analyses[sf] = {
-                        "lemma":     str(item.lemma),
-                        "pos":       str(sa.getPos().shortForm),
-                        "morphemes": [str(m) for m in sa.getMorphemes()],
                     }
             except Exception:  # noqa: BLE001
                 continue

 from __future__ import annotations
+from ._root_validator import ZEMBEREK_AVAILABLE, _morphology
 AMBIGUOUS_WORDS = {
     "yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
         return tokens
     try:
+        sentence = original_text.strip()
+        analysis = _morphology.analyze_sentence(sentence)
+        after = _morphology.disambiguate(sentence, analysis)
+        best_list = after.best_analysis()
         analyses: dict[str, dict] = {}
+        for sa in best_list:
             try:
+                sf = sa.surface_form().lower().strip()
                 if sf not in analyses:
                     analyses[sf] = {
+                        "lemma":     sa.item.lemma,
+                        "pos":       sa.item.primary_pos.short_form,
+                        "morphemes": [m.id_ for m in sa.get_morphemes()],
                     }
             except Exception:  # noqa: BLE001
                 continue

nedo_turkish_tokenizer/_java_check.py DELETED Viewed

@@ -1,57 +0,0 @@
-"""Java/JVM presence check with actionable error messages."""
-from __future__ import annotations
-import shutil
-import subprocess
-import sys
-def ensure_java() -> None:
-    """Raise a clear RuntimeError if Java is not installed."""
-    if shutil.which("java") is not None:
-        return
-    # Try jpype's own detection as a fallback
-    try:
-        import jpype  # noqa: PLC0415
-        jpype.getDefaultJVMPath()
-        return
-    except Exception:  # noqa: BLE001
-        pass
-    _install_cmd = _get_install_cmd()
-    raise RuntimeError(
-        "\n"
-        "╔══════════════════════════════════════════════════════════════╗\n"
-        "║  NedoTurkishTokenizer requires Java (JVM) — not found on this system  ║\n"
-        "╠══════════════════════════════════════════════════════════════╣\n"
-        f"║  Install Java with:                                          ║\n"
-        f"║    {_install_cmd:<58}║\n"
-        "║                                                              ║\n"
-        "║  Then re-run your script.                                    ║\n"
-        "╚══════════════════════════════════════════════════════════════╝\n"
-    )
-def _get_install_cmd() -> str:
-    if sys.platform == "linux":
-        # Try to detect distro
-        try:
-            out = subprocess.check_output(
-                ["cat", "/etc/os-release"], text=True, stderr=subprocess.DEVNULL
-            )
-            if "ubuntu" in out.lower() or "debian" in out.lower():
-                return "sudo apt install default-jre"
-            if "fedora" in out.lower() or "rhel" in out.lower() or "centos" in out.lower():
-                return "sudo dnf install java-latest-openjdk"
-            if "arch" in out.lower():
-                return "sudo pacman -S jre-openjdk"
-        except Exception:  # noqa: BLE001
-            pass
-        return "sudo apt install default-jre"
-    if sys.platform == "darwin":
-        return "brew install openjdk"
-    if sys.platform == "win32":
-        return "winget install Microsoft.OpenJDK.21"
-    return "Install Java from https://adoptium.net"

nedo_turkish_tokenizer/_preprocessor.py CHANGED Viewed

@@ -67,12 +67,13 @@ def _is_turkish_base(word: str) -> bool:
     tdk = load_tdk_words()
     if tdk and wl in tdk:
         return True
-    # Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir…)
     try:
         from ._root_validator import _morphology, ZEMBEREK_AVAILABLE  # noqa: PLC0415
         if ZEMBEREK_AVAILABLE and _morphology:
-            for analysis in _morphology.analyze(wl):
-                lemma = str(analysis).split("]")[0].lstrip("[")
                 if any(c in TR_CHARS for c in lemma):
                     return True
     except Exception:  # noqa: BLE001

     tdk = load_tdk_words()
     if tdk and wl in tdk:
         return True
+    # Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir...)
     try:
         from ._root_validator import _morphology, ZEMBEREK_AVAILABLE  # noqa: PLC0415
         if ZEMBEREK_AVAILABLE and _morphology:
+            wa = _morphology.analyze(wl)
+            for sa in wa.analysis_results:
+                lemma = sa.item.lemma
                 if any(c in TR_CHARS for c in lemma):
                     return True
     except Exception:  # noqa: BLE001

nedo_turkish_tokenizer/_root_validator.py CHANGED Viewed

@@ -2,14 +2,6 @@
 from __future__ import annotations
-import os
-from pathlib import Path
-# ── Zemberek JAR: bundled with package ───────────────────────────────────────
-_DATA_DIR = Path(__file__).parent / "data"
-JAR_PATH  = _DATA_DIR / "zemberek-full.jar"
 ZEMBEREK_AVAILABLE = False
 _morphology = None
@@ -17,30 +9,17 @@ _morphology = None
 def _init_zemberek() -> None:
     global ZEMBEREK_AVAILABLE, _morphology
-    if not JAR_PATH.exists():
-        print(
-            f"[NedoTurkishTokenizer] zemberek-full.jar not found at {JAR_PATH}\n"
-            "  Root validation disabled — morphological fixes will be limited."
-        )
-        return
     try:
-        import jpype  # noqa: PLC0415
-        if not jpype.isJVMStarted():
-            jpype.startJVM(
-                jpype.getDefaultJVMPath(),
-                "-ea",
-                f"-Djava.class.path={JAR_PATH}",
-                convertStrings=False,
-            )
-        TurkishMorphology = jpype.JClass("zemberek.morphology.TurkishMorphology")
-        _morphology = TurkishMorphology.createWithDefaults()
         ZEMBEREK_AVAILABLE = True
     except ImportError:
-        print("[NedoTurkishTokenizer] jpype1 not installed → pip install jpype1")
     except Exception as exc:  # noqa: BLE001
         print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
@@ -50,25 +29,20 @@ _init_zemberek()
 # ── Zemberek API helpers ──────────────────────────────────────────────────────
-def _jstr(s: str):
-    import jpype  # noqa: PLC0415
-    return jpype.JString(s)
 def analyze_word(word: str) -> list[dict]:
     """Return all Zemberek analyses for a single word."""
     if not ZEMBEREK_AVAILABLE:
         return []
     try:
-        wa = _morphology.analyze(_jstr(word))
         return [
             {
-                "lemma":    str(sa.getDictionaryItem().lemma),
-                "pos":      str(sa.getPos().shortForm),
-                "morphemes":[str(m) for m in sa.getMorphemes()],
-                "surface":  str(sa.surfaceForm()),
             }
-            for sa in wa.getAnalysisResults()
         ]
     except Exception:  # noqa: BLE001
         return []
@@ -99,7 +73,7 @@ def _is_spurious_root(root: str, next_tokens: list[dict]) -> bool:
 def build_correction_map(
     original_words: list[str], base_tokenizer
 ) -> dict[str, str]:
-    """Build a {tokenizer_root → zemberek_root} correction map."""
     correction_map: dict[str, str] = {}
     for word in original_words:
@@ -172,7 +146,7 @@ def validate_roots(
                 "token":           leading + correct,
                 "_original_token": tok["token"],
                 "_root_corrected": True,
-                "_note":           f"root corrected: '{surface}' → '{correct}'",
             }
         result.append(tok)
@@ -185,17 +159,17 @@ def disambiguate_sentence(words: list[str]) -> list[dict | None]:
     if not ZEMBEREK_AVAILABLE:
         return [None] * len(words)
     try:
-        sa_result = _morphology.analyzeAndDisambiguate(_jstr(" ".join(words)))
-        best = sa_result.bestAnalysis()
         out = []
-        for i in range(best.size()):
             try:
-                sa = best.get(i)
-                item = sa.getDictionaryItem()
                 out.append({
-                    "lemma":     str(item.lemma),
-                    "pos":       str(sa.getPos().shortForm),
-                    "morphemes": [str(m) for m in sa.getMorphemes()],
                 })
             except Exception:  # noqa: BLE001
                 out.append(None)

 from __future__ import annotations
 ZEMBEREK_AVAILABLE = False
 _morphology = None
 def _init_zemberek() -> None:
     global ZEMBEREK_AVAILABLE, _morphology
     try:
+        from zemberek import TurkishMorphology  # noqa: PLC0415
+        _morphology = TurkishMorphology.create_with_defaults()
         ZEMBEREK_AVAILABLE = True
     except ImportError:
+        print(
+            "[NedoTurkishTokenizer] zemberek-python not installed\n"
+            "  pip install zemberek-python"
+        )
     except Exception as exc:  # noqa: BLE001
         print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
 # ── Zemberek API helpers ──────────────────────────────────────────────────────
 def analyze_word(word: str) -> list[dict]:
     """Return all Zemberek analyses for a single word."""
     if not ZEMBEREK_AVAILABLE:
         return []
     try:
+        wa = _morphology.analyze(word)
         return [
             {
+                "lemma":    sa.item.lemma,
+                "pos":      sa.item.primary_pos.short_form,
+                "morphemes": [m.id_ for m in sa.get_morphemes()],
+                "surface":  sa.surface_form(),
             }
+            for sa in wa.analysis_results
         ]
     except Exception:  # noqa: BLE001
         return []
 def build_correction_map(
     original_words: list[str], base_tokenizer
 ) -> dict[str, str]:
+    """Build a {tokenizer_root -> zemberek_root} correction map."""
     correction_map: dict[str, str] = {}
     for word in original_words:
                 "token":           leading + correct,
                 "_original_token": tok["token"],
                 "_root_corrected": True,
+                "_note":           f"root corrected: '{surface}' -> '{correct}'",
             }
         result.append(tok)
     if not ZEMBEREK_AVAILABLE:
         return [None] * len(words)
     try:
+        sentence = " ".join(words)
+        analysis = _morphology.analyze_sentence(sentence)
+        after = _morphology.disambiguate(sentence, analysis)
+        best = after.best_analysis()
         out = []
+        for sa in best:
             try:
                 out.append({
+                    "lemma":     sa.item.lemma,
+                    "pos":       sa.item.primary_pos.short_form,
+                    "morphemes": [m.id_ for m in sa.get_morphemes()],
                 })
             except Exception:  # noqa: BLE001
                 out.append(None)

nedo_turkish_tokenizer/_tdk_vocab.py CHANGED Viewed

@@ -9,6 +9,7 @@ from pathlib import Path
 _CACHE_DIR = Path.home() / ".cache" / "nedo_turkish_tokenizer"
 _CACHE_DIR.mkdir(parents=True, exist_ok=True)
 TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt")
 TR_CHARS = set("çğışöüÇĞİŞÖÜ")
@@ -21,21 +22,46 @@ _HF_TDK_URL = (
 )
 def load_tdk_words() -> set:
     global _TDK_WORDS
     if _TDK_WORDS is not None:
         return _TDK_WORDS
     if not os.path.exists(TDK_CACHE_FILE):
-        print("[NedoTurkishTokenizer] TDK word list not found — downloading...")
         words = _download_from_hf() or _download_from_tdk()
         if not words:
             _TDK_WORDS = set()
             return _TDK_WORDS
-    with open(TDK_CACHE_FILE, encoding="utf-8") as f:
-        _TDK_WORDS = {line.strip().lower() for line in f if line.strip()}
-    print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded ✓")
     return _TDK_WORDS
@@ -51,11 +77,11 @@ def _download_from_hf() -> list[str]:
         with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
             f.write("\n".join(words))
-        print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from HuggingFace ✓")
         return words
     except Exception as exc:  # noqa: BLE001
-        print(f"[NedoTurkishTokenizer] HuggingFace download failed: {exc} — trying TDK API...")
         return []
@@ -72,7 +98,7 @@ def _download_from_tdk() -> list[str]:
         with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
             f.write("\n".join(words))
-        print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from TDK API ✓")
         return words
     except Exception as exc:  # noqa: BLE001

 _CACHE_DIR = Path.home() / ".cache" / "nedo_turkish_tokenizer"
 _CACHE_DIR.mkdir(parents=True, exist_ok=True)
 TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt")
+_BUNDLED_TDK_FILE = Path(__file__).parent / "data" / "tdk_words.txt"
 TR_CHARS = set("çğışöüÇĞİŞÖÜ")
 )
+def _read_word_file(path: Path) -> set[str]:
+    with path.open(encoding="utf-8") as f:
+        return {line.strip().lower() for line in f if line.strip()}
+def _load_cached_or_bundled_words() -> tuple[set[str] | None, str | None]:
+    candidates = (
+        (Path(TDK_CACHE_FILE), "cache"),
+        (_BUNDLED_TDK_FILE, "package bundle"),
+    )
+    for path, source in candidates:
+        if path.exists():
+            return _read_word_file(path), source
+    return None, None
 def load_tdk_words() -> set:
     global _TDK_WORDS
     if _TDK_WORDS is not None:
         return _TDK_WORDS
+    words, source = _load_cached_or_bundled_words()
+    if words is not None:
+        _TDK_WORDS = words
+        print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded from {source} [ok]")
+        return _TDK_WORDS
     if not os.path.exists(TDK_CACHE_FILE):
+        print("[NedoTurkishTokenizer] TDK word list not found - downloading...")
         words = _download_from_hf() or _download_from_tdk()
         if not words:
             _TDK_WORDS = set()
             return _TDK_WORDS
+    _TDK_WORDS, source = _load_cached_or_bundled_words()
+    if _TDK_WORDS is None:
+        _TDK_WORDS = set()
+        return _TDK_WORDS
+    print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded from {source} [ok]")
     return _TDK_WORDS
         with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
             f.write("\n".join(words))
+        print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from HuggingFace [ok]")
         return words
     except Exception as exc:  # noqa: BLE001
+        print(f"[NedoTurkishTokenizer] HuggingFace download failed: {exc} - trying TDK API...")
         return []
         with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
             f.write("\n".join(words))
+        print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from TDK API [ok]")
         return words
     except Exception as exc:  # noqa: BLE001

nedo_turkish_tokenizer/data/zemberek-full.jar DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:74ee8736b73dc2ca878071b80829f9c5acccc268d4b8b7795d36d60db26a1731
-size 31644792

nedo_turkish_tokenizer/tokenizer.py CHANGED Viewed

@@ -30,7 +30,6 @@ import multiprocessing
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from pathlib import Path
-from ._java_check import ensure_java
 from ._preprocessor import preprocess, postprocess
 from ._suffix_expander import reclassify_bpe_suffixes
 from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
@@ -98,7 +97,6 @@ class NedoTurkishTokenizer:
     """
     def __init__(self) -> None:
-        ensure_java()
         from turkish_tokenizer import TurkishTokenizer  # noqa: PLC0415
         self._base = TurkishTokenizer()
         self.zemberek_available = ZEMBEREK_AVAILABLE

 from concurrent.futures import ProcessPoolExecutor, as_completed
 from pathlib import Path
 from ._preprocessor import preprocess, postprocess
 from ._suffix_expander import reclassify_bpe_suffixes
 from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
     """
     def __init__(self) -> None:
         from turkish_tokenizer import TurkishTokenizer  # noqa: PLC0415
         self._base = TurkishTokenizer()
         self.zemberek_available = ZEMBEREK_AVAILABLE

pyproject.toml CHANGED Viewed

@@ -20,7 +20,7 @@ classifiers = [
 ]
 dependencies = [
     "turkish-tokenizer>=0.1.0",
-    "jpype1>=1.4.0",
     "requests>=2.28.0",
 ]
@@ -29,11 +29,11 @@ dev = ["pytest", "huggingface_hub"]
 [project.urls]
 Homepage = "https://huggingface.co/Ethosoft/NedoTurkishTokenizer"
-Repository = "https://huggingface.co/Ethosoft/NedoTurkishTokenizer"
 [tool.setuptools.packages.find]
 where = ["."]
 include = ["nedo_turkish_tokenizer*"]
 [tool.setuptools.package-data]
-nedo_turkish_tokenizer = ["data/*.jar"]

 ]
 dependencies = [
     "turkish-tokenizer>=0.1.0",
+    "zemberek-python>=0.2.3",
     "requests>=2.28.0",
 ]
 [project.urls]
 Homepage = "https://huggingface.co/Ethosoft/NedoTurkishTokenizer"
+Repository = "https://github.com/ethosoftai/NedoTurkishTokenizer"
 [tool.setuptools.packages.find]
 where = ["."]
 include = ["nedo_turkish_tokenizer*"]
 [tool.setuptools.package-data]
+nedo_turkish_tokenizer = ["data/*.txt"]

tests/test_tdk_vocab.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from __future__ import annotations
+import tempfile
+import unittest
+from pathlib import Path
+from unittest import mock
+from nedo_turkish_tokenizer import _tdk_vocab
+class TdkVocabTests(unittest.TestCase):
+    def setUp(self) -> None:
+        self._original_words = _tdk_vocab._TDK_WORDS
+        _tdk_vocab._TDK_WORDS = None
+    def tearDown(self) -> None:
+        _tdk_vocab._TDK_WORDS = self._original_words
+    def test_load_tdk_words_uses_bundled_file_before_network(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            cache_path = str(Path(tmpdir) / "tdk_words.txt")
+            with mock.patch.object(_tdk_vocab, "TDK_CACHE_FILE", cache_path):
+                with mock.patch.object(_tdk_vocab, "_download_from_hf") as download_hf:
+                    with mock.patch.object(_tdk_vocab, "_download_from_tdk") as download_tdk:
+                        words = _tdk_vocab.load_tdk_words()
+        self.assertGreater(len(words), 50_000)
+        self.assertIn("zemberek", words)
+        download_hf.assert_not_called()
+        download_tdk.assert_not_called()

tests/test_zemberek_integration.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from __future__ import annotations
+import tempfile
+import unittest
+from pathlib import Path
+from unittest import mock
+from nedo_turkish_tokenizer import NedoTurkishTokenizer, _tdk_vocab
+from nedo_turkish_tokenizer._root_validator import (
+    ZEMBEREK_AVAILABLE,
+    disambiguate_sentence,
+)
+@unittest.skipUnless(ZEMBEREK_AVAILABLE, "zemberek-python is required for these tests")
+class ZemberekIntegrationTests(unittest.TestCase):
+    def setUp(self) -> None:
+        self._original_words = _tdk_vocab._TDK_WORDS
+        _tdk_vocab._TDK_WORDS = None
+    def tearDown(self) -> None:
+        _tdk_vocab._TDK_WORDS = self._original_words
+    def test_sentence_disambiguation_uses_zemberek_python(self) -> None:
+        analyses = disambiguate_sentence(["Bug\u00fcn", "geldi"])
+        self.assertEqual(2, len(analyses))
+        self.assertEqual("bug\u00fcn", analyses[0]["lemma"])
+        self.assertEqual("gelmek", analyses[1]["lemma"])
+        self.assertEqual("Verb", analyses[1]["pos"])
+    def test_tokenizer_smoke_uses_bundled_tdk_words(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            cache_path = str(Path(tmpdir) / "tdk_words.txt")
+            with mock.patch.object(_tdk_vocab, "TDK_CACHE_FILE", cache_path):
+                with mock.patch.object(_tdk_vocab, "_download_from_hf") as download_hf:
+                    with mock.patch.object(_tdk_vocab, "_download_from_tdk") as download_tdk:
+                        tokenizer = NedoTurkishTokenizer()
+                        tokens = tokenizer.tokenize("Bug\u00fcn \u0130stanbul'a gidiyorum.")
+        self.assertTrue(
+            any(t["token"].strip() == "bug\u00fcn" and t["token_type"] == "ROOT" for t in tokens)
+        )
+        self.assertTrue(any(t["token"] == "'" and t["token_type"] == "PUNCT" for t in tokens))
+        self.assertTrue(
+            any(
+                t["token"].strip() == "a"
+                and t["token_type"] == "SUFFIX"
+                and t["morph_pos"] == 1
+                for t in tokens
+            )
+        )
+        self.assertTrue(
+            any(t["token"].strip() == "gitmek" and t.get("_root_corrected") for t in tokens)
+        )
+        download_hf.assert_not_called()
+        download_tdk.assert_not_called()

tokenizer_config.json CHANGED Viewed

@@ -7,6 +7,6 @@
   "version": "1.0.0",
   "language": "tr",
   "description": "Turkish morphological tokenizer — TR-MMLU world record 92%",
-  "requires_java": true,
-  "dependencies": ["turkish-tokenizer", "jpype1"]
 }

   "version": "1.0.0",
   "language": "tr",
   "description": "Turkish morphological tokenizer — TR-MMLU world record 92%",
+  "requires_java": false,
+  "dependencies": ["turkish-tokenizer", "zemberek-python"]
 }