nmstech commited on
Commit
5a6f887
·
1 Parent(s): 532470d

use zemberek-python and add regression tests

Browse files
nedo_turkish_tokenizer/_compound.py CHANGED
@@ -41,11 +41,10 @@ KNOWN_COMPOUNDS: dict[str, list[str]] = {
41
 
42
  def _decompose_zemberek(word: str, morphology) -> list[str] | None:
43
  try:
44
- import jpype # noqa: PLC0415
45
- wa = morphology.analyze(jpype.JString(word))
46
- for sa in wa.getAnalysisResults():
47
- morphemes = [str(m) for m in sa.getMorphemes()]
48
- roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
49
  if len(roots) > 1:
50
  return roots
51
  except Exception: # noqa: BLE001
 
41
 
42
  def _decompose_zemberek(word: str, morphology) -> list[str] | None:
43
  try:
44
+ wa = morphology.analyze(word)
45
+ for sa in wa.analysis_results:
46
+ morphemes = [m.id_ for m in sa.get_morphemes()]
47
+ roots = [m for m in morphemes if m in ("Noun", "Verb", "Adj")]
 
48
  if len(roots) > 1:
49
  return roots
50
  except Exception: # noqa: BLE001
nedo_turkish_tokenizer/_context_aware.py CHANGED
@@ -2,7 +2,7 @@
2
 
3
  from __future__ import annotations
4
 
5
- from ._root_validator import ZEMBEREK_AVAILABLE, _morphology, _jstr
6
 
7
  AMBIGUOUS_WORDS = {
8
  "yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
@@ -17,20 +17,20 @@ def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
17
  return tokens
18
 
19
  try:
20
- sa_result = _morphology.analyzeAndDisambiguate(_jstr(original_text.strip()))
21
- best_list = sa_result.bestAnalysis()
 
 
22
 
23
  analyses: dict[str, dict] = {}
24
- for idx in range(best_list.size()):
25
  try:
26
- sa = best_list.get(idx)
27
- item = sa.getDictionaryItem()
28
- sf = str(sa.surfaceForm()).lower().strip()
29
  if sf not in analyses:
30
  analyses[sf] = {
31
- "lemma": str(item.lemma),
32
- "pos": str(sa.getPos().shortForm),
33
- "morphemes": [str(m) for m in sa.getMorphemes()],
34
  }
35
  except Exception: # noqa: BLE001
36
  continue
 
2
 
3
  from __future__ import annotations
4
 
5
+ from ._root_validator import ZEMBEREK_AVAILABLE, _morphology
6
 
7
  AMBIGUOUS_WORDS = {
8
  "yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
 
17
  return tokens
18
 
19
  try:
20
+ sentence = original_text.strip()
21
+ analysis = _morphology.analyze_sentence(sentence)
22
+ after = _morphology.disambiguate(sentence, analysis)
23
+ best_list = after.best_analysis()
24
 
25
  analyses: dict[str, dict] = {}
26
+ for sa in best_list:
27
  try:
28
+ sf = sa.surface_form().lower().strip()
 
 
29
  if sf not in analyses:
30
  analyses[sf] = {
31
+ "lemma": sa.item.lemma,
32
+ "pos": sa.item.primary_pos.short_form,
33
+ "morphemes": [m.id_ for m in sa.get_morphemes()],
34
  }
35
  except Exception: # noqa: BLE001
36
  continue
nedo_turkish_tokenizer/_java_check.py DELETED
@@ -1,57 +0,0 @@
1
- """Java/JVM presence check with actionable error messages."""
2
-
3
- from __future__ import annotations
4
-
5
- import shutil
6
- import subprocess
7
- import sys
8
-
9
-
10
- def ensure_java() -> None:
11
- """Raise a clear RuntimeError if Java is not installed."""
12
- if shutil.which("java") is not None:
13
- return
14
-
15
- # Try jpype's own detection as a fallback
16
- try:
17
- import jpype # noqa: PLC0415
18
- jpype.getDefaultJVMPath()
19
- return
20
- except Exception: # noqa: BLE001
21
- pass
22
-
23
- _install_cmd = _get_install_cmd()
24
- raise RuntimeError(
25
- "\n"
26
- "╔══════════════════════════════════════════════════════════════╗\n"
27
- "║ NedoTurkishTokenizer requires Java (JVM) — not found on this system ║\n"
28
- "╠══════════════════════════════════════════════════════════════╣\n"
29
- f"║ Install Java with: ║\n"
30
- f"║ {_install_cmd:<58}║\n"
31
- "║ ║\n"
32
- "║ Then re-run your script. ║\n"
33
- "╚══════════════════════════════════════════════════════════════╝\n"
34
- )
35
-
36
-
37
- def _get_install_cmd() -> str:
38
- if sys.platform == "linux":
39
- # Try to detect distro
40
- try:
41
- out = subprocess.check_output(
42
- ["cat", "/etc/os-release"], text=True, stderr=subprocess.DEVNULL
43
- )
44
- if "ubuntu" in out.lower() or "debian" in out.lower():
45
- return "sudo apt install default-jre"
46
- if "fedora" in out.lower() or "rhel" in out.lower() or "centos" in out.lower():
47
- return "sudo dnf install java-latest-openjdk"
48
- if "arch" in out.lower():
49
- return "sudo pacman -S jre-openjdk"
50
- except Exception: # noqa: BLE001
51
- pass
52
- return "sudo apt install default-jre"
53
- if sys.platform == "darwin":
54
- return "brew install openjdk"
55
- if sys.platform == "win32":
56
- return "winget install Microsoft.OpenJDK.21"
57
- return "Install Java from https://adoptium.net"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
nedo_turkish_tokenizer/_preprocessor.py CHANGED
@@ -67,12 +67,13 @@ def _is_turkish_base(word: str) -> bool:
67
  tdk = load_tdk_words()
68
  if tdk and wl in tdk:
69
  return True
70
- # Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir)
71
  try:
72
  from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
73
  if ZEMBEREK_AVAILABLE and _morphology:
74
- for analysis in _morphology.analyze(wl):
75
- lemma = str(analysis).split("]")[0].lstrip("[")
 
76
  if any(c in TR_CHARS for c in lemma):
77
  return True
78
  except Exception: # noqa: BLE001
 
67
  tdk = load_tdk_words()
68
  if tdk and wl in tdk:
69
  return True
70
+ # Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir...)
71
  try:
72
  from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
73
  if ZEMBEREK_AVAILABLE and _morphology:
74
+ wa = _morphology.analyze(wl)
75
+ for sa in wa.analysis_results:
76
+ lemma = sa.item.lemma
77
  if any(c in TR_CHARS for c in lemma):
78
  return True
79
  except Exception: # noqa: BLE001
nedo_turkish_tokenizer/_root_validator.py CHANGED
@@ -2,14 +2,6 @@
2
 
3
  from __future__ import annotations
4
 
5
- import os
6
- from pathlib import Path
7
-
8
- # ── Zemberek JAR: bundled with package ───────────────────────────────────────
9
-
10
- _DATA_DIR = Path(__file__).parent / "data"
11
- JAR_PATH = _DATA_DIR / "zemberek-full.jar"
12
-
13
  ZEMBEREK_AVAILABLE = False
14
  _morphology = None
15
 
@@ -17,30 +9,17 @@ _morphology = None
17
  def _init_zemberek() -> None:
18
  global ZEMBEREK_AVAILABLE, _morphology
19
 
20
- if not JAR_PATH.exists():
21
- print(
22
- f"[NedoTurkishTokenizer] zemberek-full.jar not found at {JAR_PATH}\n"
23
- " Root validation disabled — morphological fixes will be limited."
24
- )
25
- return
26
-
27
  try:
28
- import jpype # noqa: PLC0415
29
-
30
- if not jpype.isJVMStarted():
31
- jpype.startJVM(
32
- jpype.getDefaultJVMPath(),
33
- "-ea",
34
- f"-Djava.class.path={JAR_PATH}",
35
- convertStrings=False,
36
- )
37
 
38
- TurkishMorphology = jpype.JClass("zemberek.morphology.TurkishMorphology")
39
- _morphology = TurkishMorphology.createWithDefaults()
40
  ZEMBEREK_AVAILABLE = True
41
 
42
  except ImportError:
43
- print("[NedoTurkishTokenizer] jpype1 not installed → pip install jpype1")
 
 
 
44
  except Exception as exc: # noqa: BLE001
45
  print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
46
 
@@ -50,25 +29,20 @@ _init_zemberek()
50
 
51
  # ── Zemberek API helpers ──────────────────────────────────────────────────────
52
 
53
- def _jstr(s: str):
54
- import jpype # noqa: PLC0415
55
- return jpype.JString(s)
56
-
57
-
58
  def analyze_word(word: str) -> list[dict]:
59
  """Return all Zemberek analyses for a single word."""
60
  if not ZEMBEREK_AVAILABLE:
61
  return []
62
  try:
63
- wa = _morphology.analyze(_jstr(word))
64
  return [
65
  {
66
- "lemma": str(sa.getDictionaryItem().lemma),
67
- "pos": str(sa.getPos().shortForm),
68
- "morphemes":[str(m) for m in sa.getMorphemes()],
69
- "surface": str(sa.surfaceForm()),
70
  }
71
- for sa in wa.getAnalysisResults()
72
  ]
73
  except Exception: # noqa: BLE001
74
  return []
@@ -99,7 +73,7 @@ def _is_spurious_root(root: str, next_tokens: list[dict]) -> bool:
99
  def build_correction_map(
100
  original_words: list[str], base_tokenizer
101
  ) -> dict[str, str]:
102
- """Build a {tokenizer_root zemberek_root} correction map."""
103
  correction_map: dict[str, str] = {}
104
 
105
  for word in original_words:
@@ -172,7 +146,7 @@ def validate_roots(
172
  "token": leading + correct,
173
  "_original_token": tok["token"],
174
  "_root_corrected": True,
175
- "_note": f"root corrected: '{surface}' '{correct}'",
176
  }
177
 
178
  result.append(tok)
@@ -185,17 +159,17 @@ def disambiguate_sentence(words: list[str]) -> list[dict | None]:
185
  if not ZEMBEREK_AVAILABLE:
186
  return [None] * len(words)
187
  try:
188
- sa_result = _morphology.analyzeAndDisambiguate(_jstr(" ".join(words)))
189
- best = sa_result.bestAnalysis()
 
 
190
  out = []
191
- for i in range(best.size()):
192
  try:
193
- sa = best.get(i)
194
- item = sa.getDictionaryItem()
195
  out.append({
196
- "lemma": str(item.lemma),
197
- "pos": str(sa.getPos().shortForm),
198
- "morphemes": [str(m) for m in sa.getMorphemes()],
199
  })
200
  except Exception: # noqa: BLE001
201
  out.append(None)
 
2
 
3
  from __future__ import annotations
4
 
 
 
 
 
 
 
 
 
5
  ZEMBEREK_AVAILABLE = False
6
  _morphology = None
7
 
 
9
  def _init_zemberek() -> None:
10
  global ZEMBEREK_AVAILABLE, _morphology
11
 
 
 
 
 
 
 
 
12
  try:
13
+ from zemberek import TurkishMorphology # noqa: PLC0415
 
 
 
 
 
 
 
 
14
 
15
+ _morphology = TurkishMorphology.create_with_defaults()
 
16
  ZEMBEREK_AVAILABLE = True
17
 
18
  except ImportError:
19
+ print(
20
+ "[NedoTurkishTokenizer] zemberek-python not installed\n"
21
+ " pip install zemberek-python"
22
+ )
23
  except Exception as exc: # noqa: BLE001
24
  print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
25
 
 
29
 
30
  # ── Zemberek API helpers ──────────────────────────────────────────────────────
31
 
 
 
 
 
 
32
  def analyze_word(word: str) -> list[dict]:
33
  """Return all Zemberek analyses for a single word."""
34
  if not ZEMBEREK_AVAILABLE:
35
  return []
36
  try:
37
+ wa = _morphology.analyze(word)
38
  return [
39
  {
40
+ "lemma": sa.item.lemma,
41
+ "pos": sa.item.primary_pos.short_form,
42
+ "morphemes": [m.id_ for m in sa.get_morphemes()],
43
+ "surface": sa.surface_form(),
44
  }
45
+ for sa in wa.analysis_results
46
  ]
47
  except Exception: # noqa: BLE001
48
  return []
 
73
  def build_correction_map(
74
  original_words: list[str], base_tokenizer
75
  ) -> dict[str, str]:
76
+ """Build a {tokenizer_root -> zemberek_root} correction map."""
77
  correction_map: dict[str, str] = {}
78
 
79
  for word in original_words:
 
146
  "token": leading + correct,
147
  "_original_token": tok["token"],
148
  "_root_corrected": True,
149
+ "_note": f"root corrected: '{surface}' -> '{correct}'",
150
  }
151
 
152
  result.append(tok)
 
159
  if not ZEMBEREK_AVAILABLE:
160
  return [None] * len(words)
161
  try:
162
+ sentence = " ".join(words)
163
+ analysis = _morphology.analyze_sentence(sentence)
164
+ after = _morphology.disambiguate(sentence, analysis)
165
+ best = after.best_analysis()
166
  out = []
167
+ for sa in best:
168
  try:
 
 
169
  out.append({
170
+ "lemma": sa.item.lemma,
171
+ "pos": sa.item.primary_pos.short_form,
172
+ "morphemes": [m.id_ for m in sa.get_morphemes()],
173
  })
174
  except Exception: # noqa: BLE001
175
  out.append(None)
nedo_turkish_tokenizer/_tdk_vocab.py CHANGED
@@ -9,6 +9,7 @@ from pathlib import Path
9
  _CACHE_DIR = Path.home() / ".cache" / "nedo_turkish_tokenizer"
10
  _CACHE_DIR.mkdir(parents=True, exist_ok=True)
11
  TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt")
 
12
 
13
  TR_CHARS = set("çğışöüÇĞİŞÖÜ")
14
 
@@ -21,21 +22,46 @@ _HF_TDK_URL = (
21
  )
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def load_tdk_words() -> set:
25
  global _TDK_WORDS
26
  if _TDK_WORDS is not None:
27
  return _TDK_WORDS
28
 
 
 
 
 
 
 
29
  if not os.path.exists(TDK_CACHE_FILE):
30
- print("[NedoTurkishTokenizer] TDK word list not found downloading...")
31
  words = _download_from_hf() or _download_from_tdk()
32
  if not words:
33
  _TDK_WORDS = set()
34
  return _TDK_WORDS
35
 
36
- with open(TDK_CACHE_FILE, encoding="utf-8") as f:
37
- _TDK_WORDS = {line.strip().lower() for line in f if line.strip()}
38
- print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded ✓")
 
 
 
39
  return _TDK_WORDS
40
 
41
 
@@ -51,11 +77,11 @@ def _download_from_hf() -> list[str]:
51
  with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
52
  f.write("\n".join(words))
53
 
54
- print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from HuggingFace ")
55
  return words
56
 
57
  except Exception as exc: # noqa: BLE001
58
- print(f"[NedoTurkishTokenizer] HuggingFace download failed: {exc} trying TDK API...")
59
  return []
60
 
61
 
@@ -72,7 +98,7 @@ def _download_from_tdk() -> list[str]:
72
  with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
73
  f.write("\n".join(words))
74
 
75
- print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from TDK API ")
76
  return words
77
 
78
  except Exception as exc: # noqa: BLE001
 
9
  _CACHE_DIR = Path.home() / ".cache" / "nedo_turkish_tokenizer"
10
  _CACHE_DIR.mkdir(parents=True, exist_ok=True)
11
  TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt")
12
+ _BUNDLED_TDK_FILE = Path(__file__).parent / "data" / "tdk_words.txt"
13
 
14
  TR_CHARS = set("çğışöüÇĞİŞÖÜ")
15
 
 
22
  )
23
 
24
 
25
+ def _read_word_file(path: Path) -> set[str]:
26
+ with path.open(encoding="utf-8") as f:
27
+ return {line.strip().lower() for line in f if line.strip()}
28
+
29
+
30
+ def _load_cached_or_bundled_words() -> tuple[set[str] | None, str | None]:
31
+ candidates = (
32
+ (Path(TDK_CACHE_FILE), "cache"),
33
+ (_BUNDLED_TDK_FILE, "package bundle"),
34
+ )
35
+ for path, source in candidates:
36
+ if path.exists():
37
+ return _read_word_file(path), source
38
+ return None, None
39
+
40
+
41
  def load_tdk_words() -> set:
42
  global _TDK_WORDS
43
  if _TDK_WORDS is not None:
44
  return _TDK_WORDS
45
 
46
+ words, source = _load_cached_or_bundled_words()
47
+ if words is not None:
48
+ _TDK_WORDS = words
49
+ print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded from {source} [ok]")
50
+ return _TDK_WORDS
51
+
52
  if not os.path.exists(TDK_CACHE_FILE):
53
+ print("[NedoTurkishTokenizer] TDK word list not found - downloading...")
54
  words = _download_from_hf() or _download_from_tdk()
55
  if not words:
56
  _TDK_WORDS = set()
57
  return _TDK_WORDS
58
 
59
+ _TDK_WORDS, source = _load_cached_or_bundled_words()
60
+ if _TDK_WORDS is None:
61
+ _TDK_WORDS = set()
62
+ return _TDK_WORDS
63
+
64
+ print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded from {source} [ok]")
65
  return _TDK_WORDS
66
 
67
 
 
77
  with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
78
  f.write("\n".join(words))
79
 
80
+ print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from HuggingFace [ok]")
81
  return words
82
 
83
  except Exception as exc: # noqa: BLE001
84
+ print(f"[NedoTurkishTokenizer] HuggingFace download failed: {exc} - trying TDK API...")
85
  return []
86
 
87
 
 
98
  with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
99
  f.write("\n".join(words))
100
 
101
+ print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from TDK API [ok]")
102
  return words
103
 
104
  except Exception as exc: # noqa: BLE001
nedo_turkish_tokenizer/data/zemberek-full.jar DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:74ee8736b73dc2ca878071b80829f9c5acccc268d4b8b7795d36d60db26a1731
3
- size 31644792
 
 
 
 
nedo_turkish_tokenizer/tokenizer.py CHANGED
@@ -30,7 +30,6 @@ import multiprocessing
30
  from concurrent.futures import ProcessPoolExecutor, as_completed
31
  from pathlib import Path
32
 
33
- from ._java_check import ensure_java
34
  from ._preprocessor import preprocess, postprocess
35
  from ._suffix_expander import reclassify_bpe_suffixes
36
  from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
@@ -98,7 +97,6 @@ class NedoTurkishTokenizer:
98
  """
99
 
100
  def __init__(self) -> None:
101
- ensure_java()
102
  from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
103
  self._base = TurkishTokenizer()
104
  self.zemberek_available = ZEMBEREK_AVAILABLE
 
30
  from concurrent.futures import ProcessPoolExecutor, as_completed
31
  from pathlib import Path
32
 
 
33
  from ._preprocessor import preprocess, postprocess
34
  from ._suffix_expander import reclassify_bpe_suffixes
35
  from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
 
97
  """
98
 
99
  def __init__(self) -> None:
 
100
  from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
101
  self._base = TurkishTokenizer()
102
  self.zemberek_available = ZEMBEREK_AVAILABLE
pyproject.toml CHANGED
@@ -20,7 +20,7 @@ classifiers = [
20
  ]
21
  dependencies = [
22
  "turkish-tokenizer>=0.1.0",
23
- "jpype1>=1.4.0",
24
  "requests>=2.28.0",
25
  ]
26
 
@@ -29,11 +29,11 @@ dev = ["pytest", "huggingface_hub"]
29
 
30
  [project.urls]
31
  Homepage = "https://huggingface.co/Ethosoft/NedoTurkishTokenizer"
32
- Repository = "https://huggingface.co/Ethosoft/NedoTurkishTokenizer"
33
 
34
  [tool.setuptools.packages.find]
35
  where = ["."]
36
  include = ["nedo_turkish_tokenizer*"]
37
 
38
  [tool.setuptools.package-data]
39
- nedo_turkish_tokenizer = ["data/*.jar"]
 
20
  ]
21
  dependencies = [
22
  "turkish-tokenizer>=0.1.0",
23
+ "zemberek-python>=0.2.3",
24
  "requests>=2.28.0",
25
  ]
26
 
 
29
 
30
  [project.urls]
31
  Homepage = "https://huggingface.co/Ethosoft/NedoTurkishTokenizer"
32
+ Repository = "https://github.com/ethosoftai/NedoTurkishTokenizer"
33
 
34
  [tool.setuptools.packages.find]
35
  where = ["."]
36
  include = ["nedo_turkish_tokenizer*"]
37
 
38
  [tool.setuptools.package-data]
39
+ nedo_turkish_tokenizer = ["data/*.txt"]
tests/test_tdk_vocab.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import tempfile
4
+ import unittest
5
+ from pathlib import Path
6
+ from unittest import mock
7
+
8
+ from nedo_turkish_tokenizer import _tdk_vocab
9
+
10
+
11
+ class TdkVocabTests(unittest.TestCase):
12
+ def setUp(self) -> None:
13
+ self._original_words = _tdk_vocab._TDK_WORDS
14
+ _tdk_vocab._TDK_WORDS = None
15
+
16
+ def tearDown(self) -> None:
17
+ _tdk_vocab._TDK_WORDS = self._original_words
18
+
19
+ def test_load_tdk_words_uses_bundled_file_before_network(self) -> None:
20
+ with tempfile.TemporaryDirectory() as tmpdir:
21
+ cache_path = str(Path(tmpdir) / "tdk_words.txt")
22
+
23
+ with mock.patch.object(_tdk_vocab, "TDK_CACHE_FILE", cache_path):
24
+ with mock.patch.object(_tdk_vocab, "_download_from_hf") as download_hf:
25
+ with mock.patch.object(_tdk_vocab, "_download_from_tdk") as download_tdk:
26
+ words = _tdk_vocab.load_tdk_words()
27
+
28
+ self.assertGreater(len(words), 50_000)
29
+ self.assertIn("zemberek", words)
30
+ download_hf.assert_not_called()
31
+ download_tdk.assert_not_called()
tests/test_zemberek_integration.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import tempfile
4
+ import unittest
5
+ from pathlib import Path
6
+ from unittest import mock
7
+
8
+ from nedo_turkish_tokenizer import NedoTurkishTokenizer, _tdk_vocab
9
+ from nedo_turkish_tokenizer._root_validator import (
10
+ ZEMBEREK_AVAILABLE,
11
+ disambiguate_sentence,
12
+ )
13
+
14
+
15
+ @unittest.skipUnless(ZEMBEREK_AVAILABLE, "zemberek-python is required for these tests")
16
+ class ZemberekIntegrationTests(unittest.TestCase):
17
+ def setUp(self) -> None:
18
+ self._original_words = _tdk_vocab._TDK_WORDS
19
+ _tdk_vocab._TDK_WORDS = None
20
+
21
+ def tearDown(self) -> None:
22
+ _tdk_vocab._TDK_WORDS = self._original_words
23
+
24
+ def test_sentence_disambiguation_uses_zemberek_python(self) -> None:
25
+ analyses = disambiguate_sentence(["Bug\u00fcn", "geldi"])
26
+
27
+ self.assertEqual(2, len(analyses))
28
+ self.assertEqual("bug\u00fcn", analyses[0]["lemma"])
29
+ self.assertEqual("gelmek", analyses[1]["lemma"])
30
+ self.assertEqual("Verb", analyses[1]["pos"])
31
+
32
+ def test_tokenizer_smoke_uses_bundled_tdk_words(self) -> None:
33
+ with tempfile.TemporaryDirectory() as tmpdir:
34
+ cache_path = str(Path(tmpdir) / "tdk_words.txt")
35
+
36
+ with mock.patch.object(_tdk_vocab, "TDK_CACHE_FILE", cache_path):
37
+ with mock.patch.object(_tdk_vocab, "_download_from_hf") as download_hf:
38
+ with mock.patch.object(_tdk_vocab, "_download_from_tdk") as download_tdk:
39
+ tokenizer = NedoTurkishTokenizer()
40
+ tokens = tokenizer.tokenize("Bug\u00fcn \u0130stanbul'a gidiyorum.")
41
+
42
+ self.assertTrue(
43
+ any(t["token"].strip() == "bug\u00fcn" and t["token_type"] == "ROOT" for t in tokens)
44
+ )
45
+ self.assertTrue(any(t["token"] == "'" and t["token_type"] == "PUNCT" for t in tokens))
46
+ self.assertTrue(
47
+ any(
48
+ t["token"].strip() == "a"
49
+ and t["token_type"] == "SUFFIX"
50
+ and t["morph_pos"] == 1
51
+ for t in tokens
52
+ )
53
+ )
54
+ self.assertTrue(
55
+ any(t["token"].strip() == "gitmek" and t.get("_root_corrected") for t in tokens)
56
+ )
57
+ download_hf.assert_not_called()
58
+ download_tdk.assert_not_called()
tokenizer_config.json CHANGED
@@ -7,6 +7,6 @@
7
  "version": "1.0.0",
8
  "language": "tr",
9
  "description": "Turkish morphological tokenizer — TR-MMLU world record 92%",
10
- "requires_java": true,
11
- "dependencies": ["turkish-tokenizer", "jpype1"]
12
  }
 
7
  "version": "1.0.0",
8
  "language": "tr",
9
  "description": "Turkish morphological tokenizer — TR-MMLU world record 92%",
10
+ "requires_java": false,
11
+ "dependencies": ["turkish-tokenizer", "zemberek-python"]
12
  }