nmstech commited on
Commit
8c72d18
Β·
1 Parent(s): 3e2daf4

Migrate to zemberek-python, remove JVM dependency and 31MB JAR, apply O(N^2) init fix

Browse files
README.md CHANGED
@@ -26,7 +26,7 @@ NedoTurkishTokenizer performs linguistically-aware tokenization of Turkish text
26
  | **Language** | Turkish (`tr`) |
27
  | **License** | MIT |
28
  | **Benchmark** | TR-MMLU **95.45%** (world record) |
29
- | **Morphological engine** | Zemberek NLP (bundled) |
30
 
31
  ---
32
 
@@ -38,15 +38,7 @@ NedoTurkishTokenizer performs linguistically-aware tokenization of Turkish text
38
  pip install git+https://huggingface.co/Ethosoft/NedoTurkishTokenizer
39
  ```
40
 
41
- > **Java is required** for Zemberek morphological analysis.
42
- > If you get a Java error, install it first:
43
- >
44
- > | OS | Command |
45
- > |---|---|
46
- > | Ubuntu / Debian | `sudo apt install default-jre` |
47
- > | Fedora / RHEL | `sudo dnf install java-latest-openjdk` |
48
- > | macOS | `brew install openjdk` |
49
- > | Windows | `winget install Microsoft.OpenJDK.21` |
50
 
51
  ---
52
 
 
26
  | **Language** | Turkish (`tr`) |
27
  | **License** | MIT |
28
  | **Benchmark** | TR-MMLU **95.45%** (world record) |
29
+ | **Morphological engine** | zemberek-python |
30
 
31
  ---
32
 
 
38
  pip install git+https://huggingface.co/Ethosoft/NedoTurkishTokenizer
39
  ```
40
 
41
+
 
 
 
 
 
 
 
 
42
 
43
  ---
44
 
nedo_turkish_tokenizer/_compound.py CHANGED
@@ -41,10 +41,9 @@ KNOWN_COMPOUNDS: dict[str, list[str]] = {
41
 
42
  def _decompose_zemberek(word: str, morphology) -> list[str] | None:
43
  try:
44
- import jpype # noqa: PLC0415
45
- wa = morphology.analyze(jpype.JString(word))
46
- for sa in wa.getAnalysisResults():
47
- morphemes = [str(m) for m in sa.getMorphemes()]
48
  roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
49
  if len(roots) > 1:
50
  return roots
 
41
 
42
  def _decompose_zemberek(word: str, morphology) -> list[str] | None:
43
  try:
44
+ wa = morphology.analyze(word)
45
+ for sa in wa:
46
+ morphemes = [str(m) for m in sa.get_morphemes()]
 
47
  roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
48
  if len(roots) > 1:
49
  return roots
nedo_turkish_tokenizer/_context_aware.py CHANGED
@@ -1,8 +1,11 @@
1
- """Fix 12: Context-aware Zemberek disambiguation."""
 
 
 
2
 
3
  from __future__ import annotations
4
 
5
- from ._root_validator import ZEMBEREK_AVAILABLE, _morphology, _jstr
6
 
7
  AMBIGUOUS_WORDS = {
8
  "yΓΌz", "gelir", "yazar", "geΓ§er", "Γ§Δ±kar", "gider",
@@ -17,20 +20,18 @@ def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
17
  return tokens
18
 
19
  try:
20
- sa_result = _morphology.analyzeAndDisambiguate(_jstr(original_text.strip()))
21
- best_list = sa_result.bestAnalysis()
22
 
23
  analyses: dict[str, dict] = {}
24
- for idx in range(best_list.size()):
25
  try:
26
- sa = best_list.get(idx)
27
- item = sa.getDictionaryItem()
28
- sf = str(sa.surfaceForm()).lower().strip()
29
  if sf not in analyses:
30
  analyses[sf] = {
31
- "lemma": str(item.lemma),
32
- "pos": str(sa.getPos().shortForm),
33
- "morphemes": [str(m) for m in sa.getMorphemes()],
34
  }
35
  except Exception: # noqa: BLE001
36
  continue
 
1
+ """Fix 12: Context-aware Zemberek disambiguation.
2
+
3
+ Uses zemberek-python (pure Python) β€” no JVM required.
4
+ """
5
 
6
  from __future__ import annotations
7
 
8
+ from ._root_validator import ZEMBEREK_AVAILABLE, _morphology
9
 
10
  AMBIGUOUS_WORDS = {
11
  "yΓΌz", "gelir", "yazar", "geΓ§er", "Γ§Δ±kar", "gider",
 
20
  return tokens
21
 
22
  try:
23
+ sa_result = _morphology.analyze_and_disambiguate(original_text.strip())
24
+ best_list = sa_result.best_analysis()
25
 
26
  analyses: dict[str, dict] = {}
27
+ for sa in best_list:
28
  try:
29
+ sf = (str(sa.get_stem()) + str(sa.get_ending())).lower().strip()
 
 
30
  if sf not in analyses:
31
  analyses[sf] = {
32
+ "lemma": str(sa.item.lemma),
33
+ "pos": str(sa.item.primary_pos.short_form),
34
+ "morphemes": [str(m) for m in sa.get_morphemes()],
35
  }
36
  except Exception: # noqa: BLE001
37
  continue
nedo_turkish_tokenizer/_java_check.py DELETED
@@ -1,57 +0,0 @@
1
- """Java/JVM presence check with actionable error messages."""
2
-
3
- from __future__ import annotations
4
-
5
- import shutil
6
- import subprocess
7
- import sys
8
-
9
-
10
- def ensure_java() -> None:
11
- """Raise a clear RuntimeError if Java is not installed."""
12
- if shutil.which("java") is not None:
13
- return
14
-
15
- # Try jpype's own detection as a fallback
16
- try:
17
- import jpype # noqa: PLC0415
18
- jpype.getDefaultJVMPath()
19
- return
20
- except Exception: # noqa: BLE001
21
- pass
22
-
23
- _install_cmd = _get_install_cmd()
24
- raise RuntimeError(
25
- "\n"
26
- "╔══════════════════════════════════════════════════════════════╗\n"
27
- "β•‘ NedoTurkishTokenizer requires Java (JVM) β€” not found on this system β•‘\n"
28
- "╠══════════════════════════════════════════════════════════════╣\n"
29
- f"β•‘ Install Java with: β•‘\n"
30
- f"β•‘ {_install_cmd:<58}β•‘\n"
31
- "β•‘ β•‘\n"
32
- "β•‘ Then re-run your script. β•‘\n"
33
- "β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•\n"
34
- )
35
-
36
-
37
- def _get_install_cmd() -> str:
38
- if sys.platform == "linux":
39
- # Try to detect distro
40
- try:
41
- out = subprocess.check_output(
42
- ["cat", "/etc/os-release"], text=True, stderr=subprocess.DEVNULL
43
- )
44
- if "ubuntu" in out.lower() or "debian" in out.lower():
45
- return "sudo apt install default-jre"
46
- if "fedora" in out.lower() or "rhel" in out.lower() or "centos" in out.lower():
47
- return "sudo dnf install java-latest-openjdk"
48
- if "arch" in out.lower():
49
- return "sudo pacman -S jre-openjdk"
50
- except Exception: # noqa: BLE001
51
- pass
52
- return "sudo apt install default-jre"
53
- if sys.platform == "darwin":
54
- return "brew install openjdk"
55
- if sys.platform == "win32":
56
- return "winget install Microsoft.OpenJDK.21"
57
- return "Install Java from https://adoptium.net"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
nedo_turkish_tokenizer/_preprocessor.py CHANGED
@@ -71,8 +71,9 @@ def _is_turkish_base(word: str) -> bool:
71
  try:
72
  from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
73
  if ZEMBEREK_AVAILABLE and _morphology:
74
- for analysis in _morphology.analyze(wl):
75
- lemma = str(analysis).split("]")[0].lstrip("[")
 
76
  if any(c in TR_CHARS for c in lemma):
77
  return True
78
  except Exception: # noqa: BLE001
 
71
  try:
72
  from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
73
  if ZEMBEREK_AVAILABLE and _morphology:
74
+ wa = _morphology.analyze(wl)
75
+ for sa in wa:
76
+ lemma = str(sa.item.lemma)
77
  if any(c in TR_CHARS for c in lemma):
78
  return True
79
  except Exception: # noqa: BLE001
nedo_turkish_tokenizer/_root_validator.py CHANGED
@@ -1,46 +1,51 @@
1
- """Zemberek-based root validation and correction (Fix 4)."""
2
 
3
- from __future__ import annotations
4
-
5
- import os
6
- from pathlib import Path
7
-
8
- # ── Zemberek JAR: bundled with package ───────────────────────────────────────
9
 
10
- _DATA_DIR = Path(__file__).parent / "data"
11
- JAR_PATH = _DATA_DIR / "zemberek-full.jar"
12
 
13
  ZEMBEREK_AVAILABLE = False
14
  _morphology = None
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def _init_zemberek() -> None:
18
  global ZEMBEREK_AVAILABLE, _morphology
19
 
20
- if not JAR_PATH.exists():
21
- print(
22
- f"[NedoTurkishTokenizer] zemberek-full.jar not found at {JAR_PATH}\n"
23
- " Root validation disabled β€” morphological fixes will be limited."
24
- )
25
- return
26
-
27
  try:
28
- import jpype # noqa: PLC0415
29
-
30
- if not jpype.isJVMStarted():
31
- jpype.startJVM(
32
- jpype.getDefaultJVMPath(),
33
- "-ea",
34
- f"-Djava.class.path={JAR_PATH}",
35
- convertStrings=False,
36
- )
37
-
38
- TurkishMorphology = jpype.JClass("zemberek.morphology.TurkishMorphology")
39
- _morphology = TurkishMorphology.createWithDefaults()
40
  ZEMBEREK_AVAILABLE = True
41
 
42
  except ImportError:
43
- print("[NedoTurkishTokenizer] jpype1 not installed β†’ pip install jpype1")
44
  except Exception as exc: # noqa: BLE001
45
  print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
46
 
@@ -50,25 +55,20 @@ _init_zemberek()
50
 
51
  # ── Zemberek API helpers ──────────────────────────────────────────────────────
52
 
53
- def _jstr(s: str):
54
- import jpype # noqa: PLC0415
55
- return jpype.JString(s)
56
-
57
-
58
  def analyze_word(word: str) -> list[dict]:
59
  """Return all Zemberek analyses for a single word."""
60
  if not ZEMBEREK_AVAILABLE:
61
  return []
62
  try:
63
- wa = _morphology.analyze(_jstr(word))
64
  return [
65
  {
66
- "lemma": str(sa.getDictionaryItem().lemma),
67
- "pos": str(sa.getPos().shortForm),
68
- "morphemes":[str(m) for m in sa.getMorphemes()],
69
- "surface": str(sa.surfaceForm()),
70
  }
71
- for sa in wa.getAnalysisResults()
72
  ]
73
  except Exception: # noqa: BLE001
74
  return []
@@ -185,17 +185,16 @@ def disambiguate_sentence(words: list[str]) -> list[dict | None]:
185
  if not ZEMBEREK_AVAILABLE:
186
  return [None] * len(words)
187
  try:
188
- sa_result = _morphology.analyzeAndDisambiguate(_jstr(" ".join(words)))
189
- best = sa_result.bestAnalysis()
 
190
  out = []
191
- for i in range(best.size()):
192
  try:
193
- sa = best.get(i)
194
- item = sa.getDictionaryItem()
195
  out.append({
196
- "lemma": str(item.lemma),
197
- "pos": str(sa.getPos().shortForm),
198
- "morphemes": [str(m) for m in sa.getMorphemes()],
199
  })
200
  except Exception: # noqa: BLE001
201
  out.append(None)
 
1
+ """Zemberek-based root validation and correction (Fix 4).
2
 
3
+ Uses zemberek-python (pure Python) β€” no JVM or JPype required.
4
+ """
 
 
 
 
5
 
6
+ from __future__ import annotations
 
7
 
8
  ZEMBEREK_AVAILABLE = False
9
  _morphology = None
10
 
11
 
12
+ def _apply_zemberek_patch() -> None:
13
+ """Fix O(N^2) loading time bug in zemberek-python 0.2.3."""
14
+ import csv
15
+ import zemberek.morphology.lexicon.root_lexicon as rl
16
+
17
+ def fast_load_from_resources(resource_path: str):
18
+ items = list()
19
+ csv.field_size_limit(100000000)
20
+ with open(resource_path, 'r', encoding='utf-8') as f:
21
+ lex = list(csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE))
22
+
23
+ # O(1) dictionary lookup instead of O(N) iteration per reference
24
+ lex_dict = {line[0]: line for line in lex}
25
+
26
+ for i, line in enumerate(lex):
27
+ item = rl.DictionaryReader.make_dict_item_from_line(line)
28
+ if line[7] != 'null':
29
+ reference_item_line = lex_dict.get(line[7])
30
+ if reference_item_line is not None:
31
+ item.set_reference_item(rl.DictionaryReader.make_dict_item_from_line(reference_item_line))
32
+ items.append(item)
33
+ return rl.RootLexicon(items)
34
+
35
+ rl.DictionaryReader.load_from_resources = fast_load_from_resources
36
+
37
  def _init_zemberek() -> None:
38
  global ZEMBEREK_AVAILABLE, _morphology
39
 
 
 
 
 
 
 
 
40
  try:
41
+ from zemberek import TurkishMorphology # noqa: PLC0415
42
+
43
+ _apply_zemberek_patch()
44
+ _morphology = TurkishMorphology.create_with_defaults()
 
 
 
 
 
 
 
 
45
  ZEMBEREK_AVAILABLE = True
46
 
47
  except ImportError:
48
+ print("[NedoTurkishTokenizer] zemberek-python not installed β†’ pip install zemberek-python")
49
  except Exception as exc: # noqa: BLE001
50
  print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
51
 
 
55
 
56
  # ── Zemberek API helpers ──────────────────────────────────────────────────────
57
 
 
 
 
 
 
58
  def analyze_word(word: str) -> list[dict]:
59
  """Return all Zemberek analyses for a single word."""
60
  if not ZEMBEREK_AVAILABLE:
61
  return []
62
  try:
63
+ wa = _morphology.analyze(word)
64
  return [
65
  {
66
+ "lemma": str(sa.item.lemma),
67
+ "pos": str(sa.item.primary_pos.short_form),
68
+ "morphemes": [str(m) for m in sa.get_morphemes()],
69
+ "surface": str(sa.get_stem()) + str(sa.get_ending()),
70
  }
71
+ for sa in wa
72
  ]
73
  except Exception: # noqa: BLE001
74
  return []
 
185
  if not ZEMBEREK_AVAILABLE:
186
  return [None] * len(words)
187
  try:
188
+ sentence = " ".join(words)
189
+ sa_result = _morphology.analyze_and_disambiguate(sentence)
190
+ best = sa_result.best_analysis()
191
  out = []
192
+ for sa in best:
193
  try:
 
 
194
  out.append({
195
+ "lemma": str(sa.item.lemma),
196
+ "pos": str(sa.item.primary_pos.short_form),
197
+ "morphemes": [str(m) for m in sa.get_morphemes()],
198
  })
199
  except Exception: # noqa: BLE001
200
  out.append(None)
nedo_turkish_tokenizer/data/zemberek-full.jar DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:74ee8736b73dc2ca878071b80829f9c5acccc268d4b8b7795d36d60db26a1731
3
- size 31644792
 
 
 
 
nedo_turkish_tokenizer/tokenizer.py CHANGED
@@ -26,11 +26,12 @@ Output fields per token:
26
  from __future__ import annotations
27
 
28
  import os
 
29
  import multiprocessing
30
  from concurrent.futures import ProcessPoolExecutor, as_completed
31
  from pathlib import Path
32
 
33
- from ._java_check import ensure_java
34
  from ._preprocessor import preprocess, postprocess
35
  from ._suffix_expander import reclassify_bpe_suffixes
36
  from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
@@ -98,7 +99,7 @@ class NedoTurkishTokenizer:
98
  """
99
 
100
  def __init__(self) -> None:
101
- ensure_java()
102
  from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
103
  self._base = TurkishTokenizer()
104
  self.zemberek_available = ZEMBEREK_AVAILABLE
@@ -236,6 +237,82 @@ class NedoTurkishTokenizer:
236
  json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
237
  )
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  # ── Utility ───────────────────────────────────────────────────────────────
240
 
241
  def stats(self, tokens: list[dict]) -> dict:
 
26
  from __future__ import annotations
27
 
28
  import os
29
+ import re
30
  import multiprocessing
31
  from concurrent.futures import ProcessPoolExecutor, as_completed
32
  from pathlib import Path
33
 
34
+
35
  from ._preprocessor import preprocess, postprocess
36
  from ._suffix_expander import reclassify_bpe_suffixes
37
  from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
 
99
  """
100
 
101
  def __init__(self) -> None:
102
+
103
  from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
104
  self._base = TurkishTokenizer()
105
  self.zemberek_available = ZEMBEREK_AVAILABLE
 
237
  json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
238
  )
239
 
240
+ # ── Morphological Lattice API ─────────────────────────────────────────────
241
+
242
+ def get_morphological_lattice(self, word: str) -> list[dict]:
243
+ """Return all possible morphological analyses for *word* as a lattice.
244
+
245
+ Each entry in the returned list is a dict with:
246
+ ``root`` – the lemma / root form
247
+ ``suffixes`` – list of surface-form suffixes
248
+ ``pos`` – abbreviated POS tag (Noun, Verb, Adj, …)
249
+ ``lexical_form`` – full lexical representation from Zemberek
250
+
251
+ Returns an **empty list** when Zemberek cannot analyse the word
252
+ (unknown word) or when Zemberek is not available.
253
+ """
254
+ if _zemb_morphology is None:
255
+ return []
256
+
257
+ try:
258
+ word_analysis = _zemb_morphology.analyze(word)
259
+
260
+ lattice: list[dict] = []
261
+ for sa in word_analysis:
262
+ try:
263
+ root = str(sa.item.lemma)
264
+ pos = str(sa.item.primary_pos.short_form)
265
+ lexical_form = str(sa.format_string())
266
+
267
+ # Build suffix list from morpheme chain (skip the root morpheme)
268
+ morphemes = list(sa.get_morphemes())
269
+ suffixes = [str(m) for m in morphemes[1:]] if len(morphemes) > 1 else []
270
+
271
+ lattice.append({
272
+ "root": root,
273
+ "suffixes": suffixes,
274
+ "pos": pos,
275
+ "lexical_form": lexical_form,
276
+ })
277
+ except Exception: # noqa: BLE001
278
+ continue
279
+
280
+ return lattice
281
+
282
+ except Exception: # noqa: BLE001
283
+ return []
284
+
285
+ def tokenize_lattice(self, text: str) -> dict:
286
+ """Tokenize *text* and return a morphological lattice for every word.
287
+
288
+ Returns a dict with:
289
+ ``input`` – the original text
290
+ ``words`` – list of per-word dicts, each containing
291
+ ``word`` (str) and ``lattice`` (list of analyses)
292
+
293
+ Example::
294
+
295
+ tok = NedoTurkishTokenizer()
296
+ data = tok.tokenize_lattice("Evin gΓΌzel gelir")
297
+ for w in data["words"]:
298
+ print(w["word"], "β†’", len(w["lattice"]), "analysis(es)")
299
+ """
300
+ # Split text on whitespace, respecting punctuation
301
+ words = re.findall(r"\S+", text)
302
+
303
+ result_words: list[dict] = []
304
+ for w in words:
305
+ lattice = self.get_morphological_lattice(w)
306
+ result_words.append({
307
+ "word": w,
308
+ "lattice": lattice,
309
+ })
310
+
311
+ return {
312
+ "input": text,
313
+ "words": result_words,
314
+ }
315
+
316
  # ── Utility ───────────────────────────────────────────────────────────────
317
 
318
  def stats(self, tokens: list[dict]) -> dict:
pyproject.toml CHANGED
@@ -20,7 +20,7 @@ classifiers = [
20
  ]
21
  dependencies = [
22
  "turkish-tokenizer>=0.1.0",
23
- "jpype1>=1.4.0",
24
  "requests>=2.28.0",
25
  ]
26
 
@@ -36,4 +36,4 @@ where = ["."]
36
  include = ["nedo_turkish_tokenizer*"]
37
 
38
  [tool.setuptools.package-data]
39
- nedo_turkish_tokenizer = ["data/*.jar"]
 
20
  ]
21
  dependencies = [
22
  "turkish-tokenizer>=0.1.0",
23
+ "zemberek-python>=0.2.3",
24
  "requests>=2.28.0",
25
  ]
26
 
 
36
  include = ["nedo_turkish_tokenizer*"]
37
 
38
  [tool.setuptools.package-data]
39
+ nedo_turkish_tokenizer = ["data/*.txt"]
test_lattice.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test / demo script for the Morphological Lattice API.
4
+
5
+ Shows how ambiguous Turkish words like "evin" and "gelir" produce
6
+ multiple alternative analyses in the lattice.
7
+ """
8
+
9
+ import json
10
+ from nedo_turkish_tokenizer import NedoTurkishTokenizer
11
+
12
+
13
+ def section(title: str) -> None:
14
+ print(f"\n{'═' * 60}")
15
+ print(f" {title}")
16
+ print(f"{'═' * 60}")
17
+
18
+
19
+ def main() -> None:
20
+ tok = NedoTurkishTokenizer()
21
+
22
+ # ── 1. Tek kelime lattice testi ──────────────────────────────────────
23
+ section("1) get_morphological_lattice β€” tek kelime ΓΆrnekleri")
24
+
25
+ test_words = ["evin", "gelir", "yüz", "çıkar", "koşar"]
26
+ for word in test_words:
27
+ lattice = tok.get_morphological_lattice(word)
28
+ print(f"\nβ–Έ \"{word}\" β†’ {len(lattice)} analiz:")
29
+ for i, entry in enumerate(lattice):
30
+ print(f" [{i}] root={entry['root']:<12} "
31
+ f"pos={entry['pos']:<6} "
32
+ f"suffixes={entry['suffixes']}")
33
+ print(f" lexical_form = {entry['lexical_form']}")
34
+
35
+ # ── 2. Bilinmeyen kelime (boş lattice) ───────────────────────────────
36
+ section("2) Bilinmeyen / yabancΔ± kelime β†’ boş lattice")
37
+
38
+ unknown_words = ["xyzfoo", "meeting", "blockchain"]
39
+ for word in unknown_words:
40
+ lattice = tok.get_morphological_lattice(word)
41
+ print(f" \"{word}\" β†’ lattice boş mu? {len(lattice) == 0} (len={len(lattice)})")
42
+
43
+ # ── 3. tokenize_lattice β€” cΓΌmle bazlΔ± test ──────────────────────────
44
+ section("3) tokenize_lattice β€” cΓΌmle testi")
45
+
46
+ sentences = [
47
+ "Evin gΓΌzel gelir",
48
+ "Γ‡ocuk okula koşar adΔ±m gitti",
49
+ "YΓΌz yΔ±llΔ±k Γ§Δ±nar",
50
+ ]
51
+
52
+ for sent in sentences:
53
+ print(f"\nβ–Έ Input: \"{sent}\"")
54
+ result = tok.tokenize_lattice(sent)
55
+ for winfo in result["words"]:
56
+ n = len(winfo["lattice"])
57
+ print(f" {winfo['word']:<16} β†’ {n} analiz(ler)")
58
+ for entry in winfo["lattice"]:
59
+ print(f" root={entry['root']:<12} pos={entry['pos']:<6} "
60
+ f"suffixes={entry['suffixes']}")
61
+
62
+ # ── 4. JSON Γ§Δ±ktΔ± formatΔ± ────────────────────────────────────────────
63
+ section("4) tokenize_lattice JSON Γ§Δ±ktΔ±")
64
+
65
+ data = tok.tokenize_lattice("evin gelir")
66
+ print(json.dumps(data, ensure_ascii=False, indent=2))
67
+
68
+ print("\nβœ… TΓΌm testler başarΔ±yla tamamlandΔ±.")
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()
tokenizer_config.json CHANGED
@@ -7,6 +7,6 @@
7
  "version": "1.0.0",
8
  "language": "tr",
9
  "description": "Turkish morphological tokenizer β€” TR-MMLU world record 92%",
10
- "requires_java": true,
11
- "dependencies": ["turkish-tokenizer", "jpype1"]
12
  }
 
7
  "version": "1.0.0",
8
  "language": "tr",
9
  "description": "Turkish morphological tokenizer β€” TR-MMLU world record 92%",
10
+ "requires_java": false,
11
+ "dependencies": ["turkish-tokenizer", "zemberek-python"]
12
  }