Migrate to zemberek-python, remove JVM dependency and 31MB JAR, apply O(N^2) init fix
Browse files- README.md +2 -10
- nedo_turkish_tokenizer/_compound.py +3 -4
- nedo_turkish_tokenizer/_context_aware.py +12 -11
- nedo_turkish_tokenizer/_java_check.py +0 -57
- nedo_turkish_tokenizer/_preprocessor.py +3 -2
- nedo_turkish_tokenizer/_root_validator.py +47 -48
- nedo_turkish_tokenizer/data/zemberek-full.jar +0 -3
- nedo_turkish_tokenizer/tokenizer.py +79 -2
- pyproject.toml +2 -2
- test_lattice.py +72 -0
- tokenizer_config.json +2 -2
README.md
CHANGED
|
@@ -26,7 +26,7 @@ NedoTurkishTokenizer performs linguistically-aware tokenization of Turkish text
|
|
| 26 |
| **Language** | Turkish (`tr`) |
|
| 27 |
| **License** | MIT |
|
| 28 |
| **Benchmark** | TR-MMLU **95.45%** (world record) |
|
| 29 |
-
| **Morphological engine** |
|
| 30 |
|
| 31 |
---
|
| 32 |
|
|
@@ -38,15 +38,7 @@ NedoTurkishTokenizer performs linguistically-aware tokenization of Turkish text
|
|
| 38 |
pip install git+https://huggingface.co/Ethosoft/NedoTurkishTokenizer
|
| 39 |
```
|
| 40 |
|
| 41 |
-
|
| 42 |
-
> If you get a Java error, install it first:
|
| 43 |
-
>
|
| 44 |
-
> | OS | Command |
|
| 45 |
-
> |---|---|
|
| 46 |
-
> | Ubuntu / Debian | `sudo apt install default-jre` |
|
| 47 |
-
> | Fedora / RHEL | `sudo dnf install java-latest-openjdk` |
|
| 48 |
-
> | macOS | `brew install openjdk` |
|
| 49 |
-
> | Windows | `winget install Microsoft.OpenJDK.21` |
|
| 50 |
|
| 51 |
---
|
| 52 |
|
|
|
|
| 26 |
| **Language** | Turkish (`tr`) |
|
| 27 |
| **License** | MIT |
|
| 28 |
| **Benchmark** | TR-MMLU **95.45%** (world record) |
|
| 29 |
+
| **Morphological engine** | zemberek-python |
|
| 30 |
|
| 31 |
---
|
| 32 |
|
|
|
|
| 38 |
pip install git+https://huggingface.co/Ethosoft/NedoTurkishTokenizer
|
| 39 |
```
|
| 40 |
|
| 41 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
---
|
| 44 |
|
nedo_turkish_tokenizer/_compound.py
CHANGED
|
@@ -41,10 +41,9 @@ KNOWN_COMPOUNDS: dict[str, list[str]] = {
|
|
| 41 |
|
| 42 |
def _decompose_zemberek(word: str, morphology) -> list[str] | None:
|
| 43 |
try:
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
morphemes = [str(m) for m in sa.getMorphemes()]
|
| 48 |
roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
|
| 49 |
if len(roots) > 1:
|
| 50 |
return roots
|
|
|
|
| 41 |
|
| 42 |
def _decompose_zemberek(word: str, morphology) -> list[str] | None:
|
| 43 |
try:
|
| 44 |
+
wa = morphology.analyze(word)
|
| 45 |
+
for sa in wa:
|
| 46 |
+
morphemes = [str(m) for m in sa.get_morphemes()]
|
|
|
|
| 47 |
roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
|
| 48 |
if len(roots) > 1:
|
| 49 |
return roots
|
nedo_turkish_tokenizer/_context_aware.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
| 1 |
-
"""Fix 12: Context-aware Zemberek disambiguation.
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
from ._root_validator import ZEMBEREK_AVAILABLE, _morphology
|
| 6 |
|
| 7 |
AMBIGUOUS_WORDS = {
|
| 8 |
"yΓΌz", "gelir", "yazar", "geΓ§er", "Γ§Δ±kar", "gider",
|
|
@@ -17,20 +20,18 @@ def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
|
|
| 17 |
return tokens
|
| 18 |
|
| 19 |
try:
|
| 20 |
-
sa_result = _morphology.
|
| 21 |
-
best_list = sa_result.
|
| 22 |
|
| 23 |
analyses: dict[str, dict] = {}
|
| 24 |
-
for
|
| 25 |
try:
|
| 26 |
-
|
| 27 |
-
item = sa.getDictionaryItem()
|
| 28 |
-
sf = str(sa.surfaceForm()).lower().strip()
|
| 29 |
if sf not in analyses:
|
| 30 |
analyses[sf] = {
|
| 31 |
-
"lemma": str(item.lemma),
|
| 32 |
-
"pos": str(sa.
|
| 33 |
-
"morphemes": [str(m) for m in sa.
|
| 34 |
}
|
| 35 |
except Exception: # noqa: BLE001
|
| 36 |
continue
|
|
|
|
| 1 |
+
"""Fix 12: Context-aware Zemberek disambiguation.
|
| 2 |
+
|
| 3 |
+
Uses zemberek-python (pure Python) β no JVM required.
|
| 4 |
+
"""
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
|
| 8 |
+
from ._root_validator import ZEMBEREK_AVAILABLE, _morphology
|
| 9 |
|
| 10 |
AMBIGUOUS_WORDS = {
|
| 11 |
"yΓΌz", "gelir", "yazar", "geΓ§er", "Γ§Δ±kar", "gider",
|
|
|
|
| 20 |
return tokens
|
| 21 |
|
| 22 |
try:
|
| 23 |
+
sa_result = _morphology.analyze_and_disambiguate(original_text.strip())
|
| 24 |
+
best_list = sa_result.best_analysis()
|
| 25 |
|
| 26 |
analyses: dict[str, dict] = {}
|
| 27 |
+
for sa in best_list:
|
| 28 |
try:
|
| 29 |
+
sf = (str(sa.get_stem()) + str(sa.get_ending())).lower().strip()
|
|
|
|
|
|
|
| 30 |
if sf not in analyses:
|
| 31 |
analyses[sf] = {
|
| 32 |
+
"lemma": str(sa.item.lemma),
|
| 33 |
+
"pos": str(sa.item.primary_pos.short_form),
|
| 34 |
+
"morphemes": [str(m) for m in sa.get_morphemes()],
|
| 35 |
}
|
| 36 |
except Exception: # noqa: BLE001
|
| 37 |
continue
|
nedo_turkish_tokenizer/_java_check.py
DELETED
|
@@ -1,57 +0,0 @@
|
|
| 1 |
-
"""Java/JVM presence check with actionable error messages."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import shutil
|
| 6 |
-
import subprocess
|
| 7 |
-
import sys
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def ensure_java() -> None:
|
| 11 |
-
"""Raise a clear RuntimeError if Java is not installed."""
|
| 12 |
-
if shutil.which("java") is not None:
|
| 13 |
-
return
|
| 14 |
-
|
| 15 |
-
# Try jpype's own detection as a fallback
|
| 16 |
-
try:
|
| 17 |
-
import jpype # noqa: PLC0415
|
| 18 |
-
jpype.getDefaultJVMPath()
|
| 19 |
-
return
|
| 20 |
-
except Exception: # noqa: BLE001
|
| 21 |
-
pass
|
| 22 |
-
|
| 23 |
-
_install_cmd = _get_install_cmd()
|
| 24 |
-
raise RuntimeError(
|
| 25 |
-
"\n"
|
| 26 |
-
"ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n"
|
| 27 |
-
"β NedoTurkishTokenizer requires Java (JVM) β not found on this system β\n"
|
| 28 |
-
"β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£\n"
|
| 29 |
-
f"β Install Java with: β\n"
|
| 30 |
-
f"β {_install_cmd:<58}β\n"
|
| 31 |
-
"β β\n"
|
| 32 |
-
"β Then re-run your script. β\n"
|
| 33 |
-
"ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n"
|
| 34 |
-
)
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
def _get_install_cmd() -> str:
|
| 38 |
-
if sys.platform == "linux":
|
| 39 |
-
# Try to detect distro
|
| 40 |
-
try:
|
| 41 |
-
out = subprocess.check_output(
|
| 42 |
-
["cat", "/etc/os-release"], text=True, stderr=subprocess.DEVNULL
|
| 43 |
-
)
|
| 44 |
-
if "ubuntu" in out.lower() or "debian" in out.lower():
|
| 45 |
-
return "sudo apt install default-jre"
|
| 46 |
-
if "fedora" in out.lower() or "rhel" in out.lower() or "centos" in out.lower():
|
| 47 |
-
return "sudo dnf install java-latest-openjdk"
|
| 48 |
-
if "arch" in out.lower():
|
| 49 |
-
return "sudo pacman -S jre-openjdk"
|
| 50 |
-
except Exception: # noqa: BLE001
|
| 51 |
-
pass
|
| 52 |
-
return "sudo apt install default-jre"
|
| 53 |
-
if sys.platform == "darwin":
|
| 54 |
-
return "brew install openjdk"
|
| 55 |
-
if sys.platform == "win32":
|
| 56 |
-
return "winget install Microsoft.OpenJDK.21"
|
| 57 |
-
return "Install Java from https://adoptium.net"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/_preprocessor.py
CHANGED
|
@@ -71,8 +71,9 @@ def _is_turkish_base(word: str) -> bool:
|
|
| 71 |
try:
|
| 72 |
from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
|
| 73 |
if ZEMBEREK_AVAILABLE and _morphology:
|
| 74 |
-
|
| 75 |
-
|
|
|
|
| 76 |
if any(c in TR_CHARS for c in lemma):
|
| 77 |
return True
|
| 78 |
except Exception: # noqa: BLE001
|
|
|
|
| 71 |
try:
|
| 72 |
from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
|
| 73 |
if ZEMBEREK_AVAILABLE and _morphology:
|
| 74 |
+
wa = _morphology.analyze(wl)
|
| 75 |
+
for sa in wa:
|
| 76 |
+
lemma = str(sa.item.lemma)
|
| 77 |
if any(c in TR_CHARS for c in lemma):
|
| 78 |
return True
|
| 79 |
except Exception: # noqa: BLE001
|
nedo_turkish_tokenizer/_root_validator.py
CHANGED
|
@@ -1,46 +1,51 @@
|
|
| 1 |
-
"""Zemberek-based root validation and correction (Fix 4).
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
import os
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
|
| 8 |
-
# ββ Zemberek JAR: bundled with package βββββββββββββββββββββββββββββββββββββββ
|
| 9 |
|
| 10 |
-
|
| 11 |
-
JAR_PATH = _DATA_DIR / "zemberek-full.jar"
|
| 12 |
|
| 13 |
ZEMBEREK_AVAILABLE = False
|
| 14 |
_morphology = None
|
| 15 |
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
def _init_zemberek() -> None:
|
| 18 |
global ZEMBEREK_AVAILABLE, _morphology
|
| 19 |
|
| 20 |
-
if not JAR_PATH.exists():
|
| 21 |
-
print(
|
| 22 |
-
f"[NedoTurkishTokenizer] zemberek-full.jar not found at {JAR_PATH}\n"
|
| 23 |
-
" Root validation disabled β morphological fixes will be limited."
|
| 24 |
-
)
|
| 25 |
-
return
|
| 26 |
-
|
| 27 |
try:
|
| 28 |
-
import
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
jpype.getDefaultJVMPath(),
|
| 33 |
-
"-ea",
|
| 34 |
-
f"-Djava.class.path={JAR_PATH}",
|
| 35 |
-
convertStrings=False,
|
| 36 |
-
)
|
| 37 |
-
|
| 38 |
-
TurkishMorphology = jpype.JClass("zemberek.morphology.TurkishMorphology")
|
| 39 |
-
_morphology = TurkishMorphology.createWithDefaults()
|
| 40 |
ZEMBEREK_AVAILABLE = True
|
| 41 |
|
| 42 |
except ImportError:
|
| 43 |
-
print("[NedoTurkishTokenizer]
|
| 44 |
except Exception as exc: # noqa: BLE001
|
| 45 |
print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
|
| 46 |
|
|
@@ -50,25 +55,20 @@ _init_zemberek()
|
|
| 50 |
|
| 51 |
# ββ Zemberek API helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 52 |
|
| 53 |
-
def _jstr(s: str):
|
| 54 |
-
import jpype # noqa: PLC0415
|
| 55 |
-
return jpype.JString(s)
|
| 56 |
-
|
| 57 |
-
|
| 58 |
def analyze_word(word: str) -> list[dict]:
|
| 59 |
"""Return all Zemberek analyses for a single word."""
|
| 60 |
if not ZEMBEREK_AVAILABLE:
|
| 61 |
return []
|
| 62 |
try:
|
| 63 |
-
wa = _morphology.analyze(
|
| 64 |
return [
|
| 65 |
{
|
| 66 |
-
"lemma": str(sa.
|
| 67 |
-
"pos": str(sa.
|
| 68 |
-
"morphemes":[str(m) for m in sa.
|
| 69 |
-
"surface": str(sa.
|
| 70 |
}
|
| 71 |
-
for sa in wa
|
| 72 |
]
|
| 73 |
except Exception: # noqa: BLE001
|
| 74 |
return []
|
|
@@ -185,17 +185,16 @@ def disambiguate_sentence(words: list[str]) -> list[dict | None]:
|
|
| 185 |
if not ZEMBEREK_AVAILABLE:
|
| 186 |
return [None] * len(words)
|
| 187 |
try:
|
| 188 |
-
|
| 189 |
-
|
|
|
|
| 190 |
out = []
|
| 191 |
-
for
|
| 192 |
try:
|
| 193 |
-
sa = best.get(i)
|
| 194 |
-
item = sa.getDictionaryItem()
|
| 195 |
out.append({
|
| 196 |
-
"lemma": str(item.lemma),
|
| 197 |
-
"pos": str(sa.
|
| 198 |
-
"morphemes": [str(m) for m in sa.
|
| 199 |
})
|
| 200 |
except Exception: # noqa: BLE001
|
| 201 |
out.append(None)
|
|
|
|
| 1 |
+
"""Zemberek-based root validation and correction (Fix 4).
|
| 2 |
|
| 3 |
+
Uses zemberek-python (pure Python) β no JVM or JPype required.
|
| 4 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
from __future__ import annotations
|
|
|
|
| 7 |
|
| 8 |
ZEMBEREK_AVAILABLE = False
|
| 9 |
_morphology = None
|
| 10 |
|
| 11 |
|
| 12 |
+
def _apply_zemberek_patch() -> None:
|
| 13 |
+
"""Fix O(N^2) loading time bug in zemberek-python 0.2.3."""
|
| 14 |
+
import csv
|
| 15 |
+
import zemberek.morphology.lexicon.root_lexicon as rl
|
| 16 |
+
|
| 17 |
+
def fast_load_from_resources(resource_path: str):
|
| 18 |
+
items = list()
|
| 19 |
+
csv.field_size_limit(100000000)
|
| 20 |
+
with open(resource_path, 'r', encoding='utf-8') as f:
|
| 21 |
+
lex = list(csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE))
|
| 22 |
+
|
| 23 |
+
# O(1) dictionary lookup instead of O(N) iteration per reference
|
| 24 |
+
lex_dict = {line[0]: line for line in lex}
|
| 25 |
+
|
| 26 |
+
for i, line in enumerate(lex):
|
| 27 |
+
item = rl.DictionaryReader.make_dict_item_from_line(line)
|
| 28 |
+
if line[7] != 'null':
|
| 29 |
+
reference_item_line = lex_dict.get(line[7])
|
| 30 |
+
if reference_item_line is not None:
|
| 31 |
+
item.set_reference_item(rl.DictionaryReader.make_dict_item_from_line(reference_item_line))
|
| 32 |
+
items.append(item)
|
| 33 |
+
return rl.RootLexicon(items)
|
| 34 |
+
|
| 35 |
+
rl.DictionaryReader.load_from_resources = fast_load_from_resources
|
| 36 |
+
|
| 37 |
def _init_zemberek() -> None:
|
| 38 |
global ZEMBEREK_AVAILABLE, _morphology
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
try:
|
| 41 |
+
from zemberek import TurkishMorphology # noqa: PLC0415
|
| 42 |
+
|
| 43 |
+
_apply_zemberek_patch()
|
| 44 |
+
_morphology = TurkishMorphology.create_with_defaults()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
ZEMBEREK_AVAILABLE = True
|
| 46 |
|
| 47 |
except ImportError:
|
| 48 |
+
print("[NedoTurkishTokenizer] zemberek-python not installed β pip install zemberek-python")
|
| 49 |
except Exception as exc: # noqa: BLE001
|
| 50 |
print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
|
| 51 |
|
|
|
|
| 55 |
|
| 56 |
# ββ Zemberek API helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
def analyze_word(word: str) -> list[dict]:
|
| 59 |
"""Return all Zemberek analyses for a single word."""
|
| 60 |
if not ZEMBEREK_AVAILABLE:
|
| 61 |
return []
|
| 62 |
try:
|
| 63 |
+
wa = _morphology.analyze(word)
|
| 64 |
return [
|
| 65 |
{
|
| 66 |
+
"lemma": str(sa.item.lemma),
|
| 67 |
+
"pos": str(sa.item.primary_pos.short_form),
|
| 68 |
+
"morphemes": [str(m) for m in sa.get_morphemes()],
|
| 69 |
+
"surface": str(sa.get_stem()) + str(sa.get_ending()),
|
| 70 |
}
|
| 71 |
+
for sa in wa
|
| 72 |
]
|
| 73 |
except Exception: # noqa: BLE001
|
| 74 |
return []
|
|
|
|
| 185 |
if not ZEMBEREK_AVAILABLE:
|
| 186 |
return [None] * len(words)
|
| 187 |
try:
|
| 188 |
+
sentence = " ".join(words)
|
| 189 |
+
sa_result = _morphology.analyze_and_disambiguate(sentence)
|
| 190 |
+
best = sa_result.best_analysis()
|
| 191 |
out = []
|
| 192 |
+
for sa in best:
|
| 193 |
try:
|
|
|
|
|
|
|
| 194 |
out.append({
|
| 195 |
+
"lemma": str(sa.item.lemma),
|
| 196 |
+
"pos": str(sa.item.primary_pos.short_form),
|
| 197 |
+
"morphemes": [str(m) for m in sa.get_morphemes()],
|
| 198 |
})
|
| 199 |
except Exception: # noqa: BLE001
|
| 200 |
out.append(None)
|
nedo_turkish_tokenizer/data/zemberek-full.jar
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:74ee8736b73dc2ca878071b80829f9c5acccc268d4b8b7795d36d60db26a1731
|
| 3 |
-
size 31644792
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/tokenizer.py
CHANGED
|
@@ -26,11 +26,12 @@ Output fields per token:
|
|
| 26 |
from __future__ import annotations
|
| 27 |
|
| 28 |
import os
|
|
|
|
| 29 |
import multiprocessing
|
| 30 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 31 |
from pathlib import Path
|
| 32 |
|
| 33 |
-
|
| 34 |
from ._preprocessor import preprocess, postprocess
|
| 35 |
from ._suffix_expander import reclassify_bpe_suffixes
|
| 36 |
from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
|
|
@@ -98,7 +99,7 @@ class NedoTurkishTokenizer:
|
|
| 98 |
"""
|
| 99 |
|
| 100 |
def __init__(self) -> None:
|
| 101 |
-
|
| 102 |
from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
|
| 103 |
self._base = TurkishTokenizer()
|
| 104 |
self.zemberek_available = ZEMBEREK_AVAILABLE
|
|
@@ -236,6 +237,82 @@ class NedoTurkishTokenizer:
|
|
| 236 |
json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
|
| 237 |
)
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
# ββ Utility βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 240 |
|
| 241 |
def stats(self, tokens: list[dict]) -> dict:
|
|
|
|
| 26 |
from __future__ import annotations
|
| 27 |
|
| 28 |
import os
|
| 29 |
+
import re
|
| 30 |
import multiprocessing
|
| 31 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 32 |
from pathlib import Path
|
| 33 |
|
| 34 |
+
|
| 35 |
from ._preprocessor import preprocess, postprocess
|
| 36 |
from ._suffix_expander import reclassify_bpe_suffixes
|
| 37 |
from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
|
|
|
|
| 99 |
"""
|
| 100 |
|
| 101 |
def __init__(self) -> None:
|
| 102 |
+
|
| 103 |
from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
|
| 104 |
self._base = TurkishTokenizer()
|
| 105 |
self.zemberek_available = ZEMBEREK_AVAILABLE
|
|
|
|
| 237 |
json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
|
| 238 |
)
|
| 239 |
|
| 240 |
+
# ββ Morphological Lattice API βββββββββββββββββββββββββββββββββββββββββββββ
|
| 241 |
+
|
| 242 |
+
def get_morphological_lattice(self, word: str) -> list[dict]:
|
| 243 |
+
"""Return all possible morphological analyses for *word* as a lattice.
|
| 244 |
+
|
| 245 |
+
Each entry in the returned list is a dict with:
|
| 246 |
+
``root`` β the lemma / root form
|
| 247 |
+
``suffixes`` β list of surface-form suffixes
|
| 248 |
+
``pos`` β abbreviated POS tag (Noun, Verb, Adj, β¦)
|
| 249 |
+
``lexical_form`` β full lexical representation from Zemberek
|
| 250 |
+
|
| 251 |
+
Returns an **empty list** when Zemberek cannot analyse the word
|
| 252 |
+
(unknown word) or when Zemberek is not available.
|
| 253 |
+
"""
|
| 254 |
+
if _zemb_morphology is None:
|
| 255 |
+
return []
|
| 256 |
+
|
| 257 |
+
try:
|
| 258 |
+
word_analysis = _zemb_morphology.analyze(word)
|
| 259 |
+
|
| 260 |
+
lattice: list[dict] = []
|
| 261 |
+
for sa in word_analysis:
|
| 262 |
+
try:
|
| 263 |
+
root = str(sa.item.lemma)
|
| 264 |
+
pos = str(sa.item.primary_pos.short_form)
|
| 265 |
+
lexical_form = str(sa.format_string())
|
| 266 |
+
|
| 267 |
+
# Build suffix list from morpheme chain (skip the root morpheme)
|
| 268 |
+
morphemes = list(sa.get_morphemes())
|
| 269 |
+
suffixes = [str(m) for m in morphemes[1:]] if len(morphemes) > 1 else []
|
| 270 |
+
|
| 271 |
+
lattice.append({
|
| 272 |
+
"root": root,
|
| 273 |
+
"suffixes": suffixes,
|
| 274 |
+
"pos": pos,
|
| 275 |
+
"lexical_form": lexical_form,
|
| 276 |
+
})
|
| 277 |
+
except Exception: # noqa: BLE001
|
| 278 |
+
continue
|
| 279 |
+
|
| 280 |
+
return lattice
|
| 281 |
+
|
| 282 |
+
except Exception: # noqa: BLE001
|
| 283 |
+
return []
|
| 284 |
+
|
| 285 |
+
def tokenize_lattice(self, text: str) -> dict:
|
| 286 |
+
"""Tokenize *text* and return a morphological lattice for every word.
|
| 287 |
+
|
| 288 |
+
Returns a dict with:
|
| 289 |
+
``input`` β the original text
|
| 290 |
+
``words`` β list of per-word dicts, each containing
|
| 291 |
+
``word`` (str) and ``lattice`` (list of analyses)
|
| 292 |
+
|
| 293 |
+
Example::
|
| 294 |
+
|
| 295 |
+
tok = NedoTurkishTokenizer()
|
| 296 |
+
data = tok.tokenize_lattice("Evin gΓΌzel gelir")
|
| 297 |
+
for w in data["words"]:
|
| 298 |
+
print(w["word"], "β", len(w["lattice"]), "analysis(es)")
|
| 299 |
+
"""
|
| 300 |
+
# Split text on whitespace, respecting punctuation
|
| 301 |
+
words = re.findall(r"\S+", text)
|
| 302 |
+
|
| 303 |
+
result_words: list[dict] = []
|
| 304 |
+
for w in words:
|
| 305 |
+
lattice = self.get_morphological_lattice(w)
|
| 306 |
+
result_words.append({
|
| 307 |
+
"word": w,
|
| 308 |
+
"lattice": lattice,
|
| 309 |
+
})
|
| 310 |
+
|
| 311 |
+
return {
|
| 312 |
+
"input": text,
|
| 313 |
+
"words": result_words,
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
# ββ Utility βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 317 |
|
| 318 |
def stats(self, tokens: list[dict]) -> dict:
|
pyproject.toml
CHANGED
|
@@ -20,7 +20,7 @@ classifiers = [
|
|
| 20 |
]
|
| 21 |
dependencies = [
|
| 22 |
"turkish-tokenizer>=0.1.0",
|
| 23 |
-
"
|
| 24 |
"requests>=2.28.0",
|
| 25 |
]
|
| 26 |
|
|
@@ -36,4 +36,4 @@ where = ["."]
|
|
| 36 |
include = ["nedo_turkish_tokenizer*"]
|
| 37 |
|
| 38 |
[tool.setuptools.package-data]
|
| 39 |
-
nedo_turkish_tokenizer = ["data/*.
|
|
|
|
| 20 |
]
|
| 21 |
dependencies = [
|
| 22 |
"turkish-tokenizer>=0.1.0",
|
| 23 |
+
"zemberek-python>=0.2.3",
|
| 24 |
"requests>=2.28.0",
|
| 25 |
]
|
| 26 |
|
|
|
|
| 36 |
include = ["nedo_turkish_tokenizer*"]
|
| 37 |
|
| 38 |
[tool.setuptools.package-data]
|
| 39 |
+
nedo_turkish_tokenizer = ["data/*.txt"]
|
test_lattice.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test / demo script for the Morphological Lattice API.
|
| 4 |
+
|
| 5 |
+
Shows how ambiguous Turkish words like "evin" and "gelir" produce
|
| 6 |
+
multiple alternative analyses in the lattice.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def section(title: str) -> None:
|
| 14 |
+
print(f"\n{'β' * 60}")
|
| 15 |
+
print(f" {title}")
|
| 16 |
+
print(f"{'β' * 60}")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def main() -> None:
|
| 20 |
+
tok = NedoTurkishTokenizer()
|
| 21 |
+
|
| 22 |
+
# ββ 1. Tek kelime lattice testi ββββββββββββββββββββββββββββββββββββββ
|
| 23 |
+
section("1) get_morphological_lattice β tek kelime ΓΆrnekleri")
|
| 24 |
+
|
| 25 |
+
test_words = ["evin", "gelir", "yΓΌz", "Γ§Δ±kar", "koΕar"]
|
| 26 |
+
for word in test_words:
|
| 27 |
+
lattice = tok.get_morphological_lattice(word)
|
| 28 |
+
print(f"\nβΈ \"{word}\" β {len(lattice)} analiz:")
|
| 29 |
+
for i, entry in enumerate(lattice):
|
| 30 |
+
print(f" [{i}] root={entry['root']:<12} "
|
| 31 |
+
f"pos={entry['pos']:<6} "
|
| 32 |
+
f"suffixes={entry['suffixes']}")
|
| 33 |
+
print(f" lexical_form = {entry['lexical_form']}")
|
| 34 |
+
|
| 35 |
+
# ββ 2. Bilinmeyen kelime (boΕ lattice) βββββββββββββββββββββββββββββββ
|
| 36 |
+
section("2) Bilinmeyen / yabancΔ± kelime β boΕ lattice")
|
| 37 |
+
|
| 38 |
+
unknown_words = ["xyzfoo", "meeting", "blockchain"]
|
| 39 |
+
for word in unknown_words:
|
| 40 |
+
lattice = tok.get_morphological_lattice(word)
|
| 41 |
+
print(f" \"{word}\" β lattice boΕ mu? {len(lattice) == 0} (len={len(lattice)})")
|
| 42 |
+
|
| 43 |
+
# ββ 3. tokenize_lattice β cΓΌmle bazlΔ± test ββββββββββββββββββββββββββ
|
| 44 |
+
section("3) tokenize_lattice β cΓΌmle testi")
|
| 45 |
+
|
| 46 |
+
sentences = [
|
| 47 |
+
"Evin gΓΌzel gelir",
|
| 48 |
+
"Γocuk okula koΕar adΔ±m gitti",
|
| 49 |
+
"YΓΌz yΔ±llΔ±k Γ§Δ±nar",
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
for sent in sentences:
|
| 53 |
+
print(f"\nβΈ Input: \"{sent}\"")
|
| 54 |
+
result = tok.tokenize_lattice(sent)
|
| 55 |
+
for winfo in result["words"]:
|
| 56 |
+
n = len(winfo["lattice"])
|
| 57 |
+
print(f" {winfo['word']:<16} β {n} analiz(ler)")
|
| 58 |
+
for entry in winfo["lattice"]:
|
| 59 |
+
print(f" root={entry['root']:<12} pos={entry['pos']:<6} "
|
| 60 |
+
f"suffixes={entry['suffixes']}")
|
| 61 |
+
|
| 62 |
+
# ββ 4. JSON Γ§Δ±ktΔ± formatΔ± ββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
+
section("4) tokenize_lattice JSON Γ§Δ±ktΔ±")
|
| 64 |
+
|
| 65 |
+
data = tok.tokenize_lattice("evin gelir")
|
| 66 |
+
print(json.dumps(data, ensure_ascii=False, indent=2))
|
| 67 |
+
|
| 68 |
+
print("\nβ
TΓΌm testler baΕarΔ±yla tamamlandΔ±.")
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
main()
|
tokenizer_config.json
CHANGED
|
@@ -7,6 +7,6 @@
|
|
| 7 |
"version": "1.0.0",
|
| 8 |
"language": "tr",
|
| 9 |
"description": "Turkish morphological tokenizer β TR-MMLU world record 92%",
|
| 10 |
-
"requires_java":
|
| 11 |
-
"dependencies": ["turkish-tokenizer", "
|
| 12 |
}
|
|
|
|
| 7 |
"version": "1.0.0",
|
| 8 |
"language": "tr",
|
| 9 |
"description": "Turkish morphological tokenizer β TR-MMLU world record 92%",
|
| 10 |
+
"requires_java": false,
|
| 11 |
+
"dependencies": ["turkish-tokenizer", "zemberek-python"]
|
| 12 |
}
|