use zemberek-python and add regression tests
Browse files- nedo_turkish_tokenizer/_compound.py +4 -5
- nedo_turkish_tokenizer/_context_aware.py +10 -10
- nedo_turkish_tokenizer/_java_check.py +0 -57
- nedo_turkish_tokenizer/_preprocessor.py +4 -3
- nedo_turkish_tokenizer/_root_validator.py +22 -48
- nedo_turkish_tokenizer/_tdk_vocab.py +33 -7
- nedo_turkish_tokenizer/data/zemberek-full.jar +0 -3
- nedo_turkish_tokenizer/tokenizer.py +0 -2
- pyproject.toml +3 -3
- tests/test_tdk_vocab.py +31 -0
- tests/test_zemberek_integration.py +58 -0
- tokenizer_config.json +2 -2
nedo_turkish_tokenizer/_compound.py
CHANGED
|
@@ -41,11 +41,10 @@ KNOWN_COMPOUNDS: dict[str, list[str]] = {
|
|
| 41 |
|
| 42 |
def _decompose_zemberek(word: str, morphology) -> list[str] | None:
|
| 43 |
try:
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
|
| 49 |
if len(roots) > 1:
|
| 50 |
return roots
|
| 51 |
except Exception: # noqa: BLE001
|
|
|
|
| 41 |
|
| 42 |
def _decompose_zemberek(word: str, morphology) -> list[str] | None:
|
| 43 |
try:
|
| 44 |
+
wa = morphology.analyze(word)
|
| 45 |
+
for sa in wa.analysis_results:
|
| 46 |
+
morphemes = [m.id_ for m in sa.get_morphemes()]
|
| 47 |
+
roots = [m for m in morphemes if m in ("Noun", "Verb", "Adj")]
|
|
|
|
| 48 |
if len(roots) > 1:
|
| 49 |
return roots
|
| 50 |
except Exception: # noqa: BLE001
|
nedo_turkish_tokenizer/_context_aware.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
from ._root_validator import ZEMBEREK_AVAILABLE, _morphology
|
| 6 |
|
| 7 |
AMBIGUOUS_WORDS = {
|
| 8 |
"yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
|
|
@@ -17,20 +17,20 @@ def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
|
|
| 17 |
return tokens
|
| 18 |
|
| 19 |
try:
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
|
| 23 |
analyses: dict[str, dict] = {}
|
| 24 |
-
for
|
| 25 |
try:
|
| 26 |
-
|
| 27 |
-
item = sa.getDictionaryItem()
|
| 28 |
-
sf = str(sa.surfaceForm()).lower().strip()
|
| 29 |
if sf not in analyses:
|
| 30 |
analyses[sf] = {
|
| 31 |
-
"lemma":
|
| 32 |
-
"pos":
|
| 33 |
-
"morphemes": [
|
| 34 |
}
|
| 35 |
except Exception: # noqa: BLE001
|
| 36 |
continue
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
from ._root_validator import ZEMBEREK_AVAILABLE, _morphology
|
| 6 |
|
| 7 |
AMBIGUOUS_WORDS = {
|
| 8 |
"yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
|
|
|
|
| 17 |
return tokens
|
| 18 |
|
| 19 |
try:
|
| 20 |
+
sentence = original_text.strip()
|
| 21 |
+
analysis = _morphology.analyze_sentence(sentence)
|
| 22 |
+
after = _morphology.disambiguate(sentence, analysis)
|
| 23 |
+
best_list = after.best_analysis()
|
| 24 |
|
| 25 |
analyses: dict[str, dict] = {}
|
| 26 |
+
for sa in best_list:
|
| 27 |
try:
|
| 28 |
+
sf = sa.surface_form().lower().strip()
|
|
|
|
|
|
|
| 29 |
if sf not in analyses:
|
| 30 |
analyses[sf] = {
|
| 31 |
+
"lemma": sa.item.lemma,
|
| 32 |
+
"pos": sa.item.primary_pos.short_form,
|
| 33 |
+
"morphemes": [m.id_ for m in sa.get_morphemes()],
|
| 34 |
}
|
| 35 |
except Exception: # noqa: BLE001
|
| 36 |
continue
|
nedo_turkish_tokenizer/_java_check.py
DELETED
|
@@ -1,57 +0,0 @@
|
|
| 1 |
-
"""Java/JVM presence check with actionable error messages."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import shutil
|
| 6 |
-
import subprocess
|
| 7 |
-
import sys
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def ensure_java() -> None:
|
| 11 |
-
"""Raise a clear RuntimeError if Java is not installed."""
|
| 12 |
-
if shutil.which("java") is not None:
|
| 13 |
-
return
|
| 14 |
-
|
| 15 |
-
# Try jpype's own detection as a fallback
|
| 16 |
-
try:
|
| 17 |
-
import jpype # noqa: PLC0415
|
| 18 |
-
jpype.getDefaultJVMPath()
|
| 19 |
-
return
|
| 20 |
-
except Exception: # noqa: BLE001
|
| 21 |
-
pass
|
| 22 |
-
|
| 23 |
-
_install_cmd = _get_install_cmd()
|
| 24 |
-
raise RuntimeError(
|
| 25 |
-
"\n"
|
| 26 |
-
"╔══════════════════════════════════════════════════════════════╗\n"
|
| 27 |
-
"║ NedoTurkishTokenizer requires Java (JVM) — not found on this system ║\n"
|
| 28 |
-
"╠══════════════════════════════════════════════════════════════╣\n"
|
| 29 |
-
f"║ Install Java with: ║\n"
|
| 30 |
-
f"║ {_install_cmd:<58}║\n"
|
| 31 |
-
"║ ║\n"
|
| 32 |
-
"║ Then re-run your script. ║\n"
|
| 33 |
-
"╚══════════════════════════════════════════════════════════════╝\n"
|
| 34 |
-
)
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
def _get_install_cmd() -> str:
|
| 38 |
-
if sys.platform == "linux":
|
| 39 |
-
# Try to detect distro
|
| 40 |
-
try:
|
| 41 |
-
out = subprocess.check_output(
|
| 42 |
-
["cat", "/etc/os-release"], text=True, stderr=subprocess.DEVNULL
|
| 43 |
-
)
|
| 44 |
-
if "ubuntu" in out.lower() or "debian" in out.lower():
|
| 45 |
-
return "sudo apt install default-jre"
|
| 46 |
-
if "fedora" in out.lower() or "rhel" in out.lower() or "centos" in out.lower():
|
| 47 |
-
return "sudo dnf install java-latest-openjdk"
|
| 48 |
-
if "arch" in out.lower():
|
| 49 |
-
return "sudo pacman -S jre-openjdk"
|
| 50 |
-
except Exception: # noqa: BLE001
|
| 51 |
-
pass
|
| 52 |
-
return "sudo apt install default-jre"
|
| 53 |
-
if sys.platform == "darwin":
|
| 54 |
-
return "brew install openjdk"
|
| 55 |
-
if sys.platform == "win32":
|
| 56 |
-
return "winget install Microsoft.OpenJDK.21"
|
| 57 |
-
return "Install Java from https://adoptium.net"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/_preprocessor.py
CHANGED
|
@@ -67,12 +67,13 @@ def _is_turkish_base(word: str) -> bool:
|
|
| 67 |
tdk = load_tdk_words()
|
| 68 |
if tdk and wl in tdk:
|
| 69 |
return True
|
| 70 |
-
# Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir
|
| 71 |
try:
|
| 72 |
from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
|
| 73 |
if ZEMBEREK_AVAILABLE and _morphology:
|
| 74 |
-
|
| 75 |
-
|
|
|
|
| 76 |
if any(c in TR_CHARS for c in lemma):
|
| 77 |
return True
|
| 78 |
except Exception: # noqa: BLE001
|
|
|
|
| 67 |
tdk = load_tdk_words()
|
| 68 |
if tdk and wl in tdk:
|
| 69 |
return True
|
| 70 |
+
# Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir...)
|
| 71 |
try:
|
| 72 |
from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
|
| 73 |
if ZEMBEREK_AVAILABLE and _morphology:
|
| 74 |
+
wa = _morphology.analyze(wl)
|
| 75 |
+
for sa in wa.analysis_results:
|
| 76 |
+
lemma = sa.item.lemma
|
| 77 |
if any(c in TR_CHARS for c in lemma):
|
| 78 |
return True
|
| 79 |
except Exception: # noqa: BLE001
|
nedo_turkish_tokenizer/_root_validator.py
CHANGED
|
@@ -2,14 +2,6 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
import os
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
|
| 8 |
-
# ── Zemberek JAR: bundled with package ───────────────────────────────────────
|
| 9 |
-
|
| 10 |
-
_DATA_DIR = Path(__file__).parent / "data"
|
| 11 |
-
JAR_PATH = _DATA_DIR / "zemberek-full.jar"
|
| 12 |
-
|
| 13 |
ZEMBEREK_AVAILABLE = False
|
| 14 |
_morphology = None
|
| 15 |
|
|
@@ -17,30 +9,17 @@ _morphology = None
|
|
| 17 |
def _init_zemberek() -> None:
|
| 18 |
global ZEMBEREK_AVAILABLE, _morphology
|
| 19 |
|
| 20 |
-
if not JAR_PATH.exists():
|
| 21 |
-
print(
|
| 22 |
-
f"[NedoTurkishTokenizer] zemberek-full.jar not found at {JAR_PATH}\n"
|
| 23 |
-
" Root validation disabled — morphological fixes will be limited."
|
| 24 |
-
)
|
| 25 |
-
return
|
| 26 |
-
|
| 27 |
try:
|
| 28 |
-
import
|
| 29 |
-
|
| 30 |
-
if not jpype.isJVMStarted():
|
| 31 |
-
jpype.startJVM(
|
| 32 |
-
jpype.getDefaultJVMPath(),
|
| 33 |
-
"-ea",
|
| 34 |
-
f"-Djava.class.path={JAR_PATH}",
|
| 35 |
-
convertStrings=False,
|
| 36 |
-
)
|
| 37 |
|
| 38 |
-
|
| 39 |
-
_morphology = TurkishMorphology.createWithDefaults()
|
| 40 |
ZEMBEREK_AVAILABLE = True
|
| 41 |
|
| 42 |
except ImportError:
|
| 43 |
-
print(
|
|
|
|
|
|
|
|
|
|
| 44 |
except Exception as exc: # noqa: BLE001
|
| 45 |
print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
|
| 46 |
|
|
@@ -50,25 +29,20 @@ _init_zemberek()
|
|
| 50 |
|
| 51 |
# ── Zemberek API helpers ──────────────────────────────────────────────────────
|
| 52 |
|
| 53 |
-
def _jstr(s: str):
|
| 54 |
-
import jpype # noqa: PLC0415
|
| 55 |
-
return jpype.JString(s)
|
| 56 |
-
|
| 57 |
-
|
| 58 |
def analyze_word(word: str) -> list[dict]:
|
| 59 |
"""Return all Zemberek analyses for a single word."""
|
| 60 |
if not ZEMBEREK_AVAILABLE:
|
| 61 |
return []
|
| 62 |
try:
|
| 63 |
-
wa = _morphology.analyze(
|
| 64 |
return [
|
| 65 |
{
|
| 66 |
-
"lemma":
|
| 67 |
-
"pos":
|
| 68 |
-
"morphemes":[
|
| 69 |
-
"surface":
|
| 70 |
}
|
| 71 |
-
for sa in wa.
|
| 72 |
]
|
| 73 |
except Exception: # noqa: BLE001
|
| 74 |
return []
|
|
@@ -99,7 +73,7 @@ def _is_spurious_root(root: str, next_tokens: list[dict]) -> bool:
|
|
| 99 |
def build_correction_map(
|
| 100 |
original_words: list[str], base_tokenizer
|
| 101 |
) -> dict[str, str]:
|
| 102 |
-
"""Build a {tokenizer_root
|
| 103 |
correction_map: dict[str, str] = {}
|
| 104 |
|
| 105 |
for word in original_words:
|
|
@@ -172,7 +146,7 @@ def validate_roots(
|
|
| 172 |
"token": leading + correct,
|
| 173 |
"_original_token": tok["token"],
|
| 174 |
"_root_corrected": True,
|
| 175 |
-
"_note": f"root corrected: '{surface}'
|
| 176 |
}
|
| 177 |
|
| 178 |
result.append(tok)
|
|
@@ -185,17 +159,17 @@ def disambiguate_sentence(words: list[str]) -> list[dict | None]:
|
|
| 185 |
if not ZEMBEREK_AVAILABLE:
|
| 186 |
return [None] * len(words)
|
| 187 |
try:
|
| 188 |
-
|
| 189 |
-
|
|
|
|
|
|
|
| 190 |
out = []
|
| 191 |
-
for
|
| 192 |
try:
|
| 193 |
-
sa = best.get(i)
|
| 194 |
-
item = sa.getDictionaryItem()
|
| 195 |
out.append({
|
| 196 |
-
"lemma":
|
| 197 |
-
"pos":
|
| 198 |
-
"morphemes": [
|
| 199 |
})
|
| 200 |
except Exception: # noqa: BLE001
|
| 201 |
out.append(None)
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
ZEMBEREK_AVAILABLE = False
|
| 6 |
_morphology = None
|
| 7 |
|
|
|
|
| 9 |
def _init_zemberek() -> None:
|
| 10 |
global ZEMBEREK_AVAILABLE, _morphology
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
try:
|
| 13 |
+
from zemberek import TurkishMorphology # noqa: PLC0415
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
_morphology = TurkishMorphology.create_with_defaults()
|
|
|
|
| 16 |
ZEMBEREK_AVAILABLE = True
|
| 17 |
|
| 18 |
except ImportError:
|
| 19 |
+
print(
|
| 20 |
+
"[NedoTurkishTokenizer] zemberek-python not installed\n"
|
| 21 |
+
" pip install zemberek-python"
|
| 22 |
+
)
|
| 23 |
except Exception as exc: # noqa: BLE001
|
| 24 |
print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
|
| 25 |
|
|
|
|
| 29 |
|
| 30 |
# ── Zemberek API helpers ──────────────────────────────────────────────────────
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def analyze_word(word: str) -> list[dict]:
|
| 33 |
"""Return all Zemberek analyses for a single word."""
|
| 34 |
if not ZEMBEREK_AVAILABLE:
|
| 35 |
return []
|
| 36 |
try:
|
| 37 |
+
wa = _morphology.analyze(word)
|
| 38 |
return [
|
| 39 |
{
|
| 40 |
+
"lemma": sa.item.lemma,
|
| 41 |
+
"pos": sa.item.primary_pos.short_form,
|
| 42 |
+
"morphemes": [m.id_ for m in sa.get_morphemes()],
|
| 43 |
+
"surface": sa.surface_form(),
|
| 44 |
}
|
| 45 |
+
for sa in wa.analysis_results
|
| 46 |
]
|
| 47 |
except Exception: # noqa: BLE001
|
| 48 |
return []
|
|
|
|
| 73 |
def build_correction_map(
|
| 74 |
original_words: list[str], base_tokenizer
|
| 75 |
) -> dict[str, str]:
|
| 76 |
+
"""Build a {tokenizer_root -> zemberek_root} correction map."""
|
| 77 |
correction_map: dict[str, str] = {}
|
| 78 |
|
| 79 |
for word in original_words:
|
|
|
|
| 146 |
"token": leading + correct,
|
| 147 |
"_original_token": tok["token"],
|
| 148 |
"_root_corrected": True,
|
| 149 |
+
"_note": f"root corrected: '{surface}' -> '{correct}'",
|
| 150 |
}
|
| 151 |
|
| 152 |
result.append(tok)
|
|
|
|
| 159 |
if not ZEMBEREK_AVAILABLE:
|
| 160 |
return [None] * len(words)
|
| 161 |
try:
|
| 162 |
+
sentence = " ".join(words)
|
| 163 |
+
analysis = _morphology.analyze_sentence(sentence)
|
| 164 |
+
after = _morphology.disambiguate(sentence, analysis)
|
| 165 |
+
best = after.best_analysis()
|
| 166 |
out = []
|
| 167 |
+
for sa in best:
|
| 168 |
try:
|
|
|
|
|
|
|
| 169 |
out.append({
|
| 170 |
+
"lemma": sa.item.lemma,
|
| 171 |
+
"pos": sa.item.primary_pos.short_form,
|
| 172 |
+
"morphemes": [m.id_ for m in sa.get_morphemes()],
|
| 173 |
})
|
| 174 |
except Exception: # noqa: BLE001
|
| 175 |
out.append(None)
|
nedo_turkish_tokenizer/_tdk_vocab.py
CHANGED
|
@@ -9,6 +9,7 @@ from pathlib import Path
|
|
| 9 |
_CACHE_DIR = Path.home() / ".cache" / "nedo_turkish_tokenizer"
|
| 10 |
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 11 |
TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt")
|
|
|
|
| 12 |
|
| 13 |
TR_CHARS = set("çğışöüÇĞİŞÖÜ")
|
| 14 |
|
|
@@ -21,21 +22,46 @@ _HF_TDK_URL = (
|
|
| 21 |
)
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
def load_tdk_words() -> set:
|
| 25 |
global _TDK_WORDS
|
| 26 |
if _TDK_WORDS is not None:
|
| 27 |
return _TDK_WORDS
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
if not os.path.exists(TDK_CACHE_FILE):
|
| 30 |
-
print("[NedoTurkishTokenizer] TDK word list not found
|
| 31 |
words = _download_from_hf() or _download_from_tdk()
|
| 32 |
if not words:
|
| 33 |
_TDK_WORDS = set()
|
| 34 |
return _TDK_WORDS
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
| 39 |
return _TDK_WORDS
|
| 40 |
|
| 41 |
|
|
@@ -51,11 +77,11 @@ def _download_from_hf() -> list[str]:
|
|
| 51 |
with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
|
| 52 |
f.write("\n".join(words))
|
| 53 |
|
| 54 |
-
print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from HuggingFace
|
| 55 |
return words
|
| 56 |
|
| 57 |
except Exception as exc: # noqa: BLE001
|
| 58 |
-
print(f"[NedoTurkishTokenizer] HuggingFace download failed: {exc}
|
| 59 |
return []
|
| 60 |
|
| 61 |
|
|
@@ -72,7 +98,7 @@ def _download_from_tdk() -> list[str]:
|
|
| 72 |
with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
|
| 73 |
f.write("\n".join(words))
|
| 74 |
|
| 75 |
-
print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from TDK API
|
| 76 |
return words
|
| 77 |
|
| 78 |
except Exception as exc: # noqa: BLE001
|
|
|
|
| 9 |
_CACHE_DIR = Path.home() / ".cache" / "nedo_turkish_tokenizer"
|
| 10 |
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 11 |
TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt")
|
| 12 |
+
_BUNDLED_TDK_FILE = Path(__file__).parent / "data" / "tdk_words.txt"
|
| 13 |
|
| 14 |
TR_CHARS = set("çğışöüÇĞİŞÖÜ")
|
| 15 |
|
|
|
|
| 22 |
)
|
| 23 |
|
| 24 |
|
| 25 |
+
def _read_word_file(path: Path) -> set[str]:
|
| 26 |
+
with path.open(encoding="utf-8") as f:
|
| 27 |
+
return {line.strip().lower() for line in f if line.strip()}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _load_cached_or_bundled_words() -> tuple[set[str] | None, str | None]:
|
| 31 |
+
candidates = (
|
| 32 |
+
(Path(TDK_CACHE_FILE), "cache"),
|
| 33 |
+
(_BUNDLED_TDK_FILE, "package bundle"),
|
| 34 |
+
)
|
| 35 |
+
for path, source in candidates:
|
| 36 |
+
if path.exists():
|
| 37 |
+
return _read_word_file(path), source
|
| 38 |
+
return None, None
|
| 39 |
+
|
| 40 |
+
|
| 41 |
def load_tdk_words() -> set:
|
| 42 |
global _TDK_WORDS
|
| 43 |
if _TDK_WORDS is not None:
|
| 44 |
return _TDK_WORDS
|
| 45 |
|
| 46 |
+
words, source = _load_cached_or_bundled_words()
|
| 47 |
+
if words is not None:
|
| 48 |
+
_TDK_WORDS = words
|
| 49 |
+
print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded from {source} [ok]")
|
| 50 |
+
return _TDK_WORDS
|
| 51 |
+
|
| 52 |
if not os.path.exists(TDK_CACHE_FILE):
|
| 53 |
+
print("[NedoTurkishTokenizer] TDK word list not found - downloading...")
|
| 54 |
words = _download_from_hf() or _download_from_tdk()
|
| 55 |
if not words:
|
| 56 |
_TDK_WORDS = set()
|
| 57 |
return _TDK_WORDS
|
| 58 |
|
| 59 |
+
_TDK_WORDS, source = _load_cached_or_bundled_words()
|
| 60 |
+
if _TDK_WORDS is None:
|
| 61 |
+
_TDK_WORDS = set()
|
| 62 |
+
return _TDK_WORDS
|
| 63 |
+
|
| 64 |
+
print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded from {source} [ok]")
|
| 65 |
return _TDK_WORDS
|
| 66 |
|
| 67 |
|
|
|
|
| 77 |
with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
|
| 78 |
f.write("\n".join(words))
|
| 79 |
|
| 80 |
+
print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from HuggingFace [ok]")
|
| 81 |
return words
|
| 82 |
|
| 83 |
except Exception as exc: # noqa: BLE001
|
| 84 |
+
print(f"[NedoTurkishTokenizer] HuggingFace download failed: {exc} - trying TDK API...")
|
| 85 |
return []
|
| 86 |
|
| 87 |
|
|
|
|
| 98 |
with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
|
| 99 |
f.write("\n".join(words))
|
| 100 |
|
| 101 |
+
print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from TDK API [ok]")
|
| 102 |
return words
|
| 103 |
|
| 104 |
except Exception as exc: # noqa: BLE001
|
nedo_turkish_tokenizer/data/zemberek-full.jar
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:74ee8736b73dc2ca878071b80829f9c5acccc268d4b8b7795d36d60db26a1731
|
| 3 |
-
size 31644792
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/tokenizer.py
CHANGED
|
@@ -30,7 +30,6 @@ import multiprocessing
|
|
| 30 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 31 |
from pathlib import Path
|
| 32 |
|
| 33 |
-
from ._java_check import ensure_java
|
| 34 |
from ._preprocessor import preprocess, postprocess
|
| 35 |
from ._suffix_expander import reclassify_bpe_suffixes
|
| 36 |
from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
|
|
@@ -98,7 +97,6 @@ class NedoTurkishTokenizer:
|
|
| 98 |
"""
|
| 99 |
|
| 100 |
def __init__(self) -> None:
|
| 101 |
-
ensure_java()
|
| 102 |
from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
|
| 103 |
self._base = TurkishTokenizer()
|
| 104 |
self.zemberek_available = ZEMBEREK_AVAILABLE
|
|
|
|
| 30 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 31 |
from pathlib import Path
|
| 32 |
|
|
|
|
| 33 |
from ._preprocessor import preprocess, postprocess
|
| 34 |
from ._suffix_expander import reclassify_bpe_suffixes
|
| 35 |
from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
|
|
|
|
| 97 |
"""
|
| 98 |
|
| 99 |
def __init__(self) -> None:
|
|
|
|
| 100 |
from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
|
| 101 |
self._base = TurkishTokenizer()
|
| 102 |
self.zemberek_available = ZEMBEREK_AVAILABLE
|
pyproject.toml
CHANGED
|
@@ -20,7 +20,7 @@ classifiers = [
|
|
| 20 |
]
|
| 21 |
dependencies = [
|
| 22 |
"turkish-tokenizer>=0.1.0",
|
| 23 |
-
"
|
| 24 |
"requests>=2.28.0",
|
| 25 |
]
|
| 26 |
|
|
@@ -29,11 +29,11 @@ dev = ["pytest", "huggingface_hub"]
|
|
| 29 |
|
| 30 |
[project.urls]
|
| 31 |
Homepage = "https://huggingface.co/Ethosoft/NedoTurkishTokenizer"
|
| 32 |
-
Repository = "https://
|
| 33 |
|
| 34 |
[tool.setuptools.packages.find]
|
| 35 |
where = ["."]
|
| 36 |
include = ["nedo_turkish_tokenizer*"]
|
| 37 |
|
| 38 |
[tool.setuptools.package-data]
|
| 39 |
-
nedo_turkish_tokenizer = ["data/*.
|
|
|
|
| 20 |
]
|
| 21 |
dependencies = [
|
| 22 |
"turkish-tokenizer>=0.1.0",
|
| 23 |
+
"zemberek-python>=0.2.3",
|
| 24 |
"requests>=2.28.0",
|
| 25 |
]
|
| 26 |
|
|
|
|
| 29 |
|
| 30 |
[project.urls]
|
| 31 |
Homepage = "https://huggingface.co/Ethosoft/NedoTurkishTokenizer"
|
| 32 |
+
Repository = "https://github.com/ethosoftai/NedoTurkishTokenizer"
|
| 33 |
|
| 34 |
[tool.setuptools.packages.find]
|
| 35 |
where = ["."]
|
| 36 |
include = ["nedo_turkish_tokenizer*"]
|
| 37 |
|
| 38 |
[tool.setuptools.package-data]
|
| 39 |
+
nedo_turkish_tokenizer = ["data/*.txt"]
|
tests/test_tdk_vocab.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import tempfile
|
| 4 |
+
import unittest
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from unittest import mock
|
| 7 |
+
|
| 8 |
+
from nedo_turkish_tokenizer import _tdk_vocab
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class TdkVocabTests(unittest.TestCase):
|
| 12 |
+
def setUp(self) -> None:
|
| 13 |
+
self._original_words = _tdk_vocab._TDK_WORDS
|
| 14 |
+
_tdk_vocab._TDK_WORDS = None
|
| 15 |
+
|
| 16 |
+
def tearDown(self) -> None:
|
| 17 |
+
_tdk_vocab._TDK_WORDS = self._original_words
|
| 18 |
+
|
| 19 |
+
def test_load_tdk_words_uses_bundled_file_before_network(self) -> None:
|
| 20 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 21 |
+
cache_path = str(Path(tmpdir) / "tdk_words.txt")
|
| 22 |
+
|
| 23 |
+
with mock.patch.object(_tdk_vocab, "TDK_CACHE_FILE", cache_path):
|
| 24 |
+
with mock.patch.object(_tdk_vocab, "_download_from_hf") as download_hf:
|
| 25 |
+
with mock.patch.object(_tdk_vocab, "_download_from_tdk") as download_tdk:
|
| 26 |
+
words = _tdk_vocab.load_tdk_words()
|
| 27 |
+
|
| 28 |
+
self.assertGreater(len(words), 50_000)
|
| 29 |
+
self.assertIn("zemberek", words)
|
| 30 |
+
download_hf.assert_not_called()
|
| 31 |
+
download_tdk.assert_not_called()
|
tests/test_zemberek_integration.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import tempfile
|
| 4 |
+
import unittest
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from unittest import mock
|
| 7 |
+
|
| 8 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer, _tdk_vocab
|
| 9 |
+
from nedo_turkish_tokenizer._root_validator import (
|
| 10 |
+
ZEMBEREK_AVAILABLE,
|
| 11 |
+
disambiguate_sentence,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@unittest.skipUnless(ZEMBEREK_AVAILABLE, "zemberek-python is required for these tests")
|
| 16 |
+
class ZemberekIntegrationTests(unittest.TestCase):
|
| 17 |
+
def setUp(self) -> None:
|
| 18 |
+
self._original_words = _tdk_vocab._TDK_WORDS
|
| 19 |
+
_tdk_vocab._TDK_WORDS = None
|
| 20 |
+
|
| 21 |
+
def tearDown(self) -> None:
|
| 22 |
+
_tdk_vocab._TDK_WORDS = self._original_words
|
| 23 |
+
|
| 24 |
+
def test_sentence_disambiguation_uses_zemberek_python(self) -> None:
|
| 25 |
+
analyses = disambiguate_sentence(["Bug\u00fcn", "geldi"])
|
| 26 |
+
|
| 27 |
+
self.assertEqual(2, len(analyses))
|
| 28 |
+
self.assertEqual("bug\u00fcn", analyses[0]["lemma"])
|
| 29 |
+
self.assertEqual("gelmek", analyses[1]["lemma"])
|
| 30 |
+
self.assertEqual("Verb", analyses[1]["pos"])
|
| 31 |
+
|
| 32 |
+
def test_tokenizer_smoke_uses_bundled_tdk_words(self) -> None:
|
| 33 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 34 |
+
cache_path = str(Path(tmpdir) / "tdk_words.txt")
|
| 35 |
+
|
| 36 |
+
with mock.patch.object(_tdk_vocab, "TDK_CACHE_FILE", cache_path):
|
| 37 |
+
with mock.patch.object(_tdk_vocab, "_download_from_hf") as download_hf:
|
| 38 |
+
with mock.patch.object(_tdk_vocab, "_download_from_tdk") as download_tdk:
|
| 39 |
+
tokenizer = NedoTurkishTokenizer()
|
| 40 |
+
tokens = tokenizer.tokenize("Bug\u00fcn \u0130stanbul'a gidiyorum.")
|
| 41 |
+
|
| 42 |
+
self.assertTrue(
|
| 43 |
+
any(t["token"].strip() == "bug\u00fcn" and t["token_type"] == "ROOT" for t in tokens)
|
| 44 |
+
)
|
| 45 |
+
self.assertTrue(any(t["token"] == "'" and t["token_type"] == "PUNCT" for t in tokens))
|
| 46 |
+
self.assertTrue(
|
| 47 |
+
any(
|
| 48 |
+
t["token"].strip() == "a"
|
| 49 |
+
and t["token_type"] == "SUFFIX"
|
| 50 |
+
and t["morph_pos"] == 1
|
| 51 |
+
for t in tokens
|
| 52 |
+
)
|
| 53 |
+
)
|
| 54 |
+
self.assertTrue(
|
| 55 |
+
any(t["token"].strip() == "gitmek" and t.get("_root_corrected") for t in tokens)
|
| 56 |
+
)
|
| 57 |
+
download_hf.assert_not_called()
|
| 58 |
+
download_tdk.assert_not_called()
|
tokenizer_config.json
CHANGED
|
@@ -7,6 +7,6 @@
|
|
| 7 |
"version": "1.0.0",
|
| 8 |
"language": "tr",
|
| 9 |
"description": "Turkish morphological tokenizer — TR-MMLU world record 92%",
|
| 10 |
-
"requires_java":
|
| 11 |
-
"dependencies": ["turkish-tokenizer", "
|
| 12 |
}
|
|
|
|
| 7 |
"version": "1.0.0",
|
| 8 |
"language": "tr",
|
| 9 |
"description": "Turkish morphological tokenizer — TR-MMLU world record 92%",
|
| 10 |
+
"requires_java": false,
|
| 11 |
+
"dependencies": ["turkish-tokenizer", "zemberek-python"]
|
| 12 |
}
|