merge github main before publish

Browse files

Files changed (11) hide show

hf_benchmark.py +324 -0
nedo_turkish_tokenizer/_compound.py +3 -3
nedo_turkish_tokenizer/_context_aware.py +10 -9
nedo_turkish_tokenizer/_preprocessor.py +3 -3
nedo_turkish_tokenizer/_root_validator.py +44 -19
nedo_turkish_tokenizer/tokenizer.py +79 -0
paper_baseline_check.py +103 -0
results/tr_mmlu_per_sample.csv +0 -0
results/tr_mmlu_report.md +85 -0
results/tr_mmlu_summary.json +16 -0
test_lattice.py +72 -0

hf_benchmark.py ADDED Viewed

	@@ -0,0 +1,324 @@

+"""
+hf_benchmark.py
+---------------
+NedoTurkishTokenizer'ı TR-MMLU üzerinde benchmark eder.
+HuggingFace'den hem tokenizer hem dataset'i çeker.
+Kullanım:
+    cd NedoTurkishTokenizer/
+    pip install huggingface_hub datasets
+    python hf_benchmark.py
+    # Sadece 2000 örnek (hızlı test):
+    python hf_benchmark.py --samples 2000
+    # Belirli kategori:
+    python hf_benchmark.py --category TUS
+"""
+import argparse
+import json
+import os
+import sys
+import csv
+from pathlib import Path
+# ── HuggingFace token ─────────────────────────────────────────────────────────
+HF_TOKEN = os.environ.get("HF_TOKEN", "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")
+# ── Argümanlar ────────────────────────────────────────────────────────────────
+parser = argparse.ArgumentParser()
+parser.add_argument("--samples",  default="all",  help="Kaç örnek (all veya sayı)")
+parser.add_argument("--category", default=None,   help="Kategori filtresi (ör. TUS)")
+parser.add_argument("--out",      default=".",    help="Çıktı klasörü")
+args = parser.parse_args()
+OUT = Path(args.out)
+OUT.mkdir(parents=True, exist_ok=True)
+# ── HuggingFace login ─────────────────────────────────────────────────────────
+from huggingface_hub import login
+login(token=HF_TOKEN, add_to_git_credential=False)
+print("HuggingFace login ✓")
+# ── Tokenizer yükle ───────────────────────────────────────────────────────────
+print("NedoTurkishTokenizer yükleniyor...")
+sys.path.insert(0, str(Path(__file__).parent))
+from nedo_turkish_tokenizer import NedoTurkishTokenizer
+tok = NedoTurkishTokenizer()
+print(f"  Zemberek: {'✓' if tok.zemberek_available else '✗ (devre dışı)'}")
+# ── Dataset yükle ─────────────────────────────────────────────────────────────
+print("TR-MMLU dataset yükleniyor...")
+from datasets import load_dataset
+ds = load_dataset(
+    "alibayram/turkish_mmlu",
+    split="test",
+    token=HF_TOKEN,
+)
+print(f"  {len(ds)} örnek")
+# ── Kategori filtresi ─────────────────────────────────────────────────────────
+samples = list(ds)
+if args.category:
+    samples = [r for r in samples
+               if str(r.get("bolum","")).strip() == args.category]
+    print(f"  Kategori '{args.category}': {len(samples)} örnek")
+if args.samples != "all":
+    n = int(args.samples)
+    samples = samples[:n]
+    print(f"  Kısıtlı: {len(samples)} örnek")
+# ── Metin alanlarını birleştir ─────────────────────────────────────────────────
+def get_text(row: dict) -> str:
+    parts = []
+    for field in ["soru", "question"]:
+        if row.get(field):
+            parts.append(str(row[field]))
+            break
+    choices = row.get("secenekler") or []
+    if isinstance(choices, list):
+        parts.extend(str(c) for c in choices)
+    for lbl in ["A","B","C","D"]:
+        if row.get(lbl):
+            parts.append(str(row[lbl]))
+    if row.get("aciklama"):
+        parts.append(str(row["aciklama"]))
+    return " ".join(parts)
+# ── Token istatistikleri ──────────────────────────────────────────────────────
+def token_stats(tokens: list[dict]) -> dict:
+    """NedoTurkishTokenizer'ın token_type alanını kullan."""
+    total    = len(tokens)
+    if total == 0:
+        return {"total":0,"bpe":0,"tr_pct":0.0,"pure_pct":0.0,"orig_tr_pct":0.0}
+    # Enhanced stats
+    roots    = sum(1 for t in tokens if t["token_type"] == "ROOT")
+    suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
+    foreign  = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
+    punct    = sum(1 for t in tokens if t["token_type"] == "PUNCT")
+    bpe      = sum(1 for t in tokens if t["token_type"] == "BPE")
+    special  = sum(1 for t in tokens
+                   if t["token_type"] in ("NUM","DATE","UNIT","URL",
+                                          "MENTION","HASHTAG","EMOJI","ACRONYM"))
+    tr       = roots + suffixes + foreign + punct + special
+    pure     = sum(1 for t in tokens
+                   if t["token_type"] in ("ROOT","SUFFIX","FOREIGN")
+                   and not t["token"].strip().startswith("<"))
+    return {
+        "total":    total,
+        "roots":    roots,
+        "suffixes": suffixes,
+        "foreign":  foreign,
+        "bpe":      bpe,
+        "punct":    punct,
+        "special":  special,
+        "tr_pct":   round(tr / total * 100, 4),
+        "pure_pct": round(pure / total * 100, 4),
+    }
+def orig_stats(tokens: list[dict]) -> dict:
+    """Orijinal tokenizer istatistikleri (karşılaştırma için)."""
+    total = len(tokens)
+    if total == 0:
+        return {"total":0,"bpe":0,"tr_pct":0.0,"pure_pct":0.0}
+    roots    = sum(1 for t in tokens if t.get("type") == "ROOT")
+    suffixes = sum(1 for t in tokens if t.get("type") == "SUFFIX")
+    bpe      = sum(1 for t in tokens if t.get("type") == "BPE")
+    tr       = roots + suffixes
+    pure     = sum(1 for t in tokens
+                   if t.get("type") in ("ROOT","SUFFIX")
+                   and not t.get("token","").strip().startswith("<"))
+    return {
+        "total":    total,
+        "bpe":      bpe,
+        "tr_pct":   round(tr / total * 100, 4),
+        "pure_pct": round(pure / total * 100, 4),
+    }
+# ── Ana benchmark döngüsü ─────────────────────────────────────────────────────
+print(f"\nBenchmark başlıyor: {len(samples)} örnek...")
+per_sample   = []
+orig_tr_sum  = 0.0
+enh_tr_sum   = 0.0
+orig_pur_sum = 0.0
+enh_pur_sum  = 0.0
+orig_tok_sum = 0
+enh_tok_sum  = 0
+orig_bpe_sum = 0
+enh_bpe_sum  = 0
+improved     = 0
+regressed    = 0
+unchanged    = 0
+REPORT_EVERY = 500
+for idx, row in enumerate(samples):
+    text = get_text(row)
+    if not text.strip():
+        continue
+    # Orijinal tokenizer
+    orig_toks = tok._base.tokenize_text(text)
+    os_      = orig_stats(orig_toks)
+    # NedoTurkishTokenizer
+    enh_toks  = tok.tokenize(text)
+    es_       = token_stats(enh_toks)
+    d_tr = round(es_["tr_pct"] - os_["tr_pct"], 4)
+    per_sample.append({
+        "idx":       idx,
+        "bolum":     str(row.get("bolum","")),
+        "orig_tr":   os_["tr_pct"],
+        "enh_tr":    es_["tr_pct"],
+        "d_tr":      d_tr,
+        "orig_pure": os_["pure_pct"],
+        "enh_pure":  es_["pure_pct"],
+        "orig_tok":  os_["total"],
+        "enh_tok":   es_["total"],
+        "orig_bpe":  os_["bpe"],
+        "enh_bpe":   es_["bpe"],
+    })
+    orig_tr_sum  += os_["tr_pct"]
+    enh_tr_sum   += es_["tr_pct"]
+    orig_pur_sum += os_["pure_pct"]
+    enh_pur_sum  += es_["pure_pct"]
+    orig_tok_sum += os_["total"]
+    enh_tok_sum  += es_["total"]
+    orig_bpe_sum += os_["bpe"]
+    enh_bpe_sum  += es_["bpe"]
+    if d_tr > 0:   improved  += 1
+    elif d_tr < 0: regressed += 1
+    else:          unchanged += 1
+    if (idx + 1) % REPORT_EVERY == 0:
+        n = idx + 1
+        print(f"  [{n:>6}/{len(samples)}]  "
+              f"TR%: {enh_tr_sum/n:.2f}%  "
+              f"BPE/örnek: {enh_bpe_sum/n:.2f}  "
+              f"Regressed: {regressed}")
+n = len(per_sample)
+if n == 0:
+    print("Hiç örnek işlenmedi!")
+    sys.exit(1)
+# ── Özet ─────────────────────────────────────────────────────────────────────
+summary = {
+    "n_samples":        n,
+    "orig_tr_pct":      round(orig_tr_sum / n, 4),
+    "orig_pure_pct":    round(orig_pur_sum / n, 4),
+    "enh_tr_pct":       round(enh_tr_sum / n, 4),
+    "enh_pure_pct":     round(enh_pur_sum / n, 4),
+    "delta_tr_pct":     round((enh_tr_sum - orig_tr_sum) / n, 4),
+    "delta_pure_pct":   round((enh_pur_sum - orig_pur_sum) / n, 4),
+    "orig_avg_tokens":  round(orig_tok_sum / n, 2),
+    "enh_avg_tokens":   round(enh_tok_sum / n, 2),
+    "orig_avg_bpe":     round(orig_bpe_sum / n, 2),
+    "enh_avg_bpe":      round(enh_bpe_sum / n, 2),
+    "pct_improved":     round(improved / n * 100, 2),
+    "pct_regressed":    round(regressed / n * 100, 2),
+    "pct_unchanged":    round(unchanged / n * 100, 2),
+}
+# ── Kategori bazında ──────────────────────────────────────────────────────────
+from collections import defaultdict
+cat_scores = defaultdict(list)
+for row in per_sample:
+    cat_scores[row["bolum"]].append(row["enh_tr"])
+cat_summary = {
+    cat: round(sum(v)/len(v), 2)
+    for cat, v in cat_scores.items()
+    if len(v) >= 3
+}
+cat_sorted = sorted(cat_summary.items(), key=lambda x: x[1])
+# ── Rapor yazdır ─────────────────────────────────────────────────────────────
+SEP = "═" * 65
+print(f"\n{SEP}")
+print("  NedoTurkishTokenizer — TR-MMLU Benchmark Sonuçları")
+print(SEP)
+print(f"  N = {n:,} örnek\n")
+print(f"  {'Metrik':30s} {'Orijinal':>10} {'Enhanced':>10} {'Δ':>8}")
+print("  " + "─"*55)
+for label, orig, enh, delta in [
+    ("TR%",          summary["orig_tr_pct"],   summary["enh_tr_pct"],   summary["delta_tr_pct"]),
+    ("Pure%",        summary["orig_pure_pct"], summary["enh_pure_pct"], summary["delta_pure_pct"]),
+    ("Avg token/örn",summary["orig_avg_tokens"],summary["enh_avg_tokens"],
+     round(summary["enh_avg_tokens"]-summary["orig_avg_tokens"],2)),
+    ("Avg BPE/örn",  summary["orig_avg_bpe"],  summary["enh_avg_bpe"],
+     round(summary["enh_avg_bpe"]-summary["orig_avg_bpe"],2)),
+]:
+    print(f"  {label:30s} {orig:>10.2f} {enh:>10.2f} {delta:>+8.2f}")
+print(f"\n  İyileşen : {improved:>6,} (%{summary['pct_improved']:.2f})")
+print(f"  Gerileyen: {regressed:>6,} (%{summary['pct_regressed']:.2f})")
+print(f"  Değişmeyen:{unchanged:>6,} (%{summary['pct_unchanged']:.2f})")
+print(f"\n  En düşük TR% kategoriler:")
+for cat, avg in cat_sorted[:10]:
+    n_cat = len(cat_scores[cat])
+    print(f"    {cat:<35} {avg:>6.2f}%  (n={n_cat})")
+print(f"\n  En yüksek TR% kategoriler:")
+for cat, avg in cat_sorted[-8:]:
+    n_cat = len(cat_scores[cat])
+    print(f"    {cat:<35} {avg:>6.2f}%  (n={n_cat})")
+print(SEP)
+# ── Dosyalara yaz ─────────────────────────────────────────────────────────────
+# Summary JSON
+summary_path = OUT / "tr_mmlu_summary.json"
+with open(summary_path, "w", encoding="utf-8") as f:
+    json.dump(summary, f, ensure_ascii=False, indent=2)
+print(f"\n  ✓ {summary_path}")
+# Report Markdown
+paper_tr  = 90.29
+paper_pur = 85.80
+report_path = OUT / "tr_mmlu_report.md"
+with open(report_path, "w", encoding="utf-8") as f:
+    f.write("# TR-MMLU Benchmark — NedoTurkishTokenizer\n\n")
+    f.write(f"**N = {n:,} örnek**\n\n")
+    f.write("## Ana Metrikler\n\n")
+    f.write("| Metrik | Orijinal | Enhanced | Δ |\n")
+    f.write("|--------|----------|----------|---|\n")
+    f.write(f"| TR% | {summary['orig_tr_pct']:.2f}% | {summary['enh_tr_pct']:.2f}% | {summary['delta_tr_pct']:+.2f}% |\n")
+    f.write(f"| Pure% | {summary['orig_pure_pct']:.2f}% | {summary['enh_pure_pct']:.2f}% | {summary['delta_pure_pct']:+.2f}% |\n")
+    f.write(f"| Avg token/örnek | {summary['orig_avg_tokens']:.2f} | {summary['enh_avg_tokens']:.2f} | {summary['enh_avg_tokens']-summary['orig_avg_tokens']:+.2f} |\n")
+    f.write(f"| Avg BPE/örnek | {summary['orig_avg_bpe']:.2f} | {summary['enh_avg_bpe']:.2f} | {summary['enh_avg_bpe']-summary['orig_avg_bpe']:+.2f} |\n")
+    f.write("\n## Paper ile Karşılaştırma\n\n")
+    f.write("| Metrik | Paper (orijinal) | Bizim (orijinal) | NedoTurkishTokenizer |\n")
+    f.write("|--------|-----------------|-----------------|---------------------|\n")
+    f.write(f"| TR% | {paper_tr}% | {summary['orig_tr_pct']:.2f}% | **{summary['enh_tr_pct']:.2f}%** |\n")
+    f.write(f"| Pure% | {paper_pur}% | {summary['orig_pure_pct']:.2f}% | **{summary['enh_pure_pct']:.2f}%** |\n")
+    f.write("\n## Örnek Dağılımı\n\n")
+    f.write(f"- İyileşen: {improved:,} (%{summary['pct_improved']:.2f})\n")
+    f.write(f"- Gerileyen: {regressed:,} (%{summary['pct_regressed']:.2f})\n")
+    f.write(f"- Değişmeyen: {unchanged:,} (%{summary['pct_unchanged']:.2f})\n")
+    f.write("\n## Kategori Bazında TR%\n\n")
+    f.write("| Kategori | TR% | N |\n")
+    f.write("|----------|-----|---|\n")
+    for cat, avg in cat_sorted:
+        n_cat = len(cat_scores[cat])
+        f.write(f"| {cat} | {avg:.2f}% | {n_cat} |\n")
+print(f"  ✓ {report_path}")
+# Per-sample CSV
+csv_path = OUT / "tr_mmlu_per_sample.csv"
+with open(csv_path, "w", newline="", encoding="utf-8") as f:
+    writer = csv.DictWriter(f, fieldnames=per_sample[0].keys())
+    writer.writeheader()
+    writer.writerows(per_sample)
+print(f"  ✓ {csv_path}")
+print("\nTamamlandı.\n")

nedo_turkish_tokenizer/_compound.py CHANGED Viewed

@@ -42,9 +42,9 @@ KNOWN_COMPOUNDS: dict[str, list[str]] = {
 def _decompose_zemberek(word: str, morphology) -> list[str] | None:
     try:
         wa = morphology.analyze(word)
-        for sa in wa.analysis_results:
-            morphemes = [m.id_ for m in sa.get_morphemes()]
-            roots = [m for m in morphemes if m in ("Noun", "Verb", "Adj")]
             if len(roots) > 1:
                 return roots
     except Exception:  # noqa: BLE001

 def _decompose_zemberek(word: str, morphology) -> list[str] | None:
     try:
         wa = morphology.analyze(word)
+        for sa in wa:
+            morphemes = [str(m) for m in sa.get_morphemes()]
+            roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
             if len(roots) > 1:
                 return roots
     except Exception:  # noqa: BLE001

nedo_turkish_tokenizer/_context_aware.py CHANGED Viewed

@@ -1,4 +1,7 @@
-"""Fix 12: Context-aware Zemberek disambiguation."""
 from __future__ import annotations
@@ -17,20 +20,18 @@ def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
         return tokens
     try:
-        sentence = original_text.strip()
-        analysis = _morphology.analyze_sentence(sentence)
-        after = _morphology.disambiguate(sentence, analysis)
-        best_list = after.best_analysis()
         analyses: dict[str, dict] = {}
         for sa in best_list:
             try:
-                sf = sa.surface_form().lower().strip()
                 if sf not in analyses:
                     analyses[sf] = {
-                        "lemma":     sa.item.lemma,
-                        "pos":       sa.item.primary_pos.short_form,
-                        "morphemes": [m.id_ for m in sa.get_morphemes()],
                     }
             except Exception:  # noqa: BLE001
                 continue

+"""Fix 12: Context-aware Zemberek disambiguation.
+Uses zemberek-python (pure Python) — no JVM required.
+"""
 from __future__ import annotations
         return tokens
     try:
+        sa_result = _morphology.analyze_and_disambiguate(original_text.strip())
+        best_list = sa_result.best_analysis()
         analyses: dict[str, dict] = {}
         for sa in best_list:
             try:
+                sf = (str(sa.get_stem()) + str(sa.get_ending())).lower().strip()
                 if sf not in analyses:
                     analyses[sf] = {
+                        "lemma":     str(sa.item.lemma),
+                        "pos":       str(sa.item.primary_pos.short_form),
+                        "morphemes": [str(m) for m in sa.get_morphemes()],
                     }
             except Exception:  # noqa: BLE001
                 continue

nedo_turkish_tokenizer/_preprocessor.py CHANGED Viewed

@@ -67,13 +67,13 @@ def _is_turkish_base(word: str) -> bool:
     tdk = load_tdk_words()
     if tdk and wl in tdk:
         return True
-    # Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir...)
     try:
         from ._root_validator import _morphology, ZEMBEREK_AVAILABLE  # noqa: PLC0415
         if ZEMBEREK_AVAILABLE and _morphology:
             wa = _morphology.analyze(wl)
-            for sa in wa.analysis_results:
-                lemma = sa.item.lemma
                 if any(c in TR_CHARS for c in lemma):
                     return True
     except Exception:  # noqa: BLE001

     tdk = load_tdk_words()
     if tdk and wl in tdk:
         return True
+    # Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir…)
     try:
         from ._root_validator import _morphology, ZEMBEREK_AVAILABLE  # noqa: PLC0415
         if ZEMBEREK_AVAILABLE and _morphology:
             wa = _morphology.analyze(wl)
+            for sa in wa:
+                lemma = str(sa.item.lemma)
                 if any(c in TR_CHARS for c in lemma):
                     return True
     except Exception:  # noqa: BLE001

nedo_turkish_tokenizer/_root_validator.py CHANGED Viewed

@@ -1,4 +1,7 @@
-"""Zemberek-based root validation and correction (Fix 4)."""
 from __future__ import annotations
@@ -6,20 +9,43 @@ ZEMBEREK_AVAILABLE = False
 _morphology = None
 def _init_zemberek() -> None:
     global ZEMBEREK_AVAILABLE, _morphology
     try:
         from zemberek import TurkishMorphology  # noqa: PLC0415
         _morphology = TurkishMorphology.create_with_defaults()
         ZEMBEREK_AVAILABLE = True
     except ImportError:
-        print(
-            "[NedoTurkishTokenizer] zemberek-python not installed\n"
-            "  pip install zemberek-python"
-        )
     except Exception as exc:  # noqa: BLE001
         print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
@@ -37,12 +63,12 @@ def analyze_word(word: str) -> list[dict]:
         wa = _morphology.analyze(word)
         return [
             {
-                "lemma":    sa.item.lemma,
-                "pos":      sa.item.primary_pos.short_form,
-                "morphemes": [m.id_ for m in sa.get_morphemes()],
-                "surface":  sa.surface_form(),
             }
-            for sa in wa.analysis_results
         ]
     except Exception:  # noqa: BLE001
         return []
@@ -73,7 +99,7 @@ def _is_spurious_root(root: str, next_tokens: list[dict]) -> bool:
 def build_correction_map(
     original_words: list[str], base_tokenizer
 ) -> dict[str, str]:
-    """Build a {tokenizer_root -> zemberek_root} correction map."""
     correction_map: dict[str, str] = {}
     for word in original_words:
@@ -146,7 +172,7 @@ def validate_roots(
                 "token":           leading + correct,
                 "_original_token": tok["token"],
                 "_root_corrected": True,
-                "_note":           f"root corrected: '{surface}' -> '{correct}'",
             }
         result.append(tok)
@@ -160,16 +186,15 @@ def disambiguate_sentence(words: list[str]) -> list[dict | None]:
         return [None] * len(words)
     try:
         sentence = " ".join(words)
-        analysis = _morphology.analyze_sentence(sentence)
-        after = _morphology.disambiguate(sentence, analysis)
-        best = after.best_analysis()
         out = []
         for sa in best:
             try:
                 out.append({
-                    "lemma":     sa.item.lemma,
-                    "pos":       sa.item.primary_pos.short_form,
-                    "morphemes": [m.id_ for m in sa.get_morphemes()],
                 })
             except Exception:  # noqa: BLE001
                 out.append(None)

+"""Zemberek-based root validation and correction (Fix 4).
+Uses zemberek-python (pure Python) — no JVM or JPype required.
+"""
 from __future__ import annotations
 _morphology = None
+def _apply_zemberek_patch() -> None:
+    """Fix O(N^2) loading time bug in zemberek-python 0.2.3."""
+    import csv
+    import zemberek.morphology.lexicon.root_lexicon as rl
+    def fast_load_from_resources(resource_path: str):
+        items = list()
+        csv.field_size_limit(100000000)
+        with open(resource_path, 'r', encoding='utf-8') as f:
+            lex = list(csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE))
+            # O(1) dictionary lookup instead of O(N) iteration per reference
+            lex_dict = {line[0]: line for line in lex}
+            for i, line in enumerate(lex):
+                item = rl.DictionaryReader.make_dict_item_from_line(line)
+                if line[7] != 'null':
+                    reference_item_line = lex_dict.get(line[7])
+                    if reference_item_line is not None:
+                        item.set_reference_item(rl.DictionaryReader.make_dict_item_from_line(reference_item_line))
+                items.append(item)
+        return rl.RootLexicon(items)
+    rl.DictionaryReader.load_from_resources = fast_load_from_resources
 def _init_zemberek() -> None:
     global ZEMBEREK_AVAILABLE, _morphology
     try:
         from zemberek import TurkishMorphology  # noqa: PLC0415
+        _apply_zemberek_patch()
         _morphology = TurkishMorphology.create_with_defaults()
         ZEMBEREK_AVAILABLE = True
     except ImportError:
+        print("[NedoTurkishTokenizer] zemberek-python not installed → pip install zemberek-python")
     except Exception as exc:  # noqa: BLE001
         print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
         wa = _morphology.analyze(word)
         return [
             {
+                "lemma":    str(sa.item.lemma),
+                "pos":      str(sa.item.primary_pos.short_form),
+                "morphemes": [str(m) for m in sa.get_morphemes()],
+                "surface":  str(sa.get_stem()) + str(sa.get_ending()),
             }
+            for sa in wa
         ]
     except Exception:  # noqa: BLE001
         return []
 def build_correction_map(
     original_words: list[str], base_tokenizer
 ) -> dict[str, str]:
+    """Build a {tokenizer_root → zemberek_root} correction map."""
     correction_map: dict[str, str] = {}
     for word in original_words:
                 "token":           leading + correct,
                 "_original_token": tok["token"],
                 "_root_corrected": True,
+                "_note":           f"root corrected: '{surface}' → '{correct}'",
             }
         result.append(tok)
         return [None] * len(words)
     try:
         sentence = " ".join(words)
+        sa_result = _morphology.analyze_and_disambiguate(sentence)
+        best = sa_result.best_analysis()
         out = []
         for sa in best:
             try:
                 out.append({
+                    "lemma":     str(sa.item.lemma),
+                    "pos":       str(sa.item.primary_pos.short_form),
+                    "morphemes": [str(m) for m in sa.get_morphemes()],
                 })
             except Exception:  # noqa: BLE001
                 out.append(None)

nedo_turkish_tokenizer/tokenizer.py CHANGED Viewed

@@ -26,10 +26,12 @@ Output fields per token:
 from __future__ import annotations
 import os
 import multiprocessing
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from pathlib import Path
 from ._preprocessor import preprocess, postprocess
 from ._suffix_expander import reclassify_bpe_suffixes
 from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
@@ -97,6 +99,7 @@ class NedoTurkishTokenizer:
     """
     def __init__(self) -> None:
         from turkish_tokenizer import TurkishTokenizer  # noqa: PLC0415
         self._base = TurkishTokenizer()
         self.zemberek_available = ZEMBEREK_AVAILABLE
@@ -234,6 +237,82 @@ class NedoTurkishTokenizer:
             json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
         )
     # ── Utility ───────────────────────────────────────────────────────────────
     def stats(self, tokens: list[dict]) -> dict:

 from __future__ import annotations
 import os
+import re
 import multiprocessing
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from pathlib import Path
 from ._preprocessor import preprocess, postprocess
 from ._suffix_expander import reclassify_bpe_suffixes
 from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
     """
     def __init__(self) -> None:
         from turkish_tokenizer import TurkishTokenizer  # noqa: PLC0415
         self._base = TurkishTokenizer()
         self.zemberek_available = ZEMBEREK_AVAILABLE
             json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
         )
+    # ── Morphological Lattice API ─────────────────────────────────────────────
+    def get_morphological_lattice(self, word: str) -> list[dict]:
+        """Return all possible morphological analyses for *word* as a lattice.
+        Each entry in the returned list is a dict with:
+            ``root``          – the lemma / root form
+            ``suffixes``      – list of surface-form suffixes
+            ``pos``           – abbreviated POS tag (Noun, Verb, Adj, …)
+            ``lexical_form``  – full lexical representation from Zemberek
+        Returns an **empty list** when Zemberek cannot analyse the word
+        (unknown word) or when Zemberek is not available.
+        """
+        if _zemb_morphology is None:
+            return []
+        try:
+            word_analysis = _zemb_morphology.analyze(word)
+            lattice: list[dict] = []
+            for sa in word_analysis:
+                try:
+                    root = str(sa.item.lemma)
+                    pos = str(sa.item.primary_pos.short_form)
+                    lexical_form = str(sa.format_string())
+                    # Build suffix list from morpheme chain (skip the root morpheme)
+                    morphemes = list(sa.get_morphemes())
+                    suffixes = [str(m) for m in morphemes[1:]] if len(morphemes) > 1 else []
+                    lattice.append({
+                        "root": root,
+                        "suffixes": suffixes,
+                        "pos": pos,
+                        "lexical_form": lexical_form,
+                    })
+                except Exception:  # noqa: BLE001
+                    continue
+            return lattice
+        except Exception:  # noqa: BLE001
+            return []
+    def tokenize_lattice(self, text: str) -> dict:
+        """Tokenize *text* and return a morphological lattice for every word.
+        Returns a dict with:
+            ``input``   – the original text
+            ``words``   – list of per-word dicts, each containing
+                          ``word`` (str) and ``lattice`` (list of analyses)
+        Example::
+            tok  = NedoTurkishTokenizer()
+            data = tok.tokenize_lattice("Evin güzel gelir")
+            for w in data["words"]:
+                print(w["word"], "→", len(w["lattice"]), "analysis(es)")
+        """
+        # Split text on whitespace, respecting punctuation
+        words = re.findall(r"\S+", text)
+        result_words: list[dict] = []
+        for w in words:
+            lattice = self.get_morphological_lattice(w)
+            result_words.append({
+                "word": w,
+                "lattice": lattice,
+            })
+        return {
+            "input": text,
+            "words": result_words,
+        }
     # ── Utility ───────────────────────────────────────────────────────────────
     def stats(self, tokens: list[dict]) -> dict:

paper_baseline_check.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+paper_baseline_check.py
+-----------------------
+Paper'ın %90.29 baseline'ını neden biz %75.57 olarak görüyoruz?
+İki metodolojinin farkını somut olarak gösterir.
+Kullanım:
+    cd NedoTurkishTokenizer/
+    python paper_baseline_check.py
+"""
+import os
+from huggingface_hub import login
+from datasets import load_dataset
+from turkish_tokenizer import TurkishTokenizer
+HF_TOKEN = os.environ.get("HF_TOKEN", "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")
+login(token=HF_TOKEN, add_to_git_credential=False)
+base = TurkishTokenizer()
+print("TR-MMLU yükleniyor...")
+ds   = load_dataset("alibayram/turkish_mmlu", split="test", token=HF_TOKEN)
+rows = list(ds)
+print(f"{len(rows)} örnek\n")
+def orig_tr_pct(tokens):
+    total = len(tokens)
+    if total == 0: return 0.0
+    tr = sum(1 for t in tokens if t["type"] in ("ROOT","SUFFIX"))
+    return tr / total * 100
+# ── Metodoloji A: Sadece soru (paper'ın yaptığı) ─────────────────────────────
+a_sum = 0.0
+for row in rows:
+    text = str(row.get("soru") or row.get("question") or "")
+    if not text.strip(): continue
+    toks = base.tokenize_text(text)
+    a_sum += orig_tr_pct(toks)
+a_avg = a_sum / len(rows)
+# ── Metodoloji B: Soru + şıklar (bizim yaptığımız) ───────────────────────────
+b_sum = 0.0
+for row in rows:
+    parts = []
+    for f in ["soru","question"]:
+        if row.get(f):
+            parts.append(str(row[f]))
+            break
+    choices = row.get("secenekler") or []
+    if isinstance(choices, list):
+        parts.extend(str(c) for c in choices)
+    text = " ".join(parts)
+    if not text.strip(): continue
+    toks = base.tokenize_text(text)
+    b_sum += orig_tr_pct(toks)
+b_avg = b_sum / len(rows)
+# ── Metodoloji C: Soru + şıklar + açıklama (tam) ─────────────────────────────
+c_sum = 0.0
+for row in rows:
+    parts = []
+    for f in ["soru","question"]:
+        if row.get(f):
+            parts.append(str(row[f]))
+            break
+    choices = row.get("secenekler") or []
+    if isinstance(choices, list):
+        parts.extend(str(c) for c in choices)
+    if row.get("aciklama"):
+        parts.append(str(row["aciklama"]))
+    text = " ".join(parts)
+    if not text.strip(): continue
+    toks = base.tokenize_text(text)
+    c_sum += orig_tr_pct(toks)
+c_avg = c_sum / len(rows)
+# ── Şıklardaki içerik analizi ─────────────────────────────────────────────────
+# İlk 20 sorunun şıklarına bak
+print("İlk 20 sorunun şık örnekleri:")
+for row in rows[:20]:
+    choices = row.get("secenekler") or []
+    if isinstance(choices, list) and choices:
+        sample = " | ".join(str(c)[:20] for c in choices[:4])
+        print(f"  {sample}")
+print(f"""
+{'='*60}
+METODOLOJİ KARŞILAŞTIRMASI (TürkishTokenizer baseline)
+{'='*60}
+  A) Sadece soru alanı       (paper'ın yöntemi): {a_avg:.2f}%
+  B) Soru + şıklar           (kısmi):            {b_avg:.2f}%
+  C) Soru + şıklar + açıkl.  (bizim yöntemimiz): {c_avg:.2f}%
+  Paper değeri: 90.29%
+  Bizim A değeri: {a_avg:.2f}%  ← paper ile fark: {a_avg-90.29:+.2f}
+  Bizim C değeri: {c_avg:.2f}%  ← biz bunu kullanıyoruz
+  Sonuç: {c_avg:.2f}% vs {a_avg:.2f}% = {c_avg-a_avg:.2f} puan fark
+  Bu fark şıklardaki kısaltmalar ve yabancı terimlerden kaynaklanıyor.
+{'='*60}
+""")

results/tr_mmlu_per_sample.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

results/tr_mmlu_report.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# TR-MMLU Benchmark — NedoTurkishTokenizer
+**N = 40,966 örnek**
+## Ana Metrikler
+| Metrik | Orijinal | Enhanced | Δ |
+|--------|----------|----------|---|
+| TR% | 75.57% | 92.64% | +17.07% |
+| Pure% | 60.50% | 68.30% | +7.80% |
+| Avg token/örnek | 84.75 | 75.38 | -9.37 |
+| Avg BPE/örnek | 21.95 | 5.66 | -16.29 |
+## Paper ile Karşılaştırma
+| Metrik | Paper (orijinal) | Bizim (orijinal) | NedoTurkishTokenizer |
+|--------|-----------------|-----------------|---------------------|
+| TR% | 90.29% | 75.57% | **92.64%** |
+| Pure% | 85.8% | 60.50% | **68.30%** |
+## Örnek Dağılımı
+- İyileşen: 40,964 (%100.00)
+- Gerileyen: 0 (%0.00)
+- Değişmeyen: 2 (%0.00)
+## Kategori Bazında TR%
+| Kategori | TR% | N |
+|----------|-----|---|
+| TUS | 80.25% | 1510 |
+| Turizm ve Otel İşletmeciliği | 82.30% | 185 |
+| Futbol | 89.63% | 283 |
+| DHBT | 90.72% | 336 |
+| Genel Kültür | 91.16% | 84 |
+| Tarım | 91.41% | 586 |
+| Siyer | 91.45% | 1067 |
+| Laborant ve Veteriner Sağlık | 91.91% | 1485 |
+| YGS Denemeleri | 91.92% | 4154 |
+| Kim 500 Milyar İster | 92.06% | 12968 |
+| Tarih | 93.11% | 571 |
+| Parakende Satış ve Mağaza Yöneticiliği | 93.38% | 255 |
+| Dini Bilgiler | 93.49% | 122 |
+| Kültürel Miras ve Turizm | 93.76% | 859 |
+| Uluslar Arası İlişkiler | 93.84% | 117 |
+| Radyo ve Televizyon Programcılığı | 93.88% | 531 |
+| Okul Öncesi Öğretmenliği | 93.92% | 9 |
+| İktisat | 94.01% | 346 |
+| Yaşlı Bakımı | 94.29% | 214 |
+| Özel Koruma ve Güvenlik | 94.32% | 759 |
+| Tıbbi Dökümantasyon ve Sekreterlik | 94.33% | 63 |
+| Türk Dili ve Edebiyatı | 94.33% | 340 |
+| KPSS Denemeleri | 94.33% | 2674 |
+| Yönetim Bİlişim Sistemleri | 94.40% | 294 |
+| Felsefe | 94.43% | 502 |
+| Ehliyet | 94.44% | 271 |
+| Sosyal Hizmet | 94.56% | 64 |
+| Çocuk Gelişimi | 94.58% | 2597 |
+| KPSS | 94.60% | 299 |
+| Sosyal Hizmetler | 94.61% | 935 |
+| Sağlık Yönetimi | 94.63% | 313 |
+| Halkla İlişkiler ve Reklamcılık | 94.70% | 158 |
+| İnsan Kaynakları Yönetimi | 94.92% | 237 |
+| Lojistik | 94.99% | 474 |
+| Menkul Kıymetler ve Sermaye Piyasası | 95.04% | 435 |
+| Konaklama İşletmeciliği | 95.04% | 13 |
+| Medya ve İletişim | 95.05% | 898 |
+| Sosyoloji | 95.06% | 393 |
+| İşletme Yönetimi | 95.20% | 501 |
+| Sağlık Kurumları İşletmeciliği | 95.26% | 575 |
+| Turizm ve Seyehat Hizmetleri | 95.30% | 70 |
+| Dış Ticaret | 95.36% | 1173 |
+| Muhasebe ve Vergi Uygulamaları | 95.49% | 474 |
+| Spor Yönetimi | 95.49% | 80 |
+| Marka İletişimi | 95.62% | 180 |
+| Yerel Yönetimler | 95.67% | 196 |
+| Acil Durum ve Afet Yönetimi | 95.67% | 33 |
+| Maliye | 95.68% | 13 |
+| AUZEF | 95.82% | 47 |
+| Kamu Yönetimi | 95.92% | 54 |
+| Uluslararası Ticaret ve Lojistik Yönetimi | 95.95% | 65 |
+| Havacılık Yönetimi | 96.06% | 15 |
+| İşletme | 96.09% | 24 |
+| Çalışma Ekonomisi ve Endüstri İlişkileri | 96.42% | 62 |
+| Sosyal Bilimler | 96.56% | 3 |

results/tr_mmlu_summary.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "n_samples": 40966,
+  "orig_tr_pct": 75.5727,
+  "orig_pure_pct": 60.499,
+  "enh_tr_pct": 92.6413,
+  "enh_pure_pct": 68.3004,
+  "delta_tr_pct": 17.0686,
+  "delta_pure_pct": 7.8014,
+  "orig_avg_tokens": 84.75,
+  "enh_avg_tokens": 75.38,
+  "orig_avg_bpe": 21.95,
+  "enh_avg_bpe": 5.66,
+  "pct_improved": 100.0,
+  "pct_regressed": 0.0,
+  "pct_unchanged": 0.0
+}

test_lattice.py ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/env python3
+"""
+Test / demo script for the Morphological Lattice API.
+Shows how ambiguous Turkish words like "evin" and "gelir" produce
+multiple alternative analyses in the lattice.
+"""
+import json
+from nedo_turkish_tokenizer import NedoTurkishTokenizer
+def section(title: str) -> None:
+    print(f"\n{'═' * 60}")
+    print(f"  {title}")
+    print(f"{'═' * 60}")
+def main() -> None:
+    tok = NedoTurkishTokenizer()
+    # ── 1. Tek kelime lattice testi ──────────────────────────────────────
+    section("1) get_morphological_lattice — tek kelime örnekleri")
+    test_words = ["evin", "gelir", "yüz", "çıkar", "koşar"]
+    for word in test_words:
+        lattice = tok.get_morphological_lattice(word)
+        print(f"\n▸ \"{word}\" → {len(lattice)} analiz:")
+        for i, entry in enumerate(lattice):
+            print(f"    [{i}] root={entry['root']:<12} "
+                  f"pos={entry['pos']:<6} "
+                  f"suffixes={entry['suffixes']}")
+            print(f"         lexical_form = {entry['lexical_form']}")
+    # ── 2. Bilinmeyen kelime (boş lattice) ───────────────────────────────
+    section("2) Bilinmeyen / yabancı kelime → boş lattice")
+    unknown_words = ["xyzfoo", "meeting", "blockchain"]
+    for word in unknown_words:
+        lattice = tok.get_morphological_lattice(word)
+        print(f"  \"{word}\" → lattice boş mu? {len(lattice) == 0}  (len={len(lattice)})")
+    # ── 3. tokenize_lattice — cümle bazlı test ──────────────────────────
+    section("3) tokenize_lattice — cümle testi")
+    sentences = [
+        "Evin güzel gelir",
+        "Çocuk okula koşar adım gitti",
+        "Yüz yıllık çınar",
+    ]
+    for sent in sentences:
+        print(f"\n▸ Input: \"{sent}\"")
+        result = tok.tokenize_lattice(sent)
+        for winfo in result["words"]:
+            n = len(winfo["lattice"])
+            print(f"    {winfo['word']:<16} → {n} analiz(ler)")
+            for entry in winfo["lattice"]:
+                print(f"        root={entry['root']:<12} pos={entry['pos']:<6} "
+                      f"suffixes={entry['suffixes']}")
+    # ── 4. JSON çıktı formatı ────────────────────────────────────────────
+    section("4) tokenize_lattice JSON çıktı")
+    data = tok.tokenize_lattice("evin gelir")
+    print(json.dumps(data, ensure_ascii=False, indent=2))
+    print("\n✅ Tüm testler başarıyla tamamlandı.")
+if __name__ == "__main__":
+    main()