nmstech commited on
Commit
15d9001
Β·
2 Parent(s): 5a6f887b209242

merge github main before publish

Browse files
hf_benchmark.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ hf_benchmark.py
3
+ ---------------
4
+ NedoTurkishTokenizer'Δ± TR-MMLU ΓΌzerinde benchmark eder.
5
+ HuggingFace'den hem tokenizer hem dataset'i Γ§eker.
6
+
7
+ KullanΔ±m:
8
+ cd NedoTurkishTokenizer/
9
+ pip install huggingface_hub datasets
10
+ python hf_benchmark.py
11
+
12
+ # Sadece 2000 ΓΆrnek (hΔ±zlΔ± test):
13
+ python hf_benchmark.py --samples 2000
14
+
15
+ # Belirli kategori:
16
+ python hf_benchmark.py --category TUS
17
+ """
18
+
19
+ import argparse
20
+ import json
21
+ import os
22
+ import sys
23
+ import csv
24
+ from pathlib import Path
25
+
26
+ # ── HuggingFace token ─────────────────────────────────────────────────────────
27
+ HF_TOKEN = os.environ.get("HF_TOKEN", "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")
28
+
29
+ # ── ArgΓΌmanlar ────────────────────────────────────────────────────────────────
30
+ parser = argparse.ArgumentParser()
31
+ parser.add_argument("--samples", default="all", help="KaΓ§ ΓΆrnek (all veya sayΔ±)")
32
+ parser.add_argument("--category", default=None, help="Kategori filtresi (ΓΆr. TUS)")
33
+ parser.add_argument("--out", default=".", help="Γ‡Δ±ktΔ± klasΓΆrΓΌ")
34
+ args = parser.parse_args()
35
+
36
+ OUT = Path(args.out)
37
+ OUT.mkdir(parents=True, exist_ok=True)
38
+
39
+ # ── HuggingFace login ─────────────────────────────────────────────────────────
40
+ from huggingface_hub import login
41
+ login(token=HF_TOKEN, add_to_git_credential=False)
42
+ print("HuggingFace login βœ“")
43
+
44
+ # ── Tokenizer yΓΌkle ───────────────────────────────────────────────────────────
45
+ print("NedoTurkishTokenizer yΓΌkleniyor...")
46
+ sys.path.insert(0, str(Path(__file__).parent))
47
+ from nedo_turkish_tokenizer import NedoTurkishTokenizer
48
+ tok = NedoTurkishTokenizer()
49
+ print(f" Zemberek: {'βœ“' if tok.zemberek_available else 'βœ— (devre dışı)'}")
50
+
51
+ # ── Dataset yΓΌkle ─────────────────────────────────────────────────────────────
52
+ print("TR-MMLU dataset yΓΌkleniyor...")
53
+ from datasets import load_dataset
54
+ ds = load_dataset(
55
+ "alibayram/turkish_mmlu",
56
+ split="test",
57
+ token=HF_TOKEN,
58
+ )
59
+ print(f" {len(ds)} ΓΆrnek")
60
+
61
+ # ── Kategori filtresi ─────────────────────────────────────────────────────────
62
+ samples = list(ds)
63
+ if args.category:
64
+ samples = [r for r in samples
65
+ if str(r.get("bolum","")).strip() == args.category]
66
+ print(f" Kategori '{args.category}': {len(samples)} ΓΆrnek")
67
+
68
+ if args.samples != "all":
69
+ n = int(args.samples)
70
+ samples = samples[:n]
71
+ print(f" KΔ±sΔ±tlΔ±: {len(samples)} ΓΆrnek")
72
+
73
+ # ── Metin alanlarΔ±nΔ± birleştir ─────────────────────────────────────────────────
74
+ def get_text(row: dict) -> str:
75
+ parts = []
76
+ for field in ["soru", "question"]:
77
+ if row.get(field):
78
+ parts.append(str(row[field]))
79
+ break
80
+ choices = row.get("secenekler") or []
81
+ if isinstance(choices, list):
82
+ parts.extend(str(c) for c in choices)
83
+ for lbl in ["A","B","C","D"]:
84
+ if row.get(lbl):
85
+ parts.append(str(row[lbl]))
86
+ if row.get("aciklama"):
87
+ parts.append(str(row["aciklama"]))
88
+ return " ".join(parts)
89
+
90
+ # ── Token istatistikleri ──────────────────────────────────────────────────────
91
+ def token_stats(tokens: list[dict]) -> dict:
92
+ """NedoTurkishTokenizer'Δ±n token_type alanΔ±nΔ± kullan."""
93
+ total = len(tokens)
94
+ if total == 0:
95
+ return {"total":0,"bpe":0,"tr_pct":0.0,"pure_pct":0.0,"orig_tr_pct":0.0}
96
+
97
+ # Enhanced stats
98
+ roots = sum(1 for t in tokens if t["token_type"] == "ROOT")
99
+ suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
100
+ foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
101
+ punct = sum(1 for t in tokens if t["token_type"] == "PUNCT")
102
+ bpe = sum(1 for t in tokens if t["token_type"] == "BPE")
103
+ special = sum(1 for t in tokens
104
+ if t["token_type"] in ("NUM","DATE","UNIT","URL",
105
+ "MENTION","HASHTAG","EMOJI","ACRONYM"))
106
+ tr = roots + suffixes + foreign + punct + special
107
+ pure = sum(1 for t in tokens
108
+ if t["token_type"] in ("ROOT","SUFFIX","FOREIGN")
109
+ and not t["token"].strip().startswith("<"))
110
+ return {
111
+ "total": total,
112
+ "roots": roots,
113
+ "suffixes": suffixes,
114
+ "foreign": foreign,
115
+ "bpe": bpe,
116
+ "punct": punct,
117
+ "special": special,
118
+ "tr_pct": round(tr / total * 100, 4),
119
+ "pure_pct": round(pure / total * 100, 4),
120
+ }
121
+
122
+ def orig_stats(tokens: list[dict]) -> dict:
123
+ """Orijinal tokenizer istatistikleri (karşılaştırma için)."""
124
+ total = len(tokens)
125
+ if total == 0:
126
+ return {"total":0,"bpe":0,"tr_pct":0.0,"pure_pct":0.0}
127
+ roots = sum(1 for t in tokens if t.get("type") == "ROOT")
128
+ suffixes = sum(1 for t in tokens if t.get("type") == "SUFFIX")
129
+ bpe = sum(1 for t in tokens if t.get("type") == "BPE")
130
+ tr = roots + suffixes
131
+ pure = sum(1 for t in tokens
132
+ if t.get("type") in ("ROOT","SUFFIX")
133
+ and not t.get("token","").strip().startswith("<"))
134
+ return {
135
+ "total": total,
136
+ "bpe": bpe,
137
+ "tr_pct": round(tr / total * 100, 4),
138
+ "pure_pct": round(pure / total * 100, 4),
139
+ }
140
+
141
+ # ── Ana benchmark dΓΆngΓΌsΓΌ ─────────────────────────────────────────────────────
142
+ print(f"\nBenchmark başlıyor: {len(samples)} ârnek...")
143
+
144
+ per_sample = []
145
+ orig_tr_sum = 0.0
146
+ enh_tr_sum = 0.0
147
+ orig_pur_sum = 0.0
148
+ enh_pur_sum = 0.0
149
+ orig_tok_sum = 0
150
+ enh_tok_sum = 0
151
+ orig_bpe_sum = 0
152
+ enh_bpe_sum = 0
153
+ improved = 0
154
+ regressed = 0
155
+ unchanged = 0
156
+
157
+ REPORT_EVERY = 500
158
+
159
+ for idx, row in enumerate(samples):
160
+ text = get_text(row)
161
+ if not text.strip():
162
+ continue
163
+
164
+ # Orijinal tokenizer
165
+ orig_toks = tok._base.tokenize_text(text)
166
+ os_ = orig_stats(orig_toks)
167
+
168
+ # NedoTurkishTokenizer
169
+ enh_toks = tok.tokenize(text)
170
+ es_ = token_stats(enh_toks)
171
+
172
+ d_tr = round(es_["tr_pct"] - os_["tr_pct"], 4)
173
+
174
+ per_sample.append({
175
+ "idx": idx,
176
+ "bolum": str(row.get("bolum","")),
177
+ "orig_tr": os_["tr_pct"],
178
+ "enh_tr": es_["tr_pct"],
179
+ "d_tr": d_tr,
180
+ "orig_pure": os_["pure_pct"],
181
+ "enh_pure": es_["pure_pct"],
182
+ "orig_tok": os_["total"],
183
+ "enh_tok": es_["total"],
184
+ "orig_bpe": os_["bpe"],
185
+ "enh_bpe": es_["bpe"],
186
+ })
187
+
188
+ orig_tr_sum += os_["tr_pct"]
189
+ enh_tr_sum += es_["tr_pct"]
190
+ orig_pur_sum += os_["pure_pct"]
191
+ enh_pur_sum += es_["pure_pct"]
192
+ orig_tok_sum += os_["total"]
193
+ enh_tok_sum += es_["total"]
194
+ orig_bpe_sum += os_["bpe"]
195
+ enh_bpe_sum += es_["bpe"]
196
+
197
+ if d_tr > 0: improved += 1
198
+ elif d_tr < 0: regressed += 1
199
+ else: unchanged += 1
200
+
201
+ if (idx + 1) % REPORT_EVERY == 0:
202
+ n = idx + 1
203
+ print(f" [{n:>6}/{len(samples)}] "
204
+ f"TR%: {enh_tr_sum/n:.2f}% "
205
+ f"BPE/ΓΆrnek: {enh_bpe_sum/n:.2f} "
206
+ f"Regressed: {regressed}")
207
+
208
+ n = len(per_sample)
209
+ if n == 0:
210
+ print("Hiç ârnek işlenmedi!")
211
+ sys.exit(1)
212
+
213
+ # ── Γ–zet ─────────────────────────────────────────────────────────────────────
214
+ summary = {
215
+ "n_samples": n,
216
+ "orig_tr_pct": round(orig_tr_sum / n, 4),
217
+ "orig_pure_pct": round(orig_pur_sum / n, 4),
218
+ "enh_tr_pct": round(enh_tr_sum / n, 4),
219
+ "enh_pure_pct": round(enh_pur_sum / n, 4),
220
+ "delta_tr_pct": round((enh_tr_sum - orig_tr_sum) / n, 4),
221
+ "delta_pure_pct": round((enh_pur_sum - orig_pur_sum) / n, 4),
222
+ "orig_avg_tokens": round(orig_tok_sum / n, 2),
223
+ "enh_avg_tokens": round(enh_tok_sum / n, 2),
224
+ "orig_avg_bpe": round(orig_bpe_sum / n, 2),
225
+ "enh_avg_bpe": round(enh_bpe_sum / n, 2),
226
+ "pct_improved": round(improved / n * 100, 2),
227
+ "pct_regressed": round(regressed / n * 100, 2),
228
+ "pct_unchanged": round(unchanged / n * 100, 2),
229
+ }
230
+
231
+ # ── Kategori bazΔ±nda ──────────────────────────────────────────────────────────
232
+ from collections import defaultdict
233
+ cat_scores = defaultdict(list)
234
+ for row in per_sample:
235
+ cat_scores[row["bolum"]].append(row["enh_tr"])
236
+
237
+ cat_summary = {
238
+ cat: round(sum(v)/len(v), 2)
239
+ for cat, v in cat_scores.items()
240
+ if len(v) >= 3
241
+ }
242
+ cat_sorted = sorted(cat_summary.items(), key=lambda x: x[1])
243
+
244
+ # ── Rapor yazdΔ±r ─────────────────────────────────────────────────────────────
245
+ SEP = "═" * 65
246
+ print(f"\n{SEP}")
247
+ print(" NedoTurkishTokenizer β€” TR-MMLU Benchmark SonuΓ§larΔ±")
248
+ print(SEP)
249
+ print(f" N = {n:,} ΓΆrnek\n")
250
+ print(f" {'Metrik':30s} {'Orijinal':>10} {'Enhanced':>10} {'Ξ”':>8}")
251
+ print(" " + "─"*55)
252
+ for label, orig, enh, delta in [
253
+ ("TR%", summary["orig_tr_pct"], summary["enh_tr_pct"], summary["delta_tr_pct"]),
254
+ ("Pure%", summary["orig_pure_pct"], summary["enh_pure_pct"], summary["delta_pure_pct"]),
255
+ ("Avg token/ΓΆrn",summary["orig_avg_tokens"],summary["enh_avg_tokens"],
256
+ round(summary["enh_avg_tokens"]-summary["orig_avg_tokens"],2)),
257
+ ("Avg BPE/ΓΆrn", summary["orig_avg_bpe"], summary["enh_avg_bpe"],
258
+ round(summary["enh_avg_bpe"]-summary["orig_avg_bpe"],2)),
259
+ ]:
260
+ print(f" {label:30s} {orig:>10.2f} {enh:>10.2f} {delta:>+8.2f}")
261
+
262
+ print(f"\n İyileşen : {improved:>6,} (%{summary['pct_improved']:.2f})")
263
+ print(f" Gerileyen: {regressed:>6,} (%{summary['pct_regressed']:.2f})")
264
+ print(f" Değişmeyen:{unchanged:>6,} (%{summary['pct_unchanged']:.2f})")
265
+
266
+ print(f"\n En düşük TR% kategoriler:")
267
+ for cat, avg in cat_sorted[:10]:
268
+ n_cat = len(cat_scores[cat])
269
+ print(f" {cat:<35} {avg:>6.2f}% (n={n_cat})")
270
+
271
+ print(f"\n En yΓΌksek TR% kategoriler:")
272
+ for cat, avg in cat_sorted[-8:]:
273
+ n_cat = len(cat_scores[cat])
274
+ print(f" {cat:<35} {avg:>6.2f}% (n={n_cat})")
275
+
276
+ print(SEP)
277
+
278
+ # ── Dosyalara yaz ─────────────────────────────────────────────────────────────
279
+ # Summary JSON
280
+ summary_path = OUT / "tr_mmlu_summary.json"
281
+ with open(summary_path, "w", encoding="utf-8") as f:
282
+ json.dump(summary, f, ensure_ascii=False, indent=2)
283
+ print(f"\n βœ“ {summary_path}")
284
+
285
+ # Report Markdown
286
+ paper_tr = 90.29
287
+ paper_pur = 85.80
288
+ report_path = OUT / "tr_mmlu_report.md"
289
+ with open(report_path, "w", encoding="utf-8") as f:
290
+ f.write("# TR-MMLU Benchmark β€” NedoTurkishTokenizer\n\n")
291
+ f.write(f"**N = {n:,} ΓΆrnek**\n\n")
292
+ f.write("## Ana Metrikler\n\n")
293
+ f.write("| Metrik | Orijinal | Enhanced | Ξ” |\n")
294
+ f.write("|--------|----------|----------|---|\n")
295
+ f.write(f"| TR% | {summary['orig_tr_pct']:.2f}% | {summary['enh_tr_pct']:.2f}% | {summary['delta_tr_pct']:+.2f}% |\n")
296
+ f.write(f"| Pure% | {summary['orig_pure_pct']:.2f}% | {summary['enh_pure_pct']:.2f}% | {summary['delta_pure_pct']:+.2f}% |\n")
297
+ f.write(f"| Avg token/ΓΆrnek | {summary['orig_avg_tokens']:.2f} | {summary['enh_avg_tokens']:.2f} | {summary['enh_avg_tokens']-summary['orig_avg_tokens']:+.2f} |\n")
298
+ f.write(f"| Avg BPE/ΓΆrnek | {summary['orig_avg_bpe']:.2f} | {summary['enh_avg_bpe']:.2f} | {summary['enh_avg_bpe']-summary['orig_avg_bpe']:+.2f} |\n")
299
+ f.write("\n## Paper ile Karşılaştırma\n\n")
300
+ f.write("| Metrik | Paper (orijinal) | Bizim (orijinal) | NedoTurkishTokenizer |\n")
301
+ f.write("|--------|-----------------|-----------------|---------------------|\n")
302
+ f.write(f"| TR% | {paper_tr}% | {summary['orig_tr_pct']:.2f}% | **{summary['enh_tr_pct']:.2f}%** |\n")
303
+ f.write(f"| Pure% | {paper_pur}% | {summary['orig_pure_pct']:.2f}% | **{summary['enh_pure_pct']:.2f}%** |\n")
304
+ f.write("\n## Γ–rnek DağılΔ±mΔ±\n\n")
305
+ f.write(f"- İyileşen: {improved:,} (%{summary['pct_improved']:.2f})\n")
306
+ f.write(f"- Gerileyen: {regressed:,} (%{summary['pct_regressed']:.2f})\n")
307
+ f.write(f"- Değişmeyen: {unchanged:,} (%{summary['pct_unchanged']:.2f})\n")
308
+ f.write("\n## Kategori BazΔ±nda TR%\n\n")
309
+ f.write("| Kategori | TR% | N |\n")
310
+ f.write("|----------|-----|---|\n")
311
+ for cat, avg in cat_sorted:
312
+ n_cat = len(cat_scores[cat])
313
+ f.write(f"| {cat} | {avg:.2f}% | {n_cat} |\n")
314
+ print(f" βœ“ {report_path}")
315
+
316
+ # Per-sample CSV
317
+ csv_path = OUT / "tr_mmlu_per_sample.csv"
318
+ with open(csv_path, "w", newline="", encoding="utf-8") as f:
319
+ writer = csv.DictWriter(f, fieldnames=per_sample[0].keys())
320
+ writer.writeheader()
321
+ writer.writerows(per_sample)
322
+ print(f" βœ“ {csv_path}")
323
+
324
+ print("\nTamamlandΔ±.\n")
nedo_turkish_tokenizer/_compound.py CHANGED
@@ -42,9 +42,9 @@ KNOWN_COMPOUNDS: dict[str, list[str]] = {
42
  def _decompose_zemberek(word: str, morphology) -> list[str] | None:
43
  try:
44
  wa = morphology.analyze(word)
45
- for sa in wa.analysis_results:
46
- morphemes = [m.id_ for m in sa.get_morphemes()]
47
- roots = [m for m in morphemes if m in ("Noun", "Verb", "Adj")]
48
  if len(roots) > 1:
49
  return roots
50
  except Exception: # noqa: BLE001
 
42
  def _decompose_zemberek(word: str, morphology) -> list[str] | None:
43
  try:
44
  wa = morphology.analyze(word)
45
+ for sa in wa:
46
+ morphemes = [str(m) for m in sa.get_morphemes()]
47
+ roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
48
  if len(roots) > 1:
49
  return roots
50
  except Exception: # noqa: BLE001
nedo_turkish_tokenizer/_context_aware.py CHANGED
@@ -1,4 +1,7 @@
1
- """Fix 12: Context-aware Zemberek disambiguation."""
 
 
 
2
 
3
  from __future__ import annotations
4
 
@@ -17,20 +20,18 @@ def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
17
  return tokens
18
 
19
  try:
20
- sentence = original_text.strip()
21
- analysis = _morphology.analyze_sentence(sentence)
22
- after = _morphology.disambiguate(sentence, analysis)
23
- best_list = after.best_analysis()
24
 
25
  analyses: dict[str, dict] = {}
26
  for sa in best_list:
27
  try:
28
- sf = sa.surface_form().lower().strip()
29
  if sf not in analyses:
30
  analyses[sf] = {
31
- "lemma": sa.item.lemma,
32
- "pos": sa.item.primary_pos.short_form,
33
- "morphemes": [m.id_ for m in sa.get_morphemes()],
34
  }
35
  except Exception: # noqa: BLE001
36
  continue
 
1
+ """Fix 12: Context-aware Zemberek disambiguation.
2
+
3
+ Uses zemberek-python (pure Python) β€” no JVM required.
4
+ """
5
 
6
  from __future__ import annotations
7
 
 
20
  return tokens
21
 
22
  try:
23
+ sa_result = _morphology.analyze_and_disambiguate(original_text.strip())
24
+ best_list = sa_result.best_analysis()
 
 
25
 
26
  analyses: dict[str, dict] = {}
27
  for sa in best_list:
28
  try:
29
+ sf = (str(sa.get_stem()) + str(sa.get_ending())).lower().strip()
30
  if sf not in analyses:
31
  analyses[sf] = {
32
+ "lemma": str(sa.item.lemma),
33
+ "pos": str(sa.item.primary_pos.short_form),
34
+ "morphemes": [str(m) for m in sa.get_morphemes()],
35
  }
36
  except Exception: # noqa: BLE001
37
  continue
nedo_turkish_tokenizer/_preprocessor.py CHANGED
@@ -67,13 +67,13 @@ def _is_turkish_base(word: str) -> bool:
67
  tdk = load_tdk_words()
68
  if tdk and wl in tdk:
69
  return True
70
- # Zemberek: proper nouns whose lemma contains Turkish chars (Δ°stanbul, Δ°zmir...)
71
  try:
72
  from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
73
  if ZEMBEREK_AVAILABLE and _morphology:
74
  wa = _morphology.analyze(wl)
75
- for sa in wa.analysis_results:
76
- lemma = sa.item.lemma
77
  if any(c in TR_CHARS for c in lemma):
78
  return True
79
  except Exception: # noqa: BLE001
 
67
  tdk = load_tdk_words()
68
  if tdk and wl in tdk:
69
  return True
70
+ # Zemberek: proper nouns whose lemma contains Turkish chars (Δ°stanbul, Δ°zmir…)
71
  try:
72
  from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
73
  if ZEMBEREK_AVAILABLE and _morphology:
74
  wa = _morphology.analyze(wl)
75
+ for sa in wa:
76
+ lemma = str(sa.item.lemma)
77
  if any(c in TR_CHARS for c in lemma):
78
  return True
79
  except Exception: # noqa: BLE001
nedo_turkish_tokenizer/_root_validator.py CHANGED
@@ -1,4 +1,7 @@
1
- """Zemberek-based root validation and correction (Fix 4)."""
 
 
 
2
 
3
  from __future__ import annotations
4
 
@@ -6,20 +9,43 @@ ZEMBEREK_AVAILABLE = False
6
  _morphology = None
7
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def _init_zemberek() -> None:
10
  global ZEMBEREK_AVAILABLE, _morphology
11
 
12
  try:
13
  from zemberek import TurkishMorphology # noqa: PLC0415
14
-
 
15
  _morphology = TurkishMorphology.create_with_defaults()
16
  ZEMBEREK_AVAILABLE = True
17
 
18
  except ImportError:
19
- print(
20
- "[NedoTurkishTokenizer] zemberek-python not installed\n"
21
- " pip install zemberek-python"
22
- )
23
  except Exception as exc: # noqa: BLE001
24
  print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
25
 
@@ -37,12 +63,12 @@ def analyze_word(word: str) -> list[dict]:
37
  wa = _morphology.analyze(word)
38
  return [
39
  {
40
- "lemma": sa.item.lemma,
41
- "pos": sa.item.primary_pos.short_form,
42
- "morphemes": [m.id_ for m in sa.get_morphemes()],
43
- "surface": sa.surface_form(),
44
  }
45
- for sa in wa.analysis_results
46
  ]
47
  except Exception: # noqa: BLE001
48
  return []
@@ -73,7 +99,7 @@ def _is_spurious_root(root: str, next_tokens: list[dict]) -> bool:
73
  def build_correction_map(
74
  original_words: list[str], base_tokenizer
75
  ) -> dict[str, str]:
76
- """Build a {tokenizer_root -> zemberek_root} correction map."""
77
  correction_map: dict[str, str] = {}
78
 
79
  for word in original_words:
@@ -146,7 +172,7 @@ def validate_roots(
146
  "token": leading + correct,
147
  "_original_token": tok["token"],
148
  "_root_corrected": True,
149
- "_note": f"root corrected: '{surface}' -> '{correct}'",
150
  }
151
 
152
  result.append(tok)
@@ -160,16 +186,15 @@ def disambiguate_sentence(words: list[str]) -> list[dict | None]:
160
  return [None] * len(words)
161
  try:
162
  sentence = " ".join(words)
163
- analysis = _morphology.analyze_sentence(sentence)
164
- after = _morphology.disambiguate(sentence, analysis)
165
- best = after.best_analysis()
166
  out = []
167
  for sa in best:
168
  try:
169
  out.append({
170
- "lemma": sa.item.lemma,
171
- "pos": sa.item.primary_pos.short_form,
172
- "morphemes": [m.id_ for m in sa.get_morphemes()],
173
  })
174
  except Exception: # noqa: BLE001
175
  out.append(None)
 
1
+ """Zemberek-based root validation and correction (Fix 4).
2
+
3
+ Uses zemberek-python (pure Python) β€” no JVM or JPype required.
4
+ """
5
 
6
  from __future__ import annotations
7
 
 
9
  _morphology = None
10
 
11
 
12
+ def _apply_zemberek_patch() -> None:
13
+ """Fix O(N^2) loading time bug in zemberek-python 0.2.3."""
14
+ import csv
15
+ import zemberek.morphology.lexicon.root_lexicon as rl
16
+
17
+ def fast_load_from_resources(resource_path: str):
18
+ items = list()
19
+ csv.field_size_limit(100000000)
20
+ with open(resource_path, 'r', encoding='utf-8') as f:
21
+ lex = list(csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE))
22
+
23
+ # O(1) dictionary lookup instead of O(N) iteration per reference
24
+ lex_dict = {line[0]: line for line in lex}
25
+
26
+ for i, line in enumerate(lex):
27
+ item = rl.DictionaryReader.make_dict_item_from_line(line)
28
+ if line[7] != 'null':
29
+ reference_item_line = lex_dict.get(line[7])
30
+ if reference_item_line is not None:
31
+ item.set_reference_item(rl.DictionaryReader.make_dict_item_from_line(reference_item_line))
32
+ items.append(item)
33
+ return rl.RootLexicon(items)
34
+
35
+ rl.DictionaryReader.load_from_resources = fast_load_from_resources
36
+
37
  def _init_zemberek() -> None:
38
  global ZEMBEREK_AVAILABLE, _morphology
39
 
40
  try:
41
  from zemberek import TurkishMorphology # noqa: PLC0415
42
+
43
+ _apply_zemberek_patch()
44
  _morphology = TurkishMorphology.create_with_defaults()
45
  ZEMBEREK_AVAILABLE = True
46
 
47
  except ImportError:
48
+ print("[NedoTurkishTokenizer] zemberek-python not installed β†’ pip install zemberek-python")
 
 
 
49
  except Exception as exc: # noqa: BLE001
50
  print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
51
 
 
63
  wa = _morphology.analyze(word)
64
  return [
65
  {
66
+ "lemma": str(sa.item.lemma),
67
+ "pos": str(sa.item.primary_pos.short_form),
68
+ "morphemes": [str(m) for m in sa.get_morphemes()],
69
+ "surface": str(sa.get_stem()) + str(sa.get_ending()),
70
  }
71
+ for sa in wa
72
  ]
73
  except Exception: # noqa: BLE001
74
  return []
 
99
  def build_correction_map(
100
  original_words: list[str], base_tokenizer
101
  ) -> dict[str, str]:
102
+ """Build a {tokenizer_root β†’ zemberek_root} correction map."""
103
  correction_map: dict[str, str] = {}
104
 
105
  for word in original_words:
 
172
  "token": leading + correct,
173
  "_original_token": tok["token"],
174
  "_root_corrected": True,
175
+ "_note": f"root corrected: '{surface}' β†’ '{correct}'",
176
  }
177
 
178
  result.append(tok)
 
186
  return [None] * len(words)
187
  try:
188
  sentence = " ".join(words)
189
+ sa_result = _morphology.analyze_and_disambiguate(sentence)
190
+ best = sa_result.best_analysis()
 
191
  out = []
192
  for sa in best:
193
  try:
194
  out.append({
195
+ "lemma": str(sa.item.lemma),
196
+ "pos": str(sa.item.primary_pos.short_form),
197
+ "morphemes": [str(m) for m in sa.get_morphemes()],
198
  })
199
  except Exception: # noqa: BLE001
200
  out.append(None)
nedo_turkish_tokenizer/tokenizer.py CHANGED
@@ -26,10 +26,12 @@ Output fields per token:
26
  from __future__ import annotations
27
 
28
  import os
 
29
  import multiprocessing
30
  from concurrent.futures import ProcessPoolExecutor, as_completed
31
  from pathlib import Path
32
 
 
33
  from ._preprocessor import preprocess, postprocess
34
  from ._suffix_expander import reclassify_bpe_suffixes
35
  from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
@@ -97,6 +99,7 @@ class NedoTurkishTokenizer:
97
  """
98
 
99
  def __init__(self) -> None:
 
100
  from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
101
  self._base = TurkishTokenizer()
102
  self.zemberek_available = ZEMBEREK_AVAILABLE
@@ -234,6 +237,82 @@ class NedoTurkishTokenizer:
234
  json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
235
  )
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  # ── Utility ───────────────────────────────────────────────────────────────
238
 
239
  def stats(self, tokens: list[dict]) -> dict:
 
26
  from __future__ import annotations
27
 
28
  import os
29
+ import re
30
  import multiprocessing
31
  from concurrent.futures import ProcessPoolExecutor, as_completed
32
  from pathlib import Path
33
 
34
+
35
  from ._preprocessor import preprocess, postprocess
36
  from ._suffix_expander import reclassify_bpe_suffixes
37
  from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
 
99
  """
100
 
101
  def __init__(self) -> None:
102
+
103
  from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
104
  self._base = TurkishTokenizer()
105
  self.zemberek_available = ZEMBEREK_AVAILABLE
 
237
  json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
238
  )
239
 
240
+ # ── Morphological Lattice API ─────────────────────────────────────────────
241
+
242
+ def get_morphological_lattice(self, word: str) -> list[dict]:
243
+ """Return all possible morphological analyses for *word* as a lattice.
244
+
245
+ Each entry in the returned list is a dict with:
246
+ ``root`` – the lemma / root form
247
+ ``suffixes`` – list of surface-form suffixes
248
+ ``pos`` – abbreviated POS tag (Noun, Verb, Adj, …)
249
+ ``lexical_form`` – full lexical representation from Zemberek
250
+
251
+ Returns an **empty list** when Zemberek cannot analyse the word
252
+ (unknown word) or when Zemberek is not available.
253
+ """
254
+ if _zemb_morphology is None:
255
+ return []
256
+
257
+ try:
258
+ word_analysis = _zemb_morphology.analyze(word)
259
+
260
+ lattice: list[dict] = []
261
+ for sa in word_analysis:
262
+ try:
263
+ root = str(sa.item.lemma)
264
+ pos = str(sa.item.primary_pos.short_form)
265
+ lexical_form = str(sa.format_string())
266
+
267
+ # Build suffix list from morpheme chain (skip the root morpheme)
268
+ morphemes = list(sa.get_morphemes())
269
+ suffixes = [str(m) for m in morphemes[1:]] if len(morphemes) > 1 else []
270
+
271
+ lattice.append({
272
+ "root": root,
273
+ "suffixes": suffixes,
274
+ "pos": pos,
275
+ "lexical_form": lexical_form,
276
+ })
277
+ except Exception: # noqa: BLE001
278
+ continue
279
+
280
+ return lattice
281
+
282
+ except Exception: # noqa: BLE001
283
+ return []
284
+
285
+ def tokenize_lattice(self, text: str) -> dict:
286
+ """Tokenize *text* and return a morphological lattice for every word.
287
+
288
+ Returns a dict with:
289
+ ``input`` – the original text
290
+ ``words`` – list of per-word dicts, each containing
291
+ ``word`` (str) and ``lattice`` (list of analyses)
292
+
293
+ Example::
294
+
295
+ tok = NedoTurkishTokenizer()
296
+ data = tok.tokenize_lattice("Evin gΓΌzel gelir")
297
+ for w in data["words"]:
298
+ print(w["word"], "β†’", len(w["lattice"]), "analysis(es)")
299
+ """
300
+ # Split text on whitespace, respecting punctuation
301
+ words = re.findall(r"\S+", text)
302
+
303
+ result_words: list[dict] = []
304
+ for w in words:
305
+ lattice = self.get_morphological_lattice(w)
306
+ result_words.append({
307
+ "word": w,
308
+ "lattice": lattice,
309
+ })
310
+
311
+ return {
312
+ "input": text,
313
+ "words": result_words,
314
+ }
315
+
316
  # ── Utility ───────────────────────────────────────────────────────────────
317
 
318
  def stats(self, tokens: list[dict]) -> dict:
paper_baseline_check.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ paper_baseline_check.py
3
+ -----------------------
4
+ Paper'Δ±n %90.29 baseline'Δ±nΔ± neden biz %75.57 olarak gΓΆrΓΌyoruz?
5
+ Δ°ki metodolojinin farkΔ±nΔ± somut olarak gΓΆsterir.
6
+
7
+ KullanΔ±m:
8
+ cd NedoTurkishTokenizer/
9
+ python paper_baseline_check.py
10
+ """
11
+
12
+ import os
13
+ from huggingface_hub import login
14
+ from datasets import load_dataset
15
+ from turkish_tokenizer import TurkishTokenizer
16
+
17
+ HF_TOKEN = os.environ.get("HF_TOKEN", "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")
18
+ login(token=HF_TOKEN, add_to_git_credential=False)
19
+
20
+ base = TurkishTokenizer()
21
+
22
+ print("TR-MMLU yΓΌkleniyor...")
23
+ ds = load_dataset("alibayram/turkish_mmlu", split="test", token=HF_TOKEN)
24
+ rows = list(ds)
25
+ print(f"{len(rows)} ΓΆrnek\n")
26
+
27
+ def orig_tr_pct(tokens):
28
+ total = len(tokens)
29
+ if total == 0: return 0.0
30
+ tr = sum(1 for t in tokens if t["type"] in ("ROOT","SUFFIX"))
31
+ return tr / total * 100
32
+
33
+ # ── Metodoloji A: Sadece soru (paper'Δ±n yaptığı) ─────────────────────────────
34
+ a_sum = 0.0
35
+ for row in rows:
36
+ text = str(row.get("soru") or row.get("question") or "")
37
+ if not text.strip(): continue
38
+ toks = base.tokenize_text(text)
39
+ a_sum += orig_tr_pct(toks)
40
+ a_avg = a_sum / len(rows)
41
+
42
+ # ── Metodoloji B: Soru + şıklar (bizim yaptığımΔ±z) ───────────────────────────
43
+ b_sum = 0.0
44
+ for row in rows:
45
+ parts = []
46
+ for f in ["soru","question"]:
47
+ if row.get(f):
48
+ parts.append(str(row[f]))
49
+ break
50
+ choices = row.get("secenekler") or []
51
+ if isinstance(choices, list):
52
+ parts.extend(str(c) for c in choices)
53
+ text = " ".join(parts)
54
+ if not text.strip(): continue
55
+ toks = base.tokenize_text(text)
56
+ b_sum += orig_tr_pct(toks)
57
+ b_avg = b_sum / len(rows)
58
+
59
+ # ── Metodoloji C: Soru + şıklar + aΓ§Δ±klama (tam) ─────────────────────────────
60
+ c_sum = 0.0
61
+ for row in rows:
62
+ parts = []
63
+ for f in ["soru","question"]:
64
+ if row.get(f):
65
+ parts.append(str(row[f]))
66
+ break
67
+ choices = row.get("secenekler") or []
68
+ if isinstance(choices, list):
69
+ parts.extend(str(c) for c in choices)
70
+ if row.get("aciklama"):
71
+ parts.append(str(row["aciklama"]))
72
+ text = " ".join(parts)
73
+ if not text.strip(): continue
74
+ toks = base.tokenize_text(text)
75
+ c_sum += orig_tr_pct(toks)
76
+ c_avg = c_sum / len(rows)
77
+
78
+ # ── Şıklardaki iΓ§erik analizi ─────────────────────────────────────────────────
79
+ # İlk 20 sorunun şıklarına bak
80
+ print("İlk 20 sorunun şık ârnekleri:")
81
+ for row in rows[:20]:
82
+ choices = row.get("secenekler") or []
83
+ if isinstance(choices, list) and choices:
84
+ sample = " | ".join(str(c)[:20] for c in choices[:4])
85
+ print(f" {sample}")
86
+
87
+ print(f"""
88
+ {'='*60}
89
+ METODOLOJİ KARŞILAŞTIRMASI (TürkishTokenizer baseline)
90
+ {'='*60}
91
+
92
+ A) Sadece soru alanΔ± (paper'Δ±n yΓΆntemi): {a_avg:.2f}%
93
+ B) Soru + şıklar (kısmi): {b_avg:.2f}%
94
+ C) Soru + şıklar + açıkl. (bizim yântemimiz): {c_avg:.2f}%
95
+
96
+ Paper değeri: 90.29%
97
+ Bizim A değeri: {a_avg:.2f}% ← paper ile fark: {a_avg-90.29:+.2f}
98
+ Bizim C değeri: {c_avg:.2f}% ← biz bunu kullanΔ±yoruz
99
+
100
+ SonuΓ§: {c_avg:.2f}% vs {a_avg:.2f}% = {c_avg-a_avg:.2f} puan fark
101
+ Bu fark şıklardaki kısaltmalar ve yabancı terimlerden kaynaklanıyor.
102
+ {'='*60}
103
+ """)
results/tr_mmlu_per_sample.csv ADDED
The diff for this file is too large to render. See raw diff
 
results/tr_mmlu_report.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TR-MMLU Benchmark β€” NedoTurkishTokenizer
2
+
3
+ **N = 40,966 ΓΆrnek**
4
+
5
+ ## Ana Metrikler
6
+
7
+ | Metrik | Orijinal | Enhanced | Ξ” |
8
+ |--------|----------|----------|---|
9
+ | TR% | 75.57% | 92.64% | +17.07% |
10
+ | Pure% | 60.50% | 68.30% | +7.80% |
11
+ | Avg token/ΓΆrnek | 84.75 | 75.38 | -9.37 |
12
+ | Avg BPE/ΓΆrnek | 21.95 | 5.66 | -16.29 |
13
+
14
+ ## Paper ile Karşılaştırma
15
+
16
+ | Metrik | Paper (orijinal) | Bizim (orijinal) | NedoTurkishTokenizer |
17
+ |--------|-----------------|-----------------|---------------------|
18
+ | TR% | 90.29% | 75.57% | **92.64%** |
19
+ | Pure% | 85.8% | 60.50% | **68.30%** |
20
+
21
+ ## Γ–rnek DağılΔ±mΔ±
22
+
23
+ - İyileşen: 40,964 (%100.00)
24
+ - Gerileyen: 0 (%0.00)
25
+ - Değişmeyen: 2 (%0.00)
26
+
27
+ ## Kategori BazΔ±nda TR%
28
+
29
+ | Kategori | TR% | N |
30
+ |----------|-----|---|
31
+ | TUS | 80.25% | 1510 |
32
+ | Turizm ve Otel İşletmeciliği | 82.30% | 185 |
33
+ | Futbol | 89.63% | 283 |
34
+ | DHBT | 90.72% | 336 |
35
+ | Genel KΓΌltΓΌr | 91.16% | 84 |
36
+ | TarΔ±m | 91.41% | 586 |
37
+ | Siyer | 91.45% | 1067 |
38
+ | Laborant ve Veteriner Sağlık | 91.91% | 1485 |
39
+ | YGS Denemeleri | 91.92% | 4154 |
40
+ | Kim 500 Milyar Δ°ster | 92.06% | 12968 |
41
+ | Tarih | 93.11% | 571 |
42
+ | Parakende Satış ve Mağaza Yâneticiliği | 93.38% | 255 |
43
+ | Dini Bilgiler | 93.49% | 122 |
44
+ | KΓΌltΓΌrel Miras ve Turizm | 93.76% | 859 |
45
+ | Uluslar Arası İlişkiler | 93.84% | 117 |
46
+ | Radyo ve Televizyon Programcılığı | 93.88% | 531 |
47
+ | Okul Γ–ncesi Γ–ΔŸretmenliği | 93.92% | 9 |
48
+ | Δ°ktisat | 94.01% | 346 |
49
+ | Yaşlı Bakımı | 94.29% | 214 |
50
+ | Γ–zel Koruma ve GΓΌvenlik | 94.32% | 759 |
51
+ | TΔ±bbi DΓΆkΓΌmantasyon ve Sekreterlik | 94.33% | 63 |
52
+ | TΓΌrk Dili ve EdebiyatΔ± | 94.33% | 340 |
53
+ | KPSS Denemeleri | 94.33% | 2674 |
54
+ | Yânetim Bİlişim Sistemleri | 94.40% | 294 |
55
+ | Felsefe | 94.43% | 502 |
56
+ | Ehliyet | 94.44% | 271 |
57
+ | Sosyal Hizmet | 94.56% | 64 |
58
+ | Γ‡ocuk Gelişimi | 94.58% | 2597 |
59
+ | KPSS | 94.60% | 299 |
60
+ | Sosyal Hizmetler | 94.61% | 935 |
61
+ | Sağlık Yânetimi | 94.63% | 313 |
62
+ | Halkla İlişkiler ve Reklamcılık | 94.70% | 158 |
63
+ | Δ°nsan KaynaklarΔ± YΓΆnetimi | 94.92% | 237 |
64
+ | Lojistik | 94.99% | 474 |
65
+ | Menkul KΔ±ymetler ve Sermaye PiyasasΔ± | 95.04% | 435 |
66
+ | Konaklama İşletmeciliği | 95.04% | 13 |
67
+ | Medya ve İletişim | 95.05% | 898 |
68
+ | Sosyoloji | 95.06% | 393 |
69
+ | İşletme Yânetimi | 95.20% | 501 |
70
+ | Sağlık Kurumları İşletmeciliği | 95.26% | 575 |
71
+ | Turizm ve Seyehat Hizmetleri | 95.30% | 70 |
72
+ | Dış Ticaret | 95.36% | 1173 |
73
+ | Muhasebe ve Vergi UygulamalarΔ± | 95.49% | 474 |
74
+ | Spor YΓΆnetimi | 95.49% | 80 |
75
+ | Marka İletişimi | 95.62% | 180 |
76
+ | Yerel YΓΆnetimler | 95.67% | 196 |
77
+ | Acil Durum ve Afet YΓΆnetimi | 95.67% | 33 |
78
+ | Maliye | 95.68% | 13 |
79
+ | AUZEF | 95.82% | 47 |
80
+ | Kamu YΓΆnetimi | 95.92% | 54 |
81
+ | UluslararasΔ± Ticaret ve Lojistik YΓΆnetimi | 95.95% | 65 |
82
+ | HavacΔ±lΔ±k YΓΆnetimi | 96.06% | 15 |
83
+ | İşletme | 96.09% | 24 |
84
+ | Γ‡alışma Ekonomisi ve EndΓΌstri Δ°lişkileri | 96.42% | 62 |
85
+ | Sosyal Bilimler | 96.56% | 3 |
results/tr_mmlu_summary.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_samples": 40966,
3
+ "orig_tr_pct": 75.5727,
4
+ "orig_pure_pct": 60.499,
5
+ "enh_tr_pct": 92.6413,
6
+ "enh_pure_pct": 68.3004,
7
+ "delta_tr_pct": 17.0686,
8
+ "delta_pure_pct": 7.8014,
9
+ "orig_avg_tokens": 84.75,
10
+ "enh_avg_tokens": 75.38,
11
+ "orig_avg_bpe": 21.95,
12
+ "enh_avg_bpe": 5.66,
13
+ "pct_improved": 100.0,
14
+ "pct_regressed": 0.0,
15
+ "pct_unchanged": 0.0
16
+ }
test_lattice.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test / demo script for the Morphological Lattice API.
4
+
5
+ Shows how ambiguous Turkish words like "evin" and "gelir" produce
6
+ multiple alternative analyses in the lattice.
7
+ """
8
+
9
+ import json
10
+ from nedo_turkish_tokenizer import NedoTurkishTokenizer
11
+
12
+
13
+ def section(title: str) -> None:
14
+ print(f"\n{'═' * 60}")
15
+ print(f" {title}")
16
+ print(f"{'═' * 60}")
17
+
18
+
19
+ def main() -> None:
20
+ tok = NedoTurkishTokenizer()
21
+
22
+ # ── 1. Tek kelime lattice testi ──────────────────────────────────────
23
+ section("1) get_morphological_lattice β€” tek kelime ΓΆrnekleri")
24
+
25
+ test_words = ["evin", "gelir", "yüz", "çıkar", "koşar"]
26
+ for word in test_words:
27
+ lattice = tok.get_morphological_lattice(word)
28
+ print(f"\nβ–Έ \"{word}\" β†’ {len(lattice)} analiz:")
29
+ for i, entry in enumerate(lattice):
30
+ print(f" [{i}] root={entry['root']:<12} "
31
+ f"pos={entry['pos']:<6} "
32
+ f"suffixes={entry['suffixes']}")
33
+ print(f" lexical_form = {entry['lexical_form']}")
34
+
35
+ # ── 2. Bilinmeyen kelime (boş lattice) ───────────────────────────────
36
+ section("2) Bilinmeyen / yabancΔ± kelime β†’ boş lattice")
37
+
38
+ unknown_words = ["xyzfoo", "meeting", "blockchain"]
39
+ for word in unknown_words:
40
+ lattice = tok.get_morphological_lattice(word)
41
+ print(f" \"{word}\" β†’ lattice boş mu? {len(lattice) == 0} (len={len(lattice)})")
42
+
43
+ # ── 3. tokenize_lattice β€” cΓΌmle bazlΔ± test ──────────────────────────
44
+ section("3) tokenize_lattice β€” cΓΌmle testi")
45
+
46
+ sentences = [
47
+ "Evin gΓΌzel gelir",
48
+ "Γ‡ocuk okula koşar adΔ±m gitti",
49
+ "YΓΌz yΔ±llΔ±k Γ§Δ±nar",
50
+ ]
51
+
52
+ for sent in sentences:
53
+ print(f"\nβ–Έ Input: \"{sent}\"")
54
+ result = tok.tokenize_lattice(sent)
55
+ for winfo in result["words"]:
56
+ n = len(winfo["lattice"])
57
+ print(f" {winfo['word']:<16} β†’ {n} analiz(ler)")
58
+ for entry in winfo["lattice"]:
59
+ print(f" root={entry['root']:<12} pos={entry['pos']:<6} "
60
+ f"suffixes={entry['suffixes']}")
61
+
62
+ # ── 4. JSON Γ§Δ±ktΔ± formatΔ± ────────────────────────────────────────────
63
+ section("4) tokenize_lattice JSON Γ§Δ±ktΔ±")
64
+
65
+ data = tok.tokenize_lattice("evin gelir")
66
+ print(json.dumps(data, ensure_ascii=False, indent=2))
67
+
68
+ print("\nβœ… TΓΌm testler başarΔ±yla tamamlandΔ±.")
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()