merge github main before publish
Browse files- hf_benchmark.py +324 -0
- nedo_turkish_tokenizer/_compound.py +3 -3
- nedo_turkish_tokenizer/_context_aware.py +10 -9
- nedo_turkish_tokenizer/_preprocessor.py +3 -3
- nedo_turkish_tokenizer/_root_validator.py +44 -19
- nedo_turkish_tokenizer/tokenizer.py +79 -0
- paper_baseline_check.py +103 -0
- results/tr_mmlu_per_sample.csv +0 -0
- results/tr_mmlu_report.md +85 -0
- results/tr_mmlu_summary.json +16 -0
- test_lattice.py +72 -0
hf_benchmark.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
hf_benchmark.py
|
| 3 |
+
---------------
|
| 4 |
+
NedoTurkishTokenizer'Δ± TR-MMLU ΓΌzerinde benchmark eder.
|
| 5 |
+
HuggingFace'den hem tokenizer hem dataset'i Γ§eker.
|
| 6 |
+
|
| 7 |
+
KullanΔ±m:
|
| 8 |
+
cd NedoTurkishTokenizer/
|
| 9 |
+
pip install huggingface_hub datasets
|
| 10 |
+
python hf_benchmark.py
|
| 11 |
+
|
| 12 |
+
# Sadece 2000 ΓΆrnek (hΔ±zlΔ± test):
|
| 13 |
+
python hf_benchmark.py --samples 2000
|
| 14 |
+
|
| 15 |
+
# Belirli kategori:
|
| 16 |
+
python hf_benchmark.py --category TUS
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import argparse
|
| 20 |
+
import json
|
| 21 |
+
import os
|
| 22 |
+
import sys
|
| 23 |
+
import csv
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
|
| 26 |
+
# ββ HuggingFace token βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "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")
|
| 28 |
+
|
| 29 |
+
# ββ ArgΓΌmanlar ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
parser = argparse.ArgumentParser()
|
| 31 |
+
parser.add_argument("--samples", default="all", help="KaΓ§ ΓΆrnek (all veya sayΔ±)")
|
| 32 |
+
parser.add_argument("--category", default=None, help="Kategori filtresi (ΓΆr. TUS)")
|
| 33 |
+
parser.add_argument("--out", default=".", help="ΓΔ±ktΔ± klasΓΆrΓΌ")
|
| 34 |
+
args = parser.parse_args()
|
| 35 |
+
|
| 36 |
+
OUT = Path(args.out)
|
| 37 |
+
OUT.mkdir(parents=True, exist_ok=True)
|
| 38 |
+
|
| 39 |
+
# ββ HuggingFace login βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
from huggingface_hub import login
|
| 41 |
+
login(token=HF_TOKEN, add_to_git_credential=False)
|
| 42 |
+
print("HuggingFace login β")
|
| 43 |
+
|
| 44 |
+
# ββ Tokenizer yΓΌkle βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
print("NedoTurkishTokenizer yΓΌkleniyor...")
|
| 46 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 47 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 48 |
+
tok = NedoTurkishTokenizer()
|
| 49 |
+
print(f" Zemberek: {'β' if tok.zemberek_available else 'β (devre dΔ±ΕΔ±)'}")
|
| 50 |
+
|
| 51 |
+
# ββ Dataset yΓΌkle βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 52 |
+
print("TR-MMLU dataset yΓΌkleniyor...")
|
| 53 |
+
from datasets import load_dataset
|
| 54 |
+
ds = load_dataset(
|
| 55 |
+
"alibayram/turkish_mmlu",
|
| 56 |
+
split="test",
|
| 57 |
+
token=HF_TOKEN,
|
| 58 |
+
)
|
| 59 |
+
print(f" {len(ds)} ΓΆrnek")
|
| 60 |
+
|
| 61 |
+
# ββ Kategori filtresi βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 62 |
+
samples = list(ds)
|
| 63 |
+
if args.category:
|
| 64 |
+
samples = [r for r in samples
|
| 65 |
+
if str(r.get("bolum","")).strip() == args.category]
|
| 66 |
+
print(f" Kategori '{args.category}': {len(samples)} ΓΆrnek")
|
| 67 |
+
|
| 68 |
+
if args.samples != "all":
|
| 69 |
+
n = int(args.samples)
|
| 70 |
+
samples = samples[:n]
|
| 71 |
+
print(f" KΔ±sΔ±tlΔ±: {len(samples)} ΓΆrnek")
|
| 72 |
+
|
| 73 |
+
# ββ Metin alanlarΔ±nΔ± birleΕtir βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 74 |
+
def get_text(row: dict) -> str:
|
| 75 |
+
parts = []
|
| 76 |
+
for field in ["soru", "question"]:
|
| 77 |
+
if row.get(field):
|
| 78 |
+
parts.append(str(row[field]))
|
| 79 |
+
break
|
| 80 |
+
choices = row.get("secenekler") or []
|
| 81 |
+
if isinstance(choices, list):
|
| 82 |
+
parts.extend(str(c) for c in choices)
|
| 83 |
+
for lbl in ["A","B","C","D"]:
|
| 84 |
+
if row.get(lbl):
|
| 85 |
+
parts.append(str(row[lbl]))
|
| 86 |
+
if row.get("aciklama"):
|
| 87 |
+
parts.append(str(row["aciklama"]))
|
| 88 |
+
return " ".join(parts)
|
| 89 |
+
|
| 90 |
+
# ββ Token istatistikleri ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 91 |
+
def token_stats(tokens: list[dict]) -> dict:
|
| 92 |
+
"""NedoTurkishTokenizer'Δ±n token_type alanΔ±nΔ± kullan."""
|
| 93 |
+
total = len(tokens)
|
| 94 |
+
if total == 0:
|
| 95 |
+
return {"total":0,"bpe":0,"tr_pct":0.0,"pure_pct":0.0,"orig_tr_pct":0.0}
|
| 96 |
+
|
| 97 |
+
# Enhanced stats
|
| 98 |
+
roots = sum(1 for t in tokens if t["token_type"] == "ROOT")
|
| 99 |
+
suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
|
| 100 |
+
foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
|
| 101 |
+
punct = sum(1 for t in tokens if t["token_type"] == "PUNCT")
|
| 102 |
+
bpe = sum(1 for t in tokens if t["token_type"] == "BPE")
|
| 103 |
+
special = sum(1 for t in tokens
|
| 104 |
+
if t["token_type"] in ("NUM","DATE","UNIT","URL",
|
| 105 |
+
"MENTION","HASHTAG","EMOJI","ACRONYM"))
|
| 106 |
+
tr = roots + suffixes + foreign + punct + special
|
| 107 |
+
pure = sum(1 for t in tokens
|
| 108 |
+
if t["token_type"] in ("ROOT","SUFFIX","FOREIGN")
|
| 109 |
+
and not t["token"].strip().startswith("<"))
|
| 110 |
+
return {
|
| 111 |
+
"total": total,
|
| 112 |
+
"roots": roots,
|
| 113 |
+
"suffixes": suffixes,
|
| 114 |
+
"foreign": foreign,
|
| 115 |
+
"bpe": bpe,
|
| 116 |
+
"punct": punct,
|
| 117 |
+
"special": special,
|
| 118 |
+
"tr_pct": round(tr / total * 100, 4),
|
| 119 |
+
"pure_pct": round(pure / total * 100, 4),
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
def orig_stats(tokens: list[dict]) -> dict:
|
| 123 |
+
"""Orijinal tokenizer istatistikleri (karΕΔ±laΕtΔ±rma iΓ§in)."""
|
| 124 |
+
total = len(tokens)
|
| 125 |
+
if total == 0:
|
| 126 |
+
return {"total":0,"bpe":0,"tr_pct":0.0,"pure_pct":0.0}
|
| 127 |
+
roots = sum(1 for t in tokens if t.get("type") == "ROOT")
|
| 128 |
+
suffixes = sum(1 for t in tokens if t.get("type") == "SUFFIX")
|
| 129 |
+
bpe = sum(1 for t in tokens if t.get("type") == "BPE")
|
| 130 |
+
tr = roots + suffixes
|
| 131 |
+
pure = sum(1 for t in tokens
|
| 132 |
+
if t.get("type") in ("ROOT","SUFFIX")
|
| 133 |
+
and not t.get("token","").strip().startswith("<"))
|
| 134 |
+
return {
|
| 135 |
+
"total": total,
|
| 136 |
+
"bpe": bpe,
|
| 137 |
+
"tr_pct": round(tr / total * 100, 4),
|
| 138 |
+
"pure_pct": round(pure / total * 100, 4),
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
# ββ Ana benchmark dΓΆngΓΌsΓΌ βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 142 |
+
print(f"\nBenchmark baΕlΔ±yor: {len(samples)} ΓΆrnek...")
|
| 143 |
+
|
| 144 |
+
per_sample = []
|
| 145 |
+
orig_tr_sum = 0.0
|
| 146 |
+
enh_tr_sum = 0.0
|
| 147 |
+
orig_pur_sum = 0.0
|
| 148 |
+
enh_pur_sum = 0.0
|
| 149 |
+
orig_tok_sum = 0
|
| 150 |
+
enh_tok_sum = 0
|
| 151 |
+
orig_bpe_sum = 0
|
| 152 |
+
enh_bpe_sum = 0
|
| 153 |
+
improved = 0
|
| 154 |
+
regressed = 0
|
| 155 |
+
unchanged = 0
|
| 156 |
+
|
| 157 |
+
REPORT_EVERY = 500
|
| 158 |
+
|
| 159 |
+
for idx, row in enumerate(samples):
|
| 160 |
+
text = get_text(row)
|
| 161 |
+
if not text.strip():
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
# Orijinal tokenizer
|
| 165 |
+
orig_toks = tok._base.tokenize_text(text)
|
| 166 |
+
os_ = orig_stats(orig_toks)
|
| 167 |
+
|
| 168 |
+
# NedoTurkishTokenizer
|
| 169 |
+
enh_toks = tok.tokenize(text)
|
| 170 |
+
es_ = token_stats(enh_toks)
|
| 171 |
+
|
| 172 |
+
d_tr = round(es_["tr_pct"] - os_["tr_pct"], 4)
|
| 173 |
+
|
| 174 |
+
per_sample.append({
|
| 175 |
+
"idx": idx,
|
| 176 |
+
"bolum": str(row.get("bolum","")),
|
| 177 |
+
"orig_tr": os_["tr_pct"],
|
| 178 |
+
"enh_tr": es_["tr_pct"],
|
| 179 |
+
"d_tr": d_tr,
|
| 180 |
+
"orig_pure": os_["pure_pct"],
|
| 181 |
+
"enh_pure": es_["pure_pct"],
|
| 182 |
+
"orig_tok": os_["total"],
|
| 183 |
+
"enh_tok": es_["total"],
|
| 184 |
+
"orig_bpe": os_["bpe"],
|
| 185 |
+
"enh_bpe": es_["bpe"],
|
| 186 |
+
})
|
| 187 |
+
|
| 188 |
+
orig_tr_sum += os_["tr_pct"]
|
| 189 |
+
enh_tr_sum += es_["tr_pct"]
|
| 190 |
+
orig_pur_sum += os_["pure_pct"]
|
| 191 |
+
enh_pur_sum += es_["pure_pct"]
|
| 192 |
+
orig_tok_sum += os_["total"]
|
| 193 |
+
enh_tok_sum += es_["total"]
|
| 194 |
+
orig_bpe_sum += os_["bpe"]
|
| 195 |
+
enh_bpe_sum += es_["bpe"]
|
| 196 |
+
|
| 197 |
+
if d_tr > 0: improved += 1
|
| 198 |
+
elif d_tr < 0: regressed += 1
|
| 199 |
+
else: unchanged += 1
|
| 200 |
+
|
| 201 |
+
if (idx + 1) % REPORT_EVERY == 0:
|
| 202 |
+
n = idx + 1
|
| 203 |
+
print(f" [{n:>6}/{len(samples)}] "
|
| 204 |
+
f"TR%: {enh_tr_sum/n:.2f}% "
|
| 205 |
+
f"BPE/ΓΆrnek: {enh_bpe_sum/n:.2f} "
|
| 206 |
+
f"Regressed: {regressed}")
|
| 207 |
+
|
| 208 |
+
n = len(per_sample)
|
| 209 |
+
if n == 0:
|
| 210 |
+
print("HiΓ§ ΓΆrnek iΕlenmedi!")
|
| 211 |
+
sys.exit(1)
|
| 212 |
+
|
| 213 |
+
# ββ Γzet βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 214 |
+
summary = {
|
| 215 |
+
"n_samples": n,
|
| 216 |
+
"orig_tr_pct": round(orig_tr_sum / n, 4),
|
| 217 |
+
"orig_pure_pct": round(orig_pur_sum / n, 4),
|
| 218 |
+
"enh_tr_pct": round(enh_tr_sum / n, 4),
|
| 219 |
+
"enh_pure_pct": round(enh_pur_sum / n, 4),
|
| 220 |
+
"delta_tr_pct": round((enh_tr_sum - orig_tr_sum) / n, 4),
|
| 221 |
+
"delta_pure_pct": round((enh_pur_sum - orig_pur_sum) / n, 4),
|
| 222 |
+
"orig_avg_tokens": round(orig_tok_sum / n, 2),
|
| 223 |
+
"enh_avg_tokens": round(enh_tok_sum / n, 2),
|
| 224 |
+
"orig_avg_bpe": round(orig_bpe_sum / n, 2),
|
| 225 |
+
"enh_avg_bpe": round(enh_bpe_sum / n, 2),
|
| 226 |
+
"pct_improved": round(improved / n * 100, 2),
|
| 227 |
+
"pct_regressed": round(regressed / n * 100, 2),
|
| 228 |
+
"pct_unchanged": round(unchanged / n * 100, 2),
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
# ββ Kategori bazΔ±nda ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 232 |
+
from collections import defaultdict
|
| 233 |
+
cat_scores = defaultdict(list)
|
| 234 |
+
for row in per_sample:
|
| 235 |
+
cat_scores[row["bolum"]].append(row["enh_tr"])
|
| 236 |
+
|
| 237 |
+
cat_summary = {
|
| 238 |
+
cat: round(sum(v)/len(v), 2)
|
| 239 |
+
for cat, v in cat_scores.items()
|
| 240 |
+
if len(v) >= 3
|
| 241 |
+
}
|
| 242 |
+
cat_sorted = sorted(cat_summary.items(), key=lambda x: x[1])
|
| 243 |
+
|
| 244 |
+
# ββ Rapor yazdΔ±r βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 245 |
+
SEP = "β" * 65
|
| 246 |
+
print(f"\n{SEP}")
|
| 247 |
+
print(" NedoTurkishTokenizer β TR-MMLU Benchmark SonuΓ§larΔ±")
|
| 248 |
+
print(SEP)
|
| 249 |
+
print(f" N = {n:,} ΓΆrnek\n")
|
| 250 |
+
print(f" {'Metrik':30s} {'Orijinal':>10} {'Enhanced':>10} {'Ξ':>8}")
|
| 251 |
+
print(" " + "β"*55)
|
| 252 |
+
for label, orig, enh, delta in [
|
| 253 |
+
("TR%", summary["orig_tr_pct"], summary["enh_tr_pct"], summary["delta_tr_pct"]),
|
| 254 |
+
("Pure%", summary["orig_pure_pct"], summary["enh_pure_pct"], summary["delta_pure_pct"]),
|
| 255 |
+
("Avg token/ΓΆrn",summary["orig_avg_tokens"],summary["enh_avg_tokens"],
|
| 256 |
+
round(summary["enh_avg_tokens"]-summary["orig_avg_tokens"],2)),
|
| 257 |
+
("Avg BPE/ΓΆrn", summary["orig_avg_bpe"], summary["enh_avg_bpe"],
|
| 258 |
+
round(summary["enh_avg_bpe"]-summary["orig_avg_bpe"],2)),
|
| 259 |
+
]:
|
| 260 |
+
print(f" {label:30s} {orig:>10.2f} {enh:>10.2f} {delta:>+8.2f}")
|
| 261 |
+
|
| 262 |
+
print(f"\n Δ°yileΕen : {improved:>6,} (%{summary['pct_improved']:.2f})")
|
| 263 |
+
print(f" Gerileyen: {regressed:>6,} (%{summary['pct_regressed']:.2f})")
|
| 264 |
+
print(f" DeΔiΕmeyen:{unchanged:>6,} (%{summary['pct_unchanged']:.2f})")
|
| 265 |
+
|
| 266 |
+
print(f"\n En dΓΌΕΓΌk TR% kategoriler:")
|
| 267 |
+
for cat, avg in cat_sorted[:10]:
|
| 268 |
+
n_cat = len(cat_scores[cat])
|
| 269 |
+
print(f" {cat:<35} {avg:>6.2f}% (n={n_cat})")
|
| 270 |
+
|
| 271 |
+
print(f"\n En yΓΌksek TR% kategoriler:")
|
| 272 |
+
for cat, avg in cat_sorted[-8:]:
|
| 273 |
+
n_cat = len(cat_scores[cat])
|
| 274 |
+
print(f" {cat:<35} {avg:>6.2f}% (n={n_cat})")
|
| 275 |
+
|
| 276 |
+
print(SEP)
|
| 277 |
+
|
| 278 |
+
# ββ Dosyalara yaz βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 279 |
+
# Summary JSON
|
| 280 |
+
summary_path = OUT / "tr_mmlu_summary.json"
|
| 281 |
+
with open(summary_path, "w", encoding="utf-8") as f:
|
| 282 |
+
json.dump(summary, f, ensure_ascii=False, indent=2)
|
| 283 |
+
print(f"\n β {summary_path}")
|
| 284 |
+
|
| 285 |
+
# Report Markdown
|
| 286 |
+
paper_tr = 90.29
|
| 287 |
+
paper_pur = 85.80
|
| 288 |
+
report_path = OUT / "tr_mmlu_report.md"
|
| 289 |
+
with open(report_path, "w", encoding="utf-8") as f:
|
| 290 |
+
f.write("# TR-MMLU Benchmark β NedoTurkishTokenizer\n\n")
|
| 291 |
+
f.write(f"**N = {n:,} ΓΆrnek**\n\n")
|
| 292 |
+
f.write("## Ana Metrikler\n\n")
|
| 293 |
+
f.write("| Metrik | Orijinal | Enhanced | Ξ |\n")
|
| 294 |
+
f.write("|--------|----------|----------|---|\n")
|
| 295 |
+
f.write(f"| TR% | {summary['orig_tr_pct']:.2f}% | {summary['enh_tr_pct']:.2f}% | {summary['delta_tr_pct']:+.2f}% |\n")
|
| 296 |
+
f.write(f"| Pure% | {summary['orig_pure_pct']:.2f}% | {summary['enh_pure_pct']:.2f}% | {summary['delta_pure_pct']:+.2f}% |\n")
|
| 297 |
+
f.write(f"| Avg token/ΓΆrnek | {summary['orig_avg_tokens']:.2f} | {summary['enh_avg_tokens']:.2f} | {summary['enh_avg_tokens']-summary['orig_avg_tokens']:+.2f} |\n")
|
| 298 |
+
f.write(f"| Avg BPE/ΓΆrnek | {summary['orig_avg_bpe']:.2f} | {summary['enh_avg_bpe']:.2f} | {summary['enh_avg_bpe']-summary['orig_avg_bpe']:+.2f} |\n")
|
| 299 |
+
f.write("\n## Paper ile KarΕΔ±laΕtΔ±rma\n\n")
|
| 300 |
+
f.write("| Metrik | Paper (orijinal) | Bizim (orijinal) | NedoTurkishTokenizer |\n")
|
| 301 |
+
f.write("|--------|-----------------|-----------------|---------------------|\n")
|
| 302 |
+
f.write(f"| TR% | {paper_tr}% | {summary['orig_tr_pct']:.2f}% | **{summary['enh_tr_pct']:.2f}%** |\n")
|
| 303 |
+
f.write(f"| Pure% | {paper_pur}% | {summary['orig_pure_pct']:.2f}% | **{summary['enh_pure_pct']:.2f}%** |\n")
|
| 304 |
+
f.write("\n## Γrnek DaΔΔ±lΔ±mΔ±\n\n")
|
| 305 |
+
f.write(f"- Δ°yileΕen: {improved:,} (%{summary['pct_improved']:.2f})\n")
|
| 306 |
+
f.write(f"- Gerileyen: {regressed:,} (%{summary['pct_regressed']:.2f})\n")
|
| 307 |
+
f.write(f"- DeΔiΕmeyen: {unchanged:,} (%{summary['pct_unchanged']:.2f})\n")
|
| 308 |
+
f.write("\n## Kategori BazΔ±nda TR%\n\n")
|
| 309 |
+
f.write("| Kategori | TR% | N |\n")
|
| 310 |
+
f.write("|----------|-----|---|\n")
|
| 311 |
+
for cat, avg in cat_sorted:
|
| 312 |
+
n_cat = len(cat_scores[cat])
|
| 313 |
+
f.write(f"| {cat} | {avg:.2f}% | {n_cat} |\n")
|
| 314 |
+
print(f" β {report_path}")
|
| 315 |
+
|
| 316 |
+
# Per-sample CSV
|
| 317 |
+
csv_path = OUT / "tr_mmlu_per_sample.csv"
|
| 318 |
+
with open(csv_path, "w", newline="", encoding="utf-8") as f:
|
| 319 |
+
writer = csv.DictWriter(f, fieldnames=per_sample[0].keys())
|
| 320 |
+
writer.writeheader()
|
| 321 |
+
writer.writerows(per_sample)
|
| 322 |
+
print(f" β {csv_path}")
|
| 323 |
+
|
| 324 |
+
print("\nTamamlandΔ±.\n")
|
nedo_turkish_tokenizer/_compound.py
CHANGED
|
@@ -42,9 +42,9 @@ KNOWN_COMPOUNDS: dict[str, list[str]] = {
|
|
| 42 |
def _decompose_zemberek(word: str, morphology) -> list[str] | None:
|
| 43 |
try:
|
| 44 |
wa = morphology.analyze(word)
|
| 45 |
-
for sa in wa
|
| 46 |
-
morphemes = [m
|
| 47 |
-
roots = [m for m in morphemes if
|
| 48 |
if len(roots) > 1:
|
| 49 |
return roots
|
| 50 |
except Exception: # noqa: BLE001
|
|
|
|
| 42 |
def _decompose_zemberek(word: str, morphology) -> list[str] | None:
|
| 43 |
try:
|
| 44 |
wa = morphology.analyze(word)
|
| 45 |
+
for sa in wa:
|
| 46 |
+
morphemes = [str(m) for m in sa.get_morphemes()]
|
| 47 |
+
roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
|
| 48 |
if len(roots) > 1:
|
| 49 |
return roots
|
| 50 |
except Exception: # noqa: BLE001
|
nedo_turkish_tokenizer/_context_aware.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
-
"""Fix 12: Context-aware Zemberek disambiguation.
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
@@ -17,20 +20,18 @@ def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
|
|
| 17 |
return tokens
|
| 18 |
|
| 19 |
try:
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
after = _morphology.disambiguate(sentence, analysis)
|
| 23 |
-
best_list = after.best_analysis()
|
| 24 |
|
| 25 |
analyses: dict[str, dict] = {}
|
| 26 |
for sa in best_list:
|
| 27 |
try:
|
| 28 |
-
sf = sa.
|
| 29 |
if sf not in analyses:
|
| 30 |
analyses[sf] = {
|
| 31 |
-
"lemma": sa.item.lemma,
|
| 32 |
-
"pos": sa.item.primary_pos.short_form,
|
| 33 |
-
"morphemes": [m
|
| 34 |
}
|
| 35 |
except Exception: # noqa: BLE001
|
| 36 |
continue
|
|
|
|
| 1 |
+
"""Fix 12: Context-aware Zemberek disambiguation.
|
| 2 |
+
|
| 3 |
+
Uses zemberek-python (pure Python) β no JVM required.
|
| 4 |
+
"""
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
|
|
|
|
| 20 |
return tokens
|
| 21 |
|
| 22 |
try:
|
| 23 |
+
sa_result = _morphology.analyze_and_disambiguate(original_text.strip())
|
| 24 |
+
best_list = sa_result.best_analysis()
|
|
|
|
|
|
|
| 25 |
|
| 26 |
analyses: dict[str, dict] = {}
|
| 27 |
for sa in best_list:
|
| 28 |
try:
|
| 29 |
+
sf = (str(sa.get_stem()) + str(sa.get_ending())).lower().strip()
|
| 30 |
if sf not in analyses:
|
| 31 |
analyses[sf] = {
|
| 32 |
+
"lemma": str(sa.item.lemma),
|
| 33 |
+
"pos": str(sa.item.primary_pos.short_form),
|
| 34 |
+
"morphemes": [str(m) for m in sa.get_morphemes()],
|
| 35 |
}
|
| 36 |
except Exception: # noqa: BLE001
|
| 37 |
continue
|
nedo_turkish_tokenizer/_preprocessor.py
CHANGED
|
@@ -67,13 +67,13 @@ def _is_turkish_base(word: str) -> bool:
|
|
| 67 |
tdk = load_tdk_words()
|
| 68 |
if tdk and wl in tdk:
|
| 69 |
return True
|
| 70 |
-
# Zemberek: proper nouns whose lemma contains Turkish chars (Δ°stanbul, Δ°zmir
|
| 71 |
try:
|
| 72 |
from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
|
| 73 |
if ZEMBEREK_AVAILABLE and _morphology:
|
| 74 |
wa = _morphology.analyze(wl)
|
| 75 |
-
for sa in wa
|
| 76 |
-
lemma = sa.item.lemma
|
| 77 |
if any(c in TR_CHARS for c in lemma):
|
| 78 |
return True
|
| 79 |
except Exception: # noqa: BLE001
|
|
|
|
| 67 |
tdk = load_tdk_words()
|
| 68 |
if tdk and wl in tdk:
|
| 69 |
return True
|
| 70 |
+
# Zemberek: proper nouns whose lemma contains Turkish chars (Δ°stanbul, Δ°zmirβ¦)
|
| 71 |
try:
|
| 72 |
from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
|
| 73 |
if ZEMBEREK_AVAILABLE and _morphology:
|
| 74 |
wa = _morphology.analyze(wl)
|
| 75 |
+
for sa in wa:
|
| 76 |
+
lemma = str(sa.item.lemma)
|
| 77 |
if any(c in TR_CHARS for c in lemma):
|
| 78 |
return True
|
| 79 |
except Exception: # noqa: BLE001
|
nedo_turkish_tokenizer/_root_validator.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
-
"""Zemberek-based root validation and correction (Fix 4).
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
@@ -6,20 +9,43 @@ ZEMBEREK_AVAILABLE = False
|
|
| 6 |
_morphology = None
|
| 7 |
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
def _init_zemberek() -> None:
|
| 10 |
global ZEMBEREK_AVAILABLE, _morphology
|
| 11 |
|
| 12 |
try:
|
| 13 |
from zemberek import TurkishMorphology # noqa: PLC0415
|
| 14 |
-
|
|
|
|
| 15 |
_morphology = TurkishMorphology.create_with_defaults()
|
| 16 |
ZEMBEREK_AVAILABLE = True
|
| 17 |
|
| 18 |
except ImportError:
|
| 19 |
-
print(
|
| 20 |
-
"[NedoTurkishTokenizer] zemberek-python not installed\n"
|
| 21 |
-
" pip install zemberek-python"
|
| 22 |
-
)
|
| 23 |
except Exception as exc: # noqa: BLE001
|
| 24 |
print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
|
| 25 |
|
|
@@ -37,12 +63,12 @@ def analyze_word(word: str) -> list[dict]:
|
|
| 37 |
wa = _morphology.analyze(word)
|
| 38 |
return [
|
| 39 |
{
|
| 40 |
-
"lemma": sa.item.lemma,
|
| 41 |
-
"pos": sa.item.primary_pos.short_form,
|
| 42 |
-
"morphemes": [m
|
| 43 |
-
"surface": sa.
|
| 44 |
}
|
| 45 |
-
for sa in wa
|
| 46 |
]
|
| 47 |
except Exception: # noqa: BLE001
|
| 48 |
return []
|
|
@@ -73,7 +99,7 @@ def _is_spurious_root(root: str, next_tokens: list[dict]) -> bool:
|
|
| 73 |
def build_correction_map(
|
| 74 |
original_words: list[str], base_tokenizer
|
| 75 |
) -> dict[str, str]:
|
| 76 |
-
"""Build a {tokenizer_root
|
| 77 |
correction_map: dict[str, str] = {}
|
| 78 |
|
| 79 |
for word in original_words:
|
|
@@ -146,7 +172,7 @@ def validate_roots(
|
|
| 146 |
"token": leading + correct,
|
| 147 |
"_original_token": tok["token"],
|
| 148 |
"_root_corrected": True,
|
| 149 |
-
"_note": f"root corrected: '{surface}'
|
| 150 |
}
|
| 151 |
|
| 152 |
result.append(tok)
|
|
@@ -160,16 +186,15 @@ def disambiguate_sentence(words: list[str]) -> list[dict | None]:
|
|
| 160 |
return [None] * len(words)
|
| 161 |
try:
|
| 162 |
sentence = " ".join(words)
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
best = after.best_analysis()
|
| 166 |
out = []
|
| 167 |
for sa in best:
|
| 168 |
try:
|
| 169 |
out.append({
|
| 170 |
-
"lemma": sa.item.lemma,
|
| 171 |
-
"pos": sa.item.primary_pos.short_form,
|
| 172 |
-
"morphemes": [m
|
| 173 |
})
|
| 174 |
except Exception: # noqa: BLE001
|
| 175 |
out.append(None)
|
|
|
|
| 1 |
+
"""Zemberek-based root validation and correction (Fix 4).
|
| 2 |
+
|
| 3 |
+
Uses zemberek-python (pure Python) β no JVM or JPype required.
|
| 4 |
+
"""
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
|
|
|
|
| 9 |
_morphology = None
|
| 10 |
|
| 11 |
|
| 12 |
+
def _apply_zemberek_patch() -> None:
|
| 13 |
+
"""Fix O(N^2) loading time bug in zemberek-python 0.2.3."""
|
| 14 |
+
import csv
|
| 15 |
+
import zemberek.morphology.lexicon.root_lexicon as rl
|
| 16 |
+
|
| 17 |
+
def fast_load_from_resources(resource_path: str):
|
| 18 |
+
items = list()
|
| 19 |
+
csv.field_size_limit(100000000)
|
| 20 |
+
with open(resource_path, 'r', encoding='utf-8') as f:
|
| 21 |
+
lex = list(csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE))
|
| 22 |
+
|
| 23 |
+
# O(1) dictionary lookup instead of O(N) iteration per reference
|
| 24 |
+
lex_dict = {line[0]: line for line in lex}
|
| 25 |
+
|
| 26 |
+
for i, line in enumerate(lex):
|
| 27 |
+
item = rl.DictionaryReader.make_dict_item_from_line(line)
|
| 28 |
+
if line[7] != 'null':
|
| 29 |
+
reference_item_line = lex_dict.get(line[7])
|
| 30 |
+
if reference_item_line is not None:
|
| 31 |
+
item.set_reference_item(rl.DictionaryReader.make_dict_item_from_line(reference_item_line))
|
| 32 |
+
items.append(item)
|
| 33 |
+
return rl.RootLexicon(items)
|
| 34 |
+
|
| 35 |
+
rl.DictionaryReader.load_from_resources = fast_load_from_resources
|
| 36 |
+
|
| 37 |
def _init_zemberek() -> None:
|
| 38 |
global ZEMBEREK_AVAILABLE, _morphology
|
| 39 |
|
| 40 |
try:
|
| 41 |
from zemberek import TurkishMorphology # noqa: PLC0415
|
| 42 |
+
|
| 43 |
+
_apply_zemberek_patch()
|
| 44 |
_morphology = TurkishMorphology.create_with_defaults()
|
| 45 |
ZEMBEREK_AVAILABLE = True
|
| 46 |
|
| 47 |
except ImportError:
|
| 48 |
+
print("[NedoTurkishTokenizer] zemberek-python not installed β pip install zemberek-python")
|
|
|
|
|
|
|
|
|
|
| 49 |
except Exception as exc: # noqa: BLE001
|
| 50 |
print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
|
| 51 |
|
|
|
|
| 63 |
wa = _morphology.analyze(word)
|
| 64 |
return [
|
| 65 |
{
|
| 66 |
+
"lemma": str(sa.item.lemma),
|
| 67 |
+
"pos": str(sa.item.primary_pos.short_form),
|
| 68 |
+
"morphemes": [str(m) for m in sa.get_morphemes()],
|
| 69 |
+
"surface": str(sa.get_stem()) + str(sa.get_ending()),
|
| 70 |
}
|
| 71 |
+
for sa in wa
|
| 72 |
]
|
| 73 |
except Exception: # noqa: BLE001
|
| 74 |
return []
|
|
|
|
| 99 |
def build_correction_map(
|
| 100 |
original_words: list[str], base_tokenizer
|
| 101 |
) -> dict[str, str]:
|
| 102 |
+
"""Build a {tokenizer_root β zemberek_root} correction map."""
|
| 103 |
correction_map: dict[str, str] = {}
|
| 104 |
|
| 105 |
for word in original_words:
|
|
|
|
| 172 |
"token": leading + correct,
|
| 173 |
"_original_token": tok["token"],
|
| 174 |
"_root_corrected": True,
|
| 175 |
+
"_note": f"root corrected: '{surface}' β '{correct}'",
|
| 176 |
}
|
| 177 |
|
| 178 |
result.append(tok)
|
|
|
|
| 186 |
return [None] * len(words)
|
| 187 |
try:
|
| 188 |
sentence = " ".join(words)
|
| 189 |
+
sa_result = _morphology.analyze_and_disambiguate(sentence)
|
| 190 |
+
best = sa_result.best_analysis()
|
|
|
|
| 191 |
out = []
|
| 192 |
for sa in best:
|
| 193 |
try:
|
| 194 |
out.append({
|
| 195 |
+
"lemma": str(sa.item.lemma),
|
| 196 |
+
"pos": str(sa.item.primary_pos.short_form),
|
| 197 |
+
"morphemes": [str(m) for m in sa.get_morphemes()],
|
| 198 |
})
|
| 199 |
except Exception: # noqa: BLE001
|
| 200 |
out.append(None)
|
nedo_turkish_tokenizer/tokenizer.py
CHANGED
|
@@ -26,10 +26,12 @@ Output fields per token:
|
|
| 26 |
from __future__ import annotations
|
| 27 |
|
| 28 |
import os
|
|
|
|
| 29 |
import multiprocessing
|
| 30 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 31 |
from pathlib import Path
|
| 32 |
|
|
|
|
| 33 |
from ._preprocessor import preprocess, postprocess
|
| 34 |
from ._suffix_expander import reclassify_bpe_suffixes
|
| 35 |
from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
|
|
@@ -97,6 +99,7 @@ class NedoTurkishTokenizer:
|
|
| 97 |
"""
|
| 98 |
|
| 99 |
def __init__(self) -> None:
|
|
|
|
| 100 |
from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
|
| 101 |
self._base = TurkishTokenizer()
|
| 102 |
self.zemberek_available = ZEMBEREK_AVAILABLE
|
|
@@ -234,6 +237,82 @@ class NedoTurkishTokenizer:
|
|
| 234 |
json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
|
| 235 |
)
|
| 236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
# ββ Utility βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 238 |
|
| 239 |
def stats(self, tokens: list[dict]) -> dict:
|
|
|
|
| 26 |
from __future__ import annotations
|
| 27 |
|
| 28 |
import os
|
| 29 |
+
import re
|
| 30 |
import multiprocessing
|
| 31 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 32 |
from pathlib import Path
|
| 33 |
|
| 34 |
+
|
| 35 |
from ._preprocessor import preprocess, postprocess
|
| 36 |
from ._suffix_expander import reclassify_bpe_suffixes
|
| 37 |
from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
|
|
|
|
| 99 |
"""
|
| 100 |
|
| 101 |
def __init__(self) -> None:
|
| 102 |
+
|
| 103 |
from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
|
| 104 |
self._base = TurkishTokenizer()
|
| 105 |
self.zemberek_available = ZEMBEREK_AVAILABLE
|
|
|
|
| 237 |
json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
|
| 238 |
)
|
| 239 |
|
| 240 |
+
# ββ Morphological Lattice API βββββββββββββββββββββββββββββββββββββββββββββ
|
| 241 |
+
|
| 242 |
+
def get_morphological_lattice(self, word: str) -> list[dict]:
|
| 243 |
+
"""Return all possible morphological analyses for *word* as a lattice.
|
| 244 |
+
|
| 245 |
+
Each entry in the returned list is a dict with:
|
| 246 |
+
``root`` β the lemma / root form
|
| 247 |
+
``suffixes`` β list of surface-form suffixes
|
| 248 |
+
``pos`` β abbreviated POS tag (Noun, Verb, Adj, β¦)
|
| 249 |
+
``lexical_form`` β full lexical representation from Zemberek
|
| 250 |
+
|
| 251 |
+
Returns an **empty list** when Zemberek cannot analyse the word
|
| 252 |
+
(unknown word) or when Zemberek is not available.
|
| 253 |
+
"""
|
| 254 |
+
if _zemb_morphology is None:
|
| 255 |
+
return []
|
| 256 |
+
|
| 257 |
+
try:
|
| 258 |
+
word_analysis = _zemb_morphology.analyze(word)
|
| 259 |
+
|
| 260 |
+
lattice: list[dict] = []
|
| 261 |
+
for sa in word_analysis:
|
| 262 |
+
try:
|
| 263 |
+
root = str(sa.item.lemma)
|
| 264 |
+
pos = str(sa.item.primary_pos.short_form)
|
| 265 |
+
lexical_form = str(sa.format_string())
|
| 266 |
+
|
| 267 |
+
# Build suffix list from morpheme chain (skip the root morpheme)
|
| 268 |
+
morphemes = list(sa.get_morphemes())
|
| 269 |
+
suffixes = [str(m) for m in morphemes[1:]] if len(morphemes) > 1 else []
|
| 270 |
+
|
| 271 |
+
lattice.append({
|
| 272 |
+
"root": root,
|
| 273 |
+
"suffixes": suffixes,
|
| 274 |
+
"pos": pos,
|
| 275 |
+
"lexical_form": lexical_form,
|
| 276 |
+
})
|
| 277 |
+
except Exception: # noqa: BLE001
|
| 278 |
+
continue
|
| 279 |
+
|
| 280 |
+
return lattice
|
| 281 |
+
|
| 282 |
+
except Exception: # noqa: BLE001
|
| 283 |
+
return []
|
| 284 |
+
|
| 285 |
+
def tokenize_lattice(self, text: str) -> dict:
|
| 286 |
+
"""Tokenize *text* and return a morphological lattice for every word.
|
| 287 |
+
|
| 288 |
+
Returns a dict with:
|
| 289 |
+
``input`` β the original text
|
| 290 |
+
``words`` β list of per-word dicts, each containing
|
| 291 |
+
``word`` (str) and ``lattice`` (list of analyses)
|
| 292 |
+
|
| 293 |
+
Example::
|
| 294 |
+
|
| 295 |
+
tok = NedoTurkishTokenizer()
|
| 296 |
+
data = tok.tokenize_lattice("Evin gΓΌzel gelir")
|
| 297 |
+
for w in data["words"]:
|
| 298 |
+
print(w["word"], "β", len(w["lattice"]), "analysis(es)")
|
| 299 |
+
"""
|
| 300 |
+
# Split text on whitespace, respecting punctuation
|
| 301 |
+
words = re.findall(r"\S+", text)
|
| 302 |
+
|
| 303 |
+
result_words: list[dict] = []
|
| 304 |
+
for w in words:
|
| 305 |
+
lattice = self.get_morphological_lattice(w)
|
| 306 |
+
result_words.append({
|
| 307 |
+
"word": w,
|
| 308 |
+
"lattice": lattice,
|
| 309 |
+
})
|
| 310 |
+
|
| 311 |
+
return {
|
| 312 |
+
"input": text,
|
| 313 |
+
"words": result_words,
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
# ββ Utility βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 317 |
|
| 318 |
def stats(self, tokens: list[dict]) -> dict:
|
paper_baseline_check.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
paper_baseline_check.py
|
| 3 |
+
-----------------------
|
| 4 |
+
Paper'Δ±n %90.29 baseline'Δ±nΔ± neden biz %75.57 olarak gΓΆrΓΌyoruz?
|
| 5 |
+
Δ°ki metodolojinin farkΔ±nΔ± somut olarak gΓΆsterir.
|
| 6 |
+
|
| 7 |
+
KullanΔ±m:
|
| 8 |
+
cd NedoTurkishTokenizer/
|
| 9 |
+
python paper_baseline_check.py
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
from huggingface_hub import login
|
| 14 |
+
from datasets import load_dataset
|
| 15 |
+
from turkish_tokenizer import TurkishTokenizer
|
| 16 |
+
|
| 17 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "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")
|
| 18 |
+
login(token=HF_TOKEN, add_to_git_credential=False)
|
| 19 |
+
|
| 20 |
+
base = TurkishTokenizer()
|
| 21 |
+
|
| 22 |
+
print("TR-MMLU yΓΌkleniyor...")
|
| 23 |
+
ds = load_dataset("alibayram/turkish_mmlu", split="test", token=HF_TOKEN)
|
| 24 |
+
rows = list(ds)
|
| 25 |
+
print(f"{len(rows)} ΓΆrnek\n")
|
| 26 |
+
|
| 27 |
+
def orig_tr_pct(tokens):
|
| 28 |
+
total = len(tokens)
|
| 29 |
+
if total == 0: return 0.0
|
| 30 |
+
tr = sum(1 for t in tokens if t["type"] in ("ROOT","SUFFIX"))
|
| 31 |
+
return tr / total * 100
|
| 32 |
+
|
| 33 |
+
# ββ Metodoloji A: Sadece soru (paper'Δ±n yaptΔ±ΔΔ±) βββββββββββββββββββββββββββββ
|
| 34 |
+
a_sum = 0.0
|
| 35 |
+
for row in rows:
|
| 36 |
+
text = str(row.get("soru") or row.get("question") or "")
|
| 37 |
+
if not text.strip(): continue
|
| 38 |
+
toks = base.tokenize_text(text)
|
| 39 |
+
a_sum += orig_tr_pct(toks)
|
| 40 |
+
a_avg = a_sum / len(rows)
|
| 41 |
+
|
| 42 |
+
# ββ Metodoloji B: Soru + ΕΔ±klar (bizim yaptΔ±ΔΔ±mΔ±z) βββββββββββββββββββββββββββ
|
| 43 |
+
b_sum = 0.0
|
| 44 |
+
for row in rows:
|
| 45 |
+
parts = []
|
| 46 |
+
for f in ["soru","question"]:
|
| 47 |
+
if row.get(f):
|
| 48 |
+
parts.append(str(row[f]))
|
| 49 |
+
break
|
| 50 |
+
choices = row.get("secenekler") or []
|
| 51 |
+
if isinstance(choices, list):
|
| 52 |
+
parts.extend(str(c) for c in choices)
|
| 53 |
+
text = " ".join(parts)
|
| 54 |
+
if not text.strip(): continue
|
| 55 |
+
toks = base.tokenize_text(text)
|
| 56 |
+
b_sum += orig_tr_pct(toks)
|
| 57 |
+
b_avg = b_sum / len(rows)
|
| 58 |
+
|
| 59 |
+
# ββ Metodoloji C: Soru + ΕΔ±klar + aΓ§Δ±klama (tam) βββββββββββββββββββββββββββββ
|
| 60 |
+
c_sum = 0.0
|
| 61 |
+
for row in rows:
|
| 62 |
+
parts = []
|
| 63 |
+
for f in ["soru","question"]:
|
| 64 |
+
if row.get(f):
|
| 65 |
+
parts.append(str(row[f]))
|
| 66 |
+
break
|
| 67 |
+
choices = row.get("secenekler") or []
|
| 68 |
+
if isinstance(choices, list):
|
| 69 |
+
parts.extend(str(c) for c in choices)
|
| 70 |
+
if row.get("aciklama"):
|
| 71 |
+
parts.append(str(row["aciklama"]))
|
| 72 |
+
text = " ".join(parts)
|
| 73 |
+
if not text.strip(): continue
|
| 74 |
+
toks = base.tokenize_text(text)
|
| 75 |
+
c_sum += orig_tr_pct(toks)
|
| 76 |
+
c_avg = c_sum / len(rows)
|
| 77 |
+
|
| 78 |
+
# ββ ΕΔ±klardaki iΓ§erik analizi βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
+
# Δ°lk 20 sorunun ΕΔ±klarΔ±na bak
|
| 80 |
+
print("Δ°lk 20 sorunun ΕΔ±k ΓΆrnekleri:")
|
| 81 |
+
for row in rows[:20]:
|
| 82 |
+
choices = row.get("secenekler") or []
|
| 83 |
+
if isinstance(choices, list) and choices:
|
| 84 |
+
sample = " | ".join(str(c)[:20] for c in choices[:4])
|
| 85 |
+
print(f" {sample}")
|
| 86 |
+
|
| 87 |
+
print(f"""
|
| 88 |
+
{'='*60}
|
| 89 |
+
METODOLOJΔ° KARΕILAΕTIRMASI (TΓΌrkishTokenizer baseline)
|
| 90 |
+
{'='*60}
|
| 91 |
+
|
| 92 |
+
A) Sadece soru alanΔ± (paper'Δ±n yΓΆntemi): {a_avg:.2f}%
|
| 93 |
+
B) Soru + ΕΔ±klar (kΔ±smi): {b_avg:.2f}%
|
| 94 |
+
C) Soru + ΕΔ±klar + aΓ§Δ±kl. (bizim yΓΆntemimiz): {c_avg:.2f}%
|
| 95 |
+
|
| 96 |
+
Paper deΔeri: 90.29%
|
| 97 |
+
Bizim A deΔeri: {a_avg:.2f}% β paper ile fark: {a_avg-90.29:+.2f}
|
| 98 |
+
Bizim C deΔeri: {c_avg:.2f}% β biz bunu kullanΔ±yoruz
|
| 99 |
+
|
| 100 |
+
SonuΓ§: {c_avg:.2f}% vs {a_avg:.2f}% = {c_avg-a_avg:.2f} puan fark
|
| 101 |
+
Bu fark ΕΔ±klardaki kΔ±saltmalar ve yabancΔ± terimlerden kaynaklanΔ±yor.
|
| 102 |
+
{'='*60}
|
| 103 |
+
""")
|
results/tr_mmlu_per_sample.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/tr_mmlu_report.md
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# TR-MMLU Benchmark β NedoTurkishTokenizer
|
| 2 |
+
|
| 3 |
+
**N = 40,966 ΓΆrnek**
|
| 4 |
+
|
| 5 |
+
## Ana Metrikler
|
| 6 |
+
|
| 7 |
+
| Metrik | Orijinal | Enhanced | Ξ |
|
| 8 |
+
|--------|----------|----------|---|
|
| 9 |
+
| TR% | 75.57% | 92.64% | +17.07% |
|
| 10 |
+
| Pure% | 60.50% | 68.30% | +7.80% |
|
| 11 |
+
| Avg token/ΓΆrnek | 84.75 | 75.38 | -9.37 |
|
| 12 |
+
| Avg BPE/ΓΆrnek | 21.95 | 5.66 | -16.29 |
|
| 13 |
+
|
| 14 |
+
## Paper ile KarΕΔ±laΕtΔ±rma
|
| 15 |
+
|
| 16 |
+
| Metrik | Paper (orijinal) | Bizim (orijinal) | NedoTurkishTokenizer |
|
| 17 |
+
|--------|-----------------|-----------------|---------------------|
|
| 18 |
+
| TR% | 90.29% | 75.57% | **92.64%** |
|
| 19 |
+
| Pure% | 85.8% | 60.50% | **68.30%** |
|
| 20 |
+
|
| 21 |
+
## Γrnek DaΔΔ±lΔ±mΔ±
|
| 22 |
+
|
| 23 |
+
- Δ°yileΕen: 40,964 (%100.00)
|
| 24 |
+
- Gerileyen: 0 (%0.00)
|
| 25 |
+
- DeΔiΕmeyen: 2 (%0.00)
|
| 26 |
+
|
| 27 |
+
## Kategori BazΔ±nda TR%
|
| 28 |
+
|
| 29 |
+
| Kategori | TR% | N |
|
| 30 |
+
|----------|-----|---|
|
| 31 |
+
| TUS | 80.25% | 1510 |
|
| 32 |
+
| Turizm ve Otel Δ°ΕletmeciliΔi | 82.30% | 185 |
|
| 33 |
+
| Futbol | 89.63% | 283 |
|
| 34 |
+
| DHBT | 90.72% | 336 |
|
| 35 |
+
| Genel KΓΌltΓΌr | 91.16% | 84 |
|
| 36 |
+
| TarΔ±m | 91.41% | 586 |
|
| 37 |
+
| Siyer | 91.45% | 1067 |
|
| 38 |
+
| Laborant ve Veteriner SaΔlΔ±k | 91.91% | 1485 |
|
| 39 |
+
| YGS Denemeleri | 91.92% | 4154 |
|
| 40 |
+
| Kim 500 Milyar Δ°ster | 92.06% | 12968 |
|
| 41 |
+
| Tarih | 93.11% | 571 |
|
| 42 |
+
| Parakende SatΔ±Ε ve MaΔaza YΓΆneticiliΔi | 93.38% | 255 |
|
| 43 |
+
| Dini Bilgiler | 93.49% | 122 |
|
| 44 |
+
| KΓΌltΓΌrel Miras ve Turizm | 93.76% | 859 |
|
| 45 |
+
| Uluslar ArasΔ± Δ°liΕkiler | 93.84% | 117 |
|
| 46 |
+
| Radyo ve Televizyon ProgramcΔ±lΔ±ΔΔ± | 93.88% | 531 |
|
| 47 |
+
| Okul Γncesi ΓΔretmenliΔi | 93.92% | 9 |
|
| 48 |
+
| Δ°ktisat | 94.01% | 346 |
|
| 49 |
+
| YaΕlΔ± BakΔ±mΔ± | 94.29% | 214 |
|
| 50 |
+
| Γzel Koruma ve GΓΌvenlik | 94.32% | 759 |
|
| 51 |
+
| TΔ±bbi DΓΆkΓΌmantasyon ve Sekreterlik | 94.33% | 63 |
|
| 52 |
+
| TΓΌrk Dili ve EdebiyatΔ± | 94.33% | 340 |
|
| 53 |
+
| KPSS Denemeleri | 94.33% | 2674 |
|
| 54 |
+
| YΓΆnetim BΔ°liΕim Sistemleri | 94.40% | 294 |
|
| 55 |
+
| Felsefe | 94.43% | 502 |
|
| 56 |
+
| Ehliyet | 94.44% | 271 |
|
| 57 |
+
| Sosyal Hizmet | 94.56% | 64 |
|
| 58 |
+
| Γocuk GeliΕimi | 94.58% | 2597 |
|
| 59 |
+
| KPSS | 94.60% | 299 |
|
| 60 |
+
| Sosyal Hizmetler | 94.61% | 935 |
|
| 61 |
+
| SaΔlΔ±k YΓΆnetimi | 94.63% | 313 |
|
| 62 |
+
| Halkla Δ°liΕkiler ve ReklamcΔ±lΔ±k | 94.70% | 158 |
|
| 63 |
+
| Δ°nsan KaynaklarΔ± YΓΆnetimi | 94.92% | 237 |
|
| 64 |
+
| Lojistik | 94.99% | 474 |
|
| 65 |
+
| Menkul KΔ±ymetler ve Sermaye PiyasasΔ± | 95.04% | 435 |
|
| 66 |
+
| Konaklama Δ°ΕletmeciliΔi | 95.04% | 13 |
|
| 67 |
+
| Medya ve Δ°letiΕim | 95.05% | 898 |
|
| 68 |
+
| Sosyoloji | 95.06% | 393 |
|
| 69 |
+
| Δ°Εletme YΓΆnetimi | 95.20% | 501 |
|
| 70 |
+
| SaΔlΔ±k KurumlarΔ± Δ°ΕletmeciliΔi | 95.26% | 575 |
|
| 71 |
+
| Turizm ve Seyehat Hizmetleri | 95.30% | 70 |
|
| 72 |
+
| DΔ±Ε Ticaret | 95.36% | 1173 |
|
| 73 |
+
| Muhasebe ve Vergi UygulamalarΔ± | 95.49% | 474 |
|
| 74 |
+
| Spor YΓΆnetimi | 95.49% | 80 |
|
| 75 |
+
| Marka Δ°letiΕimi | 95.62% | 180 |
|
| 76 |
+
| Yerel YΓΆnetimler | 95.67% | 196 |
|
| 77 |
+
| Acil Durum ve Afet YΓΆnetimi | 95.67% | 33 |
|
| 78 |
+
| Maliye | 95.68% | 13 |
|
| 79 |
+
| AUZEF | 95.82% | 47 |
|
| 80 |
+
| Kamu YΓΆnetimi | 95.92% | 54 |
|
| 81 |
+
| UluslararasΔ± Ticaret ve Lojistik YΓΆnetimi | 95.95% | 65 |
|
| 82 |
+
| HavacΔ±lΔ±k YΓΆnetimi | 96.06% | 15 |
|
| 83 |
+
| Δ°Εletme | 96.09% | 24 |
|
| 84 |
+
| ΓalΔ±Εma Ekonomisi ve EndΓΌstri Δ°liΕkileri | 96.42% | 62 |
|
| 85 |
+
| Sosyal Bilimler | 96.56% | 3 |
|
results/tr_mmlu_summary.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"n_samples": 40966,
|
| 3 |
+
"orig_tr_pct": 75.5727,
|
| 4 |
+
"orig_pure_pct": 60.499,
|
| 5 |
+
"enh_tr_pct": 92.6413,
|
| 6 |
+
"enh_pure_pct": 68.3004,
|
| 7 |
+
"delta_tr_pct": 17.0686,
|
| 8 |
+
"delta_pure_pct": 7.8014,
|
| 9 |
+
"orig_avg_tokens": 84.75,
|
| 10 |
+
"enh_avg_tokens": 75.38,
|
| 11 |
+
"orig_avg_bpe": 21.95,
|
| 12 |
+
"enh_avg_bpe": 5.66,
|
| 13 |
+
"pct_improved": 100.0,
|
| 14 |
+
"pct_regressed": 0.0,
|
| 15 |
+
"pct_unchanged": 0.0
|
| 16 |
+
}
|
test_lattice.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test / demo script for the Morphological Lattice API.
|
| 4 |
+
|
| 5 |
+
Shows how ambiguous Turkish words like "evin" and "gelir" produce
|
| 6 |
+
multiple alternative analyses in the lattice.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def section(title: str) -> None:
|
| 14 |
+
print(f"\n{'β' * 60}")
|
| 15 |
+
print(f" {title}")
|
| 16 |
+
print(f"{'β' * 60}")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def main() -> None:
|
| 20 |
+
tok = NedoTurkishTokenizer()
|
| 21 |
+
|
| 22 |
+
# ββ 1. Tek kelime lattice testi ββββββββββββββββββββββββββββββββββββββ
|
| 23 |
+
section("1) get_morphological_lattice β tek kelime ΓΆrnekleri")
|
| 24 |
+
|
| 25 |
+
test_words = ["evin", "gelir", "yΓΌz", "Γ§Δ±kar", "koΕar"]
|
| 26 |
+
for word in test_words:
|
| 27 |
+
lattice = tok.get_morphological_lattice(word)
|
| 28 |
+
print(f"\nβΈ \"{word}\" β {len(lattice)} analiz:")
|
| 29 |
+
for i, entry in enumerate(lattice):
|
| 30 |
+
print(f" [{i}] root={entry['root']:<12} "
|
| 31 |
+
f"pos={entry['pos']:<6} "
|
| 32 |
+
f"suffixes={entry['suffixes']}")
|
| 33 |
+
print(f" lexical_form = {entry['lexical_form']}")
|
| 34 |
+
|
| 35 |
+
# ββ 2. Bilinmeyen kelime (boΕ lattice) βββββββββββββββββββββββββββββββ
|
| 36 |
+
section("2) Bilinmeyen / yabancΔ± kelime β boΕ lattice")
|
| 37 |
+
|
| 38 |
+
unknown_words = ["xyzfoo", "meeting", "blockchain"]
|
| 39 |
+
for word in unknown_words:
|
| 40 |
+
lattice = tok.get_morphological_lattice(word)
|
| 41 |
+
print(f" \"{word}\" β lattice boΕ mu? {len(lattice) == 0} (len={len(lattice)})")
|
| 42 |
+
|
| 43 |
+
# ββ 3. tokenize_lattice β cΓΌmle bazlΔ± test ββββββββββββββββββββββββββ
|
| 44 |
+
section("3) tokenize_lattice β cΓΌmle testi")
|
| 45 |
+
|
| 46 |
+
sentences = [
|
| 47 |
+
"Evin gΓΌzel gelir",
|
| 48 |
+
"Γocuk okula koΕar adΔ±m gitti",
|
| 49 |
+
"YΓΌz yΔ±llΔ±k Γ§Δ±nar",
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
for sent in sentences:
|
| 53 |
+
print(f"\nβΈ Input: \"{sent}\"")
|
| 54 |
+
result = tok.tokenize_lattice(sent)
|
| 55 |
+
for winfo in result["words"]:
|
| 56 |
+
n = len(winfo["lattice"])
|
| 57 |
+
print(f" {winfo['word']:<16} β {n} analiz(ler)")
|
| 58 |
+
for entry in winfo["lattice"]:
|
| 59 |
+
print(f" root={entry['root']:<12} pos={entry['pos']:<6} "
|
| 60 |
+
f"suffixes={entry['suffixes']}")
|
| 61 |
+
|
| 62 |
+
# ββ 4. JSON Γ§Δ±ktΔ± formatΔ± ββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
+
section("4) tokenize_lattice JSON Γ§Δ±ktΔ±")
|
| 64 |
+
|
| 65 |
+
data = tok.tokenize_lattice("evin gelir")
|
| 66 |
+
print(json.dumps(data, ensure_ascii=False, indent=2))
|
| 67 |
+
|
| 68 |
+
print("\nβ
TΓΌm testler baΕarΔ±yla tamamlandΔ±.")
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
main()
|