zemberek temizlik
Browse files- .claude/settings.local.json +4 -1
- hf_benchmark.py +324 -0
- nedo_turkish_tokenizer/data/zemberek-full.jar +0 -3
- paper_baseline_check.py +103 -0
- results/tr_mmlu_per_sample.csv +0 -0
- results/tr_mmlu_report.md +85 -0
- results/tr_mmlu_summary.json +16 -0
.claude/settings.local.json
CHANGED
|
@@ -9,7 +9,10 @@
|
|
| 9 |
"Bash(huggingface-cli whoami:*)",
|
| 10 |
"Bash(python3:*)",
|
| 11 |
"Bash(git lfs:*)",
|
| 12 |
-
"Bash(sudo apt-get:*)"
|
|
|
|
|
|
|
|
|
|
| 13 |
]
|
| 14 |
}
|
| 15 |
}
|
|
|
|
| 9 |
"Bash(huggingface-cli whoami:*)",
|
| 10 |
"Bash(python3:*)",
|
| 11 |
"Bash(git lfs:*)",
|
| 12 |
+
"Bash(sudo apt-get:*)",
|
| 13 |
+
"Bash(grep:*)",
|
| 14 |
+
"Bash(git mv:*)",
|
| 15 |
+
"Bash(git commit:*)"
|
| 16 |
]
|
| 17 |
}
|
| 18 |
}
|
hf_benchmark.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
hf_benchmark.py
|
| 3 |
+
---------------
|
| 4 |
+
NedoTurkishTokenizer'ı TR-MMLU üzerinde benchmark eder.
|
| 5 |
+
HuggingFace'den hem tokenizer hem dataset'i çeker.
|
| 6 |
+
|
| 7 |
+
Kullanım:
|
| 8 |
+
cd NedoTurkishTokenizer/
|
| 9 |
+
pip install huggingface_hub datasets
|
| 10 |
+
python hf_benchmark.py
|
| 11 |
+
|
| 12 |
+
# Sadece 2000 örnek (hızlı test):
|
| 13 |
+
python hf_benchmark.py --samples 2000
|
| 14 |
+
|
| 15 |
+
# Belirli kategori:
|
| 16 |
+
python hf_benchmark.py --category TUS
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import argparse
|
| 20 |
+
import json
|
| 21 |
+
import os
|
| 22 |
+
import sys
|
| 23 |
+
import csv
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
|
| 26 |
+
# ── HuggingFace token ─────────────────────────────────────────────────────────
|
| 27 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "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")
|
| 28 |
+
|
| 29 |
+
# ── Argümanlar ────────────────────────────────────────────────────────────────
|
| 30 |
+
parser = argparse.ArgumentParser()
|
| 31 |
+
parser.add_argument("--samples", default="all", help="Kaç örnek (all veya sayı)")
|
| 32 |
+
parser.add_argument("--category", default=None, help="Kategori filtresi (ör. TUS)")
|
| 33 |
+
parser.add_argument("--out", default=".", help="Çıktı klasörü")
|
| 34 |
+
args = parser.parse_args()
|
| 35 |
+
|
| 36 |
+
OUT = Path(args.out)
|
| 37 |
+
OUT.mkdir(parents=True, exist_ok=True)
|
| 38 |
+
|
| 39 |
+
# ── HuggingFace login ─────────────────────────────────────────────────────────
|
| 40 |
+
from huggingface_hub import login
|
| 41 |
+
login(token=HF_TOKEN, add_to_git_credential=False)
|
| 42 |
+
print("HuggingFace login ✓")
|
| 43 |
+
|
| 44 |
+
# ── Tokenizer yükle ───────────────────────────────────────────────────────────
|
| 45 |
+
print("NedoTurkishTokenizer yükleniyor...")
|
| 46 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 47 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 48 |
+
tok = NedoTurkishTokenizer()
|
| 49 |
+
print(f" Zemberek: {'✓' if tok.zemberek_available else '✗ (devre dışı)'}")
|
| 50 |
+
|
| 51 |
+
# ── Dataset yükle ─────────────────────────────────────────────────────────────
|
| 52 |
+
print("TR-MMLU dataset yükleniyor...")
|
| 53 |
+
from datasets import load_dataset
|
| 54 |
+
ds = load_dataset(
|
| 55 |
+
"alibayram/turkish_mmlu",
|
| 56 |
+
split="test",
|
| 57 |
+
token=HF_TOKEN,
|
| 58 |
+
)
|
| 59 |
+
print(f" {len(ds)} örnek")
|
| 60 |
+
|
| 61 |
+
# ── Kategori filtresi ─────────────────────────────────────────────────────────
|
| 62 |
+
samples = list(ds)
|
| 63 |
+
if args.category:
|
| 64 |
+
samples = [r for r in samples
|
| 65 |
+
if str(r.get("bolum","")).strip() == args.category]
|
| 66 |
+
print(f" Kategori '{args.category}': {len(samples)} örnek")
|
| 67 |
+
|
| 68 |
+
if args.samples != "all":
|
| 69 |
+
n = int(args.samples)
|
| 70 |
+
samples = samples[:n]
|
| 71 |
+
print(f" Kısıtlı: {len(samples)} örnek")
|
| 72 |
+
|
| 73 |
+
# ── Metin alanlarını birleştir ─────────────────────────────────────────────────
|
| 74 |
+
def get_text(row: dict) -> str:
|
| 75 |
+
parts = []
|
| 76 |
+
for field in ["soru", "question"]:
|
| 77 |
+
if row.get(field):
|
| 78 |
+
parts.append(str(row[field]))
|
| 79 |
+
break
|
| 80 |
+
choices = row.get("secenekler") or []
|
| 81 |
+
if isinstance(choices, list):
|
| 82 |
+
parts.extend(str(c) for c in choices)
|
| 83 |
+
for lbl in ["A","B","C","D"]:
|
| 84 |
+
if row.get(lbl):
|
| 85 |
+
parts.append(str(row[lbl]))
|
| 86 |
+
if row.get("aciklama"):
|
| 87 |
+
parts.append(str(row["aciklama"]))
|
| 88 |
+
return " ".join(parts)
|
| 89 |
+
|
| 90 |
+
# ── Token istatistikleri ──────────────────────────────────────────────────────
|
| 91 |
+
def token_stats(tokens: list[dict]) -> dict:
|
| 92 |
+
"""NedoTurkishTokenizer'ın token_type alanını kullan."""
|
| 93 |
+
total = len(tokens)
|
| 94 |
+
if total == 0:
|
| 95 |
+
return {"total":0,"bpe":0,"tr_pct":0.0,"pure_pct":0.0,"orig_tr_pct":0.0}
|
| 96 |
+
|
| 97 |
+
# Enhanced stats
|
| 98 |
+
roots = sum(1 for t in tokens if t["token_type"] == "ROOT")
|
| 99 |
+
suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
|
| 100 |
+
foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
|
| 101 |
+
punct = sum(1 for t in tokens if t["token_type"] == "PUNCT")
|
| 102 |
+
bpe = sum(1 for t in tokens if t["token_type"] == "BPE")
|
| 103 |
+
special = sum(1 for t in tokens
|
| 104 |
+
if t["token_type"] in ("NUM","DATE","UNIT","URL",
|
| 105 |
+
"MENTION","HASHTAG","EMOJI","ACRONYM"))
|
| 106 |
+
tr = roots + suffixes + foreign + punct + special
|
| 107 |
+
pure = sum(1 for t in tokens
|
| 108 |
+
if t["token_type"] in ("ROOT","SUFFIX","FOREIGN")
|
| 109 |
+
and not t["token"].strip().startswith("<"))
|
| 110 |
+
return {
|
| 111 |
+
"total": total,
|
| 112 |
+
"roots": roots,
|
| 113 |
+
"suffixes": suffixes,
|
| 114 |
+
"foreign": foreign,
|
| 115 |
+
"bpe": bpe,
|
| 116 |
+
"punct": punct,
|
| 117 |
+
"special": special,
|
| 118 |
+
"tr_pct": round(tr / total * 100, 4),
|
| 119 |
+
"pure_pct": round(pure / total * 100, 4),
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
def orig_stats(tokens: list[dict]) -> dict:
|
| 123 |
+
"""Orijinal tokenizer istatistikleri (karşılaştırma için)."""
|
| 124 |
+
total = len(tokens)
|
| 125 |
+
if total == 0:
|
| 126 |
+
return {"total":0,"bpe":0,"tr_pct":0.0,"pure_pct":0.0}
|
| 127 |
+
roots = sum(1 for t in tokens if t.get("type") == "ROOT")
|
| 128 |
+
suffixes = sum(1 for t in tokens if t.get("type") == "SUFFIX")
|
| 129 |
+
bpe = sum(1 for t in tokens if t.get("type") == "BPE")
|
| 130 |
+
tr = roots + suffixes
|
| 131 |
+
pure = sum(1 for t in tokens
|
| 132 |
+
if t.get("type") in ("ROOT","SUFFIX")
|
| 133 |
+
and not t.get("token","").strip().startswith("<"))
|
| 134 |
+
return {
|
| 135 |
+
"total": total,
|
| 136 |
+
"bpe": bpe,
|
| 137 |
+
"tr_pct": round(tr / total * 100, 4),
|
| 138 |
+
"pure_pct": round(pure / total * 100, 4),
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
# ── Ana benchmark döngüsü ─────────────────────────────────────────────────────
|
| 142 |
+
print(f"\nBenchmark başlıyor: {len(samples)} örnek...")
|
| 143 |
+
|
| 144 |
+
per_sample = []
|
| 145 |
+
orig_tr_sum = 0.0
|
| 146 |
+
enh_tr_sum = 0.0
|
| 147 |
+
orig_pur_sum = 0.0
|
| 148 |
+
enh_pur_sum = 0.0
|
| 149 |
+
orig_tok_sum = 0
|
| 150 |
+
enh_tok_sum = 0
|
| 151 |
+
orig_bpe_sum = 0
|
| 152 |
+
enh_bpe_sum = 0
|
| 153 |
+
improved = 0
|
| 154 |
+
regressed = 0
|
| 155 |
+
unchanged = 0
|
| 156 |
+
|
| 157 |
+
REPORT_EVERY = 500
|
| 158 |
+
|
| 159 |
+
for idx, row in enumerate(samples):
|
| 160 |
+
text = get_text(row)
|
| 161 |
+
if not text.strip():
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
# Orijinal tokenizer
|
| 165 |
+
orig_toks = tok._base.tokenize_text(text)
|
| 166 |
+
os_ = orig_stats(orig_toks)
|
| 167 |
+
|
| 168 |
+
# NedoTurkishTokenizer
|
| 169 |
+
enh_toks = tok.tokenize(text)
|
| 170 |
+
es_ = token_stats(enh_toks)
|
| 171 |
+
|
| 172 |
+
d_tr = round(es_["tr_pct"] - os_["tr_pct"], 4)
|
| 173 |
+
|
| 174 |
+
per_sample.append({
|
| 175 |
+
"idx": idx,
|
| 176 |
+
"bolum": str(row.get("bolum","")),
|
| 177 |
+
"orig_tr": os_["tr_pct"],
|
| 178 |
+
"enh_tr": es_["tr_pct"],
|
| 179 |
+
"d_tr": d_tr,
|
| 180 |
+
"orig_pure": os_["pure_pct"],
|
| 181 |
+
"enh_pure": es_["pure_pct"],
|
| 182 |
+
"orig_tok": os_["total"],
|
| 183 |
+
"enh_tok": es_["total"],
|
| 184 |
+
"orig_bpe": os_["bpe"],
|
| 185 |
+
"enh_bpe": es_["bpe"],
|
| 186 |
+
})
|
| 187 |
+
|
| 188 |
+
orig_tr_sum += os_["tr_pct"]
|
| 189 |
+
enh_tr_sum += es_["tr_pct"]
|
| 190 |
+
orig_pur_sum += os_["pure_pct"]
|
| 191 |
+
enh_pur_sum += es_["pure_pct"]
|
| 192 |
+
orig_tok_sum += os_["total"]
|
| 193 |
+
enh_tok_sum += es_["total"]
|
| 194 |
+
orig_bpe_sum += os_["bpe"]
|
| 195 |
+
enh_bpe_sum += es_["bpe"]
|
| 196 |
+
|
| 197 |
+
if d_tr > 0: improved += 1
|
| 198 |
+
elif d_tr < 0: regressed += 1
|
| 199 |
+
else: unchanged += 1
|
| 200 |
+
|
| 201 |
+
if (idx + 1) % REPORT_EVERY == 0:
|
| 202 |
+
n = idx + 1
|
| 203 |
+
print(f" [{n:>6}/{len(samples)}] "
|
| 204 |
+
f"TR%: {enh_tr_sum/n:.2f}% "
|
| 205 |
+
f"BPE/örnek: {enh_bpe_sum/n:.2f} "
|
| 206 |
+
f"Regressed: {regressed}")
|
| 207 |
+
|
| 208 |
+
n = len(per_sample)
|
| 209 |
+
if n == 0:
|
| 210 |
+
print("Hiç örnek işlenmedi!")
|
| 211 |
+
sys.exit(1)
|
| 212 |
+
|
| 213 |
+
# ── Özet ─────────────────────────────────────────────────────────────────────
|
| 214 |
+
summary = {
|
| 215 |
+
"n_samples": n,
|
| 216 |
+
"orig_tr_pct": round(orig_tr_sum / n, 4),
|
| 217 |
+
"orig_pure_pct": round(orig_pur_sum / n, 4),
|
| 218 |
+
"enh_tr_pct": round(enh_tr_sum / n, 4),
|
| 219 |
+
"enh_pure_pct": round(enh_pur_sum / n, 4),
|
| 220 |
+
"delta_tr_pct": round((enh_tr_sum - orig_tr_sum) / n, 4),
|
| 221 |
+
"delta_pure_pct": round((enh_pur_sum - orig_pur_sum) / n, 4),
|
| 222 |
+
"orig_avg_tokens": round(orig_tok_sum / n, 2),
|
| 223 |
+
"enh_avg_tokens": round(enh_tok_sum / n, 2),
|
| 224 |
+
"orig_avg_bpe": round(orig_bpe_sum / n, 2),
|
| 225 |
+
"enh_avg_bpe": round(enh_bpe_sum / n, 2),
|
| 226 |
+
"pct_improved": round(improved / n * 100, 2),
|
| 227 |
+
"pct_regressed": round(regressed / n * 100, 2),
|
| 228 |
+
"pct_unchanged": round(unchanged / n * 100, 2),
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
# ── Kategori bazında ──────────────────────────────────────────────────────────
|
| 232 |
+
from collections import defaultdict
|
| 233 |
+
cat_scores = defaultdict(list)
|
| 234 |
+
for row in per_sample:
|
| 235 |
+
cat_scores[row["bolum"]].append(row["enh_tr"])
|
| 236 |
+
|
| 237 |
+
cat_summary = {
|
| 238 |
+
cat: round(sum(v)/len(v), 2)
|
| 239 |
+
for cat, v in cat_scores.items()
|
| 240 |
+
if len(v) >= 3
|
| 241 |
+
}
|
| 242 |
+
cat_sorted = sorted(cat_summary.items(), key=lambda x: x[1])
|
| 243 |
+
|
| 244 |
+
# ── Rapor yazdır ─────────────────────────────────────────────────────────────
|
| 245 |
+
SEP = "═" * 65
|
| 246 |
+
print(f"\n{SEP}")
|
| 247 |
+
print(" NedoTurkishTokenizer — TR-MMLU Benchmark Sonuçları")
|
| 248 |
+
print(SEP)
|
| 249 |
+
print(f" N = {n:,} örnek\n")
|
| 250 |
+
print(f" {'Metrik':30s} {'Orijinal':>10} {'Enhanced':>10} {'Δ':>8}")
|
| 251 |
+
print(" " + "─"*55)
|
| 252 |
+
for label, orig, enh, delta in [
|
| 253 |
+
("TR%", summary["orig_tr_pct"], summary["enh_tr_pct"], summary["delta_tr_pct"]),
|
| 254 |
+
("Pure%", summary["orig_pure_pct"], summary["enh_pure_pct"], summary["delta_pure_pct"]),
|
| 255 |
+
("Avg token/örn",summary["orig_avg_tokens"],summary["enh_avg_tokens"],
|
| 256 |
+
round(summary["enh_avg_tokens"]-summary["orig_avg_tokens"],2)),
|
| 257 |
+
("Avg BPE/örn", summary["orig_avg_bpe"], summary["enh_avg_bpe"],
|
| 258 |
+
round(summary["enh_avg_bpe"]-summary["orig_avg_bpe"],2)),
|
| 259 |
+
]:
|
| 260 |
+
print(f" {label:30s} {orig:>10.2f} {enh:>10.2f} {delta:>+8.2f}")
|
| 261 |
+
|
| 262 |
+
print(f"\n İyileşen : {improved:>6,} (%{summary['pct_improved']:.2f})")
|
| 263 |
+
print(f" Gerileyen: {regressed:>6,} (%{summary['pct_regressed']:.2f})")
|
| 264 |
+
print(f" Değişmeyen:{unchanged:>6,} (%{summary['pct_unchanged']:.2f})")
|
| 265 |
+
|
| 266 |
+
print(f"\n En düşük TR% kategoriler:")
|
| 267 |
+
for cat, avg in cat_sorted[:10]:
|
| 268 |
+
n_cat = len(cat_scores[cat])
|
| 269 |
+
print(f" {cat:<35} {avg:>6.2f}% (n={n_cat})")
|
| 270 |
+
|
| 271 |
+
print(f"\n En yüksek TR% kategoriler:")
|
| 272 |
+
for cat, avg in cat_sorted[-8:]:
|
| 273 |
+
n_cat = len(cat_scores[cat])
|
| 274 |
+
print(f" {cat:<35} {avg:>6.2f}% (n={n_cat})")
|
| 275 |
+
|
| 276 |
+
print(SEP)
|
| 277 |
+
|
| 278 |
+
# ── Dosyalara yaz ─────────────────────────────────────────────────────────────
|
| 279 |
+
# Summary JSON
|
| 280 |
+
summary_path = OUT / "tr_mmlu_summary.json"
|
| 281 |
+
with open(summary_path, "w", encoding="utf-8") as f:
|
| 282 |
+
json.dump(summary, f, ensure_ascii=False, indent=2)
|
| 283 |
+
print(f"\n ✓ {summary_path}")
|
| 284 |
+
|
| 285 |
+
# Report Markdown
|
| 286 |
+
paper_tr = 90.29
|
| 287 |
+
paper_pur = 85.80
|
| 288 |
+
report_path = OUT / "tr_mmlu_report.md"
|
| 289 |
+
with open(report_path, "w", encoding="utf-8") as f:
|
| 290 |
+
f.write("# TR-MMLU Benchmark — NedoTurkishTokenizer\n\n")
|
| 291 |
+
f.write(f"**N = {n:,} örnek**\n\n")
|
| 292 |
+
f.write("## Ana Metrikler\n\n")
|
| 293 |
+
f.write("| Metrik | Orijinal | Enhanced | Δ |\n")
|
| 294 |
+
f.write("|--------|----------|----------|---|\n")
|
| 295 |
+
f.write(f"| TR% | {summary['orig_tr_pct']:.2f}% | {summary['enh_tr_pct']:.2f}% | {summary['delta_tr_pct']:+.2f}% |\n")
|
| 296 |
+
f.write(f"| Pure% | {summary['orig_pure_pct']:.2f}% | {summary['enh_pure_pct']:.2f}% | {summary['delta_pure_pct']:+.2f}% |\n")
|
| 297 |
+
f.write(f"| Avg token/örnek | {summary['orig_avg_tokens']:.2f} | {summary['enh_avg_tokens']:.2f} | {summary['enh_avg_tokens']-summary['orig_avg_tokens']:+.2f} |\n")
|
| 298 |
+
f.write(f"| Avg BPE/örnek | {summary['orig_avg_bpe']:.2f} | {summary['enh_avg_bpe']:.2f} | {summary['enh_avg_bpe']-summary['orig_avg_bpe']:+.2f} |\n")
|
| 299 |
+
f.write("\n## Paper ile Karşılaştırma\n\n")
|
| 300 |
+
f.write("| Metrik | Paper (orijinal) | Bizim (orijinal) | NedoTurkishTokenizer |\n")
|
| 301 |
+
f.write("|--------|-----------------|-----------------|---------------------|\n")
|
| 302 |
+
f.write(f"| TR% | {paper_tr}% | {summary['orig_tr_pct']:.2f}% | **{summary['enh_tr_pct']:.2f}%** |\n")
|
| 303 |
+
f.write(f"| Pure% | {paper_pur}% | {summary['orig_pure_pct']:.2f}% | **{summary['enh_pure_pct']:.2f}%** |\n")
|
| 304 |
+
f.write("\n## Örnek Dağılımı\n\n")
|
| 305 |
+
f.write(f"- İyileşen: {improved:,} (%{summary['pct_improved']:.2f})\n")
|
| 306 |
+
f.write(f"- Gerileyen: {regressed:,} (%{summary['pct_regressed']:.2f})\n")
|
| 307 |
+
f.write(f"- Değişmeyen: {unchanged:,} (%{summary['pct_unchanged']:.2f})\n")
|
| 308 |
+
f.write("\n## Kategori Bazında TR%\n\n")
|
| 309 |
+
f.write("| Kategori | TR% | N |\n")
|
| 310 |
+
f.write("|----------|-----|---|\n")
|
| 311 |
+
for cat, avg in cat_sorted:
|
| 312 |
+
n_cat = len(cat_scores[cat])
|
| 313 |
+
f.write(f"| {cat} | {avg:.2f}% | {n_cat} |\n")
|
| 314 |
+
print(f" ✓ {report_path}")
|
| 315 |
+
|
| 316 |
+
# Per-sample CSV
|
| 317 |
+
csv_path = OUT / "tr_mmlu_per_sample.csv"
|
| 318 |
+
with open(csv_path, "w", newline="", encoding="utf-8") as f:
|
| 319 |
+
writer = csv.DictWriter(f, fieldnames=per_sample[0].keys())
|
| 320 |
+
writer.writeheader()
|
| 321 |
+
writer.writerows(per_sample)
|
| 322 |
+
print(f" ✓ {csv_path}")
|
| 323 |
+
|
| 324 |
+
print("\nTamamlandı.\n")
|
nedo_turkish_tokenizer/data/zemberek-full.jar
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:74ee8736b73dc2ca878071b80829f9c5acccc268d4b8b7795d36d60db26a1731
|
| 3 |
-
size 31644792
|
|
|
|
|
|
|
|
|
|
|
|
paper_baseline_check.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
paper_baseline_check.py
|
| 3 |
+
-----------------------
|
| 4 |
+
Paper'ın %90.29 baseline'ını neden biz %75.57 olarak görüyoruz?
|
| 5 |
+
İki metodolojinin farkını somut olarak gösterir.
|
| 6 |
+
|
| 7 |
+
Kullanım:
|
| 8 |
+
cd NedoTurkishTokenizer/
|
| 9 |
+
python paper_baseline_check.py
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
from huggingface_hub import login
|
| 14 |
+
from datasets import load_dataset
|
| 15 |
+
from turkish_tokenizer import TurkishTokenizer
|
| 16 |
+
|
| 17 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "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")
|
| 18 |
+
login(token=HF_TOKEN, add_to_git_credential=False)
|
| 19 |
+
|
| 20 |
+
base = TurkishTokenizer()
|
| 21 |
+
|
| 22 |
+
print("TR-MMLU yükleniyor...")
|
| 23 |
+
ds = load_dataset("alibayram/turkish_mmlu", split="test", token=HF_TOKEN)
|
| 24 |
+
rows = list(ds)
|
| 25 |
+
print(f"{len(rows)} örnek\n")
|
| 26 |
+
|
| 27 |
+
def orig_tr_pct(tokens):
|
| 28 |
+
total = len(tokens)
|
| 29 |
+
if total == 0: return 0.0
|
| 30 |
+
tr = sum(1 for t in tokens if t["type"] in ("ROOT","SUFFIX"))
|
| 31 |
+
return tr / total * 100
|
| 32 |
+
|
| 33 |
+
# ── Metodoloji A: Sadece soru (paper'ın yaptığı) ─────────────────────────────
|
| 34 |
+
a_sum = 0.0
|
| 35 |
+
for row in rows:
|
| 36 |
+
text = str(row.get("soru") or row.get("question") or "")
|
| 37 |
+
if not text.strip(): continue
|
| 38 |
+
toks = base.tokenize_text(text)
|
| 39 |
+
a_sum += orig_tr_pct(toks)
|
| 40 |
+
a_avg = a_sum / len(rows)
|
| 41 |
+
|
| 42 |
+
# ── Metodoloji B: Soru + şıklar (bizim yaptığımız) ───────────────────────────
|
| 43 |
+
b_sum = 0.0
|
| 44 |
+
for row in rows:
|
| 45 |
+
parts = []
|
| 46 |
+
for f in ["soru","question"]:
|
| 47 |
+
if row.get(f):
|
| 48 |
+
parts.append(str(row[f]))
|
| 49 |
+
break
|
| 50 |
+
choices = row.get("secenekler") or []
|
| 51 |
+
if isinstance(choices, list):
|
| 52 |
+
parts.extend(str(c) for c in choices)
|
| 53 |
+
text = " ".join(parts)
|
| 54 |
+
if not text.strip(): continue
|
| 55 |
+
toks = base.tokenize_text(text)
|
| 56 |
+
b_sum += orig_tr_pct(toks)
|
| 57 |
+
b_avg = b_sum / len(rows)
|
| 58 |
+
|
| 59 |
+
# ── Metodoloji C: Soru + şıklar + açıklama (tam) ─────────────────────────────
|
| 60 |
+
c_sum = 0.0
|
| 61 |
+
for row in rows:
|
| 62 |
+
parts = []
|
| 63 |
+
for f in ["soru","question"]:
|
| 64 |
+
if row.get(f):
|
| 65 |
+
parts.append(str(row[f]))
|
| 66 |
+
break
|
| 67 |
+
choices = row.get("secenekler") or []
|
| 68 |
+
if isinstance(choices, list):
|
| 69 |
+
parts.extend(str(c) for c in choices)
|
| 70 |
+
if row.get("aciklama"):
|
| 71 |
+
parts.append(str(row["aciklama"]))
|
| 72 |
+
text = " ".join(parts)
|
| 73 |
+
if not text.strip(): continue
|
| 74 |
+
toks = base.tokenize_text(text)
|
| 75 |
+
c_sum += orig_tr_pct(toks)
|
| 76 |
+
c_avg = c_sum / len(rows)
|
| 77 |
+
|
| 78 |
+
# ── Şıklardaki içerik analizi ─────────────────────────────────────────────────
|
| 79 |
+
# İlk 20 sorunun şıklarına bak
|
| 80 |
+
print("İlk 20 sorunun şık örnekleri:")
|
| 81 |
+
for row in rows[:20]:
|
| 82 |
+
choices = row.get("secenekler") or []
|
| 83 |
+
if isinstance(choices, list) and choices:
|
| 84 |
+
sample = " | ".join(str(c)[:20] for c in choices[:4])
|
| 85 |
+
print(f" {sample}")
|
| 86 |
+
|
| 87 |
+
print(f"""
|
| 88 |
+
{'='*60}
|
| 89 |
+
METODOLOJİ KARŞILAŞTIRMASI (TürkishTokenizer baseline)
|
| 90 |
+
{'='*60}
|
| 91 |
+
|
| 92 |
+
A) Sadece soru alanı (paper'ın yöntemi): {a_avg:.2f}%
|
| 93 |
+
B) Soru + şıklar (kısmi): {b_avg:.2f}%
|
| 94 |
+
C) Soru + şıklar + açıkl. (bizim yöntemimiz): {c_avg:.2f}%
|
| 95 |
+
|
| 96 |
+
Paper değeri: 90.29%
|
| 97 |
+
Bizim A değeri: {a_avg:.2f}% ← paper ile fark: {a_avg-90.29:+.2f}
|
| 98 |
+
Bizim C değeri: {c_avg:.2f}% ← biz bunu kullanıyoruz
|
| 99 |
+
|
| 100 |
+
Sonuç: {c_avg:.2f}% vs {a_avg:.2f}% = {c_avg-a_avg:.2f} puan fark
|
| 101 |
+
Bu fark şıklardaki kısaltmalar ve yabancı terimlerden kaynaklanıyor.
|
| 102 |
+
{'='*60}
|
| 103 |
+
""")
|
results/tr_mmlu_per_sample.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/tr_mmlu_report.md
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# TR-MMLU Benchmark — NedoTurkishTokenizer
|
| 2 |
+
|
| 3 |
+
**N = 40,966 örnek**
|
| 4 |
+
|
| 5 |
+
## Ana Metrikler
|
| 6 |
+
|
| 7 |
+
| Metrik | Orijinal | Enhanced | Δ |
|
| 8 |
+
|--------|----------|----------|---|
|
| 9 |
+
| TR% | 75.57% | 92.64% | +17.07% |
|
| 10 |
+
| Pure% | 60.50% | 68.30% | +7.80% |
|
| 11 |
+
| Avg token/örnek | 84.75 | 75.38 | -9.37 |
|
| 12 |
+
| Avg BPE/örnek | 21.95 | 5.66 | -16.29 |
|
| 13 |
+
|
| 14 |
+
## Paper ile Karşılaştırma
|
| 15 |
+
|
| 16 |
+
| Metrik | Paper (orijinal) | Bizim (orijinal) | NedoTurkishTokenizer |
|
| 17 |
+
|--------|-----------------|-----------------|---------------------|
|
| 18 |
+
| TR% | 90.29% | 75.57% | **92.64%** |
|
| 19 |
+
| Pure% | 85.8% | 60.50% | **68.30%** |
|
| 20 |
+
|
| 21 |
+
## Örnek Dağılımı
|
| 22 |
+
|
| 23 |
+
- İyileşen: 40,964 (%100.00)
|
| 24 |
+
- Gerileyen: 0 (%0.00)
|
| 25 |
+
- Değişmeyen: 2 (%0.00)
|
| 26 |
+
|
| 27 |
+
## Kategori Bazında TR%
|
| 28 |
+
|
| 29 |
+
| Kategori | TR% | N |
|
| 30 |
+
|----------|-----|---|
|
| 31 |
+
| TUS | 80.25% | 1510 |
|
| 32 |
+
| Turizm ve Otel İşletmeciliği | 82.30% | 185 |
|
| 33 |
+
| Futbol | 89.63% | 283 |
|
| 34 |
+
| DHBT | 90.72% | 336 |
|
| 35 |
+
| Genel Kültür | 91.16% | 84 |
|
| 36 |
+
| Tarım | 91.41% | 586 |
|
| 37 |
+
| Siyer | 91.45% | 1067 |
|
| 38 |
+
| Laborant ve Veteriner Sağlık | 91.91% | 1485 |
|
| 39 |
+
| YGS Denemeleri | 91.92% | 4154 |
|
| 40 |
+
| Kim 500 Milyar İster | 92.06% | 12968 |
|
| 41 |
+
| Tarih | 93.11% | 571 |
|
| 42 |
+
| Parakende Satış ve Mağaza Yöneticiliği | 93.38% | 255 |
|
| 43 |
+
| Dini Bilgiler | 93.49% | 122 |
|
| 44 |
+
| Kültürel Miras ve Turizm | 93.76% | 859 |
|
| 45 |
+
| Uluslar Arası İlişkiler | 93.84% | 117 |
|
| 46 |
+
| Radyo ve Televizyon Programcılığı | 93.88% | 531 |
|
| 47 |
+
| Okul Öncesi Öğretmenliği | 93.92% | 9 |
|
| 48 |
+
| İktisat | 94.01% | 346 |
|
| 49 |
+
| Yaşlı Bakımı | 94.29% | 214 |
|
| 50 |
+
| Özel Koruma ve Güvenlik | 94.32% | 759 |
|
| 51 |
+
| Tıbbi Dökümantasyon ve Sekreterlik | 94.33% | 63 |
|
| 52 |
+
| Türk Dili ve Edebiyatı | 94.33% | 340 |
|
| 53 |
+
| KPSS Denemeleri | 94.33% | 2674 |
|
| 54 |
+
| Yönetim Bİlişim Sistemleri | 94.40% | 294 |
|
| 55 |
+
| Felsefe | 94.43% | 502 |
|
| 56 |
+
| Ehliyet | 94.44% | 271 |
|
| 57 |
+
| Sosyal Hizmet | 94.56% | 64 |
|
| 58 |
+
| Çocuk Gelişimi | 94.58% | 2597 |
|
| 59 |
+
| KPSS | 94.60% | 299 |
|
| 60 |
+
| Sosyal Hizmetler | 94.61% | 935 |
|
| 61 |
+
| Sağlık Yönetimi | 94.63% | 313 |
|
| 62 |
+
| Halkla İlişkiler ve Reklamcılık | 94.70% | 158 |
|
| 63 |
+
| İnsan Kaynakları Yönetimi | 94.92% | 237 |
|
| 64 |
+
| Lojistik | 94.99% | 474 |
|
| 65 |
+
| Menkul Kıymetler ve Sermaye Piyasası | 95.04% | 435 |
|
| 66 |
+
| Konaklama İşletmeciliği | 95.04% | 13 |
|
| 67 |
+
| Medya ve İletişim | 95.05% | 898 |
|
| 68 |
+
| Sosyoloji | 95.06% | 393 |
|
| 69 |
+
| İşletme Yönetimi | 95.20% | 501 |
|
| 70 |
+
| Sağlık Kurumları İşletmeciliği | 95.26% | 575 |
|
| 71 |
+
| Turizm ve Seyehat Hizmetleri | 95.30% | 70 |
|
| 72 |
+
| Dış Ticaret | 95.36% | 1173 |
|
| 73 |
+
| Muhasebe ve Vergi Uygulamaları | 95.49% | 474 |
|
| 74 |
+
| Spor Yönetimi | 95.49% | 80 |
|
| 75 |
+
| Marka İletişimi | 95.62% | 180 |
|
| 76 |
+
| Yerel Yönetimler | 95.67% | 196 |
|
| 77 |
+
| Acil Durum ve Afet Yönetimi | 95.67% | 33 |
|
| 78 |
+
| Maliye | 95.68% | 13 |
|
| 79 |
+
| AUZEF | 95.82% | 47 |
|
| 80 |
+
| Kamu Yönetimi | 95.92% | 54 |
|
| 81 |
+
| Uluslararası Ticaret ve Lojistik Yönetimi | 95.95% | 65 |
|
| 82 |
+
| Havacılık Yönetimi | 96.06% | 15 |
|
| 83 |
+
| İşletme | 96.09% | 24 |
|
| 84 |
+
| Çalışma Ekonomisi ve Endüstri İlişkileri | 96.42% | 62 |
|
| 85 |
+
| Sosyal Bilimler | 96.56% | 3 |
|
results/tr_mmlu_summary.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"n_samples": 40966,
|
| 3 |
+
"orig_tr_pct": 75.5727,
|
| 4 |
+
"orig_pure_pct": 60.499,
|
| 5 |
+
"enh_tr_pct": 92.6413,
|
| 6 |
+
"enh_pure_pct": 68.3004,
|
| 7 |
+
"delta_tr_pct": 17.0686,
|
| 8 |
+
"delta_pure_pct": 7.8014,
|
| 9 |
+
"orig_avg_tokens": 84.75,
|
| 10 |
+
"enh_avg_tokens": 75.38,
|
| 11 |
+
"orig_avg_bpe": 21.95,
|
| 12 |
+
"enh_avg_bpe": 5.66,
|
| 13 |
+
"pct_improved": 100.0,
|
| 14 |
+
"pct_regressed": 0.0,
|
| 15 |
+
"pct_unchanged": 0.0
|
| 16 |
+
}
|