ronnengmail commited on
Commit
e06a0c9
·
verified ·
1 Parent(s): cbe7577

Upload exp_c_tokenizer_ablation.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. exp_c_tokenizer_ablation.json +203 -0
exp_c_tokenizer_ablation.json ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "experiment": "C_tokenizer_ablation",
3
+ "timestamp": "2026-04-13 09:30:52 UTC",
4
+ "tokenizers": {
5
+ "Ours-32K": {
6
+ "vocab_size": 32000
7
+ },
8
+ "Llama-2": {
9
+ "vocab_size": 32000
10
+ },
11
+ "HebrewGPT": {
12
+ "vocab_size": 32000
13
+ }
14
+ },
15
+ "vocabulary_composition": {
16
+ "Ours-32K": {
17
+ "Arabic": {
18
+ "count": 14945,
19
+ "pct": 46.7
20
+ },
21
+ "Hebrew": {
22
+ "count": 8888,
23
+ "pct": 27.8
24
+ },
25
+ "Latin": {
26
+ "count": 7778,
27
+ "pct": 24.3
28
+ },
29
+ "Other": {
30
+ "count": 278,
31
+ "pct": 0.9
32
+ },
33
+ "Digit": {
34
+ "count": 110,
35
+ "pct": 0.3
36
+ },
37
+ "Special": {
38
+ "count": 1,
39
+ "pct": 0.0
40
+ }
41
+ },
42
+ "Llama-2": {
43
+ "Latin": {
44
+ "count": 25900,
45
+ "pct": 80.9
46
+ },
47
+ "Other": {
48
+ "count": 5848,
49
+ "pct": 18.3
50
+ },
51
+ "Digit": {
52
+ "count": 133,
53
+ "pct": 0.4
54
+ },
55
+ "Arabic": {
56
+ "count": 54,
57
+ "pct": 0.2
58
+ },
59
+ "Hebrew": {
60
+ "count": 36,
61
+ "pct": 0.1
62
+ },
63
+ "Special": {
64
+ "count": 16,
65
+ "pct": 0.1
66
+ },
67
+ "Space": {
68
+ "count": 13,
69
+ "pct": 0.0
70
+ }
71
+ },
72
+ "HebrewGPT": {
73
+ "Hebrew": {
74
+ "count": 23101,
75
+ "pct": 72.2
76
+ },
77
+ "Other": {
78
+ "count": 6399,
79
+ "pct": 20.0
80
+ },
81
+ "Latin": {
82
+ "count": 2238,
83
+ "pct": 7.0
84
+ },
85
+ "Arabic": {
86
+ "count": 137,
87
+ "pct": 0.4
88
+ },
89
+ "Digit": {
90
+ "count": 124,
91
+ "pct": 0.4
92
+ },
93
+ "Special": {
94
+ "count": 1,
95
+ "pct": 0.0
96
+ }
97
+ }
98
+ },
99
+ "fertility": {
100
+ "en": {
101
+ "Ours-32K": {
102
+ "fertility": 1.544,
103
+ "bytes_per_token": 3.71,
104
+ "total_tokens": 2785,
105
+ "total_bytes": 10322
106
+ },
107
+ "Llama-2": {
108
+ "fertility": 1.51,
109
+ "bytes_per_token": 3.79,
110
+ "total_tokens": 2724,
111
+ "total_bytes": 10322
112
+ },
113
+ "HebrewGPT": {
114
+ "fertility": 2.419,
115
+ "bytes_per_token": 2.37,
116
+ "total_tokens": 4364,
117
+ "total_bytes": 10322
118
+ }
119
+ },
120
+ "he": {
121
+ "Ours-32K": {
122
+ "fertility": 1.343,
123
+ "bytes_per_token": 5.12,
124
+ "total_tokens": 8866,
125
+ "total_bytes": 45378
126
+ },
127
+ "Llama-2": {
128
+ "fertility": 3.909,
129
+ "bytes_per_token": 1.76,
130
+ "total_tokens": 25806,
131
+ "total_bytes": 45378
132
+ },
133
+ "HebrewGPT": {
134
+ "fertility": 1.255,
135
+ "bytes_per_token": 5.48,
136
+ "total_tokens": 8283,
137
+ "total_bytes": 45378
138
+ }
139
+ },
140
+ "ar": {
141
+ "Ours-32K": {
142
+ "fertility": 2.222,
143
+ "bytes_per_token": 3.48,
144
+ "total_tokens": 7776,
145
+ "total_bytes": 27023
146
+ },
147
+ "Llama-2": {
148
+ "fertility": 4.363,
149
+ "bytes_per_token": 1.77,
150
+ "total_tokens": 15266,
151
+ "total_bytes": 27023
152
+ },
153
+ "HebrewGPT": {
154
+ "fertility": 4.154,
155
+ "bytes_per_token": 1.86,
156
+ "total_tokens": 14535,
157
+ "total_bytes": 27023
158
+ }
159
+ },
160
+ "fa": {
161
+ "Ours-32K": {
162
+ "fertility": 1.52,
163
+ "bytes_per_token": 5.72,
164
+ "total_tokens": 5302,
165
+ "total_bytes": 30327
166
+ },
167
+ "Llama-2": {
168
+ "fertility": 4.876,
169
+ "bytes_per_token": 1.78,
170
+ "total_tokens": 17014,
171
+ "total_bytes": 30327
172
+ },
173
+ "HebrewGPT": {
174
+ "fertility": 4.508,
175
+ "bytes_per_token": 1.93,
176
+ "total_tokens": 15727,
177
+ "total_bytes": 30327
178
+ }
179
+ }
180
+ },
181
+ "efficiency": {
182
+ "en": {
183
+ "Ours-32K": 0.2698,
184
+ "Llama-2": 0.2639,
185
+ "HebrewGPT": 0.4228
186
+ },
187
+ "he": {
188
+ "Ours-32K": 0.1954,
189
+ "Llama-2": 0.5687,
190
+ "HebrewGPT": 0.1825
191
+ },
192
+ "ar": {
193
+ "Ours-32K": 0.2878,
194
+ "Llama-2": 0.5649,
195
+ "HebrewGPT": 0.5379
196
+ },
197
+ "fa": {
198
+ "Ours-32K": 0.1748,
199
+ "Llama-2": 0.561,
200
+ "HebrewGPT": 0.5186
201
+ }
202
+ }
203
+ }