Shree10 commited on
Commit
f259577
·
verified ·
1 Parent(s): 22a0251

Upload 11 files

Browse files
.gitattributes CHANGED
@@ -65,3 +65,5 @@ results/lb_comparison/expert_balance_comparison.png filter=lfs diff=lfs merge=lf
65
  results/lb_comparison/loss_comparison.png filter=lfs diff=lfs merge=lfs -text
66
  results/moe_hash/moe_hash_test_predictions.json filter=lfs diff=lfs merge=lfs -text
67
  results/moe_hash/training_curves.png filter=lfs diff=lfs merge=lfs -text
 
 
 
65
  results/lb_comparison/loss_comparison.png filter=lfs diff=lfs merge=lfs -text
66
  results/moe_hash/moe_hash_test_predictions.json filter=lfs diff=lfs merge=lfs -text
67
  results/moe_hash/training_curves.png filter=lfs diff=lfs merge=lfs -text
68
+ results/moe_token_choice/moe_token_choice_test_predictions.json filter=lfs diff=lfs merge=lfs -text
69
+ results/moe_token_choice/training_curves.png filter=lfs diff=lfs merge=lfs -text
results/moe_hash_with_lb/best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ec6145602df946505b720c0f174564accf1b6eabeee36fb85c86008af9ed553
3
+ size 2159273731
results/moe_hash_with_lb/training_metrics.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_losses": [
3
+ 7.284618572235107,
4
+ 6.295677210235596,
5
+ 5.8652772613525395,
6
+ 5.56426231765747,
7
+ 5.3282255226135256
8
+ ],
9
+ "train_nll_losses": [
10
+ 7.188395218658448,
11
+ 6.206792694091797,
12
+ 5.777505061340332,
13
+ 5.47729527053833,
14
+ 5.241658354949951
15
+ ],
16
+ "train_lb_losses": [
17
+ 0.0962233625292778,
18
+ 0.08888451331853867,
19
+ 0.08777220013141632,
20
+ 0.08696705160140991,
21
+ 0.08656716544628143
22
+ ],
23
+ "val_losses": [
24
+ 6.504456277877566,
25
+ 5.991641816638765,
26
+ 5.692757720039005,
27
+ 5.5011179106576105,
28
+ 5.334456035069057
29
+ ],
30
+ "use_load_balancer": true,
31
+ "expert_balance_history": [
32
+ {
33
+ "epoch": 1,
34
+ "cv_scores": {
35
+ "encoder_layer_0": 0.9899597063375694,
36
+ "encoder_layer_1": 0.7990518141604487,
37
+ "encoder_layer_2": 0.5562532583831334,
38
+ "encoder_layer_3": 0.9468495465629408,
39
+ "decoder_layer_0": 0.21338860568053053,
40
+ "decoder_layer_1": 0.434910226517034,
41
+ "decoder_layer_2": 0.9061288024067989,
42
+ "decoder_layer_3": 0.8562114158633968
43
+ }
44
+ },
45
+ {
46
+ "epoch": 2,
47
+ "cv_scores": {
48
+ "encoder_layer_0": 0.9942137591217549,
49
+ "encoder_layer_1": 0.9034253039673751,
50
+ "encoder_layer_2": 0.7761734095185961,
51
+ "encoder_layer_3": 0.9762803535163799,
52
+ "decoder_layer_0": 0.1967641643874591,
53
+ "decoder_layer_1": 0.24658043997511517,
54
+ "decoder_layer_2": 0.9482812902409864,
55
+ "decoder_layer_3": 0.9350409567436199
56
+ }
57
+ },
58
+ {
59
+ "epoch": 3,
60
+ "cv_scores": {
61
+ "encoder_layer_0": 0.9979227118321601,
62
+ "encoder_layer_1": 0.9387816740785491,
63
+ "encoder_layer_2": 0.853512993646018,
64
+ "encoder_layer_3": 0.9866695375658572,
65
+ "decoder_layer_0": 0.20424998009552645,
66
+ "decoder_layer_1": 0.23413879522609943,
67
+ "decoder_layer_2": 0.9657628202085438,
68
+ "decoder_layer_3": 0.9512014746936421
69
+ }
70
+ },
71
+ {
72
+ "epoch": 4,
73
+ "cv_scores": {
74
+ "encoder_layer_0": 1.0002050910590432,
75
+ "encoder_layer_1": 0.9561595695534928,
76
+ "encoder_layer_2": 0.8924241572167828,
77
+ "encoder_layer_3": 0.9920346981354939,
78
+ "decoder_layer_0": 0.21548889996954693,
79
+ "decoder_layer_1": 0.23986793581997143,
80
+ "decoder_layer_2": 0.9749462380467006,
81
+ "decoder_layer_3": 0.9621878323536873
82
+ }
83
+ },
84
+ {
85
+ "epoch": 5,
86
+ "cv_scores": {
87
+ "encoder_layer_0": 1.0016175709974362,
88
+ "encoder_layer_1": 0.9665781082053273,
89
+ "encoder_layer_2": 0.9152427659781934,
90
+ "encoder_layer_3": 0.9951101417418388,
91
+ "decoder_layer_0": 0.2274634215213949,
92
+ "decoder_layer_1": 0.24629436643568103,
93
+ "decoder_layer_2": 0.9802201580055626,
94
+ "decoder_layer_3": 0.9680230320659131
95
+ }
96
+ }
97
+ ]
98
+ }
results/moe_token_choice/best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a497228c41fbb0013e1d9f5a1514316e8c7f171f2b28625340468200505c4f06
3
+ size 2159613443
results/moe_token_choice/expert_usage_history.json ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "epoch": 1,
4
+ "usage": {
5
+ "encoder_layer_0": {
6
+ "0": 140993937,
7
+ "1": 135712261,
8
+ "2": 139518919,
9
+ "3": 139655991,
10
+ "4": 136781290,
11
+ "5": 137254831,
12
+ "6": 135476784,
13
+ "7": 137302659
14
+ },
15
+ "encoder_layer_1": {
16
+ "0": 153395951,
17
+ "1": 155997088,
18
+ "2": 135728108,
19
+ "3": 115885780,
20
+ "4": 147534633,
21
+ "5": 113059645,
22
+ "6": 139938622,
23
+ "7": 141156845
24
+ },
25
+ "encoder_layer_2": {
26
+ "0": 135360959,
27
+ "1": 143571480,
28
+ "2": 130426240,
29
+ "3": 138121467,
30
+ "4": 143331476,
31
+ "5": 143239623,
32
+ "6": 143669414,
33
+ "7": 124976013
34
+ },
35
+ "encoder_layer_3": {
36
+ "0": 138108424,
37
+ "1": 143624638,
38
+ "2": 136522770,
39
+ "3": 143495589,
40
+ "4": 136073789,
41
+ "5": 135460995,
42
+ "6": 133884692,
43
+ "7": 135525775
44
+ },
45
+ "decoder_layer_0": {
46
+ "0": 11885357,
47
+ "1": 11801320,
48
+ "2": 13060220,
49
+ "3": 12342825,
50
+ "4": 12361295,
51
+ "5": 13069915,
52
+ "6": 11787056,
53
+ "7": 13554748
54
+ },
55
+ "decoder_layer_1": {
56
+ "0": 11862357,
57
+ "1": 12259029,
58
+ "2": 11620687,
59
+ "3": 14837367,
60
+ "4": 11779914,
61
+ "5": 13099878,
62
+ "6": 11683397,
63
+ "7": 12720107
64
+ },
65
+ "decoder_layer_2": {
66
+ "0": 12201210,
67
+ "1": 12112379,
68
+ "2": 14274767,
69
+ "3": 12285646,
70
+ "4": 12123051,
71
+ "5": 12171337,
72
+ "6": 12135313,
73
+ "7": 12559033
74
+ },
75
+ "decoder_layer_3": {
76
+ "0": 12632673,
77
+ "1": 15462911,
78
+ "2": 11907600,
79
+ "3": 12328118,
80
+ "4": 11866897,
81
+ "5": 11858883,
82
+ "6": 11895272,
83
+ "7": 11910382
84
+ }
85
+ }
86
+ },
87
+ {
88
+ "epoch": 2,
89
+ "usage": {
90
+ "encoder_layer_0": {
91
+ "0": 140993937,
92
+ "1": 135712261,
93
+ "2": 139518919,
94
+ "3": 139655991,
95
+ "4": 136781290,
96
+ "5": 137254831,
97
+ "6": 135476784,
98
+ "7": 137302659
99
+ },
100
+ "encoder_layer_1": {
101
+ "0": 153395951,
102
+ "1": 155997088,
103
+ "2": 135728108,
104
+ "3": 115885780,
105
+ "4": 147534633,
106
+ "5": 113059645,
107
+ "6": 139938622,
108
+ "7": 141156845
109
+ },
110
+ "encoder_layer_2": {
111
+ "0": 135360959,
112
+ "1": 143571480,
113
+ "2": 130426240,
114
+ "3": 138121467,
115
+ "4": 143331476,
116
+ "5": 143239623,
117
+ "6": 143669414,
118
+ "7": 124976013
119
+ },
120
+ "encoder_layer_3": {
121
+ "0": 138108424,
122
+ "1": 143624638,
123
+ "2": 136522770,
124
+ "3": 143495589,
125
+ "4": 136073789,
126
+ "5": 135460995,
127
+ "6": 133884692,
128
+ "7": 135525775
129
+ },
130
+ "decoder_layer_0": {
131
+ "0": 11885357,
132
+ "1": 11801320,
133
+ "2": 13060220,
134
+ "3": 12342825,
135
+ "4": 12361295,
136
+ "5": 13069915,
137
+ "6": 11787056,
138
+ "7": 13554748
139
+ },
140
+ "decoder_layer_1": {
141
+ "0": 11862357,
142
+ "1": 12259029,
143
+ "2": 11620687,
144
+ "3": 14837367,
145
+ "4": 11779914,
146
+ "5": 13099878,
147
+ "6": 11683397,
148
+ "7": 12720107
149
+ },
150
+ "decoder_layer_2": {
151
+ "0": 12201210,
152
+ "1": 12112379,
153
+ "2": 14274767,
154
+ "3": 12285646,
155
+ "4": 12123051,
156
+ "5": 12171337,
157
+ "6": 12135313,
158
+ "7": 12559033
159
+ },
160
+ "decoder_layer_3": {
161
+ "0": 12632673,
162
+ "1": 15462911,
163
+ "2": 11907600,
164
+ "3": 12328118,
165
+ "4": 11866897,
166
+ "5": 11858883,
167
+ "6": 11895272,
168
+ "7": 11910382
169
+ }
170
+ }
171
+ },
172
+ {
173
+ "epoch": 3,
174
+ "usage": {
175
+ "encoder_layer_0": {
176
+ "0": 140993937,
177
+ "1": 135712261,
178
+ "2": 139518919,
179
+ "3": 139655991,
180
+ "4": 136781290,
181
+ "5": 137254831,
182
+ "6": 135476784,
183
+ "7": 137302659
184
+ },
185
+ "encoder_layer_1": {
186
+ "0": 153395951,
187
+ "1": 155997088,
188
+ "2": 135728108,
189
+ "3": 115885780,
190
+ "4": 147534633,
191
+ "5": 113059645,
192
+ "6": 139938622,
193
+ "7": 141156845
194
+ },
195
+ "encoder_layer_2": {
196
+ "0": 135360959,
197
+ "1": 143571480,
198
+ "2": 130426240,
199
+ "3": 138121467,
200
+ "4": 143331476,
201
+ "5": 143239623,
202
+ "6": 143669414,
203
+ "7": 124976013
204
+ },
205
+ "encoder_layer_3": {
206
+ "0": 138108424,
207
+ "1": 143624638,
208
+ "2": 136522770,
209
+ "3": 143495589,
210
+ "4": 136073789,
211
+ "5": 135460995,
212
+ "6": 133884692,
213
+ "7": 135525775
214
+ },
215
+ "decoder_layer_0": {
216
+ "0": 11885357,
217
+ "1": 11801320,
218
+ "2": 13060220,
219
+ "3": 12342825,
220
+ "4": 12361295,
221
+ "5": 13069915,
222
+ "6": 11787056,
223
+ "7": 13554748
224
+ },
225
+ "decoder_layer_1": {
226
+ "0": 11862357,
227
+ "1": 12259029,
228
+ "2": 11620687,
229
+ "3": 14837367,
230
+ "4": 11779914,
231
+ "5": 13099878,
232
+ "6": 11683397,
233
+ "7": 12720107
234
+ },
235
+ "decoder_layer_2": {
236
+ "0": 12201210,
237
+ "1": 12112379,
238
+ "2": 14274767,
239
+ "3": 12285646,
240
+ "4": 12123051,
241
+ "5": 12171337,
242
+ "6": 12135313,
243
+ "7": 12559033
244
+ },
245
+ "decoder_layer_3": {
246
+ "0": 12632673,
247
+ "1": 15462911,
248
+ "2": 11907600,
249
+ "3": 12328118,
250
+ "4": 11866897,
251
+ "5": 11858883,
252
+ "6": 11895272,
253
+ "7": 11910382
254
+ }
255
+ }
256
+ },
257
+ {
258
+ "epoch": 4,
259
+ "usage": {
260
+ "encoder_layer_0": {
261
+ "0": 140993937,
262
+ "1": 135712261,
263
+ "2": 139518919,
264
+ "3": 139655991,
265
+ "4": 136781290,
266
+ "5": 137254831,
267
+ "6": 135476784,
268
+ "7": 137302659
269
+ },
270
+ "encoder_layer_1": {
271
+ "0": 153395951,
272
+ "1": 155997088,
273
+ "2": 135728108,
274
+ "3": 115885780,
275
+ "4": 147534633,
276
+ "5": 113059645,
277
+ "6": 139938622,
278
+ "7": 141156845
279
+ },
280
+ "encoder_layer_2": {
281
+ "0": 135360959,
282
+ "1": 143571480,
283
+ "2": 130426240,
284
+ "3": 138121467,
285
+ "4": 143331476,
286
+ "5": 143239623,
287
+ "6": 143669414,
288
+ "7": 124976013
289
+ },
290
+ "encoder_layer_3": {
291
+ "0": 138108424,
292
+ "1": 143624638,
293
+ "2": 136522770,
294
+ "3": 143495589,
295
+ "4": 136073789,
296
+ "5": 135460995,
297
+ "6": 133884692,
298
+ "7": 135525775
299
+ },
300
+ "decoder_layer_0": {
301
+ "0": 11885357,
302
+ "1": 11801320,
303
+ "2": 13060220,
304
+ "3": 12342825,
305
+ "4": 12361295,
306
+ "5": 13069915,
307
+ "6": 11787056,
308
+ "7": 13554748
309
+ },
310
+ "decoder_layer_1": {
311
+ "0": 11862357,
312
+ "1": 12259029,
313
+ "2": 11620687,
314
+ "3": 14837367,
315
+ "4": 11779914,
316
+ "5": 13099878,
317
+ "6": 11683397,
318
+ "7": 12720107
319
+ },
320
+ "decoder_layer_2": {
321
+ "0": 12201210,
322
+ "1": 12112379,
323
+ "2": 14274767,
324
+ "3": 12285646,
325
+ "4": 12123051,
326
+ "5": 12171337,
327
+ "6": 12135313,
328
+ "7": 12559033
329
+ },
330
+ "decoder_layer_3": {
331
+ "0": 12632673,
332
+ "1": 15462911,
333
+ "2": 11907600,
334
+ "3": 12328118,
335
+ "4": 11866897,
336
+ "5": 11858883,
337
+ "6": 11895272,
338
+ "7": 11910382
339
+ }
340
+ }
341
+ },
342
+ {
343
+ "epoch": 5,
344
+ "usage": {
345
+ "encoder_layer_0": {
346
+ "0": 140993937,
347
+ "1": 135712261,
348
+ "2": 139518919,
349
+ "3": 139655991,
350
+ "4": 136781290,
351
+ "5": 137254831,
352
+ "6": 135476784,
353
+ "7": 137302659
354
+ },
355
+ "encoder_layer_1": {
356
+ "0": 153395951,
357
+ "1": 155997088,
358
+ "2": 135728108,
359
+ "3": 115885780,
360
+ "4": 147534633,
361
+ "5": 113059645,
362
+ "6": 139938622,
363
+ "7": 141156845
364
+ },
365
+ "encoder_layer_2": {
366
+ "0": 135360959,
367
+ "1": 143571480,
368
+ "2": 130426240,
369
+ "3": 138121467,
370
+ "4": 143331476,
371
+ "5": 143239623,
372
+ "6": 143669414,
373
+ "7": 124976013
374
+ },
375
+ "encoder_layer_3": {
376
+ "0": 138108424,
377
+ "1": 143624638,
378
+ "2": 136522770,
379
+ "3": 143495589,
380
+ "4": 136073789,
381
+ "5": 135460995,
382
+ "6": 133884692,
383
+ "7": 135525775
384
+ },
385
+ "decoder_layer_0": {
386
+ "0": 11885357,
387
+ "1": 11801320,
388
+ "2": 13060220,
389
+ "3": 12342825,
390
+ "4": 12361295,
391
+ "5": 13069915,
392
+ "6": 11787056,
393
+ "7": 13554748
394
+ },
395
+ "decoder_layer_1": {
396
+ "0": 11862357,
397
+ "1": 12259029,
398
+ "2": 11620687,
399
+ "3": 14837367,
400
+ "4": 11779914,
401
+ "5": 13099878,
402
+ "6": 11683397,
403
+ "7": 12720107
404
+ },
405
+ "decoder_layer_2": {
406
+ "0": 12201210,
407
+ "1": 12112379,
408
+ "2": 14274767,
409
+ "3": 12285646,
410
+ "4": 12123051,
411
+ "5": 12171337,
412
+ "6": 12135313,
413
+ "7": 12559033
414
+ },
415
+ "decoder_layer_3": {
416
+ "0": 12632673,
417
+ "1": 15462911,
418
+ "2": 11907600,
419
+ "3": 12328118,
420
+ "4": 11866897,
421
+ "5": 11858883,
422
+ "6": 11895272,
423
+ "7": 11910382
424
+ }
425
+ }
426
+ }
427
+ ]
results/moe_token_choice/moe_token_choice_test_predictions.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aebe85737a13dc11530e579868d68fa7fd58f6f8d6cebea9fde02914242db6f4
3
+ size 27675266
results/moe_token_choice/training_curves.png ADDED

Git LFS Details

  • SHA256: 4166935164a8afdd3a5ee32c2453e5c131446ff02fe516d807de0e9396a57b22
  • Pointer size: 131 Bytes
  • Size of remote file: 146 kB
results/moe_token_choice_with_lb/best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fafe0d552784dc65ed6caa6870f8689c28b465d177ef3d7daf596c1fed171cd
3
+ size 2159617603
results/moe_token_choice_with_lb/training_metrics.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_losses": [
3
+ 7.0997042610168455,
4
+ 6.098364649200439,
5
+ 5.656473810577393,
6
+ 5.355114995574951,
7
+ 5.108737034606934
8
+ ],
9
+ "train_nll_losses": [
10
+ 7.0122965110778805,
11
+ 6.014897843170166,
12
+ 5.573608850860595,
13
+ 5.272693781280518,
14
+ 5.026728285980225
15
+ ],
16
+ "train_lb_losses": [
17
+ 0.08740775455236435,
18
+ 0.08346681421995163,
19
+ 0.08286495605707168,
20
+ 0.08242120879888534,
21
+ 0.08200874214172363
22
+ ],
23
+ "val_losses": [
24
+ 6.270021446167477,
25
+ 5.788443149082244,
26
+ 5.507700261615572,
27
+ 5.303510824839274,
28
+ 5.156257523430718
29
+ ],
30
+ "use_load_balancer": true,
31
+ "expert_balance_history": [
32
+ {
33
+ "epoch": 1,
34
+ "cv_scores": {
35
+ "encoder_layer_0": 0.02239848952710913,
36
+ "encoder_layer_1": 0.011894839454450919,
37
+ "encoder_layer_2": 0.014883410628884161,
38
+ "encoder_layer_3": 0.03680761393831419,
39
+ "decoder_layer_0": 0.07264102945333789,
40
+ "decoder_layer_1": 0.09091776210725255,
41
+ "decoder_layer_2": 0.07472949755270192,
42
+ "decoder_layer_3": 0.10471215665870788
43
+ }
44
+ },
45
+ {
46
+ "epoch": 2,
47
+ "cv_scores": {
48
+ "encoder_layer_0": 0.02293213506202102,
49
+ "encoder_layer_1": 0.01635183137600546,
50
+ "encoder_layer_2": 0.014198928648696423,
51
+ "encoder_layer_3": 0.024659876381341785,
52
+ "decoder_layer_0": 0.05170881803637949,
53
+ "decoder_layer_1": 0.058714955010434354,
54
+ "decoder_layer_2": 0.08058705108512597,
55
+ "decoder_layer_3": 0.051674624864711804
56
+ }
57
+ },
58
+ {
59
+ "epoch": 3,
60
+ "cv_scores": {
61
+ "encoder_layer_0": 0.01790022341165699,
62
+ "encoder_layer_1": 0.03365750521155693,
63
+ "encoder_layer_2": 0.020147216041616556,
64
+ "encoder_layer_3": 0.028766130854617978,
65
+ "decoder_layer_0": 0.06359621508791356,
66
+ "decoder_layer_1": 0.0664411660102753,
67
+ "decoder_layer_2": 0.08864758361507494,
68
+ "decoder_layer_3": 0.030620321433448113
69
+ }
70
+ },
71
+ {
72
+ "epoch": 4,
73
+ "cv_scores": {
74
+ "encoder_layer_0": 0.016398893255619877,
75
+ "encoder_layer_1": 0.03261250392944865,
76
+ "encoder_layer_2": 0.02526192742709379,
77
+ "encoder_layer_3": 0.03652289327142253,
78
+ "decoder_layer_0": 0.0825089900217896,
79
+ "decoder_layer_1": 0.07279900493183922,
80
+ "decoder_layer_2": 0.09745846649526438,
81
+ "decoder_layer_3": 0.0278281985663954
82
+ }
83
+ },
84
+ {
85
+ "epoch": 5,
86
+ "cv_scores": {
87
+ "encoder_layer_0": 0.015321057262938304,
88
+ "encoder_layer_1": 0.027677770571459974,
89
+ "encoder_layer_2": 0.036855431757315545,
90
+ "encoder_layer_3": 0.03604612322392209,
91
+ "decoder_layer_0": 0.09712146666634115,
92
+ "decoder_layer_1": 0.07458012114520648,
93
+ "decoder_layer_2": 0.10375502366450497,
94
+ "decoder_layer_3": 0.027890078551332173
95
+ }
96
+ }
97
+ ]
98
+ }
results/moe_token_choice_without_lb/best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdb1c7a7e405d1836d99ee7a323c6a17a1d44dd810e89ce3b31aeddaed1dbdb6
3
+ size 2159617603
results/moe_token_choice_without_lb/training_metrics.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_losses": [
3
+ 6.9957088943481445,
4
+ 5.946418069458008,
5
+ 5.539425620269776,
6
+ 5.257929954528809,
7
+ 5.025598110198975
8
+ ],
9
+ "train_nll_losses": [
10
+ 6.9957088943481445,
11
+ 5.946418069458008,
12
+ 5.539425620269776,
13
+ 5.257929954528809,
14
+ 5.025598110198975
15
+ ],
16
+ "train_lb_losses": [
17
+ 0.0,
18
+ 0.0,
19
+ 0.0,
20
+ 0.0,
21
+ 0.0
22
+ ],
23
+ "val_losses": [
24
+ 6.208024940793476,
25
+ 5.756550130389986,
26
+ 5.503833710201203,
27
+ 5.309833534180172,
28
+ 5.157609190259661
29
+ ],
30
+ "use_load_balancer": false,
31
+ "expert_balance_history": [
32
+ {
33
+ "epoch": 1,
34
+ "cv_scores": {
35
+ "encoder_layer_0": 0.47773956186164035,
36
+ "encoder_layer_1": 1.2283993740562353,
37
+ "encoder_layer_2": 1.2066402900273532,
38
+ "encoder_layer_3": 1.1506108166570812,
39
+ "decoder_layer_0": 1.1311583795369742,
40
+ "decoder_layer_1": 1.031120870253164,
41
+ "decoder_layer_2": 1.2149890152604113,
42
+ "decoder_layer_3": 1.359569701461952
43
+ }
44
+ },
45
+ {
46
+ "epoch": 2,
47
+ "cv_scores": {
48
+ "encoder_layer_0": 0.5317931290413349,
49
+ "encoder_layer_1": 1.1592380871171228,
50
+ "encoder_layer_2": 1.048192812774701,
51
+ "encoder_layer_3": 1.0848774885298542,
52
+ "decoder_layer_0": 1.2000397098222888,
53
+ "decoder_layer_1": 0.973016538594419,
54
+ "decoder_layer_2": 1.2892343882605066,
55
+ "decoder_layer_3": 1.272053691280787
56
+ }
57
+ },
58
+ {
59
+ "epoch": 3,
60
+ "cv_scores": {
61
+ "encoder_layer_0": 0.5360830744020194,
62
+ "encoder_layer_1": 1.1426186109630772,
63
+ "encoder_layer_2": 0.9655670214289462,
64
+ "encoder_layer_3": 1.0901768668859588,
65
+ "decoder_layer_0": 1.2455368246203251,
66
+ "decoder_layer_1": 0.9588685599967219,
67
+ "decoder_layer_2": 1.3105986019784088,
68
+ "decoder_layer_3": 1.2299858663796666
69
+ }
70
+ },
71
+ {
72
+ "epoch": 4,
73
+ "cv_scores": {
74
+ "encoder_layer_0": 0.5186909345945561,
75
+ "encoder_layer_1": 1.1361607950311845,
76
+ "encoder_layer_2": 0.9141624769231046,
77
+ "encoder_layer_3": 1.1254102549849538,
78
+ "decoder_layer_0": 1.2753951147459726,
79
+ "decoder_layer_1": 0.9487367217372488,
80
+ "decoder_layer_2": 1.3281710137430134,
81
+ "decoder_layer_3": 1.205282174955813
82
+ }
83
+ },
84
+ {
85
+ "epoch": 5,
86
+ "cv_scores": {
87
+ "encoder_layer_0": 0.5177979354789254,
88
+ "encoder_layer_1": 1.1347360622293354,
89
+ "encoder_layer_2": 0.8721613047751007,
90
+ "encoder_layer_3": 1.1427533114030508,
91
+ "decoder_layer_0": 1.2888345008752309,
92
+ "decoder_layer_1": 0.9444531131949591,
93
+ "decoder_layer_2": 1.3459587847130505,
94
+ "decoder_layer_3": 1.18326321074551
95
+ }
96
+ }
97
+ ]
98
+ }
results/standard_mha_moe/best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdbdbfa8c37f4df106382200b2501689f11a215806c88cfdd7eb1b6b254e1a34
3
+ size 720563261