umer07 commited on
Commit
c64f373
·
verified ·
1 Parent(s): 5d04bbc

Fathom: upload expert-e3-network/training_log.json

Browse files
adapters/expert-e3-network/training_log.json ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "loss": 1.6756,
4
+ "grad_norm": 0.9338335990905762,
5
+ "learning_rate": 1.8e-05,
6
+ "entropy": 1.2653020560741424,
7
+ "num_tokens": 211745.0,
8
+ "mean_token_accuracy": 0.626320019364357,
9
+ "epoch": 0.016,
10
+ "step": 10
11
+ },
12
+ {
13
+ "loss": 1.6675,
14
+ "grad_norm": 0.4600104093551636,
15
+ "learning_rate": 3.8e-05,
16
+ "entropy": 1.5156079709529877,
17
+ "num_tokens": 413783.0,
18
+ "mean_token_accuracy": 0.6156355381011963,
19
+ "epoch": 0.032,
20
+ "step": 20
21
+ },
22
+ {
23
+ "loss": 1.4313,
24
+ "grad_norm": 0.5679482221603394,
25
+ "learning_rate": 5.8e-05,
26
+ "entropy": 1.511820811033249,
27
+ "num_tokens": 618078.0,
28
+ "mean_token_accuracy": 0.6563311725854873,
29
+ "epoch": 0.048,
30
+ "step": 30
31
+ },
32
+ {
33
+ "loss": 1.1182,
34
+ "grad_norm": 0.5529033541679382,
35
+ "learning_rate": 7.800000000000001e-05,
36
+ "entropy": 1.1426804274320603,
37
+ "num_tokens": 824579.0,
38
+ "mean_token_accuracy": 0.7136697113513947,
39
+ "epoch": 0.064,
40
+ "step": 40
41
+ },
42
+ {
43
+ "loss": 0.9574,
44
+ "grad_norm": 0.4290783405303955,
45
+ "learning_rate": 9.8e-05,
46
+ "entropy": 0.9684652209281921,
47
+ "num_tokens": 1022359.0,
48
+ "mean_token_accuracy": 0.7489378124475479,
49
+ "epoch": 0.08,
50
+ "step": 50
51
+ },
52
+ {
53
+ "loss": 0.8724,
54
+ "grad_norm": 0.34012866020202637,
55
+ "learning_rate": 9.993956318446873e-05,
56
+ "entropy": 0.8785539418458939,
57
+ "num_tokens": 1237697.0,
58
+ "mean_token_accuracy": 0.7662121266126632,
59
+ "epoch": 0.096,
60
+ "step": 60
61
+ },
62
+ {
63
+ "loss": 0.8526,
64
+ "grad_norm": 0.4122697710990906,
65
+ "learning_rate": 9.973083336646172e-05,
66
+ "entropy": 0.8518908679485321,
67
+ "num_tokens": 1438438.0,
68
+ "mean_token_accuracy": 0.7753641337156296,
69
+ "epoch": 0.112,
70
+ "step": 70
71
+ },
72
+ {
73
+ "loss": 0.7437,
74
+ "grad_norm": 0.3228837251663208,
75
+ "learning_rate": 9.937368719576374e-05,
76
+ "entropy": 0.7489307045936584,
77
+ "num_tokens": 1631589.0,
78
+ "mean_token_accuracy": 0.7941762745380402,
79
+ "epoch": 0.128,
80
+ "step": 80
81
+ },
82
+ {
83
+ "loss": 0.7492,
84
+ "grad_norm": 0.33696669340133667,
85
+ "learning_rate": 9.886919053691883e-05,
86
+ "entropy": 0.7552473366260528,
87
+ "num_tokens": 1836258.0,
88
+ "mean_token_accuracy": 0.7946388304233551,
89
+ "epoch": 0.144,
90
+ "step": 90
91
+ },
92
+ {
93
+ "loss": 0.7372,
94
+ "grad_norm": 0.2882266640663147,
95
+ "learning_rate": 9.82188490062415e-05,
96
+ "entropy": 0.7464645385742188,
97
+ "num_tokens": 2046902.0,
98
+ "mean_token_accuracy": 0.7958032578229904,
99
+ "epoch": 0.16,
100
+ "step": 100
101
+ },
102
+ {
103
+ "loss": 0.7528,
104
+ "grad_norm": 0.34871771931648254,
105
+ "learning_rate": 9.742460347846587e-05,
106
+ "entropy": 0.7529336363077164,
107
+ "num_tokens": 2254427.0,
108
+ "mean_token_accuracy": 0.7927632719278336,
109
+ "epoch": 0.176,
110
+ "step": 110
111
+ },
112
+ {
113
+ "loss": 0.6854,
114
+ "grad_norm": 0.344471275806427,
115
+ "learning_rate": 9.648882429441257e-05,
116
+ "entropy": 0.6884129703044891,
117
+ "num_tokens": 2452097.0,
118
+ "mean_token_accuracy": 0.8083581000566482,
119
+ "epoch": 0.192,
120
+ "step": 120
121
+ },
122
+ {
123
+ "loss": 0.7035,
124
+ "grad_norm": 0.2790994942188263,
125
+ "learning_rate": 9.541430418696018e-05,
126
+ "entropy": 0.712130931019783,
127
+ "num_tokens": 2647959.0,
128
+ "mean_token_accuracy": 0.8047784239053726,
129
+ "epoch": 0.208,
130
+ "step": 130
131
+ },
132
+ {
133
+ "loss": 0.7175,
134
+ "grad_norm": 0.3251378536224365,
135
+ "learning_rate": 9.420424994643266e-05,
136
+ "entropy": 0.7225965172052383,
137
+ "num_tokens": 2851644.0,
138
+ "mean_token_accuracy": 0.8003872334957123,
139
+ "epoch": 0.224,
140
+ "step": 140
141
+ },
142
+ {
143
+ "loss": 0.6633,
144
+ "grad_norm": 0.3178017735481262,
145
+ "learning_rate": 9.28622728502766e-05,
146
+ "entropy": 0.6729940563440323,
147
+ "num_tokens": 3053179.0,
148
+ "mean_token_accuracy": 0.8133321523666381,
149
+ "epoch": 0.24,
150
+ "step": 150
151
+ },
152
+ {
153
+ "loss": 0.7078,
154
+ "grad_norm": 0.32298043370246887,
155
+ "learning_rate": 9.139237788559e-05,
156
+ "entropy": 0.716013514995575,
157
+ "num_tokens": 3244569.0,
158
+ "mean_token_accuracy": 0.8037430495023727,
159
+ "epoch": 0.256,
160
+ "step": 160
161
+ },
162
+ {
163
+ "loss": 0.7011,
164
+ "grad_norm": 0.3123107850551605,
165
+ "learning_rate": 8.979895179666673e-05,
166
+ "entropy": 0.6998003959655762,
167
+ "num_tokens": 3448693.0,
168
+ "mean_token_accuracy": 0.8040744125843048,
169
+ "epoch": 0.272,
170
+ "step": 170
171
+ },
172
+ {
173
+ "loss": 0.6887,
174
+ "grad_norm": 0.3589050769805908,
175
+ "learning_rate": 8.808674999322728e-05,
176
+ "entropy": 0.6893662005662918,
177
+ "num_tokens": 3650299.0,
178
+ "mean_token_accuracy": 0.8075590372085572,
179
+ "epoch": 0.288,
180
+ "step": 180
181
+ },
182
+ {
183
+ "loss": 0.7397,
184
+ "grad_norm": 0.3437199890613556,
185
+ "learning_rate": 8.626088235840726e-05,
186
+ "entropy": 0.7490889757871628,
187
+ "num_tokens": 3860706.0,
188
+ "mean_token_accuracy": 0.7960624009370804,
189
+ "epoch": 0.304,
190
+ "step": 190
191
+ },
192
+ {
193
+ "loss": 0.7104,
194
+ "grad_norm": 0.31088024377822876,
195
+ "learning_rate": 8.43267979988576e-05,
196
+ "entropy": 0.7115123689174652,
197
+ "num_tokens": 4057498.0,
198
+ "mean_token_accuracy": 0.8006591260433197,
199
+ "epoch": 0.32,
200
+ "step": 200
201
+ },
202
+ {
203
+ "loss": 0.6628,
204
+ "grad_norm": 0.3589341640472412,
205
+ "learning_rate": 8.229026898246885e-05,
206
+ "entropy": 0.6689569860696792,
207
+ "num_tokens": 4251824.0,
208
+ "mean_token_accuracy": 0.812098690867424,
209
+ "epoch": 0.336,
210
+ "step": 210
211
+ },
212
+ {
213
+ "loss": 0.676,
214
+ "grad_norm": 0.31112033128738403,
215
+ "learning_rate": 8.015737311225172e-05,
216
+ "entropy": 0.6726008355617523,
217
+ "num_tokens": 4465389.0,
218
+ "mean_token_accuracy": 0.8103758335113526,
219
+ "epoch": 0.352,
220
+ "step": 220
221
+ },
222
+ {
223
+ "loss": 0.6668,
224
+ "grad_norm": 0.33138927817344666,
225
+ "learning_rate": 7.793447578778427e-05,
226
+ "entropy": 0.6735879182815552,
227
+ "num_tokens": 4685518.0,
228
+ "mean_token_accuracy": 0.8112154036760331,
229
+ "epoch": 0.368,
230
+ "step": 230
231
+ },
232
+ {
233
+ "loss": 0.6414,
234
+ "grad_norm": 0.31951209902763367,
235
+ "learning_rate": 7.56282110083577e-05,
236
+ "entropy": 0.6442272961139679,
237
+ "num_tokens": 4886383.0,
238
+ "mean_token_accuracy": 0.8160840034484863,
239
+ "epoch": 0.384,
240
+ "step": 240
241
+ },
242
+ {
243
+ "loss": 0.6817,
244
+ "grad_norm": 0.3405151665210724,
245
+ "learning_rate": 7.324546157451508e-05,
246
+ "entropy": 0.6818776607513428,
247
+ "num_tokens": 5086529.0,
248
+ "mean_token_accuracy": 0.8091985791921615,
249
+ "epoch": 0.4,
250
+ "step": 250
251
+ },
252
+ {
253
+ "loss": 0.6843,
254
+ "grad_norm": 0.30915334820747375,
255
+ "learning_rate": 7.079333854706938e-05,
256
+ "entropy": 0.6939111232757569,
257
+ "num_tokens": 5298416.0,
258
+ "mean_token_accuracy": 0.8071506917476654,
259
+ "epoch": 0.416,
260
+ "step": 260
261
+ },
262
+ {
263
+ "loss": 0.6805,
264
+ "grad_norm": 0.3196154832839966,
265
+ "learning_rate": 6.827916002490304e-05,
266
+ "entropy": 0.6853002905845642,
267
+ "num_tokens": 5502693.0,
268
+ "mean_token_accuracy": 0.8068861842155457,
269
+ "epoch": 0.432,
270
+ "step": 270
271
+ },
272
+ {
273
+ "loss": 0.6497,
274
+ "grad_norm": 0.3364115059375763,
275
+ "learning_rate": 6.571042930488474e-05,
276
+ "entropy": 0.6549761325120926,
277
+ "num_tokens": 5697863.0,
278
+ "mean_token_accuracy": 0.81393081843853,
279
+ "epoch": 0.448,
280
+ "step": 280
281
+ },
282
+ {
283
+ "loss": 0.6697,
284
+ "grad_norm": 0.33196505904197693,
285
+ "learning_rate": 6.309481248908235e-05,
286
+ "entropy": 0.6747944802045822,
287
+ "num_tokens": 5910487.0,
288
+ "mean_token_accuracy": 0.8128439605236053,
289
+ "epoch": 0.464,
290
+ "step": 290
291
+ },
292
+ {
293
+ "loss": 0.6535,
294
+ "grad_norm": 0.30423638224601746,
295
+ "learning_rate": 6.044011560610153e-05,
296
+ "entropy": 0.6566255927085877,
297
+ "num_tokens": 6127477.0,
298
+ "mean_token_accuracy": 0.8121924549341202,
299
+ "epoch": 0.48,
300
+ "step": 300
301
+ },
302
+ {
303
+ "loss": 0.6518,
304
+ "grad_norm": 0.3487173914909363,
305
+ "learning_rate": 5.775426131482811e-05,
306
+ "entropy": 0.6593502879142761,
307
+ "num_tokens": 6328197.0,
308
+ "mean_token_accuracy": 0.814925542473793,
309
+ "epoch": 0.496,
310
+ "step": 310
311
+ },
312
+ {
313
+ "loss": 0.6931,
314
+ "grad_norm": 0.3142063021659851,
315
+ "learning_rate": 5.5045265260100234e-05,
316
+ "entropy": 0.700766509771347,
317
+ "num_tokens": 6536365.0,
318
+ "mean_token_accuracy": 0.8070295184850693,
319
+ "epoch": 0.512,
320
+ "step": 320
321
+ },
322
+ {
323
+ "loss": 0.6785,
324
+ "grad_norm": 0.3409480154514313,
325
+ "learning_rate": 5.232121215087369e-05,
326
+ "entropy": 0.6776748090982437,
327
+ "num_tokens": 6742650.0,
328
+ "mean_token_accuracy": 0.809091717004776,
329
+ "epoch": 0.528,
330
+ "step": 330
331
+ },
332
+ {
333
+ "loss": 0.6438,
334
+ "grad_norm": 0.3378758132457733,
335
+ "learning_rate": 4.959023163227284e-05,
336
+ "entropy": 0.6490969121456146,
337
+ "num_tokens": 6946868.0,
338
+ "mean_token_accuracy": 0.8178076684474945,
339
+ "epoch": 0.544,
340
+ "step": 340
341
+ },
342
+ {
343
+ "loss": 0.6481,
344
+ "grad_norm": 0.3258441388607025,
345
+ "learning_rate": 4.6860474023534335e-05,
346
+ "entropy": 0.6563442766666412,
347
+ "num_tokens": 7138705.0,
348
+ "mean_token_accuracy": 0.817442661523819,
349
+ "epoch": 0.56,
350
+ "step": 350
351
+ },
352
+ {
353
+ "loss": 0.6622,
354
+ "grad_norm": 0.32307952642440796,
355
+ "learning_rate": 4.4140085994251136e-05,
356
+ "entropy": 0.6632782518863678,
357
+ "num_tokens": 7348846.0,
358
+ "mean_token_accuracy": 0.812923377752304,
359
+ "epoch": 0.576,
360
+ "step": 360
361
+ },
362
+ {
363
+ "loss": 0.6436,
364
+ "grad_norm": 0.35711175203323364,
365
+ "learning_rate": 4.143718625150854e-05,
366
+ "entropy": 0.6440205127000809,
367
+ "num_tokens": 7556952.0,
368
+ "mean_token_accuracy": 0.8172848135232925,
369
+ "epoch": 0.592,
370
+ "step": 370
371
+ },
372
+ {
373
+ "loss": 0.6299,
374
+ "grad_norm": 0.3042103052139282,
375
+ "learning_rate": 3.875984131047135e-05,
376
+ "entropy": 0.6329021096229553,
377
+ "num_tokens": 7758339.0,
378
+ "mean_token_accuracy": 0.8194414287805557,
379
+ "epoch": 0.608,
380
+ "step": 380
381
+ },
382
+ {
383
+ "loss": 0.6559,
384
+ "grad_norm": 0.35361942648887634,
385
+ "learning_rate": 3.6116041420732103e-05,
386
+ "entropy": 0.6580730825662613,
387
+ "num_tokens": 7961705.0,
388
+ "mean_token_accuracy": 0.8145245909690857,
389
+ "epoch": 0.624,
390
+ "step": 390
391
+ },
392
+ {
393
+ "loss": 0.6394,
394
+ "grad_norm": 0.35071322321891785,
395
+ "learning_rate": 3.3513676720265934e-05,
396
+ "entropy": 0.6357898592948914,
397
+ "num_tokens": 8160317.0,
398
+ "mean_token_accuracy": 0.8177996665239334,
399
+ "epoch": 0.64,
400
+ "step": 400
401
+ },
402
+ {
403
+ "loss": 0.6545,
404
+ "grad_norm": 0.31700801849365234,
405
+ "learning_rate": 3.0960513688157754e-05,
406
+ "entropy": 0.649863663315773,
407
+ "num_tokens": 8372896.0,
408
+ "mean_token_accuracy": 0.8160782635211945,
409
+ "epoch": 0.656,
410
+ "step": 410
411
+ },
412
+ {
413
+ "loss": 0.656,
414
+ "grad_norm": 0.3056493401527405,
415
+ "learning_rate": 2.846417196637613e-05,
416
+ "entropy": 0.6630118876695633,
417
+ "num_tokens": 8582698.0,
418
+ "mean_token_accuracy": 0.8151620656251908,
419
+ "epoch": 0.672,
420
+ "step": 420
421
+ },
422
+ {
423
+ "loss": 0.6869,
424
+ "grad_norm": 0.3303000330924988,
425
+ "learning_rate": 2.603210161976687e-05,
426
+ "entropy": 0.6842911779880524,
427
+ "num_tokens": 8794332.0,
428
+ "mean_token_accuracy": 0.805791437625885,
429
+ "epoch": 0.688,
430
+ "step": 430
431
+ },
432
+ {
433
+ "loss": 0.6602,
434
+ "grad_norm": 0.3385516405105591,
435
+ "learning_rate": 2.3671560902131447e-05,
436
+ "entropy": 0.6720612704753876,
437
+ "num_tokens": 9008268.0,
438
+ "mean_token_accuracy": 0.811782768368721,
439
+ "epoch": 0.704,
440
+ "step": 440
441
+ },
442
+ {
443
+ "loss": 0.6425,
444
+ "grad_norm": 0.3051205277442932,
445
+ "learning_rate": 2.1389594594744864e-05,
446
+ "entropy": 0.6423384606838226,
447
+ "num_tokens": 9216146.0,
448
+ "mean_token_accuracy": 0.81755151450634,
449
+ "epoch": 0.72,
450
+ "step": 450
451
+ },
452
+ {
453
+ "loss": 0.6262,
454
+ "grad_norm": 0.3302038013935089,
455
+ "learning_rate": 1.9193012981959475e-05,
456
+ "entropy": 0.6281730026006699,
457
+ "num_tokens": 9418629.0,
458
+ "mean_token_accuracy": 0.820314571261406,
459
+ "epoch": 0.736,
460
+ "step": 460
461
+ },
462
+ {
463
+ "loss": 0.6204,
464
+ "grad_norm": 0.2998092472553253,
465
+ "learning_rate": 1.7088371526639853e-05,
466
+ "entropy": 0.6200591921806335,
467
+ "num_tokens": 9624430.0,
468
+ "mean_token_accuracy": 0.8230848222970962,
469
+ "epoch": 0.752,
470
+ "step": 470
471
+ },
472
+ {
473
+ "loss": 0.6133,
474
+ "grad_norm": 0.3248184025287628,
475
+ "learning_rate": 1.508195130608504e-05,
476
+ "entropy": 0.6175663053989411,
477
+ "num_tokens": 9824038.0,
478
+ "mean_token_accuracy": 0.8229408711194992,
479
+ "epoch": 0.768,
480
+ "step": 480
481
+ },
482
+ {
483
+ "loss": 0.6471,
484
+ "grad_norm": 0.3508250117301941,
485
+ "learning_rate": 1.3179740266825253e-05,
486
+ "entropy": 0.6427986294031143,
487
+ "num_tokens": 10029090.0,
488
+ "mean_token_accuracy": 0.8153195798397064,
489
+ "epoch": 0.784,
490
+ "step": 490
491
+ },
492
+ {
493
+ "loss": 0.6494,
494
+ "grad_norm": 0.3246026933193207,
495
+ "learning_rate": 1.1387415354235887e-05,
496
+ "entropy": 0.6497834831476211,
497
+ "num_tokens": 10233813.0,
498
+ "mean_token_accuracy": 0.8152256548404694,
499
+ "epoch": 0.8,
500
+ "step": 500
501
+ },
502
+ {
503
+ "loss": 0.621,
504
+ "grad_norm": 0.3171588182449341,
505
+ "learning_rate": 9.710325570301348e-06,
506
+ "entropy": 0.6244612663984299,
507
+ "num_tokens": 10435960.0,
508
+ "mean_token_accuracy": 0.8210752457380295,
509
+ "epoch": 0.816,
510
+ "step": 510
511
+ },
512
+ {
513
+ "loss": 0.5986,
514
+ "grad_norm": 0.32061895728111267,
515
+ "learning_rate": 8.153476010090788e-06,
516
+ "entropy": 0.5975286513566971,
517
+ "num_tokens": 10636030.0,
518
+ "mean_token_accuracy": 0.827234199643135,
519
+ "epoch": 0.832,
520
+ "step": 520
521
+ },
522
+ {
523
+ "loss": 0.6372,
524
+ "grad_norm": 0.3552604615688324,
525
+ "learning_rate": 6.721512924587242e-06,
526
+ "entropy": 0.6376388996839524,
527
+ "num_tokens": 10838906.0,
528
+ "mean_token_accuracy": 0.818820309638977,
529
+ "epoch": 0.848,
530
+ "step": 530
531
+ },
532
+ {
533
+ "loss": 0.6304,
534
+ "grad_norm": 0.33599594235420227,
535
+ "learning_rate": 5.418709854448489e-06,
536
+ "entropy": 0.6320249021053315,
537
+ "num_tokens": 11026457.0,
538
+ "mean_token_accuracy": 0.8198111385107041,
539
+ "epoch": 0.864,
540
+ "step": 540
541
+ },
542
+ {
543
+ "loss": 0.6235,
544
+ "grad_norm": 0.41409239172935486,
545
+ "learning_rate": 4.248954876082195e-06,
546
+ "entropy": 0.6273589804768562,
547
+ "num_tokens": 11211997.0,
548
+ "mean_token_accuracy": 0.8219926416873932,
549
+ "epoch": 0.88,
550
+ "step": 550
551
+ },
552
+ {
553
+ "loss": 0.6511,
554
+ "grad_norm": 0.337541401386261,
555
+ "learning_rate": 3.2157389980976783e-06,
556
+ "entropy": 0.6515613377094269,
557
+ "num_tokens": 11421486.0,
558
+ "mean_token_accuracy": 0.815014871954918,
559
+ "epoch": 0.896,
560
+ "step": 560
561
+ },
562
+ {
563
+ "loss": 0.6209,
564
+ "grad_norm": 0.3296094536781311,
565
+ "learning_rate": 2.322145742764181e-06,
566
+ "entropy": 0.6218553513288498,
567
+ "num_tokens": 11621269.0,
568
+ "mean_token_accuracy": 0.8230124861001968,
569
+ "epoch": 0.912,
570
+ "step": 570
571
+ },
572
+ {
573
+ "loss": 0.634,
574
+ "grad_norm": 0.34123408794403076,
575
+ "learning_rate": 1.5708419435684462e-06,
576
+ "entropy": 0.6355304539203643,
577
+ "num_tokens": 11830087.0,
578
+ "mean_token_accuracy": 0.818019899725914,
579
+ "epoch": 0.928,
580
+ "step": 580
581
+ },
582
+ {
583
+ "loss": 0.6276,
584
+ "grad_norm": 0.2921477258205414,
585
+ "learning_rate": 9.640697863354698e-07,
586
+ "entropy": 0.6308085203170777,
587
+ "num_tokens": 12030300.0,
588
+ "mean_token_accuracy": 0.8203556030988693,
589
+ "epoch": 0.944,
590
+ "step": 590
591
+ },
592
+ {
593
+ "loss": 0.623,
594
+ "grad_norm": 0.3348216712474823,
595
+ "learning_rate": 5.036401176647332e-07,
596
+ "entropy": 0.6214668571949005,
597
+ "num_tokens": 12226666.0,
598
+ "mean_token_accuracy": 0.8228818953037262,
599
+ "epoch": 0.96,
600
+ "step": 600
601
+ },
602
+ {
603
+ "loss": 0.6305,
604
+ "grad_norm": 0.3399432301521301,
605
+ "learning_rate": 1.9092704065222788e-07,
606
+ "entropy": 0.6307568103075027,
607
+ "num_tokens": 12433050.0,
608
+ "mean_token_accuracy": 0.8195924639701844,
609
+ "epoch": 0.976,
610
+ "step": 610
611
+ },
612
+ {
613
+ "loss": 0.6685,
614
+ "grad_norm": 0.3575892746448517,
615
+ "learning_rate": 2.6863814026734102e-08,
616
+ "entropy": 0.6648932933807373,
617
+ "num_tokens": 12635860.0,
618
+ "mean_token_accuracy": 0.8116440147161483,
619
+ "epoch": 0.992,
620
+ "step": 620
621
+ },
622
+ {
623
+ "train_runtime": 6933.2338,
624
+ "train_samples_per_second": 2.883,
625
+ "train_steps_per_second": 0.09,
626
+ "total_flos": 7.776202046473175e+18,
627
+ "train_loss": 0.7265998420715332,
628
+ "entropy": 0.6022609889507293,
629
+ "num_tokens": 12730954.0,
630
+ "mean_token_accuracy": 0.8287682414054871,
631
+ "epoch": 1.0,
632
+ "step": 625
633
+ }
634
+ ]