umer07 commited on
Commit
d1b5c77
·
verified ·
1 Parent(s): 34779fd

Fathom: upload expert-e8-analyst/training_log.json

Browse files
adapters/expert-e8-analyst/training_log.json ADDED
@@ -0,0 +1,621 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "loss": 1.9181,
4
+ "grad_norm": 0.8695156574249268,
5
+ "learning_rate": 1.8e-05,
6
+ "entropy": 1.457271361351013,
7
+ "num_tokens": 223301.0,
8
+ "mean_token_accuracy": 0.5781058162450791,
9
+ "epoch": 0.016406890894175553,
10
+ "step": 10
11
+ },
12
+ {
13
+ "loss": 1.7798,
14
+ "grad_norm": 0.6769833564758301,
15
+ "learning_rate": 3.8e-05,
16
+ "entropy": 1.6721824526786804,
17
+ "num_tokens": 449106.0,
18
+ "mean_token_accuracy": 0.585400715470314,
19
+ "epoch": 0.03281378178835111,
20
+ "step": 20
21
+ },
22
+ {
23
+ "loss": 1.5073,
24
+ "grad_norm": 0.903017520904541,
25
+ "learning_rate": 5.8e-05,
26
+ "entropy": 1.568133682012558,
27
+ "num_tokens": 681766.0,
28
+ "mean_token_accuracy": 0.6311331987380981,
29
+ "epoch": 0.04922067268252666,
30
+ "step": 30
31
+ },
32
+ {
33
+ "loss": 1.1072,
34
+ "grad_norm": 0.4333101511001587,
35
+ "learning_rate": 7.800000000000001e-05,
36
+ "entropy": 1.0811064839363098,
37
+ "num_tokens": 919022.0,
38
+ "mean_token_accuracy": 0.7193854421377182,
39
+ "epoch": 0.06562756357670221,
40
+ "step": 40
41
+ },
42
+ {
43
+ "loss": 0.9973,
44
+ "grad_norm": 0.33554309606552124,
45
+ "learning_rate": 9.8e-05,
46
+ "entropy": 0.9973530560731888,
47
+ "num_tokens": 1148282.0,
48
+ "mean_token_accuracy": 0.7452630162239074,
49
+ "epoch": 0.08203445447087777,
50
+ "step": 50
51
+ },
52
+ {
53
+ "loss": 0.8928,
54
+ "grad_norm": 0.338527649641037,
55
+ "learning_rate": 9.993628283308581e-05,
56
+ "entropy": 0.8918034493923187,
57
+ "num_tokens": 1372211.0,
58
+ "mean_token_accuracy": 0.7673784762620925,
59
+ "epoch": 0.09844134536505332,
60
+ "step": 60
61
+ },
62
+ {
63
+ "loss": 0.8782,
64
+ "grad_norm": 0.32297801971435547,
65
+ "learning_rate": 9.971623444249021e-05,
66
+ "entropy": 0.8743007063865662,
67
+ "num_tokens": 1604226.0,
68
+ "mean_token_accuracy": 0.7686779618263244,
69
+ "epoch": 0.11484823625922888,
70
+ "step": 70
71
+ },
72
+ {
73
+ "loss": 0.8258,
74
+ "grad_norm": 0.33724743127822876,
75
+ "learning_rate": 9.933976038510333e-05,
76
+ "entropy": 0.8295134097337723,
77
+ "num_tokens": 1838081.0,
78
+ "mean_token_accuracy": 0.7803715646266938,
79
+ "epoch": 0.13125512715340443,
80
+ "step": 80
81
+ },
82
+ {
83
+ "loss": 0.7891,
84
+ "grad_norm": 0.3447152078151703,
85
+ "learning_rate": 9.88080451875917e-05,
86
+ "entropy": 0.788675120472908,
87
+ "num_tokens": 2071855.0,
88
+ "mean_token_accuracy": 0.7867679089307785,
89
+ "epoch": 0.14766201804757997,
90
+ "step": 90
91
+ },
92
+ {
93
+ "loss": 0.8272,
94
+ "grad_norm": 0.3010920286178589,
95
+ "learning_rate": 9.812276182268236e-05,
96
+ "entropy": 0.8329822063446045,
97
+ "num_tokens": 2299745.0,
98
+ "mean_token_accuracy": 0.7769433975219726,
99
+ "epoch": 0.16406890894175555,
100
+ "step": 100
101
+ },
102
+ {
103
+ "loss": 0.797,
104
+ "grad_norm": 0.3699570298194885,
105
+ "learning_rate": 9.728606644537178e-05,
106
+ "entropy": 0.8039814531803131,
107
+ "num_tokens": 2528567.0,
108
+ "mean_token_accuracy": 0.784355628490448,
109
+ "epoch": 0.1804757998359311,
110
+ "step": 110
111
+ },
112
+ {
113
+ "loss": 0.7743,
114
+ "grad_norm": 0.3595486581325531,
115
+ "learning_rate": 9.63005916088644e-05,
116
+ "entropy": 0.7768057614564896,
117
+ "num_tokens": 2751275.0,
118
+ "mean_token_accuracy": 0.7889596104621888,
119
+ "epoch": 0.19688269073010664,
120
+ "step": 120
121
+ },
122
+ {
123
+ "loss": 0.7735,
124
+ "grad_norm": 0.3556647002696991,
125
+ "learning_rate": 9.516943798158649e-05,
126
+ "entropy": 0.776089146733284,
127
+ "num_tokens": 2978023.0,
128
+ "mean_token_accuracy": 0.7897650897502899,
129
+ "epoch": 0.2132895816242822,
130
+ "step": 130
131
+ },
132
+ {
133
+ "loss": 0.7622,
134
+ "grad_norm": 0.3334764242172241,
135
+ "learning_rate": 9.389616459133597e-05,
136
+ "entropy": 0.7803491950035095,
137
+ "num_tokens": 3204517.0,
138
+ "mean_token_accuracy": 0.7920954555273056,
139
+ "epoch": 0.22969647251845776,
140
+ "step": 140
141
+ },
142
+ {
143
+ "loss": 0.7576,
144
+ "grad_norm": 0.4009985029697418,
145
+ "learning_rate": 9.248477762726437e-05,
146
+ "entropy": 0.7722930639982224,
147
+ "num_tokens": 3434323.0,
148
+ "mean_token_accuracy": 0.792155721783638,
149
+ "epoch": 0.2461033634126333,
150
+ "step": 150
151
+ },
152
+ {
153
+ "loss": 0.7741,
154
+ "grad_norm": 0.3408016264438629,
155
+ "learning_rate": 9.093971783492355e-05,
156
+ "entropy": 0.7827848076820374,
157
+ "num_tokens": 3661534.0,
158
+ "mean_token_accuracy": 0.7909984678030014,
159
+ "epoch": 0.26251025430680885,
160
+ "step": 160
161
+ },
162
+ {
163
+ "loss": 0.7409,
164
+ "grad_norm": 0.3399813175201416,
165
+ "learning_rate": 8.926584654403724e-05,
166
+ "entropy": 0.7509049952030182,
167
+ "num_tokens": 3890921.0,
168
+ "mean_token_accuracy": 0.7963089287281037,
169
+ "epoch": 0.27891714520098443,
170
+ "step": 170
171
+ },
172
+ {
173
+ "loss": 0.7478,
174
+ "grad_norm": 0.30348479747772217,
175
+ "learning_rate": 8.746843037295937e-05,
176
+ "entropy": 0.7562560260295867,
177
+ "num_tokens": 4119537.0,
178
+ "mean_token_accuracy": 0.794661870598793,
179
+ "epoch": 0.29532403609515995,
180
+ "step": 180
181
+ },
182
+ {
183
+ "loss": 0.737,
184
+ "grad_norm": 0.3403272330760956,
185
+ "learning_rate": 8.555312465794403e-05,
186
+ "entropy": 0.7475169450044632,
187
+ "num_tokens": 4357977.0,
188
+ "mean_token_accuracy": 0.796341797709465,
189
+ "epoch": 0.3117309269893355,
190
+ "step": 190
191
+ },
192
+ {
193
+ "loss": 0.7512,
194
+ "grad_norm": 0.3219321072101593,
195
+ "learning_rate": 8.352595565936554e-05,
196
+ "entropy": 0.7560538798570633,
197
+ "num_tokens": 4581046.0,
198
+ "mean_token_accuracy": 0.7952380329370499,
199
+ "epoch": 0.3281378178835111,
200
+ "step": 200
201
+ },
202
+ {
203
+ "loss": 0.7317,
204
+ "grad_norm": 0.3038958013057709,
205
+ "learning_rate": 8.139330160087374e-05,
206
+ "entropy": 0.7322431743144989,
207
+ "num_tokens": 4814227.0,
208
+ "mean_token_accuracy": 0.7974914610385895,
209
+ "epoch": 0.3445447087776866,
210
+ "step": 210
211
+ },
212
+ {
213
+ "loss": 0.7205,
214
+ "grad_norm": 0.32955309748649597,
215
+ "learning_rate": 7.916187260114263e-05,
216
+ "entropy": 0.7275226473808288,
217
+ "num_tokens": 5048157.0,
218
+ "mean_token_accuracy": 0.8014717370271682,
219
+ "epoch": 0.3609515996718622,
220
+ "step": 220
221
+ },
222
+ {
223
+ "loss": 0.7282,
224
+ "grad_norm": 0.4224933683872223,
225
+ "learning_rate": 7.68386895613546e-05,
226
+ "entropy": 0.7310106873512268,
227
+ "num_tokens": 5274071.0,
228
+ "mean_token_accuracy": 0.7992997527122497,
229
+ "epoch": 0.37735849056603776,
230
+ "step": 230
231
+ },
232
+ {
233
+ "loss": 0.6971,
234
+ "grad_norm": 0.33381229639053345,
235
+ "learning_rate": 7.443106207484776e-05,
236
+ "entropy": 0.6995288044214248,
237
+ "num_tokens": 5509360.0,
238
+ "mean_token_accuracy": 0.8060027480125427,
239
+ "epoch": 0.3937653814602133,
240
+ "step": 240
241
+ },
242
+ {
243
+ "loss": 0.711,
244
+ "grad_norm": 0.31851011514663696,
245
+ "learning_rate": 7.194656542843102e-05,
246
+ "entropy": 0.7142476379871369,
247
+ "num_tokens": 5738285.0,
248
+ "mean_token_accuracy": 0.8031993210315704,
249
+ "epoch": 0.41017227235438886,
250
+ "step": 250
251
+ },
252
+ {
253
+ "loss": 0.7237,
254
+ "grad_norm": 0.33157357573509216,
255
+ "learning_rate": 6.939301676772927e-05,
256
+ "entropy": 0.7256091266870499,
257
+ "num_tokens": 5962580.0,
258
+ "mean_token_accuracy": 0.8013624370098114,
259
+ "epoch": 0.4265791632485644,
260
+ "step": 260
261
+ },
262
+ {
263
+ "loss": 0.721,
264
+ "grad_norm": 0.3370811641216278,
265
+ "learning_rate": 6.677845050155107e-05,
266
+ "entropy": 0.7265744864940643,
267
+ "num_tokens": 6196094.0,
268
+ "mean_token_accuracy": 0.7991349190473557,
269
+ "epoch": 0.44298605414273995,
270
+ "step": 270
271
+ },
272
+ {
273
+ "loss": 0.6843,
274
+ "grad_norm": 0.38120874762535095,
275
+ "learning_rate": 6.411109302266616e-05,
276
+ "entropy": 0.6908316820859909,
277
+ "num_tokens": 6420601.0,
278
+ "mean_token_accuracy": 0.8087756901979446,
279
+ "epoch": 0.4593929450369155,
280
+ "step": 280
281
+ },
282
+ {
283
+ "loss": 0.7072,
284
+ "grad_norm": 0.38430851697921753,
285
+ "learning_rate": 6.139933682453036e-05,
286
+ "entropy": 0.7136244118213654,
287
+ "num_tokens": 6655119.0,
288
+ "mean_token_accuracy": 0.8047497570514679,
289
+ "epoch": 0.47579983593109104,
290
+ "step": 290
291
+ },
292
+ {
293
+ "loss": 0.7252,
294
+ "grad_norm": 0.3509667217731476,
295
+ "learning_rate": 5.8651714095396135e-05,
296
+ "entropy": 0.7334865719079972,
297
+ "num_tokens": 6876910.0,
298
+ "mean_token_accuracy": 0.799770200252533,
299
+ "epoch": 0.4922067268252666,
300
+ "step": 300
301
+ },
302
+ {
303
+ "loss": 0.6821,
304
+ "grad_norm": 0.3153151571750641,
305
+ "learning_rate": 5.587686987289189e-05,
306
+ "entropy": 0.6873683601617813,
307
+ "num_tokens": 7112299.0,
308
+ "mean_token_accuracy": 0.8088241666555405,
309
+ "epoch": 0.5086136177194421,
310
+ "step": 310
311
+ },
312
+ {
313
+ "loss": 0.7196,
314
+ "grad_norm": 0.34774187207221985,
315
+ "learning_rate": 5.3083534843535074e-05,
316
+ "entropy": 0.7214434593915939,
317
+ "num_tokens": 7346455.0,
318
+ "mean_token_accuracy": 0.8039845436811447,
319
+ "epoch": 0.5250205086136177,
320
+ "step": 320
321
+ },
322
+ {
323
+ "loss": 0.6638,
324
+ "grad_norm": 0.387768030166626,
325
+ "learning_rate": 5.028049787276249e-05,
326
+ "entropy": 0.6638175457715988,
327
+ "num_tokens": 7571791.0,
328
+ "mean_token_accuracy": 0.812444058060646,
329
+ "epoch": 0.5414273995077933,
330
+ "step": 330
331
+ },
332
+ {
333
+ "loss": 0.6766,
334
+ "grad_norm": 0.3517005741596222,
335
+ "learning_rate": 4.7476578351907954e-05,
336
+ "entropy": 0.6799941658973694,
337
+ "num_tokens": 7801695.0,
338
+ "mean_token_accuracy": 0.811230742931366,
339
+ "epoch": 0.5578342904019689,
340
+ "step": 340
341
+ },
342
+ {
343
+ "loss": 0.6779,
344
+ "grad_norm": 0.32577675580978394,
345
+ "learning_rate": 4.468059844913444e-05,
346
+ "entropy": 0.6814499109983444,
347
+ "num_tokens": 8039821.0,
348
+ "mean_token_accuracy": 0.8104382246732712,
349
+ "epoch": 0.5742411812961444,
350
+ "step": 350
351
+ },
352
+ {
353
+ "loss": 0.6539,
354
+ "grad_norm": 0.35933127999305725,
355
+ "learning_rate": 4.1901355351628945e-05,
356
+ "entropy": 0.6585495263338089,
357
+ "num_tokens": 8273149.0,
358
+ "mean_token_accuracy": 0.8166852772235871,
359
+ "epoch": 0.5906480721903199,
360
+ "step": 360
361
+ },
362
+ {
363
+ "loss": 0.6843,
364
+ "grad_norm": 0.31598055362701416,
365
+ "learning_rate": 3.914759358639719e-05,
366
+ "entropy": 0.6861207246780395,
367
+ "num_tokens": 8503164.0,
368
+ "mean_token_accuracy": 0.8086160510778427,
369
+ "epoch": 0.6070549630844955,
370
+ "step": 370
371
+ },
372
+ {
373
+ "loss": 0.7094,
374
+ "grad_norm": 0.3427006006240845,
375
+ "learning_rate": 3.642797750674629e-05,
376
+ "entropy": 0.7133786290884018,
377
+ "num_tokens": 8726435.0,
378
+ "mean_token_accuracy": 0.8027824640274048,
379
+ "epoch": 0.623461853978671,
380
+ "step": 380
381
+ },
382
+ {
383
+ "loss": 0.6877,
384
+ "grad_norm": 0.34877264499664307,
385
+ "learning_rate": 3.375106403102389e-05,
386
+ "entropy": 0.6881168276071549,
387
+ "num_tokens": 8954291.0,
388
+ "mean_token_accuracy": 0.8073496133089065,
389
+ "epoch": 0.6398687448728466,
390
+ "step": 390
391
+ },
392
+ {
393
+ "loss": 0.6835,
394
+ "grad_norm": 0.3225726783275604,
395
+ "learning_rate": 3.112527571938717e-05,
396
+ "entropy": 0.6862167656421662,
397
+ "num_tokens": 9177163.0,
398
+ "mean_token_accuracy": 0.8089945495128632,
399
+ "epoch": 0.6562756357670222,
400
+ "step": 400
401
+ },
402
+ {
403
+ "loss": 0.7008,
404
+ "grad_norm": 0.329756498336792,
405
+ "learning_rate": 2.8558874273312674e-05,
406
+ "entropy": 0.7071986079216004,
407
+ "num_tokens": 9404151.0,
408
+ "mean_token_accuracy": 0.8044474363327027,
409
+ "epoch": 0.6726825266611977,
410
+ "step": 410
411
+ },
412
+ {
413
+ "loss": 0.6947,
414
+ "grad_norm": 0.3715651035308838,
415
+ "learning_rate": 2.605993454122687e-05,
416
+ "entropy": 0.69432153403759,
417
+ "num_tokens": 9639400.0,
418
+ "mean_token_accuracy": 0.8064981371164321,
419
+ "epoch": 0.6890894175553732,
420
+ "step": 420
421
+ },
422
+ {
423
+ "loss": 0.7066,
424
+ "grad_norm": 0.3599180281162262,
425
+ "learning_rate": 2.3636319112045496e-05,
426
+ "entropy": 0.7111173301935196,
427
+ "num_tokens": 9867668.0,
428
+ "mean_token_accuracy": 0.8044642627239227,
429
+ "epoch": 0.7054963084495488,
430
+ "step": 430
431
+ },
432
+ {
433
+ "loss": 0.7259,
434
+ "grad_norm": 0.2912443280220032,
435
+ "learning_rate": 2.1295653576560163e-05,
436
+ "entropy": 0.7254415988922119,
437
+ "num_tokens": 10100826.0,
438
+ "mean_token_accuracy": 0.8003069430589675,
439
+ "epoch": 0.7219031993437244,
440
+ "step": 440
441
+ },
442
+ {
443
+ "loss": 0.6761,
444
+ "grad_norm": 0.30693626403808594,
445
+ "learning_rate": 1.9045302534508297e-05,
446
+ "entropy": 0.6833124309778214,
447
+ "num_tokens": 10332359.0,
448
+ "mean_token_accuracy": 0.8109049916267395,
449
+ "epoch": 0.7383100902379,
450
+ "step": 450
451
+ },
452
+ {
453
+ "loss": 0.736,
454
+ "grad_norm": 0.3155220150947571,
455
+ "learning_rate": 1.6892346422817946e-05,
456
+ "entropy": 0.736938726902008,
457
+ "num_tokens": 10563841.0,
458
+ "mean_token_accuracy": 0.7979681819677353,
459
+ "epoch": 0.7547169811320755,
460
+ "step": 460
461
+ },
462
+ {
463
+ "loss": 0.6945,
464
+ "grad_norm": 0.3748078942298889,
465
+ "learning_rate": 1.4843559237933473e-05,
466
+ "entropy": 0.7031238079071045,
467
+ "num_tokens": 10788876.0,
468
+ "mean_token_accuracy": 0.8057133972644805,
469
+ "epoch": 0.771123872026251,
470
+ "step": 470
471
+ },
472
+ {
473
+ "loss": 0.6776,
474
+ "grad_norm": 0.3635546565055847,
475
+ "learning_rate": 1.2905387222316822e-05,
476
+ "entropy": 0.6805126667022705,
477
+ "num_tokens": 11015156.0,
478
+ "mean_token_accuracy": 0.8101104766130447,
479
+ "epoch": 0.7875307629204266,
480
+ "step": 480
481
+ },
482
+ {
483
+ "loss": 0.676,
484
+ "grad_norm": 0.3111382722854614,
485
+ "learning_rate": 1.1083928582183711e-05,
486
+ "entropy": 0.6774959295988083,
487
+ "num_tokens": 11245860.0,
488
+ "mean_token_accuracy": 0.8107922226190567,
489
+ "epoch": 0.8039376538146021,
490
+ "step": 490
491
+ },
492
+ {
493
+ "loss": 0.6742,
494
+ "grad_norm": 0.32188844680786133,
495
+ "learning_rate": 9.384914300290748e-06,
496
+ "entropy": 0.6842435419559478,
497
+ "num_tokens": 11476241.0,
498
+ "mean_token_accuracy": 0.8111602008342743,
499
+ "epoch": 0.8203445447087777,
500
+ "step": 500
501
+ },
502
+ {
503
+ "loss": 0.6544,
504
+ "grad_norm": 0.36185422539711,
505
+ "learning_rate": 7.813690104143557e-06,
506
+ "entropy": 0.6514311820268631,
507
+ "num_tokens": 11708112.0,
508
+ "mean_token_accuracy": 0.8149820327758789,
509
+ "epoch": 0.8367514356029533,
510
+ "step": 510
511
+ },
512
+ {
513
+ "loss": 0.6765,
514
+ "grad_norm": 0.3183876574039459,
515
+ "learning_rate": 6.375199646360142e-06,
516
+ "entropy": 0.6856429934501648,
517
+ "num_tokens": 11939337.0,
518
+ "mean_token_accuracy": 0.8090052843093872,
519
+ "epoch": 0.8531583264971287,
520
+ "step": 520
521
+ },
522
+ {
523
+ "loss": 0.6761,
524
+ "grad_norm": 0.3287002742290497,
525
+ "learning_rate": 5.073968950110941e-06,
526
+ "entropy": 0.6834310472011567,
527
+ "num_tokens": 12174723.0,
528
+ "mean_token_accuracy": 0.8104397505521774,
529
+ "epoch": 0.8695652173913043,
530
+ "step": 530
531
+ },
532
+ {
533
+ "loss": 0.6751,
534
+ "grad_norm": 0.35229238867759705,
535
+ "learning_rate": 3.914092168575306e-06,
536
+ "entropy": 0.6824660181999207,
537
+ "num_tokens": 12398555.0,
538
+ "mean_token_accuracy": 0.8104325562715531,
539
+ "epoch": 0.8859721082854799,
540
+ "step": 540
541
+ },
542
+ {
543
+ "loss": 0.6834,
544
+ "grad_norm": 0.38912639021873474,
545
+ "learning_rate": 2.8992187032210518e-06,
546
+ "entropy": 0.682240468263626,
547
+ "num_tokens": 12624846.0,
548
+ "mean_token_accuracy": 0.8091065347194671,
549
+ "epoch": 0.9023789991796555,
550
+ "step": 550
551
+ },
552
+ {
553
+ "loss": 0.696,
554
+ "grad_norm": 0.306355744600296,
555
+ "learning_rate": 2.032541721437209e-06,
556
+ "entropy": 0.7058492481708527,
557
+ "num_tokens": 12859015.0,
558
+ "mean_token_accuracy": 0.8039765357971191,
559
+ "epoch": 0.918785890073831,
560
+ "step": 560
561
+ },
562
+ {
563
+ "loss": 0.6727,
564
+ "grad_norm": 0.38508960604667664,
565
+ "learning_rate": 1.3167881096480372e-06,
566
+ "entropy": 0.681548210978508,
567
+ "num_tokens": 13083551.0,
568
+ "mean_token_accuracy": 0.8100948423147202,
569
+ "epoch": 0.9351927809680065,
570
+ "step": 570
571
+ },
572
+ {
573
+ "loss": 0.7208,
574
+ "grad_norm": 0.33893731236457825,
575
+ "learning_rate": 7.542098935195918e-07,
576
+ "entropy": 0.7220237284898758,
577
+ "num_tokens": 13308857.0,
578
+ "mean_token_accuracy": 0.8005945891141891,
579
+ "epoch": 0.9515996718621821,
580
+ "step": 580
581
+ },
582
+ {
583
+ "loss": 0.6759,
584
+ "grad_norm": 0.3534739911556244,
585
+ "learning_rate": 3.465771522536854e-07,
586
+ "entropy": 0.6739370882511139,
587
+ "num_tokens": 13543857.0,
588
+ "mean_token_accuracy": 0.8097480118274689,
589
+ "epoch": 0.9680065627563577,
590
+ "step": 590
591
+ },
592
+ {
593
+ "loss": 0.6865,
594
+ "grad_norm": 0.3553875982761383,
595
+ "learning_rate": 9.517244926393609e-08,
596
+ "entropy": 0.6908959478139878,
597
+ "num_tokens": 13769574.0,
598
+ "mean_token_accuracy": 0.806584045290947,
599
+ "epoch": 0.9844134536505332,
600
+ "step": 600
601
+ },
602
+ {
603
+ "loss": 0.6525,
604
+ "grad_norm": 0.5078703761100769,
605
+ "learning_rate": 7.867967567354306e-10,
606
+ "entropy": 0.6598060852602908,
607
+ "num_tokens": 13978118.0,
608
+ "mean_token_accuracy": 0.8165042933664823,
609
+ "epoch": 1.0,
610
+ "step": 610
611
+ },
612
+ {
613
+ "train_runtime": 6449.4338,
614
+ "train_samples_per_second": 3.024,
615
+ "train_steps_per_second": 0.095,
616
+ "total_flos": 7.156995496917074e+18,
617
+ "train_loss": 0.7796625786140317,
618
+ "epoch": 1.0,
619
+ "step": 610
620
+ }
621
+ ]