Gege24 commited on
Commit
7bd0ca4
·
verified ·
1 Parent(s): 3739dc8

Upload task output 22683721-f995-4c95-ad47-2bd47697bc9d

Browse files
Files changed (4) hide show
  1. loss.txt +1 -1
  2. model.safetensors +1 -1
  3. trainer_state.json +376 -376
  4. training_args.bin +1 -1
loss.txt CHANGED
@@ -1 +1 @@
1
- 600,0.12757942080497742
 
1
+ 600,0.1242036521434784
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de788c65166c635cb384206107dc0c6c7b9053ce108d4d43a15b90fcaa90ad09
3
  size 324662984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8da3dd006f7b033ce16aea3b41d53313e0f1f5fc5a315238d0db5334aa24fa47
3
  size 324662984
trainer_state.json CHANGED
@@ -11,874 +11,874 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.025,
14
- "grad_norm": 172.0,
15
- "learning_rate": 1.4913733827943399e-05,
16
- "loss": 0.5808,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.05,
21
- "grad_norm": 175.0,
22
- "learning_rate": 3.355590111287265e-05,
23
- "loss": 0.4361,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.075,
28
- "grad_norm": 195.0,
29
- "learning_rate": 5.21980683978019e-05,
30
- "loss": 0.3371,
31
  "step": 15
32
  },
33
  {
34
  "epoch": 0.1,
35
- "grad_norm": 108.5,
36
- "learning_rate": 7.084023568273114e-05,
37
- "loss": 0.3287,
38
  "step": 20
39
  },
40
  {
41
  "epoch": 0.125,
42
- "grad_norm": 95.5,
43
- "learning_rate": 8.94824029676604e-05,
44
- "loss": 0.3412,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.15,
49
- "grad_norm": 244.0,
50
- "learning_rate": 0.00010812457025258965,
51
- "loss": 0.8152,
52
  "step": 30
53
  },
54
  {
55
  "epoch": 0.175,
56
- "grad_norm": 52.5,
57
- "learning_rate": 0.0001267667375375189,
58
- "loss": 1.1844,
59
  "step": 35
60
  },
61
  {
62
  "epoch": 0.2,
63
- "grad_norm": 15.6875,
64
- "learning_rate": 0.00013048306778245657,
65
- "loss": 0.4712,
66
  "step": 40
67
  },
68
  {
69
  "epoch": 0.225,
70
- "grad_norm": 17.75,
71
- "learning_rate": 0.00013043390874416683,
72
- "loss": 0.336,
73
  "step": 45
74
  },
75
  {
76
  "epoch": 0.25,
77
- "grad_norm": 12.25,
78
- "learning_rate": 0.00013034697539393133,
79
- "loss": 0.3254,
80
  "step": 50
81
  },
82
  {
83
  "epoch": 0.275,
84
- "grad_norm": 15.6875,
85
- "learning_rate": 0.00013022233492123342,
86
- "loss": 0.32,
87
  "step": 55
88
  },
89
  {
90
  "epoch": 0.3,
91
- "grad_norm": 8.0,
92
- "learning_rate": 0.00013006008365882616,
93
- "loss": 0.2932,
94
  "step": 60
95
  },
96
  {
97
  "epoch": 0.325,
98
- "grad_norm": 6.03125,
99
- "learning_rate": 0.0001298603470082783,
100
- "loss": 0.2688,
101
  "step": 65
102
  },
103
  {
104
  "epoch": 0.35,
105
- "grad_norm": 4.0,
106
- "learning_rate": 0.00012962327934305313,
107
- "loss": 0.2377,
108
  "step": 70
109
  },
110
  {
111
  "epoch": 0.375,
112
- "grad_norm": 13.4375,
113
- "learning_rate": 0.00012934906388919573,
114
- "loss": 0.2255,
115
  "step": 75
116
  },
117
  {
118
  "epoch": 0.4,
119
- "grad_norm": 3.75,
120
- "learning_rate": 0.0001290379125837201,
121
- "loss": 0.2182,
122
  "step": 80
123
  },
124
  {
125
  "epoch": 0.425,
126
- "grad_norm": 3.015625,
127
- "learning_rate": 0.00012869006591080637,
128
- "loss": 0.2016,
129
  "step": 85
130
  },
131
  {
132
  "epoch": 0.45,
133
- "grad_norm": 6.46875,
134
- "learning_rate": 0.0001283057927159341,
135
- "loss": 0.22,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 0.475,
140
- "grad_norm": 4.40625,
141
- "learning_rate": 0.00012788538999809575,
142
- "loss": 0.2246,
143
  "step": 95
144
  },
145
  {
146
  "epoch": 0.5,
147
- "grad_norm": 6.3125,
148
- "learning_rate": 0.00012742918268025043,
149
- "loss": 0.2283,
150
  "step": 100
151
  },
152
  {
153
  "epoch": 0.525,
154
- "grad_norm": 4.6875,
155
- "learning_rate": 0.00012693752335819598,
156
- "loss": 0.2131,
157
  "step": 105
158
  },
159
  {
160
  "epoch": 0.55,
161
- "grad_norm": 4.03125,
162
- "learning_rate": 0.0001264107920280529,
163
- "loss": 0.2147,
164
  "step": 110
165
  },
166
  {
167
  "epoch": 0.575,
168
- "grad_norm": 2.734375,
169
- "learning_rate": 0.00012584939579257098,
170
- "loss": 0.2142,
171
  "step": 115
172
  },
173
  {
174
  "epoch": 0.6,
175
- "grad_norm": 3.46875,
176
- "learning_rate": 0.00012525376854648584,
177
- "loss": 0.2131,
178
  "step": 120
179
  },
180
  {
181
  "epoch": 0.625,
182
- "grad_norm": 2.4375,
183
- "learning_rate": 0.000124624370641168,
184
- "loss": 0.2029,
185
  "step": 125
186
  },
187
  {
188
  "epoch": 0.65,
189
- "grad_norm": 2.21875,
190
- "learning_rate": 0.00012396168852882444,
191
- "loss": 0.1917,
192
  "step": 130
193
  },
194
  {
195
  "epoch": 0.675,
196
- "grad_norm": 2.21875,
197
- "learning_rate": 0.00012326623438652673,
198
- "loss": 0.1877,
199
  "step": 135
200
  },
201
  {
202
  "epoch": 0.7,
203
- "grad_norm": 2.3125,
204
- "learning_rate": 0.00012253854572035722,
205
- "loss": 0.1902,
206
  "step": 140
207
  },
208
  {
209
  "epoch": 0.725,
210
- "grad_norm": 2.625,
211
- "learning_rate": 0.00012177918494997859,
212
- "loss": 0.1907,
213
  "step": 145
214
  },
215
  {
216
  "epoch": 0.75,
217
- "grad_norm": 2.234375,
218
- "learning_rate": 0.00012098873897394814,
219
- "loss": 0.1798,
220
  "step": 150
221
  },
222
  {
223
  "epoch": 0.775,
224
- "grad_norm": 2.4375,
225
- "learning_rate": 0.00012016781871611264,
226
- "loss": 0.1809,
227
  "step": 155
228
  },
229
  {
230
  "epoch": 0.8,
231
- "grad_norm": 14.0625,
232
- "learning_rate": 0.00011931705865343452,
233
- "loss": 0.1879,
234
  "step": 160
235
  },
236
  {
237
  "epoch": 0.825,
238
- "grad_norm": 8.5625,
239
- "learning_rate": 0.00011843711632561409,
240
- "loss": 0.2072,
241
  "step": 165
242
  },
243
  {
244
  "epoch": 0.85,
245
- "grad_norm": 9.375,
246
- "learning_rate": 0.00011752867182688697,
247
- "loss": 0.2098,
248
  "step": 170
249
  },
250
  {
251
  "epoch": 0.875,
252
- "grad_norm": 4.78125,
253
- "learning_rate": 0.00011659242728038948,
254
- "loss": 0.2244,
255
  "step": 175
256
  },
257
  {
258
  "epoch": 0.9,
259
- "grad_norm": 16.75,
260
- "learning_rate": 0.00011562910629549808,
261
- "loss": 0.248,
262
  "step": 180
263
  },
264
  {
265
  "epoch": 0.925,
266
- "grad_norm": 3.15625,
267
- "learning_rate": 0.0001146394534085627,
268
- "loss": 0.2085,
269
  "step": 185
270
  },
271
  {
272
  "epoch": 0.95,
273
- "grad_norm": 9.6875,
274
- "learning_rate": 0.00011362423350746566,
275
- "loss": 0.2193,
276
  "step": 190
277
  },
278
  {
279
  "epoch": 0.975,
280
- "grad_norm": 7.875,
281
- "learning_rate": 0.00011258423124045127,
282
- "loss": 0.2158,
283
  "step": 195
284
  },
285
  {
286
  "epoch": 1.0,
287
- "grad_norm": 7.65625,
288
- "learning_rate": 0.00011152025040968297,
289
- "loss": 0.2198,
290
  "step": 200
291
  },
292
  {
293
  "epoch": 1.0,
294
- "eval_loss": 0.23302562534809113,
295
- "eval_runtime": 0.6083,
296
- "eval_samples_per_second": 42.739,
297
- "eval_steps_per_second": 42.739,
298
  "step": 200
299
  },
300
  {
301
  "epoch": 1.025,
302
- "grad_norm": 10.625,
303
- "learning_rate": 0.00011043311334999674,
304
- "loss": 0.2557,
305
  "step": 205
306
  },
307
  {
308
  "epoch": 1.05,
309
- "grad_norm": 7.46875,
310
- "learning_rate": 0.00010932366029333083,
311
- "loss": 0.244,
312
  "step": 210
313
  },
314
  {
315
  "epoch": 1.075,
316
- "grad_norm": 4.5625,
317
- "learning_rate": 0.00010819274871932301,
318
- "loss": 0.2418,
319
  "step": 215
320
  },
321
  {
322
  "epoch": 1.1,
323
- "grad_norm": 4.375,
324
- "learning_rate": 0.00010704125269257747,
325
- "loss": 0.2404,
326
  "step": 220
327
  },
328
  {
329
  "epoch": 1.125,
330
- "grad_norm": 3.796875,
331
- "learning_rate": 0.00010587006218711337,
332
- "loss": 0.2088,
333
  "step": 225
334
  },
335
  {
336
  "epoch": 1.15,
337
- "grad_norm": 3.15625,
338
- "learning_rate": 0.00010468008239851731,
339
- "loss": 0.2046,
340
  "step": 230
341
  },
342
  {
343
  "epoch": 1.175,
344
- "grad_norm": 6.375,
345
- "learning_rate": 0.00010347223304433115,
346
- "loss": 0.2003,
347
  "step": 235
348
  },
349
  {
350
  "epoch": 1.2,
351
- "grad_norm": 2.71875,
352
- "learning_rate": 0.00010224744765321614,
353
- "loss": 0.1991,
354
  "step": 240
355
  },
356
  {
357
  "epoch": 1.225,
358
- "grad_norm": 3.09375,
359
- "learning_rate": 0.00010100667284344267,
360
- "loss": 0.2017,
361
  "step": 245
362
  },
363
  {
364
  "epoch": 1.25,
365
- "grad_norm": 2.078125,
366
- "learning_rate": 9.975086759126306e-05,
367
- "loss": 0.1945,
368
  "step": 250
369
  },
370
  {
371
  "epoch": 1.275,
372
- "grad_norm": 2.0625,
373
- "learning_rate": 9.848100248973335e-05,
374
- "loss": 0.1827,
375
  "step": 255
376
  },
377
  {
378
  "epoch": 1.3,
379
- "grad_norm": 4.625,
380
- "learning_rate": 9.719805899855635e-05,
381
- "loss": 0.1792,
382
  "step": 260
383
  },
384
  {
385
  "epoch": 1.325,
386
- "grad_norm": 1.6328125,
387
- "learning_rate": 9.590302868552622e-05,
388
- "loss": 0.1768,
389
  "step": 265
390
  },
391
  {
392
  "epoch": 1.35,
393
- "grad_norm": 1.4140625,
394
- "learning_rate": 9.459691246016056e-05,
395
- "loss": 0.1692,
396
  "step": 270
397
  },
398
  {
399
  "epoch": 1.375,
400
- "grad_norm": 1.5078125,
401
- "learning_rate": 9.328071980011245e-05,
402
- "loss": 0.1717,
403
  "step": 275
404
  },
405
  {
406
  "epoch": 1.4,
407
- "grad_norm": 1.328125,
408
- "learning_rate": 9.195546797096022e-05,
409
- "loss": 0.1651,
410
  "step": 280
411
  },
412
  {
413
  "epoch": 1.425,
414
- "grad_norm": 1.3671875,
415
- "learning_rate": 9.062218123997836e-05,
416
- "loss": 0.1637,
417
  "step": 285
418
  },
419
  {
420
  "epoch": 1.45,
421
- "grad_norm": 1.2265625,
422
- "learning_rate": 8.928189008449641e-05,
423
- "loss": 0.1659,
424
  "step": 290
425
  },
426
  {
427
  "epoch": 1.475,
428
- "grad_norm": 1.5703125,
429
- "learning_rate": 8.793563039545874e-05,
430
- "loss": 0.1616,
431
  "step": 295
432
  },
433
  {
434
  "epoch": 1.5,
435
- "grad_norm": 1.8671875,
436
- "learning_rate": 8.658444267679969e-05,
437
- "loss": 0.1642,
438
  "step": 300
439
  },
440
  {
441
  "epoch": 1.525,
442
- "grad_norm": 2.46875,
443
- "learning_rate": 8.522937124125397e-05,
444
- "loss": 0.1674,
445
  "step": 305
446
  },
447
  {
448
  "epoch": 1.55,
449
- "grad_norm": 1.5625,
450
- "learning_rate": 8.387146340322277e-05,
451
- "loss": 0.1613,
452
  "step": 310
453
  },
454
  {
455
  "epoch": 1.575,
456
- "grad_norm": 1.3125,
457
- "learning_rate": 8.251176866932034e-05,
458
- "loss": 0.1595,
459
  "step": 315
460
  },
461
  {
462
  "epoch": 1.6,
463
- "grad_norm": 1.078125,
464
- "learning_rate": 8.115133792722579e-05,
465
- "loss": 0.1512,
466
  "step": 320
467
  },
468
  {
469
  "epoch": 1.625,
470
- "grad_norm": 1.515625,
471
- "learning_rate": 7.979122263346816e-05,
472
- "loss": 0.152,
473
  "step": 325
474
  },
475
  {
476
  "epoch": 1.65,
477
- "grad_norm": 1.7265625,
478
- "learning_rate": 7.8432474000771e-05,
479
- "loss": 0.1557,
480
  "step": 330
481
  },
482
  {
483
  "epoch": 1.675,
484
- "grad_norm": 1.453125,
485
- "learning_rate": 7.707614218558612e-05,
486
- "loss": 0.1531,
487
  "step": 335
488
  },
489
  {
490
  "epoch": 1.7,
491
- "grad_norm": 1.3046875,
492
- "learning_rate": 7.572327547644329e-05,
493
- "loss": 0.152,
494
  "step": 340
495
  },
496
  {
497
  "epoch": 1.725,
498
- "grad_norm": 1.390625,
499
- "learning_rate": 7.437491948374369e-05,
500
- "loss": 0.1491,
501
  "step": 345
502
  },
503
  {
504
  "epoch": 1.75,
505
- "grad_norm": 6.46875,
506
- "learning_rate": 7.303211633162347e-05,
507
- "loss": 0.1506,
508
  "step": 350
509
  },
510
  {
511
  "epoch": 1.775,
512
- "grad_norm": 1.6796875,
513
- "learning_rate": 7.169590385251147e-05,
514
- "loss": 0.1479,
515
  "step": 355
516
  },
517
  {
518
  "epoch": 1.8,
519
- "grad_norm": 1.2421875,
520
- "learning_rate": 7.036731478500415e-05,
521
- "loss": 0.1539,
522
  "step": 360
523
  },
524
  {
525
  "epoch": 1.825,
526
- "grad_norm": 1.046875,
527
- "learning_rate": 6.904737597567746e-05,
528
- "loss": 0.1461,
529
  "step": 365
530
  },
531
  {
532
  "epoch": 1.85,
533
- "grad_norm": 1.1171875,
534
- "learning_rate": 6.773710758545238e-05,
535
- "loss": 0.1483,
536
  "step": 370
537
  },
538
  {
539
  "epoch": 1.875,
540
- "grad_norm": 0.94140625,
541
- "learning_rate": 6.643752230112798e-05,
542
- "loss": 0.1473,
543
  "step": 375
544
  },
545
  {
546
  "epoch": 1.9,
547
- "grad_norm": 1.03125,
548
- "learning_rate": 6.514962455269088e-05,
549
- "loss": 0.1488,
550
  "step": 380
551
  },
552
  {
553
  "epoch": 1.925,
554
- "grad_norm": 0.8046875,
555
- "learning_rate": 6.38744097370064e-05,
556
- "loss": 0.1497,
557
  "step": 385
558
  },
559
  {
560
  "epoch": 1.95,
561
- "grad_norm": 0.95703125,
562
- "learning_rate": 6.261286344849127e-05,
563
- "loss": 0.1457,
564
  "step": 390
565
  },
566
  {
567
  "epoch": 1.975,
568
- "grad_norm": 0.85546875,
569
- "learning_rate": 6.136596071736244e-05,
570
- "loss": 0.1485,
571
  "step": 395
572
  },
573
  {
574
  "epoch": 2.0,
575
- "grad_norm": 1.1171875,
576
- "learning_rate": 6.0134665256050806e-05,
577
- "loss": 0.1456,
578
  "step": 400
579
  },
580
  {
581
  "epoch": 2.0,
582
- "eval_loss": 0.1398458033800125,
583
- "eval_runtime": 0.6061,
584
- "eval_samples_per_second": 42.895,
585
- "eval_steps_per_second": 42.895,
586
  "step": 400
587
  },
588
  {
589
  "epoch": 2.025,
590
- "grad_norm": 0.87109375,
591
- "learning_rate": 5.891992871436244e-05,
592
- "loss": 0.1469,
593
  "step": 405
594
  },
595
  {
596
  "epoch": 2.05,
597
- "grad_norm": 0.8125,
598
- "learning_rate": 5.772268994396255e-05,
599
- "loss": 0.1343,
600
  "step": 410
601
  },
602
  {
603
  "epoch": 2.075,
604
- "grad_norm": 0.83984375,
605
- "learning_rate": 5.6543874272751244e-05,
606
- "loss": 0.14,
607
  "step": 415
608
  },
609
  {
610
  "epoch": 2.1,
611
- "grad_norm": 0.80078125,
612
- "learning_rate": 5.5384392789691435e-05,
613
- "loss": 0.1368,
614
  "step": 420
615
  },
616
  {
617
  "epoch": 2.125,
618
- "grad_norm": 0.828125,
619
- "learning_rate": 5.4245141640641774e-05,
620
- "loss": 0.1431,
621
  "step": 425
622
  },
623
  {
624
  "epoch": 2.15,
625
- "grad_norm": 0.83203125,
626
- "learning_rate": 5.312700133573899e-05,
627
- "loss": 0.1394,
628
  "step": 430
629
  },
630
  {
631
  "epoch": 2.175,
632
- "grad_norm": 0.76171875,
633
- "learning_rate": 5.203083606886482e-05,
634
- "loss": 0.1379,
635
  "step": 435
636
  },
637
  {
638
  "epoch": 2.2,
639
- "grad_norm": 0.8984375,
640
- "learning_rate": 5.095749304972349e-05,
641
- "loss": 0.1365,
642
  "step": 440
643
  },
644
  {
645
  "epoch": 2.225,
646
- "grad_norm": 0.84375,
647
- "learning_rate": 4.990780184904607e-05,
648
- "loss": 0.1347,
649
  "step": 445
650
  },
651
  {
652
  "epoch": 2.25,
653
- "grad_norm": 0.8203125,
654
- "learning_rate": 4.888257375742759e-05,
655
- "loss": 0.1415,
656
  "step": 450
657
  },
658
  {
659
  "epoch": 2.275,
660
- "grad_norm": 0.91796875,
661
- "learning_rate": 4.788260115829281e-05,
662
- "loss": 0.1417,
663
  "step": 455
664
  },
665
  {
666
  "epoch": 2.3,
667
- "grad_norm": 0.984375,
668
- "learning_rate": 4.690865691547493e-05,
669
- "loss": 0.1354,
670
  "step": 460
671
  },
672
  {
673
  "epoch": 2.325,
674
- "grad_norm": 0.87890625,
675
- "learning_rate": 4.596149377588066e-05,
676
- "loss": 0.1346,
677
  "step": 465
678
  },
679
  {
680
  "epoch": 2.35,
681
- "grad_norm": 0.9453125,
682
- "learning_rate": 4.504184378770344e-05,
683
- "loss": 0.1389,
684
  "step": 470
685
  },
686
  {
687
  "epoch": 2.375,
688
- "grad_norm": 1.109375,
689
- "learning_rate": 4.415041773463443e-05,
690
- "loss": 0.1374,
691
  "step": 475
692
  },
693
  {
694
  "epoch": 2.4,
695
- "grad_norm": 0.87890625,
696
- "learning_rate": 4.3287904586508334e-05,
697
- "loss": 0.135,
698
  "step": 480
699
  },
700
  {
701
  "epoch": 2.425,
702
- "grad_norm": 0.83203125,
703
- "learning_rate": 4.2454970966809075e-05,
704
- "loss": 0.1333,
705
  "step": 485
706
  },
707
  {
708
  "epoch": 2.45,
709
- "grad_norm": 0.73046875,
710
- "learning_rate": 4.165226063744636e-05,
711
- "loss": 0.1387,
712
  "step": 490
713
  },
714
  {
715
  "epoch": 2.475,
716
- "grad_norm": 0.71484375,
717
- "learning_rate": 4.088039400120184e-05,
718
- "loss": 0.1302,
719
  "step": 495
720
  },
721
  {
722
  "epoch": 2.5,
723
- "grad_norm": 0.9375,
724
- "learning_rate": 4.0139967622229077e-05,
725
- "loss": 0.1379,
726
  "step": 500
727
  },
728
  {
729
  "epoch": 2.5,
730
- "eval_loss": 0.12946058809757233,
731
- "eval_runtime": 0.6048,
732
- "eval_samples_per_second": 42.989,
733
- "eval_steps_per_second": 42.989,
734
  "step": 500
735
  },
736
  {
737
  "epoch": 2.525,
738
- "grad_norm": 0.73046875,
739
- "learning_rate": 3.943155376497806e-05,
740
- "loss": 0.1328,
741
  "step": 505
742
  },
743
  {
744
  "epoch": 2.55,
745
- "grad_norm": 0.93359375,
746
- "learning_rate": 3.8755699951900555e-05,
747
- "loss": 0.134,
748
  "step": 510
749
  },
750
  {
751
  "epoch": 2.575,
752
- "grad_norm": 0.74609375,
753
- "learning_rate": 3.811292854027826e-05,
754
- "loss": 0.1327,
755
  "step": 515
756
  },
757
  {
758
  "epoch": 2.6,
759
- "grad_norm": 0.734375,
760
- "learning_rate": 3.750373631850063e-05,
761
- "loss": 0.1328,
762
  "step": 520
763
  },
764
  {
765
  "epoch": 2.625,
766
- "grad_norm": 0.74609375,
767
- "learning_rate": 3.692859412210464e-05,
768
- "loss": 0.1329,
769
  "step": 525
770
  },
771
  {
772
  "epoch": 2.65,
773
- "grad_norm": 0.69921875,
774
- "learning_rate": 3.6387946469873e-05,
775
- "loss": 0.1333,
776
  "step": 530
777
  },
778
  {
779
  "epoch": 2.675,
780
- "grad_norm": 0.69921875,
781
- "learning_rate": 3.588221122027231e-05,
782
- "loss": 0.1287,
783
  "step": 535
784
  },
785
  {
786
  "epoch": 2.7,
787
- "grad_norm": 0.59765625,
788
- "learning_rate": 3.541177924849646e-05,
789
- "loss": 0.1325,
790
  "step": 540
791
  },
792
  {
793
  "epoch": 2.725,
794
- "grad_norm": 0.77734375,
795
- "learning_rate": 3.497701414436508e-05,
796
- "loss": 0.1298,
797
  "step": 545
798
  },
799
  {
800
  "epoch": 2.75,
801
- "grad_norm": 0.65234375,
802
- "learning_rate": 3.457825193131042e-05,
803
- "loss": 0.1307,
804
  "step": 550
805
  },
806
  {
807
  "epoch": 2.775,
808
- "grad_norm": 0.73828125,
809
- "learning_rate": 3.4215800806669854e-05,
810
- "loss": 0.1326,
811
  "step": 555
812
  },
813
  {
814
  "epoch": 2.8,
815
- "grad_norm": 0.5859375,
816
- "learning_rate": 3.388994090348479e-05,
817
- "loss": 0.1356,
818
  "step": 560
819
  },
820
  {
821
  "epoch": 2.825,
822
- "grad_norm": 0.69921875,
823
- "learning_rate": 3.360092407399007e-05,
824
- "loss": 0.1277,
825
  "step": 565
826
  },
827
  {
828
  "epoch": 2.85,
829
- "grad_norm": 0.7109375,
830
- "learning_rate": 3.334897369496107e-05,
831
- "loss": 0.1295,
832
  "step": 570
833
  },
834
  {
835
  "epoch": 2.875,
836
- "grad_norm": 0.72265625,
837
- "learning_rate": 3.313428449506927e-05,
838
- "loss": 0.1286,
839
  "step": 575
840
  },
841
  {
842
  "epoch": 2.9,
843
- "grad_norm": 0.63671875,
844
- "learning_rate": 3.295702240437926e-05,
845
- "loss": 0.1357,
846
  "step": 580
847
  },
848
  {
849
  "epoch": 2.925,
850
- "grad_norm": 0.68359375,
851
- "learning_rate": 3.2817324426103896e-05,
852
- "loss": 0.131,
853
  "step": 585
854
  },
855
  {
856
  "epoch": 2.95,
857
- "grad_norm": 0.7109375,
858
- "learning_rate": 3.271529853071668e-05,
859
- "loss": 0.1293,
860
  "step": 590
861
  },
862
  {
863
  "epoch": 2.975,
864
- "grad_norm": 0.7578125,
865
- "learning_rate": 3.265102357250287e-05,
866
- "loss": 0.1286,
867
  "step": 595
868
  },
869
  {
870
  "epoch": 3.0,
871
- "grad_norm": 0.69140625,
872
- "learning_rate": 3.2624549228614246e-05,
873
- "loss": 0.132,
874
  "step": 600
875
  },
876
  {
877
  "epoch": 3.0,
878
- "eval_loss": 0.12757942080497742,
879
- "eval_runtime": 0.6043,
880
- "eval_samples_per_second": 43.022,
881
- "eval_steps_per_second": 43.022,
882
  "step": 600
883
  }
884
  ],
 
11
  "log_history": [
12
  {
13
  "epoch": 0.025,
14
+ "grad_norm": 198.0,
15
+ "learning_rate": 1.747190540874941e-05,
16
+ "loss": 0.5751,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.05,
21
+ "grad_norm": 186.0,
22
+ "learning_rate": 3.931178716968617e-05,
23
+ "loss": 0.4175,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.075,
28
+ "grad_norm": 270.0,
29
+ "learning_rate": 6.115166893062294e-05,
30
+ "loss": 0.3509,
31
  "step": 15
32
  },
33
  {
34
  "epoch": 0.1,
35
+ "grad_norm": 110.5,
36
+ "learning_rate": 8.29915506915597e-05,
37
+ "loss": 0.3508,
38
  "step": 20
39
  },
40
  {
41
  "epoch": 0.125,
42
+ "grad_norm": 111.0,
43
+ "learning_rate": 0.00010483143245249646,
44
+ "loss": 0.4222,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.15,
49
+ "grad_norm": 83.5,
50
+ "learning_rate": 0.00012667131421343323,
51
+ "loss": 0.6498,
52
  "step": 30
53
  },
54
  {
55
  "epoch": 0.175,
56
+ "grad_norm": 65.0,
57
+ "learning_rate": 0.00014851119597437,
58
+ "loss": 0.8894,
59
  "step": 35
60
  },
61
  {
62
  "epoch": 0.2,
63
+ "grad_norm": 104.5,
64
+ "learning_rate": 0.0001528649930352754,
65
+ "loss": 0.6775,
66
  "step": 40
67
  },
68
  {
69
  "epoch": 0.225,
70
+ "grad_norm": 29.5,
71
+ "learning_rate": 0.00015280740168512177,
72
+ "loss": 0.6998,
73
  "step": 45
74
  },
75
  {
76
  "epoch": 0.25,
77
+ "grad_norm": 17.0,
78
+ "learning_rate": 0.00015270555654763282,
79
+ "loss": 0.6402,
80
  "step": 50
81
  },
82
  {
83
  "epoch": 0.275,
84
+ "grad_norm": 16.75,
85
+ "learning_rate": 0.00015255953633738878,
86
+ "loss": 0.4757,
87
  "step": 55
88
  },
89
  {
90
  "epoch": 0.3,
91
+ "grad_norm": 11.75,
92
+ "learning_rate": 0.0001523694539112214,
93
+ "loss": 0.3831,
94
  "step": 60
95
  },
96
  {
97
  "epoch": 0.325,
98
+ "grad_norm": 6.5625,
99
+ "learning_rate": 0.00015213545618098876,
100
+ "loss": 0.3437,
101
  "step": 65
102
  },
103
  {
104
  "epoch": 0.35,
105
+ "grad_norm": 6.03125,
106
+ "learning_rate": 0.00015185772400002907,
107
+ "loss": 0.2986,
108
  "step": 70
109
  },
110
  {
111
  "epoch": 0.375,
112
+ "grad_norm": 4.46875,
113
+ "learning_rate": 0.00015153647202338207,
114
+ "loss": 0.2774,
115
  "step": 75
116
  },
117
  {
118
  "epoch": 0.4,
119
+ "grad_norm": 7.6875,
120
+ "learning_rate": 0.00015117194854188525,
121
+ "loss": 0.2703,
122
  "step": 80
123
  },
124
  {
125
  "epoch": 0.425,
126
+ "grad_norm": 5.65625,
127
+ "learning_rate": 0.00015076443529027353,
128
+ "loss": 0.2501,
129
  "step": 85
130
  },
131
  {
132
  "epoch": 0.45,
133
+ "grad_norm": 26.375,
134
+ "learning_rate": 0.00015031424722943083,
135
+ "loss": 0.3738,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 0.475,
140
+ "grad_norm": 8.1875,
141
+ "learning_rate": 0.00014982173230296148,
142
+ "loss": 0.2765,
143
  "step": 95
144
  },
145
  {
146
  "epoch": 0.5,
147
+ "grad_norm": 7.125,
148
+ "learning_rate": 0.00014928727116826976,
149
+ "loss": 0.2929,
150
  "step": 100
151
  },
152
  {
153
  "epoch": 0.525,
154
+ "grad_norm": 7.8125,
155
+ "learning_rate": 0.00014871127690235564,
156
+ "loss": 0.2699,
157
  "step": 105
158
  },
159
  {
160
  "epoch": 0.55,
161
+ "grad_norm": 6.125,
162
+ "learning_rate": 0.00014809419468255356,
163
+ "loss": 0.269,
164
  "step": 110
165
  },
166
  {
167
  "epoch": 0.575,
168
+ "grad_norm": 4.1875,
169
+ "learning_rate": 0.00014743650144246167,
170
+ "loss": 0.2615,
171
  "step": 115
172
  },
173
  {
174
  "epoch": 0.6,
175
+ "grad_norm": 3.828125,
176
+ "learning_rate": 0.00014673870550332703,
177
+ "loss": 0.2325,
178
  "step": 120
179
  },
180
  {
181
  "epoch": 0.625,
182
+ "grad_norm": 5.375,
183
+ "learning_rate": 0.00014600134618117166,
184
+ "loss": 0.2295,
185
  "step": 125
186
  },
187
  {
188
  "epoch": 0.65,
189
+ "grad_norm": 3.953125,
190
+ "learning_rate": 0.0001452249933699633,
191
+ "loss": 0.2407,
192
  "step": 130
193
  },
194
  {
195
  "epoch": 0.675,
196
+ "grad_norm": 2.09375,
197
+ "learning_rate": 0.0001444102471011529,
198
+ "loss": 0.2192,
199
  "step": 135
200
  },
201
  {
202
  "epoch": 0.7,
203
+ "grad_norm": 1.8515625,
204
+ "learning_rate": 0.00014355773707991926,
205
+ "loss": 0.2133,
206
  "step": 140
207
  },
208
  {
209
  "epoch": 0.725,
210
+ "grad_norm": 2.078125,
211
+ "learning_rate": 0.00014266812219847945,
212
+ "loss": 0.2014,
213
  "step": 145
214
  },
215
  {
216
  "epoch": 0.75,
217
+ "grad_norm": 2.0625,
218
+ "learning_rate": 0.00014174209002684087,
219
+ "loss": 0.191,
220
  "step": 150
221
  },
222
  {
223
  "epoch": 0.775,
224
+ "grad_norm": 2.828125,
225
+ "learning_rate": 0.00014078035628138847,
226
+ "loss": 0.2024,
227
  "step": 155
228
  },
229
  {
230
  "epoch": 0.8,
231
+ "grad_norm": 2.015625,
232
+ "learning_rate": 0.00013978366427171864,
233
+ "loss": 0.1871,
234
  "step": 160
235
  },
236
  {
237
  "epoch": 0.825,
238
+ "grad_norm": 1.90625,
239
+ "learning_rate": 0.00013875278432614612,
240
+ "loss": 0.1751,
241
  "step": 165
242
  },
243
  {
244
  "epoch": 0.85,
245
+ "grad_norm": 1.1171875,
246
+ "learning_rate": 0.00013768851319632887,
247
+ "loss": 0.1679,
248
  "step": 170
249
  },
250
  {
251
  "epoch": 0.875,
252
+ "grad_norm": 2.015625,
253
+ "learning_rate": 0.00013659167344147067,
254
+ "loss": 0.1818,
255
  "step": 175
256
  },
257
  {
258
  "epoch": 0.9,
259
+ "grad_norm": 1.8671875,
260
+ "learning_rate": 0.0001354631127925774,
261
+ "loss": 0.1692,
262
  "step": 180
263
  },
264
  {
265
  "epoch": 0.925,
266
+ "grad_norm": 1.5078125,
267
+ "learning_rate": 0.0001343037034972584,
268
+ "loss": 0.1619,
269
  "step": 185
270
  },
271
  {
272
  "epoch": 0.95,
273
+ "grad_norm": 1.6328125,
274
+ "learning_rate": 0.0001331143416455796,
275
+ "loss": 0.1617,
276
  "step": 190
277
  },
278
  {
279
  "epoch": 0.975,
280
+ "grad_norm": 1.6953125,
281
+ "learning_rate": 0.00013189594647748868,
282
+ "loss": 0.1615,
283
  "step": 195
284
  },
285
  {
286
  "epoch": 1.0,
287
+ "grad_norm": 1.5,
288
+ "learning_rate": 0.00013064945967234835,
289
+ "loss": 0.1689,
290
  "step": 200
291
  },
292
  {
293
  "epoch": 1.0,
294
+ "eval_loss": 0.16072207689285278,
295
+ "eval_runtime": 0.5972,
296
+ "eval_samples_per_second": 43.533,
297
+ "eval_steps_per_second": 43.533,
298
  "step": 200
299
  },
300
  {
301
  "epoch": 1.025,
302
+ "grad_norm": 1.828125,
303
+ "learning_rate": 0.0001293758446211266,
304
+ "loss": 0.1629,
305
  "step": 205
306
  },
307
  {
308
  "epoch": 1.05,
309
+ "grad_norm": 1.59375,
310
+ "learning_rate": 0.00012807608568180618,
311
+ "loss": 0.1624,
312
  "step": 210
313
  },
314
  {
315
  "epoch": 1.075,
316
+ "grad_norm": 1.0859375,
317
+ "learning_rate": 0.00012675118741858906,
318
+ "loss": 0.1614,
319
  "step": 215
320
  },
321
  {
322
  "epoch": 1.1,
323
+ "grad_norm": 1.265625,
324
+ "learning_rate": 0.00012540217382548384,
325
+ "loss": 0.1636,
326
  "step": 220
327
  },
328
  {
329
  "epoch": 1.125,
330
+ "grad_norm": 1.46875,
331
+ "learning_rate": 0.0001240300875348761,
332
+ "loss": 0.1567,
333
  "step": 225
334
  },
335
  {
336
  "epoch": 1.15,
337
+ "grad_norm": 2.609375,
338
+ "learning_rate": 0.0001226359890116935,
339
+ "loss": 0.1663,
340
  "step": 230
341
  },
342
  {
343
  "epoch": 1.175,
344
+ "grad_norm": 1.9140625,
345
+ "learning_rate": 0.00012122095573378837,
346
+ "loss": 0.1774,
347
  "step": 235
348
  },
349
  {
350
  "epoch": 1.2,
351
+ "grad_norm": 3.015625,
352
+ "learning_rate": 0.00011978608135917105,
353
+ "loss": 0.1701,
354
  "step": 240
355
  },
356
  {
357
  "epoch": 1.225,
358
+ "grad_norm": 2.59375,
359
+ "learning_rate": 0.00011833247488073823,
360
+ "loss": 0.1853,
361
  "step": 245
362
  },
363
  {
364
  "epoch": 1.25,
365
+ "grad_norm": 2.453125,
366
+ "learning_rate": 0.00011686125976914878,
367
+ "loss": 0.1948,
368
  "step": 250
369
  },
370
  {
371
  "epoch": 1.275,
372
+ "grad_norm": 1.3359375,
373
+ "learning_rate": 0.00011537357310451031,
374
+ "loss": 0.1733,
375
  "step": 255
376
  },
377
  {
378
  "epoch": 1.3,
379
+ "grad_norm": 1.890625,
380
+ "learning_rate": 0.00011387056469754679,
381
+ "loss": 0.1624,
382
  "step": 260
383
  },
384
  {
385
  "epoch": 1.325,
386
+ "grad_norm": 2.390625,
387
+ "learning_rate": 0.00011235339620092721,
388
+ "loss": 0.1684,
389
  "step": 265
390
  },
391
  {
392
  "epoch": 1.35,
393
+ "grad_norm": 2.390625,
394
+ "learning_rate": 0.0001108232402114416,
395
+ "loss": 0.169,
396
  "step": 270
397
  },
398
  {
399
  "epoch": 1.375,
400
+ "grad_norm": 2.65625,
401
+ "learning_rate": 0.0001092812793637186,
402
+ "loss": 0.1789,
403
  "step": 275
404
  },
405
  {
406
  "epoch": 1.4,
407
+ "grad_norm": 1.6875,
408
+ "learning_rate": 0.0001077287054161847,
409
+ "loss": 0.1695,
410
  "step": 280
411
  },
412
  {
413
  "epoch": 1.425,
414
+ "grad_norm": 1.4375,
415
+ "learning_rate": 0.00010616671832997237,
416
+ "loss": 0.1671,
417
  "step": 285
418
  },
419
  {
420
  "epoch": 1.45,
421
+ "grad_norm": 1.0390625,
422
+ "learning_rate": 0.00010459652534148764,
423
+ "loss": 0.1675,
424
  "step": 290
425
  },
426
  {
427
  "epoch": 1.475,
428
+ "grad_norm": 0.97265625,
429
+ "learning_rate": 0.00010301934002935564,
430
+ "loss": 0.159,
431
  "step": 295
432
  },
433
  {
434
  "epoch": 1.5,
435
+ "grad_norm": 1.7109375,
436
+ "learning_rate": 0.00010143638137646338,
437
+ "loss": 0.1578,
438
  "step": 300
439
  },
440
  {
441
  "epoch": 1.525,
442
+ "grad_norm": 1.171875,
443
+ "learning_rate": 9.984887282782665e-05,
444
+ "loss": 0.1588,
445
  "step": 305
446
  },
447
  {
448
  "epoch": 1.55,
449
+ "grad_norm": 1.0546875,
450
+ "learning_rate": 9.825804134500727e-05,
451
+ "loss": 0.1535,
452
  "step": 310
453
  },
454
  {
455
  "epoch": 1.575,
456
+ "grad_norm": 1.296875,
457
+ "learning_rate": 9.666511645781328e-05,
458
+ "loss": 0.1587,
459
  "step": 315
460
  },
461
  {
462
  "epoch": 1.6,
463
+ "grad_norm": 1.109375,
464
+ "learning_rate": 9.507132931401333e-05,
465
+ "loss": 0.1514,
466
  "step": 320
467
  },
468
  {
469
  "epoch": 1.625,
470
+ "grad_norm": 1.21875,
471
+ "learning_rate": 9.347791172780155e-05,
472
+ "loss": 0.1486,
473
  "step": 325
474
  },
475
  {
476
  "epoch": 1.65,
477
+ "grad_norm": 1.0703125,
478
+ "learning_rate": 9.188609522774628e-05,
479
+ "loss": 0.149,
480
  "step": 330
481
  },
482
  {
483
  "epoch": 1.675,
484
+ "grad_norm": 0.87109375,
485
+ "learning_rate": 9.029711010496061e-05,
486
+ "loss": 0.1467,
487
  "step": 335
488
  },
489
  {
490
  "epoch": 1.7,
491
+ "grad_norm": 0.8984375,
492
+ "learning_rate": 8.871218446222844e-05,
493
+ "loss": 0.1456,
494
  "step": 340
495
  },
496
  {
497
  "epoch": 1.725,
498
+ "grad_norm": 0.7734375,
499
+ "learning_rate": 8.713254326482237e-05,
500
+ "loss": 0.1439,
501
  "step": 345
502
  },
503
  {
504
  "epoch": 1.75,
505
+ "grad_norm": 1.0625,
506
+ "learning_rate": 8.555940739374653e-05,
507
+ "loss": 0.1426,
508
  "step": 350
509
  },
510
  {
511
  "epoch": 1.775,
512
+ "grad_norm": 0.9453125,
513
+ "learning_rate": 8.399399270213575e-05,
514
+ "loss": 0.1399,
515
  "step": 355
516
  },
517
  {
518
  "epoch": 1.8,
519
+ "grad_norm": 0.953125,
520
+ "learning_rate": 8.243750907554097e-05,
521
+ "loss": 0.1436,
522
  "step": 360
523
  },
524
  {
525
  "epoch": 1.825,
526
+ "grad_norm": 0.7421875,
527
+ "learning_rate": 8.089115949682696e-05,
528
+ "loss": 0.1375,
529
  "step": 365
530
  },
531
  {
532
  "epoch": 1.85,
533
+ "grad_norm": 0.6640625,
534
+ "learning_rate": 7.935613911640464e-05,
535
+ "loss": 0.1384,
536
  "step": 370
537
  },
538
  {
539
  "epoch": 1.875,
540
+ "grad_norm": 0.828125,
541
+ "learning_rate": 7.783363432851746e-05,
542
+ "loss": 0.1366,
543
  "step": 375
544
  },
545
  {
546
  "epoch": 1.9,
547
+ "grad_norm": 0.859375,
548
+ "learning_rate": 7.632482185429501e-05,
549
+ "loss": 0.1374,
550
  "step": 380
551
  },
552
  {
553
  "epoch": 1.925,
554
+ "grad_norm": 0.87890625,
555
+ "learning_rate": 7.483086783228284e-05,
556
+ "loss": 0.1413,
557
  "step": 385
558
  },
559
  {
560
  "epoch": 1.95,
561
+ "grad_norm": 1.1015625,
562
+ "learning_rate": 7.335292691715154e-05,
563
+ "loss": 0.1371,
564
  "step": 390
565
  },
566
  {
567
  "epoch": 1.975,
568
+ "grad_norm": 0.8359375,
569
+ "learning_rate": 7.189214138728142e-05,
570
+ "loss": 0.1413,
571
  "step": 395
572
  },
573
  {
574
  "epoch": 2.0,
575
+ "grad_norm": 0.8359375,
576
+ "learning_rate": 7.044964026191261e-05,
577
+ "loss": 0.1378,
578
  "step": 400
579
  },
580
  {
581
  "epoch": 2.0,
582
+ "eval_loss": 0.13336719572544098,
583
+ "eval_runtime": 0.5924,
584
+ "eval_samples_per_second": 43.887,
585
+ "eval_steps_per_second": 43.887,
586
  "step": 400
587
  },
588
  {
589
  "epoch": 2.025,
590
+ "grad_norm": 0.8125,
591
+ "learning_rate": 6.902653842854314e-05,
592
+ "loss": 0.1382,
593
  "step": 405
594
  },
595
  {
596
  "epoch": 2.05,
597
+ "grad_norm": 0.6640625,
598
+ "learning_rate": 6.762393578124894e-05,
599
+ "loss": 0.1273,
600
  "step": 410
601
  },
602
  {
603
  "epoch": 2.075,
604
+ "grad_norm": 0.6640625,
605
+ "learning_rate": 6.624291637059237e-05,
606
+ "loss": 0.1319,
607
  "step": 415
608
  },
609
  {
610
  "epoch": 2.1,
611
+ "grad_norm": 0.6796875,
612
+ "learning_rate": 6.48845475657757e-05,
613
+ "loss": 0.1296,
614
  "step": 420
615
  },
616
  {
617
  "epoch": 2.125,
618
+ "grad_norm": 0.68359375,
619
+ "learning_rate": 6.354987922968741e-05,
620
+ "loss": 0.136,
621
  "step": 425
622
  },
623
  {
624
  "epoch": 2.15,
625
+ "grad_norm": 0.8203125,
626
+ "learning_rate": 6.223994290747898e-05,
627
+ "loss": 0.1332,
628
  "step": 430
629
  },
630
  {
631
  "epoch": 2.175,
632
+ "grad_norm": 0.79296875,
633
+ "learning_rate": 6.095575102929907e-05,
634
+ "loss": 0.1337,
635
  "step": 435
636
  },
637
  {
638
  "epoch": 2.2,
639
+ "grad_norm": 0.7421875,
640
+ "learning_rate": 5.969829612780141e-05,
641
+ "loss": 0.1295,
642
  "step": 440
643
  },
644
  {
645
  "epoch": 2.225,
646
+ "grad_norm": 0.85546875,
647
+ "learning_rate": 5.8468550071031296e-05,
648
+ "loss": 0.1291,
649
  "step": 445
650
  },
651
  {
652
  "epoch": 2.25,
653
+ "grad_norm": 1.0390625,
654
+ "learning_rate": 5.726746331128316e-05,
655
+ "loss": 0.1364,
656
  "step": 450
657
  },
658
  {
659
  "epoch": 2.275,
660
+ "grad_norm": 1.0,
661
+ "learning_rate": 5.609596415051039e-05,
662
+ "loss": 0.1365,
663
  "step": 455
664
  },
665
  {
666
  "epoch": 2.3,
667
+ "grad_norm": 1.0390625,
668
+ "learning_rate": 5.495495802285465e-05,
669
+ "loss": 0.1317,
670
  "step": 460
671
  },
672
  {
673
  "epoch": 2.325,
674
+ "grad_norm": 0.7890625,
675
+ "learning_rate": 5.384532679484933e-05,
676
+ "loss": 0.1296,
677
  "step": 465
678
  },
679
  {
680
  "epoch": 2.35,
681
+ "grad_norm": 0.66796875,
682
+ "learning_rate": 5.276792808383817e-05,
683
+ "loss": 0.1355,
684
  "step": 470
685
  },
686
  {
687
  "epoch": 2.375,
688
+ "grad_norm": 0.875,
689
+ "learning_rate": 5.1723594595135666e-05,
690
+ "loss": 0.1324,
691
  "step": 475
692
  },
693
  {
694
  "epoch": 2.4,
695
+ "grad_norm": 0.77734375,
696
+ "learning_rate": 5.07131334784416e-05,
697
+ "loss": 0.1315,
698
  "step": 480
699
  },
700
  {
701
  "epoch": 2.425,
702
+ "grad_norm": 0.8359375,
703
+ "learning_rate": 4.973732570400718e-05,
704
+ "loss": 0.1284,
705
  "step": 485
706
  },
707
  {
708
  "epoch": 2.45,
709
+ "grad_norm": 0.79296875,
710
+ "learning_rate": 4.879692545903476e-05,
711
+ "loss": 0.1345,
712
  "step": 490
713
  },
714
  {
715
  "epoch": 2.475,
716
+ "grad_norm": 0.875,
717
+ "learning_rate": 4.789265956477791e-05,
718
+ "loss": 0.126,
719
  "step": 495
720
  },
721
  {
722
  "epoch": 2.5,
723
+ "grad_norm": 0.953125,
724
+ "learning_rate": 4.702522691479217e-05,
725
+ "loss": 0.1345,
726
  "step": 500
727
  },
728
  {
729
  "epoch": 2.5,
730
+ "eval_loss": 0.12545974552631378,
731
+ "eval_runtime": 0.5917,
732
+ "eval_samples_per_second": 43.943,
733
+ "eval_steps_per_second": 43.943,
734
  "step": 500
735
  },
736
  {
737
  "epoch": 2.525,
738
+ "grad_norm": 0.8125,
739
+ "learning_rate": 4.619529793477068e-05,
740
+ "loss": 0.1291,
741
  "step": 505
742
  },
743
  {
744
  "epoch": 2.55,
745
+ "grad_norm": 1.078125,
746
+ "learning_rate": 4.540351406438219e-05,
747
+ "loss": 0.1301,
748
  "step": 510
749
  },
750
  {
751
  "epoch": 2.575,
752
+ "grad_norm": 0.921875,
753
+ "learning_rate": 4.465048726151201e-05,
754
+ "loss": 0.128,
755
  "step": 515
756
  },
757
  {
758
  "epoch": 2.6,
759
+ "grad_norm": 1.1875,
760
+ "learning_rate": 4.393679952928885e-05,
761
+ "loss": 0.1288,
762
  "step": 520
763
  },
764
  {
765
  "epoch": 2.625,
766
+ "grad_norm": 1.203125,
767
+ "learning_rate": 4.3263002466263436e-05,
768
+ "loss": 0.1281,
769
  "step": 525
770
  },
771
  {
772
  "epoch": 2.65,
773
+ "grad_norm": 0.9609375,
774
+ "learning_rate": 4.262961684008613e-05,
775
+ "loss": 0.1305,
776
  "step": 530
777
  },
778
  {
779
  "epoch": 2.675,
780
+ "grad_norm": 0.88671875,
781
+ "learning_rate": 4.203713218501353e-05,
782
+ "loss": 0.1258,
783
  "step": 535
784
  },
785
  {
786
  "epoch": 2.7,
787
+ "grad_norm": 0.9921875,
788
+ "learning_rate": 4.1486006423554745e-05,
789
+ "loss": 0.1284,
790
  "step": 540
791
  },
792
  {
793
  "epoch": 2.725,
794
+ "grad_norm": 0.84765625,
795
+ "learning_rate": 4.097666551254989e-05,
796
+ "loss": 0.1269,
797
  "step": 545
798
  },
799
  {
800
  "epoch": 2.75,
801
+ "grad_norm": 0.92578125,
802
+ "learning_rate": 4.0509503113954545e-05,
803
+ "loss": 0.128,
804
  "step": 550
805
  },
806
  {
807
  "epoch": 2.775,
808
+ "grad_norm": 0.90234375,
809
+ "learning_rate": 4.008488029058422e-05,
810
+ "loss": 0.1293,
811
  "step": 555
812
  },
813
  {
814
  "epoch": 2.8,
815
+ "grad_norm": 0.85546875,
816
+ "learning_rate": 3.9703125227054376e-05,
817
+ "loss": 0.132,
818
  "step": 560
819
  },
820
  {
821
  "epoch": 2.825,
822
+ "grad_norm": 0.859375,
823
+ "learning_rate": 3.9364532976131475e-05,
824
+ "loss": 0.1247,
825
  "step": 565
826
  },
827
  {
828
  "epoch": 2.85,
829
+ "grad_norm": 1.6796875,
830
+ "learning_rate": 3.906936523069101e-05,
831
+ "loss": 0.1246,
832
  "step": 570
833
  },
834
  {
835
  "epoch": 2.875,
836
+ "grad_norm": 1.0390625,
837
+ "learning_rate": 3.8817850121459174e-05,
838
+ "loss": 0.1246,
839
  "step": 575
840
  },
841
  {
842
  "epoch": 2.9,
843
+ "grad_norm": 1.03125,
844
+ "learning_rate": 3.861018204069391e-05,
845
+ "loss": 0.1322,
846
  "step": 580
847
  },
848
  {
849
  "epoch": 2.925,
850
+ "grad_norm": 0.859375,
851
+ "learning_rate": 3.8446521491942034e-05,
852
+ "loss": 0.1277,
853
  "step": 585
854
  },
855
  {
856
  "epoch": 2.95,
857
+ "grad_norm": 0.984375,
858
+ "learning_rate": 3.832699496598859e-05,
859
+ "loss": 0.1264,
860
  "step": 590
861
  },
862
  {
863
  "epoch": 2.975,
864
+ "grad_norm": 0.81640625,
865
+ "learning_rate": 3.8251694843093894e-05,
866
+ "loss": 0.1277,
867
  "step": 595
868
  },
869
  {
870
  "epoch": 3.0,
871
+ "grad_norm": 0.78515625,
872
+ "learning_rate": 3.8220679321594226e-05,
873
+ "loss": 0.1288,
874
  "step": 600
875
  },
876
  {
877
  "epoch": 3.0,
878
+ "eval_loss": 0.1242036521434784,
879
+ "eval_runtime": 0.5964,
880
+ "eval_samples_per_second": 43.593,
881
+ "eval_steps_per_second": 43.593,
882
  "step": 600
883
  }
884
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67a1019718a4de0599994b980cfc030edc112fa7937d0aaf986d18a627026760
3
  size 5688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ded8479bbdb5f158c653056fc1b2defb99d73e60bfc8e9a594082dbf0c979c26
3
  size 5688