main_main / trainer_state.json
Gege24's picture
Upload task output 0fe99f84-0038-4cec-8e61-1eb9fea8dc55
25223d1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9952830188679245,
"eval_steps": 500,
"global_step": 846,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01179245283018868,
"grad_norm": 6.4375,
"learning_rate": 1.3142857142857143e-06,
"loss": 1.9687,
"step": 5
},
{
"epoch": 0.02358490566037736,
"grad_norm": 4.25,
"learning_rate": 2.9571428571428567e-06,
"loss": 1.9487,
"step": 10
},
{
"epoch": 0.03537735849056604,
"grad_norm": 3.09375,
"learning_rate": 4.6e-06,
"loss": 1.8763,
"step": 15
},
{
"epoch": 0.04716981132075472,
"grad_norm": 2.78125,
"learning_rate": 6.242857142857142e-06,
"loss": 1.8502,
"step": 20
},
{
"epoch": 0.0589622641509434,
"grad_norm": 2.546875,
"learning_rate": 7.885714285714286e-06,
"loss": 1.8808,
"step": 25
},
{
"epoch": 0.07075471698113207,
"grad_norm": 2.625,
"learning_rate": 9.528571428571429e-06,
"loss": 1.815,
"step": 30
},
{
"epoch": 0.08254716981132075,
"grad_norm": 2.53125,
"learning_rate": 1.1171428571428571e-05,
"loss": 1.7632,
"step": 35
},
{
"epoch": 0.09433962264150944,
"grad_norm": 2.390625,
"learning_rate": 1.1499777476590145e-05,
"loss": 1.7819,
"step": 40
},
{
"epoch": 0.10613207547169812,
"grad_norm": 2.3125,
"learning_rate": 1.1498873514595152e-05,
"loss": 1.7612,
"step": 45
},
{
"epoch": 0.1179245283018868,
"grad_norm": 2.421875,
"learning_rate": 1.1497274351951891e-05,
"loss": 1.7435,
"step": 50
},
{
"epoch": 0.12971698113207547,
"grad_norm": 2.28125,
"learning_rate": 1.1494980246522254e-05,
"loss": 1.7239,
"step": 55
},
{
"epoch": 0.14150943396226415,
"grad_norm": 2.390625,
"learning_rate": 1.1491991568226304e-05,
"loss": 1.7142,
"step": 60
},
{
"epoch": 0.15330188679245282,
"grad_norm": 2.453125,
"learning_rate": 1.1488308798982647e-05,
"loss": 1.7514,
"step": 65
},
{
"epoch": 0.1650943396226415,
"grad_norm": 2.40625,
"learning_rate": 1.14839325326307e-05,
"loss": 1.7355,
"step": 70
},
{
"epoch": 0.17688679245283018,
"grad_norm": 2.375,
"learning_rate": 1.1478863474834962e-05,
"loss": 1.7629,
"step": 75
},
{
"epoch": 0.18867924528301888,
"grad_norm": 2.265625,
"learning_rate": 1.1473102442971205e-05,
"loss": 1.7157,
"step": 80
},
{
"epoch": 0.20047169811320756,
"grad_norm": 2.375,
"learning_rate": 1.1466650365994684e-05,
"loss": 1.7379,
"step": 85
},
{
"epoch": 0.21226415094339623,
"grad_norm": 2.265625,
"learning_rate": 1.1459508284290345e-05,
"loss": 1.7371,
"step": 90
},
{
"epoch": 0.2240566037735849,
"grad_norm": 2.359375,
"learning_rate": 1.145167734950506e-05,
"loss": 1.7067,
"step": 95
},
{
"epoch": 0.2358490566037736,
"grad_norm": 2.375,
"learning_rate": 1.1443158824361937e-05,
"loss": 1.738,
"step": 100
},
{
"epoch": 0.24764150943396226,
"grad_norm": 2.375,
"learning_rate": 1.1433954082456689e-05,
"loss": 1.7135,
"step": 105
},
{
"epoch": 0.25943396226415094,
"grad_norm": 2.34375,
"learning_rate": 1.1424064608036165e-05,
"loss": 1.6967,
"step": 110
},
{
"epoch": 0.27122641509433965,
"grad_norm": 2.265625,
"learning_rate": 1.1413491995759002e-05,
"loss": 1.7233,
"step": 115
},
{
"epoch": 0.2830188679245283,
"grad_norm": 2.34375,
"learning_rate": 1.1402237950438503e-05,
"loss": 1.7345,
"step": 120
},
{
"epoch": 0.294811320754717,
"grad_norm": 2.390625,
"learning_rate": 1.139030428676773e-05,
"loss": 1.6991,
"step": 125
},
{
"epoch": 0.30660377358490565,
"grad_norm": 2.421875,
"learning_rate": 1.1377692929026888e-05,
"loss": 1.6991,
"step": 130
},
{
"epoch": 0.31839622641509435,
"grad_norm": 2.375,
"learning_rate": 1.1364405910773045e-05,
"loss": 1.7055,
"step": 135
},
{
"epoch": 0.330188679245283,
"grad_norm": 2.21875,
"learning_rate": 1.135044537451222e-05,
"loss": 1.6739,
"step": 140
},
{
"epoch": 0.3419811320754717,
"grad_norm": 2.359375,
"learning_rate": 1.1335813571353907e-05,
"loss": 1.7265,
"step": 145
},
{
"epoch": 0.35377358490566035,
"grad_norm": 2.1875,
"learning_rate": 1.1320512860648091e-05,
"loss": 1.6901,
"step": 150
},
{
"epoch": 0.36556603773584906,
"grad_norm": 2.21875,
"learning_rate": 1.130454570960481e-05,
"loss": 1.7123,
"step": 155
},
{
"epoch": 0.37735849056603776,
"grad_norm": 2.234375,
"learning_rate": 1.1287914692896318e-05,
"loss": 1.7072,
"step": 160
},
{
"epoch": 0.3891509433962264,
"grad_norm": 2.140625,
"learning_rate": 1.1270622492241917e-05,
"loss": 1.7096,
"step": 165
},
{
"epoch": 0.4009433962264151,
"grad_norm": 2.25,
"learning_rate": 1.1252671895975553e-05,
"loss": 1.7145,
"step": 170
},
{
"epoch": 0.41273584905660377,
"grad_norm": 2.28125,
"learning_rate": 1.1234065798596185e-05,
"loss": 1.6668,
"step": 175
},
{
"epoch": 0.42452830188679247,
"grad_norm": 2.328125,
"learning_rate": 1.1214807200301065e-05,
"loss": 1.6786,
"step": 180
},
{
"epoch": 0.4363207547169811,
"grad_norm": 2.28125,
"learning_rate": 1.1194899206501954e-05,
"loss": 1.7117,
"step": 185
},
{
"epoch": 0.4481132075471698,
"grad_norm": 2.28125,
"learning_rate": 1.1174345027324379e-05,
"loss": 1.6979,
"step": 190
},
{
"epoch": 0.45990566037735847,
"grad_norm": 2.21875,
"learning_rate": 1.115314797709002e-05,
"loss": 1.6748,
"step": 195
},
{
"epoch": 0.4716981132075472,
"grad_norm": 2.25,
"learning_rate": 1.1131311473782265e-05,
"loss": 1.6787,
"step": 200
},
{
"epoch": 0.4834905660377358,
"grad_norm": 2.28125,
"learning_rate": 1.1108839038495079e-05,
"loss": 1.6916,
"step": 205
},
{
"epoch": 0.49528301886792453,
"grad_norm": 2.21875,
"learning_rate": 1.1085734294865228e-05,
"loss": 1.6689,
"step": 210
},
{
"epoch": 0.5070754716981132,
"grad_norm": 2.359375,
"learning_rate": 1.1062000968487975e-05,
"loss": 1.7052,
"step": 215
},
{
"epoch": 0.5188679245283019,
"grad_norm": 2.234375,
"learning_rate": 1.1037642886316339e-05,
"loss": 1.6683,
"step": 220
},
{
"epoch": 0.5306603773584906,
"grad_norm": 2.234375,
"learning_rate": 1.1012663976044003e-05,
"loss": 1.6763,
"step": 225
},
{
"epoch": 0.5424528301886793,
"grad_norm": 2.3125,
"learning_rate": 1.0987068265471978e-05,
"loss": 1.6811,
"step": 230
},
{
"epoch": 0.5542452830188679,
"grad_norm": 2.265625,
"learning_rate": 1.0960859881859139e-05,
"loss": 1.6863,
"step": 235
},
{
"epoch": 0.5660377358490566,
"grad_norm": 2.25,
"learning_rate": 1.0934043051256698e-05,
"loss": 1.6881,
"step": 240
},
{
"epoch": 0.5778301886792453,
"grad_norm": 2.234375,
"learning_rate": 1.0906622097826771e-05,
"loss": 1.6482,
"step": 245
},
{
"epoch": 0.589622641509434,
"grad_norm": 2.3125,
"learning_rate": 1.0878601443145113e-05,
"loss": 1.6766,
"step": 250
},
{
"epoch": 0.6014150943396226,
"grad_norm": 2.265625,
"learning_rate": 1.0849985605488146e-05,
"loss": 1.675,
"step": 255
},
{
"epoch": 0.6132075471698113,
"grad_norm": 2.34375,
"learning_rate": 1.0820779199104397e-05,
"loss": 1.6626,
"step": 260
},
{
"epoch": 0.625,
"grad_norm": 2.375,
"learning_rate": 1.079098693347046e-05,
"loss": 1.6515,
"step": 265
},
{
"epoch": 0.6367924528301887,
"grad_norm": 2.234375,
"learning_rate": 1.07606136125316e-05,
"loss": 1.7015,
"step": 270
},
{
"epoch": 0.6485849056603774,
"grad_norm": 2.1875,
"learning_rate": 1.0729664133927129e-05,
"loss": 1.6355,
"step": 275
},
{
"epoch": 0.660377358490566,
"grad_norm": 2.171875,
"learning_rate": 1.0698143488200662e-05,
"loss": 1.6389,
"step": 280
},
{
"epoch": 0.6721698113207547,
"grad_norm": 2.265625,
"learning_rate": 1.0666056757995418e-05,
"loss": 1.6465,
"step": 285
},
{
"epoch": 0.6839622641509434,
"grad_norm": 2.265625,
"learning_rate": 1.0633409117234644e-05,
"loss": 1.6423,
"step": 290
},
{
"epoch": 0.6957547169811321,
"grad_norm": 2.1875,
"learning_rate": 1.0600205830287322e-05,
"loss": 1.6425,
"step": 295
},
{
"epoch": 0.7075471698113207,
"grad_norm": 2.3125,
"learning_rate": 1.0566452251119316e-05,
"loss": 1.6575,
"step": 300
},
{
"epoch": 0.7193396226415094,
"grad_norm": 2.296875,
"learning_rate": 1.053215382243004e-05,
"loss": 1.6459,
"step": 305
},
{
"epoch": 0.7311320754716981,
"grad_norm": 2.234375,
"learning_rate": 1.0497316074774848e-05,
"loss": 1.7007,
"step": 310
},
{
"epoch": 0.7429245283018868,
"grad_norm": 2.21875,
"learning_rate": 1.0461944625673232e-05,
"loss": 1.6596,
"step": 315
},
{
"epoch": 0.7547169811320755,
"grad_norm": 2.359375,
"learning_rate": 1.0426045178703008e-05,
"loss": 1.67,
"step": 320
},
{
"epoch": 0.7665094339622641,
"grad_norm": 2.234375,
"learning_rate": 1.038962352258063e-05,
"loss": 1.671,
"step": 325
},
{
"epoch": 0.7783018867924528,
"grad_norm": 2.234375,
"learning_rate": 1.0352685530227774e-05,
"loss": 1.6433,
"step": 330
},
{
"epoch": 0.7900943396226415,
"grad_norm": 2.328125,
"learning_rate": 1.0315237157824327e-05,
"loss": 1.6605,
"step": 335
},
{
"epoch": 0.8018867924528302,
"grad_norm": 2.21875,
"learning_rate": 1.0277284443847979e-05,
"loss": 1.6467,
"step": 340
},
{
"epoch": 0.8136792452830188,
"grad_norm": 2.390625,
"learning_rate": 1.0238833508100518e-05,
"loss": 1.6315,
"step": 345
},
{
"epoch": 0.8254716981132075,
"grad_norm": 2.171875,
"learning_rate": 1.0199890550721037e-05,
"loss": 1.619,
"step": 350
},
{
"epoch": 0.8372641509433962,
"grad_norm": 2.21875,
"learning_rate": 1.0160461851186164e-05,
"loss": 1.667,
"step": 355
},
{
"epoch": 0.8490566037735849,
"grad_norm": 2.234375,
"learning_rate": 1.0120553767297507e-05,
"loss": 1.6504,
"step": 360
},
{
"epoch": 0.8608490566037735,
"grad_norm": 2.1875,
"learning_rate": 1.0080172734156478e-05,
"loss": 1.6727,
"step": 365
},
{
"epoch": 0.8726415094339622,
"grad_norm": 2.203125,
"learning_rate": 1.0039325263126645e-05,
"loss": 1.6469,
"step": 370
},
{
"epoch": 0.8844339622641509,
"grad_norm": 2.1875,
"learning_rate": 9.998017940783778e-06,
"loss": 1.6638,
"step": 375
},
{
"epoch": 0.8962264150943396,
"grad_norm": 2.25,
"learning_rate": 9.956257427853788e-06,
"loss": 1.6652,
"step": 380
},
{
"epoch": 0.9080188679245284,
"grad_norm": 2.203125,
"learning_rate": 9.914050458138687e-06,
"loss": 1.6456,
"step": 385
},
{
"epoch": 0.9198113207547169,
"grad_norm": 2.125,
"learning_rate": 9.871403837430787e-06,
"loss": 1.6336,
"step": 390
},
{
"epoch": 0.9316037735849056,
"grad_norm": 2.109375,
"learning_rate": 9.828324442415267e-06,
"loss": 1.6592,
"step": 395
},
{
"epoch": 0.9433962264150944,
"grad_norm": 2.25,
"learning_rate": 9.784819219561335e-06,
"loss": 1.6251,
"step": 400
},
{
"epoch": 0.9551886792452831,
"grad_norm": 2.265625,
"learning_rate": 9.740895184002105e-06,
"loss": 1.638,
"step": 405
},
{
"epoch": 0.9669811320754716,
"grad_norm": 2.125,
"learning_rate": 9.696559418403438e-06,
"loss": 1.6297,
"step": 410
},
{
"epoch": 0.9787735849056604,
"grad_norm": 2.296875,
"learning_rate": 9.651819071821867e-06,
"loss": 1.6373,
"step": 415
},
{
"epoch": 0.9905660377358491,
"grad_norm": 2.25,
"learning_rate": 9.606681358551822e-06,
"loss": 1.6636,
"step": 420
},
{
"epoch": 0.9976415094339622,
"eval_loss": 1.6890636682510376,
"eval_runtime": 5.777,
"eval_samples_per_second": 14.54,
"eval_steps_per_second": 14.54,
"step": 423
},
{
"epoch": 1.0023584905660377,
"grad_norm": 2.546875,
"learning_rate": 9.56115355696235e-06,
"loss": 1.6559,
"step": 425
},
{
"epoch": 1.0141509433962264,
"grad_norm": 2.453125,
"learning_rate": 9.515243008323482e-06,
"loss": 1.5576,
"step": 430
},
{
"epoch": 1.025943396226415,
"grad_norm": 2.1875,
"learning_rate": 9.468957115622473e-06,
"loss": 1.5884,
"step": 435
},
{
"epoch": 1.0377358490566038,
"grad_norm": 2.21875,
"learning_rate": 9.42230334237008e-06,
"loss": 1.5567,
"step": 440
},
{
"epoch": 1.0495283018867925,
"grad_norm": 2.25,
"learning_rate": 9.37528921139709e-06,
"loss": 1.577,
"step": 445
},
{
"epoch": 1.0613207547169812,
"grad_norm": 2.21875,
"learning_rate": 9.327922303641277e-06,
"loss": 1.5942,
"step": 450
},
{
"epoch": 1.0731132075471699,
"grad_norm": 2.15625,
"learning_rate": 9.280210256924987e-06,
"loss": 1.565,
"step": 455
},
{
"epoch": 1.0849056603773586,
"grad_norm": 2.1875,
"learning_rate": 9.23216076472356e-06,
"loss": 1.5633,
"step": 460
},
{
"epoch": 1.0966981132075473,
"grad_norm": 2.265625,
"learning_rate": 9.183781574924765e-06,
"loss": 1.5821,
"step": 465
},
{
"epoch": 1.1084905660377358,
"grad_norm": 2.328125,
"learning_rate": 9.135080488579473e-06,
"loss": 1.569,
"step": 470
},
{
"epoch": 1.1202830188679245,
"grad_norm": 2.265625,
"learning_rate": 9.086065358643754e-06,
"loss": 1.5876,
"step": 475
},
{
"epoch": 1.1320754716981132,
"grad_norm": 2.296875,
"learning_rate": 9.036744088712591e-06,
"loss": 1.5964,
"step": 480
},
{
"epoch": 1.1438679245283019,
"grad_norm": 2.15625,
"learning_rate": 8.98712463174546e-06,
"loss": 1.5798,
"step": 485
},
{
"epoch": 1.1556603773584906,
"grad_norm": 2.15625,
"learning_rate": 8.937214988783914e-06,
"loss": 1.5605,
"step": 490
},
{
"epoch": 1.1674528301886793,
"grad_norm": 2.15625,
"learning_rate": 8.887023207661441e-06,
"loss": 1.5753,
"step": 495
},
{
"epoch": 1.179245283018868,
"grad_norm": 2.203125,
"learning_rate": 8.83655738170576e-06,
"loss": 1.575,
"step": 500
},
{
"epoch": 1.179245283018868,
"eval_loss": 1.6877530813217163,
"eval_runtime": 5.5993,
"eval_samples_per_second": 15.002,
"eval_steps_per_second": 15.002,
"step": 500
},
{
"epoch": 1.1910377358490567,
"grad_norm": 2.421875,
"learning_rate": 8.78582564843379e-06,
"loss": 1.5861,
"step": 505
},
{
"epoch": 1.2028301886792452,
"grad_norm": 2.21875,
"learning_rate": 8.734836188239491e-06,
"loss": 1.5701,
"step": 510
},
{
"epoch": 1.2146226415094339,
"grad_norm": 2.25,
"learning_rate": 8.68359722307479e-06,
"loss": 1.5372,
"step": 515
},
{
"epoch": 1.2264150943396226,
"grad_norm": 2.109375,
"learning_rate": 8.632117015123812e-06,
"loss": 1.5532,
"step": 520
},
{
"epoch": 1.2382075471698113,
"grad_norm": 2.21875,
"learning_rate": 8.580403865470608e-06,
"loss": 1.5902,
"step": 525
},
{
"epoch": 1.25,
"grad_norm": 2.390625,
"learning_rate": 8.528466112760638e-06,
"loss": 1.5525,
"step": 530
},
{
"epoch": 1.2617924528301887,
"grad_norm": 2.1875,
"learning_rate": 8.476312131856164e-06,
"loss": 1.5284,
"step": 535
},
{
"epoch": 1.2735849056603774,
"grad_norm": 2.109375,
"learning_rate": 8.42395033248583e-06,
"loss": 1.5249,
"step": 540
},
{
"epoch": 1.2853773584905661,
"grad_norm": 2.15625,
"learning_rate": 8.371389157888602e-06,
"loss": 1.5715,
"step": 545
},
{
"epoch": 1.2971698113207548,
"grad_norm": 2.25,
"learning_rate": 8.318637083452323e-06,
"loss": 1.5713,
"step": 550
},
{
"epoch": 1.3089622641509435,
"grad_norm": 2.25,
"learning_rate": 8.265702615347056e-06,
"loss": 1.5727,
"step": 555
},
{
"epoch": 1.320754716981132,
"grad_norm": 2.265625,
"learning_rate": 8.212594289153501e-06,
"loss": 1.5815,
"step": 560
},
{
"epoch": 1.3325471698113207,
"grad_norm": 2.265625,
"learning_rate": 8.159320668486633e-06,
"loss": 1.5349,
"step": 565
},
{
"epoch": 1.3443396226415094,
"grad_norm": 2.1875,
"learning_rate": 8.105890343614842e-06,
"loss": 1.5543,
"step": 570
},
{
"epoch": 1.3561320754716981,
"grad_norm": 2.3125,
"learning_rate": 8.052311930074767e-06,
"loss": 1.5759,
"step": 575
},
{
"epoch": 1.3679245283018868,
"grad_norm": 2.328125,
"learning_rate": 7.998594067282067e-06,
"loss": 1.5568,
"step": 580
},
{
"epoch": 1.3797169811320755,
"grad_norm": 2.140625,
"learning_rate": 7.944745417138312e-06,
"loss": 1.5792,
"step": 585
},
{
"epoch": 1.3915094339622642,
"grad_norm": 2.296875,
"learning_rate": 7.890774662634284e-06,
"loss": 1.5814,
"step": 590
},
{
"epoch": 1.4033018867924527,
"grad_norm": 2.140625,
"learning_rate": 7.83669050644986e-06,
"loss": 1.5262,
"step": 595
},
{
"epoch": 1.4150943396226414,
"grad_norm": 2.140625,
"learning_rate": 7.782501669550717e-06,
"loss": 1.5395,
"step": 600
},
{
"epoch": 1.4268867924528301,
"grad_norm": 2.328125,
"learning_rate": 7.728216889782096e-06,
"loss": 1.5824,
"step": 605
},
{
"epoch": 1.4386792452830188,
"grad_norm": 2.28125,
"learning_rate": 7.673844920459834e-06,
"loss": 1.5917,
"step": 610
},
{
"epoch": 1.4504716981132075,
"grad_norm": 2.1875,
"learning_rate": 7.619394528958923e-06,
"loss": 1.5323,
"step": 615
},
{
"epoch": 1.4622641509433962,
"grad_norm": 2.171875,
"learning_rate": 7.56487449529978e-06,
"loss": 1.5509,
"step": 620
},
{
"epoch": 1.474056603773585,
"grad_norm": 2.0625,
"learning_rate": 7.510293610732478e-06,
"loss": 1.5435,
"step": 625
},
{
"epoch": 1.4858490566037736,
"grad_norm": 2.109375,
"learning_rate": 7.4556606763191854e-06,
"loss": 1.5378,
"step": 630
},
{
"epoch": 1.4976415094339623,
"grad_norm": 2.234375,
"learning_rate": 7.400984501515011e-06,
"loss": 1.5521,
"step": 635
},
{
"epoch": 1.509433962264151,
"grad_norm": 2.234375,
"learning_rate": 7.346273902747486e-06,
"loss": 1.5588,
"step": 640
},
{
"epoch": 1.5212264150943398,
"grad_norm": 2.28125,
"learning_rate": 7.291537701994948e-06,
"loss": 1.5433,
"step": 645
},
{
"epoch": 1.5330188679245285,
"grad_norm": 2.203125,
"learning_rate": 7.236784725363994e-06,
"loss": 1.5735,
"step": 650
},
{
"epoch": 1.544811320754717,
"grad_norm": 2.171875,
"learning_rate": 7.182023801666313e-06,
"loss": 1.5861,
"step": 655
},
{
"epoch": 1.5566037735849056,
"grad_norm": 2.09375,
"learning_rate": 7.127263760995028e-06,
"loss": 1.576,
"step": 660
},
{
"epoch": 1.5683962264150944,
"grad_norm": 2.234375,
"learning_rate": 7.072513433300889e-06,
"loss": 1.5781,
"step": 665
},
{
"epoch": 1.580188679245283,
"grad_norm": 2.296875,
"learning_rate": 7.017781646968438e-06,
"loss": 1.579,
"step": 670
},
{
"epoch": 1.5919811320754715,
"grad_norm": 2.15625,
"learning_rate": 6.963077227392465e-06,
"loss": 1.5744,
"step": 675
},
{
"epoch": 1.6037735849056602,
"grad_norm": 2.171875,
"learning_rate": 6.908408995554915e-06,
"loss": 1.5747,
"step": 680
},
{
"epoch": 1.615566037735849,
"grad_norm": 2.25,
"learning_rate": 6.853785766602541e-06,
"loss": 1.5786,
"step": 685
},
{
"epoch": 1.6273584905660377,
"grad_norm": 2.15625,
"learning_rate": 6.799216348425456e-06,
"loss": 1.5559,
"step": 690
},
{
"epoch": 1.6391509433962264,
"grad_norm": 2.125,
"learning_rate": 6.744709540236898e-06,
"loss": 1.5324,
"step": 695
},
{
"epoch": 1.650943396226415,
"grad_norm": 2.234375,
"learning_rate": 6.690274131154364e-06,
"loss": 1.5782,
"step": 700
},
{
"epoch": 1.6627358490566038,
"grad_norm": 2.1875,
"learning_rate": 6.63591889878238e-06,
"loss": 1.5768,
"step": 705
},
{
"epoch": 1.6745283018867925,
"grad_norm": 2.234375,
"learning_rate": 6.581652607797136e-06,
"loss": 1.5614,
"step": 710
},
{
"epoch": 1.6863207547169812,
"grad_norm": 2.15625,
"learning_rate": 6.527484008533192e-06,
"loss": 1.5675,
"step": 715
},
{
"epoch": 1.6981132075471699,
"grad_norm": 2.09375,
"learning_rate": 6.473421835572508e-06,
"loss": 1.5621,
"step": 720
},
{
"epoch": 1.7099056603773586,
"grad_norm": 2.171875,
"learning_rate": 6.419474806336007e-06,
"loss": 1.5508,
"step": 725
},
{
"epoch": 1.7216981132075473,
"grad_norm": 2.265625,
"learning_rate": 6.365651619677913e-06,
"loss": 1.5503,
"step": 730
},
{
"epoch": 1.733490566037736,
"grad_norm": 2.234375,
"learning_rate": 6.311960954483071e-06,
"loss": 1.5419,
"step": 735
},
{
"epoch": 1.7452830188679245,
"grad_norm": 2.28125,
"learning_rate": 6.258411468267494e-06,
"loss": 1.5326,
"step": 740
},
{
"epoch": 1.7570754716981132,
"grad_norm": 2.203125,
"learning_rate": 6.205011795782359e-06,
"loss": 1.5288,
"step": 745
},
{
"epoch": 1.7688679245283019,
"grad_norm": 2.21875,
"learning_rate": 6.151770547621672e-06,
"loss": 1.5505,
"step": 750
},
{
"epoch": 1.7806603773584906,
"grad_norm": 2.171875,
"learning_rate": 6.098696308833817e-06,
"loss": 1.5524,
"step": 755
},
{
"epoch": 1.7924528301886793,
"grad_norm": 2.171875,
"learning_rate": 6.045797637537242e-06,
"loss": 1.5685,
"step": 760
},
{
"epoch": 1.8042452830188678,
"grad_norm": 2.1875,
"learning_rate": 5.993083063540479e-06,
"loss": 1.5341,
"step": 765
},
{
"epoch": 1.8160377358490565,
"grad_norm": 2.21875,
"learning_rate": 5.94056108696672e-06,
"loss": 1.5715,
"step": 770
},
{
"epoch": 1.8278301886792452,
"grad_norm": 2.15625,
"learning_rate": 5.8882401768832e-06,
"loss": 1.5531,
"step": 775
},
{
"epoch": 1.8396226415094339,
"grad_norm": 2.125,
"learning_rate": 5.836128769935567e-06,
"loss": 1.5836,
"step": 780
},
{
"epoch": 1.8514150943396226,
"grad_norm": 2.15625,
"learning_rate": 5.784235268987498e-06,
"loss": 1.5577,
"step": 785
},
{
"epoch": 1.8632075471698113,
"grad_norm": 2.25,
"learning_rate": 5.7325680417657444e-06,
"loss": 1.5722,
"step": 790
},
{
"epoch": 1.875,
"grad_norm": 2.09375,
"learning_rate": 5.681135419510858e-06,
"loss": 1.5645,
"step": 795
},
{
"epoch": 1.8867924528301887,
"grad_norm": 2.140625,
"learning_rate": 5.629945695633794e-06,
"loss": 1.5979,
"step": 800
},
{
"epoch": 1.8985849056603774,
"grad_norm": 2.21875,
"learning_rate": 5.579007124378612e-06,
"loss": 1.5541,
"step": 805
},
{
"epoch": 1.9103773584905661,
"grad_norm": 2.15625,
"learning_rate": 5.528327919491496e-06,
"loss": 1.5456,
"step": 810
},
{
"epoch": 1.9221698113207548,
"grad_norm": 2.109375,
"learning_rate": 5.477916252896307e-06,
"loss": 1.5731,
"step": 815
},
{
"epoch": 1.9339622641509435,
"grad_norm": 2.1875,
"learning_rate": 5.4277802533768706e-06,
"loss": 1.5789,
"step": 820
},
{
"epoch": 1.9457547169811322,
"grad_norm": 2.1875,
"learning_rate": 5.377928005266233e-06,
"loss": 1.5654,
"step": 825
},
{
"epoch": 1.9575471698113207,
"grad_norm": 2.265625,
"learning_rate": 5.328367547143068e-06,
"loss": 1.5347,
"step": 830
},
{
"epoch": 1.9693396226415094,
"grad_norm": 2.25,
"learning_rate": 5.279106870535483e-06,
"loss": 1.5501,
"step": 835
},
{
"epoch": 1.9811320754716981,
"grad_norm": 2.15625,
"learning_rate": 5.2301539186323925e-06,
"loss": 1.5676,
"step": 840
},
{
"epoch": 1.9929245283018868,
"grad_norm": 2.1875,
"learning_rate": 5.1815165850027e-06,
"loss": 1.5831,
"step": 845
},
{
"epoch": 1.9952830188679245,
"eval_loss": 1.6711690425872803,
"eval_runtime": 5.5851,
"eval_samples_per_second": 15.04,
"eval_steps_per_second": 15.04,
"step": 846
}
],
"logging_steps": 5,
"max_steps": 1272,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0098432568637522e+18,
"train_batch_size": 48,
"trial_name": null,
"trial_params": null
}