{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9952830188679245, "eval_steps": 500, "global_step": 846, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01179245283018868, "grad_norm": 6.4375, "learning_rate": 1.3142857142857143e-06, "loss": 1.9687, "step": 5 }, { "epoch": 0.02358490566037736, "grad_norm": 4.25, "learning_rate": 2.9571428571428567e-06, "loss": 1.9487, "step": 10 }, { "epoch": 0.03537735849056604, "grad_norm": 3.09375, "learning_rate": 4.6e-06, "loss": 1.8763, "step": 15 }, { "epoch": 0.04716981132075472, "grad_norm": 2.78125, "learning_rate": 6.242857142857142e-06, "loss": 1.8502, "step": 20 }, { "epoch": 0.0589622641509434, "grad_norm": 2.546875, "learning_rate": 7.885714285714286e-06, "loss": 1.8808, "step": 25 }, { "epoch": 0.07075471698113207, "grad_norm": 2.625, "learning_rate": 9.528571428571429e-06, "loss": 1.815, "step": 30 }, { "epoch": 0.08254716981132075, "grad_norm": 2.53125, "learning_rate": 1.1171428571428571e-05, "loss": 1.7632, "step": 35 }, { "epoch": 0.09433962264150944, "grad_norm": 2.390625, "learning_rate": 1.1499777476590145e-05, "loss": 1.7819, "step": 40 }, { "epoch": 0.10613207547169812, "grad_norm": 2.3125, "learning_rate": 1.1498873514595152e-05, "loss": 1.7612, "step": 45 }, { "epoch": 0.1179245283018868, "grad_norm": 2.421875, "learning_rate": 1.1497274351951891e-05, "loss": 1.7435, "step": 50 }, { "epoch": 0.12971698113207547, "grad_norm": 2.28125, "learning_rate": 1.1494980246522254e-05, "loss": 1.7239, "step": 55 }, { "epoch": 0.14150943396226415, "grad_norm": 2.390625, "learning_rate": 1.1491991568226304e-05, "loss": 1.7142, "step": 60 }, { "epoch": 0.15330188679245282, "grad_norm": 2.453125, "learning_rate": 1.1488308798982647e-05, "loss": 1.7514, "step": 65 }, { "epoch": 0.1650943396226415, "grad_norm": 2.40625, "learning_rate": 1.14839325326307e-05, "loss": 1.7355, "step": 70 }, { "epoch": 0.17688679245283018, "grad_norm": 2.375, "learning_rate": 1.1478863474834962e-05, "loss": 1.7629, "step": 75 }, { "epoch": 0.18867924528301888, "grad_norm": 2.265625, "learning_rate": 1.1473102442971205e-05, "loss": 1.7157, "step": 80 }, { "epoch": 0.20047169811320756, "grad_norm": 2.375, "learning_rate": 1.1466650365994684e-05, "loss": 1.7379, "step": 85 }, { "epoch": 0.21226415094339623, "grad_norm": 2.265625, "learning_rate": 1.1459508284290345e-05, "loss": 1.7371, "step": 90 }, { "epoch": 0.2240566037735849, "grad_norm": 2.359375, "learning_rate": 1.145167734950506e-05, "loss": 1.7067, "step": 95 }, { "epoch": 0.2358490566037736, "grad_norm": 2.375, "learning_rate": 1.1443158824361937e-05, "loss": 1.738, "step": 100 }, { "epoch": 0.24764150943396226, "grad_norm": 2.375, "learning_rate": 1.1433954082456689e-05, "loss": 1.7135, "step": 105 }, { "epoch": 0.25943396226415094, "grad_norm": 2.34375, "learning_rate": 1.1424064608036165e-05, "loss": 1.6967, "step": 110 }, { "epoch": 0.27122641509433965, "grad_norm": 2.265625, "learning_rate": 1.1413491995759002e-05, "loss": 1.7233, "step": 115 }, { "epoch": 0.2830188679245283, "grad_norm": 2.34375, "learning_rate": 1.1402237950438503e-05, "loss": 1.7345, "step": 120 }, { "epoch": 0.294811320754717, "grad_norm": 2.390625, "learning_rate": 1.139030428676773e-05, "loss": 1.6991, "step": 125 }, { "epoch": 0.30660377358490565, "grad_norm": 2.421875, "learning_rate": 1.1377692929026888e-05, "loss": 1.6991, "step": 130 }, { "epoch": 0.31839622641509435, "grad_norm": 2.375, "learning_rate": 1.1364405910773045e-05, "loss": 1.7055, "step": 135 }, { "epoch": 0.330188679245283, "grad_norm": 2.21875, "learning_rate": 1.135044537451222e-05, "loss": 1.6739, "step": 140 }, { "epoch": 0.3419811320754717, "grad_norm": 2.359375, "learning_rate": 1.1335813571353907e-05, "loss": 1.7265, "step": 145 }, { "epoch": 0.35377358490566035, "grad_norm": 2.1875, "learning_rate": 1.1320512860648091e-05, "loss": 1.6901, "step": 150 }, { "epoch": 0.36556603773584906, "grad_norm": 2.21875, "learning_rate": 1.130454570960481e-05, "loss": 1.7123, "step": 155 }, { "epoch": 0.37735849056603776, "grad_norm": 2.234375, "learning_rate": 1.1287914692896318e-05, "loss": 1.7072, "step": 160 }, { "epoch": 0.3891509433962264, "grad_norm": 2.140625, "learning_rate": 1.1270622492241917e-05, "loss": 1.7096, "step": 165 }, { "epoch": 0.4009433962264151, "grad_norm": 2.25, "learning_rate": 1.1252671895975553e-05, "loss": 1.7145, "step": 170 }, { "epoch": 0.41273584905660377, "grad_norm": 2.28125, "learning_rate": 1.1234065798596185e-05, "loss": 1.6668, "step": 175 }, { "epoch": 0.42452830188679247, "grad_norm": 2.328125, "learning_rate": 1.1214807200301065e-05, "loss": 1.6786, "step": 180 }, { "epoch": 0.4363207547169811, "grad_norm": 2.28125, "learning_rate": 1.1194899206501954e-05, "loss": 1.7117, "step": 185 }, { "epoch": 0.4481132075471698, "grad_norm": 2.28125, "learning_rate": 1.1174345027324379e-05, "loss": 1.6979, "step": 190 }, { "epoch": 0.45990566037735847, "grad_norm": 2.21875, "learning_rate": 1.115314797709002e-05, "loss": 1.6748, "step": 195 }, { "epoch": 0.4716981132075472, "grad_norm": 2.25, "learning_rate": 1.1131311473782265e-05, "loss": 1.6787, "step": 200 }, { "epoch": 0.4834905660377358, "grad_norm": 2.28125, "learning_rate": 1.1108839038495079e-05, "loss": 1.6916, "step": 205 }, { "epoch": 0.49528301886792453, "grad_norm": 2.21875, "learning_rate": 1.1085734294865228e-05, "loss": 1.6689, "step": 210 }, { "epoch": 0.5070754716981132, "grad_norm": 2.359375, "learning_rate": 1.1062000968487975e-05, "loss": 1.7052, "step": 215 }, { "epoch": 0.5188679245283019, "grad_norm": 2.234375, "learning_rate": 1.1037642886316339e-05, "loss": 1.6683, "step": 220 }, { "epoch": 0.5306603773584906, "grad_norm": 2.234375, "learning_rate": 1.1012663976044003e-05, "loss": 1.6763, "step": 225 }, { "epoch": 0.5424528301886793, "grad_norm": 2.3125, "learning_rate": 1.0987068265471978e-05, "loss": 1.6811, "step": 230 }, { "epoch": 0.5542452830188679, "grad_norm": 2.265625, "learning_rate": 1.0960859881859139e-05, "loss": 1.6863, "step": 235 }, { "epoch": 0.5660377358490566, "grad_norm": 2.25, "learning_rate": 1.0934043051256698e-05, "loss": 1.6881, "step": 240 }, { "epoch": 0.5778301886792453, "grad_norm": 2.234375, "learning_rate": 1.0906622097826771e-05, "loss": 1.6482, "step": 245 }, { "epoch": 0.589622641509434, "grad_norm": 2.3125, "learning_rate": 1.0878601443145113e-05, "loss": 1.6766, "step": 250 }, { "epoch": 0.6014150943396226, "grad_norm": 2.265625, "learning_rate": 1.0849985605488146e-05, "loss": 1.675, "step": 255 }, { "epoch": 0.6132075471698113, "grad_norm": 2.34375, "learning_rate": 1.0820779199104397e-05, "loss": 1.6626, "step": 260 }, { "epoch": 0.625, "grad_norm": 2.375, "learning_rate": 1.079098693347046e-05, "loss": 1.6515, "step": 265 }, { "epoch": 0.6367924528301887, "grad_norm": 2.234375, "learning_rate": 1.07606136125316e-05, "loss": 1.7015, "step": 270 }, { "epoch": 0.6485849056603774, "grad_norm": 2.1875, "learning_rate": 1.0729664133927129e-05, "loss": 1.6355, "step": 275 }, { "epoch": 0.660377358490566, "grad_norm": 2.171875, "learning_rate": 1.0698143488200662e-05, "loss": 1.6389, "step": 280 }, { "epoch": 0.6721698113207547, "grad_norm": 2.265625, "learning_rate": 1.0666056757995418e-05, "loss": 1.6465, "step": 285 }, { "epoch": 0.6839622641509434, "grad_norm": 2.265625, "learning_rate": 1.0633409117234644e-05, "loss": 1.6423, "step": 290 }, { "epoch": 0.6957547169811321, "grad_norm": 2.1875, "learning_rate": 1.0600205830287322e-05, "loss": 1.6425, "step": 295 }, { "epoch": 0.7075471698113207, "grad_norm": 2.3125, "learning_rate": 1.0566452251119316e-05, "loss": 1.6575, "step": 300 }, { "epoch": 0.7193396226415094, "grad_norm": 2.296875, "learning_rate": 1.053215382243004e-05, "loss": 1.6459, "step": 305 }, { "epoch": 0.7311320754716981, "grad_norm": 2.234375, "learning_rate": 1.0497316074774848e-05, "loss": 1.7007, "step": 310 }, { "epoch": 0.7429245283018868, "grad_norm": 2.21875, "learning_rate": 1.0461944625673232e-05, "loss": 1.6596, "step": 315 }, { "epoch": 0.7547169811320755, "grad_norm": 2.359375, "learning_rate": 1.0426045178703008e-05, "loss": 1.67, "step": 320 }, { "epoch": 0.7665094339622641, "grad_norm": 2.234375, "learning_rate": 1.038962352258063e-05, "loss": 1.671, "step": 325 }, { "epoch": 0.7783018867924528, "grad_norm": 2.234375, "learning_rate": 1.0352685530227774e-05, "loss": 1.6433, "step": 330 }, { "epoch": 0.7900943396226415, "grad_norm": 2.328125, "learning_rate": 1.0315237157824327e-05, "loss": 1.6605, "step": 335 }, { "epoch": 0.8018867924528302, "grad_norm": 2.21875, "learning_rate": 1.0277284443847979e-05, "loss": 1.6467, "step": 340 }, { "epoch": 0.8136792452830188, "grad_norm": 2.390625, "learning_rate": 1.0238833508100518e-05, "loss": 1.6315, "step": 345 }, { "epoch": 0.8254716981132075, "grad_norm": 2.171875, "learning_rate": 1.0199890550721037e-05, "loss": 1.619, "step": 350 }, { "epoch": 0.8372641509433962, "grad_norm": 2.21875, "learning_rate": 1.0160461851186164e-05, "loss": 1.667, "step": 355 }, { "epoch": 0.8490566037735849, "grad_norm": 2.234375, "learning_rate": 1.0120553767297507e-05, "loss": 1.6504, "step": 360 }, { "epoch": 0.8608490566037735, "grad_norm": 2.1875, "learning_rate": 1.0080172734156478e-05, "loss": 1.6727, "step": 365 }, { "epoch": 0.8726415094339622, "grad_norm": 2.203125, "learning_rate": 1.0039325263126645e-05, "loss": 1.6469, "step": 370 }, { "epoch": 0.8844339622641509, "grad_norm": 2.1875, "learning_rate": 9.998017940783778e-06, "loss": 1.6638, "step": 375 }, { "epoch": 0.8962264150943396, "grad_norm": 2.25, "learning_rate": 9.956257427853788e-06, "loss": 1.6652, "step": 380 }, { "epoch": 0.9080188679245284, "grad_norm": 2.203125, "learning_rate": 9.914050458138687e-06, "loss": 1.6456, "step": 385 }, { "epoch": 0.9198113207547169, "grad_norm": 2.125, "learning_rate": 9.871403837430787e-06, "loss": 1.6336, "step": 390 }, { "epoch": 0.9316037735849056, "grad_norm": 2.109375, "learning_rate": 9.828324442415267e-06, "loss": 1.6592, "step": 395 }, { "epoch": 0.9433962264150944, "grad_norm": 2.25, "learning_rate": 9.784819219561335e-06, "loss": 1.6251, "step": 400 }, { "epoch": 0.9551886792452831, "grad_norm": 2.265625, "learning_rate": 9.740895184002105e-06, "loss": 1.638, "step": 405 }, { "epoch": 0.9669811320754716, "grad_norm": 2.125, "learning_rate": 9.696559418403438e-06, "loss": 1.6297, "step": 410 }, { "epoch": 0.9787735849056604, "grad_norm": 2.296875, "learning_rate": 9.651819071821867e-06, "loss": 1.6373, "step": 415 }, { "epoch": 0.9905660377358491, "grad_norm": 2.25, "learning_rate": 9.606681358551822e-06, "loss": 1.6636, "step": 420 }, { "epoch": 0.9976415094339622, "eval_loss": 1.6890636682510376, "eval_runtime": 5.777, "eval_samples_per_second": 14.54, "eval_steps_per_second": 14.54, "step": 423 }, { "epoch": 1.0023584905660377, "grad_norm": 2.546875, "learning_rate": 9.56115355696235e-06, "loss": 1.6559, "step": 425 }, { "epoch": 1.0141509433962264, "grad_norm": 2.453125, "learning_rate": 9.515243008323482e-06, "loss": 1.5576, "step": 430 }, { "epoch": 1.025943396226415, "grad_norm": 2.1875, "learning_rate": 9.468957115622473e-06, "loss": 1.5884, "step": 435 }, { "epoch": 1.0377358490566038, "grad_norm": 2.21875, "learning_rate": 9.42230334237008e-06, "loss": 1.5567, "step": 440 }, { "epoch": 1.0495283018867925, "grad_norm": 2.25, "learning_rate": 9.37528921139709e-06, "loss": 1.577, "step": 445 }, { "epoch": 1.0613207547169812, "grad_norm": 2.21875, "learning_rate": 9.327922303641277e-06, "loss": 1.5942, "step": 450 }, { "epoch": 1.0731132075471699, "grad_norm": 2.15625, "learning_rate": 9.280210256924987e-06, "loss": 1.565, "step": 455 }, { "epoch": 1.0849056603773586, "grad_norm": 2.1875, "learning_rate": 9.23216076472356e-06, "loss": 1.5633, "step": 460 }, { "epoch": 1.0966981132075473, "grad_norm": 2.265625, "learning_rate": 9.183781574924765e-06, "loss": 1.5821, "step": 465 }, { "epoch": 1.1084905660377358, "grad_norm": 2.328125, "learning_rate": 9.135080488579473e-06, "loss": 1.569, "step": 470 }, { "epoch": 1.1202830188679245, "grad_norm": 2.265625, "learning_rate": 9.086065358643754e-06, "loss": 1.5876, "step": 475 }, { "epoch": 1.1320754716981132, "grad_norm": 2.296875, "learning_rate": 9.036744088712591e-06, "loss": 1.5964, "step": 480 }, { "epoch": 1.1438679245283019, "grad_norm": 2.15625, "learning_rate": 8.98712463174546e-06, "loss": 1.5798, "step": 485 }, { "epoch": 1.1556603773584906, "grad_norm": 2.15625, "learning_rate": 8.937214988783914e-06, "loss": 1.5605, "step": 490 }, { "epoch": 1.1674528301886793, "grad_norm": 2.15625, "learning_rate": 8.887023207661441e-06, "loss": 1.5753, "step": 495 }, { "epoch": 1.179245283018868, "grad_norm": 2.203125, "learning_rate": 8.83655738170576e-06, "loss": 1.575, "step": 500 }, { "epoch": 1.179245283018868, "eval_loss": 1.6877530813217163, "eval_runtime": 5.5993, "eval_samples_per_second": 15.002, "eval_steps_per_second": 15.002, "step": 500 }, { "epoch": 1.1910377358490567, "grad_norm": 2.421875, "learning_rate": 8.78582564843379e-06, "loss": 1.5861, "step": 505 }, { "epoch": 1.2028301886792452, "grad_norm": 2.21875, "learning_rate": 8.734836188239491e-06, "loss": 1.5701, "step": 510 }, { "epoch": 1.2146226415094339, "grad_norm": 2.25, "learning_rate": 8.68359722307479e-06, "loss": 1.5372, "step": 515 }, { "epoch": 1.2264150943396226, "grad_norm": 2.109375, "learning_rate": 8.632117015123812e-06, "loss": 1.5532, "step": 520 }, { "epoch": 1.2382075471698113, "grad_norm": 2.21875, "learning_rate": 8.580403865470608e-06, "loss": 1.5902, "step": 525 }, { "epoch": 1.25, "grad_norm": 2.390625, "learning_rate": 8.528466112760638e-06, "loss": 1.5525, "step": 530 }, { "epoch": 1.2617924528301887, "grad_norm": 2.1875, "learning_rate": 8.476312131856164e-06, "loss": 1.5284, "step": 535 }, { "epoch": 1.2735849056603774, "grad_norm": 2.109375, "learning_rate": 8.42395033248583e-06, "loss": 1.5249, "step": 540 }, { "epoch": 1.2853773584905661, "grad_norm": 2.15625, "learning_rate": 8.371389157888602e-06, "loss": 1.5715, "step": 545 }, { "epoch": 1.2971698113207548, "grad_norm": 2.25, "learning_rate": 8.318637083452323e-06, "loss": 1.5713, "step": 550 }, { "epoch": 1.3089622641509435, "grad_norm": 2.25, "learning_rate": 8.265702615347056e-06, "loss": 1.5727, "step": 555 }, { "epoch": 1.320754716981132, "grad_norm": 2.265625, "learning_rate": 8.212594289153501e-06, "loss": 1.5815, "step": 560 }, { "epoch": 1.3325471698113207, "grad_norm": 2.265625, "learning_rate": 8.159320668486633e-06, "loss": 1.5349, "step": 565 }, { "epoch": 1.3443396226415094, "grad_norm": 2.1875, "learning_rate": 8.105890343614842e-06, "loss": 1.5543, "step": 570 }, { "epoch": 1.3561320754716981, "grad_norm": 2.3125, "learning_rate": 8.052311930074767e-06, "loss": 1.5759, "step": 575 }, { "epoch": 1.3679245283018868, "grad_norm": 2.328125, "learning_rate": 7.998594067282067e-06, "loss": 1.5568, "step": 580 }, { "epoch": 1.3797169811320755, "grad_norm": 2.140625, "learning_rate": 7.944745417138312e-06, "loss": 1.5792, "step": 585 }, { "epoch": 1.3915094339622642, "grad_norm": 2.296875, "learning_rate": 7.890774662634284e-06, "loss": 1.5814, "step": 590 }, { "epoch": 1.4033018867924527, "grad_norm": 2.140625, "learning_rate": 7.83669050644986e-06, "loss": 1.5262, "step": 595 }, { "epoch": 1.4150943396226414, "grad_norm": 2.140625, "learning_rate": 7.782501669550717e-06, "loss": 1.5395, "step": 600 }, { "epoch": 1.4268867924528301, "grad_norm": 2.328125, "learning_rate": 7.728216889782096e-06, "loss": 1.5824, "step": 605 }, { "epoch": 1.4386792452830188, "grad_norm": 2.28125, "learning_rate": 7.673844920459834e-06, "loss": 1.5917, "step": 610 }, { "epoch": 1.4504716981132075, "grad_norm": 2.1875, "learning_rate": 7.619394528958923e-06, "loss": 1.5323, "step": 615 }, { "epoch": 1.4622641509433962, "grad_norm": 2.171875, "learning_rate": 7.56487449529978e-06, "loss": 1.5509, "step": 620 }, { "epoch": 1.474056603773585, "grad_norm": 2.0625, "learning_rate": 7.510293610732478e-06, "loss": 1.5435, "step": 625 }, { "epoch": 1.4858490566037736, "grad_norm": 2.109375, "learning_rate": 7.4556606763191854e-06, "loss": 1.5378, "step": 630 }, { "epoch": 1.4976415094339623, "grad_norm": 2.234375, "learning_rate": 7.400984501515011e-06, "loss": 1.5521, "step": 635 }, { "epoch": 1.509433962264151, "grad_norm": 2.234375, "learning_rate": 7.346273902747486e-06, "loss": 1.5588, "step": 640 }, { "epoch": 1.5212264150943398, "grad_norm": 2.28125, "learning_rate": 7.291537701994948e-06, "loss": 1.5433, "step": 645 }, { "epoch": 1.5330188679245285, "grad_norm": 2.203125, "learning_rate": 7.236784725363994e-06, "loss": 1.5735, "step": 650 }, { "epoch": 1.544811320754717, "grad_norm": 2.171875, "learning_rate": 7.182023801666313e-06, "loss": 1.5861, "step": 655 }, { "epoch": 1.5566037735849056, "grad_norm": 2.09375, "learning_rate": 7.127263760995028e-06, "loss": 1.576, "step": 660 }, { "epoch": 1.5683962264150944, "grad_norm": 2.234375, "learning_rate": 7.072513433300889e-06, "loss": 1.5781, "step": 665 }, { "epoch": 1.580188679245283, "grad_norm": 2.296875, "learning_rate": 7.017781646968438e-06, "loss": 1.579, "step": 670 }, { "epoch": 1.5919811320754715, "grad_norm": 2.15625, "learning_rate": 6.963077227392465e-06, "loss": 1.5744, "step": 675 }, { "epoch": 1.6037735849056602, "grad_norm": 2.171875, "learning_rate": 6.908408995554915e-06, "loss": 1.5747, "step": 680 }, { "epoch": 1.615566037735849, "grad_norm": 2.25, "learning_rate": 6.853785766602541e-06, "loss": 1.5786, "step": 685 }, { "epoch": 1.6273584905660377, "grad_norm": 2.15625, "learning_rate": 6.799216348425456e-06, "loss": 1.5559, "step": 690 }, { "epoch": 1.6391509433962264, "grad_norm": 2.125, "learning_rate": 6.744709540236898e-06, "loss": 1.5324, "step": 695 }, { "epoch": 1.650943396226415, "grad_norm": 2.234375, "learning_rate": 6.690274131154364e-06, "loss": 1.5782, "step": 700 }, { "epoch": 1.6627358490566038, "grad_norm": 2.1875, "learning_rate": 6.63591889878238e-06, "loss": 1.5768, "step": 705 }, { "epoch": 1.6745283018867925, "grad_norm": 2.234375, "learning_rate": 6.581652607797136e-06, "loss": 1.5614, "step": 710 }, { "epoch": 1.6863207547169812, "grad_norm": 2.15625, "learning_rate": 6.527484008533192e-06, "loss": 1.5675, "step": 715 }, { "epoch": 1.6981132075471699, "grad_norm": 2.09375, "learning_rate": 6.473421835572508e-06, "loss": 1.5621, "step": 720 }, { "epoch": 1.7099056603773586, "grad_norm": 2.171875, "learning_rate": 6.419474806336007e-06, "loss": 1.5508, "step": 725 }, { "epoch": 1.7216981132075473, "grad_norm": 2.265625, "learning_rate": 6.365651619677913e-06, "loss": 1.5503, "step": 730 }, { "epoch": 1.733490566037736, "grad_norm": 2.234375, "learning_rate": 6.311960954483071e-06, "loss": 1.5419, "step": 735 }, { "epoch": 1.7452830188679245, "grad_norm": 2.28125, "learning_rate": 6.258411468267494e-06, "loss": 1.5326, "step": 740 }, { "epoch": 1.7570754716981132, "grad_norm": 2.203125, "learning_rate": 6.205011795782359e-06, "loss": 1.5288, "step": 745 }, { "epoch": 1.7688679245283019, "grad_norm": 2.21875, "learning_rate": 6.151770547621672e-06, "loss": 1.5505, "step": 750 }, { "epoch": 1.7806603773584906, "grad_norm": 2.171875, "learning_rate": 6.098696308833817e-06, "loss": 1.5524, "step": 755 }, { "epoch": 1.7924528301886793, "grad_norm": 2.171875, "learning_rate": 6.045797637537242e-06, "loss": 1.5685, "step": 760 }, { "epoch": 1.8042452830188678, "grad_norm": 2.1875, "learning_rate": 5.993083063540479e-06, "loss": 1.5341, "step": 765 }, { "epoch": 1.8160377358490565, "grad_norm": 2.21875, "learning_rate": 5.94056108696672e-06, "loss": 1.5715, "step": 770 }, { "epoch": 1.8278301886792452, "grad_norm": 2.15625, "learning_rate": 5.8882401768832e-06, "loss": 1.5531, "step": 775 }, { "epoch": 1.8396226415094339, "grad_norm": 2.125, "learning_rate": 5.836128769935567e-06, "loss": 1.5836, "step": 780 }, { "epoch": 1.8514150943396226, "grad_norm": 2.15625, "learning_rate": 5.784235268987498e-06, "loss": 1.5577, "step": 785 }, { "epoch": 1.8632075471698113, "grad_norm": 2.25, "learning_rate": 5.7325680417657444e-06, "loss": 1.5722, "step": 790 }, { "epoch": 1.875, "grad_norm": 2.09375, "learning_rate": 5.681135419510858e-06, "loss": 1.5645, "step": 795 }, { "epoch": 1.8867924528301887, "grad_norm": 2.140625, "learning_rate": 5.629945695633794e-06, "loss": 1.5979, "step": 800 }, { "epoch": 1.8985849056603774, "grad_norm": 2.21875, "learning_rate": 5.579007124378612e-06, "loss": 1.5541, "step": 805 }, { "epoch": 1.9103773584905661, "grad_norm": 2.15625, "learning_rate": 5.528327919491496e-06, "loss": 1.5456, "step": 810 }, { "epoch": 1.9221698113207548, "grad_norm": 2.109375, "learning_rate": 5.477916252896307e-06, "loss": 1.5731, "step": 815 }, { "epoch": 1.9339622641509435, "grad_norm": 2.1875, "learning_rate": 5.4277802533768706e-06, "loss": 1.5789, "step": 820 }, { "epoch": 1.9457547169811322, "grad_norm": 2.1875, "learning_rate": 5.377928005266233e-06, "loss": 1.5654, "step": 825 }, { "epoch": 1.9575471698113207, "grad_norm": 2.265625, "learning_rate": 5.328367547143068e-06, "loss": 1.5347, "step": 830 }, { "epoch": 1.9693396226415094, "grad_norm": 2.25, "learning_rate": 5.279106870535483e-06, "loss": 1.5501, "step": 835 }, { "epoch": 1.9811320754716981, "grad_norm": 2.15625, "learning_rate": 5.2301539186323925e-06, "loss": 1.5676, "step": 840 }, { "epoch": 1.9929245283018868, "grad_norm": 2.1875, "learning_rate": 5.1815165850027e-06, "loss": 1.5831, "step": 845 }, { "epoch": 1.9952830188679245, "eval_loss": 1.6711690425872803, "eval_runtime": 5.5851, "eval_samples_per_second": 15.04, "eval_steps_per_second": 15.04, "step": 846 } ], "logging_steps": 5, "max_steps": 1272, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0098432568637522e+18, "train_batch_size": 48, "trial_name": null, "trial_params": null }