| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9952830188679245, |
| "eval_steps": 500, |
| "global_step": 846, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01179245283018868, |
| "grad_norm": 6.4375, |
| "learning_rate": 1.3142857142857143e-06, |
| "loss": 1.9687, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.02358490566037736, |
| "grad_norm": 4.25, |
| "learning_rate": 2.9571428571428567e-06, |
| "loss": 1.9487, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03537735849056604, |
| "grad_norm": 3.09375, |
| "learning_rate": 4.6e-06, |
| "loss": 1.8763, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.04716981132075472, |
| "grad_norm": 2.78125, |
| "learning_rate": 6.242857142857142e-06, |
| "loss": 1.8502, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0589622641509434, |
| "grad_norm": 2.546875, |
| "learning_rate": 7.885714285714286e-06, |
| "loss": 1.8808, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.07075471698113207, |
| "grad_norm": 2.625, |
| "learning_rate": 9.528571428571429e-06, |
| "loss": 1.815, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.08254716981132075, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.1171428571428571e-05, |
| "loss": 1.7632, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.09433962264150944, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.1499777476590145e-05, |
| "loss": 1.7819, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10613207547169812, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.1498873514595152e-05, |
| "loss": 1.7612, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.1179245283018868, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.1497274351951891e-05, |
| "loss": 1.7435, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.12971698113207547, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.1494980246522254e-05, |
| "loss": 1.7239, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.14150943396226415, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.1491991568226304e-05, |
| "loss": 1.7142, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.15330188679245282, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.1488308798982647e-05, |
| "loss": 1.7514, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.1650943396226415, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.14839325326307e-05, |
| "loss": 1.7355, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.17688679245283018, |
| "grad_norm": 2.375, |
| "learning_rate": 1.1478863474834962e-05, |
| "loss": 1.7629, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.18867924528301888, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.1473102442971205e-05, |
| "loss": 1.7157, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.20047169811320756, |
| "grad_norm": 2.375, |
| "learning_rate": 1.1466650365994684e-05, |
| "loss": 1.7379, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.21226415094339623, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.1459508284290345e-05, |
| "loss": 1.7371, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2240566037735849, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.145167734950506e-05, |
| "loss": 1.7067, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.2358490566037736, |
| "grad_norm": 2.375, |
| "learning_rate": 1.1443158824361937e-05, |
| "loss": 1.738, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.24764150943396226, |
| "grad_norm": 2.375, |
| "learning_rate": 1.1433954082456689e-05, |
| "loss": 1.7135, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.25943396226415094, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.1424064608036165e-05, |
| "loss": 1.6967, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.27122641509433965, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.1413491995759002e-05, |
| "loss": 1.7233, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.2830188679245283, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.1402237950438503e-05, |
| "loss": 1.7345, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.294811320754717, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.139030428676773e-05, |
| "loss": 1.6991, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.30660377358490565, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.1377692929026888e-05, |
| "loss": 1.6991, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.31839622641509435, |
| "grad_norm": 2.375, |
| "learning_rate": 1.1364405910773045e-05, |
| "loss": 1.7055, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.330188679245283, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.135044537451222e-05, |
| "loss": 1.6739, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.3419811320754717, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.1335813571353907e-05, |
| "loss": 1.7265, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.35377358490566035, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.1320512860648091e-05, |
| "loss": 1.6901, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.36556603773584906, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.130454570960481e-05, |
| "loss": 1.7123, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.37735849056603776, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.1287914692896318e-05, |
| "loss": 1.7072, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.3891509433962264, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.1270622492241917e-05, |
| "loss": 1.7096, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.4009433962264151, |
| "grad_norm": 2.25, |
| "learning_rate": 1.1252671895975553e-05, |
| "loss": 1.7145, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.41273584905660377, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.1234065798596185e-05, |
| "loss": 1.6668, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.42452830188679247, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.1214807200301065e-05, |
| "loss": 1.6786, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.4363207547169811, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.1194899206501954e-05, |
| "loss": 1.7117, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.4481132075471698, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.1174345027324379e-05, |
| "loss": 1.6979, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.45990566037735847, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.115314797709002e-05, |
| "loss": 1.6748, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.4716981132075472, |
| "grad_norm": 2.25, |
| "learning_rate": 1.1131311473782265e-05, |
| "loss": 1.6787, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4834905660377358, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.1108839038495079e-05, |
| "loss": 1.6916, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.49528301886792453, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.1085734294865228e-05, |
| "loss": 1.6689, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5070754716981132, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.1062000968487975e-05, |
| "loss": 1.7052, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.5188679245283019, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.1037642886316339e-05, |
| "loss": 1.6683, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.5306603773584906, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.1012663976044003e-05, |
| "loss": 1.6763, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.5424528301886793, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.0987068265471978e-05, |
| "loss": 1.6811, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5542452830188679, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.0960859881859139e-05, |
| "loss": 1.6863, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.5660377358490566, |
| "grad_norm": 2.25, |
| "learning_rate": 1.0934043051256698e-05, |
| "loss": 1.6881, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5778301886792453, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.0906622097826771e-05, |
| "loss": 1.6482, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.589622641509434, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.0878601443145113e-05, |
| "loss": 1.6766, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6014150943396226, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.0849985605488146e-05, |
| "loss": 1.675, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.6132075471698113, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.0820779199104397e-05, |
| "loss": 1.6626, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 2.375, |
| "learning_rate": 1.079098693347046e-05, |
| "loss": 1.6515, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.6367924528301887, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.07606136125316e-05, |
| "loss": 1.7015, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6485849056603774, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.0729664133927129e-05, |
| "loss": 1.6355, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.660377358490566, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.0698143488200662e-05, |
| "loss": 1.6389, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.6721698113207547, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.0666056757995418e-05, |
| "loss": 1.6465, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.6839622641509434, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.0633409117234644e-05, |
| "loss": 1.6423, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6957547169811321, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.0600205830287322e-05, |
| "loss": 1.6425, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.7075471698113207, |
| "grad_norm": 2.3125, |
| "learning_rate": 1.0566452251119316e-05, |
| "loss": 1.6575, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7193396226415094, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.053215382243004e-05, |
| "loss": 1.6459, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.7311320754716981, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.0497316074774848e-05, |
| "loss": 1.7007, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.7429245283018868, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.0461944625673232e-05, |
| "loss": 1.6596, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.7547169811320755, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.0426045178703008e-05, |
| "loss": 1.67, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.7665094339622641, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.038962352258063e-05, |
| "loss": 1.671, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.7783018867924528, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.0352685530227774e-05, |
| "loss": 1.6433, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.7900943396226415, |
| "grad_norm": 2.328125, |
| "learning_rate": 1.0315237157824327e-05, |
| "loss": 1.6605, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.8018867924528302, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.0277284443847979e-05, |
| "loss": 1.6467, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.8136792452830188, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.0238833508100518e-05, |
| "loss": 1.6315, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.8254716981132075, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.0199890550721037e-05, |
| "loss": 1.619, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.8372641509433962, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.0160461851186164e-05, |
| "loss": 1.667, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.8490566037735849, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.0120553767297507e-05, |
| "loss": 1.6504, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.8608490566037735, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.0080172734156478e-05, |
| "loss": 1.6727, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.8726415094339622, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.0039325263126645e-05, |
| "loss": 1.6469, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.8844339622641509, |
| "grad_norm": 2.1875, |
| "learning_rate": 9.998017940783778e-06, |
| "loss": 1.6638, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.8962264150943396, |
| "grad_norm": 2.25, |
| "learning_rate": 9.956257427853788e-06, |
| "loss": 1.6652, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.9080188679245284, |
| "grad_norm": 2.203125, |
| "learning_rate": 9.914050458138687e-06, |
| "loss": 1.6456, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.9198113207547169, |
| "grad_norm": 2.125, |
| "learning_rate": 9.871403837430787e-06, |
| "loss": 1.6336, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.9316037735849056, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.828324442415267e-06, |
| "loss": 1.6592, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.9433962264150944, |
| "grad_norm": 2.25, |
| "learning_rate": 9.784819219561335e-06, |
| "loss": 1.6251, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.9551886792452831, |
| "grad_norm": 2.265625, |
| "learning_rate": 9.740895184002105e-06, |
| "loss": 1.638, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.9669811320754716, |
| "grad_norm": 2.125, |
| "learning_rate": 9.696559418403438e-06, |
| "loss": 1.6297, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.9787735849056604, |
| "grad_norm": 2.296875, |
| "learning_rate": 9.651819071821867e-06, |
| "loss": 1.6373, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.9905660377358491, |
| "grad_norm": 2.25, |
| "learning_rate": 9.606681358551822e-06, |
| "loss": 1.6636, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.9976415094339622, |
| "eval_loss": 1.6890636682510376, |
| "eval_runtime": 5.777, |
| "eval_samples_per_second": 14.54, |
| "eval_steps_per_second": 14.54, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.0023584905660377, |
| "grad_norm": 2.546875, |
| "learning_rate": 9.56115355696235e-06, |
| "loss": 1.6559, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.0141509433962264, |
| "grad_norm": 2.453125, |
| "learning_rate": 9.515243008323482e-06, |
| "loss": 1.5576, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.025943396226415, |
| "grad_norm": 2.1875, |
| "learning_rate": 9.468957115622473e-06, |
| "loss": 1.5884, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.0377358490566038, |
| "grad_norm": 2.21875, |
| "learning_rate": 9.42230334237008e-06, |
| "loss": 1.5567, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.0495283018867925, |
| "grad_norm": 2.25, |
| "learning_rate": 9.37528921139709e-06, |
| "loss": 1.577, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.0613207547169812, |
| "grad_norm": 2.21875, |
| "learning_rate": 9.327922303641277e-06, |
| "loss": 1.5942, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.0731132075471699, |
| "grad_norm": 2.15625, |
| "learning_rate": 9.280210256924987e-06, |
| "loss": 1.565, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.0849056603773586, |
| "grad_norm": 2.1875, |
| "learning_rate": 9.23216076472356e-06, |
| "loss": 1.5633, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.0966981132075473, |
| "grad_norm": 2.265625, |
| "learning_rate": 9.183781574924765e-06, |
| "loss": 1.5821, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.1084905660377358, |
| "grad_norm": 2.328125, |
| "learning_rate": 9.135080488579473e-06, |
| "loss": 1.569, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.1202830188679245, |
| "grad_norm": 2.265625, |
| "learning_rate": 9.086065358643754e-06, |
| "loss": 1.5876, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.1320754716981132, |
| "grad_norm": 2.296875, |
| "learning_rate": 9.036744088712591e-06, |
| "loss": 1.5964, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.1438679245283019, |
| "grad_norm": 2.15625, |
| "learning_rate": 8.98712463174546e-06, |
| "loss": 1.5798, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.1556603773584906, |
| "grad_norm": 2.15625, |
| "learning_rate": 8.937214988783914e-06, |
| "loss": 1.5605, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.1674528301886793, |
| "grad_norm": 2.15625, |
| "learning_rate": 8.887023207661441e-06, |
| "loss": 1.5753, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.179245283018868, |
| "grad_norm": 2.203125, |
| "learning_rate": 8.83655738170576e-06, |
| "loss": 1.575, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.179245283018868, |
| "eval_loss": 1.6877530813217163, |
| "eval_runtime": 5.5993, |
| "eval_samples_per_second": 15.002, |
| "eval_steps_per_second": 15.002, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.1910377358490567, |
| "grad_norm": 2.421875, |
| "learning_rate": 8.78582564843379e-06, |
| "loss": 1.5861, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.2028301886792452, |
| "grad_norm": 2.21875, |
| "learning_rate": 8.734836188239491e-06, |
| "loss": 1.5701, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.2146226415094339, |
| "grad_norm": 2.25, |
| "learning_rate": 8.68359722307479e-06, |
| "loss": 1.5372, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.2264150943396226, |
| "grad_norm": 2.109375, |
| "learning_rate": 8.632117015123812e-06, |
| "loss": 1.5532, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.2382075471698113, |
| "grad_norm": 2.21875, |
| "learning_rate": 8.580403865470608e-06, |
| "loss": 1.5902, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 2.390625, |
| "learning_rate": 8.528466112760638e-06, |
| "loss": 1.5525, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.2617924528301887, |
| "grad_norm": 2.1875, |
| "learning_rate": 8.476312131856164e-06, |
| "loss": 1.5284, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.2735849056603774, |
| "grad_norm": 2.109375, |
| "learning_rate": 8.42395033248583e-06, |
| "loss": 1.5249, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.2853773584905661, |
| "grad_norm": 2.15625, |
| "learning_rate": 8.371389157888602e-06, |
| "loss": 1.5715, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.2971698113207548, |
| "grad_norm": 2.25, |
| "learning_rate": 8.318637083452323e-06, |
| "loss": 1.5713, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.3089622641509435, |
| "grad_norm": 2.25, |
| "learning_rate": 8.265702615347056e-06, |
| "loss": 1.5727, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.320754716981132, |
| "grad_norm": 2.265625, |
| "learning_rate": 8.212594289153501e-06, |
| "loss": 1.5815, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.3325471698113207, |
| "grad_norm": 2.265625, |
| "learning_rate": 8.159320668486633e-06, |
| "loss": 1.5349, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.3443396226415094, |
| "grad_norm": 2.1875, |
| "learning_rate": 8.105890343614842e-06, |
| "loss": 1.5543, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.3561320754716981, |
| "grad_norm": 2.3125, |
| "learning_rate": 8.052311930074767e-06, |
| "loss": 1.5759, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.3679245283018868, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.998594067282067e-06, |
| "loss": 1.5568, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.3797169811320755, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.944745417138312e-06, |
| "loss": 1.5792, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.3915094339622642, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.890774662634284e-06, |
| "loss": 1.5814, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.4033018867924527, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.83669050644986e-06, |
| "loss": 1.5262, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.4150943396226414, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.782501669550717e-06, |
| "loss": 1.5395, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.4268867924528301, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.728216889782096e-06, |
| "loss": 1.5824, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.4386792452830188, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.673844920459834e-06, |
| "loss": 1.5917, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.4504716981132075, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.619394528958923e-06, |
| "loss": 1.5323, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.4622641509433962, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.56487449529978e-06, |
| "loss": 1.5509, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.474056603773585, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.510293610732478e-06, |
| "loss": 1.5435, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.4858490566037736, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.4556606763191854e-06, |
| "loss": 1.5378, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.4976415094339623, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.400984501515011e-06, |
| "loss": 1.5521, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.509433962264151, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.346273902747486e-06, |
| "loss": 1.5588, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.5212264150943398, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.291537701994948e-06, |
| "loss": 1.5433, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.5330188679245285, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.236784725363994e-06, |
| "loss": 1.5735, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.544811320754717, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.182023801666313e-06, |
| "loss": 1.5861, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.5566037735849056, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.127263760995028e-06, |
| "loss": 1.576, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.5683962264150944, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.072513433300889e-06, |
| "loss": 1.5781, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.580188679245283, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.017781646968438e-06, |
| "loss": 1.579, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.5919811320754715, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.963077227392465e-06, |
| "loss": 1.5744, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.6037735849056602, |
| "grad_norm": 2.171875, |
| "learning_rate": 6.908408995554915e-06, |
| "loss": 1.5747, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.615566037735849, |
| "grad_norm": 2.25, |
| "learning_rate": 6.853785766602541e-06, |
| "loss": 1.5786, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.6273584905660377, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.799216348425456e-06, |
| "loss": 1.5559, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.6391509433962264, |
| "grad_norm": 2.125, |
| "learning_rate": 6.744709540236898e-06, |
| "loss": 1.5324, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.650943396226415, |
| "grad_norm": 2.234375, |
| "learning_rate": 6.690274131154364e-06, |
| "loss": 1.5782, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.6627358490566038, |
| "grad_norm": 2.1875, |
| "learning_rate": 6.63591889878238e-06, |
| "loss": 1.5768, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.6745283018867925, |
| "grad_norm": 2.234375, |
| "learning_rate": 6.581652607797136e-06, |
| "loss": 1.5614, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.6863207547169812, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.527484008533192e-06, |
| "loss": 1.5675, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.6981132075471699, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.473421835572508e-06, |
| "loss": 1.5621, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.7099056603773586, |
| "grad_norm": 2.171875, |
| "learning_rate": 6.419474806336007e-06, |
| "loss": 1.5508, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.7216981132075473, |
| "grad_norm": 2.265625, |
| "learning_rate": 6.365651619677913e-06, |
| "loss": 1.5503, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.733490566037736, |
| "grad_norm": 2.234375, |
| "learning_rate": 6.311960954483071e-06, |
| "loss": 1.5419, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.7452830188679245, |
| "grad_norm": 2.28125, |
| "learning_rate": 6.258411468267494e-06, |
| "loss": 1.5326, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.7570754716981132, |
| "grad_norm": 2.203125, |
| "learning_rate": 6.205011795782359e-06, |
| "loss": 1.5288, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.7688679245283019, |
| "grad_norm": 2.21875, |
| "learning_rate": 6.151770547621672e-06, |
| "loss": 1.5505, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.7806603773584906, |
| "grad_norm": 2.171875, |
| "learning_rate": 6.098696308833817e-06, |
| "loss": 1.5524, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.7924528301886793, |
| "grad_norm": 2.171875, |
| "learning_rate": 6.045797637537242e-06, |
| "loss": 1.5685, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.8042452830188678, |
| "grad_norm": 2.1875, |
| "learning_rate": 5.993083063540479e-06, |
| "loss": 1.5341, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.8160377358490565, |
| "grad_norm": 2.21875, |
| "learning_rate": 5.94056108696672e-06, |
| "loss": 1.5715, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.8278301886792452, |
| "grad_norm": 2.15625, |
| "learning_rate": 5.8882401768832e-06, |
| "loss": 1.5531, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.8396226415094339, |
| "grad_norm": 2.125, |
| "learning_rate": 5.836128769935567e-06, |
| "loss": 1.5836, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.8514150943396226, |
| "grad_norm": 2.15625, |
| "learning_rate": 5.784235268987498e-06, |
| "loss": 1.5577, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.8632075471698113, |
| "grad_norm": 2.25, |
| "learning_rate": 5.7325680417657444e-06, |
| "loss": 1.5722, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.875, |
| "grad_norm": 2.09375, |
| "learning_rate": 5.681135419510858e-06, |
| "loss": 1.5645, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.8867924528301887, |
| "grad_norm": 2.140625, |
| "learning_rate": 5.629945695633794e-06, |
| "loss": 1.5979, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.8985849056603774, |
| "grad_norm": 2.21875, |
| "learning_rate": 5.579007124378612e-06, |
| "loss": 1.5541, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.9103773584905661, |
| "grad_norm": 2.15625, |
| "learning_rate": 5.528327919491496e-06, |
| "loss": 1.5456, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.9221698113207548, |
| "grad_norm": 2.109375, |
| "learning_rate": 5.477916252896307e-06, |
| "loss": 1.5731, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.9339622641509435, |
| "grad_norm": 2.1875, |
| "learning_rate": 5.4277802533768706e-06, |
| "loss": 1.5789, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.9457547169811322, |
| "grad_norm": 2.1875, |
| "learning_rate": 5.377928005266233e-06, |
| "loss": 1.5654, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.9575471698113207, |
| "grad_norm": 2.265625, |
| "learning_rate": 5.328367547143068e-06, |
| "loss": 1.5347, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.9693396226415094, |
| "grad_norm": 2.25, |
| "learning_rate": 5.279106870535483e-06, |
| "loss": 1.5501, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.9811320754716981, |
| "grad_norm": 2.15625, |
| "learning_rate": 5.2301539186323925e-06, |
| "loss": 1.5676, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.9929245283018868, |
| "grad_norm": 2.1875, |
| "learning_rate": 5.1815165850027e-06, |
| "loss": 1.5831, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.9952830188679245, |
| "eval_loss": 1.6711690425872803, |
| "eval_runtime": 5.5851, |
| "eval_samples_per_second": 15.04, |
| "eval_steps_per_second": 15.04, |
| "step": 846 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1272, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.0098432568637522e+18, |
| "train_batch_size": 48, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|