| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 544, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00919963201471941, |
| "grad_norm": 4.493361949920654, |
| "learning_rate": 7e-06, |
| "loss": 2.5772, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.01839926402943882, |
| "grad_norm": 2.119591236114502, |
| "learning_rate": 1.575e-05, |
| "loss": 2.35, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.027598896044158234, |
| "grad_norm": 0.7079921364784241, |
| "learning_rate": 2.4499999999999996e-05, |
| "loss": 1.9809, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.03679852805887764, |
| "grad_norm": 9.288702964782715, |
| "learning_rate": 3.3249999999999995e-05, |
| "loss": 1.8818, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.045998160073597055, |
| "grad_norm": 0.5875306129455566, |
| "learning_rate": 4.2e-05, |
| "loss": 1.8314, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.05519779208831647, |
| "grad_norm": 0.5792304873466492, |
| "learning_rate": 5.0749999999999994e-05, |
| "loss": 1.6966, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06439742410303588, |
| "grad_norm": 0.5320371985435486, |
| "learning_rate": 5.9499999999999996e-05, |
| "loss": 1.7159, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.07359705611775529, |
| "grad_norm": 0.41402390599250793, |
| "learning_rate": 6.824999999999999e-05, |
| "loss": 1.6542, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0827966881324747, |
| "grad_norm": 0.46447646617889404, |
| "learning_rate": 6.98237885462555e-05, |
| "loss": 1.5679, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.09199632014719411, |
| "grad_norm": 0.44029220938682556, |
| "learning_rate": 6.960352422907488e-05, |
| "loss": 1.6419, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.10119595216191353, |
| "grad_norm": 0.425231009721756, |
| "learning_rate": 6.938325991189428e-05, |
| "loss": 1.5516, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.11039558417663294, |
| "grad_norm": 0.4705358147621155, |
| "learning_rate": 6.916299559471366e-05, |
| "loss": 1.572, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.11959521619135234, |
| "grad_norm": 0.5355105400085449, |
| "learning_rate": 6.894273127753303e-05, |
| "loss": 1.5585, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.12879484820607176, |
| "grad_norm": 0.4822757840156555, |
| "learning_rate": 6.872246696035241e-05, |
| "loss": 1.5078, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.13799448022079117, |
| "grad_norm": 0.4612935781478882, |
| "learning_rate": 6.85022026431718e-05, |
| "loss": 1.5677, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.14719411223551057, |
| "grad_norm": 0.5228879451751709, |
| "learning_rate": 6.828193832599119e-05, |
| "loss": 1.5224, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.15639374425023, |
| "grad_norm": 0.4631298780441284, |
| "learning_rate": 6.806167400881057e-05, |
| "loss": 1.4413, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.1655933762649494, |
| "grad_norm": 0.5098944306373596, |
| "learning_rate": 6.784140969162995e-05, |
| "loss": 1.4965, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.17479300827966882, |
| "grad_norm": 0.5739426016807556, |
| "learning_rate": 6.762114537444933e-05, |
| "loss": 1.4526, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.18399264029438822, |
| "grad_norm": 0.5409196019172668, |
| "learning_rate": 6.740088105726871e-05, |
| "loss": 1.4667, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.19319227230910763, |
| "grad_norm": 0.6061224341392517, |
| "learning_rate": 6.71806167400881e-05, |
| "loss": 1.5092, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.20239190432382706, |
| "grad_norm": 4.403282642364502, |
| "learning_rate": 6.696035242290749e-05, |
| "loss": 1.42, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.21159153633854647, |
| "grad_norm": 0.5433526635169983, |
| "learning_rate": 6.674008810572687e-05, |
| "loss": 1.5852, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.22079116835326587, |
| "grad_norm": 0.5382786393165588, |
| "learning_rate": 6.651982378854625e-05, |
| "loss": 1.4788, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.22999080036798528, |
| "grad_norm": 0.5642833709716797, |
| "learning_rate": 6.629955947136563e-05, |
| "loss": 1.4694, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.23919043238270468, |
| "grad_norm": 0.5912417769432068, |
| "learning_rate": 6.607929515418502e-05, |
| "loss": 1.4685, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.24839006439742412, |
| "grad_norm": 0.584441602230072, |
| "learning_rate": 6.58590308370044e-05, |
| "loss": 1.4401, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.2575896964121435, |
| "grad_norm": 0.5686805844306946, |
| "learning_rate": 6.563876651982378e-05, |
| "loss": 1.4622, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2667893284268629, |
| "grad_norm": 0.5634583234786987, |
| "learning_rate": 6.541850220264316e-05, |
| "loss": 1.4674, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.27598896044158233, |
| "grad_norm": 0.5362507700920105, |
| "learning_rate": 6.519823788546254e-05, |
| "loss": 1.4368, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.28518859245630174, |
| "grad_norm": 0.528346836566925, |
| "learning_rate": 6.497797356828193e-05, |
| "loss": 1.4645, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.29438822447102114, |
| "grad_norm": 0.6441851258277893, |
| "learning_rate": 6.475770925110131e-05, |
| "loss": 1.4359, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.30358785648574055, |
| "grad_norm": 0.6480420231819153, |
| "learning_rate": 6.45374449339207e-05, |
| "loss": 1.5072, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.31278748850046, |
| "grad_norm": 0.6448924541473389, |
| "learning_rate": 6.431718061674008e-05, |
| "loss": 1.3991, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.3219871205151794, |
| "grad_norm": 232.7066650390625, |
| "learning_rate": 6.409691629955947e-05, |
| "loss": 1.4437, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.3311867525298988, |
| "grad_norm": 0.6596648097038269, |
| "learning_rate": 6.387665198237885e-05, |
| "loss": 1.4778, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3403863845446182, |
| "grad_norm": 0.6211815476417542, |
| "learning_rate": 6.365638766519823e-05, |
| "loss": 1.4235, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.34958601655933763, |
| "grad_norm": 0.597537636756897, |
| "learning_rate": 6.343612334801761e-05, |
| "loss": 1.4416, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.35878564857405704, |
| "grad_norm": 0.5805206894874573, |
| "learning_rate": 6.321585903083701e-05, |
| "loss": 1.4412, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.36798528058877644, |
| "grad_norm": 0.6511718034744263, |
| "learning_rate": 6.299559471365639e-05, |
| "loss": 1.4406, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.37718491260349585, |
| "grad_norm": 0.6224706768989563, |
| "learning_rate": 6.277533039647577e-05, |
| "loss": 1.428, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.38638454461821525, |
| "grad_norm": 0.7181910276412964, |
| "learning_rate": 6.255506607929515e-05, |
| "loss": 1.4363, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.39558417663293466, |
| "grad_norm": 0.6790558695793152, |
| "learning_rate": 6.233480176211453e-05, |
| "loss": 1.4516, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.4047838086476541, |
| "grad_norm": 0.9428783655166626, |
| "learning_rate": 6.211453744493392e-05, |
| "loss": 1.4527, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.4139834406623735, |
| "grad_norm": 0.7599747180938721, |
| "learning_rate": 6.18942731277533e-05, |
| "loss": 1.4756, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.42318307267709293, |
| "grad_norm": 0.5899630784988403, |
| "learning_rate": 6.167400881057268e-05, |
| "loss": 1.4126, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.43238270469181234, |
| "grad_norm": 0.7069624066352844, |
| "learning_rate": 6.145374449339206e-05, |
| "loss": 1.3768, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.44158233670653174, |
| "grad_norm": 1.1761517524719238, |
| "learning_rate": 6.123348017621144e-05, |
| "loss": 1.3948, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.45078196872125115, |
| "grad_norm": 0.8364585041999817, |
| "learning_rate": 6.101321585903083e-05, |
| "loss": 1.4601, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.45998160073597055, |
| "grad_norm": 0.7272374033927917, |
| "learning_rate": 6.0792951541850214e-05, |
| "loss": 1.4054, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.46918123275068996, |
| "grad_norm": 0.722029983997345, |
| "learning_rate": 6.0572687224669595e-05, |
| "loss": 1.4382, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.47838086476540936, |
| "grad_norm": 0.7180947065353394, |
| "learning_rate": 6.0352422907488984e-05, |
| "loss": 1.414, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.48758049678012877, |
| "grad_norm": 0.8569415211677551, |
| "learning_rate": 6.0132158590308366e-05, |
| "loss": 1.3801, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.49678012879484823, |
| "grad_norm": 0.6971564292907715, |
| "learning_rate": 5.991189427312775e-05, |
| "loss": 1.4264, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5059797608095676, |
| "grad_norm": 1.1400465965270996, |
| "learning_rate": 5.9691629955947136e-05, |
| "loss": 1.4571, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.515179392824287, |
| "grad_norm": 0.6640013456344604, |
| "learning_rate": 5.947136563876652e-05, |
| "loss": 1.4494, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5243790248390064, |
| "grad_norm": 0.612842857837677, |
| "learning_rate": 5.92511013215859e-05, |
| "loss": 1.3979, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.5335786568537259, |
| "grad_norm": 6.370260715484619, |
| "learning_rate": 5.903083700440528e-05, |
| "loss": 1.4008, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5427782888684453, |
| "grad_norm": 0.6938475966453552, |
| "learning_rate": 5.881057268722466e-05, |
| "loss": 1.3901, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.5519779208831647, |
| "grad_norm": 0.7252101302146912, |
| "learning_rate": 5.8590308370044045e-05, |
| "loss": 1.4129, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5611775528978841, |
| "grad_norm": 0.7261694669723511, |
| "learning_rate": 5.837004405286343e-05, |
| "loss": 1.4504, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.5703771849126035, |
| "grad_norm": 1.128242015838623, |
| "learning_rate": 5.8149779735682815e-05, |
| "loss": 1.4392, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5795768169273229, |
| "grad_norm": 0.7343323230743408, |
| "learning_rate": 5.79295154185022e-05, |
| "loss": 1.4001, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.5887764489420423, |
| "grad_norm": 0.6398953795433044, |
| "learning_rate": 5.770925110132158e-05, |
| "loss": 1.4533, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.5979760809567617, |
| "grad_norm": 0.9343668222427368, |
| "learning_rate": 5.748898678414096e-05, |
| "loss": 1.4464, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.6071757129714811, |
| "grad_norm": 0.7744137644767761, |
| "learning_rate": 5.726872246696035e-05, |
| "loss": 1.3802, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.6163753449862005, |
| "grad_norm": 0.7824112772941589, |
| "learning_rate": 5.704845814977973e-05, |
| "loss": 1.3885, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.62557497700092, |
| "grad_norm": 0.7310079336166382, |
| "learning_rate": 5.682819383259911e-05, |
| "loss": 1.3351, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.6347746090156394, |
| "grad_norm": 0.6795840859413147, |
| "learning_rate": 5.66079295154185e-05, |
| "loss": 1.4371, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.6439742410303588, |
| "grad_norm": 0.7529902458190918, |
| "learning_rate": 5.638766519823788e-05, |
| "loss": 1.4985, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.6531738730450782, |
| "grad_norm": 0.7649794220924377, |
| "learning_rate": 5.6167400881057265e-05, |
| "loss": 1.4007, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.6623735050597976, |
| "grad_norm": 0.8272032141685486, |
| "learning_rate": 5.594713656387665e-05, |
| "loss": 1.4233, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.671573137074517, |
| "grad_norm": 0.7489705085754395, |
| "learning_rate": 5.5726872246696035e-05, |
| "loss": 1.3739, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.6807727690892365, |
| "grad_norm": 0.8254089951515198, |
| "learning_rate": 5.550660792951541e-05, |
| "loss": 1.5, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.6899724011039559, |
| "grad_norm": 0.7875320911407471, |
| "learning_rate": 5.528634361233479e-05, |
| "loss": 1.3998, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.6991720331186753, |
| "grad_norm": 0.7118289470672607, |
| "learning_rate": 5.506607929515418e-05, |
| "loss": 1.4549, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.7083716651333947, |
| "grad_norm": 0.7725639343261719, |
| "learning_rate": 5.484581497797356e-05, |
| "loss": 1.3842, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.7175712971481141, |
| "grad_norm": 1.2431738376617432, |
| "learning_rate": 5.4625550660792944e-05, |
| "loss": 1.4364, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.7267709291628335, |
| "grad_norm": 0.9481919407844543, |
| "learning_rate": 5.4405286343612326e-05, |
| "loss": 1.3786, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.7359705611775529, |
| "grad_norm": 0.7844451069831848, |
| "learning_rate": 5.4185022026431715e-05, |
| "loss": 1.3388, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7451701931922723, |
| "grad_norm": 0.8081017136573792, |
| "learning_rate": 5.3964757709251096e-05, |
| "loss": 1.4376, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.7543698252069917, |
| "grad_norm": 13030255.0, |
| "learning_rate": 5.374449339207048e-05, |
| "loss": 1.439, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.7635694572217111, |
| "grad_norm": 0.7132210731506348, |
| "learning_rate": 5.352422907488987e-05, |
| "loss": 1.392, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.7727690892364305, |
| "grad_norm": 0.8951236009597778, |
| "learning_rate": 5.330396475770925e-05, |
| "loss": 1.3945, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.7819687212511499, |
| "grad_norm": 0.7916706800460815, |
| "learning_rate": 5.308370044052863e-05, |
| "loss": 1.3567, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.7911683532658693, |
| "grad_norm": 0.8253033757209778, |
| "learning_rate": 5.286343612334801e-05, |
| "loss": 1.3707, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.8003679852805887, |
| "grad_norm": 0.8816981911659241, |
| "learning_rate": 5.26431718061674e-05, |
| "loss": 1.3911, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.8095676172953082, |
| "grad_norm": 0.7913665175437927, |
| "learning_rate": 5.242290748898678e-05, |
| "loss": 1.3613, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.8187672493100276, |
| "grad_norm": 0.7874677777290344, |
| "learning_rate": 5.2202643171806164e-05, |
| "loss": 1.4487, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.827966881324747, |
| "grad_norm": 0.7003588080406189, |
| "learning_rate": 5.1982378854625546e-05, |
| "loss": 1.3712, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.8371665133394665, |
| "grad_norm": 0.7311933040618896, |
| "learning_rate": 5.176211453744493e-05, |
| "loss": 1.3422, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.8463661453541859, |
| "grad_norm": 0.823244571685791, |
| "learning_rate": 5.154185022026431e-05, |
| "loss": 1.3605, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.8555657773689053, |
| "grad_norm": 0.8285578489303589, |
| "learning_rate": 5.132158590308369e-05, |
| "loss": 1.418, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.8647654093836247, |
| "grad_norm": 0.8007466197013855, |
| "learning_rate": 5.110132158590308e-05, |
| "loss": 1.3539, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.8739650413983441, |
| "grad_norm": 0.7975384593009949, |
| "learning_rate": 5.088105726872246e-05, |
| "loss": 1.3959, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.8831646734130635, |
| "grad_norm": 0.931978702545166, |
| "learning_rate": 5.0660792951541843e-05, |
| "loss": 1.3734, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.8923643054277829, |
| "grad_norm": 0.9099324345588684, |
| "learning_rate": 5.044052863436123e-05, |
| "loss": 1.3722, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.9015639374425023, |
| "grad_norm": 0.7978657484054565, |
| "learning_rate": 5.0220264317180614e-05, |
| "loss": 1.3741, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.9107635694572217, |
| "grad_norm": 0.8722144365310669, |
| "learning_rate": 4.9999999999999996e-05, |
| "loss": 1.3869, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.9199632014719411, |
| "grad_norm": 1309.4061279296875, |
| "learning_rate": 4.977973568281938e-05, |
| "loss": 1.4329, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.9291628334866605, |
| "grad_norm": 0.807162880897522, |
| "learning_rate": 4.9559471365638766e-05, |
| "loss": 1.3755, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.9383624655013799, |
| "grad_norm": 0.9719377160072327, |
| "learning_rate": 4.933920704845815e-05, |
| "loss": 1.4036, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.9475620975160993, |
| "grad_norm": 3.86576247215271, |
| "learning_rate": 4.911894273127753e-05, |
| "loss": 1.3866, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.9567617295308187, |
| "grad_norm": 0.8033193945884705, |
| "learning_rate": 4.889867841409692e-05, |
| "loss": 1.3861, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.9659613615455381, |
| "grad_norm": 0.8535065650939941, |
| "learning_rate": 4.867841409691629e-05, |
| "loss": 1.2592, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.9751609935602575, |
| "grad_norm": 0.8155761957168579, |
| "learning_rate": 4.8458149779735675e-05, |
| "loss": 1.3123, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.984360625574977, |
| "grad_norm": 155.9907989501953, |
| "learning_rate": 4.823788546255506e-05, |
| "loss": 1.3567, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.9935602575896965, |
| "grad_norm": 2.225149631500244, |
| "learning_rate": 4.8017621145374445e-05, |
| "loss": 1.3736, |
| "step": 540 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1629, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.125792658549078e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|