diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20812 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 14835, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016854879487611663, + "grad_norm": 45.635078873413356, + "learning_rate": 8.968609865470853e-08, + "loss": 2.018, + "step": 5 + }, + { + "epoch": 0.0033709758975223325, + "grad_norm": 54.815981003563216, + "learning_rate": 2.0179372197309417e-07, + "loss": 1.9624, + "step": 10 + }, + { + "epoch": 0.005056463846283499, + "grad_norm": 47.342183607767005, + "learning_rate": 3.1390134529147985e-07, + "loss": 1.9596, + "step": 15 + }, + { + "epoch": 0.006741951795044665, + "grad_norm": 58.86327972981608, + "learning_rate": 4.2600896860986547e-07, + "loss": 1.8431, + "step": 20 + }, + { + "epoch": 0.008427439743805831, + "grad_norm": 46.738743972646034, + "learning_rate": 5.381165919282512e-07, + "loss": 1.5319, + "step": 25 + }, + { + "epoch": 0.010112927692566998, + "grad_norm": 20.284827182581076, + "learning_rate": 6.502242152466367e-07, + "loss": 1.2438, + "step": 30 + }, + { + "epoch": 0.011798415641328164, + "grad_norm": 12.172972360337889, + "learning_rate": 7.623318385650225e-07, + "loss": 1.0174, + "step": 35 + }, + { + "epoch": 0.01348390359008933, + "grad_norm": 9.808958694438337, + "learning_rate": 8.744394618834082e-07, + "loss": 0.9619, + "step": 40 + }, + { + "epoch": 0.015169391538850498, + "grad_norm": 5.846992899686357, + "learning_rate": 9.865470852017938e-07, + "loss": 0.936, + "step": 45 + }, + { + "epoch": 0.016854879487611663, + "grad_norm": 4.407895578972216, + "learning_rate": 1.0986547085201794e-06, + "loss": 0.7907, + "step": 50 + }, + { + "epoch": 0.01854036743637283, + "grad_norm": 4.4229026540725895, + "learning_rate": 1.2107623318385651e-06, + "loss": 0.8571, + "step": 55 + }, + { + "epoch": 0.020225855385133995, + "grad_norm": 3.193351753212815, + "learning_rate": 1.3228699551569508e-06, + "loss": 0.755, + "step": 60 + }, + { + "epoch": 0.02191134333389516, + "grad_norm": 3.897422918904646, + "learning_rate": 1.4349775784753365e-06, + "loss": 0.7606, + "step": 65 + }, + { + "epoch": 0.023596831282656328, + "grad_norm": 2.920303290471853, + "learning_rate": 1.5470852017937221e-06, + "loss": 0.7553, + "step": 70 + }, + { + "epoch": 0.025282319231417494, + "grad_norm": 3.177702608453161, + "learning_rate": 1.6591928251121078e-06, + "loss": 0.7082, + "step": 75 + }, + { + "epoch": 0.02696780718017866, + "grad_norm": 3.8072115047524786, + "learning_rate": 1.7713004484304935e-06, + "loss": 0.7497, + "step": 80 + }, + { + "epoch": 0.02865329512893983, + "grad_norm": 4.215753700800818, + "learning_rate": 1.883408071748879e-06, + "loss": 0.6929, + "step": 85 + }, + { + "epoch": 0.030338783077700996, + "grad_norm": 3.120813016518093, + "learning_rate": 1.9955156950672647e-06, + "loss": 0.6795, + "step": 90 + }, + { + "epoch": 0.03202427102646216, + "grad_norm": 3.7925518313582076, + "learning_rate": 2.1076233183856503e-06, + "loss": 0.6336, + "step": 95 + }, + { + "epoch": 0.033709758975223325, + "grad_norm": 3.9090323792139565, + "learning_rate": 2.219730941704036e-06, + "loss": 0.6241, + "step": 100 + }, + { + "epoch": 0.03539524692398449, + "grad_norm": 3.2401672745432233, + "learning_rate": 2.3318385650224217e-06, + "loss": 0.5761, + "step": 105 + }, + { + "epoch": 0.03708073487274566, + "grad_norm": 2.6831203132181436, + "learning_rate": 2.4439461883408074e-06, + "loss": 0.5994, + "step": 110 + }, + { + "epoch": 0.038766222821506824, + "grad_norm": 3.1421487063706506, + "learning_rate": 2.556053811659193e-06, + "loss": 0.5819, + "step": 115 + }, + { + "epoch": 0.04045171077026799, + "grad_norm": 3.05063467620266, + "learning_rate": 2.6681614349775787e-06, + "loss": 0.6341, + "step": 120 + }, + { + "epoch": 0.042137198719029156, + "grad_norm": 3.0042373128046185, + "learning_rate": 2.7802690582959644e-06, + "loss": 0.5358, + "step": 125 + }, + { + "epoch": 0.04382268666779032, + "grad_norm": 2.6261508492463865, + "learning_rate": 2.89237668161435e-06, + "loss": 0.5549, + "step": 130 + }, + { + "epoch": 0.04550817461655149, + "grad_norm": 2.438153401729035, + "learning_rate": 3.0044843049327353e-06, + "loss": 0.5276, + "step": 135 + }, + { + "epoch": 0.047193662565312655, + "grad_norm": 3.161425452337587, + "learning_rate": 3.1165919282511214e-06, + "loss": 0.5024, + "step": 140 + }, + { + "epoch": 0.04887915051407382, + "grad_norm": 2.724770652328943, + "learning_rate": 3.2286995515695067e-06, + "loss": 0.502, + "step": 145 + }, + { + "epoch": 0.05056463846283499, + "grad_norm": 3.501981992101439, + "learning_rate": 3.340807174887893e-06, + "loss": 0.4506, + "step": 150 + }, + { + "epoch": 0.052250126411596154, + "grad_norm": 3.643587579316495, + "learning_rate": 3.4529147982062785e-06, + "loss": 0.4734, + "step": 155 + }, + { + "epoch": 0.05393561436035732, + "grad_norm": 3.2491410588044594, + "learning_rate": 3.5650224215246637e-06, + "loss": 0.47, + "step": 160 + }, + { + "epoch": 0.055621102309118486, + "grad_norm": 2.4074659243017456, + "learning_rate": 3.67713004484305e-06, + "loss": 0.4135, + "step": 165 + }, + { + "epoch": 0.05730659025787966, + "grad_norm": 4.057738750671516, + "learning_rate": 3.789237668161435e-06, + "loss": 0.4755, + "step": 170 + }, + { + "epoch": 0.058992078206640826, + "grad_norm": 3.0465958363209715, + "learning_rate": 3.901345291479821e-06, + "loss": 0.4529, + "step": 175 + }, + { + "epoch": 0.06067756615540199, + "grad_norm": 2.95941371221577, + "learning_rate": 4.0134529147982064e-06, + "loss": 0.453, + "step": 180 + }, + { + "epoch": 0.06236305410416316, + "grad_norm": 3.6181830736645773, + "learning_rate": 4.125560538116592e-06, + "loss": 0.4474, + "step": 185 + }, + { + "epoch": 0.06404854205292432, + "grad_norm": 3.390695544104659, + "learning_rate": 4.237668161434978e-06, + "loss": 0.4435, + "step": 190 + }, + { + "epoch": 0.06573403000168548, + "grad_norm": 3.283255105498521, + "learning_rate": 4.3497757847533635e-06, + "loss": 0.4575, + "step": 195 + }, + { + "epoch": 0.06741951795044665, + "grad_norm": 5.873175991018478, + "learning_rate": 4.461883408071749e-06, + "loss": 0.4202, + "step": 200 + }, + { + "epoch": 0.06910500589920782, + "grad_norm": 3.032967552769786, + "learning_rate": 4.573991031390135e-06, + "loss": 0.4152, + "step": 205 + }, + { + "epoch": 0.07079049384796898, + "grad_norm": 4.450464722208425, + "learning_rate": 4.6860986547085205e-06, + "loss": 0.3909, + "step": 210 + }, + { + "epoch": 0.07247598179673015, + "grad_norm": 2.6249909571371406, + "learning_rate": 4.798206278026906e-06, + "loss": 0.3833, + "step": 215 + }, + { + "epoch": 0.07416146974549132, + "grad_norm": 2.7675155415343666, + "learning_rate": 4.910313901345292e-06, + "loss": 0.4347, + "step": 220 + }, + { + "epoch": 0.07584695769425248, + "grad_norm": 2.705989833169705, + "learning_rate": 5.0224215246636775e-06, + "loss": 0.4053, + "step": 225 + }, + { + "epoch": 0.07753244564301365, + "grad_norm": 2.593711819126537, + "learning_rate": 5.134529147982063e-06, + "loss": 0.4229, + "step": 230 + }, + { + "epoch": 0.07921793359177481, + "grad_norm": 2.4338770361517734, + "learning_rate": 5.246636771300448e-06, + "loss": 0.4162, + "step": 235 + }, + { + "epoch": 0.08090342154053598, + "grad_norm": 3.003588069957525, + "learning_rate": 5.358744394618835e-06, + "loss": 0.3861, + "step": 240 + }, + { + "epoch": 0.08258890948929715, + "grad_norm": 2.1502174031404246, + "learning_rate": 5.47085201793722e-06, + "loss": 0.4081, + "step": 245 + }, + { + "epoch": 0.08427439743805831, + "grad_norm": 3.6222165866591824, + "learning_rate": 5.582959641255605e-06, + "loss": 0.4212, + "step": 250 + }, + { + "epoch": 0.08595988538681948, + "grad_norm": 2.488014950380137, + "learning_rate": 5.695067264573992e-06, + "loss": 0.4316, + "step": 255 + }, + { + "epoch": 0.08764537333558065, + "grad_norm": 2.035292151113688, + "learning_rate": 5.807174887892377e-06, + "loss": 0.4135, + "step": 260 + }, + { + "epoch": 0.08933086128434181, + "grad_norm": 1.971410200056052, + "learning_rate": 5.919282511210763e-06, + "loss": 0.4468, + "step": 265 + }, + { + "epoch": 0.09101634923310298, + "grad_norm": 2.3089000226070056, + "learning_rate": 6.031390134529148e-06, + "loss": 0.4009, + "step": 270 + }, + { + "epoch": 0.09270183718186414, + "grad_norm": 2.612380102704638, + "learning_rate": 6.143497757847534e-06, + "loss": 0.4016, + "step": 275 + }, + { + "epoch": 0.09438732513062531, + "grad_norm": 2.3993241428609906, + "learning_rate": 6.25560538116592e-06, + "loss": 0.4309, + "step": 280 + }, + { + "epoch": 0.09607281307938648, + "grad_norm": 4.233998484786202, + "learning_rate": 6.367713004484305e-06, + "loss": 0.4054, + "step": 285 + }, + { + "epoch": 0.09775830102814764, + "grad_norm": 2.322719482657519, + "learning_rate": 6.479820627802691e-06, + "loss": 0.4196, + "step": 290 + }, + { + "epoch": 0.09944378897690881, + "grad_norm": 5.679288461330974, + "learning_rate": 6.591928251121077e-06, + "loss": 0.4146, + "step": 295 + }, + { + "epoch": 0.10112927692566998, + "grad_norm": 4.9794233931360425, + "learning_rate": 6.704035874439463e-06, + "loss": 0.4195, + "step": 300 + }, + { + "epoch": 0.10281476487443114, + "grad_norm": 1.9120380706775335, + "learning_rate": 6.8161434977578476e-06, + "loss": 0.4317, + "step": 305 + }, + { + "epoch": 0.10450025282319231, + "grad_norm": 2.044216254914569, + "learning_rate": 6.928251121076234e-06, + "loss": 0.4266, + "step": 310 + }, + { + "epoch": 0.10618574077195347, + "grad_norm": 1.9008300446356923, + "learning_rate": 7.04035874439462e-06, + "loss": 0.3963, + "step": 315 + }, + { + "epoch": 0.10787122872071464, + "grad_norm": 2.0936197065546254, + "learning_rate": 7.152466367713005e-06, + "loss": 0.396, + "step": 320 + }, + { + "epoch": 0.1095567166694758, + "grad_norm": 2.504640994538501, + "learning_rate": 7.26457399103139e-06, + "loss": 0.3755, + "step": 325 + }, + { + "epoch": 0.11124220461823697, + "grad_norm": 2.0338356261514616, + "learning_rate": 7.376681614349777e-06, + "loss": 0.397, + "step": 330 + }, + { + "epoch": 0.11292769256699814, + "grad_norm": 2.07093018887604, + "learning_rate": 7.4887892376681625e-06, + "loss": 0.3631, + "step": 335 + }, + { + "epoch": 0.11461318051575932, + "grad_norm": 2.4353106198358168, + "learning_rate": 7.600896860986547e-06, + "loss": 0.3918, + "step": 340 + }, + { + "epoch": 0.11629866846452049, + "grad_norm": 1.884231089705885, + "learning_rate": 7.713004484304933e-06, + "loss": 0.3803, + "step": 345 + }, + { + "epoch": 0.11798415641328165, + "grad_norm": 2.5897406971523105, + "learning_rate": 7.825112107623319e-06, + "loss": 0.3917, + "step": 350 + }, + { + "epoch": 0.11966964436204282, + "grad_norm": 2.307975389115971, + "learning_rate": 7.937219730941704e-06, + "loss": 0.3866, + "step": 355 + }, + { + "epoch": 0.12135513231080398, + "grad_norm": 2.467091283833354, + "learning_rate": 8.04932735426009e-06, + "loss": 0.4249, + "step": 360 + }, + { + "epoch": 0.12304062025956515, + "grad_norm": 2.0244456034927913, + "learning_rate": 8.161434977578476e-06, + "loss": 0.4185, + "step": 365 + }, + { + "epoch": 0.12472610820832632, + "grad_norm": 2.4621737674260573, + "learning_rate": 8.273542600896861e-06, + "loss": 0.3617, + "step": 370 + }, + { + "epoch": 0.12641159615708747, + "grad_norm": 2.1008353268942312, + "learning_rate": 8.385650224215247e-06, + "loss": 0.3871, + "step": 375 + }, + { + "epoch": 0.12809708410584864, + "grad_norm": 2.3606333816070326, + "learning_rate": 8.497757847533633e-06, + "loss": 0.3869, + "step": 380 + }, + { + "epoch": 0.1297825720546098, + "grad_norm": 1.7221519588815626, + "learning_rate": 8.609865470852018e-06, + "loss": 0.433, + "step": 385 + }, + { + "epoch": 0.13146806000337097, + "grad_norm": 2.4647014732321466, + "learning_rate": 8.721973094170404e-06, + "loss": 0.4185, + "step": 390 + }, + { + "epoch": 0.13315354795213213, + "grad_norm": 3.6377876835528564, + "learning_rate": 8.83408071748879e-06, + "loss": 0.42, + "step": 395 + }, + { + "epoch": 0.1348390359008933, + "grad_norm": 2.4800482154662538, + "learning_rate": 8.946188340807175e-06, + "loss": 0.4019, + "step": 400 + }, + { + "epoch": 0.13652452384965447, + "grad_norm": 1.932932709959026, + "learning_rate": 9.058295964125561e-06, + "loss": 0.3937, + "step": 405 + }, + { + "epoch": 0.13821001179841563, + "grad_norm": 2.459462459379769, + "learning_rate": 9.170403587443947e-06, + "loss": 0.3757, + "step": 410 + }, + { + "epoch": 0.1398954997471768, + "grad_norm": 2.634402740022879, + "learning_rate": 9.282511210762332e-06, + "loss": 0.4211, + "step": 415 + }, + { + "epoch": 0.14158098769593797, + "grad_norm": 2.302486918235084, + "learning_rate": 9.394618834080718e-06, + "loss": 0.4243, + "step": 420 + }, + { + "epoch": 0.14326647564469913, + "grad_norm": 2.699871424522218, + "learning_rate": 9.506726457399104e-06, + "loss": 0.4298, + "step": 425 + }, + { + "epoch": 0.1449519635934603, + "grad_norm": 2.5220003454274806, + "learning_rate": 9.61883408071749e-06, + "loss": 0.4489, + "step": 430 + }, + { + "epoch": 0.14663745154222146, + "grad_norm": 2.3013583855642326, + "learning_rate": 9.730941704035875e-06, + "loss": 0.4052, + "step": 435 + }, + { + "epoch": 0.14832293949098263, + "grad_norm": 1.8343159139166734, + "learning_rate": 9.843049327354261e-06, + "loss": 0.4012, + "step": 440 + }, + { + "epoch": 0.1500084274397438, + "grad_norm": 1.878587520427869, + "learning_rate": 9.955156950672647e-06, + "loss": 0.3653, + "step": 445 + }, + { + "epoch": 0.15169391538850496, + "grad_norm": 2.9319680956052454, + "learning_rate": 9.999998927441416e-06, + "loss": 0.4264, + "step": 450 + }, + { + "epoch": 0.15337940333726613, + "grad_norm": 2.6134158541749732, + "learning_rate": 9.999992372918407e-06, + "loss": 0.4095, + "step": 455 + }, + { + "epoch": 0.1550648912860273, + "grad_norm": 3.832484233411444, + "learning_rate": 9.999979859746068e-06, + "loss": 0.3817, + "step": 460 + }, + { + "epoch": 0.15675037923478846, + "grad_norm": 2.3653735728210186, + "learning_rate": 9.999961387939312e-06, + "loss": 0.4119, + "step": 465 + }, + { + "epoch": 0.15843586718354963, + "grad_norm": 4.4364055824071915, + "learning_rate": 9.999936957520153e-06, + "loss": 0.4003, + "step": 470 + }, + { + "epoch": 0.1601213551323108, + "grad_norm": 35.14512330840862, + "learning_rate": 9.999906568517708e-06, + "loss": 0.3873, + "step": 475 + }, + { + "epoch": 0.16180684308107196, + "grad_norm": 1.7048667253980463, + "learning_rate": 9.999870220968187e-06, + "loss": 0.3879, + "step": 480 + }, + { + "epoch": 0.16349233102983313, + "grad_norm": 3.3372208072716085, + "learning_rate": 9.99982791491491e-06, + "loss": 0.4556, + "step": 485 + }, + { + "epoch": 0.1651778189785943, + "grad_norm": 2.554132462618261, + "learning_rate": 9.999779650408294e-06, + "loss": 0.3869, + "step": 490 + }, + { + "epoch": 0.16686330692735546, + "grad_norm": 1.9937809824466266, + "learning_rate": 9.999725427505858e-06, + "loss": 0.3923, + "step": 495 + }, + { + "epoch": 0.16854879487611663, + "grad_norm": 2.548047443247034, + "learning_rate": 9.999665246272222e-06, + "loss": 0.3806, + "step": 500 + }, + { + "epoch": 0.1702342828248778, + "grad_norm": 2.2010906932226457, + "learning_rate": 9.999599106779102e-06, + "loss": 0.3966, + "step": 505 + }, + { + "epoch": 0.17191977077363896, + "grad_norm": 1.828420796921307, + "learning_rate": 9.999527009105322e-06, + "loss": 0.3922, + "step": 510 + }, + { + "epoch": 0.17360525872240012, + "grad_norm": 1.8298755244344242, + "learning_rate": 9.999448953336801e-06, + "loss": 0.4076, + "step": 515 + }, + { + "epoch": 0.1752907466711613, + "grad_norm": 2.5032192982353183, + "learning_rate": 9.999364939566563e-06, + "loss": 0.3873, + "step": 520 + }, + { + "epoch": 0.17697623461992246, + "grad_norm": 3.012693421833828, + "learning_rate": 9.999274967894728e-06, + "loss": 0.354, + "step": 525 + }, + { + "epoch": 0.17866172256868362, + "grad_norm": 1.6295732315281872, + "learning_rate": 9.999179038428518e-06, + "loss": 0.4048, + "step": 530 + }, + { + "epoch": 0.1803472105174448, + "grad_norm": 2.5804997377771266, + "learning_rate": 9.999077151282255e-06, + "loss": 0.3822, + "step": 535 + }, + { + "epoch": 0.18203269846620596, + "grad_norm": 1.9847562423733942, + "learning_rate": 9.998969306577364e-06, + "loss": 0.3847, + "step": 540 + }, + { + "epoch": 0.18371818641496712, + "grad_norm": 2.004242607918728, + "learning_rate": 9.998855504442363e-06, + "loss": 0.406, + "step": 545 + }, + { + "epoch": 0.1854036743637283, + "grad_norm": 1.7878967975539106, + "learning_rate": 9.998735745012876e-06, + "loss": 0.3858, + "step": 550 + }, + { + "epoch": 0.18708916231248945, + "grad_norm": 2.1773847589443522, + "learning_rate": 9.998610028431622e-06, + "loss": 0.4115, + "step": 555 + }, + { + "epoch": 0.18877465026125062, + "grad_norm": 2.1881165596445347, + "learning_rate": 9.998478354848425e-06, + "loss": 0.4017, + "step": 560 + }, + { + "epoch": 0.1904601382100118, + "grad_norm": 2.3909666157372187, + "learning_rate": 9.998340724420202e-06, + "loss": 0.3909, + "step": 565 + }, + { + "epoch": 0.19214562615877295, + "grad_norm": 3.8789092474821865, + "learning_rate": 9.998197137310972e-06, + "loss": 0.4149, + "step": 570 + }, + { + "epoch": 0.19383111410753412, + "grad_norm": 2.8013687374027234, + "learning_rate": 9.99804759369185e-06, + "loss": 0.3584, + "step": 575 + }, + { + "epoch": 0.19551660205629529, + "grad_norm": 2.03183171805555, + "learning_rate": 9.997892093741058e-06, + "loss": 0.3531, + "step": 580 + }, + { + "epoch": 0.19720209000505645, + "grad_norm": 2.1752647617401273, + "learning_rate": 9.997730637643904e-06, + "loss": 0.4119, + "step": 585 + }, + { + "epoch": 0.19888757795381762, + "grad_norm": 2.1615427772313223, + "learning_rate": 9.997563225592803e-06, + "loss": 0.3716, + "step": 590 + }, + { + "epoch": 0.20057306590257878, + "grad_norm": 2.9336226572334425, + "learning_rate": 9.997389857787266e-06, + "loss": 0.3913, + "step": 595 + }, + { + "epoch": 0.20225855385133995, + "grad_norm": 1.613836729716596, + "learning_rate": 9.997210534433899e-06, + "loss": 0.3838, + "step": 600 + }, + { + "epoch": 0.20394404180010112, + "grad_norm": 1.8362314282533876, + "learning_rate": 9.997025255746409e-06, + "loss": 0.4097, + "step": 605 + }, + { + "epoch": 0.20562952974886228, + "grad_norm": 2.172122875659069, + "learning_rate": 9.996834021945599e-06, + "loss": 0.4342, + "step": 610 + }, + { + "epoch": 0.20731501769762345, + "grad_norm": 1.6195979347911462, + "learning_rate": 9.996636833259365e-06, + "loss": 0.4161, + "step": 615 + }, + { + "epoch": 0.20900050564638462, + "grad_norm": 2.2138247618272477, + "learning_rate": 9.996433689922705e-06, + "loss": 0.3973, + "step": 620 + }, + { + "epoch": 0.21068599359514578, + "grad_norm": 2.009981169025572, + "learning_rate": 9.996224592177713e-06, + "loss": 0.4017, + "step": 625 + }, + { + "epoch": 0.21237148154390695, + "grad_norm": 1.9803622020128537, + "learning_rate": 9.996009540273574e-06, + "loss": 0.3629, + "step": 630 + }, + { + "epoch": 0.21405696949266811, + "grad_norm": 1.9702872751272344, + "learning_rate": 9.995788534466576e-06, + "loss": 0.3765, + "step": 635 + }, + { + "epoch": 0.21574245744142928, + "grad_norm": 2.026744637961682, + "learning_rate": 9.995561575020096e-06, + "loss": 0.3779, + "step": 640 + }, + { + "epoch": 0.21742794539019045, + "grad_norm": 2.356624382171085, + "learning_rate": 9.995328662204609e-06, + "loss": 0.3791, + "step": 645 + }, + { + "epoch": 0.2191134333389516, + "grad_norm": 1.5683166625062384, + "learning_rate": 9.995089796297686e-06, + "loss": 0.3977, + "step": 650 + }, + { + "epoch": 0.22079892128771278, + "grad_norm": 2.101603290295796, + "learning_rate": 9.994844977583989e-06, + "loss": 0.3855, + "step": 655 + }, + { + "epoch": 0.22248440923647395, + "grad_norm": 1.7565292897025453, + "learning_rate": 9.994594206355277e-06, + "loss": 0.3956, + "step": 660 + }, + { + "epoch": 0.2241698971852351, + "grad_norm": 3.10293180125714, + "learning_rate": 9.994337482910403e-06, + "loss": 0.4191, + "step": 665 + }, + { + "epoch": 0.22585538513399628, + "grad_norm": 1.6005337791639043, + "learning_rate": 9.994074807555312e-06, + "loss": 0.366, + "step": 670 + }, + { + "epoch": 0.22754087308275747, + "grad_norm": 15.746106890836087, + "learning_rate": 9.993806180603042e-06, + "loss": 0.366, + "step": 675 + }, + { + "epoch": 0.22922636103151864, + "grad_norm": 2.3584243465745467, + "learning_rate": 9.993531602373725e-06, + "loss": 0.3551, + "step": 680 + }, + { + "epoch": 0.2309118489802798, + "grad_norm": 2.911460621086186, + "learning_rate": 9.993251073194582e-06, + "loss": 0.3765, + "step": 685 + }, + { + "epoch": 0.23259733692904097, + "grad_norm": 1.918519286748289, + "learning_rate": 9.992964593399935e-06, + "loss": 0.4172, + "step": 690 + }, + { + "epoch": 0.23428282487780214, + "grad_norm": 1.7248234236040525, + "learning_rate": 9.992672163331183e-06, + "loss": 0.3669, + "step": 695 + }, + { + "epoch": 0.2359683128265633, + "grad_norm": 1.8252665725881347, + "learning_rate": 9.992373783336829e-06, + "loss": 0.4019, + "step": 700 + }, + { + "epoch": 0.23765380077532447, + "grad_norm": 1.624513733101269, + "learning_rate": 9.992069453772462e-06, + "loss": 0.3597, + "step": 705 + }, + { + "epoch": 0.23933928872408564, + "grad_norm": 1.5577357173237931, + "learning_rate": 9.991759175000759e-06, + "loss": 0.4212, + "step": 710 + }, + { + "epoch": 0.2410247766728468, + "grad_norm": 2.1786930696130327, + "learning_rate": 9.99144294739149e-06, + "loss": 0.3955, + "step": 715 + }, + { + "epoch": 0.24271026462160797, + "grad_norm": 1.6246094297359752, + "learning_rate": 9.991120771321513e-06, + "loss": 0.4389, + "step": 720 + }, + { + "epoch": 0.24439575257036913, + "grad_norm": 8.408346377550648, + "learning_rate": 9.990792647174777e-06, + "loss": 0.3853, + "step": 725 + }, + { + "epoch": 0.2460812405191303, + "grad_norm": 1.595421372399142, + "learning_rate": 9.990458575342315e-06, + "loss": 0.3976, + "step": 730 + }, + { + "epoch": 0.24776672846789147, + "grad_norm": 4.514848965218263, + "learning_rate": 9.990118556222254e-06, + "loss": 0.384, + "step": 735 + }, + { + "epoch": 0.24945221641665263, + "grad_norm": 2.218357541423892, + "learning_rate": 9.989772590219805e-06, + "loss": 0.3979, + "step": 740 + }, + { + "epoch": 0.25113770436541377, + "grad_norm": 1.6358343818606282, + "learning_rate": 9.989420677747266e-06, + "loss": 0.3754, + "step": 745 + }, + { + "epoch": 0.25282319231417494, + "grad_norm": 1.6530000481044254, + "learning_rate": 9.98906281922402e-06, + "loss": 0.4156, + "step": 750 + }, + { + "epoch": 0.2545086802629361, + "grad_norm": 1.9639962472652386, + "learning_rate": 9.988699015076545e-06, + "loss": 0.3869, + "step": 755 + }, + { + "epoch": 0.25619416821169727, + "grad_norm": 2.7981924236148674, + "learning_rate": 9.98832926573839e-06, + "loss": 0.3844, + "step": 760 + }, + { + "epoch": 0.25787965616045844, + "grad_norm": 1.7533436277070946, + "learning_rate": 9.987953571650201e-06, + "loss": 0.3883, + "step": 765 + }, + { + "epoch": 0.2595651441092196, + "grad_norm": 2.4253975139504873, + "learning_rate": 9.987571933259705e-06, + "loss": 0.4124, + "step": 770 + }, + { + "epoch": 0.26125063205798077, + "grad_norm": 1.5158341449175288, + "learning_rate": 9.98718435102171e-06, + "loss": 0.3703, + "step": 775 + }, + { + "epoch": 0.26293612000674194, + "grad_norm": 1.5959370141299707, + "learning_rate": 9.986790825398113e-06, + "loss": 0.3745, + "step": 780 + }, + { + "epoch": 0.2646216079555031, + "grad_norm": 2.34082579635864, + "learning_rate": 9.98639135685789e-06, + "loss": 0.3761, + "step": 785 + }, + { + "epoch": 0.26630709590426427, + "grad_norm": 3.467698805677363, + "learning_rate": 9.985985945877099e-06, + "loss": 0.3659, + "step": 790 + }, + { + "epoch": 0.26799258385302543, + "grad_norm": 2.8517981978610885, + "learning_rate": 9.985574592938883e-06, + "loss": 0.3812, + "step": 795 + }, + { + "epoch": 0.2696780718017866, + "grad_norm": 5.14428705280153, + "learning_rate": 9.985157298533463e-06, + "loss": 0.414, + "step": 800 + }, + { + "epoch": 0.27136355975054777, + "grad_norm": 1.8396394543962131, + "learning_rate": 9.984734063158142e-06, + "loss": 0.3949, + "step": 805 + }, + { + "epoch": 0.27304904769930893, + "grad_norm": 2.106355774259703, + "learning_rate": 9.984304887317304e-06, + "loss": 0.371, + "step": 810 + }, + { + "epoch": 0.2747345356480701, + "grad_norm": 2.015825945564459, + "learning_rate": 9.983869771522411e-06, + "loss": 0.3947, + "step": 815 + }, + { + "epoch": 0.27642002359683127, + "grad_norm": 1.9906050791876728, + "learning_rate": 9.983428716292002e-06, + "loss": 0.3896, + "step": 820 + }, + { + "epoch": 0.27810551154559243, + "grad_norm": 1.6262166951573869, + "learning_rate": 9.9829817221517e-06, + "loss": 0.3593, + "step": 825 + }, + { + "epoch": 0.2797909994943536, + "grad_norm": 1.9751342061528858, + "learning_rate": 9.982528789634203e-06, + "loss": 0.3721, + "step": 830 + }, + { + "epoch": 0.28147648744311476, + "grad_norm": 2.2090109354710075, + "learning_rate": 9.98206991927928e-06, + "loss": 0.382, + "step": 835 + }, + { + "epoch": 0.28316197539187593, + "grad_norm": 2.0014148005086265, + "learning_rate": 9.981605111633785e-06, + "loss": 0.3788, + "step": 840 + }, + { + "epoch": 0.2848474633406371, + "grad_norm": 1.9487777824799606, + "learning_rate": 9.981134367251644e-06, + "loss": 0.3848, + "step": 845 + }, + { + "epoch": 0.28653295128939826, + "grad_norm": 1.7877424705020015, + "learning_rate": 9.980657686693856e-06, + "loss": 0.3661, + "step": 850 + }, + { + "epoch": 0.28821843923815943, + "grad_norm": 1.8189700902269383, + "learning_rate": 9.980175070528496e-06, + "loss": 0.3997, + "step": 855 + }, + { + "epoch": 0.2899039271869206, + "grad_norm": 1.7885705129483607, + "learning_rate": 9.979686519330715e-06, + "loss": 0.4111, + "step": 860 + }, + { + "epoch": 0.29158941513568176, + "grad_norm": 2.0788763558907988, + "learning_rate": 9.979192033682737e-06, + "loss": 0.3867, + "step": 865 + }, + { + "epoch": 0.29327490308444293, + "grad_norm": 2.553827465281633, + "learning_rate": 9.97869161417385e-06, + "loss": 0.3748, + "step": 870 + }, + { + "epoch": 0.2949603910332041, + "grad_norm": 2.5115523800996615, + "learning_rate": 9.978185261400423e-06, + "loss": 0.4072, + "step": 875 + }, + { + "epoch": 0.29664587898196526, + "grad_norm": 2.1087588940179467, + "learning_rate": 9.977672975965895e-06, + "loss": 0.4132, + "step": 880 + }, + { + "epoch": 0.2983313669307264, + "grad_norm": 1.970566507249276, + "learning_rate": 9.977154758480771e-06, + "loss": 0.3851, + "step": 885 + }, + { + "epoch": 0.3000168548794876, + "grad_norm": 1.741373008703597, + "learning_rate": 9.976630609562626e-06, + "loss": 0.386, + "step": 890 + }, + { + "epoch": 0.30170234282824876, + "grad_norm": 2.154351386848767, + "learning_rate": 9.976100529836106e-06, + "loss": 0.4082, + "step": 895 + }, + { + "epoch": 0.3033878307770099, + "grad_norm": 2.9479101759832638, + "learning_rate": 9.975564519932922e-06, + "loss": 0.3882, + "step": 900 + }, + { + "epoch": 0.3050733187257711, + "grad_norm": 1.9012524201786816, + "learning_rate": 9.975022580491859e-06, + "loss": 0.3836, + "step": 905 + }, + { + "epoch": 0.30675880667453226, + "grad_norm": 1.659302263817562, + "learning_rate": 9.974474712158757e-06, + "loss": 0.3788, + "step": 910 + }, + { + "epoch": 0.3084442946232934, + "grad_norm": 6.390788623860543, + "learning_rate": 9.973920915586533e-06, + "loss": 0.3849, + "step": 915 + }, + { + "epoch": 0.3101297825720546, + "grad_norm": 2.646923190514544, + "learning_rate": 9.97336119143516e-06, + "loss": 0.4239, + "step": 920 + }, + { + "epoch": 0.31181527052081576, + "grad_norm": 1.692234615605573, + "learning_rate": 9.972795540371682e-06, + "loss": 0.4225, + "step": 925 + }, + { + "epoch": 0.3135007584695769, + "grad_norm": 2.6257286681434158, + "learning_rate": 9.972223963070202e-06, + "loss": 0.3833, + "step": 930 + }, + { + "epoch": 0.3151862464183381, + "grad_norm": 1.4468130664510528, + "learning_rate": 9.971646460211888e-06, + "loss": 0.3817, + "step": 935 + }, + { + "epoch": 0.31687173436709926, + "grad_norm": 2.9343853277028225, + "learning_rate": 9.971063032484966e-06, + "loss": 0.3818, + "step": 940 + }, + { + "epoch": 0.3185572223158604, + "grad_norm": 2.874297049113011, + "learning_rate": 9.970473680584729e-06, + "loss": 0.3912, + "step": 945 + }, + { + "epoch": 0.3202427102646216, + "grad_norm": 2.3679632099037353, + "learning_rate": 9.969878405213522e-06, + "loss": 0.3451, + "step": 950 + }, + { + "epoch": 0.32192819821338275, + "grad_norm": 1.8485431472741025, + "learning_rate": 9.969277207080757e-06, + "loss": 0.3826, + "step": 955 + }, + { + "epoch": 0.3236136861621439, + "grad_norm": 1.8160514122580558, + "learning_rate": 9.968670086902898e-06, + "loss": 0.4119, + "step": 960 + }, + { + "epoch": 0.3252991741109051, + "grad_norm": 1.6310654165501828, + "learning_rate": 9.968057045403473e-06, + "loss": 0.3888, + "step": 965 + }, + { + "epoch": 0.32698466205966625, + "grad_norm": 1.9293470572012084, + "learning_rate": 9.967438083313057e-06, + "loss": 0.3776, + "step": 970 + }, + { + "epoch": 0.3286701500084274, + "grad_norm": 2.064165266260474, + "learning_rate": 9.966813201369294e-06, + "loss": 0.4023, + "step": 975 + }, + { + "epoch": 0.3303556379571886, + "grad_norm": 1.6291463422904702, + "learning_rate": 9.96618240031687e-06, + "loss": 0.3711, + "step": 980 + }, + { + "epoch": 0.33204112590594975, + "grad_norm": 1.4760897650643887, + "learning_rate": 9.965545680907534e-06, + "loss": 0.372, + "step": 985 + }, + { + "epoch": 0.3337266138547109, + "grad_norm": 1.5516691875793427, + "learning_rate": 9.964903043900085e-06, + "loss": 0.379, + "step": 990 + }, + { + "epoch": 0.3354121018034721, + "grad_norm": 1.7123969322933874, + "learning_rate": 9.96425449006037e-06, + "loss": 0.3717, + "step": 995 + }, + { + "epoch": 0.33709758975223325, + "grad_norm": 2.1244926392838357, + "learning_rate": 9.963600020161294e-06, + "loss": 0.3504, + "step": 1000 + }, + { + "epoch": 0.3387830777009944, + "grad_norm": 3.7906827553852067, + "learning_rate": 9.962939634982808e-06, + "loss": 0.3701, + "step": 1005 + }, + { + "epoch": 0.3404685656497556, + "grad_norm": 1.869057502602457, + "learning_rate": 9.962273335311918e-06, + "loss": 0.3632, + "step": 1010 + }, + { + "epoch": 0.34215405359851675, + "grad_norm": 3.366608851411368, + "learning_rate": 9.961601121942667e-06, + "loss": 0.3724, + "step": 1015 + }, + { + "epoch": 0.3438395415472779, + "grad_norm": 4.54722354217597, + "learning_rate": 9.96092299567616e-06, + "loss": 0.4175, + "step": 1020 + }, + { + "epoch": 0.3455250294960391, + "grad_norm": 1.5304066770240292, + "learning_rate": 9.96023895732054e-06, + "loss": 0.3698, + "step": 1025 + }, + { + "epoch": 0.34721051744480025, + "grad_norm": 1.6504649468386736, + "learning_rate": 9.959549007690996e-06, + "loss": 0.3713, + "step": 1030 + }, + { + "epoch": 0.3488960053935614, + "grad_norm": 1.9237813388224598, + "learning_rate": 9.958853147609762e-06, + "loss": 0.3711, + "step": 1035 + }, + { + "epoch": 0.3505814933423226, + "grad_norm": 1.557187949710829, + "learning_rate": 9.958151377906116e-06, + "loss": 0.3845, + "step": 1040 + }, + { + "epoch": 0.35226698129108375, + "grad_norm": 2.1410759348004813, + "learning_rate": 9.957443699416382e-06, + "loss": 0.3689, + "step": 1045 + }, + { + "epoch": 0.3539524692398449, + "grad_norm": 2.6413903943547763, + "learning_rate": 9.956730112983922e-06, + "loss": 0.3884, + "step": 1050 + }, + { + "epoch": 0.3556379571886061, + "grad_norm": 2.0042783331507796, + "learning_rate": 9.956010619459138e-06, + "loss": 0.3993, + "step": 1055 + }, + { + "epoch": 0.35732344513736725, + "grad_norm": 1.9178390205816611, + "learning_rate": 9.955285219699476e-06, + "loss": 0.3912, + "step": 1060 + }, + { + "epoch": 0.3590089330861284, + "grad_norm": 2.033424630243459, + "learning_rate": 9.954553914569414e-06, + "loss": 0.3509, + "step": 1065 + }, + { + "epoch": 0.3606944210348896, + "grad_norm": 2.8573924965488304, + "learning_rate": 9.953816704940475e-06, + "loss": 0.3833, + "step": 1070 + }, + { + "epoch": 0.36237990898365074, + "grad_norm": 1.454160800765969, + "learning_rate": 9.953073591691214e-06, + "loss": 0.3607, + "step": 1075 + }, + { + "epoch": 0.3640653969324119, + "grad_norm": 1.436069191284367, + "learning_rate": 9.952324575707222e-06, + "loss": 0.3671, + "step": 1080 + }, + { + "epoch": 0.3657508848811731, + "grad_norm": 1.528433914963016, + "learning_rate": 9.951569657881124e-06, + "loss": 0.398, + "step": 1085 + }, + { + "epoch": 0.36743637282993424, + "grad_norm": 2.0721957509355833, + "learning_rate": 9.950808839112583e-06, + "loss": 0.3697, + "step": 1090 + }, + { + "epoch": 0.3691218607786954, + "grad_norm": 3.774799286122688, + "learning_rate": 9.950042120308287e-06, + "loss": 0.379, + "step": 1095 + }, + { + "epoch": 0.3708073487274566, + "grad_norm": 2.260780199791883, + "learning_rate": 9.949269502381961e-06, + "loss": 0.3915, + "step": 1100 + }, + { + "epoch": 0.37249283667621774, + "grad_norm": 1.9800273567119477, + "learning_rate": 9.94849098625436e-06, + "loss": 0.3854, + "step": 1105 + }, + { + "epoch": 0.3741783246249789, + "grad_norm": 1.4559740597276838, + "learning_rate": 9.947706572853262e-06, + "loss": 0.3934, + "step": 1110 + }, + { + "epoch": 0.3758638125737401, + "grad_norm": 1.6009456809223352, + "learning_rate": 9.946916263113482e-06, + "loss": 0.3832, + "step": 1115 + }, + { + "epoch": 0.37754930052250124, + "grad_norm": 1.7072821817014892, + "learning_rate": 9.946120057976853e-06, + "loss": 0.387, + "step": 1120 + }, + { + "epoch": 0.3792347884712624, + "grad_norm": 2.3391026420804013, + "learning_rate": 9.945317958392243e-06, + "loss": 0.4232, + "step": 1125 + }, + { + "epoch": 0.3809202764200236, + "grad_norm": 1.8763405027701638, + "learning_rate": 9.944509965315532e-06, + "loss": 0.3743, + "step": 1130 + }, + { + "epoch": 0.38260576436878474, + "grad_norm": 1.8203563045694253, + "learning_rate": 9.943696079709637e-06, + "loss": 0.3919, + "step": 1135 + }, + { + "epoch": 0.3842912523175459, + "grad_norm": 1.9188335442110622, + "learning_rate": 9.94287630254449e-06, + "loss": 0.3803, + "step": 1140 + }, + { + "epoch": 0.3859767402663071, + "grad_norm": 1.9193760465596672, + "learning_rate": 9.942050634797044e-06, + "loss": 0.3717, + "step": 1145 + }, + { + "epoch": 0.38766222821506824, + "grad_norm": 1.2184450527027415, + "learning_rate": 9.941219077451276e-06, + "loss": 0.3101, + "step": 1150 + }, + { + "epoch": 0.3893477161638294, + "grad_norm": 1.9082738737781184, + "learning_rate": 9.940381631498175e-06, + "loss": 0.3794, + "step": 1155 + }, + { + "epoch": 0.39103320411259057, + "grad_norm": 1.6941062415529424, + "learning_rate": 9.939538297935756e-06, + "loss": 0.395, + "step": 1160 + }, + { + "epoch": 0.39271869206135174, + "grad_norm": 1.4999086936872892, + "learning_rate": 9.938689077769046e-06, + "loss": 0.3511, + "step": 1165 + }, + { + "epoch": 0.3944041800101129, + "grad_norm": 1.5354307336843596, + "learning_rate": 9.937833972010084e-06, + "loss": 0.3779, + "step": 1170 + }, + { + "epoch": 0.39608966795887407, + "grad_norm": 1.6148043731147916, + "learning_rate": 9.93697298167793e-06, + "loss": 0.3677, + "step": 1175 + }, + { + "epoch": 0.39777515590763524, + "grad_norm": 3.547101819036569, + "learning_rate": 9.936106107798654e-06, + "loss": 0.367, + "step": 1180 + }, + { + "epoch": 0.3994606438563964, + "grad_norm": 1.4256930815524032, + "learning_rate": 9.935233351405333e-06, + "loss": 0.3292, + "step": 1185 + }, + { + "epoch": 0.40114613180515757, + "grad_norm": 1.8463182510574727, + "learning_rate": 9.934354713538061e-06, + "loss": 0.3472, + "step": 1190 + }, + { + "epoch": 0.40283161975391873, + "grad_norm": 1.8713630909886005, + "learning_rate": 9.933470195243939e-06, + "loss": 0.3633, + "step": 1195 + }, + { + "epoch": 0.4045171077026799, + "grad_norm": 1.9909097858371128, + "learning_rate": 9.932579797577075e-06, + "loss": 0.3978, + "step": 1200 + }, + { + "epoch": 0.40620259565144107, + "grad_norm": 1.922000086644586, + "learning_rate": 9.931683521598585e-06, + "loss": 0.3831, + "step": 1205 + }, + { + "epoch": 0.40788808360020223, + "grad_norm": 1.6376595060664128, + "learning_rate": 9.930781368376588e-06, + "loss": 0.3613, + "step": 1210 + }, + { + "epoch": 0.4095735715489634, + "grad_norm": 1.8682072536073533, + "learning_rate": 9.929873338986208e-06, + "loss": 0.3803, + "step": 1215 + }, + { + "epoch": 0.41125905949772457, + "grad_norm": 1.641010884397326, + "learning_rate": 9.928959434509576e-06, + "loss": 0.3855, + "step": 1220 + }, + { + "epoch": 0.41294454744648573, + "grad_norm": 2.811707066345706, + "learning_rate": 9.928039656035817e-06, + "loss": 0.3824, + "step": 1225 + }, + { + "epoch": 0.4146300353952469, + "grad_norm": 2.1669607720074064, + "learning_rate": 9.927114004661063e-06, + "loss": 0.3467, + "step": 1230 + }, + { + "epoch": 0.41631552334400806, + "grad_norm": 2.5149830559716504, + "learning_rate": 9.92618248148844e-06, + "loss": 0.3436, + "step": 1235 + }, + { + "epoch": 0.41800101129276923, + "grad_norm": 2.881848018303243, + "learning_rate": 9.925245087628073e-06, + "loss": 0.3659, + "step": 1240 + }, + { + "epoch": 0.4196864992415304, + "grad_norm": 1.5245821389573595, + "learning_rate": 9.924301824197087e-06, + "loss": 0.3723, + "step": 1245 + }, + { + "epoch": 0.42137198719029156, + "grad_norm": 1.6791218367042129, + "learning_rate": 9.923352692319595e-06, + "loss": 0.3941, + "step": 1250 + }, + { + "epoch": 0.42305747513905273, + "grad_norm": 1.5564687504140988, + "learning_rate": 9.922397693126712e-06, + "loss": 0.3646, + "step": 1255 + }, + { + "epoch": 0.4247429630878139, + "grad_norm": 1.74390718403418, + "learning_rate": 9.921436827756539e-06, + "loss": 0.3543, + "step": 1260 + }, + { + "epoch": 0.42642845103657506, + "grad_norm": 1.3414046416119982, + "learning_rate": 9.920470097354166e-06, + "loss": 0.3319, + "step": 1265 + }, + { + "epoch": 0.42811393898533623, + "grad_norm": 1.2425173294583747, + "learning_rate": 9.919497503071682e-06, + "loss": 0.3642, + "step": 1270 + }, + { + "epoch": 0.4297994269340974, + "grad_norm": 1.4527031016286536, + "learning_rate": 9.918519046068157e-06, + "loss": 0.3602, + "step": 1275 + }, + { + "epoch": 0.43148491488285856, + "grad_norm": 1.8868133059382373, + "learning_rate": 9.917534727509647e-06, + "loss": 0.354, + "step": 1280 + }, + { + "epoch": 0.4331704028316197, + "grad_norm": 1.5610447769025018, + "learning_rate": 9.916544548569195e-06, + "loss": 0.3737, + "step": 1285 + }, + { + "epoch": 0.4348558907803809, + "grad_norm": 1.3021664391906702, + "learning_rate": 9.915548510426833e-06, + "loss": 0.3592, + "step": 1290 + }, + { + "epoch": 0.43654137872914206, + "grad_norm": 1.3810259490145473, + "learning_rate": 9.91454661426957e-06, + "loss": 0.3886, + "step": 1295 + }, + { + "epoch": 0.4382268666779032, + "grad_norm": 1.3304099160027794, + "learning_rate": 9.913538861291391e-06, + "loss": 0.3855, + "step": 1300 + }, + { + "epoch": 0.4399123546266644, + "grad_norm": 1.7264723009040814, + "learning_rate": 9.912525252693276e-06, + "loss": 0.339, + "step": 1305 + }, + { + "epoch": 0.44159784257542556, + "grad_norm": 1.3667437431870433, + "learning_rate": 9.911505789683169e-06, + "loss": 0.3839, + "step": 1310 + }, + { + "epoch": 0.4432833305241867, + "grad_norm": 1.6196648417481219, + "learning_rate": 9.910480473475998e-06, + "loss": 0.3786, + "step": 1315 + }, + { + "epoch": 0.4449688184729479, + "grad_norm": 1.4133305673198646, + "learning_rate": 9.909449305293665e-06, + "loss": 0.3583, + "step": 1320 + }, + { + "epoch": 0.44665430642170906, + "grad_norm": 1.5553388389804803, + "learning_rate": 9.908412286365047e-06, + "loss": 0.3739, + "step": 1325 + }, + { + "epoch": 0.4483397943704702, + "grad_norm": 1.247977985211659, + "learning_rate": 9.90736941792599e-06, + "loss": 0.3379, + "step": 1330 + }, + { + "epoch": 0.4500252823192314, + "grad_norm": 1.447620792005676, + "learning_rate": 9.906320701219314e-06, + "loss": 0.3799, + "step": 1335 + }, + { + "epoch": 0.45171077026799256, + "grad_norm": 2.0817192065173318, + "learning_rate": 9.90526613749481e-06, + "loss": 0.4061, + "step": 1340 + }, + { + "epoch": 0.4533962582167537, + "grad_norm": 1.5729182550945877, + "learning_rate": 9.90420572800923e-06, + "loss": 0.3808, + "step": 1345 + }, + { + "epoch": 0.45508174616551494, + "grad_norm": 1.657457509921655, + "learning_rate": 9.903139474026304e-06, + "loss": 0.3586, + "step": 1350 + }, + { + "epoch": 0.4567672341142761, + "grad_norm": 1.5236374339069958, + "learning_rate": 9.902067376816716e-06, + "loss": 0.3858, + "step": 1355 + }, + { + "epoch": 0.4584527220630373, + "grad_norm": 1.5226480730500296, + "learning_rate": 9.90098943765812e-06, + "loss": 0.3519, + "step": 1360 + }, + { + "epoch": 0.46013821001179844, + "grad_norm": 1.477229921055985, + "learning_rate": 9.89990565783513e-06, + "loss": 0.3773, + "step": 1365 + }, + { + "epoch": 0.4618236979605596, + "grad_norm": 1.5683976956525734, + "learning_rate": 9.898816038639318e-06, + "loss": 0.3779, + "step": 1370 + }, + { + "epoch": 0.4635091859093208, + "grad_norm": 1.2774876859518138, + "learning_rate": 9.897720581369223e-06, + "loss": 0.3855, + "step": 1375 + }, + { + "epoch": 0.46519467385808194, + "grad_norm": 1.742934808160777, + "learning_rate": 9.896619287330333e-06, + "loss": 0.3442, + "step": 1380 + }, + { + "epoch": 0.4668801618068431, + "grad_norm": 2.0269501427448446, + "learning_rate": 9.895512157835096e-06, + "loss": 0.3817, + "step": 1385 + }, + { + "epoch": 0.4685656497556043, + "grad_norm": 1.4335424967789996, + "learning_rate": 9.894399194202913e-06, + "loss": 0.402, + "step": 1390 + }, + { + "epoch": 0.47025113770436544, + "grad_norm": 1.7165362334668752, + "learning_rate": 9.893280397760137e-06, + "loss": 0.3837, + "step": 1395 + }, + { + "epoch": 0.4719366256531266, + "grad_norm": 2.976000196795781, + "learning_rate": 9.892155769840075e-06, + "loss": 0.3682, + "step": 1400 + }, + { + "epoch": 0.4736221136018878, + "grad_norm": 1.6994878217649543, + "learning_rate": 9.891025311782981e-06, + "loss": 0.3762, + "step": 1405 + }, + { + "epoch": 0.47530760155064894, + "grad_norm": 1.6583356777220375, + "learning_rate": 9.889889024936055e-06, + "loss": 0.3779, + "step": 1410 + }, + { + "epoch": 0.4769930894994101, + "grad_norm": 1.3449016512465066, + "learning_rate": 9.888746910653451e-06, + "loss": 0.4038, + "step": 1415 + }, + { + "epoch": 0.47867857744817127, + "grad_norm": 2.3117344706889558, + "learning_rate": 9.88759897029626e-06, + "loss": 0.403, + "step": 1420 + }, + { + "epoch": 0.48036406539693244, + "grad_norm": 1.66354971908194, + "learning_rate": 9.886445205232518e-06, + "loss": 0.3639, + "step": 1425 + }, + { + "epoch": 0.4820495533456936, + "grad_norm": 1.3978047454441775, + "learning_rate": 9.885285616837204e-06, + "loss": 0.3937, + "step": 1430 + }, + { + "epoch": 0.48373504129445477, + "grad_norm": 1.8222955243579932, + "learning_rate": 9.884120206492239e-06, + "loss": 0.3815, + "step": 1435 + }, + { + "epoch": 0.48542052924321594, + "grad_norm": 1.5312912667688126, + "learning_rate": 9.882948975586475e-06, + "loss": 0.3379, + "step": 1440 + }, + { + "epoch": 0.4871060171919771, + "grad_norm": 1.5579909761514894, + "learning_rate": 9.881771925515708e-06, + "loss": 0.3753, + "step": 1445 + }, + { + "epoch": 0.48879150514073827, + "grad_norm": 1.3262651786348019, + "learning_rate": 9.880589057682666e-06, + "loss": 0.3532, + "step": 1450 + }, + { + "epoch": 0.49047699308949944, + "grad_norm": 1.4867089897459638, + "learning_rate": 9.879400373497009e-06, + "loss": 0.3578, + "step": 1455 + }, + { + "epoch": 0.4921624810382606, + "grad_norm": 3.9915201721688596, + "learning_rate": 9.878205874375327e-06, + "loss": 0.3997, + "step": 1460 + }, + { + "epoch": 0.49384796898702177, + "grad_norm": 1.5637136327033112, + "learning_rate": 9.877005561741147e-06, + "loss": 0.3645, + "step": 1465 + }, + { + "epoch": 0.49553345693578293, + "grad_norm": 1.2642443367640699, + "learning_rate": 9.875799437024918e-06, + "loss": 0.3381, + "step": 1470 + }, + { + "epoch": 0.4972189448845441, + "grad_norm": 1.96578843307235, + "learning_rate": 9.874587501664018e-06, + "loss": 0.3517, + "step": 1475 + }, + { + "epoch": 0.49890443283330527, + "grad_norm": 1.1903686232767572, + "learning_rate": 9.873369757102744e-06, + "loss": 0.396, + "step": 1480 + }, + { + "epoch": 0.5005899207820664, + "grad_norm": 1.6148672292269124, + "learning_rate": 9.872146204792327e-06, + "loss": 0.3747, + "step": 1485 + }, + { + "epoch": 0.5022754087308275, + "grad_norm": 1.3729016663750981, + "learning_rate": 9.87091684619091e-06, + "loss": 0.354, + "step": 1490 + }, + { + "epoch": 0.5039608966795888, + "grad_norm": 1.4477298289237992, + "learning_rate": 9.86968168276356e-06, + "loss": 0.4039, + "step": 1495 + }, + { + "epoch": 0.5056463846283499, + "grad_norm": 1.2921266896305665, + "learning_rate": 9.868440715982257e-06, + "loss": 0.4031, + "step": 1500 + }, + { + "epoch": 0.5073318725771111, + "grad_norm": 1.6101208365324928, + "learning_rate": 9.867193947325904e-06, + "loss": 0.3496, + "step": 1505 + }, + { + "epoch": 0.5090173605258722, + "grad_norm": 1.21317726468614, + "learning_rate": 9.865941378280312e-06, + "loss": 0.3883, + "step": 1510 + }, + { + "epoch": 0.5107028484746334, + "grad_norm": 1.379275079849456, + "learning_rate": 9.864683010338212e-06, + "loss": 0.3744, + "step": 1515 + }, + { + "epoch": 0.5123883364233945, + "grad_norm": 1.411176846999733, + "learning_rate": 9.863418844999235e-06, + "loss": 0.3888, + "step": 1520 + }, + { + "epoch": 0.5140738243721558, + "grad_norm": 1.5172650618960308, + "learning_rate": 9.862148883769931e-06, + "loss": 0.3596, + "step": 1525 + }, + { + "epoch": 0.5157593123209169, + "grad_norm": 1.875551642818612, + "learning_rate": 9.86087312816375e-06, + "loss": 0.3736, + "step": 1530 + }, + { + "epoch": 0.5174448002696781, + "grad_norm": 1.6286132129606317, + "learning_rate": 9.859591579701053e-06, + "loss": 0.3615, + "step": 1535 + }, + { + "epoch": 0.5191302882184392, + "grad_norm": 1.5249569684369995, + "learning_rate": 9.858304239909102e-06, + "loss": 0.4015, + "step": 1540 + }, + { + "epoch": 0.5208157761672004, + "grad_norm": 1.6254620292539275, + "learning_rate": 9.85701111032206e-06, + "loss": 0.344, + "step": 1545 + }, + { + "epoch": 0.5225012641159615, + "grad_norm": 1.9407647903942409, + "learning_rate": 9.855712192480986e-06, + "loss": 0.3528, + "step": 1550 + }, + { + "epoch": 0.5241867520647228, + "grad_norm": 1.6535005702127823, + "learning_rate": 9.854407487933849e-06, + "loss": 0.3453, + "step": 1555 + }, + { + "epoch": 0.5258722400134839, + "grad_norm": 1.5395599773328799, + "learning_rate": 9.853096998235502e-06, + "loss": 0.381, + "step": 1560 + }, + { + "epoch": 0.5275577279622451, + "grad_norm": 1.9864671010107728, + "learning_rate": 9.8517807249477e-06, + "loss": 0.3959, + "step": 1565 + }, + { + "epoch": 0.5292432159110062, + "grad_norm": 1.9812177121423384, + "learning_rate": 9.850458669639083e-06, + "loss": 0.3507, + "step": 1570 + }, + { + "epoch": 0.5309287038597674, + "grad_norm": 1.5164728376351555, + "learning_rate": 9.849130833885192e-06, + "loss": 0.3736, + "step": 1575 + }, + { + "epoch": 0.5326141918085285, + "grad_norm": 1.3996020796112227, + "learning_rate": 9.847797219268447e-06, + "loss": 0.3794, + "step": 1580 + }, + { + "epoch": 0.5342996797572898, + "grad_norm": 1.8305921371448441, + "learning_rate": 9.84645782737816e-06, + "loss": 0.3831, + "step": 1585 + }, + { + "epoch": 0.5359851677060509, + "grad_norm": 2.265248065963422, + "learning_rate": 9.845112659810525e-06, + "loss": 0.3453, + "step": 1590 + }, + { + "epoch": 0.5376706556548121, + "grad_norm": 1.4395690545235373, + "learning_rate": 9.843761718168625e-06, + "loss": 0.3796, + "step": 1595 + }, + { + "epoch": 0.5393561436035732, + "grad_norm": 1.8254751868748305, + "learning_rate": 9.842405004062417e-06, + "loss": 0.3774, + "step": 1600 + }, + { + "epoch": 0.5410416315523344, + "grad_norm": 2.5465244378890706, + "learning_rate": 9.841042519108739e-06, + "loss": 0.3794, + "step": 1605 + }, + { + "epoch": 0.5427271195010955, + "grad_norm": 1.242660191167484, + "learning_rate": 9.83967426493131e-06, + "loss": 0.3404, + "step": 1610 + }, + { + "epoch": 0.5444126074498568, + "grad_norm": 1.3639736726141871, + "learning_rate": 9.838300243160722e-06, + "loss": 0.3568, + "step": 1615 + }, + { + "epoch": 0.5460980953986179, + "grad_norm": 1.5872074532119953, + "learning_rate": 9.836920455434437e-06, + "loss": 0.3759, + "step": 1620 + }, + { + "epoch": 0.5477835833473791, + "grad_norm": 1.348686027258727, + "learning_rate": 9.835534903396795e-06, + "loss": 0.3994, + "step": 1625 + }, + { + "epoch": 0.5494690712961402, + "grad_norm": 1.4816598182200131, + "learning_rate": 9.834143588699002e-06, + "loss": 0.3843, + "step": 1630 + }, + { + "epoch": 0.5511545592449014, + "grad_norm": 1.8712193283757328, + "learning_rate": 9.83274651299913e-06, + "loss": 0.39, + "step": 1635 + }, + { + "epoch": 0.5528400471936625, + "grad_norm": 3.4046833656269104, + "learning_rate": 9.831343677962121e-06, + "loss": 0.3869, + "step": 1640 + }, + { + "epoch": 0.5545255351424238, + "grad_norm": 2.9435101818151463, + "learning_rate": 9.829935085259775e-06, + "loss": 0.3811, + "step": 1645 + }, + { + "epoch": 0.5562110230911849, + "grad_norm": 4.462125272620029, + "learning_rate": 9.82852073657076e-06, + "loss": 0.3831, + "step": 1650 + }, + { + "epoch": 0.5578965110399461, + "grad_norm": 1.3067101655202602, + "learning_rate": 9.827100633580595e-06, + "loss": 0.3707, + "step": 1655 + }, + { + "epoch": 0.5595819989887072, + "grad_norm": 1.4289922522073135, + "learning_rate": 9.825674777981666e-06, + "loss": 0.3855, + "step": 1660 + }, + { + "epoch": 0.5612674869374684, + "grad_norm": 1.2486461237777289, + "learning_rate": 9.824243171473208e-06, + "loss": 0.3665, + "step": 1665 + }, + { + "epoch": 0.5629529748862295, + "grad_norm": 1.587809195793634, + "learning_rate": 9.822805815761316e-06, + "loss": 0.3592, + "step": 1670 + }, + { + "epoch": 0.5646384628349908, + "grad_norm": 1.1920623778903174, + "learning_rate": 9.821362712558926e-06, + "loss": 0.3434, + "step": 1675 + }, + { + "epoch": 0.5663239507837519, + "grad_norm": 1.7118935425245672, + "learning_rate": 9.819913863585836e-06, + "loss": 0.3979, + "step": 1680 + }, + { + "epoch": 0.5680094387325131, + "grad_norm": 1.4144090637086981, + "learning_rate": 9.818459270568682e-06, + "loss": 0.3681, + "step": 1685 + }, + { + "epoch": 0.5696949266812742, + "grad_norm": 1.2924185332094549, + "learning_rate": 9.816998935240946e-06, + "loss": 0.3748, + "step": 1690 + }, + { + "epoch": 0.5713804146300354, + "grad_norm": 1.2253496181098722, + "learning_rate": 9.81553285934296e-06, + "loss": 0.3675, + "step": 1695 + }, + { + "epoch": 0.5730659025787965, + "grad_norm": 2.8092012770674164, + "learning_rate": 9.814061044621894e-06, + "loss": 0.3841, + "step": 1700 + }, + { + "epoch": 0.5747513905275577, + "grad_norm": 1.2314566428669753, + "learning_rate": 9.812583492831751e-06, + "loss": 0.3676, + "step": 1705 + }, + { + "epoch": 0.5764368784763189, + "grad_norm": 1.7990990576328856, + "learning_rate": 9.811100205733381e-06, + "loss": 0.3731, + "step": 1710 + }, + { + "epoch": 0.5781223664250801, + "grad_norm": 1.6232984129821095, + "learning_rate": 9.809611185094463e-06, + "loss": 0.3862, + "step": 1715 + }, + { + "epoch": 0.5798078543738412, + "grad_norm": 1.2766438891686642, + "learning_rate": 9.808116432689509e-06, + "loss": 0.3728, + "step": 1720 + }, + { + "epoch": 0.5814933423226024, + "grad_norm": 1.8194884822119324, + "learning_rate": 9.806615950299865e-06, + "loss": 0.3613, + "step": 1725 + }, + { + "epoch": 0.5831788302713635, + "grad_norm": 1.182041198701511, + "learning_rate": 9.8051097397137e-06, + "loss": 0.3711, + "step": 1730 + }, + { + "epoch": 0.5848643182201247, + "grad_norm": 1.1813270514141903, + "learning_rate": 9.803597802726015e-06, + "loss": 0.339, + "step": 1735 + }, + { + "epoch": 0.5865498061688859, + "grad_norm": 1.1615217782525067, + "learning_rate": 9.802080141138634e-06, + "loss": 0.335, + "step": 1740 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 1.2225454930609114, + "learning_rate": 9.8005567567602e-06, + "loss": 0.3641, + "step": 1745 + }, + { + "epoch": 0.5899207820664082, + "grad_norm": 1.5721864635841314, + "learning_rate": 9.799027651406181e-06, + "loss": 0.3579, + "step": 1750 + }, + { + "epoch": 0.5916062700151694, + "grad_norm": 1.3119639724883532, + "learning_rate": 9.79749282689886e-06, + "loss": 0.4099, + "step": 1755 + }, + { + "epoch": 0.5932917579639305, + "grad_norm": 1.371061205922346, + "learning_rate": 9.795952285067334e-06, + "loss": 0.3643, + "step": 1760 + }, + { + "epoch": 0.5949772459126917, + "grad_norm": 1.413448565845774, + "learning_rate": 9.794406027747516e-06, + "loss": 0.3714, + "step": 1765 + }, + { + "epoch": 0.5966627338614529, + "grad_norm": 1.621548282743646, + "learning_rate": 9.79285405678213e-06, + "loss": 0.3993, + "step": 1770 + }, + { + "epoch": 0.5983482218102141, + "grad_norm": 1.5758278424522085, + "learning_rate": 9.791296374020711e-06, + "loss": 0.3643, + "step": 1775 + }, + { + "epoch": 0.6000337097589752, + "grad_norm": 1.2732012081933695, + "learning_rate": 9.789732981319597e-06, + "loss": 0.3622, + "step": 1780 + }, + { + "epoch": 0.6017191977077364, + "grad_norm": 1.724149417614084, + "learning_rate": 9.788163880541933e-06, + "loss": 0.3734, + "step": 1785 + }, + { + "epoch": 0.6034046856564975, + "grad_norm": 1.2459101202905065, + "learning_rate": 9.786589073557665e-06, + "loss": 0.3759, + "step": 1790 + }, + { + "epoch": 0.6050901736052587, + "grad_norm": 1.7345209199449638, + "learning_rate": 9.785008562243544e-06, + "loss": 0.364, + "step": 1795 + }, + { + "epoch": 0.6067756615540199, + "grad_norm": 1.3365285983048494, + "learning_rate": 9.783422348483109e-06, + "loss": 0.364, + "step": 1800 + }, + { + "epoch": 0.6084611495027811, + "grad_norm": 1.3189215105489214, + "learning_rate": 9.781830434166707e-06, + "loss": 0.3861, + "step": 1805 + }, + { + "epoch": 0.6101466374515422, + "grad_norm": 2.453366829111078, + "learning_rate": 9.78023282119147e-06, + "loss": 0.3235, + "step": 1810 + }, + { + "epoch": 0.6118321254003034, + "grad_norm": 1.4264031253919447, + "learning_rate": 9.778629511461327e-06, + "loss": 0.3873, + "step": 1815 + }, + { + "epoch": 0.6135176133490645, + "grad_norm": 1.7280331748498687, + "learning_rate": 9.77702050688699e-06, + "loss": 0.3574, + "step": 1820 + }, + { + "epoch": 0.6152031012978257, + "grad_norm": 1.1882150276334564, + "learning_rate": 9.77540580938596e-06, + "loss": 0.3795, + "step": 1825 + }, + { + "epoch": 0.6168885892465868, + "grad_norm": 1.9950862244196361, + "learning_rate": 9.773785420882527e-06, + "loss": 0.3462, + "step": 1830 + }, + { + "epoch": 0.6185740771953481, + "grad_norm": 1.4034758002974188, + "learning_rate": 9.772159343307755e-06, + "loss": 0.3389, + "step": 1835 + }, + { + "epoch": 0.6202595651441092, + "grad_norm": 1.293250344875036, + "learning_rate": 9.770527578599495e-06, + "loss": 0.3715, + "step": 1840 + }, + { + "epoch": 0.6219450530928704, + "grad_norm": 1.6660907631550192, + "learning_rate": 9.76889012870237e-06, + "loss": 0.3613, + "step": 1845 + }, + { + "epoch": 0.6236305410416315, + "grad_norm": 1.1473922763213351, + "learning_rate": 9.767246995567785e-06, + "loss": 0.3307, + "step": 1850 + }, + { + "epoch": 0.6253160289903927, + "grad_norm": 3.230850340977098, + "learning_rate": 9.765598181153913e-06, + "loss": 0.3629, + "step": 1855 + }, + { + "epoch": 0.6270015169391538, + "grad_norm": 1.787900324765984, + "learning_rate": 9.763943687425695e-06, + "loss": 0.3826, + "step": 1860 + }, + { + "epoch": 0.6286870048879151, + "grad_norm": 1.7198065823109134, + "learning_rate": 9.762283516354845e-06, + "loss": 0.3972, + "step": 1865 + }, + { + "epoch": 0.6303724928366762, + "grad_norm": 1.5591004076120165, + "learning_rate": 9.760617669919843e-06, + "loss": 0.3501, + "step": 1870 + }, + { + "epoch": 0.6320579807854374, + "grad_norm": 2.687947841313869, + "learning_rate": 9.758946150105929e-06, + "loss": 0.3625, + "step": 1875 + }, + { + "epoch": 0.6337434687341985, + "grad_norm": 1.1741314827927007, + "learning_rate": 9.757268958905108e-06, + "loss": 0.3781, + "step": 1880 + }, + { + "epoch": 0.6354289566829597, + "grad_norm": 1.5664751237434484, + "learning_rate": 9.755586098316141e-06, + "loss": 0.3558, + "step": 1885 + }, + { + "epoch": 0.6371144446317208, + "grad_norm": 1.240015207550192, + "learning_rate": 9.753897570344546e-06, + "loss": 0.3551, + "step": 1890 + }, + { + "epoch": 0.6387999325804821, + "grad_norm": 1.8229664319251397, + "learning_rate": 9.752203377002596e-06, + "loss": 0.3603, + "step": 1895 + }, + { + "epoch": 0.6404854205292432, + "grad_norm": 2.83552526946451, + "learning_rate": 9.750503520309315e-06, + "loss": 0.3496, + "step": 1900 + }, + { + "epoch": 0.6421709084780044, + "grad_norm": 2.0244644558655023, + "learning_rate": 9.748798002290474e-06, + "loss": 0.3244, + "step": 1905 + }, + { + "epoch": 0.6438563964267655, + "grad_norm": 1.1760571210531725, + "learning_rate": 9.747086824978595e-06, + "loss": 0.3799, + "step": 1910 + }, + { + "epoch": 0.6455418843755267, + "grad_norm": 1.225229598191833, + "learning_rate": 9.745369990412943e-06, + "loss": 0.3927, + "step": 1915 + }, + { + "epoch": 0.6472273723242878, + "grad_norm": 1.9260704719551835, + "learning_rate": 9.743647500639521e-06, + "loss": 0.3632, + "step": 1920 + }, + { + "epoch": 0.6489128602730491, + "grad_norm": 1.4680818662991428, + "learning_rate": 9.741919357711078e-06, + "loss": 0.3867, + "step": 1925 + }, + { + "epoch": 0.6505983482218102, + "grad_norm": 1.6477683490735457, + "learning_rate": 9.740185563687095e-06, + "loss": 0.3606, + "step": 1930 + }, + { + "epoch": 0.6522838361705714, + "grad_norm": 5.509552602446063, + "learning_rate": 9.738446120633788e-06, + "loss": 0.3527, + "step": 1935 + }, + { + "epoch": 0.6539693241193325, + "grad_norm": 1.3651510345103504, + "learning_rate": 9.736701030624109e-06, + "loss": 0.3342, + "step": 1940 + }, + { + "epoch": 0.6556548120680937, + "grad_norm": 1.4065274870238285, + "learning_rate": 9.734950295737736e-06, + "loss": 0.4101, + "step": 1945 + }, + { + "epoch": 0.6573403000168548, + "grad_norm": 1.5133391638671063, + "learning_rate": 9.733193918061072e-06, + "loss": 0.3496, + "step": 1950 + }, + { + "epoch": 0.6590257879656161, + "grad_norm": 1.2800300431439677, + "learning_rate": 9.731431899687254e-06, + "loss": 0.3433, + "step": 1955 + }, + { + "epoch": 0.6607112759143772, + "grad_norm": 1.226593168456227, + "learning_rate": 9.729664242716133e-06, + "loss": 0.3288, + "step": 1960 + }, + { + "epoch": 0.6623967638631384, + "grad_norm": 1.603857320637352, + "learning_rate": 9.727890949254279e-06, + "loss": 0.3873, + "step": 1965 + }, + { + "epoch": 0.6640822518118995, + "grad_norm": 1.4776997381891348, + "learning_rate": 9.726112021414985e-06, + "loss": 0.36, + "step": 1970 + }, + { + "epoch": 0.6657677397606607, + "grad_norm": 1.4498019018029922, + "learning_rate": 9.724327461318253e-06, + "loss": 0.3559, + "step": 1975 + }, + { + "epoch": 0.6674532277094218, + "grad_norm": 1.5304816496173508, + "learning_rate": 9.722537271090801e-06, + "loss": 0.378, + "step": 1980 + }, + { + "epoch": 0.6691387156581831, + "grad_norm": 1.2841465124541995, + "learning_rate": 9.720741452866059e-06, + "loss": 0.3502, + "step": 1985 + }, + { + "epoch": 0.6708242036069442, + "grad_norm": 1.341839612536046, + "learning_rate": 9.718940008784154e-06, + "loss": 0.3732, + "step": 1990 + }, + { + "epoch": 0.6725096915557054, + "grad_norm": 1.5679954311607547, + "learning_rate": 9.71713294099193e-06, + "loss": 0.3595, + "step": 1995 + }, + { + "epoch": 0.6741951795044665, + "grad_norm": 1.8242560324395702, + "learning_rate": 9.715320251642924e-06, + "loss": 0.3437, + "step": 2000 + }, + { + "epoch": 0.6758806674532277, + "grad_norm": 1.4721199342053406, + "learning_rate": 9.713501942897374e-06, + "loss": 0.3633, + "step": 2005 + }, + { + "epoch": 0.6775661554019888, + "grad_norm": 1.903966919235931, + "learning_rate": 9.71167801692222e-06, + "loss": 0.3738, + "step": 2010 + }, + { + "epoch": 0.6792516433507501, + "grad_norm": 1.3353416843282138, + "learning_rate": 9.709848475891087e-06, + "loss": 0.3367, + "step": 2015 + }, + { + "epoch": 0.6809371312995112, + "grad_norm": 1.1024900446804993, + "learning_rate": 9.708013321984303e-06, + "loss": 0.3337, + "step": 2020 + }, + { + "epoch": 0.6826226192482724, + "grad_norm": 1.8295367652286423, + "learning_rate": 9.706172557388873e-06, + "loss": 0.3774, + "step": 2025 + }, + { + "epoch": 0.6843081071970335, + "grad_norm": 1.543116586118428, + "learning_rate": 9.7043261842985e-06, + "loss": 0.3655, + "step": 2030 + }, + { + "epoch": 0.6859935951457947, + "grad_norm": 1.6317209425057675, + "learning_rate": 9.702474204913563e-06, + "loss": 0.3475, + "step": 2035 + }, + { + "epoch": 0.6876790830945558, + "grad_norm": 1.63484679625352, + "learning_rate": 9.700616621441123e-06, + "loss": 0.3972, + "step": 2040 + }, + { + "epoch": 0.689364571043317, + "grad_norm": 2.1873559252815555, + "learning_rate": 9.698753436094924e-06, + "loss": 0.3736, + "step": 2045 + }, + { + "epoch": 0.6910500589920782, + "grad_norm": 1.2890286528794461, + "learning_rate": 9.69688465109538e-06, + "loss": 0.3469, + "step": 2050 + }, + { + "epoch": 0.6927355469408394, + "grad_norm": 1.3239260464350429, + "learning_rate": 9.695010268669585e-06, + "loss": 0.3704, + "step": 2055 + }, + { + "epoch": 0.6944210348896005, + "grad_norm": 1.8709605453949443, + "learning_rate": 9.693130291051296e-06, + "loss": 0.356, + "step": 2060 + }, + { + "epoch": 0.6961065228383617, + "grad_norm": 2.9573522315636724, + "learning_rate": 9.691244720480945e-06, + "loss": 0.3742, + "step": 2065 + }, + { + "epoch": 0.6977920107871228, + "grad_norm": 1.3980185260901286, + "learning_rate": 9.689353559205623e-06, + "loss": 0.3648, + "step": 2070 + }, + { + "epoch": 0.699477498735884, + "grad_norm": 1.228705181126694, + "learning_rate": 9.68745680947909e-06, + "loss": 0.3039, + "step": 2075 + }, + { + "epoch": 0.7011629866846452, + "grad_norm": 1.2220680611193167, + "learning_rate": 9.685554473561762e-06, + "loss": 0.3474, + "step": 2080 + }, + { + "epoch": 0.7028484746334064, + "grad_norm": 2.59974825327748, + "learning_rate": 9.683646553720712e-06, + "loss": 0.3501, + "step": 2085 + }, + { + "epoch": 0.7045339625821675, + "grad_norm": 1.6467232454475285, + "learning_rate": 9.681733052229669e-06, + "loss": 0.3787, + "step": 2090 + }, + { + "epoch": 0.7062194505309287, + "grad_norm": 1.8096540134742791, + "learning_rate": 9.679813971369012e-06, + "loss": 0.3648, + "step": 2095 + }, + { + "epoch": 0.7079049384796898, + "grad_norm": 1.2647471741504637, + "learning_rate": 9.677889313425773e-06, + "loss": 0.3788, + "step": 2100 + }, + { + "epoch": 0.709590426428451, + "grad_norm": 1.35976378763326, + "learning_rate": 9.675959080693627e-06, + "loss": 0.3433, + "step": 2105 + }, + { + "epoch": 0.7112759143772122, + "grad_norm": 1.308598282396058, + "learning_rate": 9.67402327547289e-06, + "loss": 0.378, + "step": 2110 + }, + { + "epoch": 0.7129614023259734, + "grad_norm": 1.4775263796977416, + "learning_rate": 9.672081900070528e-06, + "loss": 0.3574, + "step": 2115 + }, + { + "epoch": 0.7146468902747345, + "grad_norm": 1.1577557054374856, + "learning_rate": 9.670134956800137e-06, + "loss": 0.3733, + "step": 2120 + }, + { + "epoch": 0.7163323782234957, + "grad_norm": 1.466750065549959, + "learning_rate": 9.668182447981952e-06, + "loss": 0.4092, + "step": 2125 + }, + { + "epoch": 0.7180178661722568, + "grad_norm": 1.1565828725521126, + "learning_rate": 9.666224375942837e-06, + "loss": 0.3731, + "step": 2130 + }, + { + "epoch": 0.719703354121018, + "grad_norm": 1.350218441493612, + "learning_rate": 9.664260743016292e-06, + "loss": 0.3865, + "step": 2135 + }, + { + "epoch": 0.7213888420697792, + "grad_norm": 1.3358360503846247, + "learning_rate": 9.662291551542438e-06, + "loss": 0.3427, + "step": 2140 + }, + { + "epoch": 0.7230743300185404, + "grad_norm": 1.4332616915804153, + "learning_rate": 9.660316803868021e-06, + "loss": 0.3474, + "step": 2145 + }, + { + "epoch": 0.7247598179673015, + "grad_norm": 1.3259332446942924, + "learning_rate": 9.658336502346417e-06, + "loss": 0.3336, + "step": 2150 + }, + { + "epoch": 0.7264453059160627, + "grad_norm": 2.3158674017755065, + "learning_rate": 9.656350649337607e-06, + "loss": 0.3839, + "step": 2155 + }, + { + "epoch": 0.7281307938648238, + "grad_norm": 1.0949111862764238, + "learning_rate": 9.6543592472082e-06, + "loss": 0.3773, + "step": 2160 + }, + { + "epoch": 0.729816281813585, + "grad_norm": 1.5097948334639733, + "learning_rate": 9.65236229833141e-06, + "loss": 0.4078, + "step": 2165 + }, + { + "epoch": 0.7315017697623462, + "grad_norm": 1.2594652703608187, + "learning_rate": 9.650359805087065e-06, + "loss": 0.3567, + "step": 2170 + }, + { + "epoch": 0.7331872577111074, + "grad_norm": 1.8804252015418728, + "learning_rate": 9.648351769861602e-06, + "loss": 0.3646, + "step": 2175 + }, + { + "epoch": 0.7348727456598685, + "grad_norm": 1.3073981671846944, + "learning_rate": 9.646338195048056e-06, + "loss": 0.3504, + "step": 2180 + }, + { + "epoch": 0.7365582336086297, + "grad_norm": 2.7949276977131086, + "learning_rate": 9.64431908304607e-06, + "loss": 0.4033, + "step": 2185 + }, + { + "epoch": 0.7382437215573908, + "grad_norm": 1.2942219503947912, + "learning_rate": 9.642294436261885e-06, + "loss": 0.3619, + "step": 2190 + }, + { + "epoch": 0.739929209506152, + "grad_norm": 1.6775973408111522, + "learning_rate": 9.640264257108335e-06, + "loss": 0.3347, + "step": 2195 + }, + { + "epoch": 0.7416146974549132, + "grad_norm": 1.3588786223993885, + "learning_rate": 9.638228548004849e-06, + "loss": 0.3595, + "step": 2200 + }, + { + "epoch": 0.7433001854036744, + "grad_norm": 1.6838109268179358, + "learning_rate": 9.636187311377447e-06, + "loss": 0.3373, + "step": 2205 + }, + { + "epoch": 0.7449856733524355, + "grad_norm": 2.3830060953911167, + "learning_rate": 9.634140549658735e-06, + "loss": 0.3926, + "step": 2210 + }, + { + "epoch": 0.7466711613011967, + "grad_norm": 1.7907822839020726, + "learning_rate": 9.632088265287903e-06, + "loss": 0.3472, + "step": 2215 + }, + { + "epoch": 0.7483566492499578, + "grad_norm": 1.3471671246020636, + "learning_rate": 9.630030460710722e-06, + "loss": 0.3811, + "step": 2220 + }, + { + "epoch": 0.750042137198719, + "grad_norm": 1.5810646120081224, + "learning_rate": 9.627967138379547e-06, + "loss": 0.3545, + "step": 2225 + }, + { + "epoch": 0.7517276251474801, + "grad_norm": 1.507254101559988, + "learning_rate": 9.625898300753302e-06, + "loss": 0.3859, + "step": 2230 + }, + { + "epoch": 0.7534131130962414, + "grad_norm": 1.2779664893101494, + "learning_rate": 9.623823950297486e-06, + "loss": 0.355, + "step": 2235 + }, + { + "epoch": 0.7550986010450025, + "grad_norm": 1.2886964830711667, + "learning_rate": 9.621744089484169e-06, + "loss": 0.3449, + "step": 2240 + }, + { + "epoch": 0.7567840889937637, + "grad_norm": 1.7316231667525117, + "learning_rate": 9.619658720791987e-06, + "loss": 0.3417, + "step": 2245 + }, + { + "epoch": 0.7584695769425248, + "grad_norm": 1.1667612064431105, + "learning_rate": 9.617567846706139e-06, + "loss": 0.3474, + "step": 2250 + }, + { + "epoch": 0.760155064891286, + "grad_norm": 1.1291994882095613, + "learning_rate": 9.615471469718388e-06, + "loss": 0.3388, + "step": 2255 + }, + { + "epoch": 0.7618405528400471, + "grad_norm": 1.1256981178350294, + "learning_rate": 9.61336959232705e-06, + "loss": 0.3448, + "step": 2260 + }, + { + "epoch": 0.7635260407888084, + "grad_norm": 1.427389213381377, + "learning_rate": 9.611262217037004e-06, + "loss": 0.3491, + "step": 2265 + }, + { + "epoch": 0.7652115287375695, + "grad_norm": 1.324667521608873, + "learning_rate": 9.609149346359668e-06, + "loss": 0.3757, + "step": 2270 + }, + { + "epoch": 0.7668970166863307, + "grad_norm": 1.3031507899917336, + "learning_rate": 9.607030982813023e-06, + "loss": 0.34, + "step": 2275 + }, + { + "epoch": 0.7685825046350918, + "grad_norm": 1.4842518042803345, + "learning_rate": 9.604907128921589e-06, + "loss": 0.3929, + "step": 2280 + }, + { + "epoch": 0.770267992583853, + "grad_norm": 1.294454455888665, + "learning_rate": 9.602777787216429e-06, + "loss": 0.3541, + "step": 2285 + }, + { + "epoch": 0.7719534805326141, + "grad_norm": 1.4341792182231619, + "learning_rate": 9.600642960235147e-06, + "loss": 0.3933, + "step": 2290 + }, + { + "epoch": 0.7736389684813754, + "grad_norm": 1.3954686669018816, + "learning_rate": 9.598502650521883e-06, + "loss": 0.3218, + "step": 2295 + }, + { + "epoch": 0.7753244564301365, + "grad_norm": 1.3345111705629027, + "learning_rate": 9.596356860627314e-06, + "loss": 0.3313, + "step": 2300 + }, + { + "epoch": 0.7770099443788977, + "grad_norm": 1.616643582212815, + "learning_rate": 9.594205593108645e-06, + "loss": 0.3691, + "step": 2305 + }, + { + "epoch": 0.7786954323276588, + "grad_norm": 1.3596525923308695, + "learning_rate": 9.59204885052961e-06, + "loss": 0.3705, + "step": 2310 + }, + { + "epoch": 0.78038092027642, + "grad_norm": 2.4609659088354725, + "learning_rate": 9.589886635460466e-06, + "loss": 0.3684, + "step": 2315 + }, + { + "epoch": 0.7820664082251811, + "grad_norm": 1.2843009758418336, + "learning_rate": 9.587718950477993e-06, + "loss": 0.3784, + "step": 2320 + }, + { + "epoch": 0.7837518961739424, + "grad_norm": 1.2577183022471217, + "learning_rate": 9.585545798165494e-06, + "loss": 0.3596, + "step": 2325 + }, + { + "epoch": 0.7854373841227035, + "grad_norm": 1.6144024381724416, + "learning_rate": 9.583367181112778e-06, + "loss": 0.3741, + "step": 2330 + }, + { + "epoch": 0.7871228720714647, + "grad_norm": 1.3461516966835854, + "learning_rate": 9.581183101916176e-06, + "loss": 0.3483, + "step": 2335 + }, + { + "epoch": 0.7888083600202258, + "grad_norm": 1.6872425081360545, + "learning_rate": 9.578993563178522e-06, + "loss": 0.3757, + "step": 2340 + }, + { + "epoch": 0.790493847968987, + "grad_norm": 1.4105158607965087, + "learning_rate": 9.57679856750916e-06, + "loss": 0.3575, + "step": 2345 + }, + { + "epoch": 0.7921793359177481, + "grad_norm": 1.3965878147473851, + "learning_rate": 9.574598117523935e-06, + "loss": 0.368, + "step": 2350 + }, + { + "epoch": 0.7938648238665094, + "grad_norm": 1.7944400734908867, + "learning_rate": 9.572392215845194e-06, + "loss": 0.3496, + "step": 2355 + }, + { + "epoch": 0.7955503118152705, + "grad_norm": 1.4613174593711948, + "learning_rate": 9.570180865101778e-06, + "loss": 0.3449, + "step": 2360 + }, + { + "epoch": 0.7972357997640317, + "grad_norm": 1.505128651251869, + "learning_rate": 9.567964067929026e-06, + "loss": 0.3404, + "step": 2365 + }, + { + "epoch": 0.7989212877127928, + "grad_norm": 1.3414032802645093, + "learning_rate": 9.565741826968766e-06, + "loss": 0.3785, + "step": 2370 + }, + { + "epoch": 0.800606775661554, + "grad_norm": 1.860016833945816, + "learning_rate": 9.56351414486931e-06, + "loss": 0.3272, + "step": 2375 + }, + { + "epoch": 0.8022922636103151, + "grad_norm": 1.323878703458128, + "learning_rate": 9.561281024285459e-06, + "loss": 0.3437, + "step": 2380 + }, + { + "epoch": 0.8039777515590764, + "grad_norm": 2.4691539168218952, + "learning_rate": 9.559042467878492e-06, + "loss": 0.351, + "step": 2385 + }, + { + "epoch": 0.8056632395078375, + "grad_norm": 1.9483757444215413, + "learning_rate": 9.556798478316169e-06, + "loss": 0.3496, + "step": 2390 + }, + { + "epoch": 0.8073487274565987, + "grad_norm": 1.3091971783922076, + "learning_rate": 9.554549058272725e-06, + "loss": 0.3908, + "step": 2395 + }, + { + "epoch": 0.8090342154053598, + "grad_norm": 1.7075499068288942, + "learning_rate": 9.552294210428863e-06, + "loss": 0.3599, + "step": 2400 + }, + { + "epoch": 0.810719703354121, + "grad_norm": 1.541184995336931, + "learning_rate": 9.550033937471756e-06, + "loss": 0.3517, + "step": 2405 + }, + { + "epoch": 0.8124051913028821, + "grad_norm": 1.440864567708475, + "learning_rate": 9.547768242095046e-06, + "loss": 0.3198, + "step": 2410 + }, + { + "epoch": 0.8140906792516434, + "grad_norm": 1.3114303025122667, + "learning_rate": 9.545497126998832e-06, + "loss": 0.3658, + "step": 2415 + }, + { + "epoch": 0.8157761672004045, + "grad_norm": 1.2693919934972264, + "learning_rate": 9.543220594889672e-06, + "loss": 0.3648, + "step": 2420 + }, + { + "epoch": 0.8174616551491657, + "grad_norm": 1.2480260474966405, + "learning_rate": 9.540938648480584e-06, + "loss": 0.3736, + "step": 2425 + }, + { + "epoch": 0.8191471430979268, + "grad_norm": 1.2583908882869537, + "learning_rate": 9.538651290491037e-06, + "loss": 0.4143, + "step": 2430 + }, + { + "epoch": 0.820832631046688, + "grad_norm": 1.3219514485911716, + "learning_rate": 9.536358523646943e-06, + "loss": 0.3573, + "step": 2435 + }, + { + "epoch": 0.8225181189954491, + "grad_norm": 1.2563958548761411, + "learning_rate": 9.53406035068067e-06, + "loss": 0.3755, + "step": 2440 + }, + { + "epoch": 0.8242036069442104, + "grad_norm": 1.300659464438313, + "learning_rate": 9.53175677433102e-06, + "loss": 0.3635, + "step": 2445 + }, + { + "epoch": 0.8258890948929715, + "grad_norm": 1.2746373477428425, + "learning_rate": 9.529447797343241e-06, + "loss": 0.3644, + "step": 2450 + }, + { + "epoch": 0.8275745828417327, + "grad_norm": 1.5446920899150354, + "learning_rate": 9.527133422469013e-06, + "loss": 0.3234, + "step": 2455 + }, + { + "epoch": 0.8292600707904938, + "grad_norm": 1.2127750323326822, + "learning_rate": 9.524813652466448e-06, + "loss": 0.3503, + "step": 2460 + }, + { + "epoch": 0.830945558739255, + "grad_norm": 1.0847472392184307, + "learning_rate": 9.52248849010009e-06, + "loss": 0.3797, + "step": 2465 + }, + { + "epoch": 0.8326310466880161, + "grad_norm": 1.1563548773181405, + "learning_rate": 9.520157938140912e-06, + "loss": 0.3296, + "step": 2470 + }, + { + "epoch": 0.8343165346367774, + "grad_norm": 1.2070003204546937, + "learning_rate": 9.517821999366302e-06, + "loss": 0.3823, + "step": 2475 + }, + { + "epoch": 0.8360020225855385, + "grad_norm": 1.3126220997097682, + "learning_rate": 9.515480676560073e-06, + "loss": 0.3321, + "step": 2480 + }, + { + "epoch": 0.8376875105342997, + "grad_norm": 1.231455314986524, + "learning_rate": 9.513133972512458e-06, + "loss": 0.3147, + "step": 2485 + }, + { + "epoch": 0.8393729984830608, + "grad_norm": 1.2280471168780553, + "learning_rate": 9.510781890020093e-06, + "loss": 0.3706, + "step": 2490 + }, + { + "epoch": 0.841058486431822, + "grad_norm": 1.3044595910355357, + "learning_rate": 9.508424431886034e-06, + "loss": 0.3394, + "step": 2495 + }, + { + "epoch": 0.8427439743805831, + "grad_norm": 3.6679846130198843, + "learning_rate": 9.506061600919734e-06, + "loss": 0.3809, + "step": 2500 + }, + { + "epoch": 0.8444294623293443, + "grad_norm": 1.1082238616328808, + "learning_rate": 9.503693399937058e-06, + "loss": 0.3014, + "step": 2505 + }, + { + "epoch": 0.8461149502781055, + "grad_norm": 1.237984837643209, + "learning_rate": 9.50131983176026e-06, + "loss": 0.348, + "step": 2510 + }, + { + "epoch": 0.8478004382268667, + "grad_norm": 1.5155978687734502, + "learning_rate": 9.498940899218004e-06, + "loss": 0.3623, + "step": 2515 + }, + { + "epoch": 0.8494859261756278, + "grad_norm": 1.2146803088451354, + "learning_rate": 9.496556605145335e-06, + "loss": 0.346, + "step": 2520 + }, + { + "epoch": 0.851171414124389, + "grad_norm": 1.0883708659526483, + "learning_rate": 9.494166952383692e-06, + "loss": 0.3711, + "step": 2525 + }, + { + "epoch": 0.8528569020731501, + "grad_norm": 1.220308294992252, + "learning_rate": 9.491771943780898e-06, + "loss": 0.3464, + "step": 2530 + }, + { + "epoch": 0.8545423900219113, + "grad_norm": 1.2710239748342407, + "learning_rate": 9.489371582191164e-06, + "loss": 0.3763, + "step": 2535 + }, + { + "epoch": 0.8562278779706725, + "grad_norm": 1.1543955875726446, + "learning_rate": 9.486965870475077e-06, + "loss": 0.3861, + "step": 2540 + }, + { + "epoch": 0.8579133659194337, + "grad_norm": 1.1142956892465583, + "learning_rate": 9.484554811499598e-06, + "loss": 0.3364, + "step": 2545 + }, + { + "epoch": 0.8595988538681948, + "grad_norm": 1.660796741752221, + "learning_rate": 9.482138408138064e-06, + "loss": 0.3489, + "step": 2550 + }, + { + "epoch": 0.861284341816956, + "grad_norm": 1.4461577988844987, + "learning_rate": 9.479716663270178e-06, + "loss": 0.3459, + "step": 2555 + }, + { + "epoch": 0.8629698297657171, + "grad_norm": 1.4317652743708382, + "learning_rate": 9.47728957978201e-06, + "loss": 0.3726, + "step": 2560 + }, + { + "epoch": 0.8646553177144783, + "grad_norm": 1.1694623802018054, + "learning_rate": 9.474857160565993e-06, + "loss": 0.3445, + "step": 2565 + }, + { + "epoch": 0.8663408056632395, + "grad_norm": 1.1697324300325993, + "learning_rate": 9.472419408520919e-06, + "loss": 0.3244, + "step": 2570 + }, + { + "epoch": 0.8680262936120007, + "grad_norm": 1.5911590885082034, + "learning_rate": 9.469976326551933e-06, + "loss": 0.3301, + "step": 2575 + }, + { + "epoch": 0.8697117815607618, + "grad_norm": 1.2785690573279693, + "learning_rate": 9.467527917570533e-06, + "loss": 0.3666, + "step": 2580 + }, + { + "epoch": 0.871397269509523, + "grad_norm": 1.1744870803715373, + "learning_rate": 9.465074184494566e-06, + "loss": 0.3516, + "step": 2585 + }, + { + "epoch": 0.8730827574582841, + "grad_norm": 1.2360136206234111, + "learning_rate": 9.462615130248223e-06, + "loss": 0.3634, + "step": 2590 + }, + { + "epoch": 0.8747682454070453, + "grad_norm": 1.4682989830244326, + "learning_rate": 9.460150757762039e-06, + "loss": 0.3503, + "step": 2595 + }, + { + "epoch": 0.8764537333558065, + "grad_norm": 1.3774980387894733, + "learning_rate": 9.45768106997288e-06, + "loss": 0.3713, + "step": 2600 + }, + { + "epoch": 0.8781392213045677, + "grad_norm": 1.294704912429981, + "learning_rate": 9.455206069823959e-06, + "loss": 0.3655, + "step": 2605 + }, + { + "epoch": 0.8798247092533288, + "grad_norm": 1.327601776477407, + "learning_rate": 9.452725760264805e-06, + "loss": 0.3234, + "step": 2610 + }, + { + "epoch": 0.88151019720209, + "grad_norm": 1.1802264296178286, + "learning_rate": 9.450240144251284e-06, + "loss": 0.3358, + "step": 2615 + }, + { + "epoch": 0.8831956851508511, + "grad_norm": 1.748228754243499, + "learning_rate": 9.447749224745583e-06, + "loss": 0.3537, + "step": 2620 + }, + { + "epoch": 0.8848811730996123, + "grad_norm": 1.4045870902450581, + "learning_rate": 9.445253004716209e-06, + "loss": 0.3663, + "step": 2625 + }, + { + "epoch": 0.8865666610483734, + "grad_norm": 1.1940240009449117, + "learning_rate": 9.442751487137989e-06, + "loss": 0.3674, + "step": 2630 + }, + { + "epoch": 0.8882521489971347, + "grad_norm": 1.319656512149089, + "learning_rate": 9.440244674992058e-06, + "loss": 0.357, + "step": 2635 + }, + { + "epoch": 0.8899376369458958, + "grad_norm": 1.3350902364965938, + "learning_rate": 9.437732571265866e-06, + "loss": 0.3357, + "step": 2640 + }, + { + "epoch": 0.891623124894657, + "grad_norm": 1.1765261943953333, + "learning_rate": 9.435215178953164e-06, + "loss": 0.3323, + "step": 2645 + }, + { + "epoch": 0.8933086128434181, + "grad_norm": 1.4058080931394112, + "learning_rate": 9.43269250105401e-06, + "loss": 0.3527, + "step": 2650 + }, + { + "epoch": 0.8949941007921793, + "grad_norm": 2.253523941662195, + "learning_rate": 9.43016454057476e-06, + "loss": 0.3649, + "step": 2655 + }, + { + "epoch": 0.8966795887409404, + "grad_norm": 1.2420314384556268, + "learning_rate": 9.427631300528061e-06, + "loss": 0.3085, + "step": 2660 + }, + { + "epoch": 0.8983650766897017, + "grad_norm": 1.7457147421616308, + "learning_rate": 9.425092783932859e-06, + "loss": 0.3691, + "step": 2665 + }, + { + "epoch": 0.9000505646384628, + "grad_norm": 1.0959408810962532, + "learning_rate": 9.422548993814382e-06, + "loss": 0.3544, + "step": 2670 + }, + { + "epoch": 0.901736052587224, + "grad_norm": 1.3553626323545842, + "learning_rate": 9.419999933204149e-06, + "loss": 0.3872, + "step": 2675 + }, + { + "epoch": 0.9034215405359851, + "grad_norm": 1.2666777257862378, + "learning_rate": 9.417445605139952e-06, + "loss": 0.3353, + "step": 2680 + }, + { + "epoch": 0.9051070284847463, + "grad_norm": 1.4493189109284006, + "learning_rate": 9.414886012665867e-06, + "loss": 0.3425, + "step": 2685 + }, + { + "epoch": 0.9067925164335074, + "grad_norm": 1.5303749783674303, + "learning_rate": 9.41232115883224e-06, + "loss": 0.3597, + "step": 2690 + }, + { + "epoch": 0.9084780043822687, + "grad_norm": 5.455372188432605, + "learning_rate": 9.409751046695692e-06, + "loss": 0.3434, + "step": 2695 + }, + { + "epoch": 0.9101634923310299, + "grad_norm": 1.4618420145930104, + "learning_rate": 9.407175679319103e-06, + "loss": 0.343, + "step": 2700 + }, + { + "epoch": 0.911848980279791, + "grad_norm": 1.1877451757817232, + "learning_rate": 9.404595059771621e-06, + "loss": 0.342, + "step": 2705 + }, + { + "epoch": 0.9135344682285522, + "grad_norm": 1.2667632479796593, + "learning_rate": 9.402009191128653e-06, + "loss": 0.3261, + "step": 2710 + }, + { + "epoch": 0.9152199561773133, + "grad_norm": 1.1273263571237337, + "learning_rate": 9.39941807647186e-06, + "loss": 0.3782, + "step": 2715 + }, + { + "epoch": 0.9169054441260746, + "grad_norm": 1.3475581960058018, + "learning_rate": 9.396821718889158e-06, + "loss": 0.3665, + "step": 2720 + }, + { + "epoch": 0.9185909320748357, + "grad_norm": 1.1018041204366436, + "learning_rate": 9.394220121474703e-06, + "loss": 0.3373, + "step": 2725 + }, + { + "epoch": 0.9202764200235969, + "grad_norm": 1.196917924601841, + "learning_rate": 9.391613287328908e-06, + "loss": 0.3568, + "step": 2730 + }, + { + "epoch": 0.921961907972358, + "grad_norm": 1.1647769372411554, + "learning_rate": 9.389001219558413e-06, + "loss": 0.3323, + "step": 2735 + }, + { + "epoch": 0.9236473959211192, + "grad_norm": 1.550251090743837, + "learning_rate": 9.386383921276106e-06, + "loss": 0.3511, + "step": 2740 + }, + { + "epoch": 0.9253328838698803, + "grad_norm": 1.661099673054384, + "learning_rate": 9.383761395601103e-06, + "loss": 0.3535, + "step": 2745 + }, + { + "epoch": 0.9270183718186416, + "grad_norm": 1.4237022182652403, + "learning_rate": 9.381133645658751e-06, + "loss": 0.3475, + "step": 2750 + }, + { + "epoch": 0.9287038597674027, + "grad_norm": 1.1476681905519996, + "learning_rate": 9.378500674580624e-06, + "loss": 0.3464, + "step": 2755 + }, + { + "epoch": 0.9303893477161639, + "grad_norm": 1.4250656646807038, + "learning_rate": 9.375862485504517e-06, + "loss": 0.3385, + "step": 2760 + }, + { + "epoch": 0.932074835664925, + "grad_norm": 1.079610280655471, + "learning_rate": 9.37321908157444e-06, + "loss": 0.3547, + "step": 2765 + }, + { + "epoch": 0.9337603236136862, + "grad_norm": 1.2807702653709614, + "learning_rate": 9.370570465940626e-06, + "loss": 0.3507, + "step": 2770 + }, + { + "epoch": 0.9354458115624473, + "grad_norm": 1.268775963488638, + "learning_rate": 9.367916641759514e-06, + "loss": 0.3766, + "step": 2775 + }, + { + "epoch": 0.9371312995112085, + "grad_norm": 1.2173670686821865, + "learning_rate": 9.365257612193746e-06, + "loss": 0.3408, + "step": 2780 + }, + { + "epoch": 0.9388167874599697, + "grad_norm": 1.2329741216099723, + "learning_rate": 9.362593380412175e-06, + "loss": 0.349, + "step": 2785 + }, + { + "epoch": 0.9405022754087309, + "grad_norm": 1.1646655692947196, + "learning_rate": 9.359923949589848e-06, + "loss": 0.3115, + "step": 2790 + }, + { + "epoch": 0.942187763357492, + "grad_norm": 1.3937233536880058, + "learning_rate": 9.357249322908016e-06, + "loss": 0.375, + "step": 2795 + }, + { + "epoch": 0.9438732513062532, + "grad_norm": 1.2562016101527584, + "learning_rate": 9.354569503554108e-06, + "loss": 0.3383, + "step": 2800 + }, + { + "epoch": 0.9455587392550143, + "grad_norm": 1.636938094391326, + "learning_rate": 9.351884494721755e-06, + "loss": 0.352, + "step": 2805 + }, + { + "epoch": 0.9472442272037755, + "grad_norm": 1.4514271135210426, + "learning_rate": 9.349194299610768e-06, + "loss": 0.3403, + "step": 2810 + }, + { + "epoch": 0.9489297151525367, + "grad_norm": 1.1700167615062402, + "learning_rate": 9.346498921427133e-06, + "loss": 0.3415, + "step": 2815 + }, + { + "epoch": 0.9506152031012979, + "grad_norm": 1.3363274751129468, + "learning_rate": 9.343798363383023e-06, + "loss": 0.3811, + "step": 2820 + }, + { + "epoch": 0.952300691050059, + "grad_norm": 1.1253900331786755, + "learning_rate": 9.341092628696775e-06, + "loss": 0.3484, + "step": 2825 + }, + { + "epoch": 0.9539861789988202, + "grad_norm": 1.2107918686939847, + "learning_rate": 9.3383817205929e-06, + "loss": 0.3701, + "step": 2830 + }, + { + "epoch": 0.9556716669475813, + "grad_norm": 1.1469262673195784, + "learning_rate": 9.335665642302072e-06, + "loss": 0.2904, + "step": 2835 + }, + { + "epoch": 0.9573571548963425, + "grad_norm": 1.239341659965055, + "learning_rate": 9.33294439706113e-06, + "loss": 0.3238, + "step": 2840 + }, + { + "epoch": 0.9590426428451037, + "grad_norm": 1.1345842333861247, + "learning_rate": 9.330217988113065e-06, + "loss": 0.3203, + "step": 2845 + }, + { + "epoch": 0.9607281307938649, + "grad_norm": 1.8066563146598626, + "learning_rate": 9.327486418707027e-06, + "loss": 0.3571, + "step": 2850 + }, + { + "epoch": 0.962413618742626, + "grad_norm": 1.3574794785080313, + "learning_rate": 9.324749692098314e-06, + "loss": 0.3468, + "step": 2855 + }, + { + "epoch": 0.9640991066913872, + "grad_norm": 1.1539937926435926, + "learning_rate": 9.322007811548368e-06, + "loss": 0.3475, + "step": 2860 + }, + { + "epoch": 0.9657845946401483, + "grad_norm": 1.223891872303014, + "learning_rate": 9.319260780324776e-06, + "loss": 0.3376, + "step": 2865 + }, + { + "epoch": 0.9674700825889095, + "grad_norm": 1.2001868694768643, + "learning_rate": 9.316508601701262e-06, + "loss": 0.3292, + "step": 2870 + }, + { + "epoch": 0.9691555705376707, + "grad_norm": 1.6611575000957008, + "learning_rate": 9.313751278957685e-06, + "loss": 0.3418, + "step": 2875 + }, + { + "epoch": 0.9708410584864319, + "grad_norm": 1.2208847867854693, + "learning_rate": 9.310988815380032e-06, + "loss": 0.356, + "step": 2880 + }, + { + "epoch": 0.972526546435193, + "grad_norm": 1.0958379301416534, + "learning_rate": 9.308221214260422e-06, + "loss": 0.3028, + "step": 2885 + }, + { + "epoch": 0.9742120343839542, + "grad_norm": 1.2366470625056762, + "learning_rate": 9.30544847889709e-06, + "loss": 0.3607, + "step": 2890 + }, + { + "epoch": 0.9758975223327153, + "grad_norm": 2.3762930639598774, + "learning_rate": 9.302670612594395e-06, + "loss": 0.3319, + "step": 2895 + }, + { + "epoch": 0.9775830102814765, + "grad_norm": 1.2431508247950915, + "learning_rate": 9.299887618662805e-06, + "loss": 0.3503, + "step": 2900 + }, + { + "epoch": 0.9792684982302376, + "grad_norm": 1.4436262664920567, + "learning_rate": 9.297099500418905e-06, + "loss": 0.3621, + "step": 2905 + }, + { + "epoch": 0.9809539861789989, + "grad_norm": 1.1419148133150692, + "learning_rate": 9.29430626118538e-06, + "loss": 0.3326, + "step": 2910 + }, + { + "epoch": 0.98263947412776, + "grad_norm": 1.153375332782516, + "learning_rate": 9.291507904291026e-06, + "loss": 0.3543, + "step": 2915 + }, + { + "epoch": 0.9843249620765212, + "grad_norm": 1.2425388348474078, + "learning_rate": 9.288704433070731e-06, + "loss": 0.3487, + "step": 2920 + }, + { + "epoch": 0.9860104500252823, + "grad_norm": 1.317597576468785, + "learning_rate": 9.285895850865483e-06, + "loss": 0.3547, + "step": 2925 + }, + { + "epoch": 0.9876959379740435, + "grad_norm": 1.2615415762585858, + "learning_rate": 9.283082161022356e-06, + "loss": 0.3728, + "step": 2930 + }, + { + "epoch": 0.9893814259228046, + "grad_norm": 1.1166740430987043, + "learning_rate": 9.280263366894514e-06, + "loss": 0.3274, + "step": 2935 + }, + { + "epoch": 0.9910669138715659, + "grad_norm": 1.1510088475240097, + "learning_rate": 9.277439471841203e-06, + "loss": 0.3593, + "step": 2940 + }, + { + "epoch": 0.992752401820327, + "grad_norm": 1.1971651350588435, + "learning_rate": 9.274610479227748e-06, + "loss": 0.3319, + "step": 2945 + }, + { + "epoch": 0.9944378897690882, + "grad_norm": 1.380371721659796, + "learning_rate": 9.271776392425551e-06, + "loss": 0.3314, + "step": 2950 + }, + { + "epoch": 0.9961233777178493, + "grad_norm": 2.3108768666864035, + "learning_rate": 9.26893721481208e-06, + "loss": 0.3586, + "step": 2955 + }, + { + "epoch": 0.9978088656666105, + "grad_norm": 1.045938904984177, + "learning_rate": 9.266092949770875e-06, + "loss": 0.3319, + "step": 2960 + }, + { + "epoch": 0.9994943536153716, + "grad_norm": 1.271576090306851, + "learning_rate": 9.263243600691538e-06, + "loss": 0.3502, + "step": 2965 + }, + { + "epoch": 1.0010112927692567, + "grad_norm": 1.1986475582635325, + "learning_rate": 9.260389170969726e-06, + "loss": 0.3169, + "step": 2970 + }, + { + "epoch": 1.0026967807180178, + "grad_norm": 1.1417963899354506, + "learning_rate": 9.257529664007154e-06, + "loss": 0.3256, + "step": 2975 + }, + { + "epoch": 1.004382268666779, + "grad_norm": 1.8801962520324713, + "learning_rate": 9.254665083211587e-06, + "loss": 0.3699, + "step": 2980 + }, + { + "epoch": 1.0060677566155403, + "grad_norm": 1.1225537502833858, + "learning_rate": 9.251795431996837e-06, + "loss": 0.3463, + "step": 2985 + }, + { + "epoch": 1.0077532445643014, + "grad_norm": 1.1927612026553454, + "learning_rate": 9.248920713782759e-06, + "loss": 0.3152, + "step": 2990 + }, + { + "epoch": 1.0094387325130625, + "grad_norm": 1.792026765241136, + "learning_rate": 9.246040931995246e-06, + "loss": 0.345, + "step": 2995 + }, + { + "epoch": 1.0111242204618236, + "grad_norm": 1.3165032042139777, + "learning_rate": 9.243156090066222e-06, + "loss": 0.3033, + "step": 3000 + }, + { + "epoch": 1.012809708410585, + "grad_norm": 1.5271171902753329, + "learning_rate": 9.24026619143365e-06, + "loss": 0.3451, + "step": 3005 + }, + { + "epoch": 1.014495196359346, + "grad_norm": 1.349154410739946, + "learning_rate": 9.237371239541507e-06, + "loss": 0.3178, + "step": 3010 + }, + { + "epoch": 1.0161806843081072, + "grad_norm": 1.5345825719059862, + "learning_rate": 9.234471237839804e-06, + "loss": 0.3129, + "step": 3015 + }, + { + "epoch": 1.0178661722568683, + "grad_norm": 1.1463231622867909, + "learning_rate": 9.231566189784562e-06, + "loss": 0.2886, + "step": 3020 + }, + { + "epoch": 1.0195516602056296, + "grad_norm": 1.1444509789566124, + "learning_rate": 9.228656098837823e-06, + "loss": 0.3396, + "step": 3025 + }, + { + "epoch": 1.0212371481543907, + "grad_norm": 1.23581179979636, + "learning_rate": 9.22574096846763e-06, + "loss": 0.336, + "step": 3030 + }, + { + "epoch": 1.0229226361031518, + "grad_norm": 1.1249953312800807, + "learning_rate": 9.222820802148038e-06, + "loss": 0.3101, + "step": 3035 + }, + { + "epoch": 1.024608124051913, + "grad_norm": 1.216249650136864, + "learning_rate": 9.219895603359103e-06, + "loss": 0.3602, + "step": 3040 + }, + { + "epoch": 1.0262936120006743, + "grad_norm": 1.0811514174168881, + "learning_rate": 9.216965375586875e-06, + "loss": 0.3225, + "step": 3045 + }, + { + "epoch": 1.0279790999494354, + "grad_norm": 1.0821060654433754, + "learning_rate": 9.214030122323398e-06, + "loss": 0.3273, + "step": 3050 + }, + { + "epoch": 1.0296645878981965, + "grad_norm": 1.192607743749468, + "learning_rate": 9.211089847066712e-06, + "loss": 0.3053, + "step": 3055 + }, + { + "epoch": 1.0313500758469576, + "grad_norm": 1.3329700065622327, + "learning_rate": 9.20814455332083e-06, + "loss": 0.3543, + "step": 3060 + }, + { + "epoch": 1.033035563795719, + "grad_norm": 1.1292006779440744, + "learning_rate": 9.205194244595756e-06, + "loss": 0.3375, + "step": 3065 + }, + { + "epoch": 1.03472105174448, + "grad_norm": 1.0572796874473291, + "learning_rate": 9.202238924407464e-06, + "loss": 0.3286, + "step": 3070 + }, + { + "epoch": 1.0364065396932411, + "grad_norm": 1.3958436199927817, + "learning_rate": 9.199278596277903e-06, + "loss": 0.3279, + "step": 3075 + }, + { + "epoch": 1.0380920276420023, + "grad_norm": 1.0901119258919272, + "learning_rate": 9.196313263734992e-06, + "loss": 0.3265, + "step": 3080 + }, + { + "epoch": 1.0397775155907636, + "grad_norm": 1.7433779848202735, + "learning_rate": 9.193342930312609e-06, + "loss": 0.3264, + "step": 3085 + }, + { + "epoch": 1.0414630035395247, + "grad_norm": 1.0532495750269206, + "learning_rate": 9.190367599550595e-06, + "loss": 0.3527, + "step": 3090 + }, + { + "epoch": 1.0431484914882858, + "grad_norm": 1.3723820144680032, + "learning_rate": 9.187387274994748e-06, + "loss": 0.3388, + "step": 3095 + }, + { + "epoch": 1.044833979437047, + "grad_norm": 2.2437791281647352, + "learning_rate": 9.184401960196812e-06, + "loss": 0.357, + "step": 3100 + }, + { + "epoch": 1.0465194673858083, + "grad_norm": 1.4617104242350367, + "learning_rate": 9.181411658714484e-06, + "loss": 0.2893, + "step": 3105 + }, + { + "epoch": 1.0482049553345694, + "grad_norm": 1.3252145371158623, + "learning_rate": 9.178416374111399e-06, + "loss": 0.3202, + "step": 3110 + }, + { + "epoch": 1.0498904432833305, + "grad_norm": 1.0885819214840982, + "learning_rate": 9.175416109957136e-06, + "loss": 0.3402, + "step": 3115 + }, + { + "epoch": 1.0515759312320916, + "grad_norm": 1.19227178930355, + "learning_rate": 9.1724108698272e-06, + "loss": 0.305, + "step": 3120 + }, + { + "epoch": 1.053261419180853, + "grad_norm": 1.6085625561651227, + "learning_rate": 9.169400657303033e-06, + "loss": 0.3194, + "step": 3125 + }, + { + "epoch": 1.054946907129614, + "grad_norm": 1.2166238467446886, + "learning_rate": 9.166385475972002e-06, + "loss": 0.3435, + "step": 3130 + }, + { + "epoch": 1.0566323950783751, + "grad_norm": 1.1891858057777183, + "learning_rate": 9.163365329427392e-06, + "loss": 0.3026, + "step": 3135 + }, + { + "epoch": 1.0583178830271363, + "grad_norm": 1.3225904722574446, + "learning_rate": 9.160340221268408e-06, + "loss": 0.3191, + "step": 3140 + }, + { + "epoch": 1.0600033709758976, + "grad_norm": 1.2519048096971406, + "learning_rate": 9.15731015510017e-06, + "loss": 0.3054, + "step": 3145 + }, + { + "epoch": 1.0616888589246587, + "grad_norm": 2.1095956559646316, + "learning_rate": 9.154275134533699e-06, + "loss": 0.2828, + "step": 3150 + }, + { + "epoch": 1.0633743468734198, + "grad_norm": 1.274722011141106, + "learning_rate": 9.151235163185929e-06, + "loss": 0.352, + "step": 3155 + }, + { + "epoch": 1.065059834822181, + "grad_norm": 1.1965108294556865, + "learning_rate": 9.14819024467969e-06, + "loss": 0.3126, + "step": 3160 + }, + { + "epoch": 1.0667453227709423, + "grad_norm": 1.1996063029528807, + "learning_rate": 9.145140382643703e-06, + "loss": 0.3547, + "step": 3165 + }, + { + "epoch": 1.0684308107197034, + "grad_norm": 1.02099250189113, + "learning_rate": 9.142085580712591e-06, + "loss": 0.304, + "step": 3170 + }, + { + "epoch": 1.0701162986684645, + "grad_norm": 1.164709913815049, + "learning_rate": 9.139025842526856e-06, + "loss": 0.3449, + "step": 3175 + }, + { + "epoch": 1.0718017866172258, + "grad_norm": 1.486414637369902, + "learning_rate": 9.135961171732884e-06, + "loss": 0.3583, + "step": 3180 + }, + { + "epoch": 1.073487274565987, + "grad_norm": 1.1493998901839066, + "learning_rate": 9.132891571982942e-06, + "loss": 0.3236, + "step": 3185 + }, + { + "epoch": 1.075172762514748, + "grad_norm": 1.2765353448642704, + "learning_rate": 9.12981704693517e-06, + "loss": 0.3555, + "step": 3190 + }, + { + "epoch": 1.0768582504635091, + "grad_norm": 1.753797510882966, + "learning_rate": 9.126737600253574e-06, + "loss": 0.3679, + "step": 3195 + }, + { + "epoch": 1.0785437384122702, + "grad_norm": 1.2563728938227616, + "learning_rate": 9.12365323560803e-06, + "loss": 0.3175, + "step": 3200 + }, + { + "epoch": 1.0802292263610316, + "grad_norm": 1.2304012448682435, + "learning_rate": 9.120563956674272e-06, + "loss": 0.3376, + "step": 3205 + }, + { + "epoch": 1.0819147143097927, + "grad_norm": 1.2474689276169746, + "learning_rate": 9.117469767133894e-06, + "loss": 0.3478, + "step": 3210 + }, + { + "epoch": 1.0836002022585538, + "grad_norm": 1.3455991754168237, + "learning_rate": 9.114370670674337e-06, + "loss": 0.3143, + "step": 3215 + }, + { + "epoch": 1.0852856902073151, + "grad_norm": 1.133241200911727, + "learning_rate": 9.111266670988893e-06, + "loss": 0.3331, + "step": 3220 + }, + { + "epoch": 1.0869711781560762, + "grad_norm": 1.1508043345397088, + "learning_rate": 9.108157771776698e-06, + "loss": 0.3373, + "step": 3225 + }, + { + "epoch": 1.0886566661048374, + "grad_norm": 1.2732854414203183, + "learning_rate": 9.105043976742724e-06, + "loss": 0.2919, + "step": 3230 + }, + { + "epoch": 1.0903421540535985, + "grad_norm": 1.2997747548615537, + "learning_rate": 9.101925289597781e-06, + "loss": 0.3021, + "step": 3235 + }, + { + "epoch": 1.0920276420023596, + "grad_norm": 1.3231619815669389, + "learning_rate": 9.098801714058506e-06, + "loss": 0.3271, + "step": 3240 + }, + { + "epoch": 1.093713129951121, + "grad_norm": 2.41195593545421, + "learning_rate": 9.095673253847364e-06, + "loss": 0.327, + "step": 3245 + }, + { + "epoch": 1.095398617899882, + "grad_norm": 1.6405851933675375, + "learning_rate": 9.092539912692639e-06, + "loss": 0.3385, + "step": 3250 + }, + { + "epoch": 1.0970841058486431, + "grad_norm": 1.8352324988894357, + "learning_rate": 9.089401694328436e-06, + "loss": 0.3542, + "step": 3255 + }, + { + "epoch": 1.0987695937974045, + "grad_norm": 1.2345015508784225, + "learning_rate": 9.086258602494662e-06, + "loss": 0.3333, + "step": 3260 + }, + { + "epoch": 1.1004550817461656, + "grad_norm": 1.6093942418773934, + "learning_rate": 9.083110640937048e-06, + "loss": 0.3461, + "step": 3265 + }, + { + "epoch": 1.1021405696949267, + "grad_norm": 1.3363574509349696, + "learning_rate": 9.079957813407112e-06, + "loss": 0.2914, + "step": 3270 + }, + { + "epoch": 1.1038260576436878, + "grad_norm": 1.203001719749076, + "learning_rate": 9.076800123662185e-06, + "loss": 0.3163, + "step": 3275 + }, + { + "epoch": 1.105511545592449, + "grad_norm": 1.4431081253221059, + "learning_rate": 9.073637575465379e-06, + "loss": 0.3336, + "step": 3280 + }, + { + "epoch": 1.1071970335412102, + "grad_norm": 1.257268413635979, + "learning_rate": 9.070470172585611e-06, + "loss": 0.3237, + "step": 3285 + }, + { + "epoch": 1.1088825214899714, + "grad_norm": 1.2842426361889514, + "learning_rate": 9.067297918797567e-06, + "loss": 0.3212, + "step": 3290 + }, + { + "epoch": 1.1105680094387325, + "grad_norm": 1.192735317660996, + "learning_rate": 9.064120817881729e-06, + "loss": 0.3514, + "step": 3295 + }, + { + "epoch": 1.1122534973874938, + "grad_norm": 1.6647526277638496, + "learning_rate": 9.060938873624346e-06, + "loss": 0.3335, + "step": 3300 + }, + { + "epoch": 1.113938985336255, + "grad_norm": 1.0740569540218787, + "learning_rate": 9.05775208981744e-06, + "loss": 0.3038, + "step": 3305 + }, + { + "epoch": 1.115624473285016, + "grad_norm": 1.3657359178357789, + "learning_rate": 9.054560470258805e-06, + "loss": 0.3076, + "step": 3310 + }, + { + "epoch": 1.1173099612337771, + "grad_norm": 1.2527517404187796, + "learning_rate": 9.051364018751996e-06, + "loss": 0.3026, + "step": 3315 + }, + { + "epoch": 1.1189954491825382, + "grad_norm": 1.607673503435768, + "learning_rate": 9.048162739106322e-06, + "loss": 0.3026, + "step": 3320 + }, + { + "epoch": 1.1206809371312996, + "grad_norm": 1.381814453274146, + "learning_rate": 9.044956635136853e-06, + "loss": 0.3404, + "step": 3325 + }, + { + "epoch": 1.1223664250800607, + "grad_norm": 1.5004775481227122, + "learning_rate": 9.0417457106644e-06, + "loss": 0.3211, + "step": 3330 + }, + { + "epoch": 1.1240519130288218, + "grad_norm": 1.0621641080464284, + "learning_rate": 9.038529969515529e-06, + "loss": 0.3296, + "step": 3335 + }, + { + "epoch": 1.1257374009775831, + "grad_norm": 1.294817961155883, + "learning_rate": 9.035309415522537e-06, + "loss": 0.3133, + "step": 3340 + }, + { + "epoch": 1.1274228889263442, + "grad_norm": 1.437956315448835, + "learning_rate": 9.032084052523462e-06, + "loss": 0.3247, + "step": 3345 + }, + { + "epoch": 1.1291083768751053, + "grad_norm": 1.1717633173338764, + "learning_rate": 9.02885388436207e-06, + "loss": 0.3131, + "step": 3350 + }, + { + "epoch": 1.1307938648238665, + "grad_norm": 6.425086001865455, + "learning_rate": 9.025618914887853e-06, + "loss": 0.3044, + "step": 3355 + }, + { + "epoch": 1.1324793527726276, + "grad_norm": 1.2857207290031247, + "learning_rate": 9.022379147956032e-06, + "loss": 0.3179, + "step": 3360 + }, + { + "epoch": 1.134164840721389, + "grad_norm": 1.0364430803760774, + "learning_rate": 9.019134587427535e-06, + "loss": 0.316, + "step": 3365 + }, + { + "epoch": 1.13585032867015, + "grad_norm": 1.9246333037407806, + "learning_rate": 9.01588523716901e-06, + "loss": 0.3534, + "step": 3370 + }, + { + "epoch": 1.1375358166189111, + "grad_norm": 1.5172240239210515, + "learning_rate": 9.01263110105281e-06, + "loss": 0.3145, + "step": 3375 + }, + { + "epoch": 1.1392213045676725, + "grad_norm": 1.250729854508053, + "learning_rate": 9.009372182956993e-06, + "loss": 0.337, + "step": 3380 + }, + { + "epoch": 1.1409067925164336, + "grad_norm": 1.286048477470388, + "learning_rate": 9.006108486765312e-06, + "loss": 0.329, + "step": 3385 + }, + { + "epoch": 1.1425922804651947, + "grad_norm": 1.316957374362688, + "learning_rate": 9.00284001636722e-06, + "loss": 0.3195, + "step": 3390 + }, + { + "epoch": 1.1442777684139558, + "grad_norm": 1.4232912394152366, + "learning_rate": 8.999566775657855e-06, + "loss": 0.3139, + "step": 3395 + }, + { + "epoch": 1.145963256362717, + "grad_norm": 1.373486456426613, + "learning_rate": 8.996288768538044e-06, + "loss": 0.3267, + "step": 3400 + }, + { + "epoch": 1.1476487443114782, + "grad_norm": 2.204423812126712, + "learning_rate": 8.99300599891429e-06, + "loss": 0.3288, + "step": 3405 + }, + { + "epoch": 1.1493342322602393, + "grad_norm": 1.4328944647787816, + "learning_rate": 8.989718470698776e-06, + "loss": 0.3438, + "step": 3410 + }, + { + "epoch": 1.1510197202090005, + "grad_norm": 1.7727307773185965, + "learning_rate": 8.98642618780935e-06, + "loss": 0.3037, + "step": 3415 + }, + { + "epoch": 1.1527052081577618, + "grad_norm": 1.3942250547499695, + "learning_rate": 8.98312915416953e-06, + "loss": 0.3231, + "step": 3420 + }, + { + "epoch": 1.154390696106523, + "grad_norm": 1.3782199716782548, + "learning_rate": 8.979827373708499e-06, + "loss": 0.3477, + "step": 3425 + }, + { + "epoch": 1.156076184055284, + "grad_norm": 1.4768237198049075, + "learning_rate": 8.97652085036109e-06, + "loss": 0.3403, + "step": 3430 + }, + { + "epoch": 1.1577616720040451, + "grad_norm": 1.2846348096176017, + "learning_rate": 8.973209588067794e-06, + "loss": 0.3194, + "step": 3435 + }, + { + "epoch": 1.1594471599528062, + "grad_norm": 1.3310634826315841, + "learning_rate": 8.969893590774745e-06, + "loss": 0.3325, + "step": 3440 + }, + { + "epoch": 1.1611326479015676, + "grad_norm": 1.5129127846833883, + "learning_rate": 8.966572862433724e-06, + "loss": 0.2987, + "step": 3445 + }, + { + "epoch": 1.1628181358503287, + "grad_norm": 1.2467326527956635, + "learning_rate": 8.963247407002148e-06, + "loss": 0.3037, + "step": 3450 + }, + { + "epoch": 1.1645036237990898, + "grad_norm": 1.4479209643129705, + "learning_rate": 8.959917228443067e-06, + "loss": 0.37, + "step": 3455 + }, + { + "epoch": 1.1661891117478511, + "grad_norm": 1.439340391515246, + "learning_rate": 8.956582330725158e-06, + "loss": 0.3276, + "step": 3460 + }, + { + "epoch": 1.1678745996966122, + "grad_norm": 1.259754683808496, + "learning_rate": 8.953242717822727e-06, + "loss": 0.3288, + "step": 3465 + }, + { + "epoch": 1.1695600876453733, + "grad_norm": 1.2098043401681866, + "learning_rate": 8.9498983937157e-06, + "loss": 0.3194, + "step": 3470 + }, + { + "epoch": 1.1712455755941344, + "grad_norm": 1.1930692751081917, + "learning_rate": 8.946549362389605e-06, + "loss": 0.3269, + "step": 3475 + }, + { + "epoch": 1.1729310635428956, + "grad_norm": 1.131719558028194, + "learning_rate": 8.943195627835597e-06, + "loss": 0.3231, + "step": 3480 + }, + { + "epoch": 1.174616551491657, + "grad_norm": 1.5891565143727822, + "learning_rate": 8.939837194050422e-06, + "loss": 0.3252, + "step": 3485 + }, + { + "epoch": 1.176302039440418, + "grad_norm": 1.047800343701104, + "learning_rate": 8.936474065036435e-06, + "loss": 0.3068, + "step": 3490 + }, + { + "epoch": 1.1779875273891791, + "grad_norm": 1.2966874445214869, + "learning_rate": 8.933106244801584e-06, + "loss": 0.3234, + "step": 3495 + }, + { + "epoch": 1.1796730153379404, + "grad_norm": 1.3096370074171433, + "learning_rate": 8.929733737359406e-06, + "loss": 0.3364, + "step": 3500 + }, + { + "epoch": 1.1813585032867016, + "grad_norm": 1.1199138577516985, + "learning_rate": 8.926356546729025e-06, + "loss": 0.2954, + "step": 3505 + }, + { + "epoch": 1.1830439912354627, + "grad_norm": 1.1605643433971202, + "learning_rate": 8.922974676935145e-06, + "loss": 0.3243, + "step": 3510 + }, + { + "epoch": 1.1847294791842238, + "grad_norm": 1.1419222736041674, + "learning_rate": 8.919588132008048e-06, + "loss": 0.3448, + "step": 3515 + }, + { + "epoch": 1.1864149671329849, + "grad_norm": 1.2423183879943918, + "learning_rate": 8.916196915983588e-06, + "loss": 0.3151, + "step": 3520 + }, + { + "epoch": 1.1881004550817462, + "grad_norm": 1.4313330517460534, + "learning_rate": 8.912801032903183e-06, + "loss": 0.3006, + "step": 3525 + }, + { + "epoch": 1.1897859430305073, + "grad_norm": 1.31627280105742, + "learning_rate": 8.909400486813817e-06, + "loss": 0.32, + "step": 3530 + }, + { + "epoch": 1.1914714309792684, + "grad_norm": 1.3249246461026147, + "learning_rate": 8.905995281768024e-06, + "loss": 0.3635, + "step": 3535 + }, + { + "epoch": 1.1931569189280298, + "grad_norm": 1.0531573968679886, + "learning_rate": 8.902585421823901e-06, + "loss": 0.2969, + "step": 3540 + }, + { + "epoch": 1.1948424068767909, + "grad_norm": 1.2781551220592962, + "learning_rate": 8.899170911045081e-06, + "loss": 0.3148, + "step": 3545 + }, + { + "epoch": 1.196527894825552, + "grad_norm": 2.2616317341010084, + "learning_rate": 8.895751753500745e-06, + "loss": 0.3649, + "step": 3550 + }, + { + "epoch": 1.198213382774313, + "grad_norm": 1.0111562114447625, + "learning_rate": 8.892327953265616e-06, + "loss": 0.3189, + "step": 3555 + }, + { + "epoch": 1.1998988707230742, + "grad_norm": 1.6597569641402565, + "learning_rate": 8.888899514419939e-06, + "loss": 0.3191, + "step": 3560 + }, + { + "epoch": 1.2015843586718356, + "grad_norm": 1.31780683860324, + "learning_rate": 8.885466441049497e-06, + "loss": 0.3148, + "step": 3565 + }, + { + "epoch": 1.2032698466205967, + "grad_norm": 1.2291331373676375, + "learning_rate": 8.882028737245592e-06, + "loss": 0.3263, + "step": 3570 + }, + { + "epoch": 1.2049553345693578, + "grad_norm": 1.1658309953207986, + "learning_rate": 8.878586407105043e-06, + "loss": 0.3267, + "step": 3575 + }, + { + "epoch": 1.206640822518119, + "grad_norm": 1.3344425773299426, + "learning_rate": 8.875139454730186e-06, + "loss": 0.3193, + "step": 3580 + }, + { + "epoch": 1.2083263104668802, + "grad_norm": 1.249352254184222, + "learning_rate": 8.87168788422886e-06, + "loss": 0.2952, + "step": 3585 + }, + { + "epoch": 1.2100117984156413, + "grad_norm": 2.036619401521633, + "learning_rate": 8.868231699714416e-06, + "loss": 0.2887, + "step": 3590 + }, + { + "epoch": 1.2116972863644024, + "grad_norm": 2.3502708587080297, + "learning_rate": 8.864770905305695e-06, + "loss": 0.2927, + "step": 3595 + }, + { + "epoch": 1.2133827743131635, + "grad_norm": 1.292105539706738, + "learning_rate": 8.861305505127036e-06, + "loss": 0.3381, + "step": 3600 + }, + { + "epoch": 1.2150682622619249, + "grad_norm": 1.6205141543398498, + "learning_rate": 8.857835503308266e-06, + "loss": 0.3399, + "step": 3605 + }, + { + "epoch": 1.216753750210686, + "grad_norm": 1.2254379988378619, + "learning_rate": 8.854360903984697e-06, + "loss": 0.3415, + "step": 3610 + }, + { + "epoch": 1.218439238159447, + "grad_norm": 1.3846859413438768, + "learning_rate": 8.850881711297117e-06, + "loss": 0.3079, + "step": 3615 + }, + { + "epoch": 1.2201247261082084, + "grad_norm": 1.9821368611156058, + "learning_rate": 8.847397929391793e-06, + "loss": 0.3055, + "step": 3620 + }, + { + "epoch": 1.2218102140569695, + "grad_norm": 1.1642210769863615, + "learning_rate": 8.843909562420456e-06, + "loss": 0.3264, + "step": 3625 + }, + { + "epoch": 1.2234957020057307, + "grad_norm": 2.471738118695882, + "learning_rate": 8.840416614540306e-06, + "loss": 0.3232, + "step": 3630 + }, + { + "epoch": 1.2251811899544918, + "grad_norm": 1.3172602960661328, + "learning_rate": 8.836919089913998e-06, + "loss": 0.3292, + "step": 3635 + }, + { + "epoch": 1.2268666779032529, + "grad_norm": 1.077852869618823, + "learning_rate": 8.83341699270964e-06, + "loss": 0.3032, + "step": 3640 + }, + { + "epoch": 1.2285521658520142, + "grad_norm": 1.4280651642881883, + "learning_rate": 8.8299103271008e-06, + "loss": 0.3203, + "step": 3645 + }, + { + "epoch": 1.2302376538007753, + "grad_norm": 1.1602409372858455, + "learning_rate": 8.826399097266473e-06, + "loss": 0.3161, + "step": 3650 + }, + { + "epoch": 1.2319231417495364, + "grad_norm": 3.894990600706879, + "learning_rate": 8.82288330739111e-06, + "loss": 0.2761, + "step": 3655 + }, + { + "epoch": 1.2336086296982978, + "grad_norm": 1.2689284910708734, + "learning_rate": 8.819362961664586e-06, + "loss": 0.3032, + "step": 3660 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 1.2822831605625233, + "learning_rate": 8.815838064282208e-06, + "loss": 0.3138, + "step": 3665 + }, + { + "epoch": 1.23697960559582, + "grad_norm": 1.1376184821858066, + "learning_rate": 8.812308619444712e-06, + "loss": 0.3512, + "step": 3670 + }, + { + "epoch": 1.238665093544581, + "grad_norm": 1.2450713686302888, + "learning_rate": 8.808774631358244e-06, + "loss": 0.3082, + "step": 3675 + }, + { + "epoch": 1.2403505814933422, + "grad_norm": 1.099460704911777, + "learning_rate": 8.805236104234372e-06, + "loss": 0.3291, + "step": 3680 + }, + { + "epoch": 1.2420360694421035, + "grad_norm": 1.6632790284896568, + "learning_rate": 8.801693042290071e-06, + "loss": 0.3106, + "step": 3685 + }, + { + "epoch": 1.2437215573908647, + "grad_norm": 2.5895887194216183, + "learning_rate": 8.798145449747721e-06, + "loss": 0.3363, + "step": 3690 + }, + { + "epoch": 1.2454070453396258, + "grad_norm": 2.1677338636579977, + "learning_rate": 8.794593330835099e-06, + "loss": 0.3115, + "step": 3695 + }, + { + "epoch": 1.247092533288387, + "grad_norm": 1.531110066214464, + "learning_rate": 8.79103668978538e-06, + "loss": 0.2923, + "step": 3700 + }, + { + "epoch": 1.2487780212371482, + "grad_norm": 1.6546965442835597, + "learning_rate": 8.78747553083712e-06, + "loss": 0.3206, + "step": 3705 + }, + { + "epoch": 1.2504635091859093, + "grad_norm": 1.197919094195317, + "learning_rate": 8.783909858234272e-06, + "loss": 0.3195, + "step": 3710 + }, + { + "epoch": 1.2521489971346704, + "grad_norm": 4.8612312432634495, + "learning_rate": 8.780339676226156e-06, + "loss": 0.3121, + "step": 3715 + }, + { + "epoch": 1.2538344850834315, + "grad_norm": 1.4903782245448083, + "learning_rate": 8.776764989067474e-06, + "loss": 0.3486, + "step": 3720 + }, + { + "epoch": 1.2555199730321929, + "grad_norm": 2.744059121273202, + "learning_rate": 8.77318580101829e-06, + "loss": 0.3178, + "step": 3725 + }, + { + "epoch": 1.257205460980954, + "grad_norm": 2.2453972057845353, + "learning_rate": 8.769602116344043e-06, + "loss": 0.3479, + "step": 3730 + }, + { + "epoch": 1.258890948929715, + "grad_norm": 1.4520489883541077, + "learning_rate": 8.766013939315518e-06, + "loss": 0.3103, + "step": 3735 + }, + { + "epoch": 1.2605764368784764, + "grad_norm": 1.562303766128593, + "learning_rate": 8.762421274208858e-06, + "loss": 0.3283, + "step": 3740 + }, + { + "epoch": 1.2622619248272375, + "grad_norm": 1.5501877544034917, + "learning_rate": 8.75882412530556e-06, + "loss": 0.3248, + "step": 3745 + }, + { + "epoch": 1.2639474127759986, + "grad_norm": 1.2632632162088169, + "learning_rate": 8.75522249689246e-06, + "loss": 0.3112, + "step": 3750 + }, + { + "epoch": 1.2656329007247598, + "grad_norm": 1.1052986233758186, + "learning_rate": 8.75161639326173e-06, + "loss": 0.3115, + "step": 3755 + }, + { + "epoch": 1.2673183886735209, + "grad_norm": 2.372782363444479, + "learning_rate": 8.748005818710878e-06, + "loss": 0.3051, + "step": 3760 + }, + { + "epoch": 1.2690038766222822, + "grad_norm": 1.3727111203165512, + "learning_rate": 8.744390777542744e-06, + "loss": 0.3242, + "step": 3765 + }, + { + "epoch": 1.2706893645710433, + "grad_norm": 1.8127616056647737, + "learning_rate": 8.740771274065482e-06, + "loss": 0.3297, + "step": 3770 + }, + { + "epoch": 1.2723748525198044, + "grad_norm": 1.3861146040065275, + "learning_rate": 8.737147312592573e-06, + "loss": 0.2945, + "step": 3775 + }, + { + "epoch": 1.2740603404685658, + "grad_norm": 1.2227238010828299, + "learning_rate": 8.733518897442805e-06, + "loss": 0.3144, + "step": 3780 + }, + { + "epoch": 1.2757458284173269, + "grad_norm": 1.2011085550371767, + "learning_rate": 8.729886032940275e-06, + "loss": 0.2985, + "step": 3785 + }, + { + "epoch": 1.277431316366088, + "grad_norm": 2.075825816490112, + "learning_rate": 8.726248723414383e-06, + "loss": 0.31, + "step": 3790 + }, + { + "epoch": 1.279116804314849, + "grad_norm": 1.3373898193491482, + "learning_rate": 8.722606973199826e-06, + "loss": 0.3464, + "step": 3795 + }, + { + "epoch": 1.2808022922636102, + "grad_norm": 1.5819884164270863, + "learning_rate": 8.718960786636594e-06, + "loss": 0.2828, + "step": 3800 + }, + { + "epoch": 1.2824877802123715, + "grad_norm": 4.310723675769661, + "learning_rate": 8.715310168069961e-06, + "loss": 0.3176, + "step": 3805 + }, + { + "epoch": 1.2841732681611326, + "grad_norm": 17.89429726201977, + "learning_rate": 8.711655121850489e-06, + "loss": 0.2898, + "step": 3810 + }, + { + "epoch": 1.2858587561098938, + "grad_norm": 1.5564491525763398, + "learning_rate": 8.707995652334006e-06, + "loss": 0.3089, + "step": 3815 + }, + { + "epoch": 1.287544244058655, + "grad_norm": 1.8115396161375414, + "learning_rate": 8.704331763881624e-06, + "loss": 0.3214, + "step": 3820 + }, + { + "epoch": 1.2892297320074162, + "grad_norm": 1.500306956354656, + "learning_rate": 8.70066346085971e-06, + "loss": 0.3356, + "step": 3825 + }, + { + "epoch": 1.2909152199561773, + "grad_norm": 1.2751789454492453, + "learning_rate": 8.696990747639902e-06, + "loss": 0.3202, + "step": 3830 + }, + { + "epoch": 1.2926007079049384, + "grad_norm": 1.4267113594111158, + "learning_rate": 8.693313628599082e-06, + "loss": 0.3358, + "step": 3835 + }, + { + "epoch": 1.2942861958536995, + "grad_norm": 1.5195776141389108, + "learning_rate": 8.689632108119395e-06, + "loss": 0.3241, + "step": 3840 + }, + { + "epoch": 1.2959716838024609, + "grad_norm": 5.432837666657171, + "learning_rate": 8.685946190588224e-06, + "loss": 0.3043, + "step": 3845 + }, + { + "epoch": 1.297657171751222, + "grad_norm": 1.579641810264342, + "learning_rate": 8.682255880398193e-06, + "loss": 0.3043, + "step": 3850 + }, + { + "epoch": 1.299342659699983, + "grad_norm": 1.4778951459685434, + "learning_rate": 8.678561181947163e-06, + "loss": 0.2978, + "step": 3855 + }, + { + "epoch": 1.3010281476487444, + "grad_norm": 1.1535195636678885, + "learning_rate": 8.674862099638222e-06, + "loss": 0.3269, + "step": 3860 + }, + { + "epoch": 1.3027136355975055, + "grad_norm": 1.3848379551169592, + "learning_rate": 8.671158637879683e-06, + "loss": 0.3638, + "step": 3865 + }, + { + "epoch": 1.3043991235462666, + "grad_norm": 1.0023946409738271, + "learning_rate": 8.667450801085082e-06, + "loss": 0.327, + "step": 3870 + }, + { + "epoch": 1.3060846114950277, + "grad_norm": 1.5710337730488202, + "learning_rate": 8.66373859367316e-06, + "loss": 0.3315, + "step": 3875 + }, + { + "epoch": 1.3077700994437889, + "grad_norm": 1.1486867599777093, + "learning_rate": 8.660022020067878e-06, + "loss": 0.311, + "step": 3880 + }, + { + "epoch": 1.3094555873925502, + "grad_norm": 1.223888384412348, + "learning_rate": 8.65630108469839e-06, + "loss": 0.3192, + "step": 3885 + }, + { + "epoch": 1.3111410753413113, + "grad_norm": 1.1228246106069757, + "learning_rate": 8.652575791999056e-06, + "loss": 0.3141, + "step": 3890 + }, + { + "epoch": 1.3128265632900724, + "grad_norm": 1.2863994611203113, + "learning_rate": 8.648846146409421e-06, + "loss": 0.3361, + "step": 3895 + }, + { + "epoch": 1.3145120512388337, + "grad_norm": 1.5357745404192742, + "learning_rate": 8.645112152374226e-06, + "loss": 0.345, + "step": 3900 + }, + { + "epoch": 1.3161975391875949, + "grad_norm": 3.4188648748199384, + "learning_rate": 8.64137381434339e-06, + "loss": 0.3008, + "step": 3905 + }, + { + "epoch": 1.317883027136356, + "grad_norm": 1.2124253146786814, + "learning_rate": 8.637631136772006e-06, + "loss": 0.3287, + "step": 3910 + }, + { + "epoch": 1.319568515085117, + "grad_norm": 1.2409397938317357, + "learning_rate": 8.633884124120342e-06, + "loss": 0.3032, + "step": 3915 + }, + { + "epoch": 1.3212540030338782, + "grad_norm": 1.2678938727280409, + "learning_rate": 8.630132780853834e-06, + "loss": 0.3302, + "step": 3920 + }, + { + "epoch": 1.3229394909826395, + "grad_norm": 1.1040704379899782, + "learning_rate": 8.626377111443074e-06, + "loss": 0.3057, + "step": 3925 + }, + { + "epoch": 1.3246249789314006, + "grad_norm": 1.254471107174073, + "learning_rate": 8.622617120363815e-06, + "loss": 0.3315, + "step": 3930 + }, + { + "epoch": 1.3263104668801617, + "grad_norm": 1.1285769481318682, + "learning_rate": 8.618852812096957e-06, + "loss": 0.2866, + "step": 3935 + }, + { + "epoch": 1.327995954828923, + "grad_norm": 1.4100019660199357, + "learning_rate": 8.61508419112854e-06, + "loss": 0.316, + "step": 3940 + }, + { + "epoch": 1.3296814427776842, + "grad_norm": 2.244929713448234, + "learning_rate": 8.611311261949757e-06, + "loss": 0.2941, + "step": 3945 + }, + { + "epoch": 1.3313669307264453, + "grad_norm": 2.7078187476516047, + "learning_rate": 8.607534029056923e-06, + "loss": 0.3258, + "step": 3950 + }, + { + "epoch": 1.3330524186752064, + "grad_norm": 2.702068883155361, + "learning_rate": 8.603752496951487e-06, + "loss": 0.3296, + "step": 3955 + }, + { + "epoch": 1.3347379066239675, + "grad_norm": 1.2889997978371053, + "learning_rate": 8.599966670140019e-06, + "loss": 0.2927, + "step": 3960 + }, + { + "epoch": 1.3364233945727289, + "grad_norm": 2.059715520526292, + "learning_rate": 8.59617655313421e-06, + "loss": 0.3187, + "step": 3965 + }, + { + "epoch": 1.33810888252149, + "grad_norm": 1.3868742437237402, + "learning_rate": 8.592382150450865e-06, + "loss": 0.2991, + "step": 3970 + }, + { + "epoch": 1.339794370470251, + "grad_norm": 1.7144049714714598, + "learning_rate": 8.588583466611888e-06, + "loss": 0.349, + "step": 3975 + }, + { + "epoch": 1.3414798584190124, + "grad_norm": 1.271758956176897, + "learning_rate": 8.584780506144299e-06, + "loss": 0.315, + "step": 3980 + }, + { + "epoch": 1.3431653463677735, + "grad_norm": 1.2530317033244227, + "learning_rate": 8.580973273580198e-06, + "loss": 0.3335, + "step": 3985 + }, + { + "epoch": 1.3448508343165346, + "grad_norm": 17.21746848664345, + "learning_rate": 8.57716177345679e-06, + "loss": 0.304, + "step": 3990 + }, + { + "epoch": 1.3465363222652957, + "grad_norm": 1.3140104913474075, + "learning_rate": 8.573346010316359e-06, + "loss": 0.3119, + "step": 3995 + }, + { + "epoch": 1.3482218102140568, + "grad_norm": 1.180402704495193, + "learning_rate": 8.56952598870627e-06, + "loss": 0.3206, + "step": 4000 + }, + { + "epoch": 1.3499072981628182, + "grad_norm": 1.2447229094252958, + "learning_rate": 8.565701713178966e-06, + "loss": 0.327, + "step": 4005 + }, + { + "epoch": 1.3515927861115793, + "grad_norm": 1.4874840559252875, + "learning_rate": 8.561873188291956e-06, + "loss": 0.3348, + "step": 4010 + }, + { + "epoch": 1.3532782740603404, + "grad_norm": 1.45101813485042, + "learning_rate": 8.558040418607814e-06, + "loss": 0.3282, + "step": 4015 + }, + { + "epoch": 1.3549637620091017, + "grad_norm": 1.3054109425443021, + "learning_rate": 8.554203408694173e-06, + "loss": 0.3232, + "step": 4020 + }, + { + "epoch": 1.3566492499578628, + "grad_norm": 1.1488598328474027, + "learning_rate": 8.55036216312372e-06, + "loss": 0.306, + "step": 4025 + }, + { + "epoch": 1.358334737906624, + "grad_norm": 1.2648932973941707, + "learning_rate": 8.546516686474189e-06, + "loss": 0.3141, + "step": 4030 + }, + { + "epoch": 1.360020225855385, + "grad_norm": 1.5703652607250131, + "learning_rate": 8.542666983328355e-06, + "loss": 0.3296, + "step": 4035 + }, + { + "epoch": 1.3617057138041462, + "grad_norm": 1.7291765115916724, + "learning_rate": 8.538813058274033e-06, + "loss": 0.2943, + "step": 4040 + }, + { + "epoch": 1.3633912017529075, + "grad_norm": 1.3473249229094113, + "learning_rate": 8.534954915904068e-06, + "loss": 0.2942, + "step": 4045 + }, + { + "epoch": 1.3650766897016686, + "grad_norm": 1.39226407986307, + "learning_rate": 8.53109256081633e-06, + "loss": 0.3235, + "step": 4050 + }, + { + "epoch": 1.3667621776504297, + "grad_norm": 1.3600299455102163, + "learning_rate": 8.527225997613708e-06, + "loss": 0.3304, + "step": 4055 + }, + { + "epoch": 1.368447665599191, + "grad_norm": 1.5678211829841187, + "learning_rate": 8.52335523090411e-06, + "loss": 0.2921, + "step": 4060 + }, + { + "epoch": 1.3701331535479522, + "grad_norm": 1.1427387474695525, + "learning_rate": 8.519480265300449e-06, + "loss": 0.3099, + "step": 4065 + }, + { + "epoch": 1.3718186414967133, + "grad_norm": 1.1127138274455053, + "learning_rate": 8.515601105420646e-06, + "loss": 0.278, + "step": 4070 + }, + { + "epoch": 1.3735041294454744, + "grad_norm": 2.4216166473917045, + "learning_rate": 8.51171775588762e-06, + "loss": 0.3076, + "step": 4075 + }, + { + "epoch": 1.3751896173942355, + "grad_norm": 1.1495301946401593, + "learning_rate": 8.50783022132928e-06, + "loss": 0.3238, + "step": 4080 + }, + { + "epoch": 1.3768751053429968, + "grad_norm": 1.2296568358077848, + "learning_rate": 8.503938506378524e-06, + "loss": 0.3038, + "step": 4085 + }, + { + "epoch": 1.378560593291758, + "grad_norm": 1.2903799368998459, + "learning_rate": 8.500042615673231e-06, + "loss": 0.3223, + "step": 4090 + }, + { + "epoch": 1.380246081240519, + "grad_norm": 1.4625234888746514, + "learning_rate": 8.496142553856262e-06, + "loss": 0.3045, + "step": 4095 + }, + { + "epoch": 1.3819315691892804, + "grad_norm": 1.2987917187104592, + "learning_rate": 8.49223832557544e-06, + "loss": 0.333, + "step": 4100 + }, + { + "epoch": 1.3836170571380415, + "grad_norm": 1.227452803571335, + "learning_rate": 8.488329935483557e-06, + "loss": 0.348, + "step": 4105 + }, + { + "epoch": 1.3853025450868026, + "grad_norm": 1.3702973529239448, + "learning_rate": 8.484417388238366e-06, + "loss": 0.3035, + "step": 4110 + }, + { + "epoch": 1.3869880330355637, + "grad_norm": 1.479671680120301, + "learning_rate": 8.480500688502577e-06, + "loss": 0.2748, + "step": 4115 + }, + { + "epoch": 1.3886735209843248, + "grad_norm": 1.3561745110078562, + "learning_rate": 8.476579840943841e-06, + "loss": 0.3112, + "step": 4120 + }, + { + "epoch": 1.3903590089330862, + "grad_norm": 1.3105520780187123, + "learning_rate": 8.472654850234759e-06, + "loss": 0.2739, + "step": 4125 + }, + { + "epoch": 1.3920444968818473, + "grad_norm": 1.2673327219983164, + "learning_rate": 8.468725721052865e-06, + "loss": 0.2959, + "step": 4130 + }, + { + "epoch": 1.3937299848306084, + "grad_norm": 2.3825232686318008, + "learning_rate": 8.46479245808063e-06, + "loss": 0.2878, + "step": 4135 + }, + { + "epoch": 1.3954154727793697, + "grad_norm": 1.486716855984244, + "learning_rate": 8.46085506600544e-06, + "loss": 0.323, + "step": 4140 + }, + { + "epoch": 1.3971009607281308, + "grad_norm": 1.4761429312357983, + "learning_rate": 8.456913549519619e-06, + "loss": 0.2893, + "step": 4145 + }, + { + "epoch": 1.398786448676892, + "grad_norm": 1.3937230582771776, + "learning_rate": 8.452967913320392e-06, + "loss": 0.3356, + "step": 4150 + }, + { + "epoch": 1.400471936625653, + "grad_norm": 1.3727669058972003, + "learning_rate": 8.449018162109901e-06, + "loss": 0.2969, + "step": 4155 + }, + { + "epoch": 1.4021574245744142, + "grad_norm": 1.1645151177750952, + "learning_rate": 8.44506430059519e-06, + "loss": 0.3483, + "step": 4160 + }, + { + "epoch": 1.4038429125231755, + "grad_norm": 1.1682677364890428, + "learning_rate": 8.441106333488197e-06, + "loss": 0.3117, + "step": 4165 + }, + { + "epoch": 1.4055284004719366, + "grad_norm": 1.3746259110400163, + "learning_rate": 8.437144265505762e-06, + "loss": 0.3046, + "step": 4170 + }, + { + "epoch": 1.4072138884206977, + "grad_norm": 1.256888048959631, + "learning_rate": 8.433178101369602e-06, + "loss": 0.2916, + "step": 4175 + }, + { + "epoch": 1.408899376369459, + "grad_norm": 1.207649777757582, + "learning_rate": 8.429207845806325e-06, + "loss": 0.3119, + "step": 4180 + }, + { + "epoch": 1.4105848643182202, + "grad_norm": 1.3426705903067313, + "learning_rate": 8.425233503547408e-06, + "loss": 0.3284, + "step": 4185 + }, + { + "epoch": 1.4122703522669813, + "grad_norm": 3.566432978592899, + "learning_rate": 8.4212550793292e-06, + "loss": 0.2875, + "step": 4190 + }, + { + "epoch": 1.4139558402157424, + "grad_norm": 8.895221690453752, + "learning_rate": 8.417272577892916e-06, + "loss": 0.304, + "step": 4195 + }, + { + "epoch": 1.4156413281645035, + "grad_norm": 1.4874260058273723, + "learning_rate": 8.41328600398463e-06, + "loss": 0.3117, + "step": 4200 + }, + { + "epoch": 1.4173268161132648, + "grad_norm": 1.2216137451945772, + "learning_rate": 8.409295362355268e-06, + "loss": 0.2933, + "step": 4205 + }, + { + "epoch": 1.419012304062026, + "grad_norm": 1.0927491924958632, + "learning_rate": 8.405300657760605e-06, + "loss": 0.2808, + "step": 4210 + }, + { + "epoch": 1.420697792010787, + "grad_norm": 1.1514111797365336, + "learning_rate": 8.401301894961253e-06, + "loss": 0.295, + "step": 4215 + }, + { + "epoch": 1.4223832799595484, + "grad_norm": 1.3419551267319916, + "learning_rate": 8.397299078722668e-06, + "loss": 0.3039, + "step": 4220 + }, + { + "epoch": 1.4240687679083095, + "grad_norm": 1.1290326111150712, + "learning_rate": 8.393292213815132e-06, + "loss": 0.3085, + "step": 4225 + }, + { + "epoch": 1.4257542558570706, + "grad_norm": 1.5750789112190187, + "learning_rate": 8.389281305013755e-06, + "loss": 0.2811, + "step": 4230 + }, + { + "epoch": 1.4274397438058317, + "grad_norm": 1.1490987631446123, + "learning_rate": 8.38526635709846e-06, + "loss": 0.3183, + "step": 4235 + }, + { + "epoch": 1.4291252317545928, + "grad_norm": 1.1535140415582725, + "learning_rate": 8.38124737485399e-06, + "loss": 0.3028, + "step": 4240 + }, + { + "epoch": 1.4308107197033542, + "grad_norm": 1.5422149825340228, + "learning_rate": 8.377224363069894e-06, + "loss": 0.337, + "step": 4245 + }, + { + "epoch": 1.4324962076521153, + "grad_norm": 1.0262056181844454, + "learning_rate": 8.37319732654052e-06, + "loss": 0.332, + "step": 4250 + }, + { + "epoch": 1.4341816956008764, + "grad_norm": 1.3775005325975165, + "learning_rate": 8.369166270065017e-06, + "loss": 0.3243, + "step": 4255 + }, + { + "epoch": 1.4358671835496377, + "grad_norm": 1.173509327540304, + "learning_rate": 8.365131198447323e-06, + "loss": 0.3141, + "step": 4260 + }, + { + "epoch": 1.4375526714983988, + "grad_norm": 1.1596006980464375, + "learning_rate": 8.361092116496161e-06, + "loss": 0.3014, + "step": 4265 + }, + { + "epoch": 1.43923815944716, + "grad_norm": 1.32764415589504, + "learning_rate": 8.357049029025031e-06, + "loss": 0.3112, + "step": 4270 + }, + { + "epoch": 1.440923647395921, + "grad_norm": 1.5035233100088548, + "learning_rate": 8.35300194085221e-06, + "loss": 0.2991, + "step": 4275 + }, + { + "epoch": 1.4426091353446822, + "grad_norm": 1.1478608842920697, + "learning_rate": 8.348950856800742e-06, + "loss": 0.2857, + "step": 4280 + }, + { + "epoch": 1.4442946232934435, + "grad_norm": 1.29338444865639, + "learning_rate": 8.34489578169843e-06, + "loss": 0.3481, + "step": 4285 + }, + { + "epoch": 1.4459801112422046, + "grad_norm": 1.0453150271805118, + "learning_rate": 8.340836720377835e-06, + "loss": 0.3193, + "step": 4290 + }, + { + "epoch": 1.4476655991909657, + "grad_norm": 0.9896696042682885, + "learning_rate": 8.336773677676272e-06, + "loss": 0.3134, + "step": 4295 + }, + { + "epoch": 1.449351087139727, + "grad_norm": 2.5935236624647873, + "learning_rate": 8.332706658435797e-06, + "loss": 0.2911, + "step": 4300 + }, + { + "epoch": 1.4510365750884882, + "grad_norm": 1.1536095372386117, + "learning_rate": 8.328635667503202e-06, + "loss": 0.3095, + "step": 4305 + }, + { + "epoch": 1.4527220630372493, + "grad_norm": 1.1073368534973436, + "learning_rate": 8.32456070973002e-06, + "loss": 0.3182, + "step": 4310 + }, + { + "epoch": 1.4544075509860104, + "grad_norm": 1.462814949483858, + "learning_rate": 8.320481789972507e-06, + "loss": 0.2928, + "step": 4315 + }, + { + "epoch": 1.4560930389347715, + "grad_norm": 3.6322373946041457, + "learning_rate": 8.316398913091639e-06, + "loss": 0.3179, + "step": 4320 + }, + { + "epoch": 1.4577785268835328, + "grad_norm": 1.5194226649440357, + "learning_rate": 8.312312083953111e-06, + "loss": 0.3192, + "step": 4325 + }, + { + "epoch": 1.459464014832294, + "grad_norm": 1.3442608838655814, + "learning_rate": 8.308221307427327e-06, + "loss": 0.3272, + "step": 4330 + }, + { + "epoch": 1.461149502781055, + "grad_norm": 1.328468345788671, + "learning_rate": 8.304126588389394e-06, + "loss": 0.2645, + "step": 4335 + }, + { + "epoch": 1.4628349907298164, + "grad_norm": 1.5432668150353683, + "learning_rate": 8.300027931719119e-06, + "loss": 0.2897, + "step": 4340 + }, + { + "epoch": 1.4645204786785775, + "grad_norm": 1.2037066930144322, + "learning_rate": 8.295925342301e-06, + "loss": 0.306, + "step": 4345 + }, + { + "epoch": 1.4662059666273386, + "grad_norm": 1.2263982324594778, + "learning_rate": 8.291818825024224e-06, + "loss": 0.2913, + "step": 4350 + }, + { + "epoch": 1.4678914545760997, + "grad_norm": 1.8086134708791748, + "learning_rate": 8.287708384782659e-06, + "loss": 0.2824, + "step": 4355 + }, + { + "epoch": 1.4695769425248608, + "grad_norm": 1.2635125137788807, + "learning_rate": 8.283594026474841e-06, + "loss": 0.3186, + "step": 4360 + }, + { + "epoch": 1.4712624304736222, + "grad_norm": 2.0545826803258076, + "learning_rate": 8.279475755003989e-06, + "loss": 0.3224, + "step": 4365 + }, + { + "epoch": 1.4729479184223833, + "grad_norm": 1.1489327919842214, + "learning_rate": 8.275353575277973e-06, + "loss": 0.311, + "step": 4370 + }, + { + "epoch": 1.4746334063711444, + "grad_norm": 1.017024562259985, + "learning_rate": 8.271227492209328e-06, + "loss": 0.3084, + "step": 4375 + }, + { + "epoch": 1.4763188943199057, + "grad_norm": 1.2031665589165061, + "learning_rate": 8.267097510715233e-06, + "loss": 0.3313, + "step": 4380 + }, + { + "epoch": 1.4780043822686668, + "grad_norm": 2.673544314333556, + "learning_rate": 8.262963635717523e-06, + "loss": 0.2996, + "step": 4385 + }, + { + "epoch": 1.479689870217428, + "grad_norm": 1.2207254874472724, + "learning_rate": 8.258825872142664e-06, + "loss": 0.3184, + "step": 4390 + }, + { + "epoch": 1.481375358166189, + "grad_norm": 1.4001335668544308, + "learning_rate": 8.254684224921764e-06, + "loss": 0.3066, + "step": 4395 + }, + { + "epoch": 1.4830608461149501, + "grad_norm": 1.012842174095787, + "learning_rate": 8.25053869899055e-06, + "loss": 0.3015, + "step": 4400 + }, + { + "epoch": 1.4847463340637115, + "grad_norm": 1.3371573242587744, + "learning_rate": 8.246389299289383e-06, + "loss": 0.3046, + "step": 4405 + }, + { + "epoch": 1.4864318220124726, + "grad_norm": 1.2286355952808135, + "learning_rate": 8.24223603076323e-06, + "loss": 0.3395, + "step": 4410 + }, + { + "epoch": 1.4881173099612337, + "grad_norm": 1.246783816902124, + "learning_rate": 8.23807889836167e-06, + "loss": 0.2952, + "step": 4415 + }, + { + "epoch": 1.489802797909995, + "grad_norm": 1.1677448910186377, + "learning_rate": 8.233917907038895e-06, + "loss": 0.3079, + "step": 4420 + }, + { + "epoch": 1.4914882858587561, + "grad_norm": 1.272006781957519, + "learning_rate": 8.229753061753688e-06, + "loss": 0.3041, + "step": 4425 + }, + { + "epoch": 1.4931737738075173, + "grad_norm": 1.1268454341228855, + "learning_rate": 8.225584367469426e-06, + "loss": 0.3101, + "step": 4430 + }, + { + "epoch": 1.4948592617562784, + "grad_norm": 1.3493446211359892, + "learning_rate": 8.221411829154076e-06, + "loss": 0.2721, + "step": 4435 + }, + { + "epoch": 1.4965447497050395, + "grad_norm": 1.1799280837923247, + "learning_rate": 8.217235451780183e-06, + "loss": 0.2994, + "step": 4440 + }, + { + "epoch": 1.4982302376538008, + "grad_norm": 1.4177878540916098, + "learning_rate": 8.213055240324868e-06, + "loss": 0.3186, + "step": 4445 + }, + { + "epoch": 1.499915725602562, + "grad_norm": 1.3044048596320261, + "learning_rate": 8.208871199769823e-06, + "loss": 0.3022, + "step": 4450 + }, + { + "epoch": 1.5016012135513233, + "grad_norm": 1.2765856761072953, + "learning_rate": 8.204683335101297e-06, + "loss": 0.3144, + "step": 4455 + }, + { + "epoch": 1.5032867015000844, + "grad_norm": 1.1136046489474536, + "learning_rate": 8.200491651310107e-06, + "loss": 0.3025, + "step": 4460 + }, + { + "epoch": 1.5049721894488455, + "grad_norm": 1.2097049173994694, + "learning_rate": 8.196296153391614e-06, + "loss": 0.3324, + "step": 4465 + }, + { + "epoch": 1.5066576773976066, + "grad_norm": 1.0633724872441128, + "learning_rate": 8.192096846345722e-06, + "loss": 0.2942, + "step": 4470 + }, + { + "epoch": 1.5083431653463677, + "grad_norm": 1.0952722866375602, + "learning_rate": 8.187893735176884e-06, + "loss": 0.2871, + "step": 4475 + }, + { + "epoch": 1.5100286532951288, + "grad_norm": 1.4591350217401189, + "learning_rate": 8.183686824894075e-06, + "loss": 0.2915, + "step": 4480 + }, + { + "epoch": 1.5117141412438901, + "grad_norm": 1.2487660488320904, + "learning_rate": 8.179476120510807e-06, + "loss": 0.2961, + "step": 4485 + }, + { + "epoch": 1.5133996291926513, + "grad_norm": 1.2986555984229826, + "learning_rate": 8.17526162704511e-06, + "loss": 0.3048, + "step": 4490 + }, + { + "epoch": 1.5150851171414126, + "grad_norm": 1.197505715598598, + "learning_rate": 8.171043349519527e-06, + "loss": 0.3099, + "step": 4495 + }, + { + "epoch": 1.5167706050901737, + "grad_norm": 1.4760600880092891, + "learning_rate": 8.166821292961114e-06, + "loss": 0.2877, + "step": 4500 + }, + { + "epoch": 1.5184560930389348, + "grad_norm": 1.0946723400424974, + "learning_rate": 8.16259546240143e-06, + "loss": 0.2639, + "step": 4505 + }, + { + "epoch": 1.520141580987696, + "grad_norm": 1.1784722534823753, + "learning_rate": 8.15836586287653e-06, + "loss": 0.2836, + "step": 4510 + }, + { + "epoch": 1.521827068936457, + "grad_norm": 1.2002264645884866, + "learning_rate": 8.154132499426963e-06, + "loss": 0.2706, + "step": 4515 + }, + { + "epoch": 1.5235125568852181, + "grad_norm": 1.331889955530758, + "learning_rate": 8.149895377097763e-06, + "loss": 0.3046, + "step": 4520 + }, + { + "epoch": 1.5251980448339795, + "grad_norm": 1.0980910173352272, + "learning_rate": 8.14565450093844e-06, + "loss": 0.277, + "step": 4525 + }, + { + "epoch": 1.5268835327827406, + "grad_norm": 1.192117036032157, + "learning_rate": 8.141409876002986e-06, + "loss": 0.3054, + "step": 4530 + }, + { + "epoch": 1.528569020731502, + "grad_norm": 1.1720833863713713, + "learning_rate": 8.13716150734985e-06, + "loss": 0.3341, + "step": 4535 + }, + { + "epoch": 1.530254508680263, + "grad_norm": 1.341175803099967, + "learning_rate": 8.132909400041946e-06, + "loss": 0.3023, + "step": 4540 + }, + { + "epoch": 1.5319399966290241, + "grad_norm": 1.2797385921810815, + "learning_rate": 8.12865355914665e-06, + "loss": 0.3192, + "step": 4545 + }, + { + "epoch": 1.5336254845777852, + "grad_norm": 1.4702793982358064, + "learning_rate": 8.124393989735782e-06, + "loss": 0.2833, + "step": 4550 + }, + { + "epoch": 1.5353109725265464, + "grad_norm": 1.4575853397613314, + "learning_rate": 8.120130696885603e-06, + "loss": 0.2759, + "step": 4555 + }, + { + "epoch": 1.5369964604753075, + "grad_norm": 1.2233305478539482, + "learning_rate": 8.115863685676815e-06, + "loss": 0.2948, + "step": 4560 + }, + { + "epoch": 1.5386819484240688, + "grad_norm": 2.161647783994973, + "learning_rate": 8.11159296119455e-06, + "loss": 0.2764, + "step": 4565 + }, + { + "epoch": 1.54036743637283, + "grad_norm": 1.2056930231950673, + "learning_rate": 8.10731852852837e-06, + "loss": 0.2867, + "step": 4570 + }, + { + "epoch": 1.5420529243215912, + "grad_norm": 1.4206227206884128, + "learning_rate": 8.103040392772245e-06, + "loss": 0.315, + "step": 4575 + }, + { + "epoch": 1.5437384122703524, + "grad_norm": 6.329456167588222, + "learning_rate": 8.098758559024569e-06, + "loss": 0.2887, + "step": 4580 + }, + { + "epoch": 1.5454239002191135, + "grad_norm": 1.1488924467484576, + "learning_rate": 8.094473032388137e-06, + "loss": 0.2988, + "step": 4585 + }, + { + "epoch": 1.5471093881678746, + "grad_norm": 1.230612016638428, + "learning_rate": 8.09018381797015e-06, + "loss": 0.3103, + "step": 4590 + }, + { + "epoch": 1.5487948761166357, + "grad_norm": 1.12166613113956, + "learning_rate": 8.0858909208822e-06, + "loss": 0.2963, + "step": 4595 + }, + { + "epoch": 1.5504803640653968, + "grad_norm": 1.848923325989595, + "learning_rate": 8.081594346240266e-06, + "loss": 0.3398, + "step": 4600 + }, + { + "epoch": 1.5521658520141581, + "grad_norm": 1.47462317560063, + "learning_rate": 8.077294099164714e-06, + "loss": 0.295, + "step": 4605 + }, + { + "epoch": 1.5538513399629192, + "grad_norm": 1.3011799325240747, + "learning_rate": 8.072990184780281e-06, + "loss": 0.2865, + "step": 4610 + }, + { + "epoch": 1.5555368279116806, + "grad_norm": 1.7138001368822846, + "learning_rate": 8.068682608216086e-06, + "loss": 0.2976, + "step": 4615 + }, + { + "epoch": 1.5572223158604417, + "grad_norm": 1.3081338244407001, + "learning_rate": 8.064371374605595e-06, + "loss": 0.2932, + "step": 4620 + }, + { + "epoch": 1.5589078038092028, + "grad_norm": 1.3602376615238971, + "learning_rate": 8.06005648908665e-06, + "loss": 0.3333, + "step": 4625 + }, + { + "epoch": 1.560593291757964, + "grad_norm": 1.2035691797270929, + "learning_rate": 8.05573795680143e-06, + "loss": 0.3063, + "step": 4630 + }, + { + "epoch": 1.562278779706725, + "grad_norm": 1.106948889784737, + "learning_rate": 8.051415782896473e-06, + "loss": 0.3076, + "step": 4635 + }, + { + "epoch": 1.5639642676554861, + "grad_norm": 1.639823888733204, + "learning_rate": 8.047089972522646e-06, + "loss": 0.3059, + "step": 4640 + }, + { + "epoch": 1.5656497556042475, + "grad_norm": 1.118879469886603, + "learning_rate": 8.042760530835158e-06, + "loss": 0.3031, + "step": 4645 + }, + { + "epoch": 1.5673352435530086, + "grad_norm": 1.802512942743598, + "learning_rate": 8.038427462993536e-06, + "loss": 0.283, + "step": 4650 + }, + { + "epoch": 1.56902073150177, + "grad_norm": 1.4894578458392649, + "learning_rate": 8.03409077416164e-06, + "loss": 0.3001, + "step": 4655 + }, + { + "epoch": 1.570706219450531, + "grad_norm": 1.257693654239823, + "learning_rate": 8.029750469507637e-06, + "loss": 0.2925, + "step": 4660 + }, + { + "epoch": 1.5723917073992921, + "grad_norm": 1.6453473842466895, + "learning_rate": 8.025406554204007e-06, + "loss": 0.3115, + "step": 4665 + }, + { + "epoch": 1.5740771953480532, + "grad_norm": 1.2605455880797907, + "learning_rate": 8.02105903342753e-06, + "loss": 0.3098, + "step": 4670 + }, + { + "epoch": 1.5757626832968143, + "grad_norm": 1.1867392634727372, + "learning_rate": 8.016707912359284e-06, + "loss": 0.3432, + "step": 4675 + }, + { + "epoch": 1.5774481712455755, + "grad_norm": 1.2784850432033157, + "learning_rate": 8.01235319618464e-06, + "loss": 0.2897, + "step": 4680 + }, + { + "epoch": 1.5791336591943368, + "grad_norm": 1.7918357849402966, + "learning_rate": 8.007994890093247e-06, + "loss": 0.2791, + "step": 4685 + }, + { + "epoch": 1.580819147143098, + "grad_norm": 1.199205983179323, + "learning_rate": 8.00363299927904e-06, + "loss": 0.2937, + "step": 4690 + }, + { + "epoch": 1.5825046350918592, + "grad_norm": 1.2766814868215317, + "learning_rate": 7.999267528940225e-06, + "loss": 0.3145, + "step": 4695 + }, + { + "epoch": 1.5841901230406203, + "grad_norm": 1.233786588574404, + "learning_rate": 7.994898484279265e-06, + "loss": 0.2929, + "step": 4700 + }, + { + "epoch": 1.5858756109893815, + "grad_norm": 2.0911993189513085, + "learning_rate": 7.990525870502893e-06, + "loss": 0.3028, + "step": 4705 + }, + { + "epoch": 1.5875610989381426, + "grad_norm": 1.1632752318696185, + "learning_rate": 7.986149692822089e-06, + "loss": 0.314, + "step": 4710 + }, + { + "epoch": 1.5892465868869037, + "grad_norm": 1.2976478244391174, + "learning_rate": 7.981769956452085e-06, + "loss": 0.3112, + "step": 4715 + }, + { + "epoch": 1.5909320748356648, + "grad_norm": 1.3494201754916226, + "learning_rate": 7.97738666661235e-06, + "loss": 0.2972, + "step": 4720 + }, + { + "epoch": 1.5926175627844261, + "grad_norm": 1.307208309529107, + "learning_rate": 7.97299982852659e-06, + "loss": 0.2978, + "step": 4725 + }, + { + "epoch": 1.5943030507331872, + "grad_norm": 1.3093524249606938, + "learning_rate": 7.96860944742274e-06, + "loss": 0.2823, + "step": 4730 + }, + { + "epoch": 1.5959885386819486, + "grad_norm": 1.191591564059639, + "learning_rate": 7.964215528532955e-06, + "loss": 0.2814, + "step": 4735 + }, + { + "epoch": 1.5976740266307097, + "grad_norm": 1.2549526964375906, + "learning_rate": 7.959818077093605e-06, + "loss": 0.3136, + "step": 4740 + }, + { + "epoch": 1.5993595145794708, + "grad_norm": 1.321086131577103, + "learning_rate": 7.955417098345277e-06, + "loss": 0.2951, + "step": 4745 + }, + { + "epoch": 1.601045002528232, + "grad_norm": 1.2550529012391063, + "learning_rate": 7.951012597532755e-06, + "loss": 0.3145, + "step": 4750 + }, + { + "epoch": 1.602730490476993, + "grad_norm": 1.1809960009492033, + "learning_rate": 7.94660457990502e-06, + "loss": 0.2839, + "step": 4755 + }, + { + "epoch": 1.6044159784257541, + "grad_norm": 1.106482264060698, + "learning_rate": 7.942193050715248e-06, + "loss": 0.2976, + "step": 4760 + }, + { + "epoch": 1.6061014663745155, + "grad_norm": 1.072157217080442, + "learning_rate": 7.937778015220798e-06, + "loss": 0.2987, + "step": 4765 + }, + { + "epoch": 1.6077869543232766, + "grad_norm": 1.1474457940023302, + "learning_rate": 7.93335947868321e-06, + "loss": 0.2596, + "step": 4770 + }, + { + "epoch": 1.609472442272038, + "grad_norm": 2.0507615381539632, + "learning_rate": 7.92893744636819e-06, + "loss": 0.3031, + "step": 4775 + }, + { + "epoch": 1.611157930220799, + "grad_norm": 1.3830108942662933, + "learning_rate": 7.924511923545615e-06, + "loss": 0.3239, + "step": 4780 + }, + { + "epoch": 1.6128434181695601, + "grad_norm": 1.4290445792176358, + "learning_rate": 7.920082915489521e-06, + "loss": 0.293, + "step": 4785 + }, + { + "epoch": 1.6145289061183212, + "grad_norm": 3.1474688819918732, + "learning_rate": 7.9156504274781e-06, + "loss": 0.3222, + "step": 4790 + }, + { + "epoch": 1.6162143940670823, + "grad_norm": 1.7662624740688317, + "learning_rate": 7.911214464793687e-06, + "loss": 0.2554, + "step": 4795 + }, + { + "epoch": 1.6178998820158434, + "grad_norm": 1.4862287784132144, + "learning_rate": 7.906775032722755e-06, + "loss": 0.2722, + "step": 4800 + }, + { + "epoch": 1.6195853699646048, + "grad_norm": 8.026673170688301, + "learning_rate": 7.90233213655592e-06, + "loss": 0.3004, + "step": 4805 + }, + { + "epoch": 1.621270857913366, + "grad_norm": 1.2758282965884276, + "learning_rate": 7.897885781587924e-06, + "loss": 0.2934, + "step": 4810 + }, + { + "epoch": 1.6229563458621272, + "grad_norm": 1.5636493108431386, + "learning_rate": 7.893435973117625e-06, + "loss": 0.314, + "step": 4815 + }, + { + "epoch": 1.6246418338108883, + "grad_norm": 1.4426404779447306, + "learning_rate": 7.888982716448001e-06, + "loss": 0.2966, + "step": 4820 + }, + { + "epoch": 1.6263273217596494, + "grad_norm": 2.2562300197195286, + "learning_rate": 7.884526016886142e-06, + "loss": 0.3111, + "step": 4825 + }, + { + "epoch": 1.6280128097084106, + "grad_norm": 1.145792464399853, + "learning_rate": 7.880065879743236e-06, + "loss": 0.2822, + "step": 4830 + }, + { + "epoch": 1.6296982976571717, + "grad_norm": 1.209364419222678, + "learning_rate": 7.875602310334571e-06, + "loss": 0.2986, + "step": 4835 + }, + { + "epoch": 1.6313837856059328, + "grad_norm": 1.158602003168086, + "learning_rate": 7.87113531397952e-06, + "loss": 0.3255, + "step": 4840 + }, + { + "epoch": 1.6330692735546941, + "grad_norm": 5.218568637065671, + "learning_rate": 7.86666489600155e-06, + "loss": 0.3227, + "step": 4845 + }, + { + "epoch": 1.6347547615034552, + "grad_norm": 0.9976104195555039, + "learning_rate": 7.862191061728196e-06, + "loss": 0.2865, + "step": 4850 + }, + { + "epoch": 1.6364402494522166, + "grad_norm": 2.569454658042009, + "learning_rate": 7.85771381649107e-06, + "loss": 0.2835, + "step": 4855 + }, + { + "epoch": 1.6381257374009777, + "grad_norm": 1.2081999103176555, + "learning_rate": 7.853233165625846e-06, + "loss": 0.3128, + "step": 4860 + }, + { + "epoch": 1.6398112253497388, + "grad_norm": 1.1248594021068017, + "learning_rate": 7.848749114472258e-06, + "loss": 0.3151, + "step": 4865 + }, + { + "epoch": 1.6414967132984999, + "grad_norm": 2.4270155350719578, + "learning_rate": 7.84426166837409e-06, + "loss": 0.3076, + "step": 4870 + }, + { + "epoch": 1.643182201247261, + "grad_norm": 1.2266730638699697, + "learning_rate": 7.839770832679176e-06, + "loss": 0.2993, + "step": 4875 + }, + { + "epoch": 1.644867689196022, + "grad_norm": 1.1803866832834962, + "learning_rate": 7.835276612739386e-06, + "loss": 0.2779, + "step": 4880 + }, + { + "epoch": 1.6465531771447834, + "grad_norm": 1.4724741668831578, + "learning_rate": 7.830779013910626e-06, + "loss": 0.3018, + "step": 4885 + }, + { + "epoch": 1.6482386650935446, + "grad_norm": 1.347302301416293, + "learning_rate": 7.826278041552824e-06, + "loss": 0.3168, + "step": 4890 + }, + { + "epoch": 1.6499241530423059, + "grad_norm": 1.1208931792241335, + "learning_rate": 7.821773701029934e-06, + "loss": 0.3034, + "step": 4895 + }, + { + "epoch": 1.651609640991067, + "grad_norm": 1.5426746365548927, + "learning_rate": 7.81726599770992e-06, + "loss": 0.2516, + "step": 4900 + }, + { + "epoch": 1.653295128939828, + "grad_norm": 1.1142651926486773, + "learning_rate": 7.812754936964758e-06, + "loss": 0.2831, + "step": 4905 + }, + { + "epoch": 1.6549806168885892, + "grad_norm": 1.259520493787399, + "learning_rate": 7.808240524170414e-06, + "loss": 0.2786, + "step": 4910 + }, + { + "epoch": 1.6566661048373503, + "grad_norm": 1.4754317494045115, + "learning_rate": 7.803722764706865e-06, + "loss": 0.2958, + "step": 4915 + }, + { + "epoch": 1.6583515927861114, + "grad_norm": 1.1171716700419523, + "learning_rate": 7.799201663958066e-06, + "loss": 0.2809, + "step": 4920 + }, + { + "epoch": 1.6600370807348728, + "grad_norm": 1.2908279148621562, + "learning_rate": 7.794677227311954e-06, + "loss": 0.2991, + "step": 4925 + }, + { + "epoch": 1.6617225686836339, + "grad_norm": 1.3433670764531915, + "learning_rate": 7.790149460160445e-06, + "loss": 0.2926, + "step": 4930 + }, + { + "epoch": 1.6634080566323952, + "grad_norm": 1.2218216858309807, + "learning_rate": 7.785618367899421e-06, + "loss": 0.2917, + "step": 4935 + }, + { + "epoch": 1.6650935445811563, + "grad_norm": 1.2990524842912665, + "learning_rate": 7.781083955928732e-06, + "loss": 0.2952, + "step": 4940 + }, + { + "epoch": 1.6667790325299174, + "grad_norm": 1.6785603900156052, + "learning_rate": 7.776546229652175e-06, + "loss": 0.3159, + "step": 4945 + }, + { + "epoch": 1.6684645204786785, + "grad_norm": 1.3172034286260808, + "learning_rate": 7.772005194477506e-06, + "loss": 0.2849, + "step": 4950 + }, + { + "epoch": 1.6701500084274397, + "grad_norm": 1.7712583785514993, + "learning_rate": 7.76746085581642e-06, + "loss": 0.2966, + "step": 4955 + }, + { + "epoch": 1.6718354963762008, + "grad_norm": 1.3680930568241578, + "learning_rate": 7.762913219084549e-06, + "loss": 0.3021, + "step": 4960 + }, + { + "epoch": 1.673520984324962, + "grad_norm": 1.8731405239920322, + "learning_rate": 7.758362289701456e-06, + "loss": 0.2905, + "step": 4965 + }, + { + "epoch": 1.6752064722737232, + "grad_norm": 1.2605432916849957, + "learning_rate": 7.753808073090626e-06, + "loss": 0.3222, + "step": 4970 + }, + { + "epoch": 1.6768919602224845, + "grad_norm": 2.0933592871449385, + "learning_rate": 7.749250574679466e-06, + "loss": 0.3004, + "step": 4975 + }, + { + "epoch": 1.6785774481712457, + "grad_norm": 1.2951973558763388, + "learning_rate": 7.74468979989929e-06, + "loss": 0.3356, + "step": 4980 + }, + { + "epoch": 1.6802629361200068, + "grad_norm": 1.1121691174536121, + "learning_rate": 7.740125754185316e-06, + "loss": 0.2761, + "step": 4985 + }, + { + "epoch": 1.6819484240687679, + "grad_norm": 1.247648286765324, + "learning_rate": 7.735558442976665e-06, + "loss": 0.3005, + "step": 4990 + }, + { + "epoch": 1.683633912017529, + "grad_norm": 1.3107306413155002, + "learning_rate": 7.730987871716343e-06, + "loss": 0.296, + "step": 4995 + }, + { + "epoch": 1.68531939996629, + "grad_norm": 1.6659054480184692, + "learning_rate": 7.72641404585125e-06, + "loss": 0.3117, + "step": 5000 + }, + { + "epoch": 1.6870048879150514, + "grad_norm": 1.1094262335217973, + "learning_rate": 7.721836970832154e-06, + "loss": 0.2652, + "step": 5005 + }, + { + "epoch": 1.6886903758638125, + "grad_norm": 1.9930822705353428, + "learning_rate": 7.717256652113701e-06, + "loss": 0.2807, + "step": 5010 + }, + { + "epoch": 1.6903758638125739, + "grad_norm": 1.1554646427441158, + "learning_rate": 7.712673095154403e-06, + "loss": 0.3323, + "step": 5015 + }, + { + "epoch": 1.692061351761335, + "grad_norm": 1.1397588826811031, + "learning_rate": 7.708086305416633e-06, + "loss": 0.271, + "step": 5020 + }, + { + "epoch": 1.693746839710096, + "grad_norm": 1.5885004049421636, + "learning_rate": 7.703496288366608e-06, + "loss": 0.2656, + "step": 5025 + }, + { + "epoch": 1.6954323276588572, + "grad_norm": 1.146638763639586, + "learning_rate": 7.698903049474402e-06, + "loss": 0.2941, + "step": 5030 + }, + { + "epoch": 1.6971178156076183, + "grad_norm": 1.151903093184447, + "learning_rate": 7.69430659421392e-06, + "loss": 0.305, + "step": 5035 + }, + { + "epoch": 1.6988033035563794, + "grad_norm": 1.21754582792958, + "learning_rate": 7.6897069280629e-06, + "loss": 0.3071, + "step": 5040 + }, + { + "epoch": 1.7004887915051408, + "grad_norm": 1.6576276575339772, + "learning_rate": 7.68510405650292e-06, + "loss": 0.3179, + "step": 5045 + }, + { + "epoch": 1.7021742794539019, + "grad_norm": 1.8043608824075692, + "learning_rate": 7.68049798501936e-06, + "loss": 0.2804, + "step": 5050 + }, + { + "epoch": 1.7038597674026632, + "grad_norm": 1.5070951031781703, + "learning_rate": 7.675888719101422e-06, + "loss": 0.2933, + "step": 5055 + }, + { + "epoch": 1.7055452553514243, + "grad_norm": 1.436825831743727, + "learning_rate": 7.671276264242116e-06, + "loss": 0.2736, + "step": 5060 + }, + { + "epoch": 1.7072307433001854, + "grad_norm": 1.6490603638595753, + "learning_rate": 7.666660625938252e-06, + "loss": 0.3293, + "step": 5065 + }, + { + "epoch": 1.7089162312489465, + "grad_norm": 1.555207873204316, + "learning_rate": 7.662041809690428e-06, + "loss": 0.2872, + "step": 5070 + }, + { + "epoch": 1.7106017191977076, + "grad_norm": 1.4334715461537986, + "learning_rate": 7.657419821003038e-06, + "loss": 0.3013, + "step": 5075 + }, + { + "epoch": 1.7122872071464688, + "grad_norm": 1.4272073998025994, + "learning_rate": 7.652794665384249e-06, + "loss": 0.3176, + "step": 5080 + }, + { + "epoch": 1.71397269509523, + "grad_norm": 1.501271647310636, + "learning_rate": 7.648166348346009e-06, + "loss": 0.2593, + "step": 5085 + }, + { + "epoch": 1.7156581830439912, + "grad_norm": 1.355921776107784, + "learning_rate": 7.643534875404028e-06, + "loss": 0.2972, + "step": 5090 + }, + { + "epoch": 1.7173436709927525, + "grad_norm": 1.255616076107906, + "learning_rate": 7.638900252077778e-06, + "loss": 0.3307, + "step": 5095 + }, + { + "epoch": 1.7190291589415136, + "grad_norm": 1.3006104131596286, + "learning_rate": 7.634262483890487e-06, + "loss": 0.2729, + "step": 5100 + }, + { + "epoch": 1.7207146468902748, + "grad_norm": 6.283131471658594, + "learning_rate": 7.629621576369132e-06, + "loss": 0.2965, + "step": 5105 + }, + { + "epoch": 1.7224001348390359, + "grad_norm": 13.6092368979776, + "learning_rate": 7.624977535044429e-06, + "loss": 0.2836, + "step": 5110 + }, + { + "epoch": 1.724085622787797, + "grad_norm": 1.4047936498728297, + "learning_rate": 7.620330365450828e-06, + "loss": 0.2919, + "step": 5115 + }, + { + "epoch": 1.725771110736558, + "grad_norm": 1.5546612805155546, + "learning_rate": 7.6156800731265076e-06, + "loss": 0.2993, + "step": 5120 + }, + { + "epoch": 1.7274565986853194, + "grad_norm": 3.012391619364554, + "learning_rate": 7.6110266636133725e-06, + "loss": 0.2901, + "step": 5125 + }, + { + "epoch": 1.7291420866340805, + "grad_norm": 1.4088814643004863, + "learning_rate": 7.606370142457033e-06, + "loss": 0.3075, + "step": 5130 + }, + { + "epoch": 1.7308275745828419, + "grad_norm": 1.265940304384725, + "learning_rate": 7.601710515206816e-06, + "loss": 0.2977, + "step": 5135 + }, + { + "epoch": 1.732513062531603, + "grad_norm": 11.535301658136971, + "learning_rate": 7.597047787415746e-06, + "loss": 0.3029, + "step": 5140 + }, + { + "epoch": 1.734198550480364, + "grad_norm": 1.5396122749245849, + "learning_rate": 7.592381964640545e-06, + "loss": 0.2973, + "step": 5145 + }, + { + "epoch": 1.7358840384291252, + "grad_norm": 4.016808949745366, + "learning_rate": 7.587713052441621e-06, + "loss": 0.3195, + "step": 5150 + }, + { + "epoch": 1.7375695263778863, + "grad_norm": 1.530371697029571, + "learning_rate": 7.583041056383063e-06, + "loss": 0.2877, + "step": 5155 + }, + { + "epoch": 1.7392550143266474, + "grad_norm": 1.7382367198655937, + "learning_rate": 7.578365982032637e-06, + "loss": 0.305, + "step": 5160 + }, + { + "epoch": 1.7409405022754088, + "grad_norm": 2.2514394592699647, + "learning_rate": 7.57368783496178e-06, + "loss": 0.3182, + "step": 5165 + }, + { + "epoch": 1.7426259902241699, + "grad_norm": 1.3482588616499345, + "learning_rate": 7.569006620745586e-06, + "loss": 0.3009, + "step": 5170 + }, + { + "epoch": 1.7443114781729312, + "grad_norm": 1.42929400463226, + "learning_rate": 7.5643223449628066e-06, + "loss": 0.2929, + "step": 5175 + }, + { + "epoch": 1.7459969661216923, + "grad_norm": 1.200068728553629, + "learning_rate": 7.559635013195841e-06, + "loss": 0.2762, + "step": 5180 + }, + { + "epoch": 1.7476824540704534, + "grad_norm": 1.086195312675611, + "learning_rate": 7.554944631030732e-06, + "loss": 0.2714, + "step": 5185 + }, + { + "epoch": 1.7493679420192145, + "grad_norm": 1.390530215876357, + "learning_rate": 7.550251204057156e-06, + "loss": 0.2981, + "step": 5190 + }, + { + "epoch": 1.7510534299679756, + "grad_norm": 1.6887386714329724, + "learning_rate": 7.545554737868419e-06, + "loss": 0.2663, + "step": 5195 + }, + { + "epoch": 1.7527389179167367, + "grad_norm": 1.3929222748367942, + "learning_rate": 7.5408552380614486e-06, + "loss": 0.2699, + "step": 5200 + }, + { + "epoch": 1.754424405865498, + "grad_norm": 1.1704820389857176, + "learning_rate": 7.536152710236787e-06, + "loss": 0.2762, + "step": 5205 + }, + { + "epoch": 1.7561098938142592, + "grad_norm": 1.3168149746230169, + "learning_rate": 7.531447159998586e-06, + "loss": 0.295, + "step": 5210 + }, + { + "epoch": 1.7577953817630205, + "grad_norm": 1.1274704294658016, + "learning_rate": 7.526738592954599e-06, + "loss": 0.2718, + "step": 5215 + }, + { + "epoch": 1.7594808697117816, + "grad_norm": 1.4865898178748014, + "learning_rate": 7.522027014716176e-06, + "loss": 0.3192, + "step": 5220 + }, + { + "epoch": 1.7611663576605427, + "grad_norm": 1.351572443814194, + "learning_rate": 7.517312430898252e-06, + "loss": 0.2785, + "step": 5225 + }, + { + "epoch": 1.7628518456093039, + "grad_norm": 1.4318309186592089, + "learning_rate": 7.512594847119345e-06, + "loss": 0.2647, + "step": 5230 + }, + { + "epoch": 1.764537333558065, + "grad_norm": 2.6422620160818133, + "learning_rate": 7.50787426900155e-06, + "loss": 0.3183, + "step": 5235 + }, + { + "epoch": 1.766222821506826, + "grad_norm": 1.9544232382145845, + "learning_rate": 7.503150702170532e-06, + "loss": 0.2868, + "step": 5240 + }, + { + "epoch": 1.7679083094555874, + "grad_norm": 1.3910226812737208, + "learning_rate": 7.498424152255512e-06, + "loss": 0.2983, + "step": 5245 + }, + { + "epoch": 1.7695937974043485, + "grad_norm": 1.4395515103799448, + "learning_rate": 7.493694624889272e-06, + "loss": 0.3066, + "step": 5250 + }, + { + "epoch": 1.7712792853531099, + "grad_norm": 1.1222566371604252, + "learning_rate": 7.488962125708137e-06, + "loss": 0.281, + "step": 5255 + }, + { + "epoch": 1.772964773301871, + "grad_norm": 1.546040473023318, + "learning_rate": 7.484226660351979e-06, + "loss": 0.2995, + "step": 5260 + }, + { + "epoch": 1.774650261250632, + "grad_norm": 1.574776195213442, + "learning_rate": 7.479488234464198e-06, + "loss": 0.3102, + "step": 5265 + }, + { + "epoch": 1.7763357491993932, + "grad_norm": 1.327888821430515, + "learning_rate": 7.47474685369173e-06, + "loss": 0.2739, + "step": 5270 + }, + { + "epoch": 1.7780212371481543, + "grad_norm": 2.4448341717068507, + "learning_rate": 7.470002523685027e-06, + "loss": 0.2823, + "step": 5275 + }, + { + "epoch": 1.7797067250969154, + "grad_norm": 1.546366526166282, + "learning_rate": 7.465255250098059e-06, + "loss": 0.2817, + "step": 5280 + }, + { + "epoch": 1.7813922130456767, + "grad_norm": 1.072940090664181, + "learning_rate": 7.460505038588299e-06, + "loss": 0.2881, + "step": 5285 + }, + { + "epoch": 1.7830777009944379, + "grad_norm": 1.156854830692132, + "learning_rate": 7.4557518948167295e-06, + "loss": 0.2658, + "step": 5290 + }, + { + "epoch": 1.7847631889431992, + "grad_norm": 1.265013609236647, + "learning_rate": 7.450995824447817e-06, + "loss": 0.304, + "step": 5295 + }, + { + "epoch": 1.7864486768919603, + "grad_norm": 1.2731846527562656, + "learning_rate": 7.446236833149527e-06, + "loss": 0.3056, + "step": 5300 + }, + { + "epoch": 1.7881341648407214, + "grad_norm": 1.2328738628513, + "learning_rate": 7.4414749265932955e-06, + "loss": 0.291, + "step": 5305 + }, + { + "epoch": 1.7898196527894825, + "grad_norm": 1.2645077337917259, + "learning_rate": 7.436710110454039e-06, + "loss": 0.2879, + "step": 5310 + }, + { + "epoch": 1.7915051407382436, + "grad_norm": 11.884705169991397, + "learning_rate": 7.431942390410141e-06, + "loss": 0.2696, + "step": 5315 + }, + { + "epoch": 1.7931906286870047, + "grad_norm": 1.4416266618735254, + "learning_rate": 7.427171772143442e-06, + "loss": 0.3085, + "step": 5320 + }, + { + "epoch": 1.794876116635766, + "grad_norm": 1.3103910235201572, + "learning_rate": 7.4223982613392424e-06, + "loss": 0.2962, + "step": 5325 + }, + { + "epoch": 1.7965616045845272, + "grad_norm": 1.297116465661714, + "learning_rate": 7.417621863686283e-06, + "loss": 0.2865, + "step": 5330 + }, + { + "epoch": 1.7982470925332885, + "grad_norm": 1.394746946999585, + "learning_rate": 7.412842584876749e-06, + "loss": 0.2926, + "step": 5335 + }, + { + "epoch": 1.7999325804820496, + "grad_norm": 1.0633688131220695, + "learning_rate": 7.4080604306062605e-06, + "loss": 0.2627, + "step": 5340 + }, + { + "epoch": 1.8016180684308107, + "grad_norm": 1.4620745451668324, + "learning_rate": 7.40327540657386e-06, + "loss": 0.2914, + "step": 5345 + }, + { + "epoch": 1.8033035563795718, + "grad_norm": 1.4085668488993912, + "learning_rate": 7.398487518482013e-06, + "loss": 0.2965, + "step": 5350 + }, + { + "epoch": 1.804989044328333, + "grad_norm": 1.0751425446009926, + "learning_rate": 7.393696772036598e-06, + "loss": 0.2943, + "step": 5355 + }, + { + "epoch": 1.806674532277094, + "grad_norm": 1.2355772902782536, + "learning_rate": 7.388903172946897e-06, + "loss": 0.2725, + "step": 5360 + }, + { + "epoch": 1.8083600202258554, + "grad_norm": 1.46332548505368, + "learning_rate": 7.384106726925597e-06, + "loss": 0.271, + "step": 5365 + }, + { + "epoch": 1.8100455081746165, + "grad_norm": 1.3327958403232665, + "learning_rate": 7.3793074396887735e-06, + "loss": 0.257, + "step": 5370 + }, + { + "epoch": 1.8117309961233778, + "grad_norm": 1.27159069423012, + "learning_rate": 7.374505316955889e-06, + "loss": 0.2997, + "step": 5375 + }, + { + "epoch": 1.813416484072139, + "grad_norm": 1.1136873534765386, + "learning_rate": 7.369700364449783e-06, + "loss": 0.2813, + "step": 5380 + }, + { + "epoch": 1.8151019720209, + "grad_norm": 1.058980131231453, + "learning_rate": 7.364892587896675e-06, + "loss": 0.2851, + "step": 5385 + }, + { + "epoch": 1.8167874599696612, + "grad_norm": 1.153665868223348, + "learning_rate": 7.3600819930261406e-06, + "loss": 0.2807, + "step": 5390 + }, + { + "epoch": 1.8184729479184223, + "grad_norm": 1.094878820896215, + "learning_rate": 7.355268585571119e-06, + "loss": 0.2491, + "step": 5395 + }, + { + "epoch": 1.8201584358671834, + "grad_norm": 3.1535980383592435, + "learning_rate": 7.3504523712679e-06, + "loss": 0.2735, + "step": 5400 + }, + { + "epoch": 1.8218439238159447, + "grad_norm": 1.1158721579623263, + "learning_rate": 7.34563335585612e-06, + "loss": 0.2808, + "step": 5405 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 1.1810109517856728, + "learning_rate": 7.340811545078751e-06, + "loss": 0.2849, + "step": 5410 + }, + { + "epoch": 1.8252148997134672, + "grad_norm": 2.6921207195719474, + "learning_rate": 7.3359869446820985e-06, + "loss": 0.276, + "step": 5415 + }, + { + "epoch": 1.8269003876622283, + "grad_norm": 1.3992211796492626, + "learning_rate": 7.331159560415791e-06, + "loss": 0.2739, + "step": 5420 + }, + { + "epoch": 1.8285858756109894, + "grad_norm": 1.3720595724897235, + "learning_rate": 7.3263293980327765e-06, + "loss": 0.2668, + "step": 5425 + }, + { + "epoch": 1.8302713635597505, + "grad_norm": 1.469896908485536, + "learning_rate": 7.321496463289311e-06, + "loss": 0.2625, + "step": 5430 + }, + { + "epoch": 1.8319568515085116, + "grad_norm": 1.8485122111038519, + "learning_rate": 7.316660761944957e-06, + "loss": 0.291, + "step": 5435 + }, + { + "epoch": 1.8336423394572727, + "grad_norm": 10.340467741077116, + "learning_rate": 7.311822299762573e-06, + "loss": 0.2861, + "step": 5440 + }, + { + "epoch": 1.835327827406034, + "grad_norm": 1.7422431249626944, + "learning_rate": 7.306981082508307e-06, + "loss": 0.2871, + "step": 5445 + }, + { + "epoch": 1.8370133153547952, + "grad_norm": 1.1306638443027448, + "learning_rate": 7.3021371159515915e-06, + "loss": 0.2932, + "step": 5450 + }, + { + "epoch": 1.8386988033035565, + "grad_norm": 1.1140792597783495, + "learning_rate": 7.297290405865136e-06, + "loss": 0.2966, + "step": 5455 + }, + { + "epoch": 1.8403842912523176, + "grad_norm": 1.212632304429544, + "learning_rate": 7.292440958024916e-06, + "loss": 0.2899, + "step": 5460 + }, + { + "epoch": 1.8420697792010787, + "grad_norm": 1.3938375232153972, + "learning_rate": 7.287588778210174e-06, + "loss": 0.2977, + "step": 5465 + }, + { + "epoch": 1.8437552671498398, + "grad_norm": 1.3339246920805004, + "learning_rate": 7.282733872203405e-06, + "loss": 0.3031, + "step": 5470 + }, + { + "epoch": 1.845440755098601, + "grad_norm": 1.6499477818188544, + "learning_rate": 7.277876245790357e-06, + "loss": 0.2944, + "step": 5475 + }, + { + "epoch": 1.847126243047362, + "grad_norm": 1.4968414796972271, + "learning_rate": 7.273015904760014e-06, + "loss": 0.2883, + "step": 5480 + }, + { + "epoch": 1.8488117309961234, + "grad_norm": 1.6850501788811187, + "learning_rate": 7.2681528549046e-06, + "loss": 0.2729, + "step": 5485 + }, + { + "epoch": 1.8504972189448845, + "grad_norm": 1.442047666067804, + "learning_rate": 7.263287102019565e-06, + "loss": 0.2266, + "step": 5490 + }, + { + "epoch": 1.8521827068936458, + "grad_norm": 1.5323602110070451, + "learning_rate": 7.2584186519035815e-06, + "loss": 0.2939, + "step": 5495 + }, + { + "epoch": 1.853868194842407, + "grad_norm": 1.392918605066643, + "learning_rate": 7.253547510358536e-06, + "loss": 0.29, + "step": 5500 + }, + { + "epoch": 1.855553682791168, + "grad_norm": 2.317316796924633, + "learning_rate": 7.248673683189522e-06, + "loss": 0.3041, + "step": 5505 + }, + { + "epoch": 1.8572391707399292, + "grad_norm": 1.1660872257371953, + "learning_rate": 7.243797176204833e-06, + "loss": 0.2885, + "step": 5510 + }, + { + "epoch": 1.8589246586886903, + "grad_norm": 1.180652848669013, + "learning_rate": 7.238917995215957e-06, + "loss": 0.2967, + "step": 5515 + }, + { + "epoch": 1.8606101466374514, + "grad_norm": 1.431117199613125, + "learning_rate": 7.234036146037571e-06, + "loss": 0.2825, + "step": 5520 + }, + { + "epoch": 1.8622956345862127, + "grad_norm": 1.608766155824911, + "learning_rate": 7.229151634487526e-06, + "loss": 0.2869, + "step": 5525 + }, + { + "epoch": 1.8639811225349738, + "grad_norm": 1.194041242448981, + "learning_rate": 7.22426446638685e-06, + "loss": 0.2763, + "step": 5530 + }, + { + "epoch": 1.8656666104837352, + "grad_norm": 1.4822426907606354, + "learning_rate": 7.219374647559737e-06, + "loss": 0.2702, + "step": 5535 + }, + { + "epoch": 1.8673520984324963, + "grad_norm": 1.5813532585445986, + "learning_rate": 7.21448218383354e-06, + "loss": 0.2575, + "step": 5540 + }, + { + "epoch": 1.8690375863812574, + "grad_norm": 4.162789690078896, + "learning_rate": 7.209587081038761e-06, + "loss": 0.2831, + "step": 5545 + }, + { + "epoch": 1.8707230743300185, + "grad_norm": 2.1941907386004, + "learning_rate": 7.2046893450090485e-06, + "loss": 0.302, + "step": 5550 + }, + { + "epoch": 1.8724085622787796, + "grad_norm": 1.2844626457888235, + "learning_rate": 7.199788981581191e-06, + "loss": 0.2751, + "step": 5555 + }, + { + "epoch": 1.8740940502275407, + "grad_norm": 2.1417066213451643, + "learning_rate": 7.194885996595109e-06, + "loss": 0.2634, + "step": 5560 + }, + { + "epoch": 1.875779538176302, + "grad_norm": 1.567906672664597, + "learning_rate": 7.189980395893841e-06, + "loss": 0.2817, + "step": 5565 + }, + { + "epoch": 1.8774650261250632, + "grad_norm": 1.4640132987361372, + "learning_rate": 7.185072185323548e-06, + "loss": 0.2739, + "step": 5570 + }, + { + "epoch": 1.8791505140738245, + "grad_norm": 1.1831500326551032, + "learning_rate": 7.1801613707335015e-06, + "loss": 0.2785, + "step": 5575 + }, + { + "epoch": 1.8808360020225856, + "grad_norm": 1.5229423836919356, + "learning_rate": 7.175247957976075e-06, + "loss": 0.2763, + "step": 5580 + }, + { + "epoch": 1.8825214899713467, + "grad_norm": 1.4154336466192219, + "learning_rate": 7.170331952906737e-06, + "loss": 0.2969, + "step": 5585 + }, + { + "epoch": 1.8842069779201078, + "grad_norm": 1.2176116045878531, + "learning_rate": 7.165413361384046e-06, + "loss": 0.2854, + "step": 5590 + }, + { + "epoch": 1.885892465868869, + "grad_norm": 1.317744924086943, + "learning_rate": 7.1604921892696434e-06, + "loss": 0.3131, + "step": 5595 + }, + { + "epoch": 1.88757795381763, + "grad_norm": 1.210784995396025, + "learning_rate": 7.155568442428248e-06, + "loss": 0.2886, + "step": 5600 + }, + { + "epoch": 1.8892634417663914, + "grad_norm": 1.0466185632829665, + "learning_rate": 7.150642126727642e-06, + "loss": 0.3073, + "step": 5605 + }, + { + "epoch": 1.8909489297151525, + "grad_norm": 1.1694318182213048, + "learning_rate": 7.1457132480386745e-06, + "loss": 0.2686, + "step": 5610 + }, + { + "epoch": 1.8926344176639138, + "grad_norm": 1.0824703576581143, + "learning_rate": 7.140781812235245e-06, + "loss": 0.2691, + "step": 5615 + }, + { + "epoch": 1.894319905612675, + "grad_norm": 1.1655191162602738, + "learning_rate": 7.135847825194303e-06, + "loss": 0.2904, + "step": 5620 + }, + { + "epoch": 1.896005393561436, + "grad_norm": 1.4629293813227948, + "learning_rate": 7.1309112927958345e-06, + "loss": 0.2537, + "step": 5625 + }, + { + "epoch": 1.8976908815101972, + "grad_norm": 1.1877387693585426, + "learning_rate": 7.125972220922864e-06, + "loss": 0.2917, + "step": 5630 + }, + { + "epoch": 1.8993763694589583, + "grad_norm": 1.2354905506595124, + "learning_rate": 7.1210306154614405e-06, + "loss": 0.2866, + "step": 5635 + }, + { + "epoch": 1.9010618574077194, + "grad_norm": 1.1924275755679479, + "learning_rate": 7.116086482300629e-06, + "loss": 0.308, + "step": 5640 + }, + { + "epoch": 1.9027473453564807, + "grad_norm": 1.1118719052510926, + "learning_rate": 7.111139827332511e-06, + "loss": 0.2494, + "step": 5645 + }, + { + "epoch": 1.9044328333052418, + "grad_norm": 1.263291013583244, + "learning_rate": 7.106190656452173e-06, + "loss": 0.2399, + "step": 5650 + }, + { + "epoch": 1.9061183212540032, + "grad_norm": 1.663479763754197, + "learning_rate": 7.1012389755576995e-06, + "loss": 0.2979, + "step": 5655 + }, + { + "epoch": 1.9078038092027643, + "grad_norm": 1.7105749348673616, + "learning_rate": 7.096284790550161e-06, + "loss": 0.2684, + "step": 5660 + }, + { + "epoch": 1.9094892971515254, + "grad_norm": 1.1648848641153158, + "learning_rate": 7.0913281073336215e-06, + "loss": 0.2991, + "step": 5665 + }, + { + "epoch": 1.9111747851002865, + "grad_norm": 4.468619414466493, + "learning_rate": 7.0863689318151156e-06, + "loss": 0.3059, + "step": 5670 + }, + { + "epoch": 1.9128602730490476, + "grad_norm": 1.144369748736879, + "learning_rate": 7.081407269904649e-06, + "loss": 0.262, + "step": 5675 + }, + { + "epoch": 1.9145457609978087, + "grad_norm": 1.7484725420755798, + "learning_rate": 7.076443127515191e-06, + "loss": 0.2877, + "step": 5680 + }, + { + "epoch": 1.91623124894657, + "grad_norm": 1.574402282970591, + "learning_rate": 7.071476510562672e-06, + "loss": 0.2816, + "step": 5685 + }, + { + "epoch": 1.9179167368953312, + "grad_norm": 1.2842659899773619, + "learning_rate": 7.0665074249659605e-06, + "loss": 0.2835, + "step": 5690 + }, + { + "epoch": 1.9196022248440925, + "grad_norm": 1.674707294869782, + "learning_rate": 7.0615358766468776e-06, + "loss": 0.279, + "step": 5695 + }, + { + "epoch": 1.9212877127928536, + "grad_norm": 1.2832555638357301, + "learning_rate": 7.056561871530172e-06, + "loss": 0.3106, + "step": 5700 + }, + { + "epoch": 1.9229732007416147, + "grad_norm": 1.1343021557145014, + "learning_rate": 7.051585415543527e-06, + "loss": 0.2537, + "step": 5705 + }, + { + "epoch": 1.9246586886903758, + "grad_norm": 1.0952067263564165, + "learning_rate": 7.04660651461754e-06, + "loss": 0.2709, + "step": 5710 + }, + { + "epoch": 1.926344176639137, + "grad_norm": 1.2807376898437488, + "learning_rate": 7.041625174685725e-06, + "loss": 0.276, + "step": 5715 + }, + { + "epoch": 1.928029664587898, + "grad_norm": 1.1925516930253768, + "learning_rate": 7.036641401684502e-06, + "loss": 0.272, + "step": 5720 + }, + { + "epoch": 1.9297151525366594, + "grad_norm": 1.5060440497108794, + "learning_rate": 7.031655201553195e-06, + "loss": 0.2821, + "step": 5725 + }, + { + "epoch": 1.9314006404854205, + "grad_norm": 1.2416504816889506, + "learning_rate": 7.026666580234012e-06, + "loss": 0.2823, + "step": 5730 + }, + { + "epoch": 1.9330861284341818, + "grad_norm": 1.5616212828868108, + "learning_rate": 7.021675543672054e-06, + "loss": 0.2799, + "step": 5735 + }, + { + "epoch": 1.934771616382943, + "grad_norm": 1.326752471131693, + "learning_rate": 7.016682097815297e-06, + "loss": 0.2847, + "step": 5740 + }, + { + "epoch": 1.936457104331704, + "grad_norm": 1.338471926432003, + "learning_rate": 7.011686248614588e-06, + "loss": 0.2432, + "step": 5745 + }, + { + "epoch": 1.9381425922804651, + "grad_norm": 1.388370315512864, + "learning_rate": 7.006688002023639e-06, + "loss": 0.2759, + "step": 5750 + }, + { + "epoch": 1.9398280802292263, + "grad_norm": 1.0306100423733313, + "learning_rate": 7.001687363999017e-06, + "loss": 0.2507, + "step": 5755 + }, + { + "epoch": 1.9415135681779874, + "grad_norm": 3.9810134001311814, + "learning_rate": 6.996684340500145e-06, + "loss": 0.285, + "step": 5760 + }, + { + "epoch": 1.9431990561267487, + "grad_norm": 1.218555666408596, + "learning_rate": 6.991678937489281e-06, + "loss": 0.2757, + "step": 5765 + }, + { + "epoch": 1.9448845440755098, + "grad_norm": 1.1798125383973037, + "learning_rate": 6.986671160931523e-06, + "loss": 0.2811, + "step": 5770 + }, + { + "epoch": 1.9465700320242711, + "grad_norm": 1.3774453741493768, + "learning_rate": 6.981661016794799e-06, + "loss": 0.3021, + "step": 5775 + }, + { + "epoch": 1.9482555199730323, + "grad_norm": 1.0889069778871825, + "learning_rate": 6.9766485110498535e-06, + "loss": 0.2511, + "step": 5780 + }, + { + "epoch": 1.9499410079217934, + "grad_norm": 1.0894701060484306, + "learning_rate": 6.971633649670251e-06, + "loss": 0.2571, + "step": 5785 + }, + { + "epoch": 1.9516264958705545, + "grad_norm": 1.050824463414813, + "learning_rate": 6.96661643863236e-06, + "loss": 0.265, + "step": 5790 + }, + { + "epoch": 1.9533119838193156, + "grad_norm": 1.4427860977769666, + "learning_rate": 6.961596883915347e-06, + "loss": 0.3046, + "step": 5795 + }, + { + "epoch": 1.9549974717680767, + "grad_norm": 1.1291478688288843, + "learning_rate": 6.956574991501179e-06, + "loss": 0.2721, + "step": 5800 + }, + { + "epoch": 1.956682959716838, + "grad_norm": 1.8650217088722214, + "learning_rate": 6.951550767374603e-06, + "loss": 0.2753, + "step": 5805 + }, + { + "epoch": 1.9583684476655991, + "grad_norm": 1.1728275649235014, + "learning_rate": 6.946524217523145e-06, + "loss": 0.2727, + "step": 5810 + }, + { + "epoch": 1.9600539356143605, + "grad_norm": 1.1308807502678624, + "learning_rate": 6.941495347937102e-06, + "loss": 0.3103, + "step": 5815 + }, + { + "epoch": 1.9617394235631216, + "grad_norm": 2.1260449540015487, + "learning_rate": 6.936464164609541e-06, + "loss": 0.2682, + "step": 5820 + }, + { + "epoch": 1.9634249115118827, + "grad_norm": 1.1265791238670662, + "learning_rate": 6.9314306735362795e-06, + "loss": 0.2738, + "step": 5825 + }, + { + "epoch": 1.9651103994606438, + "grad_norm": 1.4490476432572892, + "learning_rate": 6.92639488071589e-06, + "loss": 0.2607, + "step": 5830 + }, + { + "epoch": 1.966795887409405, + "grad_norm": 2.711251059874199, + "learning_rate": 6.921356792149686e-06, + "loss": 0.2637, + "step": 5835 + }, + { + "epoch": 1.968481375358166, + "grad_norm": 1.243318693040002, + "learning_rate": 6.916316413841718e-06, + "loss": 0.2924, + "step": 5840 + }, + { + "epoch": 1.9701668633069274, + "grad_norm": 1.1683456924274003, + "learning_rate": 6.9112737517987635e-06, + "loss": 0.2634, + "step": 5845 + }, + { + "epoch": 1.9718523512556885, + "grad_norm": 1.177735087455277, + "learning_rate": 6.906228812030322e-06, + "loss": 0.2815, + "step": 5850 + }, + { + "epoch": 1.9735378392044498, + "grad_norm": 1.5404884111077783, + "learning_rate": 6.901181600548609e-06, + "loss": 0.3032, + "step": 5855 + }, + { + "epoch": 1.975223327153211, + "grad_norm": 1.1985779199195747, + "learning_rate": 6.896132123368547e-06, + "loss": 0.2801, + "step": 5860 + }, + { + "epoch": 1.976908815101972, + "grad_norm": 1.1736034194769236, + "learning_rate": 6.891080386507757e-06, + "loss": 0.2736, + "step": 5865 + }, + { + "epoch": 1.9785943030507331, + "grad_norm": 1.1463302810284723, + "learning_rate": 6.886026395986554e-06, + "loss": 0.2919, + "step": 5870 + }, + { + "epoch": 1.9802797909994942, + "grad_norm": 1.161903655011089, + "learning_rate": 6.880970157827937e-06, + "loss": 0.2879, + "step": 5875 + }, + { + "epoch": 1.9819652789482554, + "grad_norm": 1.0923068614964753, + "learning_rate": 6.8759116780575905e-06, + "loss": 0.2533, + "step": 5880 + }, + { + "epoch": 1.9836507668970167, + "grad_norm": 1.2413766194562732, + "learning_rate": 6.8708509627038585e-06, + "loss": 0.283, + "step": 5885 + }, + { + "epoch": 1.9853362548457778, + "grad_norm": 1.142317082249267, + "learning_rate": 6.865788017797761e-06, + "loss": 0.286, + "step": 5890 + }, + { + "epoch": 1.9870217427945391, + "grad_norm": 1.3953433251211684, + "learning_rate": 6.860722849372967e-06, + "loss": 0.2787, + "step": 5895 + }, + { + "epoch": 1.9887072307433002, + "grad_norm": 1.1307682002997104, + "learning_rate": 6.855655463465798e-06, + "loss": 0.2875, + "step": 5900 + }, + { + "epoch": 1.9903927186920614, + "grad_norm": 1.3477079634225693, + "learning_rate": 6.8505858661152205e-06, + "loss": 0.2915, + "step": 5905 + }, + { + "epoch": 1.9920782066408225, + "grad_norm": 2.540209978415579, + "learning_rate": 6.8455140633628315e-06, + "loss": 0.2762, + "step": 5910 + }, + { + "epoch": 1.9937636945895836, + "grad_norm": 1.1274546132992864, + "learning_rate": 6.840440061252862e-06, + "loss": 0.29, + "step": 5915 + }, + { + "epoch": 1.9954491825383447, + "grad_norm": 1.2145097813660723, + "learning_rate": 6.835363865832159e-06, + "loss": 0.2928, + "step": 5920 + }, + { + "epoch": 1.997134670487106, + "grad_norm": 1.31697819767494, + "learning_rate": 6.830285483150186e-06, + "loss": 0.2852, + "step": 5925 + }, + { + "epoch": 1.9988201584358671, + "grad_norm": 1.262786765571154, + "learning_rate": 6.825204919259013e-06, + "loss": 0.2659, + "step": 5930 + }, + { + "epoch": 2.0003370975897523, + "grad_norm": 1.081679471655034, + "learning_rate": 6.820122180213309e-06, + "loss": 0.2351, + "step": 5935 + }, + { + "epoch": 2.0020225855385134, + "grad_norm": 1.0974569801233274, + "learning_rate": 6.815037272070334e-06, + "loss": 0.2388, + "step": 5940 + }, + { + "epoch": 2.0037080734872745, + "grad_norm": 1.1121862244964253, + "learning_rate": 6.809950200889934e-06, + "loss": 0.2597, + "step": 5945 + }, + { + "epoch": 2.0053935614360356, + "grad_norm": 1.303319950093496, + "learning_rate": 6.804860972734535e-06, + "loss": 0.2713, + "step": 5950 + }, + { + "epoch": 2.0070790493847968, + "grad_norm": 1.1534870595682194, + "learning_rate": 6.799769593669131e-06, + "loss": 0.2565, + "step": 5955 + }, + { + "epoch": 2.008764537333558, + "grad_norm": 1.3382215610294592, + "learning_rate": 6.794676069761278e-06, + "loss": 0.2808, + "step": 5960 + }, + { + "epoch": 2.0104500252823194, + "grad_norm": 10.55053784365816, + "learning_rate": 6.78958040708109e-06, + "loss": 0.2601, + "step": 5965 + }, + { + "epoch": 2.0121355132310805, + "grad_norm": 1.2387882851117746, + "learning_rate": 6.784482611701231e-06, + "loss": 0.2734, + "step": 5970 + }, + { + "epoch": 2.0138210011798416, + "grad_norm": 1.2010925585751586, + "learning_rate": 6.779382689696905e-06, + "loss": 0.2495, + "step": 5975 + }, + { + "epoch": 2.0155064891286028, + "grad_norm": 1.3563958907335463, + "learning_rate": 6.77428064714585e-06, + "loss": 0.2719, + "step": 5980 + }, + { + "epoch": 2.017191977077364, + "grad_norm": 1.5986235511234503, + "learning_rate": 6.769176490128333e-06, + "loss": 0.2724, + "step": 5985 + }, + { + "epoch": 2.018877465026125, + "grad_norm": 1.1796754043878754, + "learning_rate": 6.764070224727137e-06, + "loss": 0.2553, + "step": 5990 + }, + { + "epoch": 2.020562952974886, + "grad_norm": 1.1498771592568715, + "learning_rate": 6.758961857027564e-06, + "loss": 0.2993, + "step": 5995 + }, + { + "epoch": 2.022248440923647, + "grad_norm": 1.210072788515302, + "learning_rate": 6.753851393117414e-06, + "loss": 0.2048, + "step": 6000 + }, + { + "epoch": 2.0239339288724087, + "grad_norm": 1.1455726254272833, + "learning_rate": 6.748738839086992e-06, + "loss": 0.2667, + "step": 6005 + }, + { + "epoch": 2.02561941682117, + "grad_norm": 1.600119557458997, + "learning_rate": 6.743624201029089e-06, + "loss": 0.2686, + "step": 6010 + }, + { + "epoch": 2.027304904769931, + "grad_norm": 1.3810363711281857, + "learning_rate": 6.738507485038981e-06, + "loss": 0.2682, + "step": 6015 + }, + { + "epoch": 2.028990392718692, + "grad_norm": 1.266606651116861, + "learning_rate": 6.733388697214419e-06, + "loss": 0.2173, + "step": 6020 + }, + { + "epoch": 2.030675880667453, + "grad_norm": 1.2614845305669782, + "learning_rate": 6.728267843655628e-06, + "loss": 0.2644, + "step": 6025 + }, + { + "epoch": 2.0323613686162143, + "grad_norm": 1.3900921696692135, + "learning_rate": 6.723144930465288e-06, + "loss": 0.2416, + "step": 6030 + }, + { + "epoch": 2.0340468565649754, + "grad_norm": 1.130968249461293, + "learning_rate": 6.718019963748542e-06, + "loss": 0.2495, + "step": 6035 + }, + { + "epoch": 2.0357323445137365, + "grad_norm": 1.619017723239934, + "learning_rate": 6.71289294961297e-06, + "loss": 0.247, + "step": 6040 + }, + { + "epoch": 2.037417832462498, + "grad_norm": 1.2928331465674285, + "learning_rate": 6.7077638941685994e-06, + "loss": 0.2366, + "step": 6045 + }, + { + "epoch": 2.039103320411259, + "grad_norm": 1.2805082081937507, + "learning_rate": 6.70263280352789e-06, + "loss": 0.2482, + "step": 6050 + }, + { + "epoch": 2.0407888083600203, + "grad_norm": 1.8872078181420409, + "learning_rate": 6.69749968380572e-06, + "loss": 0.2475, + "step": 6055 + }, + { + "epoch": 2.0424742963087814, + "grad_norm": 1.0204046297193239, + "learning_rate": 6.692364541119396e-06, + "loss": 0.2252, + "step": 6060 + }, + { + "epoch": 2.0441597842575425, + "grad_norm": 2.0257825357597796, + "learning_rate": 6.687227381588627e-06, + "loss": 0.2555, + "step": 6065 + }, + { + "epoch": 2.0458452722063036, + "grad_norm": 1.2512342026367647, + "learning_rate": 6.682088211335531e-06, + "loss": 0.2573, + "step": 6070 + }, + { + "epoch": 2.0475307601550647, + "grad_norm": 1.386188385979886, + "learning_rate": 6.676947036484617e-06, + "loss": 0.2644, + "step": 6075 + }, + { + "epoch": 2.049216248103826, + "grad_norm": 1.0861165391839216, + "learning_rate": 6.671803863162789e-06, + "loss": 0.2388, + "step": 6080 + }, + { + "epoch": 2.0509017360525874, + "grad_norm": 2.0772980119317106, + "learning_rate": 6.666658697499329e-06, + "loss": 0.2532, + "step": 6085 + }, + { + "epoch": 2.0525872240013485, + "grad_norm": 1.3067634721841486, + "learning_rate": 6.6615115456258925e-06, + "loss": 0.2635, + "step": 6090 + }, + { + "epoch": 2.0542727119501096, + "grad_norm": 1.6623088119304197, + "learning_rate": 6.656362413676503e-06, + "loss": 0.2502, + "step": 6095 + }, + { + "epoch": 2.0559581998988707, + "grad_norm": 1.2518297326165573, + "learning_rate": 6.651211307787549e-06, + "loss": 0.2388, + "step": 6100 + }, + { + "epoch": 2.057643687847632, + "grad_norm": 1.3434940303637946, + "learning_rate": 6.64605823409776e-06, + "loss": 0.2235, + "step": 6105 + }, + { + "epoch": 2.059329175796393, + "grad_norm": 0.9897277403291037, + "learning_rate": 6.640903198748222e-06, + "loss": 0.2388, + "step": 6110 + }, + { + "epoch": 2.061014663745154, + "grad_norm": 1.45510930813586, + "learning_rate": 6.635746207882349e-06, + "loss": 0.2697, + "step": 6115 + }, + { + "epoch": 2.062700151693915, + "grad_norm": 1.3521089592228548, + "learning_rate": 6.630587267645898e-06, + "loss": 0.2622, + "step": 6120 + }, + { + "epoch": 2.0643856396426767, + "grad_norm": 1.0692698834733692, + "learning_rate": 6.625426384186935e-06, + "loss": 0.2459, + "step": 6125 + }, + { + "epoch": 2.066071127591438, + "grad_norm": 1.4825489235287137, + "learning_rate": 6.620263563655851e-06, + "loss": 0.2695, + "step": 6130 + }, + { + "epoch": 2.067756615540199, + "grad_norm": 1.1681226122016348, + "learning_rate": 6.615098812205342e-06, + "loss": 0.2486, + "step": 6135 + }, + { + "epoch": 2.06944210348896, + "grad_norm": 2.0777535621564756, + "learning_rate": 6.609932135990407e-06, + "loss": 0.252, + "step": 6140 + }, + { + "epoch": 2.071127591437721, + "grad_norm": 1.1701961255204318, + "learning_rate": 6.604763541168336e-06, + "loss": 0.221, + "step": 6145 + }, + { + "epoch": 2.0728130793864823, + "grad_norm": 1.3049019400716555, + "learning_rate": 6.5995930338987095e-06, + "loss": 0.2228, + "step": 6150 + }, + { + "epoch": 2.0744985673352434, + "grad_norm": 1.4136741050753618, + "learning_rate": 6.594420620343383e-06, + "loss": 0.2553, + "step": 6155 + }, + { + "epoch": 2.0761840552840045, + "grad_norm": 1.5809852448132287, + "learning_rate": 6.589246306666486e-06, + "loss": 0.2627, + "step": 6160 + }, + { + "epoch": 2.077869543232766, + "grad_norm": 1.4880142228637652, + "learning_rate": 6.584070099034412e-06, + "loss": 0.2313, + "step": 6165 + }, + { + "epoch": 2.079555031181527, + "grad_norm": 1.2393786050637563, + "learning_rate": 6.578892003615812e-06, + "loss": 0.2262, + "step": 6170 + }, + { + "epoch": 2.0812405191302883, + "grad_norm": 1.3603080402046084, + "learning_rate": 6.573712026581587e-06, + "loss": 0.2494, + "step": 6175 + }, + { + "epoch": 2.0829260070790494, + "grad_norm": 1.166870025568283, + "learning_rate": 6.568530174104878e-06, + "loss": 0.2427, + "step": 6180 + }, + { + "epoch": 2.0846114950278105, + "grad_norm": 1.3565042842232793, + "learning_rate": 6.563346452361064e-06, + "loss": 0.2743, + "step": 6185 + }, + { + "epoch": 2.0862969829765716, + "grad_norm": 1.264171481817194, + "learning_rate": 6.5581608675277496e-06, + "loss": 0.24, + "step": 6190 + }, + { + "epoch": 2.0879824709253327, + "grad_norm": 1.3624315567935192, + "learning_rate": 6.5529734257847636e-06, + "loss": 0.2501, + "step": 6195 + }, + { + "epoch": 2.089667958874094, + "grad_norm": 1.265549615564812, + "learning_rate": 6.54778413331414e-06, + "loss": 0.2535, + "step": 6200 + }, + { + "epoch": 2.0913534468228554, + "grad_norm": 1.2127408077403912, + "learning_rate": 6.542592996300125e-06, + "loss": 0.2537, + "step": 6205 + }, + { + "epoch": 2.0930389347716165, + "grad_norm": 1.1296303663606113, + "learning_rate": 6.537400020929162e-06, + "loss": 0.2611, + "step": 6210 + }, + { + "epoch": 2.0947244227203776, + "grad_norm": 1.3157590868806934, + "learning_rate": 6.532205213389885e-06, + "loss": 0.2588, + "step": 6215 + }, + { + "epoch": 2.0964099106691387, + "grad_norm": 1.6103286145082827, + "learning_rate": 6.527008579873107e-06, + "loss": 0.2485, + "step": 6220 + }, + { + "epoch": 2.0980953986179, + "grad_norm": 1.1648434001401868, + "learning_rate": 6.521810126571825e-06, + "loss": 0.243, + "step": 6225 + }, + { + "epoch": 2.099780886566661, + "grad_norm": 1.1147612768375883, + "learning_rate": 6.516609859681198e-06, + "loss": 0.2536, + "step": 6230 + }, + { + "epoch": 2.101466374515422, + "grad_norm": 1.2655454397441157, + "learning_rate": 6.511407785398549e-06, + "loss": 0.233, + "step": 6235 + }, + { + "epoch": 2.103151862464183, + "grad_norm": 1.536416415087582, + "learning_rate": 6.506203909923357e-06, + "loss": 0.2161, + "step": 6240 + }, + { + "epoch": 2.1048373504129447, + "grad_norm": 1.1697953229322648, + "learning_rate": 6.500998239457241e-06, + "loss": 0.2304, + "step": 6245 + }, + { + "epoch": 2.106522838361706, + "grad_norm": 1.2246499214653337, + "learning_rate": 6.495790780203967e-06, + "loss": 0.2485, + "step": 6250 + }, + { + "epoch": 2.108208326310467, + "grad_norm": 0.9838021800101451, + "learning_rate": 6.490581538369429e-06, + "loss": 0.2532, + "step": 6255 + }, + { + "epoch": 2.109893814259228, + "grad_norm": 1.6274303549369238, + "learning_rate": 6.485370520161643e-06, + "loss": 0.2431, + "step": 6260 + }, + { + "epoch": 2.111579302207989, + "grad_norm": 1.174549942148735, + "learning_rate": 6.480157731790747e-06, + "loss": 0.2401, + "step": 6265 + }, + { + "epoch": 2.1132647901567503, + "grad_norm": 1.298723556089621, + "learning_rate": 6.474943179468986e-06, + "loss": 0.273, + "step": 6270 + }, + { + "epoch": 2.1149502781055114, + "grad_norm": 1.362565090959308, + "learning_rate": 6.469726869410706e-06, + "loss": 0.2425, + "step": 6275 + }, + { + "epoch": 2.1166357660542725, + "grad_norm": 1.2707050963377002, + "learning_rate": 6.464508807832348e-06, + "loss": 0.2492, + "step": 6280 + }, + { + "epoch": 2.118321254003034, + "grad_norm": 1.4398493183564267, + "learning_rate": 6.4592890009524446e-06, + "loss": 0.2208, + "step": 6285 + }, + { + "epoch": 2.120006741951795, + "grad_norm": 2.2097074051300902, + "learning_rate": 6.454067454991602e-06, + "loss": 0.254, + "step": 6290 + }, + { + "epoch": 2.1216922299005563, + "grad_norm": 1.0908970533139506, + "learning_rate": 6.448844176172504e-06, + "loss": 0.2512, + "step": 6295 + }, + { + "epoch": 2.1233777178493174, + "grad_norm": 1.1939799800037705, + "learning_rate": 6.443619170719896e-06, + "loss": 0.253, + "step": 6300 + }, + { + "epoch": 2.1250632057980785, + "grad_norm": 1.6320742143381952, + "learning_rate": 6.438392444860584e-06, + "loss": 0.2553, + "step": 6305 + }, + { + "epoch": 2.1267486937468396, + "grad_norm": 1.2004140859484624, + "learning_rate": 6.433164004823421e-06, + "loss": 0.2279, + "step": 6310 + }, + { + "epoch": 2.1284341816956007, + "grad_norm": 1.2971609743548953, + "learning_rate": 6.427933856839305e-06, + "loss": 0.2563, + "step": 6315 + }, + { + "epoch": 2.130119669644362, + "grad_norm": 1.2299086008240054, + "learning_rate": 6.4227020071411704e-06, + "loss": 0.2605, + "step": 6320 + }, + { + "epoch": 2.1318051575931234, + "grad_norm": 1.312129544332664, + "learning_rate": 6.417468461963978e-06, + "loss": 0.242, + "step": 6325 + }, + { + "epoch": 2.1334906455418845, + "grad_norm": 1.2869067770450704, + "learning_rate": 6.41223322754471e-06, + "loss": 0.2234, + "step": 6330 + }, + { + "epoch": 2.1351761334906456, + "grad_norm": 1.1860377014817758, + "learning_rate": 6.4069963101223575e-06, + "loss": 0.2431, + "step": 6335 + }, + { + "epoch": 2.1368616214394067, + "grad_norm": 1.2427302070054334, + "learning_rate": 6.401757715937924e-06, + "loss": 0.2707, + "step": 6340 + }, + { + "epoch": 2.138547109388168, + "grad_norm": 1.2421934361326865, + "learning_rate": 6.3965174512344074e-06, + "loss": 0.2499, + "step": 6345 + }, + { + "epoch": 2.140232597336929, + "grad_norm": 1.33796945391822, + "learning_rate": 6.391275522256799e-06, + "loss": 0.2491, + "step": 6350 + }, + { + "epoch": 2.14191808528569, + "grad_norm": 1.2661647720517677, + "learning_rate": 6.386031935252068e-06, + "loss": 0.2312, + "step": 6355 + }, + { + "epoch": 2.1436035732344516, + "grad_norm": 3.878190628066829, + "learning_rate": 6.380786696469168e-06, + "loss": 0.2412, + "step": 6360 + }, + { + "epoch": 2.1452890611832127, + "grad_norm": 1.1235382207507927, + "learning_rate": 6.37553981215901e-06, + "loss": 0.2546, + "step": 6365 + }, + { + "epoch": 2.146974549131974, + "grad_norm": 1.131285056452738, + "learning_rate": 6.370291288574479e-06, + "loss": 0.236, + "step": 6370 + }, + { + "epoch": 2.148660037080735, + "grad_norm": 1.148681698062413, + "learning_rate": 6.365041131970401e-06, + "loss": 0.2506, + "step": 6375 + }, + { + "epoch": 2.150345525029496, + "grad_norm": 1.0017329344505117, + "learning_rate": 6.359789348603559e-06, + "loss": 0.2171, + "step": 6380 + }, + { + "epoch": 2.152031012978257, + "grad_norm": 1.2961719987505604, + "learning_rate": 6.354535944732665e-06, + "loss": 0.2551, + "step": 6385 + }, + { + "epoch": 2.1537165009270183, + "grad_norm": 1.7615416802946193, + "learning_rate": 6.3492809266183705e-06, + "loss": 0.2355, + "step": 6390 + }, + { + "epoch": 2.1554019888757794, + "grad_norm": 1.4712566924642663, + "learning_rate": 6.344024300523244e-06, + "loss": 0.24, + "step": 6395 + }, + { + "epoch": 2.1570874768245405, + "grad_norm": 2.5427161626675177, + "learning_rate": 6.338766072711777e-06, + "loss": 0.2329, + "step": 6400 + }, + { + "epoch": 2.158772964773302, + "grad_norm": 1.0873714400857215, + "learning_rate": 6.333506249450363e-06, + "loss": 0.2342, + "step": 6405 + }, + { + "epoch": 2.160458452722063, + "grad_norm": 2.356290105341313, + "learning_rate": 6.328244837007302e-06, + "loss": 0.2207, + "step": 6410 + }, + { + "epoch": 2.1621439406708243, + "grad_norm": 1.0946280353668836, + "learning_rate": 6.322981841652784e-06, + "loss": 0.2415, + "step": 6415 + }, + { + "epoch": 2.1638294286195854, + "grad_norm": 1.2226895675382223, + "learning_rate": 6.317717269658889e-06, + "loss": 0.2397, + "step": 6420 + }, + { + "epoch": 2.1655149165683465, + "grad_norm": 1.1313438840146788, + "learning_rate": 6.312451127299572e-06, + "loss": 0.2209, + "step": 6425 + }, + { + "epoch": 2.1672004045171076, + "grad_norm": 1.149977859921272, + "learning_rate": 6.307183420850666e-06, + "loss": 0.2377, + "step": 6430 + }, + { + "epoch": 2.1688858924658687, + "grad_norm": 1.2413176937109252, + "learning_rate": 6.30191415658986e-06, + "loss": 0.2483, + "step": 6435 + }, + { + "epoch": 2.1705713804146303, + "grad_norm": 1.0624062904086755, + "learning_rate": 6.296643340796704e-06, + "loss": 0.272, + "step": 6440 + }, + { + "epoch": 2.1722568683633914, + "grad_norm": 1.2458917486919447, + "learning_rate": 6.291370979752596e-06, + "loss": 0.2345, + "step": 6445 + }, + { + "epoch": 2.1739423563121525, + "grad_norm": 1.1092889538805586, + "learning_rate": 6.286097079740776e-06, + "loss": 0.2732, + "step": 6450 + }, + { + "epoch": 2.1756278442609136, + "grad_norm": 1.2145308467589744, + "learning_rate": 6.280821647046319e-06, + "loss": 0.237, + "step": 6455 + }, + { + "epoch": 2.1773133322096747, + "grad_norm": 1.1942778026429117, + "learning_rate": 6.2755446879561235e-06, + "loss": 0.252, + "step": 6460 + }, + { + "epoch": 2.178998820158436, + "grad_norm": 1.1936836670649174, + "learning_rate": 6.27026620875891e-06, + "loss": 0.2527, + "step": 6465 + }, + { + "epoch": 2.180684308107197, + "grad_norm": 1.6407158583958905, + "learning_rate": 6.2649862157452075e-06, + "loss": 0.2515, + "step": 6470 + }, + { + "epoch": 2.182369796055958, + "grad_norm": 1.4042006113815277, + "learning_rate": 6.2597047152073535e-06, + "loss": 0.2365, + "step": 6475 + }, + { + "epoch": 2.184055284004719, + "grad_norm": 1.264853171511084, + "learning_rate": 6.254421713439478e-06, + "loss": 0.2481, + "step": 6480 + }, + { + "epoch": 2.1857407719534807, + "grad_norm": 1.1571421488286933, + "learning_rate": 6.2491372167375035e-06, + "loss": 0.2566, + "step": 6485 + }, + { + "epoch": 2.187426259902242, + "grad_norm": 1.1344941892282723, + "learning_rate": 6.243851231399127e-06, + "loss": 0.2642, + "step": 6490 + }, + { + "epoch": 2.189111747851003, + "grad_norm": 1.033522861212516, + "learning_rate": 6.23856376372383e-06, + "loss": 0.2354, + "step": 6495 + }, + { + "epoch": 2.190797235799764, + "grad_norm": 1.3910548577470034, + "learning_rate": 6.233274820012854e-06, + "loss": 0.2417, + "step": 6500 + }, + { + "epoch": 2.192482723748525, + "grad_norm": 1.1187488644271608, + "learning_rate": 6.227984406569202e-06, + "loss": 0.2451, + "step": 6505 + }, + { + "epoch": 2.1941682116972863, + "grad_norm": 1.3938663799017454, + "learning_rate": 6.2226925296976215e-06, + "loss": 0.2482, + "step": 6510 + }, + { + "epoch": 2.1958536996460474, + "grad_norm": 1.2143780836363995, + "learning_rate": 6.217399195704618e-06, + "loss": 0.2628, + "step": 6515 + }, + { + "epoch": 2.197539187594809, + "grad_norm": 1.5102904432753659, + "learning_rate": 6.212104410898419e-06, + "loss": 0.2271, + "step": 6520 + }, + { + "epoch": 2.19922467554357, + "grad_norm": 1.2594634974044248, + "learning_rate": 6.206808181588991e-06, + "loss": 0.2474, + "step": 6525 + }, + { + "epoch": 2.200910163492331, + "grad_norm": 1.2072521354060317, + "learning_rate": 6.201510514088015e-06, + "loss": 0.2626, + "step": 6530 + }, + { + "epoch": 2.2025956514410923, + "grad_norm": 1.1328336914246169, + "learning_rate": 6.196211414708894e-06, + "loss": 0.2193, + "step": 6535 + }, + { + "epoch": 2.2042811393898534, + "grad_norm": 1.245626489237525, + "learning_rate": 6.190910889766727e-06, + "loss": 0.2426, + "step": 6540 + }, + { + "epoch": 2.2059666273386145, + "grad_norm": 1.1860186636748626, + "learning_rate": 6.1856089455783205e-06, + "loss": 0.2277, + "step": 6545 + }, + { + "epoch": 2.2076521152873756, + "grad_norm": 1.2039569888275217, + "learning_rate": 6.18030558846217e-06, + "loss": 0.2015, + "step": 6550 + }, + { + "epoch": 2.2093376032361367, + "grad_norm": 1.3145544977631802, + "learning_rate": 6.175000824738455e-06, + "loss": 0.2499, + "step": 6555 + }, + { + "epoch": 2.211023091184898, + "grad_norm": 1.193321888684212, + "learning_rate": 6.169694660729026e-06, + "loss": 0.2399, + "step": 6560 + }, + { + "epoch": 2.2127085791336594, + "grad_norm": 1.616439088934144, + "learning_rate": 6.164387102757411e-06, + "loss": 0.2361, + "step": 6565 + }, + { + "epoch": 2.2143940670824205, + "grad_norm": 1.4583808239398672, + "learning_rate": 6.1590781571487935e-06, + "loss": 0.2393, + "step": 6570 + }, + { + "epoch": 2.2160795550311816, + "grad_norm": 2.2827973314642036, + "learning_rate": 6.153767830230013e-06, + "loss": 0.2541, + "step": 6575 + }, + { + "epoch": 2.2177650429799427, + "grad_norm": 1.1064912696242977, + "learning_rate": 6.148456128329553e-06, + "loss": 0.2402, + "step": 6580 + }, + { + "epoch": 2.219450530928704, + "grad_norm": 1.1891062258691032, + "learning_rate": 6.143143057777537e-06, + "loss": 0.2361, + "step": 6585 + }, + { + "epoch": 2.221136018877465, + "grad_norm": 1.2681051786950261, + "learning_rate": 6.137828624905722e-06, + "loss": 0.2536, + "step": 6590 + }, + { + "epoch": 2.222821506826226, + "grad_norm": 1.2534210276757634, + "learning_rate": 6.132512836047482e-06, + "loss": 0.2359, + "step": 6595 + }, + { + "epoch": 2.2245069947749876, + "grad_norm": 1.0748329546371043, + "learning_rate": 6.127195697537813e-06, + "loss": 0.252, + "step": 6600 + }, + { + "epoch": 2.2261924827237487, + "grad_norm": 1.1736147397405687, + "learning_rate": 6.1218772157133185e-06, + "loss": 0.2388, + "step": 6605 + }, + { + "epoch": 2.22787797067251, + "grad_norm": 1.0568139309517195, + "learning_rate": 6.116557396912202e-06, + "loss": 0.2311, + "step": 6610 + }, + { + "epoch": 2.229563458621271, + "grad_norm": 1.0798317282856567, + "learning_rate": 6.111236247474257e-06, + "loss": 0.2509, + "step": 6615 + }, + { + "epoch": 2.231248946570032, + "grad_norm": 1.2481887998981374, + "learning_rate": 6.105913773740868e-06, + "loss": 0.2547, + "step": 6620 + }, + { + "epoch": 2.232934434518793, + "grad_norm": 1.3505784539591155, + "learning_rate": 6.100589982054996e-06, + "loss": 0.2395, + "step": 6625 + }, + { + "epoch": 2.2346199224675543, + "grad_norm": 1.5618324719320573, + "learning_rate": 6.095264878761173e-06, + "loss": 0.2466, + "step": 6630 + }, + { + "epoch": 2.2363054104163154, + "grad_norm": 14.96700943268755, + "learning_rate": 6.089938470205491e-06, + "loss": 0.2352, + "step": 6635 + }, + { + "epoch": 2.2379908983650765, + "grad_norm": 1.5468393100897242, + "learning_rate": 6.0846107627356e-06, + "loss": 0.2331, + "step": 6640 + }, + { + "epoch": 2.239676386313838, + "grad_norm": 3.150900044580589, + "learning_rate": 6.079281762700699e-06, + "loss": 0.2291, + "step": 6645 + }, + { + "epoch": 2.241361874262599, + "grad_norm": 1.613263136193192, + "learning_rate": 6.073951476451527e-06, + "loss": 0.2621, + "step": 6650 + }, + { + "epoch": 2.2430473622113603, + "grad_norm": 1.7551864238089545, + "learning_rate": 6.068619910340352e-06, + "loss": 0.2395, + "step": 6655 + }, + { + "epoch": 2.2447328501601214, + "grad_norm": 1.4465600175104945, + "learning_rate": 6.063287070720973e-06, + "loss": 0.255, + "step": 6660 + }, + { + "epoch": 2.2464183381088825, + "grad_norm": 1.258228336774687, + "learning_rate": 6.057952963948702e-06, + "loss": 0.2481, + "step": 6665 + }, + { + "epoch": 2.2481038260576436, + "grad_norm": 1.263143196546373, + "learning_rate": 6.052617596380367e-06, + "loss": 0.2531, + "step": 6670 + }, + { + "epoch": 2.2497893140064047, + "grad_norm": 4.807974012440118, + "learning_rate": 6.047280974374288e-06, + "loss": 0.2444, + "step": 6675 + }, + { + "epoch": 2.2514748019551662, + "grad_norm": 2.100061007257097, + "learning_rate": 6.041943104290292e-06, + "loss": 0.2398, + "step": 6680 + }, + { + "epoch": 2.2531602899039274, + "grad_norm": 1.4269327638298077, + "learning_rate": 6.036603992489686e-06, + "loss": 0.2361, + "step": 6685 + }, + { + "epoch": 2.2548457778526885, + "grad_norm": 1.360058455325419, + "learning_rate": 6.031263645335259e-06, + "loss": 0.2401, + "step": 6690 + }, + { + "epoch": 2.2565312658014496, + "grad_norm": 1.1782327502374301, + "learning_rate": 6.0259220691912716e-06, + "loss": 0.2302, + "step": 6695 + }, + { + "epoch": 2.2582167537502107, + "grad_norm": 1.164819799173591, + "learning_rate": 6.020579270423449e-06, + "loss": 0.2593, + "step": 6700 + }, + { + "epoch": 2.259902241698972, + "grad_norm": 1.2387369444451408, + "learning_rate": 6.015235255398974e-06, + "loss": 0.2592, + "step": 6705 + }, + { + "epoch": 2.261587729647733, + "grad_norm": 1.6254405001299357, + "learning_rate": 6.009890030486479e-06, + "loss": 0.2475, + "step": 6710 + }, + { + "epoch": 2.263273217596494, + "grad_norm": 1.625103328435798, + "learning_rate": 6.004543602056037e-06, + "loss": 0.2305, + "step": 6715 + }, + { + "epoch": 2.264958705545255, + "grad_norm": 1.1636770921013966, + "learning_rate": 5.999195976479157e-06, + "loss": 0.2473, + "step": 6720 + }, + { + "epoch": 2.2666441934940167, + "grad_norm": 11.893545102865916, + "learning_rate": 5.993847160128775e-06, + "loss": 0.2357, + "step": 6725 + }, + { + "epoch": 2.268329681442778, + "grad_norm": 1.1335921386467114, + "learning_rate": 5.988497159379243e-06, + "loss": 0.2193, + "step": 6730 + }, + { + "epoch": 2.270015169391539, + "grad_norm": 1.0642553169784263, + "learning_rate": 5.983145980606326e-06, + "loss": 0.235, + "step": 6735 + }, + { + "epoch": 2.2717006573403, + "grad_norm": 3.786792114465667, + "learning_rate": 5.977793630187195e-06, + "loss": 0.2493, + "step": 6740 + }, + { + "epoch": 2.273386145289061, + "grad_norm": 1.1713137338435256, + "learning_rate": 5.972440114500416e-06, + "loss": 0.2555, + "step": 6745 + }, + { + "epoch": 2.2750716332378222, + "grad_norm": 1.3623208121739148, + "learning_rate": 5.967085439925939e-06, + "loss": 0.2182, + "step": 6750 + }, + { + "epoch": 2.2767571211865834, + "grad_norm": 1.344198239937484, + "learning_rate": 5.961729612845106e-06, + "loss": 0.223, + "step": 6755 + }, + { + "epoch": 2.278442609135345, + "grad_norm": 1.383230976151544, + "learning_rate": 5.956372639640619e-06, + "loss": 0.2359, + "step": 6760 + }, + { + "epoch": 2.280128097084106, + "grad_norm": 1.944799811793341, + "learning_rate": 5.951014526696559e-06, + "loss": 0.2442, + "step": 6765 + }, + { + "epoch": 2.281813585032867, + "grad_norm": 1.0918926349432208, + "learning_rate": 5.945655280398354e-06, + "loss": 0.2138, + "step": 6770 + }, + { + "epoch": 2.2834990729816282, + "grad_norm": 1.1231403105751203, + "learning_rate": 5.940294907132791e-06, + "loss": 0.2417, + "step": 6775 + }, + { + "epoch": 2.2851845609303894, + "grad_norm": 1.3184852064982298, + "learning_rate": 5.9349334132879934e-06, + "loss": 0.2291, + "step": 6780 + }, + { + "epoch": 2.2868700488791505, + "grad_norm": 2.2537021473360648, + "learning_rate": 5.929570805253427e-06, + "loss": 0.2581, + "step": 6785 + }, + { + "epoch": 2.2885555368279116, + "grad_norm": 1.100721585922719, + "learning_rate": 5.924207089419877e-06, + "loss": 0.2191, + "step": 6790 + }, + { + "epoch": 2.2902410247766727, + "grad_norm": 1.1670665658746653, + "learning_rate": 5.918842272179459e-06, + "loss": 0.2654, + "step": 6795 + }, + { + "epoch": 2.291926512725434, + "grad_norm": 1.1589864389994455, + "learning_rate": 5.9134763599255916e-06, + "loss": 0.2471, + "step": 6800 + }, + { + "epoch": 2.2936120006741954, + "grad_norm": 1.0481508446883854, + "learning_rate": 5.908109359053005e-06, + "loss": 0.2255, + "step": 6805 + }, + { + "epoch": 2.2952974886229565, + "grad_norm": 1.047008069868799, + "learning_rate": 5.902741275957721e-06, + "loss": 0.2479, + "step": 6810 + }, + { + "epoch": 2.2969829765717176, + "grad_norm": 1.2058473501896603, + "learning_rate": 5.897372117037059e-06, + "loss": 0.2612, + "step": 6815 + }, + { + "epoch": 2.2986684645204787, + "grad_norm": 1.4300246968276056, + "learning_rate": 5.892001888689612e-06, + "loss": 0.2615, + "step": 6820 + }, + { + "epoch": 2.30035395246924, + "grad_norm": 1.269627297541263, + "learning_rate": 5.88663059731525e-06, + "loss": 0.2318, + "step": 6825 + }, + { + "epoch": 2.302039440418001, + "grad_norm": 1.1374461606786461, + "learning_rate": 5.881258249315116e-06, + "loss": 0.2229, + "step": 6830 + }, + { + "epoch": 2.303724928366762, + "grad_norm": 1.7180410432225457, + "learning_rate": 5.875884851091604e-06, + "loss": 0.2442, + "step": 6835 + }, + { + "epoch": 2.3054104163155236, + "grad_norm": 1.2143351896131052, + "learning_rate": 5.870510409048365e-06, + "loss": 0.2423, + "step": 6840 + }, + { + "epoch": 2.3070959042642847, + "grad_norm": 1.1258442510338162, + "learning_rate": 5.8651349295902896e-06, + "loss": 0.2387, + "step": 6845 + }, + { + "epoch": 2.308781392213046, + "grad_norm": 1.137679132016169, + "learning_rate": 5.859758419123508e-06, + "loss": 0.2403, + "step": 6850 + }, + { + "epoch": 2.310466880161807, + "grad_norm": 1.112321254644545, + "learning_rate": 5.854380884055377e-06, + "loss": 0.262, + "step": 6855 + }, + { + "epoch": 2.312152368110568, + "grad_norm": 1.3640879778961075, + "learning_rate": 5.849002330794478e-06, + "loss": 0.2576, + "step": 6860 + }, + { + "epoch": 2.313837856059329, + "grad_norm": 1.0839730017891585, + "learning_rate": 5.843622765750601e-06, + "loss": 0.2697, + "step": 6865 + }, + { + "epoch": 2.3155233440080902, + "grad_norm": 1.263259455972846, + "learning_rate": 5.838242195334747e-06, + "loss": 0.2596, + "step": 6870 + }, + { + "epoch": 2.3172088319568513, + "grad_norm": 1.1893704341874667, + "learning_rate": 5.832860625959108e-06, + "loss": 0.223, + "step": 6875 + }, + { + "epoch": 2.3188943199056125, + "grad_norm": 1.1540849636190489, + "learning_rate": 5.8274780640370735e-06, + "loss": 0.2375, + "step": 6880 + }, + { + "epoch": 2.320579807854374, + "grad_norm": 1.2223960562080016, + "learning_rate": 5.822094515983213e-06, + "loss": 0.2381, + "step": 6885 + }, + { + "epoch": 2.322265295803135, + "grad_norm": 1.487367547151045, + "learning_rate": 5.816709988213272e-06, + "loss": 0.2148, + "step": 6890 + }, + { + "epoch": 2.3239507837518962, + "grad_norm": 1.2503138457234537, + "learning_rate": 5.811324487144158e-06, + "loss": 0.202, + "step": 6895 + }, + { + "epoch": 2.3256362717006573, + "grad_norm": 1.0493433032609951, + "learning_rate": 5.805938019193951e-06, + "loss": 0.2186, + "step": 6900 + }, + { + "epoch": 2.3273217596494185, + "grad_norm": 1.2202510868363294, + "learning_rate": 5.800550590781868e-06, + "loss": 0.2466, + "step": 6905 + }, + { + "epoch": 2.3290072475981796, + "grad_norm": 1.2202610825295719, + "learning_rate": 5.7951622083282855e-06, + "loss": 0.2062, + "step": 6910 + }, + { + "epoch": 2.3306927355469407, + "grad_norm": 1.8773743398521658, + "learning_rate": 5.789772878254702e-06, + "loss": 0.2512, + "step": 6915 + }, + { + "epoch": 2.3323782234957022, + "grad_norm": 1.4613324066896567, + "learning_rate": 5.784382606983758e-06, + "loss": 0.2153, + "step": 6920 + }, + { + "epoch": 2.3340637114444633, + "grad_norm": 2.7067987953485186, + "learning_rate": 5.77899140093921e-06, + "loss": 0.2469, + "step": 6925 + }, + { + "epoch": 2.3357491993932245, + "grad_norm": 1.1325219005782825, + "learning_rate": 5.773599266545929e-06, + "loss": 0.2341, + "step": 6930 + }, + { + "epoch": 2.3374346873419856, + "grad_norm": 1.4263889886016168, + "learning_rate": 5.7682062102298885e-06, + "loss": 0.2388, + "step": 6935 + }, + { + "epoch": 2.3391201752907467, + "grad_norm": 4.1610607418431815, + "learning_rate": 5.76281223841817e-06, + "loss": 0.2476, + "step": 6940 + }, + { + "epoch": 2.340805663239508, + "grad_norm": 1.5113178559529479, + "learning_rate": 5.757417357538937e-06, + "loss": 0.2155, + "step": 6945 + }, + { + "epoch": 2.342491151188269, + "grad_norm": 1.0816194687734555, + "learning_rate": 5.7520215740214425e-06, + "loss": 0.2247, + "step": 6950 + }, + { + "epoch": 2.34417663913703, + "grad_norm": 1.6654194399233935, + "learning_rate": 5.746624894296011e-06, + "loss": 0.2251, + "step": 6955 + }, + { + "epoch": 2.345862127085791, + "grad_norm": 1.2577144581376345, + "learning_rate": 5.741227324794036e-06, + "loss": 0.2498, + "step": 6960 + }, + { + "epoch": 2.3475476150345527, + "grad_norm": 1.3714996263539714, + "learning_rate": 5.735828871947975e-06, + "loss": 0.2566, + "step": 6965 + }, + { + "epoch": 2.349233102983314, + "grad_norm": 1.3782756750931844, + "learning_rate": 5.730429542191334e-06, + "loss": 0.2025, + "step": 6970 + }, + { + "epoch": 2.350918590932075, + "grad_norm": 1.265353171121104, + "learning_rate": 5.725029341958663e-06, + "loss": 0.2126, + "step": 6975 + }, + { + "epoch": 2.352604078880836, + "grad_norm": 1.2328021537041824, + "learning_rate": 5.719628277685554e-06, + "loss": 0.2225, + "step": 6980 + }, + { + "epoch": 2.354289566829597, + "grad_norm": 20.18868511446677, + "learning_rate": 5.714226355808626e-06, + "loss": 0.2244, + "step": 6985 + }, + { + "epoch": 2.3559750547783582, + "grad_norm": 1.3438543339064466, + "learning_rate": 5.708823582765522e-06, + "loss": 0.2709, + "step": 6990 + }, + { + "epoch": 2.3576605427271193, + "grad_norm": 1.124947070515268, + "learning_rate": 5.703419964994895e-06, + "loss": 0.2225, + "step": 6995 + }, + { + "epoch": 2.359346030675881, + "grad_norm": 1.310836306015827, + "learning_rate": 5.698015508936409e-06, + "loss": 0.2285, + "step": 7000 + }, + { + "epoch": 2.361031518624642, + "grad_norm": 1.584790783446456, + "learning_rate": 5.692610221030725e-06, + "loss": 0.2538, + "step": 7005 + }, + { + "epoch": 2.362717006573403, + "grad_norm": 1.4685376379074737, + "learning_rate": 5.687204107719497e-06, + "loss": 0.2165, + "step": 7010 + }, + { + "epoch": 2.364402494522164, + "grad_norm": 1.269877373502541, + "learning_rate": 5.68179717544536e-06, + "loss": 0.2023, + "step": 7015 + }, + { + "epoch": 2.3660879824709253, + "grad_norm": 1.094382809454758, + "learning_rate": 5.676389430651928e-06, + "loss": 0.2177, + "step": 7020 + }, + { + "epoch": 2.3677734704196864, + "grad_norm": 1.1387050143356454, + "learning_rate": 5.670980879783781e-06, + "loss": 0.2458, + "step": 7025 + }, + { + "epoch": 2.3694589583684476, + "grad_norm": 1.1974323412230972, + "learning_rate": 5.665571529286459e-06, + "loss": 0.2084, + "step": 7030 + }, + { + "epoch": 2.3711444463172087, + "grad_norm": 1.092310961345673, + "learning_rate": 5.660161385606457e-06, + "loss": 0.2047, + "step": 7035 + }, + { + "epoch": 2.3728299342659698, + "grad_norm": 1.527084870640945, + "learning_rate": 5.654750455191218e-06, + "loss": 0.232, + "step": 7040 + }, + { + "epoch": 2.3745154222147313, + "grad_norm": 1.0282336568760309, + "learning_rate": 5.649338744489117e-06, + "loss": 0.2158, + "step": 7045 + }, + { + "epoch": 2.3762009101634924, + "grad_norm": 1.3155101904587596, + "learning_rate": 5.643926259949457e-06, + "loss": 0.213, + "step": 7050 + }, + { + "epoch": 2.3778863981122536, + "grad_norm": 2.866191674783534, + "learning_rate": 5.638513008022474e-06, + "loss": 0.2209, + "step": 7055 + }, + { + "epoch": 2.3795718860610147, + "grad_norm": 1.5414440506664546, + "learning_rate": 5.633098995159309e-06, + "loss": 0.2424, + "step": 7060 + }, + { + "epoch": 2.3812573740097758, + "grad_norm": 1.112557460013863, + "learning_rate": 5.627684227812013e-06, + "loss": 0.201, + "step": 7065 + }, + { + "epoch": 2.382942861958537, + "grad_norm": 2.0114792448001535, + "learning_rate": 5.622268712433534e-06, + "loss": 0.2334, + "step": 7070 + }, + { + "epoch": 2.384628349907298, + "grad_norm": 1.2742428624111608, + "learning_rate": 5.616852455477716e-06, + "loss": 0.2351, + "step": 7075 + }, + { + "epoch": 2.3863138378560595, + "grad_norm": 1.3671795536971996, + "learning_rate": 5.611435463399281e-06, + "loss": 0.2398, + "step": 7080 + }, + { + "epoch": 2.3879993258048207, + "grad_norm": 1.1301205543521131, + "learning_rate": 5.606017742653833e-06, + "loss": 0.2235, + "step": 7085 + }, + { + "epoch": 2.3896848137535818, + "grad_norm": 1.1526152359615127, + "learning_rate": 5.600599299697839e-06, + "loss": 0.2468, + "step": 7090 + }, + { + "epoch": 2.391370301702343, + "grad_norm": 1.453507374800417, + "learning_rate": 5.595180140988632e-06, + "loss": 0.2254, + "step": 7095 + }, + { + "epoch": 2.393055789651104, + "grad_norm": 1.3710186359545902, + "learning_rate": 5.589760272984392e-06, + "loss": 0.2351, + "step": 7100 + }, + { + "epoch": 2.394741277599865, + "grad_norm": 1.754515083381334, + "learning_rate": 5.584339702144152e-06, + "loss": 0.2225, + "step": 7105 + }, + { + "epoch": 2.396426765548626, + "grad_norm": 1.199318449021297, + "learning_rate": 5.57891843492777e-06, + "loss": 0.2079, + "step": 7110 + }, + { + "epoch": 2.3981122534973873, + "grad_norm": 1.4269637236991999, + "learning_rate": 5.573496477795951e-06, + "loss": 0.2234, + "step": 7115 + }, + { + "epoch": 2.3997977414461484, + "grad_norm": 1.1534253539542598, + "learning_rate": 5.568073837210207e-06, + "loss": 0.2386, + "step": 7120 + }, + { + "epoch": 2.40148322939491, + "grad_norm": 1.355162400125902, + "learning_rate": 5.562650519632873e-06, + "loss": 0.2412, + "step": 7125 + }, + { + "epoch": 2.403168717343671, + "grad_norm": 1.1517182571316948, + "learning_rate": 5.557226531527088e-06, + "loss": 0.2236, + "step": 7130 + }, + { + "epoch": 2.404854205292432, + "grad_norm": 1.1364581590979208, + "learning_rate": 5.551801879356789e-06, + "loss": 0.2621, + "step": 7135 + }, + { + "epoch": 2.4065396932411933, + "grad_norm": 1.1700374556331525, + "learning_rate": 5.546376569586709e-06, + "loss": 0.2538, + "step": 7140 + }, + { + "epoch": 2.4082251811899544, + "grad_norm": 1.3523151286119313, + "learning_rate": 5.540950608682359e-06, + "loss": 0.2344, + "step": 7145 + }, + { + "epoch": 2.4099106691387155, + "grad_norm": 1.1825564225868845, + "learning_rate": 5.535524003110031e-06, + "loss": 0.2382, + "step": 7150 + }, + { + "epoch": 2.4115961570874767, + "grad_norm": 1.0797433745032434, + "learning_rate": 5.530096759336779e-06, + "loss": 0.2199, + "step": 7155 + }, + { + "epoch": 2.413281645036238, + "grad_norm": 1.0143081902605937, + "learning_rate": 5.5246688838304266e-06, + "loss": 0.2247, + "step": 7160 + }, + { + "epoch": 2.4149671329849993, + "grad_norm": 1.1118061266983945, + "learning_rate": 5.519240383059537e-06, + "loss": 0.2544, + "step": 7165 + }, + { + "epoch": 2.4166526209337604, + "grad_norm": 1.2286450157516224, + "learning_rate": 5.513811263493436e-06, + "loss": 0.2397, + "step": 7170 + }, + { + "epoch": 2.4183381088825215, + "grad_norm": 1.4927797502510405, + "learning_rate": 5.508381531602171e-06, + "loss": 0.2329, + "step": 7175 + }, + { + "epoch": 2.4200235968312827, + "grad_norm": 1.1002295912499036, + "learning_rate": 5.502951193856527e-06, + "loss": 0.2374, + "step": 7180 + }, + { + "epoch": 2.4217090847800438, + "grad_norm": 1.576560002392886, + "learning_rate": 5.49752025672801e-06, + "loss": 0.2239, + "step": 7185 + }, + { + "epoch": 2.423394572728805, + "grad_norm": 1.1424670041991118, + "learning_rate": 5.49208872668884e-06, + "loss": 0.2029, + "step": 7190 + }, + { + "epoch": 2.425080060677566, + "grad_norm": 1.2484868685933979, + "learning_rate": 5.486656610211943e-06, + "loss": 0.2285, + "step": 7195 + }, + { + "epoch": 2.426765548626327, + "grad_norm": 1.1959207944488555, + "learning_rate": 5.4812239137709465e-06, + "loss": 0.2173, + "step": 7200 + }, + { + "epoch": 2.4284510365750887, + "grad_norm": 1.3496514473746175, + "learning_rate": 5.475790643840162e-06, + "loss": 0.2303, + "step": 7205 + }, + { + "epoch": 2.4301365245238498, + "grad_norm": 1.197822436196079, + "learning_rate": 5.470356806894596e-06, + "loss": 0.234, + "step": 7210 + }, + { + "epoch": 2.431822012472611, + "grad_norm": 1.3377703610214131, + "learning_rate": 5.464922409409918e-06, + "loss": 0.1965, + "step": 7215 + }, + { + "epoch": 2.433507500421372, + "grad_norm": 1.1773728278895432, + "learning_rate": 5.459487457862473e-06, + "loss": 0.2185, + "step": 7220 + }, + { + "epoch": 2.435192988370133, + "grad_norm": 1.3169882505936392, + "learning_rate": 5.454051958729269e-06, + "loss": 0.2292, + "step": 7225 + }, + { + "epoch": 2.436878476318894, + "grad_norm": 1.6848622821363404, + "learning_rate": 5.44861591848796e-06, + "loss": 0.2107, + "step": 7230 + }, + { + "epoch": 2.4385639642676553, + "grad_norm": 1.1165133222451564, + "learning_rate": 5.443179343616846e-06, + "loss": 0.2379, + "step": 7235 + }, + { + "epoch": 2.440249452216417, + "grad_norm": 1.1185248465942157, + "learning_rate": 5.437742240594866e-06, + "loss": 0.2064, + "step": 7240 + }, + { + "epoch": 2.441934940165178, + "grad_norm": 1.1370278202403525, + "learning_rate": 5.4323046159015895e-06, + "loss": 0.2097, + "step": 7245 + }, + { + "epoch": 2.443620428113939, + "grad_norm": 1.374722614390174, + "learning_rate": 5.426866476017205e-06, + "loss": 0.2306, + "step": 7250 + }, + { + "epoch": 2.4453059160627, + "grad_norm": 1.132417679631243, + "learning_rate": 5.421427827422517e-06, + "loss": 0.2525, + "step": 7255 + }, + { + "epoch": 2.4469914040114613, + "grad_norm": 1.141997524164393, + "learning_rate": 5.415988676598933e-06, + "loss": 0.2321, + "step": 7260 + }, + { + "epoch": 2.4486768919602224, + "grad_norm": 1.017487900565632, + "learning_rate": 5.410549030028463e-06, + "loss": 0.2219, + "step": 7265 + }, + { + "epoch": 2.4503623799089835, + "grad_norm": 1.1477724859913059, + "learning_rate": 5.405108894193709e-06, + "loss": 0.2266, + "step": 7270 + }, + { + "epoch": 2.4520478678577446, + "grad_norm": 1.2500636663936444, + "learning_rate": 5.399668275577849e-06, + "loss": 0.2186, + "step": 7275 + }, + { + "epoch": 2.4537333558065058, + "grad_norm": 1.250040587105613, + "learning_rate": 5.39422718066464e-06, + "loss": 0.2474, + "step": 7280 + }, + { + "epoch": 2.4554188437552673, + "grad_norm": 1.1703132501739941, + "learning_rate": 5.3887856159384125e-06, + "loss": 0.2048, + "step": 7285 + }, + { + "epoch": 2.4571043317040284, + "grad_norm": 1.2197533074398348, + "learning_rate": 5.383343587884047e-06, + "loss": 0.2323, + "step": 7290 + }, + { + "epoch": 2.4587898196527895, + "grad_norm": 1.4973472049059546, + "learning_rate": 5.377901102986982e-06, + "loss": 0.2341, + "step": 7295 + }, + { + "epoch": 2.4604753076015506, + "grad_norm": 1.1198616931261667, + "learning_rate": 5.372458167733199e-06, + "loss": 0.2332, + "step": 7300 + }, + { + "epoch": 2.4621607955503118, + "grad_norm": 1.209762979996002, + "learning_rate": 5.367014788609217e-06, + "loss": 0.2009, + "step": 7305 + }, + { + "epoch": 2.463846283499073, + "grad_norm": 2.565199374352355, + "learning_rate": 5.361570972102083e-06, + "loss": 0.2158, + "step": 7310 + }, + { + "epoch": 2.465531771447834, + "grad_norm": 1.2584617419031974, + "learning_rate": 5.356126724699366e-06, + "loss": 0.218, + "step": 7315 + }, + { + "epoch": 2.4672172593965955, + "grad_norm": 19.181266839822687, + "learning_rate": 5.3506820528891466e-06, + "loss": 0.2413, + "step": 7320 + }, + { + "epoch": 2.4689027473453566, + "grad_norm": 1.266880506918883, + "learning_rate": 5.345236963160017e-06, + "loss": 0.2145, + "step": 7325 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 1.5613710090495556, + "learning_rate": 5.339791462001056e-06, + "loss": 0.2164, + "step": 7330 + }, + { + "epoch": 2.472273723242879, + "grad_norm": 1.3876495140019696, + "learning_rate": 5.334345555901845e-06, + "loss": 0.2321, + "step": 7335 + }, + { + "epoch": 2.47395921119164, + "grad_norm": 1.195579075161368, + "learning_rate": 5.328899251352443e-06, + "loss": 0.2234, + "step": 7340 + }, + { + "epoch": 2.475644699140401, + "grad_norm": 1.21658518877199, + "learning_rate": 5.323452554843383e-06, + "loss": 0.2334, + "step": 7345 + }, + { + "epoch": 2.477330187089162, + "grad_norm": 1.341412480385651, + "learning_rate": 5.3180054728656635e-06, + "loss": 0.2103, + "step": 7350 + }, + { + "epoch": 2.4790156750379233, + "grad_norm": 2.3581330846404054, + "learning_rate": 5.312558011910747e-06, + "loss": 0.2441, + "step": 7355 + }, + { + "epoch": 2.4807011629866844, + "grad_norm": 1.281528543535254, + "learning_rate": 5.3071101784705444e-06, + "loss": 0.2495, + "step": 7360 + }, + { + "epoch": 2.482386650935446, + "grad_norm": 1.2361244252686825, + "learning_rate": 5.301661979037412e-06, + "loss": 0.2333, + "step": 7365 + }, + { + "epoch": 2.484072138884207, + "grad_norm": 1.19514127211079, + "learning_rate": 5.296213420104141e-06, + "loss": 0.2239, + "step": 7370 + }, + { + "epoch": 2.485757626832968, + "grad_norm": 1.365035732612306, + "learning_rate": 5.290764508163953e-06, + "loss": 0.2272, + "step": 7375 + }, + { + "epoch": 2.4874431147817293, + "grad_norm": 1.8123799483990297, + "learning_rate": 5.285315249710488e-06, + "loss": 0.2451, + "step": 7380 + }, + { + "epoch": 2.4891286027304904, + "grad_norm": 1.1712964598933269, + "learning_rate": 5.279865651237801e-06, + "loss": 0.226, + "step": 7385 + }, + { + "epoch": 2.4908140906792515, + "grad_norm": 1.3165948441055455, + "learning_rate": 5.274415719240349e-06, + "loss": 0.2172, + "step": 7390 + }, + { + "epoch": 2.4924995786280126, + "grad_norm": 1.1848531526625858, + "learning_rate": 5.268965460212989e-06, + "loss": 0.2164, + "step": 7395 + }, + { + "epoch": 2.494185066576774, + "grad_norm": 1.3803426336906315, + "learning_rate": 5.26351488065097e-06, + "loss": 0.2293, + "step": 7400 + }, + { + "epoch": 2.4958705545255353, + "grad_norm": 1.1146173443586165, + "learning_rate": 5.258063987049919e-06, + "loss": 0.215, + "step": 7405 + }, + { + "epoch": 2.4975560424742964, + "grad_norm": 1.3421795249765416, + "learning_rate": 5.252612785905836e-06, + "loss": 0.2215, + "step": 7410 + }, + { + "epoch": 2.4992415304230575, + "grad_norm": 1.3343865702009974, + "learning_rate": 5.247161283715093e-06, + "loss": 0.2, + "step": 7415 + }, + { + "epoch": 2.5009270183718186, + "grad_norm": 1.2680822744156943, + "learning_rate": 5.241709486974419e-06, + "loss": 0.2448, + "step": 7420 + }, + { + "epoch": 2.5026125063205797, + "grad_norm": 1.2153448944194911, + "learning_rate": 5.23625740218089e-06, + "loss": 0.1976, + "step": 7425 + }, + { + "epoch": 2.504297994269341, + "grad_norm": 1.4596828307055203, + "learning_rate": 5.230805035831928e-06, + "loss": 0.2121, + "step": 7430 + }, + { + "epoch": 2.5059834822181024, + "grad_norm": 1.7781454267742418, + "learning_rate": 5.225352394425293e-06, + "loss": 0.2253, + "step": 7435 + }, + { + "epoch": 2.507668970166863, + "grad_norm": 1.2962141597729508, + "learning_rate": 5.2198994844590716e-06, + "loss": 0.2392, + "step": 7440 + }, + { + "epoch": 2.5093544581156246, + "grad_norm": 1.3549211808148687, + "learning_rate": 5.214446312431664e-06, + "loss": 0.2143, + "step": 7445 + }, + { + "epoch": 2.5110399460643857, + "grad_norm": 1.1777314657407805, + "learning_rate": 5.208992884841794e-06, + "loss": 0.2036, + "step": 7450 + }, + { + "epoch": 2.512725434013147, + "grad_norm": 1.1126328029588632, + "learning_rate": 5.203539208188479e-06, + "loss": 0.2215, + "step": 7455 + }, + { + "epoch": 2.514410921961908, + "grad_norm": 1.2350579187532946, + "learning_rate": 5.198085288971043e-06, + "loss": 0.2393, + "step": 7460 + }, + { + "epoch": 2.516096409910669, + "grad_norm": 1.1473437156684776, + "learning_rate": 5.19263113368909e-06, + "loss": 0.2234, + "step": 7465 + }, + { + "epoch": 2.51778189785943, + "grad_norm": 1.2605124960412208, + "learning_rate": 5.187176748842514e-06, + "loss": 0.2118, + "step": 7470 + }, + { + "epoch": 2.5194673858081913, + "grad_norm": 1.1996583064453163, + "learning_rate": 5.1817221409314755e-06, + "loss": 0.2229, + "step": 7475 + }, + { + "epoch": 2.521152873756953, + "grad_norm": 1.4972450500593595, + "learning_rate": 5.176267316456404e-06, + "loss": 0.2315, + "step": 7480 + }, + { + "epoch": 2.522838361705714, + "grad_norm": 1.2625856422283825, + "learning_rate": 5.170812281917985e-06, + "loss": 0.212, + "step": 7485 + }, + { + "epoch": 2.524523849654475, + "grad_norm": 1.0757073846492482, + "learning_rate": 5.16535704381716e-06, + "loss": 0.2255, + "step": 7490 + }, + { + "epoch": 2.526209337603236, + "grad_norm": 1.3284596906628738, + "learning_rate": 5.159901608655105e-06, + "loss": 0.2002, + "step": 7495 + }, + { + "epoch": 2.5278948255519973, + "grad_norm": 1.197013365424821, + "learning_rate": 5.154445982933238e-06, + "loss": 0.2159, + "step": 7500 + }, + { + "epoch": 2.5295803135007584, + "grad_norm": 1.1471487167123913, + "learning_rate": 5.148990173153198e-06, + "loss": 0.2442, + "step": 7505 + }, + { + "epoch": 2.5312658014495195, + "grad_norm": 1.1133548990083035, + "learning_rate": 5.1435341858168496e-06, + "loss": 0.1944, + "step": 7510 + }, + { + "epoch": 2.532951289398281, + "grad_norm": 2.8631588077757204, + "learning_rate": 5.138078027426263e-06, + "loss": 0.2119, + "step": 7515 + }, + { + "epoch": 2.5346367773470417, + "grad_norm": 1.2632368897607371, + "learning_rate": 5.132621704483718e-06, + "loss": 0.2332, + "step": 7520 + }, + { + "epoch": 2.5363222652958033, + "grad_norm": 1.2282471728320135, + "learning_rate": 5.127165223491684e-06, + "loss": 0.2447, + "step": 7525 + }, + { + "epoch": 2.5380077532445644, + "grad_norm": 1.237329542255208, + "learning_rate": 5.121708590952826e-06, + "loss": 0.1999, + "step": 7530 + }, + { + "epoch": 2.5396932411933255, + "grad_norm": 1.3866702828113873, + "learning_rate": 5.116251813369982e-06, + "loss": 0.2331, + "step": 7535 + }, + { + "epoch": 2.5413787291420866, + "grad_norm": 1.221353394456828, + "learning_rate": 5.1107948972461705e-06, + "loss": 0.2275, + "step": 7540 + }, + { + "epoch": 2.5430642170908477, + "grad_norm": 1.2602921223294226, + "learning_rate": 5.10533784908457e-06, + "loss": 0.2355, + "step": 7545 + }, + { + "epoch": 2.544749705039609, + "grad_norm": 1.400793972666376, + "learning_rate": 5.099880675388516e-06, + "loss": 0.2108, + "step": 7550 + }, + { + "epoch": 2.54643519298837, + "grad_norm": 1.3124538737996967, + "learning_rate": 5.094423382661496e-06, + "loss": 0.2109, + "step": 7555 + }, + { + "epoch": 2.5481206809371315, + "grad_norm": 1.270861270880195, + "learning_rate": 5.0889659774071396e-06, + "loss": 0.2184, + "step": 7560 + }, + { + "epoch": 2.5498061688858926, + "grad_norm": 1.1534545573853237, + "learning_rate": 5.08350846612921e-06, + "loss": 0.1968, + "step": 7565 + }, + { + "epoch": 2.5514916568346537, + "grad_norm": 1.1686319456141216, + "learning_rate": 5.078050855331595e-06, + "loss": 0.2319, + "step": 7570 + }, + { + "epoch": 2.553177144783415, + "grad_norm": 1.1816525144174514, + "learning_rate": 5.0725931515183035e-06, + "loss": 0.2312, + "step": 7575 + }, + { + "epoch": 2.554862632732176, + "grad_norm": 1.2888614690099474, + "learning_rate": 5.0671353611934505e-06, + "loss": 0.2166, + "step": 7580 + }, + { + "epoch": 2.556548120680937, + "grad_norm": 1.4569746507152874, + "learning_rate": 5.061677490861263e-06, + "loss": 0.2158, + "step": 7585 + }, + { + "epoch": 2.558233608629698, + "grad_norm": 1.5332163875387035, + "learning_rate": 5.056219547026055e-06, + "loss": 0.2155, + "step": 7590 + }, + { + "epoch": 2.5599190965784597, + "grad_norm": 1.1573902495188044, + "learning_rate": 5.050761536192231e-06, + "loss": 0.2012, + "step": 7595 + }, + { + "epoch": 2.5616045845272204, + "grad_norm": 1.0696710718777616, + "learning_rate": 5.0453034648642765e-06, + "loss": 0.2019, + "step": 7600 + }, + { + "epoch": 2.563290072475982, + "grad_norm": 1.9820669090703988, + "learning_rate": 5.039845339546749e-06, + "loss": 0.2183, + "step": 7605 + }, + { + "epoch": 2.564975560424743, + "grad_norm": 1.161627647696652, + "learning_rate": 5.034387166744266e-06, + "loss": 0.2155, + "step": 7610 + }, + { + "epoch": 2.566661048373504, + "grad_norm": 1.216790349189924, + "learning_rate": 5.028928952961507e-06, + "loss": 0.2159, + "step": 7615 + }, + { + "epoch": 2.5683465363222653, + "grad_norm": 1.074176202440405, + "learning_rate": 5.023470704703198e-06, + "loss": 0.237, + "step": 7620 + }, + { + "epoch": 2.5700320242710264, + "grad_norm": 1.1430658238276077, + "learning_rate": 5.018012428474108e-06, + "loss": 0.206, + "step": 7625 + }, + { + "epoch": 2.5717175122197875, + "grad_norm": 3.035099123795803, + "learning_rate": 5.012554130779035e-06, + "loss": 0.2132, + "step": 7630 + }, + { + "epoch": 2.5734030001685486, + "grad_norm": 1.164783026358899, + "learning_rate": 5.007095818122807e-06, + "loss": 0.196, + "step": 7635 + }, + { + "epoch": 2.57508848811731, + "grad_norm": 1.2985221587075237, + "learning_rate": 5.001637497010267e-06, + "loss": 0.2204, + "step": 7640 + }, + { + "epoch": 2.5767739760660713, + "grad_norm": 1.3168407527973596, + "learning_rate": 4.996179173946271e-06, + "loss": 0.2184, + "step": 7645 + }, + { + "epoch": 2.5784594640148324, + "grad_norm": 1.2553672938566072, + "learning_rate": 4.990720855435673e-06, + "loss": 0.2124, + "step": 7650 + }, + { + "epoch": 2.5801449519635935, + "grad_norm": 1.5232095565932093, + "learning_rate": 4.9852625479833275e-06, + "loss": 0.2119, + "step": 7655 + }, + { + "epoch": 2.5818304399123546, + "grad_norm": 1.3738872841862542, + "learning_rate": 4.97980425809407e-06, + "loss": 0.2234, + "step": 7660 + }, + { + "epoch": 2.5835159278611157, + "grad_norm": 1.193187891987461, + "learning_rate": 4.974345992272718e-06, + "loss": 0.2179, + "step": 7665 + }, + { + "epoch": 2.585201415809877, + "grad_norm": 1.2178922042183746, + "learning_rate": 4.9688877570240595e-06, + "loss": 0.2166, + "step": 7670 + }, + { + "epoch": 2.5868869037586384, + "grad_norm": 1.195714331610979, + "learning_rate": 4.9634295588528475e-06, + "loss": 0.2087, + "step": 7675 + }, + { + "epoch": 2.588572391707399, + "grad_norm": 1.5491557664737279, + "learning_rate": 4.957971404263787e-06, + "loss": 0.2112, + "step": 7680 + }, + { + "epoch": 2.5902578796561606, + "grad_norm": 1.4409633952231076, + "learning_rate": 4.952513299761536e-06, + "loss": 0.2252, + "step": 7685 + }, + { + "epoch": 2.5919433676049217, + "grad_norm": 1.1824290929772407, + "learning_rate": 4.947055251850692e-06, + "loss": 0.2267, + "step": 7690 + }, + { + "epoch": 2.593628855553683, + "grad_norm": 2.65766670226504, + "learning_rate": 4.94159726703578e-06, + "loss": 0.2183, + "step": 7695 + }, + { + "epoch": 2.595314343502444, + "grad_norm": 1.330761056252785, + "learning_rate": 4.936139351821257e-06, + "loss": 0.2231, + "step": 7700 + }, + { + "epoch": 2.596999831451205, + "grad_norm": 1.2768987847764608, + "learning_rate": 4.930681512711491e-06, + "loss": 0.2047, + "step": 7705 + }, + { + "epoch": 2.598685319399966, + "grad_norm": 2.5788407095683668, + "learning_rate": 4.925223756210762e-06, + "loss": 0.2023, + "step": 7710 + }, + { + "epoch": 2.6003708073487273, + "grad_norm": 1.6971820075392683, + "learning_rate": 4.919766088823253e-06, + "loss": 0.2338, + "step": 7715 + }, + { + "epoch": 2.602056295297489, + "grad_norm": 1.2076456903760784, + "learning_rate": 4.914308517053036e-06, + "loss": 0.2109, + "step": 7720 + }, + { + "epoch": 2.60374178324625, + "grad_norm": 1.3327065551630404, + "learning_rate": 4.908851047404076e-06, + "loss": 0.2316, + "step": 7725 + }, + { + "epoch": 2.605427271195011, + "grad_norm": 1.2027785275920007, + "learning_rate": 4.903393686380212e-06, + "loss": 0.2025, + "step": 7730 + }, + { + "epoch": 2.607112759143772, + "grad_norm": 1.4071669291229922, + "learning_rate": 4.89793644048515e-06, + "loss": 0.2089, + "step": 7735 + }, + { + "epoch": 2.6087982470925333, + "grad_norm": 1.8369380712878773, + "learning_rate": 4.892479316222467e-06, + "loss": 0.2097, + "step": 7740 + }, + { + "epoch": 2.6104837350412944, + "grad_norm": 1.243199599731519, + "learning_rate": 4.88702232009559e-06, + "loss": 0.237, + "step": 7745 + }, + { + "epoch": 2.6121692229900555, + "grad_norm": 1.2769876086429437, + "learning_rate": 4.881565458607793e-06, + "loss": 0.2367, + "step": 7750 + }, + { + "epoch": 2.613854710938817, + "grad_norm": 1.909431590495729, + "learning_rate": 4.876108738262189e-06, + "loss": 0.2087, + "step": 7755 + }, + { + "epoch": 2.6155401988875777, + "grad_norm": 1.2717307641002418, + "learning_rate": 4.870652165561731e-06, + "loss": 0.2147, + "step": 7760 + }, + { + "epoch": 2.6172256868363393, + "grad_norm": 1.137811095272054, + "learning_rate": 4.865195747009183e-06, + "loss": 0.2175, + "step": 7765 + }, + { + "epoch": 2.6189111747851004, + "grad_norm": 1.2006060819624018, + "learning_rate": 4.859739489107137e-06, + "loss": 0.2096, + "step": 7770 + }, + { + "epoch": 2.6205966627338615, + "grad_norm": 1.3185521440511523, + "learning_rate": 4.854283398357983e-06, + "loss": 0.2061, + "step": 7775 + }, + { + "epoch": 2.6222821506826226, + "grad_norm": 1.1883514424084403, + "learning_rate": 4.848827481263922e-06, + "loss": 0.1983, + "step": 7780 + }, + { + "epoch": 2.6239676386313837, + "grad_norm": 1.445064829296447, + "learning_rate": 4.84337174432694e-06, + "loss": 0.2381, + "step": 7785 + }, + { + "epoch": 2.625653126580145, + "grad_norm": 1.164969110462535, + "learning_rate": 4.837916194048814e-06, + "loss": 0.1991, + "step": 7790 + }, + { + "epoch": 2.627338614528906, + "grad_norm": 1.0917276156235423, + "learning_rate": 4.832460836931093e-06, + "loss": 0.2152, + "step": 7795 + }, + { + "epoch": 2.6290241024776675, + "grad_norm": 1.1716749826219557, + "learning_rate": 4.827005679475101e-06, + "loss": 0.2064, + "step": 7800 + }, + { + "epoch": 2.6307095904264286, + "grad_norm": 1.1835409077085433, + "learning_rate": 4.821550728181924e-06, + "loss": 0.2053, + "step": 7805 + }, + { + "epoch": 2.6323950783751897, + "grad_norm": 1.0672841722354087, + "learning_rate": 4.816095989552397e-06, + "loss": 0.2069, + "step": 7810 + }, + { + "epoch": 2.634080566323951, + "grad_norm": 1.1189104397633212, + "learning_rate": 4.8106414700871055e-06, + "loss": 0.2262, + "step": 7815 + }, + { + "epoch": 2.635766054272712, + "grad_norm": 1.4845855781818307, + "learning_rate": 4.805187176286375e-06, + "loss": 0.2088, + "step": 7820 + }, + { + "epoch": 2.637451542221473, + "grad_norm": 1.887706656080745, + "learning_rate": 4.799733114650258e-06, + "loss": 0.2265, + "step": 7825 + }, + { + "epoch": 2.639137030170234, + "grad_norm": 1.4732247431555492, + "learning_rate": 4.794279291678532e-06, + "loss": 0.2106, + "step": 7830 + }, + { + "epoch": 2.6408225181189957, + "grad_norm": 1.0853331771280081, + "learning_rate": 4.788825713870694e-06, + "loss": 0.2184, + "step": 7835 + }, + { + "epoch": 2.6425080060677564, + "grad_norm": 1.2963549201991373, + "learning_rate": 4.783372387725943e-06, + "loss": 0.2398, + "step": 7840 + }, + { + "epoch": 2.644193494016518, + "grad_norm": 1.0496304810572075, + "learning_rate": 4.777919319743182e-06, + "loss": 0.2107, + "step": 7845 + }, + { + "epoch": 2.645878981965279, + "grad_norm": 1.2044158354126433, + "learning_rate": 4.772466516421003e-06, + "loss": 0.2124, + "step": 7850 + }, + { + "epoch": 2.64756446991404, + "grad_norm": 1.2202219357417485, + "learning_rate": 4.767013984257687e-06, + "loss": 0.206, + "step": 7855 + }, + { + "epoch": 2.6492499578628013, + "grad_norm": 1.2914769386909546, + "learning_rate": 4.761561729751186e-06, + "loss": 0.1762, + "step": 7860 + }, + { + "epoch": 2.6509354458115624, + "grad_norm": 1.2146084373160662, + "learning_rate": 4.756109759399127e-06, + "loss": 0.2104, + "step": 7865 + }, + { + "epoch": 2.6526209337603235, + "grad_norm": 1.469151091659385, + "learning_rate": 4.750658079698793e-06, + "loss": 0.2354, + "step": 7870 + }, + { + "epoch": 2.6543064217090846, + "grad_norm": 1.2237814189742386, + "learning_rate": 4.745206697147129e-06, + "loss": 0.197, + "step": 7875 + }, + { + "epoch": 2.655991909657846, + "grad_norm": 1.334011718394007, + "learning_rate": 4.739755618240714e-06, + "loss": 0.2027, + "step": 7880 + }, + { + "epoch": 2.6576773976066073, + "grad_norm": 1.3450770693251022, + "learning_rate": 4.7343048494757765e-06, + "loss": 0.2219, + "step": 7885 + }, + { + "epoch": 2.6593628855553684, + "grad_norm": 1.1713996219825322, + "learning_rate": 4.728854397348166e-06, + "loss": 0.1955, + "step": 7890 + }, + { + "epoch": 2.6610483735041295, + "grad_norm": 1.473575977059867, + "learning_rate": 4.723404268353363e-06, + "loss": 0.2272, + "step": 7895 + }, + { + "epoch": 2.6627338614528906, + "grad_norm": 1.239481438038185, + "learning_rate": 4.717954468986456e-06, + "loss": 0.2136, + "step": 7900 + }, + { + "epoch": 2.6644193494016517, + "grad_norm": 1.2073338033281267, + "learning_rate": 4.712505005742143e-06, + "loss": 0.2061, + "step": 7905 + }, + { + "epoch": 2.666104837350413, + "grad_norm": 1.3312178764641676, + "learning_rate": 4.707055885114725e-06, + "loss": 0.237, + "step": 7910 + }, + { + "epoch": 2.6677903252991744, + "grad_norm": 1.897086284655153, + "learning_rate": 4.7016071135980915e-06, + "loss": 0.1846, + "step": 7915 + }, + { + "epoch": 2.669475813247935, + "grad_norm": 1.2203593846498346, + "learning_rate": 4.696158697685713e-06, + "loss": 0.1925, + "step": 7920 + }, + { + "epoch": 2.6711613011966966, + "grad_norm": 1.0575694355259007, + "learning_rate": 4.690710643870643e-06, + "loss": 0.1937, + "step": 7925 + }, + { + "epoch": 2.6728467891454577, + "grad_norm": 1.1964370422235926, + "learning_rate": 4.685262958645497e-06, + "loss": 0.2243, + "step": 7930 + }, + { + "epoch": 2.674532277094219, + "grad_norm": 1.3247158824635015, + "learning_rate": 4.679815648502455e-06, + "loss": 0.2049, + "step": 7935 + }, + { + "epoch": 2.67621776504298, + "grad_norm": 1.417846174872059, + "learning_rate": 4.67436871993325e-06, + "loss": 0.2225, + "step": 7940 + }, + { + "epoch": 2.677903252991741, + "grad_norm": 1.4640995421446863, + "learning_rate": 4.668922179429156e-06, + "loss": 0.224, + "step": 7945 + }, + { + "epoch": 2.679588740940502, + "grad_norm": 1.1232583515450298, + "learning_rate": 4.6634760334809945e-06, + "loss": 0.2091, + "step": 7950 + }, + { + "epoch": 2.6812742288892633, + "grad_norm": 1.2961341608661843, + "learning_rate": 4.658030288579104e-06, + "loss": 0.2191, + "step": 7955 + }, + { + "epoch": 2.682959716838025, + "grad_norm": 1.128248479176604, + "learning_rate": 4.652584951213354e-06, + "loss": 0.2122, + "step": 7960 + }, + { + "epoch": 2.684645204786786, + "grad_norm": 1.2685530371234115, + "learning_rate": 4.6471400278731245e-06, + "loss": 0.2486, + "step": 7965 + }, + { + "epoch": 2.686330692735547, + "grad_norm": 1.2482213439540817, + "learning_rate": 4.641695525047305e-06, + "loss": 0.1816, + "step": 7970 + }, + { + "epoch": 2.688016180684308, + "grad_norm": 1.1985374443223176, + "learning_rate": 4.63625144922428e-06, + "loss": 0.1891, + "step": 7975 + }, + { + "epoch": 2.6897016686330693, + "grad_norm": 1.2409915480778366, + "learning_rate": 4.630807806891927e-06, + "loss": 0.1994, + "step": 7980 + }, + { + "epoch": 2.6913871565818304, + "grad_norm": 1.299970662285356, + "learning_rate": 4.625364604537607e-06, + "loss": 0.2142, + "step": 7985 + }, + { + "epoch": 2.6930726445305915, + "grad_norm": 1.2035482765561063, + "learning_rate": 4.619921848648161e-06, + "loss": 0.1858, + "step": 7990 + }, + { + "epoch": 2.694758132479353, + "grad_norm": 1.0906507678198634, + "learning_rate": 4.6144795457098876e-06, + "loss": 0.2031, + "step": 7995 + }, + { + "epoch": 2.6964436204281137, + "grad_norm": 1.2642173617342594, + "learning_rate": 4.609037702208556e-06, + "loss": 0.2018, + "step": 8000 + }, + { + "epoch": 2.6981291083768753, + "grad_norm": 1.2095766816767846, + "learning_rate": 4.60359632462938e-06, + "loss": 0.2139, + "step": 8005 + }, + { + "epoch": 2.6998145963256364, + "grad_norm": 1.2460924525904324, + "learning_rate": 4.5981554194570256e-06, + "loss": 0.2017, + "step": 8010 + }, + { + "epoch": 2.7015000842743975, + "grad_norm": 1.1991951728812023, + "learning_rate": 4.592714993175588e-06, + "loss": 0.2038, + "step": 8015 + }, + { + "epoch": 2.7031855722231586, + "grad_norm": 1.1542325631291357, + "learning_rate": 4.587275052268596e-06, + "loss": 0.2162, + "step": 8020 + }, + { + "epoch": 2.7048710601719197, + "grad_norm": 1.3591661714420404, + "learning_rate": 4.581835603219002e-06, + "loss": 0.2031, + "step": 8025 + }, + { + "epoch": 2.706556548120681, + "grad_norm": 1.2550882397550795, + "learning_rate": 4.57639665250917e-06, + "loss": 0.2173, + "step": 8030 + }, + { + "epoch": 2.708242036069442, + "grad_norm": 1.2079248483458591, + "learning_rate": 4.570958206620868e-06, + "loss": 0.2142, + "step": 8035 + }, + { + "epoch": 2.7099275240182035, + "grad_norm": 1.0562215002063613, + "learning_rate": 4.565520272035265e-06, + "loss": 0.1825, + "step": 8040 + }, + { + "epoch": 2.7116130119669646, + "grad_norm": 1.0560766090254274, + "learning_rate": 4.560082855232919e-06, + "loss": 0.1951, + "step": 8045 + }, + { + "epoch": 2.7132984999157257, + "grad_norm": 1.2036721170342266, + "learning_rate": 4.554645962693773e-06, + "loss": 0.2162, + "step": 8050 + }, + { + "epoch": 2.714983987864487, + "grad_norm": 1.2550065545273137, + "learning_rate": 4.549209600897142e-06, + "loss": 0.2054, + "step": 8055 + }, + { + "epoch": 2.716669475813248, + "grad_norm": 1.256922529684213, + "learning_rate": 4.5437737763217135e-06, + "loss": 0.2103, + "step": 8060 + }, + { + "epoch": 2.718354963762009, + "grad_norm": 1.298482568255476, + "learning_rate": 4.538338495445531e-06, + "loss": 0.2225, + "step": 8065 + }, + { + "epoch": 2.72004045171077, + "grad_norm": 1.208277249486905, + "learning_rate": 4.532903764745991e-06, + "loss": 0.1922, + "step": 8070 + }, + { + "epoch": 2.7217259396595317, + "grad_norm": 1.287360635630622, + "learning_rate": 4.5274695906998325e-06, + "loss": 0.2132, + "step": 8075 + }, + { + "epoch": 2.7234114276082924, + "grad_norm": 1.508020618879755, + "learning_rate": 4.522035979783136e-06, + "loss": 0.214, + "step": 8080 + }, + { + "epoch": 2.725096915557054, + "grad_norm": 1.294481237052173, + "learning_rate": 4.516602938471305e-06, + "loss": 0.2101, + "step": 8085 + }, + { + "epoch": 2.726782403505815, + "grad_norm": 1.2667584687823812, + "learning_rate": 4.511170473239069e-06, + "loss": 0.2015, + "step": 8090 + }, + { + "epoch": 2.728467891454576, + "grad_norm": 1.232871861263674, + "learning_rate": 4.505738590560466e-06, + "loss": 0.2095, + "step": 8095 + }, + { + "epoch": 2.7301533794033372, + "grad_norm": 0.9720525051487791, + "learning_rate": 4.500307296908845e-06, + "loss": 0.1847, + "step": 8100 + }, + { + "epoch": 2.7318388673520984, + "grad_norm": 1.2182120125781937, + "learning_rate": 4.494876598756852e-06, + "loss": 0.1968, + "step": 8105 + }, + { + "epoch": 2.7335243553008595, + "grad_norm": 1.2811058412221115, + "learning_rate": 4.4894465025764196e-06, + "loss": 0.1936, + "step": 8110 + }, + { + "epoch": 2.7352098432496206, + "grad_norm": 1.203975144696989, + "learning_rate": 4.484017014838767e-06, + "loss": 0.2234, + "step": 8115 + }, + { + "epoch": 2.736895331198382, + "grad_norm": 1.114820195816211, + "learning_rate": 4.478588142014385e-06, + "loss": 0.2011, + "step": 8120 + }, + { + "epoch": 2.7385808191471432, + "grad_norm": 3.9301821811838704, + "learning_rate": 4.473159890573034e-06, + "loss": 0.22, + "step": 8125 + }, + { + "epoch": 2.7402663070959044, + "grad_norm": 1.1651231107384559, + "learning_rate": 4.4677322669837334e-06, + "loss": 0.1952, + "step": 8130 + }, + { + "epoch": 2.7419517950446655, + "grad_norm": 1.2026231387193682, + "learning_rate": 4.462305277714756e-06, + "loss": 0.2023, + "step": 8135 + }, + { + "epoch": 2.7436372829934266, + "grad_norm": 1.7641807551102524, + "learning_rate": 4.456878929233614e-06, + "loss": 0.224, + "step": 8140 + }, + { + "epoch": 2.7453227709421877, + "grad_norm": 1.3383941911155575, + "learning_rate": 4.451453228007061e-06, + "loss": 0.1947, + "step": 8145 + }, + { + "epoch": 2.747008258890949, + "grad_norm": 1.2695203975579834, + "learning_rate": 4.4460281805010755e-06, + "loss": 0.2028, + "step": 8150 + }, + { + "epoch": 2.7486937468397104, + "grad_norm": 1.4003249747645194, + "learning_rate": 4.44060379318086e-06, + "loss": 0.192, + "step": 8155 + }, + { + "epoch": 2.750379234788471, + "grad_norm": 1.368960866876165, + "learning_rate": 4.435180072510827e-06, + "loss": 0.2332, + "step": 8160 + }, + { + "epoch": 2.7520647227372326, + "grad_norm": 1.2414367283303092, + "learning_rate": 4.429757024954599e-06, + "loss": 0.2099, + "step": 8165 + }, + { + "epoch": 2.7537502106859937, + "grad_norm": 1.2826947437153777, + "learning_rate": 4.424334656974987e-06, + "loss": 0.2271, + "step": 8170 + }, + { + "epoch": 2.755435698634755, + "grad_norm": 1.1016378366259734, + "learning_rate": 4.418912975034008e-06, + "loss": 0.222, + "step": 8175 + }, + { + "epoch": 2.757121186583516, + "grad_norm": 1.1405163621409955, + "learning_rate": 4.413491985592846e-06, + "loss": 0.1995, + "step": 8180 + }, + { + "epoch": 2.758806674532277, + "grad_norm": 1.2888868278377614, + "learning_rate": 4.408071695111868e-06, + "loss": 0.2033, + "step": 8185 + }, + { + "epoch": 2.760492162481038, + "grad_norm": 1.1411197132080275, + "learning_rate": 4.402652110050605e-06, + "loss": 0.199, + "step": 8190 + }, + { + "epoch": 2.7621776504297992, + "grad_norm": 1.1580417718173754, + "learning_rate": 4.3972332368677496e-06, + "loss": 0.2121, + "step": 8195 + }, + { + "epoch": 2.763863138378561, + "grad_norm": 1.1365678268528452, + "learning_rate": 4.391815082021142e-06, + "loss": 0.2036, + "step": 8200 + }, + { + "epoch": 2.765548626327322, + "grad_norm": 1.4277318865557, + "learning_rate": 4.3863976519677725e-06, + "loss": 0.2077, + "step": 8205 + }, + { + "epoch": 2.767234114276083, + "grad_norm": 1.1670065742226998, + "learning_rate": 4.38098095316376e-06, + "loss": 0.1999, + "step": 8210 + }, + { + "epoch": 2.768919602224844, + "grad_norm": 1.1747133532537581, + "learning_rate": 4.375564992064359e-06, + "loss": 0.1808, + "step": 8215 + }, + { + "epoch": 2.7706050901736052, + "grad_norm": 1.331536656284568, + "learning_rate": 4.370149775123942e-06, + "loss": 0.2272, + "step": 8220 + }, + { + "epoch": 2.7722905781223663, + "grad_norm": 1.2305343169614085, + "learning_rate": 4.364735308795995e-06, + "loss": 0.2035, + "step": 8225 + }, + { + "epoch": 2.7739760660711275, + "grad_norm": 1.2877016204789493, + "learning_rate": 4.3593215995331065e-06, + "loss": 0.1835, + "step": 8230 + }, + { + "epoch": 2.775661554019889, + "grad_norm": 1.2191011990272045, + "learning_rate": 4.353908653786968e-06, + "loss": 0.2051, + "step": 8235 + }, + { + "epoch": 2.7773470419686497, + "grad_norm": 1.2161324917089447, + "learning_rate": 4.348496478008357e-06, + "loss": 0.216, + "step": 8240 + }, + { + "epoch": 2.7790325299174112, + "grad_norm": 1.0883390283573073, + "learning_rate": 4.343085078647133e-06, + "loss": 0.2135, + "step": 8245 + }, + { + "epoch": 2.7807180178661723, + "grad_norm": 1.1927476554034464, + "learning_rate": 4.337674462152236e-06, + "loss": 0.2062, + "step": 8250 + }, + { + "epoch": 2.7824035058149335, + "grad_norm": 1.3148578636143122, + "learning_rate": 4.332264634971668e-06, + "loss": 0.2079, + "step": 8255 + }, + { + "epoch": 2.7840889937636946, + "grad_norm": 1.4712378506522028, + "learning_rate": 4.326855603552491e-06, + "loss": 0.1958, + "step": 8260 + }, + { + "epoch": 2.7857744817124557, + "grad_norm": 1.4577330610313746, + "learning_rate": 4.321447374340817e-06, + "loss": 0.2019, + "step": 8265 + }, + { + "epoch": 2.787459969661217, + "grad_norm": 1.096441776769599, + "learning_rate": 4.316039953781809e-06, + "loss": 0.1837, + "step": 8270 + }, + { + "epoch": 2.789145457609978, + "grad_norm": 1.5145781993547742, + "learning_rate": 4.310633348319655e-06, + "loss": 0.1957, + "step": 8275 + }, + { + "epoch": 2.7908309455587395, + "grad_norm": 1.2267818699569348, + "learning_rate": 4.305227564397583e-06, + "loss": 0.2132, + "step": 8280 + }, + { + "epoch": 2.7925164335075006, + "grad_norm": 1.2254078190906166, + "learning_rate": 4.299822608457835e-06, + "loss": 0.204, + "step": 8285 + }, + { + "epoch": 2.7942019214562617, + "grad_norm": 1.3473040118593569, + "learning_rate": 4.29441848694167e-06, + "loss": 0.197, + "step": 8290 + }, + { + "epoch": 2.795887409405023, + "grad_norm": 1.3844537744939152, + "learning_rate": 4.2890152062893475e-06, + "loss": 0.2204, + "step": 8295 + }, + { + "epoch": 2.797572897353784, + "grad_norm": 1.1641308923110405, + "learning_rate": 4.283612772940132e-06, + "loss": 0.2297, + "step": 8300 + }, + { + "epoch": 2.799258385302545, + "grad_norm": 1.2817293621333818, + "learning_rate": 4.278211193332273e-06, + "loss": 0.2029, + "step": 8305 + }, + { + "epoch": 2.800943873251306, + "grad_norm": 1.2037204289050882, + "learning_rate": 4.272810473903003e-06, + "loss": 0.1948, + "step": 8310 + }, + { + "epoch": 2.8026293612000677, + "grad_norm": 1.4330754159672232, + "learning_rate": 4.2674106210885305e-06, + "loss": 0.2328, + "step": 8315 + }, + { + "epoch": 2.8043148491488283, + "grad_norm": 1.2967773614836344, + "learning_rate": 4.262011641324032e-06, + "loss": 0.1938, + "step": 8320 + }, + { + "epoch": 2.80600033709759, + "grad_norm": 1.3772603790369828, + "learning_rate": 4.2566135410436425e-06, + "loss": 0.1697, + "step": 8325 + }, + { + "epoch": 2.807685825046351, + "grad_norm": 1.3138260462934688, + "learning_rate": 4.2512163266804506e-06, + "loss": 0.226, + "step": 8330 + }, + { + "epoch": 2.809371312995112, + "grad_norm": 1.1462076660661356, + "learning_rate": 4.245820004666486e-06, + "loss": 0.1875, + "step": 8335 + }, + { + "epoch": 2.8110568009438732, + "grad_norm": 1.2362333917746737, + "learning_rate": 4.240424581432718e-06, + "loss": 0.1796, + "step": 8340 + }, + { + "epoch": 2.8127422888926343, + "grad_norm": 1.1409090711262735, + "learning_rate": 4.235030063409041e-06, + "loss": 0.1809, + "step": 8345 + }, + { + "epoch": 2.8144277768413954, + "grad_norm": 1.3813250458920163, + "learning_rate": 4.229636457024276e-06, + "loss": 0.1788, + "step": 8350 + }, + { + "epoch": 2.8161132647901566, + "grad_norm": 1.1293414456316424, + "learning_rate": 4.224243768706151e-06, + "loss": 0.2059, + "step": 8355 + }, + { + "epoch": 2.817798752738918, + "grad_norm": 1.8168562582897307, + "learning_rate": 4.218852004881305e-06, + "loss": 0.1933, + "step": 8360 + }, + { + "epoch": 2.819484240687679, + "grad_norm": 1.1886959699440642, + "learning_rate": 4.213461171975277e-06, + "loss": 0.2039, + "step": 8365 + }, + { + "epoch": 2.8211697286364403, + "grad_norm": 1.3212647878701373, + "learning_rate": 4.20807127641249e-06, + "loss": 0.2083, + "step": 8370 + }, + { + "epoch": 2.8228552165852014, + "grad_norm": 1.140767409050604, + "learning_rate": 4.202682324616253e-06, + "loss": 0.2046, + "step": 8375 + }, + { + "epoch": 2.8245407045339626, + "grad_norm": 1.1568938021812396, + "learning_rate": 4.1972943230087535e-06, + "loss": 0.1874, + "step": 8380 + }, + { + "epoch": 2.8262261924827237, + "grad_norm": 1.188451679119585, + "learning_rate": 4.19190727801104e-06, + "loss": 0.2167, + "step": 8385 + }, + { + "epoch": 2.8279116804314848, + "grad_norm": 1.273013068407849, + "learning_rate": 4.186521196043028e-06, + "loss": 0.2013, + "step": 8390 + }, + { + "epoch": 2.8295971683802463, + "grad_norm": 1.2511228585869532, + "learning_rate": 4.18113608352348e-06, + "loss": 0.2021, + "step": 8395 + }, + { + "epoch": 2.831282656329007, + "grad_norm": 1.2595578190181103, + "learning_rate": 4.175751946870005e-06, + "loss": 0.1958, + "step": 8400 + }, + { + "epoch": 2.8329681442777686, + "grad_norm": 1.2433040402270128, + "learning_rate": 4.1703687924990525e-06, + "loss": 0.2099, + "step": 8405 + }, + { + "epoch": 2.8346536322265297, + "grad_norm": 1.110021214593105, + "learning_rate": 4.164986626825894e-06, + "loss": 0.2025, + "step": 8410 + }, + { + "epoch": 2.8363391201752908, + "grad_norm": 1.064850840174882, + "learning_rate": 4.1596054562646294e-06, + "loss": 0.1933, + "step": 8415 + }, + { + "epoch": 2.838024608124052, + "grad_norm": 1.4229645174562606, + "learning_rate": 4.154225287228169e-06, + "loss": 0.1959, + "step": 8420 + }, + { + "epoch": 2.839710096072813, + "grad_norm": 1.2249032864371672, + "learning_rate": 4.148846126128232e-06, + "loss": 0.2047, + "step": 8425 + }, + { + "epoch": 2.841395584021574, + "grad_norm": 1.1301944668252912, + "learning_rate": 4.143467979375332e-06, + "loss": 0.1996, + "step": 8430 + }, + { + "epoch": 2.843081071970335, + "grad_norm": 1.2013755673175146, + "learning_rate": 4.1380908533787796e-06, + "loss": 0.2134, + "step": 8435 + }, + { + "epoch": 2.8447665599190968, + "grad_norm": 1.2307863801871006, + "learning_rate": 4.132714754546666e-06, + "loss": 0.2027, + "step": 8440 + }, + { + "epoch": 2.846452047867858, + "grad_norm": 1.4757993042778519, + "learning_rate": 4.127339689285859e-06, + "loss": 0.1973, + "step": 8445 + }, + { + "epoch": 2.848137535816619, + "grad_norm": 1.2041741783160391, + "learning_rate": 4.121965664001993e-06, + "loss": 0.2024, + "step": 8450 + }, + { + "epoch": 2.84982302376538, + "grad_norm": 1.6283152510947259, + "learning_rate": 4.116592685099464e-06, + "loss": 0.2068, + "step": 8455 + }, + { + "epoch": 2.851508511714141, + "grad_norm": 1.339424989314122, + "learning_rate": 4.111220758981422e-06, + "loss": 0.2173, + "step": 8460 + }, + { + "epoch": 2.8531939996629023, + "grad_norm": 1.227117541980443, + "learning_rate": 4.105849892049762e-06, + "loss": 0.2135, + "step": 8465 + }, + { + "epoch": 2.8548794876116634, + "grad_norm": 1.1500834859952311, + "learning_rate": 4.100480090705114e-06, + "loss": 0.2234, + "step": 8470 + }, + { + "epoch": 2.856564975560425, + "grad_norm": 1.2714701266672594, + "learning_rate": 4.095111361346842e-06, + "loss": 0.1971, + "step": 8475 + }, + { + "epoch": 2.8582504635091857, + "grad_norm": 1.1309754101918799, + "learning_rate": 4.089743710373031e-06, + "loss": 0.1961, + "step": 8480 + }, + { + "epoch": 2.859935951457947, + "grad_norm": 1.2029084595667585, + "learning_rate": 4.084377144180483e-06, + "loss": 0.2034, + "step": 8485 + }, + { + "epoch": 2.8616214394067083, + "grad_norm": 1.3267100492980242, + "learning_rate": 4.0790116691647e-06, + "loss": 0.2068, + "step": 8490 + }, + { + "epoch": 2.8633069273554694, + "grad_norm": 1.0543341592946291, + "learning_rate": 4.0736472917198924e-06, + "loss": 0.1918, + "step": 8495 + }, + { + "epoch": 2.8649924153042305, + "grad_norm": 1.3968526262026753, + "learning_rate": 4.068284018238957e-06, + "loss": 0.2082, + "step": 8500 + }, + { + "epoch": 2.8666779032529917, + "grad_norm": 1.2233220424688536, + "learning_rate": 4.062921855113478e-06, + "loss": 0.1928, + "step": 8505 + }, + { + "epoch": 2.8683633912017528, + "grad_norm": 1.21374727836665, + "learning_rate": 4.057560808733712e-06, + "loss": 0.1785, + "step": 8510 + }, + { + "epoch": 2.870048879150514, + "grad_norm": 1.2407628049003339, + "learning_rate": 4.052200885488591e-06, + "loss": 0.1953, + "step": 8515 + }, + { + "epoch": 2.8717343670992754, + "grad_norm": 1.1712166484264521, + "learning_rate": 4.046842091765706e-06, + "loss": 0.1898, + "step": 8520 + }, + { + "epoch": 2.8734198550480365, + "grad_norm": 1.351947766530135, + "learning_rate": 4.041484433951299e-06, + "loss": 0.227, + "step": 8525 + }, + { + "epoch": 2.8751053429967977, + "grad_norm": 0.9961877131524264, + "learning_rate": 4.036127918430262e-06, + "loss": 0.19, + "step": 8530 + }, + { + "epoch": 2.8767908309455588, + "grad_norm": 1.677230218101421, + "learning_rate": 4.030772551586123e-06, + "loss": 0.2115, + "step": 8535 + }, + { + "epoch": 2.87847631889432, + "grad_norm": 1.2414444268399183, + "learning_rate": 4.025418339801042e-06, + "loss": 0.2211, + "step": 8540 + }, + { + "epoch": 2.880161806843081, + "grad_norm": 1.1178112896661947, + "learning_rate": 4.020065289455803e-06, + "loss": 0.1955, + "step": 8545 + }, + { + "epoch": 2.881847294791842, + "grad_norm": 1.6767137087933748, + "learning_rate": 4.01471340692981e-06, + "loss": 0.191, + "step": 8550 + }, + { + "epoch": 2.8835327827406037, + "grad_norm": 1.2925204021902665, + "learning_rate": 4.009362698601065e-06, + "loss": 0.1837, + "step": 8555 + }, + { + "epoch": 2.8852182706893643, + "grad_norm": 1.2507662452920694, + "learning_rate": 4.00401317084618e-06, + "loss": 0.1778, + "step": 8560 + }, + { + "epoch": 2.886903758638126, + "grad_norm": 1.2101226725952177, + "learning_rate": 3.998664830040355e-06, + "loss": 0.1967, + "step": 8565 + }, + { + "epoch": 2.888589246586887, + "grad_norm": 1.2380819962739664, + "learning_rate": 3.99331768255738e-06, + "loss": 0.1751, + "step": 8570 + }, + { + "epoch": 2.890274734535648, + "grad_norm": 1.2024190079036536, + "learning_rate": 3.987971734769615e-06, + "loss": 0.1933, + "step": 8575 + }, + { + "epoch": 2.891960222484409, + "grad_norm": 1.1166676847965284, + "learning_rate": 3.982626993048001e-06, + "loss": 0.1691, + "step": 8580 + }, + { + "epoch": 2.8936457104331703, + "grad_norm": 1.2434254675121241, + "learning_rate": 3.97728346376203e-06, + "loss": 0.1866, + "step": 8585 + }, + { + "epoch": 2.8953311983819314, + "grad_norm": 1.269478442274771, + "learning_rate": 3.971941153279761e-06, + "loss": 0.195, + "step": 8590 + }, + { + "epoch": 2.8970166863306925, + "grad_norm": 1.217748685649073, + "learning_rate": 3.9666000679677925e-06, + "loss": 0.1996, + "step": 8595 + }, + { + "epoch": 2.898702174279454, + "grad_norm": 1.2592626629594095, + "learning_rate": 3.961260214191265e-06, + "loss": 0.187, + "step": 8600 + }, + { + "epoch": 2.900387662228215, + "grad_norm": 1.3085125127147617, + "learning_rate": 3.9559215983138514e-06, + "loss": 0.1975, + "step": 8605 + }, + { + "epoch": 2.9020731501769763, + "grad_norm": 1.1199222258238601, + "learning_rate": 3.950584226697749e-06, + "loss": 0.1868, + "step": 8610 + }, + { + "epoch": 2.9037586381257374, + "grad_norm": 1.4918499279087545, + "learning_rate": 3.945248105703672e-06, + "loss": 0.2301, + "step": 8615 + }, + { + "epoch": 2.9054441260744985, + "grad_norm": 1.2430268090108927, + "learning_rate": 3.939913241690846e-06, + "loss": 0.1937, + "step": 8620 + }, + { + "epoch": 2.9071296140232596, + "grad_norm": 1.1641142631885228, + "learning_rate": 3.934579641016999e-06, + "loss": 0.2004, + "step": 8625 + }, + { + "epoch": 2.9088151019720208, + "grad_norm": 1.299321455172275, + "learning_rate": 3.929247310038348e-06, + "loss": 0.1977, + "step": 8630 + }, + { + "epoch": 2.9105005899207823, + "grad_norm": 1.1702703265750152, + "learning_rate": 3.9239162551096035e-06, + "loss": 0.1912, + "step": 8635 + }, + { + "epoch": 2.912186077869543, + "grad_norm": 1.3354950999957933, + "learning_rate": 3.918586482583954e-06, + "loss": 0.2121, + "step": 8640 + }, + { + "epoch": 2.9138715658183045, + "grad_norm": 1.3093054335852983, + "learning_rate": 3.913257998813055e-06, + "loss": 0.1822, + "step": 8645 + }, + { + "epoch": 2.9155570537670656, + "grad_norm": 1.057291195670878, + "learning_rate": 3.9079308101470306e-06, + "loss": 0.19, + "step": 8650 + }, + { + "epoch": 2.9172425417158268, + "grad_norm": 1.4219857727997094, + "learning_rate": 3.902604922934461e-06, + "loss": 0.1821, + "step": 8655 + }, + { + "epoch": 2.918928029664588, + "grad_norm": 1.209282061453467, + "learning_rate": 3.897280343522372e-06, + "loss": 0.1832, + "step": 8660 + }, + { + "epoch": 2.920613517613349, + "grad_norm": 1.3582862854870226, + "learning_rate": 3.891957078256239e-06, + "loss": 0.1915, + "step": 8665 + }, + { + "epoch": 2.92229900556211, + "grad_norm": 1.3441310517298874, + "learning_rate": 3.88663513347996e-06, + "loss": 0.1877, + "step": 8670 + }, + { + "epoch": 2.923984493510871, + "grad_norm": 1.2848823431870975, + "learning_rate": 3.881314515535871e-06, + "loss": 0.2063, + "step": 8675 + }, + { + "epoch": 2.9256699814596328, + "grad_norm": 1.231905261181969, + "learning_rate": 3.875995230764715e-06, + "loss": 0.1862, + "step": 8680 + }, + { + "epoch": 2.927355469408394, + "grad_norm": 1.1868346313076998, + "learning_rate": 3.870677285505657e-06, + "loss": 0.1835, + "step": 8685 + }, + { + "epoch": 2.929040957357155, + "grad_norm": 1.194436929729291, + "learning_rate": 3.865360686096258e-06, + "loss": 0.1728, + "step": 8690 + }, + { + "epoch": 2.930726445305916, + "grad_norm": 1.3207718082413187, + "learning_rate": 3.860045438872477e-06, + "loss": 0.2142, + "step": 8695 + }, + { + "epoch": 2.932411933254677, + "grad_norm": 1.3018343404256352, + "learning_rate": 3.854731550168666e-06, + "loss": 0.1856, + "step": 8700 + }, + { + "epoch": 2.9340974212034383, + "grad_norm": 1.2405065418914671, + "learning_rate": 3.8494190263175545e-06, + "loss": 0.1875, + "step": 8705 + }, + { + "epoch": 2.9357829091521994, + "grad_norm": 1.1474049715556274, + "learning_rate": 3.844107873650242e-06, + "loss": 0.1846, + "step": 8710 + }, + { + "epoch": 2.937468397100961, + "grad_norm": 1.3848845959800287, + "learning_rate": 3.838798098496201e-06, + "loss": 0.1787, + "step": 8715 + }, + { + "epoch": 2.9391538850497216, + "grad_norm": 1.2664715246312455, + "learning_rate": 3.833489707183256e-06, + "loss": 0.185, + "step": 8720 + }, + { + "epoch": 2.940839372998483, + "grad_norm": 1.6243977252485549, + "learning_rate": 3.828182706037588e-06, + "loss": 0.198, + "step": 8725 + }, + { + "epoch": 2.9425248609472443, + "grad_norm": 1.5130037637558595, + "learning_rate": 3.8228771013837145e-06, + "loss": 0.1918, + "step": 8730 + }, + { + "epoch": 2.9442103488960054, + "grad_norm": 1.2889800657327404, + "learning_rate": 3.817572899544494e-06, + "loss": 0.1905, + "step": 8735 + }, + { + "epoch": 2.9458958368447665, + "grad_norm": 1.3477371490792038, + "learning_rate": 3.812270106841114e-06, + "loss": 0.1774, + "step": 8740 + }, + { + "epoch": 2.9475813247935276, + "grad_norm": 1.2295097696184567, + "learning_rate": 3.8069687295930803e-06, + "loss": 0.1796, + "step": 8745 + }, + { + "epoch": 2.9492668127422887, + "grad_norm": 1.0474065290922339, + "learning_rate": 3.80166877411821e-06, + "loss": 0.1669, + "step": 8750 + }, + { + "epoch": 2.95095230069105, + "grad_norm": 1.2037522765837032, + "learning_rate": 3.79637024673263e-06, + "loss": 0.2066, + "step": 8755 + }, + { + "epoch": 2.9526377886398114, + "grad_norm": 1.2733518171976481, + "learning_rate": 3.7910731537507616e-06, + "loss": 0.1829, + "step": 8760 + }, + { + "epoch": 2.9543232765885725, + "grad_norm": 1.1867266001542542, + "learning_rate": 3.78577750148532e-06, + "loss": 0.1667, + "step": 8765 + }, + { + "epoch": 2.9560087645373336, + "grad_norm": 1.436934798961873, + "learning_rate": 3.7804832962472985e-06, + "loss": 0.173, + "step": 8770 + }, + { + "epoch": 2.9576942524860947, + "grad_norm": 1.2156057391485697, + "learning_rate": 3.775190544345973e-06, + "loss": 0.2066, + "step": 8775 + }, + { + "epoch": 2.959379740434856, + "grad_norm": 1.3481180264188397, + "learning_rate": 3.7698992520888833e-06, + "loss": 0.1987, + "step": 8780 + }, + { + "epoch": 2.961065228383617, + "grad_norm": 1.119308881281576, + "learning_rate": 3.7646094257818276e-06, + "loss": 0.2078, + "step": 8785 + }, + { + "epoch": 2.962750716332378, + "grad_norm": 1.2703548308382717, + "learning_rate": 3.759321071728861e-06, + "loss": 0.2038, + "step": 8790 + }, + { + "epoch": 2.9644362042811396, + "grad_norm": 1.1091967158693012, + "learning_rate": 3.7540341962322835e-06, + "loss": 0.1812, + "step": 8795 + }, + { + "epoch": 2.9661216922299003, + "grad_norm": 1.3040108337849374, + "learning_rate": 3.74874880559263e-06, + "loss": 0.2036, + "step": 8800 + }, + { + "epoch": 2.967807180178662, + "grad_norm": 1.8342470499943189, + "learning_rate": 3.7434649061086703e-06, + "loss": 0.1728, + "step": 8805 + }, + { + "epoch": 2.969492668127423, + "grad_norm": 1.327575741254867, + "learning_rate": 3.738182504077392e-06, + "loss": 0.1913, + "step": 8810 + }, + { + "epoch": 2.971178156076184, + "grad_norm": 1.2447382092558106, + "learning_rate": 3.732901605794003e-06, + "loss": 0.1884, + "step": 8815 + }, + { + "epoch": 2.972863644024945, + "grad_norm": 1.1298239317913796, + "learning_rate": 3.727622217551918e-06, + "loss": 0.1901, + "step": 8820 + }, + { + "epoch": 2.9745491319737063, + "grad_norm": 0.9757532167636964, + "learning_rate": 3.7223443456427482e-06, + "loss": 0.1554, + "step": 8825 + }, + { + "epoch": 2.9762346199224674, + "grad_norm": 1.2485801386614734, + "learning_rate": 3.717067996356304e-06, + "loss": 0.1849, + "step": 8830 + }, + { + "epoch": 2.9779201078712285, + "grad_norm": 1.1862094486162922, + "learning_rate": 3.711793175980576e-06, + "loss": 0.1902, + "step": 8835 + }, + { + "epoch": 2.97960559581999, + "grad_norm": 1.4458438812909886, + "learning_rate": 3.7065198908017353e-06, + "loss": 0.1956, + "step": 8840 + }, + { + "epoch": 2.981291083768751, + "grad_norm": 1.2143320584081898, + "learning_rate": 3.7012481471041208e-06, + "loss": 0.1917, + "step": 8845 + }, + { + "epoch": 2.9829765717175123, + "grad_norm": 1.1529784288846208, + "learning_rate": 3.695977951170241e-06, + "loss": 0.1837, + "step": 8850 + }, + { + "epoch": 2.9846620596662734, + "grad_norm": 1.2195289935083835, + "learning_rate": 3.6907093092807515e-06, + "loss": 0.1765, + "step": 8855 + }, + { + "epoch": 2.9863475476150345, + "grad_norm": 1.5852245782966363, + "learning_rate": 3.685442227714463e-06, + "loss": 0.1963, + "step": 8860 + }, + { + "epoch": 2.9880330355637956, + "grad_norm": 1.2592376341505946, + "learning_rate": 3.6801767127483207e-06, + "loss": 0.2108, + "step": 8865 + }, + { + "epoch": 2.9897185235125567, + "grad_norm": 1.2329518408245288, + "learning_rate": 3.6749127706574074e-06, + "loss": 0.1821, + "step": 8870 + }, + { + "epoch": 2.9914040114613183, + "grad_norm": 1.1149355513725558, + "learning_rate": 3.669650407714928e-06, + "loss": 0.152, + "step": 8875 + }, + { + "epoch": 2.993089499410079, + "grad_norm": 1.2436052599352312, + "learning_rate": 3.664389630192209e-06, + "loss": 0.1968, + "step": 8880 + }, + { + "epoch": 2.9947749873588405, + "grad_norm": 1.2545273870907585, + "learning_rate": 3.6591304443586826e-06, + "loss": 0.192, + "step": 8885 + }, + { + "epoch": 2.9964604753076016, + "grad_norm": 1.2005253337127357, + "learning_rate": 3.6538728564818903e-06, + "loss": 0.1879, + "step": 8890 + }, + { + "epoch": 2.9981459632563627, + "grad_norm": 1.3896115577203842, + "learning_rate": 3.6486168728274655e-06, + "loss": 0.1753, + "step": 8895 + }, + { + "epoch": 2.999831451205124, + "grad_norm": 1.2651520950003203, + "learning_rate": 3.643362499659131e-06, + "loss": 0.1824, + "step": 8900 + }, + { + "epoch": 3.001348390359009, + "grad_norm": 1.0207205459257762, + "learning_rate": 3.638109743238688e-06, + "loss": 0.1554, + "step": 8905 + }, + { + "epoch": 3.00303387830777, + "grad_norm": 1.2038156734208045, + "learning_rate": 3.6328586098260143e-06, + "loss": 0.1645, + "step": 8910 + }, + { + "epoch": 3.0047193662565315, + "grad_norm": 1.08229609164426, + "learning_rate": 3.6276091056790507e-06, + "loss": 0.1794, + "step": 8915 + }, + { + "epoch": 3.0064048542052926, + "grad_norm": 1.2777016525802405, + "learning_rate": 3.6223612370537965e-06, + "loss": 0.1575, + "step": 8920 + }, + { + "epoch": 3.0080903421540537, + "grad_norm": 1.3440606799362391, + "learning_rate": 3.6171150102043074e-06, + "loss": 0.1899, + "step": 8925 + }, + { + "epoch": 3.009775830102815, + "grad_norm": 1.341990062641125, + "learning_rate": 3.6118704313826735e-06, + "loss": 0.1791, + "step": 8930 + }, + { + "epoch": 3.011461318051576, + "grad_norm": 1.150587983663152, + "learning_rate": 3.60662750683903e-06, + "loss": 0.1747, + "step": 8935 + }, + { + "epoch": 3.013146806000337, + "grad_norm": 1.2060600777552093, + "learning_rate": 3.601386242821532e-06, + "loss": 0.1795, + "step": 8940 + }, + { + "epoch": 3.014832293949098, + "grad_norm": 1.2085366981686378, + "learning_rate": 3.5961466455763617e-06, + "loss": 0.1782, + "step": 8945 + }, + { + "epoch": 3.0165177818978592, + "grad_norm": 1.374761290347725, + "learning_rate": 3.5909087213477134e-06, + "loss": 0.1756, + "step": 8950 + }, + { + "epoch": 3.018203269846621, + "grad_norm": 1.38360173008327, + "learning_rate": 3.5856724763777868e-06, + "loss": 0.1525, + "step": 8955 + }, + { + "epoch": 3.019888757795382, + "grad_norm": 1.3231287738293254, + "learning_rate": 3.58043791690678e-06, + "loss": 0.173, + "step": 8960 + }, + { + "epoch": 3.021574245744143, + "grad_norm": 1.2541823368229426, + "learning_rate": 3.5752050491728863e-06, + "loss": 0.1679, + "step": 8965 + }, + { + "epoch": 3.023259733692904, + "grad_norm": 1.1878348160244694, + "learning_rate": 3.5699738794122775e-06, + "loss": 0.169, + "step": 8970 + }, + { + "epoch": 3.0249452216416652, + "grad_norm": 1.283960050232123, + "learning_rate": 3.5647444138591057e-06, + "loss": 0.1765, + "step": 8975 + }, + { + "epoch": 3.0266307095904263, + "grad_norm": 1.4704810541047644, + "learning_rate": 3.5595166587454894e-06, + "loss": 0.1865, + "step": 8980 + }, + { + "epoch": 3.0283161975391875, + "grad_norm": 1.7976473900655823, + "learning_rate": 3.5542906203015114e-06, + "loss": 0.1714, + "step": 8985 + }, + { + "epoch": 3.0300016854879486, + "grad_norm": 1.1686552991371704, + "learning_rate": 3.5490663047552055e-06, + "loss": 0.1563, + "step": 8990 + }, + { + "epoch": 3.03168717343671, + "grad_norm": 1.124015045321442, + "learning_rate": 3.5438437183325543e-06, + "loss": 0.1593, + "step": 8995 + }, + { + "epoch": 3.0333726613854712, + "grad_norm": 1.2155243699304927, + "learning_rate": 3.5386228672574806e-06, + "loss": 0.165, + "step": 9000 + }, + { + "epoch": 3.0350581493342323, + "grad_norm": 1.143240939695264, + "learning_rate": 3.53340375775184e-06, + "loss": 0.1347, + "step": 9005 + }, + { + "epoch": 3.0367436372829935, + "grad_norm": 1.4439635724868005, + "learning_rate": 3.528186396035407e-06, + "loss": 0.1729, + "step": 9010 + }, + { + "epoch": 3.0384291252317546, + "grad_norm": 1.1233288549598444, + "learning_rate": 3.522970788325879e-06, + "loss": 0.1641, + "step": 9015 + }, + { + "epoch": 3.0401146131805157, + "grad_norm": 1.2295149688056763, + "learning_rate": 3.517756940838859e-06, + "loss": 0.1804, + "step": 9020 + }, + { + "epoch": 3.041800101129277, + "grad_norm": 1.464718560477114, + "learning_rate": 3.5125448597878563e-06, + "loss": 0.173, + "step": 9025 + }, + { + "epoch": 3.043485589078038, + "grad_norm": 1.2826674637909234, + "learning_rate": 3.5073345513842717e-06, + "loss": 0.1721, + "step": 9030 + }, + { + "epoch": 3.0451710770267995, + "grad_norm": 1.2067772736183733, + "learning_rate": 3.5021260218373943e-06, + "loss": 0.1674, + "step": 9035 + }, + { + "epoch": 3.0468565649755606, + "grad_norm": 1.2387239718781364, + "learning_rate": 3.4969192773543968e-06, + "loss": 0.1375, + "step": 9040 + }, + { + "epoch": 3.0485420529243217, + "grad_norm": 1.3775821528176602, + "learning_rate": 3.4917143241403185e-06, + "loss": 0.1546, + "step": 9045 + }, + { + "epoch": 3.050227540873083, + "grad_norm": 1.3526829775589913, + "learning_rate": 3.486511168398068e-06, + "loss": 0.1629, + "step": 9050 + }, + { + "epoch": 3.051913028821844, + "grad_norm": 1.3103433087396437, + "learning_rate": 3.481309816328412e-06, + "loss": 0.155, + "step": 9055 + }, + { + "epoch": 3.053598516770605, + "grad_norm": 1.6886999726326517, + "learning_rate": 3.4761102741299648e-06, + "loss": 0.1448, + "step": 9060 + }, + { + "epoch": 3.055284004719366, + "grad_norm": 1.222040977445654, + "learning_rate": 3.4709125479991867e-06, + "loss": 0.1438, + "step": 9065 + }, + { + "epoch": 3.0569694926681272, + "grad_norm": 1.4398985720510666, + "learning_rate": 3.4657166441303706e-06, + "loss": 0.1554, + "step": 9070 + }, + { + "epoch": 3.058654980616889, + "grad_norm": 1.6404960464778278, + "learning_rate": 3.4605225687156423e-06, + "loss": 0.1512, + "step": 9075 + }, + { + "epoch": 3.06034046856565, + "grad_norm": 1.117768762825777, + "learning_rate": 3.4553303279449463e-06, + "loss": 0.1789, + "step": 9080 + }, + { + "epoch": 3.062025956514411, + "grad_norm": 1.1939243780201165, + "learning_rate": 3.4501399280060383e-06, + "loss": 0.1503, + "step": 9085 + }, + { + "epoch": 3.063711444463172, + "grad_norm": 1.1687677704075992, + "learning_rate": 3.4449513750844843e-06, + "loss": 0.17, + "step": 9090 + }, + { + "epoch": 3.0653969324119332, + "grad_norm": 1.1975196012625673, + "learning_rate": 3.4397646753636447e-06, + "loss": 0.1627, + "step": 9095 + }, + { + "epoch": 3.0670824203606943, + "grad_norm": 1.1203286258112777, + "learning_rate": 3.434579835024676e-06, + "loss": 0.1588, + "step": 9100 + }, + { + "epoch": 3.0687679083094554, + "grad_norm": 1.264716385774105, + "learning_rate": 3.4293968602465164e-06, + "loss": 0.172, + "step": 9105 + }, + { + "epoch": 3.0704533962582166, + "grad_norm": 1.3119060577741284, + "learning_rate": 3.42421575720588e-06, + "loss": 0.167, + "step": 9110 + }, + { + "epoch": 3.072138884206978, + "grad_norm": 1.178899013183934, + "learning_rate": 3.419036532077252e-06, + "loss": 0.1643, + "step": 9115 + }, + { + "epoch": 3.0738243721557392, + "grad_norm": 1.2551607239940359, + "learning_rate": 3.4138591910328817e-06, + "loss": 0.1641, + "step": 9120 + }, + { + "epoch": 3.0755098601045003, + "grad_norm": 1.1797211896509134, + "learning_rate": 3.4086837402427664e-06, + "loss": 0.1802, + "step": 9125 + }, + { + "epoch": 3.0771953480532614, + "grad_norm": 1.4830834279098966, + "learning_rate": 3.4035101858746578e-06, + "loss": 0.1721, + "step": 9130 + }, + { + "epoch": 3.0788808360020226, + "grad_norm": 1.774760111704624, + "learning_rate": 3.398338534094042e-06, + "loss": 0.1745, + "step": 9135 + }, + { + "epoch": 3.0805663239507837, + "grad_norm": 1.1693511782575596, + "learning_rate": 3.393168791064143e-06, + "loss": 0.1685, + "step": 9140 + }, + { + "epoch": 3.082251811899545, + "grad_norm": 1.0183558877902805, + "learning_rate": 3.3880009629459045e-06, + "loss": 0.1424, + "step": 9145 + }, + { + "epoch": 3.083937299848306, + "grad_norm": 1.4007350339055136, + "learning_rate": 3.3828350558979924e-06, + "loss": 0.1654, + "step": 9150 + }, + { + "epoch": 3.0856227877970674, + "grad_norm": 1.2276635073812663, + "learning_rate": 3.3776710760767822e-06, + "loss": 0.175, + "step": 9155 + }, + { + "epoch": 3.0873082757458286, + "grad_norm": 1.282045574865131, + "learning_rate": 3.372509029636353e-06, + "loss": 0.1846, + "step": 9160 + }, + { + "epoch": 3.0889937636945897, + "grad_norm": 1.3442914045267893, + "learning_rate": 3.3673489227284773e-06, + "loss": 0.1494, + "step": 9165 + }, + { + "epoch": 3.0906792516433508, + "grad_norm": 1.2678535007672018, + "learning_rate": 3.3621907615026196e-06, + "loss": 0.1843, + "step": 9170 + }, + { + "epoch": 3.092364739592112, + "grad_norm": 1.287782306366842, + "learning_rate": 3.3570345521059217e-06, + "loss": 0.1732, + "step": 9175 + }, + { + "epoch": 3.094050227540873, + "grad_norm": 1.37481541255487, + "learning_rate": 3.3518803006832047e-06, + "loss": 0.176, + "step": 9180 + }, + { + "epoch": 3.095735715489634, + "grad_norm": 1.1787929412344784, + "learning_rate": 3.34672801337695e-06, + "loss": 0.1532, + "step": 9185 + }, + { + "epoch": 3.097421203438395, + "grad_norm": 1.1753370420040756, + "learning_rate": 3.341577696327304e-06, + "loss": 0.1802, + "step": 9190 + }, + { + "epoch": 3.0991066913871568, + "grad_norm": 1.3630837478662825, + "learning_rate": 3.336429355672063e-06, + "loss": 0.1819, + "step": 9195 + }, + { + "epoch": 3.100792179335918, + "grad_norm": 1.361996292551038, + "learning_rate": 3.331282997546666e-06, + "loss": 0.1705, + "step": 9200 + }, + { + "epoch": 3.102477667284679, + "grad_norm": 1.2526914617444551, + "learning_rate": 3.326138628084192e-06, + "loss": 0.1551, + "step": 9205 + }, + { + "epoch": 3.10416315523344, + "grad_norm": 1.1758149636665614, + "learning_rate": 3.3209962534153493e-06, + "loss": 0.1287, + "step": 9210 + }, + { + "epoch": 3.105848643182201, + "grad_norm": 1.415438669706238, + "learning_rate": 3.3158558796684683e-06, + "loss": 0.1814, + "step": 9215 + }, + { + "epoch": 3.1075341311309623, + "grad_norm": 1.1432014158085264, + "learning_rate": 3.310717512969494e-06, + "loss": 0.1467, + "step": 9220 + }, + { + "epoch": 3.1092196190797234, + "grad_norm": 1.1892849474522582, + "learning_rate": 3.305581159441984e-06, + "loss": 0.1772, + "step": 9225 + }, + { + "epoch": 3.1109051070284846, + "grad_norm": 1.1475388545749203, + "learning_rate": 3.3004468252070908e-06, + "loss": 0.1656, + "step": 9230 + }, + { + "epoch": 3.112590594977246, + "grad_norm": 1.2756817502779538, + "learning_rate": 3.2953145163835655e-06, + "loss": 0.1813, + "step": 9235 + }, + { + "epoch": 3.114276082926007, + "grad_norm": 1.7405597519767244, + "learning_rate": 3.2901842390877403e-06, + "loss": 0.1392, + "step": 9240 + }, + { + "epoch": 3.1159615708747683, + "grad_norm": 1.3434266543562863, + "learning_rate": 3.285055999433531e-06, + "loss": 0.1688, + "step": 9245 + }, + { + "epoch": 3.1176470588235294, + "grad_norm": 1.2182793869948876, + "learning_rate": 3.2799298035324224e-06, + "loss": 0.1576, + "step": 9250 + }, + { + "epoch": 3.1193325467722905, + "grad_norm": 1.30723048266272, + "learning_rate": 3.274805657493466e-06, + "loss": 0.1762, + "step": 9255 + }, + { + "epoch": 3.1210180347210517, + "grad_norm": 1.1708361861411545, + "learning_rate": 3.2696835674232653e-06, + "loss": 0.1626, + "step": 9260 + }, + { + "epoch": 3.1227035226698128, + "grad_norm": 1.200197195129053, + "learning_rate": 3.2645635394259822e-06, + "loss": 0.17, + "step": 9265 + }, + { + "epoch": 3.124389010618574, + "grad_norm": 1.2379102143632188, + "learning_rate": 3.2594455796033144e-06, + "loss": 0.1664, + "step": 9270 + }, + { + "epoch": 3.1260744985673354, + "grad_norm": 1.1989971064565192, + "learning_rate": 3.2543296940544967e-06, + "loss": 0.1853, + "step": 9275 + }, + { + "epoch": 3.1277599865160965, + "grad_norm": 1.2455747680454687, + "learning_rate": 3.2492158888762905e-06, + "loss": 0.1616, + "step": 9280 + }, + { + "epoch": 3.1294454744648577, + "grad_norm": 1.294602311430553, + "learning_rate": 3.2441041701629815e-06, + "loss": 0.173, + "step": 9285 + }, + { + "epoch": 3.1311309624136188, + "grad_norm": 1.1918922901924984, + "learning_rate": 3.2389945440063662e-06, + "loss": 0.1566, + "step": 9290 + }, + { + "epoch": 3.13281645036238, + "grad_norm": 1.0910472071920585, + "learning_rate": 3.233887016495746e-06, + "loss": 0.1464, + "step": 9295 + }, + { + "epoch": 3.134501938311141, + "grad_norm": 1.516106917729951, + "learning_rate": 3.2287815937179277e-06, + "loss": 0.1717, + "step": 9300 + }, + { + "epoch": 3.136187426259902, + "grad_norm": 1.3704725428024847, + "learning_rate": 3.223678281757202e-06, + "loss": 0.1726, + "step": 9305 + }, + { + "epoch": 3.137872914208663, + "grad_norm": 1.2223479549027039, + "learning_rate": 3.2185770866953476e-06, + "loss": 0.1608, + "step": 9310 + }, + { + "epoch": 3.1395584021574248, + "grad_norm": 1.2740998782271278, + "learning_rate": 3.2134780146116225e-06, + "loss": 0.1697, + "step": 9315 + }, + { + "epoch": 3.141243890106186, + "grad_norm": 1.2558127468529559, + "learning_rate": 3.2083810715827495e-06, + "loss": 0.1751, + "step": 9320 + }, + { + "epoch": 3.142929378054947, + "grad_norm": 1.1766271095842882, + "learning_rate": 3.20328626368292e-06, + "loss": 0.1597, + "step": 9325 + }, + { + "epoch": 3.144614866003708, + "grad_norm": 1.3797584416470503, + "learning_rate": 3.198193596983775e-06, + "loss": 0.1542, + "step": 9330 + }, + { + "epoch": 3.146300353952469, + "grad_norm": 1.2567222348584293, + "learning_rate": 3.1931030775544076e-06, + "loss": 0.1632, + "step": 9335 + }, + { + "epoch": 3.1479858419012303, + "grad_norm": 1.1402089659536931, + "learning_rate": 3.1880147114613533e-06, + "loss": 0.1538, + "step": 9340 + }, + { + "epoch": 3.1496713298499914, + "grad_norm": 1.3072830701919846, + "learning_rate": 3.1829285047685775e-06, + "loss": 0.1691, + "step": 9345 + }, + { + "epoch": 3.1513568177987525, + "grad_norm": 1.2698855552382342, + "learning_rate": 3.1778444635374745e-06, + "loss": 0.172, + "step": 9350 + }, + { + "epoch": 3.153042305747514, + "grad_norm": 1.260046757033209, + "learning_rate": 3.172762593826856e-06, + "loss": 0.1455, + "step": 9355 + }, + { + "epoch": 3.154727793696275, + "grad_norm": 1.4774034629075194, + "learning_rate": 3.167682901692948e-06, + "loss": 0.1837, + "step": 9360 + }, + { + "epoch": 3.1564132816450363, + "grad_norm": 1.2717571808725205, + "learning_rate": 3.162605393189381e-06, + "loss": 0.1829, + "step": 9365 + }, + { + "epoch": 3.1580987695937974, + "grad_norm": 1.2880709192467314, + "learning_rate": 3.1575300743671806e-06, + "loss": 0.1653, + "step": 9370 + }, + { + "epoch": 3.1597842575425585, + "grad_norm": 1.4437813142717841, + "learning_rate": 3.1524569512747683e-06, + "loss": 0.1807, + "step": 9375 + }, + { + "epoch": 3.1614697454913196, + "grad_norm": 1.3701463733957, + "learning_rate": 3.147386029957944e-06, + "loss": 0.159, + "step": 9380 + }, + { + "epoch": 3.1631552334400808, + "grad_norm": 1.324744375297103, + "learning_rate": 3.142317316459885e-06, + "loss": 0.1747, + "step": 9385 + }, + { + "epoch": 3.164840721388842, + "grad_norm": 1.3506080024868086, + "learning_rate": 3.137250816821139e-06, + "loss": 0.1632, + "step": 9390 + }, + { + "epoch": 3.1665262093376034, + "grad_norm": 1.3139701825343126, + "learning_rate": 3.1321865370796122e-06, + "loss": 0.1579, + "step": 9395 + }, + { + "epoch": 3.1682116972863645, + "grad_norm": 1.229312512917331, + "learning_rate": 3.12712448327057e-06, + "loss": 0.187, + "step": 9400 + }, + { + "epoch": 3.1698971852351256, + "grad_norm": 1.3010758922423349, + "learning_rate": 3.1220646614266193e-06, + "loss": 0.1694, + "step": 9405 + }, + { + "epoch": 3.1715826731838868, + "grad_norm": 1.1240272769230117, + "learning_rate": 3.11700707757771e-06, + "loss": 0.1795, + "step": 9410 + }, + { + "epoch": 3.173268161132648, + "grad_norm": 1.307436729770644, + "learning_rate": 3.111951737751128e-06, + "loss": 0.165, + "step": 9415 + }, + { + "epoch": 3.174953649081409, + "grad_norm": 1.428189994611883, + "learning_rate": 3.106898647971481e-06, + "loss": 0.1683, + "step": 9420 + }, + { + "epoch": 3.17663913703017, + "grad_norm": 1.1725458533946036, + "learning_rate": 3.1018478142606944e-06, + "loss": 0.1572, + "step": 9425 + }, + { + "epoch": 3.178324624978931, + "grad_norm": 1.2700921188793206, + "learning_rate": 3.096799242638009e-06, + "loss": 0.1579, + "step": 9430 + }, + { + "epoch": 3.1800101129276928, + "grad_norm": 1.2514185126726696, + "learning_rate": 3.091752939119966e-06, + "loss": 0.1718, + "step": 9435 + }, + { + "epoch": 3.181695600876454, + "grad_norm": 1.4999203561023051, + "learning_rate": 3.0867089097204062e-06, + "loss": 0.143, + "step": 9440 + }, + { + "epoch": 3.183381088825215, + "grad_norm": 1.1807717256720542, + "learning_rate": 3.0816671604504567e-06, + "loss": 0.1639, + "step": 9445 + }, + { + "epoch": 3.185066576773976, + "grad_norm": 1.2211558604253152, + "learning_rate": 3.0766276973185326e-06, + "loss": 0.1778, + "step": 9450 + }, + { + "epoch": 3.186752064722737, + "grad_norm": 2.377638369244552, + "learning_rate": 3.0715905263303226e-06, + "loss": 0.1607, + "step": 9455 + }, + { + "epoch": 3.1884375526714983, + "grad_norm": 1.2117524539834534, + "learning_rate": 3.0665556534887807e-06, + "loss": 0.1721, + "step": 9460 + }, + { + "epoch": 3.1901230406202594, + "grad_norm": 1.3975189495173967, + "learning_rate": 3.0615230847941244e-06, + "loss": 0.1712, + "step": 9465 + }, + { + "epoch": 3.1918085285690205, + "grad_norm": 1.8109062696988218, + "learning_rate": 3.0564928262438276e-06, + "loss": 0.1644, + "step": 9470 + }, + { + "epoch": 3.193494016517782, + "grad_norm": 1.209988593116305, + "learning_rate": 3.0514648838326056e-06, + "loss": 0.1683, + "step": 9475 + }, + { + "epoch": 3.195179504466543, + "grad_norm": 1.323539775735678, + "learning_rate": 3.046439263552419e-06, + "loss": 0.1491, + "step": 9480 + }, + { + "epoch": 3.1968649924153043, + "grad_norm": 1.2065094647522658, + "learning_rate": 3.041415971392457e-06, + "loss": 0.149, + "step": 9485 + }, + { + "epoch": 3.1985504803640654, + "grad_norm": 1.2611655717609178, + "learning_rate": 3.0363950133391375e-06, + "loss": 0.1513, + "step": 9490 + }, + { + "epoch": 3.2002359683128265, + "grad_norm": 1.4986209625618618, + "learning_rate": 3.0313763953760957e-06, + "loss": 0.1456, + "step": 9495 + }, + { + "epoch": 3.2019214562615876, + "grad_norm": 1.1304760296305825, + "learning_rate": 3.0263601234841757e-06, + "loss": 0.1571, + "step": 9500 + }, + { + "epoch": 3.2036069442103488, + "grad_norm": 1.5294498865070563, + "learning_rate": 3.0213462036414294e-06, + "loss": 0.1714, + "step": 9505 + }, + { + "epoch": 3.20529243215911, + "grad_norm": 1.353017583083657, + "learning_rate": 3.016334641823102e-06, + "loss": 0.1717, + "step": 9510 + }, + { + "epoch": 3.2069779201078714, + "grad_norm": 1.722681173934253, + "learning_rate": 3.0113254440016325e-06, + "loss": 0.1758, + "step": 9515 + }, + { + "epoch": 3.2086634080566325, + "grad_norm": 1.211881767224924, + "learning_rate": 3.0063186161466384e-06, + "loss": 0.1426, + "step": 9520 + }, + { + "epoch": 3.2103488960053936, + "grad_norm": 1.2367120052202052, + "learning_rate": 3.0013141642249183e-06, + "loss": 0.1735, + "step": 9525 + }, + { + "epoch": 3.2120343839541547, + "grad_norm": 1.2769389133752633, + "learning_rate": 2.996312094200434e-06, + "loss": 0.18, + "step": 9530 + }, + { + "epoch": 3.213719871902916, + "grad_norm": 1.3879869536741454, + "learning_rate": 2.991312412034312e-06, + "loss": 0.1462, + "step": 9535 + }, + { + "epoch": 3.215405359851677, + "grad_norm": 1.2140405718148446, + "learning_rate": 2.98631512368483e-06, + "loss": 0.1712, + "step": 9540 + }, + { + "epoch": 3.217090847800438, + "grad_norm": 1.3180230916425395, + "learning_rate": 2.9813202351074165e-06, + "loss": 0.1577, + "step": 9545 + }, + { + "epoch": 3.218776335749199, + "grad_norm": 1.3600076963285992, + "learning_rate": 2.9763277522546372e-06, + "loss": 0.1657, + "step": 9550 + }, + { + "epoch": 3.2204618236979607, + "grad_norm": 1.4671002267407103, + "learning_rate": 2.971337681076194e-06, + "loss": 0.1566, + "step": 9555 + }, + { + "epoch": 3.222147311646722, + "grad_norm": 1.3709612532883515, + "learning_rate": 2.9663500275189082e-06, + "loss": 0.1449, + "step": 9560 + }, + { + "epoch": 3.223832799595483, + "grad_norm": 1.4384780637244976, + "learning_rate": 2.9613647975267323e-06, + "loss": 0.1672, + "step": 9565 + }, + { + "epoch": 3.225518287544244, + "grad_norm": 1.2647140869055336, + "learning_rate": 2.956381997040717e-06, + "loss": 0.1497, + "step": 9570 + }, + { + "epoch": 3.227203775493005, + "grad_norm": 1.2698793575906924, + "learning_rate": 2.9514016319990257e-06, + "loss": 0.1652, + "step": 9575 + }, + { + "epoch": 3.2288892634417663, + "grad_norm": 1.3023893804465498, + "learning_rate": 2.9464237083369157e-06, + "loss": 0.1668, + "step": 9580 + }, + { + "epoch": 3.2305747513905274, + "grad_norm": 1.1616935191259887, + "learning_rate": 2.941448231986739e-06, + "loss": 0.1502, + "step": 9585 + }, + { + "epoch": 3.2322602393392885, + "grad_norm": 1.2302765723083426, + "learning_rate": 2.9364752088779247e-06, + "loss": 0.1651, + "step": 9590 + }, + { + "epoch": 3.23394572728805, + "grad_norm": 1.261595018073127, + "learning_rate": 2.9315046449369854e-06, + "loss": 0.1739, + "step": 9595 + }, + { + "epoch": 3.235631215236811, + "grad_norm": 1.248918860893241, + "learning_rate": 2.926536546087496e-06, + "loss": 0.1765, + "step": 9600 + }, + { + "epoch": 3.2373167031855723, + "grad_norm": 1.2379293176270614, + "learning_rate": 2.9215709182501007e-06, + "loss": 0.1413, + "step": 9605 + }, + { + "epoch": 3.2390021911343334, + "grad_norm": 1.0823834025280892, + "learning_rate": 2.916607767342494e-06, + "loss": 0.1586, + "step": 9610 + }, + { + "epoch": 3.2406876790830945, + "grad_norm": 1.4161088895383096, + "learning_rate": 2.9116470992794223e-06, + "loss": 0.1677, + "step": 9615 + }, + { + "epoch": 3.2423731670318556, + "grad_norm": 1.1029906470930286, + "learning_rate": 2.9066889199726685e-06, + "loss": 0.1497, + "step": 9620 + }, + { + "epoch": 3.2440586549806167, + "grad_norm": 1.67222310058739, + "learning_rate": 2.90173323533105e-06, + "loss": 0.1632, + "step": 9625 + }, + { + "epoch": 3.245744142929378, + "grad_norm": 1.266332865437193, + "learning_rate": 2.8967800512604183e-06, + "loss": 0.1538, + "step": 9630 + }, + { + "epoch": 3.2474296308781394, + "grad_norm": 1.2965984814533966, + "learning_rate": 2.8918293736636348e-06, + "loss": 0.1493, + "step": 9635 + }, + { + "epoch": 3.2491151188269005, + "grad_norm": 1.2623155830617245, + "learning_rate": 2.886881208440584e-06, + "loss": 0.1859, + "step": 9640 + }, + { + "epoch": 3.2508006067756616, + "grad_norm": 1.3169177658170732, + "learning_rate": 2.8819355614881477e-06, + "loss": 0.1636, + "step": 9645 + }, + { + "epoch": 3.2524860947244227, + "grad_norm": 1.3065877659962761, + "learning_rate": 2.876992438700209e-06, + "loss": 0.1771, + "step": 9650 + }, + { + "epoch": 3.254171582673184, + "grad_norm": 1.2879065086914747, + "learning_rate": 2.8720518459676476e-06, + "loss": 0.164, + "step": 9655 + }, + { + "epoch": 3.255857070621945, + "grad_norm": 1.104712711465004, + "learning_rate": 2.867113789178323e-06, + "loss": 0.162, + "step": 9660 + }, + { + "epoch": 3.257542558570706, + "grad_norm": 1.2515341755135794, + "learning_rate": 2.862178274217073e-06, + "loss": 0.1854, + "step": 9665 + }, + { + "epoch": 3.2592280465194676, + "grad_norm": 1.2025075722868321, + "learning_rate": 2.857245306965706e-06, + "loss": 0.1555, + "step": 9670 + }, + { + "epoch": 3.2609135344682287, + "grad_norm": 1.2352924301964021, + "learning_rate": 2.8523148933029963e-06, + "loss": 0.1615, + "step": 9675 + }, + { + "epoch": 3.26259902241699, + "grad_norm": 1.1225049407440537, + "learning_rate": 2.847387039104677e-06, + "loss": 0.168, + "step": 9680 + }, + { + "epoch": 3.264284510365751, + "grad_norm": 1.1843493670219003, + "learning_rate": 2.842461750243426e-06, + "loss": 0.1441, + "step": 9685 + }, + { + "epoch": 3.265969998314512, + "grad_norm": 1.4223983699398892, + "learning_rate": 2.837539032588864e-06, + "loss": 0.1533, + "step": 9690 + }, + { + "epoch": 3.267655486263273, + "grad_norm": 1.3668018963765802, + "learning_rate": 2.8326188920075535e-06, + "loss": 0.133, + "step": 9695 + }, + { + "epoch": 3.2693409742120343, + "grad_norm": 1.3732328773508795, + "learning_rate": 2.82770133436298e-06, + "loss": 0.152, + "step": 9700 + }, + { + "epoch": 3.2710264621607954, + "grad_norm": 1.3268811184973257, + "learning_rate": 2.822786365515552e-06, + "loss": 0.1628, + "step": 9705 + }, + { + "epoch": 3.2727119501095565, + "grad_norm": 1.0306016554222965, + "learning_rate": 2.817873991322593e-06, + "loss": 0.14, + "step": 9710 + }, + { + "epoch": 3.274397438058318, + "grad_norm": 1.3115593947766953, + "learning_rate": 2.812964217638336e-06, + "loss": 0.1605, + "step": 9715 + }, + { + "epoch": 3.276082926007079, + "grad_norm": 1.3130610428454002, + "learning_rate": 2.808057050313916e-06, + "loss": 0.1698, + "step": 9720 + }, + { + "epoch": 3.2777684139558403, + "grad_norm": 1.4356301555549609, + "learning_rate": 2.8031524951973577e-06, + "loss": 0.1362, + "step": 9725 + }, + { + "epoch": 3.2794539019046014, + "grad_norm": 1.3386960730337292, + "learning_rate": 2.798250558133574e-06, + "loss": 0.1517, + "step": 9730 + }, + { + "epoch": 3.2811393898533625, + "grad_norm": 1.4488180024774386, + "learning_rate": 2.7933512449643605e-06, + "loss": 0.1728, + "step": 9735 + }, + { + "epoch": 3.2828248778021236, + "grad_norm": 1.359578650451188, + "learning_rate": 2.7884545615283837e-06, + "loss": 0.1537, + "step": 9740 + }, + { + "epoch": 3.2845103657508847, + "grad_norm": 1.1897590570490932, + "learning_rate": 2.7835605136611754e-06, + "loss": 0.1435, + "step": 9745 + }, + { + "epoch": 3.2861958536996463, + "grad_norm": 1.2038117092508442, + "learning_rate": 2.778669107195126e-06, + "loss": 0.163, + "step": 9750 + }, + { + "epoch": 3.2878813416484074, + "grad_norm": 1.2761016367040738, + "learning_rate": 2.7737803479594816e-06, + "loss": 0.1829, + "step": 9755 + }, + { + "epoch": 3.2895668295971685, + "grad_norm": 1.283230762590572, + "learning_rate": 2.7688942417803334e-06, + "loss": 0.1551, + "step": 9760 + }, + { + "epoch": 3.2912523175459296, + "grad_norm": 1.159281231366604, + "learning_rate": 2.764010794480606e-06, + "loss": 0.1644, + "step": 9765 + }, + { + "epoch": 3.2929378054946907, + "grad_norm": 1.2045208633888451, + "learning_rate": 2.759130011880058e-06, + "loss": 0.135, + "step": 9770 + }, + { + "epoch": 3.294623293443452, + "grad_norm": 1.3690116658480913, + "learning_rate": 2.7542518997952756e-06, + "loss": 0.1499, + "step": 9775 + }, + { + "epoch": 3.296308781392213, + "grad_norm": 1.234692808086299, + "learning_rate": 2.7493764640396577e-06, + "loss": 0.1682, + "step": 9780 + }, + { + "epoch": 3.297994269340974, + "grad_norm": 1.4082026875072793, + "learning_rate": 2.744503710423413e-06, + "loss": 0.1646, + "step": 9785 + }, + { + "epoch": 3.299679757289735, + "grad_norm": 1.4087003153021567, + "learning_rate": 2.7396336447535617e-06, + "loss": 0.1659, + "step": 9790 + }, + { + "epoch": 3.3013652452384967, + "grad_norm": 1.3565180469630933, + "learning_rate": 2.7347662728339095e-06, + "loss": 0.1717, + "step": 9795 + }, + { + "epoch": 3.303050733187258, + "grad_norm": 1.1499016521512857, + "learning_rate": 2.729901600465064e-06, + "loss": 0.1752, + "step": 9800 + }, + { + "epoch": 3.304736221136019, + "grad_norm": 1.274611088950639, + "learning_rate": 2.7250396334444063e-06, + "loss": 0.1732, + "step": 9805 + }, + { + "epoch": 3.30642170908478, + "grad_norm": 2.2597812181213213, + "learning_rate": 2.7201803775660955e-06, + "loss": 0.1686, + "step": 9810 + }, + { + "epoch": 3.308107197033541, + "grad_norm": 1.2555303991856703, + "learning_rate": 2.71532383862106e-06, + "loss": 0.1699, + "step": 9815 + }, + { + "epoch": 3.3097926849823023, + "grad_norm": 1.2720036046307266, + "learning_rate": 2.710470022396996e-06, + "loss": 0.1668, + "step": 9820 + }, + { + "epoch": 3.3114781729310634, + "grad_norm": 1.2771764919302078, + "learning_rate": 2.7056189346783436e-06, + "loss": 0.1376, + "step": 9825 + }, + { + "epoch": 3.313163660879825, + "grad_norm": 1.2195796633785456, + "learning_rate": 2.7007705812463036e-06, + "loss": 0.1495, + "step": 9830 + }, + { + "epoch": 3.314849148828586, + "grad_norm": 1.3821972998294634, + "learning_rate": 2.695924967878808e-06, + "loss": 0.156, + "step": 9835 + }, + { + "epoch": 3.316534636777347, + "grad_norm": 1.1986652251422123, + "learning_rate": 2.6910821003505317e-06, + "loss": 0.173, + "step": 9840 + }, + { + "epoch": 3.3182201247261083, + "grad_norm": 1.2925895879187035, + "learning_rate": 2.686241984432871e-06, + "loss": 0.1576, + "step": 9845 + }, + { + "epoch": 3.3199056126748694, + "grad_norm": 1.2402445420674666, + "learning_rate": 2.6814046258939463e-06, + "loss": 0.1825, + "step": 9850 + }, + { + "epoch": 3.3215911006236305, + "grad_norm": 1.3769551053540399, + "learning_rate": 2.6765700304985876e-06, + "loss": 0.1636, + "step": 9855 + }, + { + "epoch": 3.3232765885723916, + "grad_norm": 1.345576550638648, + "learning_rate": 2.6717382040083393e-06, + "loss": 0.1552, + "step": 9860 + }, + { + "epoch": 3.3249620765211527, + "grad_norm": 1.2241688048131634, + "learning_rate": 2.666909152181443e-06, + "loss": 0.1401, + "step": 9865 + }, + { + "epoch": 3.326647564469914, + "grad_norm": 1.2490989855449084, + "learning_rate": 2.6620828807728304e-06, + "loss": 0.172, + "step": 9870 + }, + { + "epoch": 3.3283330524186754, + "grad_norm": 1.2430478967631053, + "learning_rate": 2.65725939553412e-06, + "loss": 0.1534, + "step": 9875 + }, + { + "epoch": 3.3300185403674365, + "grad_norm": 1.140887083180735, + "learning_rate": 2.6524387022136177e-06, + "loss": 0.1398, + "step": 9880 + }, + { + "epoch": 3.3317040283161976, + "grad_norm": 1.314994612439792, + "learning_rate": 2.6476208065562924e-06, + "loss": 0.1635, + "step": 9885 + }, + { + "epoch": 3.3333895162649587, + "grad_norm": 1.114088503690688, + "learning_rate": 2.6428057143037842e-06, + "loss": 0.1582, + "step": 9890 + }, + { + "epoch": 3.33507500421372, + "grad_norm": 1.332360206492176, + "learning_rate": 2.637993431194389e-06, + "loss": 0.1445, + "step": 9895 + }, + { + "epoch": 3.336760492162481, + "grad_norm": 1.3748852428537113, + "learning_rate": 2.6331839629630584e-06, + "loss": 0.1485, + "step": 9900 + }, + { + "epoch": 3.338445980111242, + "grad_norm": 1.0516598879651196, + "learning_rate": 2.6283773153413912e-06, + "loss": 0.1748, + "step": 9905 + }, + { + "epoch": 3.3401314680600036, + "grad_norm": 1.3013265817650992, + "learning_rate": 2.6235734940576185e-06, + "loss": 0.1648, + "step": 9910 + }, + { + "epoch": 3.3418169560087647, + "grad_norm": 1.1930122703681216, + "learning_rate": 2.6187725048366064e-06, + "loss": 0.152, + "step": 9915 + }, + { + "epoch": 3.343502443957526, + "grad_norm": 1.016190066299787, + "learning_rate": 2.613974353399845e-06, + "loss": 0.1178, + "step": 9920 + }, + { + "epoch": 3.345187931906287, + "grad_norm": 1.343488871051259, + "learning_rate": 2.6091790454654463e-06, + "loss": 0.1697, + "step": 9925 + }, + { + "epoch": 3.346873419855048, + "grad_norm": 1.2718864711303226, + "learning_rate": 2.604386586748129e-06, + "loss": 0.1578, + "step": 9930 + }, + { + "epoch": 3.348558907803809, + "grad_norm": 1.3144675193353847, + "learning_rate": 2.599596982959216e-06, + "loss": 0.1483, + "step": 9935 + }, + { + "epoch": 3.3502443957525703, + "grad_norm": 1.2868730060576092, + "learning_rate": 2.594810239806632e-06, + "loss": 0.1495, + "step": 9940 + }, + { + "epoch": 3.3519298837013314, + "grad_norm": 1.4535583194642787, + "learning_rate": 2.5900263629948926e-06, + "loss": 0.1645, + "step": 9945 + }, + { + "epoch": 3.3536153716500925, + "grad_norm": 1.3060008542485482, + "learning_rate": 2.5852453582250925e-06, + "loss": 0.1513, + "step": 9950 + }, + { + "epoch": 3.355300859598854, + "grad_norm": 1.210308369586654, + "learning_rate": 2.5804672311949073e-06, + "loss": 0.1612, + "step": 9955 + }, + { + "epoch": 3.356986347547615, + "grad_norm": 1.2239350750221103, + "learning_rate": 2.5756919875985813e-06, + "loss": 0.1463, + "step": 9960 + }, + { + "epoch": 3.3586718354963763, + "grad_norm": 1.2831887242887268, + "learning_rate": 2.570919633126926e-06, + "loss": 0.1721, + "step": 9965 + }, + { + "epoch": 3.3603573234451374, + "grad_norm": 1.1974341824227415, + "learning_rate": 2.566150173467306e-06, + "loss": 0.1463, + "step": 9970 + }, + { + "epoch": 3.3620428113938985, + "grad_norm": 1.7198354586353592, + "learning_rate": 2.5613836143036357e-06, + "loss": 0.14, + "step": 9975 + }, + { + "epoch": 3.3637282993426596, + "grad_norm": 1.2042408241398865, + "learning_rate": 2.5566199613163766e-06, + "loss": 0.165, + "step": 9980 + }, + { + "epoch": 3.3654137872914207, + "grad_norm": 1.2293708423374898, + "learning_rate": 2.5518592201825267e-06, + "loss": 0.1439, + "step": 9985 + }, + { + "epoch": 3.3670992752401823, + "grad_norm": 1.4356486672717312, + "learning_rate": 2.54710139657561e-06, + "loss": 0.1882, + "step": 9990 + }, + { + "epoch": 3.3687847631889434, + "grad_norm": 1.2734891709414193, + "learning_rate": 2.5423464961656753e-06, + "loss": 0.1613, + "step": 9995 + }, + { + "epoch": 3.3704702511377045, + "grad_norm": 1.1945074047496578, + "learning_rate": 2.5375945246192866e-06, + "loss": 0.1344, + "step": 10000 + }, + { + "epoch": 3.3721557390864656, + "grad_norm": 1.213512918584927, + "learning_rate": 2.5328454875995236e-06, + "loss": 0.1203, + "step": 10005 + }, + { + "epoch": 3.3738412270352267, + "grad_norm": 1.2884866166581566, + "learning_rate": 2.5280993907659597e-06, + "loss": 0.1506, + "step": 10010 + }, + { + "epoch": 3.375526714983988, + "grad_norm": 1.1274743652728205, + "learning_rate": 2.523356239774672e-06, + "loss": 0.1457, + "step": 10015 + }, + { + "epoch": 3.377212202932749, + "grad_norm": 1.447649015328168, + "learning_rate": 2.5186160402782224e-06, + "loss": 0.177, + "step": 10020 + }, + { + "epoch": 3.37889769088151, + "grad_norm": 3.5867383838328206, + "learning_rate": 2.5138787979256552e-06, + "loss": 0.1532, + "step": 10025 + }, + { + "epoch": 3.380583178830271, + "grad_norm": 4.75720701305717, + "learning_rate": 2.5091445183624955e-06, + "loss": 0.1286, + "step": 10030 + }, + { + "epoch": 3.3822686667790327, + "grad_norm": 1.3099119747432584, + "learning_rate": 2.5044132072307337e-06, + "loss": 0.1586, + "step": 10035 + }, + { + "epoch": 3.383954154727794, + "grad_norm": 1.4026656590916609, + "learning_rate": 2.499684870168819e-06, + "loss": 0.1493, + "step": 10040 + }, + { + "epoch": 3.385639642676555, + "grad_norm": 1.2735465304947546, + "learning_rate": 2.494959512811666e-06, + "loss": 0.1634, + "step": 10045 + }, + { + "epoch": 3.387325130625316, + "grad_norm": 1.2405189512566897, + "learning_rate": 2.4902371407906285e-06, + "loss": 0.1515, + "step": 10050 + }, + { + "epoch": 3.389010618574077, + "grad_norm": 1.3739165984313435, + "learning_rate": 2.4855177597335105e-06, + "loss": 0.1533, + "step": 10055 + }, + { + "epoch": 3.3906961065228383, + "grad_norm": 1.379104118916509, + "learning_rate": 2.4808013752645466e-06, + "loss": 0.1681, + "step": 10060 + }, + { + "epoch": 3.3923815944715994, + "grad_norm": 1.3603912344666569, + "learning_rate": 2.476087993004399e-06, + "loss": 0.1679, + "step": 10065 + }, + { + "epoch": 3.394067082420361, + "grad_norm": 1.4006661475584379, + "learning_rate": 2.4713776185701587e-06, + "loss": 0.1472, + "step": 10070 + }, + { + "epoch": 3.395752570369122, + "grad_norm": 1.2184416515683658, + "learning_rate": 2.4666702575753264e-06, + "loss": 0.1696, + "step": 10075 + }, + { + "epoch": 3.397438058317883, + "grad_norm": 1.182412536303197, + "learning_rate": 2.461965915629813e-06, + "loss": 0.1446, + "step": 10080 + }, + { + "epoch": 3.3991235462666443, + "grad_norm": 1.1729750586108794, + "learning_rate": 2.457264598339929e-06, + "loss": 0.1481, + "step": 10085 + }, + { + "epoch": 3.4008090342154054, + "grad_norm": 1.1136834210810826, + "learning_rate": 2.4525663113083898e-06, + "loss": 0.1415, + "step": 10090 + }, + { + "epoch": 3.4024945221641665, + "grad_norm": 1.4430532932826554, + "learning_rate": 2.4478710601342894e-06, + "loss": 0.1565, + "step": 10095 + }, + { + "epoch": 3.4041800101129276, + "grad_norm": 1.2267652059194414, + "learning_rate": 2.443178850413107e-06, + "loss": 0.1463, + "step": 10100 + }, + { + "epoch": 3.4058654980616887, + "grad_norm": 1.4949942070434277, + "learning_rate": 2.4384896877366963e-06, + "loss": 0.1544, + "step": 10105 + }, + { + "epoch": 3.40755098601045, + "grad_norm": 1.2604223328760327, + "learning_rate": 2.433803577693285e-06, + "loss": 0.1541, + "step": 10110 + }, + { + "epoch": 3.4092364739592114, + "grad_norm": 1.3060010613190922, + "learning_rate": 2.429120525867456e-06, + "loss": 0.1483, + "step": 10115 + }, + { + "epoch": 3.4109219619079725, + "grad_norm": 1.2146774291782885, + "learning_rate": 2.424440537840152e-06, + "loss": 0.1842, + "step": 10120 + }, + { + "epoch": 3.4126074498567336, + "grad_norm": 1.2621944558371296, + "learning_rate": 2.4197636191886596e-06, + "loss": 0.1313, + "step": 10125 + }, + { + "epoch": 3.4142929378054947, + "grad_norm": 1.4618231091191864, + "learning_rate": 2.415089775486614e-06, + "loss": 0.1549, + "step": 10130 + }, + { + "epoch": 3.415978425754256, + "grad_norm": 1.2494958728003562, + "learning_rate": 2.4104190123039834e-06, + "loss": 0.1504, + "step": 10135 + }, + { + "epoch": 3.417663913703017, + "grad_norm": 1.0466219441983735, + "learning_rate": 2.4057513352070636e-06, + "loss": 0.1552, + "step": 10140 + }, + { + "epoch": 3.419349401651778, + "grad_norm": 1.181381181227899, + "learning_rate": 2.4010867497584717e-06, + "loss": 0.1361, + "step": 10145 + }, + { + "epoch": 3.4210348896005396, + "grad_norm": 1.4330681182373313, + "learning_rate": 2.396425261517144e-06, + "loss": 0.1577, + "step": 10150 + }, + { + "epoch": 3.4227203775493007, + "grad_norm": 1.3772972060192594, + "learning_rate": 2.3917668760383234e-06, + "loss": 0.1639, + "step": 10155 + }, + { + "epoch": 3.424405865498062, + "grad_norm": 1.2095551479791498, + "learning_rate": 2.3871115988735535e-06, + "loss": 0.1427, + "step": 10160 + }, + { + "epoch": 3.426091353446823, + "grad_norm": 1.2021884887059764, + "learning_rate": 2.3824594355706783e-06, + "loss": 0.1538, + "step": 10165 + }, + { + "epoch": 3.427776841395584, + "grad_norm": 1.1313169172852517, + "learning_rate": 2.3778103916738253e-06, + "loss": 0.1441, + "step": 10170 + }, + { + "epoch": 3.429462329344345, + "grad_norm": 1.2177024573507838, + "learning_rate": 2.373164472723411e-06, + "loss": 0.1548, + "step": 10175 + }, + { + "epoch": 3.4311478172931063, + "grad_norm": 1.246213663716035, + "learning_rate": 2.368521684256122e-06, + "loss": 0.1497, + "step": 10180 + }, + { + "epoch": 3.4328333052418674, + "grad_norm": 1.2783660552054863, + "learning_rate": 2.3638820318049147e-06, + "loss": 0.1369, + "step": 10185 + }, + { + "epoch": 3.4345187931906285, + "grad_norm": 1.3268102245928621, + "learning_rate": 2.3592455208990147e-06, + "loss": 0.1508, + "step": 10190 + }, + { + "epoch": 3.43620428113939, + "grad_norm": 1.3666486878269484, + "learning_rate": 2.3546121570638953e-06, + "loss": 0.1464, + "step": 10195 + }, + { + "epoch": 3.437889769088151, + "grad_norm": 1.2657649808599138, + "learning_rate": 2.3499819458212823e-06, + "loss": 0.1485, + "step": 10200 + }, + { + "epoch": 3.4395752570369122, + "grad_norm": 1.3242067408581548, + "learning_rate": 2.345354892689149e-06, + "loss": 0.1532, + "step": 10205 + }, + { + "epoch": 3.4412607449856734, + "grad_norm": 1.3114272086342003, + "learning_rate": 2.3407310031816964e-06, + "loss": 0.1537, + "step": 10210 + }, + { + "epoch": 3.4429462329344345, + "grad_norm": 1.1146493999973337, + "learning_rate": 2.3361102828093647e-06, + "loss": 0.1409, + "step": 10215 + }, + { + "epoch": 3.4446317208831956, + "grad_norm": 1.3527977652436147, + "learning_rate": 2.3314927370788108e-06, + "loss": 0.1477, + "step": 10220 + }, + { + "epoch": 3.4463172088319567, + "grad_norm": 1.3906779215680816, + "learning_rate": 2.3268783714929098e-06, + "loss": 0.1306, + "step": 10225 + }, + { + "epoch": 3.4480026967807182, + "grad_norm": 1.2281823355192947, + "learning_rate": 2.3222671915507466e-06, + "loss": 0.158, + "step": 10230 + }, + { + "epoch": 3.4496881847294794, + "grad_norm": 1.1300870010146995, + "learning_rate": 2.3176592027476115e-06, + "loss": 0.1212, + "step": 10235 + }, + { + "epoch": 3.4513736726782405, + "grad_norm": 1.4417167756613363, + "learning_rate": 2.3130544105749917e-06, + "loss": 0.158, + "step": 10240 + }, + { + "epoch": 3.4530591606270016, + "grad_norm": 1.2018000207545478, + "learning_rate": 2.3084528205205644e-06, + "loss": 0.1592, + "step": 10245 + }, + { + "epoch": 3.4547446485757627, + "grad_norm": 1.2891547151774303, + "learning_rate": 2.303854438068186e-06, + "loss": 0.116, + "step": 10250 + }, + { + "epoch": 3.456430136524524, + "grad_norm": 1.2953658726878892, + "learning_rate": 2.2992592686979e-06, + "loss": 0.1559, + "step": 10255 + }, + { + "epoch": 3.458115624473285, + "grad_norm": 1.2621481779059829, + "learning_rate": 2.294667317885912e-06, + "loss": 0.1526, + "step": 10260 + }, + { + "epoch": 3.459801112422046, + "grad_norm": 1.2171418295041299, + "learning_rate": 2.290078591104597e-06, + "loss": 0.1425, + "step": 10265 + }, + { + "epoch": 3.461486600370807, + "grad_norm": 1.2903836928936998, + "learning_rate": 2.2854930938224828e-06, + "loss": 0.1605, + "step": 10270 + }, + { + "epoch": 3.4631720883195687, + "grad_norm": 1.3229012029215648, + "learning_rate": 2.2809108315042544e-06, + "loss": 0.1753, + "step": 10275 + }, + { + "epoch": 3.46485757626833, + "grad_norm": 1.2216745043943342, + "learning_rate": 2.27633180961074e-06, + "loss": 0.1315, + "step": 10280 + }, + { + "epoch": 3.466543064217091, + "grad_norm": 1.352358245982348, + "learning_rate": 2.271756033598905e-06, + "loss": 0.134, + "step": 10285 + }, + { + "epoch": 3.468228552165852, + "grad_norm": 1.2957046008732787, + "learning_rate": 2.2671835089218424e-06, + "loss": 0.1504, + "step": 10290 + }, + { + "epoch": 3.469914040114613, + "grad_norm": 1.2815829853788925, + "learning_rate": 2.2626142410287805e-06, + "loss": 0.1411, + "step": 10295 + }, + { + "epoch": 3.4715995280633742, + "grad_norm": 1.6271233142109944, + "learning_rate": 2.258048235365057e-06, + "loss": 0.1449, + "step": 10300 + }, + { + "epoch": 3.4732850160121354, + "grad_norm": 1.2974471537100711, + "learning_rate": 2.2534854973721277e-06, + "loss": 0.1723, + "step": 10305 + }, + { + "epoch": 3.474970503960897, + "grad_norm": 1.5151258091900692, + "learning_rate": 2.2489260324875485e-06, + "loss": 0.1473, + "step": 10310 + }, + { + "epoch": 3.476655991909658, + "grad_norm": 1.4934880449965409, + "learning_rate": 2.2443698461449804e-06, + "loss": 0.1642, + "step": 10315 + }, + { + "epoch": 3.478341479858419, + "grad_norm": 1.3723996901532494, + "learning_rate": 2.239816943774176e-06, + "loss": 0.158, + "step": 10320 + }, + { + "epoch": 3.4800269678071802, + "grad_norm": 1.443019268367747, + "learning_rate": 2.2352673308009737e-06, + "loss": 0.1542, + "step": 10325 + }, + { + "epoch": 3.4817124557559413, + "grad_norm": 1.1284300395864866, + "learning_rate": 2.2307210126472895e-06, + "loss": 0.1414, + "step": 10330 + }, + { + "epoch": 3.4833979437047025, + "grad_norm": 1.23855888555975, + "learning_rate": 2.2261779947311135e-06, + "loss": 0.1421, + "step": 10335 + }, + { + "epoch": 3.4850834316534636, + "grad_norm": 1.4084019002786303, + "learning_rate": 2.221638282466508e-06, + "loss": 0.1455, + "step": 10340 + }, + { + "epoch": 3.4867689196022247, + "grad_norm": 1.3362795161936447, + "learning_rate": 2.2171018812635897e-06, + "loss": 0.1381, + "step": 10345 + }, + { + "epoch": 3.488454407550986, + "grad_norm": 1.259134734511089, + "learning_rate": 2.2125687965285304e-06, + "loss": 0.1489, + "step": 10350 + }, + { + "epoch": 3.4901398954997473, + "grad_norm": 1.2212844823353102, + "learning_rate": 2.2080390336635515e-06, + "loss": 0.131, + "step": 10355 + }, + { + "epoch": 3.4918253834485085, + "grad_norm": 1.3002947953988913, + "learning_rate": 2.203512598066918e-06, + "loss": 0.1261, + "step": 10360 + }, + { + "epoch": 3.4935108713972696, + "grad_norm": 1.3574551635687806, + "learning_rate": 2.198989495132925e-06, + "loss": 0.1444, + "step": 10365 + }, + { + "epoch": 3.4951963593460307, + "grad_norm": 1.4212234938341588, + "learning_rate": 2.1944697302518957e-06, + "loss": 0.1332, + "step": 10370 + }, + { + "epoch": 3.496881847294792, + "grad_norm": 1.363582357147013, + "learning_rate": 2.189953308810177e-06, + "loss": 0.1432, + "step": 10375 + }, + { + "epoch": 3.498567335243553, + "grad_norm": 1.8118963505278043, + "learning_rate": 2.1854402361901345e-06, + "loss": 0.1459, + "step": 10380 + }, + { + "epoch": 3.500252823192314, + "grad_norm": 6.001511141669761, + "learning_rate": 2.180930517770136e-06, + "loss": 0.13, + "step": 10385 + }, + { + "epoch": 3.5019383111410756, + "grad_norm": 1.1529911935264896, + "learning_rate": 2.1764241589245604e-06, + "loss": 0.1297, + "step": 10390 + }, + { + "epoch": 3.5036237990898362, + "grad_norm": 1.3124330659110302, + "learning_rate": 2.1719211650237736e-06, + "loss": 0.1399, + "step": 10395 + }, + { + "epoch": 3.505309287038598, + "grad_norm": 1.3060778220339841, + "learning_rate": 2.16742154143414e-06, + "loss": 0.1549, + "step": 10400 + }, + { + "epoch": 3.506994774987359, + "grad_norm": 1.306653702069654, + "learning_rate": 2.1629252935180015e-06, + "loss": 0.1331, + "step": 10405 + }, + { + "epoch": 3.50868026293612, + "grad_norm": 1.4771043963795132, + "learning_rate": 2.1584324266336802e-06, + "loss": 0.1522, + "step": 10410 + }, + { + "epoch": 3.510365750884881, + "grad_norm": 1.3364185791466066, + "learning_rate": 2.1539429461354655e-06, + "loss": 0.1523, + "step": 10415 + }, + { + "epoch": 3.5120512388336422, + "grad_norm": 1.192134034575673, + "learning_rate": 2.149456857373617e-06, + "loss": 0.1224, + "step": 10420 + }, + { + "epoch": 3.5137367267824033, + "grad_norm": 1.3478448597892143, + "learning_rate": 2.144974165694345e-06, + "loss": 0.16, + "step": 10425 + }, + { + "epoch": 3.5154222147311645, + "grad_norm": 1.1420976982959112, + "learning_rate": 2.14049487643982e-06, + "loss": 0.1218, + "step": 10430 + }, + { + "epoch": 3.517107702679926, + "grad_norm": 1.2690054975999607, + "learning_rate": 2.1360189949481497e-06, + "loss": 0.1454, + "step": 10435 + }, + { + "epoch": 3.518793190628687, + "grad_norm": 1.2762089343715004, + "learning_rate": 2.131546526553383e-06, + "loss": 0.1488, + "step": 10440 + }, + { + "epoch": 3.5204786785774482, + "grad_norm": 1.2812913533625703, + "learning_rate": 2.127077476585505e-06, + "loss": 0.141, + "step": 10445 + }, + { + "epoch": 3.5221641665262093, + "grad_norm": 1.2349437337118987, + "learning_rate": 2.122611850370423e-06, + "loss": 0.1486, + "step": 10450 + }, + { + "epoch": 3.5238496544749704, + "grad_norm": 2.0885116018721095, + "learning_rate": 2.118149653229963e-06, + "loss": 0.1349, + "step": 10455 + }, + { + "epoch": 3.5255351424237316, + "grad_norm": 1.1195939342097039, + "learning_rate": 2.11369089048187e-06, + "loss": 0.1481, + "step": 10460 + }, + { + "epoch": 3.5272206303724927, + "grad_norm": 1.3557711996658708, + "learning_rate": 2.109235567439792e-06, + "loss": 0.1385, + "step": 10465 + }, + { + "epoch": 3.5289061183212542, + "grad_norm": 1.1764486853350906, + "learning_rate": 2.104783689413279e-06, + "loss": 0.1315, + "step": 10470 + }, + { + "epoch": 3.530591606270015, + "grad_norm": 1.1336262689502794, + "learning_rate": 2.100335261707774e-06, + "loss": 0.1493, + "step": 10475 + }, + { + "epoch": 3.5322770942187764, + "grad_norm": 1.3801602380775975, + "learning_rate": 2.095890289624608e-06, + "loss": 0.1799, + "step": 10480 + }, + { + "epoch": 3.5339625821675376, + "grad_norm": 1.3471750010356411, + "learning_rate": 2.0914487784609982e-06, + "loss": 0.1467, + "step": 10485 + }, + { + "epoch": 3.5356480701162987, + "grad_norm": 1.4112952717155867, + "learning_rate": 2.0870107335100324e-06, + "loss": 0.1459, + "step": 10490 + }, + { + "epoch": 3.53733355806506, + "grad_norm": 1.71665705731009, + "learning_rate": 2.082576160060669e-06, + "loss": 0.138, + "step": 10495 + }, + { + "epoch": 3.539019046013821, + "grad_norm": 1.4191379470057897, + "learning_rate": 2.078145063397729e-06, + "loss": 0.1399, + "step": 10500 + }, + { + "epoch": 3.540704533962582, + "grad_norm": 1.2610800597148517, + "learning_rate": 2.073717448801894e-06, + "loss": 0.1406, + "step": 10505 + }, + { + "epoch": 3.542390021911343, + "grad_norm": 1.2012880101008747, + "learning_rate": 2.0692933215496906e-06, + "loss": 0.151, + "step": 10510 + }, + { + "epoch": 3.5440755098601047, + "grad_norm": 1.4278325658758595, + "learning_rate": 2.064872686913492e-06, + "loss": 0.166, + "step": 10515 + }, + { + "epoch": 3.5457609978088658, + "grad_norm": 1.472644933927288, + "learning_rate": 2.060455550161506e-06, + "loss": 0.1395, + "step": 10520 + }, + { + "epoch": 3.547446485757627, + "grad_norm": 2.188490090397927, + "learning_rate": 2.056041916557778e-06, + "loss": 0.1343, + "step": 10525 + }, + { + "epoch": 3.549131973706388, + "grad_norm": 1.3879426456272843, + "learning_rate": 2.0516317913621724e-06, + "loss": 0.1442, + "step": 10530 + }, + { + "epoch": 3.550817461655149, + "grad_norm": 1.3428771458764217, + "learning_rate": 2.0472251798303757e-06, + "loss": 0.1447, + "step": 10535 + }, + { + "epoch": 3.55250294960391, + "grad_norm": 1.3932142671509251, + "learning_rate": 2.042822087213886e-06, + "loss": 0.1618, + "step": 10540 + }, + { + "epoch": 3.5541884375526713, + "grad_norm": 1.3509593971795395, + "learning_rate": 2.0384225187600113e-06, + "loss": 0.1533, + "step": 10545 + }, + { + "epoch": 3.555873925501433, + "grad_norm": 1.556290992445429, + "learning_rate": 2.034026479711855e-06, + "loss": 0.1737, + "step": 10550 + }, + { + "epoch": 3.557559413450194, + "grad_norm": 1.2644217179475776, + "learning_rate": 2.029633975308315e-06, + "loss": 0.1376, + "step": 10555 + }, + { + "epoch": 3.559244901398955, + "grad_norm": 1.2423782672725796, + "learning_rate": 2.0252450107840765e-06, + "loss": 0.1418, + "step": 10560 + }, + { + "epoch": 3.560930389347716, + "grad_norm": 1.4317162751893622, + "learning_rate": 2.020859591369612e-06, + "loss": 0.1459, + "step": 10565 + }, + { + "epoch": 3.5626158772964773, + "grad_norm": 1.3862237555473154, + "learning_rate": 2.016477722291163e-06, + "loss": 0.1336, + "step": 10570 + }, + { + "epoch": 3.5643013652452384, + "grad_norm": 1.4254922326020505, + "learning_rate": 2.012099408770739e-06, + "loss": 0.1491, + "step": 10575 + }, + { + "epoch": 3.5659868531939996, + "grad_norm": 1.4712500075169244, + "learning_rate": 2.0077246560261186e-06, + "loss": 0.134, + "step": 10580 + }, + { + "epoch": 3.567672341142761, + "grad_norm": 1.4780926112680588, + "learning_rate": 2.0033534692708306e-06, + "loss": 0.149, + "step": 10585 + }, + { + "epoch": 3.5693578290915218, + "grad_norm": 1.1906661357195025, + "learning_rate": 1.99898585371416e-06, + "loss": 0.1505, + "step": 10590 + }, + { + "epoch": 3.5710433170402833, + "grad_norm": 1.2955525444480056, + "learning_rate": 1.9946218145611298e-06, + "loss": 0.13, + "step": 10595 + }, + { + "epoch": 3.5727288049890444, + "grad_norm": 1.4304391663027463, + "learning_rate": 1.9902613570125028e-06, + "loss": 0.1318, + "step": 10600 + }, + { + "epoch": 3.5744142929378055, + "grad_norm": 1.3314494150846279, + "learning_rate": 1.9859044862647786e-06, + "loss": 0.1368, + "step": 10605 + }, + { + "epoch": 3.5760997808865667, + "grad_norm": 1.1985082105430942, + "learning_rate": 1.9815512075101734e-06, + "loss": 0.1338, + "step": 10610 + }, + { + "epoch": 3.5777852688353278, + "grad_norm": 1.246983396775462, + "learning_rate": 1.977201525936632e-06, + "loss": 0.1346, + "step": 10615 + }, + { + "epoch": 3.579470756784089, + "grad_norm": 1.274464458650494, + "learning_rate": 1.9728554467278043e-06, + "loss": 0.1539, + "step": 10620 + }, + { + "epoch": 3.58115624473285, + "grad_norm": 1.2443256218719754, + "learning_rate": 1.9685129750630506e-06, + "loss": 0.1455, + "step": 10625 + }, + { + "epoch": 3.5828417326816115, + "grad_norm": 1.356276070081595, + "learning_rate": 1.964174116117435e-06, + "loss": 0.1606, + "step": 10630 + }, + { + "epoch": 3.5845272206303727, + "grad_norm": 1.5256136219456349, + "learning_rate": 1.959838875061711e-06, + "loss": 0.1521, + "step": 10635 + }, + { + "epoch": 3.5862127085791338, + "grad_norm": 1.3573621305917005, + "learning_rate": 1.955507257062323e-06, + "loss": 0.1355, + "step": 10640 + }, + { + "epoch": 3.587898196527895, + "grad_norm": 1.2779022604044483, + "learning_rate": 1.9511792672813957e-06, + "loss": 0.1393, + "step": 10645 + }, + { + "epoch": 3.589583684476656, + "grad_norm": 1.3243206119262712, + "learning_rate": 1.946854910876734e-06, + "loss": 0.1328, + "step": 10650 + }, + { + "epoch": 3.591269172425417, + "grad_norm": 1.4054265088833848, + "learning_rate": 1.9425341930018104e-06, + "loss": 0.1392, + "step": 10655 + }, + { + "epoch": 3.592954660374178, + "grad_norm": 1.2994062236654595, + "learning_rate": 1.9382171188057612e-06, + "loss": 0.1455, + "step": 10660 + }, + { + "epoch": 3.5946401483229398, + "grad_norm": 1.3019479247019692, + "learning_rate": 1.9339036934333785e-06, + "loss": 0.1428, + "step": 10665 + }, + { + "epoch": 3.5963256362717004, + "grad_norm": 1.2386064454120294, + "learning_rate": 1.92959392202511e-06, + "loss": 0.14, + "step": 10670 + }, + { + "epoch": 3.598011124220462, + "grad_norm": 2.3205460412573427, + "learning_rate": 1.925287809717048e-06, + "loss": 0.1464, + "step": 10675 + }, + { + "epoch": 3.599696612169223, + "grad_norm": 1.2592526547322387, + "learning_rate": 1.9209853616409202e-06, + "loss": 0.1271, + "step": 10680 + }, + { + "epoch": 3.601382100117984, + "grad_norm": 1.2479378541792572, + "learning_rate": 1.91668658292409e-06, + "loss": 0.142, + "step": 10685 + }, + { + "epoch": 3.6030675880667453, + "grad_norm": 1.2671475006269277, + "learning_rate": 1.912391478689549e-06, + "loss": 0.1342, + "step": 10690 + }, + { + "epoch": 3.6047530760155064, + "grad_norm": 1.2538256473714446, + "learning_rate": 1.9081000540559118e-06, + "loss": 0.1371, + "step": 10695 + }, + { + "epoch": 3.6064385639642675, + "grad_norm": 1.3419396368565297, + "learning_rate": 1.9038123141374026e-06, + "loss": 0.1549, + "step": 10700 + }, + { + "epoch": 3.6081240519130287, + "grad_norm": 1.3731470411140996, + "learning_rate": 1.8995282640438556e-06, + "loss": 0.1351, + "step": 10705 + }, + { + "epoch": 3.60980953986179, + "grad_norm": 1.4855548060217267, + "learning_rate": 1.8952479088807125e-06, + "loss": 0.1438, + "step": 10710 + }, + { + "epoch": 3.6114950278105513, + "grad_norm": 4.153027925748704, + "learning_rate": 1.890971253749006e-06, + "loss": 0.1133, + "step": 10715 + }, + { + "epoch": 3.6131805157593124, + "grad_norm": 1.1804058356152094, + "learning_rate": 1.8866983037453618e-06, + "loss": 0.1493, + "step": 10720 + }, + { + "epoch": 3.6148660037080735, + "grad_norm": 1.2290498082921446, + "learning_rate": 1.882429063961988e-06, + "loss": 0.1369, + "step": 10725 + }, + { + "epoch": 3.6165514916568346, + "grad_norm": 1.342012503195042, + "learning_rate": 1.8781635394866743e-06, + "loss": 0.1346, + "step": 10730 + }, + { + "epoch": 3.6182369796055958, + "grad_norm": 1.2552312593496555, + "learning_rate": 1.8739017354027839e-06, + "loss": 0.1333, + "step": 10735 + }, + { + "epoch": 3.619922467554357, + "grad_norm": 1.2101740541295785, + "learning_rate": 1.8696436567892418e-06, + "loss": 0.1337, + "step": 10740 + }, + { + "epoch": 3.6216079555031184, + "grad_norm": 2.527413991315585, + "learning_rate": 1.8653893087205349e-06, + "loss": 0.128, + "step": 10745 + }, + { + "epoch": 3.623293443451879, + "grad_norm": 1.5371710753339014, + "learning_rate": 1.8611386962667028e-06, + "loss": 0.1345, + "step": 10750 + }, + { + "epoch": 3.6249789314006406, + "grad_norm": 1.4229550620159923, + "learning_rate": 1.8568918244933386e-06, + "loss": 0.1483, + "step": 10755 + }, + { + "epoch": 3.6266644193494018, + "grad_norm": 1.1869181162925668, + "learning_rate": 1.852648698461571e-06, + "loss": 0.1296, + "step": 10760 + }, + { + "epoch": 3.628349907298163, + "grad_norm": 1.2672176145150378, + "learning_rate": 1.8484093232280704e-06, + "loss": 0.1387, + "step": 10765 + }, + { + "epoch": 3.630035395246924, + "grad_norm": 1.5353080778660577, + "learning_rate": 1.8441737038450313e-06, + "loss": 0.1473, + "step": 10770 + }, + { + "epoch": 3.631720883195685, + "grad_norm": 1.1920347312767035, + "learning_rate": 1.8399418453601798e-06, + "loss": 0.1364, + "step": 10775 + }, + { + "epoch": 3.633406371144446, + "grad_norm": 1.090443066617329, + "learning_rate": 1.835713752816753e-06, + "loss": 0.1212, + "step": 10780 + }, + { + "epoch": 3.6350918590932073, + "grad_norm": 1.344111238189381, + "learning_rate": 1.8314894312535026e-06, + "loss": 0.1224, + "step": 10785 + }, + { + "epoch": 3.636777347041969, + "grad_norm": 1.2110775527578468, + "learning_rate": 1.827268885704686e-06, + "loss": 0.1608, + "step": 10790 + }, + { + "epoch": 3.63846283499073, + "grad_norm": 1.128471012097077, + "learning_rate": 1.8230521212000635e-06, + "loss": 0.1178, + "step": 10795 + }, + { + "epoch": 3.640148322939491, + "grad_norm": 1.461130970109866, + "learning_rate": 1.8188391427648832e-06, + "loss": 0.1609, + "step": 10800 + }, + { + "epoch": 3.641833810888252, + "grad_norm": 1.2857101622554097, + "learning_rate": 1.8146299554198894e-06, + "loss": 0.1395, + "step": 10805 + }, + { + "epoch": 3.6435192988370133, + "grad_norm": 1.3471375373402812, + "learning_rate": 1.8104245641813e-06, + "loss": 0.1537, + "step": 10810 + }, + { + "epoch": 3.6452047867857744, + "grad_norm": 1.3133766589746239, + "learning_rate": 1.8062229740608166e-06, + "loss": 0.1624, + "step": 10815 + }, + { + "epoch": 3.6468902747345355, + "grad_norm": 1.2682272594750614, + "learning_rate": 1.802025190065606e-06, + "loss": 0.151, + "step": 10820 + }, + { + "epoch": 3.648575762683297, + "grad_norm": 1.337186038866119, + "learning_rate": 1.7978312171982993e-06, + "loss": 0.1414, + "step": 10825 + }, + { + "epoch": 3.6502612506320578, + "grad_norm": 1.225988906650177, + "learning_rate": 1.7936410604569859e-06, + "loss": 0.1461, + "step": 10830 + }, + { + "epoch": 3.6519467385808193, + "grad_norm": 1.463406152544717, + "learning_rate": 1.7894547248352101e-06, + "loss": 0.1402, + "step": 10835 + }, + { + "epoch": 3.6536322265295804, + "grad_norm": 1.179333130684187, + "learning_rate": 1.785272215321962e-06, + "loss": 0.1521, + "step": 10840 + }, + { + "epoch": 3.6553177144783415, + "grad_norm": 1.274228070915282, + "learning_rate": 1.7810935369016692e-06, + "loss": 0.1295, + "step": 10845 + }, + { + "epoch": 3.6570032024271026, + "grad_norm": 1.2766295202607583, + "learning_rate": 1.7769186945541956e-06, + "loss": 0.1349, + "step": 10850 + }, + { + "epoch": 3.6586886903758638, + "grad_norm": 2.0165099013912973, + "learning_rate": 1.7727476932548304e-06, + "loss": 0.1525, + "step": 10855 + }, + { + "epoch": 3.660374178324625, + "grad_norm": 2.9137908892716666, + "learning_rate": 1.7685805379742921e-06, + "loss": 0.1549, + "step": 10860 + }, + { + "epoch": 3.662059666273386, + "grad_norm": 1.178746265790924, + "learning_rate": 1.7644172336787096e-06, + "loss": 0.1315, + "step": 10865 + }, + { + "epoch": 3.6637451542221475, + "grad_norm": 1.4017366203879746, + "learning_rate": 1.7602577853296237e-06, + "loss": 0.1342, + "step": 10870 + }, + { + "epoch": 3.6654306421709086, + "grad_norm": 1.1986093038186967, + "learning_rate": 1.7561021978839814e-06, + "loss": 0.1449, + "step": 10875 + }, + { + "epoch": 3.6671161301196697, + "grad_norm": 1.3621368224019756, + "learning_rate": 1.7519504762941303e-06, + "loss": 0.1257, + "step": 10880 + }, + { + "epoch": 3.668801618068431, + "grad_norm": 1.229922223662929, + "learning_rate": 1.7478026255078067e-06, + "loss": 0.1504, + "step": 10885 + }, + { + "epoch": 3.670487106017192, + "grad_norm": 1.3218602290130936, + "learning_rate": 1.7436586504681357e-06, + "loss": 0.1293, + "step": 10890 + }, + { + "epoch": 3.672172593965953, + "grad_norm": 1.2186336535728164, + "learning_rate": 1.7395185561136219e-06, + "loss": 0.1354, + "step": 10895 + }, + { + "epoch": 3.673858081914714, + "grad_norm": 1.4301576145878616, + "learning_rate": 1.7353823473781506e-06, + "loss": 0.1412, + "step": 10900 + }, + { + "epoch": 3.6755435698634757, + "grad_norm": 1.3124284024611363, + "learning_rate": 1.7312500291909707e-06, + "loss": 0.1439, + "step": 10905 + }, + { + "epoch": 3.6772290578122364, + "grad_norm": 1.3193249001410443, + "learning_rate": 1.7271216064766955e-06, + "loss": 0.1226, + "step": 10910 + }, + { + "epoch": 3.678914545760998, + "grad_norm": 1.4046165543654832, + "learning_rate": 1.7229970841552985e-06, + "loss": 0.1325, + "step": 10915 + }, + { + "epoch": 3.680600033709759, + "grad_norm": 1.3992901351664337, + "learning_rate": 1.7188764671421055e-06, + "loss": 0.116, + "step": 10920 + }, + { + "epoch": 3.68228552165852, + "grad_norm": 1.0904309846631886, + "learning_rate": 1.7147597603477845e-06, + "loss": 0.1559, + "step": 10925 + }, + { + "epoch": 3.6839710096072813, + "grad_norm": 1.3359699437178203, + "learning_rate": 1.710646968678345e-06, + "loss": 0.1486, + "step": 10930 + }, + { + "epoch": 3.6856564975560424, + "grad_norm": 1.2327647372204082, + "learning_rate": 1.7065380970351304e-06, + "loss": 0.1308, + "step": 10935 + }, + { + "epoch": 3.6873419855048035, + "grad_norm": 1.4799913603094659, + "learning_rate": 1.702433150314816e-06, + "loss": 0.1334, + "step": 10940 + }, + { + "epoch": 3.6890274734535646, + "grad_norm": 1.2621914955513351, + "learning_rate": 1.6983321334093955e-06, + "loss": 0.1263, + "step": 10945 + }, + { + "epoch": 3.690712961402326, + "grad_norm": 1.4318181939663055, + "learning_rate": 1.6942350512061788e-06, + "loss": 0.1223, + "step": 10950 + }, + { + "epoch": 3.6923984493510873, + "grad_norm": 1.252786341936707, + "learning_rate": 1.6901419085877902e-06, + "loss": 0.1268, + "step": 10955 + }, + { + "epoch": 3.6940839372998484, + "grad_norm": 1.3156485619952978, + "learning_rate": 1.6860527104321594e-06, + "loss": 0.1349, + "step": 10960 + }, + { + "epoch": 3.6957694252486095, + "grad_norm": 1.4287223642883522, + "learning_rate": 1.6819674616125109e-06, + "loss": 0.142, + "step": 10965 + }, + { + "epoch": 3.6974549131973706, + "grad_norm": 1.4057183581891188, + "learning_rate": 1.6778861669973661e-06, + "loss": 0.1316, + "step": 10970 + }, + { + "epoch": 3.6991404011461317, + "grad_norm": 1.2378528563096136, + "learning_rate": 1.6738088314505312e-06, + "loss": 0.1116, + "step": 10975 + }, + { + "epoch": 3.700825889094893, + "grad_norm": 1.2196288849624572, + "learning_rate": 1.6697354598310995e-06, + "loss": 0.1373, + "step": 10980 + }, + { + "epoch": 3.7025113770436544, + "grad_norm": 1.4958660574515723, + "learning_rate": 1.6656660569934353e-06, + "loss": 0.127, + "step": 10985 + }, + { + "epoch": 3.704196864992415, + "grad_norm": 1.2609253493145949, + "learning_rate": 1.6616006277871727e-06, + "loss": 0.1317, + "step": 10990 + }, + { + "epoch": 3.7058823529411766, + "grad_norm": 1.4129856363806284, + "learning_rate": 1.6575391770572168e-06, + "loss": 0.1329, + "step": 10995 + }, + { + "epoch": 3.7075678408899377, + "grad_norm": 1.3389979570312858, + "learning_rate": 1.6534817096437228e-06, + "loss": 0.1259, + "step": 11000 + }, + { + "epoch": 3.709253328838699, + "grad_norm": 1.2637090064308742, + "learning_rate": 1.6494282303821075e-06, + "loss": 0.1176, + "step": 11005 + }, + { + "epoch": 3.71093881678746, + "grad_norm": 1.2416217473428255, + "learning_rate": 1.6453787441030284e-06, + "loss": 0.1419, + "step": 11010 + }, + { + "epoch": 3.712624304736221, + "grad_norm": 1.2100826998838317, + "learning_rate": 1.6413332556323847e-06, + "loss": 0.1288, + "step": 11015 + }, + { + "epoch": 3.714309792684982, + "grad_norm": 1.168320926301484, + "learning_rate": 1.6372917697913165e-06, + "loss": 0.1239, + "step": 11020 + }, + { + "epoch": 3.7159952806337433, + "grad_norm": 1.2839813029539138, + "learning_rate": 1.6332542913961874e-06, + "loss": 0.1267, + "step": 11025 + }, + { + "epoch": 3.717680768582505, + "grad_norm": 1.353670532278106, + "learning_rate": 1.6292208252585922e-06, + "loss": 0.1476, + "step": 11030 + }, + { + "epoch": 3.719366256531266, + "grad_norm": 1.3220795371446468, + "learning_rate": 1.6251913761853378e-06, + "loss": 0.1398, + "step": 11035 + }, + { + "epoch": 3.721051744480027, + "grad_norm": 1.1553512249234226, + "learning_rate": 1.6211659489784448e-06, + "loss": 0.1544, + "step": 11040 + }, + { + "epoch": 3.722737232428788, + "grad_norm": 1.3130351186312237, + "learning_rate": 1.6171445484351462e-06, + "loss": 0.1414, + "step": 11045 + }, + { + "epoch": 3.7244227203775493, + "grad_norm": 1.142910314927335, + "learning_rate": 1.6131271793478704e-06, + "loss": 0.1173, + "step": 11050 + }, + { + "epoch": 3.7261082083263104, + "grad_norm": 1.2919554545792558, + "learning_rate": 1.6091138465042434e-06, + "loss": 0.1371, + "step": 11055 + }, + { + "epoch": 3.7277936962750715, + "grad_norm": 1.7895386627119003, + "learning_rate": 1.6051045546870791e-06, + "loss": 0.1403, + "step": 11060 + }, + { + "epoch": 3.729479184223833, + "grad_norm": 1.2380015640589026, + "learning_rate": 1.6010993086743804e-06, + "loss": 0.1387, + "step": 11065 + }, + { + "epoch": 3.7311646721725937, + "grad_norm": 1.2733695615773877, + "learning_rate": 1.5970981132393266e-06, + "loss": 0.1429, + "step": 11070 + }, + { + "epoch": 3.7328501601213553, + "grad_norm": 1.394081091787083, + "learning_rate": 1.5931009731502673e-06, + "loss": 0.1352, + "step": 11075 + }, + { + "epoch": 3.7345356480701164, + "grad_norm": 1.2588665514022965, + "learning_rate": 1.5891078931707194e-06, + "loss": 0.126, + "step": 11080 + }, + { + "epoch": 3.7362211360188775, + "grad_norm": 1.36710298121437, + "learning_rate": 1.5851188780593668e-06, + "loss": 0.1466, + "step": 11085 + }, + { + "epoch": 3.7379066239676386, + "grad_norm": 1.3953484795375954, + "learning_rate": 1.581133932570043e-06, + "loss": 0.1349, + "step": 11090 + }, + { + "epoch": 3.7395921119163997, + "grad_norm": 1.1442607060166285, + "learning_rate": 1.5771530614517339e-06, + "loss": 0.1474, + "step": 11095 + }, + { + "epoch": 3.741277599865161, + "grad_norm": 1.3852545739441826, + "learning_rate": 1.5731762694485681e-06, + "loss": 0.127, + "step": 11100 + }, + { + "epoch": 3.742963087813922, + "grad_norm": 1.344833226682109, + "learning_rate": 1.5692035612998163e-06, + "loss": 0.12, + "step": 11105 + }, + { + "epoch": 3.7446485757626835, + "grad_norm": 1.4701799239754092, + "learning_rate": 1.565234941739882e-06, + "loss": 0.134, + "step": 11110 + }, + { + "epoch": 3.7463340637114446, + "grad_norm": 1.3909412719624628, + "learning_rate": 1.5612704154982937e-06, + "loss": 0.129, + "step": 11115 + }, + { + "epoch": 3.7480195516602057, + "grad_norm": 1.1780797760040174, + "learning_rate": 1.557309987299701e-06, + "loss": 0.1346, + "step": 11120 + }, + { + "epoch": 3.749705039608967, + "grad_norm": 1.133305541282998, + "learning_rate": 1.5533536618638755e-06, + "loss": 0.1102, + "step": 11125 + }, + { + "epoch": 3.751390527557728, + "grad_norm": 1.4051303257982724, + "learning_rate": 1.5494014439056931e-06, + "loss": 0.1342, + "step": 11130 + }, + { + "epoch": 3.753076015506489, + "grad_norm": 2.3301186322425256, + "learning_rate": 1.5454533381351378e-06, + "loss": 0.1154, + "step": 11135 + }, + { + "epoch": 3.75476150345525, + "grad_norm": 1.4838583399462095, + "learning_rate": 1.5415093492572902e-06, + "loss": 0.1198, + "step": 11140 + }, + { + "epoch": 3.7564469914040117, + "grad_norm": 1.3862808847131185, + "learning_rate": 1.5375694819723286e-06, + "loss": 0.1336, + "step": 11145 + }, + { + "epoch": 3.7581324793527724, + "grad_norm": 1.4877096838778645, + "learning_rate": 1.5336337409755198e-06, + "loss": 0.1634, + "step": 11150 + }, + { + "epoch": 3.759817967301534, + "grad_norm": 1.205089520477097, + "learning_rate": 1.5297021309572085e-06, + "loss": 0.1457, + "step": 11155 + }, + { + "epoch": 3.761503455250295, + "grad_norm": 1.2930805196199944, + "learning_rate": 1.5257746566028198e-06, + "loss": 0.1343, + "step": 11160 + }, + { + "epoch": 3.763188943199056, + "grad_norm": 1.4997910724555106, + "learning_rate": 1.5218513225928473e-06, + "loss": 0.1419, + "step": 11165 + }, + { + "epoch": 3.7648744311478173, + "grad_norm": 1.326257220590502, + "learning_rate": 1.5179321336028557e-06, + "loss": 0.135, + "step": 11170 + }, + { + "epoch": 3.7665599190965784, + "grad_norm": 3.3805180320738004, + "learning_rate": 1.5140170943034633e-06, + "loss": 0.1551, + "step": 11175 + }, + { + "epoch": 3.7682454070453395, + "grad_norm": 1.502125772594316, + "learning_rate": 1.5101062093603502e-06, + "loss": 0.1246, + "step": 11180 + }, + { + "epoch": 3.7699308949941006, + "grad_norm": 1.3387209752283937, + "learning_rate": 1.506199483434238e-06, + "loss": 0.1531, + "step": 11185 + }, + { + "epoch": 3.771616382942862, + "grad_norm": 1.2512897849676383, + "learning_rate": 1.5022969211808997e-06, + "loss": 0.1313, + "step": 11190 + }, + { + "epoch": 3.7733018708916233, + "grad_norm": 1.3046445675910159, + "learning_rate": 1.4983985272511404e-06, + "loss": 0.1413, + "step": 11195 + }, + { + "epoch": 3.7749873588403844, + "grad_norm": 1.3228199491773467, + "learning_rate": 1.4945043062907993e-06, + "loss": 0.145, + "step": 11200 + }, + { + "epoch": 3.7766728467891455, + "grad_norm": 1.1277066014272603, + "learning_rate": 1.4906142629407421e-06, + "loss": 0.1301, + "step": 11205 + }, + { + "epoch": 3.7783583347379066, + "grad_norm": 1.401023583324405, + "learning_rate": 1.486728401836859e-06, + "loss": 0.1458, + "step": 11210 + }, + { + "epoch": 3.7800438226866677, + "grad_norm": 2.4623045803277965, + "learning_rate": 1.4828467276100516e-06, + "loss": 0.1193, + "step": 11215 + }, + { + "epoch": 3.781729310635429, + "grad_norm": 1.2017613317670408, + "learning_rate": 1.4789692448862364e-06, + "loss": 0.133, + "step": 11220 + }, + { + "epoch": 3.7834147985841904, + "grad_norm": 1.4369487121538838, + "learning_rate": 1.47509595828633e-06, + "loss": 0.1196, + "step": 11225 + }, + { + "epoch": 3.785100286532951, + "grad_norm": 1.2493238247353895, + "learning_rate": 1.4712268724262529e-06, + "loss": 0.1216, + "step": 11230 + }, + { + "epoch": 3.7867857744817126, + "grad_norm": 1.4063951576050708, + "learning_rate": 1.4673619919169168e-06, + "loss": 0.1291, + "step": 11235 + }, + { + "epoch": 3.7884712624304737, + "grad_norm": 1.2906232810719316, + "learning_rate": 1.4635013213642213e-06, + "loss": 0.1164, + "step": 11240 + }, + { + "epoch": 3.790156750379235, + "grad_norm": 1.3767491306273862, + "learning_rate": 1.4596448653690493e-06, + "loss": 0.1525, + "step": 11245 + }, + { + "epoch": 3.791842238327996, + "grad_norm": 1.2489608029596313, + "learning_rate": 1.4557926285272622e-06, + "loss": 0.1228, + "step": 11250 + }, + { + "epoch": 3.793527726276757, + "grad_norm": 1.6760503093343815, + "learning_rate": 1.4519446154296951e-06, + "loss": 0.1373, + "step": 11255 + }, + { + "epoch": 3.795213214225518, + "grad_norm": 1.5353684247978776, + "learning_rate": 1.4481008306621447e-06, + "loss": 0.1596, + "step": 11260 + }, + { + "epoch": 3.7968987021742793, + "grad_norm": 1.4731419789805278, + "learning_rate": 1.44426127880537e-06, + "loss": 0.1284, + "step": 11265 + }, + { + "epoch": 3.798584190123041, + "grad_norm": 1.0635085858618885, + "learning_rate": 1.4404259644350899e-06, + "loss": 0.1425, + "step": 11270 + }, + { + "epoch": 3.800269678071802, + "grad_norm": 1.3188467927039411, + "learning_rate": 1.436594892121968e-06, + "loss": 0.138, + "step": 11275 + }, + { + "epoch": 3.801955166020563, + "grad_norm": 1.2440191333955246, + "learning_rate": 1.4327680664316152e-06, + "loss": 0.1305, + "step": 11280 + }, + { + "epoch": 3.803640653969324, + "grad_norm": 1.2837662103817602, + "learning_rate": 1.4289454919245788e-06, + "loss": 0.1379, + "step": 11285 + }, + { + "epoch": 3.8053261419180853, + "grad_norm": 1.1769279984930598, + "learning_rate": 1.4251271731563438e-06, + "loss": 0.125, + "step": 11290 + }, + { + "epoch": 3.8070116298668464, + "grad_norm": 1.2766168881855045, + "learning_rate": 1.4213131146773229e-06, + "loss": 0.1381, + "step": 11295 + }, + { + "epoch": 3.8086971178156075, + "grad_norm": 1.5157427588705457, + "learning_rate": 1.4175033210328493e-06, + "loss": 0.1276, + "step": 11300 + }, + { + "epoch": 3.810382605764369, + "grad_norm": 1.2398980650711655, + "learning_rate": 1.4136977967631743e-06, + "loss": 0.1245, + "step": 11305 + }, + { + "epoch": 3.8120680937131297, + "grad_norm": 1.3371425887454111, + "learning_rate": 1.4098965464034609e-06, + "loss": 0.1463, + "step": 11310 + }, + { + "epoch": 3.8137535816618913, + "grad_norm": 1.3161589516680494, + "learning_rate": 1.406099574483782e-06, + "loss": 0.1242, + "step": 11315 + }, + { + "epoch": 3.8154390696106524, + "grad_norm": 1.3572046560700748, + "learning_rate": 1.4023068855291082e-06, + "loss": 0.1351, + "step": 11320 + }, + { + "epoch": 3.8171245575594135, + "grad_norm": 1.4045816580208925, + "learning_rate": 1.3985184840593052e-06, + "loss": 0.1121, + "step": 11325 + }, + { + "epoch": 3.8188100455081746, + "grad_norm": 1.283108073289214, + "learning_rate": 1.394734374589133e-06, + "loss": 0.1325, + "step": 11330 + }, + { + "epoch": 3.8204955334569357, + "grad_norm": 1.791579105416178, + "learning_rate": 1.390954561628236e-06, + "loss": 0.1288, + "step": 11335 + }, + { + "epoch": 3.822181021405697, + "grad_norm": 1.3451170525968954, + "learning_rate": 1.3871790496811356e-06, + "loss": 0.1363, + "step": 11340 + }, + { + "epoch": 3.823866509354458, + "grad_norm": 1.3587677057777976, + "learning_rate": 1.3834078432472292e-06, + "loss": 0.1272, + "step": 11345 + }, + { + "epoch": 3.8255519973032195, + "grad_norm": 1.2621330786954483, + "learning_rate": 1.379640946820781e-06, + "loss": 0.1553, + "step": 11350 + }, + { + "epoch": 3.8272374852519806, + "grad_norm": 1.2749397547430936, + "learning_rate": 1.3758783648909246e-06, + "loss": 0.1302, + "step": 11355 + }, + { + "epoch": 3.8289229732007417, + "grad_norm": 1.360132376641049, + "learning_rate": 1.3721201019416458e-06, + "loss": 0.1134, + "step": 11360 + }, + { + "epoch": 3.830608461149503, + "grad_norm": 1.3463797002311013, + "learning_rate": 1.3683661624517847e-06, + "loss": 0.1339, + "step": 11365 + }, + { + "epoch": 3.832293949098264, + "grad_norm": 1.1656547531021166, + "learning_rate": 1.364616550895031e-06, + "loss": 0.1217, + "step": 11370 + }, + { + "epoch": 3.833979437047025, + "grad_norm": 1.2493967090178022, + "learning_rate": 1.3608712717399174e-06, + "loss": 0.1273, + "step": 11375 + }, + { + "epoch": 3.835664924995786, + "grad_norm": 1.3225994130273213, + "learning_rate": 1.35713032944981e-06, + "loss": 0.1312, + "step": 11380 + }, + { + "epoch": 3.8373504129445477, + "grad_norm": 1.4418769066940484, + "learning_rate": 1.353393728482909e-06, + "loss": 0.1313, + "step": 11385 + }, + { + "epoch": 3.8390359008933084, + "grad_norm": 1.4818666743645907, + "learning_rate": 1.3496614732922375e-06, + "loss": 0.1337, + "step": 11390 + }, + { + "epoch": 3.84072138884207, + "grad_norm": 1.459798900231498, + "learning_rate": 1.3459335683256457e-06, + "loss": 0.1485, + "step": 11395 + }, + { + "epoch": 3.842406876790831, + "grad_norm": 1.4538320839709264, + "learning_rate": 1.3422100180257936e-06, + "loss": 0.1604, + "step": 11400 + }, + { + "epoch": 3.844092364739592, + "grad_norm": 1.7742149272289591, + "learning_rate": 1.3384908268301566e-06, + "loss": 0.1211, + "step": 11405 + }, + { + "epoch": 3.8457778526883533, + "grad_norm": 1.1933454350246195, + "learning_rate": 1.3347759991710109e-06, + "loss": 0.1263, + "step": 11410 + }, + { + "epoch": 3.8474633406371144, + "grad_norm": 1.4852947422766494, + "learning_rate": 1.3310655394754335e-06, + "loss": 0.1281, + "step": 11415 + }, + { + "epoch": 3.8491488285858755, + "grad_norm": 1.4595205492510024, + "learning_rate": 1.3273594521652994e-06, + "loss": 0.138, + "step": 11420 + }, + { + "epoch": 3.8508343165346366, + "grad_norm": 1.570597937583335, + "learning_rate": 1.3236577416572682e-06, + "loss": 0.1359, + "step": 11425 + }, + { + "epoch": 3.852519804483398, + "grad_norm": 1.3041151431665867, + "learning_rate": 1.319960412362785e-06, + "loss": 0.1439, + "step": 11430 + }, + { + "epoch": 3.8542052924321593, + "grad_norm": 1.1223750056339685, + "learning_rate": 1.3162674686880778e-06, + "loss": 0.1207, + "step": 11435 + }, + { + "epoch": 3.8558907803809204, + "grad_norm": 1.6708025803319664, + "learning_rate": 1.31257891503414e-06, + "loss": 0.1439, + "step": 11440 + }, + { + "epoch": 3.8575762683296815, + "grad_norm": 1.547027523067389, + "learning_rate": 1.3088947557967412e-06, + "loss": 0.135, + "step": 11445 + }, + { + "epoch": 3.8592617562784426, + "grad_norm": 1.3025590668689755, + "learning_rate": 1.3052149953664107e-06, + "loss": 0.1371, + "step": 11450 + }, + { + "epoch": 3.8609472442272037, + "grad_norm": 1.4324892259690334, + "learning_rate": 1.3015396381284317e-06, + "loss": 0.1335, + "step": 11455 + }, + { + "epoch": 3.862632732175965, + "grad_norm": 1.3895613413035335, + "learning_rate": 1.2978686884628489e-06, + "loss": 0.121, + "step": 11460 + }, + { + "epoch": 3.8643182201247264, + "grad_norm": 1.3093726468294993, + "learning_rate": 1.2942021507444475e-06, + "loss": 0.1215, + "step": 11465 + }, + { + "epoch": 3.866003708073487, + "grad_norm": 1.450148805180268, + "learning_rate": 1.2905400293427555e-06, + "loss": 0.1363, + "step": 11470 + }, + { + "epoch": 3.8676891960222486, + "grad_norm": 1.2912630339556102, + "learning_rate": 1.2868823286220372e-06, + "loss": 0.1119, + "step": 11475 + }, + { + "epoch": 3.8693746839710097, + "grad_norm": 1.251171279718913, + "learning_rate": 1.2832290529412954e-06, + "loss": 0.1342, + "step": 11480 + }, + { + "epoch": 3.871060171919771, + "grad_norm": 1.4914594893258193, + "learning_rate": 1.2795802066542523e-06, + "loss": 0.1179, + "step": 11485 + }, + { + "epoch": 3.872745659868532, + "grad_norm": 1.4830649563474119, + "learning_rate": 1.2759357941093536e-06, + "loss": 0.1293, + "step": 11490 + }, + { + "epoch": 3.874431147817293, + "grad_norm": 1.529046490796919, + "learning_rate": 1.2722958196497599e-06, + "loss": 0.1401, + "step": 11495 + }, + { + "epoch": 3.876116635766054, + "grad_norm": 1.364320473286898, + "learning_rate": 1.2686602876133457e-06, + "loss": 0.137, + "step": 11500 + }, + { + "epoch": 3.8778021237148153, + "grad_norm": 1.2666932819544312, + "learning_rate": 1.26502920233269e-06, + "loss": 0.1053, + "step": 11505 + }, + { + "epoch": 3.879487611663577, + "grad_norm": 1.3644748845747552, + "learning_rate": 1.2614025681350712e-06, + "loss": 0.1279, + "step": 11510 + }, + { + "epoch": 3.881173099612338, + "grad_norm": 1.3713293207300685, + "learning_rate": 1.2577803893424628e-06, + "loss": 0.1174, + "step": 11515 + }, + { + "epoch": 3.882858587561099, + "grad_norm": 1.2780743363842897, + "learning_rate": 1.2541626702715316e-06, + "loss": 0.1244, + "step": 11520 + }, + { + "epoch": 3.88454407550986, + "grad_norm": 1.9396492050763243, + "learning_rate": 1.2505494152336294e-06, + "loss": 0.1284, + "step": 11525 + }, + { + "epoch": 3.8862295634586213, + "grad_norm": 1.274118183453942, + "learning_rate": 1.2469406285347851e-06, + "loss": 0.133, + "step": 11530 + }, + { + "epoch": 3.8879150514073824, + "grad_norm": 1.6720765742990575, + "learning_rate": 1.2433363144757037e-06, + "loss": 0.1339, + "step": 11535 + }, + { + "epoch": 3.8896005393561435, + "grad_norm": 1.3845866516122425, + "learning_rate": 1.2397364773517618e-06, + "loss": 0.157, + "step": 11540 + }, + { + "epoch": 3.891286027304905, + "grad_norm": 1.331781088462336, + "learning_rate": 1.2361411214529995e-06, + "loss": 0.1496, + "step": 11545 + }, + { + "epoch": 3.8929715152536657, + "grad_norm": 1.0935379891948103, + "learning_rate": 1.2325502510641135e-06, + "loss": 0.1165, + "step": 11550 + }, + { + "epoch": 3.8946570032024272, + "grad_norm": 1.5071622175057107, + "learning_rate": 1.2289638704644612e-06, + "loss": 0.14, + "step": 11555 + }, + { + "epoch": 3.8963424911511884, + "grad_norm": 1.328602243598805, + "learning_rate": 1.2253819839280435e-06, + "loss": 0.1403, + "step": 11560 + }, + { + "epoch": 3.8980279790999495, + "grad_norm": 1.2820953430032531, + "learning_rate": 1.221804595723511e-06, + "loss": 0.1464, + "step": 11565 + }, + { + "epoch": 3.8997134670487106, + "grad_norm": 1.4155052163367552, + "learning_rate": 1.2182317101141477e-06, + "loss": 0.1527, + "step": 11570 + }, + { + "epoch": 3.9013989549974717, + "grad_norm": 1.380507169619867, + "learning_rate": 1.2146633313578766e-06, + "loss": 0.1175, + "step": 11575 + }, + { + "epoch": 3.903084442946233, + "grad_norm": 1.14637493772561, + "learning_rate": 1.2110994637072448e-06, + "loss": 0.1168, + "step": 11580 + }, + { + "epoch": 3.904769930894994, + "grad_norm": 1.2904859905703334, + "learning_rate": 1.2075401114094303e-06, + "loss": 0.1238, + "step": 11585 + }, + { + "epoch": 3.9064554188437555, + "grad_norm": 1.379140301488519, + "learning_rate": 1.2039852787062222e-06, + "loss": 0.1157, + "step": 11590 + }, + { + "epoch": 3.9081409067925166, + "grad_norm": 1.779540026975358, + "learning_rate": 1.2004349698340307e-06, + "loss": 0.1331, + "step": 11595 + }, + { + "epoch": 3.9098263947412777, + "grad_norm": 1.2923324777083516, + "learning_rate": 1.1968891890238681e-06, + "loss": 0.1296, + "step": 11600 + }, + { + "epoch": 3.911511882690039, + "grad_norm": 1.483539619591517, + "learning_rate": 1.193347940501357e-06, + "loss": 0.1292, + "step": 11605 + }, + { + "epoch": 3.9131973706388, + "grad_norm": 1.27187117436411, + "learning_rate": 1.1898112284867137e-06, + "loss": 0.119, + "step": 11610 + }, + { + "epoch": 3.914882858587561, + "grad_norm": 1.409407748050843, + "learning_rate": 1.1862790571947502e-06, + "loss": 0.134, + "step": 11615 + }, + { + "epoch": 3.916568346536322, + "grad_norm": 1.3809295704352462, + "learning_rate": 1.1827514308348652e-06, + "loss": 0.1378, + "step": 11620 + }, + { + "epoch": 3.9182538344850837, + "grad_norm": 1.1843572362723038, + "learning_rate": 1.1792283536110444e-06, + "loss": 0.1291, + "step": 11625 + }, + { + "epoch": 3.9199393224338444, + "grad_norm": 1.417513753472395, + "learning_rate": 1.1757098297218523e-06, + "loss": 0.137, + "step": 11630 + }, + { + "epoch": 3.921624810382606, + "grad_norm": 1.2719598733782485, + "learning_rate": 1.172195863360423e-06, + "loss": 0.1504, + "step": 11635 + }, + { + "epoch": 3.923310298331367, + "grad_norm": 1.5040447813022308, + "learning_rate": 1.1686864587144614e-06, + "loss": 0.1341, + "step": 11640 + }, + { + "epoch": 3.924995786280128, + "grad_norm": 1.64275717096149, + "learning_rate": 1.165181619966238e-06, + "loss": 0.1306, + "step": 11645 + }, + { + "epoch": 3.9266812742288892, + "grad_norm": 1.0752213608872785, + "learning_rate": 1.161681351292579e-06, + "loss": 0.1216, + "step": 11650 + }, + { + "epoch": 3.9283667621776504, + "grad_norm": 1.2738139268445927, + "learning_rate": 1.1581856568648658e-06, + "loss": 0.1273, + "step": 11655 + }, + { + "epoch": 3.9300522501264115, + "grad_norm": 1.1415927007120943, + "learning_rate": 1.1546945408490267e-06, + "loss": 0.1439, + "step": 11660 + }, + { + "epoch": 3.9317377380751726, + "grad_norm": 1.48432947909629, + "learning_rate": 1.1512080074055365e-06, + "loss": 0.1314, + "step": 11665 + }, + { + "epoch": 3.933423226023934, + "grad_norm": 1.5177086660710022, + "learning_rate": 1.1477260606894091e-06, + "loss": 0.1317, + "step": 11670 + }, + { + "epoch": 3.9351087139726952, + "grad_norm": 1.2480924669984619, + "learning_rate": 1.1442487048501888e-06, + "loss": 0.1179, + "step": 11675 + }, + { + "epoch": 3.9367942019214563, + "grad_norm": 1.1673574109519713, + "learning_rate": 1.1407759440319504e-06, + "loss": 0.1116, + "step": 11680 + }, + { + "epoch": 3.9384796898702175, + "grad_norm": 1.189692540307074, + "learning_rate": 1.1373077823732948e-06, + "loss": 0.1401, + "step": 11685 + }, + { + "epoch": 3.9401651778189786, + "grad_norm": 1.2474132437518495, + "learning_rate": 1.1338442240073395e-06, + "loss": 0.1399, + "step": 11690 + }, + { + "epoch": 3.9418506657677397, + "grad_norm": 1.3841753764772589, + "learning_rate": 1.130385273061716e-06, + "loss": 0.1153, + "step": 11695 + }, + { + "epoch": 3.943536153716501, + "grad_norm": 1.3670410936056547, + "learning_rate": 1.1269309336585648e-06, + "loss": 0.121, + "step": 11700 + }, + { + "epoch": 3.9452216416652623, + "grad_norm": 4.2566426506681525, + "learning_rate": 1.123481209914533e-06, + "loss": 0.1247, + "step": 11705 + }, + { + "epoch": 3.946907129614023, + "grad_norm": 1.2549136289873755, + "learning_rate": 1.1200361059407665e-06, + "loss": 0.1192, + "step": 11710 + }, + { + "epoch": 3.9485926175627846, + "grad_norm": 1.4272672440381446, + "learning_rate": 1.116595625842905e-06, + "loss": 0.15, + "step": 11715 + }, + { + "epoch": 3.9502781055115457, + "grad_norm": 1.2059470758704138, + "learning_rate": 1.1131597737210758e-06, + "loss": 0.1389, + "step": 11720 + }, + { + "epoch": 3.951963593460307, + "grad_norm": 1.4754281793219044, + "learning_rate": 1.1097285536698922e-06, + "loss": 0.1324, + "step": 11725 + }, + { + "epoch": 3.953649081409068, + "grad_norm": 1.2141992362969727, + "learning_rate": 1.106301969778451e-06, + "loss": 0.111, + "step": 11730 + }, + { + "epoch": 3.955334569357829, + "grad_norm": 1.2918810114814279, + "learning_rate": 1.1028800261303186e-06, + "loss": 0.1549, + "step": 11735 + }, + { + "epoch": 3.95702005730659, + "grad_norm": 1.461639333770653, + "learning_rate": 1.0994627268035324e-06, + "loss": 0.1265, + "step": 11740 + }, + { + "epoch": 3.9587055452553512, + "grad_norm": 1.7712147161462113, + "learning_rate": 1.0960500758705983e-06, + "loss": 0.1234, + "step": 11745 + }, + { + "epoch": 3.960391033204113, + "grad_norm": 1.2394532177536923, + "learning_rate": 1.0926420773984813e-06, + "loss": 0.1277, + "step": 11750 + }, + { + "epoch": 3.962076521152874, + "grad_norm": 1.3595241710627732, + "learning_rate": 1.0892387354486006e-06, + "loss": 0.1334, + "step": 11755 + }, + { + "epoch": 3.963762009101635, + "grad_norm": 1.3596726580758596, + "learning_rate": 1.0858400540768255e-06, + "loss": 0.1183, + "step": 11760 + }, + { + "epoch": 3.965447497050396, + "grad_norm": 1.3995731294824745, + "learning_rate": 1.0824460373334716e-06, + "loss": 0.1298, + "step": 11765 + }, + { + "epoch": 3.9671329849991572, + "grad_norm": 1.585776588487857, + "learning_rate": 1.0790566892632986e-06, + "loss": 0.129, + "step": 11770 + }, + { + "epoch": 3.9688184729479183, + "grad_norm": 1.3895099701140083, + "learning_rate": 1.0756720139054976e-06, + "loss": 0.1348, + "step": 11775 + }, + { + "epoch": 3.9705039608966795, + "grad_norm": 1.500716624265993, + "learning_rate": 1.072292015293696e-06, + "loss": 0.1283, + "step": 11780 + }, + { + "epoch": 3.972189448845441, + "grad_norm": 1.3111959478401685, + "learning_rate": 1.0689166974559427e-06, + "loss": 0.12, + "step": 11785 + }, + { + "epoch": 3.9738749367942017, + "grad_norm": 1.3140798821113782, + "learning_rate": 1.065546064414713e-06, + "loss": 0.1284, + "step": 11790 + }, + { + "epoch": 3.9755604247429632, + "grad_norm": 1.3794025142465844, + "learning_rate": 1.062180120186897e-06, + "loss": 0.127, + "step": 11795 + }, + { + "epoch": 3.9772459126917243, + "grad_norm": 1.235243365041627, + "learning_rate": 1.0588188687837953e-06, + "loss": 0.1025, + "step": 11800 + }, + { + "epoch": 3.9789314006404854, + "grad_norm": 1.3093075471981128, + "learning_rate": 1.055462314211118e-06, + "loss": 0.1162, + "step": 11805 + }, + { + "epoch": 3.9806168885892466, + "grad_norm": 1.4024378987390371, + "learning_rate": 1.0521104604689792e-06, + "loss": 0.1495, + "step": 11810 + }, + { + "epoch": 3.9823023765380077, + "grad_norm": 1.3548800011731634, + "learning_rate": 1.0487633115518869e-06, + "loss": 0.1196, + "step": 11815 + }, + { + "epoch": 3.983987864486769, + "grad_norm": 1.3027882530451373, + "learning_rate": 1.0454208714487475e-06, + "loss": 0.1236, + "step": 11820 + }, + { + "epoch": 3.98567335243553, + "grad_norm": 1.2656768758124717, + "learning_rate": 1.0420831441428502e-06, + "loss": 0.1268, + "step": 11825 + }, + { + "epoch": 3.9873588403842914, + "grad_norm": 1.28518948121888, + "learning_rate": 1.0387501336118712e-06, + "loss": 0.1151, + "step": 11830 + }, + { + "epoch": 3.9890443283330526, + "grad_norm": 1.2394477069103667, + "learning_rate": 1.0354218438278652e-06, + "loss": 0.1432, + "step": 11835 + }, + { + "epoch": 3.9907298162818137, + "grad_norm": 1.2215048085667828, + "learning_rate": 1.0320982787572603e-06, + "loss": 0.1271, + "step": 11840 + }, + { + "epoch": 3.992415304230575, + "grad_norm": 1.194697649390314, + "learning_rate": 1.0287794423608532e-06, + "loss": 0.12, + "step": 11845 + }, + { + "epoch": 3.994100792179336, + "grad_norm": 1.5046819102871702, + "learning_rate": 1.0254653385938074e-06, + "loss": 0.1197, + "step": 11850 + }, + { + "epoch": 3.995786280128097, + "grad_norm": 1.2649916827451366, + "learning_rate": 1.0221559714056462e-06, + "loss": 0.1251, + "step": 11855 + }, + { + "epoch": 3.997471768076858, + "grad_norm": 1.5053061115636008, + "learning_rate": 1.018851344740247e-06, + "loss": 0.1294, + "step": 11860 + }, + { + "epoch": 3.9991572560256197, + "grad_norm": 1.377029121737187, + "learning_rate": 1.015551462535837e-06, + "loss": 0.1253, + "step": 11865 + }, + { + "epoch": 4.000674195179505, + "grad_norm": 1.2686397699680958, + "learning_rate": 1.0122563287249903e-06, + "loss": 0.1323, + "step": 11870 + }, + { + "epoch": 4.002359683128265, + "grad_norm": 1.1816811217186254, + "learning_rate": 1.0089659472346241e-06, + "loss": 0.1159, + "step": 11875 + }, + { + "epoch": 4.004045171077027, + "grad_norm": 1.200704112220642, + "learning_rate": 1.005680321985989e-06, + "loss": 0.0981, + "step": 11880 + }, + { + "epoch": 4.005730659025788, + "grad_norm": 1.2283603724405283, + "learning_rate": 1.0023994568946682e-06, + "loss": 0.1105, + "step": 11885 + }, + { + "epoch": 4.007416146974549, + "grad_norm": 1.2806739817525785, + "learning_rate": 9.991233558705716e-07, + "loss": 0.0968, + "step": 11890 + }, + { + "epoch": 4.009101634923311, + "grad_norm": 1.3201740651821476, + "learning_rate": 9.958520228179364e-07, + "loss": 0.1114, + "step": 11895 + }, + { + "epoch": 4.010787122872071, + "grad_norm": 1.2684535246540378, + "learning_rate": 9.925854616353115e-07, + "loss": 0.11, + "step": 11900 + }, + { + "epoch": 4.012472610820833, + "grad_norm": 1.2094872874491596, + "learning_rate": 9.893236762155611e-07, + "loss": 0.1169, + "step": 11905 + }, + { + "epoch": 4.0141580987695935, + "grad_norm": 1.1351603996712527, + "learning_rate": 9.860666704458578e-07, + "loss": 0.1142, + "step": 11910 + }, + { + "epoch": 4.015843586718355, + "grad_norm": 1.3013391760812418, + "learning_rate": 9.828144482076807e-07, + "loss": 0.1059, + "step": 11915 + }, + { + "epoch": 4.017529074667116, + "grad_norm": 1.211883041428162, + "learning_rate": 9.795670133768047e-07, + "loss": 0.1143, + "step": 11920 + }, + { + "epoch": 4.019214562615877, + "grad_norm": 1.4530157299174902, + "learning_rate": 9.763243698232994e-07, + "loss": 0.1187, + "step": 11925 + }, + { + "epoch": 4.020900050564639, + "grad_norm": 1.3619804571974272, + "learning_rate": 9.730865214115288e-07, + "loss": 0.1057, + "step": 11930 + }, + { + "epoch": 4.0225855385133995, + "grad_norm": 1.517925247056964, + "learning_rate": 9.698534720001362e-07, + "loss": 0.1152, + "step": 11935 + }, + { + "epoch": 4.024271026462161, + "grad_norm": 1.4364199955195414, + "learning_rate": 9.666252254420526e-07, + "loss": 0.1121, + "step": 11940 + }, + { + "epoch": 4.025956514410922, + "grad_norm": 1.5904175933559828, + "learning_rate": 9.634017855844796e-07, + "loss": 0.1206, + "step": 11945 + }, + { + "epoch": 4.027642002359683, + "grad_norm": 1.4386968806132958, + "learning_rate": 9.60183156268892e-07, + "loss": 0.117, + "step": 11950 + }, + { + "epoch": 4.029327490308444, + "grad_norm": 1.0907356500333152, + "learning_rate": 9.569693413310338e-07, + "loss": 0.1214, + "step": 11955 + }, + { + "epoch": 4.0310129782572055, + "grad_norm": 1.366651038847322, + "learning_rate": 9.537603446009098e-07, + "loss": 0.104, + "step": 11960 + }, + { + "epoch": 4.032698466205967, + "grad_norm": 1.4222103737274834, + "learning_rate": 9.505561699027816e-07, + "loss": 0.0969, + "step": 11965 + }, + { + "epoch": 4.034383954154728, + "grad_norm": 1.4179261951176476, + "learning_rate": 9.473568210551681e-07, + "loss": 0.1104, + "step": 11970 + }, + { + "epoch": 4.036069442103489, + "grad_norm": 1.3820881706297394, + "learning_rate": 9.441623018708318e-07, + "loss": 0.1041, + "step": 11975 + }, + { + "epoch": 4.03775493005225, + "grad_norm": 1.3535859735670328, + "learning_rate": 9.409726161567856e-07, + "loss": 0.1168, + "step": 11980 + }, + { + "epoch": 4.0394404180010115, + "grad_norm": 1.216653765426924, + "learning_rate": 9.377877677142777e-07, + "loss": 0.1039, + "step": 11985 + }, + { + "epoch": 4.041125905949772, + "grad_norm": 1.450819171303049, + "learning_rate": 9.346077603387915e-07, + "loss": 0.1088, + "step": 11990 + }, + { + "epoch": 4.042811393898534, + "grad_norm": 1.3264255159063292, + "learning_rate": 9.314325978200451e-07, + "loss": 0.1129, + "step": 11995 + }, + { + "epoch": 4.044496881847294, + "grad_norm": 1.247752590631787, + "learning_rate": 9.282622839419775e-07, + "loss": 0.1127, + "step": 12000 + }, + { + "epoch": 4.046182369796056, + "grad_norm": 1.2718654651906907, + "learning_rate": 9.250968224827544e-07, + "loss": 0.1199, + "step": 12005 + }, + { + "epoch": 4.0478678577448175, + "grad_norm": 1.2784470200033737, + "learning_rate": 9.219362172147567e-07, + "loss": 0.1164, + "step": 12010 + }, + { + "epoch": 4.049553345693578, + "grad_norm": 1.275011129776298, + "learning_rate": 9.187804719045751e-07, + "loss": 0.1103, + "step": 12015 + }, + { + "epoch": 4.05123883364234, + "grad_norm": 1.3483311814842633, + "learning_rate": 9.156295903130141e-07, + "loss": 0.1159, + "step": 12020 + }, + { + "epoch": 4.0529243215911, + "grad_norm": 1.5365317528571418, + "learning_rate": 9.124835761950784e-07, + "loss": 0.1181, + "step": 12025 + }, + { + "epoch": 4.054609809539862, + "grad_norm": 8.089273619285802, + "learning_rate": 9.093424332999723e-07, + "loss": 0.1051, + "step": 12030 + }, + { + "epoch": 4.056295297488623, + "grad_norm": 1.412741400622911, + "learning_rate": 9.06206165371094e-07, + "loss": 0.0904, + "step": 12035 + }, + { + "epoch": 4.057980785437384, + "grad_norm": 1.3451991617246908, + "learning_rate": 9.030747761460351e-07, + "loss": 0.123, + "step": 12040 + }, + { + "epoch": 4.059666273386146, + "grad_norm": 1.3706355009912603, + "learning_rate": 8.99948269356572e-07, + "loss": 0.1072, + "step": 12045 + }, + { + "epoch": 4.061351761334906, + "grad_norm": 1.2813393553915857, + "learning_rate": 8.968266487286609e-07, + "loss": 0.1042, + "step": 12050 + }, + { + "epoch": 4.063037249283668, + "grad_norm": 1.393422851426227, + "learning_rate": 8.937099179824343e-07, + "loss": 0.1081, + "step": 12055 + }, + { + "epoch": 4.064722737232429, + "grad_norm": 1.280841462377404, + "learning_rate": 8.905980808322029e-07, + "loss": 0.1203, + "step": 12060 + }, + { + "epoch": 4.06640822518119, + "grad_norm": 1.8849739487073585, + "learning_rate": 8.874911409864384e-07, + "loss": 0.1233, + "step": 12065 + }, + { + "epoch": 4.068093713129951, + "grad_norm": 1.0980679250876781, + "learning_rate": 8.843891021477813e-07, + "loss": 0.1058, + "step": 12070 + }, + { + "epoch": 4.069779201078712, + "grad_norm": 2.0950751356984587, + "learning_rate": 8.812919680130272e-07, + "loss": 0.1375, + "step": 12075 + }, + { + "epoch": 4.071464689027473, + "grad_norm": 1.2036312089080325, + "learning_rate": 8.781997422731304e-07, + "loss": 0.113, + "step": 12080 + }, + { + "epoch": 4.073150176976235, + "grad_norm": 1.6339619258398572, + "learning_rate": 8.751124286131957e-07, + "loss": 0.1045, + "step": 12085 + }, + { + "epoch": 4.074835664924996, + "grad_norm": 1.4342407357213185, + "learning_rate": 8.720300307124712e-07, + "loss": 0.1166, + "step": 12090 + }, + { + "epoch": 4.076521152873757, + "grad_norm": 1.2787393231188653, + "learning_rate": 8.689525522443471e-07, + "loss": 0.1168, + "step": 12095 + }, + { + "epoch": 4.078206640822518, + "grad_norm": 1.5026794288218963, + "learning_rate": 8.658799968763548e-07, + "loss": 0.1125, + "step": 12100 + }, + { + "epoch": 4.079892128771279, + "grad_norm": 1.4636374697221322, + "learning_rate": 8.628123682701533e-07, + "loss": 0.1222, + "step": 12105 + }, + { + "epoch": 4.081577616720041, + "grad_norm": 1.3854446033934944, + "learning_rate": 8.597496700815344e-07, + "loss": 0.1018, + "step": 12110 + }, + { + "epoch": 4.083263104668801, + "grad_norm": 1.3314914384548686, + "learning_rate": 8.566919059604106e-07, + "loss": 0.1296, + "step": 12115 + }, + { + "epoch": 4.084948592617563, + "grad_norm": 1.2830041524416362, + "learning_rate": 8.536390795508176e-07, + "loss": 0.1069, + "step": 12120 + }, + { + "epoch": 4.086634080566324, + "grad_norm": 1.220959008225962, + "learning_rate": 8.505911944909062e-07, + "loss": 0.1003, + "step": 12125 + }, + { + "epoch": 4.088319568515085, + "grad_norm": 1.1826856191408797, + "learning_rate": 8.475482544129371e-07, + "loss": 0.099, + "step": 12130 + }, + { + "epoch": 4.090005056463847, + "grad_norm": 1.2736778287384793, + "learning_rate": 8.445102629432778e-07, + "loss": 0.0967, + "step": 12135 + }, + { + "epoch": 4.091690544412607, + "grad_norm": 1.4482792394902526, + "learning_rate": 8.414772237023982e-07, + "loss": 0.0951, + "step": 12140 + }, + { + "epoch": 4.093376032361369, + "grad_norm": 3.367883128370919, + "learning_rate": 8.384491403048694e-07, + "loss": 0.1244, + "step": 12145 + }, + { + "epoch": 4.0950615203101295, + "grad_norm": 1.1690680983435247, + "learning_rate": 8.354260163593519e-07, + "loss": 0.1062, + "step": 12150 + }, + { + "epoch": 4.096747008258891, + "grad_norm": 1.2947556372949949, + "learning_rate": 8.32407855468601e-07, + "loss": 0.1059, + "step": 12155 + }, + { + "epoch": 4.098432496207652, + "grad_norm": 1.5373405572019554, + "learning_rate": 8.293946612294523e-07, + "loss": 0.1234, + "step": 12160 + }, + { + "epoch": 4.100117984156413, + "grad_norm": 1.1500599400608853, + "learning_rate": 8.263864372328268e-07, + "loss": 0.114, + "step": 12165 + }, + { + "epoch": 4.101803472105175, + "grad_norm": 1.3361032011907505, + "learning_rate": 8.233831870637188e-07, + "loss": 0.1142, + "step": 12170 + }, + { + "epoch": 4.1034889600539355, + "grad_norm": 1.3763397653453309, + "learning_rate": 8.203849143011977e-07, + "loss": 0.1076, + "step": 12175 + }, + { + "epoch": 4.105174448002697, + "grad_norm": 1.1943015358381728, + "learning_rate": 8.173916225183987e-07, + "loss": 0.108, + "step": 12180 + }, + { + "epoch": 4.106859935951458, + "grad_norm": 1.4212257971050177, + "learning_rate": 8.144033152825243e-07, + "loss": 0.1163, + "step": 12185 + }, + { + "epoch": 4.108545423900219, + "grad_norm": 1.2792428668113918, + "learning_rate": 8.11419996154833e-07, + "loss": 0.1192, + "step": 12190 + }, + { + "epoch": 4.11023091184898, + "grad_norm": 1.414385709276504, + "learning_rate": 8.084416686906426e-07, + "loss": 0.097, + "step": 12195 + }, + { + "epoch": 4.1119163997977415, + "grad_norm": 1.280707167684563, + "learning_rate": 8.054683364393185e-07, + "loss": 0.0976, + "step": 12200 + }, + { + "epoch": 4.113601887746503, + "grad_norm": 1.3413973918750335, + "learning_rate": 8.025000029442776e-07, + "loss": 0.1035, + "step": 12205 + }, + { + "epoch": 4.115287375695264, + "grad_norm": 1.1956122736138755, + "learning_rate": 7.995366717429748e-07, + "loss": 0.1009, + "step": 12210 + }, + { + "epoch": 4.116972863644025, + "grad_norm": 1.351190451029454, + "learning_rate": 7.965783463669063e-07, + "loss": 0.1123, + "step": 12215 + }, + { + "epoch": 4.118658351592786, + "grad_norm": 1.5044448299648419, + "learning_rate": 7.936250303416009e-07, + "loss": 0.1258, + "step": 12220 + }, + { + "epoch": 4.1203438395415475, + "grad_norm": 1.2845839295722064, + "learning_rate": 7.906767271866206e-07, + "loss": 0.1124, + "step": 12225 + }, + { + "epoch": 4.122029327490308, + "grad_norm": 1.225063006901401, + "learning_rate": 7.877334404155518e-07, + "loss": 0.0999, + "step": 12230 + }, + { + "epoch": 4.12371481543907, + "grad_norm": 1.6545267890659516, + "learning_rate": 7.847951735360021e-07, + "loss": 0.12, + "step": 12235 + }, + { + "epoch": 4.12540030338783, + "grad_norm": 1.2484653665854208, + "learning_rate": 7.818619300495978e-07, + "loss": 0.0978, + "step": 12240 + }, + { + "epoch": 4.127085791336592, + "grad_norm": 1.1903596648167274, + "learning_rate": 7.789337134519759e-07, + "loss": 0.1039, + "step": 12245 + }, + { + "epoch": 4.1287712792853535, + "grad_norm": 1.338620620795788, + "learning_rate": 7.760105272327872e-07, + "loss": 0.1177, + "step": 12250 + }, + { + "epoch": 4.130456767234114, + "grad_norm": 1.2803492314820784, + "learning_rate": 7.730923748756852e-07, + "loss": 0.1113, + "step": 12255 + }, + { + "epoch": 4.132142255182876, + "grad_norm": 1.4965664188299521, + "learning_rate": 7.701792598583224e-07, + "loss": 0.1019, + "step": 12260 + }, + { + "epoch": 4.133827743131636, + "grad_norm": 1.2116510236214313, + "learning_rate": 7.672711856523518e-07, + "loss": 0.1174, + "step": 12265 + }, + { + "epoch": 4.135513231080398, + "grad_norm": 1.2323286887036327, + "learning_rate": 7.643681557234189e-07, + "loss": 0.1191, + "step": 12270 + }, + { + "epoch": 4.137198719029159, + "grad_norm": 1.2903856968737564, + "learning_rate": 7.614701735311552e-07, + "loss": 0.0945, + "step": 12275 + }, + { + "epoch": 4.13888420697792, + "grad_norm": 10.433753242491871, + "learning_rate": 7.585772425291776e-07, + "loss": 0.0992, + "step": 12280 + }, + { + "epoch": 4.140569694926682, + "grad_norm": 1.2233233500202267, + "learning_rate": 7.556893661650827e-07, + "loss": 0.1249, + "step": 12285 + }, + { + "epoch": 4.142255182875442, + "grad_norm": 1.2130226964082333, + "learning_rate": 7.528065478804463e-07, + "loss": 0.0972, + "step": 12290 + }, + { + "epoch": 4.143940670824204, + "grad_norm": 1.2063408932815303, + "learning_rate": 7.499287911108132e-07, + "loss": 0.1167, + "step": 12295 + }, + { + "epoch": 4.145626158772965, + "grad_norm": 1.30741382337855, + "learning_rate": 7.470560992856984e-07, + "loss": 0.112, + "step": 12300 + }, + { + "epoch": 4.147311646721726, + "grad_norm": 1.9204573661896698, + "learning_rate": 7.441884758285756e-07, + "loss": 0.1189, + "step": 12305 + }, + { + "epoch": 4.148997134670487, + "grad_norm": 1.6452883785762646, + "learning_rate": 7.413259241568887e-07, + "loss": 0.1306, + "step": 12310 + }, + { + "epoch": 4.150682622619248, + "grad_norm": 1.3835192185313165, + "learning_rate": 7.384684476820281e-07, + "loss": 0.1264, + "step": 12315 + }, + { + "epoch": 4.152368110568009, + "grad_norm": 1.6241808130288877, + "learning_rate": 7.35616049809339e-07, + "loss": 0.114, + "step": 12320 + }, + { + "epoch": 4.154053598516771, + "grad_norm": 1.3217625507298068, + "learning_rate": 7.32768733938114e-07, + "loss": 0.1104, + "step": 12325 + }, + { + "epoch": 4.155739086465532, + "grad_norm": 1.1639592250004458, + "learning_rate": 7.29926503461591e-07, + "loss": 0.0985, + "step": 12330 + }, + { + "epoch": 4.157424574414293, + "grad_norm": 1.3164239348969922, + "learning_rate": 7.27089361766945e-07, + "loss": 0.1089, + "step": 12335 + }, + { + "epoch": 4.159110062363054, + "grad_norm": 1.2830066976673342, + "learning_rate": 7.242573122352875e-07, + "loss": 0.1108, + "step": 12340 + }, + { + "epoch": 4.160795550311815, + "grad_norm": 1.4142314237515623, + "learning_rate": 7.214303582416626e-07, + "loss": 0.1172, + "step": 12345 + }, + { + "epoch": 4.162481038260577, + "grad_norm": 1.3441874811916532, + "learning_rate": 7.1860850315504e-07, + "loss": 0.1071, + "step": 12350 + }, + { + "epoch": 4.164166526209337, + "grad_norm": 1.2360749017856782, + "learning_rate": 7.157917503383149e-07, + "loss": 0.129, + "step": 12355 + }, + { + "epoch": 4.165852014158099, + "grad_norm": 1.5174335899130909, + "learning_rate": 7.129801031483008e-07, + "loss": 0.1179, + "step": 12360 + }, + { + "epoch": 4.16753750210686, + "grad_norm": 1.3359049180292741, + "learning_rate": 7.101735649357244e-07, + "loss": 0.1019, + "step": 12365 + }, + { + "epoch": 4.169222990055621, + "grad_norm": 1.9062074251066823, + "learning_rate": 7.073721390452298e-07, + "loss": 0.1171, + "step": 12370 + }, + { + "epoch": 4.170908478004383, + "grad_norm": 1.3360549920596059, + "learning_rate": 7.045758288153631e-07, + "loss": 0.0992, + "step": 12375 + }, + { + "epoch": 4.172593965953143, + "grad_norm": 1.2666268233549498, + "learning_rate": 7.017846375785742e-07, + "loss": 0.1308, + "step": 12380 + }, + { + "epoch": 4.174279453901905, + "grad_norm": 1.6136590144629126, + "learning_rate": 6.989985686612177e-07, + "loss": 0.1075, + "step": 12385 + }, + { + "epoch": 4.1759649418506655, + "grad_norm": 1.4203178557637688, + "learning_rate": 6.962176253835367e-07, + "loss": 0.1004, + "step": 12390 + }, + { + "epoch": 4.177650429799427, + "grad_norm": 1.5148294344863817, + "learning_rate": 6.934418110596725e-07, + "loss": 0.1307, + "step": 12395 + }, + { + "epoch": 4.179335917748188, + "grad_norm": 1.6901766383164198, + "learning_rate": 6.906711289976492e-07, + "loss": 0.1245, + "step": 12400 + }, + { + "epoch": 4.181021405696949, + "grad_norm": 1.3052650695023462, + "learning_rate": 6.879055824993758e-07, + "loss": 0.1147, + "step": 12405 + }, + { + "epoch": 4.182706893645711, + "grad_norm": 1.4832200914996088, + "learning_rate": 6.851451748606436e-07, + "loss": 0.099, + "step": 12410 + }, + { + "epoch": 4.1843923815944715, + "grad_norm": 1.300074399510205, + "learning_rate": 6.823899093711161e-07, + "loss": 0.1138, + "step": 12415 + }, + { + "epoch": 4.186077869543233, + "grad_norm": 1.3801413673802827, + "learning_rate": 6.79639789314332e-07, + "loss": 0.1055, + "step": 12420 + }, + { + "epoch": 4.187763357491994, + "grad_norm": 1.4405626274680872, + "learning_rate": 6.768948179676959e-07, + "loss": 0.1147, + "step": 12425 + }, + { + "epoch": 4.189448845440755, + "grad_norm": 1.2885362808046412, + "learning_rate": 6.741549986024759e-07, + "loss": 0.102, + "step": 12430 + }, + { + "epoch": 4.191134333389516, + "grad_norm": 1.2631602822200947, + "learning_rate": 6.714203344838033e-07, + "loss": 0.097, + "step": 12435 + }, + { + "epoch": 4.1928198213382775, + "grad_norm": 1.322156569688669, + "learning_rate": 6.686908288706639e-07, + "loss": 0.1068, + "step": 12440 + }, + { + "epoch": 4.194505309287039, + "grad_norm": 1.3443668712005117, + "learning_rate": 6.659664850158948e-07, + "loss": 0.1108, + "step": 12445 + }, + { + "epoch": 4.1961907972358, + "grad_norm": 1.4522694124226216, + "learning_rate": 6.632473061661831e-07, + "loss": 0.1061, + "step": 12450 + }, + { + "epoch": 4.197876285184561, + "grad_norm": 1.8908127442697675, + "learning_rate": 6.605332955620603e-07, + "loss": 0.1261, + "step": 12455 + }, + { + "epoch": 4.199561773133322, + "grad_norm": 1.7695866843146077, + "learning_rate": 6.578244564379005e-07, + "loss": 0.1212, + "step": 12460 + }, + { + "epoch": 4.2012472610820835, + "grad_norm": 1.5133609075771488, + "learning_rate": 6.551207920219121e-07, + "loss": 0.1162, + "step": 12465 + }, + { + "epoch": 4.202932749030844, + "grad_norm": 1.3015009625368679, + "learning_rate": 6.524223055361362e-07, + "loss": 0.1092, + "step": 12470 + }, + { + "epoch": 4.204618236979606, + "grad_norm": 1.4631592452147437, + "learning_rate": 6.497290001964468e-07, + "loss": 0.1347, + "step": 12475 + }, + { + "epoch": 4.206303724928366, + "grad_norm": 1.3343524272579732, + "learning_rate": 6.470408792125404e-07, + "loss": 0.1179, + "step": 12480 + }, + { + "epoch": 4.207989212877128, + "grad_norm": 1.3770180880579295, + "learning_rate": 6.443579457879362e-07, + "loss": 0.1039, + "step": 12485 + }, + { + "epoch": 4.2096747008258895, + "grad_norm": 1.279648638951182, + "learning_rate": 6.416802031199693e-07, + "loss": 0.1087, + "step": 12490 + }, + { + "epoch": 4.21136018877465, + "grad_norm": 1.1611456187276283, + "learning_rate": 6.39007654399792e-07, + "loss": 0.1053, + "step": 12495 + }, + { + "epoch": 4.213045676723412, + "grad_norm": 1.3885249594275164, + "learning_rate": 6.36340302812366e-07, + "loss": 0.1244, + "step": 12500 + }, + { + "epoch": 4.214731164672172, + "grad_norm": 1.4764836670737602, + "learning_rate": 6.336781515364576e-07, + "loss": 0.1185, + "step": 12505 + }, + { + "epoch": 4.216416652620934, + "grad_norm": 1.2273309966155903, + "learning_rate": 6.310212037446361e-07, + "loss": 0.1075, + "step": 12510 + }, + { + "epoch": 4.218102140569695, + "grad_norm": 1.1630542076146055, + "learning_rate": 6.283694626032727e-07, + "loss": 0.0988, + "step": 12515 + }, + { + "epoch": 4.219787628518456, + "grad_norm": 1.5308452378094253, + "learning_rate": 6.257229312725293e-07, + "loss": 0.109, + "step": 12520 + }, + { + "epoch": 4.221473116467218, + "grad_norm": 1.3807175156048481, + "learning_rate": 6.230816129063621e-07, + "loss": 0.1045, + "step": 12525 + }, + { + "epoch": 4.223158604415978, + "grad_norm": 1.2463396384249752, + "learning_rate": 6.204455106525126e-07, + "loss": 0.0938, + "step": 12530 + }, + { + "epoch": 4.22484409236474, + "grad_norm": 1.4154264595396142, + "learning_rate": 6.178146276525082e-07, + "loss": 0.125, + "step": 12535 + }, + { + "epoch": 4.226529580313501, + "grad_norm": 1.2074553419962946, + "learning_rate": 6.151889670416566e-07, + "loss": 0.1112, + "step": 12540 + }, + { + "epoch": 4.228215068262262, + "grad_norm": 1.3745228772374778, + "learning_rate": 6.125685319490399e-07, + "loss": 0.1076, + "step": 12545 + }, + { + "epoch": 4.229900556211023, + "grad_norm": 1.3442914729059194, + "learning_rate": 6.099533254975131e-07, + "loss": 0.1038, + "step": 12550 + }, + { + "epoch": 4.231586044159784, + "grad_norm": 1.4682437757162892, + "learning_rate": 6.073433508037002e-07, + "loss": 0.1189, + "step": 12555 + }, + { + "epoch": 4.233271532108545, + "grad_norm": 1.3775983777635847, + "learning_rate": 6.047386109779929e-07, + "loss": 0.122, + "step": 12560 + }, + { + "epoch": 4.234957020057307, + "grad_norm": 1.4011722199263275, + "learning_rate": 6.021391091245394e-07, + "loss": 0.1079, + "step": 12565 + }, + { + "epoch": 4.236642508006068, + "grad_norm": 1.091308302361115, + "learning_rate": 5.995448483412514e-07, + "loss": 0.0984, + "step": 12570 + }, + { + "epoch": 4.238327995954829, + "grad_norm": 1.404128023644069, + "learning_rate": 5.969558317197882e-07, + "loss": 0.1146, + "step": 12575 + }, + { + "epoch": 4.24001348390359, + "grad_norm": 1.1485118539171184, + "learning_rate": 5.943720623455667e-07, + "loss": 0.1046, + "step": 12580 + }, + { + "epoch": 4.241698971852351, + "grad_norm": 1.3958542990358158, + "learning_rate": 5.917935432977445e-07, + "loss": 0.0974, + "step": 12585 + }, + { + "epoch": 4.243384459801113, + "grad_norm": 1.42446604239913, + "learning_rate": 5.892202776492245e-07, + "loss": 0.1127, + "step": 12590 + }, + { + "epoch": 4.245069947749873, + "grad_norm": 1.3733725009041717, + "learning_rate": 5.866522684666487e-07, + "loss": 0.097, + "step": 12595 + }, + { + "epoch": 4.246755435698635, + "grad_norm": 1.3037524065490758, + "learning_rate": 5.840895188103963e-07, + "loss": 0.103, + "step": 12600 + }, + { + "epoch": 4.248440923647396, + "grad_norm": 1.1394240155820112, + "learning_rate": 5.815320317345758e-07, + "loss": 0.1179, + "step": 12605 + }, + { + "epoch": 4.250126411596157, + "grad_norm": 1.3577679899286457, + "learning_rate": 5.789798102870264e-07, + "loss": 0.112, + "step": 12610 + }, + { + "epoch": 4.251811899544919, + "grad_norm": 1.3351097891936905, + "learning_rate": 5.764328575093109e-07, + "loss": 0.1085, + "step": 12615 + }, + { + "epoch": 4.253497387493679, + "grad_norm": 1.3149442488122238, + "learning_rate": 5.738911764367144e-07, + "loss": 0.0944, + "step": 12620 + }, + { + "epoch": 4.255182875442441, + "grad_norm": 1.3423258175257915, + "learning_rate": 5.713547700982385e-07, + "loss": 0.0846, + "step": 12625 + }, + { + "epoch": 4.2568683633912014, + "grad_norm": 1.2079081445483166, + "learning_rate": 5.688236415165988e-07, + "loss": 0.1101, + "step": 12630 + }, + { + "epoch": 4.258553851339963, + "grad_norm": 1.3488689182947924, + "learning_rate": 5.662977937082204e-07, + "loss": 0.1016, + "step": 12635 + }, + { + "epoch": 4.260239339288724, + "grad_norm": 1.6504673777600523, + "learning_rate": 5.637772296832367e-07, + "loss": 0.1271, + "step": 12640 + }, + { + "epoch": 4.261924827237485, + "grad_norm": 1.3793465158474956, + "learning_rate": 5.612619524454854e-07, + "loss": 0.1313, + "step": 12645 + }, + { + "epoch": 4.263610315186247, + "grad_norm": 1.185490237774572, + "learning_rate": 5.587519649925005e-07, + "loss": 0.0898, + "step": 12650 + }, + { + "epoch": 4.2652958031350074, + "grad_norm": 1.5426151385369398, + "learning_rate": 5.562472703155142e-07, + "loss": 0.1153, + "step": 12655 + }, + { + "epoch": 4.266981291083769, + "grad_norm": 1.2605141841919176, + "learning_rate": 5.537478713994493e-07, + "loss": 0.1182, + "step": 12660 + }, + { + "epoch": 4.26866677903253, + "grad_norm": 1.5645076718578845, + "learning_rate": 5.512537712229199e-07, + "loss": 0.1321, + "step": 12665 + }, + { + "epoch": 4.270352266981291, + "grad_norm": 1.3706046002481285, + "learning_rate": 5.487649727582245e-07, + "loss": 0.1149, + "step": 12670 + }, + { + "epoch": 4.272037754930052, + "grad_norm": 1.3480901250235686, + "learning_rate": 5.462814789713411e-07, + "loss": 0.0937, + "step": 12675 + }, + { + "epoch": 4.2737232428788134, + "grad_norm": 4.894753355003127, + "learning_rate": 5.438032928219289e-07, + "loss": 0.1032, + "step": 12680 + }, + { + "epoch": 4.275408730827575, + "grad_norm": 1.364313335978621, + "learning_rate": 5.413304172633227e-07, + "loss": 0.1244, + "step": 12685 + }, + { + "epoch": 4.277094218776336, + "grad_norm": 1.3873558144374436, + "learning_rate": 5.388628552425251e-07, + "loss": 0.1132, + "step": 12690 + }, + { + "epoch": 4.278779706725097, + "grad_norm": 1.465437984120428, + "learning_rate": 5.364006097002078e-07, + "loss": 0.1133, + "step": 12695 + }, + { + "epoch": 4.280465194673858, + "grad_norm": 1.3070673802862949, + "learning_rate": 5.339436835707063e-07, + "loss": 0.1072, + "step": 12700 + }, + { + "epoch": 4.282150682622619, + "grad_norm": 1.5208917895461296, + "learning_rate": 5.314920797820189e-07, + "loss": 0.101, + "step": 12705 + }, + { + "epoch": 4.28383617057138, + "grad_norm": 1.341952462702398, + "learning_rate": 5.290458012557986e-07, + "loss": 0.1182, + "step": 12710 + }, + { + "epoch": 4.285521658520142, + "grad_norm": 1.2180380643292374, + "learning_rate": 5.266048509073518e-07, + "loss": 0.1021, + "step": 12715 + }, + { + "epoch": 4.287207146468903, + "grad_norm": 1.1508451676006477, + "learning_rate": 5.241692316456381e-07, + "loss": 0.0968, + "step": 12720 + }, + { + "epoch": 4.288892634417664, + "grad_norm": 1.124393382402687, + "learning_rate": 5.217389463732625e-07, + "loss": 0.0937, + "step": 12725 + }, + { + "epoch": 4.290578122366425, + "grad_norm": 1.3368529587946882, + "learning_rate": 5.193139979864726e-07, + "loss": 0.1104, + "step": 12730 + }, + { + "epoch": 4.292263610315186, + "grad_norm": 1.3097972966656428, + "learning_rate": 5.168943893751549e-07, + "loss": 0.1288, + "step": 12735 + }, + { + "epoch": 4.293949098263948, + "grad_norm": 1.2753472082896762, + "learning_rate": 5.144801234228342e-07, + "loss": 0.1071, + "step": 12740 + }, + { + "epoch": 4.295634586212708, + "grad_norm": 1.3107957633148926, + "learning_rate": 5.120712030066688e-07, + "loss": 0.0907, + "step": 12745 + }, + { + "epoch": 4.29732007416147, + "grad_norm": 1.2875216625415367, + "learning_rate": 5.096676309974447e-07, + "loss": 0.1031, + "step": 12750 + }, + { + "epoch": 4.2990055621102305, + "grad_norm": 1.4514402890633926, + "learning_rate": 5.072694102595743e-07, + "loss": 0.1259, + "step": 12755 + }, + { + "epoch": 4.300691050058992, + "grad_norm": 1.3237308164459693, + "learning_rate": 5.048765436510933e-07, + "loss": 0.108, + "step": 12760 + }, + { + "epoch": 4.302376538007754, + "grad_norm": 1.300150145012879, + "learning_rate": 5.024890340236583e-07, + "loss": 0.0981, + "step": 12765 + }, + { + "epoch": 4.304062025956514, + "grad_norm": 1.3855062505625857, + "learning_rate": 5.001068842225387e-07, + "loss": 0.1075, + "step": 12770 + }, + { + "epoch": 4.305747513905276, + "grad_norm": 1.499093249303719, + "learning_rate": 4.977300970866184e-07, + "loss": 0.105, + "step": 12775 + }, + { + "epoch": 4.3074330018540365, + "grad_norm": 1.7111121992372054, + "learning_rate": 4.953586754483891e-07, + "loss": 0.0932, + "step": 12780 + }, + { + "epoch": 4.309118489802798, + "grad_norm": 1.8555027883170325, + "learning_rate": 4.929926221339504e-07, + "loss": 0.1282, + "step": 12785 + }, + { + "epoch": 4.310803977751559, + "grad_norm": 1.4812802235015243, + "learning_rate": 4.906319399630011e-07, + "loss": 0.0971, + "step": 12790 + }, + { + "epoch": 4.31248946570032, + "grad_norm": 1.4355396413152655, + "learning_rate": 4.882766317488435e-07, + "loss": 0.1146, + "step": 12795 + }, + { + "epoch": 4.314174953649081, + "grad_norm": 1.444068572952506, + "learning_rate": 4.859267002983714e-07, + "loss": 0.1251, + "step": 12800 + }, + { + "epoch": 4.3158604415978425, + "grad_norm": 1.278171588280947, + "learning_rate": 4.835821484120723e-07, + "loss": 0.1161, + "step": 12805 + }, + { + "epoch": 4.317545929546604, + "grad_norm": 1.4015777366435573, + "learning_rate": 4.812429788840245e-07, + "loss": 0.1037, + "step": 12810 + }, + { + "epoch": 4.319231417495365, + "grad_norm": 1.267401729605222, + "learning_rate": 4.789091945018892e-07, + "loss": 0.105, + "step": 12815 + }, + { + "epoch": 4.320916905444126, + "grad_norm": 1.0944872659705882, + "learning_rate": 4.765807980469106e-07, + "loss": 0.1151, + "step": 12820 + }, + { + "epoch": 4.322602393392887, + "grad_norm": 1.3974443654687387, + "learning_rate": 4.74257792293914e-07, + "loss": 0.1065, + "step": 12825 + }, + { + "epoch": 4.3242878813416485, + "grad_norm": 1.3294429319356331, + "learning_rate": 4.719401800112977e-07, + "loss": 0.1139, + "step": 12830 + }, + { + "epoch": 4.325973369290409, + "grad_norm": 1.2919635572246846, + "learning_rate": 4.6962796396103514e-07, + "loss": 0.1005, + "step": 12835 + }, + { + "epoch": 4.327658857239171, + "grad_norm": 1.387467405554503, + "learning_rate": 4.6732114689866713e-07, + "loss": 0.1079, + "step": 12840 + }, + { + "epoch": 4.329344345187932, + "grad_norm": 1.2814156861095354, + "learning_rate": 4.6501973157329847e-07, + "loss": 0.1046, + "step": 12845 + }, + { + "epoch": 4.331029833136693, + "grad_norm": 1.632353129530135, + "learning_rate": 4.62723720727602e-07, + "loss": 0.1008, + "step": 12850 + }, + { + "epoch": 4.3327153210854545, + "grad_norm": 1.258261859989475, + "learning_rate": 4.604331170978049e-07, + "loss": 0.1004, + "step": 12855 + }, + { + "epoch": 4.334400809034215, + "grad_norm": 1.7920244077170024, + "learning_rate": 4.581479234136915e-07, + "loss": 0.0945, + "step": 12860 + }, + { + "epoch": 4.336086296982977, + "grad_norm": 1.360200317683404, + "learning_rate": 4.558681423985989e-07, + "loss": 0.104, + "step": 12865 + }, + { + "epoch": 4.337771784931737, + "grad_norm": 1.3475770543917587, + "learning_rate": 4.5359377676941764e-07, + "loss": 0.0963, + "step": 12870 + }, + { + "epoch": 4.339457272880499, + "grad_norm": 1.255107027167233, + "learning_rate": 4.5132482923657904e-07, + "loss": 0.11, + "step": 12875 + }, + { + "epoch": 4.3411427608292605, + "grad_norm": 1.3398219651184247, + "learning_rate": 4.4906130250406024e-07, + "loss": 0.1127, + "step": 12880 + }, + { + "epoch": 4.342828248778021, + "grad_norm": 1.282294836209136, + "learning_rate": 4.4680319926937666e-07, + "loss": 0.1019, + "step": 12885 + }, + { + "epoch": 4.344513736726783, + "grad_norm": 1.1284975586864117, + "learning_rate": 4.4455052222358354e-07, + "loss": 0.0932, + "step": 12890 + }, + { + "epoch": 4.346199224675543, + "grad_norm": 1.5180478849747012, + "learning_rate": 4.4230327405126614e-07, + "loss": 0.1168, + "step": 12895 + }, + { + "epoch": 4.347884712624305, + "grad_norm": 1.269082280456362, + "learning_rate": 4.4006145743054116e-07, + "loss": 0.0948, + "step": 12900 + }, + { + "epoch": 4.349570200573066, + "grad_norm": 1.2232591570968698, + "learning_rate": 4.3782507503305263e-07, + "loss": 0.1044, + "step": 12905 + }, + { + "epoch": 4.351255688521827, + "grad_norm": 1.418462747935375, + "learning_rate": 4.3559412952396796e-07, + "loss": 0.1138, + "step": 12910 + }, + { + "epoch": 4.352941176470588, + "grad_norm": 1.7745767467659825, + "learning_rate": 4.333686235619772e-07, + "loss": 0.1065, + "step": 12915 + }, + { + "epoch": 4.354626664419349, + "grad_norm": 1.2900376660394284, + "learning_rate": 4.311485597992854e-07, + "loss": 0.0998, + "step": 12920 + }, + { + "epoch": 4.356312152368111, + "grad_norm": 1.3765273059531757, + "learning_rate": 4.2893394088161176e-07, + "loss": 0.1156, + "step": 12925 + }, + { + "epoch": 4.357997640316872, + "grad_norm": 1.2580320516333507, + "learning_rate": 4.2672476944818963e-07, + "loss": 0.0989, + "step": 12930 + }, + { + "epoch": 4.359683128265633, + "grad_norm": 1.2994496505304025, + "learning_rate": 4.2452104813175797e-07, + "loss": 0.1011, + "step": 12935 + }, + { + "epoch": 4.361368616214394, + "grad_norm": 1.3774522003838567, + "learning_rate": 4.2232277955855995e-07, + "loss": 0.1031, + "step": 12940 + }, + { + "epoch": 4.363054104163155, + "grad_norm": 1.414295121192504, + "learning_rate": 4.2012996634834434e-07, + "loss": 0.1055, + "step": 12945 + }, + { + "epoch": 4.364739592111916, + "grad_norm": 1.2195219009223808, + "learning_rate": 4.179426111143536e-07, + "loss": 0.0951, + "step": 12950 + }, + { + "epoch": 4.366425080060678, + "grad_norm": 1.2565916088421782, + "learning_rate": 4.1576071646333037e-07, + "loss": 0.1023, + "step": 12955 + }, + { + "epoch": 4.368110568009438, + "grad_norm": 1.802797545212119, + "learning_rate": 4.1358428499550687e-07, + "loss": 0.119, + "step": 12960 + }, + { + "epoch": 4.3697960559582, + "grad_norm": 1.3805722332018802, + "learning_rate": 4.1141331930460546e-07, + "loss": 0.1107, + "step": 12965 + }, + { + "epoch": 4.371481543906961, + "grad_norm": 1.3657834551175823, + "learning_rate": 4.0924782197783344e-07, + "loss": 0.1038, + "step": 12970 + }, + { + "epoch": 4.373167031855722, + "grad_norm": 1.3726640016764247, + "learning_rate": 4.0708779559588463e-07, + "loss": 0.0976, + "step": 12975 + }, + { + "epoch": 4.374852519804484, + "grad_norm": 1.4261444871092577, + "learning_rate": 4.049332427329294e-07, + "loss": 0.1156, + "step": 12980 + }, + { + "epoch": 4.376538007753244, + "grad_norm": 1.6534352594521706, + "learning_rate": 4.0278416595661787e-07, + "loss": 0.1035, + "step": 12985 + }, + { + "epoch": 4.378223495702006, + "grad_norm": 1.2911067985280702, + "learning_rate": 4.006405678280717e-07, + "loss": 0.1066, + "step": 12990 + }, + { + "epoch": 4.3799089836507665, + "grad_norm": 1.3058495586023977, + "learning_rate": 3.9850245090188587e-07, + "loss": 0.0981, + "step": 12995 + }, + { + "epoch": 4.381594471599528, + "grad_norm": 1.7470967650220217, + "learning_rate": 3.963698177261216e-07, + "loss": 0.1203, + "step": 13000 + }, + { + "epoch": 4.38327995954829, + "grad_norm": 1.5335847116198777, + "learning_rate": 3.9424267084230583e-07, + "loss": 0.1147, + "step": 13005 + }, + { + "epoch": 4.38496544749705, + "grad_norm": 1.4703422570854467, + "learning_rate": 3.9212101278542524e-07, + "loss": 0.112, + "step": 13010 + }, + { + "epoch": 4.386650935445812, + "grad_norm": 1.2063754002001417, + "learning_rate": 3.9000484608392786e-07, + "loss": 0.1026, + "step": 13015 + }, + { + "epoch": 4.3883364233945725, + "grad_norm": 1.3935790269241226, + "learning_rate": 3.87894173259718e-07, + "loss": 0.0885, + "step": 13020 + }, + { + "epoch": 4.390021911343334, + "grad_norm": 1.3553381164698273, + "learning_rate": 3.857889968281503e-07, + "loss": 0.1066, + "step": 13025 + }, + { + "epoch": 4.391707399292095, + "grad_norm": 1.2898295004031393, + "learning_rate": 3.83689319298029e-07, + "loss": 0.1043, + "step": 13030 + }, + { + "epoch": 4.393392887240856, + "grad_norm": 1.6708450574654765, + "learning_rate": 3.8159514317160807e-07, + "loss": 0.1101, + "step": 13035 + }, + { + "epoch": 4.395078375189618, + "grad_norm": 1.5281800814265354, + "learning_rate": 3.795064709445834e-07, + "loss": 0.1007, + "step": 13040 + }, + { + "epoch": 4.3967638631383785, + "grad_norm": 1.3074495938444166, + "learning_rate": 3.774233051060916e-07, + "loss": 0.1101, + "step": 13045 + }, + { + "epoch": 4.39844935108714, + "grad_norm": 1.3659165183320154, + "learning_rate": 3.753456481387058e-07, + "loss": 0.1115, + "step": 13050 + }, + { + "epoch": 4.400134839035901, + "grad_norm": 1.3137223877241775, + "learning_rate": 3.7327350251843695e-07, + "loss": 0.1038, + "step": 13055 + }, + { + "epoch": 4.401820326984662, + "grad_norm": 1.2527662849244792, + "learning_rate": 3.712068707147282e-07, + "loss": 0.1237, + "step": 13060 + }, + { + "epoch": 4.403505814933423, + "grad_norm": 1.2192091494581734, + "learning_rate": 3.6914575519044816e-07, + "loss": 0.0897, + "step": 13065 + }, + { + "epoch": 4.4051913028821845, + "grad_norm": 1.3891589894514313, + "learning_rate": 3.6709015840189435e-07, + "loss": 0.0926, + "step": 13070 + }, + { + "epoch": 4.406876790830945, + "grad_norm": 2.168010607688832, + "learning_rate": 3.6504008279878486e-07, + "loss": 0.0947, + "step": 13075 + }, + { + "epoch": 4.408562278779707, + "grad_norm": 1.717699719899326, + "learning_rate": 3.6299553082426274e-07, + "loss": 0.1155, + "step": 13080 + }, + { + "epoch": 4.410247766728468, + "grad_norm": 1.3765641211070212, + "learning_rate": 3.609565049148833e-07, + "loss": 0.1049, + "step": 13085 + }, + { + "epoch": 4.411933254677229, + "grad_norm": 1.4325235530009308, + "learning_rate": 3.589230075006178e-07, + "loss": 0.099, + "step": 13090 + }, + { + "epoch": 4.4136187426259905, + "grad_norm": 1.6787616160478467, + "learning_rate": 3.568950410048505e-07, + "loss": 0.0961, + "step": 13095 + }, + { + "epoch": 4.415304230574751, + "grad_norm": 1.2558170114537444, + "learning_rate": 3.5487260784437483e-07, + "loss": 0.0991, + "step": 13100 + }, + { + "epoch": 4.416989718523513, + "grad_norm": 1.4730828629326196, + "learning_rate": 3.5285571042938615e-07, + "loss": 0.1233, + "step": 13105 + }, + { + "epoch": 4.418675206472273, + "grad_norm": 1.406970977909432, + "learning_rate": 3.5084435116348624e-07, + "loss": 0.1154, + "step": 13110 + }, + { + "epoch": 4.420360694421035, + "grad_norm": 1.5036343587118584, + "learning_rate": 3.488385324436744e-07, + "loss": 0.1089, + "step": 13115 + }, + { + "epoch": 4.422046182369796, + "grad_norm": 1.493600176527841, + "learning_rate": 3.468382566603501e-07, + "loss": 0.104, + "step": 13120 + }, + { + "epoch": 4.423731670318557, + "grad_norm": 1.2744043722365532, + "learning_rate": 3.4484352619730435e-07, + "loss": 0.0984, + "step": 13125 + }, + { + "epoch": 4.425417158267319, + "grad_norm": 1.30950686330533, + "learning_rate": 3.4285434343172e-07, + "loss": 0.0934, + "step": 13130 + }, + { + "epoch": 4.427102646216079, + "grad_norm": 5.954466591873907, + "learning_rate": 3.4087071073416966e-07, + "loss": 0.1017, + "step": 13135 + }, + { + "epoch": 4.428788134164841, + "grad_norm": 1.111782857915883, + "learning_rate": 3.38892630468613e-07, + "loss": 0.11, + "step": 13140 + }, + { + "epoch": 4.430473622113602, + "grad_norm": 1.1780736629033581, + "learning_rate": 3.369201049923887e-07, + "loss": 0.1032, + "step": 13145 + }, + { + "epoch": 4.432159110062363, + "grad_norm": 1.4376665805994704, + "learning_rate": 3.349531366562192e-07, + "loss": 0.1003, + "step": 13150 + }, + { + "epoch": 4.433844598011124, + "grad_norm": 1.0773282584288, + "learning_rate": 3.3299172780420165e-07, + "loss": 0.0975, + "step": 13155 + }, + { + "epoch": 4.435530085959885, + "grad_norm": 1.5123653100906522, + "learning_rate": 3.3103588077381067e-07, + "loss": 0.119, + "step": 13160 + }, + { + "epoch": 4.437215573908647, + "grad_norm": 1.292414589958653, + "learning_rate": 3.2908559789588955e-07, + "loss": 0.0989, + "step": 13165 + }, + { + "epoch": 4.438901061857408, + "grad_norm": 1.1816991727295574, + "learning_rate": 3.271408814946536e-07, + "loss": 0.1126, + "step": 13170 + }, + { + "epoch": 4.440586549806169, + "grad_norm": 1.351515000998513, + "learning_rate": 3.252017338876817e-07, + "loss": 0.1084, + "step": 13175 + }, + { + "epoch": 4.44227203775493, + "grad_norm": 1.2347811208826616, + "learning_rate": 3.232681573859192e-07, + "loss": 0.0946, + "step": 13180 + }, + { + "epoch": 4.443957525703691, + "grad_norm": 1.3385457566974366, + "learning_rate": 3.2134015429366894e-07, + "loss": 0.1054, + "step": 13185 + }, + { + "epoch": 4.445643013652452, + "grad_norm": 1.2580278966688596, + "learning_rate": 3.194177269085946e-07, + "loss": 0.1207, + "step": 13190 + }, + { + "epoch": 4.447328501601214, + "grad_norm": 1.3842959172953215, + "learning_rate": 3.1750087752171145e-07, + "loss": 0.1064, + "step": 13195 + }, + { + "epoch": 4.449013989549975, + "grad_norm": 6.862406860552724, + "learning_rate": 3.1558960841739263e-07, + "loss": 0.1013, + "step": 13200 + }, + { + "epoch": 4.450699477498736, + "grad_norm": 1.1997106922225558, + "learning_rate": 3.1368392187335563e-07, + "loss": 0.1035, + "step": 13205 + }, + { + "epoch": 4.452384965447497, + "grad_norm": 1.3307420909333114, + "learning_rate": 3.1178382016066933e-07, + "loss": 0.1107, + "step": 13210 + }, + { + "epoch": 4.454070453396258, + "grad_norm": 1.457990226696042, + "learning_rate": 3.0988930554374406e-07, + "loss": 0.1176, + "step": 13215 + }, + { + "epoch": 4.45575594134502, + "grad_norm": 1.2801374667672991, + "learning_rate": 3.0800038028033276e-07, + "loss": 0.0986, + "step": 13220 + }, + { + "epoch": 4.45744142929378, + "grad_norm": 1.3629737041739027, + "learning_rate": 3.061170466215285e-07, + "loss": 0.1071, + "step": 13225 + }, + { + "epoch": 4.459126917242542, + "grad_norm": 1.44409171217283, + "learning_rate": 3.0423930681175937e-07, + "loss": 0.1056, + "step": 13230 + }, + { + "epoch": 4.4608124051913025, + "grad_norm": 1.6979002115681163, + "learning_rate": 3.023671630887859e-07, + "loss": 0.1153, + "step": 13235 + }, + { + "epoch": 4.462497893140064, + "grad_norm": 1.2247683478658518, + "learning_rate": 3.0050061768370275e-07, + "loss": 0.1033, + "step": 13240 + }, + { + "epoch": 4.464183381088826, + "grad_norm": 1.2180326978307408, + "learning_rate": 2.986396728209312e-07, + "loss": 0.1036, + "step": 13245 + }, + { + "epoch": 4.465868869037586, + "grad_norm": 1.3691193804229946, + "learning_rate": 2.967843307182183e-07, + "loss": 0.1066, + "step": 13250 + }, + { + "epoch": 4.467554356986348, + "grad_norm": 1.2661150448335052, + "learning_rate": 2.9493459358663325e-07, + "loss": 0.0965, + "step": 13255 + }, + { + "epoch": 4.4692398449351085, + "grad_norm": 1.3806974581332552, + "learning_rate": 2.930904636305659e-07, + "loss": 0.116, + "step": 13260 + }, + { + "epoch": 4.47092533288387, + "grad_norm": 1.493484499947453, + "learning_rate": 2.912519430477256e-07, + "loss": 0.0936, + "step": 13265 + }, + { + "epoch": 4.472610820832631, + "grad_norm": 1.3419874632039008, + "learning_rate": 2.894190340291353e-07, + "loss": 0.1209, + "step": 13270 + }, + { + "epoch": 4.474296308781392, + "grad_norm": 1.207992872749056, + "learning_rate": 2.8759173875913036e-07, + "loss": 0.105, + "step": 13275 + }, + { + "epoch": 4.475981796730153, + "grad_norm": 1.5128141751931952, + "learning_rate": 2.8577005941535563e-07, + "loss": 0.1077, + "step": 13280 + }, + { + "epoch": 4.4776672846789145, + "grad_norm": 1.3212953983652505, + "learning_rate": 2.839539981687661e-07, + "loss": 0.1244, + "step": 13285 + }, + { + "epoch": 4.479352772627676, + "grad_norm": 1.334640731661006, + "learning_rate": 2.821435571836184e-07, + "loss": 0.1055, + "step": 13290 + }, + { + "epoch": 4.481038260576437, + "grad_norm": 1.5011148936411765, + "learning_rate": 2.8033873861747273e-07, + "loss": 0.1034, + "step": 13295 + }, + { + "epoch": 4.482723748525198, + "grad_norm": 1.285105290546476, + "learning_rate": 2.78539544621188e-07, + "loss": 0.0941, + "step": 13300 + }, + { + "epoch": 4.484409236473959, + "grad_norm": 1.0910101062939028, + "learning_rate": 2.767459773389214e-07, + "loss": 0.0961, + "step": 13305 + }, + { + "epoch": 4.4860947244227205, + "grad_norm": 1.4428093730356306, + "learning_rate": 2.74958038908123e-07, + "loss": 0.0899, + "step": 13310 + }, + { + "epoch": 4.487780212371481, + "grad_norm": 1.3092331473422458, + "learning_rate": 2.731757314595362e-07, + "loss": 0.1001, + "step": 13315 + }, + { + "epoch": 4.489465700320243, + "grad_norm": 1.4362122682242904, + "learning_rate": 2.713990571171937e-07, + "loss": 0.1033, + "step": 13320 + }, + { + "epoch": 4.491151188269004, + "grad_norm": 1.454945176553375, + "learning_rate": 2.696280179984134e-07, + "loss": 0.1263, + "step": 13325 + }, + { + "epoch": 4.492836676217765, + "grad_norm": 1.614289102941099, + "learning_rate": 2.678626162138004e-07, + "loss": 0.1093, + "step": 13330 + }, + { + "epoch": 4.4945221641665265, + "grad_norm": 1.2443091025966628, + "learning_rate": 2.6610285386723887e-07, + "loss": 0.1019, + "step": 13335 + }, + { + "epoch": 4.496207652115287, + "grad_norm": 1.408390653946815, + "learning_rate": 2.64348733055893e-07, + "loss": 0.0921, + "step": 13340 + }, + { + "epoch": 4.497893140064049, + "grad_norm": 1.8205675196145494, + "learning_rate": 2.626002558702051e-07, + "loss": 0.0964, + "step": 13345 + }, + { + "epoch": 4.499578628012809, + "grad_norm": 1.2023903854946982, + "learning_rate": 2.608574243938905e-07, + "loss": 0.0948, + "step": 13350 + }, + { + "epoch": 4.501264115961571, + "grad_norm": 1.3600282917815953, + "learning_rate": 2.591202407039356e-07, + "loss": 0.0917, + "step": 13355 + }, + { + "epoch": 4.5029496039103325, + "grad_norm": 1.265455009740431, + "learning_rate": 2.573887068705994e-07, + "loss": 0.1029, + "step": 13360 + }, + { + "epoch": 4.504635091859093, + "grad_norm": 1.2760492932336738, + "learning_rate": 2.556628249574034e-07, + "loss": 0.0996, + "step": 13365 + }, + { + "epoch": 4.506320579807855, + "grad_norm": 1.335186382041331, + "learning_rate": 2.5394259702113787e-07, + "loss": 0.1269, + "step": 13370 + }, + { + "epoch": 4.508006067756615, + "grad_norm": 1.3311672235205472, + "learning_rate": 2.522280251118514e-07, + "loss": 0.1054, + "step": 13375 + }, + { + "epoch": 4.509691555705377, + "grad_norm": 1.4444715505739214, + "learning_rate": 2.5051911127285446e-07, + "loss": 0.0932, + "step": 13380 + }, + { + "epoch": 4.511377043654138, + "grad_norm": 1.467697929063845, + "learning_rate": 2.4881585754071236e-07, + "loss": 0.1005, + "step": 13385 + }, + { + "epoch": 4.513062531602899, + "grad_norm": 1.474115930327594, + "learning_rate": 2.471182659452481e-07, + "loss": 0.1174, + "step": 13390 + }, + { + "epoch": 4.51474801955166, + "grad_norm": 1.4688121836243981, + "learning_rate": 2.454263385095357e-07, + "loss": 0.1044, + "step": 13395 + }, + { + "epoch": 4.516433507500421, + "grad_norm": 1.295857311837337, + "learning_rate": 2.437400772498977e-07, + "loss": 0.0964, + "step": 13400 + }, + { + "epoch": 4.518118995449182, + "grad_norm": 1.4508744717258912, + "learning_rate": 2.42059484175905e-07, + "loss": 0.1097, + "step": 13405 + }, + { + "epoch": 4.519804483397944, + "grad_norm": 1.3250289810648217, + "learning_rate": 2.403845612903738e-07, + "loss": 0.1019, + "step": 13410 + }, + { + "epoch": 4.521489971346705, + "grad_norm": 1.4877412560146837, + "learning_rate": 2.387153105893636e-07, + "loss": 0.0809, + "step": 13415 + }, + { + "epoch": 4.523175459295466, + "grad_norm": 1.7429122138240438, + "learning_rate": 2.3705173406217252e-07, + "loss": 0.109, + "step": 13420 + }, + { + "epoch": 4.524860947244227, + "grad_norm": 1.5071684021912692, + "learning_rate": 2.3539383369133638e-07, + "loss": 0.1163, + "step": 13425 + }, + { + "epoch": 4.526546435192988, + "grad_norm": 1.3675636870248489, + "learning_rate": 2.337416114526292e-07, + "loss": 0.11, + "step": 13430 + }, + { + "epoch": 4.52823192314175, + "grad_norm": 1.528495357739462, + "learning_rate": 2.3209506931505698e-07, + "loss": 0.1026, + "step": 13435 + }, + { + "epoch": 4.52991741109051, + "grad_norm": 1.2744348436124975, + "learning_rate": 2.304542092408546e-07, + "loss": 0.0894, + "step": 13440 + }, + { + "epoch": 4.531602899039272, + "grad_norm": 1.958814366529426, + "learning_rate": 2.2881903318548782e-07, + "loss": 0.1187, + "step": 13445 + }, + { + "epoch": 4.533288386988033, + "grad_norm": 1.3051445388956697, + "learning_rate": 2.271895430976473e-07, + "loss": 0.0999, + "step": 13450 + }, + { + "epoch": 4.534973874936794, + "grad_norm": 2.0114381479862375, + "learning_rate": 2.255657409192491e-07, + "loss": 0.1181, + "step": 13455 + }, + { + "epoch": 4.536659362885556, + "grad_norm": 1.2902828398357575, + "learning_rate": 2.239476285854286e-07, + "loss": 0.1258, + "step": 13460 + }, + { + "epoch": 4.538344850834316, + "grad_norm": 1.3801705209191595, + "learning_rate": 2.223352080245411e-07, + "loss": 0.1299, + "step": 13465 + }, + { + "epoch": 4.540030338783078, + "grad_norm": 1.3151069475180843, + "learning_rate": 2.207284811581606e-07, + "loss": 0.0922, + "step": 13470 + }, + { + "epoch": 4.5417158267318385, + "grad_norm": 1.5134407164527663, + "learning_rate": 2.1912744990107427e-07, + "loss": 0.1054, + "step": 13475 + }, + { + "epoch": 4.5434013146806, + "grad_norm": 2.4600152060600027, + "learning_rate": 2.1753211616128089e-07, + "loss": 0.0979, + "step": 13480 + }, + { + "epoch": 4.545086802629362, + "grad_norm": 2.8002583519383903, + "learning_rate": 2.1594248183999023e-07, + "loss": 0.1057, + "step": 13485 + }, + { + "epoch": 4.546772290578122, + "grad_norm": 1.4688754097757288, + "learning_rate": 2.1435854883162134e-07, + "loss": 0.0991, + "step": 13490 + }, + { + "epoch": 4.548457778526884, + "grad_norm": 1.9176154461189199, + "learning_rate": 2.1278031902379649e-07, + "loss": 0.0885, + "step": 13495 + }, + { + "epoch": 4.5501432664756445, + "grad_norm": 1.4077288663541572, + "learning_rate": 2.1120779429734172e-07, + "loss": 0.1114, + "step": 13500 + }, + { + "epoch": 4.551828754424406, + "grad_norm": 1.149103264917252, + "learning_rate": 2.0964097652628413e-07, + "loss": 0.0907, + "step": 13505 + }, + { + "epoch": 4.553514242373167, + "grad_norm": 1.5061164931200244, + "learning_rate": 2.0807986757785116e-07, + "loss": 0.0974, + "step": 13510 + }, + { + "epoch": 4.555199730321928, + "grad_norm": 1.410563741026411, + "learning_rate": 2.0652446931246573e-07, + "loss": 0.1165, + "step": 13515 + }, + { + "epoch": 4.55688521827069, + "grad_norm": 1.5629610413914392, + "learning_rate": 2.0497478358374567e-07, + "loss": 0.1248, + "step": 13520 + }, + { + "epoch": 4.5585707062194505, + "grad_norm": 1.2541546922911895, + "learning_rate": 2.0343081223849925e-07, + "loss": 0.0916, + "step": 13525 + }, + { + "epoch": 4.560256194168212, + "grad_norm": 1.6012834176835438, + "learning_rate": 2.0189255711672628e-07, + "loss": 0.1293, + "step": 13530 + }, + { + "epoch": 4.561941682116973, + "grad_norm": 1.4217877374746684, + "learning_rate": 2.0036002005161538e-07, + "loss": 0.0972, + "step": 13535 + }, + { + "epoch": 4.563627170065734, + "grad_norm": 1.3252550147324835, + "learning_rate": 1.9883320286953777e-07, + "loss": 0.1002, + "step": 13540 + }, + { + "epoch": 4.565312658014495, + "grad_norm": 1.552802696328646, + "learning_rate": 1.9731210739005134e-07, + "loss": 0.0971, + "step": 13545 + }, + { + "epoch": 4.5669981459632565, + "grad_norm": 1.219037379507953, + "learning_rate": 1.9579673542589273e-07, + "loss": 0.0807, + "step": 13550 + }, + { + "epoch": 4.568683633912017, + "grad_norm": 1.5542895792943663, + "learning_rate": 1.9428708878298008e-07, + "loss": 0.1126, + "step": 13555 + }, + { + "epoch": 4.570369121860779, + "grad_norm": 1.323240352071046, + "learning_rate": 1.9278316926040598e-07, + "loss": 0.1058, + "step": 13560 + }, + { + "epoch": 4.572054609809539, + "grad_norm": 1.4506849782108833, + "learning_rate": 1.912849786504395e-07, + "loss": 0.1071, + "step": 13565 + }, + { + "epoch": 4.573740097758301, + "grad_norm": 1.2973205184715504, + "learning_rate": 1.8979251873852023e-07, + "loss": 0.1052, + "step": 13570 + }, + { + "epoch": 4.5754255857070625, + "grad_norm": 1.1575245632553617, + "learning_rate": 1.8830579130326265e-07, + "loss": 0.0894, + "step": 13575 + }, + { + "epoch": 4.577111073655823, + "grad_norm": 1.4657347968296255, + "learning_rate": 1.868247981164445e-07, + "loss": 0.1062, + "step": 13580 + }, + { + "epoch": 4.578796561604585, + "grad_norm": 1.265978212372819, + "learning_rate": 1.8534954094301449e-07, + "loss": 0.0954, + "step": 13585 + }, + { + "epoch": 4.580482049553345, + "grad_norm": 1.4106536434223835, + "learning_rate": 1.838800215410813e-07, + "loss": 0.1179, + "step": 13590 + }, + { + "epoch": 4.582167537502107, + "grad_norm": 1.253712100573429, + "learning_rate": 1.8241624166191963e-07, + "loss": 0.0942, + "step": 13595 + }, + { + "epoch": 4.583853025450868, + "grad_norm": 1.5665981816419539, + "learning_rate": 1.809582030499607e-07, + "loss": 0.0933, + "step": 13600 + }, + { + "epoch": 4.585538513399629, + "grad_norm": 1.4463990990024052, + "learning_rate": 1.7950590744279682e-07, + "loss": 0.1136, + "step": 13605 + }, + { + "epoch": 4.587224001348391, + "grad_norm": 1.5536933339722785, + "learning_rate": 1.7805935657117246e-07, + "loss": 0.0971, + "step": 13610 + }, + { + "epoch": 4.588909489297151, + "grad_norm": 1.2453426235969909, + "learning_rate": 1.7661855215899083e-07, + "loss": 0.1192, + "step": 13615 + }, + { + "epoch": 4.590594977245913, + "grad_norm": 1.2706847333011229, + "learning_rate": 1.7518349592330176e-07, + "loss": 0.105, + "step": 13620 + }, + { + "epoch": 4.592280465194674, + "grad_norm": 1.3435038098907643, + "learning_rate": 1.7375418957430944e-07, + "loss": 0.1137, + "step": 13625 + }, + { + "epoch": 4.593965953143435, + "grad_norm": 1.6570935440979908, + "learning_rate": 1.723306348153625e-07, + "loss": 0.1007, + "step": 13630 + }, + { + "epoch": 4.595651441092196, + "grad_norm": 1.1470784971673305, + "learning_rate": 1.709128333429555e-07, + "loss": 0.106, + "step": 13635 + }, + { + "epoch": 4.597336929040957, + "grad_norm": 1.5097808842881983, + "learning_rate": 1.6950078684672854e-07, + "loss": 0.1069, + "step": 13640 + }, + { + "epoch": 4.599022416989719, + "grad_norm": 1.4006209172127413, + "learning_rate": 1.6809449700946167e-07, + "loss": 0.1013, + "step": 13645 + }, + { + "epoch": 4.60070790493848, + "grad_norm": 1.3445071398100308, + "learning_rate": 1.6669396550707485e-07, + "loss": 0.1161, + "step": 13650 + }, + { + "epoch": 4.602393392887241, + "grad_norm": 1.3638574619338524, + "learning_rate": 1.652991940086257e-07, + "loss": 0.1053, + "step": 13655 + }, + { + "epoch": 4.604078880836002, + "grad_norm": 1.2097422183788538, + "learning_rate": 1.6391018417630855e-07, + "loss": 0.1107, + "step": 13660 + }, + { + "epoch": 4.605764368784763, + "grad_norm": 1.546073374471397, + "learning_rate": 1.6252693766545036e-07, + "loss": 0.1216, + "step": 13665 + }, + { + "epoch": 4.607449856733524, + "grad_norm": 1.4436844182108657, + "learning_rate": 1.6114945612450915e-07, + "loss": 0.1128, + "step": 13670 + }, + { + "epoch": 4.609135344682286, + "grad_norm": 1.2781279461330373, + "learning_rate": 1.5977774119507294e-07, + "loss": 0.0912, + "step": 13675 + }, + { + "epoch": 4.610820832631047, + "grad_norm": 1.4515965768020278, + "learning_rate": 1.5841179451185907e-07, + "loss": 0.1115, + "step": 13680 + }, + { + "epoch": 4.612506320579808, + "grad_norm": 1.4502547215027042, + "learning_rate": 1.570516177027087e-07, + "loss": 0.113, + "step": 13685 + }, + { + "epoch": 4.614191808528569, + "grad_norm": 1.0850439962631129, + "learning_rate": 1.5569721238858748e-07, + "loss": 0.097, + "step": 13690 + }, + { + "epoch": 4.61587729647733, + "grad_norm": 1.6438707611644572, + "learning_rate": 1.5434858018358257e-07, + "loss": 0.119, + "step": 13695 + }, + { + "epoch": 4.617562784426092, + "grad_norm": 1.4561568075903104, + "learning_rate": 1.5300572269490388e-07, + "loss": 0.0919, + "step": 13700 + }, + { + "epoch": 4.619248272374852, + "grad_norm": 1.226244235817788, + "learning_rate": 1.5166864152287574e-07, + "loss": 0.0983, + "step": 13705 + }, + { + "epoch": 4.620933760323614, + "grad_norm": 1.301093385520065, + "learning_rate": 1.5033733826094077e-07, + "loss": 0.1105, + "step": 13710 + }, + { + "epoch": 4.6226192482723745, + "grad_norm": 1.1850246498855652, + "learning_rate": 1.4901181449565372e-07, + "loss": 0.0946, + "step": 13715 + }, + { + "epoch": 4.624304736221136, + "grad_norm": 1.2985772344897857, + "learning_rate": 1.4769207180668487e-07, + "loss": 0.1026, + "step": 13720 + }, + { + "epoch": 4.625990224169897, + "grad_norm": 1.7816307687608601, + "learning_rate": 1.4637811176681283e-07, + "loss": 0.1072, + "step": 13725 + }, + { + "epoch": 4.627675712118658, + "grad_norm": 1.2727167962544899, + "learning_rate": 1.4506993594192554e-07, + "loss": 0.0973, + "step": 13730 + }, + { + "epoch": 4.62936120006742, + "grad_norm": 1.2803686739478282, + "learning_rate": 1.4376754589101705e-07, + "loss": 0.0935, + "step": 13735 + }, + { + "epoch": 4.6310466880161805, + "grad_norm": 1.4004762869722913, + "learning_rate": 1.4247094316618748e-07, + "loss": 0.0872, + "step": 13740 + }, + { + "epoch": 4.632732175964942, + "grad_norm": 1.448946891965298, + "learning_rate": 1.411801293126397e-07, + "loss": 0.1068, + "step": 13745 + }, + { + "epoch": 4.634417663913703, + "grad_norm": 1.510078583068379, + "learning_rate": 1.398951058686765e-07, + "loss": 0.1117, + "step": 13750 + }, + { + "epoch": 4.636103151862464, + "grad_norm": 1.4684994382171623, + "learning_rate": 1.3861587436570123e-07, + "loss": 0.1101, + "step": 13755 + }, + { + "epoch": 4.637788639811225, + "grad_norm": 1.4349037388555372, + "learning_rate": 1.3734243632821498e-07, + "loss": 0.089, + "step": 13760 + }, + { + "epoch": 4.6394741277599865, + "grad_norm": 1.2204813917194925, + "learning_rate": 1.360747932738149e-07, + "loss": 0.082, + "step": 13765 + }, + { + "epoch": 4.641159615708748, + "grad_norm": 1.5537345001374436, + "learning_rate": 1.3481294671318924e-07, + "loss": 0.0921, + "step": 13770 + }, + { + "epoch": 4.642845103657509, + "grad_norm": 1.4973438628843838, + "learning_rate": 1.3355689815012286e-07, + "loss": 0.1067, + "step": 13775 + }, + { + "epoch": 4.64453059160627, + "grad_norm": 1.4082753630005014, + "learning_rate": 1.323066490814867e-07, + "loss": 0.0992, + "step": 13780 + }, + { + "epoch": 4.646216079555031, + "grad_norm": 1.439678451239427, + "learning_rate": 1.31062200997244e-07, + "loss": 0.102, + "step": 13785 + }, + { + "epoch": 4.6479015675037925, + "grad_norm": 1.2850062094181645, + "learning_rate": 1.2982355538044221e-07, + "loss": 0.111, + "step": 13790 + }, + { + "epoch": 4.649587055452553, + "grad_norm": 1.5424547304187732, + "learning_rate": 1.28590713707214e-07, + "loss": 0.112, + "step": 13795 + }, + { + "epoch": 4.651272543401315, + "grad_norm": 1.4671621130843322, + "learning_rate": 1.2736367744677626e-07, + "loss": 0.0928, + "step": 13800 + }, + { + "epoch": 4.652958031350076, + "grad_norm": 1.467450902677651, + "learning_rate": 1.2614244806142651e-07, + "loss": 0.1053, + "step": 13805 + }, + { + "epoch": 4.654643519298837, + "grad_norm": 1.1730816201958458, + "learning_rate": 1.2492702700654337e-07, + "loss": 0.0904, + "step": 13810 + }, + { + "epoch": 4.6563290072475985, + "grad_norm": 1.469229789590408, + "learning_rate": 1.237174157305826e-07, + "loss": 0.1102, + "step": 13815 + }, + { + "epoch": 4.658014495196359, + "grad_norm": 1.188093502193414, + "learning_rate": 1.2251361567507559e-07, + "loss": 0.0915, + "step": 13820 + }, + { + "epoch": 4.659699983145121, + "grad_norm": 2.2344150063316772, + "learning_rate": 1.2131562827462973e-07, + "loss": 0.1038, + "step": 13825 + }, + { + "epoch": 4.661385471093881, + "grad_norm": 1.014425979160535, + "learning_rate": 1.2012345495692356e-07, + "loss": 0.0945, + "step": 13830 + }, + { + "epoch": 4.663070959042643, + "grad_norm": 1.2416135504428898, + "learning_rate": 1.1893709714270895e-07, + "loss": 0.1178, + "step": 13835 + }, + { + "epoch": 4.6647564469914045, + "grad_norm": 1.832968547848945, + "learning_rate": 1.1775655624580496e-07, + "loss": 0.1018, + "step": 13840 + }, + { + "epoch": 4.666441934940165, + "grad_norm": 1.2650391940888095, + "learning_rate": 1.165818336731006e-07, + "loss": 0.1056, + "step": 13845 + }, + { + "epoch": 4.668127422888927, + "grad_norm": 1.4636818432086411, + "learning_rate": 1.1541293082454941e-07, + "loss": 0.1131, + "step": 13850 + }, + { + "epoch": 4.669812910837687, + "grad_norm": 1.5789184285509004, + "learning_rate": 1.1424984909317038e-07, + "loss": 0.1025, + "step": 13855 + }, + { + "epoch": 4.671498398786449, + "grad_norm": 1.408174467537858, + "learning_rate": 1.1309258986504424e-07, + "loss": 0.1003, + "step": 13860 + }, + { + "epoch": 4.67318388673521, + "grad_norm": 1.285766597093372, + "learning_rate": 1.1194115451931386e-07, + "loss": 0.0995, + "step": 13865 + }, + { + "epoch": 4.674869374683971, + "grad_norm": 1.4648611518155075, + "learning_rate": 1.1079554442818108e-07, + "loss": 0.0975, + "step": 13870 + }, + { + "epoch": 4.676554862632732, + "grad_norm": 1.180007859726227, + "learning_rate": 1.0965576095690656e-07, + "loss": 0.1053, + "step": 13875 + }, + { + "epoch": 4.678240350581493, + "grad_norm": 1.2982904102149746, + "learning_rate": 1.0852180546380486e-07, + "loss": 0.11, + "step": 13880 + }, + { + "epoch": 4.679925838530254, + "grad_norm": 1.1942878040297782, + "learning_rate": 1.0739367930024724e-07, + "loss": 0.1012, + "step": 13885 + }, + { + "epoch": 4.681611326479016, + "grad_norm": 1.2812876636900898, + "learning_rate": 1.0627138381065827e-07, + "loss": 0.1021, + "step": 13890 + }, + { + "epoch": 4.683296814427777, + "grad_norm": 1.216588415005487, + "learning_rate": 1.0515492033251196e-07, + "loss": 0.1077, + "step": 13895 + }, + { + "epoch": 4.684982302376538, + "grad_norm": 1.4167751469266234, + "learning_rate": 1.0404429019633344e-07, + "loss": 0.097, + "step": 13900 + }, + { + "epoch": 4.686667790325299, + "grad_norm": 1.4200775765023221, + "learning_rate": 1.0293949472569676e-07, + "loss": 0.1026, + "step": 13905 + }, + { + "epoch": 4.68835327827406, + "grad_norm": 1.3074665236415486, + "learning_rate": 1.0184053523722093e-07, + "loss": 0.0975, + "step": 13910 + }, + { + "epoch": 4.690038766222822, + "grad_norm": 1.1800306553238262, + "learning_rate": 1.0074741304057056e-07, + "loss": 0.0954, + "step": 13915 + }, + { + "epoch": 4.691724254171582, + "grad_norm": 1.4886420207429685, + "learning_rate": 9.966012943845361e-08, + "loss": 0.1011, + "step": 13920 + }, + { + "epoch": 4.693409742120344, + "grad_norm": 1.3144909160057303, + "learning_rate": 9.857868572662133e-08, + "loss": 0.112, + "step": 13925 + }, + { + "epoch": 4.695095230069105, + "grad_norm": 1.262286624107206, + "learning_rate": 9.750308319386503e-08, + "loss": 0.0828, + "step": 13930 + }, + { + "epoch": 4.696780718017866, + "grad_norm": 1.2707681070455816, + "learning_rate": 9.64333231220127e-08, + "loss": 0.0948, + "step": 13935 + }, + { + "epoch": 4.698466205966628, + "grad_norm": 1.6918903015487379, + "learning_rate": 9.536940678593232e-08, + "loss": 0.1185, + "step": 13940 + }, + { + "epoch": 4.700151693915388, + "grad_norm": 1.4283721193086336, + "learning_rate": 9.431133545352634e-08, + "loss": 0.0989, + "step": 13945 + }, + { + "epoch": 4.70183718186415, + "grad_norm": 1.4124434276758755, + "learning_rate": 9.32591103857322e-08, + "loss": 0.1157, + "step": 13950 + }, + { + "epoch": 4.7035226698129105, + "grad_norm": 1.4791650934256526, + "learning_rate": 9.22127328365191e-08, + "loss": 0.0888, + "step": 13955 + }, + { + "epoch": 4.705208157761672, + "grad_norm": 1.1462947330726747, + "learning_rate": 9.117220405288951e-08, + "loss": 0.095, + "step": 13960 + }, + { + "epoch": 4.706893645710434, + "grad_norm": 1.3097678867679194, + "learning_rate": 9.013752527487374e-08, + "loss": 0.0916, + "step": 13965 + }, + { + "epoch": 4.708579133659194, + "grad_norm": 1.4634185123578487, + "learning_rate": 8.910869773553155e-08, + "loss": 0.0963, + "step": 13970 + }, + { + "epoch": 4.710264621607956, + "grad_norm": 1.3727738856540987, + "learning_rate": 8.808572266094939e-08, + "loss": 0.111, + "step": 13975 + }, + { + "epoch": 4.7119501095567164, + "grad_norm": 1.2320528176036325, + "learning_rate": 8.706860127023875e-08, + "loss": 0.1031, + "step": 13980 + }, + { + "epoch": 4.713635597505478, + "grad_norm": 1.3724045713287618, + "learning_rate": 8.605733477553502e-08, + "loss": 0.12, + "step": 13985 + }, + { + "epoch": 4.715321085454239, + "grad_norm": 1.3648950441040957, + "learning_rate": 8.50519243819975e-08, + "loss": 0.0939, + "step": 13990 + }, + { + "epoch": 4.717006573403, + "grad_norm": 1.3058780044628497, + "learning_rate": 8.405237128780497e-08, + "loss": 0.0873, + "step": 13995 + }, + { + "epoch": 4.718692061351762, + "grad_norm": 1.478244485874207, + "learning_rate": 8.305867668415679e-08, + "loss": 0.1075, + "step": 14000 + }, + { + "epoch": 4.7203775493005224, + "grad_norm": 1.5088070557684299, + "learning_rate": 8.207084175527014e-08, + "loss": 0.1252, + "step": 14005 + }, + { + "epoch": 4.722063037249284, + "grad_norm": 1.374032480063295, + "learning_rate": 8.108886767837998e-08, + "loss": 0.1001, + "step": 14010 + }, + { + "epoch": 4.723748525198045, + "grad_norm": 2.177449526483698, + "learning_rate": 8.011275562373466e-08, + "loss": 0.1014, + "step": 14015 + }, + { + "epoch": 4.725434013146806, + "grad_norm": 1.3136875220028357, + "learning_rate": 7.914250675459867e-08, + "loss": 0.1046, + "step": 14020 + }, + { + "epoch": 4.727119501095567, + "grad_norm": 1.6446241508007466, + "learning_rate": 7.817812222724763e-08, + "loss": 0.1164, + "step": 14025 + }, + { + "epoch": 4.728804989044328, + "grad_norm": 1.254716520691661, + "learning_rate": 7.721960319097e-08, + "loss": 0.0871, + "step": 14030 + }, + { + "epoch": 4.73049047699309, + "grad_norm": 1.1338983059495962, + "learning_rate": 7.626695078806312e-08, + "loss": 0.0969, + "step": 14035 + }, + { + "epoch": 4.732175964941851, + "grad_norm": 1.2403341420737148, + "learning_rate": 7.53201661538322e-08, + "loss": 0.0982, + "step": 14040 + }, + { + "epoch": 4.733861452890612, + "grad_norm": 1.4179221828314692, + "learning_rate": 7.437925041659189e-08, + "loss": 0.0957, + "step": 14045 + }, + { + "epoch": 4.735546940839373, + "grad_norm": 1.2719084829694884, + "learning_rate": 7.344420469765967e-08, + "loss": 0.1126, + "step": 14050 + }, + { + "epoch": 4.737232428788134, + "grad_norm": 1.5907825455167337, + "learning_rate": 7.25150301113603e-08, + "loss": 0.0989, + "step": 14055 + }, + { + "epoch": 4.738917916736895, + "grad_norm": 1.7800432677776008, + "learning_rate": 7.159172776502077e-08, + "loss": 0.104, + "step": 14060 + }, + { + "epoch": 4.740603404685657, + "grad_norm": 1.3184865979313847, + "learning_rate": 7.067429875896815e-08, + "loss": 0.1225, + "step": 14065 + }, + { + "epoch": 4.742288892634417, + "grad_norm": 1.5068797680268022, + "learning_rate": 6.976274418653284e-08, + "loss": 0.1109, + "step": 14070 + }, + { + "epoch": 4.743974380583179, + "grad_norm": 1.2836301539188963, + "learning_rate": 6.885706513404422e-08, + "loss": 0.1141, + "step": 14075 + }, + { + "epoch": 4.7456598685319396, + "grad_norm": 1.2487966488165214, + "learning_rate": 6.79572626808267e-08, + "loss": 0.1003, + "step": 14080 + }, + { + "epoch": 4.747345356480701, + "grad_norm": 1.4740865889940853, + "learning_rate": 6.706333789920527e-08, + "loss": 0.0978, + "step": 14085 + }, + { + "epoch": 4.749030844429463, + "grad_norm": 1.3887974610128597, + "learning_rate": 6.617529185449668e-08, + "loss": 0.0925, + "step": 14090 + }, + { + "epoch": 4.750716332378223, + "grad_norm": 1.3904350765045153, + "learning_rate": 6.529312560501433e-08, + "loss": 0.0889, + "step": 14095 + }, + { + "epoch": 4.752401820326985, + "grad_norm": 1.4133855143041645, + "learning_rate": 6.441684020206452e-08, + "loss": 0.1156, + "step": 14100 + }, + { + "epoch": 4.7540873082757455, + "grad_norm": 1.302962150737466, + "learning_rate": 6.354643668994243e-08, + "loss": 0.1202, + "step": 14105 + }, + { + "epoch": 4.755772796224507, + "grad_norm": 1.372419067721362, + "learning_rate": 6.268191610593666e-08, + "loss": 0.0878, + "step": 14110 + }, + { + "epoch": 4.757458284173268, + "grad_norm": 3.454099385937723, + "learning_rate": 6.182327948032474e-08, + "loss": 0.1014, + "step": 14115 + }, + { + "epoch": 4.759143772122029, + "grad_norm": 1.1870700419863325, + "learning_rate": 6.097052783636925e-08, + "loss": 0.0976, + "step": 14120 + }, + { + "epoch": 4.760829260070791, + "grad_norm": 1.4750296611451297, + "learning_rate": 6.012366219032284e-08, + "loss": 0.1076, + "step": 14125 + }, + { + "epoch": 4.7625147480195515, + "grad_norm": 1.4568738615338586, + "learning_rate": 5.9282683551420974e-08, + "loss": 0.105, + "step": 14130 + }, + { + "epoch": 4.764200235968313, + "grad_norm": 1.4991782213657212, + "learning_rate": 5.8447592921885286e-08, + "loss": 0.1091, + "step": 14135 + }, + { + "epoch": 4.765885723917074, + "grad_norm": 1.3102617851381169, + "learning_rate": 5.7618391296919706e-08, + "loss": 0.0947, + "step": 14140 + }, + { + "epoch": 4.767571211865835, + "grad_norm": 1.618087149916084, + "learning_rate": 5.679507966470932e-08, + "loss": 0.0983, + "step": 14145 + }, + { + "epoch": 4.769256699814596, + "grad_norm": 1.4792429006716927, + "learning_rate": 5.597765900642149e-08, + "loss": 0.1003, + "step": 14150 + }, + { + "epoch": 4.7709421877633575, + "grad_norm": 1.4274182820738144, + "learning_rate": 5.516613029620199e-08, + "loss": 0.1139, + "step": 14155 + }, + { + "epoch": 4.772627675712119, + "grad_norm": 1.20931340010249, + "learning_rate": 5.4360494501176084e-08, + "loss": 0.0837, + "step": 14160 + }, + { + "epoch": 4.77431316366088, + "grad_norm": 1.1641391670837216, + "learning_rate": 5.356075258144411e-08, + "loss": 0.0993, + "step": 14165 + }, + { + "epoch": 4.775998651609641, + "grad_norm": 1.2934022272539012, + "learning_rate": 5.2766905490084784e-08, + "loss": 0.1079, + "step": 14170 + }, + { + "epoch": 4.777684139558402, + "grad_norm": 1.4872495758075495, + "learning_rate": 5.197895417315024e-08, + "loss": 0.1015, + "step": 14175 + }, + { + "epoch": 4.7793696275071635, + "grad_norm": 1.538770319443185, + "learning_rate": 5.119689956966767e-08, + "loss": 0.1162, + "step": 14180 + }, + { + "epoch": 4.781055115455924, + "grad_norm": 1.1982756767293192, + "learning_rate": 5.042074261163599e-08, + "loss": 0.1034, + "step": 14185 + }, + { + "epoch": 4.782740603404686, + "grad_norm": 1.8762319699827408, + "learning_rate": 4.965048422402641e-08, + "loss": 0.1057, + "step": 14190 + }, + { + "epoch": 4.784426091353447, + "grad_norm": 1.451763792493304, + "learning_rate": 4.88861253247791e-08, + "loss": 0.1135, + "step": 14195 + }, + { + "epoch": 4.786111579302208, + "grad_norm": 1.48814623839987, + "learning_rate": 4.812766682480596e-08, + "loss": 0.101, + "step": 14200 + }, + { + "epoch": 4.7877970672509695, + "grad_norm": 1.2203621932114868, + "learning_rate": 4.737510962798564e-08, + "loss": 0.0979, + "step": 14205 + }, + { + "epoch": 4.78948255519973, + "grad_norm": 1.1162606094259975, + "learning_rate": 4.662845463116461e-08, + "loss": 0.0792, + "step": 14210 + }, + { + "epoch": 4.791168043148492, + "grad_norm": 1.3276256419225485, + "learning_rate": 4.5887702724154994e-08, + "loss": 0.0925, + "step": 14215 + }, + { + "epoch": 4.792853531097252, + "grad_norm": 1.464968815034206, + "learning_rate": 4.515285478973397e-08, + "loss": 0.1082, + "step": 14220 + }, + { + "epoch": 4.794539019046014, + "grad_norm": 1.3048458940015526, + "learning_rate": 4.44239117036438e-08, + "loss": 0.1056, + "step": 14225 + }, + { + "epoch": 4.796224506994775, + "grad_norm": 1.4920919238488923, + "learning_rate": 4.370087433458903e-08, + "loss": 0.104, + "step": 14230 + }, + { + "epoch": 4.797909994943536, + "grad_norm": 1.4779132299592195, + "learning_rate": 4.29837435442354e-08, + "loss": 0.1238, + "step": 14235 + }, + { + "epoch": 4.799595482892297, + "grad_norm": 1.348592374411331, + "learning_rate": 4.227252018721151e-08, + "loss": 0.108, + "step": 14240 + }, + { + "epoch": 4.801280970841058, + "grad_norm": 1.4645067823705782, + "learning_rate": 4.1567205111104346e-08, + "loss": 0.1088, + "step": 14245 + }, + { + "epoch": 4.80296645878982, + "grad_norm": 1.3262592748339201, + "learning_rate": 4.086779915645989e-08, + "loss": 0.1064, + "step": 14250 + }, + { + "epoch": 4.804651946738581, + "grad_norm": 1.946159694795386, + "learning_rate": 4.017430315678195e-08, + "loss": 0.1009, + "step": 14255 + }, + { + "epoch": 4.806337434687342, + "grad_norm": 1.4246586217648731, + "learning_rate": 3.948671793853276e-08, + "loss": 0.0802, + "step": 14260 + }, + { + "epoch": 4.808022922636103, + "grad_norm": 1.4962720045677116, + "learning_rate": 3.880504432112908e-08, + "loss": 0.1254, + "step": 14265 + }, + { + "epoch": 4.809708410584864, + "grad_norm": 1.393816825289662, + "learning_rate": 3.812928311694275e-08, + "loss": 0.0929, + "step": 14270 + }, + { + "epoch": 4.811393898533625, + "grad_norm": 1.5256976576165577, + "learning_rate": 3.745943513129957e-08, + "loss": 0.0906, + "step": 14275 + }, + { + "epoch": 4.813079386482387, + "grad_norm": 1.487951662028393, + "learning_rate": 3.679550116247932e-08, + "loss": 0.0899, + "step": 14280 + }, + { + "epoch": 4.814764874431148, + "grad_norm": 1.4882273942351816, + "learning_rate": 3.613748200171241e-08, + "loss": 0.1042, + "step": 14285 + }, + { + "epoch": 4.816450362379909, + "grad_norm": 1.1875714832766766, + "learning_rate": 3.5485378433182117e-08, + "loss": 0.092, + "step": 14290 + }, + { + "epoch": 4.81813585032867, + "grad_norm": 1.4053327302056282, + "learning_rate": 3.4839191234019576e-08, + "loss": 0.111, + "step": 14295 + }, + { + "epoch": 4.819821338277431, + "grad_norm": 1.2613569775368803, + "learning_rate": 3.419892117430712e-08, + "loss": 0.1188, + "step": 14300 + }, + { + "epoch": 4.821506826226193, + "grad_norm": 1.4989874950538131, + "learning_rate": 3.35645690170755e-08, + "loss": 0.0966, + "step": 14305 + }, + { + "epoch": 4.823192314174953, + "grad_norm": 1.3731881461629478, + "learning_rate": 3.293613551830222e-08, + "loss": 0.1122, + "step": 14310 + }, + { + "epoch": 4.824877802123715, + "grad_norm": 1.2788135152005884, + "learning_rate": 3.2313621426909855e-08, + "loss": 0.0913, + "step": 14315 + }, + { + "epoch": 4.826563290072476, + "grad_norm": 1.600960703496493, + "learning_rate": 3.169702748476999e-08, + "loss": 0.1088, + "step": 14320 + }, + { + "epoch": 4.828248778021237, + "grad_norm": 1.466673992777664, + "learning_rate": 3.10863544266965e-08, + "loss": 0.0879, + "step": 14325 + }, + { + "epoch": 4.829934265969999, + "grad_norm": 1.291084556225331, + "learning_rate": 3.048160298044722e-08, + "loss": 0.096, + "step": 14330 + }, + { + "epoch": 4.831619753918759, + "grad_norm": 1.3753486674449276, + "learning_rate": 2.988277386672456e-08, + "loss": 0.1121, + "step": 14335 + }, + { + "epoch": 4.833305241867521, + "grad_norm": 1.3627248642232634, + "learning_rate": 2.928986779917098e-08, + "loss": 0.1223, + "step": 14340 + }, + { + "epoch": 4.8349907298162815, + "grad_norm": 1.134452585061997, + "learning_rate": 2.8702885484372944e-08, + "loss": 0.0951, + "step": 14345 + }, + { + "epoch": 4.836676217765043, + "grad_norm": 1.3874592022831107, + "learning_rate": 2.8121827621855336e-08, + "loss": 0.0927, + "step": 14350 + }, + { + "epoch": 4.838361705713805, + "grad_norm": 1.2983660687207284, + "learning_rate": 2.7546694904082572e-08, + "loss": 0.0928, + "step": 14355 + }, + { + "epoch": 4.840047193662565, + "grad_norm": 1.364783484024969, + "learning_rate": 2.697748801645861e-08, + "loss": 0.0975, + "step": 14360 + }, + { + "epoch": 4.841732681611327, + "grad_norm": 1.2189737120634667, + "learning_rate": 2.6414207637325828e-08, + "loss": 0.0889, + "step": 14365 + }, + { + "epoch": 4.8434181695600875, + "grad_norm": 1.4757354692915603, + "learning_rate": 2.5856854437963376e-08, + "loss": 0.0899, + "step": 14370 + }, + { + "epoch": 4.845103657508849, + "grad_norm": 1.2709892836999879, + "learning_rate": 2.5305429082586597e-08, + "loss": 0.0996, + "step": 14375 + }, + { + "epoch": 4.84678914545761, + "grad_norm": 1.3345061704770749, + "learning_rate": 2.475993222834594e-08, + "loss": 0.1031, + "step": 14380 + }, + { + "epoch": 4.848474633406371, + "grad_norm": 1.305377234298407, + "learning_rate": 2.4220364525328055e-08, + "loss": 0.0994, + "step": 14385 + }, + { + "epoch": 4.850160121355132, + "grad_norm": 1.2231125689931832, + "learning_rate": 2.3686726616553023e-08, + "loss": 0.0981, + "step": 14390 + }, + { + "epoch": 4.8518456093038935, + "grad_norm": 1.653085067087123, + "learning_rate": 2.3159019137973248e-08, + "loss": 0.1072, + "step": 14395 + }, + { + "epoch": 4.853531097252654, + "grad_norm": 1.380812027604419, + "learning_rate": 2.2637242718474562e-08, + "loss": 0.1041, + "step": 14400 + }, + { + "epoch": 4.855216585201416, + "grad_norm": 1.3017367505026471, + "learning_rate": 2.2121397979875114e-08, + "loss": 0.0942, + "step": 14405 + }, + { + "epoch": 4.856902073150177, + "grad_norm": 1.4249052839160983, + "learning_rate": 2.161148553692316e-08, + "loss": 0.095, + "step": 14410 + }, + { + "epoch": 4.858587561098938, + "grad_norm": 1.327283947326012, + "learning_rate": 2.110750599729705e-08, + "loss": 0.1032, + "step": 14415 + }, + { + "epoch": 4.8602730490476995, + "grad_norm": 1.3251995054831043, + "learning_rate": 2.0609459961605794e-08, + "loss": 0.104, + "step": 14420 + }, + { + "epoch": 4.86195853699646, + "grad_norm": 1.2263633559148792, + "learning_rate": 2.0117348023386274e-08, + "loss": 0.11, + "step": 14425 + }, + { + "epoch": 4.863644024945222, + "grad_norm": 3.0158100580245644, + "learning_rate": 1.9631170769103812e-08, + "loss": 0.103, + "step": 14430 + }, + { + "epoch": 4.865329512893982, + "grad_norm": 1.5023783123325296, + "learning_rate": 1.9150928778151613e-08, + "loss": 0.1216, + "step": 14435 + }, + { + "epoch": 4.867015000842744, + "grad_norm": 1.4997010407474924, + "learning_rate": 1.867662262284853e-08, + "loss": 0.1106, + "step": 14440 + }, + { + "epoch": 4.8687004887915055, + "grad_norm": 1.2278695574479974, + "learning_rate": 1.8208252868441302e-08, + "loss": 0.1011, + "step": 14445 + }, + { + "epoch": 4.870385976740266, + "grad_norm": 1.2966558733227076, + "learning_rate": 1.774582007310066e-08, + "loss": 0.102, + "step": 14450 + }, + { + "epoch": 4.872071464689028, + "grad_norm": 1.3403499888324188, + "learning_rate": 1.728932478792189e-08, + "loss": 0.0997, + "step": 14455 + }, + { + "epoch": 4.873756952637788, + "grad_norm": 1.15835870765287, + "learning_rate": 1.6838767556925372e-08, + "loss": 0.0893, + "step": 14460 + }, + { + "epoch": 4.87544244058655, + "grad_norm": 1.4940155219468743, + "learning_rate": 1.639414891705382e-08, + "loss": 0.1077, + "step": 14465 + }, + { + "epoch": 4.877127928535311, + "grad_norm": 1.4463261334234456, + "learning_rate": 1.595546939817394e-08, + "loss": 0.0949, + "step": 14470 + }, + { + "epoch": 4.878813416484072, + "grad_norm": 1.2964682055497974, + "learning_rate": 1.552272952307421e-08, + "loss": 0.1193, + "step": 14475 + }, + { + "epoch": 4.880498904432834, + "grad_norm": 1.3741866763663682, + "learning_rate": 1.5095929807463772e-08, + "loss": 0.11, + "step": 14480 + }, + { + "epoch": 4.882184392381594, + "grad_norm": 1.2238382777882655, + "learning_rate": 1.4675070759974097e-08, + "loss": 0.1011, + "step": 14485 + }, + { + "epoch": 4.883869880330356, + "grad_norm": 1.3926257112820095, + "learning_rate": 1.4260152882155654e-08, + "loss": 0.1143, + "step": 14490 + }, + { + "epoch": 4.885555368279117, + "grad_norm": 2.805845937973201, + "learning_rate": 1.3851176668479571e-08, + "loss": 0.1141, + "step": 14495 + }, + { + "epoch": 4.887240856227878, + "grad_norm": 1.815060779592187, + "learning_rate": 1.3448142606335424e-08, + "loss": 0.094, + "step": 14500 + }, + { + "epoch": 4.888926344176639, + "grad_norm": 1.368144997931563, + "learning_rate": 1.3051051176032336e-08, + "loss": 0.0957, + "step": 14505 + }, + { + "epoch": 4.8906118321254, + "grad_norm": 1.4091654981611477, + "learning_rate": 1.2659902850795659e-08, + "loss": 0.0791, + "step": 14510 + }, + { + "epoch": 4.892297320074162, + "grad_norm": 1.2977749319696585, + "learning_rate": 1.2274698096770287e-08, + "loss": 0.1156, + "step": 14515 + }, + { + "epoch": 4.893982808022923, + "grad_norm": 2.031184872648758, + "learning_rate": 1.1895437373016239e-08, + "loss": 0.1002, + "step": 14520 + }, + { + "epoch": 4.895668295971684, + "grad_norm": 1.2768824289027978, + "learning_rate": 1.152212113151141e-08, + "loss": 0.0904, + "step": 14525 + }, + { + "epoch": 4.897353783920445, + "grad_norm": 1.1316564467207972, + "learning_rate": 1.1154749817147147e-08, + "loss": 0.0892, + "step": 14530 + }, + { + "epoch": 4.899039271869206, + "grad_norm": 1.3727479458180365, + "learning_rate": 1.0793323867733241e-08, + "loss": 0.0965, + "step": 14535 + }, + { + "epoch": 4.900724759817967, + "grad_norm": 1.150793789115667, + "learning_rate": 1.0437843713991258e-08, + "loss": 0.0932, + "step": 14540 + }, + { + "epoch": 4.902410247766729, + "grad_norm": 1.368948355426084, + "learning_rate": 1.008830977955788e-08, + "loss": 0.0926, + "step": 14545 + }, + { + "epoch": 4.904095735715489, + "grad_norm": 1.4709380810659298, + "learning_rate": 9.744722480984903e-09, + "loss": 0.0874, + "step": 14550 + }, + { + "epoch": 4.905781223664251, + "grad_norm": 1.464568349442694, + "learning_rate": 9.407082227735342e-09, + "loss": 0.0983, + "step": 14555 + }, + { + "epoch": 4.9074667116130115, + "grad_norm": 1.4937043474445795, + "learning_rate": 9.07538942218622e-09, + "loss": 0.1176, + "step": 14560 + }, + { + "epoch": 4.909152199561773, + "grad_norm": 1.5056690904887626, + "learning_rate": 8.749644459626338e-09, + "loss": 0.1026, + "step": 14565 + }, + { + "epoch": 4.910837687510535, + "grad_norm": 1.332865071391694, + "learning_rate": 8.429847728255725e-09, + "loss": 0.1052, + "step": 14570 + }, + { + "epoch": 4.912523175459295, + "grad_norm": 1.2683786880575063, + "learning_rate": 8.115999609187298e-09, + "loss": 0.0883, + "step": 14575 + }, + { + "epoch": 4.914208663408057, + "grad_norm": 1.574291409575643, + "learning_rate": 7.808100476442982e-09, + "loss": 0.113, + "step": 14580 + }, + { + "epoch": 4.9158941513568175, + "grad_norm": 1.3436450372767457, + "learning_rate": 7.506150696955927e-09, + "loss": 0.0952, + "step": 14585 + }, + { + "epoch": 4.917579639305579, + "grad_norm": 1.5593660881509166, + "learning_rate": 7.2101506305699565e-09, + "loss": 0.1065, + "step": 14590 + }, + { + "epoch": 4.91926512725434, + "grad_norm": 1.515065980451858, + "learning_rate": 6.920100630036786e-09, + "loss": 0.1008, + "step": 14595 + }, + { + "epoch": 4.920950615203101, + "grad_norm": 1.4768959884413175, + "learning_rate": 6.636001041019357e-09, + "loss": 0.1042, + "step": 14600 + }, + { + "epoch": 4.922636103151863, + "grad_norm": 1.3566671047313197, + "learning_rate": 6.357852202086845e-09, + "loss": 0.1144, + "step": 14605 + }, + { + "epoch": 4.9243215911006235, + "grad_norm": 1.7232660638299566, + "learning_rate": 6.085654444719091e-09, + "loss": 0.1156, + "step": 14610 + }, + { + "epoch": 4.926007079049385, + "grad_norm": 1.245039092600688, + "learning_rate": 5.81940809330217e-09, + "loss": 0.0921, + "step": 14615 + }, + { + "epoch": 4.927692566998146, + "grad_norm": 1.4185565225384973, + "learning_rate": 5.559113465130051e-09, + "loss": 0.1112, + "step": 14620 + }, + { + "epoch": 4.929378054946907, + "grad_norm": 1.7545468415875707, + "learning_rate": 5.304770870405152e-09, + "loss": 0.0885, + "step": 14625 + }, + { + "epoch": 4.931063542895668, + "grad_norm": 1.437941603395581, + "learning_rate": 5.0563806122344575e-09, + "loss": 0.1123, + "step": 14630 + }, + { + "epoch": 4.9327490308444295, + "grad_norm": 1.46177106060379, + "learning_rate": 4.813942986633402e-09, + "loss": 0.1011, + "step": 14635 + }, + { + "epoch": 4.934434518793191, + "grad_norm": 1.4721786423010996, + "learning_rate": 4.5774582825219846e-09, + "loss": 0.1184, + "step": 14640 + }, + { + "epoch": 4.936120006741952, + "grad_norm": 1.3859301059254683, + "learning_rate": 4.34692678172699e-09, + "loss": 0.106, + "step": 14645 + }, + { + "epoch": 4.937805494690713, + "grad_norm": 1.2671351567180096, + "learning_rate": 4.122348758979766e-09, + "loss": 0.1082, + "step": 14650 + }, + { + "epoch": 4.939490982639474, + "grad_norm": 1.9554293400220635, + "learning_rate": 3.903724481916782e-09, + "loss": 0.1058, + "step": 14655 + }, + { + "epoch": 4.9411764705882355, + "grad_norm": 1.3596853441968018, + "learning_rate": 3.691054211080736e-09, + "loss": 0.102, + "step": 14660 + }, + { + "epoch": 4.942861958536996, + "grad_norm": 1.470148160698503, + "learning_rate": 3.484338199916115e-09, + "loss": 0.0967, + "step": 14665 + }, + { + "epoch": 4.944547446485758, + "grad_norm": 1.4304674727429603, + "learning_rate": 3.283576694773083e-09, + "loss": 0.0953, + "step": 14670 + }, + { + "epoch": 4.946232934434519, + "grad_norm": 1.5766864717031746, + "learning_rate": 3.088769934906366e-09, + "loss": 0.1193, + "step": 14675 + }, + { + "epoch": 4.94791842238328, + "grad_norm": 1.3503109603634298, + "learning_rate": 2.899918152473036e-09, + "loss": 0.1046, + "step": 14680 + }, + { + "epoch": 4.9496039103320415, + "grad_norm": 1.4917913508850917, + "learning_rate": 2.7170215725336178e-09, + "loss": 0.095, + "step": 14685 + }, + { + "epoch": 4.951289398280802, + "grad_norm": 1.2445698559908476, + "learning_rate": 2.5400804130515377e-09, + "loss": 0.1046, + "step": 14690 + }, + { + "epoch": 4.952974886229564, + "grad_norm": 1.3057329204886383, + "learning_rate": 2.3690948848931195e-09, + "loss": 0.1007, + "step": 14695 + }, + { + "epoch": 4.954660374178324, + "grad_norm": 1.1783829580691276, + "learning_rate": 2.204065191828142e-09, + "loss": 0.0976, + "step": 14700 + }, + { + "epoch": 4.956345862127086, + "grad_norm": 1.3909285323137688, + "learning_rate": 2.044991530526508e-09, + "loss": 0.0987, + "step": 14705 + }, + { + "epoch": 4.958031350075847, + "grad_norm": 1.5336632708956925, + "learning_rate": 1.891874090562129e-09, + "loss": 0.109, + "step": 14710 + }, + { + "epoch": 4.959716838024608, + "grad_norm": 1.5063911298248347, + "learning_rate": 1.7447130544095969e-09, + "loss": 0.1179, + "step": 14715 + }, + { + "epoch": 4.961402325973369, + "grad_norm": 1.3840193028939571, + "learning_rate": 1.6035085974452913e-09, + "loss": 0.1108, + "step": 14720 + }, + { + "epoch": 4.96308781392213, + "grad_norm": 1.5144172111749752, + "learning_rate": 1.4682608879479366e-09, + "loss": 0.1168, + "step": 14725 + }, + { + "epoch": 4.964773301870892, + "grad_norm": 1.276551146771887, + "learning_rate": 1.3389700870952705e-09, + "loss": 0.1021, + "step": 14730 + }, + { + "epoch": 4.966458789819653, + "grad_norm": 1.3137608821564721, + "learning_rate": 1.21563634896793e-09, + "loss": 0.1119, + "step": 14735 + }, + { + "epoch": 4.968144277768414, + "grad_norm": 1.1871070566357258, + "learning_rate": 1.0982598205461215e-09, + "loss": 0.1086, + "step": 14740 + }, + { + "epoch": 4.969829765717175, + "grad_norm": 1.1932765768001778, + "learning_rate": 9.868406417118391e-10, + "loss": 0.1001, + "step": 14745 + }, + { + "epoch": 4.971515253665936, + "grad_norm": 1.2855010367496051, + "learning_rate": 8.81378945246647e-10, + "loss": 0.0962, + "step": 14750 + }, + { + "epoch": 4.973200741614697, + "grad_norm": 1.3684311123014772, + "learning_rate": 7.818748568322321e-10, + "loss": 0.109, + "step": 14755 + }, + { + "epoch": 4.974886229563459, + "grad_norm": 1.1802664955924085, + "learning_rate": 6.883284950509606e-10, + "loss": 0.0986, + "step": 14760 + }, + { + "epoch": 4.97657171751222, + "grad_norm": 1.2218188299467154, + "learning_rate": 6.007399713853224e-10, + "loss": 0.0967, + "step": 14765 + }, + { + "epoch": 4.978257205460981, + "grad_norm": 1.3842376768427533, + "learning_rate": 5.191093902168209e-10, + "loss": 0.1086, + "step": 14770 + }, + { + "epoch": 4.979942693409742, + "grad_norm": 1.4366177776147933, + "learning_rate": 4.434368488276386e-10, + "loss": 0.1103, + "step": 14775 + }, + { + "epoch": 4.981628181358503, + "grad_norm": 1.0757505610798177, + "learning_rate": 3.737224373989712e-10, + "loss": 0.1016, + "step": 14780 + }, + { + "epoch": 4.983313669307265, + "grad_norm": 1.2893166566817769, + "learning_rate": 3.099662390115832e-10, + "loss": 0.1141, + "step": 14785 + }, + { + "epoch": 4.984999157256025, + "grad_norm": 1.5323418018912287, + "learning_rate": 2.5216832964580775e-10, + "loss": 0.0948, + "step": 14790 + }, + { + "epoch": 4.986684645204787, + "grad_norm": 1.3496620760339086, + "learning_rate": 2.003287781809915e-10, + "loss": 0.1029, + "step": 14795 + }, + { + "epoch": 4.988370133153548, + "grad_norm": 1.3974048630503693, + "learning_rate": 1.5444764639660492e-10, + "loss": 0.1074, + "step": 14800 + }, + { + "epoch": 4.990055621102309, + "grad_norm": 1.6672607871290943, + "learning_rate": 1.1452498897057685e-10, + "loss": 0.1034, + "step": 14805 + }, + { + "epoch": 4.991741109051071, + "grad_norm": 1.2365419280318917, + "learning_rate": 8.056085347929454e-11, + "loss": 0.0827, + "step": 14810 + }, + { + "epoch": 4.993426596999831, + "grad_norm": 1.427640809943521, + "learning_rate": 5.255528039926905e-11, + "loss": 0.1091, + "step": 14815 + }, + { + "epoch": 4.995112084948593, + "grad_norm": 1.2181763677593134, + "learning_rate": 3.050830310602493e-11, + "loss": 0.0944, + "step": 14820 + }, + { + "epoch": 4.9967975728973535, + "grad_norm": 1.342276220471211, + "learning_rate": 1.441994787299006e-11, + "loss": 0.0939, + "step": 14825 + }, + { + "epoch": 4.998483060846115, + "grad_norm": 0.9877557358399789, + "learning_rate": 4.2902338737160765e-12, + "loss": 0.0938, + "step": 14830 + }, + { + "epoch": 5.0, + "grad_norm": 2.056041710199563, + "learning_rate": 1.1917317965792764e-13, + "loss": 0.1204, + "step": 14835 + }, + { + "epoch": 5.0, + "step": 14835, + "total_flos": 2.8140820020854784e+16, + "train_loss": 0.23571134310078853, + "train_runtime": 105488.1838, + "train_samples_per_second": 71.99, + "train_steps_per_second": 0.141 + } + ], + "logging_steps": 5, + "max_steps": 14835, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.8140820020854784e+16, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}