{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 14835, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016854879487611663, "grad_norm": 45.635078873413356, "learning_rate": 8.968609865470853e-08, "loss": 2.018, "step": 5 }, { "epoch": 0.0033709758975223325, "grad_norm": 54.815981003563216, "learning_rate": 2.0179372197309417e-07, "loss": 1.9624, "step": 10 }, { "epoch": 0.005056463846283499, "grad_norm": 47.342183607767005, "learning_rate": 3.1390134529147985e-07, "loss": 1.9596, "step": 15 }, { "epoch": 0.006741951795044665, "grad_norm": 58.86327972981608, "learning_rate": 4.2600896860986547e-07, "loss": 1.8431, "step": 20 }, { "epoch": 0.008427439743805831, "grad_norm": 46.738743972646034, "learning_rate": 5.381165919282512e-07, "loss": 1.5319, "step": 25 }, { "epoch": 0.010112927692566998, "grad_norm": 20.284827182581076, "learning_rate": 6.502242152466367e-07, "loss": 1.2438, "step": 30 }, { "epoch": 0.011798415641328164, "grad_norm": 12.172972360337889, "learning_rate": 7.623318385650225e-07, "loss": 1.0174, "step": 35 }, { "epoch": 0.01348390359008933, "grad_norm": 9.808958694438337, "learning_rate": 8.744394618834082e-07, "loss": 0.9619, "step": 40 }, { "epoch": 0.015169391538850498, "grad_norm": 5.846992899686357, "learning_rate": 9.865470852017938e-07, "loss": 0.936, "step": 45 }, { "epoch": 0.016854879487611663, "grad_norm": 4.407895578972216, "learning_rate": 1.0986547085201794e-06, "loss": 0.7907, "step": 50 }, { "epoch": 0.01854036743637283, "grad_norm": 4.4229026540725895, "learning_rate": 1.2107623318385651e-06, "loss": 0.8571, "step": 55 }, { "epoch": 0.020225855385133995, "grad_norm": 3.193351753212815, "learning_rate": 1.3228699551569508e-06, "loss": 0.755, "step": 60 }, { "epoch": 0.02191134333389516, "grad_norm": 3.897422918904646, "learning_rate": 1.4349775784753365e-06, "loss": 0.7606, "step": 65 }, { "epoch": 0.023596831282656328, "grad_norm": 2.920303290471853, "learning_rate": 1.5470852017937221e-06, "loss": 0.7553, "step": 70 }, { "epoch": 0.025282319231417494, "grad_norm": 3.177702608453161, "learning_rate": 1.6591928251121078e-06, "loss": 0.7082, "step": 75 }, { "epoch": 0.02696780718017866, "grad_norm": 3.8072115047524786, "learning_rate": 1.7713004484304935e-06, "loss": 0.7497, "step": 80 }, { "epoch": 0.02865329512893983, "grad_norm": 4.215753700800818, "learning_rate": 1.883408071748879e-06, "loss": 0.6929, "step": 85 }, { "epoch": 0.030338783077700996, "grad_norm": 3.120813016518093, "learning_rate": 1.9955156950672647e-06, "loss": 0.6795, "step": 90 }, { "epoch": 0.03202427102646216, "grad_norm": 3.7925518313582076, "learning_rate": 2.1076233183856503e-06, "loss": 0.6336, "step": 95 }, { "epoch": 0.033709758975223325, "grad_norm": 3.9090323792139565, "learning_rate": 2.219730941704036e-06, "loss": 0.6241, "step": 100 }, { "epoch": 0.03539524692398449, "grad_norm": 3.2401672745432233, "learning_rate": 2.3318385650224217e-06, "loss": 0.5761, "step": 105 }, { "epoch": 0.03708073487274566, "grad_norm": 2.6831203132181436, "learning_rate": 2.4439461883408074e-06, "loss": 0.5994, "step": 110 }, { "epoch": 0.038766222821506824, "grad_norm": 3.1421487063706506, "learning_rate": 2.556053811659193e-06, "loss": 0.5819, "step": 115 }, { "epoch": 0.04045171077026799, "grad_norm": 3.05063467620266, "learning_rate": 2.6681614349775787e-06, "loss": 0.6341, "step": 120 }, { "epoch": 0.042137198719029156, "grad_norm": 3.0042373128046185, "learning_rate": 2.7802690582959644e-06, "loss": 0.5358, "step": 125 }, { "epoch": 0.04382268666779032, "grad_norm": 2.6261508492463865, "learning_rate": 2.89237668161435e-06, "loss": 0.5549, "step": 130 }, { "epoch": 0.04550817461655149, "grad_norm": 2.438153401729035, "learning_rate": 3.0044843049327353e-06, "loss": 0.5276, "step": 135 }, { "epoch": 0.047193662565312655, "grad_norm": 3.161425452337587, "learning_rate": 3.1165919282511214e-06, "loss": 0.5024, "step": 140 }, { "epoch": 0.04887915051407382, "grad_norm": 2.724770652328943, "learning_rate": 3.2286995515695067e-06, "loss": 0.502, "step": 145 }, { "epoch": 0.05056463846283499, "grad_norm": 3.501981992101439, "learning_rate": 3.340807174887893e-06, "loss": 0.4506, "step": 150 }, { "epoch": 0.052250126411596154, "grad_norm": 3.643587579316495, "learning_rate": 3.4529147982062785e-06, "loss": 0.4734, "step": 155 }, { "epoch": 0.05393561436035732, "grad_norm": 3.2491410588044594, "learning_rate": 3.5650224215246637e-06, "loss": 0.47, "step": 160 }, { "epoch": 0.055621102309118486, "grad_norm": 2.4074659243017456, "learning_rate": 3.67713004484305e-06, "loss": 0.4135, "step": 165 }, { "epoch": 0.05730659025787966, "grad_norm": 4.057738750671516, "learning_rate": 3.789237668161435e-06, "loss": 0.4755, "step": 170 }, { "epoch": 0.058992078206640826, "grad_norm": 3.0465958363209715, "learning_rate": 3.901345291479821e-06, "loss": 0.4529, "step": 175 }, { "epoch": 0.06067756615540199, "grad_norm": 2.95941371221577, "learning_rate": 4.0134529147982064e-06, "loss": 0.453, "step": 180 }, { "epoch": 0.06236305410416316, "grad_norm": 3.6181830736645773, "learning_rate": 4.125560538116592e-06, "loss": 0.4474, "step": 185 }, { "epoch": 0.06404854205292432, "grad_norm": 3.390695544104659, "learning_rate": 4.237668161434978e-06, "loss": 0.4435, "step": 190 }, { "epoch": 0.06573403000168548, "grad_norm": 3.283255105498521, "learning_rate": 4.3497757847533635e-06, "loss": 0.4575, "step": 195 }, { "epoch": 0.06741951795044665, "grad_norm": 5.873175991018478, "learning_rate": 4.461883408071749e-06, "loss": 0.4202, "step": 200 }, { "epoch": 0.06910500589920782, "grad_norm": 3.032967552769786, "learning_rate": 4.573991031390135e-06, "loss": 0.4152, "step": 205 }, { "epoch": 0.07079049384796898, "grad_norm": 4.450464722208425, "learning_rate": 4.6860986547085205e-06, "loss": 0.3909, "step": 210 }, { "epoch": 0.07247598179673015, "grad_norm": 2.6249909571371406, "learning_rate": 4.798206278026906e-06, "loss": 0.3833, "step": 215 }, { "epoch": 0.07416146974549132, "grad_norm": 2.7675155415343666, "learning_rate": 4.910313901345292e-06, "loss": 0.4347, "step": 220 }, { "epoch": 0.07584695769425248, "grad_norm": 2.705989833169705, "learning_rate": 5.0224215246636775e-06, "loss": 0.4053, "step": 225 }, { "epoch": 0.07753244564301365, "grad_norm": 2.593711819126537, "learning_rate": 5.134529147982063e-06, "loss": 0.4229, "step": 230 }, { "epoch": 0.07921793359177481, "grad_norm": 2.4338770361517734, "learning_rate": 5.246636771300448e-06, "loss": 0.4162, "step": 235 }, { "epoch": 0.08090342154053598, "grad_norm": 3.003588069957525, "learning_rate": 5.358744394618835e-06, "loss": 0.3861, "step": 240 }, { "epoch": 0.08258890948929715, "grad_norm": 2.1502174031404246, "learning_rate": 5.47085201793722e-06, "loss": 0.4081, "step": 245 }, { "epoch": 0.08427439743805831, "grad_norm": 3.6222165866591824, "learning_rate": 5.582959641255605e-06, "loss": 0.4212, "step": 250 }, { "epoch": 0.08595988538681948, "grad_norm": 2.488014950380137, "learning_rate": 5.695067264573992e-06, "loss": 0.4316, "step": 255 }, { "epoch": 0.08764537333558065, "grad_norm": 2.035292151113688, "learning_rate": 5.807174887892377e-06, "loss": 0.4135, "step": 260 }, { "epoch": 0.08933086128434181, "grad_norm": 1.971410200056052, "learning_rate": 5.919282511210763e-06, "loss": 0.4468, "step": 265 }, { "epoch": 0.09101634923310298, "grad_norm": 2.3089000226070056, "learning_rate": 6.031390134529148e-06, "loss": 0.4009, "step": 270 }, { "epoch": 0.09270183718186414, "grad_norm": 2.612380102704638, "learning_rate": 6.143497757847534e-06, "loss": 0.4016, "step": 275 }, { "epoch": 0.09438732513062531, "grad_norm": 2.3993241428609906, "learning_rate": 6.25560538116592e-06, "loss": 0.4309, "step": 280 }, { "epoch": 0.09607281307938648, "grad_norm": 4.233998484786202, "learning_rate": 6.367713004484305e-06, "loss": 0.4054, "step": 285 }, { "epoch": 0.09775830102814764, "grad_norm": 2.322719482657519, "learning_rate": 6.479820627802691e-06, "loss": 0.4196, "step": 290 }, { "epoch": 0.09944378897690881, "grad_norm": 5.679288461330974, "learning_rate": 6.591928251121077e-06, "loss": 0.4146, "step": 295 }, { "epoch": 0.10112927692566998, "grad_norm": 4.9794233931360425, "learning_rate": 6.704035874439463e-06, "loss": 0.4195, "step": 300 }, { "epoch": 0.10281476487443114, "grad_norm": 1.9120380706775335, "learning_rate": 6.8161434977578476e-06, "loss": 0.4317, "step": 305 }, { "epoch": 0.10450025282319231, "grad_norm": 2.044216254914569, "learning_rate": 6.928251121076234e-06, "loss": 0.4266, "step": 310 }, { "epoch": 0.10618574077195347, "grad_norm": 1.9008300446356923, "learning_rate": 7.04035874439462e-06, "loss": 0.3963, "step": 315 }, { "epoch": 0.10787122872071464, "grad_norm": 2.0936197065546254, "learning_rate": 7.152466367713005e-06, "loss": 0.396, "step": 320 }, { "epoch": 0.1095567166694758, "grad_norm": 2.504640994538501, "learning_rate": 7.26457399103139e-06, "loss": 0.3755, "step": 325 }, { "epoch": 0.11124220461823697, "grad_norm": 2.0338356261514616, "learning_rate": 7.376681614349777e-06, "loss": 0.397, "step": 330 }, { "epoch": 0.11292769256699814, "grad_norm": 2.07093018887604, "learning_rate": 7.4887892376681625e-06, "loss": 0.3631, "step": 335 }, { "epoch": 0.11461318051575932, "grad_norm": 2.4353106198358168, "learning_rate": 7.600896860986547e-06, "loss": 0.3918, "step": 340 }, { "epoch": 0.11629866846452049, "grad_norm": 1.884231089705885, "learning_rate": 7.713004484304933e-06, "loss": 0.3803, "step": 345 }, { "epoch": 0.11798415641328165, "grad_norm": 2.5897406971523105, "learning_rate": 7.825112107623319e-06, "loss": 0.3917, "step": 350 }, { "epoch": 0.11966964436204282, "grad_norm": 2.307975389115971, "learning_rate": 7.937219730941704e-06, "loss": 0.3866, "step": 355 }, { "epoch": 0.12135513231080398, "grad_norm": 2.467091283833354, "learning_rate": 8.04932735426009e-06, "loss": 0.4249, "step": 360 }, { "epoch": 0.12304062025956515, "grad_norm": 2.0244456034927913, "learning_rate": 8.161434977578476e-06, "loss": 0.4185, "step": 365 }, { "epoch": 0.12472610820832632, "grad_norm": 2.4621737674260573, "learning_rate": 8.273542600896861e-06, "loss": 0.3617, "step": 370 }, { "epoch": 0.12641159615708747, "grad_norm": 2.1008353268942312, "learning_rate": 8.385650224215247e-06, "loss": 0.3871, "step": 375 }, { "epoch": 0.12809708410584864, "grad_norm": 2.3606333816070326, "learning_rate": 8.497757847533633e-06, "loss": 0.3869, "step": 380 }, { "epoch": 0.1297825720546098, "grad_norm": 1.7221519588815626, "learning_rate": 8.609865470852018e-06, "loss": 0.433, "step": 385 }, { "epoch": 0.13146806000337097, "grad_norm": 2.4647014732321466, "learning_rate": 8.721973094170404e-06, "loss": 0.4185, "step": 390 }, { "epoch": 0.13315354795213213, "grad_norm": 3.6377876835528564, "learning_rate": 8.83408071748879e-06, "loss": 0.42, "step": 395 }, { "epoch": 0.1348390359008933, "grad_norm": 2.4800482154662538, "learning_rate": 8.946188340807175e-06, "loss": 0.4019, "step": 400 }, { "epoch": 0.13652452384965447, "grad_norm": 1.932932709959026, "learning_rate": 9.058295964125561e-06, "loss": 0.3937, "step": 405 }, { "epoch": 0.13821001179841563, "grad_norm": 2.459462459379769, "learning_rate": 9.170403587443947e-06, "loss": 0.3757, "step": 410 }, { "epoch": 0.1398954997471768, "grad_norm": 2.634402740022879, "learning_rate": 9.282511210762332e-06, "loss": 0.4211, "step": 415 }, { "epoch": 0.14158098769593797, "grad_norm": 2.302486918235084, "learning_rate": 9.394618834080718e-06, "loss": 0.4243, "step": 420 }, { "epoch": 0.14326647564469913, "grad_norm": 2.699871424522218, "learning_rate": 9.506726457399104e-06, "loss": 0.4298, "step": 425 }, { "epoch": 0.1449519635934603, "grad_norm": 2.5220003454274806, "learning_rate": 9.61883408071749e-06, "loss": 0.4489, "step": 430 }, { "epoch": 0.14663745154222146, "grad_norm": 2.3013583855642326, "learning_rate": 9.730941704035875e-06, "loss": 0.4052, "step": 435 }, { "epoch": 0.14832293949098263, "grad_norm": 1.8343159139166734, "learning_rate": 9.843049327354261e-06, "loss": 0.4012, "step": 440 }, { "epoch": 0.1500084274397438, "grad_norm": 1.878587520427869, "learning_rate": 9.955156950672647e-06, "loss": 0.3653, "step": 445 }, { "epoch": 0.15169391538850496, "grad_norm": 2.9319680956052454, "learning_rate": 9.999998927441416e-06, "loss": 0.4264, "step": 450 }, { "epoch": 0.15337940333726613, "grad_norm": 2.6134158541749732, "learning_rate": 9.999992372918407e-06, "loss": 0.4095, "step": 455 }, { "epoch": 0.1550648912860273, "grad_norm": 3.832484233411444, "learning_rate": 9.999979859746068e-06, "loss": 0.3817, "step": 460 }, { "epoch": 0.15675037923478846, "grad_norm": 2.3653735728210186, "learning_rate": 9.999961387939312e-06, "loss": 0.4119, "step": 465 }, { "epoch": 0.15843586718354963, "grad_norm": 4.4364055824071915, "learning_rate": 9.999936957520153e-06, "loss": 0.4003, "step": 470 }, { "epoch": 0.1601213551323108, "grad_norm": 35.14512330840862, "learning_rate": 9.999906568517708e-06, "loss": 0.3873, "step": 475 }, { "epoch": 0.16180684308107196, "grad_norm": 1.7048667253980463, "learning_rate": 9.999870220968187e-06, "loss": 0.3879, "step": 480 }, { "epoch": 0.16349233102983313, "grad_norm": 3.3372208072716085, "learning_rate": 9.99982791491491e-06, "loss": 0.4556, "step": 485 }, { "epoch": 0.1651778189785943, "grad_norm": 2.554132462618261, "learning_rate": 9.999779650408294e-06, "loss": 0.3869, "step": 490 }, { "epoch": 0.16686330692735546, "grad_norm": 1.9937809824466266, "learning_rate": 9.999725427505858e-06, "loss": 0.3923, "step": 495 }, { "epoch": 0.16854879487611663, "grad_norm": 2.548047443247034, "learning_rate": 9.999665246272222e-06, "loss": 0.3806, "step": 500 }, { "epoch": 0.1702342828248778, "grad_norm": 2.2010906932226457, "learning_rate": 9.999599106779102e-06, "loss": 0.3966, "step": 505 }, { "epoch": 0.17191977077363896, "grad_norm": 1.828420796921307, "learning_rate": 9.999527009105322e-06, "loss": 0.3922, "step": 510 }, { "epoch": 0.17360525872240012, "grad_norm": 1.8298755244344242, "learning_rate": 9.999448953336801e-06, "loss": 0.4076, "step": 515 }, { "epoch": 0.1752907466711613, "grad_norm": 2.5032192982353183, "learning_rate": 9.999364939566563e-06, "loss": 0.3873, "step": 520 }, { "epoch": 0.17697623461992246, "grad_norm": 3.012693421833828, "learning_rate": 9.999274967894728e-06, "loss": 0.354, "step": 525 }, { "epoch": 0.17866172256868362, "grad_norm": 1.6295732315281872, "learning_rate": 9.999179038428518e-06, "loss": 0.4048, "step": 530 }, { "epoch": 0.1803472105174448, "grad_norm": 2.5804997377771266, "learning_rate": 9.999077151282255e-06, "loss": 0.3822, "step": 535 }, { "epoch": 0.18203269846620596, "grad_norm": 1.9847562423733942, "learning_rate": 9.998969306577364e-06, "loss": 0.3847, "step": 540 }, { "epoch": 0.18371818641496712, "grad_norm": 2.004242607918728, "learning_rate": 9.998855504442363e-06, "loss": 0.406, "step": 545 }, { "epoch": 0.1854036743637283, "grad_norm": 1.7878967975539106, "learning_rate": 9.998735745012876e-06, "loss": 0.3858, "step": 550 }, { "epoch": 0.18708916231248945, "grad_norm": 2.1773847589443522, "learning_rate": 9.998610028431622e-06, "loss": 0.4115, "step": 555 }, { "epoch": 0.18877465026125062, "grad_norm": 2.1881165596445347, "learning_rate": 9.998478354848425e-06, "loss": 0.4017, "step": 560 }, { "epoch": 0.1904601382100118, "grad_norm": 2.3909666157372187, "learning_rate": 9.998340724420202e-06, "loss": 0.3909, "step": 565 }, { "epoch": 0.19214562615877295, "grad_norm": 3.8789092474821865, "learning_rate": 9.998197137310972e-06, "loss": 0.4149, "step": 570 }, { "epoch": 0.19383111410753412, "grad_norm": 2.8013687374027234, "learning_rate": 9.99804759369185e-06, "loss": 0.3584, "step": 575 }, { "epoch": 0.19551660205629529, "grad_norm": 2.03183171805555, "learning_rate": 9.997892093741058e-06, "loss": 0.3531, "step": 580 }, { "epoch": 0.19720209000505645, "grad_norm": 2.1752647617401273, "learning_rate": 9.997730637643904e-06, "loss": 0.4119, "step": 585 }, { "epoch": 0.19888757795381762, "grad_norm": 2.1615427772313223, "learning_rate": 9.997563225592803e-06, "loss": 0.3716, "step": 590 }, { "epoch": 0.20057306590257878, "grad_norm": 2.9336226572334425, "learning_rate": 9.997389857787266e-06, "loss": 0.3913, "step": 595 }, { "epoch": 0.20225855385133995, "grad_norm": 1.613836729716596, "learning_rate": 9.997210534433899e-06, "loss": 0.3838, "step": 600 }, { "epoch": 0.20394404180010112, "grad_norm": 1.8362314282533876, "learning_rate": 9.997025255746409e-06, "loss": 0.4097, "step": 605 }, { "epoch": 0.20562952974886228, "grad_norm": 2.172122875659069, "learning_rate": 9.996834021945599e-06, "loss": 0.4342, "step": 610 }, { "epoch": 0.20731501769762345, "grad_norm": 1.6195979347911462, "learning_rate": 9.996636833259365e-06, "loss": 0.4161, "step": 615 }, { "epoch": 0.20900050564638462, "grad_norm": 2.2138247618272477, "learning_rate": 9.996433689922705e-06, "loss": 0.3973, "step": 620 }, { "epoch": 0.21068599359514578, "grad_norm": 2.009981169025572, "learning_rate": 9.996224592177713e-06, "loss": 0.4017, "step": 625 }, { "epoch": 0.21237148154390695, "grad_norm": 1.9803622020128537, "learning_rate": 9.996009540273574e-06, "loss": 0.3629, "step": 630 }, { "epoch": 0.21405696949266811, "grad_norm": 1.9702872751272344, "learning_rate": 9.995788534466576e-06, "loss": 0.3765, "step": 635 }, { "epoch": 0.21574245744142928, "grad_norm": 2.026744637961682, "learning_rate": 9.995561575020096e-06, "loss": 0.3779, "step": 640 }, { "epoch": 0.21742794539019045, "grad_norm": 2.356624382171085, "learning_rate": 9.995328662204609e-06, "loss": 0.3791, "step": 645 }, { "epoch": 0.2191134333389516, "grad_norm": 1.5683166625062384, "learning_rate": 9.995089796297686e-06, "loss": 0.3977, "step": 650 }, { "epoch": 0.22079892128771278, "grad_norm": 2.101603290295796, "learning_rate": 9.994844977583989e-06, "loss": 0.3855, "step": 655 }, { "epoch": 0.22248440923647395, "grad_norm": 1.7565292897025453, "learning_rate": 9.994594206355277e-06, "loss": 0.3956, "step": 660 }, { "epoch": 0.2241698971852351, "grad_norm": 3.10293180125714, "learning_rate": 9.994337482910403e-06, "loss": 0.4191, "step": 665 }, { "epoch": 0.22585538513399628, "grad_norm": 1.6005337791639043, "learning_rate": 9.994074807555312e-06, "loss": 0.366, "step": 670 }, { "epoch": 0.22754087308275747, "grad_norm": 15.746106890836087, "learning_rate": 9.993806180603042e-06, "loss": 0.366, "step": 675 }, { "epoch": 0.22922636103151864, "grad_norm": 2.3584243465745467, "learning_rate": 9.993531602373725e-06, "loss": 0.3551, "step": 680 }, { "epoch": 0.2309118489802798, "grad_norm": 2.911460621086186, "learning_rate": 9.993251073194582e-06, "loss": 0.3765, "step": 685 }, { "epoch": 0.23259733692904097, "grad_norm": 1.918519286748289, "learning_rate": 9.992964593399935e-06, "loss": 0.4172, "step": 690 }, { "epoch": 0.23428282487780214, "grad_norm": 1.7248234236040525, "learning_rate": 9.992672163331183e-06, "loss": 0.3669, "step": 695 }, { "epoch": 0.2359683128265633, "grad_norm": 1.8252665725881347, "learning_rate": 9.992373783336829e-06, "loss": 0.4019, "step": 700 }, { "epoch": 0.23765380077532447, "grad_norm": 1.624513733101269, "learning_rate": 9.992069453772462e-06, "loss": 0.3597, "step": 705 }, { "epoch": 0.23933928872408564, "grad_norm": 1.5577357173237931, "learning_rate": 9.991759175000759e-06, "loss": 0.4212, "step": 710 }, { "epoch": 0.2410247766728468, "grad_norm": 2.1786930696130327, "learning_rate": 9.99144294739149e-06, "loss": 0.3955, "step": 715 }, { "epoch": 0.24271026462160797, "grad_norm": 1.6246094297359752, "learning_rate": 9.991120771321513e-06, "loss": 0.4389, "step": 720 }, { "epoch": 0.24439575257036913, "grad_norm": 8.408346377550648, "learning_rate": 9.990792647174777e-06, "loss": 0.3853, "step": 725 }, { "epoch": 0.2460812405191303, "grad_norm": 1.595421372399142, "learning_rate": 9.990458575342315e-06, "loss": 0.3976, "step": 730 }, { "epoch": 0.24776672846789147, "grad_norm": 4.514848965218263, "learning_rate": 9.990118556222254e-06, "loss": 0.384, "step": 735 }, { "epoch": 0.24945221641665263, "grad_norm": 2.218357541423892, "learning_rate": 9.989772590219805e-06, "loss": 0.3979, "step": 740 }, { "epoch": 0.25113770436541377, "grad_norm": 1.6358343818606282, "learning_rate": 9.989420677747266e-06, "loss": 0.3754, "step": 745 }, { "epoch": 0.25282319231417494, "grad_norm": 1.6530000481044254, "learning_rate": 9.98906281922402e-06, "loss": 0.4156, "step": 750 }, { "epoch": 0.2545086802629361, "grad_norm": 1.9639962472652386, "learning_rate": 9.988699015076545e-06, "loss": 0.3869, "step": 755 }, { "epoch": 0.25619416821169727, "grad_norm": 2.7981924236148674, "learning_rate": 9.98832926573839e-06, "loss": 0.3844, "step": 760 }, { "epoch": 0.25787965616045844, "grad_norm": 1.7533436277070946, "learning_rate": 9.987953571650201e-06, "loss": 0.3883, "step": 765 }, { "epoch": 0.2595651441092196, "grad_norm": 2.4253975139504873, "learning_rate": 9.987571933259705e-06, "loss": 0.4124, "step": 770 }, { "epoch": 0.26125063205798077, "grad_norm": 1.5158341449175288, "learning_rate": 9.98718435102171e-06, "loss": 0.3703, "step": 775 }, { "epoch": 0.26293612000674194, "grad_norm": 1.5959370141299707, "learning_rate": 9.986790825398113e-06, "loss": 0.3745, "step": 780 }, { "epoch": 0.2646216079555031, "grad_norm": 2.34082579635864, "learning_rate": 9.98639135685789e-06, "loss": 0.3761, "step": 785 }, { "epoch": 0.26630709590426427, "grad_norm": 3.467698805677363, "learning_rate": 9.985985945877099e-06, "loss": 0.3659, "step": 790 }, { "epoch": 0.26799258385302543, "grad_norm": 2.8517981978610885, "learning_rate": 9.985574592938883e-06, "loss": 0.3812, "step": 795 }, { "epoch": 0.2696780718017866, "grad_norm": 5.14428705280153, "learning_rate": 9.985157298533463e-06, "loss": 0.414, "step": 800 }, { "epoch": 0.27136355975054777, "grad_norm": 1.8396394543962131, "learning_rate": 9.984734063158142e-06, "loss": 0.3949, "step": 805 }, { "epoch": 0.27304904769930893, "grad_norm": 2.106355774259703, "learning_rate": 9.984304887317304e-06, "loss": 0.371, "step": 810 }, { "epoch": 0.2747345356480701, "grad_norm": 2.015825945564459, "learning_rate": 9.983869771522411e-06, "loss": 0.3947, "step": 815 }, { "epoch": 0.27642002359683127, "grad_norm": 1.9906050791876728, "learning_rate": 9.983428716292002e-06, "loss": 0.3896, "step": 820 }, { "epoch": 0.27810551154559243, "grad_norm": 1.6262166951573869, "learning_rate": 9.9829817221517e-06, "loss": 0.3593, "step": 825 }, { "epoch": 0.2797909994943536, "grad_norm": 1.9751342061528858, "learning_rate": 9.982528789634203e-06, "loss": 0.3721, "step": 830 }, { "epoch": 0.28147648744311476, "grad_norm": 2.2090109354710075, "learning_rate": 9.98206991927928e-06, "loss": 0.382, "step": 835 }, { "epoch": 0.28316197539187593, "grad_norm": 2.0014148005086265, "learning_rate": 9.981605111633785e-06, "loss": 0.3788, "step": 840 }, { "epoch": 0.2848474633406371, "grad_norm": 1.9487777824799606, "learning_rate": 9.981134367251644e-06, "loss": 0.3848, "step": 845 }, { "epoch": 0.28653295128939826, "grad_norm": 1.7877424705020015, "learning_rate": 9.980657686693856e-06, "loss": 0.3661, "step": 850 }, { "epoch": 0.28821843923815943, "grad_norm": 1.8189700902269383, "learning_rate": 9.980175070528496e-06, "loss": 0.3997, "step": 855 }, { "epoch": 0.2899039271869206, "grad_norm": 1.7885705129483607, "learning_rate": 9.979686519330715e-06, "loss": 0.4111, "step": 860 }, { "epoch": 0.29158941513568176, "grad_norm": 2.0788763558907988, "learning_rate": 9.979192033682737e-06, "loss": 0.3867, "step": 865 }, { "epoch": 0.29327490308444293, "grad_norm": 2.553827465281633, "learning_rate": 9.97869161417385e-06, "loss": 0.3748, "step": 870 }, { "epoch": 0.2949603910332041, "grad_norm": 2.5115523800996615, "learning_rate": 9.978185261400423e-06, "loss": 0.4072, "step": 875 }, { "epoch": 0.29664587898196526, "grad_norm": 2.1087588940179467, "learning_rate": 9.977672975965895e-06, "loss": 0.4132, "step": 880 }, { "epoch": 0.2983313669307264, "grad_norm": 1.970566507249276, "learning_rate": 9.977154758480771e-06, "loss": 0.3851, "step": 885 }, { "epoch": 0.3000168548794876, "grad_norm": 1.741373008703597, "learning_rate": 9.976630609562626e-06, "loss": 0.386, "step": 890 }, { "epoch": 0.30170234282824876, "grad_norm": 2.154351386848767, "learning_rate": 9.976100529836106e-06, "loss": 0.4082, "step": 895 }, { "epoch": 0.3033878307770099, "grad_norm": 2.9479101759832638, "learning_rate": 9.975564519932922e-06, "loss": 0.3882, "step": 900 }, { "epoch": 0.3050733187257711, "grad_norm": 1.9012524201786816, "learning_rate": 9.975022580491859e-06, "loss": 0.3836, "step": 905 }, { "epoch": 0.30675880667453226, "grad_norm": 1.659302263817562, "learning_rate": 9.974474712158757e-06, "loss": 0.3788, "step": 910 }, { "epoch": 0.3084442946232934, "grad_norm": 6.390788623860543, "learning_rate": 9.973920915586533e-06, "loss": 0.3849, "step": 915 }, { "epoch": 0.3101297825720546, "grad_norm": 2.646923190514544, "learning_rate": 9.97336119143516e-06, "loss": 0.4239, "step": 920 }, { "epoch": 0.31181527052081576, "grad_norm": 1.692234615605573, "learning_rate": 9.972795540371682e-06, "loss": 0.4225, "step": 925 }, { "epoch": 0.3135007584695769, "grad_norm": 2.6257286681434158, "learning_rate": 9.972223963070202e-06, "loss": 0.3833, "step": 930 }, { "epoch": 0.3151862464183381, "grad_norm": 1.4468130664510528, "learning_rate": 9.971646460211888e-06, "loss": 0.3817, "step": 935 }, { "epoch": 0.31687173436709926, "grad_norm": 2.9343853277028225, "learning_rate": 9.971063032484966e-06, "loss": 0.3818, "step": 940 }, { "epoch": 0.3185572223158604, "grad_norm": 2.874297049113011, "learning_rate": 9.970473680584729e-06, "loss": 0.3912, "step": 945 }, { "epoch": 0.3202427102646216, "grad_norm": 2.3679632099037353, "learning_rate": 9.969878405213522e-06, "loss": 0.3451, "step": 950 }, { "epoch": 0.32192819821338275, "grad_norm": 1.8485431472741025, "learning_rate": 9.969277207080757e-06, "loss": 0.3826, "step": 955 }, { "epoch": 0.3236136861621439, "grad_norm": 1.8160514122580558, "learning_rate": 9.968670086902898e-06, "loss": 0.4119, "step": 960 }, { "epoch": 0.3252991741109051, "grad_norm": 1.6310654165501828, "learning_rate": 9.968057045403473e-06, "loss": 0.3888, "step": 965 }, { "epoch": 0.32698466205966625, "grad_norm": 1.9293470572012084, "learning_rate": 9.967438083313057e-06, "loss": 0.3776, "step": 970 }, { "epoch": 0.3286701500084274, "grad_norm": 2.064165266260474, "learning_rate": 9.966813201369294e-06, "loss": 0.4023, "step": 975 }, { "epoch": 0.3303556379571886, "grad_norm": 1.6291463422904702, "learning_rate": 9.96618240031687e-06, "loss": 0.3711, "step": 980 }, { "epoch": 0.33204112590594975, "grad_norm": 1.4760897650643887, "learning_rate": 9.965545680907534e-06, "loss": 0.372, "step": 985 }, { "epoch": 0.3337266138547109, "grad_norm": 1.5516691875793427, "learning_rate": 9.964903043900085e-06, "loss": 0.379, "step": 990 }, { "epoch": 0.3354121018034721, "grad_norm": 1.7123969322933874, "learning_rate": 9.96425449006037e-06, "loss": 0.3717, "step": 995 }, { "epoch": 0.33709758975223325, "grad_norm": 2.1244926392838357, "learning_rate": 9.963600020161294e-06, "loss": 0.3504, "step": 1000 }, { "epoch": 0.3387830777009944, "grad_norm": 3.7906827553852067, "learning_rate": 9.962939634982808e-06, "loss": 0.3701, "step": 1005 }, { "epoch": 0.3404685656497556, "grad_norm": 1.869057502602457, "learning_rate": 9.962273335311918e-06, "loss": 0.3632, "step": 1010 }, { "epoch": 0.34215405359851675, "grad_norm": 3.366608851411368, "learning_rate": 9.961601121942667e-06, "loss": 0.3724, "step": 1015 }, { "epoch": 0.3438395415472779, "grad_norm": 4.54722354217597, "learning_rate": 9.96092299567616e-06, "loss": 0.4175, "step": 1020 }, { "epoch": 0.3455250294960391, "grad_norm": 1.5304066770240292, "learning_rate": 9.96023895732054e-06, "loss": 0.3698, "step": 1025 }, { "epoch": 0.34721051744480025, "grad_norm": 1.6504649468386736, "learning_rate": 9.959549007690996e-06, "loss": 0.3713, "step": 1030 }, { "epoch": 0.3488960053935614, "grad_norm": 1.9237813388224598, "learning_rate": 9.958853147609762e-06, "loss": 0.3711, "step": 1035 }, { "epoch": 0.3505814933423226, "grad_norm": 1.557187949710829, "learning_rate": 9.958151377906116e-06, "loss": 0.3845, "step": 1040 }, { "epoch": 0.35226698129108375, "grad_norm": 2.1410759348004813, "learning_rate": 9.957443699416382e-06, "loss": 0.3689, "step": 1045 }, { "epoch": 0.3539524692398449, "grad_norm": 2.6413903943547763, "learning_rate": 9.956730112983922e-06, "loss": 0.3884, "step": 1050 }, { "epoch": 0.3556379571886061, "grad_norm": 2.0042783331507796, "learning_rate": 9.956010619459138e-06, "loss": 0.3993, "step": 1055 }, { "epoch": 0.35732344513736725, "grad_norm": 1.9178390205816611, "learning_rate": 9.955285219699476e-06, "loss": 0.3912, "step": 1060 }, { "epoch": 0.3590089330861284, "grad_norm": 2.033424630243459, "learning_rate": 9.954553914569414e-06, "loss": 0.3509, "step": 1065 }, { "epoch": 0.3606944210348896, "grad_norm": 2.8573924965488304, "learning_rate": 9.953816704940475e-06, "loss": 0.3833, "step": 1070 }, { "epoch": 0.36237990898365074, "grad_norm": 1.454160800765969, "learning_rate": 9.953073591691214e-06, "loss": 0.3607, "step": 1075 }, { "epoch": 0.3640653969324119, "grad_norm": 1.436069191284367, "learning_rate": 9.952324575707222e-06, "loss": 0.3671, "step": 1080 }, { "epoch": 0.3657508848811731, "grad_norm": 1.528433914963016, "learning_rate": 9.951569657881124e-06, "loss": 0.398, "step": 1085 }, { "epoch": 0.36743637282993424, "grad_norm": 2.0721957509355833, "learning_rate": 9.950808839112583e-06, "loss": 0.3697, "step": 1090 }, { "epoch": 0.3691218607786954, "grad_norm": 3.774799286122688, "learning_rate": 9.950042120308287e-06, "loss": 0.379, "step": 1095 }, { "epoch": 0.3708073487274566, "grad_norm": 2.260780199791883, "learning_rate": 9.949269502381961e-06, "loss": 0.3915, "step": 1100 }, { "epoch": 0.37249283667621774, "grad_norm": 1.9800273567119477, "learning_rate": 9.94849098625436e-06, "loss": 0.3854, "step": 1105 }, { "epoch": 0.3741783246249789, "grad_norm": 1.4559740597276838, "learning_rate": 9.947706572853262e-06, "loss": 0.3934, "step": 1110 }, { "epoch": 0.3758638125737401, "grad_norm": 1.6009456809223352, "learning_rate": 9.946916263113482e-06, "loss": 0.3832, "step": 1115 }, { "epoch": 0.37754930052250124, "grad_norm": 1.7072821817014892, "learning_rate": 9.946120057976853e-06, "loss": 0.387, "step": 1120 }, { "epoch": 0.3792347884712624, "grad_norm": 2.3391026420804013, "learning_rate": 9.945317958392243e-06, "loss": 0.4232, "step": 1125 }, { "epoch": 0.3809202764200236, "grad_norm": 1.8763405027701638, "learning_rate": 9.944509965315532e-06, "loss": 0.3743, "step": 1130 }, { "epoch": 0.38260576436878474, "grad_norm": 1.8203563045694253, "learning_rate": 9.943696079709637e-06, "loss": 0.3919, "step": 1135 }, { "epoch": 0.3842912523175459, "grad_norm": 1.9188335442110622, "learning_rate": 9.94287630254449e-06, "loss": 0.3803, "step": 1140 }, { "epoch": 0.3859767402663071, "grad_norm": 1.9193760465596672, "learning_rate": 9.942050634797044e-06, "loss": 0.3717, "step": 1145 }, { "epoch": 0.38766222821506824, "grad_norm": 1.2184450527027415, "learning_rate": 9.941219077451276e-06, "loss": 0.3101, "step": 1150 }, { "epoch": 0.3893477161638294, "grad_norm": 1.9082738737781184, "learning_rate": 9.940381631498175e-06, "loss": 0.3794, "step": 1155 }, { "epoch": 0.39103320411259057, "grad_norm": 1.6941062415529424, "learning_rate": 9.939538297935756e-06, "loss": 0.395, "step": 1160 }, { "epoch": 0.39271869206135174, "grad_norm": 1.4999086936872892, "learning_rate": 9.938689077769046e-06, "loss": 0.3511, "step": 1165 }, { "epoch": 0.3944041800101129, "grad_norm": 1.5354307336843596, "learning_rate": 9.937833972010084e-06, "loss": 0.3779, "step": 1170 }, { "epoch": 0.39608966795887407, "grad_norm": 1.6148043731147916, "learning_rate": 9.93697298167793e-06, "loss": 0.3677, "step": 1175 }, { "epoch": 0.39777515590763524, "grad_norm": 3.547101819036569, "learning_rate": 9.936106107798654e-06, "loss": 0.367, "step": 1180 }, { "epoch": 0.3994606438563964, "grad_norm": 1.4256930815524032, "learning_rate": 9.935233351405333e-06, "loss": 0.3292, "step": 1185 }, { "epoch": 0.40114613180515757, "grad_norm": 1.8463182510574727, "learning_rate": 9.934354713538061e-06, "loss": 0.3472, "step": 1190 }, { "epoch": 0.40283161975391873, "grad_norm": 1.8713630909886005, "learning_rate": 9.933470195243939e-06, "loss": 0.3633, "step": 1195 }, { "epoch": 0.4045171077026799, "grad_norm": 1.9909097858371128, "learning_rate": 9.932579797577075e-06, "loss": 0.3978, "step": 1200 }, { "epoch": 0.40620259565144107, "grad_norm": 1.922000086644586, "learning_rate": 9.931683521598585e-06, "loss": 0.3831, "step": 1205 }, { "epoch": 0.40788808360020223, "grad_norm": 1.6376595060664128, "learning_rate": 9.930781368376588e-06, "loss": 0.3613, "step": 1210 }, { "epoch": 0.4095735715489634, "grad_norm": 1.8682072536073533, "learning_rate": 9.929873338986208e-06, "loss": 0.3803, "step": 1215 }, { "epoch": 0.41125905949772457, "grad_norm": 1.641010884397326, "learning_rate": 9.928959434509576e-06, "loss": 0.3855, "step": 1220 }, { "epoch": 0.41294454744648573, "grad_norm": 2.811707066345706, "learning_rate": 9.928039656035817e-06, "loss": 0.3824, "step": 1225 }, { "epoch": 0.4146300353952469, "grad_norm": 2.1669607720074064, "learning_rate": 9.927114004661063e-06, "loss": 0.3467, "step": 1230 }, { "epoch": 0.41631552334400806, "grad_norm": 2.5149830559716504, "learning_rate": 9.92618248148844e-06, "loss": 0.3436, "step": 1235 }, { "epoch": 0.41800101129276923, "grad_norm": 2.881848018303243, "learning_rate": 9.925245087628073e-06, "loss": 0.3659, "step": 1240 }, { "epoch": 0.4196864992415304, "grad_norm": 1.5245821389573595, "learning_rate": 9.924301824197087e-06, "loss": 0.3723, "step": 1245 }, { "epoch": 0.42137198719029156, "grad_norm": 1.6791218367042129, "learning_rate": 9.923352692319595e-06, "loss": 0.3941, "step": 1250 }, { "epoch": 0.42305747513905273, "grad_norm": 1.5564687504140988, "learning_rate": 9.922397693126712e-06, "loss": 0.3646, "step": 1255 }, { "epoch": 0.4247429630878139, "grad_norm": 1.74390718403418, "learning_rate": 9.921436827756539e-06, "loss": 0.3543, "step": 1260 }, { "epoch": 0.42642845103657506, "grad_norm": 1.3414046416119982, "learning_rate": 9.920470097354166e-06, "loss": 0.3319, "step": 1265 }, { "epoch": 0.42811393898533623, "grad_norm": 1.2425173294583747, "learning_rate": 9.919497503071682e-06, "loss": 0.3642, "step": 1270 }, { "epoch": 0.4297994269340974, "grad_norm": 1.4527031016286536, "learning_rate": 9.918519046068157e-06, "loss": 0.3602, "step": 1275 }, { "epoch": 0.43148491488285856, "grad_norm": 1.8868133059382373, "learning_rate": 9.917534727509647e-06, "loss": 0.354, "step": 1280 }, { "epoch": 0.4331704028316197, "grad_norm": 1.5610447769025018, "learning_rate": 9.916544548569195e-06, "loss": 0.3737, "step": 1285 }, { "epoch": 0.4348558907803809, "grad_norm": 1.3021664391906702, "learning_rate": 9.915548510426833e-06, "loss": 0.3592, "step": 1290 }, { "epoch": 0.43654137872914206, "grad_norm": 1.3810259490145473, "learning_rate": 9.91454661426957e-06, "loss": 0.3886, "step": 1295 }, { "epoch": 0.4382268666779032, "grad_norm": 1.3304099160027794, "learning_rate": 9.913538861291391e-06, "loss": 0.3855, "step": 1300 }, { "epoch": 0.4399123546266644, "grad_norm": 1.7264723009040814, "learning_rate": 9.912525252693276e-06, "loss": 0.339, "step": 1305 }, { "epoch": 0.44159784257542556, "grad_norm": 1.3667437431870433, "learning_rate": 9.911505789683169e-06, "loss": 0.3839, "step": 1310 }, { "epoch": 0.4432833305241867, "grad_norm": 1.6196648417481219, "learning_rate": 9.910480473475998e-06, "loss": 0.3786, "step": 1315 }, { "epoch": 0.4449688184729479, "grad_norm": 1.4133305673198646, "learning_rate": 9.909449305293665e-06, "loss": 0.3583, "step": 1320 }, { "epoch": 0.44665430642170906, "grad_norm": 1.5553388389804803, "learning_rate": 9.908412286365047e-06, "loss": 0.3739, "step": 1325 }, { "epoch": 0.4483397943704702, "grad_norm": 1.247977985211659, "learning_rate": 9.90736941792599e-06, "loss": 0.3379, "step": 1330 }, { "epoch": 0.4500252823192314, "grad_norm": 1.447620792005676, "learning_rate": 9.906320701219314e-06, "loss": 0.3799, "step": 1335 }, { "epoch": 0.45171077026799256, "grad_norm": 2.0817192065173318, "learning_rate": 9.90526613749481e-06, "loss": 0.4061, "step": 1340 }, { "epoch": 0.4533962582167537, "grad_norm": 1.5729182550945877, "learning_rate": 9.90420572800923e-06, "loss": 0.3808, "step": 1345 }, { "epoch": 0.45508174616551494, "grad_norm": 1.657457509921655, "learning_rate": 9.903139474026304e-06, "loss": 0.3586, "step": 1350 }, { "epoch": 0.4567672341142761, "grad_norm": 1.5236374339069958, "learning_rate": 9.902067376816716e-06, "loss": 0.3858, "step": 1355 }, { "epoch": 0.4584527220630373, "grad_norm": 1.5226480730500296, "learning_rate": 9.90098943765812e-06, "loss": 0.3519, "step": 1360 }, { "epoch": 0.46013821001179844, "grad_norm": 1.477229921055985, "learning_rate": 9.89990565783513e-06, "loss": 0.3773, "step": 1365 }, { "epoch": 0.4618236979605596, "grad_norm": 1.5683976956525734, "learning_rate": 9.898816038639318e-06, "loss": 0.3779, "step": 1370 }, { "epoch": 0.4635091859093208, "grad_norm": 1.2774876859518138, "learning_rate": 9.897720581369223e-06, "loss": 0.3855, "step": 1375 }, { "epoch": 0.46519467385808194, "grad_norm": 1.742934808160777, "learning_rate": 9.896619287330333e-06, "loss": 0.3442, "step": 1380 }, { "epoch": 0.4668801618068431, "grad_norm": 2.0269501427448446, "learning_rate": 9.895512157835096e-06, "loss": 0.3817, "step": 1385 }, { "epoch": 0.4685656497556043, "grad_norm": 1.4335424967789996, "learning_rate": 9.894399194202913e-06, "loss": 0.402, "step": 1390 }, { "epoch": 0.47025113770436544, "grad_norm": 1.7165362334668752, "learning_rate": 9.893280397760137e-06, "loss": 0.3837, "step": 1395 }, { "epoch": 0.4719366256531266, "grad_norm": 2.976000196795781, "learning_rate": 9.892155769840075e-06, "loss": 0.3682, "step": 1400 }, { "epoch": 0.4736221136018878, "grad_norm": 1.6994878217649543, "learning_rate": 9.891025311782981e-06, "loss": 0.3762, "step": 1405 }, { "epoch": 0.47530760155064894, "grad_norm": 1.6583356777220375, "learning_rate": 9.889889024936055e-06, "loss": 0.3779, "step": 1410 }, { "epoch": 0.4769930894994101, "grad_norm": 1.3449016512465066, "learning_rate": 9.888746910653451e-06, "loss": 0.4038, "step": 1415 }, { "epoch": 0.47867857744817127, "grad_norm": 2.3117344706889558, "learning_rate": 9.88759897029626e-06, "loss": 0.403, "step": 1420 }, { "epoch": 0.48036406539693244, "grad_norm": 1.66354971908194, "learning_rate": 9.886445205232518e-06, "loss": 0.3639, "step": 1425 }, { "epoch": 0.4820495533456936, "grad_norm": 1.3978047454441775, "learning_rate": 9.885285616837204e-06, "loss": 0.3937, "step": 1430 }, { "epoch": 0.48373504129445477, "grad_norm": 1.8222955243579932, "learning_rate": 9.884120206492239e-06, "loss": 0.3815, "step": 1435 }, { "epoch": 0.48542052924321594, "grad_norm": 1.5312912667688126, "learning_rate": 9.882948975586475e-06, "loss": 0.3379, "step": 1440 }, { "epoch": 0.4871060171919771, "grad_norm": 1.5579909761514894, "learning_rate": 9.881771925515708e-06, "loss": 0.3753, "step": 1445 }, { "epoch": 0.48879150514073827, "grad_norm": 1.3262651786348019, "learning_rate": 9.880589057682666e-06, "loss": 0.3532, "step": 1450 }, { "epoch": 0.49047699308949944, "grad_norm": 1.4867089897459638, "learning_rate": 9.879400373497009e-06, "loss": 0.3578, "step": 1455 }, { "epoch": 0.4921624810382606, "grad_norm": 3.9915201721688596, "learning_rate": 9.878205874375327e-06, "loss": 0.3997, "step": 1460 }, { "epoch": 0.49384796898702177, "grad_norm": 1.5637136327033112, "learning_rate": 9.877005561741147e-06, "loss": 0.3645, "step": 1465 }, { "epoch": 0.49553345693578293, "grad_norm": 1.2642443367640699, "learning_rate": 9.875799437024918e-06, "loss": 0.3381, "step": 1470 }, { "epoch": 0.4972189448845441, "grad_norm": 1.96578843307235, "learning_rate": 9.874587501664018e-06, "loss": 0.3517, "step": 1475 }, { "epoch": 0.49890443283330527, "grad_norm": 1.1903686232767572, "learning_rate": 9.873369757102744e-06, "loss": 0.396, "step": 1480 }, { "epoch": 0.5005899207820664, "grad_norm": 1.6148672292269124, "learning_rate": 9.872146204792327e-06, "loss": 0.3747, "step": 1485 }, { "epoch": 0.5022754087308275, "grad_norm": 1.3729016663750981, "learning_rate": 9.87091684619091e-06, "loss": 0.354, "step": 1490 }, { "epoch": 0.5039608966795888, "grad_norm": 1.4477298289237992, "learning_rate": 9.86968168276356e-06, "loss": 0.4039, "step": 1495 }, { "epoch": 0.5056463846283499, "grad_norm": 1.2921266896305665, "learning_rate": 9.868440715982257e-06, "loss": 0.4031, "step": 1500 }, { "epoch": 0.5073318725771111, "grad_norm": 1.6101208365324928, "learning_rate": 9.867193947325904e-06, "loss": 0.3496, "step": 1505 }, { "epoch": 0.5090173605258722, "grad_norm": 1.21317726468614, "learning_rate": 9.865941378280312e-06, "loss": 0.3883, "step": 1510 }, { "epoch": 0.5107028484746334, "grad_norm": 1.379275079849456, "learning_rate": 9.864683010338212e-06, "loss": 0.3744, "step": 1515 }, { "epoch": 0.5123883364233945, "grad_norm": 1.411176846999733, "learning_rate": 9.863418844999235e-06, "loss": 0.3888, "step": 1520 }, { "epoch": 0.5140738243721558, "grad_norm": 1.5172650618960308, "learning_rate": 9.862148883769931e-06, "loss": 0.3596, "step": 1525 }, { "epoch": 0.5157593123209169, "grad_norm": 1.875551642818612, "learning_rate": 9.86087312816375e-06, "loss": 0.3736, "step": 1530 }, { "epoch": 0.5174448002696781, "grad_norm": 1.6286132129606317, "learning_rate": 9.859591579701053e-06, "loss": 0.3615, "step": 1535 }, { "epoch": 0.5191302882184392, "grad_norm": 1.5249569684369995, "learning_rate": 9.858304239909102e-06, "loss": 0.4015, "step": 1540 }, { "epoch": 0.5208157761672004, "grad_norm": 1.6254620292539275, "learning_rate": 9.85701111032206e-06, "loss": 0.344, "step": 1545 }, { "epoch": 0.5225012641159615, "grad_norm": 1.9407647903942409, "learning_rate": 9.855712192480986e-06, "loss": 0.3528, "step": 1550 }, { "epoch": 0.5241867520647228, "grad_norm": 1.6535005702127823, "learning_rate": 9.854407487933849e-06, "loss": 0.3453, "step": 1555 }, { "epoch": 0.5258722400134839, "grad_norm": 1.5395599773328799, "learning_rate": 9.853096998235502e-06, "loss": 0.381, "step": 1560 }, { "epoch": 0.5275577279622451, "grad_norm": 1.9864671010107728, "learning_rate": 9.8517807249477e-06, "loss": 0.3959, "step": 1565 }, { "epoch": 0.5292432159110062, "grad_norm": 1.9812177121423384, "learning_rate": 9.850458669639083e-06, "loss": 0.3507, "step": 1570 }, { "epoch": 0.5309287038597674, "grad_norm": 1.5164728376351555, "learning_rate": 9.849130833885192e-06, "loss": 0.3736, "step": 1575 }, { "epoch": 0.5326141918085285, "grad_norm": 1.3996020796112227, "learning_rate": 9.847797219268447e-06, "loss": 0.3794, "step": 1580 }, { "epoch": 0.5342996797572898, "grad_norm": 1.8305921371448441, "learning_rate": 9.84645782737816e-06, "loss": 0.3831, "step": 1585 }, { "epoch": 0.5359851677060509, "grad_norm": 2.265248065963422, "learning_rate": 9.845112659810525e-06, "loss": 0.3453, "step": 1590 }, { "epoch": 0.5376706556548121, "grad_norm": 1.4395690545235373, "learning_rate": 9.843761718168625e-06, "loss": 0.3796, "step": 1595 }, { "epoch": 0.5393561436035732, "grad_norm": 1.8254751868748305, "learning_rate": 9.842405004062417e-06, "loss": 0.3774, "step": 1600 }, { "epoch": 0.5410416315523344, "grad_norm": 2.5465244378890706, "learning_rate": 9.841042519108739e-06, "loss": 0.3794, "step": 1605 }, { "epoch": 0.5427271195010955, "grad_norm": 1.242660191167484, "learning_rate": 9.83967426493131e-06, "loss": 0.3404, "step": 1610 }, { "epoch": 0.5444126074498568, "grad_norm": 1.3639736726141871, "learning_rate": 9.838300243160722e-06, "loss": 0.3568, "step": 1615 }, { "epoch": 0.5460980953986179, "grad_norm": 1.5872074532119953, "learning_rate": 9.836920455434437e-06, "loss": 0.3759, "step": 1620 }, { "epoch": 0.5477835833473791, "grad_norm": 1.348686027258727, "learning_rate": 9.835534903396795e-06, "loss": 0.3994, "step": 1625 }, { "epoch": 0.5494690712961402, "grad_norm": 1.4816598182200131, "learning_rate": 9.834143588699002e-06, "loss": 0.3843, "step": 1630 }, { "epoch": 0.5511545592449014, "grad_norm": 1.8712193283757328, "learning_rate": 9.83274651299913e-06, "loss": 0.39, "step": 1635 }, { "epoch": 0.5528400471936625, "grad_norm": 3.4046833656269104, "learning_rate": 9.831343677962121e-06, "loss": 0.3869, "step": 1640 }, { "epoch": 0.5545255351424238, "grad_norm": 2.9435101818151463, "learning_rate": 9.829935085259775e-06, "loss": 0.3811, "step": 1645 }, { "epoch": 0.5562110230911849, "grad_norm": 4.462125272620029, "learning_rate": 9.82852073657076e-06, "loss": 0.3831, "step": 1650 }, { "epoch": 0.5578965110399461, "grad_norm": 1.3067101655202602, "learning_rate": 9.827100633580595e-06, "loss": 0.3707, "step": 1655 }, { "epoch": 0.5595819989887072, "grad_norm": 1.4289922522073135, "learning_rate": 9.825674777981666e-06, "loss": 0.3855, "step": 1660 }, { "epoch": 0.5612674869374684, "grad_norm": 1.2486461237777289, "learning_rate": 9.824243171473208e-06, "loss": 0.3665, "step": 1665 }, { "epoch": 0.5629529748862295, "grad_norm": 1.587809195793634, "learning_rate": 9.822805815761316e-06, "loss": 0.3592, "step": 1670 }, { "epoch": 0.5646384628349908, "grad_norm": 1.1920623778903174, "learning_rate": 9.821362712558926e-06, "loss": 0.3434, "step": 1675 }, { "epoch": 0.5663239507837519, "grad_norm": 1.7118935425245672, "learning_rate": 9.819913863585836e-06, "loss": 0.3979, "step": 1680 }, { "epoch": 0.5680094387325131, "grad_norm": 1.4144090637086981, "learning_rate": 9.818459270568682e-06, "loss": 0.3681, "step": 1685 }, { "epoch": 0.5696949266812742, "grad_norm": 1.2924185332094549, "learning_rate": 9.816998935240946e-06, "loss": 0.3748, "step": 1690 }, { "epoch": 0.5713804146300354, "grad_norm": 1.2253496181098722, "learning_rate": 9.81553285934296e-06, "loss": 0.3675, "step": 1695 }, { "epoch": 0.5730659025787965, "grad_norm": 2.8092012770674164, "learning_rate": 9.814061044621894e-06, "loss": 0.3841, "step": 1700 }, { "epoch": 0.5747513905275577, "grad_norm": 1.2314566428669753, "learning_rate": 9.812583492831751e-06, "loss": 0.3676, "step": 1705 }, { "epoch": 0.5764368784763189, "grad_norm": 1.7990990576328856, "learning_rate": 9.811100205733381e-06, "loss": 0.3731, "step": 1710 }, { "epoch": 0.5781223664250801, "grad_norm": 1.6232984129821095, "learning_rate": 9.809611185094463e-06, "loss": 0.3862, "step": 1715 }, { "epoch": 0.5798078543738412, "grad_norm": 1.2766438891686642, "learning_rate": 9.808116432689509e-06, "loss": 0.3728, "step": 1720 }, { "epoch": 0.5814933423226024, "grad_norm": 1.8194884822119324, "learning_rate": 9.806615950299865e-06, "loss": 0.3613, "step": 1725 }, { "epoch": 0.5831788302713635, "grad_norm": 1.182041198701511, "learning_rate": 9.8051097397137e-06, "loss": 0.3711, "step": 1730 }, { "epoch": 0.5848643182201247, "grad_norm": 1.1813270514141903, "learning_rate": 9.803597802726015e-06, "loss": 0.339, "step": 1735 }, { "epoch": 0.5865498061688859, "grad_norm": 1.1615217782525067, "learning_rate": 9.802080141138634e-06, "loss": 0.335, "step": 1740 }, { "epoch": 0.5882352941176471, "grad_norm": 1.2225454930609114, "learning_rate": 9.8005567567602e-06, "loss": 0.3641, "step": 1745 }, { "epoch": 0.5899207820664082, "grad_norm": 1.5721864635841314, "learning_rate": 9.799027651406181e-06, "loss": 0.3579, "step": 1750 }, { "epoch": 0.5916062700151694, "grad_norm": 1.3119639724883532, "learning_rate": 9.79749282689886e-06, "loss": 0.4099, "step": 1755 }, { "epoch": 0.5932917579639305, "grad_norm": 1.371061205922346, "learning_rate": 9.795952285067334e-06, "loss": 0.3643, "step": 1760 }, { "epoch": 0.5949772459126917, "grad_norm": 1.413448565845774, "learning_rate": 9.794406027747516e-06, "loss": 0.3714, "step": 1765 }, { "epoch": 0.5966627338614529, "grad_norm": 1.621548282743646, "learning_rate": 9.79285405678213e-06, "loss": 0.3993, "step": 1770 }, { "epoch": 0.5983482218102141, "grad_norm": 1.5758278424522085, "learning_rate": 9.791296374020711e-06, "loss": 0.3643, "step": 1775 }, { "epoch": 0.6000337097589752, "grad_norm": 1.2732012081933695, "learning_rate": 9.789732981319597e-06, "loss": 0.3622, "step": 1780 }, { "epoch": 0.6017191977077364, "grad_norm": 1.724149417614084, "learning_rate": 9.788163880541933e-06, "loss": 0.3734, "step": 1785 }, { "epoch": 0.6034046856564975, "grad_norm": 1.2459101202905065, "learning_rate": 9.786589073557665e-06, "loss": 0.3759, "step": 1790 }, { "epoch": 0.6050901736052587, "grad_norm": 1.7345209199449638, "learning_rate": 9.785008562243544e-06, "loss": 0.364, "step": 1795 }, { "epoch": 0.6067756615540199, "grad_norm": 1.3365285983048494, "learning_rate": 9.783422348483109e-06, "loss": 0.364, "step": 1800 }, { "epoch": 0.6084611495027811, "grad_norm": 1.3189215105489214, "learning_rate": 9.781830434166707e-06, "loss": 0.3861, "step": 1805 }, { "epoch": 0.6101466374515422, "grad_norm": 2.453366829111078, "learning_rate": 9.78023282119147e-06, "loss": 0.3235, "step": 1810 }, { "epoch": 0.6118321254003034, "grad_norm": 1.4264031253919447, "learning_rate": 9.778629511461327e-06, "loss": 0.3873, "step": 1815 }, { "epoch": 0.6135176133490645, "grad_norm": 1.7280331748498687, "learning_rate": 9.77702050688699e-06, "loss": 0.3574, "step": 1820 }, { "epoch": 0.6152031012978257, "grad_norm": 1.1882150276334564, "learning_rate": 9.77540580938596e-06, "loss": 0.3795, "step": 1825 }, { "epoch": 0.6168885892465868, "grad_norm": 1.9950862244196361, "learning_rate": 9.773785420882527e-06, "loss": 0.3462, "step": 1830 }, { "epoch": 0.6185740771953481, "grad_norm": 1.4034758002974188, "learning_rate": 9.772159343307755e-06, "loss": 0.3389, "step": 1835 }, { "epoch": 0.6202595651441092, "grad_norm": 1.293250344875036, "learning_rate": 9.770527578599495e-06, "loss": 0.3715, "step": 1840 }, { "epoch": 0.6219450530928704, "grad_norm": 1.6660907631550192, "learning_rate": 9.76889012870237e-06, "loss": 0.3613, "step": 1845 }, { "epoch": 0.6236305410416315, "grad_norm": 1.1473922763213351, "learning_rate": 9.767246995567785e-06, "loss": 0.3307, "step": 1850 }, { "epoch": 0.6253160289903927, "grad_norm": 3.230850340977098, "learning_rate": 9.765598181153913e-06, "loss": 0.3629, "step": 1855 }, { "epoch": 0.6270015169391538, "grad_norm": 1.787900324765984, "learning_rate": 9.763943687425695e-06, "loss": 0.3826, "step": 1860 }, { "epoch": 0.6286870048879151, "grad_norm": 1.7198065823109134, "learning_rate": 9.762283516354845e-06, "loss": 0.3972, "step": 1865 }, { "epoch": 0.6303724928366762, "grad_norm": 1.5591004076120165, "learning_rate": 9.760617669919843e-06, "loss": 0.3501, "step": 1870 }, { "epoch": 0.6320579807854374, "grad_norm": 2.687947841313869, "learning_rate": 9.758946150105929e-06, "loss": 0.3625, "step": 1875 }, { "epoch": 0.6337434687341985, "grad_norm": 1.1741314827927007, "learning_rate": 9.757268958905108e-06, "loss": 0.3781, "step": 1880 }, { "epoch": 0.6354289566829597, "grad_norm": 1.5664751237434484, "learning_rate": 9.755586098316141e-06, "loss": 0.3558, "step": 1885 }, { "epoch": 0.6371144446317208, "grad_norm": 1.240015207550192, "learning_rate": 9.753897570344546e-06, "loss": 0.3551, "step": 1890 }, { "epoch": 0.6387999325804821, "grad_norm": 1.8229664319251397, "learning_rate": 9.752203377002596e-06, "loss": 0.3603, "step": 1895 }, { "epoch": 0.6404854205292432, "grad_norm": 2.83552526946451, "learning_rate": 9.750503520309315e-06, "loss": 0.3496, "step": 1900 }, { "epoch": 0.6421709084780044, "grad_norm": 2.0244644558655023, "learning_rate": 9.748798002290474e-06, "loss": 0.3244, "step": 1905 }, { "epoch": 0.6438563964267655, "grad_norm": 1.1760571210531725, "learning_rate": 9.747086824978595e-06, "loss": 0.3799, "step": 1910 }, { "epoch": 0.6455418843755267, "grad_norm": 1.225229598191833, "learning_rate": 9.745369990412943e-06, "loss": 0.3927, "step": 1915 }, { "epoch": 0.6472273723242878, "grad_norm": 1.9260704719551835, "learning_rate": 9.743647500639521e-06, "loss": 0.3632, "step": 1920 }, { "epoch": 0.6489128602730491, "grad_norm": 1.4680818662991428, "learning_rate": 9.741919357711078e-06, "loss": 0.3867, "step": 1925 }, { "epoch": 0.6505983482218102, "grad_norm": 1.6477683490735457, "learning_rate": 9.740185563687095e-06, "loss": 0.3606, "step": 1930 }, { "epoch": 0.6522838361705714, "grad_norm": 5.509552602446063, "learning_rate": 9.738446120633788e-06, "loss": 0.3527, "step": 1935 }, { "epoch": 0.6539693241193325, "grad_norm": 1.3651510345103504, "learning_rate": 9.736701030624109e-06, "loss": 0.3342, "step": 1940 }, { "epoch": 0.6556548120680937, "grad_norm": 1.4065274870238285, "learning_rate": 9.734950295737736e-06, "loss": 0.4101, "step": 1945 }, { "epoch": 0.6573403000168548, "grad_norm": 1.5133391638671063, "learning_rate": 9.733193918061072e-06, "loss": 0.3496, "step": 1950 }, { "epoch": 0.6590257879656161, "grad_norm": 1.2800300431439677, "learning_rate": 9.731431899687254e-06, "loss": 0.3433, "step": 1955 }, { "epoch": 0.6607112759143772, "grad_norm": 1.226593168456227, "learning_rate": 9.729664242716133e-06, "loss": 0.3288, "step": 1960 }, { "epoch": 0.6623967638631384, "grad_norm": 1.603857320637352, "learning_rate": 9.727890949254279e-06, "loss": 0.3873, "step": 1965 }, { "epoch": 0.6640822518118995, "grad_norm": 1.4776997381891348, "learning_rate": 9.726112021414985e-06, "loss": 0.36, "step": 1970 }, { "epoch": 0.6657677397606607, "grad_norm": 1.4498019018029922, "learning_rate": 9.724327461318253e-06, "loss": 0.3559, "step": 1975 }, { "epoch": 0.6674532277094218, "grad_norm": 1.5304816496173508, "learning_rate": 9.722537271090801e-06, "loss": 0.378, "step": 1980 }, { "epoch": 0.6691387156581831, "grad_norm": 1.2841465124541995, "learning_rate": 9.720741452866059e-06, "loss": 0.3502, "step": 1985 }, { "epoch": 0.6708242036069442, "grad_norm": 1.341839612536046, "learning_rate": 9.718940008784154e-06, "loss": 0.3732, "step": 1990 }, { "epoch": 0.6725096915557054, "grad_norm": 1.5679954311607547, "learning_rate": 9.71713294099193e-06, "loss": 0.3595, "step": 1995 }, { "epoch": 0.6741951795044665, "grad_norm": 1.8242560324395702, "learning_rate": 9.715320251642924e-06, "loss": 0.3437, "step": 2000 }, { "epoch": 0.6758806674532277, "grad_norm": 1.4721199342053406, "learning_rate": 9.713501942897374e-06, "loss": 0.3633, "step": 2005 }, { "epoch": 0.6775661554019888, "grad_norm": 1.903966919235931, "learning_rate": 9.71167801692222e-06, "loss": 0.3738, "step": 2010 }, { "epoch": 0.6792516433507501, "grad_norm": 1.3353416843282138, "learning_rate": 9.709848475891087e-06, "loss": 0.3367, "step": 2015 }, { "epoch": 0.6809371312995112, "grad_norm": 1.1024900446804993, "learning_rate": 9.708013321984303e-06, "loss": 0.3337, "step": 2020 }, { "epoch": 0.6826226192482724, "grad_norm": 1.8295367652286423, "learning_rate": 9.706172557388873e-06, "loss": 0.3774, "step": 2025 }, { "epoch": 0.6843081071970335, "grad_norm": 1.543116586118428, "learning_rate": 9.7043261842985e-06, "loss": 0.3655, "step": 2030 }, { "epoch": 0.6859935951457947, "grad_norm": 1.6317209425057675, "learning_rate": 9.702474204913563e-06, "loss": 0.3475, "step": 2035 }, { "epoch": 0.6876790830945558, "grad_norm": 1.63484679625352, "learning_rate": 9.700616621441123e-06, "loss": 0.3972, "step": 2040 }, { "epoch": 0.689364571043317, "grad_norm": 2.1873559252815555, "learning_rate": 9.698753436094924e-06, "loss": 0.3736, "step": 2045 }, { "epoch": 0.6910500589920782, "grad_norm": 1.2890286528794461, "learning_rate": 9.69688465109538e-06, "loss": 0.3469, "step": 2050 }, { "epoch": 0.6927355469408394, "grad_norm": 1.3239260464350429, "learning_rate": 9.695010268669585e-06, "loss": 0.3704, "step": 2055 }, { "epoch": 0.6944210348896005, "grad_norm": 1.8709605453949443, "learning_rate": 9.693130291051296e-06, "loss": 0.356, "step": 2060 }, { "epoch": 0.6961065228383617, "grad_norm": 2.9573522315636724, "learning_rate": 9.691244720480945e-06, "loss": 0.3742, "step": 2065 }, { "epoch": 0.6977920107871228, "grad_norm": 1.3980185260901286, "learning_rate": 9.689353559205623e-06, "loss": 0.3648, "step": 2070 }, { "epoch": 0.699477498735884, "grad_norm": 1.228705181126694, "learning_rate": 9.68745680947909e-06, "loss": 0.3039, "step": 2075 }, { "epoch": 0.7011629866846452, "grad_norm": 1.2220680611193167, "learning_rate": 9.685554473561762e-06, "loss": 0.3474, "step": 2080 }, { "epoch": 0.7028484746334064, "grad_norm": 2.59974825327748, "learning_rate": 9.683646553720712e-06, "loss": 0.3501, "step": 2085 }, { "epoch": 0.7045339625821675, "grad_norm": 1.6467232454475285, "learning_rate": 9.681733052229669e-06, "loss": 0.3787, "step": 2090 }, { "epoch": 0.7062194505309287, "grad_norm": 1.8096540134742791, "learning_rate": 9.679813971369012e-06, "loss": 0.3648, "step": 2095 }, { "epoch": 0.7079049384796898, "grad_norm": 1.2647471741504637, "learning_rate": 9.677889313425773e-06, "loss": 0.3788, "step": 2100 }, { "epoch": 0.709590426428451, "grad_norm": 1.35976378763326, "learning_rate": 9.675959080693627e-06, "loss": 0.3433, "step": 2105 }, { "epoch": 0.7112759143772122, "grad_norm": 1.308598282396058, "learning_rate": 9.67402327547289e-06, "loss": 0.378, "step": 2110 }, { "epoch": 0.7129614023259734, "grad_norm": 1.4775263796977416, "learning_rate": 9.672081900070528e-06, "loss": 0.3574, "step": 2115 }, { "epoch": 0.7146468902747345, "grad_norm": 1.1577557054374856, "learning_rate": 9.670134956800137e-06, "loss": 0.3733, "step": 2120 }, { "epoch": 0.7163323782234957, "grad_norm": 1.466750065549959, "learning_rate": 9.668182447981952e-06, "loss": 0.4092, "step": 2125 }, { "epoch": 0.7180178661722568, "grad_norm": 1.1565828725521126, "learning_rate": 9.666224375942837e-06, "loss": 0.3731, "step": 2130 }, { "epoch": 0.719703354121018, "grad_norm": 1.350218441493612, "learning_rate": 9.664260743016292e-06, "loss": 0.3865, "step": 2135 }, { "epoch": 0.7213888420697792, "grad_norm": 1.3358360503846247, "learning_rate": 9.662291551542438e-06, "loss": 0.3427, "step": 2140 }, { "epoch": 0.7230743300185404, "grad_norm": 1.4332616915804153, "learning_rate": 9.660316803868021e-06, "loss": 0.3474, "step": 2145 }, { "epoch": 0.7247598179673015, "grad_norm": 1.3259332446942924, "learning_rate": 9.658336502346417e-06, "loss": 0.3336, "step": 2150 }, { "epoch": 0.7264453059160627, "grad_norm": 2.3158674017755065, "learning_rate": 9.656350649337607e-06, "loss": 0.3839, "step": 2155 }, { "epoch": 0.7281307938648238, "grad_norm": 1.0949111862764238, "learning_rate": 9.6543592472082e-06, "loss": 0.3773, "step": 2160 }, { "epoch": 0.729816281813585, "grad_norm": 1.5097948334639733, "learning_rate": 9.65236229833141e-06, "loss": 0.4078, "step": 2165 }, { "epoch": 0.7315017697623462, "grad_norm": 1.2594652703608187, "learning_rate": 9.650359805087065e-06, "loss": 0.3567, "step": 2170 }, { "epoch": 0.7331872577111074, "grad_norm": 1.8804252015418728, "learning_rate": 9.648351769861602e-06, "loss": 0.3646, "step": 2175 }, { "epoch": 0.7348727456598685, "grad_norm": 1.3073981671846944, "learning_rate": 9.646338195048056e-06, "loss": 0.3504, "step": 2180 }, { "epoch": 0.7365582336086297, "grad_norm": 2.7949276977131086, "learning_rate": 9.64431908304607e-06, "loss": 0.4033, "step": 2185 }, { "epoch": 0.7382437215573908, "grad_norm": 1.2942219503947912, "learning_rate": 9.642294436261885e-06, "loss": 0.3619, "step": 2190 }, { "epoch": 0.739929209506152, "grad_norm": 1.6775973408111522, "learning_rate": 9.640264257108335e-06, "loss": 0.3347, "step": 2195 }, { "epoch": 0.7416146974549132, "grad_norm": 1.3588786223993885, "learning_rate": 9.638228548004849e-06, "loss": 0.3595, "step": 2200 }, { "epoch": 0.7433001854036744, "grad_norm": 1.6838109268179358, "learning_rate": 9.636187311377447e-06, "loss": 0.3373, "step": 2205 }, { "epoch": 0.7449856733524355, "grad_norm": 2.3830060953911167, "learning_rate": 9.634140549658735e-06, "loss": 0.3926, "step": 2210 }, { "epoch": 0.7466711613011967, "grad_norm": 1.7907822839020726, "learning_rate": 9.632088265287903e-06, "loss": 0.3472, "step": 2215 }, { "epoch": 0.7483566492499578, "grad_norm": 1.3471671246020636, "learning_rate": 9.630030460710722e-06, "loss": 0.3811, "step": 2220 }, { "epoch": 0.750042137198719, "grad_norm": 1.5810646120081224, "learning_rate": 9.627967138379547e-06, "loss": 0.3545, "step": 2225 }, { "epoch": 0.7517276251474801, "grad_norm": 1.507254101559988, "learning_rate": 9.625898300753302e-06, "loss": 0.3859, "step": 2230 }, { "epoch": 0.7534131130962414, "grad_norm": 1.2779664893101494, "learning_rate": 9.623823950297486e-06, "loss": 0.355, "step": 2235 }, { "epoch": 0.7550986010450025, "grad_norm": 1.2886964830711667, "learning_rate": 9.621744089484169e-06, "loss": 0.3449, "step": 2240 }, { "epoch": 0.7567840889937637, "grad_norm": 1.7316231667525117, "learning_rate": 9.619658720791987e-06, "loss": 0.3417, "step": 2245 }, { "epoch": 0.7584695769425248, "grad_norm": 1.1667612064431105, "learning_rate": 9.617567846706139e-06, "loss": 0.3474, "step": 2250 }, { "epoch": 0.760155064891286, "grad_norm": 1.1291994882095613, "learning_rate": 9.615471469718388e-06, "loss": 0.3388, "step": 2255 }, { "epoch": 0.7618405528400471, "grad_norm": 1.1256981178350294, "learning_rate": 9.61336959232705e-06, "loss": 0.3448, "step": 2260 }, { "epoch": 0.7635260407888084, "grad_norm": 1.427389213381377, "learning_rate": 9.611262217037004e-06, "loss": 0.3491, "step": 2265 }, { "epoch": 0.7652115287375695, "grad_norm": 1.324667521608873, "learning_rate": 9.609149346359668e-06, "loss": 0.3757, "step": 2270 }, { "epoch": 0.7668970166863307, "grad_norm": 1.3031507899917336, "learning_rate": 9.607030982813023e-06, "loss": 0.34, "step": 2275 }, { "epoch": 0.7685825046350918, "grad_norm": 1.4842518042803345, "learning_rate": 9.604907128921589e-06, "loss": 0.3929, "step": 2280 }, { "epoch": 0.770267992583853, "grad_norm": 1.294454455888665, "learning_rate": 9.602777787216429e-06, "loss": 0.3541, "step": 2285 }, { "epoch": 0.7719534805326141, "grad_norm": 1.4341792182231619, "learning_rate": 9.600642960235147e-06, "loss": 0.3933, "step": 2290 }, { "epoch": 0.7736389684813754, "grad_norm": 1.3954686669018816, "learning_rate": 9.598502650521883e-06, "loss": 0.3218, "step": 2295 }, { "epoch": 0.7753244564301365, "grad_norm": 1.3345111705629027, "learning_rate": 9.596356860627314e-06, "loss": 0.3313, "step": 2300 }, { "epoch": 0.7770099443788977, "grad_norm": 1.616643582212815, "learning_rate": 9.594205593108645e-06, "loss": 0.3691, "step": 2305 }, { "epoch": 0.7786954323276588, "grad_norm": 1.3596525923308695, "learning_rate": 9.59204885052961e-06, "loss": 0.3705, "step": 2310 }, { "epoch": 0.78038092027642, "grad_norm": 2.4609659088354725, "learning_rate": 9.589886635460466e-06, "loss": 0.3684, "step": 2315 }, { "epoch": 0.7820664082251811, "grad_norm": 1.2843009758418336, "learning_rate": 9.587718950477993e-06, "loss": 0.3784, "step": 2320 }, { "epoch": 0.7837518961739424, "grad_norm": 1.2577183022471217, "learning_rate": 9.585545798165494e-06, "loss": 0.3596, "step": 2325 }, { "epoch": 0.7854373841227035, "grad_norm": 1.6144024381724416, "learning_rate": 9.583367181112778e-06, "loss": 0.3741, "step": 2330 }, { "epoch": 0.7871228720714647, "grad_norm": 1.3461516966835854, "learning_rate": 9.581183101916176e-06, "loss": 0.3483, "step": 2335 }, { "epoch": 0.7888083600202258, "grad_norm": 1.6872425081360545, "learning_rate": 9.578993563178522e-06, "loss": 0.3757, "step": 2340 }, { "epoch": 0.790493847968987, "grad_norm": 1.4105158607965087, "learning_rate": 9.57679856750916e-06, "loss": 0.3575, "step": 2345 }, { "epoch": 0.7921793359177481, "grad_norm": 1.3965878147473851, "learning_rate": 9.574598117523935e-06, "loss": 0.368, "step": 2350 }, { "epoch": 0.7938648238665094, "grad_norm": 1.7944400734908867, "learning_rate": 9.572392215845194e-06, "loss": 0.3496, "step": 2355 }, { "epoch": 0.7955503118152705, "grad_norm": 1.4613174593711948, "learning_rate": 9.570180865101778e-06, "loss": 0.3449, "step": 2360 }, { "epoch": 0.7972357997640317, "grad_norm": 1.505128651251869, "learning_rate": 9.567964067929026e-06, "loss": 0.3404, "step": 2365 }, { "epoch": 0.7989212877127928, "grad_norm": 1.3414032802645093, "learning_rate": 9.565741826968766e-06, "loss": 0.3785, "step": 2370 }, { "epoch": 0.800606775661554, "grad_norm": 1.860016833945816, "learning_rate": 9.56351414486931e-06, "loss": 0.3272, "step": 2375 }, { "epoch": 0.8022922636103151, "grad_norm": 1.323878703458128, "learning_rate": 9.561281024285459e-06, "loss": 0.3437, "step": 2380 }, { "epoch": 0.8039777515590764, "grad_norm": 2.4691539168218952, "learning_rate": 9.559042467878492e-06, "loss": 0.351, "step": 2385 }, { "epoch": 0.8056632395078375, "grad_norm": 1.9483757444215413, "learning_rate": 9.556798478316169e-06, "loss": 0.3496, "step": 2390 }, { "epoch": 0.8073487274565987, "grad_norm": 1.3091971783922076, "learning_rate": 9.554549058272725e-06, "loss": 0.3908, "step": 2395 }, { "epoch": 0.8090342154053598, "grad_norm": 1.7075499068288942, "learning_rate": 9.552294210428863e-06, "loss": 0.3599, "step": 2400 }, { "epoch": 0.810719703354121, "grad_norm": 1.541184995336931, "learning_rate": 9.550033937471756e-06, "loss": 0.3517, "step": 2405 }, { "epoch": 0.8124051913028821, "grad_norm": 1.440864567708475, "learning_rate": 9.547768242095046e-06, "loss": 0.3198, "step": 2410 }, { "epoch": 0.8140906792516434, "grad_norm": 1.3114303025122667, "learning_rate": 9.545497126998832e-06, "loss": 0.3658, "step": 2415 }, { "epoch": 0.8157761672004045, "grad_norm": 1.2693919934972264, "learning_rate": 9.543220594889672e-06, "loss": 0.3648, "step": 2420 }, { "epoch": 0.8174616551491657, "grad_norm": 1.2480260474966405, "learning_rate": 9.540938648480584e-06, "loss": 0.3736, "step": 2425 }, { "epoch": 0.8191471430979268, "grad_norm": 1.2583908882869537, "learning_rate": 9.538651290491037e-06, "loss": 0.4143, "step": 2430 }, { "epoch": 0.820832631046688, "grad_norm": 1.3219514485911716, "learning_rate": 9.536358523646943e-06, "loss": 0.3573, "step": 2435 }, { "epoch": 0.8225181189954491, "grad_norm": 1.2563958548761411, "learning_rate": 9.53406035068067e-06, "loss": 0.3755, "step": 2440 }, { "epoch": 0.8242036069442104, "grad_norm": 1.300659464438313, "learning_rate": 9.53175677433102e-06, "loss": 0.3635, "step": 2445 }, { "epoch": 0.8258890948929715, "grad_norm": 1.2746373477428425, "learning_rate": 9.529447797343241e-06, "loss": 0.3644, "step": 2450 }, { "epoch": 0.8275745828417327, "grad_norm": 1.5446920899150354, "learning_rate": 9.527133422469013e-06, "loss": 0.3234, "step": 2455 }, { "epoch": 0.8292600707904938, "grad_norm": 1.2127750323326822, "learning_rate": 9.524813652466448e-06, "loss": 0.3503, "step": 2460 }, { "epoch": 0.830945558739255, "grad_norm": 1.0847472392184307, "learning_rate": 9.52248849010009e-06, "loss": 0.3797, "step": 2465 }, { "epoch": 0.8326310466880161, "grad_norm": 1.1563548773181405, "learning_rate": 9.520157938140912e-06, "loss": 0.3296, "step": 2470 }, { "epoch": 0.8343165346367774, "grad_norm": 1.2070003204546937, "learning_rate": 9.517821999366302e-06, "loss": 0.3823, "step": 2475 }, { "epoch": 0.8360020225855385, "grad_norm": 1.3126220997097682, "learning_rate": 9.515480676560073e-06, "loss": 0.3321, "step": 2480 }, { "epoch": 0.8376875105342997, "grad_norm": 1.231455314986524, "learning_rate": 9.513133972512458e-06, "loss": 0.3147, "step": 2485 }, { "epoch": 0.8393729984830608, "grad_norm": 1.2280471168780553, "learning_rate": 9.510781890020093e-06, "loss": 0.3706, "step": 2490 }, { "epoch": 0.841058486431822, "grad_norm": 1.3044595910355357, "learning_rate": 9.508424431886034e-06, "loss": 0.3394, "step": 2495 }, { "epoch": 0.8427439743805831, "grad_norm": 3.6679846130198843, "learning_rate": 9.506061600919734e-06, "loss": 0.3809, "step": 2500 }, { "epoch": 0.8444294623293443, "grad_norm": 1.1082238616328808, "learning_rate": 9.503693399937058e-06, "loss": 0.3014, "step": 2505 }, { "epoch": 0.8461149502781055, "grad_norm": 1.237984837643209, "learning_rate": 9.50131983176026e-06, "loss": 0.348, "step": 2510 }, { "epoch": 0.8478004382268667, "grad_norm": 1.5155978687734502, "learning_rate": 9.498940899218004e-06, "loss": 0.3623, "step": 2515 }, { "epoch": 0.8494859261756278, "grad_norm": 1.2146803088451354, "learning_rate": 9.496556605145335e-06, "loss": 0.346, "step": 2520 }, { "epoch": 0.851171414124389, "grad_norm": 1.0883708659526483, "learning_rate": 9.494166952383692e-06, "loss": 0.3711, "step": 2525 }, { "epoch": 0.8528569020731501, "grad_norm": 1.220308294992252, "learning_rate": 9.491771943780898e-06, "loss": 0.3464, "step": 2530 }, { "epoch": 0.8545423900219113, "grad_norm": 1.2710239748342407, "learning_rate": 9.489371582191164e-06, "loss": 0.3763, "step": 2535 }, { "epoch": 0.8562278779706725, "grad_norm": 1.1543955875726446, "learning_rate": 9.486965870475077e-06, "loss": 0.3861, "step": 2540 }, { "epoch": 0.8579133659194337, "grad_norm": 1.1142956892465583, "learning_rate": 9.484554811499598e-06, "loss": 0.3364, "step": 2545 }, { "epoch": 0.8595988538681948, "grad_norm": 1.660796741752221, "learning_rate": 9.482138408138064e-06, "loss": 0.3489, "step": 2550 }, { "epoch": 0.861284341816956, "grad_norm": 1.4461577988844987, "learning_rate": 9.479716663270178e-06, "loss": 0.3459, "step": 2555 }, { "epoch": 0.8629698297657171, "grad_norm": 1.4317652743708382, "learning_rate": 9.47728957978201e-06, "loss": 0.3726, "step": 2560 }, { "epoch": 0.8646553177144783, "grad_norm": 1.1694623802018054, "learning_rate": 9.474857160565993e-06, "loss": 0.3445, "step": 2565 }, { "epoch": 0.8663408056632395, "grad_norm": 1.1697324300325993, "learning_rate": 9.472419408520919e-06, "loss": 0.3244, "step": 2570 }, { "epoch": 0.8680262936120007, "grad_norm": 1.5911590885082034, "learning_rate": 9.469976326551933e-06, "loss": 0.3301, "step": 2575 }, { "epoch": 0.8697117815607618, "grad_norm": 1.2785690573279693, "learning_rate": 9.467527917570533e-06, "loss": 0.3666, "step": 2580 }, { "epoch": 0.871397269509523, "grad_norm": 1.1744870803715373, "learning_rate": 9.465074184494566e-06, "loss": 0.3516, "step": 2585 }, { "epoch": 0.8730827574582841, "grad_norm": 1.2360136206234111, "learning_rate": 9.462615130248223e-06, "loss": 0.3634, "step": 2590 }, { "epoch": 0.8747682454070453, "grad_norm": 1.4682989830244326, "learning_rate": 9.460150757762039e-06, "loss": 0.3503, "step": 2595 }, { "epoch": 0.8764537333558065, "grad_norm": 1.3774980387894733, "learning_rate": 9.45768106997288e-06, "loss": 0.3713, "step": 2600 }, { "epoch": 0.8781392213045677, "grad_norm": 1.294704912429981, "learning_rate": 9.455206069823959e-06, "loss": 0.3655, "step": 2605 }, { "epoch": 0.8798247092533288, "grad_norm": 1.327601776477407, "learning_rate": 9.452725760264805e-06, "loss": 0.3234, "step": 2610 }, { "epoch": 0.88151019720209, "grad_norm": 1.1802264296178286, "learning_rate": 9.450240144251284e-06, "loss": 0.3358, "step": 2615 }, { "epoch": 0.8831956851508511, "grad_norm": 1.748228754243499, "learning_rate": 9.447749224745583e-06, "loss": 0.3537, "step": 2620 }, { "epoch": 0.8848811730996123, "grad_norm": 1.4045870902450581, "learning_rate": 9.445253004716209e-06, "loss": 0.3663, "step": 2625 }, { "epoch": 0.8865666610483734, "grad_norm": 1.1940240009449117, "learning_rate": 9.442751487137989e-06, "loss": 0.3674, "step": 2630 }, { "epoch": 0.8882521489971347, "grad_norm": 1.319656512149089, "learning_rate": 9.440244674992058e-06, "loss": 0.357, "step": 2635 }, { "epoch": 0.8899376369458958, "grad_norm": 1.3350902364965938, "learning_rate": 9.437732571265866e-06, "loss": 0.3357, "step": 2640 }, { "epoch": 0.891623124894657, "grad_norm": 1.1765261943953333, "learning_rate": 9.435215178953164e-06, "loss": 0.3323, "step": 2645 }, { "epoch": 0.8933086128434181, "grad_norm": 1.4058080931394112, "learning_rate": 9.43269250105401e-06, "loss": 0.3527, "step": 2650 }, { "epoch": 0.8949941007921793, "grad_norm": 2.253523941662195, "learning_rate": 9.43016454057476e-06, "loss": 0.3649, "step": 2655 }, { "epoch": 0.8966795887409404, "grad_norm": 1.2420314384556268, "learning_rate": 9.427631300528061e-06, "loss": 0.3085, "step": 2660 }, { "epoch": 0.8983650766897017, "grad_norm": 1.7457147421616308, "learning_rate": 9.425092783932859e-06, "loss": 0.3691, "step": 2665 }, { "epoch": 0.9000505646384628, "grad_norm": 1.0959408810962532, "learning_rate": 9.422548993814382e-06, "loss": 0.3544, "step": 2670 }, { "epoch": 0.901736052587224, "grad_norm": 1.3553626323545842, "learning_rate": 9.419999933204149e-06, "loss": 0.3872, "step": 2675 }, { "epoch": 0.9034215405359851, "grad_norm": 1.2666777257862378, "learning_rate": 9.417445605139952e-06, "loss": 0.3353, "step": 2680 }, { "epoch": 0.9051070284847463, "grad_norm": 1.4493189109284006, "learning_rate": 9.414886012665867e-06, "loss": 0.3425, "step": 2685 }, { "epoch": 0.9067925164335074, "grad_norm": 1.5303749783674303, "learning_rate": 9.41232115883224e-06, "loss": 0.3597, "step": 2690 }, { "epoch": 0.9084780043822687, "grad_norm": 5.455372188432605, "learning_rate": 9.409751046695692e-06, "loss": 0.3434, "step": 2695 }, { "epoch": 0.9101634923310299, "grad_norm": 1.4618420145930104, "learning_rate": 9.407175679319103e-06, "loss": 0.343, "step": 2700 }, { "epoch": 0.911848980279791, "grad_norm": 1.1877451757817232, "learning_rate": 9.404595059771621e-06, "loss": 0.342, "step": 2705 }, { "epoch": 0.9135344682285522, "grad_norm": 1.2667632479796593, "learning_rate": 9.402009191128653e-06, "loss": 0.3261, "step": 2710 }, { "epoch": 0.9152199561773133, "grad_norm": 1.1273263571237337, "learning_rate": 9.39941807647186e-06, "loss": 0.3782, "step": 2715 }, { "epoch": 0.9169054441260746, "grad_norm": 1.3475581960058018, "learning_rate": 9.396821718889158e-06, "loss": 0.3665, "step": 2720 }, { "epoch": 0.9185909320748357, "grad_norm": 1.1018041204366436, "learning_rate": 9.394220121474703e-06, "loss": 0.3373, "step": 2725 }, { "epoch": 0.9202764200235969, "grad_norm": 1.196917924601841, "learning_rate": 9.391613287328908e-06, "loss": 0.3568, "step": 2730 }, { "epoch": 0.921961907972358, "grad_norm": 1.1647769372411554, "learning_rate": 9.389001219558413e-06, "loss": 0.3323, "step": 2735 }, { "epoch": 0.9236473959211192, "grad_norm": 1.550251090743837, "learning_rate": 9.386383921276106e-06, "loss": 0.3511, "step": 2740 }, { "epoch": 0.9253328838698803, "grad_norm": 1.661099673054384, "learning_rate": 9.383761395601103e-06, "loss": 0.3535, "step": 2745 }, { "epoch": 0.9270183718186416, "grad_norm": 1.4237022182652403, "learning_rate": 9.381133645658751e-06, "loss": 0.3475, "step": 2750 }, { "epoch": 0.9287038597674027, "grad_norm": 1.1476681905519996, "learning_rate": 9.378500674580624e-06, "loss": 0.3464, "step": 2755 }, { "epoch": 0.9303893477161639, "grad_norm": 1.4250656646807038, "learning_rate": 9.375862485504517e-06, "loss": 0.3385, "step": 2760 }, { "epoch": 0.932074835664925, "grad_norm": 1.079610280655471, "learning_rate": 9.37321908157444e-06, "loss": 0.3547, "step": 2765 }, { "epoch": 0.9337603236136862, "grad_norm": 1.2807702653709614, "learning_rate": 9.370570465940626e-06, "loss": 0.3507, "step": 2770 }, { "epoch": 0.9354458115624473, "grad_norm": 1.268775963488638, "learning_rate": 9.367916641759514e-06, "loss": 0.3766, "step": 2775 }, { "epoch": 0.9371312995112085, "grad_norm": 1.2173670686821865, "learning_rate": 9.365257612193746e-06, "loss": 0.3408, "step": 2780 }, { "epoch": 0.9388167874599697, "grad_norm": 1.2329741216099723, "learning_rate": 9.362593380412175e-06, "loss": 0.349, "step": 2785 }, { "epoch": 0.9405022754087309, "grad_norm": 1.1646655692947196, "learning_rate": 9.359923949589848e-06, "loss": 0.3115, "step": 2790 }, { "epoch": 0.942187763357492, "grad_norm": 1.3937233536880058, "learning_rate": 9.357249322908016e-06, "loss": 0.375, "step": 2795 }, { "epoch": 0.9438732513062532, "grad_norm": 1.2562016101527584, "learning_rate": 9.354569503554108e-06, "loss": 0.3383, "step": 2800 }, { "epoch": 0.9455587392550143, "grad_norm": 1.636938094391326, "learning_rate": 9.351884494721755e-06, "loss": 0.352, "step": 2805 }, { "epoch": 0.9472442272037755, "grad_norm": 1.4514271135210426, "learning_rate": 9.349194299610768e-06, "loss": 0.3403, "step": 2810 }, { "epoch": 0.9489297151525367, "grad_norm": 1.1700167615062402, "learning_rate": 9.346498921427133e-06, "loss": 0.3415, "step": 2815 }, { "epoch": 0.9506152031012979, "grad_norm": 1.3363274751129468, "learning_rate": 9.343798363383023e-06, "loss": 0.3811, "step": 2820 }, { "epoch": 0.952300691050059, "grad_norm": 1.1253900331786755, "learning_rate": 9.341092628696775e-06, "loss": 0.3484, "step": 2825 }, { "epoch": 0.9539861789988202, "grad_norm": 1.2107918686939847, "learning_rate": 9.3383817205929e-06, "loss": 0.3701, "step": 2830 }, { "epoch": 0.9556716669475813, "grad_norm": 1.1469262673195784, "learning_rate": 9.335665642302072e-06, "loss": 0.2904, "step": 2835 }, { "epoch": 0.9573571548963425, "grad_norm": 1.239341659965055, "learning_rate": 9.33294439706113e-06, "loss": 0.3238, "step": 2840 }, { "epoch": 0.9590426428451037, "grad_norm": 1.1345842333861247, "learning_rate": 9.330217988113065e-06, "loss": 0.3203, "step": 2845 }, { "epoch": 0.9607281307938649, "grad_norm": 1.8066563146598626, "learning_rate": 9.327486418707027e-06, "loss": 0.3571, "step": 2850 }, { "epoch": 0.962413618742626, "grad_norm": 1.3574794785080313, "learning_rate": 9.324749692098314e-06, "loss": 0.3468, "step": 2855 }, { "epoch": 0.9640991066913872, "grad_norm": 1.1539937926435926, "learning_rate": 9.322007811548368e-06, "loss": 0.3475, "step": 2860 }, { "epoch": 0.9657845946401483, "grad_norm": 1.223891872303014, "learning_rate": 9.319260780324776e-06, "loss": 0.3376, "step": 2865 }, { "epoch": 0.9674700825889095, "grad_norm": 1.2001868694768643, "learning_rate": 9.316508601701262e-06, "loss": 0.3292, "step": 2870 }, { "epoch": 0.9691555705376707, "grad_norm": 1.6611575000957008, "learning_rate": 9.313751278957685e-06, "loss": 0.3418, "step": 2875 }, { "epoch": 0.9708410584864319, "grad_norm": 1.2208847867854693, "learning_rate": 9.310988815380032e-06, "loss": 0.356, "step": 2880 }, { "epoch": 0.972526546435193, "grad_norm": 1.0958379301416534, "learning_rate": 9.308221214260422e-06, "loss": 0.3028, "step": 2885 }, { "epoch": 0.9742120343839542, "grad_norm": 1.2366470625056762, "learning_rate": 9.30544847889709e-06, "loss": 0.3607, "step": 2890 }, { "epoch": 0.9758975223327153, "grad_norm": 2.3762930639598774, "learning_rate": 9.302670612594395e-06, "loss": 0.3319, "step": 2895 }, { "epoch": 0.9775830102814765, "grad_norm": 1.2431508247950915, "learning_rate": 9.299887618662805e-06, "loss": 0.3503, "step": 2900 }, { "epoch": 0.9792684982302376, "grad_norm": 1.4436262664920567, "learning_rate": 9.297099500418905e-06, "loss": 0.3621, "step": 2905 }, { "epoch": 0.9809539861789989, "grad_norm": 1.1419148133150692, "learning_rate": 9.29430626118538e-06, "loss": 0.3326, "step": 2910 }, { "epoch": 0.98263947412776, "grad_norm": 1.153375332782516, "learning_rate": 9.291507904291026e-06, "loss": 0.3543, "step": 2915 }, { "epoch": 0.9843249620765212, "grad_norm": 1.2425388348474078, "learning_rate": 9.288704433070731e-06, "loss": 0.3487, "step": 2920 }, { "epoch": 0.9860104500252823, "grad_norm": 1.317597576468785, "learning_rate": 9.285895850865483e-06, "loss": 0.3547, "step": 2925 }, { "epoch": 0.9876959379740435, "grad_norm": 1.2615415762585858, "learning_rate": 9.283082161022356e-06, "loss": 0.3728, "step": 2930 }, { "epoch": 0.9893814259228046, "grad_norm": 1.1166740430987043, "learning_rate": 9.280263366894514e-06, "loss": 0.3274, "step": 2935 }, { "epoch": 0.9910669138715659, "grad_norm": 1.1510088475240097, "learning_rate": 9.277439471841203e-06, "loss": 0.3593, "step": 2940 }, { "epoch": 0.992752401820327, "grad_norm": 1.1971651350588435, "learning_rate": 9.274610479227748e-06, "loss": 0.3319, "step": 2945 }, { "epoch": 0.9944378897690882, "grad_norm": 1.380371721659796, "learning_rate": 9.271776392425551e-06, "loss": 0.3314, "step": 2950 }, { "epoch": 0.9961233777178493, "grad_norm": 2.3108768666864035, "learning_rate": 9.26893721481208e-06, "loss": 0.3586, "step": 2955 }, { "epoch": 0.9978088656666105, "grad_norm": 1.045938904984177, "learning_rate": 9.266092949770875e-06, "loss": 0.3319, "step": 2960 }, { "epoch": 0.9994943536153716, "grad_norm": 1.271576090306851, "learning_rate": 9.263243600691538e-06, "loss": 0.3502, "step": 2965 }, { "epoch": 1.0010112927692567, "grad_norm": 1.1986475582635325, "learning_rate": 9.260389170969726e-06, "loss": 0.3169, "step": 2970 }, { "epoch": 1.0026967807180178, "grad_norm": 1.1417963899354506, "learning_rate": 9.257529664007154e-06, "loss": 0.3256, "step": 2975 }, { "epoch": 1.004382268666779, "grad_norm": 1.8801962520324713, "learning_rate": 9.254665083211587e-06, "loss": 0.3699, "step": 2980 }, { "epoch": 1.0060677566155403, "grad_norm": 1.1225537502833858, "learning_rate": 9.251795431996837e-06, "loss": 0.3463, "step": 2985 }, { "epoch": 1.0077532445643014, "grad_norm": 1.1927612026553454, "learning_rate": 9.248920713782759e-06, "loss": 0.3152, "step": 2990 }, { "epoch": 1.0094387325130625, "grad_norm": 1.792026765241136, "learning_rate": 9.246040931995246e-06, "loss": 0.345, "step": 2995 }, { "epoch": 1.0111242204618236, "grad_norm": 1.3165032042139777, "learning_rate": 9.243156090066222e-06, "loss": 0.3033, "step": 3000 }, { "epoch": 1.012809708410585, "grad_norm": 1.5271171902753329, "learning_rate": 9.24026619143365e-06, "loss": 0.3451, "step": 3005 }, { "epoch": 1.014495196359346, "grad_norm": 1.349154410739946, "learning_rate": 9.237371239541507e-06, "loss": 0.3178, "step": 3010 }, { "epoch": 1.0161806843081072, "grad_norm": 1.5345825719059862, "learning_rate": 9.234471237839804e-06, "loss": 0.3129, "step": 3015 }, { "epoch": 1.0178661722568683, "grad_norm": 1.1463231622867909, "learning_rate": 9.231566189784562e-06, "loss": 0.2886, "step": 3020 }, { "epoch": 1.0195516602056296, "grad_norm": 1.1444509789566124, "learning_rate": 9.228656098837823e-06, "loss": 0.3396, "step": 3025 }, { "epoch": 1.0212371481543907, "grad_norm": 1.23581179979636, "learning_rate": 9.22574096846763e-06, "loss": 0.336, "step": 3030 }, { "epoch": 1.0229226361031518, "grad_norm": 1.1249953312800807, "learning_rate": 9.222820802148038e-06, "loss": 0.3101, "step": 3035 }, { "epoch": 1.024608124051913, "grad_norm": 1.216249650136864, "learning_rate": 9.219895603359103e-06, "loss": 0.3602, "step": 3040 }, { "epoch": 1.0262936120006743, "grad_norm": 1.0811514174168881, "learning_rate": 9.216965375586875e-06, "loss": 0.3225, "step": 3045 }, { "epoch": 1.0279790999494354, "grad_norm": 1.0821060654433754, "learning_rate": 9.214030122323398e-06, "loss": 0.3273, "step": 3050 }, { "epoch": 1.0296645878981965, "grad_norm": 1.192607743749468, "learning_rate": 9.211089847066712e-06, "loss": 0.3053, "step": 3055 }, { "epoch": 1.0313500758469576, "grad_norm": 1.3329700065622327, "learning_rate": 9.20814455332083e-06, "loss": 0.3543, "step": 3060 }, { "epoch": 1.033035563795719, "grad_norm": 1.1292006779440744, "learning_rate": 9.205194244595756e-06, "loss": 0.3375, "step": 3065 }, { "epoch": 1.03472105174448, "grad_norm": 1.0572796874473291, "learning_rate": 9.202238924407464e-06, "loss": 0.3286, "step": 3070 }, { "epoch": 1.0364065396932411, "grad_norm": 1.3958436199927817, "learning_rate": 9.199278596277903e-06, "loss": 0.3279, "step": 3075 }, { "epoch": 1.0380920276420023, "grad_norm": 1.0901119258919272, "learning_rate": 9.196313263734992e-06, "loss": 0.3265, "step": 3080 }, { "epoch": 1.0397775155907636, "grad_norm": 1.7433779848202735, "learning_rate": 9.193342930312609e-06, "loss": 0.3264, "step": 3085 }, { "epoch": 1.0414630035395247, "grad_norm": 1.0532495750269206, "learning_rate": 9.190367599550595e-06, "loss": 0.3527, "step": 3090 }, { "epoch": 1.0431484914882858, "grad_norm": 1.3723820144680032, "learning_rate": 9.187387274994748e-06, "loss": 0.3388, "step": 3095 }, { "epoch": 1.044833979437047, "grad_norm": 2.2437791281647352, "learning_rate": 9.184401960196812e-06, "loss": 0.357, "step": 3100 }, { "epoch": 1.0465194673858083, "grad_norm": 1.4617104242350367, "learning_rate": 9.181411658714484e-06, "loss": 0.2893, "step": 3105 }, { "epoch": 1.0482049553345694, "grad_norm": 1.3252145371158623, "learning_rate": 9.178416374111399e-06, "loss": 0.3202, "step": 3110 }, { "epoch": 1.0498904432833305, "grad_norm": 1.0885819214840982, "learning_rate": 9.175416109957136e-06, "loss": 0.3402, "step": 3115 }, { "epoch": 1.0515759312320916, "grad_norm": 1.19227178930355, "learning_rate": 9.1724108698272e-06, "loss": 0.305, "step": 3120 }, { "epoch": 1.053261419180853, "grad_norm": 1.6085625561651227, "learning_rate": 9.169400657303033e-06, "loss": 0.3194, "step": 3125 }, { "epoch": 1.054946907129614, "grad_norm": 1.2166238467446886, "learning_rate": 9.166385475972002e-06, "loss": 0.3435, "step": 3130 }, { "epoch": 1.0566323950783751, "grad_norm": 1.1891858057777183, "learning_rate": 9.163365329427392e-06, "loss": 0.3026, "step": 3135 }, { "epoch": 1.0583178830271363, "grad_norm": 1.3225904722574446, "learning_rate": 9.160340221268408e-06, "loss": 0.3191, "step": 3140 }, { "epoch": 1.0600033709758976, "grad_norm": 1.2519048096971406, "learning_rate": 9.15731015510017e-06, "loss": 0.3054, "step": 3145 }, { "epoch": 1.0616888589246587, "grad_norm": 2.1095956559646316, "learning_rate": 9.154275134533699e-06, "loss": 0.2828, "step": 3150 }, { "epoch": 1.0633743468734198, "grad_norm": 1.274722011141106, "learning_rate": 9.151235163185929e-06, "loss": 0.352, "step": 3155 }, { "epoch": 1.065059834822181, "grad_norm": 1.1965108294556865, "learning_rate": 9.14819024467969e-06, "loss": 0.3126, "step": 3160 }, { "epoch": 1.0667453227709423, "grad_norm": 1.1996063029528807, "learning_rate": 9.145140382643703e-06, "loss": 0.3547, "step": 3165 }, { "epoch": 1.0684308107197034, "grad_norm": 1.02099250189113, "learning_rate": 9.142085580712591e-06, "loss": 0.304, "step": 3170 }, { "epoch": 1.0701162986684645, "grad_norm": 1.164709913815049, "learning_rate": 9.139025842526856e-06, "loss": 0.3449, "step": 3175 }, { "epoch": 1.0718017866172258, "grad_norm": 1.486414637369902, "learning_rate": 9.135961171732884e-06, "loss": 0.3583, "step": 3180 }, { "epoch": 1.073487274565987, "grad_norm": 1.1493998901839066, "learning_rate": 9.132891571982942e-06, "loss": 0.3236, "step": 3185 }, { "epoch": 1.075172762514748, "grad_norm": 1.2765353448642704, "learning_rate": 9.12981704693517e-06, "loss": 0.3555, "step": 3190 }, { "epoch": 1.0768582504635091, "grad_norm": 1.753797510882966, "learning_rate": 9.126737600253574e-06, "loss": 0.3679, "step": 3195 }, { "epoch": 1.0785437384122702, "grad_norm": 1.2563728938227616, "learning_rate": 9.12365323560803e-06, "loss": 0.3175, "step": 3200 }, { "epoch": 1.0802292263610316, "grad_norm": 1.2304012448682435, "learning_rate": 9.120563956674272e-06, "loss": 0.3376, "step": 3205 }, { "epoch": 1.0819147143097927, "grad_norm": 1.2474689276169746, "learning_rate": 9.117469767133894e-06, "loss": 0.3478, "step": 3210 }, { "epoch": 1.0836002022585538, "grad_norm": 1.3455991754168237, "learning_rate": 9.114370670674337e-06, "loss": 0.3143, "step": 3215 }, { "epoch": 1.0852856902073151, "grad_norm": 1.133241200911727, "learning_rate": 9.111266670988893e-06, "loss": 0.3331, "step": 3220 }, { "epoch": 1.0869711781560762, "grad_norm": 1.1508043345397088, "learning_rate": 9.108157771776698e-06, "loss": 0.3373, "step": 3225 }, { "epoch": 1.0886566661048374, "grad_norm": 1.2732854414203183, "learning_rate": 9.105043976742724e-06, "loss": 0.2919, "step": 3230 }, { "epoch": 1.0903421540535985, "grad_norm": 1.2997747548615537, "learning_rate": 9.101925289597781e-06, "loss": 0.3021, "step": 3235 }, { "epoch": 1.0920276420023596, "grad_norm": 1.3231619815669389, "learning_rate": 9.098801714058506e-06, "loss": 0.3271, "step": 3240 }, { "epoch": 1.093713129951121, "grad_norm": 2.41195593545421, "learning_rate": 9.095673253847364e-06, "loss": 0.327, "step": 3245 }, { "epoch": 1.095398617899882, "grad_norm": 1.6405851933675375, "learning_rate": 9.092539912692639e-06, "loss": 0.3385, "step": 3250 }, { "epoch": 1.0970841058486431, "grad_norm": 1.8352324988894357, "learning_rate": 9.089401694328436e-06, "loss": 0.3542, "step": 3255 }, { "epoch": 1.0987695937974045, "grad_norm": 1.2345015508784225, "learning_rate": 9.086258602494662e-06, "loss": 0.3333, "step": 3260 }, { "epoch": 1.1004550817461656, "grad_norm": 1.6093942418773934, "learning_rate": 9.083110640937048e-06, "loss": 0.3461, "step": 3265 }, { "epoch": 1.1021405696949267, "grad_norm": 1.3363574509349696, "learning_rate": 9.079957813407112e-06, "loss": 0.2914, "step": 3270 }, { "epoch": 1.1038260576436878, "grad_norm": 1.203001719749076, "learning_rate": 9.076800123662185e-06, "loss": 0.3163, "step": 3275 }, { "epoch": 1.105511545592449, "grad_norm": 1.4431081253221059, "learning_rate": 9.073637575465379e-06, "loss": 0.3336, "step": 3280 }, { "epoch": 1.1071970335412102, "grad_norm": 1.257268413635979, "learning_rate": 9.070470172585611e-06, "loss": 0.3237, "step": 3285 }, { "epoch": 1.1088825214899714, "grad_norm": 1.2842426361889514, "learning_rate": 9.067297918797567e-06, "loss": 0.3212, "step": 3290 }, { "epoch": 1.1105680094387325, "grad_norm": 1.192735317660996, "learning_rate": 9.064120817881729e-06, "loss": 0.3514, "step": 3295 }, { "epoch": 1.1122534973874938, "grad_norm": 1.6647526277638496, "learning_rate": 9.060938873624346e-06, "loss": 0.3335, "step": 3300 }, { "epoch": 1.113938985336255, "grad_norm": 1.0740569540218787, "learning_rate": 9.05775208981744e-06, "loss": 0.3038, "step": 3305 }, { "epoch": 1.115624473285016, "grad_norm": 1.3657359178357789, "learning_rate": 9.054560470258805e-06, "loss": 0.3076, "step": 3310 }, { "epoch": 1.1173099612337771, "grad_norm": 1.2527517404187796, "learning_rate": 9.051364018751996e-06, "loss": 0.3026, "step": 3315 }, { "epoch": 1.1189954491825382, "grad_norm": 1.607673503435768, "learning_rate": 9.048162739106322e-06, "loss": 0.3026, "step": 3320 }, { "epoch": 1.1206809371312996, "grad_norm": 1.381814453274146, "learning_rate": 9.044956635136853e-06, "loss": 0.3404, "step": 3325 }, { "epoch": 1.1223664250800607, "grad_norm": 1.5004775481227122, "learning_rate": 9.0417457106644e-06, "loss": 0.3211, "step": 3330 }, { "epoch": 1.1240519130288218, "grad_norm": 1.0621641080464284, "learning_rate": 9.038529969515529e-06, "loss": 0.3296, "step": 3335 }, { "epoch": 1.1257374009775831, "grad_norm": 1.294817961155883, "learning_rate": 9.035309415522537e-06, "loss": 0.3133, "step": 3340 }, { "epoch": 1.1274228889263442, "grad_norm": 1.437956315448835, "learning_rate": 9.032084052523462e-06, "loss": 0.3247, "step": 3345 }, { "epoch": 1.1291083768751053, "grad_norm": 1.1717633173338764, "learning_rate": 9.02885388436207e-06, "loss": 0.3131, "step": 3350 }, { "epoch": 1.1307938648238665, "grad_norm": 6.425086001865455, "learning_rate": 9.025618914887853e-06, "loss": 0.3044, "step": 3355 }, { "epoch": 1.1324793527726276, "grad_norm": 1.2857207290031247, "learning_rate": 9.022379147956032e-06, "loss": 0.3179, "step": 3360 }, { "epoch": 1.134164840721389, "grad_norm": 1.0364430803760774, "learning_rate": 9.019134587427535e-06, "loss": 0.316, "step": 3365 }, { "epoch": 1.13585032867015, "grad_norm": 1.9246333037407806, "learning_rate": 9.01588523716901e-06, "loss": 0.3534, "step": 3370 }, { "epoch": 1.1375358166189111, "grad_norm": 1.5172240239210515, "learning_rate": 9.01263110105281e-06, "loss": 0.3145, "step": 3375 }, { "epoch": 1.1392213045676725, "grad_norm": 1.250729854508053, "learning_rate": 9.009372182956993e-06, "loss": 0.337, "step": 3380 }, { "epoch": 1.1409067925164336, "grad_norm": 1.286048477470388, "learning_rate": 9.006108486765312e-06, "loss": 0.329, "step": 3385 }, { "epoch": 1.1425922804651947, "grad_norm": 1.316957374362688, "learning_rate": 9.00284001636722e-06, "loss": 0.3195, "step": 3390 }, { "epoch": 1.1442777684139558, "grad_norm": 1.4232912394152366, "learning_rate": 8.999566775657855e-06, "loss": 0.3139, "step": 3395 }, { "epoch": 1.145963256362717, "grad_norm": 1.373486456426613, "learning_rate": 8.996288768538044e-06, "loss": 0.3267, "step": 3400 }, { "epoch": 1.1476487443114782, "grad_norm": 2.204423812126712, "learning_rate": 8.99300599891429e-06, "loss": 0.3288, "step": 3405 }, { "epoch": 1.1493342322602393, "grad_norm": 1.4328944647787816, "learning_rate": 8.989718470698776e-06, "loss": 0.3438, "step": 3410 }, { "epoch": 1.1510197202090005, "grad_norm": 1.7727307773185965, "learning_rate": 8.98642618780935e-06, "loss": 0.3037, "step": 3415 }, { "epoch": 1.1527052081577618, "grad_norm": 1.3942250547499695, "learning_rate": 8.98312915416953e-06, "loss": 0.3231, "step": 3420 }, { "epoch": 1.154390696106523, "grad_norm": 1.3782199716782548, "learning_rate": 8.979827373708499e-06, "loss": 0.3477, "step": 3425 }, { "epoch": 1.156076184055284, "grad_norm": 1.4768237198049075, "learning_rate": 8.97652085036109e-06, "loss": 0.3403, "step": 3430 }, { "epoch": 1.1577616720040451, "grad_norm": 1.2846348096176017, "learning_rate": 8.973209588067794e-06, "loss": 0.3194, "step": 3435 }, { "epoch": 1.1594471599528062, "grad_norm": 1.3310634826315841, "learning_rate": 8.969893590774745e-06, "loss": 0.3325, "step": 3440 }, { "epoch": 1.1611326479015676, "grad_norm": 1.5129127846833883, "learning_rate": 8.966572862433724e-06, "loss": 0.2987, "step": 3445 }, { "epoch": 1.1628181358503287, "grad_norm": 1.2467326527956635, "learning_rate": 8.963247407002148e-06, "loss": 0.3037, "step": 3450 }, { "epoch": 1.1645036237990898, "grad_norm": 1.4479209643129705, "learning_rate": 8.959917228443067e-06, "loss": 0.37, "step": 3455 }, { "epoch": 1.1661891117478511, "grad_norm": 1.439340391515246, "learning_rate": 8.956582330725158e-06, "loss": 0.3276, "step": 3460 }, { "epoch": 1.1678745996966122, "grad_norm": 1.259754683808496, "learning_rate": 8.953242717822727e-06, "loss": 0.3288, "step": 3465 }, { "epoch": 1.1695600876453733, "grad_norm": 1.2098043401681866, "learning_rate": 8.9498983937157e-06, "loss": 0.3194, "step": 3470 }, { "epoch": 1.1712455755941344, "grad_norm": 1.1930692751081917, "learning_rate": 8.946549362389605e-06, "loss": 0.3269, "step": 3475 }, { "epoch": 1.1729310635428956, "grad_norm": 1.131719558028194, "learning_rate": 8.943195627835597e-06, "loss": 0.3231, "step": 3480 }, { "epoch": 1.174616551491657, "grad_norm": 1.5891565143727822, "learning_rate": 8.939837194050422e-06, "loss": 0.3252, "step": 3485 }, { "epoch": 1.176302039440418, "grad_norm": 1.047800343701104, "learning_rate": 8.936474065036435e-06, "loss": 0.3068, "step": 3490 }, { "epoch": 1.1779875273891791, "grad_norm": 1.2966874445214869, "learning_rate": 8.933106244801584e-06, "loss": 0.3234, "step": 3495 }, { "epoch": 1.1796730153379404, "grad_norm": 1.3096370074171433, "learning_rate": 8.929733737359406e-06, "loss": 0.3364, "step": 3500 }, { "epoch": 1.1813585032867016, "grad_norm": 1.1199138577516985, "learning_rate": 8.926356546729025e-06, "loss": 0.2954, "step": 3505 }, { "epoch": 1.1830439912354627, "grad_norm": 1.1605643433971202, "learning_rate": 8.922974676935145e-06, "loss": 0.3243, "step": 3510 }, { "epoch": 1.1847294791842238, "grad_norm": 1.1419222736041674, "learning_rate": 8.919588132008048e-06, "loss": 0.3448, "step": 3515 }, { "epoch": 1.1864149671329849, "grad_norm": 1.2423183879943918, "learning_rate": 8.916196915983588e-06, "loss": 0.3151, "step": 3520 }, { "epoch": 1.1881004550817462, "grad_norm": 1.4313330517460534, "learning_rate": 8.912801032903183e-06, "loss": 0.3006, "step": 3525 }, { "epoch": 1.1897859430305073, "grad_norm": 1.31627280105742, "learning_rate": 8.909400486813817e-06, "loss": 0.32, "step": 3530 }, { "epoch": 1.1914714309792684, "grad_norm": 1.3249246461026147, "learning_rate": 8.905995281768024e-06, "loss": 0.3635, "step": 3535 }, { "epoch": 1.1931569189280298, "grad_norm": 1.0531573968679886, "learning_rate": 8.902585421823901e-06, "loss": 0.2969, "step": 3540 }, { "epoch": 1.1948424068767909, "grad_norm": 1.2781551220592962, "learning_rate": 8.899170911045081e-06, "loss": 0.3148, "step": 3545 }, { "epoch": 1.196527894825552, "grad_norm": 2.2616317341010084, "learning_rate": 8.895751753500745e-06, "loss": 0.3649, "step": 3550 }, { "epoch": 1.198213382774313, "grad_norm": 1.0111562114447625, "learning_rate": 8.892327953265616e-06, "loss": 0.3189, "step": 3555 }, { "epoch": 1.1998988707230742, "grad_norm": 1.6597569641402565, "learning_rate": 8.888899514419939e-06, "loss": 0.3191, "step": 3560 }, { "epoch": 1.2015843586718356, "grad_norm": 1.31780683860324, "learning_rate": 8.885466441049497e-06, "loss": 0.3148, "step": 3565 }, { "epoch": 1.2032698466205967, "grad_norm": 1.2291331373676375, "learning_rate": 8.882028737245592e-06, "loss": 0.3263, "step": 3570 }, { "epoch": 1.2049553345693578, "grad_norm": 1.1658309953207986, "learning_rate": 8.878586407105043e-06, "loss": 0.3267, "step": 3575 }, { "epoch": 1.206640822518119, "grad_norm": 1.3344425773299426, "learning_rate": 8.875139454730186e-06, "loss": 0.3193, "step": 3580 }, { "epoch": 1.2083263104668802, "grad_norm": 1.249352254184222, "learning_rate": 8.87168788422886e-06, "loss": 0.2952, "step": 3585 }, { "epoch": 1.2100117984156413, "grad_norm": 2.036619401521633, "learning_rate": 8.868231699714416e-06, "loss": 0.2887, "step": 3590 }, { "epoch": 1.2116972863644024, "grad_norm": 2.3502708587080297, "learning_rate": 8.864770905305695e-06, "loss": 0.2927, "step": 3595 }, { "epoch": 1.2133827743131635, "grad_norm": 1.292105539706738, "learning_rate": 8.861305505127036e-06, "loss": 0.3381, "step": 3600 }, { "epoch": 1.2150682622619249, "grad_norm": 1.6205141543398498, "learning_rate": 8.857835503308266e-06, "loss": 0.3399, "step": 3605 }, { "epoch": 1.216753750210686, "grad_norm": 1.2254379988378619, "learning_rate": 8.854360903984697e-06, "loss": 0.3415, "step": 3610 }, { "epoch": 1.218439238159447, "grad_norm": 1.3846859413438768, "learning_rate": 8.850881711297117e-06, "loss": 0.3079, "step": 3615 }, { "epoch": 1.2201247261082084, "grad_norm": 1.9821368611156058, "learning_rate": 8.847397929391793e-06, "loss": 0.3055, "step": 3620 }, { "epoch": 1.2218102140569695, "grad_norm": 1.1642210769863615, "learning_rate": 8.843909562420456e-06, "loss": 0.3264, "step": 3625 }, { "epoch": 1.2234957020057307, "grad_norm": 2.471738118695882, "learning_rate": 8.840416614540306e-06, "loss": 0.3232, "step": 3630 }, { "epoch": 1.2251811899544918, "grad_norm": 1.3172602960661328, "learning_rate": 8.836919089913998e-06, "loss": 0.3292, "step": 3635 }, { "epoch": 1.2268666779032529, "grad_norm": 1.077852869618823, "learning_rate": 8.83341699270964e-06, "loss": 0.3032, "step": 3640 }, { "epoch": 1.2285521658520142, "grad_norm": 1.4280651642881883, "learning_rate": 8.8299103271008e-06, "loss": 0.3203, "step": 3645 }, { "epoch": 1.2302376538007753, "grad_norm": 1.1602409372858455, "learning_rate": 8.826399097266473e-06, "loss": 0.3161, "step": 3650 }, { "epoch": 1.2319231417495364, "grad_norm": 3.894990600706879, "learning_rate": 8.82288330739111e-06, "loss": 0.2761, "step": 3655 }, { "epoch": 1.2336086296982978, "grad_norm": 1.2689284910708734, "learning_rate": 8.819362961664586e-06, "loss": 0.3032, "step": 3660 }, { "epoch": 1.2352941176470589, "grad_norm": 1.2822831605625233, "learning_rate": 8.815838064282208e-06, "loss": 0.3138, "step": 3665 }, { "epoch": 1.23697960559582, "grad_norm": 1.1376184821858066, "learning_rate": 8.812308619444712e-06, "loss": 0.3512, "step": 3670 }, { "epoch": 1.238665093544581, "grad_norm": 1.2450713686302888, "learning_rate": 8.808774631358244e-06, "loss": 0.3082, "step": 3675 }, { "epoch": 1.2403505814933422, "grad_norm": 1.099460704911777, "learning_rate": 8.805236104234372e-06, "loss": 0.3291, "step": 3680 }, { "epoch": 1.2420360694421035, "grad_norm": 1.6632790284896568, "learning_rate": 8.801693042290071e-06, "loss": 0.3106, "step": 3685 }, { "epoch": 1.2437215573908647, "grad_norm": 2.5895887194216183, "learning_rate": 8.798145449747721e-06, "loss": 0.3363, "step": 3690 }, { "epoch": 1.2454070453396258, "grad_norm": 2.1677338636579977, "learning_rate": 8.794593330835099e-06, "loss": 0.3115, "step": 3695 }, { "epoch": 1.247092533288387, "grad_norm": 1.531110066214464, "learning_rate": 8.79103668978538e-06, "loss": 0.2923, "step": 3700 }, { "epoch": 1.2487780212371482, "grad_norm": 1.6546965442835597, "learning_rate": 8.78747553083712e-06, "loss": 0.3206, "step": 3705 }, { "epoch": 1.2504635091859093, "grad_norm": 1.197919094195317, "learning_rate": 8.783909858234272e-06, "loss": 0.3195, "step": 3710 }, { "epoch": 1.2521489971346704, "grad_norm": 4.8612312432634495, "learning_rate": 8.780339676226156e-06, "loss": 0.3121, "step": 3715 }, { "epoch": 1.2538344850834315, "grad_norm": 1.4903782245448083, "learning_rate": 8.776764989067474e-06, "loss": 0.3486, "step": 3720 }, { "epoch": 1.2555199730321929, "grad_norm": 2.744059121273202, "learning_rate": 8.77318580101829e-06, "loss": 0.3178, "step": 3725 }, { "epoch": 1.257205460980954, "grad_norm": 2.2453972057845353, "learning_rate": 8.769602116344043e-06, "loss": 0.3479, "step": 3730 }, { "epoch": 1.258890948929715, "grad_norm": 1.4520489883541077, "learning_rate": 8.766013939315518e-06, "loss": 0.3103, "step": 3735 }, { "epoch": 1.2605764368784764, "grad_norm": 1.562303766128593, "learning_rate": 8.762421274208858e-06, "loss": 0.3283, "step": 3740 }, { "epoch": 1.2622619248272375, "grad_norm": 1.5501877544034917, "learning_rate": 8.75882412530556e-06, "loss": 0.3248, "step": 3745 }, { "epoch": 1.2639474127759986, "grad_norm": 1.2632632162088169, "learning_rate": 8.75522249689246e-06, "loss": 0.3112, "step": 3750 }, { "epoch": 1.2656329007247598, "grad_norm": 1.1052986233758186, "learning_rate": 8.75161639326173e-06, "loss": 0.3115, "step": 3755 }, { "epoch": 1.2673183886735209, "grad_norm": 2.372782363444479, "learning_rate": 8.748005818710878e-06, "loss": 0.3051, "step": 3760 }, { "epoch": 1.2690038766222822, "grad_norm": 1.3727111203165512, "learning_rate": 8.744390777542744e-06, "loss": 0.3242, "step": 3765 }, { "epoch": 1.2706893645710433, "grad_norm": 1.8127616056647737, "learning_rate": 8.740771274065482e-06, "loss": 0.3297, "step": 3770 }, { "epoch": 1.2723748525198044, "grad_norm": 1.3861146040065275, "learning_rate": 8.737147312592573e-06, "loss": 0.2945, "step": 3775 }, { "epoch": 1.2740603404685658, "grad_norm": 1.2227238010828299, "learning_rate": 8.733518897442805e-06, "loss": 0.3144, "step": 3780 }, { "epoch": 1.2757458284173269, "grad_norm": 1.2011085550371767, "learning_rate": 8.729886032940275e-06, "loss": 0.2985, "step": 3785 }, { "epoch": 1.277431316366088, "grad_norm": 2.075825816490112, "learning_rate": 8.726248723414383e-06, "loss": 0.31, "step": 3790 }, { "epoch": 1.279116804314849, "grad_norm": 1.3373898193491482, "learning_rate": 8.722606973199826e-06, "loss": 0.3464, "step": 3795 }, { "epoch": 1.2808022922636102, "grad_norm": 1.5819884164270863, "learning_rate": 8.718960786636594e-06, "loss": 0.2828, "step": 3800 }, { "epoch": 1.2824877802123715, "grad_norm": 4.310723675769661, "learning_rate": 8.715310168069961e-06, "loss": 0.3176, "step": 3805 }, { "epoch": 1.2841732681611326, "grad_norm": 17.89429726201977, "learning_rate": 8.711655121850489e-06, "loss": 0.2898, "step": 3810 }, { "epoch": 1.2858587561098938, "grad_norm": 1.5564491525763398, "learning_rate": 8.707995652334006e-06, "loss": 0.3089, "step": 3815 }, { "epoch": 1.287544244058655, "grad_norm": 1.8115396161375414, "learning_rate": 8.704331763881624e-06, "loss": 0.3214, "step": 3820 }, { "epoch": 1.2892297320074162, "grad_norm": 1.500306956354656, "learning_rate": 8.70066346085971e-06, "loss": 0.3356, "step": 3825 }, { "epoch": 1.2909152199561773, "grad_norm": 1.2751789454492453, "learning_rate": 8.696990747639902e-06, "loss": 0.3202, "step": 3830 }, { "epoch": 1.2926007079049384, "grad_norm": 1.4267113594111158, "learning_rate": 8.693313628599082e-06, "loss": 0.3358, "step": 3835 }, { "epoch": 1.2942861958536995, "grad_norm": 1.5195776141389108, "learning_rate": 8.689632108119395e-06, "loss": 0.3241, "step": 3840 }, { "epoch": 1.2959716838024609, "grad_norm": 5.432837666657171, "learning_rate": 8.685946190588224e-06, "loss": 0.3043, "step": 3845 }, { "epoch": 1.297657171751222, "grad_norm": 1.579641810264342, "learning_rate": 8.682255880398193e-06, "loss": 0.3043, "step": 3850 }, { "epoch": 1.299342659699983, "grad_norm": 1.4778951459685434, "learning_rate": 8.678561181947163e-06, "loss": 0.2978, "step": 3855 }, { "epoch": 1.3010281476487444, "grad_norm": 1.1535195636678885, "learning_rate": 8.674862099638222e-06, "loss": 0.3269, "step": 3860 }, { "epoch": 1.3027136355975055, "grad_norm": 1.3848379551169592, "learning_rate": 8.671158637879683e-06, "loss": 0.3638, "step": 3865 }, { "epoch": 1.3043991235462666, "grad_norm": 1.0023946409738271, "learning_rate": 8.667450801085082e-06, "loss": 0.327, "step": 3870 }, { "epoch": 1.3060846114950277, "grad_norm": 1.5710337730488202, "learning_rate": 8.66373859367316e-06, "loss": 0.3315, "step": 3875 }, { "epoch": 1.3077700994437889, "grad_norm": 1.1486867599777093, "learning_rate": 8.660022020067878e-06, "loss": 0.311, "step": 3880 }, { "epoch": 1.3094555873925502, "grad_norm": 1.223888384412348, "learning_rate": 8.65630108469839e-06, "loss": 0.3192, "step": 3885 }, { "epoch": 1.3111410753413113, "grad_norm": 1.1228246106069757, "learning_rate": 8.652575791999056e-06, "loss": 0.3141, "step": 3890 }, { "epoch": 1.3128265632900724, "grad_norm": 1.2863994611203113, "learning_rate": 8.648846146409421e-06, "loss": 0.3361, "step": 3895 }, { "epoch": 1.3145120512388337, "grad_norm": 1.5357745404192742, "learning_rate": 8.645112152374226e-06, "loss": 0.345, "step": 3900 }, { "epoch": 1.3161975391875949, "grad_norm": 3.4188648748199384, "learning_rate": 8.64137381434339e-06, "loss": 0.3008, "step": 3905 }, { "epoch": 1.317883027136356, "grad_norm": 1.2124253146786814, "learning_rate": 8.637631136772006e-06, "loss": 0.3287, "step": 3910 }, { "epoch": 1.319568515085117, "grad_norm": 1.2409397938317357, "learning_rate": 8.633884124120342e-06, "loss": 0.3032, "step": 3915 }, { "epoch": 1.3212540030338782, "grad_norm": 1.2678938727280409, "learning_rate": 8.630132780853834e-06, "loss": 0.3302, "step": 3920 }, { "epoch": 1.3229394909826395, "grad_norm": 1.1040704379899782, "learning_rate": 8.626377111443074e-06, "loss": 0.3057, "step": 3925 }, { "epoch": 1.3246249789314006, "grad_norm": 1.254471107174073, "learning_rate": 8.622617120363815e-06, "loss": 0.3315, "step": 3930 }, { "epoch": 1.3263104668801617, "grad_norm": 1.1285769481318682, "learning_rate": 8.618852812096957e-06, "loss": 0.2866, "step": 3935 }, { "epoch": 1.327995954828923, "grad_norm": 1.4100019660199357, "learning_rate": 8.61508419112854e-06, "loss": 0.316, "step": 3940 }, { "epoch": 1.3296814427776842, "grad_norm": 2.244929713448234, "learning_rate": 8.611311261949757e-06, "loss": 0.2941, "step": 3945 }, { "epoch": 1.3313669307264453, "grad_norm": 2.7078187476516047, "learning_rate": 8.607534029056923e-06, "loss": 0.3258, "step": 3950 }, { "epoch": 1.3330524186752064, "grad_norm": 2.702068883155361, "learning_rate": 8.603752496951487e-06, "loss": 0.3296, "step": 3955 }, { "epoch": 1.3347379066239675, "grad_norm": 1.2889997978371053, "learning_rate": 8.599966670140019e-06, "loss": 0.2927, "step": 3960 }, { "epoch": 1.3364233945727289, "grad_norm": 2.059715520526292, "learning_rate": 8.59617655313421e-06, "loss": 0.3187, "step": 3965 }, { "epoch": 1.33810888252149, "grad_norm": 1.3868742437237402, "learning_rate": 8.592382150450865e-06, "loss": 0.2991, "step": 3970 }, { "epoch": 1.339794370470251, "grad_norm": 1.7144049714714598, "learning_rate": 8.588583466611888e-06, "loss": 0.349, "step": 3975 }, { "epoch": 1.3414798584190124, "grad_norm": 1.271758956176897, "learning_rate": 8.584780506144299e-06, "loss": 0.315, "step": 3980 }, { "epoch": 1.3431653463677735, "grad_norm": 1.2530317033244227, "learning_rate": 8.580973273580198e-06, "loss": 0.3335, "step": 3985 }, { "epoch": 1.3448508343165346, "grad_norm": 17.21746848664345, "learning_rate": 8.57716177345679e-06, "loss": 0.304, "step": 3990 }, { "epoch": 1.3465363222652957, "grad_norm": 1.3140104913474075, "learning_rate": 8.573346010316359e-06, "loss": 0.3119, "step": 3995 }, { "epoch": 1.3482218102140568, "grad_norm": 1.180402704495193, "learning_rate": 8.56952598870627e-06, "loss": 0.3206, "step": 4000 }, { "epoch": 1.3499072981628182, "grad_norm": 1.2447229094252958, "learning_rate": 8.565701713178966e-06, "loss": 0.327, "step": 4005 }, { "epoch": 1.3515927861115793, "grad_norm": 1.4874840559252875, "learning_rate": 8.561873188291956e-06, "loss": 0.3348, "step": 4010 }, { "epoch": 1.3532782740603404, "grad_norm": 1.45101813485042, "learning_rate": 8.558040418607814e-06, "loss": 0.3282, "step": 4015 }, { "epoch": 1.3549637620091017, "grad_norm": 1.3054109425443021, "learning_rate": 8.554203408694173e-06, "loss": 0.3232, "step": 4020 }, { "epoch": 1.3566492499578628, "grad_norm": 1.1488598328474027, "learning_rate": 8.55036216312372e-06, "loss": 0.306, "step": 4025 }, { "epoch": 1.358334737906624, "grad_norm": 1.2648932973941707, "learning_rate": 8.546516686474189e-06, "loss": 0.3141, "step": 4030 }, { "epoch": 1.360020225855385, "grad_norm": 1.5703652607250131, "learning_rate": 8.542666983328355e-06, "loss": 0.3296, "step": 4035 }, { "epoch": 1.3617057138041462, "grad_norm": 1.7291765115916724, "learning_rate": 8.538813058274033e-06, "loss": 0.2943, "step": 4040 }, { "epoch": 1.3633912017529075, "grad_norm": 1.3473249229094113, "learning_rate": 8.534954915904068e-06, "loss": 0.2942, "step": 4045 }, { "epoch": 1.3650766897016686, "grad_norm": 1.39226407986307, "learning_rate": 8.53109256081633e-06, "loss": 0.3235, "step": 4050 }, { "epoch": 1.3667621776504297, "grad_norm": 1.3600299455102163, "learning_rate": 8.527225997613708e-06, "loss": 0.3304, "step": 4055 }, { "epoch": 1.368447665599191, "grad_norm": 1.5678211829841187, "learning_rate": 8.52335523090411e-06, "loss": 0.2921, "step": 4060 }, { "epoch": 1.3701331535479522, "grad_norm": 1.1427387474695525, "learning_rate": 8.519480265300449e-06, "loss": 0.3099, "step": 4065 }, { "epoch": 1.3718186414967133, "grad_norm": 1.1127138274455053, "learning_rate": 8.515601105420646e-06, "loss": 0.278, "step": 4070 }, { "epoch": 1.3735041294454744, "grad_norm": 2.4216166473917045, "learning_rate": 8.51171775588762e-06, "loss": 0.3076, "step": 4075 }, { "epoch": 1.3751896173942355, "grad_norm": 1.1495301946401593, "learning_rate": 8.50783022132928e-06, "loss": 0.3238, "step": 4080 }, { "epoch": 1.3768751053429968, "grad_norm": 1.2296568358077848, "learning_rate": 8.503938506378524e-06, "loss": 0.3038, "step": 4085 }, { "epoch": 1.378560593291758, "grad_norm": 1.2903799368998459, "learning_rate": 8.500042615673231e-06, "loss": 0.3223, "step": 4090 }, { "epoch": 1.380246081240519, "grad_norm": 1.4625234888746514, "learning_rate": 8.496142553856262e-06, "loss": 0.3045, "step": 4095 }, { "epoch": 1.3819315691892804, "grad_norm": 1.2987917187104592, "learning_rate": 8.49223832557544e-06, "loss": 0.333, "step": 4100 }, { "epoch": 1.3836170571380415, "grad_norm": 1.227452803571335, "learning_rate": 8.488329935483557e-06, "loss": 0.348, "step": 4105 }, { "epoch": 1.3853025450868026, "grad_norm": 1.3702973529239448, "learning_rate": 8.484417388238366e-06, "loss": 0.3035, "step": 4110 }, { "epoch": 1.3869880330355637, "grad_norm": 1.479671680120301, "learning_rate": 8.480500688502577e-06, "loss": 0.2748, "step": 4115 }, { "epoch": 1.3886735209843248, "grad_norm": 1.3561745110078562, "learning_rate": 8.476579840943841e-06, "loss": 0.3112, "step": 4120 }, { "epoch": 1.3903590089330862, "grad_norm": 1.3105520780187123, "learning_rate": 8.472654850234759e-06, "loss": 0.2739, "step": 4125 }, { "epoch": 1.3920444968818473, "grad_norm": 1.2673327219983164, "learning_rate": 8.468725721052865e-06, "loss": 0.2959, "step": 4130 }, { "epoch": 1.3937299848306084, "grad_norm": 2.3825232686318008, "learning_rate": 8.46479245808063e-06, "loss": 0.2878, "step": 4135 }, { "epoch": 1.3954154727793697, "grad_norm": 1.486716855984244, "learning_rate": 8.46085506600544e-06, "loss": 0.323, "step": 4140 }, { "epoch": 1.3971009607281308, "grad_norm": 1.4761429312357983, "learning_rate": 8.456913549519619e-06, "loss": 0.2893, "step": 4145 }, { "epoch": 1.398786448676892, "grad_norm": 1.3937230582771776, "learning_rate": 8.452967913320392e-06, "loss": 0.3356, "step": 4150 }, { "epoch": 1.400471936625653, "grad_norm": 1.3727669058972003, "learning_rate": 8.449018162109901e-06, "loss": 0.2969, "step": 4155 }, { "epoch": 1.4021574245744142, "grad_norm": 1.1645151177750952, "learning_rate": 8.44506430059519e-06, "loss": 0.3483, "step": 4160 }, { "epoch": 1.4038429125231755, "grad_norm": 1.1682677364890428, "learning_rate": 8.441106333488197e-06, "loss": 0.3117, "step": 4165 }, { "epoch": 1.4055284004719366, "grad_norm": 1.3746259110400163, "learning_rate": 8.437144265505762e-06, "loss": 0.3046, "step": 4170 }, { "epoch": 1.4072138884206977, "grad_norm": 1.256888048959631, "learning_rate": 8.433178101369602e-06, "loss": 0.2916, "step": 4175 }, { "epoch": 1.408899376369459, "grad_norm": 1.207649777757582, "learning_rate": 8.429207845806325e-06, "loss": 0.3119, "step": 4180 }, { "epoch": 1.4105848643182202, "grad_norm": 1.3426705903067313, "learning_rate": 8.425233503547408e-06, "loss": 0.3284, "step": 4185 }, { "epoch": 1.4122703522669813, "grad_norm": 3.566432978592899, "learning_rate": 8.4212550793292e-06, "loss": 0.2875, "step": 4190 }, { "epoch": 1.4139558402157424, "grad_norm": 8.895221690453752, "learning_rate": 8.417272577892916e-06, "loss": 0.304, "step": 4195 }, { "epoch": 1.4156413281645035, "grad_norm": 1.4874260058273723, "learning_rate": 8.41328600398463e-06, "loss": 0.3117, "step": 4200 }, { "epoch": 1.4173268161132648, "grad_norm": 1.2216137451945772, "learning_rate": 8.409295362355268e-06, "loss": 0.2933, "step": 4205 }, { "epoch": 1.419012304062026, "grad_norm": 1.0927491924958632, "learning_rate": 8.405300657760605e-06, "loss": 0.2808, "step": 4210 }, { "epoch": 1.420697792010787, "grad_norm": 1.1514111797365336, "learning_rate": 8.401301894961253e-06, "loss": 0.295, "step": 4215 }, { "epoch": 1.4223832799595484, "grad_norm": 1.3419551267319916, "learning_rate": 8.397299078722668e-06, "loss": 0.3039, "step": 4220 }, { "epoch": 1.4240687679083095, "grad_norm": 1.1290326111150712, "learning_rate": 8.393292213815132e-06, "loss": 0.3085, "step": 4225 }, { "epoch": 1.4257542558570706, "grad_norm": 1.5750789112190187, "learning_rate": 8.389281305013755e-06, "loss": 0.2811, "step": 4230 }, { "epoch": 1.4274397438058317, "grad_norm": 1.1490987631446123, "learning_rate": 8.38526635709846e-06, "loss": 0.3183, "step": 4235 }, { "epoch": 1.4291252317545928, "grad_norm": 1.1535140415582725, "learning_rate": 8.38124737485399e-06, "loss": 0.3028, "step": 4240 }, { "epoch": 1.4308107197033542, "grad_norm": 1.5422149825340228, "learning_rate": 8.377224363069894e-06, "loss": 0.337, "step": 4245 }, { "epoch": 1.4324962076521153, "grad_norm": 1.0262056181844454, "learning_rate": 8.37319732654052e-06, "loss": 0.332, "step": 4250 }, { "epoch": 1.4341816956008764, "grad_norm": 1.3775005325975165, "learning_rate": 8.369166270065017e-06, "loss": 0.3243, "step": 4255 }, { "epoch": 1.4358671835496377, "grad_norm": 1.173509327540304, "learning_rate": 8.365131198447323e-06, "loss": 0.3141, "step": 4260 }, { "epoch": 1.4375526714983988, "grad_norm": 1.1596006980464375, "learning_rate": 8.361092116496161e-06, "loss": 0.3014, "step": 4265 }, { "epoch": 1.43923815944716, "grad_norm": 1.32764415589504, "learning_rate": 8.357049029025031e-06, "loss": 0.3112, "step": 4270 }, { "epoch": 1.440923647395921, "grad_norm": 1.5035233100088548, "learning_rate": 8.35300194085221e-06, "loss": 0.2991, "step": 4275 }, { "epoch": 1.4426091353446822, "grad_norm": 1.1478608842920697, "learning_rate": 8.348950856800742e-06, "loss": 0.2857, "step": 4280 }, { "epoch": 1.4442946232934435, "grad_norm": 1.29338444865639, "learning_rate": 8.34489578169843e-06, "loss": 0.3481, "step": 4285 }, { "epoch": 1.4459801112422046, "grad_norm": 1.0453150271805118, "learning_rate": 8.340836720377835e-06, "loss": 0.3193, "step": 4290 }, { "epoch": 1.4476655991909657, "grad_norm": 0.9896696042682885, "learning_rate": 8.336773677676272e-06, "loss": 0.3134, "step": 4295 }, { "epoch": 1.449351087139727, "grad_norm": 2.5935236624647873, "learning_rate": 8.332706658435797e-06, "loss": 0.2911, "step": 4300 }, { "epoch": 1.4510365750884882, "grad_norm": 1.1536095372386117, "learning_rate": 8.328635667503202e-06, "loss": 0.3095, "step": 4305 }, { "epoch": 1.4527220630372493, "grad_norm": 1.1073368534973436, "learning_rate": 8.32456070973002e-06, "loss": 0.3182, "step": 4310 }, { "epoch": 1.4544075509860104, "grad_norm": 1.462814949483858, "learning_rate": 8.320481789972507e-06, "loss": 0.2928, "step": 4315 }, { "epoch": 1.4560930389347715, "grad_norm": 3.6322373946041457, "learning_rate": 8.316398913091639e-06, "loss": 0.3179, "step": 4320 }, { "epoch": 1.4577785268835328, "grad_norm": 1.5194226649440357, "learning_rate": 8.312312083953111e-06, "loss": 0.3192, "step": 4325 }, { "epoch": 1.459464014832294, "grad_norm": 1.3442608838655814, "learning_rate": 8.308221307427327e-06, "loss": 0.3272, "step": 4330 }, { "epoch": 1.461149502781055, "grad_norm": 1.328468345788671, "learning_rate": 8.304126588389394e-06, "loss": 0.2645, "step": 4335 }, { "epoch": 1.4628349907298164, "grad_norm": 1.5432668150353683, "learning_rate": 8.300027931719119e-06, "loss": 0.2897, "step": 4340 }, { "epoch": 1.4645204786785775, "grad_norm": 1.2037066930144322, "learning_rate": 8.295925342301e-06, "loss": 0.306, "step": 4345 }, { "epoch": 1.4662059666273386, "grad_norm": 1.2263982324594778, "learning_rate": 8.291818825024224e-06, "loss": 0.2913, "step": 4350 }, { "epoch": 1.4678914545760997, "grad_norm": 1.8086134708791748, "learning_rate": 8.287708384782659e-06, "loss": 0.2824, "step": 4355 }, { "epoch": 1.4695769425248608, "grad_norm": 1.2635125137788807, "learning_rate": 8.283594026474841e-06, "loss": 0.3186, "step": 4360 }, { "epoch": 1.4712624304736222, "grad_norm": 2.0545826803258076, "learning_rate": 8.279475755003989e-06, "loss": 0.3224, "step": 4365 }, { "epoch": 1.4729479184223833, "grad_norm": 1.1489327919842214, "learning_rate": 8.275353575277973e-06, "loss": 0.311, "step": 4370 }, { "epoch": 1.4746334063711444, "grad_norm": 1.017024562259985, "learning_rate": 8.271227492209328e-06, "loss": 0.3084, "step": 4375 }, { "epoch": 1.4763188943199057, "grad_norm": 1.2031665589165061, "learning_rate": 8.267097510715233e-06, "loss": 0.3313, "step": 4380 }, { "epoch": 1.4780043822686668, "grad_norm": 2.673544314333556, "learning_rate": 8.262963635717523e-06, "loss": 0.2996, "step": 4385 }, { "epoch": 1.479689870217428, "grad_norm": 1.2207254874472724, "learning_rate": 8.258825872142664e-06, "loss": 0.3184, "step": 4390 }, { "epoch": 1.481375358166189, "grad_norm": 1.4001335668544308, "learning_rate": 8.254684224921764e-06, "loss": 0.3066, "step": 4395 }, { "epoch": 1.4830608461149501, "grad_norm": 1.012842174095787, "learning_rate": 8.25053869899055e-06, "loss": 0.3015, "step": 4400 }, { "epoch": 1.4847463340637115, "grad_norm": 1.3371573242587744, "learning_rate": 8.246389299289383e-06, "loss": 0.3046, "step": 4405 }, { "epoch": 1.4864318220124726, "grad_norm": 1.2286355952808135, "learning_rate": 8.24223603076323e-06, "loss": 0.3395, "step": 4410 }, { "epoch": 1.4881173099612337, "grad_norm": 1.246783816902124, "learning_rate": 8.23807889836167e-06, "loss": 0.2952, "step": 4415 }, { "epoch": 1.489802797909995, "grad_norm": 1.1677448910186377, "learning_rate": 8.233917907038895e-06, "loss": 0.3079, "step": 4420 }, { "epoch": 1.4914882858587561, "grad_norm": 1.272006781957519, "learning_rate": 8.229753061753688e-06, "loss": 0.3041, "step": 4425 }, { "epoch": 1.4931737738075173, "grad_norm": 1.1268454341228855, "learning_rate": 8.225584367469426e-06, "loss": 0.3101, "step": 4430 }, { "epoch": 1.4948592617562784, "grad_norm": 1.3493446211359892, "learning_rate": 8.221411829154076e-06, "loss": 0.2721, "step": 4435 }, { "epoch": 1.4965447497050395, "grad_norm": 1.1799280837923247, "learning_rate": 8.217235451780183e-06, "loss": 0.2994, "step": 4440 }, { "epoch": 1.4982302376538008, "grad_norm": 1.4177878540916098, "learning_rate": 8.213055240324868e-06, "loss": 0.3186, "step": 4445 }, { "epoch": 1.499915725602562, "grad_norm": 1.3044048596320261, "learning_rate": 8.208871199769823e-06, "loss": 0.3022, "step": 4450 }, { "epoch": 1.5016012135513233, "grad_norm": 1.2765856761072953, "learning_rate": 8.204683335101297e-06, "loss": 0.3144, "step": 4455 }, { "epoch": 1.5032867015000844, "grad_norm": 1.1136046489474536, "learning_rate": 8.200491651310107e-06, "loss": 0.3025, "step": 4460 }, { "epoch": 1.5049721894488455, "grad_norm": 1.2097049173994694, "learning_rate": 8.196296153391614e-06, "loss": 0.3324, "step": 4465 }, { "epoch": 1.5066576773976066, "grad_norm": 1.0633724872441128, "learning_rate": 8.192096846345722e-06, "loss": 0.2942, "step": 4470 }, { "epoch": 1.5083431653463677, "grad_norm": 1.0952722866375602, "learning_rate": 8.187893735176884e-06, "loss": 0.2871, "step": 4475 }, { "epoch": 1.5100286532951288, "grad_norm": 1.4591350217401189, "learning_rate": 8.183686824894075e-06, "loss": 0.2915, "step": 4480 }, { "epoch": 1.5117141412438901, "grad_norm": 1.2487660488320904, "learning_rate": 8.179476120510807e-06, "loss": 0.2961, "step": 4485 }, { "epoch": 1.5133996291926513, "grad_norm": 1.2986555984229826, "learning_rate": 8.17526162704511e-06, "loss": 0.3048, "step": 4490 }, { "epoch": 1.5150851171414126, "grad_norm": 1.197505715598598, "learning_rate": 8.171043349519527e-06, "loss": 0.3099, "step": 4495 }, { "epoch": 1.5167706050901737, "grad_norm": 1.4760600880092891, "learning_rate": 8.166821292961114e-06, "loss": 0.2877, "step": 4500 }, { "epoch": 1.5184560930389348, "grad_norm": 1.0946723400424974, "learning_rate": 8.16259546240143e-06, "loss": 0.2639, "step": 4505 }, { "epoch": 1.520141580987696, "grad_norm": 1.1784722534823753, "learning_rate": 8.15836586287653e-06, "loss": 0.2836, "step": 4510 }, { "epoch": 1.521827068936457, "grad_norm": 1.2002264645884866, "learning_rate": 8.154132499426963e-06, "loss": 0.2706, "step": 4515 }, { "epoch": 1.5235125568852181, "grad_norm": 1.331889955530758, "learning_rate": 8.149895377097763e-06, "loss": 0.3046, "step": 4520 }, { "epoch": 1.5251980448339795, "grad_norm": 1.0980910173352272, "learning_rate": 8.14565450093844e-06, "loss": 0.277, "step": 4525 }, { "epoch": 1.5268835327827406, "grad_norm": 1.192117036032157, "learning_rate": 8.141409876002986e-06, "loss": 0.3054, "step": 4530 }, { "epoch": 1.528569020731502, "grad_norm": 1.1720833863713713, "learning_rate": 8.13716150734985e-06, "loss": 0.3341, "step": 4535 }, { "epoch": 1.530254508680263, "grad_norm": 1.341175803099967, "learning_rate": 8.132909400041946e-06, "loss": 0.3023, "step": 4540 }, { "epoch": 1.5319399966290241, "grad_norm": 1.2797385921810815, "learning_rate": 8.12865355914665e-06, "loss": 0.3192, "step": 4545 }, { "epoch": 1.5336254845777852, "grad_norm": 1.4702793982358064, "learning_rate": 8.124393989735782e-06, "loss": 0.2833, "step": 4550 }, { "epoch": 1.5353109725265464, "grad_norm": 1.4575853397613314, "learning_rate": 8.120130696885603e-06, "loss": 0.2759, "step": 4555 }, { "epoch": 1.5369964604753075, "grad_norm": 1.2233305478539482, "learning_rate": 8.115863685676815e-06, "loss": 0.2948, "step": 4560 }, { "epoch": 1.5386819484240688, "grad_norm": 2.161647783994973, "learning_rate": 8.11159296119455e-06, "loss": 0.2764, "step": 4565 }, { "epoch": 1.54036743637283, "grad_norm": 1.2056930231950673, "learning_rate": 8.10731852852837e-06, "loss": 0.2867, "step": 4570 }, { "epoch": 1.5420529243215912, "grad_norm": 1.4206227206884128, "learning_rate": 8.103040392772245e-06, "loss": 0.315, "step": 4575 }, { "epoch": 1.5437384122703524, "grad_norm": 6.329456167588222, "learning_rate": 8.098758559024569e-06, "loss": 0.2887, "step": 4580 }, { "epoch": 1.5454239002191135, "grad_norm": 1.1488924467484576, "learning_rate": 8.094473032388137e-06, "loss": 0.2988, "step": 4585 }, { "epoch": 1.5471093881678746, "grad_norm": 1.230612016638428, "learning_rate": 8.09018381797015e-06, "loss": 0.3103, "step": 4590 }, { "epoch": 1.5487948761166357, "grad_norm": 1.12166613113956, "learning_rate": 8.0858909208822e-06, "loss": 0.2963, "step": 4595 }, { "epoch": 1.5504803640653968, "grad_norm": 1.848923325989595, "learning_rate": 8.081594346240266e-06, "loss": 0.3398, "step": 4600 }, { "epoch": 1.5521658520141581, "grad_norm": 1.47462317560063, "learning_rate": 8.077294099164714e-06, "loss": 0.295, "step": 4605 }, { "epoch": 1.5538513399629192, "grad_norm": 1.3011799325240747, "learning_rate": 8.072990184780281e-06, "loss": 0.2865, "step": 4610 }, { "epoch": 1.5555368279116806, "grad_norm": 1.7138001368822846, "learning_rate": 8.068682608216086e-06, "loss": 0.2976, "step": 4615 }, { "epoch": 1.5572223158604417, "grad_norm": 1.3081338244407001, "learning_rate": 8.064371374605595e-06, "loss": 0.2932, "step": 4620 }, { "epoch": 1.5589078038092028, "grad_norm": 1.3602376615238971, "learning_rate": 8.06005648908665e-06, "loss": 0.3333, "step": 4625 }, { "epoch": 1.560593291757964, "grad_norm": 1.2035691797270929, "learning_rate": 8.05573795680143e-06, "loss": 0.3063, "step": 4630 }, { "epoch": 1.562278779706725, "grad_norm": 1.106948889784737, "learning_rate": 8.051415782896473e-06, "loss": 0.3076, "step": 4635 }, { "epoch": 1.5639642676554861, "grad_norm": 1.639823888733204, "learning_rate": 8.047089972522646e-06, "loss": 0.3059, "step": 4640 }, { "epoch": 1.5656497556042475, "grad_norm": 1.118879469886603, "learning_rate": 8.042760530835158e-06, "loss": 0.3031, "step": 4645 }, { "epoch": 1.5673352435530086, "grad_norm": 1.802512942743598, "learning_rate": 8.038427462993536e-06, "loss": 0.283, "step": 4650 }, { "epoch": 1.56902073150177, "grad_norm": 1.4894578458392649, "learning_rate": 8.03409077416164e-06, "loss": 0.3001, "step": 4655 }, { "epoch": 1.570706219450531, "grad_norm": 1.257693654239823, "learning_rate": 8.029750469507637e-06, "loss": 0.2925, "step": 4660 }, { "epoch": 1.5723917073992921, "grad_norm": 1.6453473842466895, "learning_rate": 8.025406554204007e-06, "loss": 0.3115, "step": 4665 }, { "epoch": 1.5740771953480532, "grad_norm": 1.2605455880797907, "learning_rate": 8.02105903342753e-06, "loss": 0.3098, "step": 4670 }, { "epoch": 1.5757626832968143, "grad_norm": 1.1867392634727372, "learning_rate": 8.016707912359284e-06, "loss": 0.3432, "step": 4675 }, { "epoch": 1.5774481712455755, "grad_norm": 1.2784850432033157, "learning_rate": 8.01235319618464e-06, "loss": 0.2897, "step": 4680 }, { "epoch": 1.5791336591943368, "grad_norm": 1.7918357849402966, "learning_rate": 8.007994890093247e-06, "loss": 0.2791, "step": 4685 }, { "epoch": 1.580819147143098, "grad_norm": 1.199205983179323, "learning_rate": 8.00363299927904e-06, "loss": 0.2937, "step": 4690 }, { "epoch": 1.5825046350918592, "grad_norm": 1.2766814868215317, "learning_rate": 7.999267528940225e-06, "loss": 0.3145, "step": 4695 }, { "epoch": 1.5841901230406203, "grad_norm": 1.233786588574404, "learning_rate": 7.994898484279265e-06, "loss": 0.2929, "step": 4700 }, { "epoch": 1.5858756109893815, "grad_norm": 2.0911993189513085, "learning_rate": 7.990525870502893e-06, "loss": 0.3028, "step": 4705 }, { "epoch": 1.5875610989381426, "grad_norm": 1.1632752318696185, "learning_rate": 7.986149692822089e-06, "loss": 0.314, "step": 4710 }, { "epoch": 1.5892465868869037, "grad_norm": 1.2976478244391174, "learning_rate": 7.981769956452085e-06, "loss": 0.3112, "step": 4715 }, { "epoch": 1.5909320748356648, "grad_norm": 1.3494201754916226, "learning_rate": 7.97738666661235e-06, "loss": 0.2972, "step": 4720 }, { "epoch": 1.5926175627844261, "grad_norm": 1.307208309529107, "learning_rate": 7.97299982852659e-06, "loss": 0.2978, "step": 4725 }, { "epoch": 1.5943030507331872, "grad_norm": 1.3093524249606938, "learning_rate": 7.96860944742274e-06, "loss": 0.2823, "step": 4730 }, { "epoch": 1.5959885386819486, "grad_norm": 1.191591564059639, "learning_rate": 7.964215528532955e-06, "loss": 0.2814, "step": 4735 }, { "epoch": 1.5976740266307097, "grad_norm": 1.2549526964375906, "learning_rate": 7.959818077093605e-06, "loss": 0.3136, "step": 4740 }, { "epoch": 1.5993595145794708, "grad_norm": 1.321086131577103, "learning_rate": 7.955417098345277e-06, "loss": 0.2951, "step": 4745 }, { "epoch": 1.601045002528232, "grad_norm": 1.2550529012391063, "learning_rate": 7.951012597532755e-06, "loss": 0.3145, "step": 4750 }, { "epoch": 1.602730490476993, "grad_norm": 1.1809960009492033, "learning_rate": 7.94660457990502e-06, "loss": 0.2839, "step": 4755 }, { "epoch": 1.6044159784257541, "grad_norm": 1.106482264060698, "learning_rate": 7.942193050715248e-06, "loss": 0.2976, "step": 4760 }, { "epoch": 1.6061014663745155, "grad_norm": 1.072157217080442, "learning_rate": 7.937778015220798e-06, "loss": 0.2987, "step": 4765 }, { "epoch": 1.6077869543232766, "grad_norm": 1.1474457940023302, "learning_rate": 7.93335947868321e-06, "loss": 0.2596, "step": 4770 }, { "epoch": 1.609472442272038, "grad_norm": 2.0507615381539632, "learning_rate": 7.92893744636819e-06, "loss": 0.3031, "step": 4775 }, { "epoch": 1.611157930220799, "grad_norm": 1.3830108942662933, "learning_rate": 7.924511923545615e-06, "loss": 0.3239, "step": 4780 }, { "epoch": 1.6128434181695601, "grad_norm": 1.4290445792176358, "learning_rate": 7.920082915489521e-06, "loss": 0.293, "step": 4785 }, { "epoch": 1.6145289061183212, "grad_norm": 3.1474688819918732, "learning_rate": 7.9156504274781e-06, "loss": 0.3222, "step": 4790 }, { "epoch": 1.6162143940670823, "grad_norm": 1.7662624740688317, "learning_rate": 7.911214464793687e-06, "loss": 0.2554, "step": 4795 }, { "epoch": 1.6178998820158434, "grad_norm": 1.4862287784132144, "learning_rate": 7.906775032722755e-06, "loss": 0.2722, "step": 4800 }, { "epoch": 1.6195853699646048, "grad_norm": 8.026673170688301, "learning_rate": 7.90233213655592e-06, "loss": 0.3004, "step": 4805 }, { "epoch": 1.621270857913366, "grad_norm": 1.2758282965884276, "learning_rate": 7.897885781587924e-06, "loss": 0.2934, "step": 4810 }, { "epoch": 1.6229563458621272, "grad_norm": 1.5636493108431386, "learning_rate": 7.893435973117625e-06, "loss": 0.314, "step": 4815 }, { "epoch": 1.6246418338108883, "grad_norm": 1.4426404779447306, "learning_rate": 7.888982716448001e-06, "loss": 0.2966, "step": 4820 }, { "epoch": 1.6263273217596494, "grad_norm": 2.2562300197195286, "learning_rate": 7.884526016886142e-06, "loss": 0.3111, "step": 4825 }, { "epoch": 1.6280128097084106, "grad_norm": 1.145792464399853, "learning_rate": 7.880065879743236e-06, "loss": 0.2822, "step": 4830 }, { "epoch": 1.6296982976571717, "grad_norm": 1.209364419222678, "learning_rate": 7.875602310334571e-06, "loss": 0.2986, "step": 4835 }, { "epoch": 1.6313837856059328, "grad_norm": 1.158602003168086, "learning_rate": 7.87113531397952e-06, "loss": 0.3255, "step": 4840 }, { "epoch": 1.6330692735546941, "grad_norm": 5.218568637065671, "learning_rate": 7.86666489600155e-06, "loss": 0.3227, "step": 4845 }, { "epoch": 1.6347547615034552, "grad_norm": 0.9976104195555039, "learning_rate": 7.862191061728196e-06, "loss": 0.2865, "step": 4850 }, { "epoch": 1.6364402494522166, "grad_norm": 2.569454658042009, "learning_rate": 7.85771381649107e-06, "loss": 0.2835, "step": 4855 }, { "epoch": 1.6381257374009777, "grad_norm": 1.2081999103176555, "learning_rate": 7.853233165625846e-06, "loss": 0.3128, "step": 4860 }, { "epoch": 1.6398112253497388, "grad_norm": 1.1248594021068017, "learning_rate": 7.848749114472258e-06, "loss": 0.3151, "step": 4865 }, { "epoch": 1.6414967132984999, "grad_norm": 2.4270155350719578, "learning_rate": 7.84426166837409e-06, "loss": 0.3076, "step": 4870 }, { "epoch": 1.643182201247261, "grad_norm": 1.2266730638699697, "learning_rate": 7.839770832679176e-06, "loss": 0.2993, "step": 4875 }, { "epoch": 1.644867689196022, "grad_norm": 1.1803866832834962, "learning_rate": 7.835276612739386e-06, "loss": 0.2779, "step": 4880 }, { "epoch": 1.6465531771447834, "grad_norm": 1.4724741668831578, "learning_rate": 7.830779013910626e-06, "loss": 0.3018, "step": 4885 }, { "epoch": 1.6482386650935446, "grad_norm": 1.347302301416293, "learning_rate": 7.826278041552824e-06, "loss": 0.3168, "step": 4890 }, { "epoch": 1.6499241530423059, "grad_norm": 1.1208931792241335, "learning_rate": 7.821773701029934e-06, "loss": 0.3034, "step": 4895 }, { "epoch": 1.651609640991067, "grad_norm": 1.5426746365548927, "learning_rate": 7.81726599770992e-06, "loss": 0.2516, "step": 4900 }, { "epoch": 1.653295128939828, "grad_norm": 1.1142651926486773, "learning_rate": 7.812754936964758e-06, "loss": 0.2831, "step": 4905 }, { "epoch": 1.6549806168885892, "grad_norm": 1.259520493787399, "learning_rate": 7.808240524170414e-06, "loss": 0.2786, "step": 4910 }, { "epoch": 1.6566661048373503, "grad_norm": 1.4754317494045115, "learning_rate": 7.803722764706865e-06, "loss": 0.2958, "step": 4915 }, { "epoch": 1.6583515927861114, "grad_norm": 1.1171716700419523, "learning_rate": 7.799201663958066e-06, "loss": 0.2809, "step": 4920 }, { "epoch": 1.6600370807348728, "grad_norm": 1.2908279148621562, "learning_rate": 7.794677227311954e-06, "loss": 0.2991, "step": 4925 }, { "epoch": 1.6617225686836339, "grad_norm": 1.3433670764531915, "learning_rate": 7.790149460160445e-06, "loss": 0.2926, "step": 4930 }, { "epoch": 1.6634080566323952, "grad_norm": 1.2218216858309807, "learning_rate": 7.785618367899421e-06, "loss": 0.2917, "step": 4935 }, { "epoch": 1.6650935445811563, "grad_norm": 1.2990524842912665, "learning_rate": 7.781083955928732e-06, "loss": 0.2952, "step": 4940 }, { "epoch": 1.6667790325299174, "grad_norm": 1.6785603900156052, "learning_rate": 7.776546229652175e-06, "loss": 0.3159, "step": 4945 }, { "epoch": 1.6684645204786785, "grad_norm": 1.3172034286260808, "learning_rate": 7.772005194477506e-06, "loss": 0.2849, "step": 4950 }, { "epoch": 1.6701500084274397, "grad_norm": 1.7712583785514993, "learning_rate": 7.76746085581642e-06, "loss": 0.2966, "step": 4955 }, { "epoch": 1.6718354963762008, "grad_norm": 1.3680930568241578, "learning_rate": 7.762913219084549e-06, "loss": 0.3021, "step": 4960 }, { "epoch": 1.673520984324962, "grad_norm": 1.8731405239920322, "learning_rate": 7.758362289701456e-06, "loss": 0.2905, "step": 4965 }, { "epoch": 1.6752064722737232, "grad_norm": 1.2605432916849957, "learning_rate": 7.753808073090626e-06, "loss": 0.3222, "step": 4970 }, { "epoch": 1.6768919602224845, "grad_norm": 2.0933592871449385, "learning_rate": 7.749250574679466e-06, "loss": 0.3004, "step": 4975 }, { "epoch": 1.6785774481712457, "grad_norm": 1.2951973558763388, "learning_rate": 7.74468979989929e-06, "loss": 0.3356, "step": 4980 }, { "epoch": 1.6802629361200068, "grad_norm": 1.1121691174536121, "learning_rate": 7.740125754185316e-06, "loss": 0.2761, "step": 4985 }, { "epoch": 1.6819484240687679, "grad_norm": 1.247648286765324, "learning_rate": 7.735558442976665e-06, "loss": 0.3005, "step": 4990 }, { "epoch": 1.683633912017529, "grad_norm": 1.3107306413155002, "learning_rate": 7.730987871716343e-06, "loss": 0.296, "step": 4995 }, { "epoch": 1.68531939996629, "grad_norm": 1.6659054480184692, "learning_rate": 7.72641404585125e-06, "loss": 0.3117, "step": 5000 }, { "epoch": 1.6870048879150514, "grad_norm": 1.1094262335217973, "learning_rate": 7.721836970832154e-06, "loss": 0.2652, "step": 5005 }, { "epoch": 1.6886903758638125, "grad_norm": 1.9930822705353428, "learning_rate": 7.717256652113701e-06, "loss": 0.2807, "step": 5010 }, { "epoch": 1.6903758638125739, "grad_norm": 1.1554646427441158, "learning_rate": 7.712673095154403e-06, "loss": 0.3323, "step": 5015 }, { "epoch": 1.692061351761335, "grad_norm": 1.1397588826811031, "learning_rate": 7.708086305416633e-06, "loss": 0.271, "step": 5020 }, { "epoch": 1.693746839710096, "grad_norm": 1.5885004049421636, "learning_rate": 7.703496288366608e-06, "loss": 0.2656, "step": 5025 }, { "epoch": 1.6954323276588572, "grad_norm": 1.146638763639586, "learning_rate": 7.698903049474402e-06, "loss": 0.2941, "step": 5030 }, { "epoch": 1.6971178156076183, "grad_norm": 1.151903093184447, "learning_rate": 7.69430659421392e-06, "loss": 0.305, "step": 5035 }, { "epoch": 1.6988033035563794, "grad_norm": 1.21754582792958, "learning_rate": 7.6897069280629e-06, "loss": 0.3071, "step": 5040 }, { "epoch": 1.7004887915051408, "grad_norm": 1.6576276575339772, "learning_rate": 7.68510405650292e-06, "loss": 0.3179, "step": 5045 }, { "epoch": 1.7021742794539019, "grad_norm": 1.8043608824075692, "learning_rate": 7.68049798501936e-06, "loss": 0.2804, "step": 5050 }, { "epoch": 1.7038597674026632, "grad_norm": 1.5070951031781703, "learning_rate": 7.675888719101422e-06, "loss": 0.2933, "step": 5055 }, { "epoch": 1.7055452553514243, "grad_norm": 1.436825831743727, "learning_rate": 7.671276264242116e-06, "loss": 0.2736, "step": 5060 }, { "epoch": 1.7072307433001854, "grad_norm": 1.6490603638595753, "learning_rate": 7.666660625938252e-06, "loss": 0.3293, "step": 5065 }, { "epoch": 1.7089162312489465, "grad_norm": 1.555207873204316, "learning_rate": 7.662041809690428e-06, "loss": 0.2872, "step": 5070 }, { "epoch": 1.7106017191977076, "grad_norm": 1.4334715461537986, "learning_rate": 7.657419821003038e-06, "loss": 0.3013, "step": 5075 }, { "epoch": 1.7122872071464688, "grad_norm": 1.4272073998025994, "learning_rate": 7.652794665384249e-06, "loss": 0.3176, "step": 5080 }, { "epoch": 1.71397269509523, "grad_norm": 1.501271647310636, "learning_rate": 7.648166348346009e-06, "loss": 0.2593, "step": 5085 }, { "epoch": 1.7156581830439912, "grad_norm": 1.355921776107784, "learning_rate": 7.643534875404028e-06, "loss": 0.2972, "step": 5090 }, { "epoch": 1.7173436709927525, "grad_norm": 1.255616076107906, "learning_rate": 7.638900252077778e-06, "loss": 0.3307, "step": 5095 }, { "epoch": 1.7190291589415136, "grad_norm": 1.3006104131596286, "learning_rate": 7.634262483890487e-06, "loss": 0.2729, "step": 5100 }, { "epoch": 1.7207146468902748, "grad_norm": 6.283131471658594, "learning_rate": 7.629621576369132e-06, "loss": 0.2965, "step": 5105 }, { "epoch": 1.7224001348390359, "grad_norm": 13.6092368979776, "learning_rate": 7.624977535044429e-06, "loss": 0.2836, "step": 5110 }, { "epoch": 1.724085622787797, "grad_norm": 1.4047936498728297, "learning_rate": 7.620330365450828e-06, "loss": 0.2919, "step": 5115 }, { "epoch": 1.725771110736558, "grad_norm": 1.5546612805155546, "learning_rate": 7.6156800731265076e-06, "loss": 0.2993, "step": 5120 }, { "epoch": 1.7274565986853194, "grad_norm": 3.012391619364554, "learning_rate": 7.6110266636133725e-06, "loss": 0.2901, "step": 5125 }, { "epoch": 1.7291420866340805, "grad_norm": 1.4088814643004863, "learning_rate": 7.606370142457033e-06, "loss": 0.3075, "step": 5130 }, { "epoch": 1.7308275745828419, "grad_norm": 1.265940304384725, "learning_rate": 7.601710515206816e-06, "loss": 0.2977, "step": 5135 }, { "epoch": 1.732513062531603, "grad_norm": 11.535301658136971, "learning_rate": 7.597047787415746e-06, "loss": 0.3029, "step": 5140 }, { "epoch": 1.734198550480364, "grad_norm": 1.5396122749245849, "learning_rate": 7.592381964640545e-06, "loss": 0.2973, "step": 5145 }, { "epoch": 1.7358840384291252, "grad_norm": 4.016808949745366, "learning_rate": 7.587713052441621e-06, "loss": 0.3195, "step": 5150 }, { "epoch": 1.7375695263778863, "grad_norm": 1.530371697029571, "learning_rate": 7.583041056383063e-06, "loss": 0.2877, "step": 5155 }, { "epoch": 1.7392550143266474, "grad_norm": 1.7382367198655937, "learning_rate": 7.578365982032637e-06, "loss": 0.305, "step": 5160 }, { "epoch": 1.7409405022754088, "grad_norm": 2.2514394592699647, "learning_rate": 7.57368783496178e-06, "loss": 0.3182, "step": 5165 }, { "epoch": 1.7426259902241699, "grad_norm": 1.3482588616499345, "learning_rate": 7.569006620745586e-06, "loss": 0.3009, "step": 5170 }, { "epoch": 1.7443114781729312, "grad_norm": 1.42929400463226, "learning_rate": 7.5643223449628066e-06, "loss": 0.2929, "step": 5175 }, { "epoch": 1.7459969661216923, "grad_norm": 1.200068728553629, "learning_rate": 7.559635013195841e-06, "loss": 0.2762, "step": 5180 }, { "epoch": 1.7476824540704534, "grad_norm": 1.086195312675611, "learning_rate": 7.554944631030732e-06, "loss": 0.2714, "step": 5185 }, { "epoch": 1.7493679420192145, "grad_norm": 1.390530215876357, "learning_rate": 7.550251204057156e-06, "loss": 0.2981, "step": 5190 }, { "epoch": 1.7510534299679756, "grad_norm": 1.6887386714329724, "learning_rate": 7.545554737868419e-06, "loss": 0.2663, "step": 5195 }, { "epoch": 1.7527389179167367, "grad_norm": 1.3929222748367942, "learning_rate": 7.5408552380614486e-06, "loss": 0.2699, "step": 5200 }, { "epoch": 1.754424405865498, "grad_norm": 1.1704820389857176, "learning_rate": 7.536152710236787e-06, "loss": 0.2762, "step": 5205 }, { "epoch": 1.7561098938142592, "grad_norm": 1.3168149746230169, "learning_rate": 7.531447159998586e-06, "loss": 0.295, "step": 5210 }, { "epoch": 1.7577953817630205, "grad_norm": 1.1274704294658016, "learning_rate": 7.526738592954599e-06, "loss": 0.2718, "step": 5215 }, { "epoch": 1.7594808697117816, "grad_norm": 1.4865898178748014, "learning_rate": 7.522027014716176e-06, "loss": 0.3192, "step": 5220 }, { "epoch": 1.7611663576605427, "grad_norm": 1.351572443814194, "learning_rate": 7.517312430898252e-06, "loss": 0.2785, "step": 5225 }, { "epoch": 1.7628518456093039, "grad_norm": 1.4318309186592089, "learning_rate": 7.512594847119345e-06, "loss": 0.2647, "step": 5230 }, { "epoch": 1.764537333558065, "grad_norm": 2.6422620160818133, "learning_rate": 7.50787426900155e-06, "loss": 0.3183, "step": 5235 }, { "epoch": 1.766222821506826, "grad_norm": 1.9544232382145845, "learning_rate": 7.503150702170532e-06, "loss": 0.2868, "step": 5240 }, { "epoch": 1.7679083094555874, "grad_norm": 1.3910226812737208, "learning_rate": 7.498424152255512e-06, "loss": 0.2983, "step": 5245 }, { "epoch": 1.7695937974043485, "grad_norm": 1.4395515103799448, "learning_rate": 7.493694624889272e-06, "loss": 0.3066, "step": 5250 }, { "epoch": 1.7712792853531099, "grad_norm": 1.1222566371604252, "learning_rate": 7.488962125708137e-06, "loss": 0.281, "step": 5255 }, { "epoch": 1.772964773301871, "grad_norm": 1.546040473023318, "learning_rate": 7.484226660351979e-06, "loss": 0.2995, "step": 5260 }, { "epoch": 1.774650261250632, "grad_norm": 1.574776195213442, "learning_rate": 7.479488234464198e-06, "loss": 0.3102, "step": 5265 }, { "epoch": 1.7763357491993932, "grad_norm": 1.327888821430515, "learning_rate": 7.47474685369173e-06, "loss": 0.2739, "step": 5270 }, { "epoch": 1.7780212371481543, "grad_norm": 2.4448341717068507, "learning_rate": 7.470002523685027e-06, "loss": 0.2823, "step": 5275 }, { "epoch": 1.7797067250969154, "grad_norm": 1.546366526166282, "learning_rate": 7.465255250098059e-06, "loss": 0.2817, "step": 5280 }, { "epoch": 1.7813922130456767, "grad_norm": 1.072940090664181, "learning_rate": 7.460505038588299e-06, "loss": 0.2881, "step": 5285 }, { "epoch": 1.7830777009944379, "grad_norm": 1.156854830692132, "learning_rate": 7.4557518948167295e-06, "loss": 0.2658, "step": 5290 }, { "epoch": 1.7847631889431992, "grad_norm": 1.265013609236647, "learning_rate": 7.450995824447817e-06, "loss": 0.304, "step": 5295 }, { "epoch": 1.7864486768919603, "grad_norm": 1.2731846527562656, "learning_rate": 7.446236833149527e-06, "loss": 0.3056, "step": 5300 }, { "epoch": 1.7881341648407214, "grad_norm": 1.2328738628513, "learning_rate": 7.4414749265932955e-06, "loss": 0.291, "step": 5305 }, { "epoch": 1.7898196527894825, "grad_norm": 1.2645077337917259, "learning_rate": 7.436710110454039e-06, "loss": 0.2879, "step": 5310 }, { "epoch": 1.7915051407382436, "grad_norm": 11.884705169991397, "learning_rate": 7.431942390410141e-06, "loss": 0.2696, "step": 5315 }, { "epoch": 1.7931906286870047, "grad_norm": 1.4416266618735254, "learning_rate": 7.427171772143442e-06, "loss": 0.3085, "step": 5320 }, { "epoch": 1.794876116635766, "grad_norm": 1.3103910235201572, "learning_rate": 7.4223982613392424e-06, "loss": 0.2962, "step": 5325 }, { "epoch": 1.7965616045845272, "grad_norm": 1.297116465661714, "learning_rate": 7.417621863686283e-06, "loss": 0.2865, "step": 5330 }, { "epoch": 1.7982470925332885, "grad_norm": 1.394746946999585, "learning_rate": 7.412842584876749e-06, "loss": 0.2926, "step": 5335 }, { "epoch": 1.7999325804820496, "grad_norm": 1.0633688131220695, "learning_rate": 7.4080604306062605e-06, "loss": 0.2627, "step": 5340 }, { "epoch": 1.8016180684308107, "grad_norm": 1.4620745451668324, "learning_rate": 7.40327540657386e-06, "loss": 0.2914, "step": 5345 }, { "epoch": 1.8033035563795718, "grad_norm": 1.4085668488993912, "learning_rate": 7.398487518482013e-06, "loss": 0.2965, "step": 5350 }, { "epoch": 1.804989044328333, "grad_norm": 1.0751425446009926, "learning_rate": 7.393696772036598e-06, "loss": 0.2943, "step": 5355 }, { "epoch": 1.806674532277094, "grad_norm": 1.2355772902782536, "learning_rate": 7.388903172946897e-06, "loss": 0.2725, "step": 5360 }, { "epoch": 1.8083600202258554, "grad_norm": 1.46332548505368, "learning_rate": 7.384106726925597e-06, "loss": 0.271, "step": 5365 }, { "epoch": 1.8100455081746165, "grad_norm": 1.3327958403232665, "learning_rate": 7.3793074396887735e-06, "loss": 0.257, "step": 5370 }, { "epoch": 1.8117309961233778, "grad_norm": 1.27159069423012, "learning_rate": 7.374505316955889e-06, "loss": 0.2997, "step": 5375 }, { "epoch": 1.813416484072139, "grad_norm": 1.1136873534765386, "learning_rate": 7.369700364449783e-06, "loss": 0.2813, "step": 5380 }, { "epoch": 1.8151019720209, "grad_norm": 1.058980131231453, "learning_rate": 7.364892587896675e-06, "loss": 0.2851, "step": 5385 }, { "epoch": 1.8167874599696612, "grad_norm": 1.153665868223348, "learning_rate": 7.3600819930261406e-06, "loss": 0.2807, "step": 5390 }, { "epoch": 1.8184729479184223, "grad_norm": 1.094878820896215, "learning_rate": 7.355268585571119e-06, "loss": 0.2491, "step": 5395 }, { "epoch": 1.8201584358671834, "grad_norm": 3.1535980383592435, "learning_rate": 7.3504523712679e-06, "loss": 0.2735, "step": 5400 }, { "epoch": 1.8218439238159447, "grad_norm": 1.1158721579623263, "learning_rate": 7.34563335585612e-06, "loss": 0.2808, "step": 5405 }, { "epoch": 1.8235294117647058, "grad_norm": 1.1810109517856728, "learning_rate": 7.340811545078751e-06, "loss": 0.2849, "step": 5410 }, { "epoch": 1.8252148997134672, "grad_norm": 2.6921207195719474, "learning_rate": 7.3359869446820985e-06, "loss": 0.276, "step": 5415 }, { "epoch": 1.8269003876622283, "grad_norm": 1.3992211796492626, "learning_rate": 7.331159560415791e-06, "loss": 0.2739, "step": 5420 }, { "epoch": 1.8285858756109894, "grad_norm": 1.3720595724897235, "learning_rate": 7.3263293980327765e-06, "loss": 0.2668, "step": 5425 }, { "epoch": 1.8302713635597505, "grad_norm": 1.469896908485536, "learning_rate": 7.321496463289311e-06, "loss": 0.2625, "step": 5430 }, { "epoch": 1.8319568515085116, "grad_norm": 1.8485122111038519, "learning_rate": 7.316660761944957e-06, "loss": 0.291, "step": 5435 }, { "epoch": 1.8336423394572727, "grad_norm": 10.340467741077116, "learning_rate": 7.311822299762573e-06, "loss": 0.2861, "step": 5440 }, { "epoch": 1.835327827406034, "grad_norm": 1.7422431249626944, "learning_rate": 7.306981082508307e-06, "loss": 0.2871, "step": 5445 }, { "epoch": 1.8370133153547952, "grad_norm": 1.1306638443027448, "learning_rate": 7.3021371159515915e-06, "loss": 0.2932, "step": 5450 }, { "epoch": 1.8386988033035565, "grad_norm": 1.1140792597783495, "learning_rate": 7.297290405865136e-06, "loss": 0.2966, "step": 5455 }, { "epoch": 1.8403842912523176, "grad_norm": 1.212632304429544, "learning_rate": 7.292440958024916e-06, "loss": 0.2899, "step": 5460 }, { "epoch": 1.8420697792010787, "grad_norm": 1.3938375232153972, "learning_rate": 7.287588778210174e-06, "loss": 0.2977, "step": 5465 }, { "epoch": 1.8437552671498398, "grad_norm": 1.3339246920805004, "learning_rate": 7.282733872203405e-06, "loss": 0.3031, "step": 5470 }, { "epoch": 1.845440755098601, "grad_norm": 1.6499477818188544, "learning_rate": 7.277876245790357e-06, "loss": 0.2944, "step": 5475 }, { "epoch": 1.847126243047362, "grad_norm": 1.4968414796972271, "learning_rate": 7.273015904760014e-06, "loss": 0.2883, "step": 5480 }, { "epoch": 1.8488117309961234, "grad_norm": 1.6850501788811187, "learning_rate": 7.2681528549046e-06, "loss": 0.2729, "step": 5485 }, { "epoch": 1.8504972189448845, "grad_norm": 1.442047666067804, "learning_rate": 7.263287102019565e-06, "loss": 0.2266, "step": 5490 }, { "epoch": 1.8521827068936458, "grad_norm": 1.5323602110070451, "learning_rate": 7.2584186519035815e-06, "loss": 0.2939, "step": 5495 }, { "epoch": 1.853868194842407, "grad_norm": 1.392918605066643, "learning_rate": 7.253547510358536e-06, "loss": 0.29, "step": 5500 }, { "epoch": 1.855553682791168, "grad_norm": 2.317316796924633, "learning_rate": 7.248673683189522e-06, "loss": 0.3041, "step": 5505 }, { "epoch": 1.8572391707399292, "grad_norm": 1.1660872257371953, "learning_rate": 7.243797176204833e-06, "loss": 0.2885, "step": 5510 }, { "epoch": 1.8589246586886903, "grad_norm": 1.180652848669013, "learning_rate": 7.238917995215957e-06, "loss": 0.2967, "step": 5515 }, { "epoch": 1.8606101466374514, "grad_norm": 1.431117199613125, "learning_rate": 7.234036146037571e-06, "loss": 0.2825, "step": 5520 }, { "epoch": 1.8622956345862127, "grad_norm": 1.608766155824911, "learning_rate": 7.229151634487526e-06, "loss": 0.2869, "step": 5525 }, { "epoch": 1.8639811225349738, "grad_norm": 1.194041242448981, "learning_rate": 7.22426446638685e-06, "loss": 0.2763, "step": 5530 }, { "epoch": 1.8656666104837352, "grad_norm": 1.4822426907606354, "learning_rate": 7.219374647559737e-06, "loss": 0.2702, "step": 5535 }, { "epoch": 1.8673520984324963, "grad_norm": 1.5813532585445986, "learning_rate": 7.21448218383354e-06, "loss": 0.2575, "step": 5540 }, { "epoch": 1.8690375863812574, "grad_norm": 4.162789690078896, "learning_rate": 7.209587081038761e-06, "loss": 0.2831, "step": 5545 }, { "epoch": 1.8707230743300185, "grad_norm": 2.1941907386004, "learning_rate": 7.2046893450090485e-06, "loss": 0.302, "step": 5550 }, { "epoch": 1.8724085622787796, "grad_norm": 1.2844626457888235, "learning_rate": 7.199788981581191e-06, "loss": 0.2751, "step": 5555 }, { "epoch": 1.8740940502275407, "grad_norm": 2.1417066213451643, "learning_rate": 7.194885996595109e-06, "loss": 0.2634, "step": 5560 }, { "epoch": 1.875779538176302, "grad_norm": 1.567906672664597, "learning_rate": 7.189980395893841e-06, "loss": 0.2817, "step": 5565 }, { "epoch": 1.8774650261250632, "grad_norm": 1.4640132987361372, "learning_rate": 7.185072185323548e-06, "loss": 0.2739, "step": 5570 }, { "epoch": 1.8791505140738245, "grad_norm": 1.1831500326551032, "learning_rate": 7.1801613707335015e-06, "loss": 0.2785, "step": 5575 }, { "epoch": 1.8808360020225856, "grad_norm": 1.5229423836919356, "learning_rate": 7.175247957976075e-06, "loss": 0.2763, "step": 5580 }, { "epoch": 1.8825214899713467, "grad_norm": 1.4154336466192219, "learning_rate": 7.170331952906737e-06, "loss": 0.2969, "step": 5585 }, { "epoch": 1.8842069779201078, "grad_norm": 1.2176116045878531, "learning_rate": 7.165413361384046e-06, "loss": 0.2854, "step": 5590 }, { "epoch": 1.885892465868869, "grad_norm": 1.317744924086943, "learning_rate": 7.1604921892696434e-06, "loss": 0.3131, "step": 5595 }, { "epoch": 1.88757795381763, "grad_norm": 1.210784995396025, "learning_rate": 7.155568442428248e-06, "loss": 0.2886, "step": 5600 }, { "epoch": 1.8892634417663914, "grad_norm": 1.0466185632829665, "learning_rate": 7.150642126727642e-06, "loss": 0.3073, "step": 5605 }, { "epoch": 1.8909489297151525, "grad_norm": 1.1694318182213048, "learning_rate": 7.1457132480386745e-06, "loss": 0.2686, "step": 5610 }, { "epoch": 1.8926344176639138, "grad_norm": 1.0824703576581143, "learning_rate": 7.140781812235245e-06, "loss": 0.2691, "step": 5615 }, { "epoch": 1.894319905612675, "grad_norm": 1.1655191162602738, "learning_rate": 7.135847825194303e-06, "loss": 0.2904, "step": 5620 }, { "epoch": 1.896005393561436, "grad_norm": 1.4629293813227948, "learning_rate": 7.1309112927958345e-06, "loss": 0.2537, "step": 5625 }, { "epoch": 1.8976908815101972, "grad_norm": 1.1877387693585426, "learning_rate": 7.125972220922864e-06, "loss": 0.2917, "step": 5630 }, { "epoch": 1.8993763694589583, "grad_norm": 1.2354905506595124, "learning_rate": 7.1210306154614405e-06, "loss": 0.2866, "step": 5635 }, { "epoch": 1.9010618574077194, "grad_norm": 1.1924275755679479, "learning_rate": 7.116086482300629e-06, "loss": 0.308, "step": 5640 }, { "epoch": 1.9027473453564807, "grad_norm": 1.1118719052510926, "learning_rate": 7.111139827332511e-06, "loss": 0.2494, "step": 5645 }, { "epoch": 1.9044328333052418, "grad_norm": 1.263291013583244, "learning_rate": 7.106190656452173e-06, "loss": 0.2399, "step": 5650 }, { "epoch": 1.9061183212540032, "grad_norm": 1.663479763754197, "learning_rate": 7.1012389755576995e-06, "loss": 0.2979, "step": 5655 }, { "epoch": 1.9078038092027643, "grad_norm": 1.7105749348673616, "learning_rate": 7.096284790550161e-06, "loss": 0.2684, "step": 5660 }, { "epoch": 1.9094892971515254, "grad_norm": 1.1648848641153158, "learning_rate": 7.0913281073336215e-06, "loss": 0.2991, "step": 5665 }, { "epoch": 1.9111747851002865, "grad_norm": 4.468619414466493, "learning_rate": 7.0863689318151156e-06, "loss": 0.3059, "step": 5670 }, { "epoch": 1.9128602730490476, "grad_norm": 1.144369748736879, "learning_rate": 7.081407269904649e-06, "loss": 0.262, "step": 5675 }, { "epoch": 1.9145457609978087, "grad_norm": 1.7484725420755798, "learning_rate": 7.076443127515191e-06, "loss": 0.2877, "step": 5680 }, { "epoch": 1.91623124894657, "grad_norm": 1.574402282970591, "learning_rate": 7.071476510562672e-06, "loss": 0.2816, "step": 5685 }, { "epoch": 1.9179167368953312, "grad_norm": 1.2842659899773619, "learning_rate": 7.0665074249659605e-06, "loss": 0.2835, "step": 5690 }, { "epoch": 1.9196022248440925, "grad_norm": 1.674707294869782, "learning_rate": 7.0615358766468776e-06, "loss": 0.279, "step": 5695 }, { "epoch": 1.9212877127928536, "grad_norm": 1.2832555638357301, "learning_rate": 7.056561871530172e-06, "loss": 0.3106, "step": 5700 }, { "epoch": 1.9229732007416147, "grad_norm": 1.1343021557145014, "learning_rate": 7.051585415543527e-06, "loss": 0.2537, "step": 5705 }, { "epoch": 1.9246586886903758, "grad_norm": 1.0952067263564165, "learning_rate": 7.04660651461754e-06, "loss": 0.2709, "step": 5710 }, { "epoch": 1.926344176639137, "grad_norm": 1.2807376898437488, "learning_rate": 7.041625174685725e-06, "loss": 0.276, "step": 5715 }, { "epoch": 1.928029664587898, "grad_norm": 1.1925516930253768, "learning_rate": 7.036641401684502e-06, "loss": 0.272, "step": 5720 }, { "epoch": 1.9297151525366594, "grad_norm": 1.5060440497108794, "learning_rate": 7.031655201553195e-06, "loss": 0.2821, "step": 5725 }, { "epoch": 1.9314006404854205, "grad_norm": 1.2416504816889506, "learning_rate": 7.026666580234012e-06, "loss": 0.2823, "step": 5730 }, { "epoch": 1.9330861284341818, "grad_norm": 1.5616212828868108, "learning_rate": 7.021675543672054e-06, "loss": 0.2799, "step": 5735 }, { "epoch": 1.934771616382943, "grad_norm": 1.326752471131693, "learning_rate": 7.016682097815297e-06, "loss": 0.2847, "step": 5740 }, { "epoch": 1.936457104331704, "grad_norm": 1.338471926432003, "learning_rate": 7.011686248614588e-06, "loss": 0.2432, "step": 5745 }, { "epoch": 1.9381425922804651, "grad_norm": 1.388370315512864, "learning_rate": 7.006688002023639e-06, "loss": 0.2759, "step": 5750 }, { "epoch": 1.9398280802292263, "grad_norm": 1.0306100423733313, "learning_rate": 7.001687363999017e-06, "loss": 0.2507, "step": 5755 }, { "epoch": 1.9415135681779874, "grad_norm": 3.9810134001311814, "learning_rate": 6.996684340500145e-06, "loss": 0.285, "step": 5760 }, { "epoch": 1.9431990561267487, "grad_norm": 1.218555666408596, "learning_rate": 6.991678937489281e-06, "loss": 0.2757, "step": 5765 }, { "epoch": 1.9448845440755098, "grad_norm": 1.1798125383973037, "learning_rate": 6.986671160931523e-06, "loss": 0.2811, "step": 5770 }, { "epoch": 1.9465700320242711, "grad_norm": 1.3774453741493768, "learning_rate": 6.981661016794799e-06, "loss": 0.3021, "step": 5775 }, { "epoch": 1.9482555199730323, "grad_norm": 1.0889069778871825, "learning_rate": 6.9766485110498535e-06, "loss": 0.2511, "step": 5780 }, { "epoch": 1.9499410079217934, "grad_norm": 1.0894701060484306, "learning_rate": 6.971633649670251e-06, "loss": 0.2571, "step": 5785 }, { "epoch": 1.9516264958705545, "grad_norm": 1.050824463414813, "learning_rate": 6.96661643863236e-06, "loss": 0.265, "step": 5790 }, { "epoch": 1.9533119838193156, "grad_norm": 1.4427860977769666, "learning_rate": 6.961596883915347e-06, "loss": 0.3046, "step": 5795 }, { "epoch": 1.9549974717680767, "grad_norm": 1.1291478688288843, "learning_rate": 6.956574991501179e-06, "loss": 0.2721, "step": 5800 }, { "epoch": 1.956682959716838, "grad_norm": 1.8650217088722214, "learning_rate": 6.951550767374603e-06, "loss": 0.2753, "step": 5805 }, { "epoch": 1.9583684476655991, "grad_norm": 1.1728275649235014, "learning_rate": 6.946524217523145e-06, "loss": 0.2727, "step": 5810 }, { "epoch": 1.9600539356143605, "grad_norm": 1.1308807502678624, "learning_rate": 6.941495347937102e-06, "loss": 0.3103, "step": 5815 }, { "epoch": 1.9617394235631216, "grad_norm": 2.1260449540015487, "learning_rate": 6.936464164609541e-06, "loss": 0.2682, "step": 5820 }, { "epoch": 1.9634249115118827, "grad_norm": 1.1265791238670662, "learning_rate": 6.9314306735362795e-06, "loss": 0.2738, "step": 5825 }, { "epoch": 1.9651103994606438, "grad_norm": 1.4490476432572892, "learning_rate": 6.92639488071589e-06, "loss": 0.2607, "step": 5830 }, { "epoch": 1.966795887409405, "grad_norm": 2.711251059874199, "learning_rate": 6.921356792149686e-06, "loss": 0.2637, "step": 5835 }, { "epoch": 1.968481375358166, "grad_norm": 1.243318693040002, "learning_rate": 6.916316413841718e-06, "loss": 0.2924, "step": 5840 }, { "epoch": 1.9701668633069274, "grad_norm": 1.1683456924274003, "learning_rate": 6.9112737517987635e-06, "loss": 0.2634, "step": 5845 }, { "epoch": 1.9718523512556885, "grad_norm": 1.177735087455277, "learning_rate": 6.906228812030322e-06, "loss": 0.2815, "step": 5850 }, { "epoch": 1.9735378392044498, "grad_norm": 1.5404884111077783, "learning_rate": 6.901181600548609e-06, "loss": 0.3032, "step": 5855 }, { "epoch": 1.975223327153211, "grad_norm": 1.1985779199195747, "learning_rate": 6.896132123368547e-06, "loss": 0.2801, "step": 5860 }, { "epoch": 1.976908815101972, "grad_norm": 1.1736034194769236, "learning_rate": 6.891080386507757e-06, "loss": 0.2736, "step": 5865 }, { "epoch": 1.9785943030507331, "grad_norm": 1.1463302810284723, "learning_rate": 6.886026395986554e-06, "loss": 0.2919, "step": 5870 }, { "epoch": 1.9802797909994942, "grad_norm": 1.161903655011089, "learning_rate": 6.880970157827937e-06, "loss": 0.2879, "step": 5875 }, { "epoch": 1.9819652789482554, "grad_norm": 1.0923068614964753, "learning_rate": 6.8759116780575905e-06, "loss": 0.2533, "step": 5880 }, { "epoch": 1.9836507668970167, "grad_norm": 1.2413766194562732, "learning_rate": 6.8708509627038585e-06, "loss": 0.283, "step": 5885 }, { "epoch": 1.9853362548457778, "grad_norm": 1.142317082249267, "learning_rate": 6.865788017797761e-06, "loss": 0.286, "step": 5890 }, { "epoch": 1.9870217427945391, "grad_norm": 1.3953433251211684, "learning_rate": 6.860722849372967e-06, "loss": 0.2787, "step": 5895 }, { "epoch": 1.9887072307433002, "grad_norm": 1.1307682002997104, "learning_rate": 6.855655463465798e-06, "loss": 0.2875, "step": 5900 }, { "epoch": 1.9903927186920614, "grad_norm": 1.3477079634225693, "learning_rate": 6.8505858661152205e-06, "loss": 0.2915, "step": 5905 }, { "epoch": 1.9920782066408225, "grad_norm": 2.540209978415579, "learning_rate": 6.8455140633628315e-06, "loss": 0.2762, "step": 5910 }, { "epoch": 1.9937636945895836, "grad_norm": 1.1274546132992864, "learning_rate": 6.840440061252862e-06, "loss": 0.29, "step": 5915 }, { "epoch": 1.9954491825383447, "grad_norm": 1.2145097813660723, "learning_rate": 6.835363865832159e-06, "loss": 0.2928, "step": 5920 }, { "epoch": 1.997134670487106, "grad_norm": 1.31697819767494, "learning_rate": 6.830285483150186e-06, "loss": 0.2852, "step": 5925 }, { "epoch": 1.9988201584358671, "grad_norm": 1.262786765571154, "learning_rate": 6.825204919259013e-06, "loss": 0.2659, "step": 5930 }, { "epoch": 2.0003370975897523, "grad_norm": 1.081679471655034, "learning_rate": 6.820122180213309e-06, "loss": 0.2351, "step": 5935 }, { "epoch": 2.0020225855385134, "grad_norm": 1.0974569801233274, "learning_rate": 6.815037272070334e-06, "loss": 0.2388, "step": 5940 }, { "epoch": 2.0037080734872745, "grad_norm": 1.1121862244964253, "learning_rate": 6.809950200889934e-06, "loss": 0.2597, "step": 5945 }, { "epoch": 2.0053935614360356, "grad_norm": 1.303319950093496, "learning_rate": 6.804860972734535e-06, "loss": 0.2713, "step": 5950 }, { "epoch": 2.0070790493847968, "grad_norm": 1.1534870595682194, "learning_rate": 6.799769593669131e-06, "loss": 0.2565, "step": 5955 }, { "epoch": 2.008764537333558, "grad_norm": 1.3382215610294592, "learning_rate": 6.794676069761278e-06, "loss": 0.2808, "step": 5960 }, { "epoch": 2.0104500252823194, "grad_norm": 10.55053784365816, "learning_rate": 6.78958040708109e-06, "loss": 0.2601, "step": 5965 }, { "epoch": 2.0121355132310805, "grad_norm": 1.2387882851117746, "learning_rate": 6.784482611701231e-06, "loss": 0.2734, "step": 5970 }, { "epoch": 2.0138210011798416, "grad_norm": 1.2010925585751586, "learning_rate": 6.779382689696905e-06, "loss": 0.2495, "step": 5975 }, { "epoch": 2.0155064891286028, "grad_norm": 1.3563958907335463, "learning_rate": 6.77428064714585e-06, "loss": 0.2719, "step": 5980 }, { "epoch": 2.017191977077364, "grad_norm": 1.5986235511234503, "learning_rate": 6.769176490128333e-06, "loss": 0.2724, "step": 5985 }, { "epoch": 2.018877465026125, "grad_norm": 1.1796754043878754, "learning_rate": 6.764070224727137e-06, "loss": 0.2553, "step": 5990 }, { "epoch": 2.020562952974886, "grad_norm": 1.1498771592568715, "learning_rate": 6.758961857027564e-06, "loss": 0.2993, "step": 5995 }, { "epoch": 2.022248440923647, "grad_norm": 1.210072788515302, "learning_rate": 6.753851393117414e-06, "loss": 0.2048, "step": 6000 }, { "epoch": 2.0239339288724087, "grad_norm": 1.1455726254272833, "learning_rate": 6.748738839086992e-06, "loss": 0.2667, "step": 6005 }, { "epoch": 2.02561941682117, "grad_norm": 1.600119557458997, "learning_rate": 6.743624201029089e-06, "loss": 0.2686, "step": 6010 }, { "epoch": 2.027304904769931, "grad_norm": 1.3810363711281857, "learning_rate": 6.738507485038981e-06, "loss": 0.2682, "step": 6015 }, { "epoch": 2.028990392718692, "grad_norm": 1.266606651116861, "learning_rate": 6.733388697214419e-06, "loss": 0.2173, "step": 6020 }, { "epoch": 2.030675880667453, "grad_norm": 1.2614845305669782, "learning_rate": 6.728267843655628e-06, "loss": 0.2644, "step": 6025 }, { "epoch": 2.0323613686162143, "grad_norm": 1.3900921696692135, "learning_rate": 6.723144930465288e-06, "loss": 0.2416, "step": 6030 }, { "epoch": 2.0340468565649754, "grad_norm": 1.130968249461293, "learning_rate": 6.718019963748542e-06, "loss": 0.2495, "step": 6035 }, { "epoch": 2.0357323445137365, "grad_norm": 1.619017723239934, "learning_rate": 6.71289294961297e-06, "loss": 0.247, "step": 6040 }, { "epoch": 2.037417832462498, "grad_norm": 1.2928331465674285, "learning_rate": 6.7077638941685994e-06, "loss": 0.2366, "step": 6045 }, { "epoch": 2.039103320411259, "grad_norm": 1.2805082081937507, "learning_rate": 6.70263280352789e-06, "loss": 0.2482, "step": 6050 }, { "epoch": 2.0407888083600203, "grad_norm": 1.8872078181420409, "learning_rate": 6.69749968380572e-06, "loss": 0.2475, "step": 6055 }, { "epoch": 2.0424742963087814, "grad_norm": 1.0204046297193239, "learning_rate": 6.692364541119396e-06, "loss": 0.2252, "step": 6060 }, { "epoch": 2.0441597842575425, "grad_norm": 2.0257825357597796, "learning_rate": 6.687227381588627e-06, "loss": 0.2555, "step": 6065 }, { "epoch": 2.0458452722063036, "grad_norm": 1.2512342026367647, "learning_rate": 6.682088211335531e-06, "loss": 0.2573, "step": 6070 }, { "epoch": 2.0475307601550647, "grad_norm": 1.386188385979886, "learning_rate": 6.676947036484617e-06, "loss": 0.2644, "step": 6075 }, { "epoch": 2.049216248103826, "grad_norm": 1.0861165391839216, "learning_rate": 6.671803863162789e-06, "loss": 0.2388, "step": 6080 }, { "epoch": 2.0509017360525874, "grad_norm": 2.0772980119317106, "learning_rate": 6.666658697499329e-06, "loss": 0.2532, "step": 6085 }, { "epoch": 2.0525872240013485, "grad_norm": 1.3067634721841486, "learning_rate": 6.6615115456258925e-06, "loss": 0.2635, "step": 6090 }, { "epoch": 2.0542727119501096, "grad_norm": 1.6623088119304197, "learning_rate": 6.656362413676503e-06, "loss": 0.2502, "step": 6095 }, { "epoch": 2.0559581998988707, "grad_norm": 1.2518297326165573, "learning_rate": 6.651211307787549e-06, "loss": 0.2388, "step": 6100 }, { "epoch": 2.057643687847632, "grad_norm": 1.3434940303637946, "learning_rate": 6.64605823409776e-06, "loss": 0.2235, "step": 6105 }, { "epoch": 2.059329175796393, "grad_norm": 0.9897277403291037, "learning_rate": 6.640903198748222e-06, "loss": 0.2388, "step": 6110 }, { "epoch": 2.061014663745154, "grad_norm": 1.45510930813586, "learning_rate": 6.635746207882349e-06, "loss": 0.2697, "step": 6115 }, { "epoch": 2.062700151693915, "grad_norm": 1.3521089592228548, "learning_rate": 6.630587267645898e-06, "loss": 0.2622, "step": 6120 }, { "epoch": 2.0643856396426767, "grad_norm": 1.0692698834733692, "learning_rate": 6.625426384186935e-06, "loss": 0.2459, "step": 6125 }, { "epoch": 2.066071127591438, "grad_norm": 1.4825489235287137, "learning_rate": 6.620263563655851e-06, "loss": 0.2695, "step": 6130 }, { "epoch": 2.067756615540199, "grad_norm": 1.1681226122016348, "learning_rate": 6.615098812205342e-06, "loss": 0.2486, "step": 6135 }, { "epoch": 2.06944210348896, "grad_norm": 2.0777535621564756, "learning_rate": 6.609932135990407e-06, "loss": 0.252, "step": 6140 }, { "epoch": 2.071127591437721, "grad_norm": 1.1701961255204318, "learning_rate": 6.604763541168336e-06, "loss": 0.221, "step": 6145 }, { "epoch": 2.0728130793864823, "grad_norm": 1.3049019400716555, "learning_rate": 6.5995930338987095e-06, "loss": 0.2228, "step": 6150 }, { "epoch": 2.0744985673352434, "grad_norm": 1.4136741050753618, "learning_rate": 6.594420620343383e-06, "loss": 0.2553, "step": 6155 }, { "epoch": 2.0761840552840045, "grad_norm": 1.5809852448132287, "learning_rate": 6.589246306666486e-06, "loss": 0.2627, "step": 6160 }, { "epoch": 2.077869543232766, "grad_norm": 1.4880142228637652, "learning_rate": 6.584070099034412e-06, "loss": 0.2313, "step": 6165 }, { "epoch": 2.079555031181527, "grad_norm": 1.2393786050637563, "learning_rate": 6.578892003615812e-06, "loss": 0.2262, "step": 6170 }, { "epoch": 2.0812405191302883, "grad_norm": 1.3603080402046084, "learning_rate": 6.573712026581587e-06, "loss": 0.2494, "step": 6175 }, { "epoch": 2.0829260070790494, "grad_norm": 1.166870025568283, "learning_rate": 6.568530174104878e-06, "loss": 0.2427, "step": 6180 }, { "epoch": 2.0846114950278105, "grad_norm": 1.3565042842232793, "learning_rate": 6.563346452361064e-06, "loss": 0.2743, "step": 6185 }, { "epoch": 2.0862969829765716, "grad_norm": 1.264171481817194, "learning_rate": 6.5581608675277496e-06, "loss": 0.24, "step": 6190 }, { "epoch": 2.0879824709253327, "grad_norm": 1.3624315567935192, "learning_rate": 6.5529734257847636e-06, "loss": 0.2501, "step": 6195 }, { "epoch": 2.089667958874094, "grad_norm": 1.265549615564812, "learning_rate": 6.54778413331414e-06, "loss": 0.2535, "step": 6200 }, { "epoch": 2.0913534468228554, "grad_norm": 1.2127408077403912, "learning_rate": 6.542592996300125e-06, "loss": 0.2537, "step": 6205 }, { "epoch": 2.0930389347716165, "grad_norm": 1.1296303663606113, "learning_rate": 6.537400020929162e-06, "loss": 0.2611, "step": 6210 }, { "epoch": 2.0947244227203776, "grad_norm": 1.3157590868806934, "learning_rate": 6.532205213389885e-06, "loss": 0.2588, "step": 6215 }, { "epoch": 2.0964099106691387, "grad_norm": 1.6103286145082827, "learning_rate": 6.527008579873107e-06, "loss": 0.2485, "step": 6220 }, { "epoch": 2.0980953986179, "grad_norm": 1.1648434001401868, "learning_rate": 6.521810126571825e-06, "loss": 0.243, "step": 6225 }, { "epoch": 2.099780886566661, "grad_norm": 1.1147612768375883, "learning_rate": 6.516609859681198e-06, "loss": 0.2536, "step": 6230 }, { "epoch": 2.101466374515422, "grad_norm": 1.2655454397441157, "learning_rate": 6.511407785398549e-06, "loss": 0.233, "step": 6235 }, { "epoch": 2.103151862464183, "grad_norm": 1.536416415087582, "learning_rate": 6.506203909923357e-06, "loss": 0.2161, "step": 6240 }, { "epoch": 2.1048373504129447, "grad_norm": 1.1697953229322648, "learning_rate": 6.500998239457241e-06, "loss": 0.2304, "step": 6245 }, { "epoch": 2.106522838361706, "grad_norm": 1.2246499214653337, "learning_rate": 6.495790780203967e-06, "loss": 0.2485, "step": 6250 }, { "epoch": 2.108208326310467, "grad_norm": 0.9838021800101451, "learning_rate": 6.490581538369429e-06, "loss": 0.2532, "step": 6255 }, { "epoch": 2.109893814259228, "grad_norm": 1.6274303549369238, "learning_rate": 6.485370520161643e-06, "loss": 0.2431, "step": 6260 }, { "epoch": 2.111579302207989, "grad_norm": 1.174549942148735, "learning_rate": 6.480157731790747e-06, "loss": 0.2401, "step": 6265 }, { "epoch": 2.1132647901567503, "grad_norm": 1.298723556089621, "learning_rate": 6.474943179468986e-06, "loss": 0.273, "step": 6270 }, { "epoch": 2.1149502781055114, "grad_norm": 1.362565090959308, "learning_rate": 6.469726869410706e-06, "loss": 0.2425, "step": 6275 }, { "epoch": 2.1166357660542725, "grad_norm": 1.2707050963377002, "learning_rate": 6.464508807832348e-06, "loss": 0.2492, "step": 6280 }, { "epoch": 2.118321254003034, "grad_norm": 1.4398493183564267, "learning_rate": 6.4592890009524446e-06, "loss": 0.2208, "step": 6285 }, { "epoch": 2.120006741951795, "grad_norm": 2.2097074051300902, "learning_rate": 6.454067454991602e-06, "loss": 0.254, "step": 6290 }, { "epoch": 2.1216922299005563, "grad_norm": 1.0908970533139506, "learning_rate": 6.448844176172504e-06, "loss": 0.2512, "step": 6295 }, { "epoch": 2.1233777178493174, "grad_norm": 1.1939799800037705, "learning_rate": 6.443619170719896e-06, "loss": 0.253, "step": 6300 }, { "epoch": 2.1250632057980785, "grad_norm": 1.6320742143381952, "learning_rate": 6.438392444860584e-06, "loss": 0.2553, "step": 6305 }, { "epoch": 2.1267486937468396, "grad_norm": 1.2004140859484624, "learning_rate": 6.433164004823421e-06, "loss": 0.2279, "step": 6310 }, { "epoch": 2.1284341816956007, "grad_norm": 1.2971609743548953, "learning_rate": 6.427933856839305e-06, "loss": 0.2563, "step": 6315 }, { "epoch": 2.130119669644362, "grad_norm": 1.2299086008240054, "learning_rate": 6.4227020071411704e-06, "loss": 0.2605, "step": 6320 }, { "epoch": 2.1318051575931234, "grad_norm": 1.312129544332664, "learning_rate": 6.417468461963978e-06, "loss": 0.242, "step": 6325 }, { "epoch": 2.1334906455418845, "grad_norm": 1.2869067770450704, "learning_rate": 6.41223322754471e-06, "loss": 0.2234, "step": 6330 }, { "epoch": 2.1351761334906456, "grad_norm": 1.1860377014817758, "learning_rate": 6.4069963101223575e-06, "loss": 0.2431, "step": 6335 }, { "epoch": 2.1368616214394067, "grad_norm": 1.2427302070054334, "learning_rate": 6.401757715937924e-06, "loss": 0.2707, "step": 6340 }, { "epoch": 2.138547109388168, "grad_norm": 1.2421934361326865, "learning_rate": 6.3965174512344074e-06, "loss": 0.2499, "step": 6345 }, { "epoch": 2.140232597336929, "grad_norm": 1.33796945391822, "learning_rate": 6.391275522256799e-06, "loss": 0.2491, "step": 6350 }, { "epoch": 2.14191808528569, "grad_norm": 1.2661647720517677, "learning_rate": 6.386031935252068e-06, "loss": 0.2312, "step": 6355 }, { "epoch": 2.1436035732344516, "grad_norm": 3.878190628066829, "learning_rate": 6.380786696469168e-06, "loss": 0.2412, "step": 6360 }, { "epoch": 2.1452890611832127, "grad_norm": 1.1235382207507927, "learning_rate": 6.37553981215901e-06, "loss": 0.2546, "step": 6365 }, { "epoch": 2.146974549131974, "grad_norm": 1.131285056452738, "learning_rate": 6.370291288574479e-06, "loss": 0.236, "step": 6370 }, { "epoch": 2.148660037080735, "grad_norm": 1.148681698062413, "learning_rate": 6.365041131970401e-06, "loss": 0.2506, "step": 6375 }, { "epoch": 2.150345525029496, "grad_norm": 1.0017329344505117, "learning_rate": 6.359789348603559e-06, "loss": 0.2171, "step": 6380 }, { "epoch": 2.152031012978257, "grad_norm": 1.2961719987505604, "learning_rate": 6.354535944732665e-06, "loss": 0.2551, "step": 6385 }, { "epoch": 2.1537165009270183, "grad_norm": 1.7615416802946193, "learning_rate": 6.3492809266183705e-06, "loss": 0.2355, "step": 6390 }, { "epoch": 2.1554019888757794, "grad_norm": 1.4712566924642663, "learning_rate": 6.344024300523244e-06, "loss": 0.24, "step": 6395 }, { "epoch": 2.1570874768245405, "grad_norm": 2.5427161626675177, "learning_rate": 6.338766072711777e-06, "loss": 0.2329, "step": 6400 }, { "epoch": 2.158772964773302, "grad_norm": 1.0873714400857215, "learning_rate": 6.333506249450363e-06, "loss": 0.2342, "step": 6405 }, { "epoch": 2.160458452722063, "grad_norm": 2.356290105341313, "learning_rate": 6.328244837007302e-06, "loss": 0.2207, "step": 6410 }, { "epoch": 2.1621439406708243, "grad_norm": 1.0946280353668836, "learning_rate": 6.322981841652784e-06, "loss": 0.2415, "step": 6415 }, { "epoch": 2.1638294286195854, "grad_norm": 1.2226895675382223, "learning_rate": 6.317717269658889e-06, "loss": 0.2397, "step": 6420 }, { "epoch": 2.1655149165683465, "grad_norm": 1.1313438840146788, "learning_rate": 6.312451127299572e-06, "loss": 0.2209, "step": 6425 }, { "epoch": 2.1672004045171076, "grad_norm": 1.149977859921272, "learning_rate": 6.307183420850666e-06, "loss": 0.2377, "step": 6430 }, { "epoch": 2.1688858924658687, "grad_norm": 1.2413176937109252, "learning_rate": 6.30191415658986e-06, "loss": 0.2483, "step": 6435 }, { "epoch": 2.1705713804146303, "grad_norm": 1.0624062904086755, "learning_rate": 6.296643340796704e-06, "loss": 0.272, "step": 6440 }, { "epoch": 2.1722568683633914, "grad_norm": 1.2458917486919447, "learning_rate": 6.291370979752596e-06, "loss": 0.2345, "step": 6445 }, { "epoch": 2.1739423563121525, "grad_norm": 1.1092889538805586, "learning_rate": 6.286097079740776e-06, "loss": 0.2732, "step": 6450 }, { "epoch": 2.1756278442609136, "grad_norm": 1.2145308467589744, "learning_rate": 6.280821647046319e-06, "loss": 0.237, "step": 6455 }, { "epoch": 2.1773133322096747, "grad_norm": 1.1942778026429117, "learning_rate": 6.2755446879561235e-06, "loss": 0.252, "step": 6460 }, { "epoch": 2.178998820158436, "grad_norm": 1.1936836670649174, "learning_rate": 6.27026620875891e-06, "loss": 0.2527, "step": 6465 }, { "epoch": 2.180684308107197, "grad_norm": 1.6407158583958905, "learning_rate": 6.2649862157452075e-06, "loss": 0.2515, "step": 6470 }, { "epoch": 2.182369796055958, "grad_norm": 1.4042006113815277, "learning_rate": 6.2597047152073535e-06, "loss": 0.2365, "step": 6475 }, { "epoch": 2.184055284004719, "grad_norm": 1.264853171511084, "learning_rate": 6.254421713439478e-06, "loss": 0.2481, "step": 6480 }, { "epoch": 2.1857407719534807, "grad_norm": 1.1571421488286933, "learning_rate": 6.2491372167375035e-06, "loss": 0.2566, "step": 6485 }, { "epoch": 2.187426259902242, "grad_norm": 1.1344941892282723, "learning_rate": 6.243851231399127e-06, "loss": 0.2642, "step": 6490 }, { "epoch": 2.189111747851003, "grad_norm": 1.033522861212516, "learning_rate": 6.23856376372383e-06, "loss": 0.2354, "step": 6495 }, { "epoch": 2.190797235799764, "grad_norm": 1.3910548577470034, "learning_rate": 6.233274820012854e-06, "loss": 0.2417, "step": 6500 }, { "epoch": 2.192482723748525, "grad_norm": 1.1187488644271608, "learning_rate": 6.227984406569202e-06, "loss": 0.2451, "step": 6505 }, { "epoch": 2.1941682116972863, "grad_norm": 1.3938663799017454, "learning_rate": 6.2226925296976215e-06, "loss": 0.2482, "step": 6510 }, { "epoch": 2.1958536996460474, "grad_norm": 1.2143780836363995, "learning_rate": 6.217399195704618e-06, "loss": 0.2628, "step": 6515 }, { "epoch": 2.197539187594809, "grad_norm": 1.5102904432753659, "learning_rate": 6.212104410898419e-06, "loss": 0.2271, "step": 6520 }, { "epoch": 2.19922467554357, "grad_norm": 1.2594634974044248, "learning_rate": 6.206808181588991e-06, "loss": 0.2474, "step": 6525 }, { "epoch": 2.200910163492331, "grad_norm": 1.2072521354060317, "learning_rate": 6.201510514088015e-06, "loss": 0.2626, "step": 6530 }, { "epoch": 2.2025956514410923, "grad_norm": 1.1328336914246169, "learning_rate": 6.196211414708894e-06, "loss": 0.2193, "step": 6535 }, { "epoch": 2.2042811393898534, "grad_norm": 1.245626489237525, "learning_rate": 6.190910889766727e-06, "loss": 0.2426, "step": 6540 }, { "epoch": 2.2059666273386145, "grad_norm": 1.1860186636748626, "learning_rate": 6.1856089455783205e-06, "loss": 0.2277, "step": 6545 }, { "epoch": 2.2076521152873756, "grad_norm": 1.2039569888275217, "learning_rate": 6.18030558846217e-06, "loss": 0.2015, "step": 6550 }, { "epoch": 2.2093376032361367, "grad_norm": 1.3145544977631802, "learning_rate": 6.175000824738455e-06, "loss": 0.2499, "step": 6555 }, { "epoch": 2.211023091184898, "grad_norm": 1.193321888684212, "learning_rate": 6.169694660729026e-06, "loss": 0.2399, "step": 6560 }, { "epoch": 2.2127085791336594, "grad_norm": 1.616439088934144, "learning_rate": 6.164387102757411e-06, "loss": 0.2361, "step": 6565 }, { "epoch": 2.2143940670824205, "grad_norm": 1.4583808239398672, "learning_rate": 6.1590781571487935e-06, "loss": 0.2393, "step": 6570 }, { "epoch": 2.2160795550311816, "grad_norm": 2.2827973314642036, "learning_rate": 6.153767830230013e-06, "loss": 0.2541, "step": 6575 }, { "epoch": 2.2177650429799427, "grad_norm": 1.1064912696242977, "learning_rate": 6.148456128329553e-06, "loss": 0.2402, "step": 6580 }, { "epoch": 2.219450530928704, "grad_norm": 1.1891062258691032, "learning_rate": 6.143143057777537e-06, "loss": 0.2361, "step": 6585 }, { "epoch": 2.221136018877465, "grad_norm": 1.2681051786950261, "learning_rate": 6.137828624905722e-06, "loss": 0.2536, "step": 6590 }, { "epoch": 2.222821506826226, "grad_norm": 1.2534210276757634, "learning_rate": 6.132512836047482e-06, "loss": 0.2359, "step": 6595 }, { "epoch": 2.2245069947749876, "grad_norm": 1.0748329546371043, "learning_rate": 6.127195697537813e-06, "loss": 0.252, "step": 6600 }, { "epoch": 2.2261924827237487, "grad_norm": 1.1736147397405687, "learning_rate": 6.1218772157133185e-06, "loss": 0.2388, "step": 6605 }, { "epoch": 2.22787797067251, "grad_norm": 1.0568139309517195, "learning_rate": 6.116557396912202e-06, "loss": 0.2311, "step": 6610 }, { "epoch": 2.229563458621271, "grad_norm": 1.0798317282856567, "learning_rate": 6.111236247474257e-06, "loss": 0.2509, "step": 6615 }, { "epoch": 2.231248946570032, "grad_norm": 1.2481887998981374, "learning_rate": 6.105913773740868e-06, "loss": 0.2547, "step": 6620 }, { "epoch": 2.232934434518793, "grad_norm": 1.3505784539591155, "learning_rate": 6.100589982054996e-06, "loss": 0.2395, "step": 6625 }, { "epoch": 2.2346199224675543, "grad_norm": 1.5618324719320573, "learning_rate": 6.095264878761173e-06, "loss": 0.2466, "step": 6630 }, { "epoch": 2.2363054104163154, "grad_norm": 14.96700943268755, "learning_rate": 6.089938470205491e-06, "loss": 0.2352, "step": 6635 }, { "epoch": 2.2379908983650765, "grad_norm": 1.5468393100897242, "learning_rate": 6.0846107627356e-06, "loss": 0.2331, "step": 6640 }, { "epoch": 2.239676386313838, "grad_norm": 3.150900044580589, "learning_rate": 6.079281762700699e-06, "loss": 0.2291, "step": 6645 }, { "epoch": 2.241361874262599, "grad_norm": 1.613263136193192, "learning_rate": 6.073951476451527e-06, "loss": 0.2621, "step": 6650 }, { "epoch": 2.2430473622113603, "grad_norm": 1.7551864238089545, "learning_rate": 6.068619910340352e-06, "loss": 0.2395, "step": 6655 }, { "epoch": 2.2447328501601214, "grad_norm": 1.4465600175104945, "learning_rate": 6.063287070720973e-06, "loss": 0.255, "step": 6660 }, { "epoch": 2.2464183381088825, "grad_norm": 1.258228336774687, "learning_rate": 6.057952963948702e-06, "loss": 0.2481, "step": 6665 }, { "epoch": 2.2481038260576436, "grad_norm": 1.263143196546373, "learning_rate": 6.052617596380367e-06, "loss": 0.2531, "step": 6670 }, { "epoch": 2.2497893140064047, "grad_norm": 4.807974012440118, "learning_rate": 6.047280974374288e-06, "loss": 0.2444, "step": 6675 }, { "epoch": 2.2514748019551662, "grad_norm": 2.100061007257097, "learning_rate": 6.041943104290292e-06, "loss": 0.2398, "step": 6680 }, { "epoch": 2.2531602899039274, "grad_norm": 1.4269327638298077, "learning_rate": 6.036603992489686e-06, "loss": 0.2361, "step": 6685 }, { "epoch": 2.2548457778526885, "grad_norm": 1.360058455325419, "learning_rate": 6.031263645335259e-06, "loss": 0.2401, "step": 6690 }, { "epoch": 2.2565312658014496, "grad_norm": 1.1782327502374301, "learning_rate": 6.0259220691912716e-06, "loss": 0.2302, "step": 6695 }, { "epoch": 2.2582167537502107, "grad_norm": 1.164819799173591, "learning_rate": 6.020579270423449e-06, "loss": 0.2593, "step": 6700 }, { "epoch": 2.259902241698972, "grad_norm": 1.2387369444451408, "learning_rate": 6.015235255398974e-06, "loss": 0.2592, "step": 6705 }, { "epoch": 2.261587729647733, "grad_norm": 1.6254405001299357, "learning_rate": 6.009890030486479e-06, "loss": 0.2475, "step": 6710 }, { "epoch": 2.263273217596494, "grad_norm": 1.625103328435798, "learning_rate": 6.004543602056037e-06, "loss": 0.2305, "step": 6715 }, { "epoch": 2.264958705545255, "grad_norm": 1.1636770921013966, "learning_rate": 5.999195976479157e-06, "loss": 0.2473, "step": 6720 }, { "epoch": 2.2666441934940167, "grad_norm": 11.893545102865916, "learning_rate": 5.993847160128775e-06, "loss": 0.2357, "step": 6725 }, { "epoch": 2.268329681442778, "grad_norm": 1.1335921386467114, "learning_rate": 5.988497159379243e-06, "loss": 0.2193, "step": 6730 }, { "epoch": 2.270015169391539, "grad_norm": 1.0642553169784263, "learning_rate": 5.983145980606326e-06, "loss": 0.235, "step": 6735 }, { "epoch": 2.2717006573403, "grad_norm": 3.786792114465667, "learning_rate": 5.977793630187195e-06, "loss": 0.2493, "step": 6740 }, { "epoch": 2.273386145289061, "grad_norm": 1.1713137338435256, "learning_rate": 5.972440114500416e-06, "loss": 0.2555, "step": 6745 }, { "epoch": 2.2750716332378222, "grad_norm": 1.3623208121739148, "learning_rate": 5.967085439925939e-06, "loss": 0.2182, "step": 6750 }, { "epoch": 2.2767571211865834, "grad_norm": 1.344198239937484, "learning_rate": 5.961729612845106e-06, "loss": 0.223, "step": 6755 }, { "epoch": 2.278442609135345, "grad_norm": 1.383230976151544, "learning_rate": 5.956372639640619e-06, "loss": 0.2359, "step": 6760 }, { "epoch": 2.280128097084106, "grad_norm": 1.944799811793341, "learning_rate": 5.951014526696559e-06, "loss": 0.2442, "step": 6765 }, { "epoch": 2.281813585032867, "grad_norm": 1.0918926349432208, "learning_rate": 5.945655280398354e-06, "loss": 0.2138, "step": 6770 }, { "epoch": 2.2834990729816282, "grad_norm": 1.1231403105751203, "learning_rate": 5.940294907132791e-06, "loss": 0.2417, "step": 6775 }, { "epoch": 2.2851845609303894, "grad_norm": 1.3184852064982298, "learning_rate": 5.9349334132879934e-06, "loss": 0.2291, "step": 6780 }, { "epoch": 2.2868700488791505, "grad_norm": 2.2537021473360648, "learning_rate": 5.929570805253427e-06, "loss": 0.2581, "step": 6785 }, { "epoch": 2.2885555368279116, "grad_norm": 1.100721585922719, "learning_rate": 5.924207089419877e-06, "loss": 0.2191, "step": 6790 }, { "epoch": 2.2902410247766727, "grad_norm": 1.1670665658746653, "learning_rate": 5.918842272179459e-06, "loss": 0.2654, "step": 6795 }, { "epoch": 2.291926512725434, "grad_norm": 1.1589864389994455, "learning_rate": 5.9134763599255916e-06, "loss": 0.2471, "step": 6800 }, { "epoch": 2.2936120006741954, "grad_norm": 1.0481508446883854, "learning_rate": 5.908109359053005e-06, "loss": 0.2255, "step": 6805 }, { "epoch": 2.2952974886229565, "grad_norm": 1.047008069868799, "learning_rate": 5.902741275957721e-06, "loss": 0.2479, "step": 6810 }, { "epoch": 2.2969829765717176, "grad_norm": 1.2058473501896603, "learning_rate": 5.897372117037059e-06, "loss": 0.2612, "step": 6815 }, { "epoch": 2.2986684645204787, "grad_norm": 1.4300246968276056, "learning_rate": 5.892001888689612e-06, "loss": 0.2615, "step": 6820 }, { "epoch": 2.30035395246924, "grad_norm": 1.269627297541263, "learning_rate": 5.88663059731525e-06, "loss": 0.2318, "step": 6825 }, { "epoch": 2.302039440418001, "grad_norm": 1.1374461606786461, "learning_rate": 5.881258249315116e-06, "loss": 0.2229, "step": 6830 }, { "epoch": 2.303724928366762, "grad_norm": 1.7180410432225457, "learning_rate": 5.875884851091604e-06, "loss": 0.2442, "step": 6835 }, { "epoch": 2.3054104163155236, "grad_norm": 1.2143351896131052, "learning_rate": 5.870510409048365e-06, "loss": 0.2423, "step": 6840 }, { "epoch": 2.3070959042642847, "grad_norm": 1.1258442510338162, "learning_rate": 5.8651349295902896e-06, "loss": 0.2387, "step": 6845 }, { "epoch": 2.308781392213046, "grad_norm": 1.137679132016169, "learning_rate": 5.859758419123508e-06, "loss": 0.2403, "step": 6850 }, { "epoch": 2.310466880161807, "grad_norm": 1.112321254644545, "learning_rate": 5.854380884055377e-06, "loss": 0.262, "step": 6855 }, { "epoch": 2.312152368110568, "grad_norm": 1.3640879778961075, "learning_rate": 5.849002330794478e-06, "loss": 0.2576, "step": 6860 }, { "epoch": 2.313837856059329, "grad_norm": 1.0839730017891585, "learning_rate": 5.843622765750601e-06, "loss": 0.2697, "step": 6865 }, { "epoch": 2.3155233440080902, "grad_norm": 1.263259455972846, "learning_rate": 5.838242195334747e-06, "loss": 0.2596, "step": 6870 }, { "epoch": 2.3172088319568513, "grad_norm": 1.1893704341874667, "learning_rate": 5.832860625959108e-06, "loss": 0.223, "step": 6875 }, { "epoch": 2.3188943199056125, "grad_norm": 1.1540849636190489, "learning_rate": 5.8274780640370735e-06, "loss": 0.2375, "step": 6880 }, { "epoch": 2.320579807854374, "grad_norm": 1.2223960562080016, "learning_rate": 5.822094515983213e-06, "loss": 0.2381, "step": 6885 }, { "epoch": 2.322265295803135, "grad_norm": 1.487367547151045, "learning_rate": 5.816709988213272e-06, "loss": 0.2148, "step": 6890 }, { "epoch": 2.3239507837518962, "grad_norm": 1.2503138457234537, "learning_rate": 5.811324487144158e-06, "loss": 0.202, "step": 6895 }, { "epoch": 2.3256362717006573, "grad_norm": 1.0493433032609951, "learning_rate": 5.805938019193951e-06, "loss": 0.2186, "step": 6900 }, { "epoch": 2.3273217596494185, "grad_norm": 1.2202510868363294, "learning_rate": 5.800550590781868e-06, "loss": 0.2466, "step": 6905 }, { "epoch": 2.3290072475981796, "grad_norm": 1.2202610825295719, "learning_rate": 5.7951622083282855e-06, "loss": 0.2062, "step": 6910 }, { "epoch": 2.3306927355469407, "grad_norm": 1.8773743398521658, "learning_rate": 5.789772878254702e-06, "loss": 0.2512, "step": 6915 }, { "epoch": 2.3323782234957022, "grad_norm": 1.4613324066896567, "learning_rate": 5.784382606983758e-06, "loss": 0.2153, "step": 6920 }, { "epoch": 2.3340637114444633, "grad_norm": 2.7067987953485186, "learning_rate": 5.77899140093921e-06, "loss": 0.2469, "step": 6925 }, { "epoch": 2.3357491993932245, "grad_norm": 1.1325219005782825, "learning_rate": 5.773599266545929e-06, "loss": 0.2341, "step": 6930 }, { "epoch": 2.3374346873419856, "grad_norm": 1.4263889886016168, "learning_rate": 5.7682062102298885e-06, "loss": 0.2388, "step": 6935 }, { "epoch": 2.3391201752907467, "grad_norm": 4.1610607418431815, "learning_rate": 5.76281223841817e-06, "loss": 0.2476, "step": 6940 }, { "epoch": 2.340805663239508, "grad_norm": 1.5113178559529479, "learning_rate": 5.757417357538937e-06, "loss": 0.2155, "step": 6945 }, { "epoch": 2.342491151188269, "grad_norm": 1.0816194687734555, "learning_rate": 5.7520215740214425e-06, "loss": 0.2247, "step": 6950 }, { "epoch": 2.34417663913703, "grad_norm": 1.6654194399233935, "learning_rate": 5.746624894296011e-06, "loss": 0.2251, "step": 6955 }, { "epoch": 2.345862127085791, "grad_norm": 1.2577144581376345, "learning_rate": 5.741227324794036e-06, "loss": 0.2498, "step": 6960 }, { "epoch": 2.3475476150345527, "grad_norm": 1.3714996263539714, "learning_rate": 5.735828871947975e-06, "loss": 0.2566, "step": 6965 }, { "epoch": 2.349233102983314, "grad_norm": 1.3782756750931844, "learning_rate": 5.730429542191334e-06, "loss": 0.2025, "step": 6970 }, { "epoch": 2.350918590932075, "grad_norm": 1.265353171121104, "learning_rate": 5.725029341958663e-06, "loss": 0.2126, "step": 6975 }, { "epoch": 2.352604078880836, "grad_norm": 1.2328021537041824, "learning_rate": 5.719628277685554e-06, "loss": 0.2225, "step": 6980 }, { "epoch": 2.354289566829597, "grad_norm": 20.18868511446677, "learning_rate": 5.714226355808626e-06, "loss": 0.2244, "step": 6985 }, { "epoch": 2.3559750547783582, "grad_norm": 1.3438543339064466, "learning_rate": 5.708823582765522e-06, "loss": 0.2709, "step": 6990 }, { "epoch": 2.3576605427271193, "grad_norm": 1.124947070515268, "learning_rate": 5.703419964994895e-06, "loss": 0.2225, "step": 6995 }, { "epoch": 2.359346030675881, "grad_norm": 1.310836306015827, "learning_rate": 5.698015508936409e-06, "loss": 0.2285, "step": 7000 }, { "epoch": 2.361031518624642, "grad_norm": 1.584790783446456, "learning_rate": 5.692610221030725e-06, "loss": 0.2538, "step": 7005 }, { "epoch": 2.362717006573403, "grad_norm": 1.4685376379074737, "learning_rate": 5.687204107719497e-06, "loss": 0.2165, "step": 7010 }, { "epoch": 2.364402494522164, "grad_norm": 1.269877373502541, "learning_rate": 5.68179717544536e-06, "loss": 0.2023, "step": 7015 }, { "epoch": 2.3660879824709253, "grad_norm": 1.094382809454758, "learning_rate": 5.676389430651928e-06, "loss": 0.2177, "step": 7020 }, { "epoch": 2.3677734704196864, "grad_norm": 1.1387050143356454, "learning_rate": 5.670980879783781e-06, "loss": 0.2458, "step": 7025 }, { "epoch": 2.3694589583684476, "grad_norm": 1.1974323412230972, "learning_rate": 5.665571529286459e-06, "loss": 0.2084, "step": 7030 }, { "epoch": 2.3711444463172087, "grad_norm": 1.092310961345673, "learning_rate": 5.660161385606457e-06, "loss": 0.2047, "step": 7035 }, { "epoch": 2.3728299342659698, "grad_norm": 1.527084870640945, "learning_rate": 5.654750455191218e-06, "loss": 0.232, "step": 7040 }, { "epoch": 2.3745154222147313, "grad_norm": 1.0282336568760309, "learning_rate": 5.649338744489117e-06, "loss": 0.2158, "step": 7045 }, { "epoch": 2.3762009101634924, "grad_norm": 1.3155101904587596, "learning_rate": 5.643926259949457e-06, "loss": 0.213, "step": 7050 }, { "epoch": 2.3778863981122536, "grad_norm": 2.866191674783534, "learning_rate": 5.638513008022474e-06, "loss": 0.2209, "step": 7055 }, { "epoch": 2.3795718860610147, "grad_norm": 1.5414440506664546, "learning_rate": 5.633098995159309e-06, "loss": 0.2424, "step": 7060 }, { "epoch": 2.3812573740097758, "grad_norm": 1.112557460013863, "learning_rate": 5.627684227812013e-06, "loss": 0.201, "step": 7065 }, { "epoch": 2.382942861958537, "grad_norm": 2.0114792448001535, "learning_rate": 5.622268712433534e-06, "loss": 0.2334, "step": 7070 }, { "epoch": 2.384628349907298, "grad_norm": 1.2742428624111608, "learning_rate": 5.616852455477716e-06, "loss": 0.2351, "step": 7075 }, { "epoch": 2.3863138378560595, "grad_norm": 1.3671795536971996, "learning_rate": 5.611435463399281e-06, "loss": 0.2398, "step": 7080 }, { "epoch": 2.3879993258048207, "grad_norm": 1.1301205543521131, "learning_rate": 5.606017742653833e-06, "loss": 0.2235, "step": 7085 }, { "epoch": 2.3896848137535818, "grad_norm": 1.1526152359615127, "learning_rate": 5.600599299697839e-06, "loss": 0.2468, "step": 7090 }, { "epoch": 2.391370301702343, "grad_norm": 1.453507374800417, "learning_rate": 5.595180140988632e-06, "loss": 0.2254, "step": 7095 }, { "epoch": 2.393055789651104, "grad_norm": 1.3710186359545902, "learning_rate": 5.589760272984392e-06, "loss": 0.2351, "step": 7100 }, { "epoch": 2.394741277599865, "grad_norm": 1.754515083381334, "learning_rate": 5.584339702144152e-06, "loss": 0.2225, "step": 7105 }, { "epoch": 2.396426765548626, "grad_norm": 1.199318449021297, "learning_rate": 5.57891843492777e-06, "loss": 0.2079, "step": 7110 }, { "epoch": 2.3981122534973873, "grad_norm": 1.4269637236991999, "learning_rate": 5.573496477795951e-06, "loss": 0.2234, "step": 7115 }, { "epoch": 2.3997977414461484, "grad_norm": 1.1534253539542598, "learning_rate": 5.568073837210207e-06, "loss": 0.2386, "step": 7120 }, { "epoch": 2.40148322939491, "grad_norm": 1.355162400125902, "learning_rate": 5.562650519632873e-06, "loss": 0.2412, "step": 7125 }, { "epoch": 2.403168717343671, "grad_norm": 1.1517182571316948, "learning_rate": 5.557226531527088e-06, "loss": 0.2236, "step": 7130 }, { "epoch": 2.404854205292432, "grad_norm": 1.1364581590979208, "learning_rate": 5.551801879356789e-06, "loss": 0.2621, "step": 7135 }, { "epoch": 2.4065396932411933, "grad_norm": 1.1700374556331525, "learning_rate": 5.546376569586709e-06, "loss": 0.2538, "step": 7140 }, { "epoch": 2.4082251811899544, "grad_norm": 1.3523151286119313, "learning_rate": 5.540950608682359e-06, "loss": 0.2344, "step": 7145 }, { "epoch": 2.4099106691387155, "grad_norm": 1.1825564225868845, "learning_rate": 5.535524003110031e-06, "loss": 0.2382, "step": 7150 }, { "epoch": 2.4115961570874767, "grad_norm": 1.0797433745032434, "learning_rate": 5.530096759336779e-06, "loss": 0.2199, "step": 7155 }, { "epoch": 2.413281645036238, "grad_norm": 1.0143081902605937, "learning_rate": 5.5246688838304266e-06, "loss": 0.2247, "step": 7160 }, { "epoch": 2.4149671329849993, "grad_norm": 1.1118061266983945, "learning_rate": 5.519240383059537e-06, "loss": 0.2544, "step": 7165 }, { "epoch": 2.4166526209337604, "grad_norm": 1.2286450157516224, "learning_rate": 5.513811263493436e-06, "loss": 0.2397, "step": 7170 }, { "epoch": 2.4183381088825215, "grad_norm": 1.4927797502510405, "learning_rate": 5.508381531602171e-06, "loss": 0.2329, "step": 7175 }, { "epoch": 2.4200235968312827, "grad_norm": 1.1002295912499036, "learning_rate": 5.502951193856527e-06, "loss": 0.2374, "step": 7180 }, { "epoch": 2.4217090847800438, "grad_norm": 1.576560002392886, "learning_rate": 5.49752025672801e-06, "loss": 0.2239, "step": 7185 }, { "epoch": 2.423394572728805, "grad_norm": 1.1424670041991118, "learning_rate": 5.49208872668884e-06, "loss": 0.2029, "step": 7190 }, { "epoch": 2.425080060677566, "grad_norm": 1.2484868685933979, "learning_rate": 5.486656610211943e-06, "loss": 0.2285, "step": 7195 }, { "epoch": 2.426765548626327, "grad_norm": 1.1959207944488555, "learning_rate": 5.4812239137709465e-06, "loss": 0.2173, "step": 7200 }, { "epoch": 2.4284510365750887, "grad_norm": 1.3496514473746175, "learning_rate": 5.475790643840162e-06, "loss": 0.2303, "step": 7205 }, { "epoch": 2.4301365245238498, "grad_norm": 1.197822436196079, "learning_rate": 5.470356806894596e-06, "loss": 0.234, "step": 7210 }, { "epoch": 2.431822012472611, "grad_norm": 1.3377703610214131, "learning_rate": 5.464922409409918e-06, "loss": 0.1965, "step": 7215 }, { "epoch": 2.433507500421372, "grad_norm": 1.1773728278895432, "learning_rate": 5.459487457862473e-06, "loss": 0.2185, "step": 7220 }, { "epoch": 2.435192988370133, "grad_norm": 1.3169882505936392, "learning_rate": 5.454051958729269e-06, "loss": 0.2292, "step": 7225 }, { "epoch": 2.436878476318894, "grad_norm": 1.6848622821363404, "learning_rate": 5.44861591848796e-06, "loss": 0.2107, "step": 7230 }, { "epoch": 2.4385639642676553, "grad_norm": 1.1165133222451564, "learning_rate": 5.443179343616846e-06, "loss": 0.2379, "step": 7235 }, { "epoch": 2.440249452216417, "grad_norm": 1.1185248465942157, "learning_rate": 5.437742240594866e-06, "loss": 0.2064, "step": 7240 }, { "epoch": 2.441934940165178, "grad_norm": 1.1370278202403525, "learning_rate": 5.4323046159015895e-06, "loss": 0.2097, "step": 7245 }, { "epoch": 2.443620428113939, "grad_norm": 1.374722614390174, "learning_rate": 5.426866476017205e-06, "loss": 0.2306, "step": 7250 }, { "epoch": 2.4453059160627, "grad_norm": 1.132417679631243, "learning_rate": 5.421427827422517e-06, "loss": 0.2525, "step": 7255 }, { "epoch": 2.4469914040114613, "grad_norm": 1.141997524164393, "learning_rate": 5.415988676598933e-06, "loss": 0.2321, "step": 7260 }, { "epoch": 2.4486768919602224, "grad_norm": 1.017487900565632, "learning_rate": 5.410549030028463e-06, "loss": 0.2219, "step": 7265 }, { "epoch": 2.4503623799089835, "grad_norm": 1.1477724859913059, "learning_rate": 5.405108894193709e-06, "loss": 0.2266, "step": 7270 }, { "epoch": 2.4520478678577446, "grad_norm": 1.2500636663936444, "learning_rate": 5.399668275577849e-06, "loss": 0.2186, "step": 7275 }, { "epoch": 2.4537333558065058, "grad_norm": 1.250040587105613, "learning_rate": 5.39422718066464e-06, "loss": 0.2474, "step": 7280 }, { "epoch": 2.4554188437552673, "grad_norm": 1.1703132501739941, "learning_rate": 5.3887856159384125e-06, "loss": 0.2048, "step": 7285 }, { "epoch": 2.4571043317040284, "grad_norm": 1.2197533074398348, "learning_rate": 5.383343587884047e-06, "loss": 0.2323, "step": 7290 }, { "epoch": 2.4587898196527895, "grad_norm": 1.4973472049059546, "learning_rate": 5.377901102986982e-06, "loss": 0.2341, "step": 7295 }, { "epoch": 2.4604753076015506, "grad_norm": 1.1198616931261667, "learning_rate": 5.372458167733199e-06, "loss": 0.2332, "step": 7300 }, { "epoch": 2.4621607955503118, "grad_norm": 1.209762979996002, "learning_rate": 5.367014788609217e-06, "loss": 0.2009, "step": 7305 }, { "epoch": 2.463846283499073, "grad_norm": 2.565199374352355, "learning_rate": 5.361570972102083e-06, "loss": 0.2158, "step": 7310 }, { "epoch": 2.465531771447834, "grad_norm": 1.2584617419031974, "learning_rate": 5.356126724699366e-06, "loss": 0.218, "step": 7315 }, { "epoch": 2.4672172593965955, "grad_norm": 19.181266839822687, "learning_rate": 5.3506820528891466e-06, "loss": 0.2413, "step": 7320 }, { "epoch": 2.4689027473453566, "grad_norm": 1.266880506918883, "learning_rate": 5.345236963160017e-06, "loss": 0.2145, "step": 7325 }, { "epoch": 2.4705882352941178, "grad_norm": 1.5613710090495556, "learning_rate": 5.339791462001056e-06, "loss": 0.2164, "step": 7330 }, { "epoch": 2.472273723242879, "grad_norm": 1.3876495140019696, "learning_rate": 5.334345555901845e-06, "loss": 0.2321, "step": 7335 }, { "epoch": 2.47395921119164, "grad_norm": 1.195579075161368, "learning_rate": 5.328899251352443e-06, "loss": 0.2234, "step": 7340 }, { "epoch": 2.475644699140401, "grad_norm": 1.21658518877199, "learning_rate": 5.323452554843383e-06, "loss": 0.2334, "step": 7345 }, { "epoch": 2.477330187089162, "grad_norm": 1.341412480385651, "learning_rate": 5.3180054728656635e-06, "loss": 0.2103, "step": 7350 }, { "epoch": 2.4790156750379233, "grad_norm": 2.3581330846404054, "learning_rate": 5.312558011910747e-06, "loss": 0.2441, "step": 7355 }, { "epoch": 2.4807011629866844, "grad_norm": 1.281528543535254, "learning_rate": 5.3071101784705444e-06, "loss": 0.2495, "step": 7360 }, { "epoch": 2.482386650935446, "grad_norm": 1.2361244252686825, "learning_rate": 5.301661979037412e-06, "loss": 0.2333, "step": 7365 }, { "epoch": 2.484072138884207, "grad_norm": 1.19514127211079, "learning_rate": 5.296213420104141e-06, "loss": 0.2239, "step": 7370 }, { "epoch": 2.485757626832968, "grad_norm": 1.365035732612306, "learning_rate": 5.290764508163953e-06, "loss": 0.2272, "step": 7375 }, { "epoch": 2.4874431147817293, "grad_norm": 1.8123799483990297, "learning_rate": 5.285315249710488e-06, "loss": 0.2451, "step": 7380 }, { "epoch": 2.4891286027304904, "grad_norm": 1.1712964598933269, "learning_rate": 5.279865651237801e-06, "loss": 0.226, "step": 7385 }, { "epoch": 2.4908140906792515, "grad_norm": 1.3165948441055455, "learning_rate": 5.274415719240349e-06, "loss": 0.2172, "step": 7390 }, { "epoch": 2.4924995786280126, "grad_norm": 1.1848531526625858, "learning_rate": 5.268965460212989e-06, "loss": 0.2164, "step": 7395 }, { "epoch": 2.494185066576774, "grad_norm": 1.3803426336906315, "learning_rate": 5.26351488065097e-06, "loss": 0.2293, "step": 7400 }, { "epoch": 2.4958705545255353, "grad_norm": 1.1146173443586165, "learning_rate": 5.258063987049919e-06, "loss": 0.215, "step": 7405 }, { "epoch": 2.4975560424742964, "grad_norm": 1.3421795249765416, "learning_rate": 5.252612785905836e-06, "loss": 0.2215, "step": 7410 }, { "epoch": 2.4992415304230575, "grad_norm": 1.3343865702009974, "learning_rate": 5.247161283715093e-06, "loss": 0.2, "step": 7415 }, { "epoch": 2.5009270183718186, "grad_norm": 1.2680822744156943, "learning_rate": 5.241709486974419e-06, "loss": 0.2448, "step": 7420 }, { "epoch": 2.5026125063205797, "grad_norm": 1.2153448944194911, "learning_rate": 5.23625740218089e-06, "loss": 0.1976, "step": 7425 }, { "epoch": 2.504297994269341, "grad_norm": 1.4596828307055203, "learning_rate": 5.230805035831928e-06, "loss": 0.2121, "step": 7430 }, { "epoch": 2.5059834822181024, "grad_norm": 1.7781454267742418, "learning_rate": 5.225352394425293e-06, "loss": 0.2253, "step": 7435 }, { "epoch": 2.507668970166863, "grad_norm": 1.2962141597729508, "learning_rate": 5.2198994844590716e-06, "loss": 0.2392, "step": 7440 }, { "epoch": 2.5093544581156246, "grad_norm": 1.3549211808148687, "learning_rate": 5.214446312431664e-06, "loss": 0.2143, "step": 7445 }, { "epoch": 2.5110399460643857, "grad_norm": 1.1777314657407805, "learning_rate": 5.208992884841794e-06, "loss": 0.2036, "step": 7450 }, { "epoch": 2.512725434013147, "grad_norm": 1.1126328029588632, "learning_rate": 5.203539208188479e-06, "loss": 0.2215, "step": 7455 }, { "epoch": 2.514410921961908, "grad_norm": 1.2350579187532946, "learning_rate": 5.198085288971043e-06, "loss": 0.2393, "step": 7460 }, { "epoch": 2.516096409910669, "grad_norm": 1.1473437156684776, "learning_rate": 5.19263113368909e-06, "loss": 0.2234, "step": 7465 }, { "epoch": 2.51778189785943, "grad_norm": 1.2605124960412208, "learning_rate": 5.187176748842514e-06, "loss": 0.2118, "step": 7470 }, { "epoch": 2.5194673858081913, "grad_norm": 1.1996583064453163, "learning_rate": 5.1817221409314755e-06, "loss": 0.2229, "step": 7475 }, { "epoch": 2.521152873756953, "grad_norm": 1.4972450500593595, "learning_rate": 5.176267316456404e-06, "loss": 0.2315, "step": 7480 }, { "epoch": 2.522838361705714, "grad_norm": 1.2625856422283825, "learning_rate": 5.170812281917985e-06, "loss": 0.212, "step": 7485 }, { "epoch": 2.524523849654475, "grad_norm": 1.0757073846492482, "learning_rate": 5.16535704381716e-06, "loss": 0.2255, "step": 7490 }, { "epoch": 2.526209337603236, "grad_norm": 1.3284596906628738, "learning_rate": 5.159901608655105e-06, "loss": 0.2002, "step": 7495 }, { "epoch": 2.5278948255519973, "grad_norm": 1.197013365424821, "learning_rate": 5.154445982933238e-06, "loss": 0.2159, "step": 7500 }, { "epoch": 2.5295803135007584, "grad_norm": 1.1471487167123913, "learning_rate": 5.148990173153198e-06, "loss": 0.2442, "step": 7505 }, { "epoch": 2.5312658014495195, "grad_norm": 1.1133548990083035, "learning_rate": 5.1435341858168496e-06, "loss": 0.1944, "step": 7510 }, { "epoch": 2.532951289398281, "grad_norm": 2.8631588077757204, "learning_rate": 5.138078027426263e-06, "loss": 0.2119, "step": 7515 }, { "epoch": 2.5346367773470417, "grad_norm": 1.2632368897607371, "learning_rate": 5.132621704483718e-06, "loss": 0.2332, "step": 7520 }, { "epoch": 2.5363222652958033, "grad_norm": 1.2282471728320135, "learning_rate": 5.127165223491684e-06, "loss": 0.2447, "step": 7525 }, { "epoch": 2.5380077532445644, "grad_norm": 1.237329542255208, "learning_rate": 5.121708590952826e-06, "loss": 0.1999, "step": 7530 }, { "epoch": 2.5396932411933255, "grad_norm": 1.3866702828113873, "learning_rate": 5.116251813369982e-06, "loss": 0.2331, "step": 7535 }, { "epoch": 2.5413787291420866, "grad_norm": 1.221353394456828, "learning_rate": 5.1107948972461705e-06, "loss": 0.2275, "step": 7540 }, { "epoch": 2.5430642170908477, "grad_norm": 1.2602921223294226, "learning_rate": 5.10533784908457e-06, "loss": 0.2355, "step": 7545 }, { "epoch": 2.544749705039609, "grad_norm": 1.400793972666376, "learning_rate": 5.099880675388516e-06, "loss": 0.2108, "step": 7550 }, { "epoch": 2.54643519298837, "grad_norm": 1.3124538737996967, "learning_rate": 5.094423382661496e-06, "loss": 0.2109, "step": 7555 }, { "epoch": 2.5481206809371315, "grad_norm": 1.270861270880195, "learning_rate": 5.0889659774071396e-06, "loss": 0.2184, "step": 7560 }, { "epoch": 2.5498061688858926, "grad_norm": 1.1534545573853237, "learning_rate": 5.08350846612921e-06, "loss": 0.1968, "step": 7565 }, { "epoch": 2.5514916568346537, "grad_norm": 1.1686319456141216, "learning_rate": 5.078050855331595e-06, "loss": 0.2319, "step": 7570 }, { "epoch": 2.553177144783415, "grad_norm": 1.1816525144174514, "learning_rate": 5.0725931515183035e-06, "loss": 0.2312, "step": 7575 }, { "epoch": 2.554862632732176, "grad_norm": 1.2888614690099474, "learning_rate": 5.0671353611934505e-06, "loss": 0.2166, "step": 7580 }, { "epoch": 2.556548120680937, "grad_norm": 1.4569746507152874, "learning_rate": 5.061677490861263e-06, "loss": 0.2158, "step": 7585 }, { "epoch": 2.558233608629698, "grad_norm": 1.5332163875387035, "learning_rate": 5.056219547026055e-06, "loss": 0.2155, "step": 7590 }, { "epoch": 2.5599190965784597, "grad_norm": 1.1573902495188044, "learning_rate": 5.050761536192231e-06, "loss": 0.2012, "step": 7595 }, { "epoch": 2.5616045845272204, "grad_norm": 1.0696710718777616, "learning_rate": 5.0453034648642765e-06, "loss": 0.2019, "step": 7600 }, { "epoch": 2.563290072475982, "grad_norm": 1.9820669090703988, "learning_rate": 5.039845339546749e-06, "loss": 0.2183, "step": 7605 }, { "epoch": 2.564975560424743, "grad_norm": 1.161627647696652, "learning_rate": 5.034387166744266e-06, "loss": 0.2155, "step": 7610 }, { "epoch": 2.566661048373504, "grad_norm": 1.216790349189924, "learning_rate": 5.028928952961507e-06, "loss": 0.2159, "step": 7615 }, { "epoch": 2.5683465363222653, "grad_norm": 1.074176202440405, "learning_rate": 5.023470704703198e-06, "loss": 0.237, "step": 7620 }, { "epoch": 2.5700320242710264, "grad_norm": 1.1430658238276077, "learning_rate": 5.018012428474108e-06, "loss": 0.206, "step": 7625 }, { "epoch": 2.5717175122197875, "grad_norm": 3.035099123795803, "learning_rate": 5.012554130779035e-06, "loss": 0.2132, "step": 7630 }, { "epoch": 2.5734030001685486, "grad_norm": 1.164783026358899, "learning_rate": 5.007095818122807e-06, "loss": 0.196, "step": 7635 }, { "epoch": 2.57508848811731, "grad_norm": 1.2985221587075237, "learning_rate": 5.001637497010267e-06, "loss": 0.2204, "step": 7640 }, { "epoch": 2.5767739760660713, "grad_norm": 1.3168407527973596, "learning_rate": 4.996179173946271e-06, "loss": 0.2184, "step": 7645 }, { "epoch": 2.5784594640148324, "grad_norm": 1.2553672938566072, "learning_rate": 4.990720855435673e-06, "loss": 0.2124, "step": 7650 }, { "epoch": 2.5801449519635935, "grad_norm": 1.5232095565932093, "learning_rate": 4.9852625479833275e-06, "loss": 0.2119, "step": 7655 }, { "epoch": 2.5818304399123546, "grad_norm": 1.3738872841862542, "learning_rate": 4.97980425809407e-06, "loss": 0.2234, "step": 7660 }, { "epoch": 2.5835159278611157, "grad_norm": 1.193187891987461, "learning_rate": 4.974345992272718e-06, "loss": 0.2179, "step": 7665 }, { "epoch": 2.585201415809877, "grad_norm": 1.2178922042183746, "learning_rate": 4.9688877570240595e-06, "loss": 0.2166, "step": 7670 }, { "epoch": 2.5868869037586384, "grad_norm": 1.195714331610979, "learning_rate": 4.9634295588528475e-06, "loss": 0.2087, "step": 7675 }, { "epoch": 2.588572391707399, "grad_norm": 1.5491557664737279, "learning_rate": 4.957971404263787e-06, "loss": 0.2112, "step": 7680 }, { "epoch": 2.5902578796561606, "grad_norm": 1.4409633952231076, "learning_rate": 4.952513299761536e-06, "loss": 0.2252, "step": 7685 }, { "epoch": 2.5919433676049217, "grad_norm": 1.1824290929772407, "learning_rate": 4.947055251850692e-06, "loss": 0.2267, "step": 7690 }, { "epoch": 2.593628855553683, "grad_norm": 2.65766670226504, "learning_rate": 4.94159726703578e-06, "loss": 0.2183, "step": 7695 }, { "epoch": 2.595314343502444, "grad_norm": 1.330761056252785, "learning_rate": 4.936139351821257e-06, "loss": 0.2231, "step": 7700 }, { "epoch": 2.596999831451205, "grad_norm": 1.2768987847764608, "learning_rate": 4.930681512711491e-06, "loss": 0.2047, "step": 7705 }, { "epoch": 2.598685319399966, "grad_norm": 2.5788407095683668, "learning_rate": 4.925223756210762e-06, "loss": 0.2023, "step": 7710 }, { "epoch": 2.6003708073487273, "grad_norm": 1.6971820075392683, "learning_rate": 4.919766088823253e-06, "loss": 0.2338, "step": 7715 }, { "epoch": 2.602056295297489, "grad_norm": 1.2076456903760784, "learning_rate": 4.914308517053036e-06, "loss": 0.2109, "step": 7720 }, { "epoch": 2.60374178324625, "grad_norm": 1.3327065551630404, "learning_rate": 4.908851047404076e-06, "loss": 0.2316, "step": 7725 }, { "epoch": 2.605427271195011, "grad_norm": 1.2027785275920007, "learning_rate": 4.903393686380212e-06, "loss": 0.2025, "step": 7730 }, { "epoch": 2.607112759143772, "grad_norm": 1.4071669291229922, "learning_rate": 4.89793644048515e-06, "loss": 0.2089, "step": 7735 }, { "epoch": 2.6087982470925333, "grad_norm": 1.8369380712878773, "learning_rate": 4.892479316222467e-06, "loss": 0.2097, "step": 7740 }, { "epoch": 2.6104837350412944, "grad_norm": 1.243199599731519, "learning_rate": 4.88702232009559e-06, "loss": 0.237, "step": 7745 }, { "epoch": 2.6121692229900555, "grad_norm": 1.2769876086429437, "learning_rate": 4.881565458607793e-06, "loss": 0.2367, "step": 7750 }, { "epoch": 2.613854710938817, "grad_norm": 1.909431590495729, "learning_rate": 4.876108738262189e-06, "loss": 0.2087, "step": 7755 }, { "epoch": 2.6155401988875777, "grad_norm": 1.2717307641002418, "learning_rate": 4.870652165561731e-06, "loss": 0.2147, "step": 7760 }, { "epoch": 2.6172256868363393, "grad_norm": 1.137811095272054, "learning_rate": 4.865195747009183e-06, "loss": 0.2175, "step": 7765 }, { "epoch": 2.6189111747851004, "grad_norm": 1.2006060819624018, "learning_rate": 4.859739489107137e-06, "loss": 0.2096, "step": 7770 }, { "epoch": 2.6205966627338615, "grad_norm": 1.3185521440511523, "learning_rate": 4.854283398357983e-06, "loss": 0.2061, "step": 7775 }, { "epoch": 2.6222821506826226, "grad_norm": 1.1883514424084403, "learning_rate": 4.848827481263922e-06, "loss": 0.1983, "step": 7780 }, { "epoch": 2.6239676386313837, "grad_norm": 1.445064829296447, "learning_rate": 4.84337174432694e-06, "loss": 0.2381, "step": 7785 }, { "epoch": 2.625653126580145, "grad_norm": 1.164969110462535, "learning_rate": 4.837916194048814e-06, "loss": 0.1991, "step": 7790 }, { "epoch": 2.627338614528906, "grad_norm": 1.0917276156235423, "learning_rate": 4.832460836931093e-06, "loss": 0.2152, "step": 7795 }, { "epoch": 2.6290241024776675, "grad_norm": 1.1716749826219557, "learning_rate": 4.827005679475101e-06, "loss": 0.2064, "step": 7800 }, { "epoch": 2.6307095904264286, "grad_norm": 1.1835409077085433, "learning_rate": 4.821550728181924e-06, "loss": 0.2053, "step": 7805 }, { "epoch": 2.6323950783751897, "grad_norm": 1.0672841722354087, "learning_rate": 4.816095989552397e-06, "loss": 0.2069, "step": 7810 }, { "epoch": 2.634080566323951, "grad_norm": 1.1189104397633212, "learning_rate": 4.8106414700871055e-06, "loss": 0.2262, "step": 7815 }, { "epoch": 2.635766054272712, "grad_norm": 1.4845855781818307, "learning_rate": 4.805187176286375e-06, "loss": 0.2088, "step": 7820 }, { "epoch": 2.637451542221473, "grad_norm": 1.887706656080745, "learning_rate": 4.799733114650258e-06, "loss": 0.2265, "step": 7825 }, { "epoch": 2.639137030170234, "grad_norm": 1.4732247431555492, "learning_rate": 4.794279291678532e-06, "loss": 0.2106, "step": 7830 }, { "epoch": 2.6408225181189957, "grad_norm": 1.0853331771280081, "learning_rate": 4.788825713870694e-06, "loss": 0.2184, "step": 7835 }, { "epoch": 2.6425080060677564, "grad_norm": 1.2963549201991373, "learning_rate": 4.783372387725943e-06, "loss": 0.2398, "step": 7840 }, { "epoch": 2.644193494016518, "grad_norm": 1.0496304810572075, "learning_rate": 4.777919319743182e-06, "loss": 0.2107, "step": 7845 }, { "epoch": 2.645878981965279, "grad_norm": 1.2044158354126433, "learning_rate": 4.772466516421003e-06, "loss": 0.2124, "step": 7850 }, { "epoch": 2.64756446991404, "grad_norm": 1.2202219357417485, "learning_rate": 4.767013984257687e-06, "loss": 0.206, "step": 7855 }, { "epoch": 2.6492499578628013, "grad_norm": 1.2914769386909546, "learning_rate": 4.761561729751186e-06, "loss": 0.1762, "step": 7860 }, { "epoch": 2.6509354458115624, "grad_norm": 1.2146084373160662, "learning_rate": 4.756109759399127e-06, "loss": 0.2104, "step": 7865 }, { "epoch": 2.6526209337603235, "grad_norm": 1.469151091659385, "learning_rate": 4.750658079698793e-06, "loss": 0.2354, "step": 7870 }, { "epoch": 2.6543064217090846, "grad_norm": 1.2237814189742386, "learning_rate": 4.745206697147129e-06, "loss": 0.197, "step": 7875 }, { "epoch": 2.655991909657846, "grad_norm": 1.334011718394007, "learning_rate": 4.739755618240714e-06, "loss": 0.2027, "step": 7880 }, { "epoch": 2.6576773976066073, "grad_norm": 1.3450770693251022, "learning_rate": 4.7343048494757765e-06, "loss": 0.2219, "step": 7885 }, { "epoch": 2.6593628855553684, "grad_norm": 1.1713996219825322, "learning_rate": 4.728854397348166e-06, "loss": 0.1955, "step": 7890 }, { "epoch": 2.6610483735041295, "grad_norm": 1.473575977059867, "learning_rate": 4.723404268353363e-06, "loss": 0.2272, "step": 7895 }, { "epoch": 2.6627338614528906, "grad_norm": 1.239481438038185, "learning_rate": 4.717954468986456e-06, "loss": 0.2136, "step": 7900 }, { "epoch": 2.6644193494016517, "grad_norm": 1.2073338033281267, "learning_rate": 4.712505005742143e-06, "loss": 0.2061, "step": 7905 }, { "epoch": 2.666104837350413, "grad_norm": 1.3312178764641676, "learning_rate": 4.707055885114725e-06, "loss": 0.237, "step": 7910 }, { "epoch": 2.6677903252991744, "grad_norm": 1.897086284655153, "learning_rate": 4.7016071135980915e-06, "loss": 0.1846, "step": 7915 }, { "epoch": 2.669475813247935, "grad_norm": 1.2203593846498346, "learning_rate": 4.696158697685713e-06, "loss": 0.1925, "step": 7920 }, { "epoch": 2.6711613011966966, "grad_norm": 1.0575694355259007, "learning_rate": 4.690710643870643e-06, "loss": 0.1937, "step": 7925 }, { "epoch": 2.6728467891454577, "grad_norm": 1.1964370422235926, "learning_rate": 4.685262958645497e-06, "loss": 0.2243, "step": 7930 }, { "epoch": 2.674532277094219, "grad_norm": 1.3247158824635015, "learning_rate": 4.679815648502455e-06, "loss": 0.2049, "step": 7935 }, { "epoch": 2.67621776504298, "grad_norm": 1.417846174872059, "learning_rate": 4.67436871993325e-06, "loss": 0.2225, "step": 7940 }, { "epoch": 2.677903252991741, "grad_norm": 1.4640995421446863, "learning_rate": 4.668922179429156e-06, "loss": 0.224, "step": 7945 }, { "epoch": 2.679588740940502, "grad_norm": 1.1232583515450298, "learning_rate": 4.6634760334809945e-06, "loss": 0.2091, "step": 7950 }, { "epoch": 2.6812742288892633, "grad_norm": 1.2961341608661843, "learning_rate": 4.658030288579104e-06, "loss": 0.2191, "step": 7955 }, { "epoch": 2.682959716838025, "grad_norm": 1.128248479176604, "learning_rate": 4.652584951213354e-06, "loss": 0.2122, "step": 7960 }, { "epoch": 2.684645204786786, "grad_norm": 1.2685530371234115, "learning_rate": 4.6471400278731245e-06, "loss": 0.2486, "step": 7965 }, { "epoch": 2.686330692735547, "grad_norm": 1.2482213439540817, "learning_rate": 4.641695525047305e-06, "loss": 0.1816, "step": 7970 }, { "epoch": 2.688016180684308, "grad_norm": 1.1985374443223176, "learning_rate": 4.63625144922428e-06, "loss": 0.1891, "step": 7975 }, { "epoch": 2.6897016686330693, "grad_norm": 1.2409915480778366, "learning_rate": 4.630807806891927e-06, "loss": 0.1994, "step": 7980 }, { "epoch": 2.6913871565818304, "grad_norm": 1.299970662285356, "learning_rate": 4.625364604537607e-06, "loss": 0.2142, "step": 7985 }, { "epoch": 2.6930726445305915, "grad_norm": 1.2035482765561063, "learning_rate": 4.619921848648161e-06, "loss": 0.1858, "step": 7990 }, { "epoch": 2.694758132479353, "grad_norm": 1.0906507678198634, "learning_rate": 4.6144795457098876e-06, "loss": 0.2031, "step": 7995 }, { "epoch": 2.6964436204281137, "grad_norm": 1.2642173617342594, "learning_rate": 4.609037702208556e-06, "loss": 0.2018, "step": 8000 }, { "epoch": 2.6981291083768753, "grad_norm": 1.2095766816767846, "learning_rate": 4.60359632462938e-06, "loss": 0.2139, "step": 8005 }, { "epoch": 2.6998145963256364, "grad_norm": 1.2460924525904324, "learning_rate": 4.5981554194570256e-06, "loss": 0.2017, "step": 8010 }, { "epoch": 2.7015000842743975, "grad_norm": 1.1991951728812023, "learning_rate": 4.592714993175588e-06, "loss": 0.2038, "step": 8015 }, { "epoch": 2.7031855722231586, "grad_norm": 1.1542325631291357, "learning_rate": 4.587275052268596e-06, "loss": 0.2162, "step": 8020 }, { "epoch": 2.7048710601719197, "grad_norm": 1.3591661714420404, "learning_rate": 4.581835603219002e-06, "loss": 0.2031, "step": 8025 }, { "epoch": 2.706556548120681, "grad_norm": 1.2550882397550795, "learning_rate": 4.57639665250917e-06, "loss": 0.2173, "step": 8030 }, { "epoch": 2.708242036069442, "grad_norm": 1.2079248483458591, "learning_rate": 4.570958206620868e-06, "loss": 0.2142, "step": 8035 }, { "epoch": 2.7099275240182035, "grad_norm": 1.0562215002063613, "learning_rate": 4.565520272035265e-06, "loss": 0.1825, "step": 8040 }, { "epoch": 2.7116130119669646, "grad_norm": 1.0560766090254274, "learning_rate": 4.560082855232919e-06, "loss": 0.1951, "step": 8045 }, { "epoch": 2.7132984999157257, "grad_norm": 1.2036721170342266, "learning_rate": 4.554645962693773e-06, "loss": 0.2162, "step": 8050 }, { "epoch": 2.714983987864487, "grad_norm": 1.2550065545273137, "learning_rate": 4.549209600897142e-06, "loss": 0.2054, "step": 8055 }, { "epoch": 2.716669475813248, "grad_norm": 1.256922529684213, "learning_rate": 4.5437737763217135e-06, "loss": 0.2103, "step": 8060 }, { "epoch": 2.718354963762009, "grad_norm": 1.298482568255476, "learning_rate": 4.538338495445531e-06, "loss": 0.2225, "step": 8065 }, { "epoch": 2.72004045171077, "grad_norm": 1.208277249486905, "learning_rate": 4.532903764745991e-06, "loss": 0.1922, "step": 8070 }, { "epoch": 2.7217259396595317, "grad_norm": 1.287360635630622, "learning_rate": 4.5274695906998325e-06, "loss": 0.2132, "step": 8075 }, { "epoch": 2.7234114276082924, "grad_norm": 1.508020618879755, "learning_rate": 4.522035979783136e-06, "loss": 0.214, "step": 8080 }, { "epoch": 2.725096915557054, "grad_norm": 1.294481237052173, "learning_rate": 4.516602938471305e-06, "loss": 0.2101, "step": 8085 }, { "epoch": 2.726782403505815, "grad_norm": 1.2667584687823812, "learning_rate": 4.511170473239069e-06, "loss": 0.2015, "step": 8090 }, { "epoch": 2.728467891454576, "grad_norm": 1.232871861263674, "learning_rate": 4.505738590560466e-06, "loss": 0.2095, "step": 8095 }, { "epoch": 2.7301533794033372, "grad_norm": 0.9720525051487791, "learning_rate": 4.500307296908845e-06, "loss": 0.1847, "step": 8100 }, { "epoch": 2.7318388673520984, "grad_norm": 1.2182120125781937, "learning_rate": 4.494876598756852e-06, "loss": 0.1968, "step": 8105 }, { "epoch": 2.7335243553008595, "grad_norm": 1.2811058412221115, "learning_rate": 4.4894465025764196e-06, "loss": 0.1936, "step": 8110 }, { "epoch": 2.7352098432496206, "grad_norm": 1.203975144696989, "learning_rate": 4.484017014838767e-06, "loss": 0.2234, "step": 8115 }, { "epoch": 2.736895331198382, "grad_norm": 1.114820195816211, "learning_rate": 4.478588142014385e-06, "loss": 0.2011, "step": 8120 }, { "epoch": 2.7385808191471432, "grad_norm": 3.9301821811838704, "learning_rate": 4.473159890573034e-06, "loss": 0.22, "step": 8125 }, { "epoch": 2.7402663070959044, "grad_norm": 1.1651231107384559, "learning_rate": 4.4677322669837334e-06, "loss": 0.1952, "step": 8130 }, { "epoch": 2.7419517950446655, "grad_norm": 1.2026231387193682, "learning_rate": 4.462305277714756e-06, "loss": 0.2023, "step": 8135 }, { "epoch": 2.7436372829934266, "grad_norm": 1.7641807551102524, "learning_rate": 4.456878929233614e-06, "loss": 0.224, "step": 8140 }, { "epoch": 2.7453227709421877, "grad_norm": 1.3383941911155575, "learning_rate": 4.451453228007061e-06, "loss": 0.1947, "step": 8145 }, { "epoch": 2.747008258890949, "grad_norm": 1.2695203975579834, "learning_rate": 4.4460281805010755e-06, "loss": 0.2028, "step": 8150 }, { "epoch": 2.7486937468397104, "grad_norm": 1.4003249747645194, "learning_rate": 4.44060379318086e-06, "loss": 0.192, "step": 8155 }, { "epoch": 2.750379234788471, "grad_norm": 1.368960866876165, "learning_rate": 4.435180072510827e-06, "loss": 0.2332, "step": 8160 }, { "epoch": 2.7520647227372326, "grad_norm": 1.2414367283303092, "learning_rate": 4.429757024954599e-06, "loss": 0.2099, "step": 8165 }, { "epoch": 2.7537502106859937, "grad_norm": 1.2826947437153777, "learning_rate": 4.424334656974987e-06, "loss": 0.2271, "step": 8170 }, { "epoch": 2.755435698634755, "grad_norm": 1.1016378366259734, "learning_rate": 4.418912975034008e-06, "loss": 0.222, "step": 8175 }, { "epoch": 2.757121186583516, "grad_norm": 1.1405163621409955, "learning_rate": 4.413491985592846e-06, "loss": 0.1995, "step": 8180 }, { "epoch": 2.758806674532277, "grad_norm": 1.2888868278377614, "learning_rate": 4.408071695111868e-06, "loss": 0.2033, "step": 8185 }, { "epoch": 2.760492162481038, "grad_norm": 1.1411197132080275, "learning_rate": 4.402652110050605e-06, "loss": 0.199, "step": 8190 }, { "epoch": 2.7621776504297992, "grad_norm": 1.1580417718173754, "learning_rate": 4.3972332368677496e-06, "loss": 0.2121, "step": 8195 }, { "epoch": 2.763863138378561, "grad_norm": 1.1365678268528452, "learning_rate": 4.391815082021142e-06, "loss": 0.2036, "step": 8200 }, { "epoch": 2.765548626327322, "grad_norm": 1.4277318865557, "learning_rate": 4.3863976519677725e-06, "loss": 0.2077, "step": 8205 }, { "epoch": 2.767234114276083, "grad_norm": 1.1670065742226998, "learning_rate": 4.38098095316376e-06, "loss": 0.1999, "step": 8210 }, { "epoch": 2.768919602224844, "grad_norm": 1.1747133532537581, "learning_rate": 4.375564992064359e-06, "loss": 0.1808, "step": 8215 }, { "epoch": 2.7706050901736052, "grad_norm": 1.331536656284568, "learning_rate": 4.370149775123942e-06, "loss": 0.2272, "step": 8220 }, { "epoch": 2.7722905781223663, "grad_norm": 1.2305343169614085, "learning_rate": 4.364735308795995e-06, "loss": 0.2035, "step": 8225 }, { "epoch": 2.7739760660711275, "grad_norm": 1.2877016204789493, "learning_rate": 4.3593215995331065e-06, "loss": 0.1835, "step": 8230 }, { "epoch": 2.775661554019889, "grad_norm": 1.2191011990272045, "learning_rate": 4.353908653786968e-06, "loss": 0.2051, "step": 8235 }, { "epoch": 2.7773470419686497, "grad_norm": 1.2161324917089447, "learning_rate": 4.348496478008357e-06, "loss": 0.216, "step": 8240 }, { "epoch": 2.7790325299174112, "grad_norm": 1.0883390283573073, "learning_rate": 4.343085078647133e-06, "loss": 0.2135, "step": 8245 }, { "epoch": 2.7807180178661723, "grad_norm": 1.1927476554034464, "learning_rate": 4.337674462152236e-06, "loss": 0.2062, "step": 8250 }, { "epoch": 2.7824035058149335, "grad_norm": 1.3148578636143122, "learning_rate": 4.332264634971668e-06, "loss": 0.2079, "step": 8255 }, { "epoch": 2.7840889937636946, "grad_norm": 1.4712378506522028, "learning_rate": 4.326855603552491e-06, "loss": 0.1958, "step": 8260 }, { "epoch": 2.7857744817124557, "grad_norm": 1.4577330610313746, "learning_rate": 4.321447374340817e-06, "loss": 0.2019, "step": 8265 }, { "epoch": 2.787459969661217, "grad_norm": 1.096441776769599, "learning_rate": 4.316039953781809e-06, "loss": 0.1837, "step": 8270 }, { "epoch": 2.789145457609978, "grad_norm": 1.5145781993547742, "learning_rate": 4.310633348319655e-06, "loss": 0.1957, "step": 8275 }, { "epoch": 2.7908309455587395, "grad_norm": 1.2267818699569348, "learning_rate": 4.305227564397583e-06, "loss": 0.2132, "step": 8280 }, { "epoch": 2.7925164335075006, "grad_norm": 1.2254078190906166, "learning_rate": 4.299822608457835e-06, "loss": 0.204, "step": 8285 }, { "epoch": 2.7942019214562617, "grad_norm": 1.3473040118593569, "learning_rate": 4.29441848694167e-06, "loss": 0.197, "step": 8290 }, { "epoch": 2.795887409405023, "grad_norm": 1.3844537744939152, "learning_rate": 4.2890152062893475e-06, "loss": 0.2204, "step": 8295 }, { "epoch": 2.797572897353784, "grad_norm": 1.1641308923110405, "learning_rate": 4.283612772940132e-06, "loss": 0.2297, "step": 8300 }, { "epoch": 2.799258385302545, "grad_norm": 1.2817293621333818, "learning_rate": 4.278211193332273e-06, "loss": 0.2029, "step": 8305 }, { "epoch": 2.800943873251306, "grad_norm": 1.2037204289050882, "learning_rate": 4.272810473903003e-06, "loss": 0.1948, "step": 8310 }, { "epoch": 2.8026293612000677, "grad_norm": 1.4330754159672232, "learning_rate": 4.2674106210885305e-06, "loss": 0.2328, "step": 8315 }, { "epoch": 2.8043148491488283, "grad_norm": 1.2967773614836344, "learning_rate": 4.262011641324032e-06, "loss": 0.1938, "step": 8320 }, { "epoch": 2.80600033709759, "grad_norm": 1.3772603790369828, "learning_rate": 4.2566135410436425e-06, "loss": 0.1697, "step": 8325 }, { "epoch": 2.807685825046351, "grad_norm": 1.3138260462934688, "learning_rate": 4.2512163266804506e-06, "loss": 0.226, "step": 8330 }, { "epoch": 2.809371312995112, "grad_norm": 1.1462076660661356, "learning_rate": 4.245820004666486e-06, "loss": 0.1875, "step": 8335 }, { "epoch": 2.8110568009438732, "grad_norm": 1.2362333917746737, "learning_rate": 4.240424581432718e-06, "loss": 0.1796, "step": 8340 }, { "epoch": 2.8127422888926343, "grad_norm": 1.1409090711262735, "learning_rate": 4.235030063409041e-06, "loss": 0.1809, "step": 8345 }, { "epoch": 2.8144277768413954, "grad_norm": 1.3813250458920163, "learning_rate": 4.229636457024276e-06, "loss": 0.1788, "step": 8350 }, { "epoch": 2.8161132647901566, "grad_norm": 1.1293414456316424, "learning_rate": 4.224243768706151e-06, "loss": 0.2059, "step": 8355 }, { "epoch": 2.817798752738918, "grad_norm": 1.8168562582897307, "learning_rate": 4.218852004881305e-06, "loss": 0.1933, "step": 8360 }, { "epoch": 2.819484240687679, "grad_norm": 1.1886959699440642, "learning_rate": 4.213461171975277e-06, "loss": 0.2039, "step": 8365 }, { "epoch": 2.8211697286364403, "grad_norm": 1.3212647878701373, "learning_rate": 4.20807127641249e-06, "loss": 0.2083, "step": 8370 }, { "epoch": 2.8228552165852014, "grad_norm": 1.140767409050604, "learning_rate": 4.202682324616253e-06, "loss": 0.2046, "step": 8375 }, { "epoch": 2.8245407045339626, "grad_norm": 1.1568938021812396, "learning_rate": 4.1972943230087535e-06, "loss": 0.1874, "step": 8380 }, { "epoch": 2.8262261924827237, "grad_norm": 1.188451679119585, "learning_rate": 4.19190727801104e-06, "loss": 0.2167, "step": 8385 }, { "epoch": 2.8279116804314848, "grad_norm": 1.273013068407849, "learning_rate": 4.186521196043028e-06, "loss": 0.2013, "step": 8390 }, { "epoch": 2.8295971683802463, "grad_norm": 1.2511228585869532, "learning_rate": 4.18113608352348e-06, "loss": 0.2021, "step": 8395 }, { "epoch": 2.831282656329007, "grad_norm": 1.2595578190181103, "learning_rate": 4.175751946870005e-06, "loss": 0.1958, "step": 8400 }, { "epoch": 2.8329681442777686, "grad_norm": 1.2433040402270128, "learning_rate": 4.1703687924990525e-06, "loss": 0.2099, "step": 8405 }, { "epoch": 2.8346536322265297, "grad_norm": 1.110021214593105, "learning_rate": 4.164986626825894e-06, "loss": 0.2025, "step": 8410 }, { "epoch": 2.8363391201752908, "grad_norm": 1.064850840174882, "learning_rate": 4.1596054562646294e-06, "loss": 0.1933, "step": 8415 }, { "epoch": 2.838024608124052, "grad_norm": 1.4229645174562606, "learning_rate": 4.154225287228169e-06, "loss": 0.1959, "step": 8420 }, { "epoch": 2.839710096072813, "grad_norm": 1.2249032864371672, "learning_rate": 4.148846126128232e-06, "loss": 0.2047, "step": 8425 }, { "epoch": 2.841395584021574, "grad_norm": 1.1301944668252912, "learning_rate": 4.143467979375332e-06, "loss": 0.1996, "step": 8430 }, { "epoch": 2.843081071970335, "grad_norm": 1.2013755673175146, "learning_rate": 4.1380908533787796e-06, "loss": 0.2134, "step": 8435 }, { "epoch": 2.8447665599190968, "grad_norm": 1.2307863801871006, "learning_rate": 4.132714754546666e-06, "loss": 0.2027, "step": 8440 }, { "epoch": 2.846452047867858, "grad_norm": 1.4757993042778519, "learning_rate": 4.127339689285859e-06, "loss": 0.1973, "step": 8445 }, { "epoch": 2.848137535816619, "grad_norm": 1.2041741783160391, "learning_rate": 4.121965664001993e-06, "loss": 0.2024, "step": 8450 }, { "epoch": 2.84982302376538, "grad_norm": 1.6283152510947259, "learning_rate": 4.116592685099464e-06, "loss": 0.2068, "step": 8455 }, { "epoch": 2.851508511714141, "grad_norm": 1.339424989314122, "learning_rate": 4.111220758981422e-06, "loss": 0.2173, "step": 8460 }, { "epoch": 2.8531939996629023, "grad_norm": 1.227117541980443, "learning_rate": 4.105849892049762e-06, "loss": 0.2135, "step": 8465 }, { "epoch": 2.8548794876116634, "grad_norm": 1.1500834859952311, "learning_rate": 4.100480090705114e-06, "loss": 0.2234, "step": 8470 }, { "epoch": 2.856564975560425, "grad_norm": 1.2714701266672594, "learning_rate": 4.095111361346842e-06, "loss": 0.1971, "step": 8475 }, { "epoch": 2.8582504635091857, "grad_norm": 1.1309754101918799, "learning_rate": 4.089743710373031e-06, "loss": 0.1961, "step": 8480 }, { "epoch": 2.859935951457947, "grad_norm": 1.2029084595667585, "learning_rate": 4.084377144180483e-06, "loss": 0.2034, "step": 8485 }, { "epoch": 2.8616214394067083, "grad_norm": 1.3267100492980242, "learning_rate": 4.0790116691647e-06, "loss": 0.2068, "step": 8490 }, { "epoch": 2.8633069273554694, "grad_norm": 1.0543341592946291, "learning_rate": 4.0736472917198924e-06, "loss": 0.1918, "step": 8495 }, { "epoch": 2.8649924153042305, "grad_norm": 1.3968526262026753, "learning_rate": 4.068284018238957e-06, "loss": 0.2082, "step": 8500 }, { "epoch": 2.8666779032529917, "grad_norm": 1.2233220424688536, "learning_rate": 4.062921855113478e-06, "loss": 0.1928, "step": 8505 }, { "epoch": 2.8683633912017528, "grad_norm": 1.21374727836665, "learning_rate": 4.057560808733712e-06, "loss": 0.1785, "step": 8510 }, { "epoch": 2.870048879150514, "grad_norm": 1.2407628049003339, "learning_rate": 4.052200885488591e-06, "loss": 0.1953, "step": 8515 }, { "epoch": 2.8717343670992754, "grad_norm": 1.1712166484264521, "learning_rate": 4.046842091765706e-06, "loss": 0.1898, "step": 8520 }, { "epoch": 2.8734198550480365, "grad_norm": 1.351947766530135, "learning_rate": 4.041484433951299e-06, "loss": 0.227, "step": 8525 }, { "epoch": 2.8751053429967977, "grad_norm": 0.9961877131524264, "learning_rate": 4.036127918430262e-06, "loss": 0.19, "step": 8530 }, { "epoch": 2.8767908309455588, "grad_norm": 1.677230218101421, "learning_rate": 4.030772551586123e-06, "loss": 0.2115, "step": 8535 }, { "epoch": 2.87847631889432, "grad_norm": 1.2414444268399183, "learning_rate": 4.025418339801042e-06, "loss": 0.2211, "step": 8540 }, { "epoch": 2.880161806843081, "grad_norm": 1.1178112896661947, "learning_rate": 4.020065289455803e-06, "loss": 0.1955, "step": 8545 }, { "epoch": 2.881847294791842, "grad_norm": 1.6767137087933748, "learning_rate": 4.01471340692981e-06, "loss": 0.191, "step": 8550 }, { "epoch": 2.8835327827406037, "grad_norm": 1.2925204021902665, "learning_rate": 4.009362698601065e-06, "loss": 0.1837, "step": 8555 }, { "epoch": 2.8852182706893643, "grad_norm": 1.2507662452920694, "learning_rate": 4.00401317084618e-06, "loss": 0.1778, "step": 8560 }, { "epoch": 2.886903758638126, "grad_norm": 1.2101226725952177, "learning_rate": 3.998664830040355e-06, "loss": 0.1967, "step": 8565 }, { "epoch": 2.888589246586887, "grad_norm": 1.2380819962739664, "learning_rate": 3.99331768255738e-06, "loss": 0.1751, "step": 8570 }, { "epoch": 2.890274734535648, "grad_norm": 1.2024190079036536, "learning_rate": 3.987971734769615e-06, "loss": 0.1933, "step": 8575 }, { "epoch": 2.891960222484409, "grad_norm": 1.1166676847965284, "learning_rate": 3.982626993048001e-06, "loss": 0.1691, "step": 8580 }, { "epoch": 2.8936457104331703, "grad_norm": 1.2434254675121241, "learning_rate": 3.97728346376203e-06, "loss": 0.1866, "step": 8585 }, { "epoch": 2.8953311983819314, "grad_norm": 1.269478442274771, "learning_rate": 3.971941153279761e-06, "loss": 0.195, "step": 8590 }, { "epoch": 2.8970166863306925, "grad_norm": 1.217748685649073, "learning_rate": 3.9666000679677925e-06, "loss": 0.1996, "step": 8595 }, { "epoch": 2.898702174279454, "grad_norm": 1.2592626629594095, "learning_rate": 3.961260214191265e-06, "loss": 0.187, "step": 8600 }, { "epoch": 2.900387662228215, "grad_norm": 1.3085125127147617, "learning_rate": 3.9559215983138514e-06, "loss": 0.1975, "step": 8605 }, { "epoch": 2.9020731501769763, "grad_norm": 1.1199222258238601, "learning_rate": 3.950584226697749e-06, "loss": 0.1868, "step": 8610 }, { "epoch": 2.9037586381257374, "grad_norm": 1.4918499279087545, "learning_rate": 3.945248105703672e-06, "loss": 0.2301, "step": 8615 }, { "epoch": 2.9054441260744985, "grad_norm": 1.2430268090108927, "learning_rate": 3.939913241690846e-06, "loss": 0.1937, "step": 8620 }, { "epoch": 2.9071296140232596, "grad_norm": 1.1641142631885228, "learning_rate": 3.934579641016999e-06, "loss": 0.2004, "step": 8625 }, { "epoch": 2.9088151019720208, "grad_norm": 1.299321455172275, "learning_rate": 3.929247310038348e-06, "loss": 0.1977, "step": 8630 }, { "epoch": 2.9105005899207823, "grad_norm": 1.1702703265750152, "learning_rate": 3.9239162551096035e-06, "loss": 0.1912, "step": 8635 }, { "epoch": 2.912186077869543, "grad_norm": 1.3354950999957933, "learning_rate": 3.918586482583954e-06, "loss": 0.2121, "step": 8640 }, { "epoch": 2.9138715658183045, "grad_norm": 1.3093054335852983, "learning_rate": 3.913257998813055e-06, "loss": 0.1822, "step": 8645 }, { "epoch": 2.9155570537670656, "grad_norm": 1.057291195670878, "learning_rate": 3.9079308101470306e-06, "loss": 0.19, "step": 8650 }, { "epoch": 2.9172425417158268, "grad_norm": 1.4219857727997094, "learning_rate": 3.902604922934461e-06, "loss": 0.1821, "step": 8655 }, { "epoch": 2.918928029664588, "grad_norm": 1.209282061453467, "learning_rate": 3.897280343522372e-06, "loss": 0.1832, "step": 8660 }, { "epoch": 2.920613517613349, "grad_norm": 1.3582862854870226, "learning_rate": 3.891957078256239e-06, "loss": 0.1915, "step": 8665 }, { "epoch": 2.92229900556211, "grad_norm": 1.3441310517298874, "learning_rate": 3.88663513347996e-06, "loss": 0.1877, "step": 8670 }, { "epoch": 2.923984493510871, "grad_norm": 1.2848823431870975, "learning_rate": 3.881314515535871e-06, "loss": 0.2063, "step": 8675 }, { "epoch": 2.9256699814596328, "grad_norm": 1.231905261181969, "learning_rate": 3.875995230764715e-06, "loss": 0.1862, "step": 8680 }, { "epoch": 2.927355469408394, "grad_norm": 1.1868346313076998, "learning_rate": 3.870677285505657e-06, "loss": 0.1835, "step": 8685 }, { "epoch": 2.929040957357155, "grad_norm": 1.194436929729291, "learning_rate": 3.865360686096258e-06, "loss": 0.1728, "step": 8690 }, { "epoch": 2.930726445305916, "grad_norm": 1.3207718082413187, "learning_rate": 3.860045438872477e-06, "loss": 0.2142, "step": 8695 }, { "epoch": 2.932411933254677, "grad_norm": 1.3018343404256352, "learning_rate": 3.854731550168666e-06, "loss": 0.1856, "step": 8700 }, { "epoch": 2.9340974212034383, "grad_norm": 1.2405065418914671, "learning_rate": 3.8494190263175545e-06, "loss": 0.1875, "step": 8705 }, { "epoch": 2.9357829091521994, "grad_norm": 1.1474049715556274, "learning_rate": 3.844107873650242e-06, "loss": 0.1846, "step": 8710 }, { "epoch": 2.937468397100961, "grad_norm": 1.3848845959800287, "learning_rate": 3.838798098496201e-06, "loss": 0.1787, "step": 8715 }, { "epoch": 2.9391538850497216, "grad_norm": 1.2664715246312455, "learning_rate": 3.833489707183256e-06, "loss": 0.185, "step": 8720 }, { "epoch": 2.940839372998483, "grad_norm": 1.6243977252485549, "learning_rate": 3.828182706037588e-06, "loss": 0.198, "step": 8725 }, { "epoch": 2.9425248609472443, "grad_norm": 1.5130037637558595, "learning_rate": 3.8228771013837145e-06, "loss": 0.1918, "step": 8730 }, { "epoch": 2.9442103488960054, "grad_norm": 1.2889800657327404, "learning_rate": 3.817572899544494e-06, "loss": 0.1905, "step": 8735 }, { "epoch": 2.9458958368447665, "grad_norm": 1.3477371490792038, "learning_rate": 3.812270106841114e-06, "loss": 0.1774, "step": 8740 }, { "epoch": 2.9475813247935276, "grad_norm": 1.2295097696184567, "learning_rate": 3.8069687295930803e-06, "loss": 0.1796, "step": 8745 }, { "epoch": 2.9492668127422887, "grad_norm": 1.0474065290922339, "learning_rate": 3.80166877411821e-06, "loss": 0.1669, "step": 8750 }, { "epoch": 2.95095230069105, "grad_norm": 1.2037522765837032, "learning_rate": 3.79637024673263e-06, "loss": 0.2066, "step": 8755 }, { "epoch": 2.9526377886398114, "grad_norm": 1.2733518171976481, "learning_rate": 3.7910731537507616e-06, "loss": 0.1829, "step": 8760 }, { "epoch": 2.9543232765885725, "grad_norm": 1.1867266001542542, "learning_rate": 3.78577750148532e-06, "loss": 0.1667, "step": 8765 }, { "epoch": 2.9560087645373336, "grad_norm": 1.436934798961873, "learning_rate": 3.7804832962472985e-06, "loss": 0.173, "step": 8770 }, { "epoch": 2.9576942524860947, "grad_norm": 1.2156057391485697, "learning_rate": 3.775190544345973e-06, "loss": 0.2066, "step": 8775 }, { "epoch": 2.959379740434856, "grad_norm": 1.3481180264188397, "learning_rate": 3.7698992520888833e-06, "loss": 0.1987, "step": 8780 }, { "epoch": 2.961065228383617, "grad_norm": 1.119308881281576, "learning_rate": 3.7646094257818276e-06, "loss": 0.2078, "step": 8785 }, { "epoch": 2.962750716332378, "grad_norm": 1.2703548308382717, "learning_rate": 3.759321071728861e-06, "loss": 0.2038, "step": 8790 }, { "epoch": 2.9644362042811396, "grad_norm": 1.1091967158693012, "learning_rate": 3.7540341962322835e-06, "loss": 0.1812, "step": 8795 }, { "epoch": 2.9661216922299003, "grad_norm": 1.3040108337849374, "learning_rate": 3.74874880559263e-06, "loss": 0.2036, "step": 8800 }, { "epoch": 2.967807180178662, "grad_norm": 1.8342470499943189, "learning_rate": 3.7434649061086703e-06, "loss": 0.1728, "step": 8805 }, { "epoch": 2.969492668127423, "grad_norm": 1.327575741254867, "learning_rate": 3.738182504077392e-06, "loss": 0.1913, "step": 8810 }, { "epoch": 2.971178156076184, "grad_norm": 1.2447382092558106, "learning_rate": 3.732901605794003e-06, "loss": 0.1884, "step": 8815 }, { "epoch": 2.972863644024945, "grad_norm": 1.1298239317913796, "learning_rate": 3.727622217551918e-06, "loss": 0.1901, "step": 8820 }, { "epoch": 2.9745491319737063, "grad_norm": 0.9757532167636964, "learning_rate": 3.7223443456427482e-06, "loss": 0.1554, "step": 8825 }, { "epoch": 2.9762346199224674, "grad_norm": 1.2485801386614734, "learning_rate": 3.717067996356304e-06, "loss": 0.1849, "step": 8830 }, { "epoch": 2.9779201078712285, "grad_norm": 1.1862094486162922, "learning_rate": 3.711793175980576e-06, "loss": 0.1902, "step": 8835 }, { "epoch": 2.97960559581999, "grad_norm": 1.4458438812909886, "learning_rate": 3.7065198908017353e-06, "loss": 0.1956, "step": 8840 }, { "epoch": 2.981291083768751, "grad_norm": 1.2143320584081898, "learning_rate": 3.7012481471041208e-06, "loss": 0.1917, "step": 8845 }, { "epoch": 2.9829765717175123, "grad_norm": 1.1529784288846208, "learning_rate": 3.695977951170241e-06, "loss": 0.1837, "step": 8850 }, { "epoch": 2.9846620596662734, "grad_norm": 1.2195289935083835, "learning_rate": 3.6907093092807515e-06, "loss": 0.1765, "step": 8855 }, { "epoch": 2.9863475476150345, "grad_norm": 1.5852245782966363, "learning_rate": 3.685442227714463e-06, "loss": 0.1963, "step": 8860 }, { "epoch": 2.9880330355637956, "grad_norm": 1.2592376341505946, "learning_rate": 3.6801767127483207e-06, "loss": 0.2108, "step": 8865 }, { "epoch": 2.9897185235125567, "grad_norm": 1.2329518408245288, "learning_rate": 3.6749127706574074e-06, "loss": 0.1821, "step": 8870 }, { "epoch": 2.9914040114613183, "grad_norm": 1.1149355513725558, "learning_rate": 3.669650407714928e-06, "loss": 0.152, "step": 8875 }, { "epoch": 2.993089499410079, "grad_norm": 1.2436052599352312, "learning_rate": 3.664389630192209e-06, "loss": 0.1968, "step": 8880 }, { "epoch": 2.9947749873588405, "grad_norm": 1.2545273870907585, "learning_rate": 3.6591304443586826e-06, "loss": 0.192, "step": 8885 }, { "epoch": 2.9964604753076016, "grad_norm": 1.2005253337127357, "learning_rate": 3.6538728564818903e-06, "loss": 0.1879, "step": 8890 }, { "epoch": 2.9981459632563627, "grad_norm": 1.3896115577203842, "learning_rate": 3.6486168728274655e-06, "loss": 0.1753, "step": 8895 }, { "epoch": 2.999831451205124, "grad_norm": 1.2651520950003203, "learning_rate": 3.643362499659131e-06, "loss": 0.1824, "step": 8900 }, { "epoch": 3.001348390359009, "grad_norm": 1.0207205459257762, "learning_rate": 3.638109743238688e-06, "loss": 0.1554, "step": 8905 }, { "epoch": 3.00303387830777, "grad_norm": 1.2038156734208045, "learning_rate": 3.6328586098260143e-06, "loss": 0.1645, "step": 8910 }, { "epoch": 3.0047193662565315, "grad_norm": 1.08229609164426, "learning_rate": 3.6276091056790507e-06, "loss": 0.1794, "step": 8915 }, { "epoch": 3.0064048542052926, "grad_norm": 1.2777016525802405, "learning_rate": 3.6223612370537965e-06, "loss": 0.1575, "step": 8920 }, { "epoch": 3.0080903421540537, "grad_norm": 1.3440606799362391, "learning_rate": 3.6171150102043074e-06, "loss": 0.1899, "step": 8925 }, { "epoch": 3.009775830102815, "grad_norm": 1.341990062641125, "learning_rate": 3.6118704313826735e-06, "loss": 0.1791, "step": 8930 }, { "epoch": 3.011461318051576, "grad_norm": 1.150587983663152, "learning_rate": 3.60662750683903e-06, "loss": 0.1747, "step": 8935 }, { "epoch": 3.013146806000337, "grad_norm": 1.2060600777552093, "learning_rate": 3.601386242821532e-06, "loss": 0.1795, "step": 8940 }, { "epoch": 3.014832293949098, "grad_norm": 1.2085366981686378, "learning_rate": 3.5961466455763617e-06, "loss": 0.1782, "step": 8945 }, { "epoch": 3.0165177818978592, "grad_norm": 1.374761290347725, "learning_rate": 3.5909087213477134e-06, "loss": 0.1756, "step": 8950 }, { "epoch": 3.018203269846621, "grad_norm": 1.38360173008327, "learning_rate": 3.5856724763777868e-06, "loss": 0.1525, "step": 8955 }, { "epoch": 3.019888757795382, "grad_norm": 1.3231287738293254, "learning_rate": 3.58043791690678e-06, "loss": 0.173, "step": 8960 }, { "epoch": 3.021574245744143, "grad_norm": 1.2541823368229426, "learning_rate": 3.5752050491728863e-06, "loss": 0.1679, "step": 8965 }, { "epoch": 3.023259733692904, "grad_norm": 1.1878348160244694, "learning_rate": 3.5699738794122775e-06, "loss": 0.169, "step": 8970 }, { "epoch": 3.0249452216416652, "grad_norm": 1.283960050232123, "learning_rate": 3.5647444138591057e-06, "loss": 0.1765, "step": 8975 }, { "epoch": 3.0266307095904263, "grad_norm": 1.4704810541047644, "learning_rate": 3.5595166587454894e-06, "loss": 0.1865, "step": 8980 }, { "epoch": 3.0283161975391875, "grad_norm": 1.7976473900655823, "learning_rate": 3.5542906203015114e-06, "loss": 0.1714, "step": 8985 }, { "epoch": 3.0300016854879486, "grad_norm": 1.1686552991371704, "learning_rate": 3.5490663047552055e-06, "loss": 0.1563, "step": 8990 }, { "epoch": 3.03168717343671, "grad_norm": 1.124015045321442, "learning_rate": 3.5438437183325543e-06, "loss": 0.1593, "step": 8995 }, { "epoch": 3.0333726613854712, "grad_norm": 1.2155243699304927, "learning_rate": 3.5386228672574806e-06, "loss": 0.165, "step": 9000 }, { "epoch": 3.0350581493342323, "grad_norm": 1.143240939695264, "learning_rate": 3.53340375775184e-06, "loss": 0.1347, "step": 9005 }, { "epoch": 3.0367436372829935, "grad_norm": 1.4439635724868005, "learning_rate": 3.528186396035407e-06, "loss": 0.1729, "step": 9010 }, { "epoch": 3.0384291252317546, "grad_norm": 1.1233288549598444, "learning_rate": 3.522970788325879e-06, "loss": 0.1641, "step": 9015 }, { "epoch": 3.0401146131805157, "grad_norm": 1.2295149688056763, "learning_rate": 3.517756940838859e-06, "loss": 0.1804, "step": 9020 }, { "epoch": 3.041800101129277, "grad_norm": 1.464718560477114, "learning_rate": 3.5125448597878563e-06, "loss": 0.173, "step": 9025 }, { "epoch": 3.043485589078038, "grad_norm": 1.2826674637909234, "learning_rate": 3.5073345513842717e-06, "loss": 0.1721, "step": 9030 }, { "epoch": 3.0451710770267995, "grad_norm": 1.2067772736183733, "learning_rate": 3.5021260218373943e-06, "loss": 0.1674, "step": 9035 }, { "epoch": 3.0468565649755606, "grad_norm": 1.2387239718781364, "learning_rate": 3.4969192773543968e-06, "loss": 0.1375, "step": 9040 }, { "epoch": 3.0485420529243217, "grad_norm": 1.3775821528176602, "learning_rate": 3.4917143241403185e-06, "loss": 0.1546, "step": 9045 }, { "epoch": 3.050227540873083, "grad_norm": 1.3526829775589913, "learning_rate": 3.486511168398068e-06, "loss": 0.1629, "step": 9050 }, { "epoch": 3.051913028821844, "grad_norm": 1.3103433087396437, "learning_rate": 3.481309816328412e-06, "loss": 0.155, "step": 9055 }, { "epoch": 3.053598516770605, "grad_norm": 1.6886999726326517, "learning_rate": 3.4761102741299648e-06, "loss": 0.1448, "step": 9060 }, { "epoch": 3.055284004719366, "grad_norm": 1.222040977445654, "learning_rate": 3.4709125479991867e-06, "loss": 0.1438, "step": 9065 }, { "epoch": 3.0569694926681272, "grad_norm": 1.4398985720510666, "learning_rate": 3.4657166441303706e-06, "loss": 0.1554, "step": 9070 }, { "epoch": 3.058654980616889, "grad_norm": 1.6404960464778278, "learning_rate": 3.4605225687156423e-06, "loss": 0.1512, "step": 9075 }, { "epoch": 3.06034046856565, "grad_norm": 1.117768762825777, "learning_rate": 3.4553303279449463e-06, "loss": 0.1789, "step": 9080 }, { "epoch": 3.062025956514411, "grad_norm": 1.1939243780201165, "learning_rate": 3.4501399280060383e-06, "loss": 0.1503, "step": 9085 }, { "epoch": 3.063711444463172, "grad_norm": 1.1687677704075992, "learning_rate": 3.4449513750844843e-06, "loss": 0.17, "step": 9090 }, { "epoch": 3.0653969324119332, "grad_norm": 1.1975196012625673, "learning_rate": 3.4397646753636447e-06, "loss": 0.1627, "step": 9095 }, { "epoch": 3.0670824203606943, "grad_norm": 1.1203286258112777, "learning_rate": 3.434579835024676e-06, "loss": 0.1588, "step": 9100 }, { "epoch": 3.0687679083094554, "grad_norm": 1.264716385774105, "learning_rate": 3.4293968602465164e-06, "loss": 0.172, "step": 9105 }, { "epoch": 3.0704533962582166, "grad_norm": 1.3119060577741284, "learning_rate": 3.42421575720588e-06, "loss": 0.167, "step": 9110 }, { "epoch": 3.072138884206978, "grad_norm": 1.178899013183934, "learning_rate": 3.419036532077252e-06, "loss": 0.1643, "step": 9115 }, { "epoch": 3.0738243721557392, "grad_norm": 1.2551607239940359, "learning_rate": 3.4138591910328817e-06, "loss": 0.1641, "step": 9120 }, { "epoch": 3.0755098601045003, "grad_norm": 1.1797211896509134, "learning_rate": 3.4086837402427664e-06, "loss": 0.1802, "step": 9125 }, { "epoch": 3.0771953480532614, "grad_norm": 1.4830834279098966, "learning_rate": 3.4035101858746578e-06, "loss": 0.1721, "step": 9130 }, { "epoch": 3.0788808360020226, "grad_norm": 1.774760111704624, "learning_rate": 3.398338534094042e-06, "loss": 0.1745, "step": 9135 }, { "epoch": 3.0805663239507837, "grad_norm": 1.1693511782575596, "learning_rate": 3.393168791064143e-06, "loss": 0.1685, "step": 9140 }, { "epoch": 3.082251811899545, "grad_norm": 1.0183558877902805, "learning_rate": 3.3880009629459045e-06, "loss": 0.1424, "step": 9145 }, { "epoch": 3.083937299848306, "grad_norm": 1.4007350339055136, "learning_rate": 3.3828350558979924e-06, "loss": 0.1654, "step": 9150 }, { "epoch": 3.0856227877970674, "grad_norm": 1.2276635073812663, "learning_rate": 3.3776710760767822e-06, "loss": 0.175, "step": 9155 }, { "epoch": 3.0873082757458286, "grad_norm": 1.282045574865131, "learning_rate": 3.372509029636353e-06, "loss": 0.1846, "step": 9160 }, { "epoch": 3.0889937636945897, "grad_norm": 1.3442914045267893, "learning_rate": 3.3673489227284773e-06, "loss": 0.1494, "step": 9165 }, { "epoch": 3.0906792516433508, "grad_norm": 1.2678535007672018, "learning_rate": 3.3621907615026196e-06, "loss": 0.1843, "step": 9170 }, { "epoch": 3.092364739592112, "grad_norm": 1.287782306366842, "learning_rate": 3.3570345521059217e-06, "loss": 0.1732, "step": 9175 }, { "epoch": 3.094050227540873, "grad_norm": 1.37481541255487, "learning_rate": 3.3518803006832047e-06, "loss": 0.176, "step": 9180 }, { "epoch": 3.095735715489634, "grad_norm": 1.1787929412344784, "learning_rate": 3.34672801337695e-06, "loss": 0.1532, "step": 9185 }, { "epoch": 3.097421203438395, "grad_norm": 1.1753370420040756, "learning_rate": 3.341577696327304e-06, "loss": 0.1802, "step": 9190 }, { "epoch": 3.0991066913871568, "grad_norm": 1.3630837478662825, "learning_rate": 3.336429355672063e-06, "loss": 0.1819, "step": 9195 }, { "epoch": 3.100792179335918, "grad_norm": 1.361996292551038, "learning_rate": 3.331282997546666e-06, "loss": 0.1705, "step": 9200 }, { "epoch": 3.102477667284679, "grad_norm": 1.2526914617444551, "learning_rate": 3.326138628084192e-06, "loss": 0.1551, "step": 9205 }, { "epoch": 3.10416315523344, "grad_norm": 1.1758149636665614, "learning_rate": 3.3209962534153493e-06, "loss": 0.1287, "step": 9210 }, { "epoch": 3.105848643182201, "grad_norm": 1.415438669706238, "learning_rate": 3.3158558796684683e-06, "loss": 0.1814, "step": 9215 }, { "epoch": 3.1075341311309623, "grad_norm": 1.1432014158085264, "learning_rate": 3.310717512969494e-06, "loss": 0.1467, "step": 9220 }, { "epoch": 3.1092196190797234, "grad_norm": 1.1892849474522582, "learning_rate": 3.305581159441984e-06, "loss": 0.1772, "step": 9225 }, { "epoch": 3.1109051070284846, "grad_norm": 1.1475388545749203, "learning_rate": 3.3004468252070908e-06, "loss": 0.1656, "step": 9230 }, { "epoch": 3.112590594977246, "grad_norm": 1.2756817502779538, "learning_rate": 3.2953145163835655e-06, "loss": 0.1813, "step": 9235 }, { "epoch": 3.114276082926007, "grad_norm": 1.7405597519767244, "learning_rate": 3.2901842390877403e-06, "loss": 0.1392, "step": 9240 }, { "epoch": 3.1159615708747683, "grad_norm": 1.3434266543562863, "learning_rate": 3.285055999433531e-06, "loss": 0.1688, "step": 9245 }, { "epoch": 3.1176470588235294, "grad_norm": 1.2182793869948876, "learning_rate": 3.2799298035324224e-06, "loss": 0.1576, "step": 9250 }, { "epoch": 3.1193325467722905, "grad_norm": 1.30723048266272, "learning_rate": 3.274805657493466e-06, "loss": 0.1762, "step": 9255 }, { "epoch": 3.1210180347210517, "grad_norm": 1.1708361861411545, "learning_rate": 3.2696835674232653e-06, "loss": 0.1626, "step": 9260 }, { "epoch": 3.1227035226698128, "grad_norm": 1.200197195129053, "learning_rate": 3.2645635394259822e-06, "loss": 0.17, "step": 9265 }, { "epoch": 3.124389010618574, "grad_norm": 1.2379102143632188, "learning_rate": 3.2594455796033144e-06, "loss": 0.1664, "step": 9270 }, { "epoch": 3.1260744985673354, "grad_norm": 1.1989971064565192, "learning_rate": 3.2543296940544967e-06, "loss": 0.1853, "step": 9275 }, { "epoch": 3.1277599865160965, "grad_norm": 1.2455747680454687, "learning_rate": 3.2492158888762905e-06, "loss": 0.1616, "step": 9280 }, { "epoch": 3.1294454744648577, "grad_norm": 1.294602311430553, "learning_rate": 3.2441041701629815e-06, "loss": 0.173, "step": 9285 }, { "epoch": 3.1311309624136188, "grad_norm": 1.1918922901924984, "learning_rate": 3.2389945440063662e-06, "loss": 0.1566, "step": 9290 }, { "epoch": 3.13281645036238, "grad_norm": 1.0910472071920585, "learning_rate": 3.233887016495746e-06, "loss": 0.1464, "step": 9295 }, { "epoch": 3.134501938311141, "grad_norm": 1.516106917729951, "learning_rate": 3.2287815937179277e-06, "loss": 0.1717, "step": 9300 }, { "epoch": 3.136187426259902, "grad_norm": 1.3704725428024847, "learning_rate": 3.223678281757202e-06, "loss": 0.1726, "step": 9305 }, { "epoch": 3.137872914208663, "grad_norm": 1.2223479549027039, "learning_rate": 3.2185770866953476e-06, "loss": 0.1608, "step": 9310 }, { "epoch": 3.1395584021574248, "grad_norm": 1.2740998782271278, "learning_rate": 3.2134780146116225e-06, "loss": 0.1697, "step": 9315 }, { "epoch": 3.141243890106186, "grad_norm": 1.2558127468529559, "learning_rate": 3.2083810715827495e-06, "loss": 0.1751, "step": 9320 }, { "epoch": 3.142929378054947, "grad_norm": 1.1766271095842882, "learning_rate": 3.20328626368292e-06, "loss": 0.1597, "step": 9325 }, { "epoch": 3.144614866003708, "grad_norm": 1.3797584416470503, "learning_rate": 3.198193596983775e-06, "loss": 0.1542, "step": 9330 }, { "epoch": 3.146300353952469, "grad_norm": 1.2567222348584293, "learning_rate": 3.1931030775544076e-06, "loss": 0.1632, "step": 9335 }, { "epoch": 3.1479858419012303, "grad_norm": 1.1402089659536931, "learning_rate": 3.1880147114613533e-06, "loss": 0.1538, "step": 9340 }, { "epoch": 3.1496713298499914, "grad_norm": 1.3072830701919846, "learning_rate": 3.1829285047685775e-06, "loss": 0.1691, "step": 9345 }, { "epoch": 3.1513568177987525, "grad_norm": 1.2698855552382342, "learning_rate": 3.1778444635374745e-06, "loss": 0.172, "step": 9350 }, { "epoch": 3.153042305747514, "grad_norm": 1.260046757033209, "learning_rate": 3.172762593826856e-06, "loss": 0.1455, "step": 9355 }, { "epoch": 3.154727793696275, "grad_norm": 1.4774034629075194, "learning_rate": 3.167682901692948e-06, "loss": 0.1837, "step": 9360 }, { "epoch": 3.1564132816450363, "grad_norm": 1.2717571808725205, "learning_rate": 3.162605393189381e-06, "loss": 0.1829, "step": 9365 }, { "epoch": 3.1580987695937974, "grad_norm": 1.2880709192467314, "learning_rate": 3.1575300743671806e-06, "loss": 0.1653, "step": 9370 }, { "epoch": 3.1597842575425585, "grad_norm": 1.4437813142717841, "learning_rate": 3.1524569512747683e-06, "loss": 0.1807, "step": 9375 }, { "epoch": 3.1614697454913196, "grad_norm": 1.3701463733957, "learning_rate": 3.147386029957944e-06, "loss": 0.159, "step": 9380 }, { "epoch": 3.1631552334400808, "grad_norm": 1.324744375297103, "learning_rate": 3.142317316459885e-06, "loss": 0.1747, "step": 9385 }, { "epoch": 3.164840721388842, "grad_norm": 1.3506080024868086, "learning_rate": 3.137250816821139e-06, "loss": 0.1632, "step": 9390 }, { "epoch": 3.1665262093376034, "grad_norm": 1.3139701825343126, "learning_rate": 3.1321865370796122e-06, "loss": 0.1579, "step": 9395 }, { "epoch": 3.1682116972863645, "grad_norm": 1.229312512917331, "learning_rate": 3.12712448327057e-06, "loss": 0.187, "step": 9400 }, { "epoch": 3.1698971852351256, "grad_norm": 1.3010758922423349, "learning_rate": 3.1220646614266193e-06, "loss": 0.1694, "step": 9405 }, { "epoch": 3.1715826731838868, "grad_norm": 1.1240272769230117, "learning_rate": 3.11700707757771e-06, "loss": 0.1795, "step": 9410 }, { "epoch": 3.173268161132648, "grad_norm": 1.307436729770644, "learning_rate": 3.111951737751128e-06, "loss": 0.165, "step": 9415 }, { "epoch": 3.174953649081409, "grad_norm": 1.428189994611883, "learning_rate": 3.106898647971481e-06, "loss": 0.1683, "step": 9420 }, { "epoch": 3.17663913703017, "grad_norm": 1.1725458533946036, "learning_rate": 3.1018478142606944e-06, "loss": 0.1572, "step": 9425 }, { "epoch": 3.178324624978931, "grad_norm": 1.2700921188793206, "learning_rate": 3.096799242638009e-06, "loss": 0.1579, "step": 9430 }, { "epoch": 3.1800101129276928, "grad_norm": 1.2514185126726696, "learning_rate": 3.091752939119966e-06, "loss": 0.1718, "step": 9435 }, { "epoch": 3.181695600876454, "grad_norm": 1.4999203561023051, "learning_rate": 3.0867089097204062e-06, "loss": 0.143, "step": 9440 }, { "epoch": 3.183381088825215, "grad_norm": 1.1807717256720542, "learning_rate": 3.0816671604504567e-06, "loss": 0.1639, "step": 9445 }, { "epoch": 3.185066576773976, "grad_norm": 1.2211558604253152, "learning_rate": 3.0766276973185326e-06, "loss": 0.1778, "step": 9450 }, { "epoch": 3.186752064722737, "grad_norm": 2.377638369244552, "learning_rate": 3.0715905263303226e-06, "loss": 0.1607, "step": 9455 }, { "epoch": 3.1884375526714983, "grad_norm": 1.2117524539834534, "learning_rate": 3.0665556534887807e-06, "loss": 0.1721, "step": 9460 }, { "epoch": 3.1901230406202594, "grad_norm": 1.3975189495173967, "learning_rate": 3.0615230847941244e-06, "loss": 0.1712, "step": 9465 }, { "epoch": 3.1918085285690205, "grad_norm": 1.8109062696988218, "learning_rate": 3.0564928262438276e-06, "loss": 0.1644, "step": 9470 }, { "epoch": 3.193494016517782, "grad_norm": 1.209988593116305, "learning_rate": 3.0514648838326056e-06, "loss": 0.1683, "step": 9475 }, { "epoch": 3.195179504466543, "grad_norm": 1.323539775735678, "learning_rate": 3.046439263552419e-06, "loss": 0.1491, "step": 9480 }, { "epoch": 3.1968649924153043, "grad_norm": 1.2065094647522658, "learning_rate": 3.041415971392457e-06, "loss": 0.149, "step": 9485 }, { "epoch": 3.1985504803640654, "grad_norm": 1.2611655717609178, "learning_rate": 3.0363950133391375e-06, "loss": 0.1513, "step": 9490 }, { "epoch": 3.2002359683128265, "grad_norm": 1.4986209625618618, "learning_rate": 3.0313763953760957e-06, "loss": 0.1456, "step": 9495 }, { "epoch": 3.2019214562615876, "grad_norm": 1.1304760296305825, "learning_rate": 3.0263601234841757e-06, "loss": 0.1571, "step": 9500 }, { "epoch": 3.2036069442103488, "grad_norm": 1.5294498865070563, "learning_rate": 3.0213462036414294e-06, "loss": 0.1714, "step": 9505 }, { "epoch": 3.20529243215911, "grad_norm": 1.353017583083657, "learning_rate": 3.016334641823102e-06, "loss": 0.1717, "step": 9510 }, { "epoch": 3.2069779201078714, "grad_norm": 1.722681173934253, "learning_rate": 3.0113254440016325e-06, "loss": 0.1758, "step": 9515 }, { "epoch": 3.2086634080566325, "grad_norm": 1.211881767224924, "learning_rate": 3.0063186161466384e-06, "loss": 0.1426, "step": 9520 }, { "epoch": 3.2103488960053936, "grad_norm": 1.2367120052202052, "learning_rate": 3.0013141642249183e-06, "loss": 0.1735, "step": 9525 }, { "epoch": 3.2120343839541547, "grad_norm": 1.2769389133752633, "learning_rate": 2.996312094200434e-06, "loss": 0.18, "step": 9530 }, { "epoch": 3.213719871902916, "grad_norm": 1.3879869536741454, "learning_rate": 2.991312412034312e-06, "loss": 0.1462, "step": 9535 }, { "epoch": 3.215405359851677, "grad_norm": 1.2140405718148446, "learning_rate": 2.98631512368483e-06, "loss": 0.1712, "step": 9540 }, { "epoch": 3.217090847800438, "grad_norm": 1.3180230916425395, "learning_rate": 2.9813202351074165e-06, "loss": 0.1577, "step": 9545 }, { "epoch": 3.218776335749199, "grad_norm": 1.3600076963285992, "learning_rate": 2.9763277522546372e-06, "loss": 0.1657, "step": 9550 }, { "epoch": 3.2204618236979607, "grad_norm": 1.4671002267407103, "learning_rate": 2.971337681076194e-06, "loss": 0.1566, "step": 9555 }, { "epoch": 3.222147311646722, "grad_norm": 1.3709612532883515, "learning_rate": 2.9663500275189082e-06, "loss": 0.1449, "step": 9560 }, { "epoch": 3.223832799595483, "grad_norm": 1.4384780637244976, "learning_rate": 2.9613647975267323e-06, "loss": 0.1672, "step": 9565 }, { "epoch": 3.225518287544244, "grad_norm": 1.2647140869055336, "learning_rate": 2.956381997040717e-06, "loss": 0.1497, "step": 9570 }, { "epoch": 3.227203775493005, "grad_norm": 1.2698793575906924, "learning_rate": 2.9514016319990257e-06, "loss": 0.1652, "step": 9575 }, { "epoch": 3.2288892634417663, "grad_norm": 1.3023893804465498, "learning_rate": 2.9464237083369157e-06, "loss": 0.1668, "step": 9580 }, { "epoch": 3.2305747513905274, "grad_norm": 1.1616935191259887, "learning_rate": 2.941448231986739e-06, "loss": 0.1502, "step": 9585 }, { "epoch": 3.2322602393392885, "grad_norm": 1.2302765723083426, "learning_rate": 2.9364752088779247e-06, "loss": 0.1651, "step": 9590 }, { "epoch": 3.23394572728805, "grad_norm": 1.261595018073127, "learning_rate": 2.9315046449369854e-06, "loss": 0.1739, "step": 9595 }, { "epoch": 3.235631215236811, "grad_norm": 1.248918860893241, "learning_rate": 2.926536546087496e-06, "loss": 0.1765, "step": 9600 }, { "epoch": 3.2373167031855723, "grad_norm": 1.2379293176270614, "learning_rate": 2.9215709182501007e-06, "loss": 0.1413, "step": 9605 }, { "epoch": 3.2390021911343334, "grad_norm": 1.0823834025280892, "learning_rate": 2.916607767342494e-06, "loss": 0.1586, "step": 9610 }, { "epoch": 3.2406876790830945, "grad_norm": 1.4161088895383096, "learning_rate": 2.9116470992794223e-06, "loss": 0.1677, "step": 9615 }, { "epoch": 3.2423731670318556, "grad_norm": 1.1029906470930286, "learning_rate": 2.9066889199726685e-06, "loss": 0.1497, "step": 9620 }, { "epoch": 3.2440586549806167, "grad_norm": 1.67222310058739, "learning_rate": 2.90173323533105e-06, "loss": 0.1632, "step": 9625 }, { "epoch": 3.245744142929378, "grad_norm": 1.266332865437193, "learning_rate": 2.8967800512604183e-06, "loss": 0.1538, "step": 9630 }, { "epoch": 3.2474296308781394, "grad_norm": 1.2965984814533966, "learning_rate": 2.8918293736636348e-06, "loss": 0.1493, "step": 9635 }, { "epoch": 3.2491151188269005, "grad_norm": 1.2623155830617245, "learning_rate": 2.886881208440584e-06, "loss": 0.1859, "step": 9640 }, { "epoch": 3.2508006067756616, "grad_norm": 1.3169177658170732, "learning_rate": 2.8819355614881477e-06, "loss": 0.1636, "step": 9645 }, { "epoch": 3.2524860947244227, "grad_norm": 1.3065877659962761, "learning_rate": 2.876992438700209e-06, "loss": 0.1771, "step": 9650 }, { "epoch": 3.254171582673184, "grad_norm": 1.2879065086914747, "learning_rate": 2.8720518459676476e-06, "loss": 0.164, "step": 9655 }, { "epoch": 3.255857070621945, "grad_norm": 1.104712711465004, "learning_rate": 2.867113789178323e-06, "loss": 0.162, "step": 9660 }, { "epoch": 3.257542558570706, "grad_norm": 1.2515341755135794, "learning_rate": 2.862178274217073e-06, "loss": 0.1854, "step": 9665 }, { "epoch": 3.2592280465194676, "grad_norm": 1.2025075722868321, "learning_rate": 2.857245306965706e-06, "loss": 0.1555, "step": 9670 }, { "epoch": 3.2609135344682287, "grad_norm": 1.2352924301964021, "learning_rate": 2.8523148933029963e-06, "loss": 0.1615, "step": 9675 }, { "epoch": 3.26259902241699, "grad_norm": 1.1225049407440537, "learning_rate": 2.847387039104677e-06, "loss": 0.168, "step": 9680 }, { "epoch": 3.264284510365751, "grad_norm": 1.1843493670219003, "learning_rate": 2.842461750243426e-06, "loss": 0.1441, "step": 9685 }, { "epoch": 3.265969998314512, "grad_norm": 1.4223983699398892, "learning_rate": 2.837539032588864e-06, "loss": 0.1533, "step": 9690 }, { "epoch": 3.267655486263273, "grad_norm": 1.3668018963765802, "learning_rate": 2.8326188920075535e-06, "loss": 0.133, "step": 9695 }, { "epoch": 3.2693409742120343, "grad_norm": 1.3732328773508795, "learning_rate": 2.82770133436298e-06, "loss": 0.152, "step": 9700 }, { "epoch": 3.2710264621607954, "grad_norm": 1.3268811184973257, "learning_rate": 2.822786365515552e-06, "loss": 0.1628, "step": 9705 }, { "epoch": 3.2727119501095565, "grad_norm": 1.0306016554222965, "learning_rate": 2.817873991322593e-06, "loss": 0.14, "step": 9710 }, { "epoch": 3.274397438058318, "grad_norm": 1.3115593947766953, "learning_rate": 2.812964217638336e-06, "loss": 0.1605, "step": 9715 }, { "epoch": 3.276082926007079, "grad_norm": 1.3130610428454002, "learning_rate": 2.808057050313916e-06, "loss": 0.1698, "step": 9720 }, { "epoch": 3.2777684139558403, "grad_norm": 1.4356301555549609, "learning_rate": 2.8031524951973577e-06, "loss": 0.1362, "step": 9725 }, { "epoch": 3.2794539019046014, "grad_norm": 1.3386960730337292, "learning_rate": 2.798250558133574e-06, "loss": 0.1517, "step": 9730 }, { "epoch": 3.2811393898533625, "grad_norm": 1.4488180024774386, "learning_rate": 2.7933512449643605e-06, "loss": 0.1728, "step": 9735 }, { "epoch": 3.2828248778021236, "grad_norm": 1.359578650451188, "learning_rate": 2.7884545615283837e-06, "loss": 0.1537, "step": 9740 }, { "epoch": 3.2845103657508847, "grad_norm": 1.1897590570490932, "learning_rate": 2.7835605136611754e-06, "loss": 0.1435, "step": 9745 }, { "epoch": 3.2861958536996463, "grad_norm": 1.2038117092508442, "learning_rate": 2.778669107195126e-06, "loss": 0.163, "step": 9750 }, { "epoch": 3.2878813416484074, "grad_norm": 1.2761016367040738, "learning_rate": 2.7737803479594816e-06, "loss": 0.1829, "step": 9755 }, { "epoch": 3.2895668295971685, "grad_norm": 1.283230762590572, "learning_rate": 2.7688942417803334e-06, "loss": 0.1551, "step": 9760 }, { "epoch": 3.2912523175459296, "grad_norm": 1.159281231366604, "learning_rate": 2.764010794480606e-06, "loss": 0.1644, "step": 9765 }, { "epoch": 3.2929378054946907, "grad_norm": 1.2045208633888451, "learning_rate": 2.759130011880058e-06, "loss": 0.135, "step": 9770 }, { "epoch": 3.294623293443452, "grad_norm": 1.3690116658480913, "learning_rate": 2.7542518997952756e-06, "loss": 0.1499, "step": 9775 }, { "epoch": 3.296308781392213, "grad_norm": 1.234692808086299, "learning_rate": 2.7493764640396577e-06, "loss": 0.1682, "step": 9780 }, { "epoch": 3.297994269340974, "grad_norm": 1.4082026875072793, "learning_rate": 2.744503710423413e-06, "loss": 0.1646, "step": 9785 }, { "epoch": 3.299679757289735, "grad_norm": 1.4087003153021567, "learning_rate": 2.7396336447535617e-06, "loss": 0.1659, "step": 9790 }, { "epoch": 3.3013652452384967, "grad_norm": 1.3565180469630933, "learning_rate": 2.7347662728339095e-06, "loss": 0.1717, "step": 9795 }, { "epoch": 3.303050733187258, "grad_norm": 1.1499016521512857, "learning_rate": 2.729901600465064e-06, "loss": 0.1752, "step": 9800 }, { "epoch": 3.304736221136019, "grad_norm": 1.274611088950639, "learning_rate": 2.7250396334444063e-06, "loss": 0.1732, "step": 9805 }, { "epoch": 3.30642170908478, "grad_norm": 2.2597812181213213, "learning_rate": 2.7201803775660955e-06, "loss": 0.1686, "step": 9810 }, { "epoch": 3.308107197033541, "grad_norm": 1.2555303991856703, "learning_rate": 2.71532383862106e-06, "loss": 0.1699, "step": 9815 }, { "epoch": 3.3097926849823023, "grad_norm": 1.2720036046307266, "learning_rate": 2.710470022396996e-06, "loss": 0.1668, "step": 9820 }, { "epoch": 3.3114781729310634, "grad_norm": 1.2771764919302078, "learning_rate": 2.7056189346783436e-06, "loss": 0.1376, "step": 9825 }, { "epoch": 3.313163660879825, "grad_norm": 1.2195796633785456, "learning_rate": 2.7007705812463036e-06, "loss": 0.1495, "step": 9830 }, { "epoch": 3.314849148828586, "grad_norm": 1.3821972998294634, "learning_rate": 2.695924967878808e-06, "loss": 0.156, "step": 9835 }, { "epoch": 3.316534636777347, "grad_norm": 1.1986652251422123, "learning_rate": 2.6910821003505317e-06, "loss": 0.173, "step": 9840 }, { "epoch": 3.3182201247261083, "grad_norm": 1.2925895879187035, "learning_rate": 2.686241984432871e-06, "loss": 0.1576, "step": 9845 }, { "epoch": 3.3199056126748694, "grad_norm": 1.2402445420674666, "learning_rate": 2.6814046258939463e-06, "loss": 0.1825, "step": 9850 }, { "epoch": 3.3215911006236305, "grad_norm": 1.3769551053540399, "learning_rate": 2.6765700304985876e-06, "loss": 0.1636, "step": 9855 }, { "epoch": 3.3232765885723916, "grad_norm": 1.345576550638648, "learning_rate": 2.6717382040083393e-06, "loss": 0.1552, "step": 9860 }, { "epoch": 3.3249620765211527, "grad_norm": 1.2241688048131634, "learning_rate": 2.666909152181443e-06, "loss": 0.1401, "step": 9865 }, { "epoch": 3.326647564469914, "grad_norm": 1.2490989855449084, "learning_rate": 2.6620828807728304e-06, "loss": 0.172, "step": 9870 }, { "epoch": 3.3283330524186754, "grad_norm": 1.2430478967631053, "learning_rate": 2.65725939553412e-06, "loss": 0.1534, "step": 9875 }, { "epoch": 3.3300185403674365, "grad_norm": 1.140887083180735, "learning_rate": 2.6524387022136177e-06, "loss": 0.1398, "step": 9880 }, { "epoch": 3.3317040283161976, "grad_norm": 1.314994612439792, "learning_rate": 2.6476208065562924e-06, "loss": 0.1635, "step": 9885 }, { "epoch": 3.3333895162649587, "grad_norm": 1.114088503690688, "learning_rate": 2.6428057143037842e-06, "loss": 0.1582, "step": 9890 }, { "epoch": 3.33507500421372, "grad_norm": 1.332360206492176, "learning_rate": 2.637993431194389e-06, "loss": 0.1445, "step": 9895 }, { "epoch": 3.336760492162481, "grad_norm": 1.3748852428537113, "learning_rate": 2.6331839629630584e-06, "loss": 0.1485, "step": 9900 }, { "epoch": 3.338445980111242, "grad_norm": 1.0516598879651196, "learning_rate": 2.6283773153413912e-06, "loss": 0.1748, "step": 9905 }, { "epoch": 3.3401314680600036, "grad_norm": 1.3013265817650992, "learning_rate": 2.6235734940576185e-06, "loss": 0.1648, "step": 9910 }, { "epoch": 3.3418169560087647, "grad_norm": 1.1930122703681216, "learning_rate": 2.6187725048366064e-06, "loss": 0.152, "step": 9915 }, { "epoch": 3.343502443957526, "grad_norm": 1.016190066299787, "learning_rate": 2.613974353399845e-06, "loss": 0.1178, "step": 9920 }, { "epoch": 3.345187931906287, "grad_norm": 1.343488871051259, "learning_rate": 2.6091790454654463e-06, "loss": 0.1697, "step": 9925 }, { "epoch": 3.346873419855048, "grad_norm": 1.2718864711303226, "learning_rate": 2.604386586748129e-06, "loss": 0.1578, "step": 9930 }, { "epoch": 3.348558907803809, "grad_norm": 1.3144675193353847, "learning_rate": 2.599596982959216e-06, "loss": 0.1483, "step": 9935 }, { "epoch": 3.3502443957525703, "grad_norm": 1.2868730060576092, "learning_rate": 2.594810239806632e-06, "loss": 0.1495, "step": 9940 }, { "epoch": 3.3519298837013314, "grad_norm": 1.4535583194642787, "learning_rate": 2.5900263629948926e-06, "loss": 0.1645, "step": 9945 }, { "epoch": 3.3536153716500925, "grad_norm": 1.3060008542485482, "learning_rate": 2.5852453582250925e-06, "loss": 0.1513, "step": 9950 }, { "epoch": 3.355300859598854, "grad_norm": 1.210308369586654, "learning_rate": 2.5804672311949073e-06, "loss": 0.1612, "step": 9955 }, { "epoch": 3.356986347547615, "grad_norm": 1.2239350750221103, "learning_rate": 2.5756919875985813e-06, "loss": 0.1463, "step": 9960 }, { "epoch": 3.3586718354963763, "grad_norm": 1.2831887242887268, "learning_rate": 2.570919633126926e-06, "loss": 0.1721, "step": 9965 }, { "epoch": 3.3603573234451374, "grad_norm": 1.1974341824227415, "learning_rate": 2.566150173467306e-06, "loss": 0.1463, "step": 9970 }, { "epoch": 3.3620428113938985, "grad_norm": 1.7198354586353592, "learning_rate": 2.5613836143036357e-06, "loss": 0.14, "step": 9975 }, { "epoch": 3.3637282993426596, "grad_norm": 1.2042408241398865, "learning_rate": 2.5566199613163766e-06, "loss": 0.165, "step": 9980 }, { "epoch": 3.3654137872914207, "grad_norm": 1.2293708423374898, "learning_rate": 2.5518592201825267e-06, "loss": 0.1439, "step": 9985 }, { "epoch": 3.3670992752401823, "grad_norm": 1.4356486672717312, "learning_rate": 2.54710139657561e-06, "loss": 0.1882, "step": 9990 }, { "epoch": 3.3687847631889434, "grad_norm": 1.2734891709414193, "learning_rate": 2.5423464961656753e-06, "loss": 0.1613, "step": 9995 }, { "epoch": 3.3704702511377045, "grad_norm": 1.1945074047496578, "learning_rate": 2.5375945246192866e-06, "loss": 0.1344, "step": 10000 }, { "epoch": 3.3721557390864656, "grad_norm": 1.213512918584927, "learning_rate": 2.5328454875995236e-06, "loss": 0.1203, "step": 10005 }, { "epoch": 3.3738412270352267, "grad_norm": 1.2884866166581566, "learning_rate": 2.5280993907659597e-06, "loss": 0.1506, "step": 10010 }, { "epoch": 3.375526714983988, "grad_norm": 1.1274743652728205, "learning_rate": 2.523356239774672e-06, "loss": 0.1457, "step": 10015 }, { "epoch": 3.377212202932749, "grad_norm": 1.447649015328168, "learning_rate": 2.5186160402782224e-06, "loss": 0.177, "step": 10020 }, { "epoch": 3.37889769088151, "grad_norm": 3.5867383838328206, "learning_rate": 2.5138787979256552e-06, "loss": 0.1532, "step": 10025 }, { "epoch": 3.380583178830271, "grad_norm": 4.75720701305717, "learning_rate": 2.5091445183624955e-06, "loss": 0.1286, "step": 10030 }, { "epoch": 3.3822686667790327, "grad_norm": 1.3099119747432584, "learning_rate": 2.5044132072307337e-06, "loss": 0.1586, "step": 10035 }, { "epoch": 3.383954154727794, "grad_norm": 1.4026656590916609, "learning_rate": 2.499684870168819e-06, "loss": 0.1493, "step": 10040 }, { "epoch": 3.385639642676555, "grad_norm": 1.2735465304947546, "learning_rate": 2.494959512811666e-06, "loss": 0.1634, "step": 10045 }, { "epoch": 3.387325130625316, "grad_norm": 1.2405189512566897, "learning_rate": 2.4902371407906285e-06, "loss": 0.1515, "step": 10050 }, { "epoch": 3.389010618574077, "grad_norm": 1.3739165984313435, "learning_rate": 2.4855177597335105e-06, "loss": 0.1533, "step": 10055 }, { "epoch": 3.3906961065228383, "grad_norm": 1.379104118916509, "learning_rate": 2.4808013752645466e-06, "loss": 0.1681, "step": 10060 }, { "epoch": 3.3923815944715994, "grad_norm": 1.3603912344666569, "learning_rate": 2.476087993004399e-06, "loss": 0.1679, "step": 10065 }, { "epoch": 3.394067082420361, "grad_norm": 1.4006661475584379, "learning_rate": 2.4713776185701587e-06, "loss": 0.1472, "step": 10070 }, { "epoch": 3.395752570369122, "grad_norm": 1.2184416515683658, "learning_rate": 2.4666702575753264e-06, "loss": 0.1696, "step": 10075 }, { "epoch": 3.397438058317883, "grad_norm": 1.182412536303197, "learning_rate": 2.461965915629813e-06, "loss": 0.1446, "step": 10080 }, { "epoch": 3.3991235462666443, "grad_norm": 1.1729750586108794, "learning_rate": 2.457264598339929e-06, "loss": 0.1481, "step": 10085 }, { "epoch": 3.4008090342154054, "grad_norm": 1.1136834210810826, "learning_rate": 2.4525663113083898e-06, "loss": 0.1415, "step": 10090 }, { "epoch": 3.4024945221641665, "grad_norm": 1.4430532932826554, "learning_rate": 2.4478710601342894e-06, "loss": 0.1565, "step": 10095 }, { "epoch": 3.4041800101129276, "grad_norm": 1.2267652059194414, "learning_rate": 2.443178850413107e-06, "loss": 0.1463, "step": 10100 }, { "epoch": 3.4058654980616887, "grad_norm": 1.4949942070434277, "learning_rate": 2.4384896877366963e-06, "loss": 0.1544, "step": 10105 }, { "epoch": 3.40755098601045, "grad_norm": 1.2604223328760327, "learning_rate": 2.433803577693285e-06, "loss": 0.1541, "step": 10110 }, { "epoch": 3.4092364739592114, "grad_norm": 1.3060010613190922, "learning_rate": 2.429120525867456e-06, "loss": 0.1483, "step": 10115 }, { "epoch": 3.4109219619079725, "grad_norm": 1.2146774291782885, "learning_rate": 2.424440537840152e-06, "loss": 0.1842, "step": 10120 }, { "epoch": 3.4126074498567336, "grad_norm": 1.2621944558371296, "learning_rate": 2.4197636191886596e-06, "loss": 0.1313, "step": 10125 }, { "epoch": 3.4142929378054947, "grad_norm": 1.4618231091191864, "learning_rate": 2.415089775486614e-06, "loss": 0.1549, "step": 10130 }, { "epoch": 3.415978425754256, "grad_norm": 1.2494958728003562, "learning_rate": 2.4104190123039834e-06, "loss": 0.1504, "step": 10135 }, { "epoch": 3.417663913703017, "grad_norm": 1.0466219441983735, "learning_rate": 2.4057513352070636e-06, "loss": 0.1552, "step": 10140 }, { "epoch": 3.419349401651778, "grad_norm": 1.181381181227899, "learning_rate": 2.4010867497584717e-06, "loss": 0.1361, "step": 10145 }, { "epoch": 3.4210348896005396, "grad_norm": 1.4330681182373313, "learning_rate": 2.396425261517144e-06, "loss": 0.1577, "step": 10150 }, { "epoch": 3.4227203775493007, "grad_norm": 1.3772972060192594, "learning_rate": 2.3917668760383234e-06, "loss": 0.1639, "step": 10155 }, { "epoch": 3.424405865498062, "grad_norm": 1.2095551479791498, "learning_rate": 2.3871115988735535e-06, "loss": 0.1427, "step": 10160 }, { "epoch": 3.426091353446823, "grad_norm": 1.2021884887059764, "learning_rate": 2.3824594355706783e-06, "loss": 0.1538, "step": 10165 }, { "epoch": 3.427776841395584, "grad_norm": 1.1313169172852517, "learning_rate": 2.3778103916738253e-06, "loss": 0.1441, "step": 10170 }, { "epoch": 3.429462329344345, "grad_norm": 1.2177024573507838, "learning_rate": 2.373164472723411e-06, "loss": 0.1548, "step": 10175 }, { "epoch": 3.4311478172931063, "grad_norm": 1.246213663716035, "learning_rate": 2.368521684256122e-06, "loss": 0.1497, "step": 10180 }, { "epoch": 3.4328333052418674, "grad_norm": 1.2783660552054863, "learning_rate": 2.3638820318049147e-06, "loss": 0.1369, "step": 10185 }, { "epoch": 3.4345187931906285, "grad_norm": 1.3268102245928621, "learning_rate": 2.3592455208990147e-06, "loss": 0.1508, "step": 10190 }, { "epoch": 3.43620428113939, "grad_norm": 1.3666486878269484, "learning_rate": 2.3546121570638953e-06, "loss": 0.1464, "step": 10195 }, { "epoch": 3.437889769088151, "grad_norm": 1.2657649808599138, "learning_rate": 2.3499819458212823e-06, "loss": 0.1485, "step": 10200 }, { "epoch": 3.4395752570369122, "grad_norm": 1.3242067408581548, "learning_rate": 2.345354892689149e-06, "loss": 0.1532, "step": 10205 }, { "epoch": 3.4412607449856734, "grad_norm": 1.3114272086342003, "learning_rate": 2.3407310031816964e-06, "loss": 0.1537, "step": 10210 }, { "epoch": 3.4429462329344345, "grad_norm": 1.1146493999973337, "learning_rate": 2.3361102828093647e-06, "loss": 0.1409, "step": 10215 }, { "epoch": 3.4446317208831956, "grad_norm": 1.3527977652436147, "learning_rate": 2.3314927370788108e-06, "loss": 0.1477, "step": 10220 }, { "epoch": 3.4463172088319567, "grad_norm": 1.3906779215680816, "learning_rate": 2.3268783714929098e-06, "loss": 0.1306, "step": 10225 }, { "epoch": 3.4480026967807182, "grad_norm": 1.2281823355192947, "learning_rate": 2.3222671915507466e-06, "loss": 0.158, "step": 10230 }, { "epoch": 3.4496881847294794, "grad_norm": 1.1300870010146995, "learning_rate": 2.3176592027476115e-06, "loss": 0.1212, "step": 10235 }, { "epoch": 3.4513736726782405, "grad_norm": 1.4417167756613363, "learning_rate": 2.3130544105749917e-06, "loss": 0.158, "step": 10240 }, { "epoch": 3.4530591606270016, "grad_norm": 1.2018000207545478, "learning_rate": 2.3084528205205644e-06, "loss": 0.1592, "step": 10245 }, { "epoch": 3.4547446485757627, "grad_norm": 1.2891547151774303, "learning_rate": 2.303854438068186e-06, "loss": 0.116, "step": 10250 }, { "epoch": 3.456430136524524, "grad_norm": 1.2953658726878892, "learning_rate": 2.2992592686979e-06, "loss": 0.1559, "step": 10255 }, { "epoch": 3.458115624473285, "grad_norm": 1.2621481779059829, "learning_rate": 2.294667317885912e-06, "loss": 0.1526, "step": 10260 }, { "epoch": 3.459801112422046, "grad_norm": 1.2171418295041299, "learning_rate": 2.290078591104597e-06, "loss": 0.1425, "step": 10265 }, { "epoch": 3.461486600370807, "grad_norm": 1.2903836928936998, "learning_rate": 2.2854930938224828e-06, "loss": 0.1605, "step": 10270 }, { "epoch": 3.4631720883195687, "grad_norm": 1.3229012029215648, "learning_rate": 2.2809108315042544e-06, "loss": 0.1753, "step": 10275 }, { "epoch": 3.46485757626833, "grad_norm": 1.2216745043943342, "learning_rate": 2.27633180961074e-06, "loss": 0.1315, "step": 10280 }, { "epoch": 3.466543064217091, "grad_norm": 1.352358245982348, "learning_rate": 2.271756033598905e-06, "loss": 0.134, "step": 10285 }, { "epoch": 3.468228552165852, "grad_norm": 1.2957046008732787, "learning_rate": 2.2671835089218424e-06, "loss": 0.1504, "step": 10290 }, { "epoch": 3.469914040114613, "grad_norm": 1.2815829853788925, "learning_rate": 2.2626142410287805e-06, "loss": 0.1411, "step": 10295 }, { "epoch": 3.4715995280633742, "grad_norm": 1.6271233142109944, "learning_rate": 2.258048235365057e-06, "loss": 0.1449, "step": 10300 }, { "epoch": 3.4732850160121354, "grad_norm": 1.2974471537100711, "learning_rate": 2.2534854973721277e-06, "loss": 0.1723, "step": 10305 }, { "epoch": 3.474970503960897, "grad_norm": 1.5151258091900692, "learning_rate": 2.2489260324875485e-06, "loss": 0.1473, "step": 10310 }, { "epoch": 3.476655991909658, "grad_norm": 1.4934880449965409, "learning_rate": 2.2443698461449804e-06, "loss": 0.1642, "step": 10315 }, { "epoch": 3.478341479858419, "grad_norm": 1.3723996901532494, "learning_rate": 2.239816943774176e-06, "loss": 0.158, "step": 10320 }, { "epoch": 3.4800269678071802, "grad_norm": 1.443019268367747, "learning_rate": 2.2352673308009737e-06, "loss": 0.1542, "step": 10325 }, { "epoch": 3.4817124557559413, "grad_norm": 1.1284300395864866, "learning_rate": 2.2307210126472895e-06, "loss": 0.1414, "step": 10330 }, { "epoch": 3.4833979437047025, "grad_norm": 1.23855888555975, "learning_rate": 2.2261779947311135e-06, "loss": 0.1421, "step": 10335 }, { "epoch": 3.4850834316534636, "grad_norm": 1.4084019002786303, "learning_rate": 2.221638282466508e-06, "loss": 0.1455, "step": 10340 }, { "epoch": 3.4867689196022247, "grad_norm": 1.3362795161936447, "learning_rate": 2.2171018812635897e-06, "loss": 0.1381, "step": 10345 }, { "epoch": 3.488454407550986, "grad_norm": 1.259134734511089, "learning_rate": 2.2125687965285304e-06, "loss": 0.1489, "step": 10350 }, { "epoch": 3.4901398954997473, "grad_norm": 1.2212844823353102, "learning_rate": 2.2080390336635515e-06, "loss": 0.131, "step": 10355 }, { "epoch": 3.4918253834485085, "grad_norm": 1.3002947953988913, "learning_rate": 2.203512598066918e-06, "loss": 0.1261, "step": 10360 }, { "epoch": 3.4935108713972696, "grad_norm": 1.3574551635687806, "learning_rate": 2.198989495132925e-06, "loss": 0.1444, "step": 10365 }, { "epoch": 3.4951963593460307, "grad_norm": 1.4212234938341588, "learning_rate": 2.1944697302518957e-06, "loss": 0.1332, "step": 10370 }, { "epoch": 3.496881847294792, "grad_norm": 1.363582357147013, "learning_rate": 2.189953308810177e-06, "loss": 0.1432, "step": 10375 }, { "epoch": 3.498567335243553, "grad_norm": 1.8118963505278043, "learning_rate": 2.1854402361901345e-06, "loss": 0.1459, "step": 10380 }, { "epoch": 3.500252823192314, "grad_norm": 6.001511141669761, "learning_rate": 2.180930517770136e-06, "loss": 0.13, "step": 10385 }, { "epoch": 3.5019383111410756, "grad_norm": 1.1529911935264896, "learning_rate": 2.1764241589245604e-06, "loss": 0.1297, "step": 10390 }, { "epoch": 3.5036237990898362, "grad_norm": 1.3124330659110302, "learning_rate": 2.1719211650237736e-06, "loss": 0.1399, "step": 10395 }, { "epoch": 3.505309287038598, "grad_norm": 1.3060778220339841, "learning_rate": 2.16742154143414e-06, "loss": 0.1549, "step": 10400 }, { "epoch": 3.506994774987359, "grad_norm": 1.306653702069654, "learning_rate": 2.1629252935180015e-06, "loss": 0.1331, "step": 10405 }, { "epoch": 3.50868026293612, "grad_norm": 1.4771043963795132, "learning_rate": 2.1584324266336802e-06, "loss": 0.1522, "step": 10410 }, { "epoch": 3.510365750884881, "grad_norm": 1.3364185791466066, "learning_rate": 2.1539429461354655e-06, "loss": 0.1523, "step": 10415 }, { "epoch": 3.5120512388336422, "grad_norm": 1.192134034575673, "learning_rate": 2.149456857373617e-06, "loss": 0.1224, "step": 10420 }, { "epoch": 3.5137367267824033, "grad_norm": 1.3478448597892143, "learning_rate": 2.144974165694345e-06, "loss": 0.16, "step": 10425 }, { "epoch": 3.5154222147311645, "grad_norm": 1.1420976982959112, "learning_rate": 2.14049487643982e-06, "loss": 0.1218, "step": 10430 }, { "epoch": 3.517107702679926, "grad_norm": 1.2690054975999607, "learning_rate": 2.1360189949481497e-06, "loss": 0.1454, "step": 10435 }, { "epoch": 3.518793190628687, "grad_norm": 1.2762089343715004, "learning_rate": 2.131546526553383e-06, "loss": 0.1488, "step": 10440 }, { "epoch": 3.5204786785774482, "grad_norm": 1.2812913533625703, "learning_rate": 2.127077476585505e-06, "loss": 0.141, "step": 10445 }, { "epoch": 3.5221641665262093, "grad_norm": 1.2349437337118987, "learning_rate": 2.122611850370423e-06, "loss": 0.1486, "step": 10450 }, { "epoch": 3.5238496544749704, "grad_norm": 2.0885116018721095, "learning_rate": 2.118149653229963e-06, "loss": 0.1349, "step": 10455 }, { "epoch": 3.5255351424237316, "grad_norm": 1.1195939342097039, "learning_rate": 2.11369089048187e-06, "loss": 0.1481, "step": 10460 }, { "epoch": 3.5272206303724927, "grad_norm": 1.3557711996658708, "learning_rate": 2.109235567439792e-06, "loss": 0.1385, "step": 10465 }, { "epoch": 3.5289061183212542, "grad_norm": 1.1764486853350906, "learning_rate": 2.104783689413279e-06, "loss": 0.1315, "step": 10470 }, { "epoch": 3.530591606270015, "grad_norm": 1.1336262689502794, "learning_rate": 2.100335261707774e-06, "loss": 0.1493, "step": 10475 }, { "epoch": 3.5322770942187764, "grad_norm": 1.3801602380775975, "learning_rate": 2.095890289624608e-06, "loss": 0.1799, "step": 10480 }, { "epoch": 3.5339625821675376, "grad_norm": 1.3471750010356411, "learning_rate": 2.0914487784609982e-06, "loss": 0.1467, "step": 10485 }, { "epoch": 3.5356480701162987, "grad_norm": 1.4112952717155867, "learning_rate": 2.0870107335100324e-06, "loss": 0.1459, "step": 10490 }, { "epoch": 3.53733355806506, "grad_norm": 1.71665705731009, "learning_rate": 2.082576160060669e-06, "loss": 0.138, "step": 10495 }, { "epoch": 3.539019046013821, "grad_norm": 1.4191379470057897, "learning_rate": 2.078145063397729e-06, "loss": 0.1399, "step": 10500 }, { "epoch": 3.540704533962582, "grad_norm": 1.2610800597148517, "learning_rate": 2.073717448801894e-06, "loss": 0.1406, "step": 10505 }, { "epoch": 3.542390021911343, "grad_norm": 1.2012880101008747, "learning_rate": 2.0692933215496906e-06, "loss": 0.151, "step": 10510 }, { "epoch": 3.5440755098601047, "grad_norm": 1.4278325658758595, "learning_rate": 2.064872686913492e-06, "loss": 0.166, "step": 10515 }, { "epoch": 3.5457609978088658, "grad_norm": 1.472644933927288, "learning_rate": 2.060455550161506e-06, "loss": 0.1395, "step": 10520 }, { "epoch": 3.547446485757627, "grad_norm": 2.188490090397927, "learning_rate": 2.056041916557778e-06, "loss": 0.1343, "step": 10525 }, { "epoch": 3.549131973706388, "grad_norm": 1.3879426456272843, "learning_rate": 2.0516317913621724e-06, "loss": 0.1442, "step": 10530 }, { "epoch": 3.550817461655149, "grad_norm": 1.3428771458764217, "learning_rate": 2.0472251798303757e-06, "loss": 0.1447, "step": 10535 }, { "epoch": 3.55250294960391, "grad_norm": 1.3932142671509251, "learning_rate": 2.042822087213886e-06, "loss": 0.1618, "step": 10540 }, { "epoch": 3.5541884375526713, "grad_norm": 1.3509593971795395, "learning_rate": 2.0384225187600113e-06, "loss": 0.1533, "step": 10545 }, { "epoch": 3.555873925501433, "grad_norm": 1.556290992445429, "learning_rate": 2.034026479711855e-06, "loss": 0.1737, "step": 10550 }, { "epoch": 3.557559413450194, "grad_norm": 1.2644217179475776, "learning_rate": 2.029633975308315e-06, "loss": 0.1376, "step": 10555 }, { "epoch": 3.559244901398955, "grad_norm": 1.2423782672725796, "learning_rate": 2.0252450107840765e-06, "loss": 0.1418, "step": 10560 }, { "epoch": 3.560930389347716, "grad_norm": 1.4317162751893622, "learning_rate": 2.020859591369612e-06, "loss": 0.1459, "step": 10565 }, { "epoch": 3.5626158772964773, "grad_norm": 1.3862237555473154, "learning_rate": 2.016477722291163e-06, "loss": 0.1336, "step": 10570 }, { "epoch": 3.5643013652452384, "grad_norm": 1.4254922326020505, "learning_rate": 2.012099408770739e-06, "loss": 0.1491, "step": 10575 }, { "epoch": 3.5659868531939996, "grad_norm": 1.4712500075169244, "learning_rate": 2.0077246560261186e-06, "loss": 0.134, "step": 10580 }, { "epoch": 3.567672341142761, "grad_norm": 1.4780926112680588, "learning_rate": 2.0033534692708306e-06, "loss": 0.149, "step": 10585 }, { "epoch": 3.5693578290915218, "grad_norm": 1.1906661357195025, "learning_rate": 1.99898585371416e-06, "loss": 0.1505, "step": 10590 }, { "epoch": 3.5710433170402833, "grad_norm": 1.2955525444480056, "learning_rate": 1.9946218145611298e-06, "loss": 0.13, "step": 10595 }, { "epoch": 3.5727288049890444, "grad_norm": 1.4304391663027463, "learning_rate": 1.9902613570125028e-06, "loss": 0.1318, "step": 10600 }, { "epoch": 3.5744142929378055, "grad_norm": 1.3314494150846279, "learning_rate": 1.9859044862647786e-06, "loss": 0.1368, "step": 10605 }, { "epoch": 3.5760997808865667, "grad_norm": 1.1985082105430942, "learning_rate": 1.9815512075101734e-06, "loss": 0.1338, "step": 10610 }, { "epoch": 3.5777852688353278, "grad_norm": 1.246983396775462, "learning_rate": 1.977201525936632e-06, "loss": 0.1346, "step": 10615 }, { "epoch": 3.579470756784089, "grad_norm": 1.274464458650494, "learning_rate": 1.9728554467278043e-06, "loss": 0.1539, "step": 10620 }, { "epoch": 3.58115624473285, "grad_norm": 1.2443256218719754, "learning_rate": 1.9685129750630506e-06, "loss": 0.1455, "step": 10625 }, { "epoch": 3.5828417326816115, "grad_norm": 1.356276070081595, "learning_rate": 1.964174116117435e-06, "loss": 0.1606, "step": 10630 }, { "epoch": 3.5845272206303727, "grad_norm": 1.5256136219456349, "learning_rate": 1.959838875061711e-06, "loss": 0.1521, "step": 10635 }, { "epoch": 3.5862127085791338, "grad_norm": 1.3573621305917005, "learning_rate": 1.955507257062323e-06, "loss": 0.1355, "step": 10640 }, { "epoch": 3.587898196527895, "grad_norm": 1.2779022604044483, "learning_rate": 1.9511792672813957e-06, "loss": 0.1393, "step": 10645 }, { "epoch": 3.589583684476656, "grad_norm": 1.3243206119262712, "learning_rate": 1.946854910876734e-06, "loss": 0.1328, "step": 10650 }, { "epoch": 3.591269172425417, "grad_norm": 1.4054265088833848, "learning_rate": 1.9425341930018104e-06, "loss": 0.1392, "step": 10655 }, { "epoch": 3.592954660374178, "grad_norm": 1.2994062236654595, "learning_rate": 1.9382171188057612e-06, "loss": 0.1455, "step": 10660 }, { "epoch": 3.5946401483229398, "grad_norm": 1.3019479247019692, "learning_rate": 1.9339036934333785e-06, "loss": 0.1428, "step": 10665 }, { "epoch": 3.5963256362717004, "grad_norm": 1.2386064454120294, "learning_rate": 1.92959392202511e-06, "loss": 0.14, "step": 10670 }, { "epoch": 3.598011124220462, "grad_norm": 2.3205460412573427, "learning_rate": 1.925287809717048e-06, "loss": 0.1464, "step": 10675 }, { "epoch": 3.599696612169223, "grad_norm": 1.2592526547322387, "learning_rate": 1.9209853616409202e-06, "loss": 0.1271, "step": 10680 }, { "epoch": 3.601382100117984, "grad_norm": 1.2479378541792572, "learning_rate": 1.91668658292409e-06, "loss": 0.142, "step": 10685 }, { "epoch": 3.6030675880667453, "grad_norm": 1.2671475006269277, "learning_rate": 1.912391478689549e-06, "loss": 0.1342, "step": 10690 }, { "epoch": 3.6047530760155064, "grad_norm": 1.2538256473714446, "learning_rate": 1.9081000540559118e-06, "loss": 0.1371, "step": 10695 }, { "epoch": 3.6064385639642675, "grad_norm": 1.3419396368565297, "learning_rate": 1.9038123141374026e-06, "loss": 0.1549, "step": 10700 }, { "epoch": 3.6081240519130287, "grad_norm": 1.3731470411140996, "learning_rate": 1.8995282640438556e-06, "loss": 0.1351, "step": 10705 }, { "epoch": 3.60980953986179, "grad_norm": 1.4855548060217267, "learning_rate": 1.8952479088807125e-06, "loss": 0.1438, "step": 10710 }, { "epoch": 3.6114950278105513, "grad_norm": 4.153027925748704, "learning_rate": 1.890971253749006e-06, "loss": 0.1133, "step": 10715 }, { "epoch": 3.6131805157593124, "grad_norm": 1.1804058356152094, "learning_rate": 1.8866983037453618e-06, "loss": 0.1493, "step": 10720 }, { "epoch": 3.6148660037080735, "grad_norm": 1.2290498082921446, "learning_rate": 1.882429063961988e-06, "loss": 0.1369, "step": 10725 }, { "epoch": 3.6165514916568346, "grad_norm": 1.342012503195042, "learning_rate": 1.8781635394866743e-06, "loss": 0.1346, "step": 10730 }, { "epoch": 3.6182369796055958, "grad_norm": 1.2552312593496555, "learning_rate": 1.8739017354027839e-06, "loss": 0.1333, "step": 10735 }, { "epoch": 3.619922467554357, "grad_norm": 1.2101740541295785, "learning_rate": 1.8696436567892418e-06, "loss": 0.1337, "step": 10740 }, { "epoch": 3.6216079555031184, "grad_norm": 2.527413991315585, "learning_rate": 1.8653893087205349e-06, "loss": 0.128, "step": 10745 }, { "epoch": 3.623293443451879, "grad_norm": 1.5371710753339014, "learning_rate": 1.8611386962667028e-06, "loss": 0.1345, "step": 10750 }, { "epoch": 3.6249789314006406, "grad_norm": 1.4229550620159923, "learning_rate": 1.8568918244933386e-06, "loss": 0.1483, "step": 10755 }, { "epoch": 3.6266644193494018, "grad_norm": 1.1869181162925668, "learning_rate": 1.852648698461571e-06, "loss": 0.1296, "step": 10760 }, { "epoch": 3.628349907298163, "grad_norm": 1.2672176145150378, "learning_rate": 1.8484093232280704e-06, "loss": 0.1387, "step": 10765 }, { "epoch": 3.630035395246924, "grad_norm": 1.5353080778660577, "learning_rate": 1.8441737038450313e-06, "loss": 0.1473, "step": 10770 }, { "epoch": 3.631720883195685, "grad_norm": 1.1920347312767035, "learning_rate": 1.8399418453601798e-06, "loss": 0.1364, "step": 10775 }, { "epoch": 3.633406371144446, "grad_norm": 1.090443066617329, "learning_rate": 1.835713752816753e-06, "loss": 0.1212, "step": 10780 }, { "epoch": 3.6350918590932073, "grad_norm": 1.344111238189381, "learning_rate": 1.8314894312535026e-06, "loss": 0.1224, "step": 10785 }, { "epoch": 3.636777347041969, "grad_norm": 1.2110775527578468, "learning_rate": 1.827268885704686e-06, "loss": 0.1608, "step": 10790 }, { "epoch": 3.63846283499073, "grad_norm": 1.128471012097077, "learning_rate": 1.8230521212000635e-06, "loss": 0.1178, "step": 10795 }, { "epoch": 3.640148322939491, "grad_norm": 1.461130970109866, "learning_rate": 1.8188391427648832e-06, "loss": 0.1609, "step": 10800 }, { "epoch": 3.641833810888252, "grad_norm": 1.2857101622554097, "learning_rate": 1.8146299554198894e-06, "loss": 0.1395, "step": 10805 }, { "epoch": 3.6435192988370133, "grad_norm": 1.3471375373402812, "learning_rate": 1.8104245641813e-06, "loss": 0.1537, "step": 10810 }, { "epoch": 3.6452047867857744, "grad_norm": 1.3133766589746239, "learning_rate": 1.8062229740608166e-06, "loss": 0.1624, "step": 10815 }, { "epoch": 3.6468902747345355, "grad_norm": 1.2682272594750614, "learning_rate": 1.802025190065606e-06, "loss": 0.151, "step": 10820 }, { "epoch": 3.648575762683297, "grad_norm": 1.337186038866119, "learning_rate": 1.7978312171982993e-06, "loss": 0.1414, "step": 10825 }, { "epoch": 3.6502612506320578, "grad_norm": 1.225988906650177, "learning_rate": 1.7936410604569859e-06, "loss": 0.1461, "step": 10830 }, { "epoch": 3.6519467385808193, "grad_norm": 1.463406152544717, "learning_rate": 1.7894547248352101e-06, "loss": 0.1402, "step": 10835 }, { "epoch": 3.6536322265295804, "grad_norm": 1.179333130684187, "learning_rate": 1.785272215321962e-06, "loss": 0.1521, "step": 10840 }, { "epoch": 3.6553177144783415, "grad_norm": 1.274228070915282, "learning_rate": 1.7810935369016692e-06, "loss": 0.1295, "step": 10845 }, { "epoch": 3.6570032024271026, "grad_norm": 1.2766295202607583, "learning_rate": 1.7769186945541956e-06, "loss": 0.1349, "step": 10850 }, { "epoch": 3.6586886903758638, "grad_norm": 2.0165099013912973, "learning_rate": 1.7727476932548304e-06, "loss": 0.1525, "step": 10855 }, { "epoch": 3.660374178324625, "grad_norm": 2.9137908892716666, "learning_rate": 1.7685805379742921e-06, "loss": 0.1549, "step": 10860 }, { "epoch": 3.662059666273386, "grad_norm": 1.178746265790924, "learning_rate": 1.7644172336787096e-06, "loss": 0.1315, "step": 10865 }, { "epoch": 3.6637451542221475, "grad_norm": 1.4017366203879746, "learning_rate": 1.7602577853296237e-06, "loss": 0.1342, "step": 10870 }, { "epoch": 3.6654306421709086, "grad_norm": 1.1986093038186967, "learning_rate": 1.7561021978839814e-06, "loss": 0.1449, "step": 10875 }, { "epoch": 3.6671161301196697, "grad_norm": 1.3621368224019756, "learning_rate": 1.7519504762941303e-06, "loss": 0.1257, "step": 10880 }, { "epoch": 3.668801618068431, "grad_norm": 1.229922223662929, "learning_rate": 1.7478026255078067e-06, "loss": 0.1504, "step": 10885 }, { "epoch": 3.670487106017192, "grad_norm": 1.3218602290130936, "learning_rate": 1.7436586504681357e-06, "loss": 0.1293, "step": 10890 }, { "epoch": 3.672172593965953, "grad_norm": 1.2186336535728164, "learning_rate": 1.7395185561136219e-06, "loss": 0.1354, "step": 10895 }, { "epoch": 3.673858081914714, "grad_norm": 1.4301576145878616, "learning_rate": 1.7353823473781506e-06, "loss": 0.1412, "step": 10900 }, { "epoch": 3.6755435698634757, "grad_norm": 1.3124284024611363, "learning_rate": 1.7312500291909707e-06, "loss": 0.1439, "step": 10905 }, { "epoch": 3.6772290578122364, "grad_norm": 1.3193249001410443, "learning_rate": 1.7271216064766955e-06, "loss": 0.1226, "step": 10910 }, { "epoch": 3.678914545760998, "grad_norm": 1.4046165543654832, "learning_rate": 1.7229970841552985e-06, "loss": 0.1325, "step": 10915 }, { "epoch": 3.680600033709759, "grad_norm": 1.3992901351664337, "learning_rate": 1.7188764671421055e-06, "loss": 0.116, "step": 10920 }, { "epoch": 3.68228552165852, "grad_norm": 1.0904309846631886, "learning_rate": 1.7147597603477845e-06, "loss": 0.1559, "step": 10925 }, { "epoch": 3.6839710096072813, "grad_norm": 1.3359699437178203, "learning_rate": 1.710646968678345e-06, "loss": 0.1486, "step": 10930 }, { "epoch": 3.6856564975560424, "grad_norm": 1.2327647372204082, "learning_rate": 1.7065380970351304e-06, "loss": 0.1308, "step": 10935 }, { "epoch": 3.6873419855048035, "grad_norm": 1.4799913603094659, "learning_rate": 1.702433150314816e-06, "loss": 0.1334, "step": 10940 }, { "epoch": 3.6890274734535646, "grad_norm": 1.2621914955513351, "learning_rate": 1.6983321334093955e-06, "loss": 0.1263, "step": 10945 }, { "epoch": 3.690712961402326, "grad_norm": 1.4318181939663055, "learning_rate": 1.6942350512061788e-06, "loss": 0.1223, "step": 10950 }, { "epoch": 3.6923984493510873, "grad_norm": 1.252786341936707, "learning_rate": 1.6901419085877902e-06, "loss": 0.1268, "step": 10955 }, { "epoch": 3.6940839372998484, "grad_norm": 1.3156485619952978, "learning_rate": 1.6860527104321594e-06, "loss": 0.1349, "step": 10960 }, { "epoch": 3.6957694252486095, "grad_norm": 1.4287223642883522, "learning_rate": 1.6819674616125109e-06, "loss": 0.142, "step": 10965 }, { "epoch": 3.6974549131973706, "grad_norm": 1.4057183581891188, "learning_rate": 1.6778861669973661e-06, "loss": 0.1316, "step": 10970 }, { "epoch": 3.6991404011461317, "grad_norm": 1.2378528563096136, "learning_rate": 1.6738088314505312e-06, "loss": 0.1116, "step": 10975 }, { "epoch": 3.700825889094893, "grad_norm": 1.2196288849624572, "learning_rate": 1.6697354598310995e-06, "loss": 0.1373, "step": 10980 }, { "epoch": 3.7025113770436544, "grad_norm": 1.4958660574515723, "learning_rate": 1.6656660569934353e-06, "loss": 0.127, "step": 10985 }, { "epoch": 3.704196864992415, "grad_norm": 1.2609253493145949, "learning_rate": 1.6616006277871727e-06, "loss": 0.1317, "step": 10990 }, { "epoch": 3.7058823529411766, "grad_norm": 1.4129856363806284, "learning_rate": 1.6575391770572168e-06, "loss": 0.1329, "step": 10995 }, { "epoch": 3.7075678408899377, "grad_norm": 1.3389979570312858, "learning_rate": 1.6534817096437228e-06, "loss": 0.1259, "step": 11000 }, { "epoch": 3.709253328838699, "grad_norm": 1.2637090064308742, "learning_rate": 1.6494282303821075e-06, "loss": 0.1176, "step": 11005 }, { "epoch": 3.71093881678746, "grad_norm": 1.2416217473428255, "learning_rate": 1.6453787441030284e-06, "loss": 0.1419, "step": 11010 }, { "epoch": 3.712624304736221, "grad_norm": 1.2100826998838317, "learning_rate": 1.6413332556323847e-06, "loss": 0.1288, "step": 11015 }, { "epoch": 3.714309792684982, "grad_norm": 1.168320926301484, "learning_rate": 1.6372917697913165e-06, "loss": 0.1239, "step": 11020 }, { "epoch": 3.7159952806337433, "grad_norm": 1.2839813029539138, "learning_rate": 1.6332542913961874e-06, "loss": 0.1267, "step": 11025 }, { "epoch": 3.717680768582505, "grad_norm": 1.353670532278106, "learning_rate": 1.6292208252585922e-06, "loss": 0.1476, "step": 11030 }, { "epoch": 3.719366256531266, "grad_norm": 1.3220795371446468, "learning_rate": 1.6251913761853378e-06, "loss": 0.1398, "step": 11035 }, { "epoch": 3.721051744480027, "grad_norm": 1.1553512249234226, "learning_rate": 1.6211659489784448e-06, "loss": 0.1544, "step": 11040 }, { "epoch": 3.722737232428788, "grad_norm": 1.3130351186312237, "learning_rate": 1.6171445484351462e-06, "loss": 0.1414, "step": 11045 }, { "epoch": 3.7244227203775493, "grad_norm": 1.142910314927335, "learning_rate": 1.6131271793478704e-06, "loss": 0.1173, "step": 11050 }, { "epoch": 3.7261082083263104, "grad_norm": 1.2919554545792558, "learning_rate": 1.6091138465042434e-06, "loss": 0.1371, "step": 11055 }, { "epoch": 3.7277936962750715, "grad_norm": 1.7895386627119003, "learning_rate": 1.6051045546870791e-06, "loss": 0.1403, "step": 11060 }, { "epoch": 3.729479184223833, "grad_norm": 1.2380015640589026, "learning_rate": 1.6010993086743804e-06, "loss": 0.1387, "step": 11065 }, { "epoch": 3.7311646721725937, "grad_norm": 1.2733695615773877, "learning_rate": 1.5970981132393266e-06, "loss": 0.1429, "step": 11070 }, { "epoch": 3.7328501601213553, "grad_norm": 1.394081091787083, "learning_rate": 1.5931009731502673e-06, "loss": 0.1352, "step": 11075 }, { "epoch": 3.7345356480701164, "grad_norm": 1.2588665514022965, "learning_rate": 1.5891078931707194e-06, "loss": 0.126, "step": 11080 }, { "epoch": 3.7362211360188775, "grad_norm": 1.36710298121437, "learning_rate": 1.5851188780593668e-06, "loss": 0.1466, "step": 11085 }, { "epoch": 3.7379066239676386, "grad_norm": 1.3953484795375954, "learning_rate": 1.581133932570043e-06, "loss": 0.1349, "step": 11090 }, { "epoch": 3.7395921119163997, "grad_norm": 1.1442607060166285, "learning_rate": 1.5771530614517339e-06, "loss": 0.1474, "step": 11095 }, { "epoch": 3.741277599865161, "grad_norm": 1.3852545739441826, "learning_rate": 1.5731762694485681e-06, "loss": 0.127, "step": 11100 }, { "epoch": 3.742963087813922, "grad_norm": 1.344833226682109, "learning_rate": 1.5692035612998163e-06, "loss": 0.12, "step": 11105 }, { "epoch": 3.7446485757626835, "grad_norm": 1.4701799239754092, "learning_rate": 1.565234941739882e-06, "loss": 0.134, "step": 11110 }, { "epoch": 3.7463340637114446, "grad_norm": 1.3909412719624628, "learning_rate": 1.5612704154982937e-06, "loss": 0.129, "step": 11115 }, { "epoch": 3.7480195516602057, "grad_norm": 1.1780797760040174, "learning_rate": 1.557309987299701e-06, "loss": 0.1346, "step": 11120 }, { "epoch": 3.749705039608967, "grad_norm": 1.133305541282998, "learning_rate": 1.5533536618638755e-06, "loss": 0.1102, "step": 11125 }, { "epoch": 3.751390527557728, "grad_norm": 1.4051303257982724, "learning_rate": 1.5494014439056931e-06, "loss": 0.1342, "step": 11130 }, { "epoch": 3.753076015506489, "grad_norm": 2.3301186322425256, "learning_rate": 1.5454533381351378e-06, "loss": 0.1154, "step": 11135 }, { "epoch": 3.75476150345525, "grad_norm": 1.4838583399462095, "learning_rate": 1.5415093492572902e-06, "loss": 0.1198, "step": 11140 }, { "epoch": 3.7564469914040117, "grad_norm": 1.3862808847131185, "learning_rate": 1.5375694819723286e-06, "loss": 0.1336, "step": 11145 }, { "epoch": 3.7581324793527724, "grad_norm": 1.4877096838778645, "learning_rate": 1.5336337409755198e-06, "loss": 0.1634, "step": 11150 }, { "epoch": 3.759817967301534, "grad_norm": 1.205089520477097, "learning_rate": 1.5297021309572085e-06, "loss": 0.1457, "step": 11155 }, { "epoch": 3.761503455250295, "grad_norm": 1.2930805196199944, "learning_rate": 1.5257746566028198e-06, "loss": 0.1343, "step": 11160 }, { "epoch": 3.763188943199056, "grad_norm": 1.4997910724555106, "learning_rate": 1.5218513225928473e-06, "loss": 0.1419, "step": 11165 }, { "epoch": 3.7648744311478173, "grad_norm": 1.326257220590502, "learning_rate": 1.5179321336028557e-06, "loss": 0.135, "step": 11170 }, { "epoch": 3.7665599190965784, "grad_norm": 3.3805180320738004, "learning_rate": 1.5140170943034633e-06, "loss": 0.1551, "step": 11175 }, { "epoch": 3.7682454070453395, "grad_norm": 1.502125772594316, "learning_rate": 1.5101062093603502e-06, "loss": 0.1246, "step": 11180 }, { "epoch": 3.7699308949941006, "grad_norm": 1.3387209752283937, "learning_rate": 1.506199483434238e-06, "loss": 0.1531, "step": 11185 }, { "epoch": 3.771616382942862, "grad_norm": 1.2512897849676383, "learning_rate": 1.5022969211808997e-06, "loss": 0.1313, "step": 11190 }, { "epoch": 3.7733018708916233, "grad_norm": 1.3046445675910159, "learning_rate": 1.4983985272511404e-06, "loss": 0.1413, "step": 11195 }, { "epoch": 3.7749873588403844, "grad_norm": 1.3228199491773467, "learning_rate": 1.4945043062907993e-06, "loss": 0.145, "step": 11200 }, { "epoch": 3.7766728467891455, "grad_norm": 1.1277066014272603, "learning_rate": 1.4906142629407421e-06, "loss": 0.1301, "step": 11205 }, { "epoch": 3.7783583347379066, "grad_norm": 1.401023583324405, "learning_rate": 1.486728401836859e-06, "loss": 0.1458, "step": 11210 }, { "epoch": 3.7800438226866677, "grad_norm": 2.4623045803277965, "learning_rate": 1.4828467276100516e-06, "loss": 0.1193, "step": 11215 }, { "epoch": 3.781729310635429, "grad_norm": 1.2017613317670408, "learning_rate": 1.4789692448862364e-06, "loss": 0.133, "step": 11220 }, { "epoch": 3.7834147985841904, "grad_norm": 1.4369487121538838, "learning_rate": 1.47509595828633e-06, "loss": 0.1196, "step": 11225 }, { "epoch": 3.785100286532951, "grad_norm": 1.2493238247353895, "learning_rate": 1.4712268724262529e-06, "loss": 0.1216, "step": 11230 }, { "epoch": 3.7867857744817126, "grad_norm": 1.4063951576050708, "learning_rate": 1.4673619919169168e-06, "loss": 0.1291, "step": 11235 }, { "epoch": 3.7884712624304737, "grad_norm": 1.2906232810719316, "learning_rate": 1.4635013213642213e-06, "loss": 0.1164, "step": 11240 }, { "epoch": 3.790156750379235, "grad_norm": 1.3767491306273862, "learning_rate": 1.4596448653690493e-06, "loss": 0.1525, "step": 11245 }, { "epoch": 3.791842238327996, "grad_norm": 1.2489608029596313, "learning_rate": 1.4557926285272622e-06, "loss": 0.1228, "step": 11250 }, { "epoch": 3.793527726276757, "grad_norm": 1.6760503093343815, "learning_rate": 1.4519446154296951e-06, "loss": 0.1373, "step": 11255 }, { "epoch": 3.795213214225518, "grad_norm": 1.5353684247978776, "learning_rate": 1.4481008306621447e-06, "loss": 0.1596, "step": 11260 }, { "epoch": 3.7968987021742793, "grad_norm": 1.4731419789805278, "learning_rate": 1.44426127880537e-06, "loss": 0.1284, "step": 11265 }, { "epoch": 3.798584190123041, "grad_norm": 1.0635085858618885, "learning_rate": 1.4404259644350899e-06, "loss": 0.1425, "step": 11270 }, { "epoch": 3.800269678071802, "grad_norm": 1.3188467927039411, "learning_rate": 1.436594892121968e-06, "loss": 0.138, "step": 11275 }, { "epoch": 3.801955166020563, "grad_norm": 1.2440191333955246, "learning_rate": 1.4327680664316152e-06, "loss": 0.1305, "step": 11280 }, { "epoch": 3.803640653969324, "grad_norm": 1.2837662103817602, "learning_rate": 1.4289454919245788e-06, "loss": 0.1379, "step": 11285 }, { "epoch": 3.8053261419180853, "grad_norm": 1.1769279984930598, "learning_rate": 1.4251271731563438e-06, "loss": 0.125, "step": 11290 }, { "epoch": 3.8070116298668464, "grad_norm": 1.2766168881855045, "learning_rate": 1.4213131146773229e-06, "loss": 0.1381, "step": 11295 }, { "epoch": 3.8086971178156075, "grad_norm": 1.5157427588705457, "learning_rate": 1.4175033210328493e-06, "loss": 0.1276, "step": 11300 }, { "epoch": 3.810382605764369, "grad_norm": 1.2398980650711655, "learning_rate": 1.4136977967631743e-06, "loss": 0.1245, "step": 11305 }, { "epoch": 3.8120680937131297, "grad_norm": 1.3371425887454111, "learning_rate": 1.4098965464034609e-06, "loss": 0.1463, "step": 11310 }, { "epoch": 3.8137535816618913, "grad_norm": 1.3161589516680494, "learning_rate": 1.406099574483782e-06, "loss": 0.1242, "step": 11315 }, { "epoch": 3.8154390696106524, "grad_norm": 1.3572046560700748, "learning_rate": 1.4023068855291082e-06, "loss": 0.1351, "step": 11320 }, { "epoch": 3.8171245575594135, "grad_norm": 1.4045816580208925, "learning_rate": 1.3985184840593052e-06, "loss": 0.1121, "step": 11325 }, { "epoch": 3.8188100455081746, "grad_norm": 1.283108073289214, "learning_rate": 1.394734374589133e-06, "loss": 0.1325, "step": 11330 }, { "epoch": 3.8204955334569357, "grad_norm": 1.791579105416178, "learning_rate": 1.390954561628236e-06, "loss": 0.1288, "step": 11335 }, { "epoch": 3.822181021405697, "grad_norm": 1.3451170525968954, "learning_rate": 1.3871790496811356e-06, "loss": 0.1363, "step": 11340 }, { "epoch": 3.823866509354458, "grad_norm": 1.3587677057777976, "learning_rate": 1.3834078432472292e-06, "loss": 0.1272, "step": 11345 }, { "epoch": 3.8255519973032195, "grad_norm": 1.2621330786954483, "learning_rate": 1.379640946820781e-06, "loss": 0.1553, "step": 11350 }, { "epoch": 3.8272374852519806, "grad_norm": 1.2749397547430936, "learning_rate": 1.3758783648909246e-06, "loss": 0.1302, "step": 11355 }, { "epoch": 3.8289229732007417, "grad_norm": 1.360132376641049, "learning_rate": 1.3721201019416458e-06, "loss": 0.1134, "step": 11360 }, { "epoch": 3.830608461149503, "grad_norm": 1.3463797002311013, "learning_rate": 1.3683661624517847e-06, "loss": 0.1339, "step": 11365 }, { "epoch": 3.832293949098264, "grad_norm": 1.1656547531021166, "learning_rate": 1.364616550895031e-06, "loss": 0.1217, "step": 11370 }, { "epoch": 3.833979437047025, "grad_norm": 1.2493967090178022, "learning_rate": 1.3608712717399174e-06, "loss": 0.1273, "step": 11375 }, { "epoch": 3.835664924995786, "grad_norm": 1.3225994130273213, "learning_rate": 1.35713032944981e-06, "loss": 0.1312, "step": 11380 }, { "epoch": 3.8373504129445477, "grad_norm": 1.4418769066940484, "learning_rate": 1.353393728482909e-06, "loss": 0.1313, "step": 11385 }, { "epoch": 3.8390359008933084, "grad_norm": 1.4818666743645907, "learning_rate": 1.3496614732922375e-06, "loss": 0.1337, "step": 11390 }, { "epoch": 3.84072138884207, "grad_norm": 1.459798900231498, "learning_rate": 1.3459335683256457e-06, "loss": 0.1485, "step": 11395 }, { "epoch": 3.842406876790831, "grad_norm": 1.4538320839709264, "learning_rate": 1.3422100180257936e-06, "loss": 0.1604, "step": 11400 }, { "epoch": 3.844092364739592, "grad_norm": 1.7742149272289591, "learning_rate": 1.3384908268301566e-06, "loss": 0.1211, "step": 11405 }, { "epoch": 3.8457778526883533, "grad_norm": 1.1933454350246195, "learning_rate": 1.3347759991710109e-06, "loss": 0.1263, "step": 11410 }, { "epoch": 3.8474633406371144, "grad_norm": 1.4852947422766494, "learning_rate": 1.3310655394754335e-06, "loss": 0.1281, "step": 11415 }, { "epoch": 3.8491488285858755, "grad_norm": 1.4595205492510024, "learning_rate": 1.3273594521652994e-06, "loss": 0.138, "step": 11420 }, { "epoch": 3.8508343165346366, "grad_norm": 1.570597937583335, "learning_rate": 1.3236577416572682e-06, "loss": 0.1359, "step": 11425 }, { "epoch": 3.852519804483398, "grad_norm": 1.3041151431665867, "learning_rate": 1.319960412362785e-06, "loss": 0.1439, "step": 11430 }, { "epoch": 3.8542052924321593, "grad_norm": 1.1223750056339685, "learning_rate": 1.3162674686880778e-06, "loss": 0.1207, "step": 11435 }, { "epoch": 3.8558907803809204, "grad_norm": 1.6708025803319664, "learning_rate": 1.31257891503414e-06, "loss": 0.1439, "step": 11440 }, { "epoch": 3.8575762683296815, "grad_norm": 1.547027523067389, "learning_rate": 1.3088947557967412e-06, "loss": 0.135, "step": 11445 }, { "epoch": 3.8592617562784426, "grad_norm": 1.3025590668689755, "learning_rate": 1.3052149953664107e-06, "loss": 0.1371, "step": 11450 }, { "epoch": 3.8609472442272037, "grad_norm": 1.4324892259690334, "learning_rate": 1.3015396381284317e-06, "loss": 0.1335, "step": 11455 }, { "epoch": 3.862632732175965, "grad_norm": 1.3895613413035335, "learning_rate": 1.2978686884628489e-06, "loss": 0.121, "step": 11460 }, { "epoch": 3.8643182201247264, "grad_norm": 1.3093726468294993, "learning_rate": 1.2942021507444475e-06, "loss": 0.1215, "step": 11465 }, { "epoch": 3.866003708073487, "grad_norm": 1.450148805180268, "learning_rate": 1.2905400293427555e-06, "loss": 0.1363, "step": 11470 }, { "epoch": 3.8676891960222486, "grad_norm": 1.2912630339556102, "learning_rate": 1.2868823286220372e-06, "loss": 0.1119, "step": 11475 }, { "epoch": 3.8693746839710097, "grad_norm": 1.251171279718913, "learning_rate": 1.2832290529412954e-06, "loss": 0.1342, "step": 11480 }, { "epoch": 3.871060171919771, "grad_norm": 1.4914594893258193, "learning_rate": 1.2795802066542523e-06, "loss": 0.1179, "step": 11485 }, { "epoch": 3.872745659868532, "grad_norm": 1.4830649563474119, "learning_rate": 1.2759357941093536e-06, "loss": 0.1293, "step": 11490 }, { "epoch": 3.874431147817293, "grad_norm": 1.529046490796919, "learning_rate": 1.2722958196497599e-06, "loss": 0.1401, "step": 11495 }, { "epoch": 3.876116635766054, "grad_norm": 1.364320473286898, "learning_rate": 1.2686602876133457e-06, "loss": 0.137, "step": 11500 }, { "epoch": 3.8778021237148153, "grad_norm": 1.2666932819544312, "learning_rate": 1.26502920233269e-06, "loss": 0.1053, "step": 11505 }, { "epoch": 3.879487611663577, "grad_norm": 1.3644748845747552, "learning_rate": 1.2614025681350712e-06, "loss": 0.1279, "step": 11510 }, { "epoch": 3.881173099612338, "grad_norm": 1.3713293207300685, "learning_rate": 1.2577803893424628e-06, "loss": 0.1174, "step": 11515 }, { "epoch": 3.882858587561099, "grad_norm": 1.2780743363842897, "learning_rate": 1.2541626702715316e-06, "loss": 0.1244, "step": 11520 }, { "epoch": 3.88454407550986, "grad_norm": 1.9396492050763243, "learning_rate": 1.2505494152336294e-06, "loss": 0.1284, "step": 11525 }, { "epoch": 3.8862295634586213, "grad_norm": 1.274118183453942, "learning_rate": 1.2469406285347851e-06, "loss": 0.133, "step": 11530 }, { "epoch": 3.8879150514073824, "grad_norm": 1.6720765742990575, "learning_rate": 1.2433363144757037e-06, "loss": 0.1339, "step": 11535 }, { "epoch": 3.8896005393561435, "grad_norm": 1.3845866516122425, "learning_rate": 1.2397364773517618e-06, "loss": 0.157, "step": 11540 }, { "epoch": 3.891286027304905, "grad_norm": 1.331781088462336, "learning_rate": 1.2361411214529995e-06, "loss": 0.1496, "step": 11545 }, { "epoch": 3.8929715152536657, "grad_norm": 1.0935379891948103, "learning_rate": 1.2325502510641135e-06, "loss": 0.1165, "step": 11550 }, { "epoch": 3.8946570032024272, "grad_norm": 1.5071622175057107, "learning_rate": 1.2289638704644612e-06, "loss": 0.14, "step": 11555 }, { "epoch": 3.8963424911511884, "grad_norm": 1.328602243598805, "learning_rate": 1.2253819839280435e-06, "loss": 0.1403, "step": 11560 }, { "epoch": 3.8980279790999495, "grad_norm": 1.2820953430032531, "learning_rate": 1.221804595723511e-06, "loss": 0.1464, "step": 11565 }, { "epoch": 3.8997134670487106, "grad_norm": 1.4155052163367552, "learning_rate": 1.2182317101141477e-06, "loss": 0.1527, "step": 11570 }, { "epoch": 3.9013989549974717, "grad_norm": 1.380507169619867, "learning_rate": 1.2146633313578766e-06, "loss": 0.1175, "step": 11575 }, { "epoch": 3.903084442946233, "grad_norm": 1.14637493772561, "learning_rate": 1.2110994637072448e-06, "loss": 0.1168, "step": 11580 }, { "epoch": 3.904769930894994, "grad_norm": 1.2904859905703334, "learning_rate": 1.2075401114094303e-06, "loss": 0.1238, "step": 11585 }, { "epoch": 3.9064554188437555, "grad_norm": 1.379140301488519, "learning_rate": 1.2039852787062222e-06, "loss": 0.1157, "step": 11590 }, { "epoch": 3.9081409067925166, "grad_norm": 1.779540026975358, "learning_rate": 1.2004349698340307e-06, "loss": 0.1331, "step": 11595 }, { "epoch": 3.9098263947412777, "grad_norm": 1.2923324777083516, "learning_rate": 1.1968891890238681e-06, "loss": 0.1296, "step": 11600 }, { "epoch": 3.911511882690039, "grad_norm": 1.483539619591517, "learning_rate": 1.193347940501357e-06, "loss": 0.1292, "step": 11605 }, { "epoch": 3.9131973706388, "grad_norm": 1.27187117436411, "learning_rate": 1.1898112284867137e-06, "loss": 0.119, "step": 11610 }, { "epoch": 3.914882858587561, "grad_norm": 1.409407748050843, "learning_rate": 1.1862790571947502e-06, "loss": 0.134, "step": 11615 }, { "epoch": 3.916568346536322, "grad_norm": 1.3809295704352462, "learning_rate": 1.1827514308348652e-06, "loss": 0.1378, "step": 11620 }, { "epoch": 3.9182538344850837, "grad_norm": 1.1843572362723038, "learning_rate": 1.1792283536110444e-06, "loss": 0.1291, "step": 11625 }, { "epoch": 3.9199393224338444, "grad_norm": 1.417513753472395, "learning_rate": 1.1757098297218523e-06, "loss": 0.137, "step": 11630 }, { "epoch": 3.921624810382606, "grad_norm": 1.2719598733782485, "learning_rate": 1.172195863360423e-06, "loss": 0.1504, "step": 11635 }, { "epoch": 3.923310298331367, "grad_norm": 1.5040447813022308, "learning_rate": 1.1686864587144614e-06, "loss": 0.1341, "step": 11640 }, { "epoch": 3.924995786280128, "grad_norm": 1.64275717096149, "learning_rate": 1.165181619966238e-06, "loss": 0.1306, "step": 11645 }, { "epoch": 3.9266812742288892, "grad_norm": 1.0752213608872785, "learning_rate": 1.161681351292579e-06, "loss": 0.1216, "step": 11650 }, { "epoch": 3.9283667621776504, "grad_norm": 1.2738139268445927, "learning_rate": 1.1581856568648658e-06, "loss": 0.1273, "step": 11655 }, { "epoch": 3.9300522501264115, "grad_norm": 1.1415927007120943, "learning_rate": 1.1546945408490267e-06, "loss": 0.1439, "step": 11660 }, { "epoch": 3.9317377380751726, "grad_norm": 1.48432947909629, "learning_rate": 1.1512080074055365e-06, "loss": 0.1314, "step": 11665 }, { "epoch": 3.933423226023934, "grad_norm": 1.5177086660710022, "learning_rate": 1.1477260606894091e-06, "loss": 0.1317, "step": 11670 }, { "epoch": 3.9351087139726952, "grad_norm": 1.2480924669984619, "learning_rate": 1.1442487048501888e-06, "loss": 0.1179, "step": 11675 }, { "epoch": 3.9367942019214563, "grad_norm": 1.1673574109519713, "learning_rate": 1.1407759440319504e-06, "loss": 0.1116, "step": 11680 }, { "epoch": 3.9384796898702175, "grad_norm": 1.189692540307074, "learning_rate": 1.1373077823732948e-06, "loss": 0.1401, "step": 11685 }, { "epoch": 3.9401651778189786, "grad_norm": 1.2474132437518495, "learning_rate": 1.1338442240073395e-06, "loss": 0.1399, "step": 11690 }, { "epoch": 3.9418506657677397, "grad_norm": 1.3841753764772589, "learning_rate": 1.130385273061716e-06, "loss": 0.1153, "step": 11695 }, { "epoch": 3.943536153716501, "grad_norm": 1.3670410936056547, "learning_rate": 1.1269309336585648e-06, "loss": 0.121, "step": 11700 }, { "epoch": 3.9452216416652623, "grad_norm": 4.2566426506681525, "learning_rate": 1.123481209914533e-06, "loss": 0.1247, "step": 11705 }, { "epoch": 3.946907129614023, "grad_norm": 1.2549136289873755, "learning_rate": 1.1200361059407665e-06, "loss": 0.1192, "step": 11710 }, { "epoch": 3.9485926175627846, "grad_norm": 1.4272672440381446, "learning_rate": 1.116595625842905e-06, "loss": 0.15, "step": 11715 }, { "epoch": 3.9502781055115457, "grad_norm": 1.2059470758704138, "learning_rate": 1.1131597737210758e-06, "loss": 0.1389, "step": 11720 }, { "epoch": 3.951963593460307, "grad_norm": 1.4754281793219044, "learning_rate": 1.1097285536698922e-06, "loss": 0.1324, "step": 11725 }, { "epoch": 3.953649081409068, "grad_norm": 1.2141992362969727, "learning_rate": 1.106301969778451e-06, "loss": 0.111, "step": 11730 }, { "epoch": 3.955334569357829, "grad_norm": 1.2918810114814279, "learning_rate": 1.1028800261303186e-06, "loss": 0.1549, "step": 11735 }, { "epoch": 3.95702005730659, "grad_norm": 1.461639333770653, "learning_rate": 1.0994627268035324e-06, "loss": 0.1265, "step": 11740 }, { "epoch": 3.9587055452553512, "grad_norm": 1.7712147161462113, "learning_rate": 1.0960500758705983e-06, "loss": 0.1234, "step": 11745 }, { "epoch": 3.960391033204113, "grad_norm": 1.2394532177536923, "learning_rate": 1.0926420773984813e-06, "loss": 0.1277, "step": 11750 }, { "epoch": 3.962076521152874, "grad_norm": 1.3595241710627732, "learning_rate": 1.0892387354486006e-06, "loss": 0.1334, "step": 11755 }, { "epoch": 3.963762009101635, "grad_norm": 1.3596726580758596, "learning_rate": 1.0858400540768255e-06, "loss": 0.1183, "step": 11760 }, { "epoch": 3.965447497050396, "grad_norm": 1.3995731294824745, "learning_rate": 1.0824460373334716e-06, "loss": 0.1298, "step": 11765 }, { "epoch": 3.9671329849991572, "grad_norm": 1.585776588487857, "learning_rate": 1.0790566892632986e-06, "loss": 0.129, "step": 11770 }, { "epoch": 3.9688184729479183, "grad_norm": 1.3895099701140083, "learning_rate": 1.0756720139054976e-06, "loss": 0.1348, "step": 11775 }, { "epoch": 3.9705039608966795, "grad_norm": 1.500716624265993, "learning_rate": 1.072292015293696e-06, "loss": 0.1283, "step": 11780 }, { "epoch": 3.972189448845441, "grad_norm": 1.3111959478401685, "learning_rate": 1.0689166974559427e-06, "loss": 0.12, "step": 11785 }, { "epoch": 3.9738749367942017, "grad_norm": 1.3140798821113782, "learning_rate": 1.065546064414713e-06, "loss": 0.1284, "step": 11790 }, { "epoch": 3.9755604247429632, "grad_norm": 1.3794025142465844, "learning_rate": 1.062180120186897e-06, "loss": 0.127, "step": 11795 }, { "epoch": 3.9772459126917243, "grad_norm": 1.235243365041627, "learning_rate": 1.0588188687837953e-06, "loss": 0.1025, "step": 11800 }, { "epoch": 3.9789314006404854, "grad_norm": 1.3093075471981128, "learning_rate": 1.055462314211118e-06, "loss": 0.1162, "step": 11805 }, { "epoch": 3.9806168885892466, "grad_norm": 1.4024378987390371, "learning_rate": 1.0521104604689792e-06, "loss": 0.1495, "step": 11810 }, { "epoch": 3.9823023765380077, "grad_norm": 1.3548800011731634, "learning_rate": 1.0487633115518869e-06, "loss": 0.1196, "step": 11815 }, { "epoch": 3.983987864486769, "grad_norm": 1.3027882530451373, "learning_rate": 1.0454208714487475e-06, "loss": 0.1236, "step": 11820 }, { "epoch": 3.98567335243553, "grad_norm": 1.2656768758124717, "learning_rate": 1.0420831441428502e-06, "loss": 0.1268, "step": 11825 }, { "epoch": 3.9873588403842914, "grad_norm": 1.28518948121888, "learning_rate": 1.0387501336118712e-06, "loss": 0.1151, "step": 11830 }, { "epoch": 3.9890443283330526, "grad_norm": 1.2394477069103667, "learning_rate": 1.0354218438278652e-06, "loss": 0.1432, "step": 11835 }, { "epoch": 3.9907298162818137, "grad_norm": 1.2215048085667828, "learning_rate": 1.0320982787572603e-06, "loss": 0.1271, "step": 11840 }, { "epoch": 3.992415304230575, "grad_norm": 1.194697649390314, "learning_rate": 1.0287794423608532e-06, "loss": 0.12, "step": 11845 }, { "epoch": 3.994100792179336, "grad_norm": 1.5046819102871702, "learning_rate": 1.0254653385938074e-06, "loss": 0.1197, "step": 11850 }, { "epoch": 3.995786280128097, "grad_norm": 1.2649916827451366, "learning_rate": 1.0221559714056462e-06, "loss": 0.1251, "step": 11855 }, { "epoch": 3.997471768076858, "grad_norm": 1.5053061115636008, "learning_rate": 1.018851344740247e-06, "loss": 0.1294, "step": 11860 }, { "epoch": 3.9991572560256197, "grad_norm": 1.377029121737187, "learning_rate": 1.015551462535837e-06, "loss": 0.1253, "step": 11865 }, { "epoch": 4.000674195179505, "grad_norm": 1.2686397699680958, "learning_rate": 1.0122563287249903e-06, "loss": 0.1323, "step": 11870 }, { "epoch": 4.002359683128265, "grad_norm": 1.1816811217186254, "learning_rate": 1.0089659472346241e-06, "loss": 0.1159, "step": 11875 }, { "epoch": 4.004045171077027, "grad_norm": 1.200704112220642, "learning_rate": 1.005680321985989e-06, "loss": 0.0981, "step": 11880 }, { "epoch": 4.005730659025788, "grad_norm": 1.2283603724405283, "learning_rate": 1.0023994568946682e-06, "loss": 0.1105, "step": 11885 }, { "epoch": 4.007416146974549, "grad_norm": 1.2806739817525785, "learning_rate": 9.991233558705716e-07, "loss": 0.0968, "step": 11890 }, { "epoch": 4.009101634923311, "grad_norm": 1.3201740651821476, "learning_rate": 9.958520228179364e-07, "loss": 0.1114, "step": 11895 }, { "epoch": 4.010787122872071, "grad_norm": 1.2684535246540378, "learning_rate": 9.925854616353115e-07, "loss": 0.11, "step": 11900 }, { "epoch": 4.012472610820833, "grad_norm": 1.2094872874491596, "learning_rate": 9.893236762155611e-07, "loss": 0.1169, "step": 11905 }, { "epoch": 4.0141580987695935, "grad_norm": 1.1351603996712527, "learning_rate": 9.860666704458578e-07, "loss": 0.1142, "step": 11910 }, { "epoch": 4.015843586718355, "grad_norm": 1.3013391760812418, "learning_rate": 9.828144482076807e-07, "loss": 0.1059, "step": 11915 }, { "epoch": 4.017529074667116, "grad_norm": 1.211883041428162, "learning_rate": 9.795670133768047e-07, "loss": 0.1143, "step": 11920 }, { "epoch": 4.019214562615877, "grad_norm": 1.4530157299174902, "learning_rate": 9.763243698232994e-07, "loss": 0.1187, "step": 11925 }, { "epoch": 4.020900050564639, "grad_norm": 1.3619804571974272, "learning_rate": 9.730865214115288e-07, "loss": 0.1057, "step": 11930 }, { "epoch": 4.0225855385133995, "grad_norm": 1.517925247056964, "learning_rate": 9.698534720001362e-07, "loss": 0.1152, "step": 11935 }, { "epoch": 4.024271026462161, "grad_norm": 1.4364199955195414, "learning_rate": 9.666252254420526e-07, "loss": 0.1121, "step": 11940 }, { "epoch": 4.025956514410922, "grad_norm": 1.5904175933559828, "learning_rate": 9.634017855844796e-07, "loss": 0.1206, "step": 11945 }, { "epoch": 4.027642002359683, "grad_norm": 1.4386968806132958, "learning_rate": 9.60183156268892e-07, "loss": 0.117, "step": 11950 }, { "epoch": 4.029327490308444, "grad_norm": 1.0907356500333152, "learning_rate": 9.569693413310338e-07, "loss": 0.1214, "step": 11955 }, { "epoch": 4.0310129782572055, "grad_norm": 1.366651038847322, "learning_rate": 9.537603446009098e-07, "loss": 0.104, "step": 11960 }, { "epoch": 4.032698466205967, "grad_norm": 1.4222103737274834, "learning_rate": 9.505561699027816e-07, "loss": 0.0969, "step": 11965 }, { "epoch": 4.034383954154728, "grad_norm": 1.4179261951176476, "learning_rate": 9.473568210551681e-07, "loss": 0.1104, "step": 11970 }, { "epoch": 4.036069442103489, "grad_norm": 1.3820881706297394, "learning_rate": 9.441623018708318e-07, "loss": 0.1041, "step": 11975 }, { "epoch": 4.03775493005225, "grad_norm": 1.3535859735670328, "learning_rate": 9.409726161567856e-07, "loss": 0.1168, "step": 11980 }, { "epoch": 4.0394404180010115, "grad_norm": 1.216653765426924, "learning_rate": 9.377877677142777e-07, "loss": 0.1039, "step": 11985 }, { "epoch": 4.041125905949772, "grad_norm": 1.450819171303049, "learning_rate": 9.346077603387915e-07, "loss": 0.1088, "step": 11990 }, { "epoch": 4.042811393898534, "grad_norm": 1.3264255159063292, "learning_rate": 9.314325978200451e-07, "loss": 0.1129, "step": 11995 }, { "epoch": 4.044496881847294, "grad_norm": 1.247752590631787, "learning_rate": 9.282622839419775e-07, "loss": 0.1127, "step": 12000 }, { "epoch": 4.046182369796056, "grad_norm": 1.2718654651906907, "learning_rate": 9.250968224827544e-07, "loss": 0.1199, "step": 12005 }, { "epoch": 4.0478678577448175, "grad_norm": 1.2784470200033737, "learning_rate": 9.219362172147567e-07, "loss": 0.1164, "step": 12010 }, { "epoch": 4.049553345693578, "grad_norm": 1.275011129776298, "learning_rate": 9.187804719045751e-07, "loss": 0.1103, "step": 12015 }, { "epoch": 4.05123883364234, "grad_norm": 1.3483311814842633, "learning_rate": 9.156295903130141e-07, "loss": 0.1159, "step": 12020 }, { "epoch": 4.0529243215911, "grad_norm": 1.5365317528571418, "learning_rate": 9.124835761950784e-07, "loss": 0.1181, "step": 12025 }, { "epoch": 4.054609809539862, "grad_norm": 8.089273619285802, "learning_rate": 9.093424332999723e-07, "loss": 0.1051, "step": 12030 }, { "epoch": 4.056295297488623, "grad_norm": 1.412741400622911, "learning_rate": 9.06206165371094e-07, "loss": 0.0904, "step": 12035 }, { "epoch": 4.057980785437384, "grad_norm": 1.3451991617246908, "learning_rate": 9.030747761460351e-07, "loss": 0.123, "step": 12040 }, { "epoch": 4.059666273386146, "grad_norm": 1.3706355009912603, "learning_rate": 8.99948269356572e-07, "loss": 0.1072, "step": 12045 }, { "epoch": 4.061351761334906, "grad_norm": 1.2813393553915857, "learning_rate": 8.968266487286609e-07, "loss": 0.1042, "step": 12050 }, { "epoch": 4.063037249283668, "grad_norm": 1.393422851426227, "learning_rate": 8.937099179824343e-07, "loss": 0.1081, "step": 12055 }, { "epoch": 4.064722737232429, "grad_norm": 1.280841462377404, "learning_rate": 8.905980808322029e-07, "loss": 0.1203, "step": 12060 }, { "epoch": 4.06640822518119, "grad_norm": 1.8849739487073585, "learning_rate": 8.874911409864384e-07, "loss": 0.1233, "step": 12065 }, { "epoch": 4.068093713129951, "grad_norm": 1.0980679250876781, "learning_rate": 8.843891021477813e-07, "loss": 0.1058, "step": 12070 }, { "epoch": 4.069779201078712, "grad_norm": 2.0950751356984587, "learning_rate": 8.812919680130272e-07, "loss": 0.1375, "step": 12075 }, { "epoch": 4.071464689027473, "grad_norm": 1.2036312089080325, "learning_rate": 8.781997422731304e-07, "loss": 0.113, "step": 12080 }, { "epoch": 4.073150176976235, "grad_norm": 1.6339619258398572, "learning_rate": 8.751124286131957e-07, "loss": 0.1045, "step": 12085 }, { "epoch": 4.074835664924996, "grad_norm": 1.4342407357213185, "learning_rate": 8.720300307124712e-07, "loss": 0.1166, "step": 12090 }, { "epoch": 4.076521152873757, "grad_norm": 1.2787393231188653, "learning_rate": 8.689525522443471e-07, "loss": 0.1168, "step": 12095 }, { "epoch": 4.078206640822518, "grad_norm": 1.5026794288218963, "learning_rate": 8.658799968763548e-07, "loss": 0.1125, "step": 12100 }, { "epoch": 4.079892128771279, "grad_norm": 1.4636374697221322, "learning_rate": 8.628123682701533e-07, "loss": 0.1222, "step": 12105 }, { "epoch": 4.081577616720041, "grad_norm": 1.3854446033934944, "learning_rate": 8.597496700815344e-07, "loss": 0.1018, "step": 12110 }, { "epoch": 4.083263104668801, "grad_norm": 1.3314914384548686, "learning_rate": 8.566919059604106e-07, "loss": 0.1296, "step": 12115 }, { "epoch": 4.084948592617563, "grad_norm": 1.2830041524416362, "learning_rate": 8.536390795508176e-07, "loss": 0.1069, "step": 12120 }, { "epoch": 4.086634080566324, "grad_norm": 1.220959008225962, "learning_rate": 8.505911944909062e-07, "loss": 0.1003, "step": 12125 }, { "epoch": 4.088319568515085, "grad_norm": 1.1826856191408797, "learning_rate": 8.475482544129371e-07, "loss": 0.099, "step": 12130 }, { "epoch": 4.090005056463847, "grad_norm": 1.2736778287384793, "learning_rate": 8.445102629432778e-07, "loss": 0.0967, "step": 12135 }, { "epoch": 4.091690544412607, "grad_norm": 1.4482792394902526, "learning_rate": 8.414772237023982e-07, "loss": 0.0951, "step": 12140 }, { "epoch": 4.093376032361369, "grad_norm": 3.367883128370919, "learning_rate": 8.384491403048694e-07, "loss": 0.1244, "step": 12145 }, { "epoch": 4.0950615203101295, "grad_norm": 1.1690680983435247, "learning_rate": 8.354260163593519e-07, "loss": 0.1062, "step": 12150 }, { "epoch": 4.096747008258891, "grad_norm": 1.2947556372949949, "learning_rate": 8.32407855468601e-07, "loss": 0.1059, "step": 12155 }, { "epoch": 4.098432496207652, "grad_norm": 1.5373405572019554, "learning_rate": 8.293946612294523e-07, "loss": 0.1234, "step": 12160 }, { "epoch": 4.100117984156413, "grad_norm": 1.1500599400608853, "learning_rate": 8.263864372328268e-07, "loss": 0.114, "step": 12165 }, { "epoch": 4.101803472105175, "grad_norm": 1.3361032011907505, "learning_rate": 8.233831870637188e-07, "loss": 0.1142, "step": 12170 }, { "epoch": 4.1034889600539355, "grad_norm": 1.3763397653453309, "learning_rate": 8.203849143011977e-07, "loss": 0.1076, "step": 12175 }, { "epoch": 4.105174448002697, "grad_norm": 1.1943015358381728, "learning_rate": 8.173916225183987e-07, "loss": 0.108, "step": 12180 }, { "epoch": 4.106859935951458, "grad_norm": 1.4212257971050177, "learning_rate": 8.144033152825243e-07, "loss": 0.1163, "step": 12185 }, { "epoch": 4.108545423900219, "grad_norm": 1.2792428668113918, "learning_rate": 8.11419996154833e-07, "loss": 0.1192, "step": 12190 }, { "epoch": 4.11023091184898, "grad_norm": 1.414385709276504, "learning_rate": 8.084416686906426e-07, "loss": 0.097, "step": 12195 }, { "epoch": 4.1119163997977415, "grad_norm": 1.280707167684563, "learning_rate": 8.054683364393185e-07, "loss": 0.0976, "step": 12200 }, { "epoch": 4.113601887746503, "grad_norm": 1.3413973918750335, "learning_rate": 8.025000029442776e-07, "loss": 0.1035, "step": 12205 }, { "epoch": 4.115287375695264, "grad_norm": 1.1956122736138755, "learning_rate": 7.995366717429748e-07, "loss": 0.1009, "step": 12210 }, { "epoch": 4.116972863644025, "grad_norm": 1.351190451029454, "learning_rate": 7.965783463669063e-07, "loss": 0.1123, "step": 12215 }, { "epoch": 4.118658351592786, "grad_norm": 1.5044448299648419, "learning_rate": 7.936250303416009e-07, "loss": 0.1258, "step": 12220 }, { "epoch": 4.1203438395415475, "grad_norm": 1.2845839295722064, "learning_rate": 7.906767271866206e-07, "loss": 0.1124, "step": 12225 }, { "epoch": 4.122029327490308, "grad_norm": 1.225063006901401, "learning_rate": 7.877334404155518e-07, "loss": 0.0999, "step": 12230 }, { "epoch": 4.12371481543907, "grad_norm": 1.6545267890659516, "learning_rate": 7.847951735360021e-07, "loss": 0.12, "step": 12235 }, { "epoch": 4.12540030338783, "grad_norm": 1.2484653665854208, "learning_rate": 7.818619300495978e-07, "loss": 0.0978, "step": 12240 }, { "epoch": 4.127085791336592, "grad_norm": 1.1903596648167274, "learning_rate": 7.789337134519759e-07, "loss": 0.1039, "step": 12245 }, { "epoch": 4.1287712792853535, "grad_norm": 1.338620620795788, "learning_rate": 7.760105272327872e-07, "loss": 0.1177, "step": 12250 }, { "epoch": 4.130456767234114, "grad_norm": 1.2803492314820784, "learning_rate": 7.730923748756852e-07, "loss": 0.1113, "step": 12255 }, { "epoch": 4.132142255182876, "grad_norm": 1.4965664188299521, "learning_rate": 7.701792598583224e-07, "loss": 0.1019, "step": 12260 }, { "epoch": 4.133827743131636, "grad_norm": 1.2116510236214313, "learning_rate": 7.672711856523518e-07, "loss": 0.1174, "step": 12265 }, { "epoch": 4.135513231080398, "grad_norm": 1.2323286887036327, "learning_rate": 7.643681557234189e-07, "loss": 0.1191, "step": 12270 }, { "epoch": 4.137198719029159, "grad_norm": 1.2903856968737564, "learning_rate": 7.614701735311552e-07, "loss": 0.0945, "step": 12275 }, { "epoch": 4.13888420697792, "grad_norm": 10.433753242491871, "learning_rate": 7.585772425291776e-07, "loss": 0.0992, "step": 12280 }, { "epoch": 4.140569694926682, "grad_norm": 1.2233233500202267, "learning_rate": 7.556893661650827e-07, "loss": 0.1249, "step": 12285 }, { "epoch": 4.142255182875442, "grad_norm": 1.2130226964082333, "learning_rate": 7.528065478804463e-07, "loss": 0.0972, "step": 12290 }, { "epoch": 4.143940670824204, "grad_norm": 1.2063408932815303, "learning_rate": 7.499287911108132e-07, "loss": 0.1167, "step": 12295 }, { "epoch": 4.145626158772965, "grad_norm": 1.30741382337855, "learning_rate": 7.470560992856984e-07, "loss": 0.112, "step": 12300 }, { "epoch": 4.147311646721726, "grad_norm": 1.9204573661896698, "learning_rate": 7.441884758285756e-07, "loss": 0.1189, "step": 12305 }, { "epoch": 4.148997134670487, "grad_norm": 1.6452883785762646, "learning_rate": 7.413259241568887e-07, "loss": 0.1306, "step": 12310 }, { "epoch": 4.150682622619248, "grad_norm": 1.3835192185313165, "learning_rate": 7.384684476820281e-07, "loss": 0.1264, "step": 12315 }, { "epoch": 4.152368110568009, "grad_norm": 1.6241808130288877, "learning_rate": 7.35616049809339e-07, "loss": 0.114, "step": 12320 }, { "epoch": 4.154053598516771, "grad_norm": 1.3217625507298068, "learning_rate": 7.32768733938114e-07, "loss": 0.1104, "step": 12325 }, { "epoch": 4.155739086465532, "grad_norm": 1.1639592250004458, "learning_rate": 7.29926503461591e-07, "loss": 0.0985, "step": 12330 }, { "epoch": 4.157424574414293, "grad_norm": 1.3164239348969922, "learning_rate": 7.27089361766945e-07, "loss": 0.1089, "step": 12335 }, { "epoch": 4.159110062363054, "grad_norm": 1.2830066976673342, "learning_rate": 7.242573122352875e-07, "loss": 0.1108, "step": 12340 }, { "epoch": 4.160795550311815, "grad_norm": 1.4142314237515623, "learning_rate": 7.214303582416626e-07, "loss": 0.1172, "step": 12345 }, { "epoch": 4.162481038260577, "grad_norm": 1.3441874811916532, "learning_rate": 7.1860850315504e-07, "loss": 0.1071, "step": 12350 }, { "epoch": 4.164166526209337, "grad_norm": 1.2360749017856782, "learning_rate": 7.157917503383149e-07, "loss": 0.129, "step": 12355 }, { "epoch": 4.165852014158099, "grad_norm": 1.5174335899130909, "learning_rate": 7.129801031483008e-07, "loss": 0.1179, "step": 12360 }, { "epoch": 4.16753750210686, "grad_norm": 1.3359049180292741, "learning_rate": 7.101735649357244e-07, "loss": 0.1019, "step": 12365 }, { "epoch": 4.169222990055621, "grad_norm": 1.9062074251066823, "learning_rate": 7.073721390452298e-07, "loss": 0.1171, "step": 12370 }, { "epoch": 4.170908478004383, "grad_norm": 1.3360549920596059, "learning_rate": 7.045758288153631e-07, "loss": 0.0992, "step": 12375 }, { "epoch": 4.172593965953143, "grad_norm": 1.2666268233549498, "learning_rate": 7.017846375785742e-07, "loss": 0.1308, "step": 12380 }, { "epoch": 4.174279453901905, "grad_norm": 1.6136590144629126, "learning_rate": 6.989985686612177e-07, "loss": 0.1075, "step": 12385 }, { "epoch": 4.1759649418506655, "grad_norm": 1.4203178557637688, "learning_rate": 6.962176253835367e-07, "loss": 0.1004, "step": 12390 }, { "epoch": 4.177650429799427, "grad_norm": 1.5148294344863817, "learning_rate": 6.934418110596725e-07, "loss": 0.1307, "step": 12395 }, { "epoch": 4.179335917748188, "grad_norm": 1.6901766383164198, "learning_rate": 6.906711289976492e-07, "loss": 0.1245, "step": 12400 }, { "epoch": 4.181021405696949, "grad_norm": 1.3052650695023462, "learning_rate": 6.879055824993758e-07, "loss": 0.1147, "step": 12405 }, { "epoch": 4.182706893645711, "grad_norm": 1.4832200914996088, "learning_rate": 6.851451748606436e-07, "loss": 0.099, "step": 12410 }, { "epoch": 4.1843923815944715, "grad_norm": 1.300074399510205, "learning_rate": 6.823899093711161e-07, "loss": 0.1138, "step": 12415 }, { "epoch": 4.186077869543233, "grad_norm": 1.3801413673802827, "learning_rate": 6.79639789314332e-07, "loss": 0.1055, "step": 12420 }, { "epoch": 4.187763357491994, "grad_norm": 1.4405626274680872, "learning_rate": 6.768948179676959e-07, "loss": 0.1147, "step": 12425 }, { "epoch": 4.189448845440755, "grad_norm": 1.2885362808046412, "learning_rate": 6.741549986024759e-07, "loss": 0.102, "step": 12430 }, { "epoch": 4.191134333389516, "grad_norm": 1.2631602822200947, "learning_rate": 6.714203344838033e-07, "loss": 0.097, "step": 12435 }, { "epoch": 4.1928198213382775, "grad_norm": 1.322156569688669, "learning_rate": 6.686908288706639e-07, "loss": 0.1068, "step": 12440 }, { "epoch": 4.194505309287039, "grad_norm": 1.3443668712005117, "learning_rate": 6.659664850158948e-07, "loss": 0.1108, "step": 12445 }, { "epoch": 4.1961907972358, "grad_norm": 1.4522694124226216, "learning_rate": 6.632473061661831e-07, "loss": 0.1061, "step": 12450 }, { "epoch": 4.197876285184561, "grad_norm": 1.8908127442697675, "learning_rate": 6.605332955620603e-07, "loss": 0.1261, "step": 12455 }, { "epoch": 4.199561773133322, "grad_norm": 1.7695866843146077, "learning_rate": 6.578244564379005e-07, "loss": 0.1212, "step": 12460 }, { "epoch": 4.2012472610820835, "grad_norm": 1.5133609075771488, "learning_rate": 6.551207920219121e-07, "loss": 0.1162, "step": 12465 }, { "epoch": 4.202932749030844, "grad_norm": 1.3015009625368679, "learning_rate": 6.524223055361362e-07, "loss": 0.1092, "step": 12470 }, { "epoch": 4.204618236979606, "grad_norm": 1.4631592452147437, "learning_rate": 6.497290001964468e-07, "loss": 0.1347, "step": 12475 }, { "epoch": 4.206303724928366, "grad_norm": 1.3343524272579732, "learning_rate": 6.470408792125404e-07, "loss": 0.1179, "step": 12480 }, { "epoch": 4.207989212877128, "grad_norm": 1.3770180880579295, "learning_rate": 6.443579457879362e-07, "loss": 0.1039, "step": 12485 }, { "epoch": 4.2096747008258895, "grad_norm": 1.279648638951182, "learning_rate": 6.416802031199693e-07, "loss": 0.1087, "step": 12490 }, { "epoch": 4.21136018877465, "grad_norm": 1.1611456187276283, "learning_rate": 6.39007654399792e-07, "loss": 0.1053, "step": 12495 }, { "epoch": 4.213045676723412, "grad_norm": 1.3885249594275164, "learning_rate": 6.36340302812366e-07, "loss": 0.1244, "step": 12500 }, { "epoch": 4.214731164672172, "grad_norm": 1.4764836670737602, "learning_rate": 6.336781515364576e-07, "loss": 0.1185, "step": 12505 }, { "epoch": 4.216416652620934, "grad_norm": 1.2273309966155903, "learning_rate": 6.310212037446361e-07, "loss": 0.1075, "step": 12510 }, { "epoch": 4.218102140569695, "grad_norm": 1.1630542076146055, "learning_rate": 6.283694626032727e-07, "loss": 0.0988, "step": 12515 }, { "epoch": 4.219787628518456, "grad_norm": 1.5308452378094253, "learning_rate": 6.257229312725293e-07, "loss": 0.109, "step": 12520 }, { "epoch": 4.221473116467218, "grad_norm": 1.3807175156048481, "learning_rate": 6.230816129063621e-07, "loss": 0.1045, "step": 12525 }, { "epoch": 4.223158604415978, "grad_norm": 1.2463396384249752, "learning_rate": 6.204455106525126e-07, "loss": 0.0938, "step": 12530 }, { "epoch": 4.22484409236474, "grad_norm": 1.4154264595396142, "learning_rate": 6.178146276525082e-07, "loss": 0.125, "step": 12535 }, { "epoch": 4.226529580313501, "grad_norm": 1.2074553419962946, "learning_rate": 6.151889670416566e-07, "loss": 0.1112, "step": 12540 }, { "epoch": 4.228215068262262, "grad_norm": 1.3745228772374778, "learning_rate": 6.125685319490399e-07, "loss": 0.1076, "step": 12545 }, { "epoch": 4.229900556211023, "grad_norm": 1.3442914729059194, "learning_rate": 6.099533254975131e-07, "loss": 0.1038, "step": 12550 }, { "epoch": 4.231586044159784, "grad_norm": 1.4682437757162892, "learning_rate": 6.073433508037002e-07, "loss": 0.1189, "step": 12555 }, { "epoch": 4.233271532108545, "grad_norm": 1.3775983777635847, "learning_rate": 6.047386109779929e-07, "loss": 0.122, "step": 12560 }, { "epoch": 4.234957020057307, "grad_norm": 1.4011722199263275, "learning_rate": 6.021391091245394e-07, "loss": 0.1079, "step": 12565 }, { "epoch": 4.236642508006068, "grad_norm": 1.091308302361115, "learning_rate": 5.995448483412514e-07, "loss": 0.0984, "step": 12570 }, { "epoch": 4.238327995954829, "grad_norm": 1.404128023644069, "learning_rate": 5.969558317197882e-07, "loss": 0.1146, "step": 12575 }, { "epoch": 4.24001348390359, "grad_norm": 1.1485118539171184, "learning_rate": 5.943720623455667e-07, "loss": 0.1046, "step": 12580 }, { "epoch": 4.241698971852351, "grad_norm": 1.3958542990358158, "learning_rate": 5.917935432977445e-07, "loss": 0.0974, "step": 12585 }, { "epoch": 4.243384459801113, "grad_norm": 1.42446604239913, "learning_rate": 5.892202776492245e-07, "loss": 0.1127, "step": 12590 }, { "epoch": 4.245069947749873, "grad_norm": 1.3733725009041717, "learning_rate": 5.866522684666487e-07, "loss": 0.097, "step": 12595 }, { "epoch": 4.246755435698635, "grad_norm": 1.3037524065490758, "learning_rate": 5.840895188103963e-07, "loss": 0.103, "step": 12600 }, { "epoch": 4.248440923647396, "grad_norm": 1.1394240155820112, "learning_rate": 5.815320317345758e-07, "loss": 0.1179, "step": 12605 }, { "epoch": 4.250126411596157, "grad_norm": 1.3577679899286457, "learning_rate": 5.789798102870264e-07, "loss": 0.112, "step": 12610 }, { "epoch": 4.251811899544919, "grad_norm": 1.3351097891936905, "learning_rate": 5.764328575093109e-07, "loss": 0.1085, "step": 12615 }, { "epoch": 4.253497387493679, "grad_norm": 1.3149442488122238, "learning_rate": 5.738911764367144e-07, "loss": 0.0944, "step": 12620 }, { "epoch": 4.255182875442441, "grad_norm": 1.3423258175257915, "learning_rate": 5.713547700982385e-07, "loss": 0.0846, "step": 12625 }, { "epoch": 4.2568683633912014, "grad_norm": 1.2079081445483166, "learning_rate": 5.688236415165988e-07, "loss": 0.1101, "step": 12630 }, { "epoch": 4.258553851339963, "grad_norm": 1.3488689182947924, "learning_rate": 5.662977937082204e-07, "loss": 0.1016, "step": 12635 }, { "epoch": 4.260239339288724, "grad_norm": 1.6504673777600523, "learning_rate": 5.637772296832367e-07, "loss": 0.1271, "step": 12640 }, { "epoch": 4.261924827237485, "grad_norm": 1.3793465158474956, "learning_rate": 5.612619524454854e-07, "loss": 0.1313, "step": 12645 }, { "epoch": 4.263610315186247, "grad_norm": 1.185490237774572, "learning_rate": 5.587519649925005e-07, "loss": 0.0898, "step": 12650 }, { "epoch": 4.2652958031350074, "grad_norm": 1.5426151385369398, "learning_rate": 5.562472703155142e-07, "loss": 0.1153, "step": 12655 }, { "epoch": 4.266981291083769, "grad_norm": 1.2605141841919176, "learning_rate": 5.537478713994493e-07, "loss": 0.1182, "step": 12660 }, { "epoch": 4.26866677903253, "grad_norm": 1.5645076718578845, "learning_rate": 5.512537712229199e-07, "loss": 0.1321, "step": 12665 }, { "epoch": 4.270352266981291, "grad_norm": 1.3706046002481285, "learning_rate": 5.487649727582245e-07, "loss": 0.1149, "step": 12670 }, { "epoch": 4.272037754930052, "grad_norm": 1.3480901250235686, "learning_rate": 5.462814789713411e-07, "loss": 0.0937, "step": 12675 }, { "epoch": 4.2737232428788134, "grad_norm": 4.894753355003127, "learning_rate": 5.438032928219289e-07, "loss": 0.1032, "step": 12680 }, { "epoch": 4.275408730827575, "grad_norm": 1.364313335978621, "learning_rate": 5.413304172633227e-07, "loss": 0.1244, "step": 12685 }, { "epoch": 4.277094218776336, "grad_norm": 1.3873558144374436, "learning_rate": 5.388628552425251e-07, "loss": 0.1132, "step": 12690 }, { "epoch": 4.278779706725097, "grad_norm": 1.465437984120428, "learning_rate": 5.364006097002078e-07, "loss": 0.1133, "step": 12695 }, { "epoch": 4.280465194673858, "grad_norm": 1.3070673802862949, "learning_rate": 5.339436835707063e-07, "loss": 0.1072, "step": 12700 }, { "epoch": 4.282150682622619, "grad_norm": 1.5208917895461296, "learning_rate": 5.314920797820189e-07, "loss": 0.101, "step": 12705 }, { "epoch": 4.28383617057138, "grad_norm": 1.341952462702398, "learning_rate": 5.290458012557986e-07, "loss": 0.1182, "step": 12710 }, { "epoch": 4.285521658520142, "grad_norm": 1.2180380643292374, "learning_rate": 5.266048509073518e-07, "loss": 0.1021, "step": 12715 }, { "epoch": 4.287207146468903, "grad_norm": 1.1508451676006477, "learning_rate": 5.241692316456381e-07, "loss": 0.0968, "step": 12720 }, { "epoch": 4.288892634417664, "grad_norm": 1.124393382402687, "learning_rate": 5.217389463732625e-07, "loss": 0.0937, "step": 12725 }, { "epoch": 4.290578122366425, "grad_norm": 1.3368529587946882, "learning_rate": 5.193139979864726e-07, "loss": 0.1104, "step": 12730 }, { "epoch": 4.292263610315186, "grad_norm": 1.3097972966656428, "learning_rate": 5.168943893751549e-07, "loss": 0.1288, "step": 12735 }, { "epoch": 4.293949098263948, "grad_norm": 1.2753472082896762, "learning_rate": 5.144801234228342e-07, "loss": 0.1071, "step": 12740 }, { "epoch": 4.295634586212708, "grad_norm": 1.3107957633148926, "learning_rate": 5.120712030066688e-07, "loss": 0.0907, "step": 12745 }, { "epoch": 4.29732007416147, "grad_norm": 1.2875216625415367, "learning_rate": 5.096676309974447e-07, "loss": 0.1031, "step": 12750 }, { "epoch": 4.2990055621102305, "grad_norm": 1.4514402890633926, "learning_rate": 5.072694102595743e-07, "loss": 0.1259, "step": 12755 }, { "epoch": 4.300691050058992, "grad_norm": 1.3237308164459693, "learning_rate": 5.048765436510933e-07, "loss": 0.108, "step": 12760 }, { "epoch": 4.302376538007754, "grad_norm": 1.300150145012879, "learning_rate": 5.024890340236583e-07, "loss": 0.0981, "step": 12765 }, { "epoch": 4.304062025956514, "grad_norm": 1.3855062505625857, "learning_rate": 5.001068842225387e-07, "loss": 0.1075, "step": 12770 }, { "epoch": 4.305747513905276, "grad_norm": 1.499093249303719, "learning_rate": 4.977300970866184e-07, "loss": 0.105, "step": 12775 }, { "epoch": 4.3074330018540365, "grad_norm": 1.7111121992372054, "learning_rate": 4.953586754483891e-07, "loss": 0.0932, "step": 12780 }, { "epoch": 4.309118489802798, "grad_norm": 1.8555027883170325, "learning_rate": 4.929926221339504e-07, "loss": 0.1282, "step": 12785 }, { "epoch": 4.310803977751559, "grad_norm": 1.4812802235015243, "learning_rate": 4.906319399630011e-07, "loss": 0.0971, "step": 12790 }, { "epoch": 4.31248946570032, "grad_norm": 1.4355396413152655, "learning_rate": 4.882766317488435e-07, "loss": 0.1146, "step": 12795 }, { "epoch": 4.314174953649081, "grad_norm": 1.444068572952506, "learning_rate": 4.859267002983714e-07, "loss": 0.1251, "step": 12800 }, { "epoch": 4.3158604415978425, "grad_norm": 1.278171588280947, "learning_rate": 4.835821484120723e-07, "loss": 0.1161, "step": 12805 }, { "epoch": 4.317545929546604, "grad_norm": 1.4015777366435573, "learning_rate": 4.812429788840245e-07, "loss": 0.1037, "step": 12810 }, { "epoch": 4.319231417495365, "grad_norm": 1.267401729605222, "learning_rate": 4.789091945018892e-07, "loss": 0.105, "step": 12815 }, { "epoch": 4.320916905444126, "grad_norm": 1.0944872659705882, "learning_rate": 4.765807980469106e-07, "loss": 0.1151, "step": 12820 }, { "epoch": 4.322602393392887, "grad_norm": 1.3974443654687387, "learning_rate": 4.74257792293914e-07, "loss": 0.1065, "step": 12825 }, { "epoch": 4.3242878813416485, "grad_norm": 1.3294429319356331, "learning_rate": 4.719401800112977e-07, "loss": 0.1139, "step": 12830 }, { "epoch": 4.325973369290409, "grad_norm": 1.2919635572246846, "learning_rate": 4.6962796396103514e-07, "loss": 0.1005, "step": 12835 }, { "epoch": 4.327658857239171, "grad_norm": 1.387467405554503, "learning_rate": 4.6732114689866713e-07, "loss": 0.1079, "step": 12840 }, { "epoch": 4.329344345187932, "grad_norm": 1.2814156861095354, "learning_rate": 4.6501973157329847e-07, "loss": 0.1046, "step": 12845 }, { "epoch": 4.331029833136693, "grad_norm": 1.632353129530135, "learning_rate": 4.62723720727602e-07, "loss": 0.1008, "step": 12850 }, { "epoch": 4.3327153210854545, "grad_norm": 1.258261859989475, "learning_rate": 4.604331170978049e-07, "loss": 0.1004, "step": 12855 }, { "epoch": 4.334400809034215, "grad_norm": 1.7920244077170024, "learning_rate": 4.581479234136915e-07, "loss": 0.0945, "step": 12860 }, { "epoch": 4.336086296982977, "grad_norm": 1.360200317683404, "learning_rate": 4.558681423985989e-07, "loss": 0.104, "step": 12865 }, { "epoch": 4.337771784931737, "grad_norm": 1.3475770543917587, "learning_rate": 4.5359377676941764e-07, "loss": 0.0963, "step": 12870 }, { "epoch": 4.339457272880499, "grad_norm": 1.255107027167233, "learning_rate": 4.5132482923657904e-07, "loss": 0.11, "step": 12875 }, { "epoch": 4.3411427608292605, "grad_norm": 1.3398219651184247, "learning_rate": 4.4906130250406024e-07, "loss": 0.1127, "step": 12880 }, { "epoch": 4.342828248778021, "grad_norm": 1.282294836209136, "learning_rate": 4.4680319926937666e-07, "loss": 0.1019, "step": 12885 }, { "epoch": 4.344513736726783, "grad_norm": 1.1284975586864117, "learning_rate": 4.4455052222358354e-07, "loss": 0.0932, "step": 12890 }, { "epoch": 4.346199224675543, "grad_norm": 1.5180478849747012, "learning_rate": 4.4230327405126614e-07, "loss": 0.1168, "step": 12895 }, { "epoch": 4.347884712624305, "grad_norm": 1.269082280456362, "learning_rate": 4.4006145743054116e-07, "loss": 0.0948, "step": 12900 }, { "epoch": 4.349570200573066, "grad_norm": 1.2232591570968698, "learning_rate": 4.3782507503305263e-07, "loss": 0.1044, "step": 12905 }, { "epoch": 4.351255688521827, "grad_norm": 1.418462747935375, "learning_rate": 4.3559412952396796e-07, "loss": 0.1138, "step": 12910 }, { "epoch": 4.352941176470588, "grad_norm": 1.7745767467659825, "learning_rate": 4.333686235619772e-07, "loss": 0.1065, "step": 12915 }, { "epoch": 4.354626664419349, "grad_norm": 1.2900376660394284, "learning_rate": 4.311485597992854e-07, "loss": 0.0998, "step": 12920 }, { "epoch": 4.356312152368111, "grad_norm": 1.3765273059531757, "learning_rate": 4.2893394088161176e-07, "loss": 0.1156, "step": 12925 }, { "epoch": 4.357997640316872, "grad_norm": 1.2580320516333507, "learning_rate": 4.2672476944818963e-07, "loss": 0.0989, "step": 12930 }, { "epoch": 4.359683128265633, "grad_norm": 1.2994496505304025, "learning_rate": 4.2452104813175797e-07, "loss": 0.1011, "step": 12935 }, { "epoch": 4.361368616214394, "grad_norm": 1.3774522003838567, "learning_rate": 4.2232277955855995e-07, "loss": 0.1031, "step": 12940 }, { "epoch": 4.363054104163155, "grad_norm": 1.414295121192504, "learning_rate": 4.2012996634834434e-07, "loss": 0.1055, "step": 12945 }, { "epoch": 4.364739592111916, "grad_norm": 1.2195219009223808, "learning_rate": 4.179426111143536e-07, "loss": 0.0951, "step": 12950 }, { "epoch": 4.366425080060678, "grad_norm": 1.2565916088421782, "learning_rate": 4.1576071646333037e-07, "loss": 0.1023, "step": 12955 }, { "epoch": 4.368110568009438, "grad_norm": 1.802797545212119, "learning_rate": 4.1358428499550687e-07, "loss": 0.119, "step": 12960 }, { "epoch": 4.3697960559582, "grad_norm": 1.3805722332018802, "learning_rate": 4.1141331930460546e-07, "loss": 0.1107, "step": 12965 }, { "epoch": 4.371481543906961, "grad_norm": 1.3657834551175823, "learning_rate": 4.0924782197783344e-07, "loss": 0.1038, "step": 12970 }, { "epoch": 4.373167031855722, "grad_norm": 1.3726640016764247, "learning_rate": 4.0708779559588463e-07, "loss": 0.0976, "step": 12975 }, { "epoch": 4.374852519804484, "grad_norm": 1.4261444871092577, "learning_rate": 4.049332427329294e-07, "loss": 0.1156, "step": 12980 }, { "epoch": 4.376538007753244, "grad_norm": 1.6534352594521706, "learning_rate": 4.0278416595661787e-07, "loss": 0.1035, "step": 12985 }, { "epoch": 4.378223495702006, "grad_norm": 1.2911067985280702, "learning_rate": 4.006405678280717e-07, "loss": 0.1066, "step": 12990 }, { "epoch": 4.3799089836507665, "grad_norm": 1.3058495586023977, "learning_rate": 3.9850245090188587e-07, "loss": 0.0981, "step": 12995 }, { "epoch": 4.381594471599528, "grad_norm": 1.7470967650220217, "learning_rate": 3.963698177261216e-07, "loss": 0.1203, "step": 13000 }, { "epoch": 4.38327995954829, "grad_norm": 1.5335847116198777, "learning_rate": 3.9424267084230583e-07, "loss": 0.1147, "step": 13005 }, { "epoch": 4.38496544749705, "grad_norm": 1.4703422570854467, "learning_rate": 3.9212101278542524e-07, "loss": 0.112, "step": 13010 }, { "epoch": 4.386650935445812, "grad_norm": 1.2063754002001417, "learning_rate": 3.9000484608392786e-07, "loss": 0.1026, "step": 13015 }, { "epoch": 4.3883364233945725, "grad_norm": 1.3935790269241226, "learning_rate": 3.87894173259718e-07, "loss": 0.0885, "step": 13020 }, { "epoch": 4.390021911343334, "grad_norm": 1.3553381164698273, "learning_rate": 3.857889968281503e-07, "loss": 0.1066, "step": 13025 }, { "epoch": 4.391707399292095, "grad_norm": 1.2898295004031393, "learning_rate": 3.83689319298029e-07, "loss": 0.1043, "step": 13030 }, { "epoch": 4.393392887240856, "grad_norm": 1.6708450574654765, "learning_rate": 3.8159514317160807e-07, "loss": 0.1101, "step": 13035 }, { "epoch": 4.395078375189618, "grad_norm": 1.5281800814265354, "learning_rate": 3.795064709445834e-07, "loss": 0.1007, "step": 13040 }, { "epoch": 4.3967638631383785, "grad_norm": 1.3074495938444166, "learning_rate": 3.774233051060916e-07, "loss": 0.1101, "step": 13045 }, { "epoch": 4.39844935108714, "grad_norm": 1.3659165183320154, "learning_rate": 3.753456481387058e-07, "loss": 0.1115, "step": 13050 }, { "epoch": 4.400134839035901, "grad_norm": 1.3137223877241775, "learning_rate": 3.7327350251843695e-07, "loss": 0.1038, "step": 13055 }, { "epoch": 4.401820326984662, "grad_norm": 1.2527662849244792, "learning_rate": 3.712068707147282e-07, "loss": 0.1237, "step": 13060 }, { "epoch": 4.403505814933423, "grad_norm": 1.2192091494581734, "learning_rate": 3.6914575519044816e-07, "loss": 0.0897, "step": 13065 }, { "epoch": 4.4051913028821845, "grad_norm": 1.3891589894514313, "learning_rate": 3.6709015840189435e-07, "loss": 0.0926, "step": 13070 }, { "epoch": 4.406876790830945, "grad_norm": 2.168010607688832, "learning_rate": 3.6504008279878486e-07, "loss": 0.0947, "step": 13075 }, { "epoch": 4.408562278779707, "grad_norm": 1.717699719899326, "learning_rate": 3.6299553082426274e-07, "loss": 0.1155, "step": 13080 }, { "epoch": 4.410247766728468, "grad_norm": 1.3765641211070212, "learning_rate": 3.609565049148833e-07, "loss": 0.1049, "step": 13085 }, { "epoch": 4.411933254677229, "grad_norm": 1.4325235530009308, "learning_rate": 3.589230075006178e-07, "loss": 0.099, "step": 13090 }, { "epoch": 4.4136187426259905, "grad_norm": 1.6787616160478467, "learning_rate": 3.568950410048505e-07, "loss": 0.0961, "step": 13095 }, { "epoch": 4.415304230574751, "grad_norm": 1.2558170114537444, "learning_rate": 3.5487260784437483e-07, "loss": 0.0991, "step": 13100 }, { "epoch": 4.416989718523513, "grad_norm": 1.4730828629326196, "learning_rate": 3.5285571042938615e-07, "loss": 0.1233, "step": 13105 }, { "epoch": 4.418675206472273, "grad_norm": 1.406970977909432, "learning_rate": 3.5084435116348624e-07, "loss": 0.1154, "step": 13110 }, { "epoch": 4.420360694421035, "grad_norm": 1.5036343587118584, "learning_rate": 3.488385324436744e-07, "loss": 0.1089, "step": 13115 }, { "epoch": 4.422046182369796, "grad_norm": 1.493600176527841, "learning_rate": 3.468382566603501e-07, "loss": 0.104, "step": 13120 }, { "epoch": 4.423731670318557, "grad_norm": 1.2744043722365532, "learning_rate": 3.4484352619730435e-07, "loss": 0.0984, "step": 13125 }, { "epoch": 4.425417158267319, "grad_norm": 1.30950686330533, "learning_rate": 3.4285434343172e-07, "loss": 0.0934, "step": 13130 }, { "epoch": 4.427102646216079, "grad_norm": 5.954466591873907, "learning_rate": 3.4087071073416966e-07, "loss": 0.1017, "step": 13135 }, { "epoch": 4.428788134164841, "grad_norm": 1.111782857915883, "learning_rate": 3.38892630468613e-07, "loss": 0.11, "step": 13140 }, { "epoch": 4.430473622113602, "grad_norm": 1.1780736629033581, "learning_rate": 3.369201049923887e-07, "loss": 0.1032, "step": 13145 }, { "epoch": 4.432159110062363, "grad_norm": 1.4376665805994704, "learning_rate": 3.349531366562192e-07, "loss": 0.1003, "step": 13150 }, { "epoch": 4.433844598011124, "grad_norm": 1.0773282584288, "learning_rate": 3.3299172780420165e-07, "loss": 0.0975, "step": 13155 }, { "epoch": 4.435530085959885, "grad_norm": 1.5123653100906522, "learning_rate": 3.3103588077381067e-07, "loss": 0.119, "step": 13160 }, { "epoch": 4.437215573908647, "grad_norm": 1.292414589958653, "learning_rate": 3.2908559789588955e-07, "loss": 0.0989, "step": 13165 }, { "epoch": 4.438901061857408, "grad_norm": 1.1816991727295574, "learning_rate": 3.271408814946536e-07, "loss": 0.1126, "step": 13170 }, { "epoch": 4.440586549806169, "grad_norm": 1.351515000998513, "learning_rate": 3.252017338876817e-07, "loss": 0.1084, "step": 13175 }, { "epoch": 4.44227203775493, "grad_norm": 1.2347811208826616, "learning_rate": 3.232681573859192e-07, "loss": 0.0946, "step": 13180 }, { "epoch": 4.443957525703691, "grad_norm": 1.3385457566974366, "learning_rate": 3.2134015429366894e-07, "loss": 0.1054, "step": 13185 }, { "epoch": 4.445643013652452, "grad_norm": 1.2580278966688596, "learning_rate": 3.194177269085946e-07, "loss": 0.1207, "step": 13190 }, { "epoch": 4.447328501601214, "grad_norm": 1.3842959172953215, "learning_rate": 3.1750087752171145e-07, "loss": 0.1064, "step": 13195 }, { "epoch": 4.449013989549975, "grad_norm": 6.862406860552724, "learning_rate": 3.1558960841739263e-07, "loss": 0.1013, "step": 13200 }, { "epoch": 4.450699477498736, "grad_norm": 1.1997106922225558, "learning_rate": 3.1368392187335563e-07, "loss": 0.1035, "step": 13205 }, { "epoch": 4.452384965447497, "grad_norm": 1.3307420909333114, "learning_rate": 3.1178382016066933e-07, "loss": 0.1107, "step": 13210 }, { "epoch": 4.454070453396258, "grad_norm": 1.457990226696042, "learning_rate": 3.0988930554374406e-07, "loss": 0.1176, "step": 13215 }, { "epoch": 4.45575594134502, "grad_norm": 1.2801374667672991, "learning_rate": 3.0800038028033276e-07, "loss": 0.0986, "step": 13220 }, { "epoch": 4.45744142929378, "grad_norm": 1.3629737041739027, "learning_rate": 3.061170466215285e-07, "loss": 0.1071, "step": 13225 }, { "epoch": 4.459126917242542, "grad_norm": 1.44409171217283, "learning_rate": 3.0423930681175937e-07, "loss": 0.1056, "step": 13230 }, { "epoch": 4.4608124051913025, "grad_norm": 1.6979002115681163, "learning_rate": 3.023671630887859e-07, "loss": 0.1153, "step": 13235 }, { "epoch": 4.462497893140064, "grad_norm": 1.2247683478658518, "learning_rate": 3.0050061768370275e-07, "loss": 0.1033, "step": 13240 }, { "epoch": 4.464183381088826, "grad_norm": 1.2180326978307408, "learning_rate": 2.986396728209312e-07, "loss": 0.1036, "step": 13245 }, { "epoch": 4.465868869037586, "grad_norm": 1.3691193804229946, "learning_rate": 2.967843307182183e-07, "loss": 0.1066, "step": 13250 }, { "epoch": 4.467554356986348, "grad_norm": 1.2661150448335052, "learning_rate": 2.9493459358663325e-07, "loss": 0.0965, "step": 13255 }, { "epoch": 4.4692398449351085, "grad_norm": 1.3806974581332552, "learning_rate": 2.930904636305659e-07, "loss": 0.116, "step": 13260 }, { "epoch": 4.47092533288387, "grad_norm": 1.493484499947453, "learning_rate": 2.912519430477256e-07, "loss": 0.0936, "step": 13265 }, { "epoch": 4.472610820832631, "grad_norm": 1.3419874632039008, "learning_rate": 2.894190340291353e-07, "loss": 0.1209, "step": 13270 }, { "epoch": 4.474296308781392, "grad_norm": 1.207992872749056, "learning_rate": 2.8759173875913036e-07, "loss": 0.105, "step": 13275 }, { "epoch": 4.475981796730153, "grad_norm": 1.5128141751931952, "learning_rate": 2.8577005941535563e-07, "loss": 0.1077, "step": 13280 }, { "epoch": 4.4776672846789145, "grad_norm": 1.3212953983652505, "learning_rate": 2.839539981687661e-07, "loss": 0.1244, "step": 13285 }, { "epoch": 4.479352772627676, "grad_norm": 1.334640731661006, "learning_rate": 2.821435571836184e-07, "loss": 0.1055, "step": 13290 }, { "epoch": 4.481038260576437, "grad_norm": 1.5011148936411765, "learning_rate": 2.8033873861747273e-07, "loss": 0.1034, "step": 13295 }, { "epoch": 4.482723748525198, "grad_norm": 1.285105290546476, "learning_rate": 2.78539544621188e-07, "loss": 0.0941, "step": 13300 }, { "epoch": 4.484409236473959, "grad_norm": 1.0910101062939028, "learning_rate": 2.767459773389214e-07, "loss": 0.0961, "step": 13305 }, { "epoch": 4.4860947244227205, "grad_norm": 1.4428093730356306, "learning_rate": 2.74958038908123e-07, "loss": 0.0899, "step": 13310 }, { "epoch": 4.487780212371481, "grad_norm": 1.3092331473422458, "learning_rate": 2.731757314595362e-07, "loss": 0.1001, "step": 13315 }, { "epoch": 4.489465700320243, "grad_norm": 1.4362122682242904, "learning_rate": 2.713990571171937e-07, "loss": 0.1033, "step": 13320 }, { "epoch": 4.491151188269004, "grad_norm": 1.454945176553375, "learning_rate": 2.696280179984134e-07, "loss": 0.1263, "step": 13325 }, { "epoch": 4.492836676217765, "grad_norm": 1.614289102941099, "learning_rate": 2.678626162138004e-07, "loss": 0.1093, "step": 13330 }, { "epoch": 4.4945221641665265, "grad_norm": 1.2443091025966628, "learning_rate": 2.6610285386723887e-07, "loss": 0.1019, "step": 13335 }, { "epoch": 4.496207652115287, "grad_norm": 1.408390653946815, "learning_rate": 2.64348733055893e-07, "loss": 0.0921, "step": 13340 }, { "epoch": 4.497893140064049, "grad_norm": 1.8205675196145494, "learning_rate": 2.626002558702051e-07, "loss": 0.0964, "step": 13345 }, { "epoch": 4.499578628012809, "grad_norm": 1.2023903854946982, "learning_rate": 2.608574243938905e-07, "loss": 0.0948, "step": 13350 }, { "epoch": 4.501264115961571, "grad_norm": 1.3600282917815953, "learning_rate": 2.591202407039356e-07, "loss": 0.0917, "step": 13355 }, { "epoch": 4.5029496039103325, "grad_norm": 1.265455009740431, "learning_rate": 2.573887068705994e-07, "loss": 0.1029, "step": 13360 }, { "epoch": 4.504635091859093, "grad_norm": 1.2760492932336738, "learning_rate": 2.556628249574034e-07, "loss": 0.0996, "step": 13365 }, { "epoch": 4.506320579807855, "grad_norm": 1.335186382041331, "learning_rate": 2.5394259702113787e-07, "loss": 0.1269, "step": 13370 }, { "epoch": 4.508006067756615, "grad_norm": 1.3311672235205472, "learning_rate": 2.522280251118514e-07, "loss": 0.1054, "step": 13375 }, { "epoch": 4.509691555705377, "grad_norm": 1.4444715505739214, "learning_rate": 2.5051911127285446e-07, "loss": 0.0932, "step": 13380 }, { "epoch": 4.511377043654138, "grad_norm": 1.467697929063845, "learning_rate": 2.4881585754071236e-07, "loss": 0.1005, "step": 13385 }, { "epoch": 4.513062531602899, "grad_norm": 1.474115930327594, "learning_rate": 2.471182659452481e-07, "loss": 0.1174, "step": 13390 }, { "epoch": 4.51474801955166, "grad_norm": 1.4688121836243981, "learning_rate": 2.454263385095357e-07, "loss": 0.1044, "step": 13395 }, { "epoch": 4.516433507500421, "grad_norm": 1.295857311837337, "learning_rate": 2.437400772498977e-07, "loss": 0.0964, "step": 13400 }, { "epoch": 4.518118995449182, "grad_norm": 1.4508744717258912, "learning_rate": 2.42059484175905e-07, "loss": 0.1097, "step": 13405 }, { "epoch": 4.519804483397944, "grad_norm": 1.3250289810648217, "learning_rate": 2.403845612903738e-07, "loss": 0.1019, "step": 13410 }, { "epoch": 4.521489971346705, "grad_norm": 1.4877412560146837, "learning_rate": 2.387153105893636e-07, "loss": 0.0809, "step": 13415 }, { "epoch": 4.523175459295466, "grad_norm": 1.7429122138240438, "learning_rate": 2.3705173406217252e-07, "loss": 0.109, "step": 13420 }, { "epoch": 4.524860947244227, "grad_norm": 1.5071684021912692, "learning_rate": 2.3539383369133638e-07, "loss": 0.1163, "step": 13425 }, { "epoch": 4.526546435192988, "grad_norm": 1.3675636870248489, "learning_rate": 2.337416114526292e-07, "loss": 0.11, "step": 13430 }, { "epoch": 4.52823192314175, "grad_norm": 1.528495357739462, "learning_rate": 2.3209506931505698e-07, "loss": 0.1026, "step": 13435 }, { "epoch": 4.52991741109051, "grad_norm": 1.2744348436124975, "learning_rate": 2.304542092408546e-07, "loss": 0.0894, "step": 13440 }, { "epoch": 4.531602899039272, "grad_norm": 1.958814366529426, "learning_rate": 2.2881903318548782e-07, "loss": 0.1187, "step": 13445 }, { "epoch": 4.533288386988033, "grad_norm": 1.3051445388956697, "learning_rate": 2.271895430976473e-07, "loss": 0.0999, "step": 13450 }, { "epoch": 4.534973874936794, "grad_norm": 2.0114381479862375, "learning_rate": 2.255657409192491e-07, "loss": 0.1181, "step": 13455 }, { "epoch": 4.536659362885556, "grad_norm": 1.2902828398357575, "learning_rate": 2.239476285854286e-07, "loss": 0.1258, "step": 13460 }, { "epoch": 4.538344850834316, "grad_norm": 1.3801705209191595, "learning_rate": 2.223352080245411e-07, "loss": 0.1299, "step": 13465 }, { "epoch": 4.540030338783078, "grad_norm": 1.3151069475180843, "learning_rate": 2.207284811581606e-07, "loss": 0.0922, "step": 13470 }, { "epoch": 4.5417158267318385, "grad_norm": 1.5134407164527663, "learning_rate": 2.1912744990107427e-07, "loss": 0.1054, "step": 13475 }, { "epoch": 4.5434013146806, "grad_norm": 2.4600152060600027, "learning_rate": 2.1753211616128089e-07, "loss": 0.0979, "step": 13480 }, { "epoch": 4.545086802629362, "grad_norm": 2.8002583519383903, "learning_rate": 2.1594248183999023e-07, "loss": 0.1057, "step": 13485 }, { "epoch": 4.546772290578122, "grad_norm": 1.4688754097757288, "learning_rate": 2.1435854883162134e-07, "loss": 0.0991, "step": 13490 }, { "epoch": 4.548457778526884, "grad_norm": 1.9176154461189199, "learning_rate": 2.1278031902379649e-07, "loss": 0.0885, "step": 13495 }, { "epoch": 4.5501432664756445, "grad_norm": 1.4077288663541572, "learning_rate": 2.1120779429734172e-07, "loss": 0.1114, "step": 13500 }, { "epoch": 4.551828754424406, "grad_norm": 1.149103264917252, "learning_rate": 2.0964097652628413e-07, "loss": 0.0907, "step": 13505 }, { "epoch": 4.553514242373167, "grad_norm": 1.5061164931200244, "learning_rate": 2.0807986757785116e-07, "loss": 0.0974, "step": 13510 }, { "epoch": 4.555199730321928, "grad_norm": 1.410563741026411, "learning_rate": 2.0652446931246573e-07, "loss": 0.1165, "step": 13515 }, { "epoch": 4.55688521827069, "grad_norm": 1.5629610413914392, "learning_rate": 2.0497478358374567e-07, "loss": 0.1248, "step": 13520 }, { "epoch": 4.5585707062194505, "grad_norm": 1.2541546922911895, "learning_rate": 2.0343081223849925e-07, "loss": 0.0916, "step": 13525 }, { "epoch": 4.560256194168212, "grad_norm": 1.6012834176835438, "learning_rate": 2.0189255711672628e-07, "loss": 0.1293, "step": 13530 }, { "epoch": 4.561941682116973, "grad_norm": 1.4217877374746684, "learning_rate": 2.0036002005161538e-07, "loss": 0.0972, "step": 13535 }, { "epoch": 4.563627170065734, "grad_norm": 1.3252550147324835, "learning_rate": 1.9883320286953777e-07, "loss": 0.1002, "step": 13540 }, { "epoch": 4.565312658014495, "grad_norm": 1.552802696328646, "learning_rate": 1.9731210739005134e-07, "loss": 0.0971, "step": 13545 }, { "epoch": 4.5669981459632565, "grad_norm": 1.219037379507953, "learning_rate": 1.9579673542589273e-07, "loss": 0.0807, "step": 13550 }, { "epoch": 4.568683633912017, "grad_norm": 1.5542895792943663, "learning_rate": 1.9428708878298008e-07, "loss": 0.1126, "step": 13555 }, { "epoch": 4.570369121860779, "grad_norm": 1.323240352071046, "learning_rate": 1.9278316926040598e-07, "loss": 0.1058, "step": 13560 }, { "epoch": 4.572054609809539, "grad_norm": 1.4506849782108833, "learning_rate": 1.912849786504395e-07, "loss": 0.1071, "step": 13565 }, { "epoch": 4.573740097758301, "grad_norm": 1.2973205184715504, "learning_rate": 1.8979251873852023e-07, "loss": 0.1052, "step": 13570 }, { "epoch": 4.5754255857070625, "grad_norm": 1.1575245632553617, "learning_rate": 1.8830579130326265e-07, "loss": 0.0894, "step": 13575 }, { "epoch": 4.577111073655823, "grad_norm": 1.4657347968296255, "learning_rate": 1.868247981164445e-07, "loss": 0.1062, "step": 13580 }, { "epoch": 4.578796561604585, "grad_norm": 1.265978212372819, "learning_rate": 1.8534954094301449e-07, "loss": 0.0954, "step": 13585 }, { "epoch": 4.580482049553345, "grad_norm": 1.4106536434223835, "learning_rate": 1.838800215410813e-07, "loss": 0.1179, "step": 13590 }, { "epoch": 4.582167537502107, "grad_norm": 1.253712100573429, "learning_rate": 1.8241624166191963e-07, "loss": 0.0942, "step": 13595 }, { "epoch": 4.583853025450868, "grad_norm": 1.5665981816419539, "learning_rate": 1.809582030499607e-07, "loss": 0.0933, "step": 13600 }, { "epoch": 4.585538513399629, "grad_norm": 1.4463990990024052, "learning_rate": 1.7950590744279682e-07, "loss": 0.1136, "step": 13605 }, { "epoch": 4.587224001348391, "grad_norm": 1.5536933339722785, "learning_rate": 1.7805935657117246e-07, "loss": 0.0971, "step": 13610 }, { "epoch": 4.588909489297151, "grad_norm": 1.2453426235969909, "learning_rate": 1.7661855215899083e-07, "loss": 0.1192, "step": 13615 }, { "epoch": 4.590594977245913, "grad_norm": 1.2706847333011229, "learning_rate": 1.7518349592330176e-07, "loss": 0.105, "step": 13620 }, { "epoch": 4.592280465194674, "grad_norm": 1.3435038098907643, "learning_rate": 1.7375418957430944e-07, "loss": 0.1137, "step": 13625 }, { "epoch": 4.593965953143435, "grad_norm": 1.6570935440979908, "learning_rate": 1.723306348153625e-07, "loss": 0.1007, "step": 13630 }, { "epoch": 4.595651441092196, "grad_norm": 1.1470784971673305, "learning_rate": 1.709128333429555e-07, "loss": 0.106, "step": 13635 }, { "epoch": 4.597336929040957, "grad_norm": 1.5097808842881983, "learning_rate": 1.6950078684672854e-07, "loss": 0.1069, "step": 13640 }, { "epoch": 4.599022416989719, "grad_norm": 1.4006209172127413, "learning_rate": 1.6809449700946167e-07, "loss": 0.1013, "step": 13645 }, { "epoch": 4.60070790493848, "grad_norm": 1.3445071398100308, "learning_rate": 1.6669396550707485e-07, "loss": 0.1161, "step": 13650 }, { "epoch": 4.602393392887241, "grad_norm": 1.3638574619338524, "learning_rate": 1.652991940086257e-07, "loss": 0.1053, "step": 13655 }, { "epoch": 4.604078880836002, "grad_norm": 1.2097422183788538, "learning_rate": 1.6391018417630855e-07, "loss": 0.1107, "step": 13660 }, { "epoch": 4.605764368784763, "grad_norm": 1.546073374471397, "learning_rate": 1.6252693766545036e-07, "loss": 0.1216, "step": 13665 }, { "epoch": 4.607449856733524, "grad_norm": 1.4436844182108657, "learning_rate": 1.6114945612450915e-07, "loss": 0.1128, "step": 13670 }, { "epoch": 4.609135344682286, "grad_norm": 1.2781279461330373, "learning_rate": 1.5977774119507294e-07, "loss": 0.0912, "step": 13675 }, { "epoch": 4.610820832631047, "grad_norm": 1.4515965768020278, "learning_rate": 1.5841179451185907e-07, "loss": 0.1115, "step": 13680 }, { "epoch": 4.612506320579808, "grad_norm": 1.4502547215027042, "learning_rate": 1.570516177027087e-07, "loss": 0.113, "step": 13685 }, { "epoch": 4.614191808528569, "grad_norm": 1.0850439962631129, "learning_rate": 1.5569721238858748e-07, "loss": 0.097, "step": 13690 }, { "epoch": 4.61587729647733, "grad_norm": 1.6438707611644572, "learning_rate": 1.5434858018358257e-07, "loss": 0.119, "step": 13695 }, { "epoch": 4.617562784426092, "grad_norm": 1.4561568075903104, "learning_rate": 1.5300572269490388e-07, "loss": 0.0919, "step": 13700 }, { "epoch": 4.619248272374852, "grad_norm": 1.226244235817788, "learning_rate": 1.5166864152287574e-07, "loss": 0.0983, "step": 13705 }, { "epoch": 4.620933760323614, "grad_norm": 1.301093385520065, "learning_rate": 1.5033733826094077e-07, "loss": 0.1105, "step": 13710 }, { "epoch": 4.6226192482723745, "grad_norm": 1.1850246498855652, "learning_rate": 1.4901181449565372e-07, "loss": 0.0946, "step": 13715 }, { "epoch": 4.624304736221136, "grad_norm": 1.2985772344897857, "learning_rate": 1.4769207180668487e-07, "loss": 0.1026, "step": 13720 }, { "epoch": 4.625990224169897, "grad_norm": 1.7816307687608601, "learning_rate": 1.4637811176681283e-07, "loss": 0.1072, "step": 13725 }, { "epoch": 4.627675712118658, "grad_norm": 1.2727167962544899, "learning_rate": 1.4506993594192554e-07, "loss": 0.0973, "step": 13730 }, { "epoch": 4.62936120006742, "grad_norm": 1.2803686739478282, "learning_rate": 1.4376754589101705e-07, "loss": 0.0935, "step": 13735 }, { "epoch": 4.6310466880161805, "grad_norm": 1.4004762869722913, "learning_rate": 1.4247094316618748e-07, "loss": 0.0872, "step": 13740 }, { "epoch": 4.632732175964942, "grad_norm": 1.448946891965298, "learning_rate": 1.411801293126397e-07, "loss": 0.1068, "step": 13745 }, { "epoch": 4.634417663913703, "grad_norm": 1.510078583068379, "learning_rate": 1.398951058686765e-07, "loss": 0.1117, "step": 13750 }, { "epoch": 4.636103151862464, "grad_norm": 1.4684994382171623, "learning_rate": 1.3861587436570123e-07, "loss": 0.1101, "step": 13755 }, { "epoch": 4.637788639811225, "grad_norm": 1.4349037388555372, "learning_rate": 1.3734243632821498e-07, "loss": 0.089, "step": 13760 }, { "epoch": 4.6394741277599865, "grad_norm": 1.2204813917194925, "learning_rate": 1.360747932738149e-07, "loss": 0.082, "step": 13765 }, { "epoch": 4.641159615708748, "grad_norm": 1.5537345001374436, "learning_rate": 1.3481294671318924e-07, "loss": 0.0921, "step": 13770 }, { "epoch": 4.642845103657509, "grad_norm": 1.4973438628843838, "learning_rate": 1.3355689815012286e-07, "loss": 0.1067, "step": 13775 }, { "epoch": 4.64453059160627, "grad_norm": 1.4082753630005014, "learning_rate": 1.323066490814867e-07, "loss": 0.0992, "step": 13780 }, { "epoch": 4.646216079555031, "grad_norm": 1.439678451239427, "learning_rate": 1.31062200997244e-07, "loss": 0.102, "step": 13785 }, { "epoch": 4.6479015675037925, "grad_norm": 1.2850062094181645, "learning_rate": 1.2982355538044221e-07, "loss": 0.111, "step": 13790 }, { "epoch": 4.649587055452553, "grad_norm": 1.5424547304187732, "learning_rate": 1.28590713707214e-07, "loss": 0.112, "step": 13795 }, { "epoch": 4.651272543401315, "grad_norm": 1.4671621130843322, "learning_rate": 1.2736367744677626e-07, "loss": 0.0928, "step": 13800 }, { "epoch": 4.652958031350076, "grad_norm": 1.467450902677651, "learning_rate": 1.2614244806142651e-07, "loss": 0.1053, "step": 13805 }, { "epoch": 4.654643519298837, "grad_norm": 1.1730816201958458, "learning_rate": 1.2492702700654337e-07, "loss": 0.0904, "step": 13810 }, { "epoch": 4.6563290072475985, "grad_norm": 1.469229789590408, "learning_rate": 1.237174157305826e-07, "loss": 0.1102, "step": 13815 }, { "epoch": 4.658014495196359, "grad_norm": 1.188093502193414, "learning_rate": 1.2251361567507559e-07, "loss": 0.0915, "step": 13820 }, { "epoch": 4.659699983145121, "grad_norm": 2.2344150063316772, "learning_rate": 1.2131562827462973e-07, "loss": 0.1038, "step": 13825 }, { "epoch": 4.661385471093881, "grad_norm": 1.014425979160535, "learning_rate": 1.2012345495692356e-07, "loss": 0.0945, "step": 13830 }, { "epoch": 4.663070959042643, "grad_norm": 1.2416135504428898, "learning_rate": 1.1893709714270895e-07, "loss": 0.1178, "step": 13835 }, { "epoch": 4.6647564469914045, "grad_norm": 1.832968547848945, "learning_rate": 1.1775655624580496e-07, "loss": 0.1018, "step": 13840 }, { "epoch": 4.666441934940165, "grad_norm": 1.2650391940888095, "learning_rate": 1.165818336731006e-07, "loss": 0.1056, "step": 13845 }, { "epoch": 4.668127422888927, "grad_norm": 1.4636818432086411, "learning_rate": 1.1541293082454941e-07, "loss": 0.1131, "step": 13850 }, { "epoch": 4.669812910837687, "grad_norm": 1.5789184285509004, "learning_rate": 1.1424984909317038e-07, "loss": 0.1025, "step": 13855 }, { "epoch": 4.671498398786449, "grad_norm": 1.408174467537858, "learning_rate": 1.1309258986504424e-07, "loss": 0.1003, "step": 13860 }, { "epoch": 4.67318388673521, "grad_norm": 1.285766597093372, "learning_rate": 1.1194115451931386e-07, "loss": 0.0995, "step": 13865 }, { "epoch": 4.674869374683971, "grad_norm": 1.4648611518155075, "learning_rate": 1.1079554442818108e-07, "loss": 0.0975, "step": 13870 }, { "epoch": 4.676554862632732, "grad_norm": 1.180007859726227, "learning_rate": 1.0965576095690656e-07, "loss": 0.1053, "step": 13875 }, { "epoch": 4.678240350581493, "grad_norm": 1.2982904102149746, "learning_rate": 1.0852180546380486e-07, "loss": 0.11, "step": 13880 }, { "epoch": 4.679925838530254, "grad_norm": 1.1942878040297782, "learning_rate": 1.0739367930024724e-07, "loss": 0.1012, "step": 13885 }, { "epoch": 4.681611326479016, "grad_norm": 1.2812876636900898, "learning_rate": 1.0627138381065827e-07, "loss": 0.1021, "step": 13890 }, { "epoch": 4.683296814427777, "grad_norm": 1.216588415005487, "learning_rate": 1.0515492033251196e-07, "loss": 0.1077, "step": 13895 }, { "epoch": 4.684982302376538, "grad_norm": 1.4167751469266234, "learning_rate": 1.0404429019633344e-07, "loss": 0.097, "step": 13900 }, { "epoch": 4.686667790325299, "grad_norm": 1.4200775765023221, "learning_rate": 1.0293949472569676e-07, "loss": 0.1026, "step": 13905 }, { "epoch": 4.68835327827406, "grad_norm": 1.3074665236415486, "learning_rate": 1.0184053523722093e-07, "loss": 0.0975, "step": 13910 }, { "epoch": 4.690038766222822, "grad_norm": 1.1800306553238262, "learning_rate": 1.0074741304057056e-07, "loss": 0.0954, "step": 13915 }, { "epoch": 4.691724254171582, "grad_norm": 1.4886420207429685, "learning_rate": 9.966012943845361e-08, "loss": 0.1011, "step": 13920 }, { "epoch": 4.693409742120344, "grad_norm": 1.3144909160057303, "learning_rate": 9.857868572662133e-08, "loss": 0.112, "step": 13925 }, { "epoch": 4.695095230069105, "grad_norm": 1.262286624107206, "learning_rate": 9.750308319386503e-08, "loss": 0.0828, "step": 13930 }, { "epoch": 4.696780718017866, "grad_norm": 1.2707681070455816, "learning_rate": 9.64333231220127e-08, "loss": 0.0948, "step": 13935 }, { "epoch": 4.698466205966628, "grad_norm": 1.6918903015487379, "learning_rate": 9.536940678593232e-08, "loss": 0.1185, "step": 13940 }, { "epoch": 4.700151693915388, "grad_norm": 1.4283721193086336, "learning_rate": 9.431133545352634e-08, "loss": 0.0989, "step": 13945 }, { "epoch": 4.70183718186415, "grad_norm": 1.4124434276758755, "learning_rate": 9.32591103857322e-08, "loss": 0.1157, "step": 13950 }, { "epoch": 4.7035226698129105, "grad_norm": 1.4791650934256526, "learning_rate": 9.22127328365191e-08, "loss": 0.0888, "step": 13955 }, { "epoch": 4.705208157761672, "grad_norm": 1.1462947330726747, "learning_rate": 9.117220405288951e-08, "loss": 0.095, "step": 13960 }, { "epoch": 4.706893645710434, "grad_norm": 1.3097678867679194, "learning_rate": 9.013752527487374e-08, "loss": 0.0916, "step": 13965 }, { "epoch": 4.708579133659194, "grad_norm": 1.4634185123578487, "learning_rate": 8.910869773553155e-08, "loss": 0.0963, "step": 13970 }, { "epoch": 4.710264621607956, "grad_norm": 1.3727738856540987, "learning_rate": 8.808572266094939e-08, "loss": 0.111, "step": 13975 }, { "epoch": 4.7119501095567164, "grad_norm": 1.2320528176036325, "learning_rate": 8.706860127023875e-08, "loss": 0.1031, "step": 13980 }, { "epoch": 4.713635597505478, "grad_norm": 1.3724045713287618, "learning_rate": 8.605733477553502e-08, "loss": 0.12, "step": 13985 }, { "epoch": 4.715321085454239, "grad_norm": 1.3648950441040957, "learning_rate": 8.50519243819975e-08, "loss": 0.0939, "step": 13990 }, { "epoch": 4.717006573403, "grad_norm": 1.3058780044628497, "learning_rate": 8.405237128780497e-08, "loss": 0.0873, "step": 13995 }, { "epoch": 4.718692061351762, "grad_norm": 1.478244485874207, "learning_rate": 8.305867668415679e-08, "loss": 0.1075, "step": 14000 }, { "epoch": 4.7203775493005224, "grad_norm": 1.5088070557684299, "learning_rate": 8.207084175527014e-08, "loss": 0.1252, "step": 14005 }, { "epoch": 4.722063037249284, "grad_norm": 1.374032480063295, "learning_rate": 8.108886767837998e-08, "loss": 0.1001, "step": 14010 }, { "epoch": 4.723748525198045, "grad_norm": 2.177449526483698, "learning_rate": 8.011275562373466e-08, "loss": 0.1014, "step": 14015 }, { "epoch": 4.725434013146806, "grad_norm": 1.3136875220028357, "learning_rate": 7.914250675459867e-08, "loss": 0.1046, "step": 14020 }, { "epoch": 4.727119501095567, "grad_norm": 1.6446241508007466, "learning_rate": 7.817812222724763e-08, "loss": 0.1164, "step": 14025 }, { "epoch": 4.728804989044328, "grad_norm": 1.254716520691661, "learning_rate": 7.721960319097e-08, "loss": 0.0871, "step": 14030 }, { "epoch": 4.73049047699309, "grad_norm": 1.1338983059495962, "learning_rate": 7.626695078806312e-08, "loss": 0.0969, "step": 14035 }, { "epoch": 4.732175964941851, "grad_norm": 1.2403341420737148, "learning_rate": 7.53201661538322e-08, "loss": 0.0982, "step": 14040 }, { "epoch": 4.733861452890612, "grad_norm": 1.4179221828314692, "learning_rate": 7.437925041659189e-08, "loss": 0.0957, "step": 14045 }, { "epoch": 4.735546940839373, "grad_norm": 1.2719084829694884, "learning_rate": 7.344420469765967e-08, "loss": 0.1126, "step": 14050 }, { "epoch": 4.737232428788134, "grad_norm": 1.5907825455167337, "learning_rate": 7.25150301113603e-08, "loss": 0.0989, "step": 14055 }, { "epoch": 4.738917916736895, "grad_norm": 1.7800432677776008, "learning_rate": 7.159172776502077e-08, "loss": 0.104, "step": 14060 }, { "epoch": 4.740603404685657, "grad_norm": 1.3184865979313847, "learning_rate": 7.067429875896815e-08, "loss": 0.1225, "step": 14065 }, { "epoch": 4.742288892634417, "grad_norm": 1.5068797680268022, "learning_rate": 6.976274418653284e-08, "loss": 0.1109, "step": 14070 }, { "epoch": 4.743974380583179, "grad_norm": 1.2836301539188963, "learning_rate": 6.885706513404422e-08, "loss": 0.1141, "step": 14075 }, { "epoch": 4.7456598685319396, "grad_norm": 1.2487966488165214, "learning_rate": 6.79572626808267e-08, "loss": 0.1003, "step": 14080 }, { "epoch": 4.747345356480701, "grad_norm": 1.4740865889940853, "learning_rate": 6.706333789920527e-08, "loss": 0.0978, "step": 14085 }, { "epoch": 4.749030844429463, "grad_norm": 1.3887974610128597, "learning_rate": 6.617529185449668e-08, "loss": 0.0925, "step": 14090 }, { "epoch": 4.750716332378223, "grad_norm": 1.3904350765045153, "learning_rate": 6.529312560501433e-08, "loss": 0.0889, "step": 14095 }, { "epoch": 4.752401820326985, "grad_norm": 1.4133855143041645, "learning_rate": 6.441684020206452e-08, "loss": 0.1156, "step": 14100 }, { "epoch": 4.7540873082757455, "grad_norm": 1.302962150737466, "learning_rate": 6.354643668994243e-08, "loss": 0.1202, "step": 14105 }, { "epoch": 4.755772796224507, "grad_norm": 1.372419067721362, "learning_rate": 6.268191610593666e-08, "loss": 0.0878, "step": 14110 }, { "epoch": 4.757458284173268, "grad_norm": 3.454099385937723, "learning_rate": 6.182327948032474e-08, "loss": 0.1014, "step": 14115 }, { "epoch": 4.759143772122029, "grad_norm": 1.1870700419863325, "learning_rate": 6.097052783636925e-08, "loss": 0.0976, "step": 14120 }, { "epoch": 4.760829260070791, "grad_norm": 1.4750296611451297, "learning_rate": 6.012366219032284e-08, "loss": 0.1076, "step": 14125 }, { "epoch": 4.7625147480195515, "grad_norm": 1.4568738615338586, "learning_rate": 5.9282683551420974e-08, "loss": 0.105, "step": 14130 }, { "epoch": 4.764200235968313, "grad_norm": 1.4991782213657212, "learning_rate": 5.8447592921885286e-08, "loss": 0.1091, "step": 14135 }, { "epoch": 4.765885723917074, "grad_norm": 1.3102617851381169, "learning_rate": 5.7618391296919706e-08, "loss": 0.0947, "step": 14140 }, { "epoch": 4.767571211865835, "grad_norm": 1.618087149916084, "learning_rate": 5.679507966470932e-08, "loss": 0.0983, "step": 14145 }, { "epoch": 4.769256699814596, "grad_norm": 1.4792429006716927, "learning_rate": 5.597765900642149e-08, "loss": 0.1003, "step": 14150 }, { "epoch": 4.7709421877633575, "grad_norm": 1.4274182820738144, "learning_rate": 5.516613029620199e-08, "loss": 0.1139, "step": 14155 }, { "epoch": 4.772627675712119, "grad_norm": 1.20931340010249, "learning_rate": 5.4360494501176084e-08, "loss": 0.0837, "step": 14160 }, { "epoch": 4.77431316366088, "grad_norm": 1.1641391670837216, "learning_rate": 5.356075258144411e-08, "loss": 0.0993, "step": 14165 }, { "epoch": 4.775998651609641, "grad_norm": 1.2934022272539012, "learning_rate": 5.2766905490084784e-08, "loss": 0.1079, "step": 14170 }, { "epoch": 4.777684139558402, "grad_norm": 1.4872495758075495, "learning_rate": 5.197895417315024e-08, "loss": 0.1015, "step": 14175 }, { "epoch": 4.7793696275071635, "grad_norm": 1.538770319443185, "learning_rate": 5.119689956966767e-08, "loss": 0.1162, "step": 14180 }, { "epoch": 4.781055115455924, "grad_norm": 1.1982756767293192, "learning_rate": 5.042074261163599e-08, "loss": 0.1034, "step": 14185 }, { "epoch": 4.782740603404686, "grad_norm": 1.8762319699827408, "learning_rate": 4.965048422402641e-08, "loss": 0.1057, "step": 14190 }, { "epoch": 4.784426091353447, "grad_norm": 1.451763792493304, "learning_rate": 4.88861253247791e-08, "loss": 0.1135, "step": 14195 }, { "epoch": 4.786111579302208, "grad_norm": 1.48814623839987, "learning_rate": 4.812766682480596e-08, "loss": 0.101, "step": 14200 }, { "epoch": 4.7877970672509695, "grad_norm": 1.2203621932114868, "learning_rate": 4.737510962798564e-08, "loss": 0.0979, "step": 14205 }, { "epoch": 4.78948255519973, "grad_norm": 1.1162606094259975, "learning_rate": 4.662845463116461e-08, "loss": 0.0792, "step": 14210 }, { "epoch": 4.791168043148492, "grad_norm": 1.3276256419225485, "learning_rate": 4.5887702724154994e-08, "loss": 0.0925, "step": 14215 }, { "epoch": 4.792853531097252, "grad_norm": 1.464968815034206, "learning_rate": 4.515285478973397e-08, "loss": 0.1082, "step": 14220 }, { "epoch": 4.794539019046014, "grad_norm": 1.3048458940015526, "learning_rate": 4.44239117036438e-08, "loss": 0.1056, "step": 14225 }, { "epoch": 4.796224506994775, "grad_norm": 1.4920919238488923, "learning_rate": 4.370087433458903e-08, "loss": 0.104, "step": 14230 }, { "epoch": 4.797909994943536, "grad_norm": 1.4779132299592195, "learning_rate": 4.29837435442354e-08, "loss": 0.1238, "step": 14235 }, { "epoch": 4.799595482892297, "grad_norm": 1.348592374411331, "learning_rate": 4.227252018721151e-08, "loss": 0.108, "step": 14240 }, { "epoch": 4.801280970841058, "grad_norm": 1.4645067823705782, "learning_rate": 4.1567205111104346e-08, "loss": 0.1088, "step": 14245 }, { "epoch": 4.80296645878982, "grad_norm": 1.3262592748339201, "learning_rate": 4.086779915645989e-08, "loss": 0.1064, "step": 14250 }, { "epoch": 4.804651946738581, "grad_norm": 1.946159694795386, "learning_rate": 4.017430315678195e-08, "loss": 0.1009, "step": 14255 }, { "epoch": 4.806337434687342, "grad_norm": 1.4246586217648731, "learning_rate": 3.948671793853276e-08, "loss": 0.0802, "step": 14260 }, { "epoch": 4.808022922636103, "grad_norm": 1.4962720045677116, "learning_rate": 3.880504432112908e-08, "loss": 0.1254, "step": 14265 }, { "epoch": 4.809708410584864, "grad_norm": 1.393816825289662, "learning_rate": 3.812928311694275e-08, "loss": 0.0929, "step": 14270 }, { "epoch": 4.811393898533625, "grad_norm": 1.5256976576165577, "learning_rate": 3.745943513129957e-08, "loss": 0.0906, "step": 14275 }, { "epoch": 4.813079386482387, "grad_norm": 1.487951662028393, "learning_rate": 3.679550116247932e-08, "loss": 0.0899, "step": 14280 }, { "epoch": 4.814764874431148, "grad_norm": 1.4882273942351816, "learning_rate": 3.613748200171241e-08, "loss": 0.1042, "step": 14285 }, { "epoch": 4.816450362379909, "grad_norm": 1.1875714832766766, "learning_rate": 3.5485378433182117e-08, "loss": 0.092, "step": 14290 }, { "epoch": 4.81813585032867, "grad_norm": 1.4053327302056282, "learning_rate": 3.4839191234019576e-08, "loss": 0.111, "step": 14295 }, { "epoch": 4.819821338277431, "grad_norm": 1.2613569775368803, "learning_rate": 3.419892117430712e-08, "loss": 0.1188, "step": 14300 }, { "epoch": 4.821506826226193, "grad_norm": 1.4989874950538131, "learning_rate": 3.35645690170755e-08, "loss": 0.0966, "step": 14305 }, { "epoch": 4.823192314174953, "grad_norm": 1.3731881461629478, "learning_rate": 3.293613551830222e-08, "loss": 0.1122, "step": 14310 }, { "epoch": 4.824877802123715, "grad_norm": 1.2788135152005884, "learning_rate": 3.2313621426909855e-08, "loss": 0.0913, "step": 14315 }, { "epoch": 4.826563290072476, "grad_norm": 1.600960703496493, "learning_rate": 3.169702748476999e-08, "loss": 0.1088, "step": 14320 }, { "epoch": 4.828248778021237, "grad_norm": 1.466673992777664, "learning_rate": 3.10863544266965e-08, "loss": 0.0879, "step": 14325 }, { "epoch": 4.829934265969999, "grad_norm": 1.291084556225331, "learning_rate": 3.048160298044722e-08, "loss": 0.096, "step": 14330 }, { "epoch": 4.831619753918759, "grad_norm": 1.3753486674449276, "learning_rate": 2.988277386672456e-08, "loss": 0.1121, "step": 14335 }, { "epoch": 4.833305241867521, "grad_norm": 1.3627248642232634, "learning_rate": 2.928986779917098e-08, "loss": 0.1223, "step": 14340 }, { "epoch": 4.8349907298162815, "grad_norm": 1.134452585061997, "learning_rate": 2.8702885484372944e-08, "loss": 0.0951, "step": 14345 }, { "epoch": 4.836676217765043, "grad_norm": 1.3874592022831107, "learning_rate": 2.8121827621855336e-08, "loss": 0.0927, "step": 14350 }, { "epoch": 4.838361705713805, "grad_norm": 1.2983660687207284, "learning_rate": 2.7546694904082572e-08, "loss": 0.0928, "step": 14355 }, { "epoch": 4.840047193662565, "grad_norm": 1.364783484024969, "learning_rate": 2.697748801645861e-08, "loss": 0.0975, "step": 14360 }, { "epoch": 4.841732681611327, "grad_norm": 1.2189737120634667, "learning_rate": 2.6414207637325828e-08, "loss": 0.0889, "step": 14365 }, { "epoch": 4.8434181695600875, "grad_norm": 1.4757354692915603, "learning_rate": 2.5856854437963376e-08, "loss": 0.0899, "step": 14370 }, { "epoch": 4.845103657508849, "grad_norm": 1.2709892836999879, "learning_rate": 2.5305429082586597e-08, "loss": 0.0996, "step": 14375 }, { "epoch": 4.84678914545761, "grad_norm": 1.3345061704770749, "learning_rate": 2.475993222834594e-08, "loss": 0.1031, "step": 14380 }, { "epoch": 4.848474633406371, "grad_norm": 1.305377234298407, "learning_rate": 2.4220364525328055e-08, "loss": 0.0994, "step": 14385 }, { "epoch": 4.850160121355132, "grad_norm": 1.2231125689931832, "learning_rate": 2.3686726616553023e-08, "loss": 0.0981, "step": 14390 }, { "epoch": 4.8518456093038935, "grad_norm": 1.653085067087123, "learning_rate": 2.3159019137973248e-08, "loss": 0.1072, "step": 14395 }, { "epoch": 4.853531097252654, "grad_norm": 1.380812027604419, "learning_rate": 2.2637242718474562e-08, "loss": 0.1041, "step": 14400 }, { "epoch": 4.855216585201416, "grad_norm": 1.3017367505026471, "learning_rate": 2.2121397979875114e-08, "loss": 0.0942, "step": 14405 }, { "epoch": 4.856902073150177, "grad_norm": 1.4249052839160983, "learning_rate": 2.161148553692316e-08, "loss": 0.095, "step": 14410 }, { "epoch": 4.858587561098938, "grad_norm": 1.327283947326012, "learning_rate": 2.110750599729705e-08, "loss": 0.1032, "step": 14415 }, { "epoch": 4.8602730490476995, "grad_norm": 1.3251995054831043, "learning_rate": 2.0609459961605794e-08, "loss": 0.104, "step": 14420 }, { "epoch": 4.86195853699646, "grad_norm": 1.2263633559148792, "learning_rate": 2.0117348023386274e-08, "loss": 0.11, "step": 14425 }, { "epoch": 4.863644024945222, "grad_norm": 3.0158100580245644, "learning_rate": 1.9631170769103812e-08, "loss": 0.103, "step": 14430 }, { "epoch": 4.865329512893982, "grad_norm": 1.5023783123325296, "learning_rate": 1.9150928778151613e-08, "loss": 0.1216, "step": 14435 }, { "epoch": 4.867015000842744, "grad_norm": 1.4997010407474924, "learning_rate": 1.867662262284853e-08, "loss": 0.1106, "step": 14440 }, { "epoch": 4.8687004887915055, "grad_norm": 1.2278695574479974, "learning_rate": 1.8208252868441302e-08, "loss": 0.1011, "step": 14445 }, { "epoch": 4.870385976740266, "grad_norm": 1.2966558733227076, "learning_rate": 1.774582007310066e-08, "loss": 0.102, "step": 14450 }, { "epoch": 4.872071464689028, "grad_norm": 1.3403499888324188, "learning_rate": 1.728932478792189e-08, "loss": 0.0997, "step": 14455 }, { "epoch": 4.873756952637788, "grad_norm": 1.15835870765287, "learning_rate": 1.6838767556925372e-08, "loss": 0.0893, "step": 14460 }, { "epoch": 4.87544244058655, "grad_norm": 1.4940155219468743, "learning_rate": 1.639414891705382e-08, "loss": 0.1077, "step": 14465 }, { "epoch": 4.877127928535311, "grad_norm": 1.4463261334234456, "learning_rate": 1.595546939817394e-08, "loss": 0.0949, "step": 14470 }, { "epoch": 4.878813416484072, "grad_norm": 1.2964682055497974, "learning_rate": 1.552272952307421e-08, "loss": 0.1193, "step": 14475 }, { "epoch": 4.880498904432834, "grad_norm": 1.3741866763663682, "learning_rate": 1.5095929807463772e-08, "loss": 0.11, "step": 14480 }, { "epoch": 4.882184392381594, "grad_norm": 1.2238382777882655, "learning_rate": 1.4675070759974097e-08, "loss": 0.1011, "step": 14485 }, { "epoch": 4.883869880330356, "grad_norm": 1.3926257112820095, "learning_rate": 1.4260152882155654e-08, "loss": 0.1143, "step": 14490 }, { "epoch": 4.885555368279117, "grad_norm": 2.805845937973201, "learning_rate": 1.3851176668479571e-08, "loss": 0.1141, "step": 14495 }, { "epoch": 4.887240856227878, "grad_norm": 1.815060779592187, "learning_rate": 1.3448142606335424e-08, "loss": 0.094, "step": 14500 }, { "epoch": 4.888926344176639, "grad_norm": 1.368144997931563, "learning_rate": 1.3051051176032336e-08, "loss": 0.0957, "step": 14505 }, { "epoch": 4.8906118321254, "grad_norm": 1.4091654981611477, "learning_rate": 1.2659902850795659e-08, "loss": 0.0791, "step": 14510 }, { "epoch": 4.892297320074162, "grad_norm": 1.2977749319696585, "learning_rate": 1.2274698096770287e-08, "loss": 0.1156, "step": 14515 }, { "epoch": 4.893982808022923, "grad_norm": 2.031184872648758, "learning_rate": 1.1895437373016239e-08, "loss": 0.1002, "step": 14520 }, { "epoch": 4.895668295971684, "grad_norm": 1.2768824289027978, "learning_rate": 1.152212113151141e-08, "loss": 0.0904, "step": 14525 }, { "epoch": 4.897353783920445, "grad_norm": 1.1316564467207972, "learning_rate": 1.1154749817147147e-08, "loss": 0.0892, "step": 14530 }, { "epoch": 4.899039271869206, "grad_norm": 1.3727479458180365, "learning_rate": 1.0793323867733241e-08, "loss": 0.0965, "step": 14535 }, { "epoch": 4.900724759817967, "grad_norm": 1.150793789115667, "learning_rate": 1.0437843713991258e-08, "loss": 0.0932, "step": 14540 }, { "epoch": 4.902410247766729, "grad_norm": 1.368948355426084, "learning_rate": 1.008830977955788e-08, "loss": 0.0926, "step": 14545 }, { "epoch": 4.904095735715489, "grad_norm": 1.4709380810659298, "learning_rate": 9.744722480984903e-09, "loss": 0.0874, "step": 14550 }, { "epoch": 4.905781223664251, "grad_norm": 1.464568349442694, "learning_rate": 9.407082227735342e-09, "loss": 0.0983, "step": 14555 }, { "epoch": 4.9074667116130115, "grad_norm": 1.4937043474445795, "learning_rate": 9.07538942218622e-09, "loss": 0.1176, "step": 14560 }, { "epoch": 4.909152199561773, "grad_norm": 1.5056690904887626, "learning_rate": 8.749644459626338e-09, "loss": 0.1026, "step": 14565 }, { "epoch": 4.910837687510535, "grad_norm": 1.332865071391694, "learning_rate": 8.429847728255725e-09, "loss": 0.1052, "step": 14570 }, { "epoch": 4.912523175459295, "grad_norm": 1.2683786880575063, "learning_rate": 8.115999609187298e-09, "loss": 0.0883, "step": 14575 }, { "epoch": 4.914208663408057, "grad_norm": 1.574291409575643, "learning_rate": 7.808100476442982e-09, "loss": 0.113, "step": 14580 }, { "epoch": 4.9158941513568175, "grad_norm": 1.3436450372767457, "learning_rate": 7.506150696955927e-09, "loss": 0.0952, "step": 14585 }, { "epoch": 4.917579639305579, "grad_norm": 1.5593660881509166, "learning_rate": 7.2101506305699565e-09, "loss": 0.1065, "step": 14590 }, { "epoch": 4.91926512725434, "grad_norm": 1.515065980451858, "learning_rate": 6.920100630036786e-09, "loss": 0.1008, "step": 14595 }, { "epoch": 4.920950615203101, "grad_norm": 1.4768959884413175, "learning_rate": 6.636001041019357e-09, "loss": 0.1042, "step": 14600 }, { "epoch": 4.922636103151863, "grad_norm": 1.3566671047313197, "learning_rate": 6.357852202086845e-09, "loss": 0.1144, "step": 14605 }, { "epoch": 4.9243215911006235, "grad_norm": 1.7232660638299566, "learning_rate": 6.085654444719091e-09, "loss": 0.1156, "step": 14610 }, { "epoch": 4.926007079049385, "grad_norm": 1.245039092600688, "learning_rate": 5.81940809330217e-09, "loss": 0.0921, "step": 14615 }, { "epoch": 4.927692566998146, "grad_norm": 1.4185565225384973, "learning_rate": 5.559113465130051e-09, "loss": 0.1112, "step": 14620 }, { "epoch": 4.929378054946907, "grad_norm": 1.7545468415875707, "learning_rate": 5.304770870405152e-09, "loss": 0.0885, "step": 14625 }, { "epoch": 4.931063542895668, "grad_norm": 1.437941603395581, "learning_rate": 5.0563806122344575e-09, "loss": 0.1123, "step": 14630 }, { "epoch": 4.9327490308444295, "grad_norm": 1.46177106060379, "learning_rate": 4.813942986633402e-09, "loss": 0.1011, "step": 14635 }, { "epoch": 4.934434518793191, "grad_norm": 1.4721786423010996, "learning_rate": 4.5774582825219846e-09, "loss": 0.1184, "step": 14640 }, { "epoch": 4.936120006741952, "grad_norm": 1.3859301059254683, "learning_rate": 4.34692678172699e-09, "loss": 0.106, "step": 14645 }, { "epoch": 4.937805494690713, "grad_norm": 1.2671351567180096, "learning_rate": 4.122348758979766e-09, "loss": 0.1082, "step": 14650 }, { "epoch": 4.939490982639474, "grad_norm": 1.9554293400220635, "learning_rate": 3.903724481916782e-09, "loss": 0.1058, "step": 14655 }, { "epoch": 4.9411764705882355, "grad_norm": 1.3596853441968018, "learning_rate": 3.691054211080736e-09, "loss": 0.102, "step": 14660 }, { "epoch": 4.942861958536996, "grad_norm": 1.470148160698503, "learning_rate": 3.484338199916115e-09, "loss": 0.0967, "step": 14665 }, { "epoch": 4.944547446485758, "grad_norm": 1.4304674727429603, "learning_rate": 3.283576694773083e-09, "loss": 0.0953, "step": 14670 }, { "epoch": 4.946232934434519, "grad_norm": 1.5766864717031746, "learning_rate": 3.088769934906366e-09, "loss": 0.1193, "step": 14675 }, { "epoch": 4.94791842238328, "grad_norm": 1.3503109603634298, "learning_rate": 2.899918152473036e-09, "loss": 0.1046, "step": 14680 }, { "epoch": 4.9496039103320415, "grad_norm": 1.4917913508850917, "learning_rate": 2.7170215725336178e-09, "loss": 0.095, "step": 14685 }, { "epoch": 4.951289398280802, "grad_norm": 1.2445698559908476, "learning_rate": 2.5400804130515377e-09, "loss": 0.1046, "step": 14690 }, { "epoch": 4.952974886229564, "grad_norm": 1.3057329204886383, "learning_rate": 2.3690948848931195e-09, "loss": 0.1007, "step": 14695 }, { "epoch": 4.954660374178324, "grad_norm": 1.1783829580691276, "learning_rate": 2.204065191828142e-09, "loss": 0.0976, "step": 14700 }, { "epoch": 4.956345862127086, "grad_norm": 1.3909285323137688, "learning_rate": 2.044991530526508e-09, "loss": 0.0987, "step": 14705 }, { "epoch": 4.958031350075847, "grad_norm": 1.5336632708956925, "learning_rate": 1.891874090562129e-09, "loss": 0.109, "step": 14710 }, { "epoch": 4.959716838024608, "grad_norm": 1.5063911298248347, "learning_rate": 1.7447130544095969e-09, "loss": 0.1179, "step": 14715 }, { "epoch": 4.961402325973369, "grad_norm": 1.3840193028939571, "learning_rate": 1.6035085974452913e-09, "loss": 0.1108, "step": 14720 }, { "epoch": 4.96308781392213, "grad_norm": 1.5144172111749752, "learning_rate": 1.4682608879479366e-09, "loss": 0.1168, "step": 14725 }, { "epoch": 4.964773301870892, "grad_norm": 1.276551146771887, "learning_rate": 1.3389700870952705e-09, "loss": 0.1021, "step": 14730 }, { "epoch": 4.966458789819653, "grad_norm": 1.3137608821564721, "learning_rate": 1.21563634896793e-09, "loss": 0.1119, "step": 14735 }, { "epoch": 4.968144277768414, "grad_norm": 1.1871070566357258, "learning_rate": 1.0982598205461215e-09, "loss": 0.1086, "step": 14740 }, { "epoch": 4.969829765717175, "grad_norm": 1.1932765768001778, "learning_rate": 9.868406417118391e-10, "loss": 0.1001, "step": 14745 }, { "epoch": 4.971515253665936, "grad_norm": 1.2855010367496051, "learning_rate": 8.81378945246647e-10, "loss": 0.0962, "step": 14750 }, { "epoch": 4.973200741614697, "grad_norm": 1.3684311123014772, "learning_rate": 7.818748568322321e-10, "loss": 0.109, "step": 14755 }, { "epoch": 4.974886229563459, "grad_norm": 1.1802664955924085, "learning_rate": 6.883284950509606e-10, "loss": 0.0986, "step": 14760 }, { "epoch": 4.97657171751222, "grad_norm": 1.2218188299467154, "learning_rate": 6.007399713853224e-10, "loss": 0.0967, "step": 14765 }, { "epoch": 4.978257205460981, "grad_norm": 1.3842376768427533, "learning_rate": 5.191093902168209e-10, "loss": 0.1086, "step": 14770 }, { "epoch": 4.979942693409742, "grad_norm": 1.4366177776147933, "learning_rate": 4.434368488276386e-10, "loss": 0.1103, "step": 14775 }, { "epoch": 4.981628181358503, "grad_norm": 1.0757505610798177, "learning_rate": 3.737224373989712e-10, "loss": 0.1016, "step": 14780 }, { "epoch": 4.983313669307265, "grad_norm": 1.2893166566817769, "learning_rate": 3.099662390115832e-10, "loss": 0.1141, "step": 14785 }, { "epoch": 4.984999157256025, "grad_norm": 1.5323418018912287, "learning_rate": 2.5216832964580775e-10, "loss": 0.0948, "step": 14790 }, { "epoch": 4.986684645204787, "grad_norm": 1.3496620760339086, "learning_rate": 2.003287781809915e-10, "loss": 0.1029, "step": 14795 }, { "epoch": 4.988370133153548, "grad_norm": 1.3974048630503693, "learning_rate": 1.5444764639660492e-10, "loss": 0.1074, "step": 14800 }, { "epoch": 4.990055621102309, "grad_norm": 1.6672607871290943, "learning_rate": 1.1452498897057685e-10, "loss": 0.1034, "step": 14805 }, { "epoch": 4.991741109051071, "grad_norm": 1.2365419280318917, "learning_rate": 8.056085347929454e-11, "loss": 0.0827, "step": 14810 }, { "epoch": 4.993426596999831, "grad_norm": 1.427640809943521, "learning_rate": 5.255528039926905e-11, "loss": 0.1091, "step": 14815 }, { "epoch": 4.995112084948593, "grad_norm": 1.2181763677593134, "learning_rate": 3.050830310602493e-11, "loss": 0.0944, "step": 14820 }, { "epoch": 4.9967975728973535, "grad_norm": 1.342276220471211, "learning_rate": 1.441994787299006e-11, "loss": 0.0939, "step": 14825 }, { "epoch": 4.998483060846115, "grad_norm": 0.9877557358399789, "learning_rate": 4.2902338737160765e-12, "loss": 0.0938, "step": 14830 }, { "epoch": 5.0, "grad_norm": 2.056041710199563, "learning_rate": 1.1917317965792764e-13, "loss": 0.1204, "step": 14835 }, { "epoch": 5.0, "step": 14835, "total_flos": 2.8140820020854784e+16, "train_loss": 0.23571134310078853, "train_runtime": 105488.1838, "train_samples_per_second": 71.99, "train_steps_per_second": 0.141 } ], "logging_steps": 5, "max_steps": 14835, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8140820020854784e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }