{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 14505, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017238407171177384, "grad_norm": 39.78016284529353, "learning_rate": 9.174311926605506e-08, "loss": 1.9587, "step": 5 }, { "epoch": 0.0034476814342354768, "grad_norm": 54.45887630252972, "learning_rate": 2.064220183486239e-07, "loss": 1.9468, "step": 10 }, { "epoch": 0.005171522151353215, "grad_norm": 49.12766068536532, "learning_rate": 3.211009174311927e-07, "loss": 1.8937, "step": 15 }, { "epoch": 0.0068953628684709535, "grad_norm": 41.25405363658383, "learning_rate": 4.357798165137615e-07, "loss": 1.8044, "step": 20 }, { "epoch": 0.008619203585588691, "grad_norm": 45.654320048769435, "learning_rate": 5.504587155963304e-07, "loss": 1.4952, "step": 25 }, { "epoch": 0.01034304430270643, "grad_norm": 17.90673392907594, "learning_rate": 6.651376146788992e-07, "loss": 1.1202, "step": 30 }, { "epoch": 0.012066885019824168, "grad_norm": 17.266787541205158, "learning_rate": 7.79816513761468e-07, "loss": 1.0723, "step": 35 }, { "epoch": 0.013790725736941907, "grad_norm": 10.826211444168319, "learning_rate": 8.944954128440368e-07, "loss": 0.9813, "step": 40 }, { "epoch": 0.015514566454059645, "grad_norm": 6.139222549734279, "learning_rate": 1.0091743119266057e-06, "loss": 0.8871, "step": 45 }, { "epoch": 0.017238407171177382, "grad_norm": 6.354966515955482, "learning_rate": 1.1238532110091744e-06, "loss": 0.8239, "step": 50 }, { "epoch": 0.018962247888295123, "grad_norm": 4.4768413900168476, "learning_rate": 1.2385321100917433e-06, "loss": 0.8044, "step": 55 }, { "epoch": 0.02068608860541286, "grad_norm": 3.8016061359265825, "learning_rate": 1.353211009174312e-06, "loss": 0.7791, "step": 60 }, { "epoch": 0.022409929322530598, "grad_norm": 4.668162150858383, "learning_rate": 1.467889908256881e-06, "loss": 0.7884, "step": 65 }, { "epoch": 0.024133770039648336, "grad_norm": 4.288983799389968, "learning_rate": 1.5825688073394496e-06, "loss": 0.7471, "step": 70 }, { "epoch": 0.025857610756766077, "grad_norm": 3.0974564753828004, "learning_rate": 1.6972477064220186e-06, "loss": 0.7531, "step": 75 }, { "epoch": 0.027581451473883814, "grad_norm": 3.1397534126911686, "learning_rate": 1.8119266055045873e-06, "loss": 0.7049, "step": 80 }, { "epoch": 0.02930529219100155, "grad_norm": 2.628798019332378, "learning_rate": 1.9266055045871564e-06, "loss": 0.6718, "step": 85 }, { "epoch": 0.03102913290811929, "grad_norm": 3.8325816308276, "learning_rate": 2.041284403669725e-06, "loss": 0.6207, "step": 90 }, { "epoch": 0.03275297362523703, "grad_norm": 3.4309823032403393, "learning_rate": 2.155963302752294e-06, "loss": 0.626, "step": 95 }, { "epoch": 0.034476814342354764, "grad_norm": 2.9722717648273616, "learning_rate": 2.2706422018348624e-06, "loss": 0.6616, "step": 100 }, { "epoch": 0.0362006550594725, "grad_norm": 3.63653575765165, "learning_rate": 2.3853211009174317e-06, "loss": 0.624, "step": 105 }, { "epoch": 0.037924495776590246, "grad_norm": 8.600676962941835, "learning_rate": 2.5e-06, "loss": 0.5929, "step": 110 }, { "epoch": 0.039648336493707984, "grad_norm": 2.987695328695734, "learning_rate": 2.6146788990825687e-06, "loss": 0.56, "step": 115 }, { "epoch": 0.04137217721082572, "grad_norm": 2.809016432052269, "learning_rate": 2.7293577981651376e-06, "loss": 0.5462, "step": 120 }, { "epoch": 0.04309601792794346, "grad_norm": 4.083569914949518, "learning_rate": 2.844036697247707e-06, "loss": 0.4953, "step": 125 }, { "epoch": 0.044819858645061196, "grad_norm": 2.657791948245135, "learning_rate": 2.9587155963302755e-06, "loss": 0.5255, "step": 130 }, { "epoch": 0.046543699362178934, "grad_norm": 2.4534732093259484, "learning_rate": 3.073394495412844e-06, "loss": 0.5137, "step": 135 }, { "epoch": 0.04826754007929667, "grad_norm": 2.5288108338905744, "learning_rate": 3.188073394495413e-06, "loss": 0.4995, "step": 140 }, { "epoch": 0.04999138079641441, "grad_norm": 3.050354376645196, "learning_rate": 3.3027522935779823e-06, "loss": 0.4656, "step": 145 }, { "epoch": 0.05171522151353215, "grad_norm": 3.8301326296872036, "learning_rate": 3.4174311926605508e-06, "loss": 0.4727, "step": 150 }, { "epoch": 0.05343906223064989, "grad_norm": 2.7466041439996354, "learning_rate": 3.5321100917431193e-06, "loss": 0.4448, "step": 155 }, { "epoch": 0.05516290294776763, "grad_norm": 2.7421197824426136, "learning_rate": 3.646788990825688e-06, "loss": 0.4521, "step": 160 }, { "epoch": 0.056886743664885366, "grad_norm": 2.1029134850923596, "learning_rate": 3.7614678899082575e-06, "loss": 0.4127, "step": 165 }, { "epoch": 0.0586105843820031, "grad_norm": 2.404231668222291, "learning_rate": 3.876146788990826e-06, "loss": 0.4382, "step": 170 }, { "epoch": 0.06033442509912084, "grad_norm": 2.920030378869354, "learning_rate": 3.9908256880733945e-06, "loss": 0.4046, "step": 175 }, { "epoch": 0.06205826581623858, "grad_norm": 2.1025930464202363, "learning_rate": 4.105504587155963e-06, "loss": 0.4054, "step": 180 }, { "epoch": 0.06378210653335632, "grad_norm": 2.366928935303905, "learning_rate": 4.220183486238532e-06, "loss": 0.4045, "step": 185 }, { "epoch": 0.06550594725047405, "grad_norm": 3.390075203841566, "learning_rate": 4.334862385321102e-06, "loss": 0.4073, "step": 190 }, { "epoch": 0.0672297879675918, "grad_norm": 2.891098407909226, "learning_rate": 4.44954128440367e-06, "loss": 0.3926, "step": 195 }, { "epoch": 0.06895362868470953, "grad_norm": 4.152303170049208, "learning_rate": 4.564220183486239e-06, "loss": 0.4134, "step": 200 }, { "epoch": 0.07067746940182727, "grad_norm": 4.021346589174643, "learning_rate": 4.678899082568808e-06, "loss": 0.4287, "step": 205 }, { "epoch": 0.072401310118945, "grad_norm": 2.285811756508236, "learning_rate": 4.793577981651377e-06, "loss": 0.4012, "step": 210 }, { "epoch": 0.07412515083606275, "grad_norm": 3.1588080152416946, "learning_rate": 4.908256880733945e-06, "loss": 0.3995, "step": 215 }, { "epoch": 0.07584899155318049, "grad_norm": 3.130168900049288, "learning_rate": 5.0229357798165144e-06, "loss": 0.4176, "step": 220 }, { "epoch": 0.07757283227029822, "grad_norm": 3.038295689456518, "learning_rate": 5.137614678899083e-06, "loss": 0.4017, "step": 225 }, { "epoch": 0.07929667298741597, "grad_norm": 3.103669836904637, "learning_rate": 5.252293577981652e-06, "loss": 0.4209, "step": 230 }, { "epoch": 0.0810205137045337, "grad_norm": 6.68545870580622, "learning_rate": 5.366972477064221e-06, "loss": 0.3958, "step": 235 }, { "epoch": 0.08274435442165144, "grad_norm": 2.322241896800378, "learning_rate": 5.481651376146789e-06, "loss": 0.4179, "step": 240 }, { "epoch": 0.08446819513876917, "grad_norm": 2.507028021110862, "learning_rate": 5.596330275229358e-06, "loss": 0.3977, "step": 245 }, { "epoch": 0.08619203585588692, "grad_norm": 2.753656567325975, "learning_rate": 5.711009174311926e-06, "loss": 0.4112, "step": 250 }, { "epoch": 0.08791587657300465, "grad_norm": 2.2863472704603147, "learning_rate": 5.825688073394496e-06, "loss": 0.3881, "step": 255 }, { "epoch": 0.08963971729012239, "grad_norm": 3.813935905396218, "learning_rate": 5.940366972477065e-06, "loss": 0.3893, "step": 260 }, { "epoch": 0.09136355800724014, "grad_norm": 2.402554331365669, "learning_rate": 6.0550458715596335e-06, "loss": 0.3785, "step": 265 }, { "epoch": 0.09308739872435787, "grad_norm": 2.1116348449486297, "learning_rate": 6.169724770642203e-06, "loss": 0.3971, "step": 270 }, { "epoch": 0.09481123944147561, "grad_norm": 2.5491224473263796, "learning_rate": 6.284403669724771e-06, "loss": 0.3926, "step": 275 }, { "epoch": 0.09653508015859334, "grad_norm": 2.355395092053141, "learning_rate": 6.39908256880734e-06, "loss": 0.3998, "step": 280 }, { "epoch": 0.09825892087571109, "grad_norm": 2.2613087445602833, "learning_rate": 6.513761467889908e-06, "loss": 0.3889, "step": 285 }, { "epoch": 0.09998276159282882, "grad_norm": 2.355674812454849, "learning_rate": 6.628440366972477e-06, "loss": 0.4069, "step": 290 }, { "epoch": 0.10170660230994656, "grad_norm": 4.185598154500696, "learning_rate": 6.743119266055046e-06, "loss": 0.404, "step": 295 }, { "epoch": 0.1034304430270643, "grad_norm": 2.527006058053336, "learning_rate": 6.8577981651376156e-06, "loss": 0.3947, "step": 300 }, { "epoch": 0.10515428374418204, "grad_norm": 3.3133527713511843, "learning_rate": 6.972477064220184e-06, "loss": 0.3852, "step": 305 }, { "epoch": 0.10687812446129978, "grad_norm": 3.540122413545443, "learning_rate": 7.087155963302753e-06, "loss": 0.3928, "step": 310 }, { "epoch": 0.10860196517841751, "grad_norm": 3.0843381032577333, "learning_rate": 7.201834862385322e-06, "loss": 0.4073, "step": 315 }, { "epoch": 0.11032580589553526, "grad_norm": 2.040518561707585, "learning_rate": 7.31651376146789e-06, "loss": 0.4004, "step": 320 }, { "epoch": 0.11204964661265299, "grad_norm": 1.7648309553670658, "learning_rate": 7.431192660550459e-06, "loss": 0.4205, "step": 325 }, { "epoch": 0.11377348732977073, "grad_norm": 2.243096280244523, "learning_rate": 7.545871559633028e-06, "loss": 0.3736, "step": 330 }, { "epoch": 0.11549732804688846, "grad_norm": 2.0265523548779254, "learning_rate": 7.660550458715596e-06, "loss": 0.3619, "step": 335 }, { "epoch": 0.1172211687640062, "grad_norm": 2.0501398174879952, "learning_rate": 7.775229357798164e-06, "loss": 0.4015, "step": 340 }, { "epoch": 0.11894500948112395, "grad_norm": 2.2353934598909158, "learning_rate": 7.889908256880735e-06, "loss": 0.3731, "step": 345 }, { "epoch": 0.12066885019824168, "grad_norm": 1.838277567752222, "learning_rate": 8.004587155963303e-06, "loss": 0.3908, "step": 350 }, { "epoch": 0.12239269091535943, "grad_norm": 2.398312584687634, "learning_rate": 8.119266055045872e-06, "loss": 0.384, "step": 355 }, { "epoch": 0.12411653163247716, "grad_norm": 2.313315321651735, "learning_rate": 8.233944954128442e-06, "loss": 0.3829, "step": 360 }, { "epoch": 0.1258403723495949, "grad_norm": 4.6373392398998705, "learning_rate": 8.34862385321101e-06, "loss": 0.369, "step": 365 }, { "epoch": 0.12756421306671265, "grad_norm": 3.7183533750191495, "learning_rate": 8.463302752293579e-06, "loss": 0.4126, "step": 370 }, { "epoch": 0.12928805378383038, "grad_norm": 2.406767268368641, "learning_rate": 8.577981651376147e-06, "loss": 0.3562, "step": 375 }, { "epoch": 0.1310118945009481, "grad_norm": 2.2747093864314074, "learning_rate": 8.692660550458716e-06, "loss": 0.4103, "step": 380 }, { "epoch": 0.13273573521806586, "grad_norm": 1.908977382401557, "learning_rate": 8.807339449541286e-06, "loss": 0.4133, "step": 385 }, { "epoch": 0.1344595759351836, "grad_norm": 1.9376598412867219, "learning_rate": 8.922018348623855e-06, "loss": 0.3801, "step": 390 }, { "epoch": 0.13618341665230133, "grad_norm": 2.2551638318770935, "learning_rate": 9.036697247706423e-06, "loss": 0.3888, "step": 395 }, { "epoch": 0.13790725736941906, "grad_norm": 2.104590761231902, "learning_rate": 9.151376146788992e-06, "loss": 0.4118, "step": 400 }, { "epoch": 0.13963109808653681, "grad_norm": 1.8231769972122067, "learning_rate": 9.26605504587156e-06, "loss": 0.3984, "step": 405 }, { "epoch": 0.14135493880365455, "grad_norm": 5.751866218247112, "learning_rate": 9.380733944954129e-06, "loss": 0.3847, "step": 410 }, { "epoch": 0.14307877952077228, "grad_norm": 1.4715301900214872, "learning_rate": 9.495412844036697e-06, "loss": 0.3673, "step": 415 }, { "epoch": 0.14480262023789, "grad_norm": 2.4838714998797067, "learning_rate": 9.610091743119267e-06, "loss": 0.3812, "step": 420 }, { "epoch": 0.14652646095500776, "grad_norm": 1.6705137293047283, "learning_rate": 9.724770642201836e-06, "loss": 0.3379, "step": 425 }, { "epoch": 0.1482503016721255, "grad_norm": 2.1731769348150065, "learning_rate": 9.839449541284404e-06, "loss": 0.398, "step": 430 }, { "epoch": 0.14997414238924323, "grad_norm": 3.8419027612384946, "learning_rate": 9.954128440366973e-06, "loss": 0.398, "step": 435 }, { "epoch": 0.15169798310636098, "grad_norm": 2.110073518173239, "learning_rate": 9.999998878095765e-06, "loss": 0.4154, "step": 440 }, { "epoch": 0.15342182382347871, "grad_norm": 1.5126990806154013, "learning_rate": 9.999992022016144e-06, "loss": 0.3414, "step": 445 }, { "epoch": 0.15514566454059645, "grad_norm": 1.43575853742032, "learning_rate": 9.999978933145567e-06, "loss": 0.3764, "step": 450 }, { "epoch": 0.15686950525771418, "grad_norm": 1.717920752962547, "learning_rate": 9.999959611500351e-06, "loss": 0.4114, "step": 455 }, { "epoch": 0.15859334597483193, "grad_norm": 1.540164844779612, "learning_rate": 9.999934057104585e-06, "loss": 0.3547, "step": 460 }, { "epoch": 0.16031718669194966, "grad_norm": 2.4286004694238694, "learning_rate": 9.99990226999012e-06, "loss": 0.3737, "step": 465 }, { "epoch": 0.1620410274090674, "grad_norm": 1.6364737615086615, "learning_rate": 9.999864250196582e-06, "loss": 0.3712, "step": 470 }, { "epoch": 0.16376486812618515, "grad_norm": 1.9489915262665032, "learning_rate": 9.999819997771365e-06, "loss": 0.3664, "step": 475 }, { "epoch": 0.16548870884330288, "grad_norm": 2.058773467524577, "learning_rate": 9.999769512769632e-06, "loss": 0.3838, "step": 480 }, { "epoch": 0.16721254956042061, "grad_norm": 19.681311592477975, "learning_rate": 9.999712795254318e-06, "loss": 0.3805, "step": 485 }, { "epoch": 0.16893639027753835, "grad_norm": 2.093532171948987, "learning_rate": 9.99964984529612e-06, "loss": 0.3848, "step": 490 }, { "epoch": 0.1706602309946561, "grad_norm": 1.6329941265934056, "learning_rate": 9.999580662973511e-06, "loss": 0.3926, "step": 495 }, { "epoch": 0.17238407171177383, "grad_norm": 1.5584880166519641, "learning_rate": 9.999505248372734e-06, "loss": 0.359, "step": 500 }, { "epoch": 0.17410791242889156, "grad_norm": 1.4956353781231042, "learning_rate": 9.999423601587794e-06, "loss": 0.3884, "step": 505 }, { "epoch": 0.1758317531460093, "grad_norm": 1.4756155207429988, "learning_rate": 9.999335722720471e-06, "loss": 0.3885, "step": 510 }, { "epoch": 0.17755559386312705, "grad_norm": 1.5953193620214319, "learning_rate": 9.999241611880309e-06, "loss": 0.4194, "step": 515 }, { "epoch": 0.17927943458024478, "grad_norm": 2.085260102903812, "learning_rate": 9.999141269184624e-06, "loss": 0.3763, "step": 520 }, { "epoch": 0.18100327529736251, "grad_norm": 1.71472471804224, "learning_rate": 9.9990346947585e-06, "loss": 0.3802, "step": 525 }, { "epoch": 0.18272711601448027, "grad_norm": 1.895808542216389, "learning_rate": 9.998921888734787e-06, "loss": 0.3648, "step": 530 }, { "epoch": 0.184450956731598, "grad_norm": 1.8454078954270925, "learning_rate": 9.998802851254106e-06, "loss": 0.3967, "step": 535 }, { "epoch": 0.18617479744871573, "grad_norm": 2.2793271469320375, "learning_rate": 9.998677582464842e-06, "loss": 0.3778, "step": 540 }, { "epoch": 0.18789863816583346, "grad_norm": 1.4775312147085078, "learning_rate": 9.998546082523154e-06, "loss": 0.4067, "step": 545 }, { "epoch": 0.18962247888295122, "grad_norm": 1.8126102204167893, "learning_rate": 9.99840835159296e-06, "loss": 0.3969, "step": 550 }, { "epoch": 0.19134631960006895, "grad_norm": 3.1302854559662494, "learning_rate": 9.998264389845954e-06, "loss": 0.3476, "step": 555 }, { "epoch": 0.19307016031718668, "grad_norm": 1.9485324674828295, "learning_rate": 9.99811419746159e-06, "loss": 0.3734, "step": 560 }, { "epoch": 0.19479400103430444, "grad_norm": 3.484220474881742, "learning_rate": 9.997957774627094e-06, "loss": 0.3812, "step": 565 }, { "epoch": 0.19651784175142217, "grad_norm": 1.5462072274250418, "learning_rate": 9.997795121537455e-06, "loss": 0.4151, "step": 570 }, { "epoch": 0.1982416824685399, "grad_norm": 2.3841039117699503, "learning_rate": 9.997626238395431e-06, "loss": 0.375, "step": 575 }, { "epoch": 0.19996552318565763, "grad_norm": 1.734462124188258, "learning_rate": 9.997451125411542e-06, "loss": 0.3534, "step": 580 }, { "epoch": 0.2016893639027754, "grad_norm": 1.542198455947852, "learning_rate": 9.99726978280408e-06, "loss": 0.3417, "step": 585 }, { "epoch": 0.20341320461989312, "grad_norm": 1.9091830991742968, "learning_rate": 9.997082210799101e-06, "loss": 0.3659, "step": 590 }, { "epoch": 0.20513704533701085, "grad_norm": 1.8345560789276327, "learning_rate": 9.99688840963042e-06, "loss": 0.393, "step": 595 }, { "epoch": 0.2068608860541286, "grad_norm": 2.1062288478262206, "learning_rate": 9.996688379539625e-06, "loss": 0.3836, "step": 600 }, { "epoch": 0.20858472677124634, "grad_norm": 2.339131539367282, "learning_rate": 9.996482120776065e-06, "loss": 0.3611, "step": 605 }, { "epoch": 0.21030856748836407, "grad_norm": 1.6370806618729674, "learning_rate": 9.996269633596853e-06, "loss": 0.3501, "step": 610 }, { "epoch": 0.2120324082054818, "grad_norm": 1.5630726784490585, "learning_rate": 9.99605091826687e-06, "loss": 0.3613, "step": 615 }, { "epoch": 0.21375624892259956, "grad_norm": 1.6159611373776788, "learning_rate": 9.995825975058754e-06, "loss": 0.3874, "step": 620 }, { "epoch": 0.2154800896397173, "grad_norm": 1.5359715084462024, "learning_rate": 9.995594804252913e-06, "loss": 0.3766, "step": 625 }, { "epoch": 0.21720393035683502, "grad_norm": 1.644239217826752, "learning_rate": 9.995357406137512e-06, "loss": 0.3973, "step": 630 }, { "epoch": 0.21892777107395275, "grad_norm": 1.8749655181666027, "learning_rate": 9.995113781008485e-06, "loss": 0.3724, "step": 635 }, { "epoch": 0.2206516117910705, "grad_norm": 1.5084495107797091, "learning_rate": 9.994863929169526e-06, "loss": 0.4027, "step": 640 }, { "epoch": 0.22237545250818824, "grad_norm": 1.6397265813828814, "learning_rate": 9.994607850932089e-06, "loss": 0.3854, "step": 645 }, { "epoch": 0.22409929322530597, "grad_norm": 1.6228257707157137, "learning_rate": 9.994345546615389e-06, "loss": 0.3774, "step": 650 }, { "epoch": 0.22582313394242373, "grad_norm": 1.6150673694783573, "learning_rate": 9.99407701654641e-06, "loss": 0.3736, "step": 655 }, { "epoch": 0.22754697465954146, "grad_norm": 1.5201668210444677, "learning_rate": 9.993802261059882e-06, "loss": 0.3509, "step": 660 }, { "epoch": 0.2292708153766592, "grad_norm": 1.5639526727728108, "learning_rate": 9.993521280498312e-06, "loss": 0.3457, "step": 665 }, { "epoch": 0.23099465609377692, "grad_norm": 2.4286137351423687, "learning_rate": 9.993234075211954e-06, "loss": 0.3823, "step": 670 }, { "epoch": 0.23271849681089468, "grad_norm": 3.171671081725359, "learning_rate": 9.992940645558832e-06, "loss": 0.3594, "step": 675 }, { "epoch": 0.2344423375280124, "grad_norm": 1.4223660407457652, "learning_rate": 9.99264099190472e-06, "loss": 0.3483, "step": 680 }, { "epoch": 0.23616617824513014, "grad_norm": 1.491078285294274, "learning_rate": 9.992335114623155e-06, "loss": 0.3649, "step": 685 }, { "epoch": 0.2378900189622479, "grad_norm": 1.5010894475480492, "learning_rate": 9.992023014095431e-06, "loss": 0.3857, "step": 690 }, { "epoch": 0.23961385967936563, "grad_norm": 1.9744878308027272, "learning_rate": 9.991704690710602e-06, "loss": 0.3799, "step": 695 }, { "epoch": 0.24133770039648336, "grad_norm": 1.6326915073540642, "learning_rate": 9.991380144865474e-06, "loss": 0.362, "step": 700 }, { "epoch": 0.2430615411136011, "grad_norm": 1.636689293283894, "learning_rate": 9.991049376964614e-06, "loss": 0.3521, "step": 705 }, { "epoch": 0.24478538183071885, "grad_norm": 1.9068915222690204, "learning_rate": 9.990712387420348e-06, "loss": 0.381, "step": 710 }, { "epoch": 0.24650922254783658, "grad_norm": 1.6023130627615407, "learning_rate": 9.990369176652748e-06, "loss": 0.3934, "step": 715 }, { "epoch": 0.2482330632649543, "grad_norm": 1.7323114108675037, "learning_rate": 9.99001974508965e-06, "loss": 0.3858, "step": 720 }, { "epoch": 0.24995690398207204, "grad_norm": 2.7566884308975435, "learning_rate": 9.989664093166641e-06, "loss": 0.4039, "step": 725 }, { "epoch": 0.2516807446991898, "grad_norm": 1.5382959451268354, "learning_rate": 9.989302221327065e-06, "loss": 0.3834, "step": 730 }, { "epoch": 0.25340458541630756, "grad_norm": 1.4581586166654468, "learning_rate": 9.988934130022012e-06, "loss": 0.3834, "step": 735 }, { "epoch": 0.2551284261334253, "grad_norm": 2.5874414821417817, "learning_rate": 9.988559819710333e-06, "loss": 0.3807, "step": 740 }, { "epoch": 0.256852266850543, "grad_norm": 1.4101523870158132, "learning_rate": 9.988179290858628e-06, "loss": 0.3856, "step": 745 }, { "epoch": 0.25857610756766075, "grad_norm": 1.5004119745253264, "learning_rate": 9.987792543941248e-06, "loss": 0.4238, "step": 750 }, { "epoch": 0.2602999482847785, "grad_norm": 1.5840716199539697, "learning_rate": 9.987399579440298e-06, "loss": 0.3352, "step": 755 }, { "epoch": 0.2620237890018962, "grad_norm": 1.950713807715483, "learning_rate": 9.987000397845632e-06, "loss": 0.365, "step": 760 }, { "epoch": 0.26374762971901394, "grad_norm": 3.5692407156235655, "learning_rate": 9.986594999654853e-06, "loss": 0.4, "step": 765 }, { "epoch": 0.26547147043613173, "grad_norm": 1.4602853381322065, "learning_rate": 9.986183385373314e-06, "loss": 0.3554, "step": 770 }, { "epoch": 0.26719531115324946, "grad_norm": 2.0408178483352413, "learning_rate": 9.985765555514115e-06, "loss": 0.3825, "step": 775 }, { "epoch": 0.2689191518703672, "grad_norm": 1.766355386032275, "learning_rate": 9.985341510598111e-06, "loss": 0.4305, "step": 780 }, { "epoch": 0.2706429925874849, "grad_norm": 1.8735701650719805, "learning_rate": 9.984911251153897e-06, "loss": 0.3854, "step": 785 }, { "epoch": 0.27236683330460265, "grad_norm": 1.8498771495958393, "learning_rate": 9.984474777717815e-06, "loss": 0.3598, "step": 790 }, { "epoch": 0.2740906740217204, "grad_norm": 2.031747943114794, "learning_rate": 9.984032090833959e-06, "loss": 0.3838, "step": 795 }, { "epoch": 0.2758145147388381, "grad_norm": 1.5129690322023275, "learning_rate": 9.983583191054162e-06, "loss": 0.371, "step": 800 }, { "epoch": 0.27753835545595584, "grad_norm": 1.5488707743982602, "learning_rate": 9.983128078938007e-06, "loss": 0.3921, "step": 805 }, { "epoch": 0.27926219617307363, "grad_norm": 1.6432895517230897, "learning_rate": 9.982666755052818e-06, "loss": 0.3606, "step": 810 }, { "epoch": 0.28098603689019136, "grad_norm": 1.620912544021688, "learning_rate": 9.982199219973662e-06, "loss": 0.3952, "step": 815 }, { "epoch": 0.2827098776073091, "grad_norm": 2.018270453251334, "learning_rate": 9.98172547428335e-06, "loss": 0.3806, "step": 820 }, { "epoch": 0.2844337183244268, "grad_norm": 2.0495975917720193, "learning_rate": 9.981245518572434e-06, "loss": 0.4053, "step": 825 }, { "epoch": 0.28615755904154455, "grad_norm": 1.3074616768100098, "learning_rate": 9.98075935343921e-06, "loss": 0.3738, "step": 830 }, { "epoch": 0.2878813997586623, "grad_norm": 1.451187504669899, "learning_rate": 9.98026697948971e-06, "loss": 0.3454, "step": 835 }, { "epoch": 0.28960524047578, "grad_norm": 1.6684435894984186, "learning_rate": 9.979768397337707e-06, "loss": 0.361, "step": 840 }, { "epoch": 0.2913290811928978, "grad_norm": 2.1691383415826038, "learning_rate": 9.979263607604717e-06, "loss": 0.3293, "step": 845 }, { "epoch": 0.29305292191001553, "grad_norm": 1.7361296307334113, "learning_rate": 9.978752610919986e-06, "loss": 0.3682, "step": 850 }, { "epoch": 0.29477676262713326, "grad_norm": 2.632542549563109, "learning_rate": 9.978235407920506e-06, "loss": 0.3699, "step": 855 }, { "epoch": 0.296500603344251, "grad_norm": 1.5991509266182393, "learning_rate": 9.977711999251001e-06, "loss": 0.3775, "step": 860 }, { "epoch": 0.2982244440613687, "grad_norm": 1.419117347034277, "learning_rate": 9.97718238556393e-06, "loss": 0.3873, "step": 865 }, { "epoch": 0.29994828477848645, "grad_norm": 1.7818350177376596, "learning_rate": 9.97664656751949e-06, "loss": 0.3247, "step": 870 }, { "epoch": 0.3016721254956042, "grad_norm": 1.6102128287351594, "learning_rate": 9.97610454578561e-06, "loss": 0.3663, "step": 875 }, { "epoch": 0.30339596621272197, "grad_norm": 1.4259541767846493, "learning_rate": 9.975556321037951e-06, "loss": 0.37, "step": 880 }, { "epoch": 0.3051198069298397, "grad_norm": 1.394367272133933, "learning_rate": 9.975001893959912e-06, "loss": 0.4106, "step": 885 }, { "epoch": 0.30684364764695743, "grad_norm": 1.550783532169553, "learning_rate": 9.974441265242614e-06, "loss": 0.397, "step": 890 }, { "epoch": 0.30856748836407516, "grad_norm": 1.61618942570839, "learning_rate": 9.97387443558492e-06, "loss": 0.402, "step": 895 }, { "epoch": 0.3102913290811929, "grad_norm": 2.158601560774135, "learning_rate": 9.973301405693414e-06, "loss": 0.4115, "step": 900 }, { "epoch": 0.3120151697983106, "grad_norm": 1.6864322676938845, "learning_rate": 9.972722176282412e-06, "loss": 0.3863, "step": 905 }, { "epoch": 0.31373901051542835, "grad_norm": 1.499513360028198, "learning_rate": 9.972136748073962e-06, "loss": 0.3898, "step": 910 }, { "epoch": 0.31546285123254614, "grad_norm": 2.9945860401099353, "learning_rate": 9.97154512179783e-06, "loss": 0.3368, "step": 915 }, { "epoch": 0.31718669194966387, "grad_norm": 2.2619940301196007, "learning_rate": 9.970947298191518e-06, "loss": 0.3681, "step": 920 }, { "epoch": 0.3189105326667816, "grad_norm": 1.9961612634247545, "learning_rate": 9.970343278000248e-06, "loss": 0.3656, "step": 925 }, { "epoch": 0.32063437338389933, "grad_norm": 1.2979879218516086, "learning_rate": 9.969733061976968e-06, "loss": 0.3587, "step": 930 }, { "epoch": 0.32235821410101706, "grad_norm": 1.6959871926114105, "learning_rate": 9.969116650882347e-06, "loss": 0.3741, "step": 935 }, { "epoch": 0.3240820548181348, "grad_norm": 1.646394829591119, "learning_rate": 9.968494045484781e-06, "loss": 0.3775, "step": 940 }, { "epoch": 0.3258058955352525, "grad_norm": 1.8953700126107131, "learning_rate": 9.967865246560384e-06, "loss": 0.3658, "step": 945 }, { "epoch": 0.3275297362523703, "grad_norm": 1.4124007971281993, "learning_rate": 9.96723025489299e-06, "loss": 0.3941, "step": 950 }, { "epoch": 0.32925357696948804, "grad_norm": 1.8263518324450987, "learning_rate": 9.966589071274157e-06, "loss": 0.3948, "step": 955 }, { "epoch": 0.33097741768660577, "grad_norm": 1.2845869518907578, "learning_rate": 9.965941696503159e-06, "loss": 0.3724, "step": 960 }, { "epoch": 0.3327012584037235, "grad_norm": 1.3653877556655445, "learning_rate": 9.965288131386985e-06, "loss": 0.3701, "step": 965 }, { "epoch": 0.33442509912084123, "grad_norm": 1.992962251816929, "learning_rate": 9.964628376740346e-06, "loss": 0.341, "step": 970 }, { "epoch": 0.33614893983795896, "grad_norm": 1.132570804670226, "learning_rate": 9.963962433385664e-06, "loss": 0.3861, "step": 975 }, { "epoch": 0.3378727805550767, "grad_norm": 1.4106486997441041, "learning_rate": 9.963290302153079e-06, "loss": 0.3537, "step": 980 }, { "epoch": 0.3395966212721945, "grad_norm": 1.7058732858287857, "learning_rate": 9.962611983880441e-06, "loss": 0.3609, "step": 985 }, { "epoch": 0.3413204619893122, "grad_norm": 1.425582117664892, "learning_rate": 9.961927479413315e-06, "loss": 0.3591, "step": 990 }, { "epoch": 0.34304430270642994, "grad_norm": 2.034087724933577, "learning_rate": 9.96123678960498e-06, "loss": 0.3526, "step": 995 }, { "epoch": 0.34476814342354767, "grad_norm": 1.4069646155605997, "learning_rate": 9.960539915316419e-06, "loss": 0.3729, "step": 1000 }, { "epoch": 0.3464919841406654, "grad_norm": 1.5670770963105607, "learning_rate": 9.95983685741633e-06, "loss": 0.3983, "step": 1005 }, { "epoch": 0.34821582485778313, "grad_norm": 1.670410438433764, "learning_rate": 9.959127616781115e-06, "loss": 0.3702, "step": 1010 }, { "epoch": 0.34993966557490086, "grad_norm": 1.2811693543664437, "learning_rate": 9.958412194294885e-06, "loss": 0.365, "step": 1015 }, { "epoch": 0.3516635062920186, "grad_norm": 1.3881901167938773, "learning_rate": 9.95769059084946e-06, "loss": 0.3763, "step": 1020 }, { "epoch": 0.3533873470091364, "grad_norm": 1.1822513717312828, "learning_rate": 9.956962807344359e-06, "loss": 0.3588, "step": 1025 }, { "epoch": 0.3551111877262541, "grad_norm": 1.5015058214845947, "learning_rate": 9.956228844686808e-06, "loss": 0.3719, "step": 1030 }, { "epoch": 0.35683502844337184, "grad_norm": 1.2850863986904908, "learning_rate": 9.95548870379174e-06, "loss": 0.3703, "step": 1035 }, { "epoch": 0.35855886916048957, "grad_norm": 1.9595751672101223, "learning_rate": 9.954742385581779e-06, "loss": 0.406, "step": 1040 }, { "epoch": 0.3602827098776073, "grad_norm": 1.189969196375302, "learning_rate": 9.95398989098726e-06, "loss": 0.3806, "step": 1045 }, { "epoch": 0.36200655059472503, "grad_norm": 3.2765302636646867, "learning_rate": 9.953231220946213e-06, "loss": 0.3804, "step": 1050 }, { "epoch": 0.36373039131184276, "grad_norm": 1.4117818326643434, "learning_rate": 9.95246637640436e-06, "loss": 0.3636, "step": 1055 }, { "epoch": 0.36545423202896055, "grad_norm": 1.1362293837316353, "learning_rate": 9.951695358315135e-06, "loss": 0.3604, "step": 1060 }, { "epoch": 0.3671780727460783, "grad_norm": 2.375998856783418, "learning_rate": 9.95091816763965e-06, "loss": 0.3598, "step": 1065 }, { "epoch": 0.368901913463196, "grad_norm": 1.4402987169718324, "learning_rate": 9.950134805346727e-06, "loss": 0.3543, "step": 1070 }, { "epoch": 0.37062575418031374, "grad_norm": 2.336170329724286, "learning_rate": 9.949345272412866e-06, "loss": 0.3649, "step": 1075 }, { "epoch": 0.37234959489743147, "grad_norm": 2.1173847250304143, "learning_rate": 9.948549569822276e-06, "loss": 0.3773, "step": 1080 }, { "epoch": 0.3740734356145492, "grad_norm": 5.115336943456691, "learning_rate": 9.947747698566842e-06, "loss": 0.3852, "step": 1085 }, { "epoch": 0.37579727633166693, "grad_norm": 1.7062503153846607, "learning_rate": 9.946939659646147e-06, "loss": 0.3939, "step": 1090 }, { "epoch": 0.3775211170487847, "grad_norm": 1.32448946576323, "learning_rate": 9.94612545406746e-06, "loss": 0.3812, "step": 1095 }, { "epoch": 0.37924495776590245, "grad_norm": 1.8972128591502049, "learning_rate": 9.945305082845738e-06, "loss": 0.3802, "step": 1100 }, { "epoch": 0.3809687984830202, "grad_norm": 1.2829640579295978, "learning_rate": 9.944478547003622e-06, "loss": 0.3634, "step": 1105 }, { "epoch": 0.3826926392001379, "grad_norm": 1.4149606943731012, "learning_rate": 9.943645847571439e-06, "loss": 0.3383, "step": 1110 }, { "epoch": 0.38441647991725564, "grad_norm": 1.2510105520686647, "learning_rate": 9.942806985587199e-06, "loss": 0.3848, "step": 1115 }, { "epoch": 0.38614032063437337, "grad_norm": 1.3127699282429592, "learning_rate": 9.941961962096595e-06, "loss": 0.3719, "step": 1120 }, { "epoch": 0.3878641613514911, "grad_norm": 1.2255416667018941, "learning_rate": 9.941110778152997e-06, "loss": 0.3662, "step": 1125 }, { "epoch": 0.3895880020686089, "grad_norm": 1.5072255633942293, "learning_rate": 9.94025343481746e-06, "loss": 0.3509, "step": 1130 }, { "epoch": 0.3913118427857266, "grad_norm": 1.5051483096888227, "learning_rate": 9.939389933158712e-06, "loss": 0.3689, "step": 1135 }, { "epoch": 0.39303568350284435, "grad_norm": 1.4244272154222093, "learning_rate": 9.93852027425316e-06, "loss": 0.3463, "step": 1140 }, { "epoch": 0.3947595242199621, "grad_norm": 1.547483122253386, "learning_rate": 9.937644459184887e-06, "loss": 0.3493, "step": 1145 }, { "epoch": 0.3964833649370798, "grad_norm": 1.332252429344222, "learning_rate": 9.936762489045648e-06, "loss": 0.356, "step": 1150 }, { "epoch": 0.39820720565419754, "grad_norm": 1.3004669243523812, "learning_rate": 9.935874364934875e-06, "loss": 0.3912, "step": 1155 }, { "epoch": 0.39993104637131527, "grad_norm": 1.6257996779485293, "learning_rate": 9.934980087959663e-06, "loss": 0.3479, "step": 1160 }, { "epoch": 0.40165488708843305, "grad_norm": 1.3921654223877478, "learning_rate": 9.934079659234787e-06, "loss": 0.3813, "step": 1165 }, { "epoch": 0.4033787278055508, "grad_norm": 1.8477133683088989, "learning_rate": 9.933173079882682e-06, "loss": 0.3463, "step": 1170 }, { "epoch": 0.4051025685226685, "grad_norm": 1.3347077947689194, "learning_rate": 9.932260351033456e-06, "loss": 0.356, "step": 1175 }, { "epoch": 0.40682640923978625, "grad_norm": 1.4338643697652664, "learning_rate": 9.931341473824879e-06, "loss": 0.3997, "step": 1180 }, { "epoch": 0.408550249956904, "grad_norm": 1.7317367152204977, "learning_rate": 9.930416449402388e-06, "loss": 0.3686, "step": 1185 }, { "epoch": 0.4102740906740217, "grad_norm": 1.507979353997919, "learning_rate": 9.92948527891908e-06, "loss": 0.3637, "step": 1190 }, { "epoch": 0.41199793139113944, "grad_norm": 1.403884182741799, "learning_rate": 9.928547963535717e-06, "loss": 0.3529, "step": 1195 }, { "epoch": 0.4137217721082572, "grad_norm": 1.3106169255528577, "learning_rate": 9.927604504420718e-06, "loss": 0.3494, "step": 1200 }, { "epoch": 0.41544561282537495, "grad_norm": 1.3396927515002877, "learning_rate": 9.926654902750163e-06, "loss": 0.3533, "step": 1205 }, { "epoch": 0.4171694535424927, "grad_norm": 1.183714203613263, "learning_rate": 9.925699159707784e-06, "loss": 0.3734, "step": 1210 }, { "epoch": 0.4188932942596104, "grad_norm": 1.389685074932271, "learning_rate": 9.924737276484974e-06, "loss": 0.3433, "step": 1215 }, { "epoch": 0.42061713497672815, "grad_norm": 1.1942490877101626, "learning_rate": 9.923769254280781e-06, "loss": 0.3503, "step": 1220 }, { "epoch": 0.4223409756938459, "grad_norm": 1.5953115471921129, "learning_rate": 9.9227950943019e-06, "loss": 0.3797, "step": 1225 }, { "epoch": 0.4240648164109636, "grad_norm": 1.3659510775769799, "learning_rate": 9.921814797762681e-06, "loss": 0.3805, "step": 1230 }, { "epoch": 0.42578865712808134, "grad_norm": 2.4060415220718827, "learning_rate": 9.920828365885121e-06, "loss": 0.3642, "step": 1235 }, { "epoch": 0.4275124978451991, "grad_norm": 1.1890557400582953, "learning_rate": 9.919835799898869e-06, "loss": 0.3565, "step": 1240 }, { "epoch": 0.42923633856231685, "grad_norm": 1.423047533670317, "learning_rate": 9.918837101041217e-06, "loss": 0.3504, "step": 1245 }, { "epoch": 0.4309601792794346, "grad_norm": 1.546104084922549, "learning_rate": 9.917832270557103e-06, "loss": 0.362, "step": 1250 }, { "epoch": 0.4326840199965523, "grad_norm": 1.30108724662532, "learning_rate": 9.916821309699112e-06, "loss": 0.4035, "step": 1255 }, { "epoch": 0.43440786071367005, "grad_norm": 1.6968915457556, "learning_rate": 9.915804219727463e-06, "loss": 0.3664, "step": 1260 }, { "epoch": 0.4361317014307878, "grad_norm": 1.7027916902035114, "learning_rate": 9.91478100191002e-06, "loss": 0.392, "step": 1265 }, { "epoch": 0.4378555421479055, "grad_norm": 1.4545975619385854, "learning_rate": 9.91375165752229e-06, "loss": 0.393, "step": 1270 }, { "epoch": 0.4395793828650233, "grad_norm": 1.6130421167609756, "learning_rate": 9.91271618784741e-06, "loss": 0.3748, "step": 1275 }, { "epoch": 0.441303223582141, "grad_norm": 1.3023988922113487, "learning_rate": 9.911674594176153e-06, "loss": 0.3361, "step": 1280 }, { "epoch": 0.44302706429925875, "grad_norm": 1.5374173378155058, "learning_rate": 9.91062687780693e-06, "loss": 0.3703, "step": 1285 }, { "epoch": 0.4447509050163765, "grad_norm": 1.5199725349390463, "learning_rate": 9.909573040045785e-06, "loss": 0.3607, "step": 1290 }, { "epoch": 0.4464747457334942, "grad_norm": 1.3413231991377519, "learning_rate": 9.908513082206386e-06, "loss": 0.347, "step": 1295 }, { "epoch": 0.44819858645061195, "grad_norm": 1.2462578869188095, "learning_rate": 9.907447005610038e-06, "loss": 0.3484, "step": 1300 }, { "epoch": 0.4499224271677297, "grad_norm": 1.4037202302710707, "learning_rate": 9.906374811585668e-06, "loss": 0.3712, "step": 1305 }, { "epoch": 0.45164626788484746, "grad_norm": 1.7753102784448989, "learning_rate": 9.90529650146983e-06, "loss": 0.3829, "step": 1310 }, { "epoch": 0.4533701086019652, "grad_norm": 1.2899182482456184, "learning_rate": 9.904212076606704e-06, "loss": 0.3747, "step": 1315 }, { "epoch": 0.4550939493190829, "grad_norm": 1.407583745226016, "learning_rate": 9.903121538348086e-06, "loss": 0.352, "step": 1320 }, { "epoch": 0.45681779003620065, "grad_norm": 1.577726964068593, "learning_rate": 9.902024888053404e-06, "loss": 0.3517, "step": 1325 }, { "epoch": 0.4585416307533184, "grad_norm": 1.2565158325938188, "learning_rate": 9.900922127089696e-06, "loss": 0.3717, "step": 1330 }, { "epoch": 0.4602654714704361, "grad_norm": 2.294699756680478, "learning_rate": 9.899813256831618e-06, "loss": 0.3658, "step": 1335 }, { "epoch": 0.46198931218755385, "grad_norm": 1.3078489945396463, "learning_rate": 9.898698278661448e-06, "loss": 0.3726, "step": 1340 }, { "epoch": 0.46371315290467163, "grad_norm": 1.1945961138468528, "learning_rate": 9.897577193969068e-06, "loss": 0.3856, "step": 1345 }, { "epoch": 0.46543699362178936, "grad_norm": 1.3912208144004374, "learning_rate": 9.89645000415198e-06, "loss": 0.3853, "step": 1350 }, { "epoch": 0.4671608343389071, "grad_norm": 1.4416719025269795, "learning_rate": 9.895316710615296e-06, "loss": 0.3748, "step": 1355 }, { "epoch": 0.4688846750560248, "grad_norm": 1.3417231467145718, "learning_rate": 9.89417731477173e-06, "loss": 0.3971, "step": 1360 }, { "epoch": 0.47060851577314256, "grad_norm": 1.2924588730347766, "learning_rate": 9.893031818041615e-06, "loss": 0.3857, "step": 1365 }, { "epoch": 0.4723323564902603, "grad_norm": 1.5799532568189445, "learning_rate": 9.891880221852872e-06, "loss": 0.3612, "step": 1370 }, { "epoch": 0.474056197207378, "grad_norm": 1.6455110984272108, "learning_rate": 9.890722527641041e-06, "loss": 0.3469, "step": 1375 }, { "epoch": 0.4757800379244958, "grad_norm": 1.2408173241480154, "learning_rate": 9.889558736849258e-06, "loss": 0.3341, "step": 1380 }, { "epoch": 0.47750387864161353, "grad_norm": 1.3054985860026322, "learning_rate": 9.888388850928254e-06, "loss": 0.3706, "step": 1385 }, { "epoch": 0.47922771935873126, "grad_norm": 1.6467829330469361, "learning_rate": 9.887212871336368e-06, "loss": 0.3367, "step": 1390 }, { "epoch": 0.480951560075849, "grad_norm": 1.2749999426414975, "learning_rate": 9.886030799539522e-06, "loss": 0.3429, "step": 1395 }, { "epoch": 0.4826754007929667, "grad_norm": 1.8119587682624123, "learning_rate": 9.884842637011245e-06, "loss": 0.4, "step": 1400 }, { "epoch": 0.48439924151008446, "grad_norm": 1.7911955620757787, "learning_rate": 9.883648385232654e-06, "loss": 0.3809, "step": 1405 }, { "epoch": 0.4861230822272022, "grad_norm": 1.310684798244541, "learning_rate": 9.88244804569245e-06, "loss": 0.3425, "step": 1410 }, { "epoch": 0.48784692294431997, "grad_norm": 1.5197330820828343, "learning_rate": 9.881241619886934e-06, "loss": 0.3677, "step": 1415 }, { "epoch": 0.4895707636614377, "grad_norm": 1.38254878981102, "learning_rate": 9.880029109319986e-06, "loss": 0.3774, "step": 1420 }, { "epoch": 0.49129460437855543, "grad_norm": 1.1583178223739745, "learning_rate": 9.878810515503074e-06, "loss": 0.3335, "step": 1425 }, { "epoch": 0.49301844509567316, "grad_norm": 1.2847768495296337, "learning_rate": 9.877585839955247e-06, "loss": 0.3414, "step": 1430 }, { "epoch": 0.4947422858127909, "grad_norm": 1.2512268003206457, "learning_rate": 9.87635508420314e-06, "loss": 0.3351, "step": 1435 }, { "epoch": 0.4964661265299086, "grad_norm": 1.6571534427005084, "learning_rate": 9.87511824978096e-06, "loss": 0.3747, "step": 1440 }, { "epoch": 0.49818996724702636, "grad_norm": 1.5612786668980918, "learning_rate": 9.873875338230499e-06, "loss": 0.3671, "step": 1445 }, { "epoch": 0.4999138079641441, "grad_norm": 1.3248757883806321, "learning_rate": 9.87262635110112e-06, "loss": 0.357, "step": 1450 }, { "epoch": 0.5016376486812618, "grad_norm": 1.1197006492173756, "learning_rate": 9.871371289949758e-06, "loss": 0.3599, "step": 1455 }, { "epoch": 0.5033614893983795, "grad_norm": 1.3032377507199844, "learning_rate": 9.870110156340928e-06, "loss": 0.3153, "step": 1460 }, { "epoch": 0.5050853301154973, "grad_norm": 1.2771120579191166, "learning_rate": 9.868842951846703e-06, "loss": 0.3573, "step": 1465 }, { "epoch": 0.5068091708326151, "grad_norm": 1.949617037013246, "learning_rate": 9.867569678046734e-06, "loss": 0.3853, "step": 1470 }, { "epoch": 0.5085330115497328, "grad_norm": 1.4482220126271506, "learning_rate": 9.86629033652823e-06, "loss": 0.3182, "step": 1475 }, { "epoch": 0.5102568522668506, "grad_norm": 1.341005156684862, "learning_rate": 9.865004928885968e-06, "loss": 0.3647, "step": 1480 }, { "epoch": 0.5119806929839683, "grad_norm": 1.4542069069853112, "learning_rate": 9.863713456722289e-06, "loss": 0.3657, "step": 1485 }, { "epoch": 0.513704533701086, "grad_norm": 5.6703414667784, "learning_rate": 9.862415921647087e-06, "loss": 0.3897, "step": 1490 }, { "epoch": 0.5154283744182038, "grad_norm": 3.2139034968460254, "learning_rate": 9.86111232527782e-06, "loss": 0.3229, "step": 1495 }, { "epoch": 0.5171522151353215, "grad_norm": 1.3612184719987817, "learning_rate": 9.859802669239497e-06, "loss": 0.3539, "step": 1500 }, { "epoch": 0.5188760558524392, "grad_norm": 1.7907244849071648, "learning_rate": 9.858486955164686e-06, "loss": 0.3847, "step": 1505 }, { "epoch": 0.520599896569557, "grad_norm": 1.387236181448054, "learning_rate": 9.857165184693502e-06, "loss": 0.3404, "step": 1510 }, { "epoch": 0.5223237372866747, "grad_norm": 2.7369191694718435, "learning_rate": 9.855837359473611e-06, "loss": 0.3377, "step": 1515 }, { "epoch": 0.5240475780037924, "grad_norm": 1.7846678078527498, "learning_rate": 9.854503481160229e-06, "loss": 0.3397, "step": 1520 }, { "epoch": 0.5257714187209102, "grad_norm": 1.1775736461224253, "learning_rate": 9.853163551416112e-06, "loss": 0.3181, "step": 1525 }, { "epoch": 0.5274952594380279, "grad_norm": 5.217423554831368, "learning_rate": 9.851817571911568e-06, "loss": 0.3786, "step": 1530 }, { "epoch": 0.5292191001551456, "grad_norm": 1.300451482508374, "learning_rate": 9.850465544324437e-06, "loss": 0.3424, "step": 1535 }, { "epoch": 0.5309429408722635, "grad_norm": 1.264550442244784, "learning_rate": 9.849107470340105e-06, "loss": 0.3489, "step": 1540 }, { "epoch": 0.5326667815893812, "grad_norm": 1.8979472196923746, "learning_rate": 9.847743351651493e-06, "loss": 0.4058, "step": 1545 }, { "epoch": 0.5343906223064989, "grad_norm": 1.3971555375184224, "learning_rate": 9.846373189959057e-06, "loss": 0.3803, "step": 1550 }, { "epoch": 0.5361144630236166, "grad_norm": 1.1945667113234897, "learning_rate": 9.844996986970785e-06, "loss": 0.351, "step": 1555 }, { "epoch": 0.5378383037407344, "grad_norm": 1.4265596933503348, "learning_rate": 9.843614744402199e-06, "loss": 0.3618, "step": 1560 }, { "epoch": 0.5395621444578521, "grad_norm": 1.6254590780571174, "learning_rate": 9.842226463976344e-06, "loss": 0.3651, "step": 1565 }, { "epoch": 0.5412859851749698, "grad_norm": 1.6247245781236594, "learning_rate": 9.840832147423797e-06, "loss": 0.3306, "step": 1570 }, { "epoch": 0.5430098258920876, "grad_norm": 1.220406406153, "learning_rate": 9.839431796482657e-06, "loss": 0.3223, "step": 1575 }, { "epoch": 0.5447336666092053, "grad_norm": 1.8630798820145742, "learning_rate": 9.83802541289855e-06, "loss": 0.3233, "step": 1580 }, { "epoch": 0.546457507326323, "grad_norm": 1.654861293681641, "learning_rate": 9.836612998424609e-06, "loss": 0.3422, "step": 1585 }, { "epoch": 0.5481813480434408, "grad_norm": 1.2926137649787477, "learning_rate": 9.8351945548215e-06, "loss": 0.3438, "step": 1590 }, { "epoch": 0.5499051887605585, "grad_norm": 1.4070463848515484, "learning_rate": 9.833770083857399e-06, "loss": 0.3429, "step": 1595 }, { "epoch": 0.5516290294776762, "grad_norm": 1.6643200904657351, "learning_rate": 9.832339587307993e-06, "loss": 0.3872, "step": 1600 }, { "epoch": 0.553352870194794, "grad_norm": 1.196628821556852, "learning_rate": 9.830903066956482e-06, "loss": 0.3584, "step": 1605 }, { "epoch": 0.5550767109119117, "grad_norm": 1.4221976338929014, "learning_rate": 9.829460524593573e-06, "loss": 0.3501, "step": 1610 }, { "epoch": 0.5568005516290295, "grad_norm": 1.231712612757513, "learning_rate": 9.828011962017483e-06, "loss": 0.3615, "step": 1615 }, { "epoch": 0.5585243923461473, "grad_norm": 3.782491737531238, "learning_rate": 9.826557381033935e-06, "loss": 0.3595, "step": 1620 }, { "epoch": 0.560248233063265, "grad_norm": 1.8192046148733767, "learning_rate": 9.82509678345615e-06, "loss": 0.3889, "step": 1625 }, { "epoch": 0.5619720737803827, "grad_norm": 1.1784062261575894, "learning_rate": 9.82363017110485e-06, "loss": 0.358, "step": 1630 }, { "epoch": 0.5636959144975004, "grad_norm": 1.1120097999467748, "learning_rate": 9.822157545808258e-06, "loss": 0.344, "step": 1635 }, { "epoch": 0.5654197552146182, "grad_norm": 1.231049378045983, "learning_rate": 9.820678909402086e-06, "loss": 0.3622, "step": 1640 }, { "epoch": 0.5671435959317359, "grad_norm": 1.4583871524324172, "learning_rate": 9.819194263729545e-06, "loss": 0.3324, "step": 1645 }, { "epoch": 0.5688674366488536, "grad_norm": 2.1448552999596324, "learning_rate": 9.817703610641338e-06, "loss": 0.3605, "step": 1650 }, { "epoch": 0.5705912773659714, "grad_norm": 1.4216658533014481, "learning_rate": 9.816206951995651e-06, "loss": 0.3583, "step": 1655 }, { "epoch": 0.5723151180830891, "grad_norm": 1.2683657570498639, "learning_rate": 9.81470428965816e-06, "loss": 0.3266, "step": 1660 }, { "epoch": 0.5740389588002068, "grad_norm": 1.429925418521831, "learning_rate": 9.813195625502023e-06, "loss": 0.3625, "step": 1665 }, { "epoch": 0.5757627995173246, "grad_norm": 1.3109369869244831, "learning_rate": 9.81168096140788e-06, "loss": 0.3496, "step": 1670 }, { "epoch": 0.5774866402344423, "grad_norm": 1.4097525415123802, "learning_rate": 9.810160299263854e-06, "loss": 0.3222, "step": 1675 }, { "epoch": 0.57921048095156, "grad_norm": 1.6167685051895064, "learning_rate": 9.808633640965538e-06, "loss": 0.3562, "step": 1680 }, { "epoch": 0.5809343216686779, "grad_norm": 1.4068806492951937, "learning_rate": 9.80710098841601e-06, "loss": 0.3241, "step": 1685 }, { "epoch": 0.5826581623857956, "grad_norm": 1.179635646405619, "learning_rate": 9.805562343525805e-06, "loss": 0.3857, "step": 1690 }, { "epoch": 0.5843820031029133, "grad_norm": 1.2173491024659624, "learning_rate": 9.804017708212942e-06, "loss": 0.3371, "step": 1695 }, { "epoch": 0.5861058438200311, "grad_norm": 1.3173546555259819, "learning_rate": 9.8024670844029e-06, "loss": 0.343, "step": 1700 }, { "epoch": 0.5878296845371488, "grad_norm": 1.4538435102753489, "learning_rate": 9.800910474028626e-06, "loss": 0.3582, "step": 1705 }, { "epoch": 0.5895535252542665, "grad_norm": 1.490191931421755, "learning_rate": 9.79934787903053e-06, "loss": 0.3407, "step": 1710 }, { "epoch": 0.5912773659713843, "grad_norm": 1.3278697952759766, "learning_rate": 9.797779301356476e-06, "loss": 0.3645, "step": 1715 }, { "epoch": 0.593001206688502, "grad_norm": 1.9231811456670627, "learning_rate": 9.796204742961794e-06, "loss": 0.3498, "step": 1720 }, { "epoch": 0.5947250474056197, "grad_norm": 3.4354823474910896, "learning_rate": 9.794624205809265e-06, "loss": 0.3389, "step": 1725 }, { "epoch": 0.5964488881227374, "grad_norm": 1.2929264142519374, "learning_rate": 9.793037691869122e-06, "loss": 0.3386, "step": 1730 }, { "epoch": 0.5981727288398552, "grad_norm": 1.3268190419109211, "learning_rate": 9.791445203119054e-06, "loss": 0.3607, "step": 1735 }, { "epoch": 0.5998965695569729, "grad_norm": 2.4156815305069195, "learning_rate": 9.789846741544189e-06, "loss": 0.3519, "step": 1740 }, { "epoch": 0.6016204102740906, "grad_norm": 2.105253860773333, "learning_rate": 9.78824230913711e-06, "loss": 0.3482, "step": 1745 }, { "epoch": 0.6033442509912084, "grad_norm": 1.4553672573498604, "learning_rate": 9.786631907897837e-06, "loss": 0.3444, "step": 1750 }, { "epoch": 0.6050680917083262, "grad_norm": 1.1683158328286432, "learning_rate": 9.785015539833833e-06, "loss": 0.328, "step": 1755 }, { "epoch": 0.6067919324254439, "grad_norm": 1.3640121639678129, "learning_rate": 9.783393206959994e-06, "loss": 0.3329, "step": 1760 }, { "epoch": 0.6085157731425617, "grad_norm": 1.3717937958436441, "learning_rate": 9.781764911298662e-06, "loss": 0.3608, "step": 1765 }, { "epoch": 0.6102396138596794, "grad_norm": 1.4079094810940234, "learning_rate": 9.780130654879598e-06, "loss": 0.352, "step": 1770 }, { "epoch": 0.6119634545767971, "grad_norm": 1.2300630026877328, "learning_rate": 9.778490439740008e-06, "loss": 0.3695, "step": 1775 }, { "epoch": 0.6136872952939149, "grad_norm": 1.4261147274189423, "learning_rate": 9.776844267924515e-06, "loss": 0.3089, "step": 1780 }, { "epoch": 0.6154111360110326, "grad_norm": 17.970470975419595, "learning_rate": 9.775192141485172e-06, "loss": 0.3452, "step": 1785 }, { "epoch": 0.6171349767281503, "grad_norm": 2.3325003922313328, "learning_rate": 9.773534062481455e-06, "loss": 0.3341, "step": 1790 }, { "epoch": 0.618858817445268, "grad_norm": 1.2644218427270757, "learning_rate": 9.771870032980258e-06, "loss": 0.3372, "step": 1795 }, { "epoch": 0.6205826581623858, "grad_norm": 2.267288591389249, "learning_rate": 9.770200055055895e-06, "loss": 0.368, "step": 1800 }, { "epoch": 0.6223064988795035, "grad_norm": 1.511457144800206, "learning_rate": 9.768524130790092e-06, "loss": 0.352, "step": 1805 }, { "epoch": 0.6240303395966212, "grad_norm": 1.2748023215607918, "learning_rate": 9.766842262271991e-06, "loss": 0.3449, "step": 1810 }, { "epoch": 0.625754180313739, "grad_norm": 1.6346116653311675, "learning_rate": 9.765154451598142e-06, "loss": 0.3413, "step": 1815 }, { "epoch": 0.6274780210308567, "grad_norm": 1.2129741803165814, "learning_rate": 9.763460700872504e-06, "loss": 0.3323, "step": 1820 }, { "epoch": 0.6292018617479744, "grad_norm": 1.5775136340309934, "learning_rate": 9.761761012206436e-06, "loss": 0.3464, "step": 1825 }, { "epoch": 0.6309257024650923, "grad_norm": 1.1506286707450157, "learning_rate": 9.760055387718705e-06, "loss": 0.3733, "step": 1830 }, { "epoch": 0.63264954318221, "grad_norm": 3.1483872499981613, "learning_rate": 9.758343829535475e-06, "loss": 0.3357, "step": 1835 }, { "epoch": 0.6343733838993277, "grad_norm": 6.66861345094595, "learning_rate": 9.756626339790304e-06, "loss": 0.35, "step": 1840 }, { "epoch": 0.6360972246164455, "grad_norm": 1.3407152188605196, "learning_rate": 9.754902920624148e-06, "loss": 0.362, "step": 1845 }, { "epoch": 0.6378210653335632, "grad_norm": 1.2053825635726498, "learning_rate": 9.75317357418535e-06, "loss": 0.3518, "step": 1850 }, { "epoch": 0.6395449060506809, "grad_norm": 1.1351690387521203, "learning_rate": 9.751438302629648e-06, "loss": 0.3375, "step": 1855 }, { "epoch": 0.6412687467677987, "grad_norm": 1.2654786468781802, "learning_rate": 9.74969710812016e-06, "loss": 0.3577, "step": 1860 }, { "epoch": 0.6429925874849164, "grad_norm": 1.320102261820997, "learning_rate": 9.74794999282739e-06, "loss": 0.3702, "step": 1865 }, { "epoch": 0.6447164282020341, "grad_norm": 1.2184163373499772, "learning_rate": 9.746196958929224e-06, "loss": 0.3565, "step": 1870 }, { "epoch": 0.6464402689191519, "grad_norm": 1.3086320859106735, "learning_rate": 9.744438008610923e-06, "loss": 0.3415, "step": 1875 }, { "epoch": 0.6481641096362696, "grad_norm": 1.260451122577548, "learning_rate": 9.742673144065124e-06, "loss": 0.3591, "step": 1880 }, { "epoch": 0.6498879503533873, "grad_norm": 2.2130311744776687, "learning_rate": 9.740902367491838e-06, "loss": 0.3321, "step": 1885 }, { "epoch": 0.651611791070505, "grad_norm": 1.3269825421431771, "learning_rate": 9.739125681098445e-06, "loss": 0.3881, "step": 1890 }, { "epoch": 0.6533356317876228, "grad_norm": 1.4558089848240794, "learning_rate": 9.737343087099688e-06, "loss": 0.3234, "step": 1895 }, { "epoch": 0.6550594725047406, "grad_norm": 1.2647154181590219, "learning_rate": 9.735554587717683e-06, "loss": 0.3452, "step": 1900 }, { "epoch": 0.6567833132218583, "grad_norm": 1.177397083432282, "learning_rate": 9.733760185181898e-06, "loss": 0.3363, "step": 1905 }, { "epoch": 0.6585071539389761, "grad_norm": 1.5200540976797097, "learning_rate": 9.731959881729166e-06, "loss": 0.3406, "step": 1910 }, { "epoch": 0.6602309946560938, "grad_norm": 1.0930012809935852, "learning_rate": 9.730153679603672e-06, "loss": 0.337, "step": 1915 }, { "epoch": 0.6619548353732115, "grad_norm": 1.4255344620994401, "learning_rate": 9.728341581056955e-06, "loss": 0.3449, "step": 1920 }, { "epoch": 0.6636786760903293, "grad_norm": 1.2281310053060395, "learning_rate": 9.726523588347906e-06, "loss": 0.3604, "step": 1925 }, { "epoch": 0.665402516807447, "grad_norm": 1.387743088432441, "learning_rate": 9.724699703742763e-06, "loss": 0.3445, "step": 1930 }, { "epoch": 0.6671263575245647, "grad_norm": 1.2543456914925757, "learning_rate": 9.72286992951511e-06, "loss": 0.3875, "step": 1935 }, { "epoch": 0.6688501982416825, "grad_norm": 1.3470232445654011, "learning_rate": 9.721034267945866e-06, "loss": 0.3692, "step": 1940 }, { "epoch": 0.6705740389588002, "grad_norm": 1.0384993936345281, "learning_rate": 9.719192721323297e-06, "loss": 0.3316, "step": 1945 }, { "epoch": 0.6722978796759179, "grad_norm": 2.3474011989080723, "learning_rate": 9.717345291943e-06, "loss": 0.376, "step": 1950 }, { "epoch": 0.6740217203930357, "grad_norm": 1.2785175572492598, "learning_rate": 9.715491982107905e-06, "loss": 0.3354, "step": 1955 }, { "epoch": 0.6757455611101534, "grad_norm": 1.4315614912377117, "learning_rate": 9.71363279412828e-06, "loss": 0.3648, "step": 1960 }, { "epoch": 0.6774694018272711, "grad_norm": 1.5159264140521431, "learning_rate": 9.71176773032171e-06, "loss": 0.3632, "step": 1965 }, { "epoch": 0.679193242544389, "grad_norm": 1.936391805248331, "learning_rate": 9.70989679301311e-06, "loss": 0.3251, "step": 1970 }, { "epoch": 0.6809170832615067, "grad_norm": 1.1908396682983318, "learning_rate": 9.708019984534717e-06, "loss": 0.3385, "step": 1975 }, { "epoch": 0.6826409239786244, "grad_norm": 1.194331525773947, "learning_rate": 9.706137307226085e-06, "loss": 0.3766, "step": 1980 }, { "epoch": 0.6843647646957421, "grad_norm": 1.384193293040237, "learning_rate": 9.704248763434086e-06, "loss": 0.3091, "step": 1985 }, { "epoch": 0.6860886054128599, "grad_norm": 1.3014083426984635, "learning_rate": 9.702354355512899e-06, "loss": 0.3361, "step": 1990 }, { "epoch": 0.6878124461299776, "grad_norm": 2.130779427145665, "learning_rate": 9.700454085824025e-06, "loss": 0.3382, "step": 1995 }, { "epoch": 0.6895362868470953, "grad_norm": 1.3918957544735562, "learning_rate": 9.698547956736257e-06, "loss": 0.3356, "step": 2000 }, { "epoch": 0.6912601275642131, "grad_norm": 1.1048646806456652, "learning_rate": 9.696635970625705e-06, "loss": 0.3796, "step": 2005 }, { "epoch": 0.6929839682813308, "grad_norm": 1.255839215498237, "learning_rate": 9.694718129875772e-06, "loss": 0.3704, "step": 2010 }, { "epoch": 0.6947078089984485, "grad_norm": 1.155480250834214, "learning_rate": 9.692794436877161e-06, "loss": 0.3115, "step": 2015 }, { "epoch": 0.6964316497155663, "grad_norm": 1.2630790059617156, "learning_rate": 9.690864894027876e-06, "loss": 0.3611, "step": 2020 }, { "epoch": 0.698155490432684, "grad_norm": 1.2491606966680813, "learning_rate": 9.688929503733202e-06, "loss": 0.3678, "step": 2025 }, { "epoch": 0.6998793311498017, "grad_norm": 1.317226937620061, "learning_rate": 9.686988268405725e-06, "loss": 0.3434, "step": 2030 }, { "epoch": 0.7016031718669195, "grad_norm": 2.9635268070050635, "learning_rate": 9.685041190465306e-06, "loss": 0.3567, "step": 2035 }, { "epoch": 0.7033270125840372, "grad_norm": 1.1111167588783066, "learning_rate": 9.683088272339098e-06, "loss": 0.3311, "step": 2040 }, { "epoch": 0.705050853301155, "grad_norm": 1.1727225408316133, "learning_rate": 9.681129516461533e-06, "loss": 0.3644, "step": 2045 }, { "epoch": 0.7067746940182728, "grad_norm": 1.2849192048716935, "learning_rate": 9.679164925274316e-06, "loss": 0.3292, "step": 2050 }, { "epoch": 0.7084985347353905, "grad_norm": 1.856294010427372, "learning_rate": 9.677194501226427e-06, "loss": 0.3598, "step": 2055 }, { "epoch": 0.7102223754525082, "grad_norm": 1.4344730481492045, "learning_rate": 9.675218246774119e-06, "loss": 0.4037, "step": 2060 }, { "epoch": 0.711946216169626, "grad_norm": 1.2596112394096157, "learning_rate": 9.673236164380912e-06, "loss": 0.3594, "step": 2065 }, { "epoch": 0.7136700568867437, "grad_norm": 1.0430720211348907, "learning_rate": 9.671248256517593e-06, "loss": 0.3473, "step": 2070 }, { "epoch": 0.7153938976038614, "grad_norm": 2.2414599858134174, "learning_rate": 9.669254525662206e-06, "loss": 0.3449, "step": 2075 }, { "epoch": 0.7171177383209791, "grad_norm": 1.2595526917390543, "learning_rate": 9.667254974300058e-06, "loss": 0.3339, "step": 2080 }, { "epoch": 0.7188415790380969, "grad_norm": 1.5145459258516831, "learning_rate": 9.66524960492371e-06, "loss": 0.3652, "step": 2085 }, { "epoch": 0.7205654197552146, "grad_norm": 1.3622039449929448, "learning_rate": 9.663238420032974e-06, "loss": 0.3482, "step": 2090 }, { "epoch": 0.7222892604723323, "grad_norm": 4.393090680903247, "learning_rate": 9.661221422134916e-06, "loss": 0.3369, "step": 2095 }, { "epoch": 0.7240131011894501, "grad_norm": 1.3436475260291876, "learning_rate": 9.659198613743843e-06, "loss": 0.346, "step": 2100 }, { "epoch": 0.7257369419065678, "grad_norm": 1.1735302491637403, "learning_rate": 9.657169997381309e-06, "loss": 0.3287, "step": 2105 }, { "epoch": 0.7274607826236855, "grad_norm": 1.6624901063970146, "learning_rate": 9.655135575576104e-06, "loss": 0.3296, "step": 2110 }, { "epoch": 0.7291846233408034, "grad_norm": 1.1642361211785006, "learning_rate": 9.653095350864258e-06, "loss": 0.3646, "step": 2115 }, { "epoch": 0.7309084640579211, "grad_norm": 1.2379510076499574, "learning_rate": 9.651049325789035e-06, "loss": 0.3384, "step": 2120 }, { "epoch": 0.7326323047750388, "grad_norm": 2.350647701258199, "learning_rate": 9.648997502900927e-06, "loss": 0.339, "step": 2125 }, { "epoch": 0.7343561454921566, "grad_norm": 1.2552644816504759, "learning_rate": 9.646939884757658e-06, "loss": 0.3339, "step": 2130 }, { "epoch": 0.7360799862092743, "grad_norm": 1.3038930976731238, "learning_rate": 9.644876473924169e-06, "loss": 0.3607, "step": 2135 }, { "epoch": 0.737803826926392, "grad_norm": 1.1710383911554159, "learning_rate": 9.642807272972628e-06, "loss": 0.3267, "step": 2140 }, { "epoch": 0.7395276676435097, "grad_norm": 1.1416737908534371, "learning_rate": 9.640732284482415e-06, "loss": 0.3512, "step": 2145 }, { "epoch": 0.7412515083606275, "grad_norm": 1.1621384764475449, "learning_rate": 9.638651511040133e-06, "loss": 0.3516, "step": 2150 }, { "epoch": 0.7429753490777452, "grad_norm": 1.1264664231349113, "learning_rate": 9.636564955239589e-06, "loss": 0.357, "step": 2155 }, { "epoch": 0.7446991897948629, "grad_norm": 1.4580010683693982, "learning_rate": 9.6344726196818e-06, "loss": 0.3437, "step": 2160 }, { "epoch": 0.7464230305119807, "grad_norm": 1.444505088947196, "learning_rate": 9.632374506974989e-06, "loss": 0.3521, "step": 2165 }, { "epoch": 0.7481468712290984, "grad_norm": 1.3267725643785786, "learning_rate": 9.63027061973458e-06, "loss": 0.3367, "step": 2170 }, { "epoch": 0.7498707119462161, "grad_norm": 1.1604147870475694, "learning_rate": 9.628160960583193e-06, "loss": 0.3767, "step": 2175 }, { "epoch": 0.7515945526633339, "grad_norm": 2.337394597573418, "learning_rate": 9.626045532150645e-06, "loss": 0.3725, "step": 2180 }, { "epoch": 0.7533183933804517, "grad_norm": 1.2134445156430822, "learning_rate": 9.62392433707395e-06, "loss": 0.3569, "step": 2185 }, { "epoch": 0.7550422340975694, "grad_norm": 1.6532910321870786, "learning_rate": 9.6217973779973e-06, "loss": 0.3532, "step": 2190 }, { "epoch": 0.7567660748146872, "grad_norm": 1.3308694918317898, "learning_rate": 9.619664657572077e-06, "loss": 0.3364, "step": 2195 }, { "epoch": 0.7584899155318049, "grad_norm": 1.4402124304669235, "learning_rate": 9.61752617845685e-06, "loss": 0.3678, "step": 2200 }, { "epoch": 0.7602137562489226, "grad_norm": 1.407378477530841, "learning_rate": 9.615381943317358e-06, "loss": 0.3388, "step": 2205 }, { "epoch": 0.7619375969660404, "grad_norm": 1.3100326233019954, "learning_rate": 9.613231954826522e-06, "loss": 0.3434, "step": 2210 }, { "epoch": 0.7636614376831581, "grad_norm": 1.3152288801879068, "learning_rate": 9.61107621566443e-06, "loss": 0.3761, "step": 2215 }, { "epoch": 0.7653852784002758, "grad_norm": 1.8018198787615154, "learning_rate": 9.608914728518342e-06, "loss": 0.3421, "step": 2220 }, { "epoch": 0.7671091191173935, "grad_norm": 1.3715552770372919, "learning_rate": 9.60674749608268e-06, "loss": 0.3842, "step": 2225 }, { "epoch": 0.7688329598345113, "grad_norm": 1.2502019396132484, "learning_rate": 9.604574521059031e-06, "loss": 0.3527, "step": 2230 }, { "epoch": 0.770556800551629, "grad_norm": 1.292033578517676, "learning_rate": 9.602395806156138e-06, "loss": 0.3373, "step": 2235 }, { "epoch": 0.7722806412687467, "grad_norm": 1.2513714620754817, "learning_rate": 9.600211354089903e-06, "loss": 0.3696, "step": 2240 }, { "epoch": 0.7740044819858645, "grad_norm": 1.2447118649170084, "learning_rate": 9.598021167583374e-06, "loss": 0.339, "step": 2245 }, { "epoch": 0.7757283227029822, "grad_norm": 1.4852008354533497, "learning_rate": 9.595825249366751e-06, "loss": 0.3278, "step": 2250 }, { "epoch": 0.7774521634200999, "grad_norm": 1.3741570480902097, "learning_rate": 9.593623602177378e-06, "loss": 0.3537, "step": 2255 }, { "epoch": 0.7791760041372178, "grad_norm": 1.7110318822891042, "learning_rate": 9.59141622875974e-06, "loss": 0.3133, "step": 2260 }, { "epoch": 0.7808998448543355, "grad_norm": 1.0798645745221636, "learning_rate": 9.589203131865464e-06, "loss": 0.3188, "step": 2265 }, { "epoch": 0.7826236855714532, "grad_norm": 1.4075076569973053, "learning_rate": 9.586984314253307e-06, "loss": 0.3378, "step": 2270 }, { "epoch": 0.784347526288571, "grad_norm": 1.2503754359181298, "learning_rate": 9.584759778689157e-06, "loss": 0.3364, "step": 2275 }, { "epoch": 0.7860713670056887, "grad_norm": 1.1529449498678912, "learning_rate": 9.582529527946032e-06, "loss": 0.333, "step": 2280 }, { "epoch": 0.7877952077228064, "grad_norm": 1.333286467678276, "learning_rate": 9.580293564804074e-06, "loss": 0.3512, "step": 2285 }, { "epoch": 0.7895190484399242, "grad_norm": 3.0938545533335153, "learning_rate": 9.578051892050548e-06, "loss": 0.3487, "step": 2290 }, { "epoch": 0.7912428891570419, "grad_norm": 1.232161292795314, "learning_rate": 9.57580451247983e-06, "loss": 0.3503, "step": 2295 }, { "epoch": 0.7929667298741596, "grad_norm": 1.2968956904022089, "learning_rate": 9.573551428893419e-06, "loss": 0.3635, "step": 2300 }, { "epoch": 0.7946905705912773, "grad_norm": 1.2207877754566046, "learning_rate": 9.571292644099914e-06, "loss": 0.3435, "step": 2305 }, { "epoch": 0.7964144113083951, "grad_norm": 1.485339530667337, "learning_rate": 9.569028160915028e-06, "loss": 0.3545, "step": 2310 }, { "epoch": 0.7981382520255128, "grad_norm": 1.5574049401540915, "learning_rate": 9.566757982161576e-06, "loss": 0.3382, "step": 2315 }, { "epoch": 0.7998620927426305, "grad_norm": 1.1584070857731443, "learning_rate": 9.564482110669473e-06, "loss": 0.3396, "step": 2320 }, { "epoch": 0.8015859334597483, "grad_norm": 1.0932314469914213, "learning_rate": 9.56220054927573e-06, "loss": 0.3654, "step": 2325 }, { "epoch": 0.8033097741768661, "grad_norm": 1.7106605016330145, "learning_rate": 9.559913300824448e-06, "loss": 0.3235, "step": 2330 }, { "epoch": 0.8050336148939838, "grad_norm": 1.332995683769837, "learning_rate": 9.55762036816682e-06, "loss": 0.3302, "step": 2335 }, { "epoch": 0.8067574556111016, "grad_norm": 2.651225259868728, "learning_rate": 9.555321754161128e-06, "loss": 0.333, "step": 2340 }, { "epoch": 0.8084812963282193, "grad_norm": 1.0688478928234517, "learning_rate": 9.553017461672731e-06, "loss": 0.3239, "step": 2345 }, { "epoch": 0.810205137045337, "grad_norm": 1.8617172914803979, "learning_rate": 9.550707493574068e-06, "loss": 0.3424, "step": 2350 }, { "epoch": 0.8119289777624548, "grad_norm": 1.1028777782836379, "learning_rate": 9.548391852744653e-06, "loss": 0.3154, "step": 2355 }, { "epoch": 0.8136528184795725, "grad_norm": 1.3623541183125842, "learning_rate": 9.546070542071072e-06, "loss": 0.3488, "step": 2360 }, { "epoch": 0.8153766591966902, "grad_norm": 1.2406464456818898, "learning_rate": 9.543743564446978e-06, "loss": 0.3329, "step": 2365 }, { "epoch": 0.817100499913808, "grad_norm": 1.536245772157553, "learning_rate": 9.541410922773089e-06, "loss": 0.3423, "step": 2370 }, { "epoch": 0.8188243406309257, "grad_norm": 1.5397276731246494, "learning_rate": 9.539072619957183e-06, "loss": 0.3247, "step": 2375 }, { "epoch": 0.8205481813480434, "grad_norm": 1.8060539523506391, "learning_rate": 9.536728658914097e-06, "loss": 0.3568, "step": 2380 }, { "epoch": 0.8222720220651611, "grad_norm": 1.4592439115566356, "learning_rate": 9.534379042565717e-06, "loss": 0.3336, "step": 2385 }, { "epoch": 0.8239958627822789, "grad_norm": 1.3185535134406328, "learning_rate": 9.532023773840982e-06, "loss": 0.3392, "step": 2390 }, { "epoch": 0.8257197034993966, "grad_norm": 1.3342775023387434, "learning_rate": 9.529662855675876e-06, "loss": 0.315, "step": 2395 }, { "epoch": 0.8274435442165144, "grad_norm": 1.3718867632280016, "learning_rate": 9.527296291013426e-06, "loss": 0.3243, "step": 2400 }, { "epoch": 0.8291673849336322, "grad_norm": 1.3647005703897526, "learning_rate": 9.524924082803698e-06, "loss": 0.3333, "step": 2405 }, { "epoch": 0.8308912256507499, "grad_norm": 1.5042258411470462, "learning_rate": 9.522546234003788e-06, "loss": 0.336, "step": 2410 }, { "epoch": 0.8326150663678676, "grad_norm": 1.3461680568437342, "learning_rate": 9.520162747577835e-06, "loss": 0.3365, "step": 2415 }, { "epoch": 0.8343389070849854, "grad_norm": 1.1933386344717056, "learning_rate": 9.517773626496993e-06, "loss": 0.3504, "step": 2420 }, { "epoch": 0.8360627478021031, "grad_norm": 1.1962471023938015, "learning_rate": 9.515378873739446e-06, "loss": 0.3557, "step": 2425 }, { "epoch": 0.8377865885192208, "grad_norm": 1.2334697182629366, "learning_rate": 9.512978492290399e-06, "loss": 0.3505, "step": 2430 }, { "epoch": 0.8395104292363386, "grad_norm": 1.2404195025607767, "learning_rate": 9.51057248514207e-06, "loss": 0.3286, "step": 2435 }, { "epoch": 0.8412342699534563, "grad_norm": 1.4456394535074233, "learning_rate": 9.508160855293692e-06, "loss": 0.3703, "step": 2440 }, { "epoch": 0.842958110670574, "grad_norm": 1.211630844825725, "learning_rate": 9.505743605751508e-06, "loss": 0.3327, "step": 2445 }, { "epoch": 0.8446819513876918, "grad_norm": 1.2508066585995028, "learning_rate": 9.503320739528765e-06, "loss": 0.3198, "step": 2450 }, { "epoch": 0.8464057921048095, "grad_norm": 1.5023037643225363, "learning_rate": 9.500892259645711e-06, "loss": 0.3555, "step": 2455 }, { "epoch": 0.8481296328219272, "grad_norm": 1.3347230510275558, "learning_rate": 9.498458169129592e-06, "loss": 0.353, "step": 2460 }, { "epoch": 0.849853473539045, "grad_norm": 2.7517703869349965, "learning_rate": 9.496018471014647e-06, "loss": 0.3576, "step": 2465 }, { "epoch": 0.8515773142561627, "grad_norm": 1.4814688915816552, "learning_rate": 9.493573168342109e-06, "loss": 0.3301, "step": 2470 }, { "epoch": 0.8533011549732805, "grad_norm": 1.2196033889361055, "learning_rate": 9.491122264160196e-06, "loss": 0.297, "step": 2475 }, { "epoch": 0.8550249956903982, "grad_norm": 1.275401994882182, "learning_rate": 9.488665761524103e-06, "loss": 0.3234, "step": 2480 }, { "epoch": 0.856748836407516, "grad_norm": 3.4667876017122747, "learning_rate": 9.486203663496013e-06, "loss": 0.354, "step": 2485 }, { "epoch": 0.8584726771246337, "grad_norm": 1.2188819142562994, "learning_rate": 9.483735973145073e-06, "loss": 0.3304, "step": 2490 }, { "epoch": 0.8601965178417514, "grad_norm": 1.2053209562676213, "learning_rate": 9.481262693547416e-06, "loss": 0.3272, "step": 2495 }, { "epoch": 0.8619203585588692, "grad_norm": 1.157798675578704, "learning_rate": 9.47878382778613e-06, "loss": 0.3283, "step": 2500 }, { "epoch": 0.8636441992759869, "grad_norm": 1.277783662247749, "learning_rate": 9.476299378951267e-06, "loss": 0.3269, "step": 2505 }, { "epoch": 0.8653680399931046, "grad_norm": 1.2785783526501449, "learning_rate": 9.473809350139846e-06, "loss": 0.3482, "step": 2510 }, { "epoch": 0.8670918807102224, "grad_norm": 1.36486848600861, "learning_rate": 9.471313744455839e-06, "loss": 0.363, "step": 2515 }, { "epoch": 0.8688157214273401, "grad_norm": 1.1617192231119264, "learning_rate": 9.468812565010164e-06, "loss": 0.3517, "step": 2520 }, { "epoch": 0.8705395621444578, "grad_norm": 1.5440363899586036, "learning_rate": 9.466305814920695e-06, "loss": 0.3313, "step": 2525 }, { "epoch": 0.8722634028615756, "grad_norm": 1.0159466150889511, "learning_rate": 9.463793497312246e-06, "loss": 0.3326, "step": 2530 }, { "epoch": 0.8739872435786933, "grad_norm": 1.2885891218483585, "learning_rate": 9.461275615316571e-06, "loss": 0.3579, "step": 2535 }, { "epoch": 0.875711084295811, "grad_norm": 2.4592342977104957, "learning_rate": 9.458752172072363e-06, "loss": 0.3556, "step": 2540 }, { "epoch": 0.8774349250129289, "grad_norm": 1.472629523629724, "learning_rate": 9.456223170725244e-06, "loss": 0.3517, "step": 2545 }, { "epoch": 0.8791587657300466, "grad_norm": 1.4734918044829217, "learning_rate": 9.453688614427772e-06, "loss": 0.3058, "step": 2550 }, { "epoch": 0.8808826064471643, "grad_norm": 1.4448743076562125, "learning_rate": 9.451148506339416e-06, "loss": 0.357, "step": 2555 }, { "epoch": 0.882606447164282, "grad_norm": 4.491847049166671, "learning_rate": 9.44860284962658e-06, "loss": 0.3279, "step": 2560 }, { "epoch": 0.8843302878813998, "grad_norm": 1.3416304743321867, "learning_rate": 9.446051647462573e-06, "loss": 0.3478, "step": 2565 }, { "epoch": 0.8860541285985175, "grad_norm": 1.1288996950397197, "learning_rate": 9.443494903027626e-06, "loss": 0.3319, "step": 2570 }, { "epoch": 0.8877779693156352, "grad_norm": 0.901498432790985, "learning_rate": 9.440932619508873e-06, "loss": 0.3426, "step": 2575 }, { "epoch": 0.889501810032753, "grad_norm": 1.1035942636790148, "learning_rate": 9.438364800100355e-06, "loss": 0.3306, "step": 2580 }, { "epoch": 0.8912256507498707, "grad_norm": 1.0523924262373272, "learning_rate": 9.435791448003013e-06, "loss": 0.3198, "step": 2585 }, { "epoch": 0.8929494914669884, "grad_norm": 1.4009671647525208, "learning_rate": 9.433212566424687e-06, "loss": 0.3303, "step": 2590 }, { "epoch": 0.8946733321841062, "grad_norm": 1.1826714745257199, "learning_rate": 9.430628158580106e-06, "loss": 0.3471, "step": 2595 }, { "epoch": 0.8963971729012239, "grad_norm": 1.4810150161689448, "learning_rate": 9.42803822769089e-06, "loss": 0.3574, "step": 2600 }, { "epoch": 0.8981210136183416, "grad_norm": 1.441077514610335, "learning_rate": 9.425442776985545e-06, "loss": 0.3435, "step": 2605 }, { "epoch": 0.8998448543354594, "grad_norm": 1.1357452889666666, "learning_rate": 9.422841809699456e-06, "loss": 0.3241, "step": 2610 }, { "epoch": 0.9015686950525772, "grad_norm": 2.0085711068152, "learning_rate": 9.420235329074884e-06, "loss": 0.3234, "step": 2615 }, { "epoch": 0.9032925357696949, "grad_norm": 0.9489424603080546, "learning_rate": 9.417623338360969e-06, "loss": 0.3154, "step": 2620 }, { "epoch": 0.9050163764868127, "grad_norm": 1.4232674265899794, "learning_rate": 9.415005840813707e-06, "loss": 0.3303, "step": 2625 }, { "epoch": 0.9067402172039304, "grad_norm": 1.3382898651573043, "learning_rate": 9.41238283969597e-06, "loss": 0.3301, "step": 2630 }, { "epoch": 0.9084640579210481, "grad_norm": 1.1231685188301026, "learning_rate": 9.409754338277488e-06, "loss": 0.3085, "step": 2635 }, { "epoch": 0.9101878986381658, "grad_norm": 1.5592137300364184, "learning_rate": 9.407120339834844e-06, "loss": 0.3581, "step": 2640 }, { "epoch": 0.9119117393552836, "grad_norm": 1.4910951835642579, "learning_rate": 9.404480847651478e-06, "loss": 0.335, "step": 2645 }, { "epoch": 0.9136355800724013, "grad_norm": 1.3912407376667717, "learning_rate": 9.401835865017672e-06, "loss": 0.3016, "step": 2650 }, { "epoch": 0.915359420789519, "grad_norm": 1.253916795107278, "learning_rate": 9.399185395230561e-06, "loss": 0.3157, "step": 2655 }, { "epoch": 0.9170832615066368, "grad_norm": 2.2440199096218425, "learning_rate": 9.396529441594108e-06, "loss": 0.3496, "step": 2660 }, { "epoch": 0.9188071022237545, "grad_norm": 1.419891140198786, "learning_rate": 9.393868007419128e-06, "loss": 0.3507, "step": 2665 }, { "epoch": 0.9205309429408722, "grad_norm": 1.2073706223295508, "learning_rate": 9.391201096023253e-06, "loss": 0.3083, "step": 2670 }, { "epoch": 0.92225478365799, "grad_norm": 1.2615155330650807, "learning_rate": 9.388528710730948e-06, "loss": 0.321, "step": 2675 }, { "epoch": 0.9239786243751077, "grad_norm": 1.339076863408327, "learning_rate": 9.385850854873507e-06, "loss": 0.3353, "step": 2680 }, { "epoch": 0.9257024650922254, "grad_norm": 1.2825836435340867, "learning_rate": 9.383167531789034e-06, "loss": 0.3511, "step": 2685 }, { "epoch": 0.9274263058093433, "grad_norm": 1.7019430585409971, "learning_rate": 9.380478744822455e-06, "loss": 0.3074, "step": 2690 }, { "epoch": 0.929150146526461, "grad_norm": 1.233264044715385, "learning_rate": 9.377784497325501e-06, "loss": 0.3282, "step": 2695 }, { "epoch": 0.9308739872435787, "grad_norm": 1.2831370149893337, "learning_rate": 9.37508479265672e-06, "loss": 0.317, "step": 2700 }, { "epoch": 0.9325978279606965, "grad_norm": 1.2661466151020366, "learning_rate": 9.372379634181451e-06, "loss": 0.3164, "step": 2705 }, { "epoch": 0.9343216686778142, "grad_norm": 1.3461679472854156, "learning_rate": 9.36966902527184e-06, "loss": 0.3859, "step": 2710 }, { "epoch": 0.9360455093949319, "grad_norm": 1.3258623040461295, "learning_rate": 9.366952969306821e-06, "loss": 0.343, "step": 2715 }, { "epoch": 0.9377693501120496, "grad_norm": 1.2837719220100288, "learning_rate": 9.364231469672125e-06, "loss": 0.31, "step": 2720 }, { "epoch": 0.9394931908291674, "grad_norm": 1.1383293285293314, "learning_rate": 9.361504529760261e-06, "loss": 0.3466, "step": 2725 }, { "epoch": 0.9412170315462851, "grad_norm": 1.2941441350697112, "learning_rate": 9.358772152970528e-06, "loss": 0.3444, "step": 2730 }, { "epoch": 0.9429408722634028, "grad_norm": 1.69131432027105, "learning_rate": 9.356034342708995e-06, "loss": 0.3672, "step": 2735 }, { "epoch": 0.9446647129805206, "grad_norm": 0.9926487053723504, "learning_rate": 9.353291102388509e-06, "loss": 0.3131, "step": 2740 }, { "epoch": 0.9463885536976383, "grad_norm": 1.3410566345399482, "learning_rate": 9.350542435428682e-06, "loss": 0.3183, "step": 2745 }, { "epoch": 0.948112394414756, "grad_norm": 1.3214435277868297, "learning_rate": 9.347788345255895e-06, "loss": 0.3237, "step": 2750 }, { "epoch": 0.9498362351318738, "grad_norm": 1.0277835304235838, "learning_rate": 9.345028835303287e-06, "loss": 0.3319, "step": 2755 }, { "epoch": 0.9515600758489916, "grad_norm": 4.14593407556766, "learning_rate": 9.342263909010752e-06, "loss": 0.3116, "step": 2760 }, { "epoch": 0.9532839165661093, "grad_norm": 1.6385378541366884, "learning_rate": 9.339493569824937e-06, "loss": 0.3329, "step": 2765 }, { "epoch": 0.9550077572832271, "grad_norm": 1.3821378989058908, "learning_rate": 9.336717821199237e-06, "loss": 0.3046, "step": 2770 }, { "epoch": 0.9567315980003448, "grad_norm": 1.4381237457331928, "learning_rate": 9.33393666659379e-06, "loss": 0.3285, "step": 2775 }, { "epoch": 0.9584554387174625, "grad_norm": 1.216809366937311, "learning_rate": 9.331150109475473e-06, "loss": 0.3125, "step": 2780 }, { "epoch": 0.9601792794345803, "grad_norm": 1.3041047185014534, "learning_rate": 9.328358153317895e-06, "loss": 0.3826, "step": 2785 }, { "epoch": 0.961903120151698, "grad_norm": 2.2222305774162647, "learning_rate": 9.3255608016014e-06, "loss": 0.3297, "step": 2790 }, { "epoch": 0.9636269608688157, "grad_norm": 1.1654097857044674, "learning_rate": 9.322758057813053e-06, "loss": 0.3307, "step": 2795 }, { "epoch": 0.9653508015859334, "grad_norm": 1.6721298722949953, "learning_rate": 9.319949925446646e-06, "loss": 0.3361, "step": 2800 }, { "epoch": 0.9670746423030512, "grad_norm": 1.752818697182659, "learning_rate": 9.31713640800268e-06, "loss": 0.3317, "step": 2805 }, { "epoch": 0.9687984830201689, "grad_norm": 1.5385925523268744, "learning_rate": 9.31431750898838e-06, "loss": 0.353, "step": 2810 }, { "epoch": 0.9705223237372866, "grad_norm": 2.029079104356329, "learning_rate": 9.311493231917668e-06, "loss": 0.3441, "step": 2815 }, { "epoch": 0.9722461644544044, "grad_norm": 2.3846356000705025, "learning_rate": 9.308663580311176e-06, "loss": 0.3238, "step": 2820 }, { "epoch": 0.9739700051715221, "grad_norm": 1.1577758429063203, "learning_rate": 9.30582855769624e-06, "loss": 0.3658, "step": 2825 }, { "epoch": 0.9756938458886399, "grad_norm": 2.1131532793323284, "learning_rate": 9.30298816760688e-06, "loss": 0.289, "step": 2830 }, { "epoch": 0.9774176866057577, "grad_norm": 1.3175010394166544, "learning_rate": 9.300142413583815e-06, "loss": 0.3084, "step": 2835 }, { "epoch": 0.9791415273228754, "grad_norm": 1.3865265635249566, "learning_rate": 9.297291299174451e-06, "loss": 0.3108, "step": 2840 }, { "epoch": 0.9808653680399931, "grad_norm": 1.2854712886862314, "learning_rate": 9.294434827932873e-06, "loss": 0.3238, "step": 2845 }, { "epoch": 0.9825892087571109, "grad_norm": 1.3196575952522713, "learning_rate": 9.29157300341984e-06, "loss": 0.3476, "step": 2850 }, { "epoch": 0.9843130494742286, "grad_norm": 1.166391706467773, "learning_rate": 9.288705829202795e-06, "loss": 0.3467, "step": 2855 }, { "epoch": 0.9860368901913463, "grad_norm": 2.1517267968283766, "learning_rate": 9.28583330885584e-06, "loss": 0.3206, "step": 2860 }, { "epoch": 0.9877607309084641, "grad_norm": 1.4302470939931653, "learning_rate": 9.282955445959742e-06, "loss": 0.2991, "step": 2865 }, { "epoch": 0.9894845716255818, "grad_norm": 1.2982846886282287, "learning_rate": 9.280072244101935e-06, "loss": 0.3439, "step": 2870 }, { "epoch": 0.9912084123426995, "grad_norm": 1.4171862473277814, "learning_rate": 9.277183706876503e-06, "loss": 0.33, "step": 2875 }, { "epoch": 0.9929322530598172, "grad_norm": 2.0529464439224743, "learning_rate": 9.274289837884177e-06, "loss": 0.3273, "step": 2880 }, { "epoch": 0.994656093776935, "grad_norm": 1.4179302230128747, "learning_rate": 9.271390640732344e-06, "loss": 0.3249, "step": 2885 }, { "epoch": 0.9963799344940527, "grad_norm": 1.1838232344069644, "learning_rate": 9.268486119035024e-06, "loss": 0.3261, "step": 2890 }, { "epoch": 0.9981037752111704, "grad_norm": 1.0809177117913324, "learning_rate": 9.26557627641288e-06, "loss": 0.3367, "step": 2895 }, { "epoch": 0.9998276159282882, "grad_norm": 1.3718377055896473, "learning_rate": 9.262661116493206e-06, "loss": 0.3793, "step": 2900 }, { "epoch": 1.0013790725736942, "grad_norm": 1.4469424397242794, "learning_rate": 9.259740642909925e-06, "loss": 0.3396, "step": 2905 }, { "epoch": 1.003102913290812, "grad_norm": 1.2088296961564604, "learning_rate": 9.25681485930358e-06, "loss": 0.2943, "step": 2910 }, { "epoch": 1.0048267540079296, "grad_norm": 1.0212872692451622, "learning_rate": 9.253883769321338e-06, "loss": 0.3255, "step": 2915 }, { "epoch": 1.0065505947250475, "grad_norm": 1.2957301045173608, "learning_rate": 9.250947376616981e-06, "loss": 0.3406, "step": 2920 }, { "epoch": 1.008274435442165, "grad_norm": 1.200707887021714, "learning_rate": 9.248005684850899e-06, "loss": 0.3247, "step": 2925 }, { "epoch": 1.009998276159283, "grad_norm": 1.9336721553938447, "learning_rate": 9.245058697690082e-06, "loss": 0.33, "step": 2930 }, { "epoch": 1.0117221168764006, "grad_norm": 1.2363403152433825, "learning_rate": 9.242106418808135e-06, "loss": 0.2935, "step": 2935 }, { "epoch": 1.0134459575935184, "grad_norm": 1.0602086781702837, "learning_rate": 9.239148851885246e-06, "loss": 0.333, "step": 2940 }, { "epoch": 1.015169798310636, "grad_norm": 1.0500617420268046, "learning_rate": 9.236186000608202e-06, "loss": 0.2984, "step": 2945 }, { "epoch": 1.0168936390277539, "grad_norm": 1.4653937545800964, "learning_rate": 9.233217868670375e-06, "loss": 0.3551, "step": 2950 }, { "epoch": 1.0186174797448715, "grad_norm": 1.1579878357723785, "learning_rate": 9.23024445977172e-06, "loss": 0.323, "step": 2955 }, { "epoch": 1.0203413204619893, "grad_norm": 1.3960967621772689, "learning_rate": 9.22726577761877e-06, "loss": 0.328, "step": 2960 }, { "epoch": 1.022065161179107, "grad_norm": 1.6683684640391248, "learning_rate": 9.224281825924633e-06, "loss": 0.3151, "step": 2965 }, { "epoch": 1.0237890018962248, "grad_norm": 1.3594375481318157, "learning_rate": 9.221292608408981e-06, "loss": 0.3328, "step": 2970 }, { "epoch": 1.0255128426133426, "grad_norm": 1.1961076479551558, "learning_rate": 9.218298128798057e-06, "loss": 0.2866, "step": 2975 }, { "epoch": 1.0272366833304603, "grad_norm": 1.3264006343525812, "learning_rate": 9.21529839082466e-06, "loss": 0.3634, "step": 2980 }, { "epoch": 1.028960524047578, "grad_norm": 1.4380114758343852, "learning_rate": 9.212293398228143e-06, "loss": 0.326, "step": 2985 }, { "epoch": 1.0306843647646957, "grad_norm": 1.2633287466496788, "learning_rate": 9.209283154754407e-06, "loss": 0.3243, "step": 2990 }, { "epoch": 1.0324082054818136, "grad_norm": 1.2068503164627076, "learning_rate": 9.206267664155906e-06, "loss": 0.3284, "step": 2995 }, { "epoch": 1.0341320461989312, "grad_norm": 0.9972841581038544, "learning_rate": 9.20324693019163e-06, "loss": 0.3141, "step": 3000 }, { "epoch": 1.035855886916049, "grad_norm": 1.3456009009348888, "learning_rate": 9.200220956627103e-06, "loss": 0.3067, "step": 3005 }, { "epoch": 1.0375797276331666, "grad_norm": 1.4124353691852705, "learning_rate": 9.197189747234386e-06, "loss": 0.3139, "step": 3010 }, { "epoch": 1.0393035683502845, "grad_norm": 1.2134329536513566, "learning_rate": 9.194153305792063e-06, "loss": 0.303, "step": 3015 }, { "epoch": 1.041027409067402, "grad_norm": 1.225486462924102, "learning_rate": 9.191111636085239e-06, "loss": 0.3456, "step": 3020 }, { "epoch": 1.04275124978452, "grad_norm": 1.200538272769869, "learning_rate": 9.188064741905541e-06, "loss": 0.3398, "step": 3025 }, { "epoch": 1.0444750905016376, "grad_norm": 1.1788390122132568, "learning_rate": 9.185012627051104e-06, "loss": 0.308, "step": 3030 }, { "epoch": 1.0461989312187554, "grad_norm": 1.1923896675301378, "learning_rate": 9.181955295326577e-06, "loss": 0.2828, "step": 3035 }, { "epoch": 1.047922771935873, "grad_norm": 1.2818716741324634, "learning_rate": 9.178892750543102e-06, "loss": 0.3064, "step": 3040 }, { "epoch": 1.0496466126529909, "grad_norm": 1.754963405197714, "learning_rate": 9.175824996518328e-06, "loss": 0.317, "step": 3045 }, { "epoch": 1.0513704533701087, "grad_norm": 1.4595740948374027, "learning_rate": 9.172752037076397e-06, "loss": 0.3331, "step": 3050 }, { "epoch": 1.0530942940872263, "grad_norm": 1.4914539720002655, "learning_rate": 9.169673876047935e-06, "loss": 0.3124, "step": 3055 }, { "epoch": 1.0548181348043442, "grad_norm": 1.3450930772698597, "learning_rate": 9.166590517270057e-06, "loss": 0.3342, "step": 3060 }, { "epoch": 1.0565419755214618, "grad_norm": 1.5353690769757897, "learning_rate": 9.163501964586352e-06, "loss": 0.3258, "step": 3065 }, { "epoch": 1.0582658162385796, "grad_norm": 1.31240648943605, "learning_rate": 9.160408221846892e-06, "loss": 0.279, "step": 3070 }, { "epoch": 1.0599896569556972, "grad_norm": 1.3020899417469012, "learning_rate": 9.157309292908209e-06, "loss": 0.3284, "step": 3075 }, { "epoch": 1.061713497672815, "grad_norm": 1.123033745362766, "learning_rate": 9.154205181633307e-06, "loss": 0.3202, "step": 3080 }, { "epoch": 1.0634373383899327, "grad_norm": 1.3700619980016504, "learning_rate": 9.151095891891645e-06, "loss": 0.3505, "step": 3085 }, { "epoch": 1.0651611791070505, "grad_norm": 1.2780386112530913, "learning_rate": 9.147981427559143e-06, "loss": 0.2824, "step": 3090 }, { "epoch": 1.0668850198241682, "grad_norm": 1.101535842869913, "learning_rate": 9.144861792518165e-06, "loss": 0.3002, "step": 3095 }, { "epoch": 1.068608860541286, "grad_norm": 2.1390900051858357, "learning_rate": 9.141736990657525e-06, "loss": 0.3183, "step": 3100 }, { "epoch": 1.0703327012584036, "grad_norm": 1.7802300522958283, "learning_rate": 9.138607025872479e-06, "loss": 0.319, "step": 3105 }, { "epoch": 1.0720565419755215, "grad_norm": 2.541754822505671, "learning_rate": 9.135471902064715e-06, "loss": 0.3219, "step": 3110 }, { "epoch": 1.073780382692639, "grad_norm": 1.1246883363907023, "learning_rate": 9.13233162314235e-06, "loss": 0.2884, "step": 3115 }, { "epoch": 1.075504223409757, "grad_norm": 1.1851281288816107, "learning_rate": 9.129186193019936e-06, "loss": 0.3292, "step": 3120 }, { "epoch": 1.0772280641268748, "grad_norm": 1.317805691787828, "learning_rate": 9.126035615618436e-06, "loss": 0.3393, "step": 3125 }, { "epoch": 1.0789519048439924, "grad_norm": 0.9821849032061132, "learning_rate": 9.12287989486524e-06, "loss": 0.3064, "step": 3130 }, { "epoch": 1.0806757455611102, "grad_norm": 3.1902851908124754, "learning_rate": 9.119719034694138e-06, "loss": 0.3136, "step": 3135 }, { "epoch": 1.0823995862782279, "grad_norm": 1.057752385226104, "learning_rate": 9.116553039045335e-06, "loss": 0.2912, "step": 3140 }, { "epoch": 1.0841234269953457, "grad_norm": 1.082838286388527, "learning_rate": 9.113381911865438e-06, "loss": 0.3005, "step": 3145 }, { "epoch": 1.0858472677124633, "grad_norm": 1.207253429285479, "learning_rate": 9.110205657107442e-06, "loss": 0.3317, "step": 3150 }, { "epoch": 1.0875711084295812, "grad_norm": 2.1789368895253, "learning_rate": 9.107024278730745e-06, "loss": 0.3206, "step": 3155 }, { "epoch": 1.0892949491466988, "grad_norm": 1.0505887404596654, "learning_rate": 9.103837780701123e-06, "loss": 0.3246, "step": 3160 }, { "epoch": 1.0910187898638166, "grad_norm": 1.0985766251187308, "learning_rate": 9.10064616699074e-06, "loss": 0.318, "step": 3165 }, { "epoch": 1.0927426305809342, "grad_norm": 1.6052042628818102, "learning_rate": 9.097449441578133e-06, "loss": 0.3143, "step": 3170 }, { "epoch": 1.094466471298052, "grad_norm": 1.1837835031734694, "learning_rate": 9.094247608448212e-06, "loss": 0.3072, "step": 3175 }, { "epoch": 1.0961903120151697, "grad_norm": 1.2079701069646844, "learning_rate": 9.091040671592255e-06, "loss": 0.3203, "step": 3180 }, { "epoch": 1.0979141527322875, "grad_norm": 1.2820392494380455, "learning_rate": 9.087828635007905e-06, "loss": 0.3057, "step": 3185 }, { "epoch": 1.0996379934494054, "grad_norm": 1.0910305561507039, "learning_rate": 9.084611502699156e-06, "loss": 0.2925, "step": 3190 }, { "epoch": 1.101361834166523, "grad_norm": 1.1750103231464422, "learning_rate": 9.081389278676356e-06, "loss": 0.3023, "step": 3195 }, { "epoch": 1.1030856748836408, "grad_norm": 1.0523181897701774, "learning_rate": 9.078161966956205e-06, "loss": 0.3073, "step": 3200 }, { "epoch": 1.1048095156007585, "grad_norm": 1.1144835013330965, "learning_rate": 9.074929571561737e-06, "loss": 0.3103, "step": 3205 }, { "epoch": 1.1065333563178763, "grad_norm": 1.2826801652393287, "learning_rate": 9.071692096522331e-06, "loss": 0.3234, "step": 3210 }, { "epoch": 1.108257197034994, "grad_norm": 1.2110538700727251, "learning_rate": 9.068449545873692e-06, "loss": 0.3247, "step": 3215 }, { "epoch": 1.1099810377521118, "grad_norm": 1.1195375320247638, "learning_rate": 9.065201923657854e-06, "loss": 0.3239, "step": 3220 }, { "epoch": 1.1117048784692294, "grad_norm": 1.1149731135135896, "learning_rate": 9.061949233923176e-06, "loss": 0.3068, "step": 3225 }, { "epoch": 1.1134287191863472, "grad_norm": 1.095240329352778, "learning_rate": 9.058691480724329e-06, "loss": 0.3092, "step": 3230 }, { "epoch": 1.1151525599034648, "grad_norm": 1.2147303093492796, "learning_rate": 9.055428668122302e-06, "loss": 0.3157, "step": 3235 }, { "epoch": 1.1168764006205827, "grad_norm": 1.299461369702117, "learning_rate": 9.052160800184383e-06, "loss": 0.319, "step": 3240 }, { "epoch": 1.1186002413377003, "grad_norm": 1.0488502649200553, "learning_rate": 9.04888788098417e-06, "loss": 0.2859, "step": 3245 }, { "epoch": 1.1203240820548181, "grad_norm": 1.6973882557434135, "learning_rate": 9.04560991460155e-06, "loss": 0.314, "step": 3250 }, { "epoch": 1.122047922771936, "grad_norm": 1.4241709959155497, "learning_rate": 9.042326905122708e-06, "loss": 0.3169, "step": 3255 }, { "epoch": 1.1237717634890536, "grad_norm": 1.3947509943465393, "learning_rate": 9.039038856640112e-06, "loss": 0.3052, "step": 3260 }, { "epoch": 1.1254956042061712, "grad_norm": 5.946235156364782, "learning_rate": 9.035745773252512e-06, "loss": 0.3199, "step": 3265 }, { "epoch": 1.127219444923289, "grad_norm": 1.2064611015080702, "learning_rate": 9.032447659064936e-06, "loss": 0.287, "step": 3270 }, { "epoch": 1.128943285640407, "grad_norm": 1.1525822962827261, "learning_rate": 9.029144518188679e-06, "loss": 0.3187, "step": 3275 }, { "epoch": 1.1306671263575245, "grad_norm": 1.0183538734942175, "learning_rate": 9.02583635474131e-06, "loss": 0.3084, "step": 3280 }, { "epoch": 1.1323909670746424, "grad_norm": 1.0409103846604317, "learning_rate": 9.022523172846646e-06, "loss": 0.3147, "step": 3285 }, { "epoch": 1.13411480779176, "grad_norm": 1.1631051294259775, "learning_rate": 9.019204976634774e-06, "loss": 0.3399, "step": 3290 }, { "epoch": 1.1358386485088778, "grad_norm": 1.3342095603729827, "learning_rate": 9.015881770242024e-06, "loss": 0.3262, "step": 3295 }, { "epoch": 1.1375624892259955, "grad_norm": 1.0923642457198397, "learning_rate": 9.012553557810973e-06, "loss": 0.3059, "step": 3300 }, { "epoch": 1.1392863299431133, "grad_norm": 1.135252373838748, "learning_rate": 9.009220343490435e-06, "loss": 0.3011, "step": 3305 }, { "epoch": 1.141010170660231, "grad_norm": 1.3095446317894224, "learning_rate": 9.005882131435465e-06, "loss": 0.2911, "step": 3310 }, { "epoch": 1.1427340113773488, "grad_norm": 2.2004334678146464, "learning_rate": 9.002538925807345e-06, "loss": 0.3129, "step": 3315 }, { "epoch": 1.1444578520944664, "grad_norm": 1.1777030293123387, "learning_rate": 8.999190730773582e-06, "loss": 0.3194, "step": 3320 }, { "epoch": 1.1461816928115842, "grad_norm": 1.955546480394541, "learning_rate": 8.995837550507903e-06, "loss": 0.2928, "step": 3325 }, { "epoch": 1.1479055335287018, "grad_norm": 1.108216299144341, "learning_rate": 8.992479389190247e-06, "loss": 0.3273, "step": 3330 }, { "epoch": 1.1496293742458197, "grad_norm": 1.5101398201775058, "learning_rate": 8.989116251006766e-06, "loss": 0.2962, "step": 3335 }, { "epoch": 1.1513532149629375, "grad_norm": 0.9838854320360326, "learning_rate": 8.985748140149813e-06, "loss": 0.3044, "step": 3340 }, { "epoch": 1.1530770556800551, "grad_norm": 1.7167124539287208, "learning_rate": 8.982375060817942e-06, "loss": 0.318, "step": 3345 }, { "epoch": 1.154800896397173, "grad_norm": 1.1990845752020232, "learning_rate": 8.978997017215897e-06, "loss": 0.2834, "step": 3350 }, { "epoch": 1.1565247371142906, "grad_norm": 1.2563787199271776, "learning_rate": 8.975614013554619e-06, "loss": 0.3079, "step": 3355 }, { "epoch": 1.1582485778314084, "grad_norm": 1.2228714968733068, "learning_rate": 8.972226054051217e-06, "loss": 0.3214, "step": 3360 }, { "epoch": 1.159972418548526, "grad_norm": 1.1188032594839905, "learning_rate": 8.968833142928992e-06, "loss": 0.3212, "step": 3365 }, { "epoch": 1.161696259265644, "grad_norm": 1.1579918395516766, "learning_rate": 8.96543528441741e-06, "loss": 0.3457, "step": 3370 }, { "epoch": 1.1634200999827615, "grad_norm": 1.270614333694507, "learning_rate": 8.962032482752107e-06, "loss": 0.3016, "step": 3375 }, { "epoch": 1.1651439406998794, "grad_norm": 1.029638728499931, "learning_rate": 8.958624742174881e-06, "loss": 0.3206, "step": 3380 }, { "epoch": 1.166867781416997, "grad_norm": 1.0422434350804568, "learning_rate": 8.955212066933683e-06, "loss": 0.3261, "step": 3385 }, { "epoch": 1.1685916221341148, "grad_norm": 1.166885288746339, "learning_rate": 8.95179446128262e-06, "loss": 0.3082, "step": 3390 }, { "epoch": 1.1703154628512324, "grad_norm": 1.1527357707503587, "learning_rate": 8.948371929481941e-06, "loss": 0.2944, "step": 3395 }, { "epoch": 1.1720393035683503, "grad_norm": 1.3451173935652678, "learning_rate": 8.94494447579804e-06, "loss": 0.3336, "step": 3400 }, { "epoch": 1.1737631442854681, "grad_norm": 1.248928659098544, "learning_rate": 8.941512104503444e-06, "loss": 0.3211, "step": 3405 }, { "epoch": 1.1754869850025857, "grad_norm": 1.3621113086681973, "learning_rate": 8.938074819876809e-06, "loss": 0.3385, "step": 3410 }, { "epoch": 1.1772108257197036, "grad_norm": 1.3886706920077845, "learning_rate": 8.934632626202922e-06, "loss": 0.3017, "step": 3415 }, { "epoch": 1.1789346664368212, "grad_norm": 1.4960290800079372, "learning_rate": 8.931185527772676e-06, "loss": 0.2949, "step": 3420 }, { "epoch": 1.180658507153939, "grad_norm": 2.142670664410332, "learning_rate": 8.927733528883094e-06, "loss": 0.3264, "step": 3425 }, { "epoch": 1.1823823478710567, "grad_norm": 1.168035816519859, "learning_rate": 8.924276633837297e-06, "loss": 0.2848, "step": 3430 }, { "epoch": 1.1841061885881745, "grad_norm": 1.078767182728306, "learning_rate": 8.920814846944513e-06, "loss": 0.3182, "step": 3435 }, { "epoch": 1.1858300293052921, "grad_norm": 1.1575427200338022, "learning_rate": 8.917348172520069e-06, "loss": 0.3453, "step": 3440 }, { "epoch": 1.18755387002241, "grad_norm": 1.1751267417332532, "learning_rate": 8.913876614885381e-06, "loss": 0.3262, "step": 3445 }, { "epoch": 1.1892777107395276, "grad_norm": 2.714565615270745, "learning_rate": 8.910400178367958e-06, "loss": 0.3157, "step": 3450 }, { "epoch": 1.1910015514566454, "grad_norm": 1.1841621039542385, "learning_rate": 8.906918867301384e-06, "loss": 0.2714, "step": 3455 }, { "epoch": 1.192725392173763, "grad_norm": 1.2687659930527437, "learning_rate": 8.903432686025326e-06, "loss": 0.3344, "step": 3460 }, { "epoch": 1.194449232890881, "grad_norm": 1.177974001863437, "learning_rate": 8.899941638885513e-06, "loss": 0.3098, "step": 3465 }, { "epoch": 1.1961730736079987, "grad_norm": 1.1834203084489565, "learning_rate": 8.896445730233753e-06, "loss": 0.2964, "step": 3470 }, { "epoch": 1.1978969143251164, "grad_norm": 1.120645544236912, "learning_rate": 8.892944964427902e-06, "loss": 0.306, "step": 3475 }, { "epoch": 1.199620755042234, "grad_norm": 1.5043334332333145, "learning_rate": 8.889439345831873e-06, "loss": 0.3282, "step": 3480 }, { "epoch": 1.2013445957593518, "grad_norm": 1.2627887443563284, "learning_rate": 8.885928878815635e-06, "loss": 0.3183, "step": 3485 }, { "epoch": 1.2030684364764697, "grad_norm": 1.3312654827479626, "learning_rate": 8.882413567755196e-06, "loss": 0.3118, "step": 3490 }, { "epoch": 1.2047922771935873, "grad_norm": 1.1999457191173653, "learning_rate": 8.8788934170326e-06, "loss": 0.3009, "step": 3495 }, { "epoch": 1.2065161179107051, "grad_norm": 1.2149297405117525, "learning_rate": 8.87536843103593e-06, "loss": 0.302, "step": 3500 }, { "epoch": 1.2082399586278227, "grad_norm": 1.5737309802337405, "learning_rate": 8.87183861415929e-06, "loss": 0.3134, "step": 3505 }, { "epoch": 1.2099637993449406, "grad_norm": 1.2583340120837145, "learning_rate": 8.868303970802812e-06, "loss": 0.2971, "step": 3510 }, { "epoch": 1.2116876400620582, "grad_norm": 1.2160266975293088, "learning_rate": 8.864764505372638e-06, "loss": 0.336, "step": 3515 }, { "epoch": 1.213411480779176, "grad_norm": 1.1282174036075983, "learning_rate": 8.86122022228093e-06, "loss": 0.3225, "step": 3520 }, { "epoch": 1.2151353214962937, "grad_norm": 1.0651429482619807, "learning_rate": 8.857671125945846e-06, "loss": 0.3159, "step": 3525 }, { "epoch": 1.2168591622134115, "grad_norm": 1.1616952116761117, "learning_rate": 8.854117220791549e-06, "loss": 0.3183, "step": 3530 }, { "epoch": 1.2185830029305291, "grad_norm": 1.1444499355129203, "learning_rate": 8.850558511248195e-06, "loss": 0.2943, "step": 3535 }, { "epoch": 1.220306843647647, "grad_norm": 2.159298770877469, "learning_rate": 8.846995001751932e-06, "loss": 0.2877, "step": 3540 }, { "epoch": 1.2220306843647646, "grad_norm": 1.1699216183514227, "learning_rate": 8.843426696744888e-06, "loss": 0.3311, "step": 3545 }, { "epoch": 1.2237545250818824, "grad_norm": 1.5762911225905711, "learning_rate": 8.83985360067517e-06, "loss": 0.2807, "step": 3550 }, { "epoch": 1.2254783657990003, "grad_norm": 1.1775995024929828, "learning_rate": 8.836275717996853e-06, "loss": 0.3127, "step": 3555 }, { "epoch": 1.2272022065161179, "grad_norm": 1.2924795578245347, "learning_rate": 8.832693053169991e-06, "loss": 0.2866, "step": 3560 }, { "epoch": 1.2289260472332357, "grad_norm": 1.0826637187224928, "learning_rate": 8.829105610660587e-06, "loss": 0.334, "step": 3565 }, { "epoch": 1.2306498879503533, "grad_norm": 1.0878241010781369, "learning_rate": 8.825513394940604e-06, "loss": 0.3189, "step": 3570 }, { "epoch": 1.2323737286674712, "grad_norm": 1.6919986779201839, "learning_rate": 8.821916410487955e-06, "loss": 0.2935, "step": 3575 }, { "epoch": 1.2340975693845888, "grad_norm": 1.196708144055184, "learning_rate": 8.818314661786496e-06, "loss": 0.3207, "step": 3580 }, { "epoch": 1.2358214101017067, "grad_norm": 1.2977363127019375, "learning_rate": 8.814708153326025e-06, "loss": 0.3441, "step": 3585 }, { "epoch": 1.2375452508188243, "grad_norm": 5.186676555016642, "learning_rate": 8.811096889602275e-06, "loss": 0.3031, "step": 3590 }, { "epoch": 1.2392690915359421, "grad_norm": 1.273320277381821, "learning_rate": 8.807480875116901e-06, "loss": 0.3225, "step": 3595 }, { "epoch": 1.2409929322530597, "grad_norm": 1.457400171271567, "learning_rate": 8.80386011437748e-06, "loss": 0.3329, "step": 3600 }, { "epoch": 1.2427167729701776, "grad_norm": 1.1685070126812422, "learning_rate": 8.800234611897513e-06, "loss": 0.3049, "step": 3605 }, { "epoch": 1.2444406136872952, "grad_norm": 1.222717746578087, "learning_rate": 8.796604372196401e-06, "loss": 0.3019, "step": 3610 }, { "epoch": 1.246164454404413, "grad_norm": 1.187029207324435, "learning_rate": 8.792969399799464e-06, "loss": 0.2978, "step": 3615 }, { "epoch": 1.2478882951215309, "grad_norm": 2.264623865148118, "learning_rate": 8.789329699237907e-06, "loss": 0.3225, "step": 3620 }, { "epoch": 1.2496121358386485, "grad_norm": 1.3352759557893117, "learning_rate": 8.78568527504884e-06, "loss": 0.3437, "step": 3625 }, { "epoch": 1.2513359765557661, "grad_norm": 1.1603793017125967, "learning_rate": 8.782036131775255e-06, "loss": 0.3106, "step": 3630 }, { "epoch": 1.253059817272884, "grad_norm": 1.0931930826137783, "learning_rate": 8.77838227396603e-06, "loss": 0.3369, "step": 3635 }, { "epoch": 1.2547836579900018, "grad_norm": 1.101995328424067, "learning_rate": 8.774723706175919e-06, "loss": 0.3173, "step": 3640 }, { "epoch": 1.2565074987071194, "grad_norm": 1.301541638576734, "learning_rate": 8.771060432965543e-06, "loss": 0.2696, "step": 3645 }, { "epoch": 1.2582313394242373, "grad_norm": 1.7329598603808272, "learning_rate": 8.767392458901395e-06, "loss": 0.3086, "step": 3650 }, { "epoch": 1.2599551801413549, "grad_norm": 1.143101022541705, "learning_rate": 8.76371978855583e-06, "loss": 0.2676, "step": 3655 }, { "epoch": 1.2616790208584727, "grad_norm": 1.358724583745211, "learning_rate": 8.760042426507044e-06, "loss": 0.2685, "step": 3660 }, { "epoch": 1.2634028615755903, "grad_norm": 1.2314537967737824, "learning_rate": 8.756360377339097e-06, "loss": 0.2943, "step": 3665 }, { "epoch": 1.2651267022927082, "grad_norm": 1.2763524736651122, "learning_rate": 8.752673645641882e-06, "loss": 0.2538, "step": 3670 }, { "epoch": 1.2668505430098258, "grad_norm": 1.3379171604640265, "learning_rate": 8.748982236011132e-06, "loss": 0.319, "step": 3675 }, { "epoch": 1.2685743837269436, "grad_norm": 1.1703348158657632, "learning_rate": 8.74528615304841e-06, "loss": 0.33, "step": 3680 }, { "epoch": 1.2702982244440615, "grad_norm": 2.208958053875747, "learning_rate": 8.74158540136111e-06, "loss": 0.3147, "step": 3685 }, { "epoch": 1.272022065161179, "grad_norm": 1.0576540493714783, "learning_rate": 8.737879985562437e-06, "loss": 0.3166, "step": 3690 }, { "epoch": 1.2737459058782967, "grad_norm": 1.3831866611618326, "learning_rate": 8.734169910271418e-06, "loss": 0.3221, "step": 3695 }, { "epoch": 1.2754697465954146, "grad_norm": 1.266831655506686, "learning_rate": 8.730455180112885e-06, "loss": 0.311, "step": 3700 }, { "epoch": 1.2771935873125324, "grad_norm": 1.1691148138632521, "learning_rate": 8.72673579971747e-06, "loss": 0.3119, "step": 3705 }, { "epoch": 1.27891742802965, "grad_norm": 1.1632580575334266, "learning_rate": 8.723011773721606e-06, "loss": 0.3021, "step": 3710 }, { "epoch": 1.2806412687467679, "grad_norm": 1.3312894813926992, "learning_rate": 8.719283106767515e-06, "loss": 0.3322, "step": 3715 }, { "epoch": 1.2823651094638855, "grad_norm": 1.2394797838694025, "learning_rate": 8.715549803503206e-06, "loss": 0.3199, "step": 3720 }, { "epoch": 1.2840889501810033, "grad_norm": 1.1460264439012757, "learning_rate": 8.711811868582469e-06, "loss": 0.3094, "step": 3725 }, { "epoch": 1.285812790898121, "grad_norm": 1.8253868322485098, "learning_rate": 8.708069306664857e-06, "loss": 0.3091, "step": 3730 }, { "epoch": 1.2875366316152388, "grad_norm": 1.174995699726055, "learning_rate": 8.704322122415705e-06, "loss": 0.3072, "step": 3735 }, { "epoch": 1.2892604723323564, "grad_norm": 1.2405692201112009, "learning_rate": 8.7005703205061e-06, "loss": 0.2784, "step": 3740 }, { "epoch": 1.2909843130494743, "grad_norm": 1.2139381271740364, "learning_rate": 8.696813905612894e-06, "loss": 0.315, "step": 3745 }, { "epoch": 1.292708153766592, "grad_norm": 1.1218970511407682, "learning_rate": 8.693052882418679e-06, "loss": 0.275, "step": 3750 }, { "epoch": 1.2944319944837097, "grad_norm": 1.315075351816208, "learning_rate": 8.689287255611798e-06, "loss": 0.2777, "step": 3755 }, { "epoch": 1.2961558352008273, "grad_norm": 1.2052547476295588, "learning_rate": 8.685517029886333e-06, "loss": 0.3089, "step": 3760 }, { "epoch": 1.2978796759179452, "grad_norm": 1.1711089575310607, "learning_rate": 8.681742209942097e-06, "loss": 0.3102, "step": 3765 }, { "epoch": 1.299603516635063, "grad_norm": 2.5907011576274526, "learning_rate": 8.677962800484628e-06, "loss": 0.3102, "step": 3770 }, { "epoch": 1.3013273573521806, "grad_norm": 1.2269836411875583, "learning_rate": 8.674178806225189e-06, "loss": 0.311, "step": 3775 }, { "epoch": 1.3030511980692985, "grad_norm": 10.195710693149598, "learning_rate": 8.670390231880757e-06, "loss": 0.2927, "step": 3780 }, { "epoch": 1.304775038786416, "grad_norm": 4.387608525503374, "learning_rate": 8.666597082174018e-06, "loss": 0.3526, "step": 3785 }, { "epoch": 1.306498879503534, "grad_norm": 2.4775231725757676, "learning_rate": 8.662799361833358e-06, "loss": 0.3559, "step": 3790 }, { "epoch": 1.3082227202206516, "grad_norm": 1.7873721070217716, "learning_rate": 8.65899707559287e-06, "loss": 0.297, "step": 3795 }, { "epoch": 1.3099465609377694, "grad_norm": 1.209675958623188, "learning_rate": 8.655190228192327e-06, "loss": 0.2945, "step": 3800 }, { "epoch": 1.311670401654887, "grad_norm": 1.2066551121091627, "learning_rate": 8.651378824377197e-06, "loss": 0.3098, "step": 3805 }, { "epoch": 1.3133942423720049, "grad_norm": 1.1850448243433895, "learning_rate": 8.647562868898623e-06, "loss": 0.2962, "step": 3810 }, { "epoch": 1.3151180830891227, "grad_norm": 1.1464089305962142, "learning_rate": 8.643742366513421e-06, "loss": 0.3074, "step": 3815 }, { "epoch": 1.3168419238062403, "grad_norm": 1.3818831810491352, "learning_rate": 8.639917321984081e-06, "loss": 0.3102, "step": 3820 }, { "epoch": 1.318565764523358, "grad_norm": 1.6127204863145952, "learning_rate": 8.636087740078749e-06, "loss": 0.3085, "step": 3825 }, { "epoch": 1.3202896052404758, "grad_norm": 1.1506016824908816, "learning_rate": 8.63225362557123e-06, "loss": 0.3119, "step": 3830 }, { "epoch": 1.3220134459575936, "grad_norm": 1.1695078929991647, "learning_rate": 8.628414983240978e-06, "loss": 0.2929, "step": 3835 }, { "epoch": 1.3237372866747112, "grad_norm": 1.3850274658923796, "learning_rate": 8.62457181787309e-06, "loss": 0.2963, "step": 3840 }, { "epoch": 1.3254611273918289, "grad_norm": 1.4119811784754794, "learning_rate": 8.620724134258308e-06, "loss": 0.2986, "step": 3845 }, { "epoch": 1.3271849681089467, "grad_norm": 1.1063803465154691, "learning_rate": 8.616871937192995e-06, "loss": 0.3162, "step": 3850 }, { "epoch": 1.3289088088260645, "grad_norm": 1.247495661891309, "learning_rate": 8.61301523147915e-06, "loss": 0.2842, "step": 3855 }, { "epoch": 1.3306326495431822, "grad_norm": 1.1480440865690702, "learning_rate": 8.60915402192439e-06, "loss": 0.2823, "step": 3860 }, { "epoch": 1.3323564902603, "grad_norm": 1.2707825018347805, "learning_rate": 8.605288313341942e-06, "loss": 0.286, "step": 3865 }, { "epoch": 1.3340803309774176, "grad_norm": 1.2614854491642877, "learning_rate": 8.601418110550645e-06, "loss": 0.3255, "step": 3870 }, { "epoch": 1.3358041716945355, "grad_norm": 1.591165196055561, "learning_rate": 8.597543418374943e-06, "loss": 0.3394, "step": 3875 }, { "epoch": 1.337528012411653, "grad_norm": 1.103027484329325, "learning_rate": 8.593664241644868e-06, "loss": 0.3165, "step": 3880 }, { "epoch": 1.339251853128771, "grad_norm": 1.20833888216885, "learning_rate": 8.58978058519605e-06, "loss": 0.2591, "step": 3885 }, { "epoch": 1.3409756938458886, "grad_norm": 1.3884345182667164, "learning_rate": 8.5858924538697e-06, "loss": 0.2952, "step": 3890 }, { "epoch": 1.3426995345630064, "grad_norm": 1.1228948582398766, "learning_rate": 8.581999852512606e-06, "loss": 0.3159, "step": 3895 }, { "epoch": 1.3444233752801242, "grad_norm": 1.4477570266687043, "learning_rate": 8.578102785977134e-06, "loss": 0.3326, "step": 3900 }, { "epoch": 1.3461472159972419, "grad_norm": 1.1223799822191507, "learning_rate": 8.574201259121208e-06, "loss": 0.2989, "step": 3905 }, { "epoch": 1.3478710567143595, "grad_norm": 1.2260172584010405, "learning_rate": 8.570295276808319e-06, "loss": 0.313, "step": 3910 }, { "epoch": 1.3495948974314773, "grad_norm": 1.1391685796338593, "learning_rate": 8.566384843907505e-06, "loss": 0.2791, "step": 3915 }, { "epoch": 1.3513187381485952, "grad_norm": 1.137208901102679, "learning_rate": 8.562469965293361e-06, "loss": 0.2956, "step": 3920 }, { "epoch": 1.3530425788657128, "grad_norm": 1.0734438310141086, "learning_rate": 8.558550645846015e-06, "loss": 0.2922, "step": 3925 }, { "epoch": 1.3547664195828306, "grad_norm": 1.5072972449316966, "learning_rate": 8.554626890451137e-06, "loss": 0.2889, "step": 3930 }, { "epoch": 1.3564902602999482, "grad_norm": 1.111309813125269, "learning_rate": 8.550698703999922e-06, "loss": 0.2855, "step": 3935 }, { "epoch": 1.358214101017066, "grad_norm": 1.3056420462285931, "learning_rate": 8.546766091389091e-06, "loss": 0.3283, "step": 3940 }, { "epoch": 1.3599379417341837, "grad_norm": 1.199834987424094, "learning_rate": 8.542829057520884e-06, "loss": 0.28, "step": 3945 }, { "epoch": 1.3616617824513015, "grad_norm": 1.3191225232525428, "learning_rate": 8.538887607303052e-06, "loss": 0.274, "step": 3950 }, { "epoch": 1.3633856231684192, "grad_norm": 1.1068722563144409, "learning_rate": 8.534941745648845e-06, "loss": 0.275, "step": 3955 }, { "epoch": 1.365109463885537, "grad_norm": 1.078920667366664, "learning_rate": 8.53099147747702e-06, "loss": 0.3191, "step": 3960 }, { "epoch": 1.3668333046026548, "grad_norm": 1.1455257685642934, "learning_rate": 8.527036807711825e-06, "loss": 0.295, "step": 3965 }, { "epoch": 1.3685571453197725, "grad_norm": 1.305899035365651, "learning_rate": 8.523077741282991e-06, "loss": 0.2896, "step": 3970 }, { "epoch": 1.37028098603689, "grad_norm": 1.1545883100319863, "learning_rate": 8.519114283125736e-06, "loss": 0.3105, "step": 3975 }, { "epoch": 1.372004826754008, "grad_norm": 1.2535401547804015, "learning_rate": 8.515146438180745e-06, "loss": 0.3319, "step": 3980 }, { "epoch": 1.3737286674711258, "grad_norm": 1.186121490544396, "learning_rate": 8.511174211394178e-06, "loss": 0.3168, "step": 3985 }, { "epoch": 1.3754525081882434, "grad_norm": 1.036125970260004, "learning_rate": 8.507197607717656e-06, "loss": 0.3019, "step": 3990 }, { "epoch": 1.3771763489053612, "grad_norm": 1.8719458982246997, "learning_rate": 8.503216632108253e-06, "loss": 0.3021, "step": 3995 }, { "epoch": 1.3789001896224788, "grad_norm": 1.1309458795923768, "learning_rate": 8.499231289528495e-06, "loss": 0.2791, "step": 4000 }, { "epoch": 1.3806240303395967, "grad_norm": 1.1247825532724618, "learning_rate": 8.49524158494635e-06, "loss": 0.3131, "step": 4005 }, { "epoch": 1.3823478710567143, "grad_norm": 1.1592309305579953, "learning_rate": 8.491247523335227e-06, "loss": 0.2321, "step": 4010 }, { "epoch": 1.3840717117738321, "grad_norm": 1.0672390832017877, "learning_rate": 8.487249109673963e-06, "loss": 0.308, "step": 4015 }, { "epoch": 1.3857955524909498, "grad_norm": 1.020286154089578, "learning_rate": 8.483246348946823e-06, "loss": 0.3069, "step": 4020 }, { "epoch": 1.3875193932080676, "grad_norm": 1.5815639886180672, "learning_rate": 8.479239246143487e-06, "loss": 0.3255, "step": 4025 }, { "epoch": 1.3892432339251855, "grad_norm": 1.224480348243422, "learning_rate": 8.47522780625905e-06, "loss": 0.279, "step": 4030 }, { "epoch": 1.390967074642303, "grad_norm": 1.0510017728348102, "learning_rate": 8.471212034294013e-06, "loss": 0.3068, "step": 4035 }, { "epoch": 1.3926909153594207, "grad_norm": 1.1979579443327188, "learning_rate": 8.46719193525428e-06, "loss": 0.295, "step": 4040 }, { "epoch": 1.3944147560765385, "grad_norm": 1.1436237072016693, "learning_rate": 8.463167514151142e-06, "loss": 0.2952, "step": 4045 }, { "epoch": 1.3961385967936564, "grad_norm": 2.372645789701561, "learning_rate": 8.459138776001287e-06, "loss": 0.2759, "step": 4050 }, { "epoch": 1.397862437510774, "grad_norm": 1.5358038033861106, "learning_rate": 8.455105725826776e-06, "loss": 0.3102, "step": 4055 }, { "epoch": 1.3995862782278916, "grad_norm": 1.0942800078744155, "learning_rate": 8.451068368655051e-06, "loss": 0.298, "step": 4060 }, { "epoch": 1.4013101189450095, "grad_norm": 1.2498191988881162, "learning_rate": 8.447026709518917e-06, "loss": 0.2881, "step": 4065 }, { "epoch": 1.4030339596621273, "grad_norm": 1.103501423611643, "learning_rate": 8.44298075345655e-06, "loss": 0.2708, "step": 4070 }, { "epoch": 1.404757800379245, "grad_norm": 1.140581080023868, "learning_rate": 8.438930505511476e-06, "loss": 0.3046, "step": 4075 }, { "epoch": 1.4064816410963628, "grad_norm": 1.112135439548782, "learning_rate": 8.434875970732573e-06, "loss": 0.2974, "step": 4080 }, { "epoch": 1.4082054818134804, "grad_norm": 1.2195968452445054, "learning_rate": 8.430817154174061e-06, "loss": 0.2945, "step": 4085 }, { "epoch": 1.4099293225305982, "grad_norm": 1.1857667376896035, "learning_rate": 8.426754060895499e-06, "loss": 0.3017, "step": 4090 }, { "epoch": 1.4116531632477158, "grad_norm": 1.224210577295516, "learning_rate": 8.42268669596178e-06, "loss": 0.2883, "step": 4095 }, { "epoch": 1.4133770039648337, "grad_norm": 1.3165556809012575, "learning_rate": 8.418615064443116e-06, "loss": 0.2662, "step": 4100 }, { "epoch": 1.4151008446819513, "grad_norm": 1.239487565939614, "learning_rate": 8.414539171415044e-06, "loss": 0.2883, "step": 4105 }, { "epoch": 1.4168246853990691, "grad_norm": 1.1353859309595264, "learning_rate": 8.410459021958407e-06, "loss": 0.2592, "step": 4110 }, { "epoch": 1.418548526116187, "grad_norm": 1.2239330978333636, "learning_rate": 8.40637462115936e-06, "loss": 0.2978, "step": 4115 }, { "epoch": 1.4202723668333046, "grad_norm": 1.1931010142867433, "learning_rate": 8.402285974109351e-06, "loss": 0.2961, "step": 4120 }, { "epoch": 1.4219962075504222, "grad_norm": 1.1543151925906574, "learning_rate": 8.398193085905129e-06, "loss": 0.2951, "step": 4125 }, { "epoch": 1.42372004826754, "grad_norm": 1.0186883594497838, "learning_rate": 8.394095961648719e-06, "loss": 0.2943, "step": 4130 }, { "epoch": 1.425443888984658, "grad_norm": 1.6396849506994637, "learning_rate": 8.389994606447438e-06, "loss": 0.2922, "step": 4135 }, { "epoch": 1.4271677297017755, "grad_norm": 1.055836727069913, "learning_rate": 8.38588902541387e-06, "loss": 0.2993, "step": 4140 }, { "epoch": 1.4288915704188934, "grad_norm": 5.517045291697795, "learning_rate": 8.381779223665871e-06, "loss": 0.3013, "step": 4145 }, { "epoch": 1.430615411136011, "grad_norm": 1.505632142957612, "learning_rate": 8.377665206326554e-06, "loss": 0.2799, "step": 4150 }, { "epoch": 1.4323392518531288, "grad_norm": 1.1203779436713657, "learning_rate": 8.373546978524288e-06, "loss": 0.3005, "step": 4155 }, { "epoch": 1.4340630925702464, "grad_norm": 1.2089066705710734, "learning_rate": 8.369424545392694e-06, "loss": 0.295, "step": 4160 }, { "epoch": 1.4357869332873643, "grad_norm": 1.0271157769609536, "learning_rate": 8.365297912070635e-06, "loss": 0.2915, "step": 4165 }, { "epoch": 1.437510774004482, "grad_norm": 1.9142291281162835, "learning_rate": 8.361167083702204e-06, "loss": 0.2905, "step": 4170 }, { "epoch": 1.4392346147215997, "grad_norm": 1.2842701676122075, "learning_rate": 8.357032065436728e-06, "loss": 0.3133, "step": 4175 }, { "epoch": 1.4409584554387176, "grad_norm": 1.168564641232761, "learning_rate": 8.35289286242876e-06, "loss": 0.2974, "step": 4180 }, { "epoch": 1.4426822961558352, "grad_norm": 1.2654640122040623, "learning_rate": 8.348749479838057e-06, "loss": 0.2644, "step": 4185 }, { "epoch": 1.4444061368729528, "grad_norm": 1.279875810813251, "learning_rate": 8.344601922829603e-06, "loss": 0.3063, "step": 4190 }, { "epoch": 1.4461299775900707, "grad_norm": 1.1989896556917319, "learning_rate": 8.340450196573574e-06, "loss": 0.2793, "step": 4195 }, { "epoch": 1.4478538183071885, "grad_norm": 1.1684699927310496, "learning_rate": 8.336294306245347e-06, "loss": 0.2764, "step": 4200 }, { "epoch": 1.4495776590243061, "grad_norm": 1.671482671844399, "learning_rate": 8.332134257025491e-06, "loss": 0.2918, "step": 4205 }, { "epoch": 1.451301499741424, "grad_norm": 0.9660048161905248, "learning_rate": 8.327970054099754e-06, "loss": 0.2989, "step": 4210 }, { "epoch": 1.4530253404585416, "grad_norm": 1.220434293069677, "learning_rate": 8.323801702659069e-06, "loss": 0.2944, "step": 4215 }, { "epoch": 1.4547491811756594, "grad_norm": 1.5616478369713984, "learning_rate": 8.319629207899536e-06, "loss": 0.2762, "step": 4220 }, { "epoch": 1.456473021892777, "grad_norm": 1.0774039544402243, "learning_rate": 8.315452575022418e-06, "loss": 0.2769, "step": 4225 }, { "epoch": 1.458196862609895, "grad_norm": 1.1145790915355014, "learning_rate": 8.311271809234145e-06, "loss": 0.3287, "step": 4230 }, { "epoch": 1.4599207033270125, "grad_norm": 1.1681505383079978, "learning_rate": 8.307086915746288e-06, "loss": 0.2917, "step": 4235 }, { "epoch": 1.4616445440441304, "grad_norm": 1.1253000125643369, "learning_rate": 8.302897899775571e-06, "loss": 0.2845, "step": 4240 }, { "epoch": 1.4633683847612482, "grad_norm": 2.6969917963864947, "learning_rate": 8.298704766543853e-06, "loss": 0.3195, "step": 4245 }, { "epoch": 1.4650922254783658, "grad_norm": 2.071204538351495, "learning_rate": 8.294507521278127e-06, "loss": 0.2963, "step": 4250 }, { "epoch": 1.4668160661954834, "grad_norm": 1.1722313659463146, "learning_rate": 8.290306169210516e-06, "loss": 0.2927, "step": 4255 }, { "epoch": 1.4685399069126013, "grad_norm": 1.1691652127028054, "learning_rate": 8.286100715578254e-06, "loss": 0.2743, "step": 4260 }, { "epoch": 1.4702637476297191, "grad_norm": 1.1798942644502195, "learning_rate": 8.281891165623693e-06, "loss": 0.2761, "step": 4265 }, { "epoch": 1.4719875883468367, "grad_norm": 1.1121187262116956, "learning_rate": 8.277677524594288e-06, "loss": 0.3109, "step": 4270 }, { "epoch": 1.4737114290639544, "grad_norm": 1.4202418924078146, "learning_rate": 8.2734597977426e-06, "loss": 0.2891, "step": 4275 }, { "epoch": 1.4754352697810722, "grad_norm": 1.112394836998521, "learning_rate": 8.269237990326278e-06, "loss": 0.3077, "step": 4280 }, { "epoch": 1.47715911049819, "grad_norm": 1.0986260704553183, "learning_rate": 8.265012107608057e-06, "loss": 0.2732, "step": 4285 }, { "epoch": 1.4788829512153077, "grad_norm": 1.1397736528472096, "learning_rate": 8.260782154855757e-06, "loss": 0.2938, "step": 4290 }, { "epoch": 1.4806067919324255, "grad_norm": 3.0310989331718274, "learning_rate": 8.256548137342268e-06, "loss": 0.2997, "step": 4295 }, { "epoch": 1.4823306326495431, "grad_norm": 1.414759567180519, "learning_rate": 8.252310060345546e-06, "loss": 0.302, "step": 4300 }, { "epoch": 1.484054473366661, "grad_norm": 1.1273109488248374, "learning_rate": 8.248067929148612e-06, "loss": 0.2747, "step": 4305 }, { "epoch": 1.4857783140837786, "grad_norm": 1.0138568921712012, "learning_rate": 8.243821749039534e-06, "loss": 0.2731, "step": 4310 }, { "epoch": 1.4875021548008964, "grad_norm": 1.0829130740936297, "learning_rate": 8.239571525311433e-06, "loss": 0.2902, "step": 4315 }, { "epoch": 1.489225995518014, "grad_norm": 1.1966939995575037, "learning_rate": 8.23531726326247e-06, "loss": 0.3393, "step": 4320 }, { "epoch": 1.4909498362351319, "grad_norm": 0.9337724122434297, "learning_rate": 8.231058968195838e-06, "loss": 0.2982, "step": 4325 }, { "epoch": 1.4926736769522497, "grad_norm": 1.0707804685307505, "learning_rate": 8.226796645419758e-06, "loss": 0.3136, "step": 4330 }, { "epoch": 1.4943975176693673, "grad_norm": 1.141925415485157, "learning_rate": 8.222530300247467e-06, "loss": 0.3067, "step": 4335 }, { "epoch": 1.496121358386485, "grad_norm": 1.2168700457279291, "learning_rate": 8.218259937997228e-06, "loss": 0.2901, "step": 4340 }, { "epoch": 1.4978451991036028, "grad_norm": 1.0017932593769117, "learning_rate": 8.213985563992302e-06, "loss": 0.2752, "step": 4345 }, { "epoch": 1.4995690398207207, "grad_norm": 1.1784743251912764, "learning_rate": 8.209707183560953e-06, "loss": 0.3186, "step": 4350 }, { "epoch": 1.5012928805378383, "grad_norm": 1.072869339665256, "learning_rate": 8.20542480203644e-06, "loss": 0.2826, "step": 4355 }, { "epoch": 1.503016721254956, "grad_norm": 1.1017133539730921, "learning_rate": 8.201138424757008e-06, "loss": 0.2905, "step": 4360 }, { "epoch": 1.5047405619720737, "grad_norm": 1.0922456832653578, "learning_rate": 8.196848057065887e-06, "loss": 0.2839, "step": 4365 }, { "epoch": 1.5064644026891916, "grad_norm": 1.1740104467786454, "learning_rate": 8.192553704311277e-06, "loss": 0.3002, "step": 4370 }, { "epoch": 1.5081882434063094, "grad_norm": 4.204163727357052, "learning_rate": 8.188255371846347e-06, "loss": 0.3062, "step": 4375 }, { "epoch": 1.509912084123427, "grad_norm": 1.2772967129642088, "learning_rate": 8.183953065029226e-06, "loss": 0.2975, "step": 4380 }, { "epoch": 1.5116359248405447, "grad_norm": 1.5168363584089413, "learning_rate": 8.179646789223e-06, "loss": 0.3037, "step": 4385 }, { "epoch": 1.5133597655576625, "grad_norm": 1.4788250289482368, "learning_rate": 8.175336549795701e-06, "loss": 0.3435, "step": 4390 }, { "epoch": 1.5150836062747803, "grad_norm": 1.1003016752913906, "learning_rate": 8.1710223521203e-06, "loss": 0.281, "step": 4395 }, { "epoch": 1.516807446991898, "grad_norm": 1.0711099126215438, "learning_rate": 8.166704201574707e-06, "loss": 0.2731, "step": 4400 }, { "epoch": 1.5185312877090156, "grad_norm": 1.249983606211235, "learning_rate": 8.162382103541755e-06, "loss": 0.289, "step": 4405 }, { "epoch": 1.5202551284261334, "grad_norm": 1.107332976953771, "learning_rate": 8.158056063409198e-06, "loss": 0.2864, "step": 4410 }, { "epoch": 1.5219789691432513, "grad_norm": 1.2623515697126584, "learning_rate": 8.153726086569707e-06, "loss": 0.3027, "step": 4415 }, { "epoch": 1.5237028098603689, "grad_norm": 1.1052823756210386, "learning_rate": 8.149392178420858e-06, "loss": 0.2944, "step": 4420 }, { "epoch": 1.5254266505774865, "grad_norm": 1.0476709242070792, "learning_rate": 8.14505434436513e-06, "loss": 0.2987, "step": 4425 }, { "epoch": 1.5271504912946043, "grad_norm": 1.1317688824371155, "learning_rate": 8.140712589809891e-06, "loss": 0.2663, "step": 4430 }, { "epoch": 1.5288743320117222, "grad_norm": 1.1905840008315134, "learning_rate": 8.136366920167403e-06, "loss": 0.2643, "step": 4435 }, { "epoch": 1.5305981727288398, "grad_norm": 1.137180132429372, "learning_rate": 8.1320173408548e-06, "loss": 0.2906, "step": 4440 }, { "epoch": 1.5323220134459576, "grad_norm": 1.0753074044669657, "learning_rate": 8.1276638572941e-06, "loss": 0.3385, "step": 4445 }, { "epoch": 1.5340458541630753, "grad_norm": 1.167513808635872, "learning_rate": 8.123306474912178e-06, "loss": 0.2944, "step": 4450 }, { "epoch": 1.535769694880193, "grad_norm": 1.3897430885477142, "learning_rate": 8.118945199140774e-06, "loss": 0.2965, "step": 4455 }, { "epoch": 1.537493535597311, "grad_norm": 1.0462935277782224, "learning_rate": 8.114580035416484e-06, "loss": 0.2813, "step": 4460 }, { "epoch": 1.5392173763144286, "grad_norm": 0.9442234706910525, "learning_rate": 8.110210989180742e-06, "loss": 0.2856, "step": 4465 }, { "epoch": 1.5409412170315462, "grad_norm": 1.2880096731566573, "learning_rate": 8.105838065879832e-06, "loss": 0.2972, "step": 4470 }, { "epoch": 1.542665057748664, "grad_norm": 1.1817976972804822, "learning_rate": 8.101461270964863e-06, "loss": 0.3071, "step": 4475 }, { "epoch": 1.5443888984657819, "grad_norm": 1.19871914468833, "learning_rate": 8.097080609891775e-06, "loss": 0.2986, "step": 4480 }, { "epoch": 1.5461127391828995, "grad_norm": 1.1368965902954358, "learning_rate": 8.092696088121324e-06, "loss": 0.2795, "step": 4485 }, { "epoch": 1.547836579900017, "grad_norm": 1.1499305851560142, "learning_rate": 8.088307711119082e-06, "loss": 0.2588, "step": 4490 }, { "epoch": 1.549560420617135, "grad_norm": 1.3872810664498054, "learning_rate": 8.083915484355423e-06, "loss": 0.284, "step": 4495 }, { "epoch": 1.5512842613342528, "grad_norm": 1.1268491020621787, "learning_rate": 8.079519413305523e-06, "loss": 0.2628, "step": 4500 }, { "epoch": 1.5530081020513704, "grad_norm": 1.203114196866655, "learning_rate": 8.075119503449352e-06, "loss": 0.3032, "step": 4505 }, { "epoch": 1.554731942768488, "grad_norm": 1.3394123876449031, "learning_rate": 8.070715760271657e-06, "loss": 0.2967, "step": 4510 }, { "epoch": 1.5564557834856059, "grad_norm": 1.7377691152609942, "learning_rate": 8.066308189261971e-06, "loss": 0.264, "step": 4515 }, { "epoch": 1.5581796242027237, "grad_norm": 0.9846889598064859, "learning_rate": 8.0618967959146e-06, "loss": 0.2927, "step": 4520 }, { "epoch": 1.5599034649198416, "grad_norm": 1.216221746125267, "learning_rate": 8.057481585728604e-06, "loss": 0.2745, "step": 4525 }, { "epoch": 1.5616273056369592, "grad_norm": 1.0672477634076372, "learning_rate": 8.053062564207816e-06, "loss": 0.273, "step": 4530 }, { "epoch": 1.5633511463540768, "grad_norm": 1.0776198555958603, "learning_rate": 8.048639736860808e-06, "loss": 0.2815, "step": 4535 }, { "epoch": 1.5650749870711946, "grad_norm": 1.5085017509837733, "learning_rate": 8.044213109200901e-06, "loss": 0.2756, "step": 4540 }, { "epoch": 1.5667988277883125, "grad_norm": 1.3085717154879373, "learning_rate": 8.039782686746153e-06, "loss": 0.2987, "step": 4545 }, { "epoch": 1.56852266850543, "grad_norm": 1.0162772612935684, "learning_rate": 8.035348475019352e-06, "loss": 0.2946, "step": 4550 }, { "epoch": 1.5702465092225477, "grad_norm": 1.3775938951629474, "learning_rate": 8.03091047954801e-06, "loss": 0.2859, "step": 4555 }, { "epoch": 1.5719703499396656, "grad_norm": 1.1994169217393993, "learning_rate": 8.026468705864357e-06, "loss": 0.2855, "step": 4560 }, { "epoch": 1.5736941906567834, "grad_norm": 1.0925332218455293, "learning_rate": 8.022023159505328e-06, "loss": 0.2778, "step": 4565 }, { "epoch": 1.575418031373901, "grad_norm": 2.577322170757021, "learning_rate": 8.017573846012564e-06, "loss": 0.2862, "step": 4570 }, { "epoch": 1.5771418720910186, "grad_norm": 3.505143617383019, "learning_rate": 8.013120770932406e-06, "loss": 0.3156, "step": 4575 }, { "epoch": 1.5788657128081365, "grad_norm": 2.818090342674019, "learning_rate": 8.008663939815878e-06, "loss": 0.3043, "step": 4580 }, { "epoch": 1.5805895535252543, "grad_norm": 1.0380932701893224, "learning_rate": 8.004203358218687e-06, "loss": 0.301, "step": 4585 }, { "epoch": 1.5823133942423722, "grad_norm": 1.343618018699047, "learning_rate": 7.999739031701218e-06, "loss": 0.3, "step": 4590 }, { "epoch": 1.5840372349594898, "grad_norm": 1.3860425519912696, "learning_rate": 7.995270965828523e-06, "loss": 0.3106, "step": 4595 }, { "epoch": 1.5857610756766074, "grad_norm": 1.2227838784955878, "learning_rate": 7.990799166170312e-06, "loss": 0.2824, "step": 4600 }, { "epoch": 1.5874849163937252, "grad_norm": 1.2291192243757418, "learning_rate": 7.986323638300957e-06, "loss": 0.2647, "step": 4605 }, { "epoch": 1.589208757110843, "grad_norm": 1.1924245691491968, "learning_rate": 7.981844387799468e-06, "loss": 0.2886, "step": 4610 }, { "epoch": 1.5909325978279607, "grad_norm": 1.027590649560104, "learning_rate": 7.977361420249504e-06, "loss": 0.2658, "step": 4615 }, { "epoch": 1.5926564385450783, "grad_norm": 1.1668511583869174, "learning_rate": 7.972874741239352e-06, "loss": 0.2715, "step": 4620 }, { "epoch": 1.5943802792621962, "grad_norm": 1.1530835346302435, "learning_rate": 7.968384356361927e-06, "loss": 0.3023, "step": 4625 }, { "epoch": 1.596104119979314, "grad_norm": 1.1219218944959122, "learning_rate": 7.963890271214765e-06, "loss": 0.2519, "step": 4630 }, { "epoch": 1.5978279606964316, "grad_norm": 1.0597119841876788, "learning_rate": 7.959392491400015e-06, "loss": 0.2758, "step": 4635 }, { "epoch": 1.5995518014135492, "grad_norm": 1.0706986098744253, "learning_rate": 7.954891022524427e-06, "loss": 0.2616, "step": 4640 }, { "epoch": 1.601275642130667, "grad_norm": 1.0788317237951472, "learning_rate": 7.950385870199356e-06, "loss": 0.2695, "step": 4645 }, { "epoch": 1.602999482847785, "grad_norm": 1.1209830452940586, "learning_rate": 7.945877040040742e-06, "loss": 0.2619, "step": 4650 }, { "epoch": 1.6047233235649025, "grad_norm": 1.040694549076333, "learning_rate": 7.941364537669117e-06, "loss": 0.2951, "step": 4655 }, { "epoch": 1.6064471642820204, "grad_norm": 1.11191039361645, "learning_rate": 7.936848368709582e-06, "loss": 0.3003, "step": 4660 }, { "epoch": 1.608171004999138, "grad_norm": 1.2050444474037043, "learning_rate": 7.932328538791818e-06, "loss": 0.2871, "step": 4665 }, { "epoch": 1.6098948457162559, "grad_norm": 0.9903198963124957, "learning_rate": 7.927805053550064e-06, "loss": 0.255, "step": 4670 }, { "epoch": 1.6116186864333737, "grad_norm": 1.090871684646597, "learning_rate": 7.923277918623116e-06, "loss": 0.264, "step": 4675 }, { "epoch": 1.6133425271504913, "grad_norm": 1.080006435375285, "learning_rate": 7.918747139654318e-06, "loss": 0.2793, "step": 4680 }, { "epoch": 1.615066367867609, "grad_norm": 1.2223590352235285, "learning_rate": 7.914212722291561e-06, "loss": 0.3058, "step": 4685 }, { "epoch": 1.6167902085847268, "grad_norm": 1.290101087149561, "learning_rate": 7.909674672187268e-06, "loss": 0.321, "step": 4690 }, { "epoch": 1.6185140493018446, "grad_norm": 1.2857378909440365, "learning_rate": 7.905132994998394e-06, "loss": 0.3114, "step": 4695 }, { "epoch": 1.6202378900189622, "grad_norm": 1.104863427511264, "learning_rate": 7.900587696386413e-06, "loss": 0.2589, "step": 4700 }, { "epoch": 1.6219617307360799, "grad_norm": 1.0742409705734057, "learning_rate": 7.896038782017308e-06, "loss": 0.3179, "step": 4705 }, { "epoch": 1.6236855714531977, "grad_norm": 1.2678367311665033, "learning_rate": 7.89148625756158e-06, "loss": 0.284, "step": 4710 }, { "epoch": 1.6254094121703155, "grad_norm": 1.1759575583917148, "learning_rate": 7.886930128694221e-06, "loss": 0.2962, "step": 4715 }, { "epoch": 1.6271332528874332, "grad_norm": 1.1014959512898448, "learning_rate": 7.882370401094723e-06, "loss": 0.2874, "step": 4720 }, { "epoch": 1.6288570936045508, "grad_norm": 2.50511505608037, "learning_rate": 7.877807080447058e-06, "loss": 0.3029, "step": 4725 }, { "epoch": 1.6305809343216686, "grad_norm": 1.185916183994825, "learning_rate": 7.873240172439683e-06, "loss": 0.2845, "step": 4730 }, { "epoch": 1.6323047750387865, "grad_norm": 1.6176622608656361, "learning_rate": 7.86866968276552e-06, "loss": 0.2795, "step": 4735 }, { "epoch": 1.6340286157559043, "grad_norm": 1.250930863184501, "learning_rate": 7.86409561712196e-06, "loss": 0.2915, "step": 4740 }, { "epoch": 1.635752456473022, "grad_norm": 1.9008168568839663, "learning_rate": 7.859517981210855e-06, "loss": 0.2922, "step": 4745 }, { "epoch": 1.6374762971901395, "grad_norm": 2.3733977130541852, "learning_rate": 7.854936780738501e-06, "loss": 0.2875, "step": 4750 }, { "epoch": 1.6392001379072574, "grad_norm": 1.0197756084994443, "learning_rate": 7.85035202141564e-06, "loss": 0.2663, "step": 4755 }, { "epoch": 1.6409239786243752, "grad_norm": 1.1569440849863617, "learning_rate": 7.845763708957448e-06, "loss": 0.3178, "step": 4760 }, { "epoch": 1.6426478193414928, "grad_norm": 1.1135247757913942, "learning_rate": 7.841171849083537e-06, "loss": 0.2965, "step": 4765 }, { "epoch": 1.6443716600586105, "grad_norm": 1.203979917867108, "learning_rate": 7.836576447517935e-06, "loss": 0.277, "step": 4770 }, { "epoch": 1.6460955007757283, "grad_norm": 1.0915322798339258, "learning_rate": 7.831977509989086e-06, "loss": 0.3196, "step": 4775 }, { "epoch": 1.6478193414928461, "grad_norm": 1.9043654726869557, "learning_rate": 7.827375042229843e-06, "loss": 0.2809, "step": 4780 }, { "epoch": 1.6495431822099638, "grad_norm": 1.231698886575644, "learning_rate": 7.822769049977459e-06, "loss": 0.2559, "step": 4785 }, { "epoch": 1.6512670229270814, "grad_norm": 0.9756875575310217, "learning_rate": 7.81815953897358e-06, "loss": 0.2604, "step": 4790 }, { "epoch": 1.6529908636441992, "grad_norm": 1.2033951497698867, "learning_rate": 7.81354651496424e-06, "loss": 0.2562, "step": 4795 }, { "epoch": 1.654714704361317, "grad_norm": 1.2585683573733146, "learning_rate": 7.808929983699848e-06, "loss": 0.2604, "step": 4800 }, { "epoch": 1.656438545078435, "grad_norm": 1.0835966787970026, "learning_rate": 7.804309950935191e-06, "loss": 0.2807, "step": 4805 }, { "epoch": 1.6581623857955525, "grad_norm": 1.0539996832826386, "learning_rate": 7.799686422429418e-06, "loss": 0.2641, "step": 4810 }, { "epoch": 1.6598862265126701, "grad_norm": 1.1510984990439581, "learning_rate": 7.795059403946034e-06, "loss": 0.2862, "step": 4815 }, { "epoch": 1.661610067229788, "grad_norm": 1.099104953394071, "learning_rate": 7.790428901252897e-06, "loss": 0.2852, "step": 4820 }, { "epoch": 1.6633339079469058, "grad_norm": 1.1682885829402445, "learning_rate": 7.785794920122207e-06, "loss": 0.2859, "step": 4825 }, { "epoch": 1.6650577486640235, "grad_norm": 1.1721610560414317, "learning_rate": 7.7811574663305e-06, "loss": 0.2921, "step": 4830 }, { "epoch": 1.666781589381141, "grad_norm": 1.168726191344149, "learning_rate": 7.776516545658641e-06, "loss": 0.3008, "step": 4835 }, { "epoch": 1.668505430098259, "grad_norm": 1.3478403912271937, "learning_rate": 7.771872163891818e-06, "loss": 0.2945, "step": 4840 }, { "epoch": 1.6702292708153768, "grad_norm": 0.999876928881415, "learning_rate": 7.767224326819533e-06, "loss": 0.2512, "step": 4845 }, { "epoch": 1.6719531115324944, "grad_norm": 1.0653778046015696, "learning_rate": 7.762573040235592e-06, "loss": 0.2673, "step": 4850 }, { "epoch": 1.673676952249612, "grad_norm": 1.2411457041849874, "learning_rate": 7.757918309938107e-06, "loss": 0.2791, "step": 4855 }, { "epoch": 1.6754007929667298, "grad_norm": 1.130982998169481, "learning_rate": 7.753260141729474e-06, "loss": 0.2985, "step": 4860 }, { "epoch": 1.6771246336838477, "grad_norm": 1.172612356497052, "learning_rate": 7.748598541416386e-06, "loss": 0.2845, "step": 4865 }, { "epoch": 1.6788484744009653, "grad_norm": 1.4309093735314082, "learning_rate": 7.743933514809806e-06, "loss": 0.2534, "step": 4870 }, { "epoch": 1.6805723151180831, "grad_norm": 1.0583343878157196, "learning_rate": 7.739265067724966e-06, "loss": 0.268, "step": 4875 }, { "epoch": 1.6822961558352008, "grad_norm": 1.064639332799556, "learning_rate": 7.734593205981375e-06, "loss": 0.2915, "step": 4880 }, { "epoch": 1.6840199965523186, "grad_norm": 1.2481077971908499, "learning_rate": 7.729917935402783e-06, "loss": 0.3059, "step": 4885 }, { "epoch": 1.6857438372694364, "grad_norm": 1.0723438589361638, "learning_rate": 7.725239261817201e-06, "loss": 0.292, "step": 4890 }, { "epoch": 1.687467677986554, "grad_norm": 1.2069060312922704, "learning_rate": 7.720557191056873e-06, "loss": 0.245, "step": 4895 }, { "epoch": 1.6891915187036717, "grad_norm": 1.1900799630831256, "learning_rate": 7.715871728958285e-06, "loss": 0.284, "step": 4900 }, { "epoch": 1.6909153594207895, "grad_norm": 1.0808543445599839, "learning_rate": 7.711182881362143e-06, "loss": 0.2794, "step": 4905 }, { "epoch": 1.6926392001379074, "grad_norm": 1.1770038792609376, "learning_rate": 7.706490654113383e-06, "loss": 0.2829, "step": 4910 }, { "epoch": 1.694363040855025, "grad_norm": 1.3712361776514597, "learning_rate": 7.701795053061145e-06, "loss": 0.2916, "step": 4915 }, { "epoch": 1.6960868815721426, "grad_norm": 1.1007601160863054, "learning_rate": 7.697096084058781e-06, "loss": 0.2601, "step": 4920 }, { "epoch": 1.6978107222892604, "grad_norm": 1.2528337178686804, "learning_rate": 7.692393752963837e-06, "loss": 0.2879, "step": 4925 }, { "epoch": 1.6995345630063783, "grad_norm": 1.2087277967671552, "learning_rate": 7.687688065638052e-06, "loss": 0.2805, "step": 4930 }, { "epoch": 1.701258403723496, "grad_norm": 1.3431894201965882, "learning_rate": 7.682979027947349e-06, "loss": 0.3023, "step": 4935 }, { "epoch": 1.7029822444406135, "grad_norm": 1.1536128646000634, "learning_rate": 7.678266645761823e-06, "loss": 0.2813, "step": 4940 }, { "epoch": 1.7047060851577314, "grad_norm": 1.0822875992827867, "learning_rate": 7.673550924955749e-06, "loss": 0.2621, "step": 4945 }, { "epoch": 1.7064299258748492, "grad_norm": 0.9758588784653546, "learning_rate": 7.668831871407552e-06, "loss": 0.2896, "step": 4950 }, { "epoch": 1.708153766591967, "grad_norm": 5.060149915372297, "learning_rate": 7.664109490999819e-06, "loss": 0.2672, "step": 4955 }, { "epoch": 1.7098776073090847, "grad_norm": 0.9785982692848444, "learning_rate": 7.659383789619277e-06, "loss": 0.2677, "step": 4960 }, { "epoch": 1.7116014480262023, "grad_norm": 1.1456111445632373, "learning_rate": 7.6546547731568e-06, "loss": 0.296, "step": 4965 }, { "epoch": 1.7133252887433201, "grad_norm": 1.2787557535098648, "learning_rate": 7.649922447507392e-06, "loss": 0.2738, "step": 4970 }, { "epoch": 1.715049129460438, "grad_norm": 1.164003982929867, "learning_rate": 7.645186818570183e-06, "loss": 0.2813, "step": 4975 }, { "epoch": 1.7167729701775556, "grad_norm": 1.2434044369597195, "learning_rate": 7.640447892248416e-06, "loss": 0.2503, "step": 4980 }, { "epoch": 1.7184968108946732, "grad_norm": 1.0898227360866577, "learning_rate": 7.635705674449448e-06, "loss": 0.2924, "step": 4985 }, { "epoch": 1.720220651611791, "grad_norm": 1.1109599536674057, "learning_rate": 7.630960171084742e-06, "loss": 0.2814, "step": 4990 }, { "epoch": 1.721944492328909, "grad_norm": 0.9936293276920007, "learning_rate": 7.626211388069853e-06, "loss": 0.271, "step": 4995 }, { "epoch": 1.7236683330460265, "grad_norm": 0.9991754561676714, "learning_rate": 7.621459331324421e-06, "loss": 0.2692, "step": 5000 }, { "epoch": 1.7253921737631441, "grad_norm": 1.3655918924152755, "learning_rate": 7.616704006772175e-06, "loss": 0.2649, "step": 5005 }, { "epoch": 1.727116014480262, "grad_norm": 1.5048623255301028, "learning_rate": 7.611945420340913e-06, "loss": 0.2336, "step": 5010 }, { "epoch": 1.7288398551973798, "grad_norm": 1.1367584713498837, "learning_rate": 7.607183577962496e-06, "loss": 0.2633, "step": 5015 }, { "epoch": 1.7305636959144977, "grad_norm": 1.4103184520900731, "learning_rate": 7.602418485572849e-06, "loss": 0.2665, "step": 5020 }, { "epoch": 1.7322875366316153, "grad_norm": 1.458893334785239, "learning_rate": 7.597650149111948e-06, "loss": 0.2781, "step": 5025 }, { "epoch": 1.734011377348733, "grad_norm": 1.0043127824753695, "learning_rate": 7.592878574523809e-06, "loss": 0.3137, "step": 5030 }, { "epoch": 1.7357352180658507, "grad_norm": 1.0507992776405541, "learning_rate": 7.5881037677564886e-06, "loss": 0.2497, "step": 5035 }, { "epoch": 1.7374590587829686, "grad_norm": 2.8059775995775946, "learning_rate": 7.583325734762068e-06, "loss": 0.2982, "step": 5040 }, { "epoch": 1.7391828995000862, "grad_norm": 1.1436290478445026, "learning_rate": 7.578544481496657e-06, "loss": 0.2895, "step": 5045 }, { "epoch": 1.7409067402172038, "grad_norm": 1.3224859651054273, "learning_rate": 7.5737600139203715e-06, "loss": 0.2535, "step": 5050 }, { "epoch": 1.7426305809343217, "grad_norm": 1.1752876663370677, "learning_rate": 7.5689723379973404e-06, "loss": 0.2779, "step": 5055 }, { "epoch": 1.7443544216514395, "grad_norm": 1.1909209268360834, "learning_rate": 7.564181459695692e-06, "loss": 0.2952, "step": 5060 }, { "epoch": 1.7460782623685571, "grad_norm": 1.1057420527030353, "learning_rate": 7.559387384987538e-06, "loss": 0.2913, "step": 5065 }, { "epoch": 1.7478021030856747, "grad_norm": 1.0788985188134605, "learning_rate": 7.554590119848988e-06, "loss": 0.2732, "step": 5070 }, { "epoch": 1.7495259438027926, "grad_norm": 1.2506844184121586, "learning_rate": 7.549789670260117e-06, "loss": 0.2852, "step": 5075 }, { "epoch": 1.7512497845199104, "grad_norm": 1.0751159126128347, "learning_rate": 7.544986042204977e-06, "loss": 0.2869, "step": 5080 }, { "epoch": 1.752973625237028, "grad_norm": 1.0913872117333538, "learning_rate": 7.540179241671578e-06, "loss": 0.2878, "step": 5085 }, { "epoch": 1.7546974659541459, "grad_norm": 1.1394484243614582, "learning_rate": 7.535369274651887e-06, "loss": 0.2816, "step": 5090 }, { "epoch": 1.7564213066712635, "grad_norm": 1.0311513805472574, "learning_rate": 7.530556147141817e-06, "loss": 0.2654, "step": 5095 }, { "epoch": 1.7581451473883813, "grad_norm": 0.9650362254355321, "learning_rate": 7.525739865141221e-06, "loss": 0.2816, "step": 5100 }, { "epoch": 1.7598689881054992, "grad_norm": 1.146197497667083, "learning_rate": 7.5209204346538845e-06, "loss": 0.2876, "step": 5105 }, { "epoch": 1.7615928288226168, "grad_norm": 1.2823570613469755, "learning_rate": 7.516097861687517e-06, "loss": 0.2549, "step": 5110 }, { "epoch": 1.7633166695397344, "grad_norm": 1.0979690128514943, "learning_rate": 7.511272152253746e-06, "loss": 0.2931, "step": 5115 }, { "epoch": 1.7650405102568523, "grad_norm": 1.0190614992710245, "learning_rate": 7.506443312368111e-06, "loss": 0.2693, "step": 5120 }, { "epoch": 1.76676435097397, "grad_norm": 1.053828883724785, "learning_rate": 7.5016113480500465e-06, "loss": 0.2744, "step": 5125 }, { "epoch": 1.7684881916910877, "grad_norm": 1.210477140134907, "learning_rate": 7.496776265322893e-06, "loss": 0.2855, "step": 5130 }, { "epoch": 1.7702120324082053, "grad_norm": 1.6091285360924075, "learning_rate": 7.491938070213868e-06, "loss": 0.2852, "step": 5135 }, { "epoch": 1.7719358731253232, "grad_norm": 1.1356765125738666, "learning_rate": 7.4870967687540745e-06, "loss": 0.2623, "step": 5140 }, { "epoch": 1.773659713842441, "grad_norm": 1.3691503007676011, "learning_rate": 7.482252366978484e-06, "loss": 0.2929, "step": 5145 }, { "epoch": 1.7753835545595587, "grad_norm": 1.2769107336632355, "learning_rate": 7.477404870925937e-06, "loss": 0.2703, "step": 5150 }, { "epoch": 1.7771073952766763, "grad_norm": 1.185701202768985, "learning_rate": 7.47255428663913e-06, "loss": 0.2893, "step": 5155 }, { "epoch": 1.7788312359937941, "grad_norm": 1.0168763853501037, "learning_rate": 7.467700620164606e-06, "loss": 0.2836, "step": 5160 }, { "epoch": 1.780555076710912, "grad_norm": 1.056250760663425, "learning_rate": 7.462843877552752e-06, "loss": 0.2822, "step": 5165 }, { "epoch": 1.7822789174280298, "grad_norm": 1.2340473863412658, "learning_rate": 7.457984064857791e-06, "loss": 0.2798, "step": 5170 }, { "epoch": 1.7840027581451474, "grad_norm": 0.9952934398082832, "learning_rate": 7.453121188137773e-06, "loss": 0.2863, "step": 5175 }, { "epoch": 1.785726598862265, "grad_norm": 1.096546664691232, "learning_rate": 7.448255253454566e-06, "loss": 0.267, "step": 5180 }, { "epoch": 1.7874504395793829, "grad_norm": 1.1137598402828075, "learning_rate": 7.443386266873849e-06, "loss": 0.2696, "step": 5185 }, { "epoch": 1.7891742802965007, "grad_norm": 1.265124395170217, "learning_rate": 7.438514234465108e-06, "loss": 0.2743, "step": 5190 }, { "epoch": 1.7908981210136183, "grad_norm": 1.0639163200289241, "learning_rate": 7.433639162301623e-06, "loss": 0.2547, "step": 5195 }, { "epoch": 1.792621961730736, "grad_norm": 1.1278138783766827, "learning_rate": 7.4287610564604675e-06, "loss": 0.266, "step": 5200 }, { "epoch": 1.7943458024478538, "grad_norm": 3.6967756772489144, "learning_rate": 7.4238799230224924e-06, "loss": 0.2958, "step": 5205 }, { "epoch": 1.7960696431649716, "grad_norm": 1.0996972379476142, "learning_rate": 7.418995768072323e-06, "loss": 0.2867, "step": 5210 }, { "epoch": 1.7977934838820893, "grad_norm": 1.0706722278463459, "learning_rate": 7.414108597698357e-06, "loss": 0.2651, "step": 5215 }, { "epoch": 1.7995173245992069, "grad_norm": 1.0382780131139366, "learning_rate": 7.409218417992741e-06, "loss": 0.2519, "step": 5220 }, { "epoch": 1.8012411653163247, "grad_norm": 1.0992035780495972, "learning_rate": 7.404325235051381e-06, "loss": 0.252, "step": 5225 }, { "epoch": 1.8029650060334426, "grad_norm": 1.126768049409712, "learning_rate": 7.399429054973923e-06, "loss": 0.278, "step": 5230 }, { "epoch": 1.8046888467505604, "grad_norm": 1.185737350601416, "learning_rate": 7.39452988386375e-06, "loss": 0.3009, "step": 5235 }, { "epoch": 1.806412687467678, "grad_norm": 1.0451308376522186, "learning_rate": 7.389627727827977e-06, "loss": 0.2598, "step": 5240 }, { "epoch": 1.8081365281847956, "grad_norm": 1.2583174124221492, "learning_rate": 7.3847225929774316e-06, "loss": 0.2732, "step": 5245 }, { "epoch": 1.8098603689019135, "grad_norm": 1.0331050969455584, "learning_rate": 7.3798144854266615e-06, "loss": 0.267, "step": 5250 }, { "epoch": 1.8115842096190313, "grad_norm": 1.1062804250909486, "learning_rate": 7.374903411293919e-06, "loss": 0.262, "step": 5255 }, { "epoch": 1.813308050336149, "grad_norm": 1.329738516827646, "learning_rate": 7.369989376701153e-06, "loss": 0.2819, "step": 5260 }, { "epoch": 1.8150318910532666, "grad_norm": 1.2271682423367378, "learning_rate": 7.365072387774004e-06, "loss": 0.255, "step": 5265 }, { "epoch": 1.8167557317703844, "grad_norm": 1.3129017608959763, "learning_rate": 7.360152450641792e-06, "loss": 0.2671, "step": 5270 }, { "epoch": 1.8184795724875022, "grad_norm": 1.2955446038826546, "learning_rate": 7.355229571437519e-06, "loss": 0.2462, "step": 5275 }, { "epoch": 1.8202034132046199, "grad_norm": 1.3071733693941237, "learning_rate": 7.350303756297845e-06, "loss": 0.2763, "step": 5280 }, { "epoch": 1.8219272539217375, "grad_norm": 1.1198453515777207, "learning_rate": 7.3453750113631e-06, "loss": 0.2772, "step": 5285 }, { "epoch": 1.8236510946388553, "grad_norm": 1.092114500883187, "learning_rate": 7.340443342777258e-06, "loss": 0.2451, "step": 5290 }, { "epoch": 1.8253749353559732, "grad_norm": 1.0470019744280976, "learning_rate": 7.335508756687941e-06, "loss": 0.2614, "step": 5295 }, { "epoch": 1.8270987760730908, "grad_norm": 1.0594618623401897, "learning_rate": 7.330571259246411e-06, "loss": 0.2746, "step": 5300 }, { "epoch": 1.8288226167902086, "grad_norm": 1.0982499449870642, "learning_rate": 7.32563085660755e-06, "loss": 0.2631, "step": 5305 }, { "epoch": 1.8305464575073263, "grad_norm": 0.9117916047220774, "learning_rate": 7.320687554929871e-06, "loss": 0.2713, "step": 5310 }, { "epoch": 1.832270298224444, "grad_norm": 1.2399561251897282, "learning_rate": 7.315741360375497e-06, "loss": 0.2674, "step": 5315 }, { "epoch": 1.833994138941562, "grad_norm": 1.08628856500942, "learning_rate": 7.310792279110155e-06, "loss": 0.2665, "step": 5320 }, { "epoch": 1.8357179796586796, "grad_norm": 1.112817692082712, "learning_rate": 7.305840317303174e-06, "loss": 0.264, "step": 5325 }, { "epoch": 1.8374418203757972, "grad_norm": 1.1554707394297785, "learning_rate": 7.300885481127472e-06, "loss": 0.2643, "step": 5330 }, { "epoch": 1.839165661092915, "grad_norm": 1.1065351039547966, "learning_rate": 7.295927776759551e-06, "loss": 0.2626, "step": 5335 }, { "epoch": 1.8408895018100329, "grad_norm": 1.1428672706404264, "learning_rate": 7.290967210379489e-06, "loss": 0.2947, "step": 5340 }, { "epoch": 1.8426133425271505, "grad_norm": 0.9919106094816719, "learning_rate": 7.286003788170928e-06, "loss": 0.2694, "step": 5345 }, { "epoch": 1.844337183244268, "grad_norm": 1.1143963164241666, "learning_rate": 7.281037516321073e-06, "loss": 0.2974, "step": 5350 }, { "epoch": 1.846061023961386, "grad_norm": 1.204079287254847, "learning_rate": 7.276068401020682e-06, "loss": 0.2675, "step": 5355 }, { "epoch": 1.8477848646785038, "grad_norm": 1.1051609652606675, "learning_rate": 7.271096448464057e-06, "loss": 0.2587, "step": 5360 }, { "epoch": 1.8495087053956214, "grad_norm": 1.231084386734048, "learning_rate": 7.266121664849033e-06, "loss": 0.2924, "step": 5365 }, { "epoch": 1.851232546112739, "grad_norm": 1.1424913542294668, "learning_rate": 7.261144056376978e-06, "loss": 0.2394, "step": 5370 }, { "epoch": 1.8529563868298569, "grad_norm": 1.1819390379008323, "learning_rate": 7.256163629252784e-06, "loss": 0.2624, "step": 5375 }, { "epoch": 1.8546802275469747, "grad_norm": 1.0898053611089158, "learning_rate": 7.251180389684849e-06, "loss": 0.2606, "step": 5380 }, { "epoch": 1.8564040682640925, "grad_norm": 1.056902359024124, "learning_rate": 7.246194343885082e-06, "loss": 0.2613, "step": 5385 }, { "epoch": 1.8581279089812102, "grad_norm": 1.0845746443904702, "learning_rate": 7.2412054980688905e-06, "loss": 0.2891, "step": 5390 }, { "epoch": 1.8598517496983278, "grad_norm": 1.2084838861539404, "learning_rate": 7.23621385845517e-06, "loss": 0.2816, "step": 5395 }, { "epoch": 1.8615755904154456, "grad_norm": 1.2217850486847586, "learning_rate": 7.2312194312663e-06, "loss": 0.2839, "step": 5400 }, { "epoch": 1.8632994311325635, "grad_norm": 1.1310132370248838, "learning_rate": 7.226222222728134e-06, "loss": 0.2485, "step": 5405 }, { "epoch": 1.865023271849681, "grad_norm": 1.1136582290738202, "learning_rate": 7.221222239069994e-06, "loss": 0.2721, "step": 5410 }, { "epoch": 1.8667471125667987, "grad_norm": 1.1600893472584881, "learning_rate": 7.216219486524659e-06, "loss": 0.2496, "step": 5415 }, { "epoch": 1.8684709532839165, "grad_norm": 1.0477615037842176, "learning_rate": 7.211213971328364e-06, "loss": 0.262, "step": 5420 }, { "epoch": 1.8701947940010344, "grad_norm": 1.1123140931236568, "learning_rate": 7.206205699720782e-06, "loss": 0.2707, "step": 5425 }, { "epoch": 1.871918634718152, "grad_norm": 1.2651961367995357, "learning_rate": 7.201194677945027e-06, "loss": 0.2932, "step": 5430 }, { "epoch": 1.8736424754352696, "grad_norm": 1.0153308064883255, "learning_rate": 7.196180912247637e-06, "loss": 0.2912, "step": 5435 }, { "epoch": 1.8753663161523875, "grad_norm": 1.0469023717406651, "learning_rate": 7.191164408878575e-06, "loss": 0.2661, "step": 5440 }, { "epoch": 1.8770901568695053, "grad_norm": 1.15204943776014, "learning_rate": 7.186145174091214e-06, "loss": 0.2774, "step": 5445 }, { "epoch": 1.8788139975866232, "grad_norm": 1.294374183716143, "learning_rate": 7.181123214142331e-06, "loss": 0.2909, "step": 5450 }, { "epoch": 1.8805378383037408, "grad_norm": 1.2237541265828948, "learning_rate": 7.176098535292101e-06, "loss": 0.2701, "step": 5455 }, { "epoch": 1.8822616790208584, "grad_norm": 1.1134729489128907, "learning_rate": 7.171071143804089e-06, "loss": 0.2591, "step": 5460 }, { "epoch": 1.8839855197379762, "grad_norm": 0.9771169461106862, "learning_rate": 7.166041045945242e-06, "loss": 0.2776, "step": 5465 }, { "epoch": 1.885709360455094, "grad_norm": 1.164004332509995, "learning_rate": 7.161008247985881e-06, "loss": 0.2524, "step": 5470 }, { "epoch": 1.8874332011722117, "grad_norm": 1.036062945872068, "learning_rate": 7.155972756199688e-06, "loss": 0.2781, "step": 5475 }, { "epoch": 1.8891570418893293, "grad_norm": 1.2103960421924118, "learning_rate": 7.150934576863708e-06, "loss": 0.2834, "step": 5480 }, { "epoch": 1.8908808826064472, "grad_norm": 1.24293340973761, "learning_rate": 7.145893716258335e-06, "loss": 0.2778, "step": 5485 }, { "epoch": 1.892604723323565, "grad_norm": 0.9832689197280934, "learning_rate": 7.140850180667306e-06, "loss": 0.2282, "step": 5490 }, { "epoch": 1.8943285640406826, "grad_norm": 1.0331895067046448, "learning_rate": 7.13580397637769e-06, "loss": 0.2768, "step": 5495 }, { "epoch": 1.8960524047578002, "grad_norm": 1.1923177256654112, "learning_rate": 7.1307551096798855e-06, "loss": 0.2557, "step": 5500 }, { "epoch": 1.897776245474918, "grad_norm": 1.5997663160143192, "learning_rate": 7.1257035868676085e-06, "loss": 0.2711, "step": 5505 }, { "epoch": 1.899500086192036, "grad_norm": 1.194254895138194, "learning_rate": 7.120649414237885e-06, "loss": 0.2847, "step": 5510 }, { "epoch": 1.9012239269091535, "grad_norm": 1.0880952414876823, "learning_rate": 7.115592598091046e-06, "loss": 0.2535, "step": 5515 }, { "epoch": 1.9029477676262714, "grad_norm": 1.0888244601473853, "learning_rate": 7.110533144730718e-06, "loss": 0.2422, "step": 5520 }, { "epoch": 1.904671608343389, "grad_norm": 1.0598652883511703, "learning_rate": 7.105471060463814e-06, "loss": 0.2874, "step": 5525 }, { "epoch": 1.9063954490605068, "grad_norm": 1.291465991800003, "learning_rate": 7.1004063516005265e-06, "loss": 0.2776, "step": 5530 }, { "epoch": 1.9081192897776247, "grad_norm": 0.9859693890889243, "learning_rate": 7.095339024454316e-06, "loss": 0.2691, "step": 5535 }, { "epoch": 1.9098431304947423, "grad_norm": 1.0642240673720773, "learning_rate": 7.0902690853419185e-06, "loss": 0.2611, "step": 5540 }, { "epoch": 1.91156697121186, "grad_norm": 1.1138711581684404, "learning_rate": 7.085196540583312e-06, "loss": 0.259, "step": 5545 }, { "epoch": 1.9132908119289778, "grad_norm": 1.4743476326248401, "learning_rate": 7.080121396501733e-06, "loss": 0.2917, "step": 5550 }, { "epoch": 1.9150146526460956, "grad_norm": 1.7890008769900765, "learning_rate": 7.075043659423648e-06, "loss": 0.2713, "step": 5555 }, { "epoch": 1.9167384933632132, "grad_norm": 1.1299933829421482, "learning_rate": 7.069963335678767e-06, "loss": 0.2611, "step": 5560 }, { "epoch": 1.9184623340803308, "grad_norm": 1.168157969901466, "learning_rate": 7.06488043160002e-06, "loss": 0.2611, "step": 5565 }, { "epoch": 1.9201861747974487, "grad_norm": 1.653223887377282, "learning_rate": 7.059794953523549e-06, "loss": 0.2576, "step": 5570 }, { "epoch": 1.9219100155145665, "grad_norm": 1.0833739041565649, "learning_rate": 7.054706907788711e-06, "loss": 0.2644, "step": 5575 }, { "epoch": 1.9236338562316841, "grad_norm": 1.059492405661376, "learning_rate": 7.049616300738059e-06, "loss": 0.261, "step": 5580 }, { "epoch": 1.9253576969488018, "grad_norm": 1.5814448412443767, "learning_rate": 7.044523138717344e-06, "loss": 0.2698, "step": 5585 }, { "epoch": 1.9270815376659196, "grad_norm": 1.1358794117729327, "learning_rate": 7.0394274280754984e-06, "loss": 0.2666, "step": 5590 }, { "epoch": 1.9288053783830374, "grad_norm": 1.2958086531655535, "learning_rate": 7.0343291751646295e-06, "loss": 0.2941, "step": 5595 }, { "epoch": 1.9305292191001553, "grad_norm": 1.1203931328301828, "learning_rate": 7.029228386340017e-06, "loss": 0.2757, "step": 5600 }, { "epoch": 1.932253059817273, "grad_norm": 8.170377614516136, "learning_rate": 7.024125067960104e-06, "loss": 0.2606, "step": 5605 }, { "epoch": 1.9339769005343905, "grad_norm": 1.3417798966601795, "learning_rate": 7.019019226386482e-06, "loss": 0.2777, "step": 5610 }, { "epoch": 1.9357007412515084, "grad_norm": 1.3041374667829848, "learning_rate": 7.0139108679838885e-06, "loss": 0.2555, "step": 5615 }, { "epoch": 1.9374245819686262, "grad_norm": 1.0226608905689267, "learning_rate": 7.008799999120203e-06, "loss": 0.2752, "step": 5620 }, { "epoch": 1.9391484226857438, "grad_norm": 1.1926622366516715, "learning_rate": 7.003686626166429e-06, "loss": 0.2569, "step": 5625 }, { "epoch": 1.9408722634028615, "grad_norm": 1.7781872868378437, "learning_rate": 6.998570755496694e-06, "loss": 0.2475, "step": 5630 }, { "epoch": 1.9425961041199793, "grad_norm": 0.9988247278701624, "learning_rate": 6.993452393488238e-06, "loss": 0.2794, "step": 5635 }, { "epoch": 1.9443199448370971, "grad_norm": 1.1284938143658345, "learning_rate": 6.988331546521408e-06, "loss": 0.2683, "step": 5640 }, { "epoch": 1.9460437855542148, "grad_norm": 1.2099722282233312, "learning_rate": 6.983208220979647e-06, "loss": 0.2572, "step": 5645 }, { "epoch": 1.9477676262713324, "grad_norm": 1.249136336362036, "learning_rate": 6.978082423249491e-06, "loss": 0.2646, "step": 5650 }, { "epoch": 1.9494914669884502, "grad_norm": 1.1307235999444802, "learning_rate": 6.972954159720552e-06, "loss": 0.2664, "step": 5655 }, { "epoch": 1.951215307705568, "grad_norm": 1.114170233199106, "learning_rate": 6.967823436785521e-06, "loss": 0.2636, "step": 5660 }, { "epoch": 1.952939148422686, "grad_norm": 1.1602703323436812, "learning_rate": 6.962690260840153e-06, "loss": 0.2666, "step": 5665 }, { "epoch": 1.9546629891398035, "grad_norm": 1.022057578742568, "learning_rate": 6.9575546382832615e-06, "loss": 0.2911, "step": 5670 }, { "epoch": 1.9563868298569211, "grad_norm": 1.1162004610083973, "learning_rate": 6.952416575516707e-06, "loss": 0.2702, "step": 5675 }, { "epoch": 1.958110670574039, "grad_norm": 1.1672309636467573, "learning_rate": 6.947276078945393e-06, "loss": 0.2787, "step": 5680 }, { "epoch": 1.9598345112911568, "grad_norm": 1.083713088049749, "learning_rate": 6.942133154977263e-06, "loss": 0.25, "step": 5685 }, { "epoch": 1.9615583520082744, "grad_norm": 1.034910496878224, "learning_rate": 6.936987810023277e-06, "loss": 0.2648, "step": 5690 }, { "epoch": 1.963282192725392, "grad_norm": 1.047401038681741, "learning_rate": 6.931840050497417e-06, "loss": 0.2591, "step": 5695 }, { "epoch": 1.96500603344251, "grad_norm": 1.0375177459093605, "learning_rate": 6.9266898828166774e-06, "loss": 0.2393, "step": 5700 }, { "epoch": 1.9667298741596277, "grad_norm": 1.4160738858000166, "learning_rate": 6.92153731340105e-06, "loss": 0.2775, "step": 5705 }, { "epoch": 1.9684537148767454, "grad_norm": 1.0380061179428337, "learning_rate": 6.9163823486735245e-06, "loss": 0.2672, "step": 5710 }, { "epoch": 1.970177555593863, "grad_norm": 1.0168733689431326, "learning_rate": 6.9112249950600726e-06, "loss": 0.2774, "step": 5715 }, { "epoch": 1.9719013963109808, "grad_norm": 1.2152116140898908, "learning_rate": 6.9060652589896485e-06, "loss": 0.243, "step": 5720 }, { "epoch": 1.9736252370280987, "grad_norm": 1.6474829672831846, "learning_rate": 6.900903146894171e-06, "loss": 0.2531, "step": 5725 }, { "epoch": 1.9753490777452165, "grad_norm": 1.2142546628291522, "learning_rate": 6.895738665208526e-06, "loss": 0.247, "step": 5730 }, { "epoch": 1.9770729184623341, "grad_norm": 1.0326790077881896, "learning_rate": 6.8905718203705485e-06, "loss": 0.2627, "step": 5735 }, { "epoch": 1.9787967591794517, "grad_norm": 1.4913206458086243, "learning_rate": 6.885402618821022e-06, "loss": 0.2749, "step": 5740 }, { "epoch": 1.9805205998965696, "grad_norm": 1.392133050519287, "learning_rate": 6.88023106700367e-06, "loss": 0.2774, "step": 5745 }, { "epoch": 1.9822444406136874, "grad_norm": 1.0796781573181105, "learning_rate": 6.875057171365139e-06, "loss": 0.2745, "step": 5750 }, { "epoch": 1.983968281330805, "grad_norm": 1.1962868504050188, "learning_rate": 6.869880938355004e-06, "loss": 0.2701, "step": 5755 }, { "epoch": 1.9856921220479227, "grad_norm": 1.2042347082839586, "learning_rate": 6.864702374425749e-06, "loss": 0.2794, "step": 5760 }, { "epoch": 1.9874159627650405, "grad_norm": 1.3125219172293308, "learning_rate": 6.859521486032768e-06, "loss": 0.2599, "step": 5765 }, { "epoch": 1.9891398034821584, "grad_norm": 1.1429032279231959, "learning_rate": 6.854338279634349e-06, "loss": 0.2537, "step": 5770 }, { "epoch": 1.990863644199276, "grad_norm": 1.487277914324487, "learning_rate": 6.849152761691671e-06, "loss": 0.2572, "step": 5775 }, { "epoch": 1.9925874849163936, "grad_norm": 1.0745202819769721, "learning_rate": 6.843964938668792e-06, "loss": 0.2939, "step": 5780 }, { "epoch": 1.9943113256335114, "grad_norm": 1.0428930706243056, "learning_rate": 6.838774817032648e-06, "loss": 0.2883, "step": 5785 }, { "epoch": 1.9960351663506293, "grad_norm": 1.6922282787293423, "learning_rate": 6.833582403253038e-06, "loss": 0.2332, "step": 5790 }, { "epoch": 1.997759007067747, "grad_norm": 1.278542198963824, "learning_rate": 6.8283877038026185e-06, "loss": 0.2842, "step": 5795 }, { "epoch": 1.9994828477848645, "grad_norm": 1.1741569666338092, "learning_rate": 6.823190725156892e-06, "loss": 0.234, "step": 5800 }, { "epoch": 2.0010343044302705, "grad_norm": 1.069771793210654, "learning_rate": 6.817991473794207e-06, "loss": 0.2566, "step": 5805 }, { "epoch": 2.0027581451473884, "grad_norm": 1.1022591623453037, "learning_rate": 6.812789956195745e-06, "loss": 0.2127, "step": 5810 }, { "epoch": 2.004481985864506, "grad_norm": 1.2427733300346624, "learning_rate": 6.807586178845509e-06, "loss": 0.227, "step": 5815 }, { "epoch": 2.006205826581624, "grad_norm": 1.1609130023882328, "learning_rate": 6.80238014823032e-06, "loss": 0.2329, "step": 5820 }, { "epoch": 2.0079296672987414, "grad_norm": 1.2194015635674407, "learning_rate": 6.797171870839809e-06, "loss": 0.2338, "step": 5825 }, { "epoch": 2.0096535080158593, "grad_norm": 1.1713733585642396, "learning_rate": 6.791961353166408e-06, "loss": 0.2127, "step": 5830 }, { "epoch": 2.011377348732977, "grad_norm": 1.1607859835299417, "learning_rate": 6.786748601705341e-06, "loss": 0.2632, "step": 5835 }, { "epoch": 2.013101189450095, "grad_norm": 1.144078788264028, "learning_rate": 6.781533622954615e-06, "loss": 0.2149, "step": 5840 }, { "epoch": 2.0148250301672124, "grad_norm": 1.0973643308018433, "learning_rate": 6.776316423415015e-06, "loss": 0.2475, "step": 5845 }, { "epoch": 2.01654887088433, "grad_norm": 1.13899084035407, "learning_rate": 6.7710970095900956e-06, "loss": 0.2337, "step": 5850 }, { "epoch": 2.018272711601448, "grad_norm": 1.8061306377516608, "learning_rate": 6.76587538798617e-06, "loss": 0.2657, "step": 5855 }, { "epoch": 2.019996552318566, "grad_norm": 1.0832860811873404, "learning_rate": 6.7606515651123e-06, "loss": 0.2378, "step": 5860 }, { "epoch": 2.0217203930356833, "grad_norm": 1.0390851457035901, "learning_rate": 6.755425547480301e-06, "loss": 0.2438, "step": 5865 }, { "epoch": 2.023444233752801, "grad_norm": 1.105889528965176, "learning_rate": 6.750197341604714e-06, "loss": 0.2285, "step": 5870 }, { "epoch": 2.025168074469919, "grad_norm": 1.219541978578336, "learning_rate": 6.744966954002816e-06, "loss": 0.221, "step": 5875 }, { "epoch": 2.026891915187037, "grad_norm": 1.3359271211676251, "learning_rate": 6.7397343911945965e-06, "loss": 0.2187, "step": 5880 }, { "epoch": 2.0286157559041547, "grad_norm": 1.1947466098000317, "learning_rate": 6.734499659702761e-06, "loss": 0.2302, "step": 5885 }, { "epoch": 2.030339596621272, "grad_norm": 1.1116329595278311, "learning_rate": 6.72926276605272e-06, "loss": 0.2407, "step": 5890 }, { "epoch": 2.03206343733839, "grad_norm": 1.1124699826421875, "learning_rate": 6.724023716772573e-06, "loss": 0.2289, "step": 5895 }, { "epoch": 2.0337872780555077, "grad_norm": 1.0757617090304594, "learning_rate": 6.718782518393111e-06, "loss": 0.2197, "step": 5900 }, { "epoch": 2.0355111187726256, "grad_norm": 1.2394833449760483, "learning_rate": 6.713539177447805e-06, "loss": 0.2228, "step": 5905 }, { "epoch": 2.037234959489743, "grad_norm": 1.0921680519718104, "learning_rate": 6.708293700472792e-06, "loss": 0.2539, "step": 5910 }, { "epoch": 2.038958800206861, "grad_norm": 1.1967162319205045, "learning_rate": 6.703046094006878e-06, "loss": 0.2396, "step": 5915 }, { "epoch": 2.0406826409239787, "grad_norm": 1.108450216113543, "learning_rate": 6.697796364591517e-06, "loss": 0.2349, "step": 5920 }, { "epoch": 2.0424064816410965, "grad_norm": 1.0881400098795595, "learning_rate": 6.692544518770816e-06, "loss": 0.2092, "step": 5925 }, { "epoch": 2.044130322358214, "grad_norm": 1.1642194846324272, "learning_rate": 6.687290563091515e-06, "loss": 0.2511, "step": 5930 }, { "epoch": 2.0458541630753317, "grad_norm": 1.0733848179823602, "learning_rate": 6.682034504102987e-06, "loss": 0.2416, "step": 5935 }, { "epoch": 2.0475780037924496, "grad_norm": 1.279626520702259, "learning_rate": 6.676776348357224e-06, "loss": 0.2035, "step": 5940 }, { "epoch": 2.0493018445095674, "grad_norm": 1.1210734611683955, "learning_rate": 6.671516102408833e-06, "loss": 0.2464, "step": 5945 }, { "epoch": 2.0510256852266853, "grad_norm": 1.0689961412139548, "learning_rate": 6.66625377281503e-06, "loss": 0.2009, "step": 5950 }, { "epoch": 2.0527495259438027, "grad_norm": 1.1387837258799702, "learning_rate": 6.660989366135624e-06, "loss": 0.2417, "step": 5955 }, { "epoch": 2.0544733666609205, "grad_norm": 1.272758419732411, "learning_rate": 6.655722888933016e-06, "loss": 0.2581, "step": 5960 }, { "epoch": 2.0561972073780383, "grad_norm": 1.2140821035581335, "learning_rate": 6.650454347772184e-06, "loss": 0.2139, "step": 5965 }, { "epoch": 2.057921048095156, "grad_norm": 1.084407306601656, "learning_rate": 6.645183749220685e-06, "loss": 0.2662, "step": 5970 }, { "epoch": 2.0596448888122736, "grad_norm": 1.0838704341161596, "learning_rate": 6.639911099848636e-06, "loss": 0.24, "step": 5975 }, { "epoch": 2.0613687295293914, "grad_norm": 1.1914678111301553, "learning_rate": 6.634636406228711e-06, "loss": 0.2549, "step": 5980 }, { "epoch": 2.0630925702465093, "grad_norm": 1.1044635016861235, "learning_rate": 6.629359674936132e-06, "loss": 0.2451, "step": 5985 }, { "epoch": 2.064816410963627, "grad_norm": 1.1536147284580867, "learning_rate": 6.624080912548665e-06, "loss": 0.2476, "step": 5990 }, { "epoch": 2.0665402516807445, "grad_norm": 1.1421925526144936, "learning_rate": 6.6188001256466025e-06, "loss": 0.241, "step": 5995 }, { "epoch": 2.0682640923978624, "grad_norm": 2.1379547217703285, "learning_rate": 6.613517320812766e-06, "loss": 0.2164, "step": 6000 }, { "epoch": 2.06998793311498, "grad_norm": 1.1122295896891858, "learning_rate": 6.608232504632486e-06, "loss": 0.2376, "step": 6005 }, { "epoch": 2.071711773832098, "grad_norm": 1.039917351999052, "learning_rate": 6.602945683693605e-06, "loss": 0.2443, "step": 6010 }, { "epoch": 2.073435614549216, "grad_norm": 1.1600517769636751, "learning_rate": 6.597656864586466e-06, "loss": 0.2436, "step": 6015 }, { "epoch": 2.0751594552663333, "grad_norm": 1.1065203329203004, "learning_rate": 6.5923660539038995e-06, "loss": 0.2369, "step": 6020 }, { "epoch": 2.076883295983451, "grad_norm": 1.258710551388086, "learning_rate": 6.587073258241215e-06, "loss": 0.2524, "step": 6025 }, { "epoch": 2.078607136700569, "grad_norm": 1.1089161354480546, "learning_rate": 6.581778484196206e-06, "loss": 0.216, "step": 6030 }, { "epoch": 2.080330977417687, "grad_norm": 1.286041663946239, "learning_rate": 6.576481738369126e-06, "loss": 0.2483, "step": 6035 }, { "epoch": 2.082054818134804, "grad_norm": 1.0615172291224826, "learning_rate": 6.571183027362686e-06, "loss": 0.2267, "step": 6040 }, { "epoch": 2.083778658851922, "grad_norm": 1.144691418124422, "learning_rate": 6.565882357782048e-06, "loss": 0.2214, "step": 6045 }, { "epoch": 2.08550249956904, "grad_norm": 1.1203900966572184, "learning_rate": 6.5605797362348175e-06, "loss": 0.2148, "step": 6050 }, { "epoch": 2.0872263402861577, "grad_norm": 1.164264894518462, "learning_rate": 6.555275169331031e-06, "loss": 0.2353, "step": 6055 }, { "epoch": 2.088950181003275, "grad_norm": 1.111396306734735, "learning_rate": 6.5499686636831485e-06, "loss": 0.2371, "step": 6060 }, { "epoch": 2.090674021720393, "grad_norm": 1.484226472860949, "learning_rate": 6.54466022590605e-06, "loss": 0.2284, "step": 6065 }, { "epoch": 2.092397862437511, "grad_norm": 1.2784600960385353, "learning_rate": 6.539349862617023e-06, "loss": 0.2637, "step": 6070 }, { "epoch": 2.0941217031546286, "grad_norm": 2.6398567990968984, "learning_rate": 6.534037580435753e-06, "loss": 0.2326, "step": 6075 }, { "epoch": 2.095845543871746, "grad_norm": 1.0054398119210661, "learning_rate": 6.528723385984322e-06, "loss": 0.2433, "step": 6080 }, { "epoch": 2.097569384588864, "grad_norm": 1.177616826803118, "learning_rate": 6.523407285887192e-06, "loss": 0.2122, "step": 6085 }, { "epoch": 2.0992932253059817, "grad_norm": 1.2851218049211262, "learning_rate": 6.5180892867711996e-06, "loss": 0.238, "step": 6090 }, { "epoch": 2.1010170660230996, "grad_norm": 1.1495087647733981, "learning_rate": 6.512769395265556e-06, "loss": 0.2006, "step": 6095 }, { "epoch": 2.1027409067402174, "grad_norm": 1.0225974867997085, "learning_rate": 6.507447618001821e-06, "loss": 0.2254, "step": 6100 }, { "epoch": 2.104464747457335, "grad_norm": 1.137638390854424, "learning_rate": 6.502123961613912e-06, "loss": 0.2343, "step": 6105 }, { "epoch": 2.1061885881744526, "grad_norm": 1.1809201545717292, "learning_rate": 6.496798432738087e-06, "loss": 0.2232, "step": 6110 }, { "epoch": 2.1079124288915705, "grad_norm": 1.2408473497171375, "learning_rate": 6.491471038012941e-06, "loss": 0.2427, "step": 6115 }, { "epoch": 2.1096362696086883, "grad_norm": 1.2093251008140802, "learning_rate": 6.486141784079387e-06, "loss": 0.2147, "step": 6120 }, { "epoch": 2.1113601103258057, "grad_norm": 1.1284539751014209, "learning_rate": 6.480810677580664e-06, "loss": 0.2219, "step": 6125 }, { "epoch": 2.1130839510429236, "grad_norm": 0.9947179313905126, "learning_rate": 6.4754777251623166e-06, "loss": 0.2403, "step": 6130 }, { "epoch": 2.1148077917600414, "grad_norm": 1.207016487014047, "learning_rate": 6.470142933472191e-06, "loss": 0.2505, "step": 6135 }, { "epoch": 2.1165316324771593, "grad_norm": 1.109965366005483, "learning_rate": 6.464806309160427e-06, "loss": 0.2289, "step": 6140 }, { "epoch": 2.1182554731942767, "grad_norm": 1.2296801343295378, "learning_rate": 6.4594678588794445e-06, "loss": 0.249, "step": 6145 }, { "epoch": 2.1199793139113945, "grad_norm": 1.1709946649721397, "learning_rate": 6.454127589283945e-06, "loss": 0.2609, "step": 6150 }, { "epoch": 2.1217031546285123, "grad_norm": 1.0256537521065312, "learning_rate": 6.448785507030898e-06, "loss": 0.2485, "step": 6155 }, { "epoch": 2.12342699534563, "grad_norm": 1.2294744683344365, "learning_rate": 6.443441618779528e-06, "loss": 0.2316, "step": 6160 }, { "epoch": 2.1251508360627476, "grad_norm": 1.1751304786398409, "learning_rate": 6.438095931191315e-06, "loss": 0.2315, "step": 6165 }, { "epoch": 2.1268746767798654, "grad_norm": 1.0694001684270091, "learning_rate": 6.432748450929977e-06, "loss": 0.256, "step": 6170 }, { "epoch": 2.1285985174969833, "grad_norm": 1.5918972890666852, "learning_rate": 6.4273991846614735e-06, "loss": 0.234, "step": 6175 }, { "epoch": 2.130322358214101, "grad_norm": 2.1061165243228013, "learning_rate": 6.422048139053987e-06, "loss": 0.2231, "step": 6180 }, { "epoch": 2.132046198931219, "grad_norm": 1.2980001891437871, "learning_rate": 6.416695320777915e-06, "loss": 0.2355, "step": 6185 }, { "epoch": 2.1337700396483363, "grad_norm": 1.2013658421140967, "learning_rate": 6.411340736505869e-06, "loss": 0.2312, "step": 6190 }, { "epoch": 2.135493880365454, "grad_norm": 1.4480279744210514, "learning_rate": 6.4059843929126605e-06, "loss": 0.2721, "step": 6195 }, { "epoch": 2.137217721082572, "grad_norm": 0.9910186749108644, "learning_rate": 6.400626296675296e-06, "loss": 0.2449, "step": 6200 }, { "epoch": 2.13894156179969, "grad_norm": 1.4704006413199056, "learning_rate": 6.395266454472963e-06, "loss": 0.2501, "step": 6205 }, { "epoch": 2.1406654025168073, "grad_norm": 1.210407783203279, "learning_rate": 6.389904872987025e-06, "loss": 0.235, "step": 6210 }, { "epoch": 2.142389243233925, "grad_norm": 1.266712887450536, "learning_rate": 6.384541558901021e-06, "loss": 0.274, "step": 6215 }, { "epoch": 2.144113083951043, "grad_norm": 1.0758792414175562, "learning_rate": 6.37917651890064e-06, "loss": 0.2246, "step": 6220 }, { "epoch": 2.145836924668161, "grad_norm": 1.148378596934074, "learning_rate": 6.373809759673733e-06, "loss": 0.195, "step": 6225 }, { "epoch": 2.147560765385278, "grad_norm": 1.2369163372033345, "learning_rate": 6.368441287910281e-06, "loss": 0.2368, "step": 6230 }, { "epoch": 2.149284606102396, "grad_norm": 1.1605685315411505, "learning_rate": 6.3630711103024125e-06, "loss": 0.2299, "step": 6235 }, { "epoch": 2.151008446819514, "grad_norm": 1.153738101931673, "learning_rate": 6.3576992335443764e-06, "loss": 0.229, "step": 6240 }, { "epoch": 2.1527322875366317, "grad_norm": 1.061980878109314, "learning_rate": 6.352325664332539e-06, "loss": 0.2296, "step": 6245 }, { "epoch": 2.1544561282537495, "grad_norm": 0.970850560174058, "learning_rate": 6.346950409365377e-06, "loss": 0.2237, "step": 6250 }, { "epoch": 2.156179968970867, "grad_norm": 1.1016916099458374, "learning_rate": 6.3415734753434736e-06, "loss": 0.218, "step": 6255 }, { "epoch": 2.157903809687985, "grad_norm": 1.193448165025695, "learning_rate": 6.336194868969495e-06, "loss": 0.2224, "step": 6260 }, { "epoch": 2.1596276504051026, "grad_norm": 1.0219575670537784, "learning_rate": 6.3308145969482005e-06, "loss": 0.236, "step": 6265 }, { "epoch": 2.1613514911222205, "grad_norm": 1.272655310419002, "learning_rate": 6.325432665986423e-06, "loss": 0.2487, "step": 6270 }, { "epoch": 2.163075331839338, "grad_norm": 1.8152147982265001, "learning_rate": 6.320049082793063e-06, "loss": 0.2259, "step": 6275 }, { "epoch": 2.1647991725564557, "grad_norm": 1.0786103760624521, "learning_rate": 6.314663854079081e-06, "loss": 0.2337, "step": 6280 }, { "epoch": 2.1665230132735736, "grad_norm": 1.0904656874530174, "learning_rate": 6.309276986557489e-06, "loss": 0.23, "step": 6285 }, { "epoch": 2.1682468539906914, "grad_norm": 1.131407405814106, "learning_rate": 6.30388848694334e-06, "loss": 0.2118, "step": 6290 }, { "epoch": 2.169970694707809, "grad_norm": 2.220220145462081, "learning_rate": 6.298498361953723e-06, "loss": 0.2185, "step": 6295 }, { "epoch": 2.1716945354249266, "grad_norm": 1.1171046668981954, "learning_rate": 6.293106618307757e-06, "loss": 0.2689, "step": 6300 }, { "epoch": 2.1734183761420445, "grad_norm": 1.3104517671771276, "learning_rate": 6.287713262726571e-06, "loss": 0.2286, "step": 6305 }, { "epoch": 2.1751422168591623, "grad_norm": 1.2071183923627506, "learning_rate": 6.2823183019333085e-06, "loss": 0.2403, "step": 6310 }, { "epoch": 2.17686605757628, "grad_norm": 1.5815496517793666, "learning_rate": 6.276921742653113e-06, "loss": 0.2189, "step": 6315 }, { "epoch": 2.1785898982933976, "grad_norm": 1.1528626189674833, "learning_rate": 6.271523591613121e-06, "loss": 0.2688, "step": 6320 }, { "epoch": 2.1803137390105154, "grad_norm": 1.154833714772765, "learning_rate": 6.266123855542452e-06, "loss": 0.2511, "step": 6325 }, { "epoch": 2.1820375797276332, "grad_norm": 1.1271706581188135, "learning_rate": 6.2607225411722005e-06, "loss": 0.2356, "step": 6330 }, { "epoch": 2.183761420444751, "grad_norm": 1.465635437555139, "learning_rate": 6.255319655235432e-06, "loss": 0.2405, "step": 6335 }, { "epoch": 2.1854852611618685, "grad_norm": 1.0110337846358128, "learning_rate": 6.249915204467168e-06, "loss": 0.2353, "step": 6340 }, { "epoch": 2.1872091018789863, "grad_norm": 1.264188343176243, "learning_rate": 6.244509195604383e-06, "loss": 0.2179, "step": 6345 }, { "epoch": 2.188932942596104, "grad_norm": 1.1855417116942413, "learning_rate": 6.2391016353859914e-06, "loss": 0.2331, "step": 6350 }, { "epoch": 2.190656783313222, "grad_norm": 1.121771969018659, "learning_rate": 6.23369253055284e-06, "loss": 0.2319, "step": 6355 }, { "epoch": 2.1923806240303394, "grad_norm": 2.979733543546792, "learning_rate": 6.228281887847708e-06, "loss": 0.2363, "step": 6360 }, { "epoch": 2.1941044647474572, "grad_norm": 0.945998582169751, "learning_rate": 6.222869714015284e-06, "loss": 0.2006, "step": 6365 }, { "epoch": 2.195828305464575, "grad_norm": 1.1385376045750768, "learning_rate": 6.21745601580217e-06, "loss": 0.2387, "step": 6370 }, { "epoch": 2.197552146181693, "grad_norm": 1.1470900066181131, "learning_rate": 6.212040799956865e-06, "loss": 0.2217, "step": 6375 }, { "epoch": 2.1992759868988108, "grad_norm": 1.0910786770760603, "learning_rate": 6.206624073229763e-06, "loss": 0.2384, "step": 6380 }, { "epoch": 2.200999827615928, "grad_norm": 1.21836785295153, "learning_rate": 6.201205842373139e-06, "loss": 0.2408, "step": 6385 }, { "epoch": 2.202723668333046, "grad_norm": 1.1550357487539422, "learning_rate": 6.195786114141145e-06, "loss": 0.2151, "step": 6390 }, { "epoch": 2.204447509050164, "grad_norm": 2.0126094991818735, "learning_rate": 6.190364895289796e-06, "loss": 0.2258, "step": 6395 }, { "epoch": 2.2061713497672817, "grad_norm": 1.1759885295782768, "learning_rate": 6.18494219257697e-06, "loss": 0.2282, "step": 6400 }, { "epoch": 2.207895190484399, "grad_norm": 1.156765171706817, "learning_rate": 6.179518012762391e-06, "loss": 0.2221, "step": 6405 }, { "epoch": 2.209619031201517, "grad_norm": 1.2152068175854134, "learning_rate": 6.174092362607627e-06, "loss": 0.2365, "step": 6410 }, { "epoch": 2.2113428719186348, "grad_norm": 1.1065954471480959, "learning_rate": 6.1686652488760735e-06, "loss": 0.2422, "step": 6415 }, { "epoch": 2.2130667126357526, "grad_norm": 1.1496019358592222, "learning_rate": 6.163236678332959e-06, "loss": 0.2206, "step": 6420 }, { "epoch": 2.21479055335287, "grad_norm": 1.1821890228767273, "learning_rate": 6.157806657745321e-06, "loss": 0.2339, "step": 6425 }, { "epoch": 2.216514394069988, "grad_norm": 1.2580278526002249, "learning_rate": 6.1523751938820085e-06, "loss": 0.2358, "step": 6430 }, { "epoch": 2.2182382347871057, "grad_norm": 1.299244265008732, "learning_rate": 6.146942293513665e-06, "loss": 0.2529, "step": 6435 }, { "epoch": 2.2199620755042235, "grad_norm": 1.2456429273924774, "learning_rate": 6.141507963412732e-06, "loss": 0.2391, "step": 6440 }, { "epoch": 2.2216859162213414, "grad_norm": 1.0876578922412434, "learning_rate": 6.1360722103534255e-06, "loss": 0.2241, "step": 6445 }, { "epoch": 2.2234097569384588, "grad_norm": 1.163992161362365, "learning_rate": 6.130635041111741e-06, "loss": 0.2337, "step": 6450 }, { "epoch": 2.2251335976555766, "grad_norm": 1.305738602201025, "learning_rate": 6.125196462465435e-06, "loss": 0.2314, "step": 6455 }, { "epoch": 2.2268574383726945, "grad_norm": 1.224758584645192, "learning_rate": 6.119756481194025e-06, "loss": 0.2251, "step": 6460 }, { "epoch": 2.2285812790898123, "grad_norm": 1.2406266375311112, "learning_rate": 6.1143151040787755e-06, "loss": 0.2398, "step": 6465 }, { "epoch": 2.2303051198069297, "grad_norm": 1.3967656523094172, "learning_rate": 6.108872337902688e-06, "loss": 0.2192, "step": 6470 }, { "epoch": 2.2320289605240475, "grad_norm": 1.1804813959831304, "learning_rate": 6.1034281894505e-06, "loss": 0.2133, "step": 6475 }, { "epoch": 2.2337528012411654, "grad_norm": 1.270472137454462, "learning_rate": 6.0979826655086695e-06, "loss": 0.2446, "step": 6480 }, { "epoch": 2.235476641958283, "grad_norm": 1.0585835578808256, "learning_rate": 6.09253577286537e-06, "loss": 0.222, "step": 6485 }, { "epoch": 2.2372004826754006, "grad_norm": 1.2609989086875364, "learning_rate": 6.087087518310482e-06, "loss": 0.2413, "step": 6490 }, { "epoch": 2.2389243233925185, "grad_norm": 1.333218721228253, "learning_rate": 6.081637908635581e-06, "loss": 0.228, "step": 6495 }, { "epoch": 2.2406481641096363, "grad_norm": 1.0670271069980255, "learning_rate": 6.076186950633932e-06, "loss": 0.2056, "step": 6500 }, { "epoch": 2.242372004826754, "grad_norm": 1.455046424465367, "learning_rate": 6.070734651100486e-06, "loss": 0.2441, "step": 6505 }, { "epoch": 2.244095845543872, "grad_norm": 1.2331558263115812, "learning_rate": 6.065281016831861e-06, "loss": 0.2075, "step": 6510 }, { "epoch": 2.2458196862609894, "grad_norm": 1.2117871822689017, "learning_rate": 6.059826054626338e-06, "loss": 0.2464, "step": 6515 }, { "epoch": 2.247543526978107, "grad_norm": 1.1156968508704466, "learning_rate": 6.054369771283861e-06, "loss": 0.2264, "step": 6520 }, { "epoch": 2.249267367695225, "grad_norm": 1.4028013307117346, "learning_rate": 6.04891217360601e-06, "loss": 0.2165, "step": 6525 }, { "epoch": 2.2509912084123425, "grad_norm": 1.0582071748324666, "learning_rate": 6.0434532683960134e-06, "loss": 0.2026, "step": 6530 }, { "epoch": 2.2527150491294603, "grad_norm": 1.1495646956895642, "learning_rate": 6.03799306245872e-06, "loss": 0.2301, "step": 6535 }, { "epoch": 2.254438889846578, "grad_norm": 1.1465172093749207, "learning_rate": 6.03253156260061e-06, "loss": 0.2042, "step": 6540 }, { "epoch": 2.256162730563696, "grad_norm": 1.124781134057256, "learning_rate": 6.027068775629768e-06, "loss": 0.241, "step": 6545 }, { "epoch": 2.257886571280814, "grad_norm": 1.3024720247117805, "learning_rate": 6.02160470835589e-06, "loss": 0.228, "step": 6550 }, { "epoch": 2.2596104119979312, "grad_norm": 1.3853558749547408, "learning_rate": 6.016139367590263e-06, "loss": 0.2256, "step": 6555 }, { "epoch": 2.261334252715049, "grad_norm": 1.2302090593193726, "learning_rate": 6.010672760145762e-06, "loss": 0.247, "step": 6560 }, { "epoch": 2.263058093432167, "grad_norm": 1.2159809112138422, "learning_rate": 6.005204892836843e-06, "loss": 0.2096, "step": 6565 }, { "epoch": 2.2647819341492847, "grad_norm": 1.1668358935339171, "learning_rate": 5.9997357724795325e-06, "loss": 0.2445, "step": 6570 }, { "epoch": 2.2665057748664026, "grad_norm": 1.0030758336245051, "learning_rate": 5.9942654058914184e-06, "loss": 0.2268, "step": 6575 }, { "epoch": 2.26822961558352, "grad_norm": 1.220413035318449, "learning_rate": 5.988793799891639e-06, "loss": 0.2362, "step": 6580 }, { "epoch": 2.269953456300638, "grad_norm": 1.2409356225757247, "learning_rate": 5.983320961300886e-06, "loss": 0.2218, "step": 6585 }, { "epoch": 2.2716772970177557, "grad_norm": 1.1814271142656403, "learning_rate": 5.977846896941376e-06, "loss": 0.2321, "step": 6590 }, { "epoch": 2.273401137734873, "grad_norm": 1.1844509666036434, "learning_rate": 5.972371613636863e-06, "loss": 0.2197, "step": 6595 }, { "epoch": 2.275124978451991, "grad_norm": 1.3058894691253469, "learning_rate": 5.966895118212615e-06, "loss": 0.2438, "step": 6600 }, { "epoch": 2.2768488191691088, "grad_norm": 1.2269709989697146, "learning_rate": 5.961417417495416e-06, "loss": 0.236, "step": 6605 }, { "epoch": 2.2785726598862266, "grad_norm": 1.2241053358319587, "learning_rate": 5.955938518313549e-06, "loss": 0.2181, "step": 6610 }, { "epoch": 2.2802965006033444, "grad_norm": 0.9522467273279619, "learning_rate": 5.950458427496789e-06, "loss": 0.235, "step": 6615 }, { "epoch": 2.282020341320462, "grad_norm": 1.266499686373801, "learning_rate": 5.944977151876402e-06, "loss": 0.2462, "step": 6620 }, { "epoch": 2.2837441820375797, "grad_norm": 1.2230910063692286, "learning_rate": 5.939494698285125e-06, "loss": 0.2204, "step": 6625 }, { "epoch": 2.2854680227546975, "grad_norm": 1.1978976267327752, "learning_rate": 5.934011073557169e-06, "loss": 0.2208, "step": 6630 }, { "epoch": 2.2871918634718154, "grad_norm": 1.0298786417340702, "learning_rate": 5.928526284528202e-06, "loss": 0.215, "step": 6635 }, { "epoch": 2.2889157041889328, "grad_norm": 1.1331157635471427, "learning_rate": 5.923040338035339e-06, "loss": 0.2241, "step": 6640 }, { "epoch": 2.2906395449060506, "grad_norm": 1.0687985229605128, "learning_rate": 5.917553240917151e-06, "loss": 0.221, "step": 6645 }, { "epoch": 2.2923633856231684, "grad_norm": 1.1676284370384098, "learning_rate": 5.912065000013627e-06, "loss": 0.2264, "step": 6650 }, { "epoch": 2.2940872263402863, "grad_norm": 1.1272516773874, "learning_rate": 5.906575622166193e-06, "loss": 0.2151, "step": 6655 }, { "epoch": 2.2958110670574037, "grad_norm": 1.2006669040848492, "learning_rate": 5.9010851142176884e-06, "loss": 0.1966, "step": 6660 }, { "epoch": 2.2975349077745215, "grad_norm": 1.0535753392295237, "learning_rate": 5.895593483012362e-06, "loss": 0.1946, "step": 6665 }, { "epoch": 2.2992587484916394, "grad_norm": 1.4714542006763698, "learning_rate": 5.890100735395864e-06, "loss": 0.2463, "step": 6670 }, { "epoch": 2.300982589208757, "grad_norm": 1.0413354232245149, "learning_rate": 5.884606878215231e-06, "loss": 0.2246, "step": 6675 }, { "epoch": 2.302706429925875, "grad_norm": 1.186952337907024, "learning_rate": 5.87911191831889e-06, "loss": 0.2285, "step": 6680 }, { "epoch": 2.3044302706429924, "grad_norm": 1.1518388534893138, "learning_rate": 5.873615862556636e-06, "loss": 0.2093, "step": 6685 }, { "epoch": 2.3061541113601103, "grad_norm": 0.925470444073306, "learning_rate": 5.868118717779636e-06, "loss": 0.183, "step": 6690 }, { "epoch": 2.307877952077228, "grad_norm": 1.1658438303375276, "learning_rate": 5.8626204908404125e-06, "loss": 0.235, "step": 6695 }, { "epoch": 2.309601792794346, "grad_norm": 1.2464114844329484, "learning_rate": 5.857121188592834e-06, "loss": 0.2476, "step": 6700 }, { "epoch": 2.3113256335114634, "grad_norm": 1.2086437497264788, "learning_rate": 5.851620817892112e-06, "loss": 0.2385, "step": 6705 }, { "epoch": 2.313049474228581, "grad_norm": 1.2879945468527676, "learning_rate": 5.846119385594789e-06, "loss": 0.2325, "step": 6710 }, { "epoch": 2.314773314945699, "grad_norm": 1.2630188960941966, "learning_rate": 5.840616898558734e-06, "loss": 0.2393, "step": 6715 }, { "epoch": 2.316497155662817, "grad_norm": 1.2771883079158053, "learning_rate": 5.835113363643126e-06, "loss": 0.2041, "step": 6720 }, { "epoch": 2.3182209963799343, "grad_norm": 1.2023618651995445, "learning_rate": 5.829608787708454e-06, "loss": 0.2291, "step": 6725 }, { "epoch": 2.319944837097052, "grad_norm": 1.0785269405748918, "learning_rate": 5.8241031776165035e-06, "loss": 0.253, "step": 6730 }, { "epoch": 2.32166867781417, "grad_norm": 1.151900270329759, "learning_rate": 5.818596540230346e-06, "loss": 0.2173, "step": 6735 }, { "epoch": 2.323392518531288, "grad_norm": 1.161962981954948, "learning_rate": 5.8130888824143384e-06, "loss": 0.2003, "step": 6740 }, { "epoch": 2.3251163592484057, "grad_norm": 1.505530224772604, "learning_rate": 5.807580211034106e-06, "loss": 0.2142, "step": 6745 }, { "epoch": 2.326840199965523, "grad_norm": 1.2105429985143288, "learning_rate": 5.802070532956542e-06, "loss": 0.2103, "step": 6750 }, { "epoch": 2.328564040682641, "grad_norm": 1.306197584762026, "learning_rate": 5.796559855049791e-06, "loss": 0.2245, "step": 6755 }, { "epoch": 2.3302878813997587, "grad_norm": 1.5574206245981117, "learning_rate": 5.7910481841832424e-06, "loss": 0.22, "step": 6760 }, { "epoch": 2.3320117221168766, "grad_norm": 1.1763708628501528, "learning_rate": 5.785535527227527e-06, "loss": 0.2179, "step": 6765 }, { "epoch": 2.333735562833994, "grad_norm": 1.1856196939443528, "learning_rate": 5.780021891054504e-06, "loss": 0.2186, "step": 6770 }, { "epoch": 2.335459403551112, "grad_norm": 1.1408446330456214, "learning_rate": 5.774507282537251e-06, "loss": 0.2172, "step": 6775 }, { "epoch": 2.3371832442682297, "grad_norm": 1.151214016085517, "learning_rate": 5.7689917085500625e-06, "loss": 0.2345, "step": 6780 }, { "epoch": 2.3389070849853475, "grad_norm": 1.0477149333534155, "learning_rate": 5.763475175968429e-06, "loss": 0.2131, "step": 6785 }, { "epoch": 2.340630925702465, "grad_norm": 1.0639308549219988, "learning_rate": 5.7579576916690465e-06, "loss": 0.2146, "step": 6790 }, { "epoch": 2.3423547664195827, "grad_norm": 1.1565149305617368, "learning_rate": 5.752439262529784e-06, "loss": 0.1999, "step": 6795 }, { "epoch": 2.3440786071367006, "grad_norm": 1.219503174438488, "learning_rate": 5.7469198954297005e-06, "loss": 0.2057, "step": 6800 }, { "epoch": 2.3458024478538184, "grad_norm": 1.1980585499435352, "learning_rate": 5.7413995972490174e-06, "loss": 0.2265, "step": 6805 }, { "epoch": 2.3475262885709363, "grad_norm": 1.3185695988317838, "learning_rate": 5.7358783748691194e-06, "loss": 0.2498, "step": 6810 }, { "epoch": 2.3492501292880537, "grad_norm": 1.1976799909402038, "learning_rate": 5.730356235172543e-06, "loss": 0.2132, "step": 6815 }, { "epoch": 2.3509739700051715, "grad_norm": 1.1704062573965308, "learning_rate": 5.724833185042965e-06, "loss": 0.2334, "step": 6820 }, { "epoch": 2.3526978107222893, "grad_norm": 1.1114083975412845, "learning_rate": 5.719309231365202e-06, "loss": 0.2091, "step": 6825 }, { "epoch": 2.354421651439407, "grad_norm": 1.1393640520800679, "learning_rate": 5.713784381025194e-06, "loss": 0.2236, "step": 6830 }, { "epoch": 2.3561454921565246, "grad_norm": 1.134556435275592, "learning_rate": 5.7082586409100005e-06, "loss": 0.2056, "step": 6835 }, { "epoch": 2.3578693328736424, "grad_norm": 1.087144351801372, "learning_rate": 5.702732017907788e-06, "loss": 0.2081, "step": 6840 }, { "epoch": 2.3595931735907603, "grad_norm": 1.540543465229459, "learning_rate": 5.697204518907823e-06, "loss": 0.228, "step": 6845 }, { "epoch": 2.361317014307878, "grad_norm": 1.14562385187818, "learning_rate": 5.69167615080047e-06, "loss": 0.2152, "step": 6850 }, { "epoch": 2.3630408550249955, "grad_norm": 1.2752884151374904, "learning_rate": 5.686146920477169e-06, "loss": 0.1976, "step": 6855 }, { "epoch": 2.3647646957421133, "grad_norm": 1.1263082918927592, "learning_rate": 5.680616834830439e-06, "loss": 0.226, "step": 6860 }, { "epoch": 2.366488536459231, "grad_norm": 1.2588818960989305, "learning_rate": 5.675085900753865e-06, "loss": 0.2336, "step": 6865 }, { "epoch": 2.368212377176349, "grad_norm": 1.0271877557731728, "learning_rate": 5.669554125142089e-06, "loss": 0.207, "step": 6870 }, { "epoch": 2.369936217893467, "grad_norm": 1.9174249095640592, "learning_rate": 5.664021514890804e-06, "loss": 0.2111, "step": 6875 }, { "epoch": 2.3716600586105843, "grad_norm": 1.150945943954569, "learning_rate": 5.658488076896739e-06, "loss": 0.2381, "step": 6880 }, { "epoch": 2.373383899327702, "grad_norm": 1.1053485694021632, "learning_rate": 5.6529538180576574e-06, "loss": 0.2238, "step": 6885 }, { "epoch": 2.37510774004482, "grad_norm": 1.114827238948087, "learning_rate": 5.647418745272347e-06, "loss": 0.2272, "step": 6890 }, { "epoch": 2.376831580761938, "grad_norm": 1.211894614332284, "learning_rate": 5.64188286544061e-06, "loss": 0.2582, "step": 6895 }, { "epoch": 2.378555421479055, "grad_norm": 1.0331736006740053, "learning_rate": 5.636346185463254e-06, "loss": 0.227, "step": 6900 }, { "epoch": 2.380279262196173, "grad_norm": 1.2115995995308106, "learning_rate": 5.630808712242081e-06, "loss": 0.2308, "step": 6905 }, { "epoch": 2.382003102913291, "grad_norm": 1.249508815570148, "learning_rate": 5.6252704526798855e-06, "loss": 0.2356, "step": 6910 }, { "epoch": 2.3837269436304087, "grad_norm": 1.2668841587373507, "learning_rate": 5.619731413680443e-06, "loss": 0.2175, "step": 6915 }, { "epoch": 2.385450784347526, "grad_norm": 1.004589133549024, "learning_rate": 5.614191602148498e-06, "loss": 0.193, "step": 6920 }, { "epoch": 2.387174625064644, "grad_norm": 1.124115544035202, "learning_rate": 5.6086510249897576e-06, "loss": 0.2597, "step": 6925 }, { "epoch": 2.388898465781762, "grad_norm": 1.0615474999083314, "learning_rate": 5.603109689110887e-06, "loss": 0.2226, "step": 6930 }, { "epoch": 2.3906223064988796, "grad_norm": 1.2306413672173773, "learning_rate": 5.597567601419496e-06, "loss": 0.2213, "step": 6935 }, { "epoch": 2.3923461472159975, "grad_norm": 1.173560341722914, "learning_rate": 5.592024768824126e-06, "loss": 0.225, "step": 6940 }, { "epoch": 2.394069987933115, "grad_norm": 1.2633080823318978, "learning_rate": 5.586481198234253e-06, "loss": 0.2289, "step": 6945 }, { "epoch": 2.3957938286502327, "grad_norm": 1.1173136748171157, "learning_rate": 5.580936896560273e-06, "loss": 0.2071, "step": 6950 }, { "epoch": 2.3975176693673506, "grad_norm": 1.105475600922252, "learning_rate": 5.57539187071349e-06, "loss": 0.2146, "step": 6955 }, { "epoch": 2.399241510084468, "grad_norm": 1.147403948431547, "learning_rate": 5.569846127606115e-06, "loss": 0.2115, "step": 6960 }, { "epoch": 2.400965350801586, "grad_norm": 1.0777348019729607, "learning_rate": 5.564299674151248e-06, "loss": 0.1989, "step": 6965 }, { "epoch": 2.4026891915187036, "grad_norm": 5.556592377891615, "learning_rate": 5.558752517262877e-06, "loss": 0.211, "step": 6970 }, { "epoch": 2.4044130322358215, "grad_norm": 1.125771209945684, "learning_rate": 5.553204663855868e-06, "loss": 0.2231, "step": 6975 }, { "epoch": 2.4061368729529393, "grad_norm": 1.1591474019986616, "learning_rate": 5.547656120845953e-06, "loss": 0.218, "step": 6980 }, { "epoch": 2.4078607136700567, "grad_norm": 1.1585412151784338, "learning_rate": 5.542106895149727e-06, "loss": 0.2238, "step": 6985 }, { "epoch": 2.4095845543871746, "grad_norm": 1.2253255427195415, "learning_rate": 5.5365569936846294e-06, "loss": 0.2375, "step": 6990 }, { "epoch": 2.4113083951042924, "grad_norm": 1.1242386635647112, "learning_rate": 5.531006423368953e-06, "loss": 0.2164, "step": 6995 }, { "epoch": 2.4130322358214102, "grad_norm": 1.3095471221364696, "learning_rate": 5.5254551911218114e-06, "loss": 0.2319, "step": 7000 }, { "epoch": 2.414756076538528, "grad_norm": 1.0918409076710234, "learning_rate": 5.519903303863153e-06, "loss": 0.2326, "step": 7005 }, { "epoch": 2.4164799172556455, "grad_norm": 1.3028504273451234, "learning_rate": 5.514350768513738e-06, "loss": 0.2142, "step": 7010 }, { "epoch": 2.4182037579727633, "grad_norm": 1.0861765578487275, "learning_rate": 5.5087975919951374e-06, "loss": 0.1967, "step": 7015 }, { "epoch": 2.419927598689881, "grad_norm": 1.1199745344635728, "learning_rate": 5.503243781229719e-06, "loss": 0.2099, "step": 7020 }, { "epoch": 2.4216514394069986, "grad_norm": 1.2399659497777336, "learning_rate": 5.497689343140642e-06, "loss": 0.2293, "step": 7025 }, { "epoch": 2.4233752801241164, "grad_norm": 1.1536479623138571, "learning_rate": 5.4921342846518475e-06, "loss": 0.2198, "step": 7030 }, { "epoch": 2.4250991208412342, "grad_norm": 2.9483122443726, "learning_rate": 5.486578612688051e-06, "loss": 0.2122, "step": 7035 }, { "epoch": 2.426822961558352, "grad_norm": 1.0502328215848433, "learning_rate": 5.4810223341747315e-06, "loss": 0.213, "step": 7040 }, { "epoch": 2.42854680227547, "grad_norm": 1.2517019386272146, "learning_rate": 5.4754654560381245e-06, "loss": 0.2308, "step": 7045 }, { "epoch": 2.4302706429925873, "grad_norm": 1.058335724534813, "learning_rate": 5.469907985205212e-06, "loss": 0.2048, "step": 7050 }, { "epoch": 2.431994483709705, "grad_norm": 1.1595293528986441, "learning_rate": 5.4643499286037195e-06, "loss": 0.2176, "step": 7055 }, { "epoch": 2.433718324426823, "grad_norm": 1.1693707538220317, "learning_rate": 5.458791293162095e-06, "loss": 0.208, "step": 7060 }, { "epoch": 2.435442165143941, "grad_norm": 1.2031321093299394, "learning_rate": 5.453232085809514e-06, "loss": 0.2293, "step": 7065 }, { "epoch": 2.4371660058610582, "grad_norm": 1.1133781109654826, "learning_rate": 5.44767231347586e-06, "loss": 0.2017, "step": 7070 }, { "epoch": 2.438889846578176, "grad_norm": 1.0936315572392878, "learning_rate": 5.442111983091729e-06, "loss": 0.2327, "step": 7075 }, { "epoch": 2.440613687295294, "grad_norm": 1.0291507351238356, "learning_rate": 5.436551101588405e-06, "loss": 0.2031, "step": 7080 }, { "epoch": 2.4423375280124118, "grad_norm": 1.3108572256084867, "learning_rate": 5.430989675897861e-06, "loss": 0.2098, "step": 7085 }, { "epoch": 2.444061368729529, "grad_norm": 1.1755260710981985, "learning_rate": 5.425427712952748e-06, "loss": 0.2159, "step": 7090 }, { "epoch": 2.445785209446647, "grad_norm": 1.1144893583804825, "learning_rate": 5.419865219686389e-06, "loss": 0.2236, "step": 7095 }, { "epoch": 2.447509050163765, "grad_norm": 1.1173215912035803, "learning_rate": 5.414302203032766e-06, "loss": 0.2341, "step": 7100 }, { "epoch": 2.4492328908808827, "grad_norm": 1.287175299503757, "learning_rate": 5.408738669926517e-06, "loss": 0.1916, "step": 7105 }, { "epoch": 2.4509567315980005, "grad_norm": 1.0611457253042027, "learning_rate": 5.403174627302915e-06, "loss": 0.2125, "step": 7110 }, { "epoch": 2.452680572315118, "grad_norm": 1.1456019986733763, "learning_rate": 5.397610082097879e-06, "loss": 0.2121, "step": 7115 }, { "epoch": 2.4544044130322358, "grad_norm": 1.236233787814864, "learning_rate": 5.392045041247946e-06, "loss": 0.2456, "step": 7120 }, { "epoch": 2.4561282537493536, "grad_norm": 1.1703211485963458, "learning_rate": 5.386479511690276e-06, "loss": 0.2051, "step": 7125 }, { "epoch": 2.4578520944664715, "grad_norm": 1.2366522779193592, "learning_rate": 5.380913500362637e-06, "loss": 0.2297, "step": 7130 }, { "epoch": 2.459575935183589, "grad_norm": 1.2263911601430404, "learning_rate": 5.375347014203395e-06, "loss": 0.2327, "step": 7135 }, { "epoch": 2.4612997759007067, "grad_norm": 2.226172866100081, "learning_rate": 5.369780060151514e-06, "loss": 0.2347, "step": 7140 }, { "epoch": 2.4630236166178245, "grad_norm": 1.117299355633407, "learning_rate": 5.364212645146533e-06, "loss": 0.2153, "step": 7145 }, { "epoch": 2.4647474573349424, "grad_norm": 1.214567637730859, "learning_rate": 5.3586447761285724e-06, "loss": 0.2327, "step": 7150 }, { "epoch": 2.46647129805206, "grad_norm": 1.1954676598235126, "learning_rate": 5.353076460038315e-06, "loss": 0.2241, "step": 7155 }, { "epoch": 2.4681951387691776, "grad_norm": 1.1572220939526712, "learning_rate": 5.347507703817001e-06, "loss": 0.2305, "step": 7160 }, { "epoch": 2.4699189794862955, "grad_norm": 1.3110562065182316, "learning_rate": 5.341938514406423e-06, "loss": 0.2382, "step": 7165 }, { "epoch": 2.4716428202034133, "grad_norm": 1.1170022862963467, "learning_rate": 5.3363688987489075e-06, "loss": 0.2342, "step": 7170 }, { "epoch": 2.473366660920531, "grad_norm": 1.216312857430115, "learning_rate": 5.330798863787318e-06, "loss": 0.2215, "step": 7175 }, { "epoch": 2.4750905016376485, "grad_norm": 1.207424663510382, "learning_rate": 5.3252284164650355e-06, "loss": 0.2248, "step": 7180 }, { "epoch": 2.4768143423547664, "grad_norm": 1.2043950960105698, "learning_rate": 5.319657563725962e-06, "loss": 0.1857, "step": 7185 }, { "epoch": 2.4785381830718842, "grad_norm": 1.2265575159938535, "learning_rate": 5.314086312514498e-06, "loss": 0.1999, "step": 7190 }, { "epoch": 2.480262023789002, "grad_norm": 1.069220922010664, "learning_rate": 5.3085146697755415e-06, "loss": 0.1926, "step": 7195 }, { "epoch": 2.4819858645061195, "grad_norm": 1.060777107033285, "learning_rate": 5.3029426424544865e-06, "loss": 0.191, "step": 7200 }, { "epoch": 2.4837097052232373, "grad_norm": 1.3349880458159542, "learning_rate": 5.297370237497194e-06, "loss": 0.2219, "step": 7205 }, { "epoch": 2.485433545940355, "grad_norm": 1.2248208490554113, "learning_rate": 5.291797461850004e-06, "loss": 0.2205, "step": 7210 }, { "epoch": 2.487157386657473, "grad_norm": 1.2652552640765649, "learning_rate": 5.28622432245972e-06, "loss": 0.2382, "step": 7215 }, { "epoch": 2.4888812273745904, "grad_norm": 0.9161267701940665, "learning_rate": 5.280650826273591e-06, "loss": 0.2138, "step": 7220 }, { "epoch": 2.4906050680917082, "grad_norm": 1.0591088864957827, "learning_rate": 5.2750769802393195e-06, "loss": 0.1925, "step": 7225 }, { "epoch": 2.492328908808826, "grad_norm": 0.9870998756199474, "learning_rate": 5.269502791305037e-06, "loss": 0.1954, "step": 7230 }, { "epoch": 2.494052749525944, "grad_norm": 1.1463081762145328, "learning_rate": 5.263928266419306e-06, "loss": 0.206, "step": 7235 }, { "epoch": 2.4957765902430618, "grad_norm": 1.2207867491896367, "learning_rate": 5.258353412531109e-06, "loss": 0.2104, "step": 7240 }, { "epoch": 2.497500430960179, "grad_norm": 1.201890758904529, "learning_rate": 5.252778236589834e-06, "loss": 0.2071, "step": 7245 }, { "epoch": 2.499224271677297, "grad_norm": 1.2846657038195175, "learning_rate": 5.247202745545277e-06, "loss": 0.1908, "step": 7250 }, { "epoch": 2.500948112394415, "grad_norm": 1.229337153227972, "learning_rate": 5.241626946347617e-06, "loss": 0.2227, "step": 7255 }, { "epoch": 2.5026719531115322, "grad_norm": 1.2120176108004994, "learning_rate": 5.236050845947433e-06, "loss": 0.1957, "step": 7260 }, { "epoch": 2.50439579382865, "grad_norm": 1.0973018698702013, "learning_rate": 5.230474451295659e-06, "loss": 0.217, "step": 7265 }, { "epoch": 2.506119634545768, "grad_norm": 1.140347075779373, "learning_rate": 5.2248977693436154e-06, "loss": 0.2328, "step": 7270 }, { "epoch": 2.5078434752628858, "grad_norm": 1.1809774800075683, "learning_rate": 5.219320807042965e-06, "loss": 0.1994, "step": 7275 }, { "epoch": 2.5095673159800036, "grad_norm": 1.2874965605523885, "learning_rate": 5.21374357134573e-06, "loss": 0.2145, "step": 7280 }, { "epoch": 2.511291156697121, "grad_norm": 1.1577850174966156, "learning_rate": 5.208166069204274e-06, "loss": 0.2127, "step": 7285 }, { "epoch": 2.513014997414239, "grad_norm": 1.7529340656558474, "learning_rate": 5.202588307571282e-06, "loss": 0.2123, "step": 7290 }, { "epoch": 2.5147388381313567, "grad_norm": 1.4061356963194789, "learning_rate": 5.197010293399774e-06, "loss": 0.2089, "step": 7295 }, { "epoch": 2.5164626788484745, "grad_norm": 1.1149404139701187, "learning_rate": 5.191432033643078e-06, "loss": 0.1989, "step": 7300 }, { "epoch": 2.5181865195655924, "grad_norm": 1.2917225279499973, "learning_rate": 5.185853535254832e-06, "loss": 0.2468, "step": 7305 }, { "epoch": 2.5199103602827098, "grad_norm": 1.1374273101044257, "learning_rate": 5.1802748051889715e-06, "loss": 0.1891, "step": 7310 }, { "epoch": 2.5216342009998276, "grad_norm": 1.102091147672205, "learning_rate": 5.1746958503997154e-06, "loss": 0.2135, "step": 7315 }, { "epoch": 2.5233580417169454, "grad_norm": 1.2681242359822413, "learning_rate": 5.16911667784157e-06, "loss": 0.2383, "step": 7320 }, { "epoch": 2.525081882434063, "grad_norm": 1.171266354531671, "learning_rate": 5.163537294469308e-06, "loss": 0.1761, "step": 7325 }, { "epoch": 2.5268057231511807, "grad_norm": 1.056180065049999, "learning_rate": 5.1579577072379676e-06, "loss": 0.2077, "step": 7330 }, { "epoch": 2.5285295638682985, "grad_norm": 1.1330512349452793, "learning_rate": 5.152377923102836e-06, "loss": 0.218, "step": 7335 }, { "epoch": 2.5302534045854164, "grad_norm": 1.1441240829036081, "learning_rate": 5.146797949019455e-06, "loss": 0.2062, "step": 7340 }, { "epoch": 2.531977245302534, "grad_norm": 1.3887357240844742, "learning_rate": 5.141217791943597e-06, "loss": 0.2208, "step": 7345 }, { "epoch": 2.5337010860196516, "grad_norm": 1.1307112493676725, "learning_rate": 5.135637458831262e-06, "loss": 0.2074, "step": 7350 }, { "epoch": 2.5354249267367694, "grad_norm": 1.2282684352585873, "learning_rate": 5.1300569566386725e-06, "loss": 0.2079, "step": 7355 }, { "epoch": 2.5371487674538873, "grad_norm": 1.1569630718124229, "learning_rate": 5.124476292322259e-06, "loss": 0.2168, "step": 7360 }, { "epoch": 2.538872608171005, "grad_norm": 1.0466275191099756, "learning_rate": 5.1188954728386565e-06, "loss": 0.2302, "step": 7365 }, { "epoch": 2.540596448888123, "grad_norm": 3.0032131582027364, "learning_rate": 5.113314505144693e-06, "loss": 0.23, "step": 7370 }, { "epoch": 2.5423202896052404, "grad_norm": 0.9610460318468603, "learning_rate": 5.107733396197379e-06, "loss": 0.2173, "step": 7375 }, { "epoch": 2.544044130322358, "grad_norm": 1.2169358707697866, "learning_rate": 5.102152152953903e-06, "loss": 0.2125, "step": 7380 }, { "epoch": 2.545767971039476, "grad_norm": 1.2078229706325732, "learning_rate": 5.09657078237162e-06, "loss": 0.223, "step": 7385 }, { "epoch": 2.5474918117565934, "grad_norm": 1.266319393380179, "learning_rate": 5.090989291408047e-06, "loss": 0.2116, "step": 7390 }, { "epoch": 2.5492156524737113, "grad_norm": 1.1638609253026335, "learning_rate": 5.0854076870208456e-06, "loss": 0.1952, "step": 7395 }, { "epoch": 2.550939493190829, "grad_norm": 1.1923136092049034, "learning_rate": 5.079825976167821e-06, "loss": 0.209, "step": 7400 }, { "epoch": 2.552663333907947, "grad_norm": 1.2901463135572986, "learning_rate": 5.074244165806915e-06, "loss": 0.2333, "step": 7405 }, { "epoch": 2.554387174625065, "grad_norm": 1.3062271974022313, "learning_rate": 5.068662262896189e-06, "loss": 0.2201, "step": 7410 }, { "epoch": 2.556111015342182, "grad_norm": 1.1748899193790712, "learning_rate": 5.063080274393818e-06, "loss": 0.2201, "step": 7415 }, { "epoch": 2.5578348560593, "grad_norm": 1.251431560253386, "learning_rate": 5.05749820725809e-06, "loss": 0.2183, "step": 7420 }, { "epoch": 2.559558696776418, "grad_norm": 1.3622389217298134, "learning_rate": 5.051916068447387e-06, "loss": 0.2192, "step": 7425 }, { "epoch": 2.5612825374935357, "grad_norm": 1.3864641540898042, "learning_rate": 5.04633386492018e-06, "loss": 0.2296, "step": 7430 }, { "epoch": 2.5630063782106536, "grad_norm": 1.2952115427774917, "learning_rate": 5.040751603635021e-06, "loss": 0.2069, "step": 7435 }, { "epoch": 2.564730218927771, "grad_norm": 1.3404304069860504, "learning_rate": 5.035169291550537e-06, "loss": 0.1924, "step": 7440 }, { "epoch": 2.566454059644889, "grad_norm": 1.1326785798351073, "learning_rate": 5.029586935625413e-06, "loss": 0.1858, "step": 7445 }, { "epoch": 2.5681779003620067, "grad_norm": 1.290488194981013, "learning_rate": 5.024004542818396e-06, "loss": 0.2089, "step": 7450 }, { "epoch": 2.569901741079124, "grad_norm": 1.36755491149153, "learning_rate": 5.01842212008827e-06, "loss": 0.2172, "step": 7455 }, { "epoch": 2.571625581796242, "grad_norm": 1.1642713513439564, "learning_rate": 5.012839674393861e-06, "loss": 0.2017, "step": 7460 }, { "epoch": 2.5733494225133597, "grad_norm": 1.0664883925426987, "learning_rate": 5.007257212694028e-06, "loss": 0.2187, "step": 7465 }, { "epoch": 2.5750732632304776, "grad_norm": 1.1524668732702383, "learning_rate": 5.001674741947641e-06, "loss": 0.2161, "step": 7470 }, { "epoch": 2.5767971039475954, "grad_norm": 1.045590352929241, "learning_rate": 4.996092269113589e-06, "loss": 0.188, "step": 7475 }, { "epoch": 2.578520944664713, "grad_norm": 1.353761575390307, "learning_rate": 4.990509801150758e-06, "loss": 0.2193, "step": 7480 }, { "epoch": 2.5802447853818307, "grad_norm": 1.1343409334190961, "learning_rate": 4.984927345018028e-06, "loss": 0.1934, "step": 7485 }, { "epoch": 2.5819686260989485, "grad_norm": 1.1630259837276047, "learning_rate": 4.979344907674273e-06, "loss": 0.2324, "step": 7490 }, { "epoch": 2.5836924668160663, "grad_norm": 1.2007037888943761, "learning_rate": 4.973762496078333e-06, "loss": 0.2041, "step": 7495 }, { "epoch": 2.585416307533184, "grad_norm": 1.1891610793015506, "learning_rate": 4.9681801171890195e-06, "loss": 0.2206, "step": 7500 }, { "epoch": 2.5871401482503016, "grad_norm": 1.0931116205911549, "learning_rate": 4.9625977779651055e-06, "loss": 0.2195, "step": 7505 }, { "epoch": 2.5888639889674194, "grad_norm": 1.2641333552254657, "learning_rate": 4.957015485365314e-06, "loss": 0.2576, "step": 7510 }, { "epoch": 2.5905878296845373, "grad_norm": 1.5751221013193646, "learning_rate": 4.951433246348304e-06, "loss": 0.1911, "step": 7515 }, { "epoch": 2.5923116704016547, "grad_norm": 1.2284005377309903, "learning_rate": 4.945851067872677e-06, "loss": 0.2138, "step": 7520 }, { "epoch": 2.5940355111187725, "grad_norm": 1.2138876792673146, "learning_rate": 4.9402689568969516e-06, "loss": 0.222, "step": 7525 }, { "epoch": 2.5957593518358903, "grad_norm": 1.2531347514382467, "learning_rate": 4.934686920379567e-06, "loss": 0.2277, "step": 7530 }, { "epoch": 2.597483192553008, "grad_norm": 1.105439810964691, "learning_rate": 4.9291049652788645e-06, "loss": 0.2203, "step": 7535 }, { "epoch": 2.599207033270126, "grad_norm": 1.0691720932694833, "learning_rate": 4.923523098553091e-06, "loss": 0.2161, "step": 7540 }, { "epoch": 2.6009308739872434, "grad_norm": 1.1784358418650132, "learning_rate": 4.917941327160377e-06, "loss": 0.1953, "step": 7545 }, { "epoch": 2.6026547147043613, "grad_norm": 1.2263614236990952, "learning_rate": 4.912359658058736e-06, "loss": 0.1932, "step": 7550 }, { "epoch": 2.604378555421479, "grad_norm": 1.2296935329591632, "learning_rate": 4.906778098206058e-06, "loss": 0.2365, "step": 7555 }, { "epoch": 2.606102396138597, "grad_norm": 1.4035538705631023, "learning_rate": 4.901196654560088e-06, "loss": 0.1871, "step": 7560 }, { "epoch": 2.607826236855715, "grad_norm": 1.1686204347595222, "learning_rate": 4.895615334078437e-06, "loss": 0.2086, "step": 7565 }, { "epoch": 2.609550077572832, "grad_norm": 0.9935612979912067, "learning_rate": 4.89003414371855e-06, "loss": 0.2008, "step": 7570 }, { "epoch": 2.61127391828995, "grad_norm": 1.300964389093522, "learning_rate": 4.884453090437725e-06, "loss": 0.2027, "step": 7575 }, { "epoch": 2.612997759007068, "grad_norm": 1.1480951233356864, "learning_rate": 4.878872181193073e-06, "loss": 0.2249, "step": 7580 }, { "epoch": 2.6147215997241853, "grad_norm": 0.9923665704981476, "learning_rate": 4.873291422941536e-06, "loss": 0.1909, "step": 7585 }, { "epoch": 2.616445440441303, "grad_norm": 2.214346676798211, "learning_rate": 4.867710822639869e-06, "loss": 0.2231, "step": 7590 }, { "epoch": 2.618169281158421, "grad_norm": 1.2512831008438763, "learning_rate": 4.862130387244622e-06, "loss": 0.2164, "step": 7595 }, { "epoch": 2.619893121875539, "grad_norm": 1.3200386659107375, "learning_rate": 4.856550123712142e-06, "loss": 0.2102, "step": 7600 }, { "epoch": 2.6216169625926566, "grad_norm": 1.291299203564834, "learning_rate": 4.850970038998567e-06, "loss": 0.2006, "step": 7605 }, { "epoch": 2.623340803309774, "grad_norm": 1.3428618013758173, "learning_rate": 4.845390140059808e-06, "loss": 0.219, "step": 7610 }, { "epoch": 2.625064644026892, "grad_norm": 1.1621956627140024, "learning_rate": 4.839810433851543e-06, "loss": 0.1992, "step": 7615 }, { "epoch": 2.6267884847440097, "grad_norm": 1.0179151578072643, "learning_rate": 4.8342309273292115e-06, "loss": 0.1711, "step": 7620 }, { "epoch": 2.628512325461127, "grad_norm": 1.2793489697631724, "learning_rate": 4.828651627448006e-06, "loss": 0.2246, "step": 7625 }, { "epoch": 2.6302361661782454, "grad_norm": 1.2069027265954009, "learning_rate": 4.823072541162859e-06, "loss": 0.1937, "step": 7630 }, { "epoch": 2.631960006895363, "grad_norm": 1.1784427374902435, "learning_rate": 4.817493675428434e-06, "loss": 0.1983, "step": 7635 }, { "epoch": 2.6336838476124806, "grad_norm": 1.3110717127620901, "learning_rate": 4.81191503719913e-06, "loss": 0.2159, "step": 7640 }, { "epoch": 2.6354076883295985, "grad_norm": 1.1204792070203156, "learning_rate": 4.806336633429049e-06, "loss": 0.225, "step": 7645 }, { "epoch": 2.637131529046716, "grad_norm": 1.1322464392337706, "learning_rate": 4.800758471072009e-06, "loss": 0.2303, "step": 7650 }, { "epoch": 2.6388553697638337, "grad_norm": 1.418815982511749, "learning_rate": 4.795180557081524e-06, "loss": 0.1983, "step": 7655 }, { "epoch": 2.6405792104809516, "grad_norm": 1.2350238653831294, "learning_rate": 4.789602898410803e-06, "loss": 0.2217, "step": 7660 }, { "epoch": 2.6423030511980694, "grad_norm": 1.1627749751680085, "learning_rate": 4.78402550201273e-06, "loss": 0.1805, "step": 7665 }, { "epoch": 2.6440268919151872, "grad_norm": 1.3143049772616533, "learning_rate": 4.778448374839864e-06, "loss": 0.1871, "step": 7670 }, { "epoch": 2.6457507326323046, "grad_norm": 1.180059872412795, "learning_rate": 4.772871523844435e-06, "loss": 0.2064, "step": 7675 }, { "epoch": 2.6474745733494225, "grad_norm": 1.281356440545652, "learning_rate": 4.767294955978319e-06, "loss": 0.2069, "step": 7680 }, { "epoch": 2.6491984140665403, "grad_norm": 1.2451275420988035, "learning_rate": 4.761718678193044e-06, "loss": 0.2335, "step": 7685 }, { "epoch": 2.6509222547836577, "grad_norm": 1.1724257983704, "learning_rate": 4.756142697439775e-06, "loss": 0.1996, "step": 7690 }, { "epoch": 2.6526460955007756, "grad_norm": 1.0916232424849017, "learning_rate": 4.750567020669312e-06, "loss": 0.2075, "step": 7695 }, { "epoch": 2.6543699362178934, "grad_norm": 1.2713824407428382, "learning_rate": 4.744991654832067e-06, "loss": 0.216, "step": 7700 }, { "epoch": 2.6560937769350113, "grad_norm": 1.2605858193030177, "learning_rate": 4.739416606878069e-06, "loss": 0.1942, "step": 7705 }, { "epoch": 2.657817617652129, "grad_norm": 1.2291735786863593, "learning_rate": 4.733841883756954e-06, "loss": 0.1926, "step": 7710 }, { "epoch": 2.6595414583692465, "grad_norm": 1.2206656544882426, "learning_rate": 4.728267492417949e-06, "loss": 0.2106, "step": 7715 }, { "epoch": 2.6612652990863643, "grad_norm": 1.1937226387173594, "learning_rate": 4.722693439809866e-06, "loss": 0.2049, "step": 7720 }, { "epoch": 2.662989139803482, "grad_norm": 1.3494350997502214, "learning_rate": 4.717119732881099e-06, "loss": 0.2073, "step": 7725 }, { "epoch": 2.6647129805206, "grad_norm": 1.293557692804613, "learning_rate": 4.71154637857961e-06, "loss": 0.2338, "step": 7730 }, { "epoch": 2.666436821237718, "grad_norm": 1.1304063702876845, "learning_rate": 4.705973383852919e-06, "loss": 0.1987, "step": 7735 }, { "epoch": 2.6681606619548353, "grad_norm": 1.1252780123618584, "learning_rate": 4.700400755648098e-06, "loss": 0.2137, "step": 7740 }, { "epoch": 2.669884502671953, "grad_norm": 1.3044113305907792, "learning_rate": 4.694828500911766e-06, "loss": 0.1879, "step": 7745 }, { "epoch": 2.671608343389071, "grad_norm": 1.2407585884032672, "learning_rate": 4.689256626590073e-06, "loss": 0.2128, "step": 7750 }, { "epoch": 2.6733321841061883, "grad_norm": 1.1849482277456749, "learning_rate": 4.683685139628693e-06, "loss": 0.2129, "step": 7755 }, { "epoch": 2.675056024823306, "grad_norm": 1.1959407941225024, "learning_rate": 4.6781140469728255e-06, "loss": 0.1978, "step": 7760 }, { "epoch": 2.676779865540424, "grad_norm": 1.1666921527164538, "learning_rate": 4.672543355567168e-06, "loss": 0.2095, "step": 7765 }, { "epoch": 2.678503706257542, "grad_norm": 1.1131245840833146, "learning_rate": 4.666973072355925e-06, "loss": 0.2072, "step": 7770 }, { "epoch": 2.6802275469746597, "grad_norm": 1.207693667437074, "learning_rate": 4.661403204282786e-06, "loss": 0.2106, "step": 7775 }, { "epoch": 2.681951387691777, "grad_norm": 1.246143342849133, "learning_rate": 4.655833758290933e-06, "loss": 0.1956, "step": 7780 }, { "epoch": 2.683675228408895, "grad_norm": 1.1898634230084935, "learning_rate": 4.650264741323011e-06, "loss": 0.1918, "step": 7785 }, { "epoch": 2.685399069126013, "grad_norm": 1.2151672915313287, "learning_rate": 4.644696160321134e-06, "loss": 0.1881, "step": 7790 }, { "epoch": 2.6871229098431306, "grad_norm": 1.1383887903753034, "learning_rate": 4.639128022226879e-06, "loss": 0.2044, "step": 7795 }, { "epoch": 2.6888467505602485, "grad_norm": 1.1989490163151273, "learning_rate": 4.63356033398126e-06, "loss": 0.1903, "step": 7800 }, { "epoch": 2.690570591277366, "grad_norm": 1.2971416104279978, "learning_rate": 4.627993102524736e-06, "loss": 0.2024, "step": 7805 }, { "epoch": 2.6922944319944837, "grad_norm": 1.2950861054118248, "learning_rate": 4.622426334797196e-06, "loss": 0.1931, "step": 7810 }, { "epoch": 2.6940182727116015, "grad_norm": 1.4651119211485237, "learning_rate": 4.616860037737955e-06, "loss": 0.2175, "step": 7815 }, { "epoch": 2.695742113428719, "grad_norm": 1.0050843856974014, "learning_rate": 4.611294218285734e-06, "loss": 0.1945, "step": 7820 }, { "epoch": 2.697465954145837, "grad_norm": 1.2372108121867913, "learning_rate": 4.60572888337866e-06, "loss": 0.1995, "step": 7825 }, { "epoch": 2.6991897948629546, "grad_norm": 1.498612857700723, "learning_rate": 4.600164039954261e-06, "loss": 0.1956, "step": 7830 }, { "epoch": 2.7009136355800725, "grad_norm": 1.2330939883872856, "learning_rate": 4.5945996949494485e-06, "loss": 0.185, "step": 7835 }, { "epoch": 2.7026374762971903, "grad_norm": 1.2385919948447828, "learning_rate": 4.589035855300512e-06, "loss": 0.2134, "step": 7840 }, { "epoch": 2.7043613170143077, "grad_norm": 1.109916619555715, "learning_rate": 4.5834725279431155e-06, "loss": 0.2111, "step": 7845 }, { "epoch": 2.7060851577314256, "grad_norm": 1.2428499396738826, "learning_rate": 4.577909719812279e-06, "loss": 0.2094, "step": 7850 }, { "epoch": 2.7078089984485434, "grad_norm": 1.1963865432617258, "learning_rate": 4.572347437842379e-06, "loss": 0.1959, "step": 7855 }, { "epoch": 2.7095328391656612, "grad_norm": 1.319182663807647, "learning_rate": 4.566785688967131e-06, "loss": 0.1931, "step": 7860 }, { "epoch": 2.711256679882779, "grad_norm": 1.7260294326255932, "learning_rate": 4.561224480119595e-06, "loss": 0.1895, "step": 7865 }, { "epoch": 2.7129805205998965, "grad_norm": 1.071017979385581, "learning_rate": 4.555663818232149e-06, "loss": 0.1919, "step": 7870 }, { "epoch": 2.7147043613170143, "grad_norm": 1.021294790808218, "learning_rate": 4.550103710236492e-06, "loss": 0.1906, "step": 7875 }, { "epoch": 2.716428202034132, "grad_norm": 1.324442795469158, "learning_rate": 4.544544163063638e-06, "loss": 0.2148, "step": 7880 }, { "epoch": 2.7181520427512496, "grad_norm": 1.1920062038819825, "learning_rate": 4.5389851836438935e-06, "loss": 0.2109, "step": 7885 }, { "epoch": 2.7198758834683674, "grad_norm": 1.5379124195735483, "learning_rate": 4.533426778906861e-06, "loss": 0.1736, "step": 7890 }, { "epoch": 2.7215997241854852, "grad_norm": 1.2455543439708339, "learning_rate": 4.527868955781424e-06, "loss": 0.1904, "step": 7895 }, { "epoch": 2.723323564902603, "grad_norm": 1.4861350187627516, "learning_rate": 4.5223117211957505e-06, "loss": 0.1822, "step": 7900 }, { "epoch": 2.725047405619721, "grad_norm": 1.1171194416553463, "learning_rate": 4.516755082077261e-06, "loss": 0.1876, "step": 7905 }, { "epoch": 2.7267712463368383, "grad_norm": 1.3116132145694126, "learning_rate": 4.511199045352645e-06, "loss": 0.1948, "step": 7910 }, { "epoch": 2.728495087053956, "grad_norm": 1.2106137602824627, "learning_rate": 4.505643617947834e-06, "loss": 0.1872, "step": 7915 }, { "epoch": 2.730218927771074, "grad_norm": 1.4372986204495664, "learning_rate": 4.500088806788005e-06, "loss": 0.2101, "step": 7920 }, { "epoch": 2.731942768488192, "grad_norm": 1.1809474895261303, "learning_rate": 4.494534618797561e-06, "loss": 0.1994, "step": 7925 }, { "epoch": 2.7336666092053097, "grad_norm": 1.1966511234707295, "learning_rate": 4.4889810609001335e-06, "loss": 0.1792, "step": 7930 }, { "epoch": 2.735390449922427, "grad_norm": 1.1748529313376745, "learning_rate": 4.483428140018569e-06, "loss": 0.195, "step": 7935 }, { "epoch": 2.737114290639545, "grad_norm": 1.2912063021415345, "learning_rate": 4.477875863074914e-06, "loss": 0.2001, "step": 7940 }, { "epoch": 2.7388381313566628, "grad_norm": 1.0622864482871097, "learning_rate": 4.472324236990416e-06, "loss": 0.2066, "step": 7945 }, { "epoch": 2.74056197207378, "grad_norm": 1.1875791261114388, "learning_rate": 4.466773268685512e-06, "loss": 0.1968, "step": 7950 }, { "epoch": 2.742285812790898, "grad_norm": 1.2236668959815376, "learning_rate": 4.46122296507982e-06, "loss": 0.1754, "step": 7955 }, { "epoch": 2.744009653508016, "grad_norm": 1.406285223793381, "learning_rate": 4.455673333092123e-06, "loss": 0.1912, "step": 7960 }, { "epoch": 2.7457334942251337, "grad_norm": 1.1683515068454808, "learning_rate": 4.450124379640377e-06, "loss": 0.2101, "step": 7965 }, { "epoch": 2.7474573349422515, "grad_norm": 1.170486381157583, "learning_rate": 4.444576111641681e-06, "loss": 0.1931, "step": 7970 }, { "epoch": 2.749181175659369, "grad_norm": 1.2060903792039241, "learning_rate": 4.439028536012288e-06, "loss": 0.2111, "step": 7975 }, { "epoch": 2.7509050163764868, "grad_norm": 1.2409100345817432, "learning_rate": 4.433481659667583e-06, "loss": 0.1856, "step": 7980 }, { "epoch": 2.7526288570936046, "grad_norm": 1.1925647359048128, "learning_rate": 4.427935489522084e-06, "loss": 0.1781, "step": 7985 }, { "epoch": 2.7543526978107225, "grad_norm": 1.1619717711225186, "learning_rate": 4.422390032489423e-06, "loss": 0.2078, "step": 7990 }, { "epoch": 2.7560765385278403, "grad_norm": 1.0429201092240183, "learning_rate": 4.416845295482346e-06, "loss": 0.1796, "step": 7995 }, { "epoch": 2.7578003792449577, "grad_norm": 1.3352573254648352, "learning_rate": 4.411301285412703e-06, "loss": 0.2017, "step": 8000 }, { "epoch": 2.7595242199620755, "grad_norm": 1.1965571839421498, "learning_rate": 4.405758009191438e-06, "loss": 0.1895, "step": 8005 }, { "epoch": 2.7612480606791934, "grad_norm": 1.3542719573902489, "learning_rate": 4.400215473728573e-06, "loss": 0.1938, "step": 8010 }, { "epoch": 2.7629719013963108, "grad_norm": 1.112444871723316, "learning_rate": 4.394673685933215e-06, "loss": 0.1822, "step": 8015 }, { "epoch": 2.7646957421134286, "grad_norm": 1.2391166863423548, "learning_rate": 4.3891326527135375e-06, "loss": 0.2051, "step": 8020 }, { "epoch": 2.7664195828305465, "grad_norm": 1.2595252454074388, "learning_rate": 4.38359238097677e-06, "loss": 0.1984, "step": 8025 }, { "epoch": 2.7681434235476643, "grad_norm": 1.0939898221166287, "learning_rate": 4.3780528776291936e-06, "loss": 0.199, "step": 8030 }, { "epoch": 2.769867264264782, "grad_norm": 3.0232658688429344, "learning_rate": 4.3725141495761345e-06, "loss": 0.2269, "step": 8035 }, { "epoch": 2.7715911049818995, "grad_norm": 1.3255994971097103, "learning_rate": 4.366976203721952e-06, "loss": 0.2197, "step": 8040 }, { "epoch": 2.7733149456990174, "grad_norm": 1.466300530156325, "learning_rate": 4.361439046970024e-06, "loss": 0.2, "step": 8045 }, { "epoch": 2.775038786416135, "grad_norm": 1.14521302211061, "learning_rate": 4.3559026862227534e-06, "loss": 0.1732, "step": 8050 }, { "epoch": 2.7767626271332526, "grad_norm": 1.31163814279667, "learning_rate": 4.350367128381547e-06, "loss": 0.1872, "step": 8055 }, { "epoch": 2.778486467850371, "grad_norm": 1.383108648090669, "learning_rate": 4.3448323803468105e-06, "loss": 0.19, "step": 8060 }, { "epoch": 2.7802103085674883, "grad_norm": 1.2867511929277209, "learning_rate": 4.339298449017937e-06, "loss": 0.2079, "step": 8065 }, { "epoch": 2.781934149284606, "grad_norm": 1.1623867399276215, "learning_rate": 4.33376534129331e-06, "loss": 0.1961, "step": 8070 }, { "epoch": 2.783657990001724, "grad_norm": 1.2088762149212156, "learning_rate": 4.328233064070278e-06, "loss": 0.1903, "step": 8075 }, { "epoch": 2.7853818307188414, "grad_norm": 1.223823741388553, "learning_rate": 4.322701624245158e-06, "loss": 0.1595, "step": 8080 }, { "epoch": 2.787105671435959, "grad_norm": 1.1002836332974475, "learning_rate": 4.317171028713225e-06, "loss": 0.1929, "step": 8085 }, { "epoch": 2.788829512153077, "grad_norm": 1.0505110228506256, "learning_rate": 4.311641284368696e-06, "loss": 0.1847, "step": 8090 }, { "epoch": 2.790553352870195, "grad_norm": 1.034877504895325, "learning_rate": 4.306112398104732e-06, "loss": 0.1823, "step": 8095 }, { "epoch": 2.7922771935873127, "grad_norm": 1.0807104967684351, "learning_rate": 4.30058437681342e-06, "loss": 0.1894, "step": 8100 }, { "epoch": 2.79400103430443, "grad_norm": 1.139431942812383, "learning_rate": 4.295057227385776e-06, "loss": 0.2312, "step": 8105 }, { "epoch": 2.795724875021548, "grad_norm": 1.1277278495674539, "learning_rate": 4.28953095671172e-06, "loss": 0.1955, "step": 8110 }, { "epoch": 2.797448715738666, "grad_norm": 1.2303686781202476, "learning_rate": 4.284005571680081e-06, "loss": 0.2071, "step": 8115 }, { "epoch": 2.7991725564557832, "grad_norm": 1.2432317012431924, "learning_rate": 4.278481079178587e-06, "loss": 0.1909, "step": 8120 }, { "epoch": 2.800896397172901, "grad_norm": 1.2687097494661927, "learning_rate": 4.2729574860938484e-06, "loss": 0.2143, "step": 8125 }, { "epoch": 2.802620237890019, "grad_norm": 1.1667186420107287, "learning_rate": 4.267434799311357e-06, "loss": 0.1938, "step": 8130 }, { "epoch": 2.8043440786071367, "grad_norm": 1.4209068220987036, "learning_rate": 4.2619130257154726e-06, "loss": 0.1999, "step": 8135 }, { "epoch": 2.8060679193242546, "grad_norm": 1.1970763612085271, "learning_rate": 4.2563921721894216e-06, "loss": 0.2143, "step": 8140 }, { "epoch": 2.807791760041372, "grad_norm": 1.2960720448024432, "learning_rate": 4.250872245615278e-06, "loss": 0.2072, "step": 8145 }, { "epoch": 2.80951560075849, "grad_norm": 1.134691339864308, "learning_rate": 4.24535325287396e-06, "loss": 0.1699, "step": 8150 }, { "epoch": 2.8112394414756077, "grad_norm": 1.1327331828629736, "learning_rate": 4.239835200845229e-06, "loss": 0.1772, "step": 8155 }, { "epoch": 2.8129632821927255, "grad_norm": 1.2964504069699636, "learning_rate": 4.2343180964076675e-06, "loss": 0.2143, "step": 8160 }, { "epoch": 2.8146871229098434, "grad_norm": 1.3190192771614466, "learning_rate": 4.228801946438675e-06, "loss": 0.2055, "step": 8165 }, { "epoch": 2.8164109636269608, "grad_norm": 0.9720368175650838, "learning_rate": 4.22328675781447e-06, "loss": 0.1756, "step": 8170 }, { "epoch": 2.8181348043440786, "grad_norm": 1.2692431794174732, "learning_rate": 4.217772537410061e-06, "loss": 0.1987, "step": 8175 }, { "epoch": 2.8198586450611964, "grad_norm": 1.3730188514086452, "learning_rate": 4.212259292099261e-06, "loss": 0.2077, "step": 8180 }, { "epoch": 2.821582485778314, "grad_norm": 1.2171736835900917, "learning_rate": 4.206747028754656e-06, "loss": 0.1873, "step": 8185 }, { "epoch": 2.8233063264954317, "grad_norm": 1.171133534075351, "learning_rate": 4.201235754247621e-06, "loss": 0.1938, "step": 8190 }, { "epoch": 2.8250301672125495, "grad_norm": 1.2682037480479451, "learning_rate": 4.195725475448287e-06, "loss": 0.182, "step": 8195 }, { "epoch": 2.8267540079296674, "grad_norm": 1.4120655758507916, "learning_rate": 4.190216199225547e-06, "loss": 0.204, "step": 8200 }, { "epoch": 2.828477848646785, "grad_norm": 1.075863806220456, "learning_rate": 4.18470793244705e-06, "loss": 0.183, "step": 8205 }, { "epoch": 2.8302016893639026, "grad_norm": 1.1319415182795967, "learning_rate": 4.179200681979179e-06, "loss": 0.2123, "step": 8210 }, { "epoch": 2.8319255300810204, "grad_norm": 1.3054193049581715, "learning_rate": 4.173694454687053e-06, "loss": 0.206, "step": 8215 }, { "epoch": 2.8336493707981383, "grad_norm": 1.1157542815944936, "learning_rate": 4.168189257434515e-06, "loss": 0.164, "step": 8220 }, { "epoch": 2.835373211515256, "grad_norm": 1.1371507714261992, "learning_rate": 4.162685097084127e-06, "loss": 0.1931, "step": 8225 }, { "epoch": 2.837097052232374, "grad_norm": 1.1676006693921204, "learning_rate": 4.157181980497156e-06, "loss": 0.1971, "step": 8230 }, { "epoch": 2.8388208929494914, "grad_norm": 1.415367403210831, "learning_rate": 4.151679914533565e-06, "loss": 0.2069, "step": 8235 }, { "epoch": 2.840544733666609, "grad_norm": 1.3266681302230978, "learning_rate": 4.146178906052013e-06, "loss": 0.178, "step": 8240 }, { "epoch": 2.842268574383727, "grad_norm": 1.286262464013329, "learning_rate": 4.140678961909838e-06, "loss": 0.208, "step": 8245 }, { "epoch": 2.8439924151008444, "grad_norm": 1.7348548171576503, "learning_rate": 4.1351800889630515e-06, "loss": 0.1719, "step": 8250 }, { "epoch": 2.8457162558179623, "grad_norm": 1.185900473511365, "learning_rate": 4.129682294066327e-06, "loss": 0.1814, "step": 8255 }, { "epoch": 2.84744009653508, "grad_norm": 1.262898454175873, "learning_rate": 4.124185584072999e-06, "loss": 0.1939, "step": 8260 }, { "epoch": 2.849163937252198, "grad_norm": 1.270852337544755, "learning_rate": 4.118689965835048e-06, "loss": 0.2088, "step": 8265 }, { "epoch": 2.850887777969316, "grad_norm": 1.222777687393257, "learning_rate": 4.11319544620309e-06, "loss": 0.1866, "step": 8270 }, { "epoch": 2.852611618686433, "grad_norm": 1.250304059645102, "learning_rate": 4.107702032026378e-06, "loss": 0.1842, "step": 8275 }, { "epoch": 2.854335459403551, "grad_norm": 1.276360860707752, "learning_rate": 4.10220973015278e-06, "loss": 0.1919, "step": 8280 }, { "epoch": 2.856059300120669, "grad_norm": 1.21363515890293, "learning_rate": 4.096718547428781e-06, "loss": 0.1933, "step": 8285 }, { "epoch": 2.8577831408377867, "grad_norm": 1.3488690642459205, "learning_rate": 4.091228490699474e-06, "loss": 0.1848, "step": 8290 }, { "epoch": 2.8595069815549046, "grad_norm": 1.3131588721368845, "learning_rate": 4.085739566808545e-06, "loss": 0.1888, "step": 8295 }, { "epoch": 2.861230822272022, "grad_norm": 1.9331377583476947, "learning_rate": 4.080251782598263e-06, "loss": 0.1741, "step": 8300 }, { "epoch": 2.86295466298914, "grad_norm": 1.2358601570039363, "learning_rate": 4.074765144909485e-06, "loss": 0.1894, "step": 8305 }, { "epoch": 2.8646785037062577, "grad_norm": 1.017128065952739, "learning_rate": 4.069279660581635e-06, "loss": 0.1964, "step": 8310 }, { "epoch": 2.866402344423375, "grad_norm": 1.1091890388739514, "learning_rate": 4.0637953364526984e-06, "loss": 0.1839, "step": 8315 }, { "epoch": 2.868126185140493, "grad_norm": 1.2830530709685048, "learning_rate": 4.058312179359215e-06, "loss": 0.2149, "step": 8320 }, { "epoch": 2.8698500258576107, "grad_norm": 1.2324446018727235, "learning_rate": 4.052830196136272e-06, "loss": 0.1577, "step": 8325 }, { "epoch": 2.8715738665747286, "grad_norm": 1.0157886508807732, "learning_rate": 4.04734939361749e-06, "loss": 0.1857, "step": 8330 }, { "epoch": 2.8732977072918464, "grad_norm": 1.1557010837729673, "learning_rate": 4.041869778635018e-06, "loss": 0.2001, "step": 8335 }, { "epoch": 2.875021548008964, "grad_norm": 1.0784480225030633, "learning_rate": 4.036391358019526e-06, "loss": 0.1778, "step": 8340 }, { "epoch": 2.8767453887260817, "grad_norm": 1.2873100676424474, "learning_rate": 4.030914138600199e-06, "loss": 0.2083, "step": 8345 }, { "epoch": 2.8784692294431995, "grad_norm": 1.2389284327803995, "learning_rate": 4.025438127204717e-06, "loss": 0.1901, "step": 8350 }, { "epoch": 2.8801930701603173, "grad_norm": 1.2163933470381572, "learning_rate": 4.019963330659257e-06, "loss": 0.1967, "step": 8355 }, { "epoch": 2.881916910877435, "grad_norm": 1.2515375249766985, "learning_rate": 4.014489755788484e-06, "loss": 0.1855, "step": 8360 }, { "epoch": 2.8836407515945526, "grad_norm": 1.1211494136197395, "learning_rate": 4.00901740941554e-06, "loss": 0.1896, "step": 8365 }, { "epoch": 2.8853645923116704, "grad_norm": 1.1465954337843312, "learning_rate": 4.003546298362032e-06, "loss": 0.1614, "step": 8370 }, { "epoch": 2.8870884330287883, "grad_norm": 1.152720453143601, "learning_rate": 3.998076429448028e-06, "loss": 0.1992, "step": 8375 }, { "epoch": 2.8888122737459057, "grad_norm": 1.1167651038566924, "learning_rate": 3.992607809492051e-06, "loss": 0.1835, "step": 8380 }, { "epoch": 2.8905361144630235, "grad_norm": 1.189863677438235, "learning_rate": 3.987140445311065e-06, "loss": 0.1939, "step": 8385 }, { "epoch": 2.8922599551801413, "grad_norm": 1.170508660773413, "learning_rate": 3.981674343720466e-06, "loss": 0.1777, "step": 8390 }, { "epoch": 2.893983795897259, "grad_norm": 1.1915528504055501, "learning_rate": 3.976209511534083e-06, "loss": 0.196, "step": 8395 }, { "epoch": 2.895707636614377, "grad_norm": 1.301325060283411, "learning_rate": 3.9707459555641535e-06, "loss": 0.2084, "step": 8400 }, { "epoch": 2.8974314773314944, "grad_norm": 1.2661027616303373, "learning_rate": 3.965283682621329e-06, "loss": 0.1883, "step": 8405 }, { "epoch": 2.8991553180486123, "grad_norm": 1.219275075233274, "learning_rate": 3.959822699514667e-06, "loss": 0.1773, "step": 8410 }, { "epoch": 2.90087915876573, "grad_norm": 1.0979884941554234, "learning_rate": 3.9543630130516065e-06, "loss": 0.1853, "step": 8415 }, { "epoch": 2.902602999482848, "grad_norm": 1.5017483872419184, "learning_rate": 3.948904630037976e-06, "loss": 0.1555, "step": 8420 }, { "epoch": 2.904326840199966, "grad_norm": 1.2372063293423552, "learning_rate": 3.943447557277978e-06, "loss": 0.2129, "step": 8425 }, { "epoch": 2.906050680917083, "grad_norm": 1.050827856052223, "learning_rate": 3.937991801574185e-06, "loss": 0.1852, "step": 8430 }, { "epoch": 2.907774521634201, "grad_norm": 1.2096691108510387, "learning_rate": 3.932537369727523e-06, "loss": 0.177, "step": 8435 }, { "epoch": 2.909498362351319, "grad_norm": 1.181342471659955, "learning_rate": 3.927084268537266e-06, "loss": 0.2017, "step": 8440 }, { "epoch": 2.9112222030684363, "grad_norm": 1.1402574108423542, "learning_rate": 3.92163250480104e-06, "loss": 0.1905, "step": 8445 }, { "epoch": 2.912946043785554, "grad_norm": 1.3165671771647727, "learning_rate": 3.916182085314791e-06, "loss": 0.2013, "step": 8450 }, { "epoch": 2.914669884502672, "grad_norm": 1.029576640286608, "learning_rate": 3.910733016872799e-06, "loss": 0.2004, "step": 8455 }, { "epoch": 2.91639372521979, "grad_norm": 1.0665272745829613, "learning_rate": 3.90528530626765e-06, "loss": 0.1671, "step": 8460 }, { "epoch": 2.9181175659369076, "grad_norm": 1.0233155093361852, "learning_rate": 3.899838960290248e-06, "loss": 0.1871, "step": 8465 }, { "epoch": 2.919841406654025, "grad_norm": 1.1827437485749752, "learning_rate": 3.89439398572979e-06, "loss": 0.1881, "step": 8470 }, { "epoch": 2.921565247371143, "grad_norm": 1.1658387291535435, "learning_rate": 3.8889503893737625e-06, "loss": 0.1973, "step": 8475 }, { "epoch": 2.9232890880882607, "grad_norm": 1.2430907347834113, "learning_rate": 3.883508178007939e-06, "loss": 0.1719, "step": 8480 }, { "epoch": 2.925012928805378, "grad_norm": 1.1373376572837053, "learning_rate": 3.878067358416361e-06, "loss": 0.1787, "step": 8485 }, { "epoch": 2.9267367695224964, "grad_norm": 1.3601094211432447, "learning_rate": 3.872627937381338e-06, "loss": 0.1932, "step": 8490 }, { "epoch": 2.928460610239614, "grad_norm": 1.322640625687631, "learning_rate": 3.867189921683439e-06, "loss": 0.1981, "step": 8495 }, { "epoch": 2.9301844509567316, "grad_norm": 1.2719584425802242, "learning_rate": 3.861753318101473e-06, "loss": 0.1904, "step": 8500 }, { "epoch": 2.9319082916738495, "grad_norm": 1.4712530394725756, "learning_rate": 3.856318133412495e-06, "loss": 0.1706, "step": 8505 }, { "epoch": 2.933632132390967, "grad_norm": 1.3682269449776316, "learning_rate": 3.850884374391791e-06, "loss": 0.1615, "step": 8510 }, { "epoch": 2.9353559731080847, "grad_norm": 1.2792338243916719, "learning_rate": 3.845452047812868e-06, "loss": 0.1795, "step": 8515 }, { "epoch": 2.9370798138252026, "grad_norm": 1.256542313984073, "learning_rate": 3.840021160447448e-06, "loss": 0.201, "step": 8520 }, { "epoch": 2.9388036545423204, "grad_norm": 1.1712058086383184, "learning_rate": 3.8345917190654585e-06, "loss": 0.1713, "step": 8525 }, { "epoch": 2.9405274952594382, "grad_norm": 1.2398605762854835, "learning_rate": 3.829163730435025e-06, "loss": 0.1767, "step": 8530 }, { "epoch": 2.9422513359765556, "grad_norm": 1.1848463789189871, "learning_rate": 3.823737201322465e-06, "loss": 0.199, "step": 8535 }, { "epoch": 2.9439751766936735, "grad_norm": 1.303223292226999, "learning_rate": 3.818312138492268e-06, "loss": 0.1825, "step": 8540 }, { "epoch": 2.9456990174107913, "grad_norm": 1.5157044205233943, "learning_rate": 3.812888548707104e-06, "loss": 0.2029, "step": 8545 }, { "epoch": 2.9474228581279087, "grad_norm": 1.131704794695496, "learning_rate": 3.807466438727806e-06, "loss": 0.2079, "step": 8550 }, { "epoch": 2.9491466988450266, "grad_norm": 1.37739915064286, "learning_rate": 3.8020458153133586e-06, "loss": 0.1887, "step": 8555 }, { "epoch": 2.9508705395621444, "grad_norm": 1.2024852771849421, "learning_rate": 3.7966266852208934e-06, "loss": 0.1844, "step": 8560 }, { "epoch": 2.9525943802792622, "grad_norm": 1.2390752925242332, "learning_rate": 3.7912090552056847e-06, "loss": 0.1723, "step": 8565 }, { "epoch": 2.95431822099638, "grad_norm": 1.1572358198879844, "learning_rate": 3.7857929320211343e-06, "loss": 0.2092, "step": 8570 }, { "epoch": 2.9560420617134975, "grad_norm": 1.245744561744381, "learning_rate": 3.7803783224187657e-06, "loss": 0.1836, "step": 8575 }, { "epoch": 2.9577659024306153, "grad_norm": 1.245313112244567, "learning_rate": 3.7749652331482124e-06, "loss": 0.1824, "step": 8580 }, { "epoch": 2.959489743147733, "grad_norm": 1.2199876730603632, "learning_rate": 3.7695536709572194e-06, "loss": 0.1741, "step": 8585 }, { "epoch": 2.961213583864851, "grad_norm": 1.1106009111986281, "learning_rate": 3.764143642591625e-06, "loss": 0.1868, "step": 8590 }, { "epoch": 2.962937424581969, "grad_norm": 1.2264086772639844, "learning_rate": 3.7587351547953516e-06, "loss": 0.1709, "step": 8595 }, { "epoch": 2.9646612652990862, "grad_norm": 1.1811997158930236, "learning_rate": 3.753328214310409e-06, "loss": 0.1935, "step": 8600 }, { "epoch": 2.966385106016204, "grad_norm": 1.252239759339723, "learning_rate": 3.74792282787687e-06, "loss": 0.1679, "step": 8605 }, { "epoch": 2.968108946733322, "grad_norm": 1.1660064316973588, "learning_rate": 3.7425190022328763e-06, "loss": 0.2019, "step": 8610 }, { "epoch": 2.9698327874504393, "grad_norm": 1.1306830827070342, "learning_rate": 3.737116744114622e-06, "loss": 0.1773, "step": 8615 }, { "epoch": 2.971556628167557, "grad_norm": 1.1179096381911475, "learning_rate": 3.7317160602563473e-06, "loss": 0.1662, "step": 8620 }, { "epoch": 2.973280468884675, "grad_norm": 1.2131794282445627, "learning_rate": 3.7263169573903274e-06, "loss": 0.1724, "step": 8625 }, { "epoch": 2.975004309601793, "grad_norm": 1.5168740661484055, "learning_rate": 3.7209194422468684e-06, "loss": 0.194, "step": 8630 }, { "epoch": 2.9767281503189107, "grad_norm": 1.3771843753035093, "learning_rate": 3.715523521554303e-06, "loss": 0.198, "step": 8635 }, { "epoch": 2.978451991036028, "grad_norm": 1.1799172525576889, "learning_rate": 3.7101292020389666e-06, "loss": 0.1722, "step": 8640 }, { "epoch": 2.980175831753146, "grad_norm": 1.1540682340883583, "learning_rate": 3.7047364904252024e-06, "loss": 0.1728, "step": 8645 }, { "epoch": 2.9818996724702638, "grad_norm": 1.22785878372636, "learning_rate": 3.699345393435353e-06, "loss": 0.1819, "step": 8650 }, { "epoch": 2.9836235131873816, "grad_norm": 1.2329073959728665, "learning_rate": 3.6939559177897445e-06, "loss": 0.158, "step": 8655 }, { "epoch": 2.9853473539044995, "grad_norm": 1.292106741036609, "learning_rate": 3.688568070206682e-06, "loss": 0.1821, "step": 8660 }, { "epoch": 2.987071194621617, "grad_norm": 2.5148027142226557, "learning_rate": 3.6831818574024405e-06, "loss": 0.1739, "step": 8665 }, { "epoch": 2.9887950353387347, "grad_norm": 1.3617487795094294, "learning_rate": 3.6777972860912596e-06, "loss": 0.2088, "step": 8670 }, { "epoch": 2.9905188760558525, "grad_norm": 1.0917490198305588, "learning_rate": 3.6724143629853335e-06, "loss": 0.1868, "step": 8675 }, { "epoch": 2.99224271677297, "grad_norm": 1.3553058987589877, "learning_rate": 3.6670330947947953e-06, "loss": 0.1861, "step": 8680 }, { "epoch": 2.9939665574900878, "grad_norm": 1.3961356864483492, "learning_rate": 3.6616534882277242e-06, "loss": 0.2018, "step": 8685 }, { "epoch": 2.9956903982072056, "grad_norm": 1.1404126956620368, "learning_rate": 3.6562755499901207e-06, "loss": 0.2002, "step": 8690 }, { "epoch": 2.9974142389243235, "grad_norm": 1.3783271072832564, "learning_rate": 3.6508992867859104e-06, "loss": 0.1682, "step": 8695 }, { "epoch": 2.9991380796414413, "grad_norm": 1.188646039032073, "learning_rate": 3.645524705316926e-06, "loss": 0.1904, "step": 8700 }, { "epoch": 3.0006895362868473, "grad_norm": 1.0970171613103625, "learning_rate": 3.6401518122829103e-06, "loss": 0.1536, "step": 8705 }, { "epoch": 3.0024133770039647, "grad_norm": 1.092466946917964, "learning_rate": 3.6347806143814957e-06, "loss": 0.1731, "step": 8710 }, { "epoch": 3.0041372177210826, "grad_norm": 1.2313941069165162, "learning_rate": 3.629411118308202e-06, "loss": 0.1708, "step": 8715 }, { "epoch": 3.0058610584382004, "grad_norm": 1.3960767229370925, "learning_rate": 3.6240433307564337e-06, "loss": 0.1493, "step": 8720 }, { "epoch": 3.0075848991553182, "grad_norm": 1.3093850658962947, "learning_rate": 3.6186772584174577e-06, "loss": 0.1589, "step": 8725 }, { "epoch": 3.0093087398724356, "grad_norm": 1.2456216474500965, "learning_rate": 3.6133129079804064e-06, "loss": 0.1647, "step": 8730 }, { "epoch": 3.0110325805895535, "grad_norm": 1.4066648330317426, "learning_rate": 3.607950286132266e-06, "loss": 0.1783, "step": 8735 }, { "epoch": 3.0127564213066713, "grad_norm": 1.189881531728998, "learning_rate": 3.602589399557869e-06, "loss": 0.1724, "step": 8740 }, { "epoch": 3.014480262023789, "grad_norm": 1.3455052616634957, "learning_rate": 3.5972302549398795e-06, "loss": 0.1694, "step": 8745 }, { "epoch": 3.0162041027409066, "grad_norm": 1.229601294528069, "learning_rate": 3.591872858958796e-06, "loss": 0.1655, "step": 8750 }, { "epoch": 3.0179279434580244, "grad_norm": 1.3345614679194509, "learning_rate": 3.586517218292935e-06, "loss": 0.1806, "step": 8755 }, { "epoch": 3.0196517841751422, "grad_norm": 1.2245161134824287, "learning_rate": 3.5811633396184266e-06, "loss": 0.166, "step": 8760 }, { "epoch": 3.02137562489226, "grad_norm": 1.2149303448028828, "learning_rate": 3.5758112296091972e-06, "loss": 0.154, "step": 8765 }, { "epoch": 3.0230994656093775, "grad_norm": 1.2547650395188585, "learning_rate": 3.570460894936979e-06, "loss": 0.1623, "step": 8770 }, { "epoch": 3.0248233063264953, "grad_norm": 1.2832556765244343, "learning_rate": 3.5651123422712865e-06, "loss": 0.1753, "step": 8775 }, { "epoch": 3.026547147043613, "grad_norm": 1.1774052352792375, "learning_rate": 3.5597655782794096e-06, "loss": 0.1416, "step": 8780 }, { "epoch": 3.028270987760731, "grad_norm": 1.2228252426072648, "learning_rate": 3.5544206096264113e-06, "loss": 0.182, "step": 8785 }, { "epoch": 3.029994828477849, "grad_norm": 1.4863199015956217, "learning_rate": 3.5490774429751185e-06, "loss": 0.157, "step": 8790 }, { "epoch": 3.0317186691949662, "grad_norm": 1.1777303610532432, "learning_rate": 3.5437360849861103e-06, "loss": 0.1802, "step": 8795 }, { "epoch": 3.033442509912084, "grad_norm": 1.254018043291895, "learning_rate": 3.538396542317708e-06, "loss": 0.1702, "step": 8800 }, { "epoch": 3.035166350629202, "grad_norm": 1.122608652504421, "learning_rate": 3.533058821625977e-06, "loss": 0.1819, "step": 8805 }, { "epoch": 3.0368901913463198, "grad_norm": 1.4025133008197308, "learning_rate": 3.5277229295647043e-06, "loss": 0.1493, "step": 8810 }, { "epoch": 3.038614032063437, "grad_norm": 1.183473272076663, "learning_rate": 3.5223888727854018e-06, "loss": 0.1699, "step": 8815 }, { "epoch": 3.040337872780555, "grad_norm": 1.2998775767633, "learning_rate": 3.51705665793729e-06, "loss": 0.1604, "step": 8820 }, { "epoch": 3.042061713497673, "grad_norm": 1.136440256216976, "learning_rate": 3.5117262916673e-06, "loss": 0.1557, "step": 8825 }, { "epoch": 3.0437855542147907, "grad_norm": 1.3040492358993647, "learning_rate": 3.50639778062005e-06, "loss": 0.157, "step": 8830 }, { "epoch": 3.045509394931908, "grad_norm": 1.094331077347454, "learning_rate": 3.5010711314378498e-06, "loss": 0.1747, "step": 8835 }, { "epoch": 3.047233235649026, "grad_norm": 1.167773248184639, "learning_rate": 3.4957463507606924e-06, "loss": 0.1513, "step": 8840 }, { "epoch": 3.0489570763661438, "grad_norm": 1.243233315669377, "learning_rate": 3.4904234452262348e-06, "loss": 0.1457, "step": 8845 }, { "epoch": 3.0506809170832616, "grad_norm": 1.02399958288721, "learning_rate": 3.485102421469796e-06, "loss": 0.1412, "step": 8850 }, { "epoch": 3.0524047578003795, "grad_norm": 1.4348227706918033, "learning_rate": 3.479783286124357e-06, "loss": 0.1615, "step": 8855 }, { "epoch": 3.054128598517497, "grad_norm": 1.2206811934506423, "learning_rate": 3.4744660458205385e-06, "loss": 0.1622, "step": 8860 }, { "epoch": 3.0558524392346147, "grad_norm": 1.2443157593748708, "learning_rate": 3.469150707186601e-06, "loss": 0.1709, "step": 8865 }, { "epoch": 3.0575762799517325, "grad_norm": 1.3006298528543088, "learning_rate": 3.463837276848431e-06, "loss": 0.1814, "step": 8870 }, { "epoch": 3.0593001206688504, "grad_norm": 1.2221187416551949, "learning_rate": 3.4585257614295424e-06, "loss": 0.1586, "step": 8875 }, { "epoch": 3.0610239613859678, "grad_norm": 1.1420431171276149, "learning_rate": 3.453216167551059e-06, "loss": 0.1402, "step": 8880 }, { "epoch": 3.0627478021030856, "grad_norm": 1.282806457905716, "learning_rate": 3.447908501831706e-06, "loss": 0.1817, "step": 8885 }, { "epoch": 3.0644716428202035, "grad_norm": 1.1825323383956727, "learning_rate": 3.4426027708878125e-06, "loss": 0.1568, "step": 8890 }, { "epoch": 3.0661954835373213, "grad_norm": 1.234712663003976, "learning_rate": 3.437298981333288e-06, "loss": 0.1617, "step": 8895 }, { "epoch": 3.0679193242544387, "grad_norm": 0.9940273551718051, "learning_rate": 3.431997139779627e-06, "loss": 0.1434, "step": 8900 }, { "epoch": 3.0696431649715565, "grad_norm": 1.1911605035483424, "learning_rate": 3.426697252835891e-06, "loss": 0.1526, "step": 8905 }, { "epoch": 3.0713670056886744, "grad_norm": 1.2512560419389025, "learning_rate": 3.421399327108714e-06, "loss": 0.164, "step": 8910 }, { "epoch": 3.073090846405792, "grad_norm": 1.2924444069760832, "learning_rate": 3.4161033692022736e-06, "loss": 0.1451, "step": 8915 }, { "epoch": 3.0748146871229096, "grad_norm": 1.4258413720044054, "learning_rate": 3.410809385718301e-06, "loss": 0.1731, "step": 8920 }, { "epoch": 3.0765385278400275, "grad_norm": 1.2127107118447744, "learning_rate": 3.4055173832560694e-06, "loss": 0.1599, "step": 8925 }, { "epoch": 3.0782623685571453, "grad_norm": 1.4795382920841373, "learning_rate": 3.400227368412373e-06, "loss": 0.1773, "step": 8930 }, { "epoch": 3.079986209274263, "grad_norm": 1.4073976625245204, "learning_rate": 3.3949393477815374e-06, "loss": 0.1643, "step": 8935 }, { "epoch": 3.081710049991381, "grad_norm": 1.2603618596811135, "learning_rate": 3.3896533279553965e-06, "loss": 0.1594, "step": 8940 }, { "epoch": 3.0834338907084984, "grad_norm": 1.1700826310162284, "learning_rate": 3.384369315523294e-06, "loss": 0.1579, "step": 8945 }, { "epoch": 3.0851577314256162, "grad_norm": 1.2331113214960312, "learning_rate": 3.379087317072067e-06, "loss": 0.1624, "step": 8950 }, { "epoch": 3.086881572142734, "grad_norm": 1.1795130742565376, "learning_rate": 3.3738073391860443e-06, "loss": 0.1461, "step": 8955 }, { "epoch": 3.088605412859852, "grad_norm": 1.3371068033276374, "learning_rate": 3.3685293884470393e-06, "loss": 0.1612, "step": 8960 }, { "epoch": 3.0903292535769693, "grad_norm": 1.3044981159173494, "learning_rate": 3.363253471434334e-06, "loss": 0.1693, "step": 8965 }, { "epoch": 3.092053094294087, "grad_norm": 1.7031418448770361, "learning_rate": 3.357979594724673e-06, "loss": 0.1657, "step": 8970 }, { "epoch": 3.093776935011205, "grad_norm": 1.2422130712239046, "learning_rate": 3.3527077648922657e-06, "loss": 0.1548, "step": 8975 }, { "epoch": 3.095500775728323, "grad_norm": 1.2153967115568292, "learning_rate": 3.3474379885087636e-06, "loss": 0.1586, "step": 8980 }, { "epoch": 3.0972246164454402, "grad_norm": 1.2763439005942754, "learning_rate": 3.3421702721432596e-06, "loss": 0.1699, "step": 8985 }, { "epoch": 3.098948457162558, "grad_norm": 2.0238039028924923, "learning_rate": 3.336904622362278e-06, "loss": 0.1485, "step": 8990 }, { "epoch": 3.100672297879676, "grad_norm": 1.2516738258150344, "learning_rate": 3.33164104572977e-06, "loss": 0.1578, "step": 8995 }, { "epoch": 3.1023961385967938, "grad_norm": 1.2289166101128928, "learning_rate": 3.3263795488071017e-06, "loss": 0.1729, "step": 9000 }, { "epoch": 3.1041199793139116, "grad_norm": 1.3271495653163117, "learning_rate": 3.3211201381530413e-06, "loss": 0.1725, "step": 9005 }, { "epoch": 3.105843820031029, "grad_norm": 1.2248850728907936, "learning_rate": 3.3158628203237658e-06, "loss": 0.1538, "step": 9010 }, { "epoch": 3.107567660748147, "grad_norm": 1.2856247893709944, "learning_rate": 3.310607601872835e-06, "loss": 0.1614, "step": 9015 }, { "epoch": 3.1092915014652647, "grad_norm": 1.1875485025104728, "learning_rate": 3.305354489351197e-06, "loss": 0.1572, "step": 9020 }, { "epoch": 3.1110153421823825, "grad_norm": 1.3912249833335995, "learning_rate": 3.300103489307169e-06, "loss": 0.1606, "step": 9025 }, { "epoch": 3.1127391828995, "grad_norm": 1.2759584216954767, "learning_rate": 3.294854608286444e-06, "loss": 0.1735, "step": 9030 }, { "epoch": 3.1144630236166178, "grad_norm": 1.3218498639618694, "learning_rate": 3.289607852832064e-06, "loss": 0.1356, "step": 9035 }, { "epoch": 3.1161868643337356, "grad_norm": 1.1182668375520324, "learning_rate": 3.284363229484425e-06, "loss": 0.1593, "step": 9040 }, { "epoch": 3.1179107050508534, "grad_norm": 1.203530908352527, "learning_rate": 3.27912074478127e-06, "loss": 0.1523, "step": 9045 }, { "epoch": 3.119634545767971, "grad_norm": 1.1908506583990222, "learning_rate": 3.2738804052576683e-06, "loss": 0.1619, "step": 9050 }, { "epoch": 3.1213583864850887, "grad_norm": 1.3869705790615299, "learning_rate": 3.2686422174460176e-06, "loss": 0.1604, "step": 9055 }, { "epoch": 3.1230822272022065, "grad_norm": 1.164863663922533, "learning_rate": 3.2634061878760363e-06, "loss": 0.1652, "step": 9060 }, { "epoch": 3.1248060679193244, "grad_norm": 1.451389863606094, "learning_rate": 3.2581723230747507e-06, "loss": 0.1695, "step": 9065 }, { "epoch": 3.126529908636442, "grad_norm": 1.2556715058830605, "learning_rate": 3.2529406295664886e-06, "loss": 0.1753, "step": 9070 }, { "epoch": 3.1282537493535596, "grad_norm": 1.265263393868935, "learning_rate": 3.247711113872866e-06, "loss": 0.1754, "step": 9075 }, { "epoch": 3.1299775900706774, "grad_norm": 1.276515319127673, "learning_rate": 3.2424837825127943e-06, "loss": 0.1583, "step": 9080 }, { "epoch": 3.1317014307877953, "grad_norm": 1.4239641358944504, "learning_rate": 3.237258642002456e-06, "loss": 0.1487, "step": 9085 }, { "epoch": 3.133425271504913, "grad_norm": 1.2309278287313907, "learning_rate": 3.2320356988552994e-06, "loss": 0.1503, "step": 9090 }, { "epoch": 3.1351491122220305, "grad_norm": 1.1973779391393573, "learning_rate": 3.2268149595820424e-06, "loss": 0.1617, "step": 9095 }, { "epoch": 3.1368729529391484, "grad_norm": 1.4077947592991213, "learning_rate": 3.221596430690647e-06, "loss": 0.1922, "step": 9100 }, { "epoch": 3.138596793656266, "grad_norm": 1.0978739727298412, "learning_rate": 3.2163801186863266e-06, "loss": 0.1504, "step": 9105 }, { "epoch": 3.140320634373384, "grad_norm": 1.031431957954541, "learning_rate": 3.2111660300715235e-06, "loss": 0.1533, "step": 9110 }, { "epoch": 3.1420444750905014, "grad_norm": 1.4791723213322414, "learning_rate": 3.205954171345918e-06, "loss": 0.15, "step": 9115 }, { "epoch": 3.1437683158076193, "grad_norm": 1.185407698657408, "learning_rate": 3.2007445490064026e-06, "loss": 0.1519, "step": 9120 }, { "epoch": 3.145492156524737, "grad_norm": 1.8387350913492626, "learning_rate": 3.1955371695470844e-06, "loss": 0.1531, "step": 9125 }, { "epoch": 3.147215997241855, "grad_norm": 1.3424838946762745, "learning_rate": 3.1903320394592787e-06, "loss": 0.1405, "step": 9130 }, { "epoch": 3.148939837958973, "grad_norm": 1.4261443580634592, "learning_rate": 3.1851291652314907e-06, "loss": 0.1727, "step": 9135 }, { "epoch": 3.15066367867609, "grad_norm": 1.5221580856953272, "learning_rate": 3.179928553349418e-06, "loss": 0.1703, "step": 9140 }, { "epoch": 3.152387519393208, "grad_norm": 1.200721193330485, "learning_rate": 3.1747302102959334e-06, "loss": 0.1639, "step": 9145 }, { "epoch": 3.154111360110326, "grad_norm": 1.2843819585587974, "learning_rate": 3.169534142551087e-06, "loss": 0.1659, "step": 9150 }, { "epoch": 3.1558352008274437, "grad_norm": 1.3225029865479827, "learning_rate": 3.1643403565920894e-06, "loss": 0.1615, "step": 9155 }, { "epoch": 3.157559041544561, "grad_norm": 1.154229045189436, "learning_rate": 3.159148858893305e-06, "loss": 0.154, "step": 9160 }, { "epoch": 3.159282882261679, "grad_norm": 1.328864367745576, "learning_rate": 3.153959655926253e-06, "loss": 0.1594, "step": 9165 }, { "epoch": 3.161006722978797, "grad_norm": 1.2216420251964037, "learning_rate": 3.1487727541595847e-06, "loss": 0.1498, "step": 9170 }, { "epoch": 3.1627305636959147, "grad_norm": 1.2155244685570445, "learning_rate": 3.1435881600590823e-06, "loss": 0.1761, "step": 9175 }, { "epoch": 3.164454404413032, "grad_norm": 1.1016787276536528, "learning_rate": 3.138405880087658e-06, "loss": 0.142, "step": 9180 }, { "epoch": 3.16617824513015, "grad_norm": 1.285676065299186, "learning_rate": 3.1332259207053357e-06, "loss": 0.1531, "step": 9185 }, { "epoch": 3.1679020858472677, "grad_norm": 1.3820370607901018, "learning_rate": 3.128048288369245e-06, "loss": 0.1477, "step": 9190 }, { "epoch": 3.1696259265643856, "grad_norm": 1.276935464160731, "learning_rate": 3.122872989533616e-06, "loss": 0.1677, "step": 9195 }, { "epoch": 3.1713497672815034, "grad_norm": 1.2197500412036166, "learning_rate": 3.1177000306497705e-06, "loss": 0.1938, "step": 9200 }, { "epoch": 3.173073607998621, "grad_norm": 1.2295343099830522, "learning_rate": 3.1125294181661155e-06, "loss": 0.1633, "step": 9205 }, { "epoch": 3.1747974487157387, "grad_norm": 1.2348206207636296, "learning_rate": 3.107361158528126e-06, "loss": 0.1488, "step": 9210 }, { "epoch": 3.1765212894328565, "grad_norm": 1.2378880053981638, "learning_rate": 3.102195258178353e-06, "loss": 0.1393, "step": 9215 }, { "epoch": 3.1782451301499743, "grad_norm": 1.2305434710767567, "learning_rate": 3.0970317235563996e-06, "loss": 0.1679, "step": 9220 }, { "epoch": 3.1799689708670917, "grad_norm": 1.1849848594490677, "learning_rate": 3.0918705610989235e-06, "loss": 0.1467, "step": 9225 }, { "epoch": 3.1816928115842096, "grad_norm": 1.8960401239668225, "learning_rate": 3.0867117772396225e-06, "loss": 0.1426, "step": 9230 }, { "epoch": 3.1834166523013274, "grad_norm": 1.0708933155701055, "learning_rate": 3.0815553784092346e-06, "loss": 0.1622, "step": 9235 }, { "epoch": 3.1851404930184453, "grad_norm": 1.0740171225605655, "learning_rate": 3.0764013710355185e-06, "loss": 0.1733, "step": 9240 }, { "epoch": 3.1868643337355627, "grad_norm": 1.2588239522178575, "learning_rate": 3.0712497615432542e-06, "loss": 0.181, "step": 9245 }, { "epoch": 3.1885881744526805, "grad_norm": 1.1380999782943795, "learning_rate": 3.0661005563542356e-06, "loss": 0.154, "step": 9250 }, { "epoch": 3.1903120151697983, "grad_norm": 1.1114175392323502, "learning_rate": 3.060953761887256e-06, "loss": 0.1703, "step": 9255 }, { "epoch": 3.192035855886916, "grad_norm": 1.32524618177933, "learning_rate": 3.055809384558102e-06, "loss": 0.1495, "step": 9260 }, { "epoch": 3.1937596966040336, "grad_norm": 1.3807948928628861, "learning_rate": 3.0506674307795516e-06, "loss": 0.1738, "step": 9265 }, { "epoch": 3.1954835373211514, "grad_norm": 1.0145175130409587, "learning_rate": 3.0455279069613596e-06, "loss": 0.1388, "step": 9270 }, { "epoch": 3.1972073780382693, "grad_norm": 1.1246000437481154, "learning_rate": 3.0403908195102526e-06, "loss": 0.1545, "step": 9275 }, { "epoch": 3.198931218755387, "grad_norm": 1.2497428737835505, "learning_rate": 3.0352561748299157e-06, "loss": 0.1575, "step": 9280 }, { "epoch": 3.200655059472505, "grad_norm": 1.4730540337890046, "learning_rate": 3.0301239793209965e-06, "loss": 0.1576, "step": 9285 }, { "epoch": 3.2023789001896223, "grad_norm": 1.2264899431695482, "learning_rate": 3.0249942393810846e-06, "loss": 0.1621, "step": 9290 }, { "epoch": 3.20410274090674, "grad_norm": 1.339302228891098, "learning_rate": 3.0198669614047064e-06, "loss": 0.1443, "step": 9295 }, { "epoch": 3.205826581623858, "grad_norm": 1.2431049928184017, "learning_rate": 3.0147421517833274e-06, "loss": 0.1596, "step": 9300 }, { "epoch": 3.207550422340976, "grad_norm": 1.31662825026558, "learning_rate": 3.009619816905328e-06, "loss": 0.1514, "step": 9305 }, { "epoch": 3.2092742630580933, "grad_norm": 1.1377991158718492, "learning_rate": 3.0044999631560084e-06, "loss": 0.1405, "step": 9310 }, { "epoch": 3.210998103775211, "grad_norm": 1.2786348510357934, "learning_rate": 2.9993825969175717e-06, "loss": 0.1402, "step": 9315 }, { "epoch": 3.212721944492329, "grad_norm": 1.355896912109576, "learning_rate": 2.9942677245691266e-06, "loss": 0.1655, "step": 9320 }, { "epoch": 3.214445785209447, "grad_norm": 1.2767357811079234, "learning_rate": 2.989155352486667e-06, "loss": 0.1629, "step": 9325 }, { "epoch": 3.216169625926564, "grad_norm": 1.1945654397950884, "learning_rate": 2.9840454870430713e-06, "loss": 0.1468, "step": 9330 }, { "epoch": 3.217893466643682, "grad_norm": 1.2177112201203022, "learning_rate": 2.9789381346080985e-06, "loss": 0.1552, "step": 9335 }, { "epoch": 3.2196173073608, "grad_norm": 1.06642046128774, "learning_rate": 2.973833301548367e-06, "loss": 0.1298, "step": 9340 }, { "epoch": 3.2213411480779177, "grad_norm": 1.385445885995133, "learning_rate": 2.9687309942273606e-06, "loss": 0.1529, "step": 9345 }, { "epoch": 3.223064988795035, "grad_norm": 1.1548857080268446, "learning_rate": 2.9636312190054093e-06, "loss": 0.1416, "step": 9350 }, { "epoch": 3.224788829512153, "grad_norm": 1.2496366251161442, "learning_rate": 2.958533982239694e-06, "loss": 0.1603, "step": 9355 }, { "epoch": 3.226512670229271, "grad_norm": 1.198113134559372, "learning_rate": 2.953439290284224e-06, "loss": 0.1626, "step": 9360 }, { "epoch": 3.2282365109463886, "grad_norm": 1.3945918076744428, "learning_rate": 2.9483471494898396e-06, "loss": 0.1668, "step": 9365 }, { "epoch": 3.2299603516635065, "grad_norm": 1.271661001106605, "learning_rate": 2.943257566204203e-06, "loss": 0.1607, "step": 9370 }, { "epoch": 3.231684192380624, "grad_norm": 1.24702467300753, "learning_rate": 2.938170546771785e-06, "loss": 0.1297, "step": 9375 }, { "epoch": 3.2334080330977417, "grad_norm": 1.2246876912963742, "learning_rate": 2.9330860975338592e-06, "loss": 0.1597, "step": 9380 }, { "epoch": 3.2351318738148596, "grad_norm": 1.1898325900629065, "learning_rate": 2.9280042248285e-06, "loss": 0.1438, "step": 9385 }, { "epoch": 3.2368557145319774, "grad_norm": 1.1258341844437951, "learning_rate": 2.9229249349905686e-06, "loss": 0.1535, "step": 9390 }, { "epoch": 3.238579555249095, "grad_norm": 1.3075808292199873, "learning_rate": 2.917848234351702e-06, "loss": 0.1467, "step": 9395 }, { "epoch": 3.2403033959662126, "grad_norm": 1.4888915252167954, "learning_rate": 2.912774129240315e-06, "loss": 0.1236, "step": 9400 }, { "epoch": 3.2420272366833305, "grad_norm": 1.3130581874151135, "learning_rate": 2.9077026259815865e-06, "loss": 0.1512, "step": 9405 }, { "epoch": 3.2437510774004483, "grad_norm": 1.2233651203860043, "learning_rate": 2.9026337308974485e-06, "loss": 0.1514, "step": 9410 }, { "epoch": 3.2454749181175657, "grad_norm": 1.1621007724666295, "learning_rate": 2.8975674503065826e-06, "loss": 0.1422, "step": 9415 }, { "epoch": 3.2471987588346836, "grad_norm": 1.2324134122107149, "learning_rate": 2.8925037905244157e-06, "loss": 0.1526, "step": 9420 }, { "epoch": 3.2489225995518014, "grad_norm": 1.3375408126424149, "learning_rate": 2.887442757863103e-06, "loss": 0.161, "step": 9425 }, { "epoch": 3.2506464402689192, "grad_norm": 1.3044008937263296, "learning_rate": 2.8823843586315236e-06, "loss": 0.16, "step": 9430 }, { "epoch": 3.252370280986037, "grad_norm": 1.2832049055162995, "learning_rate": 2.877328599135282e-06, "loss": 0.1715, "step": 9435 }, { "epoch": 3.2540941217031545, "grad_norm": 1.3544162858132127, "learning_rate": 2.872275485676681e-06, "loss": 0.15, "step": 9440 }, { "epoch": 3.2558179624202723, "grad_norm": 1.2628491376935866, "learning_rate": 2.867225024554735e-06, "loss": 0.1595, "step": 9445 }, { "epoch": 3.25754180313739, "grad_norm": 1.0156045919606889, "learning_rate": 2.8621772220651445e-06, "loss": 0.1424, "step": 9450 }, { "epoch": 3.259265643854508, "grad_norm": 1.1795663228801603, "learning_rate": 2.8571320845003026e-06, "loss": 0.1398, "step": 9455 }, { "epoch": 3.2609894845716254, "grad_norm": 1.291572373617909, "learning_rate": 2.852089618149275e-06, "loss": 0.1449, "step": 9460 }, { "epoch": 3.2627133252887432, "grad_norm": 1.1917792856944078, "learning_rate": 2.847049829297799e-06, "loss": 0.175, "step": 9465 }, { "epoch": 3.264437166005861, "grad_norm": 1.1902174190693788, "learning_rate": 2.842012724228273e-06, "loss": 0.1517, "step": 9470 }, { "epoch": 3.266161006722979, "grad_norm": 1.1391828957317889, "learning_rate": 2.836978309219754e-06, "loss": 0.1561, "step": 9475 }, { "epoch": 3.2678848474400963, "grad_norm": 1.1556784576811463, "learning_rate": 2.831946590547945e-06, "loss": 0.1406, "step": 9480 }, { "epoch": 3.269608688157214, "grad_norm": 1.2708959197240675, "learning_rate": 2.8269175744851817e-06, "loss": 0.1441, "step": 9485 }, { "epoch": 3.271332528874332, "grad_norm": 1.0924787427958744, "learning_rate": 2.8218912673004394e-06, "loss": 0.1312, "step": 9490 }, { "epoch": 3.27305636959145, "grad_norm": 1.2769029353004335, "learning_rate": 2.8168676752593118e-06, "loss": 0.1594, "step": 9495 }, { "epoch": 3.2747802103085677, "grad_norm": 1.3795662728959874, "learning_rate": 2.8118468046240044e-06, "loss": 0.1597, "step": 9500 }, { "epoch": 3.276504051025685, "grad_norm": 1.2887238937193444, "learning_rate": 2.8068286616533403e-06, "loss": 0.1459, "step": 9505 }, { "epoch": 3.278227891742803, "grad_norm": 1.1808402704797365, "learning_rate": 2.801813252602734e-06, "loss": 0.1487, "step": 9510 }, { "epoch": 3.2799517324599208, "grad_norm": 1.5317936743745084, "learning_rate": 2.7968005837241934e-06, "loss": 0.1559, "step": 9515 }, { "epoch": 3.2816755731770386, "grad_norm": 1.2944915064473905, "learning_rate": 2.791790661266313e-06, "loss": 0.1471, "step": 9520 }, { "epoch": 3.283399413894156, "grad_norm": 1.4435572098572045, "learning_rate": 2.7867834914742653e-06, "loss": 0.1515, "step": 9525 }, { "epoch": 3.285123254611274, "grad_norm": 1.2507729186644416, "learning_rate": 2.781779080589787e-06, "loss": 0.1337, "step": 9530 }, { "epoch": 3.2868470953283917, "grad_norm": 1.2948469878334319, "learning_rate": 2.7767774348511744e-06, "loss": 0.1474, "step": 9535 }, { "epoch": 3.2885709360455095, "grad_norm": 1.1271553054927308, "learning_rate": 2.7717785604932845e-06, "loss": 0.1553, "step": 9540 }, { "epoch": 3.290294776762627, "grad_norm": 1.1780439193780539, "learning_rate": 2.7667824637475137e-06, "loss": 0.1308, "step": 9545 }, { "epoch": 3.292018617479745, "grad_norm": 1.364462227019429, "learning_rate": 2.761789150841796e-06, "loss": 0.1678, "step": 9550 }, { "epoch": 3.2937424581968626, "grad_norm": 1.2408534193952319, "learning_rate": 2.7567986280005956e-06, "loss": 0.1672, "step": 9555 }, { "epoch": 3.2954662989139805, "grad_norm": 1.3574064237829038, "learning_rate": 2.7518109014449004e-06, "loss": 0.1358, "step": 9560 }, { "epoch": 3.2971901396310983, "grad_norm": 1.2879046454538696, "learning_rate": 2.746825977392214e-06, "loss": 0.1617, "step": 9565 }, { "epoch": 3.2989139803482157, "grad_norm": 1.21458954561244, "learning_rate": 2.7418438620565405e-06, "loss": 0.1643, "step": 9570 }, { "epoch": 3.3006378210653335, "grad_norm": 1.1591378646019665, "learning_rate": 2.736864561648391e-06, "loss": 0.146, "step": 9575 }, { "epoch": 3.3023616617824514, "grad_norm": 1.3420686126024113, "learning_rate": 2.7318880823747606e-06, "loss": 0.1621, "step": 9580 }, { "epoch": 3.3040855024995692, "grad_norm": 5.195289363418977, "learning_rate": 2.7269144304391304e-06, "loss": 0.1648, "step": 9585 }, { "epoch": 3.3058093432166866, "grad_norm": 1.4312003943267158, "learning_rate": 2.7219436120414546e-06, "loss": 0.1626, "step": 9590 }, { "epoch": 3.3075331839338045, "grad_norm": 1.1891300449792883, "learning_rate": 2.7169756333781613e-06, "loss": 0.1469, "step": 9595 }, { "epoch": 3.3092570246509223, "grad_norm": 1.2300564203284647, "learning_rate": 2.712010500642131e-06, "loss": 0.1627, "step": 9600 }, { "epoch": 3.31098086536804, "grad_norm": 1.207605291762137, "learning_rate": 2.7070482200227027e-06, "loss": 0.1419, "step": 9605 }, { "epoch": 3.3127047060851575, "grad_norm": 1.4119647051471251, "learning_rate": 2.7020887977056596e-06, "loss": 0.1466, "step": 9610 }, { "epoch": 3.3144285468022754, "grad_norm": 1.3414050244450053, "learning_rate": 2.697132239873218e-06, "loss": 0.1531, "step": 9615 }, { "epoch": 3.3161523875193932, "grad_norm": 1.2794188181412658, "learning_rate": 2.6921785527040245e-06, "loss": 0.1517, "step": 9620 }, { "epoch": 3.317876228236511, "grad_norm": 1.1383487525517915, "learning_rate": 2.687227742373151e-06, "loss": 0.1339, "step": 9625 }, { "epoch": 3.319600068953629, "grad_norm": 1.3706684361185986, "learning_rate": 2.6822798150520784e-06, "loss": 0.1458, "step": 9630 }, { "epoch": 3.3213239096707463, "grad_norm": 1.0840779345482268, "learning_rate": 2.6773347769086954e-06, "loss": 0.138, "step": 9635 }, { "epoch": 3.323047750387864, "grad_norm": 1.1299680926739593, "learning_rate": 2.672392634107292e-06, "loss": 0.1378, "step": 9640 }, { "epoch": 3.324771591104982, "grad_norm": 1.2968179813302492, "learning_rate": 2.667453392808543e-06, "loss": 0.1536, "step": 9645 }, { "epoch": 3.3264954318220994, "grad_norm": 1.113095759791961, "learning_rate": 2.6625170591695147e-06, "loss": 0.1537, "step": 9650 }, { "epoch": 3.3282192725392172, "grad_norm": 1.4863009178344746, "learning_rate": 2.6575836393436407e-06, "loss": 0.1629, "step": 9655 }, { "epoch": 3.329943113256335, "grad_norm": 1.1218714497049078, "learning_rate": 2.652653139480727e-06, "loss": 0.1381, "step": 9660 }, { "epoch": 3.331666953973453, "grad_norm": 1.2767619153443948, "learning_rate": 2.6477255657269385e-06, "loss": 0.1469, "step": 9665 }, { "epoch": 3.3333907946905708, "grad_norm": 1.2679028466844597, "learning_rate": 2.6428009242247923e-06, "loss": 0.1484, "step": 9670 }, { "epoch": 3.335114635407688, "grad_norm": 1.4103582032061193, "learning_rate": 2.637879221113147e-06, "loss": 0.1363, "step": 9675 }, { "epoch": 3.336838476124806, "grad_norm": 1.079104577316675, "learning_rate": 2.6329604625272056e-06, "loss": 0.1637, "step": 9680 }, { "epoch": 3.338562316841924, "grad_norm": 1.3996568137256045, "learning_rate": 2.628044654598497e-06, "loss": 0.1494, "step": 9685 }, { "epoch": 3.3402861575590417, "grad_norm": 1.2866884448424594, "learning_rate": 2.623131803454869e-06, "loss": 0.1424, "step": 9690 }, { "epoch": 3.3420099982761595, "grad_norm": 1.3298720378968754, "learning_rate": 2.6182219152204896e-06, "loss": 0.1491, "step": 9695 }, { "epoch": 3.343733838993277, "grad_norm": 1.1645690462893232, "learning_rate": 2.613314996015828e-06, "loss": 0.1449, "step": 9700 }, { "epoch": 3.3454576797103948, "grad_norm": 1.3657935717403653, "learning_rate": 2.6084110519576544e-06, "loss": 0.1551, "step": 9705 }, { "epoch": 3.3471815204275126, "grad_norm": 1.3394892472574793, "learning_rate": 2.6035100891590277e-06, "loss": 0.1373, "step": 9710 }, { "epoch": 3.34890536114463, "grad_norm": 1.271435935332484, "learning_rate": 2.5986121137292973e-06, "loss": 0.1616, "step": 9715 }, { "epoch": 3.350629201861748, "grad_norm": 1.2957387994542078, "learning_rate": 2.5937171317740808e-06, "loss": 0.1555, "step": 9720 }, { "epoch": 3.3523530425788657, "grad_norm": 1.3265098912489344, "learning_rate": 2.588825149395269e-06, "loss": 0.1654, "step": 9725 }, { "epoch": 3.3540768832959835, "grad_norm": 1.4816628313362459, "learning_rate": 2.583936172691015e-06, "loss": 0.1487, "step": 9730 }, { "epoch": 3.3558007240131014, "grad_norm": 1.3025356011399114, "learning_rate": 2.5790502077557193e-06, "loss": 0.1458, "step": 9735 }, { "epoch": 3.3575245647302188, "grad_norm": 3.289135084806184, "learning_rate": 2.574167260680031e-06, "loss": 0.1303, "step": 9740 }, { "epoch": 3.3592484054473366, "grad_norm": 1.1993669977774641, "learning_rate": 2.5692873375508397e-06, "loss": 0.1335, "step": 9745 }, { "epoch": 3.3609722461644544, "grad_norm": 1.4260518652936927, "learning_rate": 2.564410444451263e-06, "loss": 0.1516, "step": 9750 }, { "epoch": 3.3626960868815723, "grad_norm": 1.1741266972894133, "learning_rate": 2.5595365874606403e-06, "loss": 0.1488, "step": 9755 }, { "epoch": 3.36441992759869, "grad_norm": 1.5130574213876373, "learning_rate": 2.5546657726545267e-06, "loss": 0.1332, "step": 9760 }, { "epoch": 3.3661437683158075, "grad_norm": 1.4131270060140593, "learning_rate": 2.549798006104687e-06, "loss": 0.1398, "step": 9765 }, { "epoch": 3.3678676090329254, "grad_norm": 1.1117451529424736, "learning_rate": 2.544933293879087e-06, "loss": 0.1372, "step": 9770 }, { "epoch": 3.369591449750043, "grad_norm": 1.2688281117458118, "learning_rate": 2.540071642041881e-06, "loss": 0.1487, "step": 9775 }, { "epoch": 3.3713152904671606, "grad_norm": 1.2238693061663448, "learning_rate": 2.535213056653412e-06, "loss": 0.1385, "step": 9780 }, { "epoch": 3.3730391311842784, "grad_norm": 1.312250632914668, "learning_rate": 2.5303575437701992e-06, "loss": 0.1478, "step": 9785 }, { "epoch": 3.3747629719013963, "grad_norm": 1.5068816910492788, "learning_rate": 2.525505109444931e-06, "loss": 0.1309, "step": 9790 }, { "epoch": 3.376486812618514, "grad_norm": 1.4449125973083312, "learning_rate": 2.5206557597264565e-06, "loss": 0.1649, "step": 9795 }, { "epoch": 3.378210653335632, "grad_norm": 1.1630657360993657, "learning_rate": 2.515809500659786e-06, "loss": 0.1472, "step": 9800 }, { "epoch": 3.3799344940527494, "grad_norm": 3.8103986592465118, "learning_rate": 2.5109663382860695e-06, "loss": 0.1365, "step": 9805 }, { "epoch": 3.381658334769867, "grad_norm": 2.244838279442172, "learning_rate": 2.506126278642602e-06, "loss": 0.1393, "step": 9810 }, { "epoch": 3.383382175486985, "grad_norm": 1.8591193422511196, "learning_rate": 2.5012893277628104e-06, "loss": 0.1556, "step": 9815 }, { "epoch": 3.385106016204103, "grad_norm": 1.249234027724954, "learning_rate": 2.4964554916762446e-06, "loss": 0.1501, "step": 9820 }, { "epoch": 3.3868298569212203, "grad_norm": 1.3286169728192727, "learning_rate": 2.4916247764085694e-06, "loss": 0.125, "step": 9825 }, { "epoch": 3.388553697638338, "grad_norm": 1.2390694186350926, "learning_rate": 2.4867971879815656e-06, "loss": 0.1585, "step": 9830 }, { "epoch": 3.390277538355456, "grad_norm": 1.322906955054297, "learning_rate": 2.4819727324131114e-06, "loss": 0.1502, "step": 9835 }, { "epoch": 3.392001379072574, "grad_norm": 1.2656397505407209, "learning_rate": 2.4771514157171796e-06, "loss": 0.1615, "step": 9840 }, { "epoch": 3.393725219789691, "grad_norm": 1.1797504014624975, "learning_rate": 2.4723332439038337e-06, "loss": 0.1475, "step": 9845 }, { "epoch": 3.395449060506809, "grad_norm": 1.1838104403710576, "learning_rate": 2.4675182229792128e-06, "loss": 0.1226, "step": 9850 }, { "epoch": 3.397172901223927, "grad_norm": 1.1493895536343182, "learning_rate": 2.462706358945533e-06, "loss": 0.1355, "step": 9855 }, { "epoch": 3.3988967419410447, "grad_norm": 1.1756756014850063, "learning_rate": 2.4578976578010688e-06, "loss": 0.13, "step": 9860 }, { "epoch": 3.4006205826581626, "grad_norm": 1.2006110410571582, "learning_rate": 2.4530921255401597e-06, "loss": 0.1438, "step": 9865 }, { "epoch": 3.40234442337528, "grad_norm": 1.4174755741714786, "learning_rate": 2.4482897681531885e-06, "loss": 0.1672, "step": 9870 }, { "epoch": 3.404068264092398, "grad_norm": 1.2397634742682426, "learning_rate": 2.4434905916265827e-06, "loss": 0.1466, "step": 9875 }, { "epoch": 3.4057921048095157, "grad_norm": 1.408305791146792, "learning_rate": 2.438694601942803e-06, "loss": 0.1466, "step": 9880 }, { "epoch": 3.4075159455266335, "grad_norm": 1.331945928683061, "learning_rate": 2.4339018050803413e-06, "loss": 0.1436, "step": 9885 }, { "epoch": 3.409239786243751, "grad_norm": 1.0673855870985687, "learning_rate": 2.429112207013709e-06, "loss": 0.1278, "step": 9890 }, { "epoch": 3.4109636269608687, "grad_norm": 1.3167135709994189, "learning_rate": 2.4243258137134247e-06, "loss": 0.1592, "step": 9895 }, { "epoch": 3.4126874676779866, "grad_norm": 1.0382253330637887, "learning_rate": 2.4195426311460184e-06, "loss": 0.1369, "step": 9900 }, { "epoch": 3.4144113083951044, "grad_norm": 1.347839718990197, "learning_rate": 2.414762665274015e-06, "loss": 0.1356, "step": 9905 }, { "epoch": 3.416135149112222, "grad_norm": 1.7525700544410359, "learning_rate": 2.4099859220559272e-06, "loss": 0.1598, "step": 9910 }, { "epoch": 3.4178589898293397, "grad_norm": 1.2228510478538737, "learning_rate": 2.4052124074462535e-06, "loss": 0.1464, "step": 9915 }, { "epoch": 3.4195828305464575, "grad_norm": 1.3166200373784038, "learning_rate": 2.40044212739547e-06, "loss": 0.1492, "step": 9920 }, { "epoch": 3.4213066712635753, "grad_norm": 1.2029074828603865, "learning_rate": 2.395675087850013e-06, "loss": 0.1423, "step": 9925 }, { "epoch": 3.423030511980693, "grad_norm": 1.6016952521237071, "learning_rate": 2.390911294752287e-06, "loss": 0.1492, "step": 9930 }, { "epoch": 3.4247543526978106, "grad_norm": 1.1537569382242616, "learning_rate": 2.386150754040649e-06, "loss": 0.1527, "step": 9935 }, { "epoch": 3.4264781934149284, "grad_norm": 1.4708685951800293, "learning_rate": 2.3813934716493976e-06, "loss": 0.1547, "step": 9940 }, { "epoch": 3.4282020341320463, "grad_norm": 1.2702368885272195, "learning_rate": 2.3766394535087688e-06, "loss": 0.1753, "step": 9945 }, { "epoch": 3.429925874849164, "grad_norm": 1.186006739617199, "learning_rate": 2.3718887055449362e-06, "loss": 0.1192, "step": 9950 }, { "epoch": 3.4316497155662815, "grad_norm": 1.1776757786872523, "learning_rate": 2.367141233679992e-06, "loss": 0.1351, "step": 9955 }, { "epoch": 3.4333735562833994, "grad_norm": 1.2317207596419153, "learning_rate": 2.3623970438319456e-06, "loss": 0.1457, "step": 9960 }, { "epoch": 3.435097397000517, "grad_norm": 1.2744775172337612, "learning_rate": 2.357656141914712e-06, "loss": 0.1249, "step": 9965 }, { "epoch": 3.436821237717635, "grad_norm": 1.1027102832973195, "learning_rate": 2.352918533838114e-06, "loss": 0.1288, "step": 9970 }, { "epoch": 3.4385450784347524, "grad_norm": 1.2735620938476777, "learning_rate": 2.3481842255078662e-06, "loss": 0.1238, "step": 9975 }, { "epoch": 3.4402689191518703, "grad_norm": 1.3083824846662153, "learning_rate": 2.3434532228255653e-06, "loss": 0.123, "step": 9980 }, { "epoch": 3.441992759868988, "grad_norm": 1.1261896661897102, "learning_rate": 2.3387255316886947e-06, "loss": 0.1287, "step": 9985 }, { "epoch": 3.443716600586106, "grad_norm": 1.1577807763124193, "learning_rate": 2.334001157990604e-06, "loss": 0.153, "step": 9990 }, { "epoch": 3.445440441303224, "grad_norm": 1.7469172819956151, "learning_rate": 2.3292801076205095e-06, "loss": 0.1164, "step": 9995 }, { "epoch": 3.447164282020341, "grad_norm": 1.285500279625073, "learning_rate": 2.3245623864634823e-06, "loss": 0.153, "step": 10000 }, { "epoch": 3.448888122737459, "grad_norm": 1.1373202633733297, "learning_rate": 2.3198480004004503e-06, "loss": 0.1359, "step": 10005 }, { "epoch": 3.450611963454577, "grad_norm": 1.1995880423557541, "learning_rate": 2.3151369553081747e-06, "loss": 0.152, "step": 10010 }, { "epoch": 3.4523358041716947, "grad_norm": 1.3190939820004475, "learning_rate": 2.310429257059259e-06, "loss": 0.1636, "step": 10015 }, { "epoch": 3.454059644888812, "grad_norm": 1.4851426400529595, "learning_rate": 2.305724911522134e-06, "loss": 0.1408, "step": 10020 }, { "epoch": 3.45578348560593, "grad_norm": 1.2107923919088128, "learning_rate": 2.301023924561049e-06, "loss": 0.1335, "step": 10025 }, { "epoch": 3.457507326323048, "grad_norm": 1.4514363246494875, "learning_rate": 2.296326302036065e-06, "loss": 0.1515, "step": 10030 }, { "epoch": 3.4592311670401656, "grad_norm": 1.1785208273644654, "learning_rate": 2.2916320498030507e-06, "loss": 0.132, "step": 10035 }, { "epoch": 3.460955007757283, "grad_norm": 1.3728957597883333, "learning_rate": 2.2869411737136776e-06, "loss": 0.1191, "step": 10040 }, { "epoch": 3.462678848474401, "grad_norm": 1.1398710835306203, "learning_rate": 2.282253679615401e-06, "loss": 0.128, "step": 10045 }, { "epoch": 3.4644026891915187, "grad_norm": 1.2776111551003537, "learning_rate": 2.277569573351468e-06, "loss": 0.1157, "step": 10050 }, { "epoch": 3.4661265299086366, "grad_norm": 1.4871118678155257, "learning_rate": 2.272888860760896e-06, "loss": 0.1489, "step": 10055 }, { "epoch": 3.4678503706257544, "grad_norm": 1.1816851101719668, "learning_rate": 2.268211547678478e-06, "loss": 0.1361, "step": 10060 }, { "epoch": 3.469574211342872, "grad_norm": 1.326715133116185, "learning_rate": 2.2635376399347625e-06, "loss": 0.1476, "step": 10065 }, { "epoch": 3.4712980520599896, "grad_norm": 1.201080970912036, "learning_rate": 2.2588671433560605e-06, "loss": 0.1337, "step": 10070 }, { "epoch": 3.4730218927771075, "grad_norm": 1.2095397506766368, "learning_rate": 2.2542000637644255e-06, "loss": 0.116, "step": 10075 }, { "epoch": 3.474745733494225, "grad_norm": 1.2361475967277016, "learning_rate": 2.249536406977653e-06, "loss": 0.1415, "step": 10080 }, { "epoch": 3.4764695742113427, "grad_norm": 1.1930575497042437, "learning_rate": 2.2448761788092698e-06, "loss": 0.1394, "step": 10085 }, { "epoch": 3.4781934149284606, "grad_norm": 1.8463098668073976, "learning_rate": 2.2402193850685327e-06, "loss": 0.1404, "step": 10090 }, { "epoch": 3.4799172556455784, "grad_norm": 1.5571786413481146, "learning_rate": 2.2355660315604173e-06, "loss": 0.1625, "step": 10095 }, { "epoch": 3.4816410963626963, "grad_norm": 1.3547933147833346, "learning_rate": 2.2309161240856047e-06, "loss": 0.1415, "step": 10100 }, { "epoch": 3.4833649370798137, "grad_norm": 1.3412873791216466, "learning_rate": 2.2262696684404887e-06, "loss": 0.1386, "step": 10105 }, { "epoch": 3.4850887777969315, "grad_norm": 1.3022777262880085, "learning_rate": 2.221626670417154e-06, "loss": 0.1412, "step": 10110 }, { "epoch": 3.4868126185140493, "grad_norm": 1.2544141695607791, "learning_rate": 2.216987135803376e-06, "loss": 0.1491, "step": 10115 }, { "epoch": 3.488536459231167, "grad_norm": 1.3151231958344867, "learning_rate": 2.2123510703826136e-06, "loss": 0.1422, "step": 10120 }, { "epoch": 3.490260299948285, "grad_norm": 1.2748072937124686, "learning_rate": 2.2077184799340036e-06, "loss": 0.1332, "step": 10125 }, { "epoch": 3.4919841406654024, "grad_norm": 1.187476731348306, "learning_rate": 2.2030893702323457e-06, "loss": 0.1504, "step": 10130 }, { "epoch": 3.4937079813825203, "grad_norm": 1.4262527187843115, "learning_rate": 2.1984637470481056e-06, "loss": 0.1299, "step": 10135 }, { "epoch": 3.495431822099638, "grad_norm": 1.4159713330193895, "learning_rate": 2.193841616147403e-06, "loss": 0.1492, "step": 10140 }, { "epoch": 3.4971556628167555, "grad_norm": 1.8420571283229705, "learning_rate": 2.189222983292e-06, "loss": 0.13, "step": 10145 }, { "epoch": 3.4988795035338733, "grad_norm": 1.3015205686974982, "learning_rate": 2.1846078542393005e-06, "loss": 0.1557, "step": 10150 }, { "epoch": 3.500603344250991, "grad_norm": 1.288144370705917, "learning_rate": 2.179996234742339e-06, "loss": 0.151, "step": 10155 }, { "epoch": 3.502327184968109, "grad_norm": 1.2368914366983854, "learning_rate": 2.1753881305497798e-06, "loss": 0.1266, "step": 10160 }, { "epoch": 3.504051025685227, "grad_norm": 1.2793429978132784, "learning_rate": 2.170783547405901e-06, "loss": 0.1727, "step": 10165 }, { "epoch": 3.5057748664023443, "grad_norm": 1.0476834155793038, "learning_rate": 2.16618249105059e-06, "loss": 0.1294, "step": 10170 }, { "epoch": 3.507498707119462, "grad_norm": 1.2685273294129786, "learning_rate": 2.161584967219343e-06, "loss": 0.1364, "step": 10175 }, { "epoch": 3.50922254783658, "grad_norm": 1.2165289853454064, "learning_rate": 2.1569909816432517e-06, "loss": 0.1283, "step": 10180 }, { "epoch": 3.510946388553698, "grad_norm": 1.2182163113863704, "learning_rate": 2.1524005400489917e-06, "loss": 0.1305, "step": 10185 }, { "epoch": 3.5126702292708156, "grad_norm": 1.1567298789673996, "learning_rate": 2.1478136481588284e-06, "loss": 0.1425, "step": 10190 }, { "epoch": 3.514394069987933, "grad_norm": 1.304619567517035, "learning_rate": 2.1432303116905974e-06, "loss": 0.1467, "step": 10195 }, { "epoch": 3.516117910705051, "grad_norm": 1.1718439677464705, "learning_rate": 2.1386505363577025e-06, "loss": 0.1472, "step": 10200 }, { "epoch": 3.5178417514221687, "grad_norm": 1.1920756528298804, "learning_rate": 2.1340743278691077e-06, "loss": 0.1417, "step": 10205 }, { "epoch": 3.519565592139286, "grad_norm": 1.2508333889573606, "learning_rate": 2.1295016919293366e-06, "loss": 0.1375, "step": 10210 }, { "epoch": 3.521289432856404, "grad_norm": 1.1786696781792316, "learning_rate": 2.1249326342384506e-06, "loss": 0.1428, "step": 10215 }, { "epoch": 3.523013273573522, "grad_norm": 1.2954028572764982, "learning_rate": 2.1203671604920575e-06, "loss": 0.1422, "step": 10220 }, { "epoch": 3.5247371142906396, "grad_norm": 1.2549155800354854, "learning_rate": 2.1158052763812963e-06, "loss": 0.1284, "step": 10225 }, { "epoch": 3.5264609550077575, "grad_norm": 1.3034212695095693, "learning_rate": 2.1112469875928287e-06, "loss": 0.1234, "step": 10230 }, { "epoch": 3.528184795724875, "grad_norm": 2.8605010507460733, "learning_rate": 2.1066922998088358e-06, "loss": 0.1447, "step": 10235 }, { "epoch": 3.5299086364419927, "grad_norm": 1.1506103931619351, "learning_rate": 2.1021412187070078e-06, "loss": 0.1358, "step": 10240 }, { "epoch": 3.5316324771591106, "grad_norm": 1.301886880815796, "learning_rate": 2.097593749960546e-06, "loss": 0.1393, "step": 10245 }, { "epoch": 3.5333563178762284, "grad_norm": 1.2929314975727006, "learning_rate": 2.0930498992381395e-06, "loss": 0.1327, "step": 10250 }, { "epoch": 3.5350801585933462, "grad_norm": 1.1327159869706134, "learning_rate": 2.088509672203973e-06, "loss": 0.1271, "step": 10255 }, { "epoch": 3.5368039993104636, "grad_norm": 1.4404330210849468, "learning_rate": 2.083973074517715e-06, "loss": 0.1294, "step": 10260 }, { "epoch": 3.5385278400275815, "grad_norm": 1.463516046286985, "learning_rate": 2.0794401118345065e-06, "loss": 0.1615, "step": 10265 }, { "epoch": 3.5402516807446993, "grad_norm": 1.2009394418196053, "learning_rate": 2.074910789804955e-06, "loss": 0.1212, "step": 10270 }, { "epoch": 3.5419755214618167, "grad_norm": 1.39931538405594, "learning_rate": 2.0703851140751374e-06, "loss": 0.1485, "step": 10275 }, { "epoch": 3.5436993621789346, "grad_norm": 1.3285491672518441, "learning_rate": 2.0658630902865793e-06, "loss": 0.1406, "step": 10280 }, { "epoch": 3.5454232028960524, "grad_norm": 1.27157866509823, "learning_rate": 2.061344724076255e-06, "loss": 0.1679, "step": 10285 }, { "epoch": 3.5471470436131702, "grad_norm": 1.1340291313938298, "learning_rate": 2.056830021076578e-06, "loss": 0.1342, "step": 10290 }, { "epoch": 3.548870884330288, "grad_norm": 1.4711914742437875, "learning_rate": 2.0523189869154e-06, "loss": 0.1367, "step": 10295 }, { "epoch": 3.5505947250474055, "grad_norm": 1.1018443142071328, "learning_rate": 2.047811627215997e-06, "loss": 0.1274, "step": 10300 }, { "epoch": 3.5523185657645233, "grad_norm": 1.276916706292044, "learning_rate": 2.0433079475970614e-06, "loss": 0.1509, "step": 10305 }, { "epoch": 3.554042406481641, "grad_norm": 1.2631568006321003, "learning_rate": 2.038807953672704e-06, "loss": 0.1465, "step": 10310 }, { "epoch": 3.555766247198759, "grad_norm": 1.2858206578244311, "learning_rate": 2.034311651052437e-06, "loss": 0.1237, "step": 10315 }, { "epoch": 3.557490087915877, "grad_norm": 1.3166707370113864, "learning_rate": 2.0298190453411713e-06, "loss": 0.1459, "step": 10320 }, { "epoch": 3.5592139286329942, "grad_norm": 1.329663428551299, "learning_rate": 2.025330142139209e-06, "loss": 0.1292, "step": 10325 }, { "epoch": 3.560937769350112, "grad_norm": 1.3061670357384707, "learning_rate": 2.020844947042242e-06, "loss": 0.1572, "step": 10330 }, { "epoch": 3.56266161006723, "grad_norm": 1.4902355455081342, "learning_rate": 2.0163634656413316e-06, "loss": 0.1309, "step": 10335 }, { "epoch": 3.5643854507843473, "grad_norm": 1.2660175158905984, "learning_rate": 2.0118857035229163e-06, "loss": 0.1358, "step": 10340 }, { "epoch": 3.566109291501465, "grad_norm": 1.2702700696118927, "learning_rate": 2.0074116662687972e-06, "loss": 0.1644, "step": 10345 }, { "epoch": 3.567833132218583, "grad_norm": 1.2349927622288783, "learning_rate": 2.0029413594561303e-06, "loss": 0.1253, "step": 10350 }, { "epoch": 3.569556972935701, "grad_norm": 1.2613607181969857, "learning_rate": 1.998474788657421e-06, "loss": 0.1391, "step": 10355 }, { "epoch": 3.5712808136528187, "grad_norm": 1.172492727380651, "learning_rate": 1.994011959440517e-06, "loss": 0.1376, "step": 10360 }, { "epoch": 3.573004654369936, "grad_norm": 1.3442806355321946, "learning_rate": 1.989552877368608e-06, "loss": 0.1342, "step": 10365 }, { "epoch": 3.574728495087054, "grad_norm": 1.131544245484566, "learning_rate": 1.9850975480002057e-06, "loss": 0.1184, "step": 10370 }, { "epoch": 3.5764523358041718, "grad_norm": 1.0926149570186006, "learning_rate": 1.980645976889144e-06, "loss": 0.1239, "step": 10375 }, { "epoch": 3.578176176521289, "grad_norm": 1.3719965590893122, "learning_rate": 1.9761981695845767e-06, "loss": 0.1359, "step": 10380 }, { "epoch": 3.579900017238407, "grad_norm": 1.1643149725825608, "learning_rate": 1.9717541316309647e-06, "loss": 0.1326, "step": 10385 }, { "epoch": 3.581623857955525, "grad_norm": 1.8561466802008566, "learning_rate": 1.9673138685680653e-06, "loss": 0.1404, "step": 10390 }, { "epoch": 3.5833476986726427, "grad_norm": 1.1709644797992391, "learning_rate": 1.9628773859309374e-06, "loss": 0.1219, "step": 10395 }, { "epoch": 3.5850715393897605, "grad_norm": 1.373461957118018, "learning_rate": 1.9584446892499213e-06, "loss": 0.1459, "step": 10400 }, { "epoch": 3.586795380106878, "grad_norm": 1.3315445881372823, "learning_rate": 1.9540157840506406e-06, "loss": 0.1585, "step": 10405 }, { "epoch": 3.5885192208239958, "grad_norm": 1.3190982832334845, "learning_rate": 1.9495906758539906e-06, "loss": 0.1463, "step": 10410 }, { "epoch": 3.5902430615411136, "grad_norm": 1.8810774817123894, "learning_rate": 1.9451693701761376e-06, "loss": 0.1467, "step": 10415 }, { "epoch": 3.5919669022582315, "grad_norm": 1.0999186764013664, "learning_rate": 1.9407518725285024e-06, "loss": 0.1307, "step": 10420 }, { "epoch": 3.5936907429753493, "grad_norm": 1.1333744249409976, "learning_rate": 1.9363381884177635e-06, "loss": 0.1188, "step": 10425 }, { "epoch": 3.5954145836924667, "grad_norm": 1.2983964703441595, "learning_rate": 1.9319283233458453e-06, "loss": 0.1342, "step": 10430 }, { "epoch": 3.5971384244095845, "grad_norm": 1.1977946441537601, "learning_rate": 1.927522282809908e-06, "loss": 0.1218, "step": 10435 }, { "epoch": 3.5988622651267024, "grad_norm": 1.1241380011426576, "learning_rate": 1.923120072302346e-06, "loss": 0.1304, "step": 10440 }, { "epoch": 3.6005861058438198, "grad_norm": 1.384808286108473, "learning_rate": 1.918721697310779e-06, "loss": 0.1665, "step": 10445 }, { "epoch": 3.6023099465609376, "grad_norm": 1.271228113332117, "learning_rate": 1.9143271633180494e-06, "loss": 0.1258, "step": 10450 }, { "epoch": 3.6040337872780555, "grad_norm": 1.1310942200204361, "learning_rate": 1.9099364758022037e-06, "loss": 0.1293, "step": 10455 }, { "epoch": 3.6057576279951733, "grad_norm": 1.4577887134382665, "learning_rate": 1.9055496402365004e-06, "loss": 0.1268, "step": 10460 }, { "epoch": 3.607481468712291, "grad_norm": 1.2354278117066597, "learning_rate": 1.9011666620893966e-06, "loss": 0.1341, "step": 10465 }, { "epoch": 3.6092053094294085, "grad_norm": 1.3624270768772266, "learning_rate": 1.8967875468245357e-06, "loss": 0.1364, "step": 10470 }, { "epoch": 3.6109291501465264, "grad_norm": 1.518599366796241, "learning_rate": 1.8924122999007483e-06, "loss": 0.1358, "step": 10475 }, { "epoch": 3.612652990863644, "grad_norm": 1.241562848439346, "learning_rate": 1.8880409267720417e-06, "loss": 0.1292, "step": 10480 }, { "epoch": 3.614376831580762, "grad_norm": 1.4017676664890422, "learning_rate": 1.8836734328875989e-06, "loss": 0.116, "step": 10485 }, { "epoch": 3.61610067229788, "grad_norm": 1.1914278899598685, "learning_rate": 1.8793098236917624e-06, "loss": 0.1221, "step": 10490 }, { "epoch": 3.6178245130149973, "grad_norm": 1.3611171604951202, "learning_rate": 1.8749501046240309e-06, "loss": 0.137, "step": 10495 }, { "epoch": 3.619548353732115, "grad_norm": 1.1245813067761146, "learning_rate": 1.8705942811190596e-06, "loss": 0.1195, "step": 10500 }, { "epoch": 3.621272194449233, "grad_norm": 1.4638796937027179, "learning_rate": 1.8662423586066464e-06, "loss": 0.1335, "step": 10505 }, { "epoch": 3.6229960351663504, "grad_norm": 1.277229079172751, "learning_rate": 1.8618943425117198e-06, "loss": 0.1382, "step": 10510 }, { "epoch": 3.6247198758834682, "grad_norm": 1.5670591940980676, "learning_rate": 1.857550238254348e-06, "loss": 0.133, "step": 10515 }, { "epoch": 3.626443716600586, "grad_norm": 1.369205222016917, "learning_rate": 1.853210051249717e-06, "loss": 0.135, "step": 10520 }, { "epoch": 3.628167557317704, "grad_norm": 1.3182500817127756, "learning_rate": 1.8488737869081303e-06, "loss": 0.136, "step": 10525 }, { "epoch": 3.6298913980348217, "grad_norm": 1.385080097191345, "learning_rate": 1.8445414506350002e-06, "loss": 0.1309, "step": 10530 }, { "epoch": 3.631615238751939, "grad_norm": 1.245724613530749, "learning_rate": 1.8402130478308495e-06, "loss": 0.1556, "step": 10535 }, { "epoch": 3.633339079469057, "grad_norm": 1.335406621758715, "learning_rate": 1.8358885838912881e-06, "loss": 0.1389, "step": 10540 }, { "epoch": 3.635062920186175, "grad_norm": 1.329232887143972, "learning_rate": 1.8315680642070226e-06, "loss": 0.1266, "step": 10545 }, { "epoch": 3.6367867609032927, "grad_norm": 1.4605559129841814, "learning_rate": 1.8272514941638431e-06, "loss": 0.1386, "step": 10550 }, { "epoch": 3.6385106016204105, "grad_norm": 1.4477857409364705, "learning_rate": 1.8229388791426116e-06, "loss": 0.1352, "step": 10555 }, { "epoch": 3.640234442337528, "grad_norm": 1.3043594278556248, "learning_rate": 1.818630224519262e-06, "loss": 0.1503, "step": 10560 }, { "epoch": 3.6419582830546458, "grad_norm": 1.3361476287320897, "learning_rate": 1.8143255356647903e-06, "loss": 0.1402, "step": 10565 }, { "epoch": 3.6436821237717636, "grad_norm": 1.1482931785874433, "learning_rate": 1.810024817945254e-06, "loss": 0.1287, "step": 10570 }, { "epoch": 3.645405964488881, "grad_norm": 1.1679709169601555, "learning_rate": 1.8057280767217544e-06, "loss": 0.1391, "step": 10575 }, { "epoch": 3.647129805205999, "grad_norm": 1.2385544212852486, "learning_rate": 1.8014353173504363e-06, "loss": 0.1311, "step": 10580 }, { "epoch": 3.6488536459231167, "grad_norm": 1.07210735004757, "learning_rate": 1.7971465451824842e-06, "loss": 0.114, "step": 10585 }, { "epoch": 3.6505774866402345, "grad_norm": 1.350006746341956, "learning_rate": 1.7928617655641122e-06, "loss": 0.1336, "step": 10590 }, { "epoch": 3.6523013273573524, "grad_norm": 1.3609570099878197, "learning_rate": 1.7885809838365552e-06, "loss": 0.134, "step": 10595 }, { "epoch": 3.6540251680744698, "grad_norm": 1.4535370751201406, "learning_rate": 1.7843042053360626e-06, "loss": 0.1403, "step": 10600 }, { "epoch": 3.6557490087915876, "grad_norm": 1.331424181308694, "learning_rate": 1.7800314353939003e-06, "loss": 0.1232, "step": 10605 }, { "epoch": 3.6574728495087054, "grad_norm": 1.1209992275175193, "learning_rate": 1.7757626793363308e-06, "loss": 0.126, "step": 10610 }, { "epoch": 3.6591966902258233, "grad_norm": 1.2137726012580736, "learning_rate": 1.771497942484614e-06, "loss": 0.1484, "step": 10615 }, { "epoch": 3.660920530942941, "grad_norm": 1.3747442482775356, "learning_rate": 1.7672372301550044e-06, "loss": 0.1536, "step": 10620 }, { "epoch": 3.6626443716600585, "grad_norm": 1.040662719999315, "learning_rate": 1.762980547658733e-06, "loss": 0.1338, "step": 10625 }, { "epoch": 3.6643682123771764, "grad_norm": 1.3360808889696956, "learning_rate": 1.7587279003020125e-06, "loss": 0.121, "step": 10630 }, { "epoch": 3.666092053094294, "grad_norm": 1.4687925238284372, "learning_rate": 1.7544792933860256e-06, "loss": 0.1348, "step": 10635 }, { "epoch": 3.6678158938114116, "grad_norm": 1.2033196101927577, "learning_rate": 1.750234732206914e-06, "loss": 0.1346, "step": 10640 }, { "epoch": 3.6695397345285294, "grad_norm": 1.2504807907262168, "learning_rate": 1.7459942220557791e-06, "loss": 0.1254, "step": 10645 }, { "epoch": 3.6712635752456473, "grad_norm": 1.3642377624001467, "learning_rate": 1.741757768218671e-06, "loss": 0.1191, "step": 10650 }, { "epoch": 3.672987415962765, "grad_norm": 1.3836460333016505, "learning_rate": 1.7375253759765863e-06, "loss": 0.1257, "step": 10655 }, { "epoch": 3.674711256679883, "grad_norm": 1.5930833080042952, "learning_rate": 1.7332970506054548e-06, "loss": 0.1271, "step": 10660 }, { "epoch": 3.6764350973970004, "grad_norm": 1.3318259266400878, "learning_rate": 1.729072797376139e-06, "loss": 0.1302, "step": 10665 }, { "epoch": 3.678158938114118, "grad_norm": 1.4027360006651397, "learning_rate": 1.724852621554427e-06, "loss": 0.1401, "step": 10670 }, { "epoch": 3.679882778831236, "grad_norm": 1.3567103655800863, "learning_rate": 1.7206365284010206e-06, "loss": 0.1336, "step": 10675 }, { "epoch": 3.681606619548354, "grad_norm": 1.3227808689531386, "learning_rate": 1.7164245231715325e-06, "loss": 0.1394, "step": 10680 }, { "epoch": 3.6833304602654717, "grad_norm": 1.2139258418406307, "learning_rate": 1.7122166111164807e-06, "loss": 0.132, "step": 10685 }, { "epoch": 3.685054300982589, "grad_norm": 1.2801010660869936, "learning_rate": 1.7080127974812828e-06, "loss": 0.1184, "step": 10690 }, { "epoch": 3.686778141699707, "grad_norm": 2.342104147156023, "learning_rate": 1.7038130875062437e-06, "loss": 0.1356, "step": 10695 }, { "epoch": 3.688501982416825, "grad_norm": 1.1868197294481282, "learning_rate": 1.699617486426554e-06, "loss": 0.1399, "step": 10700 }, { "epoch": 3.690225823133942, "grad_norm": 1.2895401450171413, "learning_rate": 1.6954259994722838e-06, "loss": 0.1419, "step": 10705 }, { "epoch": 3.69194966385106, "grad_norm": 1.3094544008032438, "learning_rate": 1.691238631868376e-06, "loss": 0.1371, "step": 10710 }, { "epoch": 3.693673504568178, "grad_norm": 1.24079050631587, "learning_rate": 1.6870553888346325e-06, "loss": 0.1063, "step": 10715 }, { "epoch": 3.6953973452852957, "grad_norm": 1.2174923355322707, "learning_rate": 1.6828762755857214e-06, "loss": 0.1276, "step": 10720 }, { "epoch": 3.6971211860024136, "grad_norm": 1.2499719819707977, "learning_rate": 1.6787012973311567e-06, "loss": 0.1374, "step": 10725 }, { "epoch": 3.698845026719531, "grad_norm": 1.4334182789428982, "learning_rate": 1.6745304592753004e-06, "loss": 0.1354, "step": 10730 }, { "epoch": 3.700568867436649, "grad_norm": 1.278202095404078, "learning_rate": 1.670363766617351e-06, "loss": 0.126, "step": 10735 }, { "epoch": 3.7022927081537667, "grad_norm": 1.3399633301297333, "learning_rate": 1.6662012245513454e-06, "loss": 0.1526, "step": 10740 }, { "epoch": 3.7040165488708845, "grad_norm": 1.3435297851993446, "learning_rate": 1.6620428382661391e-06, "loss": 0.1283, "step": 10745 }, { "epoch": 3.7057403895880023, "grad_norm": 1.237787900089323, "learning_rate": 1.657888612945413e-06, "loss": 0.1291, "step": 10750 }, { "epoch": 3.7074642303051197, "grad_norm": 1.231154186186958, "learning_rate": 1.6537385537676604e-06, "loss": 0.1247, "step": 10755 }, { "epoch": 3.7091880710222376, "grad_norm": 1.2283581519236173, "learning_rate": 1.6495926659061779e-06, "loss": 0.139, "step": 10760 }, { "epoch": 3.7109119117393554, "grad_norm": 1.3180432600272551, "learning_rate": 1.6454509545290647e-06, "loss": 0.1158, "step": 10765 }, { "epoch": 3.712635752456473, "grad_norm": 1.4453948312356553, "learning_rate": 1.6413134247992112e-06, "loss": 0.1218, "step": 10770 }, { "epoch": 3.7143595931735907, "grad_norm": 1.2105396668085264, "learning_rate": 1.6371800818743004e-06, "loss": 0.1239, "step": 10775 }, { "epoch": 3.7160834338907085, "grad_norm": 1.3962426884992203, "learning_rate": 1.6330509309067921e-06, "loss": 0.1273, "step": 10780 }, { "epoch": 3.7178072746078263, "grad_norm": 1.3272134789736372, "learning_rate": 1.6289259770439192e-06, "loss": 0.1283, "step": 10785 }, { "epoch": 3.719531115324944, "grad_norm": 1.4582300277803126, "learning_rate": 1.624805225427687e-06, "loss": 0.1383, "step": 10790 }, { "epoch": 3.7212549560420616, "grad_norm": 1.4787466443055255, "learning_rate": 1.6206886811948613e-06, "loss": 0.1236, "step": 10795 }, { "epoch": 3.7229787967591794, "grad_norm": 1.2562573715672862, "learning_rate": 1.616576349476961e-06, "loss": 0.135, "step": 10800 }, { "epoch": 3.7247026374762973, "grad_norm": 1.1323987446276198, "learning_rate": 1.6124682354002534e-06, "loss": 0.1406, "step": 10805 }, { "epoch": 3.7264264781934147, "grad_norm": 1.2233775897652701, "learning_rate": 1.6083643440857538e-06, "loss": 0.125, "step": 10810 }, { "epoch": 3.7281503189105325, "grad_norm": 1.4222384020217527, "learning_rate": 1.6042646806492074e-06, "loss": 0.1466, "step": 10815 }, { "epoch": 3.7298741596276503, "grad_norm": 1.3658861942644898, "learning_rate": 1.6001692502010896e-06, "loss": 0.108, "step": 10820 }, { "epoch": 3.731598000344768, "grad_norm": 1.3315481859182554, "learning_rate": 1.5960780578466045e-06, "loss": 0.1394, "step": 10825 }, { "epoch": 3.733321841061886, "grad_norm": 1.215603151961821, "learning_rate": 1.591991108685666e-06, "loss": 0.1308, "step": 10830 }, { "epoch": 3.7350456817790034, "grad_norm": 1.2194561850927335, "learning_rate": 1.5879084078129043e-06, "loss": 0.1257, "step": 10835 }, { "epoch": 3.7367695224961213, "grad_norm": 1.4393103623707952, "learning_rate": 1.5838299603176533e-06, "loss": 0.1361, "step": 10840 }, { "epoch": 3.738493363213239, "grad_norm": 1.2893582131665366, "learning_rate": 1.5797557712839412e-06, "loss": 0.1204, "step": 10845 }, { "epoch": 3.740217203930357, "grad_norm": 1.220522667960852, "learning_rate": 1.57568584579049e-06, "loss": 0.1242, "step": 10850 }, { "epoch": 3.741941044647475, "grad_norm": 1.1576652642506275, "learning_rate": 1.5716201889107051e-06, "loss": 0.1043, "step": 10855 }, { "epoch": 3.743664885364592, "grad_norm": 1.4260677520633134, "learning_rate": 1.5675588057126762e-06, "loss": 0.1413, "step": 10860 }, { "epoch": 3.74538872608171, "grad_norm": 1.361725118807402, "learning_rate": 1.5635017012591585e-06, "loss": 0.139, "step": 10865 }, { "epoch": 3.747112566798828, "grad_norm": 1.296965961384951, "learning_rate": 1.5594488806075775e-06, "loss": 0.142, "step": 10870 }, { "epoch": 3.7488364075159453, "grad_norm": 1.5503628680491506, "learning_rate": 1.5554003488100205e-06, "loss": 0.1538, "step": 10875 }, { "epoch": 3.750560248233063, "grad_norm": 1.337619682985209, "learning_rate": 1.5513561109132247e-06, "loss": 0.132, "step": 10880 }, { "epoch": 3.752284088950181, "grad_norm": 1.5941157201208773, "learning_rate": 1.5473161719585754e-06, "loss": 0.1238, "step": 10885 }, { "epoch": 3.754007929667299, "grad_norm": 1.2320881707975944, "learning_rate": 1.543280536982098e-06, "loss": 0.1408, "step": 10890 }, { "epoch": 3.7557317703844166, "grad_norm": 2.04090156034139, "learning_rate": 1.539249211014458e-06, "loss": 0.1227, "step": 10895 }, { "epoch": 3.757455611101534, "grad_norm": 1.247734741558676, "learning_rate": 1.535222199080944e-06, "loss": 0.1437, "step": 10900 }, { "epoch": 3.759179451818652, "grad_norm": 1.3943424707463135, "learning_rate": 1.5311995062014674e-06, "loss": 0.1182, "step": 10905 }, { "epoch": 3.7609032925357697, "grad_norm": 1.3327693645392733, "learning_rate": 1.5271811373905583e-06, "loss": 0.1264, "step": 10910 }, { "epoch": 3.7626271332528876, "grad_norm": 1.3495899789263786, "learning_rate": 1.5231670976573565e-06, "loss": 0.1232, "step": 10915 }, { "epoch": 3.7643509739700054, "grad_norm": 1.2950239065978737, "learning_rate": 1.5191573920056025e-06, "loss": 0.1302, "step": 10920 }, { "epoch": 3.766074814687123, "grad_norm": 1.2862976776110107, "learning_rate": 1.515152025433635e-06, "loss": 0.129, "step": 10925 }, { "epoch": 3.7677986554042406, "grad_norm": 1.1501424533467042, "learning_rate": 1.5111510029343868e-06, "loss": 0.1337, "step": 10930 }, { "epoch": 3.7695224961213585, "grad_norm": 1.2436287570238818, "learning_rate": 1.5071543294953722e-06, "loss": 0.1313, "step": 10935 }, { "epoch": 3.771246336838476, "grad_norm": 1.453319868604849, "learning_rate": 1.5031620100986833e-06, "loss": 0.1316, "step": 10940 }, { "epoch": 3.7729701775555937, "grad_norm": 1.2621068718110429, "learning_rate": 1.4991740497209895e-06, "loss": 0.1348, "step": 10945 }, { "epoch": 3.7746940182727116, "grad_norm": 1.271308202827298, "learning_rate": 1.4951904533335204e-06, "loss": 0.1278, "step": 10950 }, { "epoch": 3.7764178589898294, "grad_norm": 1.133861444720207, "learning_rate": 1.4912112259020706e-06, "loss": 0.1393, "step": 10955 }, { "epoch": 3.7781416997069472, "grad_norm": 1.1465500843261776, "learning_rate": 1.487236372386987e-06, "loss": 0.123, "step": 10960 }, { "epoch": 3.7798655404240646, "grad_norm": 1.4475200808592783, "learning_rate": 1.4832658977431635e-06, "loss": 0.1256, "step": 10965 }, { "epoch": 3.7815893811411825, "grad_norm": 1.4875096236385752, "learning_rate": 1.4792998069200348e-06, "loss": 0.1357, "step": 10970 }, { "epoch": 3.7833132218583003, "grad_norm": 1.1171697163963556, "learning_rate": 1.4753381048615706e-06, "loss": 0.119, "step": 10975 }, { "epoch": 3.785037062575418, "grad_norm": 1.2259841311742092, "learning_rate": 1.4713807965062744e-06, "loss": 0.1285, "step": 10980 }, { "epoch": 3.786760903292536, "grad_norm": 1.4564214641351185, "learning_rate": 1.4674278867871666e-06, "loss": 0.1171, "step": 10985 }, { "epoch": 3.7884847440096534, "grad_norm": 1.3354716528623003, "learning_rate": 1.463479380631786e-06, "loss": 0.1214, "step": 10990 }, { "epoch": 3.7902085847267712, "grad_norm": 1.1938806946647718, "learning_rate": 1.4595352829621856e-06, "loss": 0.1354, "step": 10995 }, { "epoch": 3.791932425443889, "grad_norm": 1.3794445650650593, "learning_rate": 1.4555955986949204e-06, "loss": 0.1292, "step": 11000 }, { "epoch": 3.7936562661610065, "grad_norm": 1.2104736072519433, "learning_rate": 1.4516603327410438e-06, "loss": 0.1294, "step": 11005 }, { "epoch": 3.7953801068781243, "grad_norm": 1.2556223572321454, "learning_rate": 1.4477294900060994e-06, "loss": 0.1192, "step": 11010 }, { "epoch": 3.797103947595242, "grad_norm": 4.240080770042343, "learning_rate": 1.4438030753901223e-06, "loss": 0.1392, "step": 11015 }, { "epoch": 3.79882778831236, "grad_norm": 1.4255185379359292, "learning_rate": 1.4398810937876234e-06, "loss": 0.1357, "step": 11020 }, { "epoch": 3.800551629029478, "grad_norm": 1.430969451647483, "learning_rate": 1.4359635500875868e-06, "loss": 0.1318, "step": 11025 }, { "epoch": 3.8022754697465952, "grad_norm": 1.4035212866383076, "learning_rate": 1.43205044917347e-06, "loss": 0.1137, "step": 11030 }, { "epoch": 3.803999310463713, "grad_norm": 1.268086122543932, "learning_rate": 1.4281417959231853e-06, "loss": 0.1076, "step": 11035 }, { "epoch": 3.805723151180831, "grad_norm": 1.453769281043299, "learning_rate": 1.424237595209108e-06, "loss": 0.1187, "step": 11040 }, { "epoch": 3.8074469918979488, "grad_norm": 1.2930421143796562, "learning_rate": 1.4203378518980554e-06, "loss": 0.1279, "step": 11045 }, { "epoch": 3.8091708326150666, "grad_norm": 1.085263627593601, "learning_rate": 1.4164425708512952e-06, "loss": 0.1227, "step": 11050 }, { "epoch": 3.810894673332184, "grad_norm": 1.144411584916602, "learning_rate": 1.412551756924529e-06, "loss": 0.1257, "step": 11055 }, { "epoch": 3.812618514049302, "grad_norm": 1.195013482913271, "learning_rate": 1.408665414967888e-06, "loss": 0.1416, "step": 11060 }, { "epoch": 3.8143423547664197, "grad_norm": 1.4447775880354259, "learning_rate": 1.4047835498259349e-06, "loss": 0.1242, "step": 11065 }, { "epoch": 3.816066195483537, "grad_norm": 1.3492475896084266, "learning_rate": 1.4009061663376455e-06, "loss": 0.1358, "step": 11070 }, { "epoch": 3.817790036200655, "grad_norm": 1.1707568635202883, "learning_rate": 1.3970332693364125e-06, "loss": 0.1343, "step": 11075 }, { "epoch": 3.8195138769177728, "grad_norm": 1.2811601038550247, "learning_rate": 1.3931648636500372e-06, "loss": 0.1136, "step": 11080 }, { "epoch": 3.8212377176348906, "grad_norm": 1.2746899216199157, "learning_rate": 1.389300954100718e-06, "loss": 0.1337, "step": 11085 }, { "epoch": 3.8229615583520085, "grad_norm": 1.1457523483560896, "learning_rate": 1.3854415455050507e-06, "loss": 0.1301, "step": 11090 }, { "epoch": 3.824685399069126, "grad_norm": 1.2964019904296888, "learning_rate": 1.3815866426740193e-06, "loss": 0.1216, "step": 11095 }, { "epoch": 3.8264092397862437, "grad_norm": 1.457836130755616, "learning_rate": 1.3777362504129948e-06, "loss": 0.1291, "step": 11100 }, { "epoch": 3.8281330805033615, "grad_norm": 1.276492445624028, "learning_rate": 1.373890373521722e-06, "loss": 0.1173, "step": 11105 }, { "epoch": 3.8298569212204794, "grad_norm": 1.3377137684512943, "learning_rate": 1.3700490167943153e-06, "loss": 0.1303, "step": 11110 }, { "epoch": 3.8315807619375972, "grad_norm": 0.9776925487844316, "learning_rate": 1.3662121850192594e-06, "loss": 0.1061, "step": 11115 }, { "epoch": 3.8333046026547146, "grad_norm": 1.2442278933032103, "learning_rate": 1.3623798829793972e-06, "loss": 0.131, "step": 11120 }, { "epoch": 3.8350284433718325, "grad_norm": 1.647143889860017, "learning_rate": 1.3585521154519226e-06, "loss": 0.119, "step": 11125 }, { "epoch": 3.8367522840889503, "grad_norm": 1.1130395591433442, "learning_rate": 1.3547288872083765e-06, "loss": 0.1208, "step": 11130 }, { "epoch": 3.8384761248060677, "grad_norm": 1.245456149628818, "learning_rate": 1.350910203014646e-06, "loss": 0.1423, "step": 11135 }, { "epoch": 3.8401999655231855, "grad_norm": 1.3163255724349603, "learning_rate": 1.3470960676309491e-06, "loss": 0.1285, "step": 11140 }, { "epoch": 3.8419238062403034, "grad_norm": 1.1961848803512916, "learning_rate": 1.3432864858118333e-06, "loss": 0.117, "step": 11145 }, { "epoch": 3.8436476469574212, "grad_norm": 1.190522361714228, "learning_rate": 1.3394814623061752e-06, "loss": 0.1282, "step": 11150 }, { "epoch": 3.845371487674539, "grad_norm": 1.463827119485804, "learning_rate": 1.3356810018571626e-06, "loss": 0.1211, "step": 11155 }, { "epoch": 3.8470953283916565, "grad_norm": 1.3327547679233624, "learning_rate": 1.3318851092022994e-06, "loss": 0.1312, "step": 11160 }, { "epoch": 3.8488191691087743, "grad_norm": 1.4066396304155349, "learning_rate": 1.3280937890733959e-06, "loss": 0.1289, "step": 11165 }, { "epoch": 3.850543009825892, "grad_norm": 1.3354027398687491, "learning_rate": 1.324307046196559e-06, "loss": 0.1287, "step": 11170 }, { "epoch": 3.85226685054301, "grad_norm": 1.37528379829199, "learning_rate": 1.3205248852921915e-06, "loss": 0.1087, "step": 11175 }, { "epoch": 3.853990691260128, "grad_norm": 1.4046439494981537, "learning_rate": 1.316747311074984e-06, "loss": 0.1437, "step": 11180 }, { "epoch": 3.8557145319772452, "grad_norm": 1.2903397860103851, "learning_rate": 1.3129743282539121e-06, "loss": 0.1283, "step": 11185 }, { "epoch": 3.857438372694363, "grad_norm": 1.3505503586265961, "learning_rate": 1.3092059415322244e-06, "loss": 0.1195, "step": 11190 }, { "epoch": 3.859162213411481, "grad_norm": 1.2530372314439693, "learning_rate": 1.305442155607441e-06, "loss": 0.1153, "step": 11195 }, { "epoch": 3.8608860541285983, "grad_norm": 1.5124853496926782, "learning_rate": 1.3016829751713483e-06, "loss": 0.1182, "step": 11200 }, { "epoch": 3.862609894845716, "grad_norm": 1.1688470516223795, "learning_rate": 1.2979284049099933e-06, "loss": 0.1139, "step": 11205 }, { "epoch": 3.864333735562834, "grad_norm": 1.1531170510402515, "learning_rate": 1.2941784495036713e-06, "loss": 0.1203, "step": 11210 }, { "epoch": 3.866057576279952, "grad_norm": 1.3078188811256894, "learning_rate": 1.2904331136269267e-06, "loss": 0.1086, "step": 11215 }, { "epoch": 3.8677814169970697, "grad_norm": 1.511012410167427, "learning_rate": 1.2866924019485488e-06, "loss": 0.1303, "step": 11220 }, { "epoch": 3.869505257714187, "grad_norm": 1.4950061574931324, "learning_rate": 1.282956319131558e-06, "loss": 0.1164, "step": 11225 }, { "epoch": 3.871229098431305, "grad_norm": 1.2534647224767244, "learning_rate": 1.279224869833205e-06, "loss": 0.141, "step": 11230 }, { "epoch": 3.8729529391484228, "grad_norm": 1.4769203165496234, "learning_rate": 1.2754980587049693e-06, "loss": 0.1193, "step": 11235 }, { "epoch": 3.87467677986554, "grad_norm": 1.1552930698672923, "learning_rate": 1.271775890392542e-06, "loss": 0.1183, "step": 11240 }, { "epoch": 3.876400620582658, "grad_norm": 1.1571956169173494, "learning_rate": 1.2680583695358329e-06, "loss": 0.1164, "step": 11245 }, { "epoch": 3.878124461299776, "grad_norm": 1.3191622152880862, "learning_rate": 1.2643455007689526e-06, "loss": 0.1414, "step": 11250 }, { "epoch": 3.8798483020168937, "grad_norm": 1.2903336535394538, "learning_rate": 1.260637288720218e-06, "loss": 0.1271, "step": 11255 }, { "epoch": 3.8815721427340115, "grad_norm": 1.3086834300730585, "learning_rate": 1.2569337380121371e-06, "loss": 0.1132, "step": 11260 }, { "epoch": 3.883295983451129, "grad_norm": 1.4192245593261936, "learning_rate": 1.253234853261408e-06, "loss": 0.1426, "step": 11265 }, { "epoch": 3.8850198241682468, "grad_norm": 1.210821345352799, "learning_rate": 1.2495406390789155e-06, "loss": 0.1261, "step": 11270 }, { "epoch": 3.8867436648853646, "grad_norm": 1.2492008174085303, "learning_rate": 1.245851100069717e-06, "loss": 0.1117, "step": 11275 }, { "epoch": 3.8884675056024824, "grad_norm": 1.3417342393956402, "learning_rate": 1.242166240833047e-06, "loss": 0.0998, "step": 11280 }, { "epoch": 3.8901913463196003, "grad_norm": 1.4186445506236978, "learning_rate": 1.2384860659623044e-06, "loss": 0.1347, "step": 11285 }, { "epoch": 3.8919151870367177, "grad_norm": 1.2949279297232827, "learning_rate": 1.2348105800450489e-06, "loss": 0.1227, "step": 11290 }, { "epoch": 3.8936390277538355, "grad_norm": 1.4243304940922359, "learning_rate": 1.2311397876629932e-06, "loss": 0.1318, "step": 11295 }, { "epoch": 3.8953628684709534, "grad_norm": 1.2905485068936577, "learning_rate": 1.2274736933920006e-06, "loss": 0.1398, "step": 11300 }, { "epoch": 3.8970867091880708, "grad_norm": 1.3236274887074178, "learning_rate": 1.2238123018020808e-06, "loss": 0.1081, "step": 11305 }, { "epoch": 3.8988105499051886, "grad_norm": 1.2931686422207205, "learning_rate": 1.2201556174573775e-06, "loss": 0.1236, "step": 11310 }, { "epoch": 3.9005343906223064, "grad_norm": 1.5918640685585563, "learning_rate": 1.216503644916166e-06, "loss": 0.1336, "step": 11315 }, { "epoch": 3.9022582313394243, "grad_norm": 4.094605028147942, "learning_rate": 1.2128563887308514e-06, "loss": 0.1309, "step": 11320 }, { "epoch": 3.903982072056542, "grad_norm": 1.3958336497684238, "learning_rate": 1.2092138534479593e-06, "loss": 0.1282, "step": 11325 }, { "epoch": 3.9057059127736595, "grad_norm": 1.3143188624337778, "learning_rate": 1.2055760436081281e-06, "loss": 0.1139, "step": 11330 }, { "epoch": 3.9074297534907774, "grad_norm": 1.0715275139956435, "learning_rate": 1.201942963746105e-06, "loss": 0.1108, "step": 11335 }, { "epoch": 3.909153594207895, "grad_norm": 1.171293519452673, "learning_rate": 1.1983146183907457e-06, "loss": 0.1257, "step": 11340 }, { "epoch": 3.910877434925013, "grad_norm": 1.3862634054133804, "learning_rate": 1.1946910120649996e-06, "loss": 0.1293, "step": 11345 }, { "epoch": 3.912601275642131, "grad_norm": 1.2786235687218053, "learning_rate": 1.1910721492859083e-06, "loss": 0.1145, "step": 11350 }, { "epoch": 3.9143251163592483, "grad_norm": 1.4781329272979127, "learning_rate": 1.1874580345646054e-06, "loss": 0.1165, "step": 11355 }, { "epoch": 3.916048957076366, "grad_norm": 1.3180672481231375, "learning_rate": 1.1838486724062992e-06, "loss": 0.1303, "step": 11360 }, { "epoch": 3.917772797793484, "grad_norm": 1.3869240003211991, "learning_rate": 1.18024406731028e-06, "loss": 0.1331, "step": 11365 }, { "epoch": 3.9194966385106014, "grad_norm": 1.3405785174041558, "learning_rate": 1.1766442237699016e-06, "loss": 0.1226, "step": 11370 }, { "epoch": 3.921220479227719, "grad_norm": 1.2456815469065652, "learning_rate": 1.173049146272589e-06, "loss": 0.1141, "step": 11375 }, { "epoch": 3.922944319944837, "grad_norm": 1.314857528219527, "learning_rate": 1.1694588392998207e-06, "loss": 0.1337, "step": 11380 }, { "epoch": 3.924668160661955, "grad_norm": 1.3116783961627352, "learning_rate": 1.1658733073271294e-06, "loss": 0.1178, "step": 11385 }, { "epoch": 3.9263920013790727, "grad_norm": 1.4187715000423577, "learning_rate": 1.1622925548240993e-06, "loss": 0.1138, "step": 11390 }, { "epoch": 3.92811584209619, "grad_norm": 1.3111507735045609, "learning_rate": 1.158716586254352e-06, "loss": 0.1331, "step": 11395 }, { "epoch": 3.929839682813308, "grad_norm": 1.3881387781864114, "learning_rate": 1.1551454060755468e-06, "loss": 0.1257, "step": 11400 }, { "epoch": 3.931563523530426, "grad_norm": 1.381550436277335, "learning_rate": 1.1515790187393761e-06, "loss": 0.1155, "step": 11405 }, { "epoch": 3.9332873642475437, "grad_norm": 1.307666267073457, "learning_rate": 1.1480174286915568e-06, "loss": 0.1298, "step": 11410 }, { "epoch": 3.9350112049646615, "grad_norm": 1.2572515169693843, "learning_rate": 1.144460640371825e-06, "loss": 0.1342, "step": 11415 }, { "epoch": 3.936735045681779, "grad_norm": 1.3827825621819523, "learning_rate": 1.140908658213929e-06, "loss": 0.1108, "step": 11420 }, { "epoch": 3.9384588863988967, "grad_norm": 1.4326374483997706, "learning_rate": 1.1373614866456318e-06, "loss": 0.115, "step": 11425 }, { "epoch": 3.9401827271160146, "grad_norm": 1.2819893498360015, "learning_rate": 1.1338191300886947e-06, "loss": 0.1136, "step": 11430 }, { "epoch": 3.941906567833132, "grad_norm": 1.3726546493538927, "learning_rate": 1.1302815929588768e-06, "loss": 0.1251, "step": 11435 }, { "epoch": 3.94363040855025, "grad_norm": 1.3668066551952909, "learning_rate": 1.1267488796659332e-06, "loss": 0.118, "step": 11440 }, { "epoch": 3.9453542492673677, "grad_norm": 1.2906622087235946, "learning_rate": 1.123220994613602e-06, "loss": 0.1125, "step": 11445 }, { "epoch": 3.9470780899844855, "grad_norm": 1.4589184065164795, "learning_rate": 1.119697942199607e-06, "loss": 0.1252, "step": 11450 }, { "epoch": 3.9488019307016033, "grad_norm": 1.2592519865439942, "learning_rate": 1.116179726815641e-06, "loss": 0.1202, "step": 11455 }, { "epoch": 3.9505257714187207, "grad_norm": 1.1442893223089237, "learning_rate": 1.1126663528473746e-06, "loss": 0.1315, "step": 11460 }, { "epoch": 3.9522496121358386, "grad_norm": 1.3414462655063484, "learning_rate": 1.109157824674439e-06, "loss": 0.1264, "step": 11465 }, { "epoch": 3.9539734528529564, "grad_norm": 1.447593237017541, "learning_rate": 1.105654146670424e-06, "loss": 0.1184, "step": 11470 }, { "epoch": 3.9556972935700743, "grad_norm": 1.318433941420654, "learning_rate": 1.1021553232028776e-06, "loss": 0.1258, "step": 11475 }, { "epoch": 3.957421134287192, "grad_norm": 1.439497552831529, "learning_rate": 1.0986613586332918e-06, "loss": 0.1283, "step": 11480 }, { "epoch": 3.9591449750043095, "grad_norm": 1.5088135652965549, "learning_rate": 1.0951722573171054e-06, "loss": 0.1243, "step": 11485 }, { "epoch": 3.9608688157214273, "grad_norm": 1.4644340894445993, "learning_rate": 1.091688023603691e-06, "loss": 0.1356, "step": 11490 }, { "epoch": 3.962592656438545, "grad_norm": 1.2528418995261748, "learning_rate": 1.088208661836358e-06, "loss": 0.1129, "step": 11495 }, { "epoch": 3.9643164971556626, "grad_norm": 1.2066685547187292, "learning_rate": 1.0847341763523395e-06, "loss": 0.1194, "step": 11500 }, { "epoch": 3.9660403378727804, "grad_norm": 1.4827307883548029, "learning_rate": 1.0812645714827891e-06, "loss": 0.1175, "step": 11505 }, { "epoch": 3.9677641785898983, "grad_norm": 1.3134738206153762, "learning_rate": 1.0777998515527803e-06, "loss": 0.1124, "step": 11510 }, { "epoch": 3.969488019307016, "grad_norm": 1.1625180293338828, "learning_rate": 1.0743400208812943e-06, "loss": 0.1115, "step": 11515 }, { "epoch": 3.971211860024134, "grad_norm": 1.3896292456470711, "learning_rate": 1.0708850837812168e-06, "loss": 0.1218, "step": 11520 }, { "epoch": 3.9729357007412514, "grad_norm": 1.3254372568094368, "learning_rate": 1.0674350445593357e-06, "loss": 0.1236, "step": 11525 }, { "epoch": 3.974659541458369, "grad_norm": 1.293089181896352, "learning_rate": 1.063989907516334e-06, "loss": 0.1102, "step": 11530 }, { "epoch": 3.976383382175487, "grad_norm": 1.4370992489184637, "learning_rate": 1.0605496769467815e-06, "loss": 0.1459, "step": 11535 }, { "epoch": 3.978107222892605, "grad_norm": 1.2210879436739124, "learning_rate": 1.0571143571391312e-06, "loss": 0.13, "step": 11540 }, { "epoch": 3.9798310636097227, "grad_norm": 1.46020719541028, "learning_rate": 1.0536839523757182e-06, "loss": 0.1285, "step": 11545 }, { "epoch": 3.98155490432684, "grad_norm": 1.2939461867294593, "learning_rate": 1.0502584669327476e-06, "loss": 0.1078, "step": 11550 }, { "epoch": 3.983278745043958, "grad_norm": 1.1821278440593554, "learning_rate": 1.0468379050802914e-06, "loss": 0.1198, "step": 11555 }, { "epoch": 3.985002585761076, "grad_norm": 1.442776995049047, "learning_rate": 1.0434222710822882e-06, "loss": 0.1121, "step": 11560 }, { "epoch": 3.986726426478193, "grad_norm": 1.3873685783529446, "learning_rate": 1.0400115691965296e-06, "loss": 0.1262, "step": 11565 }, { "epoch": 3.988450267195311, "grad_norm": 1.3681284056301366, "learning_rate": 1.036605803674663e-06, "loss": 0.1364, "step": 11570 }, { "epoch": 3.990174107912429, "grad_norm": 1.404052842629089, "learning_rate": 1.0332049787621767e-06, "loss": 0.1247, "step": 11575 }, { "epoch": 3.9918979486295467, "grad_norm": 1.2907010539025423, "learning_rate": 1.0298090986984077e-06, "loss": 0.1322, "step": 11580 }, { "epoch": 3.9936217893466646, "grad_norm": 1.2243692650404463, "learning_rate": 1.0264181677165225e-06, "loss": 0.1127, "step": 11585 }, { "epoch": 3.995345630063782, "grad_norm": 1.2521298308335556, "learning_rate": 1.0230321900435191e-06, "loss": 0.1073, "step": 11590 }, { "epoch": 3.9970694707809, "grad_norm": 1.3277773009406997, "learning_rate": 1.019651169900226e-06, "loss": 0.1065, "step": 11595 }, { "epoch": 3.9987933114980176, "grad_norm": 1.9013747081077799, "learning_rate": 1.0162751115012865e-06, "loss": 0.113, "step": 11600 }, { "epoch": 4.000344768143424, "grad_norm": 1.192967741884643, "learning_rate": 1.0129040190551591e-06, "loss": 0.1058, "step": 11605 }, { "epoch": 4.002068608860541, "grad_norm": 1.1246415912642835, "learning_rate": 1.009537896764115e-06, "loss": 0.1085, "step": 11610 }, { "epoch": 4.003792449577659, "grad_norm": 1.2283374992652234, "learning_rate": 1.0061767488242297e-06, "loss": 0.1203, "step": 11615 }, { "epoch": 4.005516290294777, "grad_norm": 1.4975157666010093, "learning_rate": 1.0028205794253748e-06, "loss": 0.1015, "step": 11620 }, { "epoch": 4.007240131011894, "grad_norm": 1.3421887596802118, "learning_rate": 9.994693927512156e-07, "loss": 0.0983, "step": 11625 }, { "epoch": 4.008963971729012, "grad_norm": 1.111806480271007, "learning_rate": 9.961231929792115e-07, "loss": 0.0997, "step": 11630 }, { "epoch": 4.01068781244613, "grad_norm": 1.3154458261628559, "learning_rate": 9.927819842805997e-07, "loss": 0.1111, "step": 11635 }, { "epoch": 4.012411653163248, "grad_norm": 1.2893096837104474, "learning_rate": 9.894457708203976e-07, "loss": 0.0981, "step": 11640 }, { "epoch": 4.0141354938803655, "grad_norm": 1.3579685588749366, "learning_rate": 9.861145567573976e-07, "loss": 0.1009, "step": 11645 }, { "epoch": 4.015859334597483, "grad_norm": 1.7631269267504865, "learning_rate": 9.827883462441568e-07, "loss": 0.1221, "step": 11650 }, { "epoch": 4.017583175314601, "grad_norm": 1.80720466145997, "learning_rate": 9.794671434269987e-07, "loss": 0.1041, "step": 11655 }, { "epoch": 4.019307016031719, "grad_norm": 1.1128224846323527, "learning_rate": 9.76150952446e-07, "loss": 0.0921, "step": 11660 }, { "epoch": 4.021030856748836, "grad_norm": 1.1635046904275557, "learning_rate": 9.728397774349957e-07, "loss": 0.0987, "step": 11665 }, { "epoch": 4.022754697465954, "grad_norm": 1.4913858189939422, "learning_rate": 9.695336225215624e-07, "loss": 0.1244, "step": 11670 }, { "epoch": 4.024478538183072, "grad_norm": 1.3770127707009217, "learning_rate": 9.662324918270205e-07, "loss": 0.1181, "step": 11675 }, { "epoch": 4.02620237890019, "grad_norm": 1.1304720331491682, "learning_rate": 9.6293638946643e-07, "loss": 0.1049, "step": 11680 }, { "epoch": 4.027926219617307, "grad_norm": 1.4254678750933454, "learning_rate": 9.596453195485795e-07, "loss": 0.0927, "step": 11685 }, { "epoch": 4.029650060334425, "grad_norm": 1.3083179637487954, "learning_rate": 9.563592861759867e-07, "loss": 0.1054, "step": 11690 }, { "epoch": 4.031373901051543, "grad_norm": 1.186237793678765, "learning_rate": 9.53078293444889e-07, "loss": 0.1036, "step": 11695 }, { "epoch": 4.03309774176866, "grad_norm": 1.9498169472536708, "learning_rate": 9.498023454452426e-07, "loss": 0.0969, "step": 11700 }, { "epoch": 4.034821582485779, "grad_norm": 1.3328954105824022, "learning_rate": 9.465314462607128e-07, "loss": 0.1089, "step": 11705 }, { "epoch": 4.036545423202896, "grad_norm": 3.2855682666504435, "learning_rate": 9.432655999686713e-07, "loss": 0.0891, "step": 11710 }, { "epoch": 4.0382692639200135, "grad_norm": 1.2580259239639295, "learning_rate": 9.400048106401949e-07, "loss": 0.0987, "step": 11715 }, { "epoch": 4.039993104637132, "grad_norm": 1.1296615390957263, "learning_rate": 9.367490823400516e-07, "loss": 0.1112, "step": 11720 }, { "epoch": 4.041716945354249, "grad_norm": 1.7251392205965974, "learning_rate": 9.334984191267022e-07, "loss": 0.1165, "step": 11725 }, { "epoch": 4.043440786071367, "grad_norm": 1.3810324511738785, "learning_rate": 9.302528250522946e-07, "loss": 0.1138, "step": 11730 }, { "epoch": 4.045164626788485, "grad_norm": 1.1635481284634455, "learning_rate": 9.270123041626588e-07, "loss": 0.0961, "step": 11735 }, { "epoch": 4.046888467505602, "grad_norm": 1.390536186188031, "learning_rate": 9.237768604972975e-07, "loss": 0.1039, "step": 11740 }, { "epoch": 4.048612308222721, "grad_norm": 1.5211267859585615, "learning_rate": 9.205464980893852e-07, "loss": 0.1034, "step": 11745 }, { "epoch": 4.050336148939838, "grad_norm": 1.5343426141391991, "learning_rate": 9.17321220965765e-07, "loss": 0.1197, "step": 11750 }, { "epoch": 4.052059989656955, "grad_norm": 1.5225286593304013, "learning_rate": 9.141010331469385e-07, "loss": 0.1111, "step": 11755 }, { "epoch": 4.053783830374074, "grad_norm": 1.5468434745235593, "learning_rate": 9.108859386470614e-07, "loss": 0.0948, "step": 11760 }, { "epoch": 4.055507671091191, "grad_norm": 1.1097092908953101, "learning_rate": 9.076759414739455e-07, "loss": 0.0932, "step": 11765 }, { "epoch": 4.057231511808309, "grad_norm": 1.3743748526969284, "learning_rate": 9.044710456290429e-07, "loss": 0.1101, "step": 11770 }, { "epoch": 4.058955352525427, "grad_norm": 1.6347840548319619, "learning_rate": 9.012712551074515e-07, "loss": 0.1118, "step": 11775 }, { "epoch": 4.060679193242544, "grad_norm": 1.3727861526025247, "learning_rate": 8.980765738979003e-07, "loss": 0.1056, "step": 11780 }, { "epoch": 4.062403033959662, "grad_norm": 1.3808153758538497, "learning_rate": 8.948870059827547e-07, "loss": 0.1137, "step": 11785 }, { "epoch": 4.06412687467678, "grad_norm": 1.2235266519823085, "learning_rate": 8.917025553380005e-07, "loss": 0.1047, "step": 11790 }, { "epoch": 4.065850715393897, "grad_norm": 1.4477415851054352, "learning_rate": 8.885232259332472e-07, "loss": 0.0935, "step": 11795 }, { "epoch": 4.0675745561110155, "grad_norm": 1.4397584466110982, "learning_rate": 8.853490217317223e-07, "loss": 0.1004, "step": 11800 }, { "epoch": 4.069298396828133, "grad_norm": 1.4921000596319516, "learning_rate": 8.821799466902603e-07, "loss": 0.102, "step": 11805 }, { "epoch": 4.071022237545251, "grad_norm": 1.0393458179033408, "learning_rate": 8.790160047593038e-07, "loss": 0.1139, "step": 11810 }, { "epoch": 4.072746078262369, "grad_norm": 1.037696317340174, "learning_rate": 8.758571998828979e-07, "loss": 0.1012, "step": 11815 }, { "epoch": 4.074469918979486, "grad_norm": 1.7236463427496, "learning_rate": 8.727035359986841e-07, "loss": 0.1029, "step": 11820 }, { "epoch": 4.076193759696604, "grad_norm": 1.358343552358338, "learning_rate": 8.695550170378924e-07, "loss": 0.1127, "step": 11825 }, { "epoch": 4.077917600413722, "grad_norm": 1.1162062249181175, "learning_rate": 8.664116469253403e-07, "loss": 0.112, "step": 11830 }, { "epoch": 4.07964144113084, "grad_norm": 1.1491529791012212, "learning_rate": 8.632734295794309e-07, "loss": 0.0937, "step": 11835 }, { "epoch": 4.081365281847957, "grad_norm": 1.4900000940003695, "learning_rate": 8.60140368912139e-07, "loss": 0.1171, "step": 11840 }, { "epoch": 4.083089122565075, "grad_norm": 1.3452224444177885, "learning_rate": 8.570124688290121e-07, "loss": 0.1068, "step": 11845 }, { "epoch": 4.084812963282193, "grad_norm": 1.4799116085086237, "learning_rate": 8.538897332291685e-07, "loss": 0.1011, "step": 11850 }, { "epoch": 4.08653680399931, "grad_norm": 1.1445816428457665, "learning_rate": 8.507721660052837e-07, "loss": 0.1177, "step": 11855 }, { "epoch": 4.088260644716428, "grad_norm": 1.206107159018617, "learning_rate": 8.476597710435952e-07, "loss": 0.0927, "step": 11860 }, { "epoch": 4.089984485433546, "grad_norm": 1.2100060217624893, "learning_rate": 8.445525522238879e-07, "loss": 0.106, "step": 11865 }, { "epoch": 4.0917083261506635, "grad_norm": 1.4458123099958744, "learning_rate": 8.414505134195e-07, "loss": 0.1146, "step": 11870 }, { "epoch": 4.093432166867782, "grad_norm": 1.3991243941204066, "learning_rate": 8.383536584973084e-07, "loss": 0.1121, "step": 11875 }, { "epoch": 4.095156007584899, "grad_norm": 1.1932901120313424, "learning_rate": 8.352619913177273e-07, "loss": 0.1047, "step": 11880 }, { "epoch": 4.096879848302017, "grad_norm": 1.5002107075033384, "learning_rate": 8.321755157347089e-07, "loss": 0.0987, "step": 11885 }, { "epoch": 4.098603689019135, "grad_norm": 1.3690348433153468, "learning_rate": 8.290942355957277e-07, "loss": 0.1146, "step": 11890 }, { "epoch": 4.100327529736252, "grad_norm": 1.078835584076048, "learning_rate": 8.260181547417878e-07, "loss": 0.1078, "step": 11895 }, { "epoch": 4.1020513704533705, "grad_norm": 1.48676742848251, "learning_rate": 8.229472770074065e-07, "loss": 0.1132, "step": 11900 }, { "epoch": 4.103775211170488, "grad_norm": 1.4042825662154081, "learning_rate": 8.1988160622062e-07, "loss": 0.0956, "step": 11905 }, { "epoch": 4.105499051887605, "grad_norm": 1.387523325741912, "learning_rate": 8.168211462029707e-07, "loss": 0.1088, "step": 11910 }, { "epoch": 4.107222892604724, "grad_norm": 1.312285694460311, "learning_rate": 8.137659007695043e-07, "loss": 0.112, "step": 11915 }, { "epoch": 4.108946733321841, "grad_norm": 1.2004785098009025, "learning_rate": 8.107158737287707e-07, "loss": 0.1062, "step": 11920 }, { "epoch": 4.110670574038958, "grad_norm": 1.242550070156778, "learning_rate": 8.076710688828115e-07, "loss": 0.0925, "step": 11925 }, { "epoch": 4.112394414756077, "grad_norm": 1.4076478021887935, "learning_rate": 8.046314900271573e-07, "loss": 0.0982, "step": 11930 }, { "epoch": 4.114118255473194, "grad_norm": 1.399082084580976, "learning_rate": 8.015971409508277e-07, "loss": 0.1022, "step": 11935 }, { "epoch": 4.115842096190312, "grad_norm": 1.4885978958615196, "learning_rate": 7.985680254363226e-07, "loss": 0.1139, "step": 11940 }, { "epoch": 4.11756593690743, "grad_norm": 1.2138767824937315, "learning_rate": 7.955441472596154e-07, "loss": 0.0982, "step": 11945 }, { "epoch": 4.119289777624547, "grad_norm": 1.1093407827318338, "learning_rate": 7.925255101901508e-07, "loss": 0.0796, "step": 11950 }, { "epoch": 4.1210136183416655, "grad_norm": 1.1675549711136834, "learning_rate": 7.895121179908444e-07, "loss": 0.1077, "step": 11955 }, { "epoch": 4.122737459058783, "grad_norm": 1.4988544671297996, "learning_rate": 7.865039744180691e-07, "loss": 0.0982, "step": 11960 }, { "epoch": 4.1244612997759, "grad_norm": 1.5360699817741974, "learning_rate": 7.835010832216567e-07, "loss": 0.1106, "step": 11965 }, { "epoch": 4.1261851404930185, "grad_norm": 1.289086933199439, "learning_rate": 7.805034481448937e-07, "loss": 0.1015, "step": 11970 }, { "epoch": 4.127908981210136, "grad_norm": 1.4227311446805668, "learning_rate": 7.775110729245095e-07, "loss": 0.0944, "step": 11975 }, { "epoch": 4.129632821927254, "grad_norm": 1.231740254467238, "learning_rate": 7.745239612906835e-07, "loss": 0.1083, "step": 11980 }, { "epoch": 4.131356662644372, "grad_norm": 1.3202500427322454, "learning_rate": 7.715421169670273e-07, "loss": 0.0985, "step": 11985 }, { "epoch": 4.133080503361489, "grad_norm": 1.24611953602762, "learning_rate": 7.685655436705913e-07, "loss": 0.1113, "step": 11990 }, { "epoch": 4.134804344078607, "grad_norm": 1.3017889243267873, "learning_rate": 7.65594245111852e-07, "loss": 0.1012, "step": 11995 }, { "epoch": 4.136528184795725, "grad_norm": 1.2667733889442665, "learning_rate": 7.626282249947115e-07, "loss": 0.0988, "step": 12000 }, { "epoch": 4.138252025512843, "grad_norm": 1.2624752669186925, "learning_rate": 7.596674870164939e-07, "loss": 0.1151, "step": 12005 }, { "epoch": 4.13997586622996, "grad_norm": 1.2812322088667052, "learning_rate": 7.567120348679369e-07, "loss": 0.0921, "step": 12010 }, { "epoch": 4.141699706947078, "grad_norm": 1.3834748467633617, "learning_rate": 7.537618722331874e-07, "loss": 0.1054, "step": 12015 }, { "epoch": 4.143423547664196, "grad_norm": 1.2230869497311545, "learning_rate": 7.50817002789802e-07, "loss": 0.1001, "step": 12020 }, { "epoch": 4.1451473883813135, "grad_norm": 1.6538512619424104, "learning_rate": 7.478774302087394e-07, "loss": 0.1172, "step": 12025 }, { "epoch": 4.146871229098432, "grad_norm": 1.376358870710376, "learning_rate": 7.449431581543526e-07, "loss": 0.118, "step": 12030 }, { "epoch": 4.148595069815549, "grad_norm": 1.212496811634599, "learning_rate": 7.420141902843864e-07, "loss": 0.1052, "step": 12035 }, { "epoch": 4.1503189105326665, "grad_norm": 1.2727108505858757, "learning_rate": 7.39090530249978e-07, "loss": 0.0962, "step": 12040 }, { "epoch": 4.152042751249785, "grad_norm": 1.3758968413519976, "learning_rate": 7.361721816956447e-07, "loss": 0.0987, "step": 12045 }, { "epoch": 4.153766591966902, "grad_norm": 1.2697425996431435, "learning_rate": 7.332591482592827e-07, "loss": 0.1068, "step": 12050 }, { "epoch": 4.15549043268402, "grad_norm": 1.2921233256220757, "learning_rate": 7.303514335721651e-07, "loss": 0.1126, "step": 12055 }, { "epoch": 4.157214273401138, "grad_norm": 1.2587095157496573, "learning_rate": 7.274490412589319e-07, "loss": 0.0932, "step": 12060 }, { "epoch": 4.158938114118255, "grad_norm": 1.1740100893674308, "learning_rate": 7.245519749375907e-07, "loss": 0.0908, "step": 12065 }, { "epoch": 4.160661954835374, "grad_norm": 1.301879362854531, "learning_rate": 7.216602382195081e-07, "loss": 0.1091, "step": 12070 }, { "epoch": 4.162385795552491, "grad_norm": 1.2075119152246632, "learning_rate": 7.187738347094097e-07, "loss": 0.1121, "step": 12075 }, { "epoch": 4.164109636269608, "grad_norm": 1.5191436087557528, "learning_rate": 7.158927680053696e-07, "loss": 0.1132, "step": 12080 }, { "epoch": 4.165833476986727, "grad_norm": 1.5201548964636065, "learning_rate": 7.130170416988102e-07, "loss": 0.1211, "step": 12085 }, { "epoch": 4.167557317703844, "grad_norm": 1.2720001429559575, "learning_rate": 7.101466593744999e-07, "loss": 0.102, "step": 12090 }, { "epoch": 4.1692811584209615, "grad_norm": 1.3372610104815759, "learning_rate": 7.072816246105402e-07, "loss": 0.0978, "step": 12095 }, { "epoch": 4.17100499913808, "grad_norm": 1.3131973260898593, "learning_rate": 7.044219409783715e-07, "loss": 0.0862, "step": 12100 }, { "epoch": 4.172728839855197, "grad_norm": 1.3462693147357956, "learning_rate": 7.015676120427595e-07, "loss": 0.1075, "step": 12105 }, { "epoch": 4.174452680572315, "grad_norm": 1.4543187413051926, "learning_rate": 6.987186413617997e-07, "loss": 0.1045, "step": 12110 }, { "epoch": 4.176176521289433, "grad_norm": 0.943878434492626, "learning_rate": 6.95875032486904e-07, "loss": 0.0831, "step": 12115 }, { "epoch": 4.17790036200655, "grad_norm": 1.4191410311195434, "learning_rate": 6.930367889628009e-07, "loss": 0.0962, "step": 12120 }, { "epoch": 4.1796242027236685, "grad_norm": 1.4224130732823812, "learning_rate": 6.902039143275341e-07, "loss": 0.1021, "step": 12125 }, { "epoch": 4.181348043440786, "grad_norm": 1.6031111083930405, "learning_rate": 6.87376412112451e-07, "loss": 0.1133, "step": 12130 }, { "epoch": 4.183071884157904, "grad_norm": 1.254260518232728, "learning_rate": 6.845542858422016e-07, "loss": 0.107, "step": 12135 }, { "epoch": 4.184795724875022, "grad_norm": 1.3031521917920925, "learning_rate": 6.817375390347386e-07, "loss": 0.115, "step": 12140 }, { "epoch": 4.186519565592139, "grad_norm": 1.3213404812386826, "learning_rate": 6.789261752013065e-07, "loss": 0.0972, "step": 12145 }, { "epoch": 4.188243406309257, "grad_norm": 0.9890280381696214, "learning_rate": 6.761201978464388e-07, "loss": 0.0991, "step": 12150 }, { "epoch": 4.189967247026375, "grad_norm": 1.2558941440702447, "learning_rate": 6.73319610467954e-07, "loss": 0.1115, "step": 12155 }, { "epoch": 4.191691087743492, "grad_norm": 1.1800450862754848, "learning_rate": 6.705244165569547e-07, "loss": 0.0952, "step": 12160 }, { "epoch": 4.19341492846061, "grad_norm": 1.3170812073211093, "learning_rate": 6.677346195978179e-07, "loss": 0.1067, "step": 12165 }, { "epoch": 4.195138769177728, "grad_norm": 1.4500544743842672, "learning_rate": 6.649502230681915e-07, "loss": 0.1093, "step": 12170 }, { "epoch": 4.196862609894846, "grad_norm": 1.8897829410967788, "learning_rate": 6.62171230438996e-07, "loss": 0.1159, "step": 12175 }, { "epoch": 4.1985864506119634, "grad_norm": 1.3445827852411167, "learning_rate": 6.593976451744106e-07, "loss": 0.1198, "step": 12180 }, { "epoch": 4.200310291329081, "grad_norm": 1.28542876479375, "learning_rate": 6.566294707318782e-07, "loss": 0.1103, "step": 12185 }, { "epoch": 4.202034132046199, "grad_norm": 1.3030784104291313, "learning_rate": 6.538667105620932e-07, "loss": 0.0949, "step": 12190 }, { "epoch": 4.2037579727633165, "grad_norm": 1.419157885553054, "learning_rate": 6.511093681090047e-07, "loss": 0.1117, "step": 12195 }, { "epoch": 4.205481813480435, "grad_norm": 1.350194958418878, "learning_rate": 6.483574468098042e-07, "loss": 0.1124, "step": 12200 }, { "epoch": 4.207205654197552, "grad_norm": 1.2871976907166678, "learning_rate": 6.456109500949265e-07, "loss": 0.1222, "step": 12205 }, { "epoch": 4.20892949491467, "grad_norm": 1.3613161194122034, "learning_rate": 6.428698813880469e-07, "loss": 0.102, "step": 12210 }, { "epoch": 4.210653335631788, "grad_norm": 1.4164947956837435, "learning_rate": 6.401342441060721e-07, "loss": 0.0938, "step": 12215 }, { "epoch": 4.212377176348905, "grad_norm": 1.5150162861272307, "learning_rate": 6.374040416591371e-07, "loss": 0.1169, "step": 12220 }, { "epoch": 4.214101017066023, "grad_norm": 1.0162984955966008, "learning_rate": 6.346792774506044e-07, "loss": 0.0915, "step": 12225 }, { "epoch": 4.215824857783141, "grad_norm": 1.3178507104745893, "learning_rate": 6.319599548770578e-07, "loss": 0.1067, "step": 12230 }, { "epoch": 4.217548698500258, "grad_norm": 1.2553539257056758, "learning_rate": 6.29246077328296e-07, "loss": 0.0923, "step": 12235 }, { "epoch": 4.219272539217377, "grad_norm": 2.3583820774370325, "learning_rate": 6.265376481873287e-07, "loss": 0.1122, "step": 12240 }, { "epoch": 4.220996379934494, "grad_norm": 1.2221506546192613, "learning_rate": 6.238346708303783e-07, "loss": 0.108, "step": 12245 }, { "epoch": 4.2227202206516115, "grad_norm": 1.3815144195748101, "learning_rate": 6.211371486268686e-07, "loss": 0.1485, "step": 12250 }, { "epoch": 4.22444406136873, "grad_norm": 1.3803058732470492, "learning_rate": 6.184450849394208e-07, "loss": 0.0963, "step": 12255 }, { "epoch": 4.226167902085847, "grad_norm": 1.4336304501411858, "learning_rate": 6.157584831238572e-07, "loss": 0.1014, "step": 12260 }, { "epoch": 4.227891742802965, "grad_norm": 1.455715308871864, "learning_rate": 6.130773465291867e-07, "loss": 0.1076, "step": 12265 }, { "epoch": 4.229615583520083, "grad_norm": 1.5972679775048577, "learning_rate": 6.104016784976092e-07, "loss": 0.1009, "step": 12270 }, { "epoch": 4.2313394242372, "grad_norm": 1.396676664172261, "learning_rate": 6.077314823645037e-07, "loss": 0.1071, "step": 12275 }, { "epoch": 4.2330632649543185, "grad_norm": 1.3346924550314543, "learning_rate": 6.050667614584327e-07, "loss": 0.1065, "step": 12280 }, { "epoch": 4.234787105671436, "grad_norm": 1.1729357327241883, "learning_rate": 6.024075191011297e-07, "loss": 0.1016, "step": 12285 }, { "epoch": 4.236510946388553, "grad_norm": 1.364809418073652, "learning_rate": 5.997537586075003e-07, "loss": 0.1077, "step": 12290 }, { "epoch": 4.238234787105672, "grad_norm": 1.340325804932455, "learning_rate": 5.971054832856177e-07, "loss": 0.1092, "step": 12295 }, { "epoch": 4.239958627822789, "grad_norm": 1.3306328836841919, "learning_rate": 5.94462696436715e-07, "loss": 0.1097, "step": 12300 }, { "epoch": 4.241682468539907, "grad_norm": 1.5652138591990172, "learning_rate": 5.918254013551867e-07, "loss": 0.1051, "step": 12305 }, { "epoch": 4.243406309257025, "grad_norm": 1.4301464183226353, "learning_rate": 5.891936013285781e-07, "loss": 0.0971, "step": 12310 }, { "epoch": 4.245130149974142, "grad_norm": 1.4010698047294279, "learning_rate": 5.865672996375882e-07, "loss": 0.118, "step": 12315 }, { "epoch": 4.24685399069126, "grad_norm": 1.4346661372391631, "learning_rate": 5.83946499556059e-07, "loss": 0.0995, "step": 12320 }, { "epoch": 4.248577831408378, "grad_norm": 1.5022228254855505, "learning_rate": 5.81331204350975e-07, "loss": 0.1176, "step": 12325 }, { "epoch": 4.250301672125495, "grad_norm": 1.2678766067832812, "learning_rate": 5.787214172824606e-07, "loss": 0.1221, "step": 12330 }, { "epoch": 4.252025512842613, "grad_norm": 1.3410254869010216, "learning_rate": 5.761171416037714e-07, "loss": 0.0934, "step": 12335 }, { "epoch": 4.253749353559731, "grad_norm": 1.5464218900432887, "learning_rate": 5.735183805612931e-07, "loss": 0.1002, "step": 12340 }, { "epoch": 4.255473194276849, "grad_norm": 1.3829402583270551, "learning_rate": 5.709251373945379e-07, "loss": 0.1054, "step": 12345 }, { "epoch": 4.2571970349939665, "grad_norm": 1.2054391203469281, "learning_rate": 5.683374153361421e-07, "loss": 0.1029, "step": 12350 }, { "epoch": 4.258920875711084, "grad_norm": 1.2472665271613321, "learning_rate": 5.657552176118542e-07, "loss": 0.0894, "step": 12355 }, { "epoch": 4.260644716428202, "grad_norm": 1.4089057223846146, "learning_rate": 5.631785474405394e-07, "loss": 0.1114, "step": 12360 }, { "epoch": 4.26236855714532, "grad_norm": 1.569769553643092, "learning_rate": 5.606074080341734e-07, "loss": 0.1094, "step": 12365 }, { "epoch": 4.264092397862438, "grad_norm": 1.3772431096303317, "learning_rate": 5.580418025978351e-07, "loss": 0.0976, "step": 12370 }, { "epoch": 4.265816238579555, "grad_norm": 1.3766000638773417, "learning_rate": 5.554817343297064e-07, "loss": 0.0983, "step": 12375 }, { "epoch": 4.267540079296673, "grad_norm": 1.4859420970994945, "learning_rate": 5.529272064210655e-07, "loss": 0.1153, "step": 12380 }, { "epoch": 4.269263920013791, "grad_norm": 1.3403472666691587, "learning_rate": 5.503782220562859e-07, "loss": 0.1054, "step": 12385 }, { "epoch": 4.270987760730908, "grad_norm": 1.4706091562778003, "learning_rate": 5.478347844128317e-07, "loss": 0.1047, "step": 12390 }, { "epoch": 4.272711601448027, "grad_norm": 1.4923912229247072, "learning_rate": 5.452968966612482e-07, "loss": 0.1016, "step": 12395 }, { "epoch": 4.274435442165144, "grad_norm": 1.3326109737245777, "learning_rate": 5.427645619651673e-07, "loss": 0.0998, "step": 12400 }, { "epoch": 4.276159282882261, "grad_norm": 1.396060359742583, "learning_rate": 5.402377834812961e-07, "loss": 0.0987, "step": 12405 }, { "epoch": 4.27788312359938, "grad_norm": 1.4662761612938902, "learning_rate": 5.377165643594145e-07, "loss": 0.0964, "step": 12410 }, { "epoch": 4.279606964316497, "grad_norm": 1.1628681323041505, "learning_rate": 5.352009077423759e-07, "loss": 0.09, "step": 12415 }, { "epoch": 4.2813308050336145, "grad_norm": 1.2393800355859474, "learning_rate": 5.326908167660971e-07, "loss": 0.0997, "step": 12420 }, { "epoch": 4.283054645750733, "grad_norm": 1.266289804169401, "learning_rate": 5.301862945595565e-07, "loss": 0.1003, "step": 12425 }, { "epoch": 4.28477848646785, "grad_norm": 1.4621785644531997, "learning_rate": 5.276873442447922e-07, "loss": 0.1014, "step": 12430 }, { "epoch": 4.2865023271849685, "grad_norm": 1.3353129648864126, "learning_rate": 5.251939689368973e-07, "loss": 0.0907, "step": 12435 }, { "epoch": 4.288226167902086, "grad_norm": 2.04250269508673, "learning_rate": 5.227061717440141e-07, "loss": 0.1077, "step": 12440 }, { "epoch": 4.289950008619203, "grad_norm": 1.2548661722067267, "learning_rate": 5.202239557673295e-07, "loss": 0.0802, "step": 12445 }, { "epoch": 4.291673849336322, "grad_norm": 1.1900181280239783, "learning_rate": 5.177473241010772e-07, "loss": 0.0911, "step": 12450 }, { "epoch": 4.293397690053439, "grad_norm": 1.3571581299873972, "learning_rate": 5.152762798325267e-07, "loss": 0.0942, "step": 12455 }, { "epoch": 4.295121530770556, "grad_norm": 1.3288448591516742, "learning_rate": 5.128108260419828e-07, "loss": 0.1135, "step": 12460 }, { "epoch": 4.296845371487675, "grad_norm": 1.2233712060586364, "learning_rate": 5.103509658027828e-07, "loss": 0.1004, "step": 12465 }, { "epoch": 4.298569212204792, "grad_norm": 1.2173291940674655, "learning_rate": 5.078967021812914e-07, "loss": 0.101, "step": 12470 }, { "epoch": 4.30029305292191, "grad_norm": 1.3655430164910245, "learning_rate": 5.054480382368948e-07, "loss": 0.1106, "step": 12475 }, { "epoch": 4.302016893639028, "grad_norm": 1.3490310044295044, "learning_rate": 5.030049770219991e-07, "loss": 0.093, "step": 12480 }, { "epoch": 4.303740734356145, "grad_norm": 2.7658729263934987, "learning_rate": 5.005675215820294e-07, "loss": 0.1039, "step": 12485 }, { "epoch": 4.305464575073263, "grad_norm": 1.5291197745764706, "learning_rate": 4.981356749554189e-07, "loss": 0.1019, "step": 12490 }, { "epoch": 4.307188415790381, "grad_norm": 1.3587881192478735, "learning_rate": 4.957094401736101e-07, "loss": 0.1035, "step": 12495 }, { "epoch": 4.308912256507499, "grad_norm": 1.417384199119948, "learning_rate": 4.932888202610531e-07, "loss": 0.1072, "step": 12500 }, { "epoch": 4.3106360972246165, "grad_norm": 1.2442251101795154, "learning_rate": 4.908738182351941e-07, "loss": 0.101, "step": 12505 }, { "epoch": 4.312359937941734, "grad_norm": 1.43864313135744, "learning_rate": 4.884644371064801e-07, "loss": 0.1061, "step": 12510 }, { "epoch": 4.314083778658852, "grad_norm": 1.3610182737377219, "learning_rate": 4.860606798783479e-07, "loss": 0.105, "step": 12515 }, { "epoch": 4.31580761937597, "grad_norm": 1.362514085323076, "learning_rate": 4.836625495472274e-07, "loss": 0.0865, "step": 12520 }, { "epoch": 4.317531460093088, "grad_norm": 1.3079875797638718, "learning_rate": 4.812700491025318e-07, "loss": 0.0939, "step": 12525 }, { "epoch": 4.319255300810205, "grad_norm": 1.4818211035752797, "learning_rate": 4.788831815266554e-07, "loss": 0.1241, "step": 12530 }, { "epoch": 4.320979141527323, "grad_norm": 1.3672898407379324, "learning_rate": 4.7650194979497466e-07, "loss": 0.1004, "step": 12535 }, { "epoch": 4.322702982244441, "grad_norm": 1.4318534960326164, "learning_rate": 4.74126356875837e-07, "loss": 0.1111, "step": 12540 }, { "epoch": 4.324426822961558, "grad_norm": 1.370126841495434, "learning_rate": 4.717564057305607e-07, "loss": 0.114, "step": 12545 }, { "epoch": 4.326150663678676, "grad_norm": 1.5715046699340225, "learning_rate": 4.693920993134343e-07, "loss": 0.1066, "step": 12550 }, { "epoch": 4.327874504395794, "grad_norm": 1.3465254964642521, "learning_rate": 4.6703344057170807e-07, "loss": 0.0941, "step": 12555 }, { "epoch": 4.329598345112911, "grad_norm": 1.486207141933927, "learning_rate": 4.6468043244559167e-07, "loss": 0.1065, "step": 12560 }, { "epoch": 4.33132218583003, "grad_norm": 1.4194290234443416, "learning_rate": 4.6233307786825e-07, "loss": 0.1035, "step": 12565 }, { "epoch": 4.333046026547147, "grad_norm": 2.1555815012813704, "learning_rate": 4.5999137976580456e-07, "loss": 0.1046, "step": 12570 }, { "epoch": 4.3347698672642645, "grad_norm": 1.4582126311186856, "learning_rate": 4.576553410573209e-07, "loss": 0.1065, "step": 12575 }, { "epoch": 4.336493707981383, "grad_norm": 1.284759161569176, "learning_rate": 4.5532496465481246e-07, "loss": 0.1043, "step": 12580 }, { "epoch": 4.3382175486985, "grad_norm": 1.3575841090237788, "learning_rate": 4.5300025346323217e-07, "loss": 0.1074, "step": 12585 }, { "epoch": 4.339941389415618, "grad_norm": 1.2729313115551153, "learning_rate": 4.506812103804742e-07, "loss": 0.1014, "step": 12590 }, { "epoch": 4.341665230132736, "grad_norm": 1.3921280110786385, "learning_rate": 4.483678382973661e-07, "loss": 0.1049, "step": 12595 }, { "epoch": 4.343389070849853, "grad_norm": 1.4120792014341572, "learning_rate": 4.460601400976633e-07, "loss": 0.1082, "step": 12600 }, { "epoch": 4.3451129115669715, "grad_norm": 1.2266977720156238, "learning_rate": 4.4375811865805196e-07, "loss": 0.087, "step": 12605 }, { "epoch": 4.346836752284089, "grad_norm": 1.221441799906539, "learning_rate": 4.4146177684813993e-07, "loss": 0.102, "step": 12610 }, { "epoch": 4.348560593001206, "grad_norm": 1.4977195702203998, "learning_rate": 4.391711175304542e-07, "loss": 0.0958, "step": 12615 }, { "epoch": 4.350284433718325, "grad_norm": 1.3647702918679094, "learning_rate": 4.3688614356044155e-07, "loss": 0.1095, "step": 12620 }, { "epoch": 4.352008274435442, "grad_norm": 1.2978731041023601, "learning_rate": 4.3460685778645874e-07, "loss": 0.0941, "step": 12625 }, { "epoch": 4.35373211515256, "grad_norm": 1.3074376377403427, "learning_rate": 4.3233326304977175e-07, "loss": 0.083, "step": 12630 }, { "epoch": 4.355455955869678, "grad_norm": 1.3636889328659345, "learning_rate": 4.3006536218455355e-07, "loss": 0.0957, "step": 12635 }, { "epoch": 4.357179796586795, "grad_norm": 1.4090294409315518, "learning_rate": 4.278031580178804e-07, "loss": 0.0958, "step": 12640 }, { "epoch": 4.358903637303913, "grad_norm": 1.2745871352097167, "learning_rate": 4.2554665336972557e-07, "loss": 0.0992, "step": 12645 }, { "epoch": 4.360627478021031, "grad_norm": 1.3487303505526345, "learning_rate": 4.232958510529561e-07, "loss": 0.104, "step": 12650 }, { "epoch": 4.362351318738148, "grad_norm": 1.057505664160843, "learning_rate": 4.210507538733344e-07, "loss": 0.1073, "step": 12655 }, { "epoch": 4.3640751594552665, "grad_norm": 1.355665425463141, "learning_rate": 4.188113646295089e-07, "loss": 0.1096, "step": 12660 }, { "epoch": 4.365799000172384, "grad_norm": 1.2972887628731364, "learning_rate": 4.165776861130116e-07, "loss": 0.0861, "step": 12665 }, { "epoch": 4.367522840889502, "grad_norm": 1.3587682846022855, "learning_rate": 4.1434972110825864e-07, "loss": 0.0972, "step": 12670 }, { "epoch": 4.3692466816066196, "grad_norm": 1.4116821509583934, "learning_rate": 4.121274723925428e-07, "loss": 0.0954, "step": 12675 }, { "epoch": 4.370970522323737, "grad_norm": 1.4358790735628058, "learning_rate": 4.0991094273603036e-07, "loss": 0.0955, "step": 12680 }, { "epoch": 4.372694363040855, "grad_norm": 1.7436771542495768, "learning_rate": 4.077001349017579e-07, "loss": 0.1041, "step": 12685 }, { "epoch": 4.374418203757973, "grad_norm": 1.5809645807030175, "learning_rate": 4.054950516456324e-07, "loss": 0.1054, "step": 12690 }, { "epoch": 4.37614204447509, "grad_norm": 1.6113582839177205, "learning_rate": 4.0329569571642133e-07, "loss": 0.1119, "step": 12695 }, { "epoch": 4.377865885192208, "grad_norm": 1.3810798928916548, "learning_rate": 4.0110206985575495e-07, "loss": 0.1015, "step": 12700 }, { "epoch": 4.379589725909326, "grad_norm": 1.1764847209935048, "learning_rate": 3.989141767981186e-07, "loss": 0.1045, "step": 12705 }, { "epoch": 4.381313566626444, "grad_norm": 1.3729825902566304, "learning_rate": 3.967320192708535e-07, "loss": 0.1107, "step": 12710 }, { "epoch": 4.383037407343561, "grad_norm": 1.4615264832490804, "learning_rate": 3.945555999941514e-07, "loss": 0.0888, "step": 12715 }, { "epoch": 4.384761248060679, "grad_norm": 1.4273211790569875, "learning_rate": 3.9238492168104825e-07, "loss": 0.1132, "step": 12720 }, { "epoch": 4.386485088777797, "grad_norm": 1.3636642689942804, "learning_rate": 3.902199870374268e-07, "loss": 0.1168, "step": 12725 }, { "epoch": 4.3882089294949145, "grad_norm": 1.3843737512650844, "learning_rate": 3.880607987620072e-07, "loss": 0.0931, "step": 12730 }, { "epoch": 4.389932770212033, "grad_norm": 1.4950554554819624, "learning_rate": 3.8590735954634694e-07, "loss": 0.0935, "step": 12735 }, { "epoch": 4.39165661092915, "grad_norm": 1.4774430566556316, "learning_rate": 3.837596720748399e-07, "loss": 0.0926, "step": 12740 }, { "epoch": 4.393380451646268, "grad_norm": 1.3825430194966124, "learning_rate": 3.816177390247061e-07, "loss": 0.0957, "step": 12745 }, { "epoch": 4.395104292363386, "grad_norm": 1.3038503938953758, "learning_rate": 3.794815630659937e-07, "loss": 0.0958, "step": 12750 }, { "epoch": 4.396828133080503, "grad_norm": 1.3365803641344325, "learning_rate": 3.773511468615748e-07, "loss": 0.1042, "step": 12755 }, { "epoch": 4.3985519737976215, "grad_norm": 1.2221751365503517, "learning_rate": 3.7522649306714233e-07, "loss": 0.102, "step": 12760 }, { "epoch": 4.400275814514739, "grad_norm": 1.4462721087322152, "learning_rate": 3.731076043312054e-07, "loss": 0.1001, "step": 12765 }, { "epoch": 4.401999655231856, "grad_norm": 1.2748365997202482, "learning_rate": 3.70994483295084e-07, "loss": 0.0936, "step": 12770 }, { "epoch": 4.403723495948975, "grad_norm": 1.7392654589861656, "learning_rate": 3.688871325929128e-07, "loss": 0.1098, "step": 12775 }, { "epoch": 4.405447336666092, "grad_norm": 1.4703587444432689, "learning_rate": 3.6678555485163137e-07, "loss": 0.0943, "step": 12780 }, { "epoch": 4.407171177383209, "grad_norm": 1.342522738440642, "learning_rate": 3.646897526909815e-07, "loss": 0.1063, "step": 12785 }, { "epoch": 4.408895018100328, "grad_norm": 1.3022584845829768, "learning_rate": 3.625997287235067e-07, "loss": 0.1087, "step": 12790 }, { "epoch": 4.410618858817445, "grad_norm": 1.206791508217318, "learning_rate": 3.6051548555454785e-07, "loss": 0.0956, "step": 12795 }, { "epoch": 4.412342699534563, "grad_norm": 1.1279835274287382, "learning_rate": 3.5843702578224115e-07, "loss": 0.1004, "step": 12800 }, { "epoch": 4.414066540251681, "grad_norm": 1.561160217448479, "learning_rate": 3.563643519975091e-07, "loss": 0.0871, "step": 12805 }, { "epoch": 4.415790380968798, "grad_norm": 1.4484456113372957, "learning_rate": 3.5429746678406707e-07, "loss": 0.0924, "step": 12810 }, { "epoch": 4.4175142216859165, "grad_norm": 1.22779791194021, "learning_rate": 3.5223637271841026e-07, "loss": 0.0952, "step": 12815 }, { "epoch": 4.419238062403034, "grad_norm": 1.1526436057893366, "learning_rate": 3.501810723698168e-07, "loss": 0.0894, "step": 12820 }, { "epoch": 4.420961903120151, "grad_norm": 1.082238673798404, "learning_rate": 3.481315683003411e-07, "loss": 0.0788, "step": 12825 }, { "epoch": 4.4226857438372695, "grad_norm": 1.6270339011649144, "learning_rate": 3.460878630648157e-07, "loss": 0.1149, "step": 12830 }, { "epoch": 4.424409584554387, "grad_norm": 1.437972544443947, "learning_rate": 3.440499592108393e-07, "loss": 0.0859, "step": 12835 }, { "epoch": 4.426133425271505, "grad_norm": 1.4124833409613646, "learning_rate": 3.4201785927878375e-07, "loss": 0.1151, "step": 12840 }, { "epoch": 4.427857265988623, "grad_norm": 1.348271869233934, "learning_rate": 3.3999156580178384e-07, "loss": 0.0875, "step": 12845 }, { "epoch": 4.42958110670574, "grad_norm": 1.3166518724443603, "learning_rate": 3.379710813057363e-07, "loss": 0.1094, "step": 12850 }, { "epoch": 4.431304947422858, "grad_norm": 1.3729400568280912, "learning_rate": 3.3595640830929534e-07, "loss": 0.1004, "step": 12855 }, { "epoch": 4.433028788139976, "grad_norm": 1.526974997179749, "learning_rate": 3.3394754932387363e-07, "loss": 0.1012, "step": 12860 }, { "epoch": 4.434752628857094, "grad_norm": 1.3652631357819194, "learning_rate": 3.3194450685363364e-07, "loss": 0.108, "step": 12865 }, { "epoch": 4.436476469574211, "grad_norm": 1.4890953606733544, "learning_rate": 3.2994728339548863e-07, "loss": 0.0805, "step": 12870 }, { "epoch": 4.438200310291329, "grad_norm": 1.2502987339362712, "learning_rate": 3.279558814390982e-07, "loss": 0.091, "step": 12875 }, { "epoch": 4.439924151008447, "grad_norm": 1.3490773986660833, "learning_rate": 3.2597030346686544e-07, "loss": 0.1004, "step": 12880 }, { "epoch": 4.4416479917255645, "grad_norm": 1.567697673737467, "learning_rate": 3.239905519539316e-07, "loss": 0.1107, "step": 12885 }, { "epoch": 4.443371832442683, "grad_norm": 1.0639027461941597, "learning_rate": 3.2201662936817533e-07, "loss": 0.0951, "step": 12890 }, { "epoch": 4.4450956731598, "grad_norm": 1.2286238065647186, "learning_rate": 3.2004853817021233e-07, "loss": 0.102, "step": 12895 }, { "epoch": 4.4468195138769175, "grad_norm": 1.3921497125598463, "learning_rate": 3.18086280813385e-07, "loss": 0.1, "step": 12900 }, { "epoch": 4.448543354594036, "grad_norm": 1.2175773126722704, "learning_rate": 3.1612985974376563e-07, "loss": 0.0847, "step": 12905 }, { "epoch": 4.450267195311153, "grad_norm": 1.285169869770806, "learning_rate": 3.1417927740015064e-07, "loss": 0.103, "step": 12910 }, { "epoch": 4.451991036028271, "grad_norm": 1.463906692471606, "learning_rate": 3.1223453621405775e-07, "loss": 0.1014, "step": 12915 }, { "epoch": 4.453714876745389, "grad_norm": 1.2869230792060926, "learning_rate": 3.102956386097256e-07, "loss": 0.0885, "step": 12920 }, { "epoch": 4.455438717462506, "grad_norm": 1.5530548797185606, "learning_rate": 3.0836258700410515e-07, "loss": 0.0848, "step": 12925 }, { "epoch": 4.457162558179625, "grad_norm": 1.2862694034718802, "learning_rate": 3.064353838068629e-07, "loss": 0.1033, "step": 12930 }, { "epoch": 4.458886398896742, "grad_norm": 1.4721761441567474, "learning_rate": 3.0451403142037263e-07, "loss": 0.0851, "step": 12935 }, { "epoch": 4.460610239613859, "grad_norm": 1.4669459599969508, "learning_rate": 3.0259853223971513e-07, "loss": 0.1067, "step": 12940 }, { "epoch": 4.462334080330978, "grad_norm": 1.37748309479494, "learning_rate": 3.0068888865267707e-07, "loss": 0.0942, "step": 12945 }, { "epoch": 4.464057921048095, "grad_norm": 1.6375777308007735, "learning_rate": 2.9878510303974375e-07, "loss": 0.1137, "step": 12950 }, { "epoch": 4.4657817617652125, "grad_norm": 1.1204002889094036, "learning_rate": 2.968871777740967e-07, "loss": 0.0971, "step": 12955 }, { "epoch": 4.467505602482331, "grad_norm": 1.0897762511129812, "learning_rate": 2.9499511522161516e-07, "loss": 0.0856, "step": 12960 }, { "epoch": 4.469229443199448, "grad_norm": 1.2686325796294473, "learning_rate": 2.9310891774087023e-07, "loss": 0.0963, "step": 12965 }, { "epoch": 4.470953283916566, "grad_norm": 1.3901134913359297, "learning_rate": 2.912285876831195e-07, "loss": 0.111, "step": 12970 }, { "epoch": 4.472677124633684, "grad_norm": 1.13042242981474, "learning_rate": 2.893541273923067e-07, "loss": 0.1015, "step": 12975 }, { "epoch": 4.474400965350801, "grad_norm": 1.5101556464880321, "learning_rate": 2.874855392050607e-07, "loss": 0.1179, "step": 12980 }, { "epoch": 4.4761248060679195, "grad_norm": 1.5865788619169257, "learning_rate": 2.856228254506888e-07, "loss": 0.1214, "step": 12985 }, { "epoch": 4.477848646785037, "grad_norm": 1.321921555506973, "learning_rate": 2.8376598845117566e-07, "loss": 0.0976, "step": 12990 }, { "epoch": 4.479572487502155, "grad_norm": 1.3855200764691942, "learning_rate": 2.819150305211793e-07, "loss": 0.105, "step": 12995 }, { "epoch": 4.481296328219273, "grad_norm": 1.5457327572351867, "learning_rate": 2.8006995396803127e-07, "loss": 0.113, "step": 13000 }, { "epoch": 4.48302016893639, "grad_norm": 0.8593743094516547, "learning_rate": 2.782307610917312e-07, "loss": 0.0854, "step": 13005 }, { "epoch": 4.484744009653508, "grad_norm": 1.3952963557799567, "learning_rate": 2.7639745418494233e-07, "loss": 0.0969, "step": 13010 }, { "epoch": 4.486467850370626, "grad_norm": 1.3731902241157263, "learning_rate": 2.7457003553299275e-07, "loss": 0.0804, "step": 13015 }, { "epoch": 4.488191691087744, "grad_norm": 1.3022314660354217, "learning_rate": 2.727485074138703e-07, "loss": 0.1048, "step": 13020 }, { "epoch": 4.489915531804861, "grad_norm": 1.227026767751595, "learning_rate": 2.709328720982185e-07, "loss": 0.1066, "step": 13025 }, { "epoch": 4.491639372521979, "grad_norm": 1.1417604541849078, "learning_rate": 2.691231318493354e-07, "loss": 0.0859, "step": 13030 }, { "epoch": 4.493363213239097, "grad_norm": 1.4900371489784046, "learning_rate": 2.6731928892317295e-07, "loss": 0.1037, "step": 13035 }, { "epoch": 4.495087053956214, "grad_norm": 1.2151600207334885, "learning_rate": 2.6552134556832863e-07, "loss": 0.0986, "step": 13040 }, { "epoch": 4.496810894673332, "grad_norm": 1.8363902524651001, "learning_rate": 2.637293040260469e-07, "loss": 0.0923, "step": 13045 }, { "epoch": 4.49853473539045, "grad_norm": 1.4983308920248348, "learning_rate": 2.6194316653021634e-07, "loss": 0.1064, "step": 13050 }, { "epoch": 4.5002585761075675, "grad_norm": 1.2872110404664545, "learning_rate": 2.6016293530736483e-07, "loss": 0.0919, "step": 13055 }, { "epoch": 4.501982416824685, "grad_norm": 1.5124529378726344, "learning_rate": 2.583886125766566e-07, "loss": 0.1209, "step": 13060 }, { "epoch": 4.503706257541803, "grad_norm": 1.287728451640388, "learning_rate": 2.56620200549893e-07, "loss": 0.0908, "step": 13065 }, { "epoch": 4.505430098258921, "grad_norm": 1.4652171994556562, "learning_rate": 2.548577014315051e-07, "loss": 0.0917, "step": 13070 }, { "epoch": 4.507153938976039, "grad_norm": 1.7160599080486028, "learning_rate": 2.531011174185544e-07, "loss": 0.0944, "step": 13075 }, { "epoch": 4.508877779693156, "grad_norm": 1.738633885349048, "learning_rate": 2.5135045070072805e-07, "loss": 0.1083, "step": 13080 }, { "epoch": 4.510601620410274, "grad_norm": 1.5220304294741687, "learning_rate": 2.4960570346033885e-07, "loss": 0.0982, "step": 13085 }, { "epoch": 4.512325461127392, "grad_norm": 1.386549851427352, "learning_rate": 2.478668778723181e-07, "loss": 0.1111, "step": 13090 }, { "epoch": 4.514049301844509, "grad_norm": 1.366751422364679, "learning_rate": 2.4613397610421694e-07, "loss": 0.0901, "step": 13095 }, { "epoch": 4.515773142561628, "grad_norm": 1.2455467002386247, "learning_rate": 2.444070003162019e-07, "loss": 0.0997, "step": 13100 }, { "epoch": 4.517496983278745, "grad_norm": 1.4699538249465196, "learning_rate": 2.4268595266105145e-07, "loss": 0.1123, "step": 13105 }, { "epoch": 4.5192208239958624, "grad_norm": 1.1079905013317122, "learning_rate": 2.409708352841561e-07, "loss": 0.0929, "step": 13110 }, { "epoch": 4.520944664712981, "grad_norm": 1.3514740009692399, "learning_rate": 2.392616503235118e-07, "loss": 0.1114, "step": 13115 }, { "epoch": 4.522668505430098, "grad_norm": 1.2706570686597356, "learning_rate": 2.3755839990972086e-07, "loss": 0.102, "step": 13120 }, { "epoch": 4.524392346147216, "grad_norm": 1.2544292470805234, "learning_rate": 2.3586108616598825e-07, "loss": 0.1045, "step": 13125 }, { "epoch": 4.526116186864334, "grad_norm": 1.4872975760870342, "learning_rate": 2.3416971120811594e-07, "loss": 0.1217, "step": 13130 }, { "epoch": 4.527840027581451, "grad_norm": 1.454057940553262, "learning_rate": 2.3248427714450684e-07, "loss": 0.087, "step": 13135 }, { "epoch": 4.5295638682985695, "grad_norm": 1.563757832532968, "learning_rate": 2.3080478607615475e-07, "loss": 0.119, "step": 13140 }, { "epoch": 4.531287709015687, "grad_norm": 1.374597331759327, "learning_rate": 2.291312400966461e-07, "loss": 0.0905, "step": 13145 }, { "epoch": 4.533011549732805, "grad_norm": 1.4757684836095026, "learning_rate": 2.274636412921566e-07, "loss": 0.1004, "step": 13150 }, { "epoch": 4.534735390449923, "grad_norm": 1.5931262829105004, "learning_rate": 2.2580199174144946e-07, "loss": 0.0983, "step": 13155 }, { "epoch": 4.53645923116704, "grad_norm": 1.426226613506302, "learning_rate": 2.2414629351586946e-07, "loss": 0.0787, "step": 13160 }, { "epoch": 4.538183071884158, "grad_norm": 1.4154589419608299, "learning_rate": 2.224965486793451e-07, "loss": 0.0979, "step": 13165 }, { "epoch": 4.539906912601276, "grad_norm": 1.3557580241854177, "learning_rate": 2.2085275928838245e-07, "loss": 0.1036, "step": 13170 }, { "epoch": 4.541630753318393, "grad_norm": 1.492145085562156, "learning_rate": 2.1921492739206463e-07, "loss": 0.0893, "step": 13175 }, { "epoch": 4.543354594035511, "grad_norm": 1.2949241272414553, "learning_rate": 2.1758305503204568e-07, "loss": 0.0967, "step": 13180 }, { "epoch": 4.545078434752629, "grad_norm": 1.2385526324923133, "learning_rate": 2.1595714424255453e-07, "loss": 0.0954, "step": 13185 }, { "epoch": 4.546802275469746, "grad_norm": 1.690001400330076, "learning_rate": 2.1433719705038602e-07, "loss": 0.1107, "step": 13190 }, { "epoch": 4.548526116186864, "grad_norm": 1.2457330913666358, "learning_rate": 2.127232154749026e-07, "loss": 0.0998, "step": 13195 }, { "epoch": 4.550249956903982, "grad_norm": 1.3930447286730825, "learning_rate": 2.1111520152802767e-07, "loss": 0.085, "step": 13200 }, { "epoch": 4.5519737976211, "grad_norm": 1.0526105841258329, "learning_rate": 2.0951315721424893e-07, "loss": 0.1005, "step": 13205 }, { "epoch": 4.5536976383382175, "grad_norm": 1.5401083244814908, "learning_rate": 2.0791708453061054e-07, "loss": 0.1141, "step": 13210 }, { "epoch": 4.555421479055335, "grad_norm": 1.3508795386173233, "learning_rate": 2.0632698546671327e-07, "loss": 0.0975, "step": 13215 }, { "epoch": 4.557145319772453, "grad_norm": 1.4921763974196698, "learning_rate": 2.0474286200471149e-07, "loss": 0.1068, "step": 13220 }, { "epoch": 4.558869160489571, "grad_norm": 1.4736816750777688, "learning_rate": 2.0316471611931066e-07, "loss": 0.1236, "step": 13225 }, { "epoch": 4.560593001206689, "grad_norm": 1.6234184304688664, "learning_rate": 2.0159254977776376e-07, "loss": 0.0953, "step": 13230 }, { "epoch": 4.562316841923806, "grad_norm": 1.3390965225224296, "learning_rate": 2.0002636493987037e-07, "loss": 0.0923, "step": 13235 }, { "epoch": 4.564040682640924, "grad_norm": 1.2978219433611065, "learning_rate": 1.98466163557976e-07, "loss": 0.0985, "step": 13240 }, { "epoch": 4.565764523358042, "grad_norm": 1.7456280452183062, "learning_rate": 1.969119475769632e-07, "loss": 0.094, "step": 13245 }, { "epoch": 4.567488364075159, "grad_norm": 1.4884220943358613, "learning_rate": 1.9536371893425776e-07, "loss": 0.1006, "step": 13250 }, { "epoch": 4.569212204792278, "grad_norm": 1.3322884018508931, "learning_rate": 1.9382147955981923e-07, "loss": 0.0946, "step": 13255 }, { "epoch": 4.570936045509395, "grad_norm": 1.3238506807068622, "learning_rate": 1.922852313761414e-07, "loss": 0.1093, "step": 13260 }, { "epoch": 4.572659886226512, "grad_norm": 1.2103606889951604, "learning_rate": 1.907549762982508e-07, "loss": 0.0954, "step": 13265 }, { "epoch": 4.574383726943631, "grad_norm": 1.5514619596585275, "learning_rate": 1.8923071623370093e-07, "loss": 0.0957, "step": 13270 }, { "epoch": 4.576107567660748, "grad_norm": 1.319009896706714, "learning_rate": 1.877124530825758e-07, "loss": 0.0891, "step": 13275 }, { "epoch": 4.5778314083778655, "grad_norm": 1.12910226251704, "learning_rate": 1.862001887374798e-07, "loss": 0.0917, "step": 13280 }, { "epoch": 4.579555249094984, "grad_norm": 1.2729579102654363, "learning_rate": 1.8469392508354277e-07, "loss": 0.1078, "step": 13285 }, { "epoch": 4.581279089812101, "grad_norm": 1.3691912041824974, "learning_rate": 1.8319366399841331e-07, "loss": 0.1023, "step": 13290 }, { "epoch": 4.5830029305292195, "grad_norm": 1.4430583312306229, "learning_rate": 1.81699407352256e-07, "loss": 0.1051, "step": 13295 }, { "epoch": 4.584726771246337, "grad_norm": 1.4521508718047706, "learning_rate": 1.8021115700775193e-07, "loss": 0.104, "step": 13300 }, { "epoch": 4.586450611963454, "grad_norm": 1.4938106721352669, "learning_rate": 1.7872891482009546e-07, "loss": 0.0914, "step": 13305 }, { "epoch": 4.588174452680573, "grad_norm": 1.2759287382605968, "learning_rate": 1.772526826369897e-07, "loss": 0.0878, "step": 13310 }, { "epoch": 4.58989829339769, "grad_norm": 1.2838268078011246, "learning_rate": 1.7578246229864816e-07, "loss": 0.0973, "step": 13315 }, { "epoch": 4.591622134114807, "grad_norm": 1.4327722570452122, "learning_rate": 1.7431825563778705e-07, "loss": 0.0902, "step": 13320 }, { "epoch": 4.593345974831926, "grad_norm": 1.603054932211892, "learning_rate": 1.7286006447962912e-07, "loss": 0.1157, "step": 13325 }, { "epoch": 4.595069815549043, "grad_norm": 1.5177612887963574, "learning_rate": 1.714078906418981e-07, "loss": 0.0929, "step": 13330 }, { "epoch": 4.596793656266161, "grad_norm": 1.6769181249663976, "learning_rate": 1.6996173593481546e-07, "loss": 0.1066, "step": 13335 }, { "epoch": 4.598517496983279, "grad_norm": 1.1958153335576733, "learning_rate": 1.6852160216110026e-07, "loss": 0.0943, "step": 13340 }, { "epoch": 4.600241337700396, "grad_norm": 1.3155997170309006, "learning_rate": 1.6708749111596535e-07, "loss": 0.093, "step": 13345 }, { "epoch": 4.601965178417514, "grad_norm": 1.4886335786550116, "learning_rate": 1.656594045871174e-07, "loss": 0.104, "step": 13350 }, { "epoch": 4.603689019134632, "grad_norm": 1.2695988845935569, "learning_rate": 1.642373443547507e-07, "loss": 0.0971, "step": 13355 }, { "epoch": 4.60541285985175, "grad_norm": 1.5233611092597934, "learning_rate": 1.6282131219155062e-07, "loss": 0.0902, "step": 13360 }, { "epoch": 4.6071367005688675, "grad_norm": 1.5912394263506853, "learning_rate": 1.6141130986268516e-07, "loss": 0.1179, "step": 13365 }, { "epoch": 4.608860541285985, "grad_norm": 1.1955424578211602, "learning_rate": 1.600073391258078e-07, "loss": 0.0924, "step": 13370 }, { "epoch": 4.610584382003103, "grad_norm": 1.5460175407099928, "learning_rate": 1.5860940173105244e-07, "loss": 0.1013, "step": 13375 }, { "epoch": 4.612308222720221, "grad_norm": 1.2221212450599088, "learning_rate": 1.5721749942103237e-07, "loss": 0.0811, "step": 13380 }, { "epoch": 4.614032063437339, "grad_norm": 1.2464206154528004, "learning_rate": 1.5583163393083689e-07, "loss": 0.0982, "step": 13385 }, { "epoch": 4.615755904154456, "grad_norm": 1.4499475052958257, "learning_rate": 1.544518069880313e-07, "loss": 0.0837, "step": 13390 }, { "epoch": 4.617479744871574, "grad_norm": 1.226797858106584, "learning_rate": 1.5307802031265305e-07, "loss": 0.0927, "step": 13395 }, { "epoch": 4.619203585588692, "grad_norm": 1.4748541756460058, "learning_rate": 1.5171027561720953e-07, "loss": 0.1069, "step": 13400 }, { "epoch": 4.620927426305809, "grad_norm": 1.3988443218251998, "learning_rate": 1.503485746066763e-07, "loss": 0.1, "step": 13405 }, { "epoch": 4.622651267022927, "grad_norm": 1.2656574957575772, "learning_rate": 1.4899291897849665e-07, "loss": 0.1054, "step": 13410 }, { "epoch": 4.624375107740045, "grad_norm": 1.449296239770174, "learning_rate": 1.4764331042257662e-07, "loss": 0.0973, "step": 13415 }, { "epoch": 4.626098948457162, "grad_norm": 1.1166203335379397, "learning_rate": 1.4629975062128432e-07, "loss": 0.0951, "step": 13420 }, { "epoch": 4.62782278917428, "grad_norm": 1.296216895911148, "learning_rate": 1.449622412494478e-07, "loss": 0.0983, "step": 13425 }, { "epoch": 4.629546629891398, "grad_norm": 1.7368534817398664, "learning_rate": 1.4363078397435336e-07, "loss": 0.1077, "step": 13430 }, { "epoch": 4.6312704706085155, "grad_norm": 1.1978076419655144, "learning_rate": 1.4230538045574283e-07, "loss": 0.0942, "step": 13435 }, { "epoch": 4.632994311325634, "grad_norm": 1.243838839475416, "learning_rate": 1.409860323458101e-07, "loss": 0.0785, "step": 13440 }, { "epoch": 4.634718152042751, "grad_norm": 1.3067558831150292, "learning_rate": 1.396727412892035e-07, "loss": 0.0794, "step": 13445 }, { "epoch": 4.636441992759869, "grad_norm": 1.4041963387833243, "learning_rate": 1.3836550892301792e-07, "loss": 0.1103, "step": 13450 }, { "epoch": 4.638165833476987, "grad_norm": 1.2034301664062652, "learning_rate": 1.370643368767982e-07, "loss": 0.0922, "step": 13455 }, { "epoch": 4.639889674194104, "grad_norm": 1.5013055926060335, "learning_rate": 1.3576922677253413e-07, "loss": 0.1065, "step": 13460 }, { "epoch": 4.6416135149112225, "grad_norm": 1.1145947340545772, "learning_rate": 1.3448018022465758e-07, "loss": 0.0966, "step": 13465 }, { "epoch": 4.64333735562834, "grad_norm": 1.2193378482255657, "learning_rate": 1.3319719884004268e-07, "loss": 0.0977, "step": 13470 }, { "epoch": 4.645061196345457, "grad_norm": 1.2820035565815853, "learning_rate": 1.3192028421800286e-07, "loss": 0.0826, "step": 13475 }, { "epoch": 4.646785037062576, "grad_norm": 1.1676360477766345, "learning_rate": 1.3064943795028927e-07, "loss": 0.0865, "step": 13480 }, { "epoch": 4.648508877779693, "grad_norm": 1.268140172438007, "learning_rate": 1.2938466162108755e-07, "loss": 0.0961, "step": 13485 }, { "epoch": 4.650232718496811, "grad_norm": 1.2316793005853521, "learning_rate": 1.2812595680701868e-07, "loss": 0.0899, "step": 13490 }, { "epoch": 4.651956559213929, "grad_norm": 1.3560309380214235, "learning_rate": 1.2687332507713367e-07, "loss": 0.0919, "step": 13495 }, { "epoch": 4.653680399931046, "grad_norm": 1.3211197526343696, "learning_rate": 1.2562676799291295e-07, "loss": 0.0936, "step": 13500 }, { "epoch": 4.655404240648164, "grad_norm": 1.7175327864120091, "learning_rate": 1.2438628710826462e-07, "loss": 0.0963, "step": 13505 }, { "epoch": 4.657128081365282, "grad_norm": 1.3462874472570279, "learning_rate": 1.2315188396952393e-07, "loss": 0.0942, "step": 13510 }, { "epoch": 4.6588519220824, "grad_norm": 1.3704760307183306, "learning_rate": 1.219235601154478e-07, "loss": 0.0994, "step": 13515 }, { "epoch": 4.6605757627995175, "grad_norm": 1.527220264908924, "learning_rate": 1.2070131707721645e-07, "loss": 0.0887, "step": 13520 }, { "epoch": 4.662299603516635, "grad_norm": 1.38286071365882, "learning_rate": 1.1948515637842772e-07, "loss": 0.109, "step": 13525 }, { "epoch": 4.664023444233753, "grad_norm": 1.3332598116491754, "learning_rate": 1.1827507953510065e-07, "loss": 0.1045, "step": 13530 }, { "epoch": 4.6657472849508705, "grad_norm": 1.4427499746222403, "learning_rate": 1.1707108805566914e-07, "loss": 0.1141, "step": 13535 }, { "epoch": 4.667471125667988, "grad_norm": 1.137998617122497, "learning_rate": 1.1587318344097987e-07, "loss": 0.083, "step": 13540 }, { "epoch": 4.669194966385106, "grad_norm": 1.600272132407465, "learning_rate": 1.146813671842939e-07, "loss": 0.0856, "step": 13545 }, { "epoch": 4.670918807102224, "grad_norm": 1.6046131715497098, "learning_rate": 1.1349564077128172e-07, "loss": 0.1054, "step": 13550 }, { "epoch": 4.672642647819341, "grad_norm": 1.1104256373747428, "learning_rate": 1.1231600568002266e-07, "loss": 0.0899, "step": 13555 }, { "epoch": 4.674366488536459, "grad_norm": 1.352860699849487, "learning_rate": 1.1114246338100209e-07, "loss": 0.0748, "step": 13560 }, { "epoch": 4.676090329253577, "grad_norm": 1.4976105797333303, "learning_rate": 1.0997501533711263e-07, "loss": 0.1103, "step": 13565 }, { "epoch": 4.677814169970695, "grad_norm": 1.3136807231096304, "learning_rate": 1.0881366300364681e-07, "loss": 0.0818, "step": 13570 }, { "epoch": 4.679538010687812, "grad_norm": 1.4188593207640947, "learning_rate": 1.0765840782830106e-07, "loss": 0.0924, "step": 13575 }, { "epoch": 4.68126185140493, "grad_norm": 1.5397827779204702, "learning_rate": 1.0650925125117062e-07, "loss": 0.0937, "step": 13580 }, { "epoch": 4.682985692122048, "grad_norm": 1.120202347782216, "learning_rate": 1.0536619470474852e-07, "loss": 0.0867, "step": 13585 }, { "epoch": 4.6847095328391655, "grad_norm": 1.820853812092607, "learning_rate": 1.0422923961392328e-07, "loss": 0.0957, "step": 13590 }, { "epoch": 4.686433373556284, "grad_norm": 1.4286969067738933, "learning_rate": 1.0309838739597677e-07, "loss": 0.1019, "step": 13595 }, { "epoch": 4.688157214273401, "grad_norm": 1.2270942167491161, "learning_rate": 1.0197363946058637e-07, "loss": 0.0867, "step": 13600 }, { "epoch": 4.6898810549905185, "grad_norm": 2.019887324948578, "learning_rate": 1.0085499720981661e-07, "loss": 0.1058, "step": 13605 }, { "epoch": 4.691604895707637, "grad_norm": 1.4971597940304116, "learning_rate": 9.97424620381221e-08, "loss": 0.0966, "step": 13610 }, { "epoch": 4.693328736424754, "grad_norm": 1.3518552066019132, "learning_rate": 9.863603533234622e-08, "loss": 0.1033, "step": 13615 }, { "epoch": 4.6950525771418725, "grad_norm": 1.3860843666652813, "learning_rate": 9.753571847171572e-08, "loss": 0.0901, "step": 13620 }, { "epoch": 4.69677641785899, "grad_norm": 1.3014880507475362, "learning_rate": 9.644151282784119e-08, "loss": 0.0965, "step": 13625 }, { "epoch": 4.698500258576107, "grad_norm": 1.4008498586606288, "learning_rate": 9.535341976471713e-08, "loss": 0.0999, "step": 13630 }, { "epoch": 4.700224099293226, "grad_norm": 1.322094982701535, "learning_rate": 9.427144063871629e-08, "loss": 0.0812, "step": 13635 }, { "epoch": 4.701947940010343, "grad_norm": 1.3876031917007183, "learning_rate": 9.319557679859093e-08, "loss": 0.0933, "step": 13640 }, { "epoch": 4.70367178072746, "grad_norm": 1.110067386556814, "learning_rate": 9.212582958546989e-08, "loss": 0.0985, "step": 13645 }, { "epoch": 4.705395621444579, "grad_norm": 1.6536004853182418, "learning_rate": 9.106220033285762e-08, "loss": 0.0976, "step": 13650 }, { "epoch": 4.707119462161696, "grad_norm": 1.3877762732500114, "learning_rate": 9.000469036663128e-08, "loss": 0.0922, "step": 13655 }, { "epoch": 4.708843302878814, "grad_norm": 1.4471504297036866, "learning_rate": 8.89533010050414e-08, "loss": 0.0933, "step": 13660 }, { "epoch": 4.710567143595932, "grad_norm": 1.345184410865251, "learning_rate": 8.790803355870847e-08, "loss": 0.1045, "step": 13665 }, { "epoch": 4.712290984313049, "grad_norm": 1.2747865374695397, "learning_rate": 8.686888933062076e-08, "loss": 0.1097, "step": 13670 }, { "epoch": 4.714014825030167, "grad_norm": 1.2778218298625545, "learning_rate": 8.583586961613432e-08, "loss": 0.1021, "step": 13675 }, { "epoch": 4.715738665747285, "grad_norm": 1.2033123973842108, "learning_rate": 8.480897570296964e-08, "loss": 0.0995, "step": 13680 }, { "epoch": 4.717462506464402, "grad_norm": 1.1922744353834456, "learning_rate": 8.378820887121276e-08, "loss": 0.0794, "step": 13685 }, { "epoch": 4.7191863471815205, "grad_norm": 1.2173613841871695, "learning_rate": 8.277357039330969e-08, "loss": 0.094, "step": 13690 }, { "epoch": 4.720910187898638, "grad_norm": 1.4153786129778525, "learning_rate": 8.176506153406983e-08, "loss": 0.0825, "step": 13695 }, { "epoch": 4.722634028615756, "grad_norm": 1.2756006556141246, "learning_rate": 8.07626835506592e-08, "loss": 0.0915, "step": 13700 }, { "epoch": 4.724357869332874, "grad_norm": 1.3386533645764536, "learning_rate": 7.976643769260329e-08, "loss": 0.0966, "step": 13705 }, { "epoch": 4.726081710049991, "grad_norm": 1.2853753966838655, "learning_rate": 7.877632520178146e-08, "loss": 0.0975, "step": 13710 }, { "epoch": 4.727805550767109, "grad_norm": 1.200683434493203, "learning_rate": 7.779234731242869e-08, "loss": 0.1097, "step": 13715 }, { "epoch": 4.729529391484227, "grad_norm": 1.2438111752310064, "learning_rate": 7.68145052511332e-08, "loss": 0.0969, "step": 13720 }, { "epoch": 4.731253232201345, "grad_norm": 1.4857904886894677, "learning_rate": 7.584280023683333e-08, "loss": 0.0981, "step": 13725 }, { "epoch": 4.732977072918462, "grad_norm": 1.2569915355430077, "learning_rate": 7.487723348081788e-08, "loss": 0.0946, "step": 13730 }, { "epoch": 4.73470091363558, "grad_norm": 1.3006579967061103, "learning_rate": 7.391780618672461e-08, "loss": 0.0979, "step": 13735 }, { "epoch": 4.736424754352698, "grad_norm": 1.2060927022007693, "learning_rate": 7.296451955053685e-08, "loss": 0.0971, "step": 13740 }, { "epoch": 4.7381485950698154, "grad_norm": 1.333980082942855, "learning_rate": 7.201737476058346e-08, "loss": 0.098, "step": 13745 }, { "epoch": 4.739872435786934, "grad_norm": 1.528677809213225, "learning_rate": 7.107637299753833e-08, "loss": 0.0991, "step": 13750 }, { "epoch": 4.741596276504051, "grad_norm": 1.5331933283271482, "learning_rate": 7.01415154344165e-08, "loss": 0.0996, "step": 13755 }, { "epoch": 4.7433201172211685, "grad_norm": 1.5113460268661256, "learning_rate": 6.921280323657354e-08, "loss": 0.0921, "step": 13760 }, { "epoch": 4.745043957938287, "grad_norm": 1.5194760667157945, "learning_rate": 6.829023756170505e-08, "loss": 0.101, "step": 13765 }, { "epoch": 4.746767798655404, "grad_norm": 6.4406094901708775, "learning_rate": 6.737381955984556e-08, "loss": 0.0842, "step": 13770 }, { "epoch": 4.748491639372522, "grad_norm": 1.3920559725715298, "learning_rate": 6.646355037336461e-08, "loss": 0.102, "step": 13775 }, { "epoch": 4.75021548008964, "grad_norm": 1.5441176486271477, "learning_rate": 6.555943113696783e-08, "loss": 0.091, "step": 13780 }, { "epoch": 4.751939320806757, "grad_norm": 1.6977755494926987, "learning_rate": 6.466146297769427e-08, "loss": 0.112, "step": 13785 }, { "epoch": 4.753663161523876, "grad_norm": 1.1084874788367438, "learning_rate": 6.376964701491518e-08, "loss": 0.096, "step": 13790 }, { "epoch": 4.755387002240993, "grad_norm": 1.3672625263770586, "learning_rate": 6.288398436033294e-08, "loss": 0.1067, "step": 13795 }, { "epoch": 4.75711084295811, "grad_norm": 1.4085136982008044, "learning_rate": 6.200447611797889e-08, "loss": 0.0996, "step": 13800 }, { "epoch": 4.758834683675229, "grad_norm": 1.315774980855411, "learning_rate": 6.113112338421379e-08, "loss": 0.0854, "step": 13805 }, { "epoch": 4.760558524392346, "grad_norm": 1.3449961450823873, "learning_rate": 6.026392724772346e-08, "loss": 0.1049, "step": 13810 }, { "epoch": 4.7622823651094635, "grad_norm": 1.3416353730437305, "learning_rate": 5.9402888789520386e-08, "loss": 0.1074, "step": 13815 }, { "epoch": 4.764006205826582, "grad_norm": 1.6156281298912567, "learning_rate": 5.8548009082941005e-08, "loss": 0.1059, "step": 13820 }, { "epoch": 4.765730046543699, "grad_norm": 1.906243084579866, "learning_rate": 5.769928919364454e-08, "loss": 0.1118, "step": 13825 }, { "epoch": 4.767453887260817, "grad_norm": 1.2151040203494308, "learning_rate": 5.68567301796108e-08, "loss": 0.1082, "step": 13830 }, { "epoch": 4.769177727977935, "grad_norm": 1.4233744317358104, "learning_rate": 5.6020333091140743e-08, "loss": 0.1141, "step": 13835 }, { "epoch": 4.770901568695052, "grad_norm": 1.2188441261815888, "learning_rate": 5.51900989708537e-08, "loss": 0.0909, "step": 13840 }, { "epoch": 4.7726254094121705, "grad_norm": 3.5705776344629556, "learning_rate": 5.43660288536868e-08, "loss": 0.0996, "step": 13845 }, { "epoch": 4.774349250129288, "grad_norm": 1.5089928203842367, "learning_rate": 5.3548123766891666e-08, "loss": 0.0836, "step": 13850 }, { "epoch": 4.776073090846406, "grad_norm": 1.4416933213557006, "learning_rate": 5.2736384730037726e-08, "loss": 0.0901, "step": 13855 }, { "epoch": 4.777796931563524, "grad_norm": 1.4536660462965365, "learning_rate": 5.1930812755005554e-08, "loss": 0.1095, "step": 13860 }, { "epoch": 4.779520772280641, "grad_norm": 1.394197227843644, "learning_rate": 5.1131408845989106e-08, "loss": 0.1024, "step": 13865 }, { "epoch": 4.781244612997759, "grad_norm": 1.4539150414557434, "learning_rate": 5.0338173999494586e-08, "loss": 0.1045, "step": 13870 }, { "epoch": 4.782968453714877, "grad_norm": 1.2009776217172086, "learning_rate": 4.9551109204336034e-08, "loss": 0.0786, "step": 13875 }, { "epoch": 4.784692294431995, "grad_norm": 1.3029531931726013, "learning_rate": 4.877021544163696e-08, "loss": 0.1031, "step": 13880 }, { "epoch": 4.786416135149112, "grad_norm": 1.4638350709115346, "learning_rate": 4.79954936848287e-08, "loss": 0.1285, "step": 13885 }, { "epoch": 4.78813997586623, "grad_norm": 1.325904788671911, "learning_rate": 4.7226944899649296e-08, "loss": 0.0951, "step": 13890 }, { "epoch": 4.789863816583348, "grad_norm": 1.5621696558731313, "learning_rate": 4.646457004413962e-08, "loss": 0.0955, "step": 13895 }, { "epoch": 4.791587657300465, "grad_norm": 1.6548210222936395, "learning_rate": 4.5708370068646144e-08, "loss": 0.0971, "step": 13900 }, { "epoch": 4.793311498017583, "grad_norm": 1.3102386537006776, "learning_rate": 4.495834591581871e-08, "loss": 0.0956, "step": 13905 }, { "epoch": 4.795035338734701, "grad_norm": 1.1558147781651293, "learning_rate": 4.4214498520607216e-08, "loss": 0.0844, "step": 13910 }, { "epoch": 4.7967591794518185, "grad_norm": 1.3034762756808058, "learning_rate": 4.3476828810261054e-08, "loss": 0.0976, "step": 13915 }, { "epoch": 4.798483020168936, "grad_norm": 1.0778541595176203, "learning_rate": 4.2745337704331316e-08, "loss": 0.0869, "step": 13920 }, { "epoch": 4.800206860886054, "grad_norm": 1.5648959055945622, "learning_rate": 4.202002611466471e-08, "loss": 0.1046, "step": 13925 }, { "epoch": 4.801930701603172, "grad_norm": 1.4808155428582115, "learning_rate": 4.130089494540635e-08, "loss": 0.0988, "step": 13930 }, { "epoch": 4.80365454232029, "grad_norm": 1.4236243594177849, "learning_rate": 4.058794509299635e-08, "loss": 0.0989, "step": 13935 }, { "epoch": 4.805378383037407, "grad_norm": 1.2427124493604509, "learning_rate": 3.9881177446169376e-08, "loss": 0.1012, "step": 13940 }, { "epoch": 4.807102223754525, "grad_norm": 1.9298724366113007, "learning_rate": 3.918059288595399e-08, "loss": 0.1038, "step": 13945 }, { "epoch": 4.808826064471643, "grad_norm": 1.2998729388360812, "learning_rate": 3.848619228567107e-08, "loss": 0.1094, "step": 13950 }, { "epoch": 4.81054990518876, "grad_norm": 1.3651422935501445, "learning_rate": 3.7797976510933196e-08, "loss": 0.0985, "step": 13955 }, { "epoch": 4.812273745905879, "grad_norm": 1.3622378507771178, "learning_rate": 3.711594641964189e-08, "loss": 0.1002, "step": 13960 }, { "epoch": 4.813997586622996, "grad_norm": 1.1948443111747744, "learning_rate": 3.644010286198929e-08, "loss": 0.0919, "step": 13965 }, { "epoch": 4.815721427340113, "grad_norm": 1.3791338577576255, "learning_rate": 3.577044668045482e-08, "loss": 0.1104, "step": 13970 }, { "epoch": 4.817445268057232, "grad_norm": 1.422126479577189, "learning_rate": 3.5106978709805726e-08, "loss": 0.1, "step": 13975 }, { "epoch": 4.819169108774349, "grad_norm": 1.3106926505898486, "learning_rate": 3.4449699777093226e-08, "loss": 0.094, "step": 13980 }, { "epoch": 4.820892949491467, "grad_norm": 1.1769287100489754, "learning_rate": 3.3798610701656906e-08, "loss": 0.0864, "step": 13985 }, { "epoch": 4.822616790208585, "grad_norm": 1.2869442181883943, "learning_rate": 3.315371229511754e-08, "loss": 0.0943, "step": 13990 }, { "epoch": 4.824340630925702, "grad_norm": 1.3513580116611834, "learning_rate": 3.2515005361380415e-08, "loss": 0.0961, "step": 13995 }, { "epoch": 4.8260644716428205, "grad_norm": 1.4141085437286893, "learning_rate": 3.1882490696631406e-08, "loss": 0.1028, "step": 14000 }, { "epoch": 4.827788312359938, "grad_norm": 1.2240986527661692, "learning_rate": 3.125616908933815e-08, "loss": 0.0889, "step": 14005 }, { "epoch": 4.829512153077056, "grad_norm": 1.3401514969185462, "learning_rate": 3.063604132024889e-08, "loss": 0.0958, "step": 14010 }, { "epoch": 4.831235993794174, "grad_norm": 1.236269890014706, "learning_rate": 3.0022108162389706e-08, "loss": 0.0974, "step": 14015 }, { "epoch": 4.832959834511291, "grad_norm": 1.3310133105230573, "learning_rate": 2.9414370381065095e-08, "loss": 0.0998, "step": 14020 }, { "epoch": 4.834683675228409, "grad_norm": 1.4760525563365918, "learning_rate": 2.8812828733856825e-08, "loss": 0.0988, "step": 14025 }, { "epoch": 4.836407515945527, "grad_norm": 1.3013321767236874, "learning_rate": 2.8217483970623404e-08, "loss": 0.0972, "step": 14030 }, { "epoch": 4.838131356662644, "grad_norm": 1.4514264775266061, "learning_rate": 2.762833683349786e-08, "loss": 0.0879, "step": 14035 }, { "epoch": 4.839855197379762, "grad_norm": 1.3986986620265374, "learning_rate": 2.7045388056886613e-08, "loss": 0.0973, "step": 14040 }, { "epoch": 4.84157903809688, "grad_norm": 1.3973586120737507, "learning_rate": 2.6468638367471156e-08, "loss": 0.1161, "step": 14045 }, { "epoch": 4.843302878813997, "grad_norm": 1.4211471904477977, "learning_rate": 2.5898088484204164e-08, "loss": 0.1161, "step": 14050 }, { "epoch": 4.845026719531115, "grad_norm": 1.1241958618848094, "learning_rate": 2.5333739118310607e-08, "loss": 0.0933, "step": 14055 }, { "epoch": 4.846750560248233, "grad_norm": 1.093403189910431, "learning_rate": 2.4775590973286634e-08, "loss": 0.0972, "step": 14060 }, { "epoch": 4.848474400965351, "grad_norm": 1.3846975251008296, "learning_rate": 2.4223644744896247e-08, "loss": 0.0936, "step": 14065 }, { "epoch": 4.8501982416824685, "grad_norm": 1.1625662697774155, "learning_rate": 2.3677901121174628e-08, "loss": 0.1267, "step": 14070 }, { "epoch": 4.851922082399586, "grad_norm": 1.6069938340444492, "learning_rate": 2.3138360782423707e-08, "loss": 0.1051, "step": 14075 }, { "epoch": 4.853645923116704, "grad_norm": 1.3886022004222287, "learning_rate": 2.2605024401212704e-08, "loss": 0.0958, "step": 14080 }, { "epoch": 4.855369763833822, "grad_norm": 1.256109373027148, "learning_rate": 2.207789264237814e-08, "loss": 0.0951, "step": 14085 }, { "epoch": 4.85709360455094, "grad_norm": 1.4532026966920928, "learning_rate": 2.1556966163021054e-08, "loss": 0.0957, "step": 14090 }, { "epoch": 4.858817445268057, "grad_norm": 1.3896721377892596, "learning_rate": 2.1042245612507563e-08, "loss": 0.1107, "step": 14095 }, { "epoch": 4.860541285985175, "grad_norm": 1.161239952998629, "learning_rate": 2.0533731632468302e-08, "loss": 0.1014, "step": 14100 }, { "epoch": 4.862265126702293, "grad_norm": 1.4405078393378496, "learning_rate": 2.0031424856795656e-08, "loss": 0.0964, "step": 14105 }, { "epoch": 4.86398896741941, "grad_norm": 1.0542170598870872, "learning_rate": 1.9535325911645974e-08, "loss": 0.1002, "step": 14110 }, { "epoch": 4.865712808136529, "grad_norm": 1.211915362473615, "learning_rate": 1.9045435415436798e-08, "loss": 0.0883, "step": 14115 }, { "epoch": 4.867436648853646, "grad_norm": 1.4877414514791176, "learning_rate": 1.856175397884519e-08, "loss": 0.1021, "step": 14120 }, { "epoch": 4.869160489570763, "grad_norm": 1.257286910642509, "learning_rate": 1.80842822048094e-08, "loss": 0.1031, "step": 14125 }, { "epoch": 4.870884330287882, "grad_norm": 1.3420926910135274, "learning_rate": 1.7613020688527215e-08, "loss": 0.1156, "step": 14130 }, { "epoch": 4.872608171004999, "grad_norm": 1.3855615335288138, "learning_rate": 1.7147970017454275e-08, "loss": 0.1131, "step": 14135 }, { "epoch": 4.8743320117221165, "grad_norm": 1.3496689615543849, "learning_rate": 1.6689130771304076e-08, "loss": 0.0972, "step": 14140 }, { "epoch": 4.876055852439235, "grad_norm": 1.2361116160781513, "learning_rate": 1.6236503522046865e-08, "loss": 0.0925, "step": 14145 }, { "epoch": 4.877779693156352, "grad_norm": 1.2949134418825732, "learning_rate": 1.5790088833910755e-08, "loss": 0.0882, "step": 14150 }, { "epoch": 4.8795035338734705, "grad_norm": 1.4423232706755098, "learning_rate": 1.5349887263377826e-08, "loss": 0.096, "step": 14155 }, { "epoch": 4.881227374590588, "grad_norm": 1.5242583230047713, "learning_rate": 1.491589935918636e-08, "loss": 0.1077, "step": 14160 }, { "epoch": 4.882951215307705, "grad_norm": 1.2823536392269483, "learning_rate": 1.448812566232749e-08, "loss": 0.0918, "step": 14165 }, { "epoch": 4.8846750560248235, "grad_norm": 1.3272510700353644, "learning_rate": 1.4066566706048001e-08, "loss": 0.115, "step": 14170 }, { "epoch": 4.886398896741941, "grad_norm": 1.566051023444431, "learning_rate": 1.3651223015845871e-08, "loss": 0.0994, "step": 14175 }, { "epoch": 4.888122737459058, "grad_norm": 1.3707859433477998, "learning_rate": 1.3242095109471942e-08, "loss": 0.1098, "step": 14180 }, { "epoch": 4.889846578176177, "grad_norm": 1.4204996960180363, "learning_rate": 1.2839183496928808e-08, "loss": 0.1047, "step": 14185 }, { "epoch": 4.891570418893294, "grad_norm": 1.0365809080242143, "learning_rate": 1.2442488680470266e-08, "loss": 0.0917, "step": 14190 }, { "epoch": 4.893294259610412, "grad_norm": 1.1823149883840962, "learning_rate": 1.2052011154600197e-08, "loss": 0.0911, "step": 14195 }, { "epoch": 4.89501810032753, "grad_norm": 1.424281172971629, "learning_rate": 1.1667751406072569e-08, "loss": 0.0897, "step": 14200 }, { "epoch": 4.896741941044647, "grad_norm": 1.3321713768345178, "learning_rate": 1.128970991388978e-08, "loss": 0.0951, "step": 14205 }, { "epoch": 4.898465781761765, "grad_norm": 1.539849141369638, "learning_rate": 1.0917887149303196e-08, "loss": 0.0972, "step": 14210 }, { "epoch": 4.900189622478883, "grad_norm": 1.3932031260084274, "learning_rate": 1.0552283575813171e-08, "loss": 0.0952, "step": 14215 }, { "epoch": 4.901913463196001, "grad_norm": 1.453166419944204, "learning_rate": 1.01928996491657e-08, "loss": 0.1088, "step": 14220 }, { "epoch": 4.9036373039131185, "grad_norm": 1.334424660734862, "learning_rate": 9.8397358173552e-09, "loss": 0.0879, "step": 14225 }, { "epoch": 4.905361144630236, "grad_norm": 1.322735552987269, "learning_rate": 9.492792520620631e-09, "loss": 0.0917, "step": 14230 }, { "epoch": 4.907084985347354, "grad_norm": 1.4973921778425843, "learning_rate": 9.152070191448814e-09, "loss": 0.0957, "step": 14235 }, { "epoch": 4.9088088260644716, "grad_norm": 1.431169667547247, "learning_rate": 8.817569254569447e-09, "loss": 0.0979, "step": 14240 }, { "epoch": 4.91053266678159, "grad_norm": 1.2710799194841491, "learning_rate": 8.489290126959537e-09, "loss": 0.0961, "step": 14245 }, { "epoch": 4.912256507498707, "grad_norm": 1.3185640551155087, "learning_rate": 8.1672332178373e-09, "loss": 0.0795, "step": 14250 }, { "epoch": 4.913980348215825, "grad_norm": 1.6115725732360635, "learning_rate": 7.851398928667154e-09, "loss": 0.105, "step": 14255 }, { "epoch": 4.915704188932943, "grad_norm": 1.5133238027410296, "learning_rate": 7.54178765315472e-09, "loss": 0.0924, "step": 14260 }, { "epoch": 4.91742802965006, "grad_norm": 1.1314040169361865, "learning_rate": 7.238399777249605e-09, "loss": 0.0992, "step": 14265 }, { "epoch": 4.919151870367178, "grad_norm": 1.250245971151011, "learning_rate": 6.941235679143177e-09, "loss": 0.0979, "step": 14270 }, { "epoch": 4.920875711084296, "grad_norm": 1.5807535933663104, "learning_rate": 6.650295729268008e-09, "loss": 0.0957, "step": 14275 }, { "epoch": 4.922599551801413, "grad_norm": 1.2646164150834145, "learning_rate": 6.3655802902984345e-09, "loss": 0.0979, "step": 14280 }, { "epoch": 4.924323392518531, "grad_norm": 1.1472513451831219, "learning_rate": 6.087089717148887e-09, "loss": 0.0745, "step": 14285 }, { "epoch": 4.926047233235649, "grad_norm": 1.3787643736402495, "learning_rate": 5.814824356975557e-09, "loss": 0.092, "step": 14290 }, { "epoch": 4.9277710739527665, "grad_norm": 1.2047071648300773, "learning_rate": 5.54878454917307e-09, "loss": 0.093, "step": 14295 }, { "epoch": 4.929494914669885, "grad_norm": 1.48549431951532, "learning_rate": 5.288970625376144e-09, "loss": 0.1051, "step": 14300 }, { "epoch": 4.931218755387002, "grad_norm": 1.3761400542258648, "learning_rate": 5.035382909457931e-09, "loss": 0.1197, "step": 14305 }, { "epoch": 4.93294259610412, "grad_norm": 1.2031446727362487, "learning_rate": 4.788021717531677e-09, "loss": 0.1019, "step": 14310 }, { "epoch": 4.934666436821238, "grad_norm": 1.0569127634349142, "learning_rate": 4.546887357947394e-09, "loss": 0.0869, "step": 14315 }, { "epoch": 4.936390277538355, "grad_norm": 1.3663366213890815, "learning_rate": 4.31198013129408e-09, "loss": 0.0917, "step": 14320 }, { "epoch": 4.9381141182554735, "grad_norm": 1.4437475336622627, "learning_rate": 4.083300330396944e-09, "loss": 0.097, "step": 14325 }, { "epoch": 4.939837958972591, "grad_norm": 1.384616177252969, "learning_rate": 3.8608482403196255e-09, "loss": 0.0977, "step": 14330 }, { "epoch": 4.941561799689708, "grad_norm": 1.4573513735461567, "learning_rate": 3.644624138362529e-09, "loss": 0.0998, "step": 14335 }, { "epoch": 4.943285640406827, "grad_norm": 1.4257908131537387, "learning_rate": 3.4346282940611596e-09, "loss": 0.0843, "step": 14340 }, { "epoch": 4.945009481123944, "grad_norm": 1.4472104168885904, "learning_rate": 3.2308609691877878e-09, "loss": 0.112, "step": 14345 }, { "epoch": 4.946733321841062, "grad_norm": 1.3596620459225746, "learning_rate": 3.033322417752005e-09, "loss": 0.0944, "step": 14350 }, { "epoch": 4.94845716255818, "grad_norm": 1.4080719556223817, "learning_rate": 2.8420128859962813e-09, "loss": 0.1058, "step": 14355 }, { "epoch": 4.950181003275297, "grad_norm": 1.929399587520485, "learning_rate": 2.656932612399854e-09, "loss": 0.1022, "step": 14360 }, { "epoch": 4.951904843992415, "grad_norm": 1.1843384274221604, "learning_rate": 2.478081827676504e-09, "loss": 0.1107, "step": 14365 }, { "epoch": 4.953628684709533, "grad_norm": 1.2150546344422188, "learning_rate": 2.305460754774003e-09, "loss": 0.0929, "step": 14370 }, { "epoch": 4.955352525426651, "grad_norm": 1.2852605711580363, "learning_rate": 2.1390696088757766e-09, "loss": 0.1072, "step": 14375 }, { "epoch": 4.9570763661437685, "grad_norm": 1.5649879790510222, "learning_rate": 1.9789085973975774e-09, "loss": 0.1147, "step": 14380 }, { "epoch": 4.958800206860886, "grad_norm": 1.4806792798022161, "learning_rate": 1.824977919990256e-09, "loss": 0.119, "step": 14385 }, { "epoch": 4.960524047578004, "grad_norm": 1.1713177695747807, "learning_rate": 1.677277768537544e-09, "loss": 0.088, "step": 14390 }, { "epoch": 4.9622478882951215, "grad_norm": 1.3199356091483336, "learning_rate": 1.535808327156052e-09, "loss": 0.0917, "step": 14395 }, { "epoch": 4.963971729012239, "grad_norm": 1.240851690916721, "learning_rate": 1.4005697721969357e-09, "loss": 0.1025, "step": 14400 }, { "epoch": 4.965695569729357, "grad_norm": 1.502781587155829, "learning_rate": 1.2715622722425657e-09, "loss": 0.1032, "step": 14405 }, { "epoch": 4.967419410446475, "grad_norm": 1.1159646005854584, "learning_rate": 1.1487859881087471e-09, "loss": 0.094, "step": 14410 }, { "epoch": 4.969143251163592, "grad_norm": 1.2331025159144084, "learning_rate": 1.0322410728436095e-09, "loss": 0.0909, "step": 14415 }, { "epoch": 4.97086709188071, "grad_norm": 1.287020195683251, "learning_rate": 9.21927671727052e-10, "loss": 0.1006, "step": 14420 }, { "epoch": 4.972590932597828, "grad_norm": 1.452627575615611, "learning_rate": 8.178459222712986e-10, "loss": 0.1066, "step": 14425 }, { "epoch": 4.974314773314946, "grad_norm": 1.81159188089844, "learning_rate": 7.199959542208979e-10, "loss": 0.0979, "step": 14430 }, { "epoch": 4.976038614032063, "grad_norm": 1.1596458413062876, "learning_rate": 6.283778895516123e-10, "loss": 0.0944, "step": 14435 }, { "epoch": 4.977762454749181, "grad_norm": 2.465260462403574, "learning_rate": 5.429918424709745e-10, "loss": 0.0965, "step": 14440 }, { "epoch": 4.979486295466299, "grad_norm": 1.3471705488906647, "learning_rate": 4.6383791941773114e-10, "loss": 0.103, "step": 14445 }, { "epoch": 4.9812101361834165, "grad_norm": 1.3025984657839627, "learning_rate": 3.909162190618432e-10, "loss": 0.1021, "step": 14450 }, { "epoch": 4.982933976900535, "grad_norm": 1.2119348971608377, "learning_rate": 3.2422683230448617e-10, "loss": 0.087, "step": 14455 }, { "epoch": 4.984657817617652, "grad_norm": 1.245040472055958, "learning_rate": 2.6376984227860504e-10, "loss": 0.0952, "step": 14460 }, { "epoch": 4.9863816583347695, "grad_norm": 1.3841399141396942, "learning_rate": 2.0954532434669384e-10, "loss": 0.0771, "step": 14465 }, { "epoch": 4.988105499051888, "grad_norm": 1.8376152995847748, "learning_rate": 1.6155334610357121e-10, "loss": 0.0832, "step": 14470 }, { "epoch": 4.989829339769005, "grad_norm": 1.352306386898328, "learning_rate": 1.1979396737415993e-10, "loss": 0.1128, "step": 14475 }, { "epoch": 4.9915531804861235, "grad_norm": 1.7225670797640587, "learning_rate": 8.426724021348697e-11, "loss": 0.1111, "step": 14480 }, { "epoch": 4.993277021203241, "grad_norm": 1.3740699473559133, "learning_rate": 5.4973208907793676e-11, "loss": 0.0885, "step": 14485 }, { "epoch": 4.995000861920358, "grad_norm": 1.1852501848701082, "learning_rate": 3.191190997398064e-11, "loss": 0.0863, "step": 14490 }, { "epoch": 4.996724702637477, "grad_norm": 1.4127853484068187, "learning_rate": 1.5083372159607756e-11, "loss": 0.0862, "step": 14495 }, { "epoch": 4.998448543354594, "grad_norm": 1.4421464293245208, "learning_rate": 4.487616442339082e-12, "loss": 0.102, "step": 14500 }, { "epoch": 5.0, "grad_norm": 2.4497902949004597, "learning_rate": 1.2465603049793828e-13, "loss": 0.1157, "step": 14505 }, { "epoch": 5.0, "step": 14505, "total_flos": 2.7534079795134464e+16, "train_loss": 0.2264123897661796, "train_runtime": 100851.2158, "train_samples_per_second": 73.624, "train_steps_per_second": 0.144 } ], "logging_steps": 5, "max_steps": 14505, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.7534079795134464e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }