diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -11,7 +11,7 @@ "log_history": [ { "epoch": 0.007462686567164179, - "grad_norm": 11.359202571623696, + "grad_norm": 11.35859680736035, "learning_rate": 0.0, "loss": 1.047095537185669, "num_tokens": 940173.0, @@ -19,4953 +19,4953 @@ }, { "epoch": 0.014925373134328358, - "grad_norm": 11.310353486645111, - "learning_rate": 5.263157894736842e-06, + "grad_norm": 11.310520487616877, + "learning_rate": 5.263157894736843e-07, "loss": 1.0946075916290283, "num_tokens": 1940908.0, "step": 2 }, { "epoch": 0.022388059701492536, - "grad_norm": 9.96037451672879, - "learning_rate": 1.0526315789473684e-05, - "loss": 0.9771832823753357, + "grad_norm": 11.106569322922516, + "learning_rate": 1.0526315789473685e-06, + "loss": 1.0278105735778809, "num_tokens": 2857302.0, "step": 3 }, { "epoch": 0.029850746268656716, - "grad_norm": 3.7090095611764284, - "learning_rate": 1.5789473684210526e-05, - "loss": 0.7594946622848511, + "grad_norm": 10.881054443812134, + "learning_rate": 1.5789473684210526e-06, + "loss": 1.0398736000061035, "num_tokens": 3696299.0, "step": 4 }, { "epoch": 0.03731343283582089, - "grad_norm": 2.241346035452221, - "learning_rate": 2.105263157894737e-05, - "loss": 0.7155371904373169, + "grad_norm": 10.448295115598174, + "learning_rate": 2.105263157894737e-06, + "loss": 1.0615425109863281, "num_tokens": 4528104.0, "step": 5 }, { "epoch": 0.04477611940298507, - "grad_norm": 2.3841061008258717, - "learning_rate": 2.6315789473684212e-05, - "loss": 0.7044811248779297, + "grad_norm": 10.151241780828355, + "learning_rate": 2.631578947368421e-06, + "loss": 1.0268486738204956, "num_tokens": 5554518.0, "step": 6 }, { "epoch": 0.05223880597014925, - "grad_norm": 1.5294256152242782, - "learning_rate": 3.157894736842105e-05, - "loss": 0.6691706776618958, + "grad_norm": 8.119312484055971, + "learning_rate": 3.157894736842105e-06, + "loss": 0.9329569935798645, "num_tokens": 6422948.0, "step": 7 }, { "epoch": 0.05970149253731343, - "grad_norm": 1.0399800122804008, - "learning_rate": 3.6842105263157895e-05, - "loss": 0.6105144023895264, + "grad_norm": 7.409758964343402, + "learning_rate": 3.6842105263157896e-06, + "loss": 0.8917287588119507, "num_tokens": 7201431.0, "step": 8 }, { "epoch": 0.06716417910447761, - "grad_norm": 0.8146568675374244, - "learning_rate": 4.210526315789474e-05, - "loss": 0.5531951189041138, + "grad_norm": 5.971479536888058, + "learning_rate": 4.210526315789474e-06, + "loss": 0.8006043434143066, "num_tokens": 8128474.0, "step": 9 }, { "epoch": 0.07462686567164178, - "grad_norm": 0.8460724626410775, - "learning_rate": 4.736842105263158e-05, - "loss": 0.6119080781936646, + "grad_norm": 3.4445244902185927, + "learning_rate": 4.736842105263158e-06, + "loss": 0.7708431482315063, "num_tokens": 9073762.0, "step": 10 }, { "epoch": 0.08208955223880597, - "grad_norm": 0.7430740893147656, - "learning_rate": 5.2631578947368424e-05, - "loss": 0.5387494564056396, + "grad_norm": 2.227913040407572, + "learning_rate": 5.263157894736842e-06, + "loss": 0.689713716506958, "num_tokens": 9950348.0, "step": 11 }, { "epoch": 0.08955223880597014, - "grad_norm": 0.9849352813180456, - "learning_rate": 5.789473684210527e-05, - "loss": 0.5732500553131104, + "grad_norm": 1.8665254369252244, + "learning_rate": 5.789473684210527e-06, + "loss": 0.7132350206375122, "num_tokens": 10884740.0, "step": 12 }, { "epoch": 0.09701492537313433, - "grad_norm": 0.9347811348927701, - "learning_rate": 6.31578947368421e-05, - "loss": 0.5503213405609131, + "grad_norm": 2.952404437976229, + "learning_rate": 6.31578947368421e-06, + "loss": 0.713362455368042, "num_tokens": 11697616.0, "step": 13 }, { "epoch": 0.1044776119402985, - "grad_norm": 0.679297078137132, - "learning_rate": 6.842105263157895e-05, - "loss": 0.5338681936264038, + "grad_norm": 2.826605099421276, + "learning_rate": 6.842105263157896e-06, + "loss": 0.6958507895469666, "num_tokens": 12632232.0, "step": 14 }, { "epoch": 0.11194029850746269, - "grad_norm": 0.9985364588497343, - "learning_rate": 7.368421052631579e-05, - "loss": 0.5242782831192017, + "grad_norm": 2.4454572403082926, + "learning_rate": 7.368421052631579e-06, + "loss": 0.6733378171920776, "num_tokens": 13568493.0, "step": 15 }, { "epoch": 0.11940298507462686, - "grad_norm": 0.7301827286143185, - "learning_rate": 7.894736842105263e-05, - "loss": 0.5389354228973389, + "grad_norm": 2.0537063830263924, + "learning_rate": 7.894736842105265e-06, + "loss": 0.6741904020309448, "num_tokens": 14533820.0, "step": 16 }, { "epoch": 0.12686567164179105, - "grad_norm": 0.7647522073170916, - "learning_rate": 8.421052631578948e-05, - "loss": 0.5333565473556519, + "grad_norm": 1.4727507656008452, + "learning_rate": 8.421052631578948e-06, + "loss": 0.6536232829093933, "num_tokens": 15435498.0, "step": 17 }, { "epoch": 0.13432835820895522, - "grad_norm": 0.8376018946079997, - "learning_rate": 8.947368421052632e-05, - "loss": 0.5089090466499329, + "grad_norm": 1.054376608380898, + "learning_rate": 8.947368421052632e-06, + "loss": 0.6000441312789917, "num_tokens": 16351791.0, "step": 18 }, { "epoch": 0.1417910447761194, - "grad_norm": 0.798595794479579, - "learning_rate": 9.473684210526316e-05, - "loss": 0.5181593894958496, + "grad_norm": 0.9835940111044099, + "learning_rate": 9.473684210526315e-06, + "loss": 0.6027337312698364, "num_tokens": 17276920.0, "step": 19 }, { "epoch": 0.14925373134328357, - "grad_norm": 0.9992963165251837, - "learning_rate": 0.0001, - "loss": 0.5558217167854309, + "grad_norm": 0.916308840098788, + "learning_rate": 1e-05, + "loss": 0.6199864149093628, "num_tokens": 18270172.0, "step": 20 }, { "epoch": 0.15671641791044777, - "grad_norm": 0.7635641019383523, - "learning_rate": 9.999938520216342e-05, - "loss": 0.5210483074188232, + "grad_norm": 0.6212633844448718, + "learning_rate": 9.999938520216343e-06, + "loss": 0.5760895609855652, "num_tokens": 19308005.0, "step": 21 }, { "epoch": 0.16417910447761194, - "grad_norm": 0.675810956701417, - "learning_rate": 9.99975408254526e-05, - "loss": 0.49068427085876465, + "grad_norm": 0.5315615385439493, + "learning_rate": 9.999754082545261e-06, + "loss": 0.5423388481140137, "num_tokens": 20162217.0, "step": 22 }, { "epoch": 0.17164179104477612, - "grad_norm": 0.6617026572622022, - "learning_rate": 9.999446692026396e-05, - "loss": 0.5179756283760071, + "grad_norm": 0.5852277738108399, + "learning_rate": 9.999446692026396e-06, + "loss": 0.5618520975112915, "num_tokens": 20980497.0, "step": 23 }, { "epoch": 0.1791044776119403, - "grad_norm": 0.6394176163707052, - "learning_rate": 9.999016357058996e-05, - "loss": 0.49992918968200684, + "grad_norm": 0.5256536336611786, + "learning_rate": 9.999016357058996e-06, + "loss": 0.5482994914054871, "num_tokens": 21857362.0, "step": 24 }, { "epoch": 0.1865671641791045, - "grad_norm": 0.6499208699738325, - "learning_rate": 9.998463089401678e-05, - "loss": 0.46909385919570923, + "grad_norm": 0.436253543862231, + "learning_rate": 9.99846308940168e-06, + "loss": 0.5038638710975647, "num_tokens": 22792620.0, "step": 25 }, { "epoch": 0.19402985074626866, - "grad_norm": 0.634359434455067, - "learning_rate": 9.997786904172126e-05, - "loss": 0.5462605357170105, + "grad_norm": 0.47872306271108794, + "learning_rate": 9.997786904172126e-06, + "loss": 0.5729074478149414, "num_tokens": 23723110.0, "step": 26 }, { "epoch": 0.20149253731343283, - "grad_norm": 0.6619051844869543, - "learning_rate": 9.996987819846656e-05, - "loss": 0.4991229772567749, + "grad_norm": 0.3887165593913177, + "learning_rate": 9.996987819846656e-06, + "loss": 0.5251473188400269, "num_tokens": 24725024.0, "step": 27 }, { "epoch": 0.208955223880597, - "grad_norm": 0.64944190384501, - "learning_rate": 9.996065858259728e-05, - "loss": 0.5392301678657532, + "grad_norm": 0.4864210479565411, + "learning_rate": 9.996065858259729e-06, + "loss": 0.560759425163269, "num_tokens": 25729987.0, "step": 28 }, { "epoch": 0.21641791044776118, - "grad_norm": 0.5771934238929258, - "learning_rate": 9.995021044603343e-05, - "loss": 0.504263162612915, + "grad_norm": 0.4545327828204722, + "learning_rate": 9.995021044603343e-06, + "loss": 0.5304505825042725, "num_tokens": 26557013.0, "step": 29 }, { "epoch": 0.22388059701492538, - "grad_norm": 0.5765419316904252, - "learning_rate": 9.993853407426354e-05, - "loss": 0.4927641749382019, + "grad_norm": 0.369912070212526, + "learning_rate": 9.993853407426353e-06, + "loss": 0.5103640556335449, "num_tokens": 27503464.0, "step": 30 }, { "epoch": 0.23134328358208955, - "grad_norm": 0.5678117542126839, - "learning_rate": 9.99256297863368e-05, - "loss": 0.48838478326797485, + "grad_norm": 0.32843421942348455, + "learning_rate": 9.99256297863368e-06, + "loss": 0.5005761384963989, "num_tokens": 28533732.0, "step": 31 }, { "epoch": 0.23880597014925373, - "grad_norm": 0.6532948801301113, - "learning_rate": 9.991149793485452e-05, - "loss": 0.5210269689559937, + "grad_norm": 0.36571377121484666, + "learning_rate": 9.991149793485453e-06, + "loss": 0.5339782238006592, "num_tokens": 29340667.0, "step": 32 }, { "epoch": 0.2462686567164179, - "grad_norm": 0.6482024497542078, - "learning_rate": 9.989613890596034e-05, - "loss": 0.523944079875946, + "grad_norm": 0.3706600251055638, + "learning_rate": 9.989613890596034e-06, + "loss": 0.5353128910064697, "num_tokens": 30210961.0, "step": 33 }, { "epoch": 0.2537313432835821, - "grad_norm": 0.6002473795850972, - "learning_rate": 9.987955311932969e-05, - "loss": 0.5042929649353027, + "grad_norm": 0.3689913973205178, + "learning_rate": 9.987955311932968e-06, + "loss": 0.5166599750518799, "num_tokens": 31101886.0, "step": 34 }, { "epoch": 0.26119402985074625, - "grad_norm": 0.537046524840533, - "learning_rate": 9.986174102815838e-05, - "loss": 0.48542821407318115, + "grad_norm": 0.33967789101967927, + "learning_rate": 9.986174102815837e-06, + "loss": 0.5018597841262817, "num_tokens": 31897310.0, "step": 35 }, { "epoch": 0.26865671641791045, - "grad_norm": 0.5554507372568854, - "learning_rate": 9.984270311915018e-05, - "loss": 0.46907031536102295, + "grad_norm": 0.34077171626781105, + "learning_rate": 9.984270311915019e-06, + "loss": 0.48667871952056885, "num_tokens": 32540943.0, "step": 36 }, { "epoch": 0.27611940298507465, - "grad_norm": 0.5922322616128651, - "learning_rate": 9.982243991250358e-05, - "loss": 0.5076830387115479, + "grad_norm": 0.3621091474207233, + "learning_rate": 9.982243991250359e-06, + "loss": 0.5088210105895996, "num_tokens": 33542067.0, "step": 37 }, { "epoch": 0.2835820895522388, - "grad_norm": 0.6618026653836381, - "learning_rate": 9.980095196189748e-05, - "loss": 0.4880174994468689, + "grad_norm": 0.3534080682731624, + "learning_rate": 9.980095196189748e-06, + "loss": 0.4913540482521057, "num_tokens": 34504224.0, "step": 38 }, { "epoch": 0.291044776119403, - "grad_norm": 0.4565970790354147, - "learning_rate": 9.977823985447613e-05, - "loss": 0.5264553427696228, + "grad_norm": 0.34385148887540573, + "learning_rate": 9.977823985447613e-06, + "loss": 0.5291423797607422, "num_tokens": 35410799.0, "step": 39 }, { "epoch": 0.29850746268656714, - "grad_norm": 0.634877120490829, - "learning_rate": 9.975430421083306e-05, - "loss": 0.5251759886741638, + "grad_norm": 0.3614616882970318, + "learning_rate": 9.975430421083307e-06, + "loss": 0.5238292217254639, "num_tokens": 36306291.0, "step": 40 }, { "epoch": 0.30597014925373134, - "grad_norm": 0.5072313550134231, - "learning_rate": 9.972914568499413e-05, - "loss": 0.4956824481487274, + "grad_norm": 0.34380854428467267, + "learning_rate": 9.972914568499412e-06, + "loss": 0.49555328488349915, "num_tokens": 37195796.0, "step": 41 }, { "epoch": 0.31343283582089554, - "grad_norm": 0.9181703800038942, - "learning_rate": 9.970276496439967e-05, - "loss": 0.4824793338775635, + "grad_norm": 0.32872739996760125, + "learning_rate": 9.970276496439967e-06, + "loss": 0.48128455877304077, "num_tokens": 38111088.0, "step": 42 }, { "epoch": 0.3208955223880597, - "grad_norm": 0.5630115218790266, - "learning_rate": 9.967516276988567e-05, - "loss": 0.4805726110935211, + "grad_norm": 0.32224419409640415, + "learning_rate": 9.967516276988569e-06, + "loss": 0.47381213307380676, "num_tokens": 38854783.0, "step": 43 }, { "epoch": 0.3283582089552239, - "grad_norm": 0.5615583807856466, - "learning_rate": 9.964633985566412e-05, - "loss": 0.5002149343490601, + "grad_norm": 0.313605152437139, + "learning_rate": 9.964633985566412e-06, + "loss": 0.4922352433204651, "num_tokens": 39832057.0, "step": 44 }, { "epoch": 0.3358208955223881, - "grad_norm": 0.5537107517169808, - "learning_rate": 9.961629700930236e-05, - "loss": 0.5166409015655518, + "grad_norm": 0.3221801938329887, + "learning_rate": 9.961629700930236e-06, + "loss": 0.5065716505050659, "num_tokens": 40758959.0, "step": 45 }, { "epoch": 0.34328358208955223, - "grad_norm": 0.5158367649381085, - "learning_rate": 9.958503505170158e-05, - "loss": 0.5032739639282227, + "grad_norm": 0.34336243037288433, + "learning_rate": 9.958503505170158e-06, + "loss": 0.4985169470310211, "num_tokens": 41744543.0, "step": 46 }, { "epoch": 0.35074626865671643, - "grad_norm": 0.527914417038312, - "learning_rate": 9.95525548370744e-05, - "loss": 0.48891395330429077, + "grad_norm": 0.323405267106758, + "learning_rate": 9.95525548370744e-06, + "loss": 0.4811803996562958, "num_tokens": 42685398.0, "step": 47 }, { "epoch": 0.3582089552238806, - "grad_norm": 0.6087097910910242, - "learning_rate": 9.951885725292151e-05, - "loss": 0.5083061456680298, + "grad_norm": 0.3472754733495145, + "learning_rate": 9.951885725292152e-06, + "loss": 0.4971832036972046, "num_tokens": 43509328.0, "step": 48 }, { "epoch": 0.3656716417910448, - "grad_norm": 0.500754939767484, - "learning_rate": 9.948394322000745e-05, - "loss": 0.4750032424926758, + "grad_norm": 0.30314939517994505, + "learning_rate": 9.948394322000747e-06, + "loss": 0.4676430821418762, "num_tokens": 44360961.0, "step": 49 }, { "epoch": 0.373134328358209, - "grad_norm": 0.5083993878307389, - "learning_rate": 9.944781369233543e-05, - "loss": 0.4557783901691437, + "grad_norm": 0.3115400700181878, + "learning_rate": 9.944781369233544e-06, + "loss": 0.4450893700122833, "num_tokens": 45215408.0, "step": 50 }, { "epoch": 0.3805970149253731, - "grad_norm": 0.5282545010029183, - "learning_rate": 9.941046965712123e-05, - "loss": 0.4741901457309723, + "grad_norm": 0.3274967224701377, + "learning_rate": 9.941046965712124e-06, + "loss": 0.4661027491092682, "num_tokens": 46008801.0, "step": 51 }, { "epoch": 0.3880597014925373, - "grad_norm": 0.5225616899404959, - "learning_rate": 9.937191213476627e-05, - "loss": 0.47363656759262085, + "grad_norm": 0.3185260501598265, + "learning_rate": 9.937191213476627e-06, + "loss": 0.45998284220695496, "num_tokens": 46857304.0, "step": 52 }, { "epoch": 0.39552238805970147, - "grad_norm": 0.48713605294963314, - "learning_rate": 9.933214217882972e-05, - "loss": 0.5170989036560059, + "grad_norm": 0.3187630499897143, + "learning_rate": 9.933214217882973e-06, + "loss": 0.49932676553726196, "num_tokens": 47835515.0, "step": 53 }, { "epoch": 0.40298507462686567, - "grad_norm": 0.47876130351667573, - "learning_rate": 9.929116087599972e-05, - "loss": 0.5105682015419006, + "grad_norm": 0.3126440220395918, + "learning_rate": 9.929116087599973e-06, + "loss": 0.49588972330093384, "num_tokens": 48834826.0, "step": 54 }, { "epoch": 0.41044776119402987, - "grad_norm": 0.5421102018520207, - "learning_rate": 9.924896934606365e-05, - "loss": 0.5093623399734497, + "grad_norm": 0.31909099806625735, + "learning_rate": 9.924896934606365e-06, + "loss": 0.49547284841537476, "num_tokens": 49858718.0, "step": 55 }, { "epoch": 0.417910447761194, - "grad_norm": 0.48625086632837894, - "learning_rate": 9.920556874187758e-05, - "loss": 0.47073519229888916, + "grad_norm": 0.2999327415505548, + "learning_rate": 9.920556874187757e-06, + "loss": 0.45831602811813354, "num_tokens": 50784650.0, "step": 56 }, { "epoch": 0.4253731343283582, - "grad_norm": 0.40903675062605, - "learning_rate": 9.916096024933471e-05, - "loss": 0.4558908939361572, + "grad_norm": 0.33478138187870804, + "learning_rate": 9.91609602493347e-06, + "loss": 0.44470953941345215, "num_tokens": 51788903.0, "step": 57 }, { "epoch": 0.43283582089552236, - "grad_norm": 0.4177728108060706, - "learning_rate": 9.911514508733307e-05, - "loss": 0.49773937463760376, + "grad_norm": 0.3098385124963181, + "learning_rate": 9.911514508733307e-06, + "loss": 0.48413345217704773, "num_tokens": 52740886.0, "step": 58 }, { "epoch": 0.44029850746268656, - "grad_norm": 0.44550558853659633, - "learning_rate": 9.906812450774207e-05, - "loss": 0.5167676210403442, + "grad_norm": 0.31570000266376347, + "learning_rate": 9.906812450774207e-06, + "loss": 0.5016104578971863, "num_tokens": 53671576.0, "step": 59 }, { "epoch": 0.44776119402985076, - "grad_norm": 1.5553374357622476, - "learning_rate": 9.90198997953684e-05, - "loss": 0.4443127512931824, + "grad_norm": 0.3184241179650494, + "learning_rate": 9.901989979536841e-06, + "loss": 0.4333784580230713, "num_tokens": 54565325.0, "step": 60 }, { "epoch": 0.4552238805970149, - "grad_norm": 0.6825650632419499, - "learning_rate": 9.897047226792092e-05, - "loss": 0.5048718452453613, + "grad_norm": 0.3257766657124954, + "learning_rate": 9.897047226792093e-06, + "loss": 0.47651222348213196, "num_tokens": 55458901.0, "step": 61 }, { "epoch": 0.4626865671641791, - "grad_norm": 0.5208100743654329, - "learning_rate": 9.891984327597461e-05, - "loss": 0.49431973695755005, + "grad_norm": 0.2817242291155619, + "learning_rate": 9.891984327597462e-06, + "loss": 0.4714818000793457, "num_tokens": 56519373.0, "step": 62 }, { "epoch": 0.4701492537313433, - "grad_norm": 0.49508396229205565, - "learning_rate": 9.886801420293365e-05, - "loss": 0.48846566677093506, + "grad_norm": 0.32585513855646564, + "learning_rate": 9.886801420293365e-06, + "loss": 0.4708700180053711, "num_tokens": 57420562.0, "step": 63 }, { "epoch": 0.47761194029850745, - "grad_norm": 0.4270971887578861, - "learning_rate": 9.88149864649937e-05, - "loss": 0.516663134098053, + "grad_norm": 0.32958409535328365, + "learning_rate": 9.88149864649937e-06, + "loss": 0.49606209993362427, "num_tokens": 58259052.0, "step": 64 }, { "epoch": 0.48507462686567165, - "grad_norm": 0.5482435682139535, - "learning_rate": 9.876076151110313e-05, - "loss": 0.5038647055625916, + "grad_norm": 0.31230811419608556, + "learning_rate": 9.876076151110313e-06, + "loss": 0.4840630888938904, "num_tokens": 59121922.0, "step": 65 }, { "epoch": 0.4925373134328358, - "grad_norm": 0.4723435961702705, - "learning_rate": 9.870534082292348e-05, - "loss": 0.47693532705307007, + "grad_norm": 0.31050271225919246, + "learning_rate": 9.870534082292349e-06, + "loss": 0.4600119888782501, "num_tokens": 60031785.0, "step": 66 }, { "epoch": 0.5, - "grad_norm": 0.4457257450195141, - "learning_rate": 9.864872591478894e-05, - "loss": 0.4574812650680542, + "grad_norm": 0.2885380845506061, + "learning_rate": 9.864872591478895e-06, + "loss": 0.44136810302734375, "num_tokens": 60972704.0, "step": 67 }, { "epoch": 0.5074626865671642, - "grad_norm": 0.47913092356031883, - "learning_rate": 9.859091833366497e-05, - "loss": 0.4792841374874115, + "grad_norm": 0.28887203572406756, + "learning_rate": 9.859091833366498e-06, + "loss": 0.4619043469429016, "num_tokens": 61912202.0, "step": 68 }, { "epoch": 0.5149253731343284, - "grad_norm": 0.4651988125797131, - "learning_rate": 9.853191965910605e-05, - "loss": 0.5062246322631836, + "grad_norm": 0.297913211640831, + "learning_rate": 9.853191965910606e-06, + "loss": 0.48681432008743286, "num_tokens": 62799081.0, "step": 69 }, { "epoch": 0.5223880597014925, - "grad_norm": 0.42221817783928267, - "learning_rate": 9.847173150321252e-05, - "loss": 0.4899851083755493, + "grad_norm": 0.2978081791490928, + "learning_rate": 9.847173150321252e-06, + "loss": 0.4710129499435425, "num_tokens": 63821360.0, "step": 70 }, { "epoch": 0.5298507462686567, - "grad_norm": 0.47108246395234693, - "learning_rate": 9.841035551058649e-05, - "loss": 0.4778142273426056, + "grad_norm": 0.33901428896502994, + "learning_rate": 9.84103555105865e-06, + "loss": 0.46070268750190735, "num_tokens": 64698236.0, "step": 71 }, { "epoch": 0.5373134328358209, - "grad_norm": 0.43352416560818235, - "learning_rate": 9.834779335828698e-05, - "loss": 0.4470871388912201, + "grad_norm": 0.2863724536535567, + "learning_rate": 9.8347793358287e-06, + "loss": 0.43551623821258545, "num_tokens": 65531533.0, "step": 72 }, { "epoch": 0.5447761194029851, - "grad_norm": 1.0181667962886616, - "learning_rate": 9.828404675578404e-05, - "loss": 0.4521074891090393, + "grad_norm": 0.30884498358581325, + "learning_rate": 9.828404675578405e-06, + "loss": 0.43174412846565247, "num_tokens": 66409682.0, "step": 73 }, { "epoch": 0.5522388059701493, - "grad_norm": 0.8780069573767034, - "learning_rate": 9.821911744491202e-05, - "loss": 0.499891459941864, + "grad_norm": 0.39653106497260543, + "learning_rate": 9.821911744491203e-06, + "loss": 0.47224926948547363, "num_tokens": 67201739.0, "step": 74 }, { "epoch": 0.5597014925373134, - "grad_norm": 0.4005993467136292, - "learning_rate": 9.815300719982203e-05, - "loss": 0.4863917827606201, + "grad_norm": 0.34427781009373076, + "learning_rate": 9.815300719982204e-06, + "loss": 0.46234217286109924, "num_tokens": 68054610.0, "step": 75 }, { "epoch": 0.5671641791044776, - "grad_norm": 0.43570944130704614, - "learning_rate": 9.808571782693344e-05, - "loss": 0.4663717448711395, + "grad_norm": 0.28593313207513976, + "learning_rate": 9.808571782693345e-06, + "loss": 0.4445508122444153, "num_tokens": 68905436.0, "step": 76 }, { "epoch": 0.5746268656716418, - "grad_norm": 0.42173553329029456, - "learning_rate": 9.80172511648845e-05, - "loss": 0.4742090404033661, + "grad_norm": 0.27754253103287374, + "learning_rate": 9.80172511648845e-06, + "loss": 0.4535985291004181, "num_tokens": 69815159.0, "step": 77 }, { "epoch": 0.582089552238806, - "grad_norm": 0.4142487648951103, - "learning_rate": 9.794760908448214e-05, - "loss": 0.5009994506835938, + "grad_norm": 0.2751626726169941, + "learning_rate": 9.794760908448215e-06, + "loss": 0.4778493642807007, "num_tokens": 70800960.0, "step": 78 }, { "epoch": 0.5895522388059702, - "grad_norm": 0.4275998035702217, - "learning_rate": 9.787679348865081e-05, - "loss": 0.45662373304367065, + "grad_norm": 0.2878195146653705, + "learning_rate": 9.787679348865082e-06, + "loss": 0.43559134006500244, "num_tokens": 71706284.0, "step": 79 }, { "epoch": 0.5970149253731343, - "grad_norm": 0.439320366628121, - "learning_rate": 9.780480631238051e-05, - "loss": 0.4795362949371338, + "grad_norm": 0.3046702186252135, + "learning_rate": 9.780480631238052e-06, + "loss": 0.45745372772216797, "num_tokens": 72585611.0, "step": 80 }, { "epoch": 0.6044776119402985, - "grad_norm": 0.4223763433435324, - "learning_rate": 9.773164952267393e-05, - "loss": 0.46257442235946655, + "grad_norm": 0.2580161347993156, + "learning_rate": 9.773164952267394e-06, + "loss": 0.44172853231430054, "num_tokens": 73603712.0, "step": 81 }, { "epoch": 0.6119402985074627, - "grad_norm": 0.39085499073204694, - "learning_rate": 9.765732511849268e-05, - "loss": 0.47663742303848267, + "grad_norm": 0.31823458045045494, + "learning_rate": 9.765732511849269e-06, + "loss": 0.4543741047382355, "num_tokens": 74510353.0, "step": 82 }, { "epoch": 0.6194029850746269, - "grad_norm": 0.42009411306133043, - "learning_rate": 9.758183513070266e-05, - "loss": 0.5020485520362854, + "grad_norm": 0.3262276808903542, + "learning_rate": 9.758183513070266e-06, + "loss": 0.48102468252182007, "num_tokens": 75426311.0, "step": 83 }, { "epoch": 0.6268656716417911, - "grad_norm": 0.3649622853337193, - "learning_rate": 9.750518162201857e-05, - "loss": 0.47370535135269165, + "grad_norm": 0.298246592306743, + "learning_rate": 9.750518162201858e-06, + "loss": 0.45155635476112366, "num_tokens": 76290512.0, "step": 84 }, { "epoch": 0.6343283582089553, - "grad_norm": 0.43670805360339937, - "learning_rate": 9.74273666869476e-05, - "loss": 0.4599141478538513, + "grad_norm": 0.30840978846450423, + "learning_rate": 9.74273666869476e-06, + "loss": 0.4398882985115051, "num_tokens": 77207410.0, "step": 85 }, { "epoch": 0.6417910447761194, - "grad_norm": 0.4076752849384704, - "learning_rate": 9.734839245173213e-05, - "loss": 0.4592893719673157, + "grad_norm": 0.2986447882814022, + "learning_rate": 9.734839245173213e-06, + "loss": 0.43722379207611084, "num_tokens": 78061170.0, "step": 86 }, { "epoch": 0.6492537313432836, - "grad_norm": 0.38024925768834367, - "learning_rate": 9.726826107429168e-05, - "loss": 0.4705815315246582, + "grad_norm": 0.3213308600234638, + "learning_rate": 9.726826107429168e-06, + "loss": 0.44796180725097656, "num_tokens": 78868118.0, "step": 87 }, { "epoch": 0.6567164179104478, - "grad_norm": 0.4456392309796699, - "learning_rate": 9.71869747441639e-05, - "loss": 0.4715425968170166, + "grad_norm": 0.3249532753373927, + "learning_rate": 9.71869747441639e-06, + "loss": 0.4503297805786133, "num_tokens": 79869363.0, "step": 88 }, { "epoch": 0.664179104477612, - "grad_norm": 0.5781305752627371, - "learning_rate": 9.71045356824448e-05, - "loss": 0.4719395637512207, + "grad_norm": 0.5892356895414527, + "learning_rate": 9.71045356824448e-06, + "loss": 0.4414302110671997, "num_tokens": 80709876.0, "step": 89 }, { "epoch": 0.6716417910447762, - "grad_norm": 0.37949041744244744, - "learning_rate": 9.702094614172801e-05, - "loss": 0.44268590211868286, + "grad_norm": 0.32884534307528746, + "learning_rate": 9.7020946141728e-06, + "loss": 0.42054399847984314, "num_tokens": 81535856.0, "step": 90 }, { "epoch": 0.6791044776119403, - "grad_norm": 0.5396784871128463, - "learning_rate": 9.693620840604324e-05, - "loss": 0.460915744304657, + "grad_norm": 0.2754517512669749, + "learning_rate": 9.693620840604326e-06, + "loss": 0.4349040985107422, "num_tokens": 82583455.0, "step": 91 }, { "epoch": 0.6865671641791045, - "grad_norm": 0.4006598992175905, - "learning_rate": 9.685032479079393e-05, - "loss": 0.46493151783943176, + "grad_norm": 0.3190387165435769, + "learning_rate": 9.685032479079394e-06, + "loss": 0.44351187348365784, "num_tokens": 83425036.0, "step": 92 }, { "epoch": 0.6940298507462687, - "grad_norm": 0.49978467835830587, - "learning_rate": 9.676329764269385e-05, - "loss": 0.4866185784339905, + "grad_norm": 0.29203678336341016, + "learning_rate": 9.676329764269385e-06, + "loss": 0.4587559103965759, "num_tokens": 84446952.0, "step": 93 }, { "epoch": 0.7014925373134329, - "grad_norm": 0.43010122447956384, - "learning_rate": 9.667512933970314e-05, - "loss": 0.4521125555038452, + "grad_norm": 0.2977218953461726, + "learning_rate": 9.667512933970315e-06, + "loss": 0.429887980222702, "num_tokens": 85254048.0, "step": 94 }, { "epoch": 0.7089552238805971, - "grad_norm": 0.4288239349494501, - "learning_rate": 9.65858222909632e-05, - "loss": 0.48363029956817627, + "grad_norm": 0.319328445980617, + "learning_rate": 9.65858222909632e-06, + "loss": 0.4590649902820587, "num_tokens": 86163467.0, "step": 95 }, { "epoch": 0.7164179104477612, - "grad_norm": 0.4810875898406697, - "learning_rate": 9.649537893673096e-05, - "loss": 0.482228547334671, + "grad_norm": 0.5444784762173913, + "learning_rate": 9.649537893673096e-06, + "loss": 0.4472053647041321, "num_tokens": 86980140.0, "step": 96 }, { "epoch": 0.7238805970149254, - "grad_norm": 0.41103096702587355, - "learning_rate": 9.64038017483121e-05, - "loss": 0.4684882164001465, + "grad_norm": 0.33070572527793457, + "learning_rate": 9.640380174831209e-06, + "loss": 0.44589415192604065, "num_tokens": 87928454.0, "step": 97 }, { "epoch": 0.7313432835820896, - "grad_norm": 0.5008519200324506, - "learning_rate": 9.631109322799362e-05, - "loss": 0.47999823093414307, + "grad_norm": 0.31480720093895037, + "learning_rate": 9.631109322799362e-06, + "loss": 0.45890533924102783, "num_tokens": 88687125.0, "step": 98 }, { "epoch": 0.7388059701492538, - "grad_norm": 0.40871942406005346, - "learning_rate": 9.621725590897544e-05, - "loss": 0.47064870595932007, + "grad_norm": 0.3045515849614143, + "learning_rate": 9.621725590897544e-06, + "loss": 0.4472447633743286, "num_tokens": 89545040.0, "step": 99 }, { "epoch": 0.746268656716418, - "grad_norm": 0.5533629877971527, - "learning_rate": 9.61222923553011e-05, - "loss": 0.47418609261512756, + "grad_norm": 0.31053505819411625, + "learning_rate": 9.61222923553011e-06, + "loss": 0.44827064871788025, "num_tokens": 90294885.0, "step": 100 }, { "epoch": 0.753731343283582, - "grad_norm": 0.4096019553219438, - "learning_rate": 9.602620516178789e-05, - "loss": 0.46458983421325684, + "grad_norm": 0.3029175634429252, + "learning_rate": 9.60262051617879e-06, + "loss": 0.4412766695022583, "num_tokens": 91184198.0, "step": 101 }, { "epoch": 0.7611940298507462, - "grad_norm": 0.5417882134091164, - "learning_rate": 9.592899695395569e-05, - "loss": 0.4749392569065094, + "grad_norm": 0.31643279761949383, + "learning_rate": 9.592899695395569e-06, + "loss": 0.4483514428138733, "num_tokens": 91984545.0, "step": 102 }, { "epoch": 0.7686567164179104, - "grad_norm": 0.46912848663434387, - "learning_rate": 9.583067038795546e-05, - "loss": 0.5165929794311523, + "grad_norm": 0.29772953486777926, + "learning_rate": 9.583067038795547e-06, + "loss": 0.48575955629348755, "num_tokens": 92895986.0, "step": 103 }, { "epoch": 0.7761194029850746, - "grad_norm": 0.45605747269903074, - "learning_rate": 9.57312281504965e-05, - "loss": 0.4692317843437195, + "grad_norm": 0.3103900650504769, + "learning_rate": 9.57312281504965e-06, + "loss": 0.4450864791870117, "num_tokens": 93788383.0, "step": 104 }, { "epoch": 0.7835820895522388, - "grad_norm": 0.413703940483972, - "learning_rate": 9.563067295877318e-05, - "loss": 0.4407064914703369, + "grad_norm": 0.2842262724404981, + "learning_rate": 9.563067295877319e-06, + "loss": 0.4178208112716675, "num_tokens": 94636525.0, "step": 105 }, { "epoch": 0.7910447761194029, - "grad_norm": 0.5027685222504175, - "learning_rate": 9.552900756039056e-05, - "loss": 0.5160893201828003, + "grad_norm": 0.318233292303752, + "learning_rate": 9.552900756039057e-06, + "loss": 0.48816001415252686, "num_tokens": 95397416.0, "step": 106 }, { "epoch": 0.7985074626865671, - "grad_norm": 0.4239135998720147, - "learning_rate": 9.54262347332894e-05, - "loss": 0.4940451979637146, + "grad_norm": 0.3031459599411157, + "learning_rate": 9.54262347332894e-06, + "loss": 0.4687079191207886, "num_tokens": 96224288.0, "step": 107 }, { "epoch": 0.8059701492537313, - "grad_norm": 0.406157440303162, - "learning_rate": 9.532235728567024e-05, - "loss": 0.4534427225589752, + "grad_norm": 0.3044834471531261, + "learning_rate": 9.532235728567025e-06, + "loss": 0.4333556890487671, "num_tokens": 97053744.0, "step": 108 }, { "epoch": 0.8134328358208955, - "grad_norm": 0.4554739910581079, - "learning_rate": 9.521737805591662e-05, - "loss": 0.4805285930633545, + "grad_norm": 0.382174488436462, + "learning_rate": 9.521737805591662e-06, + "loss": 0.45386844873428345, "num_tokens": 97941243.0, "step": 109 }, { "epoch": 0.8208955223880597, - "grad_norm": 0.33650727385185253, - "learning_rate": 9.511129991251755e-05, - "loss": 0.4370970129966736, + "grad_norm": 0.29853935870773984, + "learning_rate": 9.511129991251755e-06, + "loss": 0.4180367588996887, "num_tokens": 98814023.0, "step": 110 }, { "epoch": 0.8283582089552238, - "grad_norm": 0.44443049353094594, - "learning_rate": 9.500412575398923e-05, - "loss": 0.4835229516029358, + "grad_norm": 0.3152812743712433, + "learning_rate": 9.500412575398923e-06, + "loss": 0.45900076627731323, "num_tokens": 99770911.0, "step": 111 }, { "epoch": 0.835820895522388, - "grad_norm": 0.3979273390265327, - "learning_rate": 9.489585850879565e-05, - "loss": 0.4832872748374939, + "grad_norm": 0.2798327916645599, + "learning_rate": 9.489585850879565e-06, + "loss": 0.4589983820915222, "num_tokens": 100802886.0, "step": 112 }, { "epoch": 0.8432835820895522, - "grad_norm": 0.3730826950162529, - "learning_rate": 9.478650113526875e-05, - "loss": 0.47132688760757446, + "grad_norm": 0.3302819245429099, + "learning_rate": 9.478650113526875e-06, + "loss": 0.44858676195144653, "num_tokens": 101744970.0, "step": 113 }, { "epoch": 0.8507462686567164, - "grad_norm": 0.45565992574946057, - "learning_rate": 9.467605662152745e-05, - "loss": 0.498141884803772, + "grad_norm": 0.29962088349132515, + "learning_rate": 9.467605662152746e-06, + "loss": 0.4746031165122986, "num_tokens": 102730722.0, "step": 114 }, { "epoch": 0.8582089552238806, - "grad_norm": 0.3264677346294157, - "learning_rate": 9.456452798539616e-05, - "loss": 0.4391544759273529, + "grad_norm": 0.2939144591705004, + "learning_rate": 9.456452798539617e-06, + "loss": 0.4174093008041382, "num_tokens": 103574949.0, "step": 115 }, { "epoch": 0.8656716417910447, - "grad_norm": 0.3806908047667308, - "learning_rate": 9.445191827432216e-05, - "loss": 0.46042460203170776, + "grad_norm": 0.3825239836099086, + "learning_rate": 9.445191827432216e-06, + "loss": 0.439868301153183, "num_tokens": 104504791.0, "step": 116 }, { "epoch": 0.8731343283582089, - "grad_norm": 0.3671505571888292, - "learning_rate": 9.433823056529242e-05, - "loss": 0.4969770312309265, + "grad_norm": 0.30386076772048964, + "learning_rate": 9.433823056529241e-06, + "loss": 0.47291260957717896, "num_tokens": 105479834.0, "step": 117 }, { "epoch": 0.8805970149253731, - "grad_norm": 0.3656172087288628, - "learning_rate": 9.42234679647495e-05, - "loss": 0.4630345106124878, + "grad_norm": 0.2762720558491326, + "learning_rate": 9.42234679647495e-06, + "loss": 0.4426780045032501, "num_tokens": 106438084.0, "step": 118 }, { "epoch": 0.8880597014925373, - "grad_norm": 0.4385983070574321, - "learning_rate": 9.410763360850666e-05, - "loss": 0.484518826007843, + "grad_norm": 0.3057508592926945, + "learning_rate": 9.410763360850666e-06, + "loss": 0.4623616933822632, "num_tokens": 107262750.0, "step": 119 }, { "epoch": 0.8955223880597015, - "grad_norm": 0.414564689428505, - "learning_rate": 9.399073066166217e-05, - "loss": 0.4803532063961029, + "grad_norm": 0.3127855621446368, + "learning_rate": 9.399073066166218e-06, + "loss": 0.4572855234146118, "num_tokens": 108143548.0, "step": 120 }, { "epoch": 0.9029850746268657, - "grad_norm": 0.37895213274310274, - "learning_rate": 9.387276231851292e-05, - "loss": 0.4863133132457733, + "grad_norm": 0.32166755849704814, + "learning_rate": 9.387276231851292e-06, + "loss": 0.4610549211502075, "num_tokens": 109031239.0, "step": 121 }, { "epoch": 0.9104477611940298, - "grad_norm": 0.37517338570063247, - "learning_rate": 9.375373180246697e-05, - "loss": 0.48885974287986755, + "grad_norm": 0.308391680528446, + "learning_rate": 9.375373180246698e-06, + "loss": 0.4695647358894348, "num_tokens": 109986382.0, "step": 122 }, { "epoch": 0.917910447761194, - "grad_norm": 0.365362492493383, - "learning_rate": 9.363364236595561e-05, - "loss": 0.5022351145744324, + "grad_norm": 0.2975657588114746, + "learning_rate": 9.363364236595561e-06, + "loss": 0.47796621918678284, "num_tokens": 110966120.0, "step": 123 }, { "epoch": 0.9253731343283582, - "grad_norm": 0.3758172981032564, - "learning_rate": 9.351249729034441e-05, - "loss": 0.48538535833358765, + "grad_norm": 0.31052979583373397, + "learning_rate": 9.351249729034441e-06, + "loss": 0.46253445744514465, "num_tokens": 111841748.0, "step": 124 }, { "epoch": 0.9328358208955224, - "grad_norm": 0.3406462221158264, - "learning_rate": 9.339029988584364e-05, - "loss": 0.4734869599342346, + "grad_norm": 0.30804176635348807, + "learning_rate": 9.339029988584364e-06, + "loss": 0.45033249258995056, "num_tokens": 112797621.0, "step": 125 }, { "epoch": 0.9402985074626866, - "grad_norm": 0.3822848911084842, - "learning_rate": 9.326705349141772e-05, - "loss": 0.494500070810318, + "grad_norm": 0.2896323126815727, + "learning_rate": 9.326705349141772e-06, + "loss": 0.46928197145462036, "num_tokens": 113854322.0, "step": 126 }, { "epoch": 0.9477611940298507, - "grad_norm": 0.35359771043589155, - "learning_rate": 9.314276147469409e-05, - "loss": 0.46158310770988464, + "grad_norm": 0.2863377703738466, + "learning_rate": 9.31427614746941e-06, + "loss": 0.44036608934402466, "num_tokens": 114797592.0, "step": 127 }, { "epoch": 0.9552238805970149, - "grad_norm": 0.35988780320611496, - "learning_rate": 9.301742723187105e-05, - "loss": 0.4702184498310089, + "grad_norm": 0.3136460841921916, + "learning_rate": 9.301742723187106e-06, + "loss": 0.4462299644947052, "num_tokens": 115756574.0, "step": 128 }, { "epoch": 0.9626865671641791, - "grad_norm": 0.3663787565082927, - "learning_rate": 9.289105418762512e-05, - "loss": 0.48986703157424927, + "grad_norm": 0.30712216569223755, + "learning_rate": 9.289105418762512e-06, + "loss": 0.46634775400161743, "num_tokens": 116620827.0, "step": 129 }, { "epoch": 0.9701492537313433, - "grad_norm": 0.3299804378891389, - "learning_rate": 9.276364579501742e-05, - "loss": 0.47562772035598755, + "grad_norm": 0.30150157073298506, + "learning_rate": 9.276364579501743e-06, + "loss": 0.4525374174118042, "num_tokens": 117496028.0, "step": 130 }, { "epoch": 0.9776119402985075, - "grad_norm": 0.35906803087148503, - "learning_rate": 9.263520553539919e-05, - "loss": 0.45369184017181396, + "grad_norm": 0.2863498319159055, + "learning_rate": 9.263520553539919e-06, + "loss": 0.43308988213539124, "num_tokens": 118326101.0, "step": 131 }, { "epoch": 0.9850746268656716, - "grad_norm": 0.3500162508097388, - "learning_rate": 9.250573691831688e-05, - "loss": 0.4840993881225586, + "grad_norm": 0.31739713823558746, + "learning_rate": 9.250573691831688e-06, + "loss": 0.4591742753982544, "num_tokens": 119217901.0, "step": 132 }, { "epoch": 0.9925373134328358, - "grad_norm": 7.330378953880745, - "learning_rate": 9.2375243481416e-05, - "loss": 0.8011837005615234, + "grad_norm": 0.3107389978804748, + "learning_rate": 9.2375243481416e-06, + "loss": 0.4491395056247711, "num_tokens": 120120192.0, "step": 133 }, { "epoch": 1.0, - "grad_norm": 0.6525925757760507, - "learning_rate": 9.224372879034471e-05, - "loss": 0.4851951003074646, + "grad_norm": 0.29934735002842794, + "learning_rate": 9.224372879034471e-06, + "loss": 0.44749873876571655, "num_tokens": 121051485.0, "step": 134 }, { "epoch": 1.007462686567164, - "grad_norm": 0.5378029376327099, - "learning_rate": 9.211119643865626e-05, - "loss": 0.41316893696784973, + "grad_norm": 0.33488387869414854, + "learning_rate": 9.211119643865626e-06, + "loss": 0.4307776689529419, "num_tokens": 121991896.0, "step": 135 }, { "epoch": 1.0149253731343284, - "grad_norm": 0.5313587364112035, - "learning_rate": 9.197765004771075e-05, - "loss": 0.3996645510196686, + "grad_norm": 0.32499655410029626, + "learning_rate": 9.197765004771074e-06, + "loss": 0.4204443097114563, "num_tokens": 122819690.0, "step": 136 }, { "epoch": 1.0223880597014925, - "grad_norm": 0.48571568078584254, - "learning_rate": 9.184309326657626e-05, - "loss": 0.3900325894355774, + "grad_norm": 0.34181089478733623, + "learning_rate": 9.184309326657627e-06, + "loss": 0.41079288721084595, "num_tokens": 123657032.0, "step": 137 }, { "epoch": 1.0298507462686568, - "grad_norm": 0.4791867765688365, - "learning_rate": 9.17075297719292e-05, - "loss": 0.3834831714630127, + "grad_norm": 0.5825488788426431, + "learning_rate": 9.17075297719292e-06, + "loss": 0.4082901179790497, "num_tokens": 124550556.0, "step": 138 }, { "epoch": 1.037313432835821, - "grad_norm": 0.5620164711451672, - "learning_rate": 9.157096326795368e-05, - "loss": 0.3994726240634918, + "grad_norm": 1.1799244713672623, + "learning_rate": 9.157096326795369e-06, + "loss": 0.42325854301452637, "num_tokens": 125328617.0, "step": 139 }, { "epoch": 1.044776119402985, - "grad_norm": 0.44663203860833633, - "learning_rate": 9.143339748624043e-05, - "loss": 0.3828105926513672, + "grad_norm": 0.3981431547057968, + "learning_rate": 9.143339748624044e-06, + "loss": 0.40712812542915344, "num_tokens": 126306594.0, "step": 140 }, { "epoch": 1.0522388059701493, - "grad_norm": 0.4005701326294082, - "learning_rate": 9.129483618568478e-05, - "loss": 0.38758426904678345, + "grad_norm": 0.32884099051410826, + "learning_rate": 9.129483618568478e-06, + "loss": 0.4147931933403015, "num_tokens": 127215038.0, "step": 141 }, { "epoch": 1.0597014925373134, - "grad_norm": 0.45636911740134045, - "learning_rate": 9.115528315238396e-05, - "loss": 0.39247897267341614, + "grad_norm": 0.3071551975535917, + "learning_rate": 9.115528315238396e-06, + "loss": 0.4247783422470093, "num_tokens": 128054129.0, "step": 142 }, { "epoch": 1.0671641791044777, - "grad_norm": 0.3920833897414762, - "learning_rate": 9.101474219953366e-05, - "loss": 0.3844168484210968, + "grad_norm": 0.3132240777032372, + "learning_rate": 9.101474219953367e-06, + "loss": 0.4133056104183197, "num_tokens": 128952014.0, "step": 143 }, { "epoch": 1.0746268656716418, - "grad_norm": 0.450948426505947, - "learning_rate": 9.087321716732383e-05, - "loss": 0.3821527361869812, + "grad_norm": 0.31895939410654406, + "learning_rate": 9.087321716732384e-06, + "loss": 0.4213321805000305, "num_tokens": 129774041.0, "step": 144 }, { "epoch": 1.0820895522388059, - "grad_norm": 0.3715852663434054, - "learning_rate": 9.073071192283375e-05, - "loss": 0.38106459379196167, + "grad_norm": 0.32304487832880724, + "learning_rate": 9.073071192283374e-06, + "loss": 0.4195047616958618, "num_tokens": 130656187.0, "step": 145 }, { "epoch": 1.0895522388059702, - "grad_norm": 0.45549767464806623, - "learning_rate": 9.058723035992632e-05, - "loss": 0.39967742562294006, + "grad_norm": 0.31668877560620456, + "learning_rate": 9.058723035992632e-06, + "loss": 0.4216320514678955, "num_tokens": 131546421.0, "step": 146 }, { "epoch": 1.0970149253731343, - "grad_norm": 0.3648384954251166, - "learning_rate": 9.044277639914177e-05, - "loss": 0.3912694454193115, + "grad_norm": 0.30109857359574926, + "learning_rate": 9.044277639914177e-06, + "loss": 0.4255885183811188, "num_tokens": 132482644.0, "step": 147 }, { "epoch": 1.1044776119402986, - "grad_norm": 0.4152357326418446, - "learning_rate": 9.029735398759044e-05, - "loss": 0.3676859438419342, + "grad_norm": 0.28611352244816046, + "learning_rate": 9.029735398759044e-06, + "loss": 0.4004859924316406, "num_tokens": 133363098.0, "step": 148 }, { "epoch": 1.1119402985074627, - "grad_norm": 0.3963222277152024, - "learning_rate": 9.015096709884493e-05, - "loss": 0.3925454020500183, + "grad_norm": 0.3246541214309705, + "learning_rate": 9.015096709884493e-06, + "loss": 0.41801226139068604, "num_tokens": 134281169.0, "step": 149 }, { "epoch": 1.1194029850746268, - "grad_norm": 0.4064960888262719, - "learning_rate": 9.00036197328316e-05, - "loss": 0.35971492528915405, + "grad_norm": 0.39523810160114464, + "learning_rate": 9.00036197328316e-06, + "loss": 0.39403271675109863, "num_tokens": 135132326.0, "step": 150 }, { "epoch": 1.126865671641791, - "grad_norm": 0.4071754971946692, - "learning_rate": 8.985531591572117e-05, - "loss": 0.3754289746284485, + "grad_norm": 0.3372219635650443, + "learning_rate": 8.985531591572117e-06, + "loss": 0.40995997190475464, "num_tokens": 136009199.0, "step": 151 }, { "epoch": 1.1343283582089552, - "grad_norm": 0.36870186585431025, - "learning_rate": 8.97060596998188e-05, - "loss": 0.4028257727622986, + "grad_norm": 0.2880187226242739, + "learning_rate": 8.97060596998188e-06, + "loss": 0.44250696897506714, "num_tokens": 136974761.0, "step": 152 }, { "epoch": 1.1417910447761195, - "grad_norm": 0.4344228020561186, - "learning_rate": 8.955585516345333e-05, - "loss": 0.3753546178340912, + "grad_norm": 0.2840439662929065, + "learning_rate": 8.955585516345333e-06, + "loss": 0.41125112771987915, "num_tokens": 137953131.0, "step": 153 }, { "epoch": 1.1492537313432836, - "grad_norm": 0.36266607541139245, - "learning_rate": 8.940470641086582e-05, - "loss": 0.37533092498779297, + "grad_norm": 0.30854018310336556, + "learning_rate": 8.940470641086583e-06, + "loss": 0.41466018557548523, "num_tokens": 138890202.0, "step": 154 }, { "epoch": 1.1567164179104479, - "grad_norm": 0.47846108831358514, - "learning_rate": 8.925261757209744e-05, - "loss": 0.4062751531600952, + "grad_norm": 0.2861522107018775, + "learning_rate": 8.925261757209744e-06, + "loss": 0.4421645998954773, "num_tokens": 139921851.0, "step": 155 }, { "epoch": 1.164179104477612, - "grad_norm": 0.39212783793080364, - "learning_rate": 8.909959280287657e-05, - "loss": 0.3796807527542114, + "grad_norm": 0.30184466401361404, + "learning_rate": 8.909959280287657e-06, + "loss": 0.41726770997047424, "num_tokens": 140840212.0, "step": 156 }, { "epoch": 1.171641791044776, - "grad_norm": 0.4159448802180959, - "learning_rate": 8.894563628450533e-05, - "loss": 0.3769783079624176, + "grad_norm": 0.29786414496705443, + "learning_rate": 8.894563628450534e-06, + "loss": 0.4137997627258301, "num_tokens": 141681181.0, "step": 157 }, { "epoch": 1.1791044776119404, - "grad_norm": 0.790798611601307, - "learning_rate": 8.879075222374522e-05, - "loss": 0.3683978319168091, + "grad_norm": 0.27612956474353256, + "learning_rate": 8.879075222374522e-06, + "loss": 0.3967845141887665, "num_tokens": 142603331.0, "step": 158 }, { "epoch": 1.1865671641791045, - "grad_norm": 0.49304134063646404, - "learning_rate": 8.863494485270227e-05, - "loss": 0.36335837841033936, + "grad_norm": 0.2936198747641151, + "learning_rate": 8.863494485270228e-06, + "loss": 0.3882240355014801, "num_tokens": 143438386.0, "step": 159 }, { "epoch": 1.1940298507462686, - "grad_norm": 0.3800436969990742, - "learning_rate": 8.847821842871137e-05, - "loss": 0.38635337352752686, + "grad_norm": 0.28750782577222145, + "learning_rate": 8.847821842871137e-06, + "loss": 0.42263633012771606, "num_tokens": 144352522.0, "step": 160 }, { "epoch": 1.2014925373134329, - "grad_norm": 0.40575188138718515, - "learning_rate": 8.832057723421989e-05, - "loss": 0.3920779824256897, + "grad_norm": 0.32255178451364774, + "learning_rate": 8.832057723421989e-06, + "loss": 0.42398497462272644, "num_tokens": 145160558.0, "step": 161 }, { "epoch": 1.208955223880597, - "grad_norm": 0.3605573732327459, - "learning_rate": 8.816202557667076e-05, - "loss": 0.37881582975387573, + "grad_norm": 0.32016607068719616, + "learning_rate": 8.816202557667076e-06, + "loss": 0.40889400243759155, "num_tokens": 145970221.0, "step": 162 }, { "epoch": 1.2164179104477613, - "grad_norm": 0.3517475546963777, - "learning_rate": 8.800256778838468e-05, - "loss": 0.3680334687232971, + "grad_norm": 0.30212941397274007, + "learning_rate": 8.800256778838468e-06, + "loss": 0.3960338234901428, "num_tokens": 146893310.0, "step": 163 }, { "epoch": 1.2238805970149254, - "grad_norm": 0.40078029663425446, - "learning_rate": 8.784220822644179e-05, - "loss": 0.42311668395996094, + "grad_norm": 0.31197303744834676, + "learning_rate": 8.78422082264418e-06, + "loss": 0.44305476546287537, "num_tokens": 147701963.0, "step": 164 }, { "epoch": 1.2313432835820897, - "grad_norm": 0.3311618467192894, - "learning_rate": 8.768095127256262e-05, - "loss": 0.3540314733982086, + "grad_norm": 0.2823293130053843, + "learning_rate": 8.768095127256263e-06, + "loss": 0.3833114206790924, "num_tokens": 148634179.0, "step": 165 }, { "epoch": 1.2388059701492538, - "grad_norm": 0.44011257398208514, - "learning_rate": 8.751880133298834e-05, - "loss": 0.38583946228027344, + "grad_norm": 0.2811151003410808, + "learning_rate": 8.751880133298834e-06, + "loss": 0.4171923100948334, "num_tokens": 149594443.0, "step": 166 }, { "epoch": 1.2462686567164178, - "grad_norm": 0.35797806577728, - "learning_rate": 8.735576283836038e-05, - "loss": 0.39779961109161377, + "grad_norm": 0.31565679619489956, + "learning_rate": 8.735576283836039e-06, + "loss": 0.43264657258987427, "num_tokens": 150495465.0, "step": 167 }, { "epoch": 1.2537313432835822, - "grad_norm": 0.438751709144054, - "learning_rate": 8.719184024359935e-05, - "loss": 0.3823118805885315, + "grad_norm": 0.3023001398731657, + "learning_rate": 8.719184024359935e-06, + "loss": 0.4185860753059387, "num_tokens": 151402535.0, "step": 168 }, { "epoch": 1.2611940298507462, - "grad_norm": 0.373713033078772, - "learning_rate": 8.702703802778332e-05, - "loss": 0.3971496820449829, + "grad_norm": 0.3114367097991156, + "learning_rate": 8.702703802778332e-06, + "loss": 0.444894403219223, "num_tokens": 152354215.0, "step": 169 }, { "epoch": 1.2686567164179103, - "grad_norm": 0.4106846577179372, - "learning_rate": 8.686136069402542e-05, - "loss": 0.3557892143726349, + "grad_norm": 0.3130958107073367, + "learning_rate": 8.686136069402542e-06, + "loss": 0.3862420916557312, "num_tokens": 153135819.0, "step": 170 }, { "epoch": 1.2761194029850746, - "grad_norm": 1.440626502788841, - "learning_rate": 8.669481276935084e-05, - "loss": 0.4085765480995178, + "grad_norm": 0.32026467648986173, + "learning_rate": 8.669481276935085e-06, + "loss": 0.43771523237228394, "num_tokens": 154060950.0, "step": 171 }, { "epoch": 1.2835820895522387, - "grad_norm": 0.5258818618948825, - "learning_rate": 8.652739880457309e-05, - "loss": 0.4162169396877289, + "grad_norm": 0.33753040760769915, + "learning_rate": 8.652739880457309e-06, + "loss": 0.4314393401145935, "num_tokens": 154999582.0, "step": 172 }, { "epoch": 1.291044776119403, - "grad_norm": 0.35167964676856805, - "learning_rate": 8.635912337416962e-05, - "loss": 0.39445915818214417, + "grad_norm": 0.31404977555481944, + "learning_rate": 8.635912337416963e-06, + "loss": 0.4238457679748535, "num_tokens": 155889540.0, "step": 173 }, { "epoch": 1.2985074626865671, - "grad_norm": 0.3659684336632256, - "learning_rate": 8.618999107615694e-05, - "loss": 0.380463182926178, + "grad_norm": 0.2917828706410469, + "learning_rate": 8.618999107615694e-06, + "loss": 0.4157620072364807, "num_tokens": 156887223.0, "step": 174 }, { "epoch": 1.3059701492537314, - "grad_norm": 0.382228785857524, - "learning_rate": 8.602000653196484e-05, - "loss": 0.38270196318626404, + "grad_norm": 0.2929002597150211, + "learning_rate": 8.602000653196484e-06, + "loss": 0.4093779921531677, "num_tokens": 157776705.0, "step": 175 }, { "epoch": 1.3134328358208955, - "grad_norm": 0.36608008577274276, - "learning_rate": 8.584917438631021e-05, - "loss": 0.39126056432724, + "grad_norm": 0.2981368517552101, + "learning_rate": 8.584917438631022e-06, + "loss": 0.4151228070259094, "num_tokens": 158724790.0, "step": 176 }, { "epoch": 1.3208955223880596, - "grad_norm": 0.3993251647024534, - "learning_rate": 8.567749930707012e-05, - "loss": 0.3943611979484558, + "grad_norm": 0.307459834676784, + "learning_rate": 8.567749930707012e-06, + "loss": 0.42905163764953613, "num_tokens": 159719326.0, "step": 177 }, { "epoch": 1.328358208955224, - "grad_norm": 0.3507544047377843, - "learning_rate": 8.550498598515421e-05, - "loss": 0.41549068689346313, + "grad_norm": 0.3174851983597954, + "learning_rate": 8.55049859851542e-06, + "loss": 0.44639986753463745, "num_tokens": 160650411.0, "step": 178 }, { "epoch": 1.335820895522388, - "grad_norm": 0.41277833860040686, - "learning_rate": 8.533163913437657e-05, - "loss": 0.3995250463485718, + "grad_norm": 0.37310729673210785, + "learning_rate": 8.533163913437657e-06, + "loss": 0.4070381820201874, "num_tokens": 161685151.0, "step": 179 }, { "epoch": 1.3432835820895521, - "grad_norm": 0.30725618067056165, - "learning_rate": 8.515746349132693e-05, - "loss": 0.3731691837310791, + "grad_norm": 0.34243880652688075, + "learning_rate": 8.515746349132693e-06, + "loss": 0.40524742007255554, "num_tokens": 162668291.0, "step": 180 }, { "epoch": 1.3507462686567164, - "grad_norm": 0.3539602450879266, - "learning_rate": 8.498246381524123e-05, - "loss": 0.3635512590408325, + "grad_norm": 0.3314697629279733, + "learning_rate": 8.498246381524123e-06, + "loss": 0.39374542236328125, "num_tokens": 163602019.0, "step": 181 }, { "epoch": 1.3582089552238805, - "grad_norm": 0.3271307622787571, - "learning_rate": 8.480664488787157e-05, - "loss": 0.38671034574508667, + "grad_norm": 0.39714424307879675, + "learning_rate": 8.480664488787157e-06, + "loss": 0.41536325216293335, "num_tokens": 164374987.0, "step": 182 }, { "epoch": 1.3656716417910448, - "grad_norm": 0.3976363659582512, - "learning_rate": 8.463001151335556e-05, - "loss": 0.38379645347595215, + "grad_norm": 0.30470654817019394, + "learning_rate": 8.463001151335556e-06, + "loss": 0.420206755399704, "num_tokens": 165277351.0, "step": 183 }, { "epoch": 1.373134328358209, - "grad_norm": 0.3142952756607059, - "learning_rate": 8.445256851808504e-05, - "loss": 0.37027812004089355, + "grad_norm": 0.30147269826178413, + "learning_rate": 8.445256851808504e-06, + "loss": 0.40577423572540283, "num_tokens": 166179864.0, "step": 184 }, { "epoch": 1.3805970149253732, - "grad_norm": 0.4062824863807851, - "learning_rate": 8.427432075057423e-05, - "loss": 0.37924981117248535, + "grad_norm": 0.3160553991473881, + "learning_rate": 8.427432075057422e-06, + "loss": 0.3979928195476532, "num_tokens": 167127067.0, "step": 185 }, { "epoch": 1.3880597014925373, - "grad_norm": 0.41964498318259574, - "learning_rate": 8.409527308132718e-05, - "loss": 0.41143038868904114, + "grad_norm": 0.31665903933128287, + "learning_rate": 8.409527308132717e-06, + "loss": 0.4436604976654053, "num_tokens": 168100947.0, "step": 186 }, { "epoch": 1.3955223880597014, - "grad_norm": 0.37787793618980803, - "learning_rate": 8.391543040270478e-05, - "loss": 0.38744503259658813, + "grad_norm": 0.296181555140025, + "learning_rate": 8.391543040270477e-06, + "loss": 0.42373591661453247, "num_tokens": 168977100.0, "step": 187 }, { "epoch": 1.4029850746268657, - "grad_norm": 0.36928157132387057, - "learning_rate": 8.373479762879103e-05, - "loss": 0.3925457000732422, + "grad_norm": 0.340781706854354, + "learning_rate": 8.373479762879104e-06, + "loss": 0.4242423474788666, "num_tokens": 169809036.0, "step": 188 }, { "epoch": 1.4104477611940298, - "grad_norm": 0.324399804805819, - "learning_rate": 8.355337969525875e-05, - "loss": 0.3671782612800598, + "grad_norm": 0.2912347476979519, + "learning_rate": 8.355337969525876e-06, + "loss": 0.3881043791770935, "num_tokens": 170799001.0, "step": 189 }, { "epoch": 1.417910447761194, - "grad_norm": 0.39962644570980893, - "learning_rate": 8.337118155923474e-05, - "loss": 0.39616724848747253, + "grad_norm": 0.3167891630018227, + "learning_rate": 8.337118155923474e-06, + "loss": 0.417064368724823, "num_tokens": 171563636.0, "step": 190 }, { "epoch": 1.4253731343283582, - "grad_norm": 0.3492419569641068, - "learning_rate": 8.318820819916433e-05, - "loss": 0.3766665756702423, + "grad_norm": 0.32116936347486175, + "learning_rate": 8.318820819916433e-06, + "loss": 0.40856266021728516, "num_tokens": 172297711.0, "step": 191 }, { "epoch": 1.4328358208955223, - "grad_norm": 0.36796234487192975, - "learning_rate": 8.300446461467533e-05, - "loss": 0.42141246795654297, + "grad_norm": 0.3019887016574649, + "learning_rate": 8.300446461467533e-06, + "loss": 0.4446168541908264, "num_tokens": 173246434.0, "step": 192 }, { "epoch": 1.4402985074626866, - "grad_norm": 1.8563826752962438, - "learning_rate": 8.281995582644145e-05, - "loss": 0.4875096082687378, + "grad_norm": 0.3138769818399579, + "learning_rate": 8.281995582644145e-06, + "loss": 0.4181920289993286, "num_tokens": 174149904.0, "step": 193 }, { "epoch": 1.4477611940298507, - "grad_norm": 0.5629917997104381, - "learning_rate": 8.263468687604509e-05, - "loss": 0.41550225019454956, + "grad_norm": 0.313975344503838, + "learning_rate": 8.263468687604508e-06, + "loss": 0.4371890425682068, "num_tokens": 174963687.0, "step": 194 }, { "epoch": 1.455223880597015, - "grad_norm": 0.33085937583376096, - "learning_rate": 8.244866282583956e-05, - "loss": 0.4141860008239746, + "grad_norm": 0.29628794439446526, + "learning_rate": 8.244866282583957e-06, + "loss": 0.43816518783569336, "num_tokens": 175988598.0, "step": 195 }, { "epoch": 1.462686567164179, - "grad_norm": 0.4257798837248504, - "learning_rate": 8.226188875881081e-05, - "loss": 0.3922231197357178, + "grad_norm": 0.2963583065242463, + "learning_rate": 8.226188875881082e-06, + "loss": 0.41185736656188965, "num_tokens": 176960311.0, "step": 196 }, { "epoch": 1.4701492537313432, - "grad_norm": 0.4402132133601934, - "learning_rate": 8.20743697784385e-05, - "loss": 0.43575319647789, + "grad_norm": 0.2991189293307387, + "learning_rate": 8.20743697784385e-06, + "loss": 0.46473461389541626, "num_tokens": 177889691.0, "step": 197 }, { "epoch": 1.4776119402985075, - "grad_norm": 0.35794861202796946, - "learning_rate": 8.188611100855655e-05, - "loss": 0.3664058744907379, + "grad_norm": 0.26573849496019714, + "learning_rate": 8.188611100855656e-06, + "loss": 0.3865639567375183, "num_tokens": 178835508.0, "step": 198 }, { "epoch": 1.4850746268656716, - "grad_norm": 0.43264022580218503, - "learning_rate": 8.169711759321318e-05, - "loss": 0.4006292223930359, + "grad_norm": 0.28471866573069565, + "learning_rate": 8.169711759321318e-06, + "loss": 0.4254840612411499, "num_tokens": 179780829.0, "step": 199 }, { "epoch": 1.4925373134328357, - "grad_norm": 0.3339118816374283, - "learning_rate": 8.150739469653026e-05, - "loss": 0.3559767007827759, + "grad_norm": 0.27591064975620333, + "learning_rate": 8.150739469653026e-06, + "loss": 0.3821393847465515, "num_tokens": 180675259.0, "step": 200 }, { "epoch": 1.5, - "grad_norm": 0.38409562939014874, - "learning_rate": 8.131694750256234e-05, - "loss": 0.40150728821754456, + "grad_norm": 0.2912891463065521, + "learning_rate": 8.131694750256234e-06, + "loss": 0.4260258972644806, "num_tokens": 181593083.0, "step": 201 }, { "epoch": 1.5074626865671643, - "grad_norm": 0.3685144407348577, - "learning_rate": 8.112578121515486e-05, - "loss": 0.3964880108833313, + "grad_norm": 0.3470505245514532, + "learning_rate": 8.112578121515485e-06, + "loss": 0.42295166850090027, "num_tokens": 182453649.0, "step": 202 }, { "epoch": 1.5149253731343284, - "grad_norm": 0.3782677487633485, - "learning_rate": 8.093390105780201e-05, - "loss": 0.38671010732650757, + "grad_norm": 0.333624297966994, + "learning_rate": 8.0933901057802e-06, + "loss": 0.4165676534175873, "num_tokens": 183252908.0, "step": 203 }, { "epoch": 1.5223880597014925, - "grad_norm": 0.361547018908814, - "learning_rate": 8.074131227350408e-05, - "loss": 0.39490896463394165, + "grad_norm": 0.2999450247966616, + "learning_rate": 8.074131227350408e-06, + "loss": 0.42348137497901917, "num_tokens": 184218061.0, "step": 204 }, { "epoch": 1.5298507462686568, - "grad_norm": 0.39623365745043226, - "learning_rate": 8.05480201246241e-05, - "loss": 0.4241552948951721, + "grad_norm": 0.33075885588759496, + "learning_rate": 8.05480201246241e-06, + "loss": 0.4413604140281677, "num_tokens": 185123701.0, "step": 205 }, { "epoch": 1.537313432835821, - "grad_norm": 0.384337326931672, - "learning_rate": 8.035402989274402e-05, - "loss": 0.40156304836273193, + "grad_norm": 0.3236918821990334, + "learning_rate": 8.035402989274402e-06, + "loss": 0.4267103970050812, "num_tokens": 186020054.0, "step": 206 }, { "epoch": 1.544776119402985, - "grad_norm": 0.3147488283187526, - "learning_rate": 8.015934687852053e-05, - "loss": 0.3800262212753296, + "grad_norm": 0.28545115313146596, + "learning_rate": 8.015934687852053e-06, + "loss": 0.4010322690010071, "num_tokens": 186957926.0, "step": 207 }, { "epoch": 1.5522388059701493, - "grad_norm": 0.3753384108449989, - "learning_rate": 7.996397640154011e-05, - "loss": 0.40007251501083374, + "grad_norm": 0.33525388932605726, + "learning_rate": 7.996397640154012e-06, + "loss": 0.43479830026626587, "num_tokens": 187967937.0, "step": 208 }, { "epoch": 1.5597014925373134, - "grad_norm": 0.33847652188449584, - "learning_rate": 7.976792380017372e-05, - "loss": 0.35767635703086853, + "grad_norm": 0.2852110581692416, + "learning_rate": 7.976792380017374e-06, + "loss": 0.3835904002189636, "num_tokens": 188699883.0, "step": 209 }, { "epoch": 1.5671641791044775, - "grad_norm": 0.38090950631145293, - "learning_rate": 7.957119443143094e-05, - "loss": 0.39926737546920776, + "grad_norm": 0.38746256380732114, + "learning_rate": 7.957119443143093e-06, + "loss": 0.43473392724990845, "num_tokens": 189533459.0, "step": 210 }, { "epoch": 1.5746268656716418, - "grad_norm": 0.3450528620578906, - "learning_rate": 7.937379367081355e-05, - "loss": 0.3803778290748596, + "grad_norm": 0.30040372660742176, + "learning_rate": 7.937379367081356e-06, + "loss": 0.4094908535480499, "num_tokens": 190331401.0, "step": 211 }, { "epoch": 1.582089552238806, - "grad_norm": 0.3420127027284222, - "learning_rate": 7.917572691216867e-05, - "loss": 0.4145554006099701, + "grad_norm": 0.35097170028371405, + "learning_rate": 7.917572691216868e-06, + "loss": 0.44787487387657166, "num_tokens": 191163315.0, "step": 212 }, { "epoch": 1.5895522388059702, - "grad_norm": 0.3346895784687655, - "learning_rate": 7.897699956754142e-05, - "loss": 0.3825417160987854, + "grad_norm": 0.29035162522974023, + "learning_rate": 7.897699956754142e-06, + "loss": 0.41564756631851196, "num_tokens": 192105809.0, "step": 213 }, { "epoch": 1.5970149253731343, - "grad_norm": 0.3029553628970468, - "learning_rate": 7.877761706702697e-05, - "loss": 0.39795738458633423, + "grad_norm": 0.3234055460991543, + "learning_rate": 7.877761706702698e-06, + "loss": 0.42737478017807007, "num_tokens": 193098168.0, "step": 214 }, { "epoch": 1.6044776119402986, - "grad_norm": 0.31475834316855134, - "learning_rate": 7.85775848586222e-05, - "loss": 0.3966895341873169, + "grad_norm": 0.3181366599415042, + "learning_rate": 7.85775848586222e-06, + "loss": 0.4263436794281006, "num_tokens": 193975959.0, "step": 215 }, { "epoch": 1.6119402985074627, - "grad_norm": 0.33289818446973524, - "learning_rate": 7.837690840807687e-05, - "loss": 0.40487581491470337, + "grad_norm": 0.3047597849777916, + "learning_rate": 7.837690840807688e-06, + "loss": 0.4356343150138855, "num_tokens": 194828963.0, "step": 216 }, { "epoch": 1.6194029850746268, - "grad_norm": 0.29122335175536246, - "learning_rate": 7.817559319874418e-05, - "loss": 0.3636058568954468, + "grad_norm": 0.2953366209904587, + "learning_rate": 7.817559319874417e-06, + "loss": 0.39498403668403625, "num_tokens": 195757337.0, "step": 217 }, { "epoch": 1.626865671641791, - "grad_norm": 0.3209769832982439, - "learning_rate": 7.797364473143104e-05, - "loss": 0.3857102692127228, + "grad_norm": 0.2936401683412748, + "learning_rate": 7.797364473143105e-06, + "loss": 0.4154474139213562, "num_tokens": 196731181.0, "step": 218 }, { "epoch": 1.6343283582089554, - "grad_norm": 0.3288943639298265, - "learning_rate": 7.777106852424769e-05, - "loss": 0.3973213732242584, + "grad_norm": 0.2898185408597091, + "learning_rate": 7.77710685242477e-06, + "loss": 0.42473846673965454, "num_tokens": 197621017.0, "step": 219 }, { "epoch": 1.6417910447761193, - "grad_norm": 0.32622435912358894, - "learning_rate": 7.756787011245699e-05, - "loss": 0.41372135281562805, + "grad_norm": 0.29114088952907274, + "learning_rate": 7.7567870112457e-06, + "loss": 0.4433613419532776, "num_tokens": 198631859.0, "step": 220 }, { "epoch": 1.6492537313432836, - "grad_norm": 0.3306632028941975, - "learning_rate": 7.736405504832313e-05, - "loss": 0.4025194048881531, + "grad_norm": 0.31287064287880717, + "learning_rate": 7.736405504832314e-06, + "loss": 0.4322376549243927, "num_tokens": 199557498.0, "step": 221 }, { "epoch": 1.6567164179104479, - "grad_norm": 0.30915788874904887, - "learning_rate": 7.715962890095988e-05, - "loss": 0.39195138216018677, + "grad_norm": 0.3031132335175992, + "learning_rate": 7.715962890095988e-06, + "loss": 0.41872939467430115, "num_tokens": 200455519.0, "step": 222 }, { "epoch": 1.664179104477612, - "grad_norm": 0.30454874489918177, - "learning_rate": 7.695459725617851e-05, - "loss": 0.41680413484573364, + "grad_norm": 0.5127084447985639, + "learning_rate": 7.695459725617851e-06, + "loss": 0.4426816999912262, "num_tokens": 201364168.0, "step": 223 }, { "epoch": 1.671641791044776, - "grad_norm": 0.28818711729152613, - "learning_rate": 7.674896571633507e-05, - "loss": 0.36696305871009827, + "grad_norm": 0.36355358662257686, + "learning_rate": 7.674896571633507e-06, + "loss": 0.3920941650867462, "num_tokens": 202272665.0, "step": 224 }, { "epoch": 1.6791044776119404, - "grad_norm": 0.2742058975686566, - "learning_rate": 7.654273990017741e-05, - "loss": 0.36067163944244385, + "grad_norm": 0.2918543179655489, + "learning_rate": 7.654273990017742e-06, + "loss": 0.3865686058998108, "num_tokens": 203236852.0, "step": 225 }, { "epoch": 1.6865671641791045, - "grad_norm": 0.2929359682597656, - "learning_rate": 7.633592544269152e-05, - "loss": 0.38387706875801086, + "grad_norm": 0.29443958475831755, + "learning_rate": 7.633592544269152e-06, + "loss": 0.41160887479782104, "num_tokens": 204144409.0, "step": 226 }, { "epoch": 1.6940298507462686, - "grad_norm": 0.3072610406229327, - "learning_rate": 7.612852799494769e-05, - "loss": 0.39547377824783325, + "grad_norm": 0.29368087510062574, + "learning_rate": 7.61285279949477e-06, + "loss": 0.41996899247169495, "num_tokens": 205087641.0, "step": 227 }, { "epoch": 1.7014925373134329, - "grad_norm": 0.3046081229840581, - "learning_rate": 7.592055322394603e-05, - "loss": 0.4030948281288147, + "grad_norm": 0.2981876720268518, + "learning_rate": 7.592055322394602e-06, + "loss": 0.4322773814201355, "num_tokens": 205964269.0, "step": 228 }, { "epoch": 1.7089552238805972, - "grad_norm": 0.3500714972638898, - "learning_rate": 7.571200681246159e-05, - "loss": 0.40775904059410095, + "grad_norm": 0.3032205060654827, + "learning_rate": 7.5712006812461595e-06, + "loss": 0.4357481002807617, "num_tokens": 206853325.0, "step": 229 }, { "epoch": 1.716417910447761, - "grad_norm": 0.29254873485785454, - "learning_rate": 7.550289445888915e-05, - "loss": 0.3864845037460327, + "grad_norm": 0.30382769873452287, + "learning_rate": 7.5502894458889154e-06, + "loss": 0.42187392711639404, "num_tokens": 207780456.0, "step": 230 }, { "epoch": 1.7238805970149254, - "grad_norm": 0.3542537132949729, - "learning_rate": 7.529322187708752e-05, - "loss": 0.41177991032600403, + "grad_norm": 0.28458753280851, + "learning_rate": 7.529322187708752e-06, + "loss": 0.4417547583580017, "num_tokens": 208692271.0, "step": 231 }, { "epoch": 1.7313432835820897, - "grad_norm": 0.30924906129782137, - "learning_rate": 7.508299479622335e-05, - "loss": 0.37576648592948914, + "grad_norm": 0.28678480761878283, + "learning_rate": 7.5082994796223355e-06, + "loss": 0.4000692367553711, "num_tokens": 209542301.0, "step": 232 }, { "epoch": 1.7388059701492538, - "grad_norm": 0.4098176766583682, - "learning_rate": 7.487221896061458e-05, - "loss": 0.40126246213912964, + "grad_norm": 0.3105804034516556, + "learning_rate": 7.487221896061458e-06, + "loss": 0.43237993121147156, "num_tokens": 210462903.0, "step": 233 }, { "epoch": 1.7462686567164178, - "grad_norm": 0.3755459892153449, - "learning_rate": 7.466090012957361e-05, - "loss": 0.40743741393089294, + "grad_norm": 0.3069476203994755, + "learning_rate": 7.466090012957361e-06, + "loss": 0.4426308274269104, "num_tokens": 211451379.0, "step": 234 }, { "epoch": 1.7537313432835822, - "grad_norm": 0.2742607115279538, - "learning_rate": 7.444904407724974e-05, - "loss": 0.3960835635662079, + "grad_norm": 0.29187302592713965, + "learning_rate": 7.444904407724973e-06, + "loss": 0.4144989252090454, "num_tokens": 212341336.0, "step": 235 }, { "epoch": 1.7611940298507462, - "grad_norm": 0.2880514272386276, - "learning_rate": 7.423665659247153e-05, - "loss": 0.39030489325523376, + "grad_norm": 0.2715020106858522, + "learning_rate": 7.423665659247154e-06, + "loss": 0.4140280485153198, "num_tokens": 213184565.0, "step": 236 }, { "epoch": 1.7686567164179103, - "grad_norm": 0.30761746168132653, - "learning_rate": 7.402374347858861e-05, - "loss": 0.3909149765968323, + "grad_norm": 0.3042751492929567, + "learning_rate": 7.402374347858862e-06, + "loss": 0.4220738708972931, "num_tokens": 214162910.0, "step": 237 }, { "epoch": 1.7761194029850746, - "grad_norm": 0.3163616587429075, - "learning_rate": 7.381031055331306e-05, - "loss": 0.4014688730239868, + "grad_norm": 0.283596579410495, + "learning_rate": 7.381031055331306e-06, + "loss": 0.43350133299827576, "num_tokens": 215182240.0, "step": 238 }, { "epoch": 1.783582089552239, - "grad_norm": 0.3161196535831943, - "learning_rate": 7.359636364856043e-05, - "loss": 0.41104453802108765, + "grad_norm": 0.29114085647177373, + "learning_rate": 7.3596363648560445e-06, + "loss": 0.4327085316181183, "num_tokens": 216074554.0, "step": 239 }, { "epoch": 1.7910447761194028, - "grad_norm": 0.31321297658507724, - "learning_rate": 7.338190861029052e-05, - "loss": 0.39404720067977905, + "grad_norm": 0.28379283338161987, + "learning_rate": 7.338190861029052e-06, + "loss": 0.4293884038925171, "num_tokens": 216989156.0, "step": 240 }, { "epoch": 1.7985074626865671, - "grad_norm": 0.3006200698860379, - "learning_rate": 7.316695129834744e-05, - "loss": 0.3874233663082123, + "grad_norm": 0.31407525298001004, + "learning_rate": 7.316695129834744e-06, + "loss": 0.4033690392971039, "num_tokens": 217859754.0, "step": 241 }, { "epoch": 1.8059701492537314, - "grad_norm": 0.3057480054891478, - "learning_rate": 7.295149758629966e-05, - "loss": 0.38956218957901, + "grad_norm": 0.3013707320804031, + "learning_rate": 7.2951497586299665e-06, + "loss": 0.415780246257782, "num_tokens": 218674048.0, "step": 242 }, { "epoch": 1.8134328358208955, - "grad_norm": 0.3012677824183665, - "learning_rate": 7.273555336127947e-05, - "loss": 0.4058060050010681, + "grad_norm": 0.3130414485143585, + "learning_rate": 7.273555336127948e-06, + "loss": 0.4289485216140747, "num_tokens": 219544627.0, "step": 243 }, { "epoch": 1.8208955223880596, - "grad_norm": 0.2744785969316298, - "learning_rate": 7.251912452382205e-05, - "loss": 0.3862513303756714, + "grad_norm": 0.271886252549519, + "learning_rate": 7.251912452382206e-06, + "loss": 0.4117184579372406, "num_tokens": 220510777.0, "step": 244 }, { "epoch": 1.828358208955224, - "grad_norm": 0.3191786669737911, - "learning_rate": 7.23022169877044e-05, - "loss": 0.3802780508995056, + "grad_norm": 0.3095984364408915, + "learning_rate": 7.2302216987704395e-06, + "loss": 0.40528762340545654, "num_tokens": 221358648.0, "step": 245 }, { "epoch": 1.835820895522388, - "grad_norm": 0.250350565874878, - "learning_rate": 7.20848366797835e-05, - "loss": 0.35144785046577454, + "grad_norm": 0.28537942146166506, + "learning_rate": 7.208483667978351e-06, + "loss": 0.37842410802841187, "num_tokens": 222227328.0, "step": 246 }, { "epoch": 1.8432835820895521, - "grad_norm": 0.3303217795126071, - "learning_rate": 7.186698953983467e-05, - "loss": 0.4225725531578064, + "grad_norm": 0.3285002711937223, + "learning_rate": 7.186698953983466e-06, + "loss": 0.4463423192501068, "num_tokens": 223216379.0, "step": 247 }, { "epoch": 1.8507462686567164, - "grad_norm": 0.29218892444941164, - "learning_rate": 7.164868152038898e-05, - "loss": 0.4028915762901306, + "grad_norm": 0.29900827070350944, + "learning_rate": 7.164868152038899e-06, + "loss": 0.42675986886024475, "num_tokens": 224109870.0, "step": 248 }, { "epoch": 1.8582089552238807, - "grad_norm": 0.31258287004390606, - "learning_rate": 7.14299185865708e-05, - "loss": 0.4056345522403717, + "grad_norm": 0.27490080435841, + "learning_rate": 7.1429918586570815e-06, + "loss": 0.4331856667995453, "num_tokens": 225101205.0, "step": 249 }, { "epoch": 1.8656716417910446, - "grad_norm": 0.2874654314669858, - "learning_rate": 7.121070671593477e-05, - "loss": 0.4106283187866211, + "grad_norm": 0.2935787072389711, + "learning_rate": 7.121070671593477e-06, + "loss": 0.4262286424636841, "num_tokens": 226119986.0, "step": 250 }, { "epoch": 1.873134328358209, - "grad_norm": 3.04810806206641, - "learning_rate": 7.099105189830236e-05, - "loss": 0.4010149836540222, + "grad_norm": 0.3045861994484339, + "learning_rate": 7.099105189830235e-06, + "loss": 0.4218306541442871, "num_tokens": 226995732.0, "step": 251 }, { "epoch": 1.8805970149253732, - "grad_norm": 0.4872339332207392, - "learning_rate": 7.077096013559831e-05, - "loss": 0.3999147415161133, + "grad_norm": 0.27595409032706397, + "learning_rate": 7.077096013559831e-06, + "loss": 0.4189199209213257, "num_tokens": 227872634.0, "step": 252 }, { "epoch": 1.8880597014925373, - "grad_norm": 0.30747273177766893, - "learning_rate": 7.055043744168657e-05, - "loss": 0.42227038741111755, + "grad_norm": 0.289326233334052, + "learning_rate": 7.055043744168658e-06, + "loss": 0.44568511843681335, "num_tokens": 228843256.0, "step": 253 }, { "epoch": 1.8955223880597014, - "grad_norm": 0.36857239850672174, - "learning_rate": 7.032948984220611e-05, - "loss": 0.3819285035133362, + "grad_norm": 0.3108178596802667, + "learning_rate": 7.032948984220611e-06, + "loss": 0.39977630972862244, "num_tokens": 229749232.0, "step": 254 }, { "epoch": 1.9029850746268657, - "grad_norm": 0.37039898242213654, - "learning_rate": 7.010812337440604e-05, - "loss": 0.39614227414131165, + "grad_norm": 0.3029945133044889, + "learning_rate": 7.0108123374406046e-06, + "loss": 0.41192835569381714, "num_tokens": 230524739.0, "step": 255 }, { "epoch": 1.9104477611940298, - "grad_norm": 0.3246787644219324, - "learning_rate": 6.988634408698082e-05, - "loss": 0.360940158367157, + "grad_norm": 0.25289759257512634, + "learning_rate": 6.988634408698083e-06, + "loss": 0.38565781712532043, "num_tokens": 231455850.0, "step": 256 }, { "epoch": 1.917910447761194, - "grad_norm": 0.3928833877613726, - "learning_rate": 6.9664158039905e-05, - "loss": 0.41075122356414795, + "grad_norm": 0.298108417839461, + "learning_rate": 6.966415803990501e-06, + "loss": 0.4397220015525818, "num_tokens": 232349234.0, "step": 257 }, { "epoch": 1.9253731343283582, - "grad_norm": 0.3484930032728619, - "learning_rate": 6.944157130426745e-05, - "loss": 0.4119340479373932, + "grad_norm": 0.30576254773905986, + "learning_rate": 6.944157130426745e-06, + "loss": 0.43654486536979675, "num_tokens": 233187315.0, "step": 258 }, { "epoch": 1.9328358208955225, - "grad_norm": 0.33925317332303434, - "learning_rate": 6.92185899621057e-05, - "loss": 0.37901782989501953, + "grad_norm": 0.28668295683966216, + "learning_rate": 6.9218589962105695e-06, + "loss": 0.40597644448280334, "num_tokens": 234091956.0, "step": 259 }, { "epoch": 1.9402985074626866, - "grad_norm": 0.31187860973538933, - "learning_rate": 6.899522010623959e-05, - "loss": 0.4058675765991211, + "grad_norm": 0.2807573548073224, + "learning_rate": 6.899522010623959e-06, + "loss": 0.42698317766189575, "num_tokens": 235133005.0, "step": 260 }, { "epoch": 1.9477611940298507, - "grad_norm": 0.3337092759689744, - "learning_rate": 6.877146784010487e-05, - "loss": 0.38064438104629517, + "grad_norm": 0.2676937710994811, + "learning_rate": 6.877146784010486e-06, + "loss": 0.4118936061859131, "num_tokens": 235967243.0, "step": 261 }, { "epoch": 1.955223880597015, - "grad_norm": 0.31041663486995874, - "learning_rate": 6.854733927758636e-05, - "loss": 0.39881548285484314, + "grad_norm": 0.29199333652094117, + "learning_rate": 6.854733927758636e-06, + "loss": 0.42816537618637085, "num_tokens": 236876001.0, "step": 262 }, { "epoch": 1.962686567164179, - "grad_norm": 0.32940247343160245, - "learning_rate": 6.832284054285101e-05, - "loss": 0.4105050563812256, + "grad_norm": 0.3572922506463511, + "learning_rate": 6.832284054285101e-06, + "loss": 0.43847325444221497, "num_tokens": 237876952.0, "step": 263 }, { "epoch": 1.9701492537313432, - "grad_norm": 0.32370232925335807, - "learning_rate": 6.809797777018041e-05, - "loss": 0.4055755138397217, + "grad_norm": 0.2960985809182997, + "learning_rate": 6.809797777018041e-06, + "loss": 0.43155139684677124, "num_tokens": 238704164.0, "step": 264 }, { "epoch": 1.9776119402985075, - "grad_norm": 0.2967458731510359, - "learning_rate": 6.787275710380329e-05, - "loss": 0.40312010049819946, + "grad_norm": 0.3169980642916318, + "learning_rate": 6.78727571038033e-06, + "loss": 0.4308193027973175, "num_tokens": 239595870.0, "step": 265 }, { "epoch": 1.9850746268656716, - "grad_norm": 0.2853107757324463, - "learning_rate": 6.764718469772758e-05, - "loss": 0.38676345348358154, + "grad_norm": 0.3191747061655072, + "learning_rate": 6.764718469772759e-06, + "loss": 0.4188956022262573, "num_tokens": 240337386.0, "step": 266 }, { "epoch": 1.9925373134328357, - "grad_norm": 0.2770557077906754, - "learning_rate": 6.742126671557228e-05, - "loss": 0.37234237790107727, + "grad_norm": 0.28286588606011187, + "learning_rate": 6.7421266715572275e-06, + "loss": 0.40036123991012573, "num_tokens": 241215348.0, "step": 267 }, { "epoch": 2.0, - "grad_norm": 0.3043774937022287, - "learning_rate": 6.719500933039898e-05, - "loss": 0.37937837839126587, + "grad_norm": 0.2981753233991589, + "learning_rate": 6.719500933039898e-06, + "loss": 0.41549932956695557, "num_tokens": 242121111.0, "step": 268 }, { "epoch": 2.0074626865671643, - "grad_norm": 0.5189620563039749, - "learning_rate": 6.696841872454332e-05, - "loss": 0.31283748149871826, + "grad_norm": 0.33640737374184443, + "learning_rate": 6.696841872454332e-06, + "loss": 0.4132290482521057, "num_tokens": 243025320.0, "step": 269 }, { "epoch": 2.014925373134328, - "grad_norm": 0.3756598719050017, - "learning_rate": 6.674150108944593e-05, - "loss": 0.28343072533607483, + "grad_norm": 0.2822051764181089, + "learning_rate": 6.674150108944593e-06, + "loss": 0.37781068682670593, "num_tokens": 243793916.0, "step": 270 }, { "epoch": 2.0223880597014925, - "grad_norm": 0.7951817278664628, - "learning_rate": 6.651426262548326e-05, - "loss": 0.304007887840271, + "grad_norm": 0.38987929902231017, + "learning_rate": 6.651426262548326e-06, + "loss": 0.40918004512786865, "num_tokens": 244799351.0, "step": 271 }, { "epoch": 2.029850746268657, - "grad_norm": 0.4285085251944697, - "learning_rate": 6.62867095417983e-05, - "loss": 0.2986816167831421, + "grad_norm": 0.348061447310908, + "learning_rate": 6.62867095417983e-06, + "loss": 0.3939589858055115, "num_tokens": 245795313.0, "step": 272 }, { "epoch": 2.0373134328358207, - "grad_norm": 0.32434142021468265, - "learning_rate": 6.605884805613073e-05, - "loss": 0.27925539016723633, + "grad_norm": 0.3046732710135438, + "learning_rate": 6.605884805613073e-06, + "loss": 0.36584192514419556, "num_tokens": 246732184.0, "step": 273 }, { "epoch": 2.044776119402985, - "grad_norm": 0.3682089928755959, - "learning_rate": 6.583068439464715e-05, - "loss": 0.3119613826274872, + "grad_norm": 0.3664198494618375, + "learning_rate": 6.583068439464716e-06, + "loss": 0.4081302881240845, "num_tokens": 247606091.0, "step": 274 }, { "epoch": 2.0522388059701493, - "grad_norm": 0.38930617250730126, - "learning_rate": 6.560222479177094e-05, - "loss": 0.3010408282279968, + "grad_norm": 0.3112614984470978, + "learning_rate": 6.560222479177095e-06, + "loss": 0.3947848081588745, "num_tokens": 248474307.0, "step": 275 }, { "epoch": 2.0597014925373136, - "grad_norm": 0.3190517327957313, - "learning_rate": 6.537347549001184e-05, - "loss": 0.3006717562675476, + "grad_norm": 0.3268123714386943, + "learning_rate": 6.537347549001184e-06, + "loss": 0.39627498388290405, "num_tokens": 249293743.0, "step": 276 }, { "epoch": 2.0671641791044775, - "grad_norm": 0.38743371161670787, - "learning_rate": 6.514444273979544e-05, - "loss": 0.30118152499198914, + "grad_norm": 0.30038025917744793, + "learning_rate": 6.514444273979544e-06, + "loss": 0.3961779773235321, "num_tokens": 250164041.0, "step": 277 }, { "epoch": 2.074626865671642, - "grad_norm": 0.3065135061353304, - "learning_rate": 6.491513279929238e-05, - "loss": 0.27479201555252075, + "grad_norm": 0.30941665860783496, + "learning_rate": 6.491513279929238e-06, + "loss": 0.3704898953437805, "num_tokens": 251063865.0, "step": 278 }, { "epoch": 2.082089552238806, - "grad_norm": 0.3402807901021368, - "learning_rate": 6.468555193424735e-05, - "loss": 0.2858580946922302, + "grad_norm": 0.2822311579038674, + "learning_rate": 6.468555193424736e-06, + "loss": 0.3888505697250366, "num_tokens": 251954121.0, "step": 279 }, { "epoch": 2.08955223880597, - "grad_norm": 0.29650159883256816, - "learning_rate": 6.445570641780785e-05, - "loss": 0.2734259366989136, + "grad_norm": 0.2838966427637005, + "learning_rate": 6.445570641780786e-06, + "loss": 0.3732953667640686, "num_tokens": 252767775.0, "step": 280 }, { "epoch": 2.0970149253731343, - "grad_norm": 0.365960849536292, - "learning_rate": 6.422560253035287e-05, - "loss": 0.30103248357772827, + "grad_norm": 0.30198287700287857, + "learning_rate": 6.422560253035287e-06, + "loss": 0.3989664614200592, "num_tokens": 253671573.0, "step": 281 }, { "epoch": 2.1044776119402986, - "grad_norm": 0.32252087293233694, - "learning_rate": 6.399524655932111e-05, - "loss": 0.2971936762332916, + "grad_norm": 0.3143195160978541, + "learning_rate": 6.399524655932111e-06, + "loss": 0.4071004390716553, "num_tokens": 254540226.0, "step": 282 }, { "epoch": 2.111940298507463, - "grad_norm": 0.3345472822539113, - "learning_rate": 6.376464479903938e-05, - "loss": 0.26327937841415405, + "grad_norm": 0.29633039155095714, + "learning_rate": 6.376464479903938e-06, + "loss": 0.3590371012687683, "num_tokens": 255292355.0, "step": 283 }, { "epoch": 2.1194029850746268, - "grad_norm": 0.36256733909977773, - "learning_rate": 6.353380355055051e-05, - "loss": 0.28485092520713806, + "grad_norm": 0.2746728490799242, + "learning_rate": 6.353380355055051e-06, + "loss": 0.38884416222572327, "num_tokens": 256176530.0, "step": 284 }, { "epoch": 2.126865671641791, - "grad_norm": 0.35138096145006326, - "learning_rate": 6.330272912144116e-05, - "loss": 0.31443923711776733, + "grad_norm": 0.2951568696719758, + "learning_rate": 6.330272912144116e-06, + "loss": 0.42871013283729553, "num_tokens": 257090645.0, "step": 285 }, { "epoch": 2.1343283582089554, - "grad_norm": 0.3632097571150449, - "learning_rate": 6.307142782566951e-05, - "loss": 0.29116642475128174, + "grad_norm": 0.2902093873074645, + "learning_rate": 6.307142782566952e-06, + "loss": 0.3986203670501709, "num_tokens": 258131119.0, "step": 286 }, { "epoch": 2.1417910447761193, - "grad_norm": 0.33365929847004017, - "learning_rate": 6.283990598339274e-05, - "loss": 0.2801674008369446, + "grad_norm": 0.3900114303550773, + "learning_rate": 6.283990598339274e-06, + "loss": 0.390123188495636, "num_tokens": 258880552.0, "step": 287 }, { "epoch": 2.1492537313432836, - "grad_norm": 0.3309848941885377, - "learning_rate": 6.260816992079431e-05, - "loss": 0.270031601190567, + "grad_norm": 0.2806374479908933, + "learning_rate": 6.2608169920794314e-06, + "loss": 0.36130136251449585, "num_tokens": 259758999.0, "step": 288 }, { "epoch": 2.156716417910448, - "grad_norm": 0.28684743215164366, - "learning_rate": 6.237622596991106e-05, - "loss": 0.295134574174881, + "grad_norm": 0.2942927245657638, + "learning_rate": 6.237622596991106e-06, + "loss": 0.40030941367149353, "num_tokens": 260602559.0, "step": 289 }, { "epoch": 2.1641791044776117, - "grad_norm": 0.3883473542011954, - "learning_rate": 6.214408046846034e-05, - "loss": 0.29493197798728943, + "grad_norm": 0.3214957885578966, + "learning_rate": 6.214408046846034e-06, + "loss": 0.39499810338020325, "num_tokens": 261439646.0, "step": 290 }, { "epoch": 2.171641791044776, - "grad_norm": 0.2957992461202963, - "learning_rate": 6.191173975966669e-05, - "loss": 0.2839927673339844, + "grad_norm": 0.27240683635483437, + "learning_rate": 6.191173975966669e-06, + "loss": 0.3880019783973694, "num_tokens": 262474020.0, "step": 291 }, { "epoch": 2.1791044776119404, - "grad_norm": 0.346980078432492, - "learning_rate": 6.167921019208851e-05, - "loss": 0.29989829659461975, + "grad_norm": 0.34023027676143563, + "learning_rate": 6.167921019208851e-06, + "loss": 0.42268985509872437, "num_tokens": 263528820.0, "step": 292 }, { "epoch": 2.1865671641791047, - "grad_norm": 0.3337390269180698, - "learning_rate": 6.144649811944473e-05, - "loss": 0.2857828736305237, + "grad_norm": 0.287848829860692, + "learning_rate": 6.144649811944474e-06, + "loss": 0.3913387656211853, "num_tokens": 264372315.0, "step": 293 }, { "epoch": 2.1940298507462686, - "grad_norm": 0.41367859573349175, - "learning_rate": 6.121360990044108e-05, - "loss": 0.30010879039764404, + "grad_norm": 0.29220713499868917, + "learning_rate": 6.121360990044107e-06, + "loss": 0.40157270431518555, "num_tokens": 265188957.0, "step": 294 }, { "epoch": 2.201492537313433, - "grad_norm": 0.3349518076090135, - "learning_rate": 6.098055189859634e-05, - "loss": 0.2934808135032654, + "grad_norm": 0.286455151799939, + "learning_rate": 6.098055189859634e-06, + "loss": 0.3945062756538391, "num_tokens": 266184697.0, "step": 295 }, { "epoch": 2.208955223880597, - "grad_norm": 0.3046794410316698, - "learning_rate": 6.074733048206852e-05, - "loss": 0.3011645972728729, + "grad_norm": 0.289286738435993, + "learning_rate": 6.074733048206852e-06, + "loss": 0.3945891559123993, "num_tokens": 267190971.0, "step": 296 }, { "epoch": 2.216417910447761, - "grad_norm": 0.29027277539441865, - "learning_rate": 6.051395202348089e-05, - "loss": 0.29139044880867004, + "grad_norm": 0.27448176767847715, + "learning_rate": 6.051395202348089e-06, + "loss": 0.3953642249107361, "num_tokens": 268121281.0, "step": 297 }, { "epoch": 2.2238805970149254, - "grad_norm": 0.3164910594866622, - "learning_rate": 6.028042289974768e-05, - "loss": 0.28695452213287354, + "grad_norm": 0.297149102735408, + "learning_rate": 6.028042289974768e-06, + "loss": 0.3815913796424866, "num_tokens": 269026334.0, "step": 298 }, { "epoch": 2.2313432835820897, - "grad_norm": 0.2941793928933697, - "learning_rate": 6.0046749491900035e-05, - "loss": 0.2720807194709778, + "grad_norm": 0.29135459719595014, + "learning_rate": 6.004674949190004e-06, + "loss": 0.3744094967842102, "num_tokens": 269848673.0, "step": 299 }, { "epoch": 2.2388059701492535, - "grad_norm": 0.2924275734250093, - "learning_rate": 5.981293818491153e-05, - "loss": 0.29901331663131714, + "grad_norm": 0.3163386130777747, + "learning_rate": 5.981293818491153e-06, + "loss": 0.411973237991333, "num_tokens": 270729219.0, "step": 300 }, { "epoch": 2.246268656716418, - "grad_norm": 0.32682138311647513, - "learning_rate": 5.9578995367523726e-05, - "loss": 0.3086387515068054, + "grad_norm": 0.2996160649578529, + "learning_rate": 5.957899536752373e-06, + "loss": 0.4180707335472107, "num_tokens": 271647605.0, "step": 301 }, { "epoch": 2.253731343283582, - "grad_norm": 0.30568173595564085, - "learning_rate": 5.9344927432071685e-05, - "loss": 0.2727169394493103, + "grad_norm": 0.2744717376139136, + "learning_rate": 5.934492743207168e-06, + "loss": 0.36764925718307495, "num_tokens": 272444857.0, "step": 302 }, { "epoch": 2.2611940298507465, - "grad_norm": 0.3032747787421, - "learning_rate": 5.911074077430917e-05, - "loss": 0.2883470058441162, + "grad_norm": 0.3051287913390687, + "learning_rate": 5.911074077430917e-06, + "loss": 0.3950934410095215, "num_tokens": 273313831.0, "step": 303 }, { "epoch": 2.2686567164179103, - "grad_norm": 0.2800903879341409, - "learning_rate": 5.8876441793234025e-05, - "loss": 0.27465224266052246, + "grad_norm": 0.2740805047822694, + "learning_rate": 5.887644179323403e-06, + "loss": 0.38602137565612793, "num_tokens": 274151817.0, "step": 304 }, { "epoch": 2.2761194029850746, - "grad_norm": 0.3001171467532811, - "learning_rate": 5.8642036890913154e-05, - "loss": 0.2994191646575928, + "grad_norm": 0.2811027592780593, + "learning_rate": 5.864203689091316e-06, + "loss": 0.40490180253982544, "num_tokens": 275023603.0, "step": 305 }, { "epoch": 2.283582089552239, - "grad_norm": 0.29388999935102494, - "learning_rate": 5.8407532472307815e-05, - "loss": 0.2954255938529968, + "grad_norm": 0.37103511230501807, + "learning_rate": 5.840753247230781e-06, + "loss": 0.39756178855895996, "num_tokens": 275922951.0, "step": 306 }, { "epoch": 2.291044776119403, - "grad_norm": 0.29009038408879717, - "learning_rate": 5.8172934945098356e-05, - "loss": 0.26768869161605835, + "grad_norm": 0.260165834106451, + "learning_rate": 5.817293494509836e-06, + "loss": 0.3657914996147156, "num_tokens": 276733073.0, "step": 307 }, { "epoch": 2.298507462686567, - "grad_norm": 0.279721051678857, - "learning_rate": 5.7938250719509356e-05, - "loss": 0.2858269512653351, + "grad_norm": 0.2676322746611296, + "learning_rate": 5.793825071950936e-06, + "loss": 0.3826783299446106, "num_tokens": 277699551.0, "step": 308 }, { "epoch": 2.3059701492537314, - "grad_norm": 0.2798681741569151, - "learning_rate": 5.770348620813433e-05, - "loss": 0.27539974451065063, + "grad_norm": 0.3171630796152734, + "learning_rate": 5.770348620813433e-06, + "loss": 0.38245660066604614, "num_tokens": 278695133.0, "step": 309 }, { "epoch": 2.3134328358208958, - "grad_norm": 0.31200492809883906, - "learning_rate": 5.746864782576054e-05, - "loss": 0.28673839569091797, + "grad_norm": 0.2749216503608562, + "learning_rate": 5.746864782576054e-06, + "loss": 0.38771188259124756, "num_tokens": 279483451.0, "step": 310 }, { "epoch": 2.3208955223880596, - "grad_norm": 0.4069363575723954, - "learning_rate": 5.723374198919376e-05, - "loss": 0.28754723072052, + "grad_norm": 0.34619757766961257, + "learning_rate": 5.723374198919376e-06, + "loss": 0.40358829498291016, "num_tokens": 280316518.0, "step": 311 }, { "epoch": 2.328358208955224, - "grad_norm": 0.33217539398911666, - "learning_rate": 5.699877511708285e-05, - "loss": 0.2837458848953247, + "grad_norm": 0.2628421365077709, + "learning_rate": 5.699877511708285e-06, + "loss": 0.37161552906036377, "num_tokens": 281300113.0, "step": 312 }, { "epoch": 2.3358208955223883, - "grad_norm": 0.26860853798159157, - "learning_rate": 5.67637536297445e-05, - "loss": 0.28143924474716187, + "grad_norm": 0.2865924626367908, + "learning_rate": 5.67637536297445e-06, + "loss": 0.3707822561264038, "num_tokens": 282213553.0, "step": 313 }, { "epoch": 2.343283582089552, - "grad_norm": 0.30196368267077084, - "learning_rate": 5.652868394898766e-05, - "loss": 0.289265513420105, + "grad_norm": 0.2782360921000711, + "learning_rate": 5.652868394898766e-06, + "loss": 0.38021302223205566, "num_tokens": 283069634.0, "step": 314 }, { "epoch": 2.3507462686567164, - "grad_norm": 0.3161512050074206, - "learning_rate": 5.629357249793816e-05, - "loss": 0.2994301915168762, + "grad_norm": 0.274968159536365, + "learning_rate": 5.6293572497938165e-06, + "loss": 0.4070481061935425, "num_tokens": 284055909.0, "step": 315 }, { "epoch": 2.3582089552238807, - "grad_norm": 0.29483765485140945, - "learning_rate": 5.60584257008632e-05, - "loss": 0.29892903566360474, + "grad_norm": 0.25137582516547385, + "learning_rate": 5.605842570086321e-06, + "loss": 0.38819169998168945, "num_tokens": 285072190.0, "step": 316 }, { "epoch": 2.3656716417910446, - "grad_norm": 0.28587485226161186, - "learning_rate": 5.582324998299573e-05, - "loss": 0.29343095421791077, + "grad_norm": 0.27416935469654424, + "learning_rate": 5.582324998299573e-06, + "loss": 0.3976019620895386, "num_tokens": 285997942.0, "step": 317 }, { "epoch": 2.373134328358209, - "grad_norm": 0.28465625884439294, - "learning_rate": 5.558805177035902e-05, - "loss": 0.2936056852340698, + "grad_norm": 0.28976153755834105, + "learning_rate": 5.558805177035902e-06, + "loss": 0.39910900592803955, "num_tokens": 286957228.0, "step": 318 }, { "epoch": 2.3805970149253732, - "grad_norm": 0.27302674365853485, - "learning_rate": 5.53528374895909e-05, - "loss": 0.2934105694293976, + "grad_norm": 0.3526174425898886, + "learning_rate": 5.53528374895909e-06, + "loss": 0.37735995650291443, "num_tokens": 287834123.0, "step": 319 }, { "epoch": 2.388059701492537, - "grad_norm": 0.2937846955147667, - "learning_rate": 5.511761356776833e-05, - "loss": 0.3050351142883301, + "grad_norm": 0.2753135236966283, + "learning_rate": 5.511761356776834e-06, + "loss": 0.3974205553531647, "num_tokens": 288755581.0, "step": 320 }, { "epoch": 2.3955223880597014, - "grad_norm": 0.3082335026079815, - "learning_rate": 5.488238643223167e-05, - "loss": 0.3008425533771515, + "grad_norm": 0.2836500955971764, + "learning_rate": 5.488238643223167e-06, + "loss": 0.4040617346763611, "num_tokens": 289616887.0, "step": 321 }, { "epoch": 2.4029850746268657, - "grad_norm": 0.3337100112980536, - "learning_rate": 5.464716251040911e-05, - "loss": 0.2965007722377777, + "grad_norm": 0.3001483066578534, + "learning_rate": 5.464716251040911e-06, + "loss": 0.39118584990501404, "num_tokens": 290466034.0, "step": 322 }, { "epoch": 2.41044776119403, - "grad_norm": 0.29134047608711217, - "learning_rate": 5.4411948229641e-05, - "loss": 0.3054417371749878, + "grad_norm": 0.29609458212755346, + "learning_rate": 5.4411948229641e-06, + "loss": 0.4012300372123718, "num_tokens": 291327531.0, "step": 323 }, { "epoch": 2.417910447761194, - "grad_norm": 0.28372662232596085, - "learning_rate": 5.417675001700428e-05, - "loss": 0.28944918513298035, + "grad_norm": 0.282307409973888, + "learning_rate": 5.417675001700428e-06, + "loss": 0.39297211170196533, "num_tokens": 292249211.0, "step": 324 }, { "epoch": 2.425373134328358, - "grad_norm": 0.32814274778818836, - "learning_rate": 5.394157429913681e-05, - "loss": 0.32307708263397217, + "grad_norm": 0.31947796875203593, + "learning_rate": 5.394157429913681e-06, + "loss": 0.43389707803726196, "num_tokens": 293154262.0, "step": 325 }, { "epoch": 2.4328358208955225, - "grad_norm": 0.30046974134928794, - "learning_rate": 5.370642750206184e-05, - "loss": 0.30698198080062866, + "grad_norm": 0.2806921837500959, + "learning_rate": 5.370642750206184e-06, + "loss": 0.4193563461303711, "num_tokens": 294190925.0, "step": 326 }, { "epoch": 2.4402985074626864, - "grad_norm": 0.3317996530347103, - "learning_rate": 5.3471316051012364e-05, - "loss": 0.31206008791923523, + "grad_norm": 0.28217215862589007, + "learning_rate": 5.347131605101237e-06, + "loss": 0.42073380947113037, "num_tokens": 295155201.0, "step": 327 }, { "epoch": 2.4477611940298507, - "grad_norm": 0.3055787118754774, - "learning_rate": 5.323624637025552e-05, - "loss": 0.2846985161304474, + "grad_norm": 0.2595127351145338, + "learning_rate": 5.323624637025552e-06, + "loss": 0.38413190841674805, "num_tokens": 296039941.0, "step": 328 }, { "epoch": 2.455223880597015, - "grad_norm": 0.3387028331539513, - "learning_rate": 5.300122488291717e-05, - "loss": 0.28388965129852295, + "grad_norm": 0.27537880701127315, + "learning_rate": 5.300122488291717e-06, + "loss": 0.3896210193634033, "num_tokens": 296897125.0, "step": 329 }, { "epoch": 2.4626865671641793, - "grad_norm": 0.28963764058204255, - "learning_rate": 5.276625801080626e-05, - "loss": 0.3084145784378052, + "grad_norm": 0.2806456708250513, + "learning_rate": 5.276625801080626e-06, + "loss": 0.40547412633895874, "num_tokens": 297829206.0, "step": 330 }, { "epoch": 2.470149253731343, - "grad_norm": 0.29709058722466575, - "learning_rate": 5.253135217423948e-05, - "loss": 0.30127590894699097, + "grad_norm": 0.3233513262930407, + "learning_rate": 5.253135217423948e-06, + "loss": 0.3998452425003052, "num_tokens": 298813976.0, "step": 331 }, { "epoch": 2.4776119402985075, - "grad_norm": 0.2893854367052054, - "learning_rate": 5.229651379186568e-05, - "loss": 0.30449870228767395, + "grad_norm": 0.2870679405386201, + "learning_rate": 5.229651379186569e-06, + "loss": 0.41445013880729675, "num_tokens": 299755392.0, "step": 332 }, { "epoch": 2.485074626865672, - "grad_norm": 0.3234612092273635, - "learning_rate": 5.2061749280490655e-05, - "loss": 0.28941231966018677, + "grad_norm": 0.2623639243435129, + "learning_rate": 5.206174928049066e-06, + "loss": 0.3996489644050598, "num_tokens": 300745461.0, "step": 333 }, { "epoch": 2.4925373134328357, - "grad_norm": 0.3183066644776224, - "learning_rate": 5.1827065054901655e-05, - "loss": 0.2948898673057556, + "grad_norm": 0.2657883700801823, + "learning_rate": 5.182706505490166e-06, + "loss": 0.3919597864151001, "num_tokens": 301635785.0, "step": 334 }, { "epoch": 2.5, - "grad_norm": 0.31999878845891067, - "learning_rate": 5.15924675276922e-05, - "loss": 0.2840469479560852, + "grad_norm": 0.2730887704012263, + "learning_rate": 5.15924675276922e-06, + "loss": 0.37381941080093384, "num_tokens": 302529314.0, "step": 335 }, { "epoch": 2.5074626865671643, - "grad_norm": 0.29571538652801577, - "learning_rate": 5.135796310908685e-05, - "loss": 0.2923676371574402, + "grad_norm": 0.27926647905507407, + "learning_rate": 5.135796310908685e-06, + "loss": 0.4020169675350189, "num_tokens": 303325140.0, "step": 336 }, { "epoch": 2.5149253731343286, - "grad_norm": 0.2977832012728553, - "learning_rate": 5.1123558206766e-05, - "loss": 0.29460111260414124, + "grad_norm": 0.2573449307599577, + "learning_rate": 5.1123558206766e-06, + "loss": 0.3959069848060608, "num_tokens": 304291697.0, "step": 337 }, { "epoch": 2.5223880597014925, - "grad_norm": 0.28849092801766835, - "learning_rate": 5.088925922569083e-05, - "loss": 0.30242517590522766, + "grad_norm": 0.2713627052957801, + "learning_rate": 5.088925922569084e-06, + "loss": 0.4036637246608734, "num_tokens": 305167326.0, "step": 338 }, { "epoch": 2.529850746268657, - "grad_norm": 0.38563431794432057, - "learning_rate": 5.065507256792833e-05, - "loss": 0.34651827812194824, + "grad_norm": 0.29137688284390684, + "learning_rate": 5.065507256792833e-06, + "loss": 0.40749210119247437, "num_tokens": 306083413.0, "step": 339 }, { "epoch": 2.5373134328358207, - "grad_norm": 0.28016153774131963, - "learning_rate": 5.04210046324763e-05, - "loss": 0.30077484250068665, + "grad_norm": 0.27645786153124524, + "learning_rate": 5.04210046324763e-06, + "loss": 0.3960036039352417, "num_tokens": 306930925.0, "step": 340 }, { "epoch": 2.544776119402985, - "grad_norm": 0.29398604257120425, - "learning_rate": 5.018706181508851e-05, - "loss": 0.2981261610984802, + "grad_norm": 0.2959257579408876, + "learning_rate": 5.018706181508851e-06, + "loss": 0.40943804383277893, "num_tokens": 307667223.0, "step": 341 }, { "epoch": 2.5522388059701493, - "grad_norm": 0.3388867575808703, - "learning_rate": 4.995325050809999e-05, - "loss": 0.3186606764793396, + "grad_norm": 0.2941768147406628, + "learning_rate": 4.995325050809999e-06, + "loss": 0.42352843284606934, "num_tokens": 308548843.0, "step": 342 }, { "epoch": 2.5597014925373136, - "grad_norm": 0.298212819329933, - "learning_rate": 4.971957710025235e-05, - "loss": 0.3071972727775574, + "grad_norm": 0.3093404075043933, + "learning_rate": 4.971957710025235e-06, + "loss": 0.4167254567146301, "num_tokens": 309456869.0, "step": 343 }, { "epoch": 2.5671641791044775, - "grad_norm": 0.2912705230636927, - "learning_rate": 4.9486047976519134e-05, - "loss": 0.3108614385128021, + "grad_norm": 0.285830294036988, + "learning_rate": 4.948604797651914e-06, + "loss": 0.41970574855804443, "num_tokens": 310374426.0, "step": 344 }, { "epoch": 2.574626865671642, - "grad_norm": 0.28329377416107365, - "learning_rate": 4.9252669517931495e-05, - "loss": 0.2985752820968628, + "grad_norm": 0.2822303940696211, + "learning_rate": 4.925266951793149e-06, + "loss": 0.39743444323539734, "num_tokens": 311185331.0, "step": 345 }, { "epoch": 2.582089552238806, - "grad_norm": 0.3073811124508015, - "learning_rate": 4.90194481014037e-05, - "loss": 0.3060144782066345, + "grad_norm": 0.2722209732746419, + "learning_rate": 4.90194481014037e-06, + "loss": 0.4093334674835205, "num_tokens": 312287344.0, "step": 346 }, { "epoch": 2.58955223880597, - "grad_norm": 0.27167822402306446, - "learning_rate": 4.8786390099558955e-05, - "loss": 0.2966720759868622, + "grad_norm": 0.3685744506907742, + "learning_rate": 4.878639009955896e-06, + "loss": 0.3837957978248596, "num_tokens": 313203808.0, "step": 347 }, { "epoch": 2.5970149253731343, - "grad_norm": 0.2785871229017667, - "learning_rate": 4.8553501880555284e-05, - "loss": 0.2806853652000427, + "grad_norm": 0.26210814461472964, + "learning_rate": 4.855350188055528e-06, + "loss": 0.374228835105896, "num_tokens": 314127724.0, "step": 348 }, { "epoch": 2.6044776119402986, - "grad_norm": 0.2802725958199747, - "learning_rate": 4.832078980791151e-05, - "loss": 0.2847418785095215, + "grad_norm": 0.26577422679986124, + "learning_rate": 4.83207898079115e-06, + "loss": 0.3950842022895813, "num_tokens": 315094649.0, "step": 349 }, { "epoch": 2.611940298507463, - "grad_norm": 0.3170073910723373, - "learning_rate": 4.8088260240333336e-05, - "loss": 0.2858068645000458, + "grad_norm": 0.2694330124125045, + "learning_rate": 4.808826024033334e-06, + "loss": 0.3894980251789093, "num_tokens": 315902867.0, "step": 350 }, { "epoch": 2.6194029850746268, - "grad_norm": 0.3142814639951889, - "learning_rate": 4.785591953153966e-05, - "loss": 0.29544034600257874, + "grad_norm": 0.30012143917049156, + "learning_rate": 4.785591953153966e-06, + "loss": 0.3923467695713043, "num_tokens": 316809248.0, "step": 351 }, { "epoch": 2.626865671641791, - "grad_norm": 0.29274017471773517, - "learning_rate": 4.762377403008895e-05, - "loss": 0.29404592514038086, + "grad_norm": 0.27202743774586025, + "learning_rate": 4.762377403008895e-06, + "loss": 0.40671366453170776, "num_tokens": 317806785.0, "step": 352 }, { "epoch": 2.6343283582089554, - "grad_norm": 0.28562293817861956, - "learning_rate": 4.739183007920571e-05, - "loss": 0.28895992040634155, + "grad_norm": 0.2663498979647159, + "learning_rate": 4.739183007920572e-06, + "loss": 0.40148887038230896, "num_tokens": 318773135.0, "step": 353 }, { "epoch": 2.6417910447761193, - "grad_norm": 0.26346655385684103, - "learning_rate": 4.716009401660728e-05, - "loss": 0.2693272531032562, + "grad_norm": 0.26964724456667694, + "learning_rate": 4.716009401660728e-06, + "loss": 0.36810237169265747, "num_tokens": 319712540.0, "step": 354 }, { "epoch": 2.6492537313432836, - "grad_norm": 0.2977115917126976, - "learning_rate": 4.69285721743305e-05, - "loss": 0.285382479429245, + "grad_norm": 0.2745583940218022, + "learning_rate": 4.69285721743305e-06, + "loss": 0.3969258666038513, "num_tokens": 320623524.0, "step": 355 }, { "epoch": 2.656716417910448, - "grad_norm": 0.3082450013402585, - "learning_rate": 4.6697270878558865e-05, - "loss": 0.3006575107574463, + "grad_norm": 0.2691069702675602, + "learning_rate": 4.669727087855886e-06, + "loss": 0.39531204104423523, "num_tokens": 321558026.0, "step": 356 }, { "epoch": 2.664179104477612, - "grad_norm": 0.2666551687578358, - "learning_rate": 4.646619644944951e-05, - "loss": 0.2760135233402252, + "grad_norm": 0.2790488198361277, + "learning_rate": 4.646619644944951e-06, + "loss": 0.3691323399543762, "num_tokens": 322457137.0, "step": 357 }, { "epoch": 2.671641791044776, - "grad_norm": 0.2754231908973977, - "learning_rate": 4.623535520096063e-05, - "loss": 0.280956894159317, + "grad_norm": 0.25676092193729705, + "learning_rate": 4.623535520096063e-06, + "loss": 0.3830498456954956, "num_tokens": 323406835.0, "step": 358 }, { "epoch": 2.6791044776119404, - "grad_norm": 0.2717468008146951, - "learning_rate": 4.6004753440678894e-05, - "loss": 0.29374197125434875, + "grad_norm": 0.27765790893840286, + "learning_rate": 4.6004753440678894e-06, + "loss": 0.38582926988601685, "num_tokens": 324270762.0, "step": 359 }, { "epoch": 2.6865671641791042, - "grad_norm": 0.26590156475062093, - "learning_rate": 4.577439746964715e-05, - "loss": 0.29904693365097046, + "grad_norm": 0.2578194748970744, + "learning_rate": 4.577439746964715e-06, + "loss": 0.39646175503730774, "num_tokens": 325172716.0, "step": 360 }, { "epoch": 2.6940298507462686, - "grad_norm": 0.2504736725047571, - "learning_rate": 4.554429358219213e-05, - "loss": 0.2904181480407715, + "grad_norm": 0.26611474982215905, + "learning_rate": 4.554429358219214e-06, + "loss": 0.38044852018356323, "num_tokens": 326161663.0, "step": 361 }, { "epoch": 2.701492537313433, - "grad_norm": 0.27321456065695293, - "learning_rate": 4.531444806575266e-05, - "loss": 0.29827404022216797, + "grad_norm": 0.2670566328628317, + "learning_rate": 4.531444806575266e-06, + "loss": 0.40564393997192383, "num_tokens": 327106201.0, "step": 362 }, { "epoch": 2.708955223880597, - "grad_norm": 0.29105282865769083, - "learning_rate": 4.508486720070762e-05, - "loss": 0.3007173240184784, + "grad_norm": 0.274772662861299, + "learning_rate": 4.508486720070761e-06, + "loss": 0.39564812183380127, "num_tokens": 328050673.0, "step": 363 }, { "epoch": 2.716417910447761, - "grad_norm": 0.2692099238841509, - "learning_rate": 4.485555726020455e-05, - "loss": 0.28588560223579407, + "grad_norm": 0.3094439511198801, + "learning_rate": 4.485555726020455e-06, + "loss": 0.3800423741340637, "num_tokens": 328859100.0, "step": 364 }, { "epoch": 2.7238805970149254, - "grad_norm": 0.28004787549353405, - "learning_rate": 4.462652450998815e-05, - "loss": 0.3075996935367584, + "grad_norm": 0.2875993414193674, + "learning_rate": 4.462652450998816e-06, + "loss": 0.4001840353012085, "num_tokens": 329666962.0, "step": 365 }, { "epoch": 2.7313432835820897, - "grad_norm": 0.2844646197206539, - "learning_rate": 4.439777520822905e-05, - "loss": 0.2913839817047119, + "grad_norm": 0.27308262203119327, + "learning_rate": 4.439777520822905e-06, + "loss": 0.39083579182624817, "num_tokens": 330477732.0, "step": 366 }, { "epoch": 2.7388059701492535, - "grad_norm": 0.2861885883974154, - "learning_rate": 4.416931560535284e-05, - "loss": 0.28724342584609985, + "grad_norm": 0.2708315720399402, + "learning_rate": 4.416931560535284e-06, + "loss": 0.39352381229400635, "num_tokens": 331330359.0, "step": 367 }, { "epoch": 2.746268656716418, - "grad_norm": 0.2888905886113955, - "learning_rate": 4.3941151943869275e-05, - "loss": 0.28904759883880615, + "grad_norm": 0.2678850422820554, + "learning_rate": 4.394115194386928e-06, + "loss": 0.38045477867126465, "num_tokens": 332347647.0, "step": 368 }, { "epoch": 2.753731343283582, - "grad_norm": 0.26424166198705124, - "learning_rate": 4.3713290458201714e-05, - "loss": 0.2919449806213379, + "grad_norm": 0.2753212357157175, + "learning_rate": 4.371329045820172e-06, + "loss": 0.3969570994377136, "num_tokens": 333284873.0, "step": 369 }, { "epoch": 2.7611940298507465, - "grad_norm": 0.28376300844826147, - "learning_rate": 4.3485737374516747e-05, - "loss": 0.30219364166259766, + "grad_norm": 0.28683339254512785, + "learning_rate": 4.3485737374516745e-06, + "loss": 0.4235033392906189, "num_tokens": 334098107.0, "step": 370 }, { "epoch": 2.7686567164179103, - "grad_norm": 0.2722134002236396, - "learning_rate": 4.325849891055409e-05, - "loss": 0.2913355827331543, + "grad_norm": 0.2698726522878529, + "learning_rate": 4.3258498910554095e-06, + "loss": 0.38629546761512756, "num_tokens": 334979408.0, "step": 371 }, { "epoch": 2.7761194029850746, - "grad_norm": 0.25620790331466675, - "learning_rate": 4.303158127545669e-05, - "loss": 0.28720664978027344, + "grad_norm": 0.2615554761622241, + "learning_rate": 4.303158127545669e-06, + "loss": 0.3924221694469452, "num_tokens": 335891381.0, "step": 372 }, { "epoch": 2.783582089552239, - "grad_norm": 0.2666880634783621, - "learning_rate": 4.280499066960102e-05, - "loss": 0.28534752130508423, + "grad_norm": 0.26064429917011145, + "learning_rate": 4.280499066960102e-06, + "loss": 0.3906182050704956, "num_tokens": 336949128.0, "step": 373 }, { "epoch": 2.791044776119403, - "grad_norm": 0.2649638446326527, - "learning_rate": 4.257873328442774e-05, - "loss": 0.28511884808540344, + "grad_norm": 0.27127505364411514, + "learning_rate": 4.257873328442774e-06, + "loss": 0.3783274292945862, "num_tokens": 337776659.0, "step": 374 }, { "epoch": 2.798507462686567, - "grad_norm": 0.25217823765831393, - "learning_rate": 4.235281530227242e-05, - "loss": 0.29397153854370117, + "grad_norm": 0.27410164043945023, + "learning_rate": 4.2352815302272425e-06, + "loss": 0.3829938471317291, "num_tokens": 338685204.0, "step": 375 }, { "epoch": 2.8059701492537314, - "grad_norm": 0.2666138268600747, - "learning_rate": 4.212724289619672e-05, - "loss": 0.2805843651294708, + "grad_norm": 0.2706332327188829, + "learning_rate": 4.212724289619672e-06, + "loss": 0.37140512466430664, "num_tokens": 339492119.0, "step": 376 }, { "epoch": 2.8134328358208958, - "grad_norm": 0.2783792603374849, - "learning_rate": 4.1902022229819595e-05, - "loss": 0.3070914149284363, + "grad_norm": 0.29552966231342986, + "learning_rate": 4.190202222981959e-06, + "loss": 0.41518405079841614, "num_tokens": 340414044.0, "step": 377 }, { "epoch": 2.8208955223880596, - "grad_norm": 0.25915987993422046, - "learning_rate": 4.1677159457149004e-05, - "loss": 0.2714030146598816, + "grad_norm": 0.4384124363415056, + "learning_rate": 4.1677159457149005e-06, + "loss": 0.3670823574066162, "num_tokens": 341275739.0, "step": 378 }, { "epoch": 2.828358208955224, - "grad_norm": 0.2778044002766085, - "learning_rate": 4.145266072241365e-05, - "loss": 0.2845655381679535, + "grad_norm": 0.2818008385366561, + "learning_rate": 4.145266072241365e-06, + "loss": 0.38579511642456055, "num_tokens": 342203284.0, "step": 379 }, { "epoch": 2.835820895522388, - "grad_norm": 0.27642645731371285, - "learning_rate": 4.122853215989515e-05, - "loss": 0.3030461072921753, + "grad_norm": 0.26814078006971265, + "learning_rate": 4.122853215989515e-06, + "loss": 0.4062846899032593, "num_tokens": 343206534.0, "step": 380 }, { "epoch": 2.843283582089552, - "grad_norm": 0.27576660900927236, - "learning_rate": 4.100477989376043e-05, - "loss": 0.2963576912879944, + "grad_norm": 0.27452179515826414, + "learning_rate": 4.1004779893760424e-06, + "loss": 0.397432416677475, "num_tokens": 344154341.0, "step": 381 }, { "epoch": 2.8507462686567164, - "grad_norm": 0.27836419120198885, - "learning_rate": 4.0781410037894305e-05, - "loss": 0.2856680750846863, + "grad_norm": 0.27288188181425943, + "learning_rate": 4.078141003789431e-06, + "loss": 0.391731858253479, "num_tokens": 345024971.0, "step": 382 }, { "epoch": 2.8582089552238807, - "grad_norm": 0.28443550108381266, - "learning_rate": 4.0558428695732566e-05, - "loss": 0.302541583776474, + "grad_norm": 0.2967872715212152, + "learning_rate": 4.055842869573256e-06, + "loss": 0.400160551071167, "num_tokens": 345812228.0, "step": 383 }, { "epoch": 2.8656716417910446, - "grad_norm": 0.28925469814468263, - "learning_rate": 4.033584196009502e-05, - "loss": 0.2979735732078552, + "grad_norm": 0.27985989099065167, + "learning_rate": 4.0335841960095025e-06, + "loss": 0.3944920599460602, "num_tokens": 346769134.0, "step": 384 }, { "epoch": 2.873134328358209, - "grad_norm": 0.26270894384601023, - "learning_rate": 4.011365591301918e-05, - "loss": 0.29743897914886475, + "grad_norm": 0.2548795141867926, + "learning_rate": 4.011365591301918e-06, + "loss": 0.404415488243103, "num_tokens": 347740543.0, "step": 385 }, { "epoch": 2.8805970149253732, - "grad_norm": 0.2558465514475358, - "learning_rate": 3.989187662559397e-05, - "loss": 0.29701703786849976, + "grad_norm": 0.2353554630176529, + "learning_rate": 3.989187662559397e-06, + "loss": 0.3925011157989502, "num_tokens": 348799551.0, "step": 386 }, { "epoch": 2.888059701492537, - "grad_norm": 0.2800088128917189, - "learning_rate": 3.9670510157793896e-05, - "loss": 0.29111772775650024, + "grad_norm": 0.4371240438139863, + "learning_rate": 3.967051015779389e-06, + "loss": 0.394489049911499, "num_tokens": 349833256.0, "step": 387 }, { "epoch": 2.8955223880597014, - "grad_norm": 0.26928352792622695, - "learning_rate": 3.9449562558313424e-05, - "loss": 0.2905674874782562, + "grad_norm": 0.492017294414543, + "learning_rate": 3.944956255831342e-06, + "loss": 0.3901214003562927, "num_tokens": 350675901.0, "step": 388 }, { "epoch": 2.9029850746268657, - "grad_norm": 0.2398120940925839, - "learning_rate": 3.9229039864401703e-05, - "loss": 0.2857617735862732, + "grad_norm": 0.28604462735158265, + "learning_rate": 3.922903986440171e-06, + "loss": 0.3956416845321655, "num_tokens": 351593161.0, "step": 389 }, { "epoch": 2.91044776119403, - "grad_norm": 0.3592858551496307, - "learning_rate": 3.900894810169766e-05, - "loss": 0.3003401756286621, + "grad_norm": 0.3019009320890686, + "learning_rate": 3.900894810169766e-06, + "loss": 0.4037666618824005, "num_tokens": 352556035.0, "step": 390 }, { "epoch": 2.917910447761194, - "grad_norm": 0.29394748689330036, - "learning_rate": 3.8789293284065244e-05, - "loss": 0.2894614636898041, + "grad_norm": 0.2929989612906795, + "learning_rate": 3.878929328406524e-06, + "loss": 0.38326603174209595, "num_tokens": 353175046.0, "step": 391 }, { "epoch": 2.925373134328358, - "grad_norm": 0.24914268451504032, - "learning_rate": 3.8570081413429206e-05, - "loss": 0.2951936423778534, + "grad_norm": 0.2811533155158446, + "learning_rate": 3.857008141342921e-06, + "loss": 0.3970789909362793, "num_tokens": 354040412.0, "step": 392 }, { "epoch": 2.9328358208955225, - "grad_norm": 0.260179179816783, - "learning_rate": 3.835131847961104e-05, - "loss": 0.29720625281333923, + "grad_norm": 0.2642763742866724, + "learning_rate": 3.8351318479611045e-06, + "loss": 0.40754109621047974, "num_tokens": 354957977.0, "step": 393 }, { "epoch": 2.9402985074626864, - "grad_norm": 0.2569400200318878, - "learning_rate": 3.813301046016536e-05, - "loss": 0.2905476689338684, + "grad_norm": 0.2553969638436942, + "learning_rate": 3.8133010460165364e-06, + "loss": 0.3917849361896515, "num_tokens": 355897000.0, "step": 394 }, { "epoch": 2.9477611940298507, - "grad_norm": 0.2527496513377819, - "learning_rate": 3.791516332021651e-05, - "loss": 0.2816222906112671, + "grad_norm": 0.3227768986284808, + "learning_rate": 3.791516332021651e-06, + "loss": 0.38059675693511963, "num_tokens": 356775946.0, "step": 395 }, { "epoch": 2.955223880597015, - "grad_norm": 0.2672686721120884, - "learning_rate": 3.7697783012295615e-05, - "loss": 0.28156110644340515, + "grad_norm": 0.26373506539724473, + "learning_rate": 3.769778301229562e-06, + "loss": 0.392505407333374, "num_tokens": 357732570.0, "step": 396 }, { "epoch": 2.9626865671641793, - "grad_norm": 0.2724529554297958, - "learning_rate": 3.748087547617795e-05, - "loss": 0.2832649350166321, + "grad_norm": 0.27141559638214446, + "learning_rate": 3.748087547617795e-06, + "loss": 0.38036075234413147, "num_tokens": 358510667.0, "step": 397 }, { "epoch": 2.970149253731343, - "grad_norm": 0.24778083532209783, - "learning_rate": 3.726444663872054e-05, - "loss": 0.27781930565834045, + "grad_norm": 0.24786828522735252, + "learning_rate": 3.7264446638720542e-06, + "loss": 0.37426790595054626, "num_tokens": 359444745.0, "step": 398 }, { "epoch": 2.9776119402985075, - "grad_norm": 0.27757968216126894, - "learning_rate": 3.7048502413700346e-05, - "loss": 0.2812555432319641, + "grad_norm": 0.25219066519802286, + "learning_rate": 3.704850241370035e-06, + "loss": 0.3932304382324219, "num_tokens": 360351403.0, "step": 399 }, { "epoch": 2.9850746268656714, - "grad_norm": 0.2606532676991081, - "learning_rate": 3.683304870165257e-05, - "loss": 0.2908860743045807, + "grad_norm": 0.2314040595153558, + "learning_rate": 3.6833048701652574e-06, + "loss": 0.3921104669570923, "num_tokens": 361414260.0, "step": 400 }, { "epoch": 2.9925373134328357, - "grad_norm": 0.25655766841589706, - "learning_rate": 3.661809138970951e-05, - "loss": 0.29159975051879883, + "grad_norm": 0.24531758323496658, + "learning_rate": 3.661809138970951e-06, + "loss": 0.39479339122772217, "num_tokens": 362313539.0, "step": 401 }, { "epoch": 3.0, - "grad_norm": 0.2747306591851949, - "learning_rate": 3.640363635143958e-05, - "loss": 0.28497114777565, + "grad_norm": 0.269225436814872, + "learning_rate": 3.6403636351439577e-06, + "loss": 0.39549848437309265, "num_tokens": 363114852.0, "step": 402 }, { "epoch": 3.0074626865671643, - "grad_norm": 0.49194599201431255, - "learning_rate": 3.618968944668696e-05, - "loss": 0.2066115438938141, + "grad_norm": 0.28662511975668975, + "learning_rate": 3.618968944668696e-06, + "loss": 0.35942816734313965, "num_tokens": 363883703.0, "step": 403 }, { "epoch": 3.014925373134328, - "grad_norm": 0.3954618807606076, - "learning_rate": 3.59762565214114e-05, - "loss": 0.21091030538082123, + "grad_norm": 0.2897343949926782, + "learning_rate": 3.5976256521411402e-06, + "loss": 0.37709563970565796, "num_tokens": 364726957.0, "step": 404 }, { "epoch": 3.0223880597014925, - "grad_norm": 0.38865512072128744, - "learning_rate": 3.576334340752847e-05, - "loss": 0.19817262887954712, + "grad_norm": 0.25819303755354905, + "learning_rate": 3.576334340752847e-06, + "loss": 0.3720802664756775, "num_tokens": 365712205.0, "step": 405 }, { "epoch": 3.029850746268657, - "grad_norm": 0.6033668823356518, - "learning_rate": 3.5550955922750275e-05, - "loss": 0.21703442931175232, + "grad_norm": 0.28009429409591957, + "learning_rate": 3.5550955922750275e-06, + "loss": 0.3992989659309387, "num_tokens": 366502371.0, "step": 406 }, { "epoch": 3.0373134328358207, - "grad_norm": 0.3916988331142159, - "learning_rate": 3.533909987042642e-05, - "loss": 0.2033926248550415, + "grad_norm": 0.2764674226920931, + "learning_rate": 3.533909987042642e-06, + "loss": 0.39246252179145813, "num_tokens": 367405016.0, "step": 407 }, { "epoch": 3.044776119402985, - "grad_norm": 0.33140751800605023, - "learning_rate": 3.512778103938542e-05, - "loss": 0.21774126589298248, + "grad_norm": 0.30985373317019865, + "learning_rate": 3.512778103938542e-06, + "loss": 0.4023834466934204, "num_tokens": 368186973.0, "step": 408 }, { "epoch": 3.0522388059701493, - "grad_norm": 0.35959691841158536, - "learning_rate": 3.491700520377667e-05, - "loss": 0.19529391825199127, + "grad_norm": 0.28547534212425507, + "learning_rate": 3.491700520377667e-06, + "loss": 0.38294538855552673, "num_tokens": 369054384.0, "step": 409 }, { "epoch": 3.0597014925373136, - "grad_norm": 0.299923016782714, - "learning_rate": 3.470677812291248e-05, - "loss": 0.1944769024848938, + "grad_norm": 0.2749822220227637, + "learning_rate": 3.470677812291248e-06, + "loss": 0.3690488636493683, "num_tokens": 370021137.0, "step": 410 }, { "epoch": 3.0671641791044775, - "grad_norm": 0.29108602181635046, - "learning_rate": 3.449710554111085e-05, - "loss": 0.20074082911014557, + "grad_norm": 0.2617585883370724, + "learning_rate": 3.4497105541110847e-06, + "loss": 0.39320921897888184, "num_tokens": 370954131.0, "step": 411 }, { "epoch": 3.074626865671642, - "grad_norm": 0.2852549785594396, - "learning_rate": 3.428799318753844e-05, - "loss": 0.17918552458286285, + "grad_norm": 0.276121676089303, + "learning_rate": 3.4287993187538445e-06, + "loss": 0.3605678975582123, "num_tokens": 371779138.0, "step": 412 }, { "epoch": 3.082089552238806, - "grad_norm": 0.3048374464108842, - "learning_rate": 3.407944677605399e-05, - "loss": 0.2050935924053192, + "grad_norm": 0.3190227559580631, + "learning_rate": 3.407944677605399e-06, + "loss": 0.408037006855011, "num_tokens": 372652437.0, "step": 413 }, { "epoch": 3.08955223880597, - "grad_norm": 0.3442717098436445, - "learning_rate": 3.387147200505232e-05, - "loss": 0.18611590564250946, + "grad_norm": 0.3764832484269211, + "learning_rate": 3.387147200505232e-06, + "loss": 0.38565126061439514, "num_tokens": 373477902.0, "step": 414 }, { "epoch": 3.0970149253731343, - "grad_norm": 0.3048102891542879, - "learning_rate": 3.366407455730849e-05, - "loss": 0.20156922936439514, + "grad_norm": 0.28107007973769577, + "learning_rate": 3.366407455730849e-06, + "loss": 0.414955735206604, "num_tokens": 374298186.0, "step": 415 }, { "epoch": 3.1044776119402986, - "grad_norm": 0.2585777051914589, - "learning_rate": 3.3457260099822616e-05, - "loss": 0.1968289315700531, + "grad_norm": 0.2538068604333711, + "learning_rate": 3.345726009982262e-06, + "loss": 0.3739873766899109, "num_tokens": 375232722.0, "step": 416 }, { "epoch": 3.111940298507463, - "grad_norm": 0.28390573471476355, - "learning_rate": 3.325103428366495e-05, - "loss": 0.19271767139434814, + "grad_norm": 0.25345140165817104, + "learning_rate": 3.3251034283664945e-06, + "loss": 0.39425763487815857, "num_tokens": 376192544.0, "step": 417 }, { "epoch": 3.1194029850746268, - "grad_norm": 0.27684222641941364, - "learning_rate": 3.304540274382151e-05, - "loss": 0.18728995323181152, + "grad_norm": 0.26126693334804235, + "learning_rate": 3.304540274382151e-06, + "loss": 0.3673323094844818, "num_tokens": 377142524.0, "step": 418 }, { "epoch": 3.126865671641791, - "grad_norm": 0.26470944881530967, - "learning_rate": 3.284037109904013e-05, - "loss": 0.19758391380310059, + "grad_norm": 0.2718425837582604, + "learning_rate": 3.284037109904013e-06, + "loss": 0.38800495862960815, "num_tokens": 378076354.0, "step": 419 }, { "epoch": 3.1343283582089554, - "grad_norm": 0.26428184472593585, - "learning_rate": 3.263594495167688e-05, - "loss": 0.19001808762550354, + "grad_norm": 0.24762599606026042, + "learning_rate": 3.263594495167688e-06, + "loss": 0.3551333248615265, "num_tokens": 378966330.0, "step": 420 }, { "epoch": 3.1417910447761193, - "grad_norm": 0.2892996090271266, - "learning_rate": 3.243212988754302e-05, - "loss": 0.19018301367759705, + "grad_norm": 0.3979931015660995, + "learning_rate": 3.2432129887543026e-06, + "loss": 0.3955429196357727, "num_tokens": 379888904.0, "step": 421 }, { "epoch": 3.1492537313432836, - "grad_norm": 0.255854754497512, - "learning_rate": 3.222893147575232e-05, - "loss": 0.18163231015205383, + "grad_norm": 0.27409522127657593, + "learning_rate": 3.2228931475752323e-06, + "loss": 0.35347574949264526, "num_tokens": 380738966.0, "step": 422 }, { "epoch": 3.156716417910448, - "grad_norm": 0.2688589521434324, - "learning_rate": 3.2026355268568986e-05, - "loss": 0.18542851507663727, + "grad_norm": 0.26157991571638095, + "learning_rate": 3.2026355268568987e-06, + "loss": 0.35351991653442383, "num_tokens": 381614529.0, "step": 423 }, { "epoch": 3.1641791044776117, - "grad_norm": 0.2615236026088854, - "learning_rate": 3.1824406801255834e-05, - "loss": 0.1898452788591385, + "grad_norm": 0.253961852327095, + "learning_rate": 3.1824406801255836e-06, + "loss": 0.36370548605918884, "num_tokens": 382513458.0, "step": 424 }, { "epoch": 3.171641791044776, - "grad_norm": 0.25278096660878835, - "learning_rate": 3.162309159192316e-05, - "loss": 0.1837225705385208, + "grad_norm": 0.24868042189319053, + "learning_rate": 3.162309159192316e-06, + "loss": 0.3607192635536194, "num_tokens": 383449861.0, "step": 425 }, { "epoch": 3.1791044776119404, - "grad_norm": 0.2833105070791237, - "learning_rate": 3.1422415141377815e-05, - "loss": 0.18363988399505615, + "grad_norm": 0.26485700184898936, + "learning_rate": 3.1422415141377815e-06, + "loss": 0.3481111228466034, "num_tokens": 384253017.0, "step": 426 }, { "epoch": 3.1865671641791047, - "grad_norm": 0.3670785881303276, - "learning_rate": 3.122238293297305e-05, - "loss": 0.20621977746486664, + "grad_norm": 0.28281284316278155, + "learning_rate": 3.122238293297305e-06, + "loss": 0.3816152811050415, "num_tokens": 385257443.0, "step": 427 }, { "epoch": 3.1940298507462686, - "grad_norm": 0.28512298225388877, - "learning_rate": 3.10230004324586e-05, - "loss": 0.18900805711746216, + "grad_norm": 0.2628707804556158, + "learning_rate": 3.10230004324586e-06, + "loss": 0.349966824054718, "num_tokens": 386017753.0, "step": 428 }, { "epoch": 3.201492537313433, - "grad_norm": 0.2652115282852538, - "learning_rate": 3.0824273087831335e-05, - "loss": 0.19230526685714722, + "grad_norm": 0.2606711695382564, + "learning_rate": 3.0824273087831335e-06, + "loss": 0.38945478200912476, "num_tokens": 386978912.0, "step": 429 }, { "epoch": 3.208955223880597, - "grad_norm": 0.2970652367849119, - "learning_rate": 3.062620632918648e-05, - "loss": 0.1821131855249405, + "grad_norm": 0.2747230623623624, + "learning_rate": 3.062620632918648e-06, + "loss": 0.3638556897640228, "num_tokens": 387852467.0, "step": 430 }, { "epoch": 3.216417910447761, - "grad_norm": 0.27654968831381693, - "learning_rate": 3.0428805568569073e-05, - "loss": 0.19513992965221405, + "grad_norm": 0.2803007110615389, + "learning_rate": 3.0428805568569076e-06, + "loss": 0.38482367992401123, "num_tokens": 388658923.0, "step": 431 }, { "epoch": 3.2238805970149254, - "grad_norm": 0.2853960291095656, - "learning_rate": 3.0232076199826286e-05, - "loss": 0.18861499428749084, + "grad_norm": 0.2645967994643593, + "learning_rate": 3.023207619982629e-06, + "loss": 0.36384740471839905, "num_tokens": 389508858.0, "step": 432 }, { "epoch": 3.2313432835820897, - "grad_norm": 0.2713401245343682, - "learning_rate": 3.0036023598459895e-05, - "loss": 0.1987762451171875, + "grad_norm": 0.27202749711662244, + "learning_rate": 3.0036023598459895e-06, + "loss": 0.39492571353912354, "num_tokens": 390450838.0, "step": 433 }, { "epoch": 3.2388059701492535, - "grad_norm": 0.2947263011000826, - "learning_rate": 2.984065312147948e-05, - "loss": 0.1858668029308319, + "grad_norm": 0.2858842639475798, + "learning_rate": 2.9840653121479478e-06, + "loss": 0.3738439679145813, "num_tokens": 391283207.0, "step": 434 }, { "epoch": 3.246268656716418, - "grad_norm": 0.2502967308814167, - "learning_rate": 2.9645970107255993e-05, - "loss": 0.16813813149929047, + "grad_norm": 0.24793258551891303, + "learning_rate": 2.9645970107255997e-06, + "loss": 0.35694074630737305, "num_tokens": 392285292.0, "step": 435 }, { "epoch": 3.253731343283582, - "grad_norm": 0.270078888609482, - "learning_rate": 2.945197987537591e-05, - "loss": 0.18862999975681305, + "grad_norm": 0.2819278079547717, + "learning_rate": 2.9451979875375913e-06, + "loss": 0.3710547387599945, "num_tokens": 393145041.0, "step": 436 }, { "epoch": 3.2611940298507465, - "grad_norm": 0.3309597497687191, - "learning_rate": 2.925868772649591e-05, - "loss": 0.20458480715751648, + "grad_norm": 0.2631494530375275, + "learning_rate": 2.925868772649591e-06, + "loss": 0.3825373351573944, "num_tokens": 394022264.0, "step": 437 }, { "epoch": 3.2686567164179103, - "grad_norm": 0.2708462414734987, - "learning_rate": 2.9066098942197994e-05, - "loss": 0.1816583275794983, + "grad_norm": 0.2555768738323888, + "learning_rate": 2.9066098942197995e-06, + "loss": 0.36353516578674316, "num_tokens": 394892104.0, "step": 438 }, { "epoch": 3.2761194029850746, - "grad_norm": 0.269011360757111, - "learning_rate": 2.887421878484516e-05, - "loss": 0.18728917837142944, + "grad_norm": 0.252531665467931, + "learning_rate": 2.887421878484516e-06, + "loss": 0.38284653425216675, "num_tokens": 395835092.0, "step": 439 }, { "epoch": 3.283582089552239, - "grad_norm": 0.2915840174125746, - "learning_rate": 2.868305249743766e-05, - "loss": 0.19079147279262543, + "grad_norm": 0.2845282411268666, + "learning_rate": 2.8683052497437665e-06, + "loss": 0.3927590548992157, "num_tokens": 396725722.0, "step": 440 }, { "epoch": 3.291044776119403, - "grad_norm": 0.28130459485338055, - "learning_rate": 2.8492605303469732e-05, - "loss": 0.18476402759552002, + "grad_norm": 0.26812964504110554, + "learning_rate": 2.8492605303469732e-06, + "loss": 0.37616321444511414, "num_tokens": 397618546.0, "step": 441 }, { "epoch": 3.298507462686567, - "grad_norm": 0.270845090677631, - "learning_rate": 2.8302882406786818e-05, - "loss": 0.19156426191329956, + "grad_norm": 0.25144632819615587, + "learning_rate": 2.8302882406786817e-06, + "loss": 0.382343053817749, "num_tokens": 398571441.0, "step": 442 }, { "epoch": 3.3059701492537314, - "grad_norm": 0.282891687455983, - "learning_rate": 2.8113888991443448e-05, - "loss": 0.1864050030708313, + "grad_norm": 0.29981470486255846, + "learning_rate": 2.811388899144345e-06, + "loss": 0.3775964379310608, "num_tokens": 399409770.0, "step": 443 }, { "epoch": 3.3134328358208958, - "grad_norm": 0.27331622423574914, - "learning_rate": 2.7925630221561506e-05, - "loss": 0.1832800805568695, + "grad_norm": 0.37890241833609745, + "learning_rate": 2.7925630221561506e-06, + "loss": 0.37770912051200867, "num_tokens": 400392695.0, "step": 444 }, { "epoch": 3.3208955223880596, - "grad_norm": 0.24716151608713713, - "learning_rate": 2.7738111241189184e-05, - "loss": 0.19344063103199005, + "grad_norm": 0.2755196059695862, + "learning_rate": 2.7738111241189185e-06, + "loss": 0.3694460690021515, "num_tokens": 401345623.0, "step": 445 }, { "epoch": 3.328358208955224, - "grad_norm": 0.25735363916697457, - "learning_rate": 2.755133717416043e-05, - "loss": 0.19481462240219116, + "grad_norm": 0.2680514510877795, + "learning_rate": 2.755133717416043e-06, + "loss": 0.3776453137397766, "num_tokens": 402260500.0, "step": 446 }, { "epoch": 3.3358208955223883, - "grad_norm": 0.2761609749517773, - "learning_rate": 2.7365313123954916e-05, - "loss": 0.1954980194568634, + "grad_norm": 0.24171155783588386, + "learning_rate": 2.7365313123954916e-06, + "loss": 0.3985833525657654, "num_tokens": 403276687.0, "step": 447 }, { "epoch": 3.343283582089552, - "grad_norm": 0.25184599201070623, - "learning_rate": 2.718004417355855e-05, - "loss": 0.18150857090950012, + "grad_norm": 0.2569574542175994, + "learning_rate": 2.718004417355855e-06, + "loss": 0.3654242157936096, "num_tokens": 404190134.0, "step": 448 }, { "epoch": 3.3507462686567164, - "grad_norm": 0.25862762271317496, - "learning_rate": 2.699553538532467e-05, - "loss": 0.19782021641731262, + "grad_norm": 0.2493193792220214, + "learning_rate": 2.699553538532467e-06, + "loss": 0.3807545006275177, "num_tokens": 405215802.0, "step": 449 }, { "epoch": 3.3582089552238807, - "grad_norm": 0.2516848613392581, - "learning_rate": 2.6811791800835684e-05, - "loss": 0.18828004598617554, + "grad_norm": 0.35218345161133224, + "learning_rate": 2.6811791800835684e-06, + "loss": 0.37028026580810547, "num_tokens": 406189028.0, "step": 450 }, { "epoch": 3.3656716417910446, - "grad_norm": 0.25322426542539367, - "learning_rate": 2.6628818440765267e-05, - "loss": 0.18876372277736664, + "grad_norm": 0.2630947673101325, + "learning_rate": 2.662881844076527e-06, + "loss": 0.3866269886493683, "num_tokens": 407112961.0, "step": 451 }, { "epoch": 3.373134328358209, - "grad_norm": 0.2635905275878099, - "learning_rate": 2.6446620304741265e-05, - "loss": 0.17961478233337402, + "grad_norm": 0.24502727833486168, + "learning_rate": 2.6446620304741267e-06, + "loss": 0.3389516770839691, "num_tokens": 407955720.0, "step": 452 }, { "epoch": 3.3805970149253732, - "grad_norm": 0.27627860842036966, - "learning_rate": 2.6265202371208987e-05, - "loss": 0.1828433871269226, + "grad_norm": 0.29642792873153473, + "learning_rate": 2.6265202371208985e-06, + "loss": 0.3727038502693176, "num_tokens": 408861534.0, "step": 453 }, { "epoch": 3.388059701492537, - "grad_norm": 0.2695654586634986, - "learning_rate": 2.6084569597295226e-05, - "loss": 0.19203506410121918, + "grad_norm": 0.2729055281837691, + "learning_rate": 2.6084569597295227e-06, + "loss": 0.37226539850234985, "num_tokens": 409769033.0, "step": 454 }, { "epoch": 3.3955223880597014, - "grad_norm": 0.2618590322813307, - "learning_rate": 2.590472691867284e-05, - "loss": 0.18993832170963287, + "grad_norm": 0.2591963730270879, + "learning_rate": 2.590472691867284e-06, + "loss": 0.3665540814399719, "num_tokens": 410734429.0, "step": 455 }, { "epoch": 3.4029850746268657, - "grad_norm": 0.24786628104163766, - "learning_rate": 2.5725679249425798e-05, - "loss": 0.18233329057693481, + "grad_norm": 0.24603379464247438, + "learning_rate": 2.57256792494258e-06, + "loss": 0.3557394742965698, "num_tokens": 411668760.0, "step": 456 }, { "epoch": 3.41044776119403, - "grad_norm": 0.25848185000539076, - "learning_rate": 2.5547431481914974e-05, - "loss": 0.18337702751159668, + "grad_norm": 0.26710941082613454, + "learning_rate": 2.5547431481914973e-06, + "loss": 0.3810808062553406, "num_tokens": 412593612.0, "step": 457 }, { "epoch": 3.417910447761194, - "grad_norm": 0.24215114432171708, - "learning_rate": 2.5369988486644447e-05, - "loss": 0.1807183027267456, + "grad_norm": 0.24517969588523647, + "learning_rate": 2.536998848664445e-06, + "loss": 0.36506032943725586, "num_tokens": 413566574.0, "step": 458 }, { "epoch": 3.425373134328358, - "grad_norm": 0.45717352480476836, - "learning_rate": 2.5193355112128435e-05, - "loss": 0.22161290049552917, + "grad_norm": 0.26080308677293107, + "learning_rate": 2.5193355112128436e-06, + "loss": 0.375240683555603, "num_tokens": 414490201.0, "step": 459 }, { "epoch": 3.4328358208955225, - "grad_norm": 0.2703247004607519, - "learning_rate": 2.501753618475877e-05, - "loss": 0.18385817110538483, + "grad_norm": 0.2507828857248926, + "learning_rate": 2.501753618475877e-06, + "loss": 0.3682469129562378, "num_tokens": 415392501.0, "step": 460 }, { "epoch": 3.4402985074626864, - "grad_norm": 0.25083542831124495, - "learning_rate": 2.4842536508673087e-05, - "loss": 0.19022910296916962, + "grad_norm": 0.2692051104680508, + "learning_rate": 2.4842536508673087e-06, + "loss": 0.37688201665878296, "num_tokens": 416317197.0, "step": 461 }, { "epoch": 3.4477611940298507, - "grad_norm": 0.26603445939791553, - "learning_rate": 2.4668360865623447e-05, - "loss": 0.17970548570156097, + "grad_norm": 0.2549135365443059, + "learning_rate": 2.466836086562345e-06, + "loss": 0.36603114008903503, "num_tokens": 417156988.0, "step": 462 }, { "epoch": 3.455223880597015, - "grad_norm": 0.2599107413692842, - "learning_rate": 2.4495014014845808e-05, - "loss": 0.1887662708759308, + "grad_norm": 0.2453940193702825, + "learning_rate": 2.4495014014845807e-06, + "loss": 0.3681268095970154, "num_tokens": 418076187.0, "step": 463 }, { "epoch": 3.4626865671641793, - "grad_norm": 0.2893481653417703, - "learning_rate": 2.4322500692929888e-05, - "loss": 0.18953284621238708, + "grad_norm": 0.2725566155237126, + "learning_rate": 2.432250069292989e-06, + "loss": 0.37921467423439026, "num_tokens": 418901462.0, "step": 464 }, { "epoch": 3.470149253731343, - "grad_norm": 0.27076076153659034, - "learning_rate": 2.415082561368979e-05, - "loss": 0.20783261954784393, + "grad_norm": 0.25907951334448015, + "learning_rate": 2.415082561368979e-06, + "loss": 0.39200738072395325, "num_tokens": 419804291.0, "step": 465 }, { "epoch": 3.4776119402985075, - "grad_norm": 0.28288806684949797, - "learning_rate": 2.397999346803518e-05, - "loss": 0.19998973608016968, + "grad_norm": 0.26406315997541896, + "learning_rate": 2.397999346803518e-06, + "loss": 0.39208582043647766, "num_tokens": 420712064.0, "step": 466 }, { "epoch": 3.485074626865672, - "grad_norm": 0.2444888141754306, - "learning_rate": 2.3810008923843077e-05, - "loss": 0.18368248641490936, + "grad_norm": 0.23773901962622077, + "learning_rate": 2.3810008923843077e-06, + "loss": 0.37207821011543274, "num_tokens": 421699792.0, "step": 467 }, { "epoch": 3.4925373134328357, - "grad_norm": 0.2651038836667961, - "learning_rate": 2.364087662583038e-05, - "loss": 0.19216938316822052, + "grad_norm": 0.2479152678036227, + "learning_rate": 2.3640876625830385e-06, + "loss": 0.37208831310272217, "num_tokens": 422643169.0, "step": 468 }, { "epoch": 3.5, - "grad_norm": 0.24638484210621042, - "learning_rate": 2.3472601195426922e-05, - "loss": 0.19870220124721527, + "grad_norm": 0.2563637058550244, + "learning_rate": 2.347260119542692e-06, + "loss": 0.378294974565506, "num_tokens": 423633161.0, "step": 469 }, { "epoch": 3.5074626865671643, - "grad_norm": 0.26276449691806386, - "learning_rate": 2.3305187230649175e-05, - "loss": 0.18932682275772095, + "grad_norm": 0.2606090648417702, + "learning_rate": 2.3305187230649177e-06, + "loss": 0.3819723129272461, "num_tokens": 424556814.0, "step": 470 }, { "epoch": 3.5149253731343286, - "grad_norm": 0.28527926693849653, - "learning_rate": 2.3138639305974595e-05, - "loss": 0.19162672758102417, + "grad_norm": 0.251352839735243, + "learning_rate": 2.3138639305974596e-06, + "loss": 0.37940090894699097, "num_tokens": 425479906.0, "step": 471 }, { "epoch": 3.5223880597014925, - "grad_norm": 0.2579810851682769, - "learning_rate": 2.2972961972216704e-05, - "loss": 0.19014887511730194, + "grad_norm": 0.24680617583096287, + "learning_rate": 2.2972961972216703e-06, + "loss": 0.3712913393974304, "num_tokens": 426446651.0, "step": 472 }, { "epoch": 3.529850746268657, - "grad_norm": 0.2674640295334296, - "learning_rate": 2.2808159756400667e-05, - "loss": 0.18109123408794403, + "grad_norm": 0.25068376553010957, + "learning_rate": 2.2808159756400667e-06, + "loss": 0.36781617999076843, "num_tokens": 427310770.0, "step": 473 }, { "epoch": 3.5373134328358207, - "grad_norm": 0.25851031138301145, - "learning_rate": 2.2644237161639623e-05, - "loss": 0.2057286500930786, + "grad_norm": 0.2575081517211329, + "learning_rate": 2.264423716163962e-06, + "loss": 0.38692015409469604, "num_tokens": 428270355.0, "step": 474 }, { "epoch": 3.544776119402985, - "grad_norm": 0.2976854169257475, - "learning_rate": 2.2481198667011675e-05, - "loss": 0.20134267210960388, + "grad_norm": 0.26353888813152593, + "learning_rate": 2.2481198667011675e-06, + "loss": 0.4076312184333801, "num_tokens": 429240026.0, "step": 475 }, { "epoch": 3.5522388059701493, - "grad_norm": 0.2633992734824458, - "learning_rate": 2.2319048727437393e-05, - "loss": 0.19365693628787994, + "grad_norm": 0.24599771689956054, + "learning_rate": 2.231904872743739e-06, + "loss": 0.3803725838661194, "num_tokens": 430167582.0, "step": 476 }, { "epoch": 3.5597014925373136, - "grad_norm": 0.2754160005706429, - "learning_rate": 2.215779177355822e-05, - "loss": 0.1842573583126068, + "grad_norm": 0.24639104277677626, + "learning_rate": 2.2157791773558222e-06, + "loss": 0.3705400228500366, "num_tokens": 431118645.0, "step": 477 }, { "epoch": 3.5671641791044775, - "grad_norm": 0.26618343606259465, - "learning_rate": 2.1997432211615327e-05, - "loss": 0.19872742891311646, + "grad_norm": 0.25169470907126656, + "learning_rate": 2.199743221161533e-06, + "loss": 0.40112996101379395, "num_tokens": 432105903.0, "step": 478 }, { "epoch": 3.574626865671642, - "grad_norm": 0.26240668007974227, - "learning_rate": 2.1837974423329254e-05, - "loss": 0.1893463134765625, + "grad_norm": 0.2494015959735466, + "learning_rate": 2.1837974423329254e-06, + "loss": 0.37427645921707153, "num_tokens": 432968989.0, "step": 479 }, { "epoch": 3.582089552238806, - "grad_norm": 0.2529375591425096, - "learning_rate": 2.1679422765780117e-05, - "loss": 0.19187003374099731, + "grad_norm": 0.2510312087393284, + "learning_rate": 2.1679422765780115e-06, + "loss": 0.3761802613735199, "num_tokens": 433879607.0, "step": 480 }, { "epoch": 3.58955223880597, - "grad_norm": 0.25776468115716156, - "learning_rate": 2.1521781571288645e-05, - "loss": 0.17648732662200928, + "grad_norm": 0.2472662036793444, + "learning_rate": 2.152178157128865e-06, + "loss": 0.37739771604537964, "num_tokens": 434793981.0, "step": 481 }, { "epoch": 3.5970149253731343, - "grad_norm": 0.2610126529440518, - "learning_rate": 2.1365055147297742e-05, - "loss": 0.1914067268371582, + "grad_norm": 0.2526868205714577, + "learning_rate": 2.136505514729774e-06, + "loss": 0.3701442778110504, "num_tokens": 435697283.0, "step": 482 }, { "epoch": 3.6044776119402986, - "grad_norm": 0.2716259340753213, - "learning_rate": 2.120924777625479e-05, - "loss": 0.1937374621629715, + "grad_norm": 0.2516013371512894, + "learning_rate": 2.1209247776254795e-06, + "loss": 0.3924868106842041, "num_tokens": 436627533.0, "step": 483 }, { "epoch": 3.611940298507463, - "grad_norm": 0.24608157610433407, - "learning_rate": 2.1054363715494695e-05, - "loss": 0.17636939883232117, + "grad_norm": 0.24502502103917492, + "learning_rate": 2.1054363715494695e-06, + "loss": 0.34178441762924194, "num_tokens": 437481939.0, "step": 484 }, { "epoch": 3.6194029850746268, - "grad_norm": 0.2550443185484906, - "learning_rate": 2.0900407197123442e-05, - "loss": 0.2042848765850067, + "grad_norm": 0.26412516983013123, + "learning_rate": 2.0900407197123444e-06, + "loss": 0.3800678253173828, "num_tokens": 438276274.0, "step": 485 }, { "epoch": 3.626865671641791, - "grad_norm": 0.26902077204052094, - "learning_rate": 2.0747382427902574e-05, - "loss": 0.20606115460395813, + "grad_norm": 0.2650046240456923, + "learning_rate": 2.0747382427902574e-06, + "loss": 0.4031677544116974, "num_tokens": 439089480.0, "step": 486 }, { "epoch": 3.6343283582089554, - "grad_norm": 0.24890244318581478, - "learning_rate": 2.059529358913418e-05, - "loss": 0.19623306393623352, + "grad_norm": 0.2587379164469128, + "learning_rate": 2.059529358913418e-06, + "loss": 0.37271153926849365, "num_tokens": 439983559.0, "step": 487 }, { "epoch": 3.6417910447761193, - "grad_norm": 0.26554045857025543, - "learning_rate": 2.0444144836546683e-05, - "loss": 0.20421941578388214, + "grad_norm": 0.2540913543502109, + "learning_rate": 2.0444144836546684e-06, + "loss": 0.3822531998157501, "num_tokens": 440850324.0, "step": 488 }, { "epoch": 3.6492537313432836, - "grad_norm": 0.2707279081604383, - "learning_rate": 2.0293940300181214e-05, - "loss": 0.18913155794143677, + "grad_norm": 0.27783327558446214, + "learning_rate": 2.0293940300181216e-06, + "loss": 0.3831808269023895, "num_tokens": 441605590.0, "step": 489 }, { "epoch": 3.656716417910448, - "grad_norm": 0.2625732494668558, - "learning_rate": 2.0144684084278847e-05, - "loss": 0.18777087330818176, + "grad_norm": 0.2796967269697153, + "learning_rate": 2.0144684084278847e-06, + "loss": 0.3709692060947418, "num_tokens": 442348946.0, "step": 490 }, { "epoch": 3.664179104477612, - "grad_norm": 0.25050288014725813, - "learning_rate": 1.9996380267168417e-05, - "loss": 0.18106262385845184, + "grad_norm": 0.2465314987769435, + "learning_rate": 1.999638026716842e-06, + "loss": 0.35937702655792236, "num_tokens": 443300971.0, "step": 491 }, { "epoch": 3.671641791044776, - "grad_norm": 0.2514008093801213, - "learning_rate": 1.9849032901155073e-05, - "loss": 0.19742050766944885, + "grad_norm": 0.24683809772269052, + "learning_rate": 1.9849032901155075e-06, + "loss": 0.39329999685287476, "num_tokens": 444301774.0, "step": 492 }, { "epoch": 3.6791044776119404, - "grad_norm": 0.2515623509222219, - "learning_rate": 1.9702646012409577e-05, - "loss": 0.1979285627603531, + "grad_norm": 0.23859031932692393, + "learning_rate": 1.970264601240958e-06, + "loss": 0.3722185492515564, "num_tokens": 445224414.0, "step": 493 }, { "epoch": 3.6865671641791042, - "grad_norm": 0.26046195759469204, - "learning_rate": 1.9557223600858236e-05, - "loss": 0.18502528965473175, + "grad_norm": 0.2707873095537451, + "learning_rate": 1.955722360085824e-06, + "loss": 0.38121020793914795, "num_tokens": 446138719.0, "step": 494 }, { "epoch": 3.6940298507462686, - "grad_norm": 0.27995421490943506, - "learning_rate": 1.9412769640073687e-05, - "loss": 0.2060692310333252, + "grad_norm": 0.27134162465622047, + "learning_rate": 1.941276964007369e-06, + "loss": 0.41704389452934265, "num_tokens": 447027595.0, "step": 495 }, { "epoch": 3.701492537313433, - "grad_norm": 0.28282541375070885, - "learning_rate": 1.9269288077166266e-05, - "loss": 0.20646807551383972, + "grad_norm": 0.27102779980384334, + "learning_rate": 1.9269288077166264e-06, + "loss": 0.41601014137268066, "num_tokens": 447918016.0, "step": 496 }, { "epoch": 3.708955223880597, - "grad_norm": 0.2787771149505012, - "learning_rate": 1.9126782832676175e-05, - "loss": 0.20575478672981262, + "grad_norm": 0.2795343486481597, + "learning_rate": 1.9126782832676175e-06, + "loss": 0.37963247299194336, "num_tokens": 448782123.0, "step": 497 }, { "epoch": 3.716417910447761, - "grad_norm": 0.2402391786525068, - "learning_rate": 1.898525780046635e-05, - "loss": 0.19058045744895935, + "grad_norm": 0.24533152172023073, + "learning_rate": 1.898525780046635e-06, + "loss": 0.37255096435546875, "num_tokens": 449735295.0, "step": 498 }, { "epoch": 3.7238805970149254, - "grad_norm": 0.2467356385183413, - "learning_rate": 1.8844716847616055e-05, - "loss": 0.19737517833709717, + "grad_norm": 0.25068064600084505, + "learning_rate": 1.8844716847616053e-06, + "loss": 0.3953704237937927, "num_tokens": 450703519.0, "step": 499 }, { "epoch": 3.7313432835820897, - "grad_norm": 0.26127243762783636, - "learning_rate": 1.870516381431523e-05, - "loss": 0.18012723326683044, + "grad_norm": 0.27826952511668485, + "learning_rate": 1.870516381431523e-06, + "loss": 0.37893447279930115, "num_tokens": 451523722.0, "step": 500 }, { "epoch": 3.7388059701492535, - "grad_norm": 0.22574013899828166, - "learning_rate": 1.8566602513759572e-05, - "loss": 0.18358758091926575, + "grad_norm": 0.2470705912133386, + "learning_rate": 1.8566602513759573e-06, + "loss": 0.36960500478744507, "num_tokens": 452496914.0, "step": 501 }, { "epoch": 3.746268656716418, - "grad_norm": 0.22326512102091203, - "learning_rate": 1.842903673204633e-05, - "loss": 0.18325470387935638, + "grad_norm": 0.2380353729045607, + "learning_rate": 1.8429036732046328e-06, + "loss": 0.3598456084728241, "num_tokens": 453486873.0, "step": 502 }, { "epoch": 3.753731343283582, - "grad_norm": 0.23703853520129506, - "learning_rate": 1.8292470228070807e-05, - "loss": 0.19234976172447205, + "grad_norm": 0.24753875738528466, + "learning_rate": 1.8292470228070808e-06, + "loss": 0.3775923550128937, "num_tokens": 454415514.0, "step": 503 }, { "epoch": 3.7611940298507465, - "grad_norm": 0.23624172141507785, - "learning_rate": 1.815690673342374e-05, - "loss": 0.1858876347541809, + "grad_norm": 0.24852622318044526, + "learning_rate": 1.815690673342374e-06, + "loss": 0.377275288105011, "num_tokens": 455330400.0, "step": 504 }, { "epoch": 3.7686567164179103, - "grad_norm": 0.23035613004401564, - "learning_rate": 1.8022349952289275e-05, - "loss": 0.1752624809741974, + "grad_norm": 0.24830439594342327, + "learning_rate": 1.8022349952289275e-06, + "loss": 0.3592768907546997, "num_tokens": 456232858.0, "step": 505 }, { "epoch": 3.7761194029850746, - "grad_norm": 0.2802304582091012, - "learning_rate": 1.7888803561343752e-05, - "loss": 0.19568130373954773, + "grad_norm": 0.2661718758635726, + "learning_rate": 1.7888803561343755e-06, + "loss": 0.3917810320854187, "num_tokens": 457091321.0, "step": 506 }, { "epoch": 3.783582089552239, - "grad_norm": 0.26371130962086026, - "learning_rate": 1.7756271209655297e-05, - "loss": 0.20146141946315765, + "grad_norm": 0.2652414658319871, + "learning_rate": 1.7756271209655296e-06, + "loss": 0.41377222537994385, "num_tokens": 457990573.0, "step": 507 }, { "epoch": 3.791044776119403, - "grad_norm": 0.24585345653286125, - "learning_rate": 1.7624756518584017e-05, - "loss": 0.19103452563285828, + "grad_norm": 0.260047413567863, + "learning_rate": 1.7624756518584015e-06, + "loss": 0.3786197304725647, "num_tokens": 458827375.0, "step": 508 }, { "epoch": 3.798507462686567, - "grad_norm": 0.24329678481973108, - "learning_rate": 1.7494263081683133e-05, - "loss": 0.18317708373069763, + "grad_norm": 0.24921975710502509, + "learning_rate": 1.7494263081683134e-06, + "loss": 0.36924827098846436, "num_tokens": 459694321.0, "step": 509 }, { "epoch": 3.8059701492537314, - "grad_norm": 0.2227757517865289, - "learning_rate": 1.7364794464600808e-05, - "loss": 0.18795830011367798, + "grad_norm": 0.24376539520051552, + "learning_rate": 1.736479446460081e-06, + "loss": 0.3597017526626587, "num_tokens": 460616396.0, "step": 510 }, { "epoch": 3.8134328358208958, - "grad_norm": 0.24471648408229413, - "learning_rate": 1.7236354204982587e-05, - "loss": 0.19895689189434052, + "grad_norm": 0.24365917664342365, + "learning_rate": 1.723635420498259e-06, + "loss": 0.36935943365097046, "num_tokens": 461530829.0, "step": 511 }, { "epoch": 3.8208955223880596, - "grad_norm": 0.2533303879854254, - "learning_rate": 1.7108945812374874e-05, - "loss": 0.18436945974826813, + "grad_norm": 0.23932370443954964, + "learning_rate": 1.7108945812374874e-06, + "loss": 0.387093722820282, "num_tokens": 462464505.0, "step": 512 }, { "epoch": 3.828358208955224, - "grad_norm": 0.24762224254094353, - "learning_rate": 1.698257276812896e-05, - "loss": 0.19100706279277802, + "grad_norm": 0.257078997056124, + "learning_rate": 1.6982572768128964e-06, + "loss": 0.38530057668685913, "num_tokens": 463398691.0, "step": 513 }, { "epoch": 3.835820895522388, - "grad_norm": 0.25460476636963814, - "learning_rate": 1.6857238525305923e-05, - "loss": 0.18965837359428406, + "grad_norm": 0.24718882255890465, + "learning_rate": 1.6857238525305924e-06, + "loss": 0.3774847388267517, "num_tokens": 464295344.0, "step": 514 }, { "epoch": 3.843283582089552, - "grad_norm": 0.2387670789620089, - "learning_rate": 1.6732946508582288e-05, - "loss": 0.17684714496135712, + "grad_norm": 0.23460337795500458, + "learning_rate": 1.6732946508582288e-06, + "loss": 0.3643302619457245, "num_tokens": 465251963.0, "step": 515 }, { "epoch": 3.8507462686567164, - "grad_norm": 0.2290717178777657, - "learning_rate": 1.6609700114156366e-05, - "loss": 0.1839907169342041, + "grad_norm": 0.23769889055431628, + "learning_rate": 1.6609700114156368e-06, + "loss": 0.3710617423057556, "num_tokens": 466213047.0, "step": 516 }, { "epoch": 3.8582089552238807, - "grad_norm": 0.23149468446326116, - "learning_rate": 1.648750270965559e-05, - "loss": 0.19479894638061523, + "grad_norm": 0.23396843344896867, + "learning_rate": 1.6487502709655591e-06, + "loss": 0.382940411567688, "num_tokens": 467245768.0, "step": 517 }, { "epoch": 3.8656716417910446, - "grad_norm": 0.2438741985485795, - "learning_rate": 1.6366357634044404e-05, - "loss": 0.18519151210784912, + "grad_norm": 0.23909686285484746, + "learning_rate": 1.6366357634044406e-06, + "loss": 0.3723403215408325, "num_tokens": 468129089.0, "step": 518 }, { "epoch": 3.873134328358209, - "grad_norm": 0.25738567298690823, - "learning_rate": 1.6246268197533047e-05, - "loss": 0.19129495322704315, + "grad_norm": 0.2688981703218909, + "learning_rate": 1.6246268197533046e-06, + "loss": 0.3829047381877899, "num_tokens": 468938058.0, "step": 519 }, { "epoch": 3.8805970149253732, - "grad_norm": 0.2483364167025758, - "learning_rate": 1.6127237681487095e-05, - "loss": 0.18930166959762573, + "grad_norm": 0.25519957310879615, + "learning_rate": 1.6127237681487096e-06, + "loss": 0.39446866512298584, "num_tokens": 469847619.0, "step": 520 }, { "epoch": 3.888059701492537, - "grad_norm": 0.2552692862700431, - "learning_rate": 1.6009269338337832e-05, - "loss": 0.1927138864994049, + "grad_norm": 0.2486366229197261, + "learning_rate": 1.6009269338337832e-06, + "loss": 0.3983200788497925, "num_tokens": 470791148.0, "step": 521 }, { "epoch": 3.8955223880597014, - "grad_norm": 0.2379188866988146, - "learning_rate": 1.5892366391493364e-05, - "loss": 0.18503820896148682, + "grad_norm": 0.24830658428540756, + "learning_rate": 1.5892366391493363e-06, + "loss": 0.38877153396606445, "num_tokens": 471735636.0, "step": 522 }, { "epoch": 3.9029850746268657, - "grad_norm": 0.23950245735548287, - "learning_rate": 1.5776532035250514e-05, - "loss": 0.1889442503452301, + "grad_norm": 0.24923977507707654, + "learning_rate": 1.5776532035250513e-06, + "loss": 0.37799936532974243, "num_tokens": 472685312.0, "step": 523 }, { "epoch": 3.91044776119403, - "grad_norm": 0.2350552957040022, - "learning_rate": 1.5661769434707586e-05, - "loss": 0.17926713824272156, + "grad_norm": 0.23192614084158372, + "learning_rate": 1.5661769434707585e-06, + "loss": 0.36345481872558594, "num_tokens": 473551908.0, "step": 524 }, { "epoch": 3.917910447761194, - "grad_norm": 0.26205947886822, - "learning_rate": 1.5548081725677845e-05, - "loss": 0.18637153506278992, + "grad_norm": 0.2552136498693883, + "learning_rate": 1.5548081725677843e-06, + "loss": 0.38905611634254456, "num_tokens": 474411763.0, "step": 525 }, { "epoch": 3.925373134328358, - "grad_norm": 0.24890887636257084, - "learning_rate": 1.543547201460384e-05, - "loss": 0.19547396898269653, + "grad_norm": 0.24222236291728852, + "learning_rate": 1.543547201460384e-06, + "loss": 0.39437806606292725, "num_tokens": 475386853.0, "step": 526 }, { "epoch": 3.9328358208955225, - "grad_norm": 0.2380826834993642, - "learning_rate": 1.5323943378472547e-05, - "loss": 0.19739708304405212, + "grad_norm": 0.24678201558159044, + "learning_rate": 1.5323943378472547e-06, + "loss": 0.38338255882263184, "num_tokens": 476308351.0, "step": 527 }, { "epoch": 3.9402985074626864, - "grad_norm": 0.2304221543576119, - "learning_rate": 1.5213498864731266e-05, - "loss": 0.17367491126060486, + "grad_norm": 0.24156858852006619, + "learning_rate": 1.5213498864731266e-06, + "loss": 0.3475341796875, "num_tokens": 477113932.0, "step": 528 }, { "epoch": 3.9477611940298507, - "grad_norm": 0.23093918457549975, - "learning_rate": 1.510414149120436e-05, - "loss": 0.18326500058174133, + "grad_norm": 0.2450649252841632, + "learning_rate": 1.510414149120436e-06, + "loss": 0.3621699810028076, "num_tokens": 477978986.0, "step": 529 }, { "epoch": 3.955223880597015, - "grad_norm": 0.2532037397417475, - "learning_rate": 1.4995874246010777e-05, - "loss": 0.19176387786865234, + "grad_norm": 0.2615934671849586, + "learning_rate": 1.4995874246010778e-06, + "loss": 0.39790230989456177, "num_tokens": 478804801.0, "step": 530 }, { "epoch": 3.9626865671641793, - "grad_norm": 0.2298468712178015, - "learning_rate": 1.4888700087482446e-05, - "loss": 0.18720808625221252, + "grad_norm": 0.23841246285008993, + "learning_rate": 1.4888700087482447e-06, + "loss": 0.36489465832710266, "num_tokens": 479744154.0, "step": 531 }, { "epoch": 3.970149253731343, - "grad_norm": 0.24534352455009426, - "learning_rate": 1.4782621944083394e-05, - "loss": 0.17973420023918152, + "grad_norm": 0.23884234571084306, + "learning_rate": 1.4782621944083395e-06, + "loss": 0.3676777482032776, "num_tokens": 480672910.0, "step": 532 }, { "epoch": 3.9776119402985075, - "grad_norm": 0.2514973246268207, - "learning_rate": 1.4677642714329772e-05, - "loss": 0.19309595227241516, + "grad_norm": 0.24521642019046497, + "learning_rate": 1.4677642714329772e-06, + "loss": 0.36571812629699707, "num_tokens": 481542586.0, "step": 533 }, { "epoch": 3.9850746268656714, - "grad_norm": 0.24348123185095713, - "learning_rate": 1.4573765266710599e-05, - "loss": 0.1893158257007599, + "grad_norm": 0.2490357512875355, + "learning_rate": 1.45737652667106e-06, + "loss": 0.3776237964630127, "num_tokens": 482388483.0, "step": 534 }, { "epoch": 3.9925373134328357, - "grad_norm": 0.24797237346224157, - "learning_rate": 1.4470992439609446e-05, - "loss": 0.19356316328048706, + "grad_norm": 0.26895614724288625, + "learning_rate": 1.4470992439609447e-06, + "loss": 0.36370331048965454, "num_tokens": 483130281.0, "step": 535 }, { "epoch": 4.0, - "grad_norm": 0.23301214593504804, - "learning_rate": 1.4369327041226832e-05, - "loss": 0.17787128686904907, + "grad_norm": 0.23598448132329167, + "learning_rate": 1.4369327041226832e-06, + "loss": 0.3770376443862915, "num_tokens": 484157211.0, "step": 536 }, { "epoch": 4.007462686567164, - "grad_norm": 0.42915638694943464, - "learning_rate": 1.4268771849503507e-05, - "loss": 0.13233494758605957, + "grad_norm": 0.2696832935027054, + "learning_rate": 1.4268771849503507e-06, + "loss": 0.3495013117790222, "num_tokens": 484950425.0, "step": 537 }, { "epoch": 4.014925373134329, - "grad_norm": 0.3587597188444135, - "learning_rate": 1.4169329612044569e-05, - "loss": 0.12657418847084045, + "grad_norm": 0.2523061504872546, + "learning_rate": 1.416932961204457e-06, + "loss": 0.35033246874809265, "num_tokens": 485897373.0, "step": 538 }, { "epoch": 4.022388059701493, - "grad_norm": 0.2982202024191653, - "learning_rate": 1.4071003046044323e-05, - "loss": 0.1200299859046936, + "grad_norm": 0.24871017609979634, + "learning_rate": 1.4071003046044324e-06, + "loss": 0.3654225468635559, "num_tokens": 486751466.0, "step": 539 }, { "epoch": 4.029850746268656, - "grad_norm": 0.25553137768445994, - "learning_rate": 1.3973794838212123e-05, - "loss": 0.12038041651248932, + "grad_norm": 0.23941923046022578, + "learning_rate": 1.3973794838212124e-06, + "loss": 0.36163097620010376, "num_tokens": 487741373.0, "step": 540 }, { "epoch": 4.037313432835821, - "grad_norm": 0.292279136529883, - "learning_rate": 1.3877707644698896e-05, - "loss": 0.1095159649848938, + "grad_norm": 0.2662894021736037, + "learning_rate": 1.3877707644698895e-06, + "loss": 0.3875274062156677, "num_tokens": 488582397.0, "step": 541 }, { "epoch": 4.044776119402985, - "grad_norm": 0.36390078027223877, - "learning_rate": 1.3782744091024586e-05, - "loss": 0.12627287209033966, + "grad_norm": 0.2747015512526315, + "learning_rate": 1.3782744091024586e-06, + "loss": 0.3777075409889221, "num_tokens": 489319854.0, "step": 542 }, { "epoch": 4.052238805970149, - "grad_norm": 0.42551419155959364, - "learning_rate": 1.3688906772006394e-05, - "loss": 0.1351567804813385, + "grad_norm": 0.2525490812172139, + "learning_rate": 1.3688906772006393e-06, + "loss": 0.36404550075531006, "num_tokens": 490257709.0, "step": 543 }, { "epoch": 4.059701492537314, - "grad_norm": 0.3002078250520629, - "learning_rate": 1.3596198251687919e-05, - "loss": 0.11268725991249084, + "grad_norm": 0.24847635532681875, + "learning_rate": 1.359619825168792e-06, + "loss": 0.36995524168014526, "num_tokens": 491153491.0, "step": 544 }, { "epoch": 4.067164179104478, - "grad_norm": 0.2582279362027906, - "learning_rate": 1.3504621063269057e-05, - "loss": 0.11838126182556152, + "grad_norm": 0.23960840738937653, + "learning_rate": 1.3504621063269058e-06, + "loss": 0.36562579870224, "num_tokens": 492103168.0, "step": 545 }, { "epoch": 4.074626865671641, - "grad_norm": 0.23993415101933344, - "learning_rate": 1.34141777090368e-05, - "loss": 0.11621074378490448, + "grad_norm": 0.25453231573026114, + "learning_rate": 1.3414177709036802e-06, + "loss": 0.36385661363601685, "num_tokens": 493050344.0, "step": 546 }, { "epoch": 4.082089552238806, - "grad_norm": 0.23923879241495014, - "learning_rate": 1.3324870660296868e-05, - "loss": 0.12101944535970688, + "grad_norm": 0.2409618977265192, + "learning_rate": 1.3324870660296869e-06, + "loss": 0.34029990434646606, "num_tokens": 493993937.0, "step": 547 }, { "epoch": 4.08955223880597, - "grad_norm": 0.2489240391612763, - "learning_rate": 1.3236702357306158e-05, - "loss": 0.11438573896884918, + "grad_norm": 0.23732030399559195, + "learning_rate": 1.3236702357306157e-06, + "loss": 0.37044817209243774, "num_tokens": 494995752.0, "step": 548 }, { "epoch": 4.097014925373134, - "grad_norm": 0.2556440336814001, - "learning_rate": 1.3149675209206086e-05, - "loss": 0.11992098391056061, + "grad_norm": 0.27145957936067777, + "learning_rate": 1.3149675209206086e-06, + "loss": 0.36308181285858154, "num_tokens": 495757177.0, "step": 549 }, { "epoch": 4.104477611940299, - "grad_norm": 0.2310462700463846, - "learning_rate": 1.3063791593956756e-05, - "loss": 0.1260540932416916, + "grad_norm": 0.2833034141091316, + "learning_rate": 1.3063791593956758e-06, + "loss": 0.37331539392471313, "num_tokens": 496689668.0, "step": 550 }, { "epoch": 4.111940298507463, - "grad_norm": 0.23120186886900895, - "learning_rate": 1.2979053858271994e-05, - "loss": 0.11217569559812546, + "grad_norm": 0.240823460931463, + "learning_rate": 1.2979053858271995e-06, + "loss": 0.36020007729530334, "num_tokens": 497565858.0, "step": 551 }, { "epoch": 4.119402985074627, - "grad_norm": 0.2517756646545421, - "learning_rate": 1.2895464317555205e-05, - "loss": 0.1176270991563797, + "grad_norm": 0.2594604561644764, + "learning_rate": 1.2895464317555206e-06, + "loss": 0.3884323239326477, "num_tokens": 498385563.0, "step": 552 }, { "epoch": 4.126865671641791, - "grad_norm": 0.23442399198868952, - "learning_rate": 1.2813025255836103e-05, - "loss": 0.12030480057001114, + "grad_norm": 0.23073517132438157, + "learning_rate": 1.2813025255836104e-06, + "loss": 0.349163293838501, "num_tokens": 499323100.0, "step": 553 }, { "epoch": 4.134328358208955, - "grad_norm": 0.2495506361273293, - "learning_rate": 1.2731738925708328e-05, - "loss": 0.11524371802806854, + "grad_norm": 0.2603679958630556, + "learning_rate": 1.2731738925708328e-06, + "loss": 0.36741840839385986, "num_tokens": 500196622.0, "step": 554 }, { "epoch": 4.141791044776119, - "grad_norm": 0.2480774676431196, - "learning_rate": 1.265160754826787e-05, - "loss": 0.12456649541854858, + "grad_norm": 0.24326119145979633, + "learning_rate": 1.2651607548267873e-06, + "loss": 0.3810882568359375, "num_tokens": 501224710.0, "step": 555 }, { "epoch": 4.149253731343284, - "grad_norm": 0.21737264449964872, - "learning_rate": 1.2572633313052409e-05, - "loss": 0.1139940693974495, + "grad_norm": 0.22934377798425087, + "learning_rate": 1.257263331305241e-06, + "loss": 0.37762486934661865, "num_tokens": 502305655.0, "step": 556 }, { "epoch": 4.156716417910448, - "grad_norm": 0.2279335397273682, - "learning_rate": 1.249481837798144e-05, - "loss": 0.11345663666725159, + "grad_norm": 0.2399419262393838, + "learning_rate": 1.249481837798144e-06, + "loss": 0.360861212015152, "num_tokens": 503186087.0, "step": 557 }, { "epoch": 4.164179104477612, - "grad_norm": 0.23126173607707085, - "learning_rate": 1.2418164869297353e-05, - "loss": 0.12189182639122009, + "grad_norm": 0.2356017748084062, + "learning_rate": 1.2418164869297353e-06, + "loss": 0.36369866132736206, "num_tokens": 504097376.0, "step": 558 }, { "epoch": 4.1716417910447765, - "grad_norm": 0.2725670959153199, - "learning_rate": 1.2342674881507326e-05, - "loss": 0.12913620471954346, + "grad_norm": 0.239368624704367, + "learning_rate": 1.2342674881507327e-06, + "loss": 0.36475175619125366, "num_tokens": 505048926.0, "step": 559 }, { "epoch": 4.17910447761194, - "grad_norm": 0.2305936983676463, - "learning_rate": 1.2268350477326072e-05, - "loss": 0.11692139506340027, + "grad_norm": 0.24555194813944806, + "learning_rate": 1.2268350477326073e-06, + "loss": 0.3852774500846863, "num_tokens": 505967694.0, "step": 560 }, { "epoch": 4.186567164179104, - "grad_norm": 0.22986654664649733, - "learning_rate": 1.2195193687619506e-05, - "loss": 0.1214386522769928, + "grad_norm": 0.24385261062576613, + "learning_rate": 1.2195193687619505e-06, + "loss": 0.3750133812427521, "num_tokens": 506924348.0, "step": 561 }, { "epoch": 4.1940298507462686, - "grad_norm": 0.2293799242086639, - "learning_rate": 1.212320651134921e-05, - "loss": 0.11999252438545227, + "grad_norm": 0.24733441550806298, + "learning_rate": 1.2123206511349212e-06, + "loss": 0.36548683047294617, "num_tokens": 507837247.0, "step": 562 }, { "epoch": 4.201492537313433, - "grad_norm": 0.23255107536233421, - "learning_rate": 1.2052390915517881e-05, - "loss": 0.12108086049556732, + "grad_norm": 0.2626516276894915, + "learning_rate": 1.2052390915517881e-06, + "loss": 0.36941125988960266, "num_tokens": 508615951.0, "step": 563 }, { "epoch": 4.208955223880597, - "grad_norm": 0.21591340577390958, - "learning_rate": 1.1982748835115511e-05, - "loss": 0.11796995997428894, + "grad_norm": 0.24609691004441409, + "learning_rate": 1.1982748835115512e-06, + "loss": 0.3862428665161133, "num_tokens": 509598473.0, "step": 564 }, { "epoch": 4.2164179104477615, - "grad_norm": 0.24594742469421593, - "learning_rate": 1.1914282173066573e-05, - "loss": 0.11675625294446945, + "grad_norm": 0.24842515895556683, + "learning_rate": 1.1914282173066574e-06, + "loss": 0.38270822167396545, "num_tokens": 510499495.0, "step": 565 }, { "epoch": 4.223880597014926, - "grad_norm": 0.22031553256262087, - "learning_rate": 1.1846992800177978e-05, - "loss": 0.11913596093654633, + "grad_norm": 0.2407337171765148, + "learning_rate": 1.1846992800177979e-06, + "loss": 0.3664012551307678, "num_tokens": 511393216.0, "step": 566 }, { "epoch": 4.231343283582089, - "grad_norm": 0.2242173039348798, - "learning_rate": 1.1780882555087989e-05, - "loss": 0.11440055072307587, + "grad_norm": 0.2442416141258047, + "learning_rate": 1.1780882555087988e-06, + "loss": 0.3886314034461975, "num_tokens": 512343363.0, "step": 567 }, { "epoch": 4.2388059701492535, - "grad_norm": 0.21820837515990274, - "learning_rate": 1.1715953244215962e-05, - "loss": 0.12009980529546738, + "grad_norm": 0.2577619381883818, + "learning_rate": 1.1715953244215964e-06, + "loss": 0.3437773585319519, "num_tokens": 513127609.0, "step": 568 }, { "epoch": 4.246268656716418, - "grad_norm": 0.26437734736617013, - "learning_rate": 1.1652206641713018e-05, - "loss": 0.11999930441379547, + "grad_norm": 0.25087871697950354, + "learning_rate": 1.165220664171302e-06, + "loss": 0.3734786808490753, "num_tokens": 514033936.0, "step": 569 }, { "epoch": 4.253731343283582, - "grad_norm": 0.20815917043815502, - "learning_rate": 1.1589644489413516e-05, - "loss": 0.12217661738395691, + "grad_norm": 0.2392856334846873, + "learning_rate": 1.1589644489413516e-06, + "loss": 0.35015231370925903, "num_tokens": 514934044.0, "step": 570 }, { "epoch": 4.2611940298507465, - "grad_norm": 0.21636853602995781, - "learning_rate": 1.1528268496787497e-05, - "loss": 0.11369621753692627, + "grad_norm": 0.23533059380991045, + "learning_rate": 1.1528268496787498e-06, + "loss": 0.3818935453891754, "num_tokens": 515909265.0, "step": 571 }, { "epoch": 4.268656716417911, - "grad_norm": 0.21796486589109093, - "learning_rate": 1.1468080340893958e-05, - "loss": 0.11670064181089401, + "grad_norm": 0.28002873497751246, + "learning_rate": 1.1468080340893958e-06, + "loss": 0.3613874316215515, "num_tokens": 516712628.0, "step": 572 }, { "epoch": 4.276119402985074, - "grad_norm": 0.23320554603512064, - "learning_rate": 1.1409081666335033e-05, - "loss": 0.11871325969696045, + "grad_norm": 0.26573428139291055, + "learning_rate": 1.1409081666335035e-06, + "loss": 0.40466490387916565, "num_tokens": 517664539.0, "step": 573 }, { "epoch": 4.2835820895522385, - "grad_norm": 0.23045852022908989, - "learning_rate": 1.1351274085211067e-05, - "loss": 0.1191893219947815, + "grad_norm": 0.2622221544713941, + "learning_rate": 1.1351274085211068e-06, + "loss": 0.36875689029693604, "num_tokens": 518492097.0, "step": 574 }, { "epoch": 4.291044776119403, - "grad_norm": 0.2111625492945011, - "learning_rate": 1.1294659177076524e-05, - "loss": 0.11383013427257538, + "grad_norm": 0.8295997519231081, + "learning_rate": 1.1294659177076523e-06, + "loss": 0.343036413192749, "num_tokens": 519432536.0, "step": 575 }, { "epoch": 4.298507462686567, - "grad_norm": 0.23508025195949467, - "learning_rate": 1.1239238488896875e-05, - "loss": 0.11598856002092361, + "grad_norm": 0.26477934459538893, + "learning_rate": 1.1239238488896875e-06, + "loss": 0.39276033639907837, "num_tokens": 520276253.0, "step": 576 }, { "epoch": 4.3059701492537314, - "grad_norm": 0.2236213022600048, - "learning_rate": 1.118501353500631e-05, - "loss": 0.11648327112197876, + "grad_norm": 0.2751291575165678, + "learning_rate": 1.118501353500631e-06, + "loss": 0.36554020643234253, "num_tokens": 521085557.0, "step": 577 }, { "epoch": 4.313432835820896, - "grad_norm": 0.23486828556301872, - "learning_rate": 1.1131985797066364e-05, - "loss": 0.11901795864105225, + "grad_norm": 0.26704770077542006, + "learning_rate": 1.1131985797066364e-06, + "loss": 0.39840590953826904, "num_tokens": 521915761.0, "step": 578 }, { "epoch": 4.32089552238806, - "grad_norm": 0.22083295735614855, - "learning_rate": 1.1080156724025409e-05, - "loss": 0.1150529682636261, + "grad_norm": 0.267325084112826, + "learning_rate": 1.1080156724025409e-06, + "loss": 0.3594783842563629, "num_tokens": 522783342.0, "step": 579 }, { "epoch": 4.3283582089552235, - "grad_norm": 0.21394272061559325, - "learning_rate": 1.1029527732079084e-05, - "loss": 0.11960695683956146, + "grad_norm": 0.23810536176661679, + "learning_rate": 1.1029527732079084e-06, + "loss": 0.37440672516822815, "num_tokens": 523807264.0, "step": 580 }, { "epoch": 4.335820895522388, - "grad_norm": 0.24773713221878682, - "learning_rate": 1.0980100204631603e-05, - "loss": 0.12570662796497345, + "grad_norm": 0.27369911060242186, + "learning_rate": 1.0980100204631604e-06, + "loss": 0.40351587533950806, "num_tokens": 524601938.0, "step": 581 }, { "epoch": 4.343283582089552, - "grad_norm": 0.2042311495060902, - "learning_rate": 1.0931875492257945e-05, - "loss": 0.10974724590778351, + "grad_norm": 0.23536111609123755, + "learning_rate": 1.0931875492257946e-06, + "loss": 0.33745962381362915, "num_tokens": 525537212.0, "step": 582 }, { "epoch": 4.350746268656716, - "grad_norm": 0.23408226448062722, - "learning_rate": 1.088485491266694e-05, - "loss": 0.12000362575054169, + "grad_norm": 0.2600131581237491, + "learning_rate": 1.088485491266694e-06, + "loss": 0.38494178652763367, "num_tokens": 526347121.0, "step": 583 }, { "epoch": 4.358208955223881, - "grad_norm": 0.20959016829036062, - "learning_rate": 1.0839039750665291e-05, - "loss": 0.11346212029457092, + "grad_norm": 0.23219951832538527, + "learning_rate": 1.0839039750665292e-06, + "loss": 0.35427361726760864, "num_tokens": 527281437.0, "step": 584 }, { "epoch": 4.365671641791045, - "grad_norm": 0.21434028206354175, - "learning_rate": 1.079443125812243e-05, - "loss": 0.11478964984416962, + "grad_norm": 0.2489391057817072, + "learning_rate": 1.079443125812243e-06, + "loss": 0.3624609708786011, "num_tokens": 528208071.0, "step": 585 }, { "epoch": 4.373134328358209, - "grad_norm": 0.22613838919816373, - "learning_rate": 1.0751030653936355e-05, - "loss": 0.11110389977693558, + "grad_norm": 0.2539695897127002, + "learning_rate": 1.0751030653936356e-06, + "loss": 0.3747778534889221, "num_tokens": 529032089.0, "step": 586 }, { "epoch": 4.380597014925373, - "grad_norm": 0.24090680199846415, - "learning_rate": 1.0708839124000287e-05, - "loss": 0.11479231715202332, + "grad_norm": 0.2499880144186626, + "learning_rate": 1.0708839124000287e-06, + "loss": 0.38273054361343384, "num_tokens": 529947287.0, "step": 587 }, { "epoch": 4.388059701492537, - "grad_norm": 0.23564703204755827, - "learning_rate": 1.0667857821170281e-05, - "loss": 0.1156826838850975, + "grad_norm": 0.2506974248310357, + "learning_rate": 1.0667857821170282e-06, + "loss": 0.3470362424850464, "num_tokens": 530728896.0, "step": 588 }, { "epoch": 4.395522388059701, - "grad_norm": 0.22663485963837518, - "learning_rate": 1.0628087865233737e-05, - "loss": 0.12018388509750366, + "grad_norm": 0.24506418459436066, + "learning_rate": 1.0628087865233737e-06, + "loss": 0.35882338881492615, "num_tokens": 531620091.0, "step": 589 }, { "epoch": 4.402985074626866, - "grad_norm": 0.22960516148137106, - "learning_rate": 1.058953034287877e-05, - "loss": 0.11687445640563965, + "grad_norm": 0.24329483114740325, + "learning_rate": 1.058953034287877e-06, + "loss": 0.37174564599990845, "num_tokens": 532460579.0, "step": 590 }, { "epoch": 4.41044776119403, - "grad_norm": 0.2199967101376024, - "learning_rate": 1.0552186307664566e-05, - "loss": 0.11978814750909805, + "grad_norm": 0.23831984993388738, + "learning_rate": 1.0552186307664567e-06, + "loss": 0.363148033618927, "num_tokens": 533351390.0, "step": 591 }, { "epoch": 4.417910447761194, - "grad_norm": 0.2241944523856631, - "learning_rate": 1.0516056779992541e-05, - "loss": 0.12565642595291138, + "grad_norm": 0.26162136426743393, + "learning_rate": 1.0516056779992543e-06, + "loss": 0.38013726472854614, "num_tokens": 534195605.0, "step": 592 }, { "epoch": 4.425373134328359, - "grad_norm": 0.2342432039292704, - "learning_rate": 1.0481142747078492e-05, - "loss": 0.12581472098827362, + "grad_norm": 0.2635745464481523, + "learning_rate": 1.0481142747078494e-06, + "loss": 0.3700369596481323, "num_tokens": 535033541.0, "step": 593 }, { "epoch": 4.432835820895522, - "grad_norm": 0.2340294633523255, - "learning_rate": 1.0447445162925614e-05, - "loss": 0.12428691983222961, + "grad_norm": 0.25007207032778783, + "learning_rate": 1.0447445162925614e-06, + "loss": 0.3790166974067688, "num_tokens": 535964895.0, "step": 594 }, { "epoch": 4.440298507462686, - "grad_norm": 0.20754762744692337, - "learning_rate": 1.0414964948298436e-05, - "loss": 0.10510142147541046, + "grad_norm": 0.22799545701890034, + "learning_rate": 1.0414964948298436e-06, + "loss": 0.36508986353874207, "num_tokens": 536941184.0, "step": 595 }, { "epoch": 4.447761194029851, - "grad_norm": 0.2072380012556761, - "learning_rate": 1.0383702990697657e-05, - "loss": 0.10804590582847595, + "grad_norm": 0.23265306394886567, + "learning_rate": 1.0383702990697657e-06, + "loss": 0.3546326160430908, "num_tokens": 537896596.0, "step": 596 }, { "epoch": 4.455223880597015, - "grad_norm": 0.23080220736747944, - "learning_rate": 1.035366014433589e-05, - "loss": 0.12573307752609253, + "grad_norm": 0.2452826212608677, + "learning_rate": 1.0353660144335892e-06, + "loss": 0.3647281229496002, "num_tokens": 538748931.0, "step": 597 }, { "epoch": 4.462686567164179, - "grad_norm": 0.23348253833992477, - "learning_rate": 1.0324837230114331e-05, - "loss": 0.11393703520298004, + "grad_norm": 0.24623855227956742, + "learning_rate": 1.0324837230114332e-06, + "loss": 0.3664322793483734, "num_tokens": 539622406.0, "step": 598 }, { "epoch": 4.470149253731344, - "grad_norm": 0.21391684191860763, - "learning_rate": 1.0297235035600337e-05, - "loss": 0.11284124851226807, + "grad_norm": 0.24476867667376634, + "learning_rate": 1.0297235035600337e-06, + "loss": 0.35626494884490967, "num_tokens": 540561688.0, "step": 599 }, { "epoch": 4.477611940298507, - "grad_norm": 0.283019229465025, - "learning_rate": 1.0270854315005874e-05, - "loss": 0.14957240223884583, + "grad_norm": 0.22411638197357536, + "learning_rate": 1.0270854315005874e-06, + "loss": 0.3493247628211975, "num_tokens": 541498885.0, "step": 600 }, { "epoch": 4.485074626865671, - "grad_norm": 0.26455169811726015, - "learning_rate": 1.024569578916695e-05, - "loss": 0.11603298783302307, + "grad_norm": 0.23854702147816884, + "learning_rate": 1.024569578916695e-06, + "loss": 0.36460673809051514, "num_tokens": 542468798.0, "step": 601 }, { "epoch": 4.492537313432836, - "grad_norm": 0.22753900975610558, - "learning_rate": 1.0221760145523875e-05, - "loss": 0.12100145220756531, + "grad_norm": 0.24473776240066009, + "learning_rate": 1.0221760145523876e-06, + "loss": 0.3664558529853821, "num_tokens": 543354992.0, "step": 602 }, { "epoch": 4.5, - "grad_norm": 0.2184715841715033, - "learning_rate": 1.0199048038102527e-05, - "loss": 0.12140538543462753, + "grad_norm": 0.3484100772975978, + "learning_rate": 1.0199048038102528e-06, + "loss": 0.3781493902206421, "num_tokens": 544264190.0, "step": 603 }, { "epoch": 4.507462686567164, - "grad_norm": 0.2257828336109026, - "learning_rate": 1.0177560087496424e-05, - "loss": 0.11271989345550537, + "grad_norm": 0.23041088788536823, + "learning_rate": 1.0177560087496425e-06, + "loss": 0.36557939648628235, "num_tokens": 545199765.0, "step": 604 }, { "epoch": 4.514925373134329, - "grad_norm": 0.2336661358177161, - "learning_rate": 1.0157296880849824e-05, - "loss": 0.12647847831249237, + "grad_norm": 0.26397201636028744, + "learning_rate": 1.0157296880849826e-06, + "loss": 0.39719897508621216, "num_tokens": 546061065.0, "step": 605 }, { "epoch": 4.522388059701493, - "grad_norm": 0.2192801278976015, - "learning_rate": 1.0138258971841642e-05, - "loss": 0.11463098227977753, + "grad_norm": 0.2510378043077616, + "learning_rate": 1.0138258971841642e-06, + "loss": 0.3602595925331116, "num_tokens": 546928816.0, "step": 606 }, { "epoch": 4.529850746268656, - "grad_norm": 0.22181034220038406, - "learning_rate": 1.0120446880670326e-05, - "loss": 0.11594921350479126, + "grad_norm": 0.25217406420558186, + "learning_rate": 1.0120446880670326e-06, + "loss": 0.3766353130340576, "num_tokens": 547847934.0, "step": 607 }, { "epoch": 4.537313432835821, - "grad_norm": 0.21785476565894413, - "learning_rate": 1.0103861094039668e-05, - "loss": 0.11253038793802261, + "grad_norm": 0.23959568238841403, + "learning_rate": 1.010386109403967e-06, + "loss": 0.3650025725364685, "num_tokens": 548766636.0, "step": 608 }, { "epoch": 4.544776119402985, - "grad_norm": 0.22639997385947627, - "learning_rate": 1.008850206514547e-05, - "loss": 0.1259216070175171, + "grad_norm": 0.2377901920772251, + "learning_rate": 1.008850206514547e-06, + "loss": 0.3625343143939972, "num_tokens": 549661389.0, "step": 609 }, { "epoch": 4.552238805970149, - "grad_norm": 0.22933859110991026, - "learning_rate": 1.0074370213663201e-05, - "loss": 0.12494678795337677, + "grad_norm": 0.26122470845807755, + "learning_rate": 1.0074370213663202e-06, + "loss": 0.3682940602302551, "num_tokens": 550430887.0, "step": 610 }, { "epoch": 4.559701492537314, - "grad_norm": 0.2225421344643175, - "learning_rate": 1.0061465925736478e-05, - "loss": 0.11442428827285767, + "grad_norm": 0.2481365703161649, + "learning_rate": 1.0061465925736478e-06, + "loss": 0.36531317234039307, "num_tokens": 551293916.0, "step": 611 }, { "epoch": 4.567164179104478, - "grad_norm": 0.2135844726330891, - "learning_rate": 1.004978955396657e-05, - "loss": 0.11501826345920563, + "grad_norm": 0.23719670021949013, + "learning_rate": 1.004978955396657e-06, + "loss": 0.3669975996017456, "num_tokens": 552281926.0, "step": 612 }, { "epoch": 4.574626865671641, - "grad_norm": 0.2201629763635742, - "learning_rate": 1.0039341417402716e-05, - "loss": 0.12574587762355804, + "grad_norm": 0.25803252973725255, + "learning_rate": 1.0039341417402715e-06, + "loss": 0.37066352367401123, "num_tokens": 553148975.0, "step": 613 }, { "epoch": 4.582089552238806, - "grad_norm": 0.2244132640996229, - "learning_rate": 1.0030121801533442e-05, - "loss": 0.11866112798452377, + "grad_norm": 0.2476936983459798, + "learning_rate": 1.0030121801533442e-06, + "loss": 0.3824441134929657, "num_tokens": 554068576.0, "step": 614 }, { "epoch": 4.58955223880597, - "grad_norm": 0.23614003346429446, - "learning_rate": 1.0022130958278752e-05, - "loss": 0.12326011061668396, + "grad_norm": 0.2489594826146839, + "learning_rate": 1.002213095827875e-06, + "loss": 0.3596557378768921, "num_tokens": 554855138.0, "step": 615 }, { "epoch": 4.597014925373134, - "grad_norm": 0.195860056540846, - "learning_rate": 1.0015369105983218e-05, - "loss": 0.1145499050617218, + "grad_norm": 0.2550266059020853, + "learning_rate": 1.0015369105983218e-06, + "loss": 0.34850555658340454, "num_tokens": 555783649.0, "step": 616 }, { "epoch": 4.604477611940299, - "grad_norm": 0.2174740282366908, - "learning_rate": 1.0009836429410053e-05, - "loss": 0.11517459154129028, + "grad_norm": 0.28933444541800885, + "learning_rate": 1.0009836429410053e-06, + "loss": 0.3593859076499939, "num_tokens": 556756059.0, "step": 617 }, { "epoch": 4.611940298507463, - "grad_norm": 0.20842268914711495, - "learning_rate": 1.0005533079736037e-05, - "loss": 0.11445511877536774, + "grad_norm": 0.24100103005251267, + "learning_rate": 1.0005533079736037e-06, + "loss": 0.34157663583755493, "num_tokens": 557624997.0, "step": 618 }, { "epoch": 4.619402985074627, - "grad_norm": 0.21664224784633956, - "learning_rate": 1.0002459174547399e-05, - "loss": 0.12096623331308365, + "grad_norm": 0.2434497947580223, + "learning_rate": 1.00024591745474e-06, + "loss": 0.35940393805503845, "num_tokens": 558551462.0, "step": 619 }, { "epoch": 4.6268656716417915, - "grad_norm": 0.20902897303516801, - "learning_rate": 1.0000614797836586e-05, - "loss": 0.11375143378973007, + "grad_norm": 0.2334659825308566, + "learning_rate": 1.0000614797836587e-06, + "loss": 0.3954239785671234, "num_tokens": 559571713.0, "step": 620 }, @@ -4973,9 +4973,9 @@ "epoch": 4.6268656716417915, "step": 620, "total_flos": 829937030004736.0, - "train_loss": 0.3161669834246559, - "train_runtime": 18599.2522, - "train_samples_per_second": 1.067, + "train_loss": 0.4202386662844689, + "train_runtime": 18585.0074, + "train_samples_per_second": 1.068, "train_steps_per_second": 0.033 } ],