{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 38439, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007804573480059315, "grad_norm": 70.57565307617188, "learning_rate": 2.3413111342351717e-07, "loss": 0.8495, "step": 10 }, { "epoch": 0.001560914696011863, "grad_norm": 60.91403579711914, "learning_rate": 4.94276795005203e-07, "loss": 0.9127, "step": 20 }, { "epoch": 0.0023413720440177946, "grad_norm": 35.46073532104492, "learning_rate": 7.544224765868887e-07, "loss": 0.7408, "step": 30 }, { "epoch": 0.003121829392023726, "grad_norm": 28.7984619140625, "learning_rate": 1.0145681581685745e-06, "loss": 0.7778, "step": 40 }, { "epoch": 0.0039022867400296574, "grad_norm": 24.812997817993164, "learning_rate": 1.2747138397502602e-06, "loss": 0.6852, "step": 50 }, { "epoch": 0.004682744088035589, "grad_norm": 39.767250061035156, "learning_rate": 1.534859521331946e-06, "loss": 0.7863, "step": 60 }, { "epoch": 0.005463201436041521, "grad_norm": 37.76068878173828, "learning_rate": 1.7950052029136317e-06, "loss": 0.8252, "step": 70 }, { "epoch": 0.006243658784047452, "grad_norm": 63.68617248535156, "learning_rate": 2.0551508844953173e-06, "loss": 0.7234, "step": 80 }, { "epoch": 0.007024116132053383, "grad_norm": 23.260103225708008, "learning_rate": 2.315296566077003e-06, "loss": 0.811, "step": 90 }, { "epoch": 0.007804573480059315, "grad_norm": 26.783702850341797, "learning_rate": 2.575442247658689e-06, "loss": 0.5367, "step": 100 }, { "epoch": 0.008585030828065247, "grad_norm": 64.16069030761719, "learning_rate": 2.835587929240375e-06, "loss": 0.6239, "step": 110 }, { "epoch": 0.009365488176071178, "grad_norm": 23.482328414916992, "learning_rate": 3.0957336108220607e-06, "loss": 0.6538, "step": 120 }, { "epoch": 0.01014594552407711, "grad_norm": 47.013214111328125, "learning_rate": 3.3558792924037465e-06, "loss": 0.5528, "step": 130 }, { "epoch": 0.010926402872083041, "grad_norm": 57.85599136352539, "learning_rate": 3.616024973985432e-06, "loss": 1.022, "step": 140 }, { "epoch": 0.011706860220088973, "grad_norm": 36.2352294921875, "learning_rate": 3.8761706555671176e-06, "loss": 0.5808, "step": 150 }, { "epoch": 0.012487317568094904, "grad_norm": 55.68172836303711, "learning_rate": 4.136316337148804e-06, "loss": 0.8512, "step": 160 }, { "epoch": 0.013267774916100835, "grad_norm": 56.305442810058594, "learning_rate": 4.396462018730489e-06, "loss": 0.9563, "step": 170 }, { "epoch": 0.014048232264106767, "grad_norm": 69.95011901855469, "learning_rate": 4.656607700312175e-06, "loss": 1.0437, "step": 180 }, { "epoch": 0.014828689612112698, "grad_norm": 86.20785522460938, "learning_rate": 4.916753381893861e-06, "loss": 1.1461, "step": 190 }, { "epoch": 0.01560914696011863, "grad_norm": 61.00738525390625, "learning_rate": 5.176899063475546e-06, "loss": 0.6641, "step": 200 }, { "epoch": 0.016389604308124563, "grad_norm": 27.987585067749023, "learning_rate": 5.437044745057232e-06, "loss": 1.0178, "step": 210 }, { "epoch": 0.017170061656130494, "grad_norm": 50.186649322509766, "learning_rate": 5.697190426638918e-06, "loss": 0.9153, "step": 220 }, { "epoch": 0.017950519004136425, "grad_norm": 44.61861038208008, "learning_rate": 5.957336108220604e-06, "loss": 0.7384, "step": 230 }, { "epoch": 0.018730976352142357, "grad_norm": 44.65412902832031, "learning_rate": 6.217481789802289e-06, "loss": 0.9101, "step": 240 }, { "epoch": 0.019511433700148288, "grad_norm": 43.09900665283203, "learning_rate": 6.477627471383975e-06, "loss": 0.815, "step": 250 }, { "epoch": 0.02029189104815422, "grad_norm": 26.3706111907959, "learning_rate": 6.737773152965661e-06, "loss": 0.8089, "step": 260 }, { "epoch": 0.02107234839616015, "grad_norm": 23.912540435791016, "learning_rate": 6.997918834547347e-06, "loss": 0.5597, "step": 270 }, { "epoch": 0.021852805744166082, "grad_norm": 43.356571197509766, "learning_rate": 7.258064516129033e-06, "loss": 0.776, "step": 280 }, { "epoch": 0.022633263092172014, "grad_norm": 47.17486572265625, "learning_rate": 7.518210197710718e-06, "loss": 0.6612, "step": 290 }, { "epoch": 0.023413720440177945, "grad_norm": 27.746665954589844, "learning_rate": 7.778355879292404e-06, "loss": 0.9178, "step": 300 }, { "epoch": 0.024194177788183877, "grad_norm": 37.433265686035156, "learning_rate": 8.03850156087409e-06, "loss": 0.7673, "step": 310 }, { "epoch": 0.024974635136189808, "grad_norm": 71.587646484375, "learning_rate": 8.298647242455775e-06, "loss": 1.0383, "step": 320 }, { "epoch": 0.02575509248419574, "grad_norm": 56.9047737121582, "learning_rate": 8.558792924037463e-06, "loss": 0.8289, "step": 330 }, { "epoch": 0.02653554983220167, "grad_norm": 98.57324981689453, "learning_rate": 8.818938605619147e-06, "loss": 1.0829, "step": 340 }, { "epoch": 0.027316007180207602, "grad_norm": 57.22220230102539, "learning_rate": 9.079084287200833e-06, "loss": 1.0618, "step": 350 }, { "epoch": 0.028096464528213533, "grad_norm": 37.49354934692383, "learning_rate": 9.339229968782518e-06, "loss": 0.8915, "step": 360 }, { "epoch": 0.028876921876219465, "grad_norm": 29.789419174194336, "learning_rate": 9.599375650364204e-06, "loss": 0.6783, "step": 370 }, { "epoch": 0.029657379224225396, "grad_norm": 75.5431137084961, "learning_rate": 9.85952133194589e-06, "loss": 1.0771, "step": 380 }, { "epoch": 0.030437836572231328, "grad_norm": 46.971492767333984, "learning_rate": 1.0119667013527576e-05, "loss": 0.9261, "step": 390 }, { "epoch": 0.03121829392023726, "grad_norm": 28.467092514038086, "learning_rate": 1.0379812695109261e-05, "loss": 0.6424, "step": 400 }, { "epoch": 0.03199875126824319, "grad_norm": 70.29894256591797, "learning_rate": 1.0639958376690947e-05, "loss": 0.7372, "step": 410 }, { "epoch": 0.032779208616249125, "grad_norm": 47.83068084716797, "learning_rate": 1.0900104058272635e-05, "loss": 0.8506, "step": 420 }, { "epoch": 0.03355966596425505, "grad_norm": 114.6404037475586, "learning_rate": 1.1160249739854319e-05, "loss": 0.754, "step": 430 }, { "epoch": 0.03434012331226099, "grad_norm": 37.66133499145508, "learning_rate": 1.1420395421436004e-05, "loss": 0.7524, "step": 440 }, { "epoch": 0.035120580660266916, "grad_norm": 69.27783203125, "learning_rate": 1.168054110301769e-05, "loss": 0.7973, "step": 450 }, { "epoch": 0.03590103800827285, "grad_norm": 40.7580451965332, "learning_rate": 1.1940686784599376e-05, "loss": 1.1207, "step": 460 }, { "epoch": 0.03668149535627878, "grad_norm": 69.8377685546875, "learning_rate": 1.2200832466181062e-05, "loss": 0.7528, "step": 470 }, { "epoch": 0.037461952704284714, "grad_norm": 61.14483642578125, "learning_rate": 1.2460978147762747e-05, "loss": 0.5989, "step": 480 }, { "epoch": 0.03824241005229064, "grad_norm": 42.87925338745117, "learning_rate": 1.2721123829344433e-05, "loss": 1.0347, "step": 490 }, { "epoch": 0.039022867400296576, "grad_norm": 82.56037902832031, "learning_rate": 1.2981269510926119e-05, "loss": 0.7157, "step": 500 }, { "epoch": 0.039803324748302504, "grad_norm": 67.13080596923828, "learning_rate": 1.3241415192507806e-05, "loss": 0.5716, "step": 510 }, { "epoch": 0.04058378209630844, "grad_norm": 26.124019622802734, "learning_rate": 1.3501560874089492e-05, "loss": 0.5971, "step": 520 }, { "epoch": 0.04136423944431437, "grad_norm": 43.60024642944336, "learning_rate": 1.3761706555671178e-05, "loss": 0.6868, "step": 530 }, { "epoch": 0.0421446967923203, "grad_norm": 72.22933959960938, "learning_rate": 1.402185223725286e-05, "loss": 0.8075, "step": 540 }, { "epoch": 0.04292515414032623, "grad_norm": 51.11296081542969, "learning_rate": 1.4281997918834548e-05, "loss": 0.5868, "step": 550 }, { "epoch": 0.043705611488332165, "grad_norm": 59.321163177490234, "learning_rate": 1.4542143600416234e-05, "loss": 0.9031, "step": 560 }, { "epoch": 0.04448606883633809, "grad_norm": 14.339578628540039, "learning_rate": 1.480228928199792e-05, "loss": 0.5629, "step": 570 }, { "epoch": 0.04526652618434403, "grad_norm": 14.040125846862793, "learning_rate": 1.5062434963579605e-05, "loss": 0.8403, "step": 580 }, { "epoch": 0.046046983532349955, "grad_norm": 43.124237060546875, "learning_rate": 1.5322580645161292e-05, "loss": 0.7625, "step": 590 }, { "epoch": 0.04682744088035589, "grad_norm": 109.8456802368164, "learning_rate": 1.5582726326742977e-05, "loss": 1.2096, "step": 600 }, { "epoch": 0.04760789822836182, "grad_norm": 28.653745651245117, "learning_rate": 1.5842872008324664e-05, "loss": 1.6295, "step": 610 }, { "epoch": 0.04838835557636775, "grad_norm": 31.663606643676758, "learning_rate": 1.6103017689906348e-05, "loss": 0.7588, "step": 620 }, { "epoch": 0.04916881292437368, "grad_norm": 40.83611297607422, "learning_rate": 1.6363163371488032e-05, "loss": 0.707, "step": 630 }, { "epoch": 0.049949270272379616, "grad_norm": 16.77260398864746, "learning_rate": 1.662330905306972e-05, "loss": 1.0579, "step": 640 }, { "epoch": 0.050729727620385544, "grad_norm": 16.65311622619629, "learning_rate": 1.6883454734651404e-05, "loss": 0.309, "step": 650 }, { "epoch": 0.05151018496839148, "grad_norm": 83.49839782714844, "learning_rate": 1.714360041623309e-05, "loss": 1.103, "step": 660 }, { "epoch": 0.05229064231639741, "grad_norm": 21.528764724731445, "learning_rate": 1.740374609781478e-05, "loss": 0.651, "step": 670 }, { "epoch": 0.05307109966440334, "grad_norm": 4.0382866859436035, "learning_rate": 1.7663891779396463e-05, "loss": 0.905, "step": 680 }, { "epoch": 0.05385155701240927, "grad_norm": 108.43582153320312, "learning_rate": 1.792403746097815e-05, "loss": 1.2111, "step": 690 }, { "epoch": 0.054632014360415204, "grad_norm": 19.11322784423828, "learning_rate": 1.8184183142559834e-05, "loss": 1.1405, "step": 700 }, { "epoch": 0.05541247170842113, "grad_norm": 5.063117504119873, "learning_rate": 1.844432882414152e-05, "loss": 0.779, "step": 710 }, { "epoch": 0.05619292905642707, "grad_norm": 32.06193923950195, "learning_rate": 1.8704474505723206e-05, "loss": 0.9897, "step": 720 }, { "epoch": 0.056973386404432995, "grad_norm": 8.792379379272461, "learning_rate": 1.896462018730489e-05, "loss": 1.2692, "step": 730 }, { "epoch": 0.05775384375243893, "grad_norm": 47.35828399658203, "learning_rate": 1.9224765868886577e-05, "loss": 1.0116, "step": 740 }, { "epoch": 0.05853430110044486, "grad_norm": 100.82672882080078, "learning_rate": 1.948491155046826e-05, "loss": 1.0753, "step": 750 }, { "epoch": 0.05931475844845079, "grad_norm": 13.072049140930176, "learning_rate": 1.974505723204995e-05, "loss": 0.966, "step": 760 }, { "epoch": 0.06009521579645672, "grad_norm": 11.36054801940918, "learning_rate": 2.0005202913631636e-05, "loss": 1.8183, "step": 770 }, { "epoch": 0.060875673144462655, "grad_norm": 164.9627227783203, "learning_rate": 2.026534859521332e-05, "loss": 1.2052, "step": 780 }, { "epoch": 0.06165613049246858, "grad_norm": 1.3306998014450073, "learning_rate": 2.0525494276795008e-05, "loss": 1.7305, "step": 790 }, { "epoch": 0.06243658784047452, "grad_norm": 0.005146509036421776, "learning_rate": 2.078563995837669e-05, "loss": 0.3162, "step": 800 }, { "epoch": 0.06321704518848045, "grad_norm": 192.14456176757812, "learning_rate": 2.1045785639958376e-05, "loss": 1.4213, "step": 810 }, { "epoch": 0.06399750253648638, "grad_norm": 52.385902404785156, "learning_rate": 2.1305931321540063e-05, "loss": 0.3254, "step": 820 }, { "epoch": 0.06477795988449231, "grad_norm": 116.30766296386719, "learning_rate": 2.1566077003121747e-05, "loss": 1.6886, "step": 830 }, { "epoch": 0.06555841723249825, "grad_norm": 154.6099853515625, "learning_rate": 2.1826222684703435e-05, "loss": 2.9836, "step": 840 }, { "epoch": 0.06633887458050418, "grad_norm": 118.08147430419922, "learning_rate": 2.2086368366285122e-05, "loss": 0.353, "step": 850 }, { "epoch": 0.0671193319285101, "grad_norm": 1.24558687210083, "learning_rate": 2.2346514047866806e-05, "loss": 0.3155, "step": 860 }, { "epoch": 0.06789978927651603, "grad_norm": 4.226469993591309, "learning_rate": 2.2606659729448494e-05, "loss": 0.5152, "step": 870 }, { "epoch": 0.06868024662452198, "grad_norm": 17.129283905029297, "learning_rate": 2.2866805411030178e-05, "loss": 3.3197, "step": 880 }, { "epoch": 0.0694607039725279, "grad_norm": 204.71778869628906, "learning_rate": 2.3126951092611865e-05, "loss": 0.6891, "step": 890 }, { "epoch": 0.07024116132053383, "grad_norm": 16.1604061126709, "learning_rate": 2.338709677419355e-05, "loss": 0.9631, "step": 900 }, { "epoch": 0.07102161866853976, "grad_norm": 33.664344787597656, "learning_rate": 2.3647242455775233e-05, "loss": 1.199, "step": 910 }, { "epoch": 0.0718020760165457, "grad_norm": 122.5814437866211, "learning_rate": 2.390738813735692e-05, "loss": 1.1581, "step": 920 }, { "epoch": 0.07258253336455163, "grad_norm": 4.358405902848972e-08, "learning_rate": 2.4167533818938605e-05, "loss": 0.6411, "step": 930 }, { "epoch": 0.07336299071255756, "grad_norm": 103.39940643310547, "learning_rate": 2.4427679500520292e-05, "loss": 2.3409, "step": 940 }, { "epoch": 0.07414344806056349, "grad_norm": 184.74298095703125, "learning_rate": 2.468782518210198e-05, "loss": 1.114, "step": 950 }, { "epoch": 0.07492390540856943, "grad_norm": 4.424814224243164, "learning_rate": 2.4947970863683664e-05, "loss": 0.4251, "step": 960 }, { "epoch": 0.07570436275657536, "grad_norm": 228.1300048828125, "learning_rate": 2.5208116545265348e-05, "loss": 2.1234, "step": 970 }, { "epoch": 0.07648482010458128, "grad_norm": 173.34849548339844, "learning_rate": 2.5468262226847035e-05, "loss": 2.6774, "step": 980 }, { "epoch": 0.07726527745258721, "grad_norm": 25.359329223632812, "learning_rate": 2.572840790842872e-05, "loss": 1.4619, "step": 990 }, { "epoch": 0.07804573480059315, "grad_norm": 93.51551055908203, "learning_rate": 2.5988553590010407e-05, "loss": 1.7448, "step": 1000 }, { "epoch": 0.07882619214859908, "grad_norm": 5.242687225341797, "learning_rate": 2.624869927159209e-05, "loss": 0.8575, "step": 1010 }, { "epoch": 0.07960664949660501, "grad_norm": 137.5213165283203, "learning_rate": 2.6508844953173778e-05, "loss": 0.849, "step": 1020 }, { "epoch": 0.08038710684461094, "grad_norm": 85.96156311035156, "learning_rate": 2.6768990634755466e-05, "loss": 0.5037, "step": 1030 }, { "epoch": 0.08116756419261688, "grad_norm": 28.6999454498291, "learning_rate": 2.702913631633715e-05, "loss": 2.0381, "step": 1040 }, { "epoch": 0.0819480215406228, "grad_norm": 100.8990478515625, "learning_rate": 2.7289281997918837e-05, "loss": 1.587, "step": 1050 }, { "epoch": 0.08272847888862873, "grad_norm": 90.79500579833984, "learning_rate": 2.754942767950052e-05, "loss": 0.9584, "step": 1060 }, { "epoch": 0.08350893623663466, "grad_norm": 5.2520341873168945, "learning_rate": 2.780957336108221e-05, "loss": 0.8497, "step": 1070 }, { "epoch": 0.0842893935846406, "grad_norm": 20.94938850402832, "learning_rate": 2.8069719042663896e-05, "loss": 2.2675, "step": 1080 }, { "epoch": 0.08506985093264653, "grad_norm": 142.2401123046875, "learning_rate": 2.832986472424558e-05, "loss": 1.5153, "step": 1090 }, { "epoch": 0.08585030828065246, "grad_norm": 77.06143188476562, "learning_rate": 2.8590010405827268e-05, "loss": 1.6392, "step": 1100 }, { "epoch": 0.08663076562865839, "grad_norm": 63.13711929321289, "learning_rate": 2.8850156087408952e-05, "loss": 1.902, "step": 1110 }, { "epoch": 0.08741122297666433, "grad_norm": 34.4248161315918, "learning_rate": 2.911030176899064e-05, "loss": 1.384, "step": 1120 }, { "epoch": 0.08819168032467026, "grad_norm": 0.0009433151572011411, "learning_rate": 2.937044745057232e-05, "loss": 1.1749, "step": 1130 }, { "epoch": 0.08897213767267619, "grad_norm": 1.5084794759750366, "learning_rate": 2.9630593132154004e-05, "loss": 1.5541, "step": 1140 }, { "epoch": 0.08975259502068211, "grad_norm": 108.27482604980469, "learning_rate": 2.989073881373569e-05, "loss": 1.8344, "step": 1150 }, { "epoch": 0.09053305236868806, "grad_norm": 0.0024440279230475426, "learning_rate": 3.015088449531738e-05, "loss": 2.5862, "step": 1160 }, { "epoch": 0.09131350971669398, "grad_norm": 0.9278786778450012, "learning_rate": 3.0411030176899063e-05, "loss": 1.0581, "step": 1170 }, { "epoch": 0.09209396706469991, "grad_norm": 26.764270782470703, "learning_rate": 3.067117585848075e-05, "loss": 1.2931, "step": 1180 }, { "epoch": 0.09287442441270584, "grad_norm": 0.8647301197052002, "learning_rate": 3.093132154006244e-05, "loss": 0.4697, "step": 1190 }, { "epoch": 0.09365488176071178, "grad_norm": 127.77442169189453, "learning_rate": 3.119146722164412e-05, "loss": 1.0145, "step": 1200 }, { "epoch": 0.09443533910871771, "grad_norm": 106.81623840332031, "learning_rate": 3.1451612903225806e-05, "loss": 3.1067, "step": 1210 }, { "epoch": 0.09521579645672364, "grad_norm": 1.4174572229385376, "learning_rate": 3.171175858480749e-05, "loss": 0.9611, "step": 1220 }, { "epoch": 0.09599625380472958, "grad_norm": 28.78997802734375, "learning_rate": 3.197190426638918e-05, "loss": 1.1636, "step": 1230 }, { "epoch": 0.0967767111527355, "grad_norm": 37.20909881591797, "learning_rate": 3.223204994797087e-05, "loss": 1.9076, "step": 1240 }, { "epoch": 0.09755716850074143, "grad_norm": 150.9630584716797, "learning_rate": 3.249219562955255e-05, "loss": 1.7947, "step": 1250 }, { "epoch": 0.09833762584874736, "grad_norm": 33.25954818725586, "learning_rate": 3.2752341311134236e-05, "loss": 0.9458, "step": 1260 }, { "epoch": 0.0991180831967533, "grad_norm": 56.61296844482422, "learning_rate": 3.3012486992715924e-05, "loss": 1.1717, "step": 1270 }, { "epoch": 0.09989854054475923, "grad_norm": 0.33454135060310364, "learning_rate": 3.327263267429761e-05, "loss": 0.2624, "step": 1280 }, { "epoch": 0.10067899789276516, "grad_norm": 195.7353057861328, "learning_rate": 3.35327783558793e-05, "loss": 3.4511, "step": 1290 }, { "epoch": 0.10145945524077109, "grad_norm": 34.02952194213867, "learning_rate": 3.379292403746098e-05, "loss": 1.033, "step": 1300 }, { "epoch": 0.10223991258877703, "grad_norm": 11.683356285095215, "learning_rate": 3.405306971904266e-05, "loss": 0.9921, "step": 1310 }, { "epoch": 0.10302036993678296, "grad_norm": 5.753941059112549, "learning_rate": 3.431321540062435e-05, "loss": 0.5025, "step": 1320 }, { "epoch": 0.10380082728478889, "grad_norm": 154.21438598632812, "learning_rate": 3.4573361082206035e-05, "loss": 1.7834, "step": 1330 }, { "epoch": 0.10458128463279481, "grad_norm": 3.0168781250949905e-08, "learning_rate": 3.483350676378772e-05, "loss": 0.3439, "step": 1340 }, { "epoch": 0.10536174198080075, "grad_norm": 187.32237243652344, "learning_rate": 3.509365244536941e-05, "loss": 1.1506, "step": 1350 }, { "epoch": 0.10614219932880668, "grad_norm": 0.014487487263977528, "learning_rate": 3.535379812695109e-05, "loss": 1.7747, "step": 1360 }, { "epoch": 0.10692265667681261, "grad_norm": 1.2514485661085928e-06, "learning_rate": 3.561394380853278e-05, "loss": 1.5893, "step": 1370 }, { "epoch": 0.10770311402481854, "grad_norm": 0.7034757137298584, "learning_rate": 3.5874089490114465e-05, "loss": 1.6552, "step": 1380 }, { "epoch": 0.10848357137282448, "grad_norm": 22.506364822387695, "learning_rate": 3.613423517169615e-05, "loss": 3.3548, "step": 1390 }, { "epoch": 0.10926402872083041, "grad_norm": 0.44950780272483826, "learning_rate": 3.639438085327784e-05, "loss": 1.3068, "step": 1400 }, { "epoch": 0.11004448606883634, "grad_norm": 0.26861312985420227, "learning_rate": 3.665452653485952e-05, "loss": 1.4875, "step": 1410 }, { "epoch": 0.11082494341684226, "grad_norm": 10.148048400878906, "learning_rate": 3.691467221644121e-05, "loss": 0.6517, "step": 1420 }, { "epoch": 0.1116054007648482, "grad_norm": 117.42066955566406, "learning_rate": 3.7174817898022896e-05, "loss": 2.4652, "step": 1430 }, { "epoch": 0.11238585811285413, "grad_norm": 0.7029063701629639, "learning_rate": 3.743496357960458e-05, "loss": 0.4546, "step": 1440 }, { "epoch": 0.11316631546086006, "grad_norm": 0.00036122280289418995, "learning_rate": 3.769510926118627e-05, "loss": 0.6733, "step": 1450 }, { "epoch": 0.11394677280886599, "grad_norm": 2.137751579284668, "learning_rate": 3.795525494276795e-05, "loss": 0.6666, "step": 1460 }, { "epoch": 0.11472723015687193, "grad_norm": 0.006048910319805145, "learning_rate": 3.821540062434964e-05, "loss": 1.7245, "step": 1470 }, { "epoch": 0.11550768750487786, "grad_norm": 3.093451976776123, "learning_rate": 3.8475546305931326e-05, "loss": 0.8739, "step": 1480 }, { "epoch": 0.11628814485288379, "grad_norm": 0.0005918557289987803, "learning_rate": 3.873569198751301e-05, "loss": 1.8093, "step": 1490 }, { "epoch": 0.11706860220088972, "grad_norm": 116.87903594970703, "learning_rate": 3.8995837669094694e-05, "loss": 2.0154, "step": 1500 }, { "epoch": 0.11784905954889566, "grad_norm": 56.92986297607422, "learning_rate": 3.925598335067638e-05, "loss": 1.693, "step": 1510 }, { "epoch": 0.11862951689690159, "grad_norm": 113.77027893066406, "learning_rate": 3.951612903225806e-05, "loss": 1.5754, "step": 1520 }, { "epoch": 0.11940997424490751, "grad_norm": 0.0008633786346763372, "learning_rate": 3.977627471383975e-05, "loss": 1.9881, "step": 1530 }, { "epoch": 0.12019043159291344, "grad_norm": 0.01747491955757141, "learning_rate": 4.003642039542144e-05, "loss": 1.3658, "step": 1540 }, { "epoch": 0.12097088894091938, "grad_norm": 0.017468832433223724, "learning_rate": 4.0296566077003125e-05, "loss": 0.4905, "step": 1550 }, { "epoch": 0.12175134628892531, "grad_norm": 6.877013683319092, "learning_rate": 4.0556711758584806e-05, "loss": 1.3686, "step": 1560 }, { "epoch": 0.12253180363693124, "grad_norm": 61.5954704284668, "learning_rate": 4.081685744016649e-05, "loss": 0.9125, "step": 1570 }, { "epoch": 0.12331226098493717, "grad_norm": 35.28593826293945, "learning_rate": 4.107700312174818e-05, "loss": 0.8816, "step": 1580 }, { "epoch": 0.12409271833294311, "grad_norm": 23.813762664794922, "learning_rate": 4.133714880332987e-05, "loss": 2.0649, "step": 1590 }, { "epoch": 0.12487317568094904, "grad_norm": 1.0868879556655884, "learning_rate": 4.1597294484911555e-05, "loss": 0.9881, "step": 1600 }, { "epoch": 0.12565363302895496, "grad_norm": 89.6454086303711, "learning_rate": 4.1857440166493236e-05, "loss": 1.8194, "step": 1610 }, { "epoch": 0.1264340903769609, "grad_norm": 101.64916229248047, "learning_rate": 4.2117585848074924e-05, "loss": 0.8102, "step": 1620 }, { "epoch": 0.12721454772496682, "grad_norm": 69.2875747680664, "learning_rate": 4.237773152965661e-05, "loss": 1.708, "step": 1630 }, { "epoch": 0.12799500507297276, "grad_norm": 29.091567993164062, "learning_rate": 4.26378772112383e-05, "loss": 0.1439, "step": 1640 }, { "epoch": 0.1287754624209787, "grad_norm": 128.8373565673828, "learning_rate": 4.2898022892819986e-05, "loss": 1.2202, "step": 1650 }, { "epoch": 0.12955591976898462, "grad_norm": 0.0037375022657215595, "learning_rate": 4.3158168574401667e-05, "loss": 1.6469, "step": 1660 }, { "epoch": 0.13033637711699056, "grad_norm": 150.75579833984375, "learning_rate": 4.341831425598335e-05, "loss": 1.3376, "step": 1670 }, { "epoch": 0.1311168344649965, "grad_norm": 32.13846206665039, "learning_rate": 4.3678459937565035e-05, "loss": 1.0743, "step": 1680 }, { "epoch": 0.13189729181300242, "grad_norm": 25.405044555664062, "learning_rate": 4.393860561914672e-05, "loss": 2.3368, "step": 1690 }, { "epoch": 0.13267774916100836, "grad_norm": 33.23969650268555, "learning_rate": 4.419875130072841e-05, "loss": 1.2335, "step": 1700 }, { "epoch": 0.13345820650901427, "grad_norm": 9.90313583315583e-06, "learning_rate": 4.44588969823101e-05, "loss": 2.5255, "step": 1710 }, { "epoch": 0.1342386638570202, "grad_norm": 3.5996785163879395, "learning_rate": 4.471904266389178e-05, "loss": 0.5967, "step": 1720 }, { "epoch": 0.13501912120502615, "grad_norm": 80.54622650146484, "learning_rate": 4.4979188345473465e-05, "loss": 1.665, "step": 1730 }, { "epoch": 0.13579957855303207, "grad_norm": 77.36883544921875, "learning_rate": 4.523933402705515e-05, "loss": 0.7881, "step": 1740 }, { "epoch": 0.136580035901038, "grad_norm": 4.339534759521484, "learning_rate": 4.549947970863684e-05, "loss": 1.2602, "step": 1750 }, { "epoch": 0.13736049324904395, "grad_norm": 43.28116226196289, "learning_rate": 4.575962539021853e-05, "loss": 0.2827, "step": 1760 }, { "epoch": 0.13814095059704987, "grad_norm": 95.82295227050781, "learning_rate": 4.601977107180021e-05, "loss": 0.8773, "step": 1770 }, { "epoch": 0.1389214079450558, "grad_norm": 6.8486552238464355, "learning_rate": 4.6279916753381896e-05, "loss": 1.5294, "step": 1780 }, { "epoch": 0.13970186529306172, "grad_norm": 1.7467870065956959e-06, "learning_rate": 4.654006243496358e-05, "loss": 1.06, "step": 1790 }, { "epoch": 0.14048232264106766, "grad_norm": 9.074289321899414, "learning_rate": 4.680020811654527e-05, "loss": 1.2706, "step": 1800 }, { "epoch": 0.1412627799890736, "grad_norm": 0.0031192891765385866, "learning_rate": 4.706035379812696e-05, "loss": 0.835, "step": 1810 }, { "epoch": 0.14204323733707952, "grad_norm": 0.06087380275130272, "learning_rate": 4.732049947970864e-05, "loss": 3.8462, "step": 1820 }, { "epoch": 0.14282369468508546, "grad_norm": 86.36807250976562, "learning_rate": 4.7580645161290326e-05, "loss": 1.0386, "step": 1830 }, { "epoch": 0.1436041520330914, "grad_norm": 45.92657470703125, "learning_rate": 4.7840790842872014e-05, "loss": 0.6736, "step": 1840 }, { "epoch": 0.14438460938109732, "grad_norm": 78.54730987548828, "learning_rate": 4.81009365244537e-05, "loss": 1.8937, "step": 1850 }, { "epoch": 0.14516506672910326, "grad_norm": 83.15911865234375, "learning_rate": 4.836108220603538e-05, "loss": 0.7178, "step": 1860 }, { "epoch": 0.14594552407710917, "grad_norm": 35.2256965637207, "learning_rate": 4.862122788761707e-05, "loss": 0.5309, "step": 1870 }, { "epoch": 0.14672598142511512, "grad_norm": 138.44583129882812, "learning_rate": 4.888137356919875e-05, "loss": 1.6066, "step": 1880 }, { "epoch": 0.14750643877312106, "grad_norm": 8.984199666883796e-06, "learning_rate": 4.914151925078044e-05, "loss": 0.1384, "step": 1890 }, { "epoch": 0.14828689612112697, "grad_norm": 2.7456895423938477e-08, "learning_rate": 4.9401664932362125e-05, "loss": 2.8467, "step": 1900 }, { "epoch": 0.1490673534691329, "grad_norm": 7.1267924308776855, "learning_rate": 4.966181061394381e-05, "loss": 1.0931, "step": 1910 }, { "epoch": 0.14984781081713885, "grad_norm": 14.842751502990723, "learning_rate": 4.992195629552549e-05, "loss": 0.5779, "step": 1920 }, { "epoch": 0.15062826816514477, "grad_norm": 48.106014251708984, "learning_rate": 5.018210197710719e-05, "loss": 1.4842, "step": 1930 }, { "epoch": 0.1514087255131507, "grad_norm": 0.34851303696632385, "learning_rate": 5.044224765868887e-05, "loss": 0.4474, "step": 1940 }, { "epoch": 0.15218918286115662, "grad_norm": 3.1243269443511963, "learning_rate": 5.070239334027055e-05, "loss": 1.0802, "step": 1950 }, { "epoch": 0.15296964020916257, "grad_norm": 20.244169235229492, "learning_rate": 5.096253902185224e-05, "loss": 0.81, "step": 1960 }, { "epoch": 0.1537500975571685, "grad_norm": 12.780139923095703, "learning_rate": 5.122268470343392e-05, "loss": 1.698, "step": 1970 }, { "epoch": 0.15453055490517442, "grad_norm": 0.014045203104615211, "learning_rate": 5.148283038501561e-05, "loss": 0.3384, "step": 1980 }, { "epoch": 0.15531101225318036, "grad_norm": 129.87998962402344, "learning_rate": 5.174297606659729e-05, "loss": 1.9301, "step": 1990 }, { "epoch": 0.1560914696011863, "grad_norm": 38.74739456176758, "learning_rate": 5.2003121748178986e-05, "loss": 1.2539, "step": 2000 }, { "epoch": 0.15687192694919222, "grad_norm": 0.030707743018865585, "learning_rate": 5.2263267429760666e-05, "loss": 1.0761, "step": 2010 }, { "epoch": 0.15765238429719816, "grad_norm": 70.8248062133789, "learning_rate": 5.2523413111342354e-05, "loss": 2.2531, "step": 2020 }, { "epoch": 0.1584328416452041, "grad_norm": 20.452491760253906, "learning_rate": 5.2783558792924034e-05, "loss": 0.4756, "step": 2030 }, { "epoch": 0.15921329899321002, "grad_norm": 1.2021034955978394, "learning_rate": 5.304370447450573e-05, "loss": 0.714, "step": 2040 }, { "epoch": 0.15999375634121596, "grad_norm": 0.427171915769577, "learning_rate": 5.330385015608741e-05, "loss": 0.4299, "step": 2050 }, { "epoch": 0.16077421368922187, "grad_norm": 5.084196345706005e-06, "learning_rate": 5.3563995837669104e-05, "loss": 2.0605, "step": 2060 }, { "epoch": 0.16155467103722781, "grad_norm": 7.578249096695799e-06, "learning_rate": 5.3824141519250784e-05, "loss": 0.1577, "step": 2070 }, { "epoch": 0.16233512838523376, "grad_norm": 5.277594254948781e-07, "learning_rate": 5.408428720083247e-05, "loss": 0.663, "step": 2080 }, { "epoch": 0.16311558573323967, "grad_norm": 19.192306518554688, "learning_rate": 5.434443288241415e-05, "loss": 0.7749, "step": 2090 }, { "epoch": 0.1638960430812456, "grad_norm": 1.7029945809099445e-07, "learning_rate": 5.4604578563995847e-05, "loss": 0.6955, "step": 2100 }, { "epoch": 0.16467650042925155, "grad_norm": 1.3037423585160468e-08, "learning_rate": 5.486472424557753e-05, "loss": 1.6133, "step": 2110 }, { "epoch": 0.16545695777725747, "grad_norm": 0.6061166524887085, "learning_rate": 5.5124869927159215e-05, "loss": 1.9547, "step": 2120 }, { "epoch": 0.1662374151252634, "grad_norm": 122.31124114990234, "learning_rate": 5.5385015608740895e-05, "loss": 0.7001, "step": 2130 }, { "epoch": 0.16701787247326932, "grad_norm": 0.06084052473306656, "learning_rate": 5.5645161290322576e-05, "loss": 2.5693, "step": 2140 }, { "epoch": 0.16779832982127527, "grad_norm": 104.42826843261719, "learning_rate": 5.590530697190427e-05, "loss": 1.0454, "step": 2150 }, { "epoch": 0.1685787871692812, "grad_norm": 1.0883114337921143, "learning_rate": 5.616545265348595e-05, "loss": 0.872, "step": 2160 }, { "epoch": 0.16935924451728712, "grad_norm": 34.3475341796875, "learning_rate": 5.6425598335067645e-05, "loss": 1.844, "step": 2170 }, { "epoch": 0.17013970186529306, "grad_norm": 1.2302706241607666, "learning_rate": 5.6685744016649326e-05, "loss": 0.916, "step": 2180 }, { "epoch": 0.170920159213299, "grad_norm": 7.886373996734619, "learning_rate": 5.694588969823101e-05, "loss": 2.8442, "step": 2190 }, { "epoch": 0.17170061656130492, "grad_norm": 51.30202865600586, "learning_rate": 5.7206035379812694e-05, "loss": 2.1727, "step": 2200 }, { "epoch": 0.17248107390931086, "grad_norm": 0.004836680367588997, "learning_rate": 5.746618106139439e-05, "loss": 0.148, "step": 2210 }, { "epoch": 0.17326153125731678, "grad_norm": 10.632889747619629, "learning_rate": 5.772632674297607e-05, "loss": 1.1186, "step": 2220 }, { "epoch": 0.17404198860532272, "grad_norm": 1.2311549186706543, "learning_rate": 5.7986472424557756e-05, "loss": 0.7166, "step": 2230 }, { "epoch": 0.17482244595332866, "grad_norm": 0.0004135339695494622, "learning_rate": 5.824661810613944e-05, "loss": 0.5374, "step": 2240 }, { "epoch": 0.17560290330133457, "grad_norm": 109.13738250732422, "learning_rate": 5.850676378772113e-05, "loss": 0.7873, "step": 2250 }, { "epoch": 0.17638336064934051, "grad_norm": 45.3726806640625, "learning_rate": 5.876690946930281e-05, "loss": 2.329, "step": 2260 }, { "epoch": 0.17716381799734646, "grad_norm": 8.300317858811468e-05, "learning_rate": 5.9027055150884506e-05, "loss": 0.7245, "step": 2270 }, { "epoch": 0.17794427534535237, "grad_norm": 100.41148376464844, "learning_rate": 5.928720083246618e-05, "loss": 1.6251, "step": 2280 }, { "epoch": 0.1787247326933583, "grad_norm": 137.95272827148438, "learning_rate": 5.9547346514047874e-05, "loss": 2.1591, "step": 2290 }, { "epoch": 0.17950519004136423, "grad_norm": 111.72086334228516, "learning_rate": 5.9807492195629555e-05, "loss": 0.5804, "step": 2300 }, { "epoch": 0.18028564738937017, "grad_norm": 32.664913177490234, "learning_rate": 6.0067637877211236e-05, "loss": 0.3329, "step": 2310 }, { "epoch": 0.1810661047373761, "grad_norm": 44.84520721435547, "learning_rate": 6.032778355879293e-05, "loss": 1.1485, "step": 2320 }, { "epoch": 0.18184656208538202, "grad_norm": 0.08995208144187927, "learning_rate": 6.058792924037461e-05, "loss": 1.7519, "step": 2330 }, { "epoch": 0.18262701943338797, "grad_norm": 7.0876171776035335e-06, "learning_rate": 6.08480749219563e-05, "loss": 0.3008, "step": 2340 }, { "epoch": 0.1834074767813939, "grad_norm": 0.00017401509103365242, "learning_rate": 6.110822060353799e-05, "loss": 0.0506, "step": 2350 }, { "epoch": 0.18418793412939982, "grad_norm": 32.613162994384766, "learning_rate": 6.136836628511967e-05, "loss": 0.0262, "step": 2360 }, { "epoch": 0.18496839147740576, "grad_norm": 0.001569831627421081, "learning_rate": 6.162851196670135e-05, "loss": 3.8222, "step": 2370 }, { "epoch": 0.18574884882541168, "grad_norm": 135.337646484375, "learning_rate": 6.188865764828304e-05, "loss": 2.4808, "step": 2380 }, { "epoch": 0.18652930617341762, "grad_norm": 13.219198226928711, "learning_rate": 6.214880332986472e-05, "loss": 2.1569, "step": 2390 }, { "epoch": 0.18730976352142356, "grad_norm": 0.014933624304831028, "learning_rate": 6.240894901144642e-05, "loss": 0.7584, "step": 2400 }, { "epoch": 0.18809022086942948, "grad_norm": 104.06402587890625, "learning_rate": 6.26690946930281e-05, "loss": 1.2859, "step": 2410 }, { "epoch": 0.18887067821743542, "grad_norm": 0.00016139191575348377, "learning_rate": 6.292924037460979e-05, "loss": 0.6368, "step": 2420 }, { "epoch": 0.18965113556544136, "grad_norm": 0.02603684924542904, "learning_rate": 6.318938605619147e-05, "loss": 0.9785, "step": 2430 }, { "epoch": 0.19043159291344727, "grad_norm": 89.01065826416016, "learning_rate": 6.344953173777317e-05, "loss": 0.4253, "step": 2440 }, { "epoch": 0.19121205026145321, "grad_norm": 14.965557098388672, "learning_rate": 6.370967741935485e-05, "loss": 2.5722, "step": 2450 }, { "epoch": 0.19199250760945916, "grad_norm": 2.7472351575852372e-05, "learning_rate": 6.396982310093653e-05, "loss": 1.3484, "step": 2460 }, { "epoch": 0.19277296495746507, "grad_norm": 62.7291259765625, "learning_rate": 6.422996878251821e-05, "loss": 1.2803, "step": 2470 }, { "epoch": 0.193553422305471, "grad_norm": 23.414655685424805, "learning_rate": 6.44901144640999e-05, "loss": 0.7357, "step": 2480 }, { "epoch": 0.19433387965347693, "grad_norm": 1.1676721811682e-08, "learning_rate": 6.475026014568158e-05, "loss": 0.8226, "step": 2490 }, { "epoch": 0.19511433700148287, "grad_norm": 1.240859866142273, "learning_rate": 6.501040582726326e-05, "loss": 1.1128, "step": 2500 }, { "epoch": 0.1958947943494888, "grad_norm": 1.6697241067886353, "learning_rate": 6.527055150884496e-05, "loss": 2.9435, "step": 2510 }, { "epoch": 0.19667525169749472, "grad_norm": 33.49791717529297, "learning_rate": 6.553069719042664e-05, "loss": 0.71, "step": 2520 }, { "epoch": 0.19745570904550067, "grad_norm": 4.625143051147461, "learning_rate": 6.579084287200833e-05, "loss": 1.4883, "step": 2530 }, { "epoch": 0.1982361663935066, "grad_norm": 118.58780670166016, "learning_rate": 6.605098855359001e-05, "loss": 0.9855, "step": 2540 }, { "epoch": 0.19901662374151252, "grad_norm": 4.585960388183594, "learning_rate": 6.631113423517171e-05, "loss": 0.6574, "step": 2550 }, { "epoch": 0.19979708108951846, "grad_norm": 24.67189598083496, "learning_rate": 6.657127991675339e-05, "loss": 1.0292, "step": 2560 }, { "epoch": 0.20057753843752438, "grad_norm": 0.24398376047611237, "learning_rate": 6.683142559833507e-05, "loss": 1.203, "step": 2570 }, { "epoch": 0.20135799578553032, "grad_norm": 2.676201581954956, "learning_rate": 6.709157127991675e-05, "loss": 0.6513, "step": 2580 }, { "epoch": 0.20213845313353626, "grad_norm": 13.57225513458252, "learning_rate": 6.735171696149844e-05, "loss": 1.8385, "step": 2590 }, { "epoch": 0.20291891048154218, "grad_norm": 76.8286361694336, "learning_rate": 6.761186264308012e-05, "loss": 0.4815, "step": 2600 }, { "epoch": 0.20369936782954812, "grad_norm": 71.41983032226562, "learning_rate": 6.787200832466182e-05, "loss": 0.3473, "step": 2610 }, { "epoch": 0.20447982517755406, "grad_norm": 16.46891212463379, "learning_rate": 6.81321540062435e-05, "loss": 0.2729, "step": 2620 }, { "epoch": 0.20526028252555997, "grad_norm": 2.7754387855529785, "learning_rate": 6.839229968782519e-05, "loss": 2.3419, "step": 2630 }, { "epoch": 0.20604073987356591, "grad_norm": 92.1739501953125, "learning_rate": 6.865244536940687e-05, "loss": 0.2308, "step": 2640 }, { "epoch": 0.20682119722157183, "grad_norm": 137.1291046142578, "learning_rate": 6.891259105098857e-05, "loss": 3.2198, "step": 2650 }, { "epoch": 0.20760165456957777, "grad_norm": 3.064040422439575, "learning_rate": 6.917273673257025e-05, "loss": 2.4636, "step": 2660 }, { "epoch": 0.2083821119175837, "grad_norm": 68.23210144042969, "learning_rate": 6.943288241415193e-05, "loss": 0.4497, "step": 2670 }, { "epoch": 0.20916256926558963, "grad_norm": 8.724065992282704e-06, "learning_rate": 6.969302809573361e-05, "loss": 0.2542, "step": 2680 }, { "epoch": 0.20994302661359557, "grad_norm": 0.014424404129385948, "learning_rate": 6.995317377731529e-05, "loss": 0.565, "step": 2690 }, { "epoch": 0.2107234839616015, "grad_norm": 135.26565551757812, "learning_rate": 7.021331945889699e-05, "loss": 2.0194, "step": 2700 }, { "epoch": 0.21150394130960742, "grad_norm": 0.00900540966540575, "learning_rate": 7.047346514047867e-05, "loss": 1.9152, "step": 2710 }, { "epoch": 0.21228439865761337, "grad_norm": 39.1527214050293, "learning_rate": 7.073361082206036e-05, "loss": 0.5994, "step": 2720 }, { "epoch": 0.21306485600561928, "grad_norm": 0.008746455423533916, "learning_rate": 7.099375650364204e-05, "loss": 1.3674, "step": 2730 }, { "epoch": 0.21384531335362522, "grad_norm": 57.56302261352539, "learning_rate": 7.125390218522373e-05, "loss": 1.2222, "step": 2740 }, { "epoch": 0.21462577070163116, "grad_norm": 49.190887451171875, "learning_rate": 7.151404786680542e-05, "loss": 1.031, "step": 2750 }, { "epoch": 0.21540622804963708, "grad_norm": 124.0704116821289, "learning_rate": 7.177419354838711e-05, "loss": 1.7073, "step": 2760 }, { "epoch": 0.21618668539764302, "grad_norm": 0.058635517954826355, "learning_rate": 7.203433922996878e-05, "loss": 1.3333, "step": 2770 }, { "epoch": 0.21696714274564896, "grad_norm": 40.484222412109375, "learning_rate": 7.229448491155047e-05, "loss": 0.8355, "step": 2780 }, { "epoch": 0.21774760009365488, "grad_norm": 3.079050064086914, "learning_rate": 7.255463059313215e-05, "loss": 0.838, "step": 2790 }, { "epoch": 0.21852805744166082, "grad_norm": 2.005028247833252, "learning_rate": 7.281477627471385e-05, "loss": 0.7343, "step": 2800 }, { "epoch": 0.21930851478966673, "grad_norm": 56.680877685546875, "learning_rate": 7.307492195629553e-05, "loss": 0.3208, "step": 2810 }, { "epoch": 0.22008897213767267, "grad_norm": 1.1359798008925281e-05, "learning_rate": 7.333506763787722e-05, "loss": 2.3247, "step": 2820 }, { "epoch": 0.22086942948567861, "grad_norm": 123.48363494873047, "learning_rate": 7.35952133194589e-05, "loss": 1.2051, "step": 2830 }, { "epoch": 0.22164988683368453, "grad_norm": 0.8620667457580566, "learning_rate": 7.38553590010406e-05, "loss": 0.3456, "step": 2840 }, { "epoch": 0.22243034418169047, "grad_norm": 50.265743255615234, "learning_rate": 7.411550468262228e-05, "loss": 1.8603, "step": 2850 }, { "epoch": 0.2232108015296964, "grad_norm": 60.52621841430664, "learning_rate": 7.437565036420396e-05, "loss": 0.7977, "step": 2860 }, { "epoch": 0.22399125887770233, "grad_norm": 17.86060905456543, "learning_rate": 7.463579604578564e-05, "loss": 0.6533, "step": 2870 }, { "epoch": 0.22477171622570827, "grad_norm": 0.051353830844163895, "learning_rate": 7.489594172736732e-05, "loss": 0.4748, "step": 2880 }, { "epoch": 0.2255521735737142, "grad_norm": 0.003670419566333294, "learning_rate": 7.515608740894901e-05, "loss": 0.8188, "step": 2890 }, { "epoch": 0.22633263092172012, "grad_norm": 104.48554992675781, "learning_rate": 7.54162330905307e-05, "loss": 1.515, "step": 2900 }, { "epoch": 0.22711308826972607, "grad_norm": 0.0348089262843132, "learning_rate": 7.567637877211239e-05, "loss": 2.7286, "step": 2910 }, { "epoch": 0.22789354561773198, "grad_norm": 10.428803443908691, "learning_rate": 7.593652445369407e-05, "loss": 1.0998, "step": 2920 }, { "epoch": 0.22867400296573792, "grad_norm": 8.424872398376465, "learning_rate": 7.619667013527576e-05, "loss": 0.6017, "step": 2930 }, { "epoch": 0.22945446031374386, "grad_norm": 0.40675023198127747, "learning_rate": 7.645681581685744e-05, "loss": 0.7057, "step": 2940 }, { "epoch": 0.23023491766174978, "grad_norm": 0.02739277482032776, "learning_rate": 7.671696149843914e-05, "loss": 0.7381, "step": 2950 }, { "epoch": 0.23101537500975572, "grad_norm": 122.3631820678711, "learning_rate": 7.697710718002082e-05, "loss": 2.2416, "step": 2960 }, { "epoch": 0.23179583235776166, "grad_norm": 22.099197387695312, "learning_rate": 7.72372528616025e-05, "loss": 0.7604, "step": 2970 }, { "epoch": 0.23257628970576757, "grad_norm": 0.024255866184830666, "learning_rate": 7.749739854318418e-05, "loss": 0.7483, "step": 2980 }, { "epoch": 0.23335674705377352, "grad_norm": 0.019524026662111282, "learning_rate": 7.775754422476587e-05, "loss": 0.6842, "step": 2990 }, { "epoch": 0.23413720440177943, "grad_norm": 1.153704243250786e-09, "learning_rate": 7.801768990634755e-05, "loss": 0.7946, "step": 3000 }, { "epoch": 0.23491766174978537, "grad_norm": 0.002444772282615304, "learning_rate": 7.827783558792925e-05, "loss": 2.4327, "step": 3010 }, { "epoch": 0.23569811909779131, "grad_norm": 1.4735280275344849, "learning_rate": 7.853798126951093e-05, "loss": 1.2604, "step": 3020 }, { "epoch": 0.23647857644579723, "grad_norm": 5.004496870242292e-06, "learning_rate": 7.879812695109261e-05, "loss": 0.9982, "step": 3030 }, { "epoch": 0.23725903379380317, "grad_norm": 5.29398775100708, "learning_rate": 7.90582726326743e-05, "loss": 0.9924, "step": 3040 }, { "epoch": 0.2380394911418091, "grad_norm": 98.14347076416016, "learning_rate": 7.931841831425598e-05, "loss": 1.2078, "step": 3050 }, { "epoch": 0.23881994848981503, "grad_norm": 9.180928230285645, "learning_rate": 7.957856399583768e-05, "loss": 0.293, "step": 3060 }, { "epoch": 0.23960040583782097, "grad_norm": 96.04532623291016, "learning_rate": 7.983870967741936e-05, "loss": 0.7801, "step": 3070 }, { "epoch": 0.24038086318582688, "grad_norm": 43.307640075683594, "learning_rate": 8.009885535900104e-05, "loss": 1.1076, "step": 3080 }, { "epoch": 0.24116132053383282, "grad_norm": 5.743316173553467, "learning_rate": 8.035900104058272e-05, "loss": 1.1714, "step": 3090 }, { "epoch": 0.24194177788183877, "grad_norm": 0.0005356659530662, "learning_rate": 8.061914672216442e-05, "loss": 1.811, "step": 3100 }, { "epoch": 0.24272223522984468, "grad_norm": 0.5350730419158936, "learning_rate": 8.08792924037461e-05, "loss": 0.948, "step": 3110 }, { "epoch": 0.24350269257785062, "grad_norm": 0.001030560932122171, "learning_rate": 8.113943808532779e-05, "loss": 0.3189, "step": 3120 }, { "epoch": 0.24428314992585656, "grad_norm": 129.49615478515625, "learning_rate": 8.139958376690947e-05, "loss": 0.9723, "step": 3130 }, { "epoch": 0.24506360727386248, "grad_norm": 13.451752662658691, "learning_rate": 8.165972944849117e-05, "loss": 1.0916, "step": 3140 }, { "epoch": 0.24584406462186842, "grad_norm": 0.06887555867433548, "learning_rate": 8.191987513007285e-05, "loss": 2.2815, "step": 3150 }, { "epoch": 0.24662452196987433, "grad_norm": 18.077409744262695, "learning_rate": 8.218002081165454e-05, "loss": 2.1444, "step": 3160 }, { "epoch": 0.24740497931788027, "grad_norm": 120.06288146972656, "learning_rate": 8.244016649323622e-05, "loss": 1.5044, "step": 3170 }, { "epoch": 0.24818543666588622, "grad_norm": 8.321662902832031, "learning_rate": 8.27003121748179e-05, "loss": 0.3245, "step": 3180 }, { "epoch": 0.24896589401389213, "grad_norm": 3.4147167205810547, "learning_rate": 8.296045785639958e-05, "loss": 1.4218, "step": 3190 }, { "epoch": 0.24974635136189807, "grad_norm": 163.7802734375, "learning_rate": 8.322060353798128e-05, "loss": 1.3589, "step": 3200 }, { "epoch": 0.250526808709904, "grad_norm": 0.1835886836051941, "learning_rate": 8.348074921956296e-05, "loss": 2.1614, "step": 3210 }, { "epoch": 0.25130726605790993, "grad_norm": 0.03185463324189186, "learning_rate": 8.374089490114464e-05, "loss": 1.6368, "step": 3220 }, { "epoch": 0.25208772340591584, "grad_norm": 45.36469268798828, "learning_rate": 8.400104058272633e-05, "loss": 2.1333, "step": 3230 }, { "epoch": 0.2528681807539218, "grad_norm": 152.0260772705078, "learning_rate": 8.426118626430801e-05, "loss": 1.3231, "step": 3240 }, { "epoch": 0.2536486381019277, "grad_norm": 15.69242000579834, "learning_rate": 8.45213319458897e-05, "loss": 1.1895, "step": 3250 }, { "epoch": 0.25442909544993364, "grad_norm": 87.19802856445312, "learning_rate": 8.478147762747139e-05, "loss": 1.4486, "step": 3260 }, { "epoch": 0.2552095527979396, "grad_norm": 51.75240707397461, "learning_rate": 8.504162330905308e-05, "loss": 1.4757, "step": 3270 }, { "epoch": 0.2559900101459455, "grad_norm": 79.54395294189453, "learning_rate": 8.530176899063476e-05, "loss": 2.1736, "step": 3280 }, { "epoch": 0.25677046749395144, "grad_norm": 4.835404396057129, "learning_rate": 8.556191467221644e-05, "loss": 0.6074, "step": 3290 }, { "epoch": 0.2575509248419574, "grad_norm": 78.10124206542969, "learning_rate": 8.582206035379812e-05, "loss": 1.0351, "step": 3300 }, { "epoch": 0.2583313821899633, "grad_norm": 8.682250022888184, "learning_rate": 8.608220603537982e-05, "loss": 1.1131, "step": 3310 }, { "epoch": 0.25911183953796924, "grad_norm": 56.00176239013672, "learning_rate": 8.63423517169615e-05, "loss": 2.09, "step": 3320 }, { "epoch": 0.2598922968859752, "grad_norm": 0.021717999130487442, "learning_rate": 8.660249739854319e-05, "loss": 1.3968, "step": 3330 }, { "epoch": 0.2606727542339811, "grad_norm": 95.72789001464844, "learning_rate": 8.686264308012487e-05, "loss": 0.629, "step": 3340 }, { "epoch": 0.26145321158198703, "grad_norm": 82.1919937133789, "learning_rate": 8.712278876170657e-05, "loss": 0.6342, "step": 3350 }, { "epoch": 0.262233668929993, "grad_norm": 15.192002296447754, "learning_rate": 8.738293444328825e-05, "loss": 0.0908, "step": 3360 }, { "epoch": 0.2630141262779989, "grad_norm": 176.47926330566406, "learning_rate": 8.764308012486994e-05, "loss": 1.3793, "step": 3370 }, { "epoch": 0.26379458362600483, "grad_norm": 168.18618774414062, "learning_rate": 8.790322580645162e-05, "loss": 2.3707, "step": 3380 }, { "epoch": 0.26457504097401074, "grad_norm": 0.0006278129876591265, "learning_rate": 8.81633714880333e-05, "loss": 0.6653, "step": 3390 }, { "epoch": 0.2653554983220167, "grad_norm": 5.725263326894492e-05, "learning_rate": 8.842351716961498e-05, "loss": 1.7785, "step": 3400 }, { "epoch": 0.26613595567002263, "grad_norm": 9.961698532104492, "learning_rate": 8.868366285119667e-05, "loss": 2.1567, "step": 3410 }, { "epoch": 0.26691641301802854, "grad_norm": 15.461480140686035, "learning_rate": 8.894380853277836e-05, "loss": 1.008, "step": 3420 }, { "epoch": 0.2676968703660345, "grad_norm": 0.003817821154370904, "learning_rate": 8.920395421436004e-05, "loss": 0.3742, "step": 3430 }, { "epoch": 0.2684773277140404, "grad_norm": 4.069940567016602, "learning_rate": 8.946409989594173e-05, "loss": 0.3514, "step": 3440 }, { "epoch": 0.26925778506204634, "grad_norm": 0.027782615274190903, "learning_rate": 8.972424557752341e-05, "loss": 1.0349, "step": 3450 }, { "epoch": 0.2700382424100523, "grad_norm": 0.10339374095201492, "learning_rate": 8.998439125910511e-05, "loss": 2.5487, "step": 3460 }, { "epoch": 0.2708186997580582, "grad_norm": 0.7649679780006409, "learning_rate": 9.024453694068679e-05, "loss": 2.1772, "step": 3470 }, { "epoch": 0.27159915710606414, "grad_norm": 54.28131866455078, "learning_rate": 9.050468262226848e-05, "loss": 1.4012, "step": 3480 }, { "epoch": 0.2723796144540701, "grad_norm": 112.0675277709961, "learning_rate": 9.076482830385015e-05, "loss": 1.2856, "step": 3490 }, { "epoch": 0.273160071802076, "grad_norm": 0.7972148656845093, "learning_rate": 9.102497398543185e-05, "loss": 0.6709, "step": 3500 }, { "epoch": 0.27394052915008194, "grad_norm": 228.42440795898438, "learning_rate": 9.128511966701353e-05, "loss": 1.0519, "step": 3510 }, { "epoch": 0.2747209864980879, "grad_norm": 1.3910229199609603e-06, "learning_rate": 9.154526534859522e-05, "loss": 1.2227, "step": 3520 }, { "epoch": 0.2755014438460938, "grad_norm": 137.3799591064453, "learning_rate": 9.18054110301769e-05, "loss": 2.221, "step": 3530 }, { "epoch": 0.27628190119409973, "grad_norm": 52.212223052978516, "learning_rate": 9.20655567117586e-05, "loss": 0.5392, "step": 3540 }, { "epoch": 0.27706235854210565, "grad_norm": 12.489707946777344, "learning_rate": 9.232570239334028e-05, "loss": 1.2827, "step": 3550 }, { "epoch": 0.2778428158901116, "grad_norm": 3.4159752804185928e-09, "learning_rate": 9.258584807492197e-05, "loss": 0.4744, "step": 3560 }, { "epoch": 0.27862327323811753, "grad_norm": 6.787818908691406, "learning_rate": 9.284599375650365e-05, "loss": 1.175, "step": 3570 }, { "epoch": 0.27940373058612344, "grad_norm": 0.2829515039920807, "learning_rate": 9.310613943808533e-05, "loss": 1.7776, "step": 3580 }, { "epoch": 0.2801841879341294, "grad_norm": 0.000505531148519367, "learning_rate": 9.336628511966701e-05, "loss": 0.5682, "step": 3590 }, { "epoch": 0.28096464528213533, "grad_norm": 53.973140716552734, "learning_rate": 9.362643080124869e-05, "loss": 2.9599, "step": 3600 }, { "epoch": 0.28174510263014124, "grad_norm": 115.87065887451172, "learning_rate": 9.388657648283039e-05, "loss": 3.8834, "step": 3610 }, { "epoch": 0.2825255599781472, "grad_norm": 15.08074951171875, "learning_rate": 9.414672216441207e-05, "loss": 0.6171, "step": 3620 }, { "epoch": 0.2833060173261531, "grad_norm": 0.0419006384909153, "learning_rate": 9.440686784599376e-05, "loss": 2.639, "step": 3630 }, { "epoch": 0.28408647467415904, "grad_norm": 1.0769245818664785e-05, "learning_rate": 9.466701352757544e-05, "loss": 1.3622, "step": 3640 }, { "epoch": 0.284866932022165, "grad_norm": 45.47175979614258, "learning_rate": 9.492715920915714e-05, "loss": 0.6441, "step": 3650 }, { "epoch": 0.2856473893701709, "grad_norm": 0.3931857943534851, "learning_rate": 9.518730489073882e-05, "loss": 1.6816, "step": 3660 }, { "epoch": 0.28642784671817684, "grad_norm": 63.66070556640625, "learning_rate": 9.544745057232051e-05, "loss": 1.3548, "step": 3670 }, { "epoch": 0.2872083040661828, "grad_norm": 12.380095481872559, "learning_rate": 9.570759625390219e-05, "loss": 0.5438, "step": 3680 }, { "epoch": 0.2879887614141887, "grad_norm": 8.507214546203613, "learning_rate": 9.596774193548387e-05, "loss": 0.7701, "step": 3690 }, { "epoch": 0.28876921876219463, "grad_norm": 0.00235476135276258, "learning_rate": 9.622788761706555e-05, "loss": 0.0634, "step": 3700 }, { "epoch": 0.2895496761102006, "grad_norm": 0.45816951990127563, "learning_rate": 9.648803329864725e-05, "loss": 0.9807, "step": 3710 }, { "epoch": 0.2903301334582065, "grad_norm": 0.003578943433240056, "learning_rate": 9.674817898022893e-05, "loss": 1.536, "step": 3720 }, { "epoch": 0.29111059080621243, "grad_norm": 114.68425750732422, "learning_rate": 9.700832466181062e-05, "loss": 1.1759, "step": 3730 }, { "epoch": 0.29189104815421835, "grad_norm": 28.14389419555664, "learning_rate": 9.72684703433923e-05, "loss": 2.6994, "step": 3740 }, { "epoch": 0.2926715055022243, "grad_norm": 71.8919677734375, "learning_rate": 9.7528616024974e-05, "loss": 1.5178, "step": 3750 }, { "epoch": 0.29345196285023023, "grad_norm": 56.985450744628906, "learning_rate": 9.778876170655568e-05, "loss": 0.657, "step": 3760 }, { "epoch": 0.29423242019823614, "grad_norm": 49.4119758605957, "learning_rate": 9.804890738813736e-05, "loss": 0.7112, "step": 3770 }, { "epoch": 0.2950128775462421, "grad_norm": 23.014514923095703, "learning_rate": 9.830905306971905e-05, "loss": 0.857, "step": 3780 }, { "epoch": 0.29579333489424803, "grad_norm": 32.53450012207031, "learning_rate": 9.856919875130073e-05, "loss": 0.7348, "step": 3790 }, { "epoch": 0.29657379224225394, "grad_norm": 59.46930694580078, "learning_rate": 9.882934443288241e-05, "loss": 2.8634, "step": 3800 }, { "epoch": 0.2973542495902599, "grad_norm": 15.2013521194458, "learning_rate": 9.90894901144641e-05, "loss": 0.5304, "step": 3810 }, { "epoch": 0.2981347069382658, "grad_norm": 11.043535232543945, "learning_rate": 9.934963579604579e-05, "loss": 0.7362, "step": 3820 }, { "epoch": 0.29891516428627174, "grad_norm": 0.4539059102535248, "learning_rate": 9.960978147762747e-05, "loss": 1.841, "step": 3830 }, { "epoch": 0.2996956216342777, "grad_norm": 0.08967360109090805, "learning_rate": 9.986992715920916e-05, "loss": 0.451, "step": 3840 }, { "epoch": 0.3004760789822836, "grad_norm": 9.775715827941895, "learning_rate": 9.99999948458971e-05, "loss": 0.0336, "step": 3850 }, { "epoch": 0.30125653633028954, "grad_norm": 0.0003560681361705065, "learning_rate": 9.999995361308023e-05, "loss": 6.5222, "step": 3860 }, { "epoch": 0.3020369936782955, "grad_norm": 74.62921905517578, "learning_rate": 9.999987114748048e-05, "loss": 1.7191, "step": 3870 }, { "epoch": 0.3028174510263014, "grad_norm": 0.004861526656895876, "learning_rate": 9.999974744916586e-05, "loss": 0.661, "step": 3880 }, { "epoch": 0.30359790837430733, "grad_norm": 20.33612823486328, "learning_rate": 9.999958251823837e-05, "loss": 0.6154, "step": 3890 }, { "epoch": 0.30437836572231325, "grad_norm": 3.1527907848358154, "learning_rate": 9.999937635483406e-05, "loss": 1.4707, "step": 3900 }, { "epoch": 0.3051588230703192, "grad_norm": 70.3372802734375, "learning_rate": 9.999912895912291e-05, "loss": 1.3573, "step": 3910 }, { "epoch": 0.30593928041832513, "grad_norm": 0.2236901819705963, "learning_rate": 9.999884033130893e-05, "loss": 1.436, "step": 3920 }, { "epoch": 0.30671973776633105, "grad_norm": 0.030305564403533936, "learning_rate": 9.999851047163016e-05, "loss": 0.4924, "step": 3930 }, { "epoch": 0.307500195114337, "grad_norm": 5.503699753717228e-07, "learning_rate": 9.999813938035863e-05, "loss": 1.1959, "step": 3940 }, { "epoch": 0.30828065246234293, "grad_norm": 50.966861724853516, "learning_rate": 9.999772705780032e-05, "loss": 0.5705, "step": 3950 }, { "epoch": 0.30906110981034884, "grad_norm": 0.027110936120152473, "learning_rate": 9.999727350429529e-05, "loss": 0.9733, "step": 3960 }, { "epoch": 0.3098415671583548, "grad_norm": 6.961010456085205, "learning_rate": 9.999677872021755e-05, "loss": 0.5901, "step": 3970 }, { "epoch": 0.3106220245063607, "grad_norm": 13.56571102142334, "learning_rate": 9.999624270597515e-05, "loss": 0.1543, "step": 3980 }, { "epoch": 0.31140248185436664, "grad_norm": 44.97492980957031, "learning_rate": 9.999566546201008e-05, "loss": 1.1215, "step": 3990 }, { "epoch": 0.3121829392023726, "grad_norm": 20.070308685302734, "learning_rate": 9.999504698879838e-05, "loss": 1.4225, "step": 4000 }, { "epoch": 0.3129633965503785, "grad_norm": 1.784777283668518, "learning_rate": 9.999438728685011e-05, "loss": 0.4262, "step": 4010 }, { "epoch": 0.31374385389838444, "grad_norm": 6.901144504547119, "learning_rate": 9.999368635670928e-05, "loss": 0.6186, "step": 4020 }, { "epoch": 0.3145243112463904, "grad_norm": 4.161862373352051, "learning_rate": 9.999294419895389e-05, "loss": 1.6664, "step": 4030 }, { "epoch": 0.3153047685943963, "grad_norm": 0.14534419775009155, "learning_rate": 9.999216081419599e-05, "loss": 1.4358, "step": 4040 }, { "epoch": 0.31608522594240224, "grad_norm": 8.709707260131836, "learning_rate": 9.999133620308161e-05, "loss": 1.2329, "step": 4050 }, { "epoch": 0.3168656832904082, "grad_norm": 59.18502426147461, "learning_rate": 9.999047036629074e-05, "loss": 1.7152, "step": 4060 }, { "epoch": 0.3176461406384141, "grad_norm": 6.609310626983643, "learning_rate": 9.998956330453744e-05, "loss": 1.5284, "step": 4070 }, { "epoch": 0.31842659798642003, "grad_norm": 34.27482604980469, "learning_rate": 9.998861501856969e-05, "loss": 0.7867, "step": 4080 }, { "epoch": 0.31920705533442595, "grad_norm": 0.22041009366512299, "learning_rate": 9.99876255091695e-05, "loss": 1.6204, "step": 4090 }, { "epoch": 0.3199875126824319, "grad_norm": 71.2801742553711, "learning_rate": 9.99865947771529e-05, "loss": 1.2527, "step": 4100 }, { "epoch": 0.32076797003043783, "grad_norm": 29.67563819885254, "learning_rate": 9.998552282336988e-05, "loss": 1.1256, "step": 4110 }, { "epoch": 0.32154842737844375, "grad_norm": 39.03765869140625, "learning_rate": 9.998440964870443e-05, "loss": 1.2212, "step": 4120 }, { "epoch": 0.3223288847264497, "grad_norm": 75.92422485351562, "learning_rate": 9.998325525407453e-05, "loss": 1.7344, "step": 4130 }, { "epoch": 0.32310934207445563, "grad_norm": 11.299295425415039, "learning_rate": 9.998205964043217e-05, "loss": 1.7633, "step": 4140 }, { "epoch": 0.32388979942246154, "grad_norm": 0.8371900916099548, "learning_rate": 9.99808228087633e-05, "loss": 1.8775, "step": 4150 }, { "epoch": 0.3246702567704675, "grad_norm": 0.0006445566541515291, "learning_rate": 9.997954476008792e-05, "loss": 0.9025, "step": 4160 }, { "epoch": 0.3254507141184734, "grad_norm": 0.00038227959885261953, "learning_rate": 9.997822549545995e-05, "loss": 0.2794, "step": 4170 }, { "epoch": 0.32623117146647934, "grad_norm": 4.977309799869545e-06, "learning_rate": 9.997686501596735e-05, "loss": 1.2245, "step": 4180 }, { "epoch": 0.3270116288144853, "grad_norm": 2.4554672241210938, "learning_rate": 9.997546332273202e-05, "loss": 0.1281, "step": 4190 }, { "epoch": 0.3277920861624912, "grad_norm": 1.094784853050701e-11, "learning_rate": 9.99740204169099e-05, "loss": 0.9423, "step": 4200 }, { "epoch": 0.32857254351049714, "grad_norm": 59.58600616455078, "learning_rate": 9.997253629969089e-05, "loss": 6.1129, "step": 4210 }, { "epoch": 0.3293530008585031, "grad_norm": 41.99625778198242, "learning_rate": 9.997101097229886e-05, "loss": 0.6757, "step": 4220 }, { "epoch": 0.330133458206509, "grad_norm": 18.46398162841797, "learning_rate": 9.99694444359917e-05, "loss": 1.9334, "step": 4230 }, { "epoch": 0.33091391555451494, "grad_norm": 0.34535104036331177, "learning_rate": 9.996783669206124e-05, "loss": 0.0171, "step": 4240 }, { "epoch": 0.33169437290252085, "grad_norm": 70.5401840209961, "learning_rate": 9.996618774183335e-05, "loss": 1.538, "step": 4250 }, { "epoch": 0.3324748302505268, "grad_norm": 0.9419988989830017, "learning_rate": 9.996449758666784e-05, "loss": 1.0131, "step": 4260 }, { "epoch": 0.33325528759853273, "grad_norm": 44.18296813964844, "learning_rate": 9.996276622795847e-05, "loss": 1.2465, "step": 4270 }, { "epoch": 0.33403574494653865, "grad_norm": 48.11833190917969, "learning_rate": 9.996099366713306e-05, "loss": 1.1763, "step": 4280 }, { "epoch": 0.3348162022945446, "grad_norm": 2.812107563018799, "learning_rate": 9.995917990565333e-05, "loss": 1.1437, "step": 4290 }, { "epoch": 0.33559665964255053, "grad_norm": 14.357457160949707, "learning_rate": 9.995732494501505e-05, "loss": 1.9292, "step": 4300 }, { "epoch": 0.33637711699055645, "grad_norm": 25.811626434326172, "learning_rate": 9.995542878674789e-05, "loss": 0.8213, "step": 4310 }, { "epoch": 0.3371575743385624, "grad_norm": 55.16279602050781, "learning_rate": 9.995349143241552e-05, "loss": 1.6721, "step": 4320 }, { "epoch": 0.33793803168656833, "grad_norm": 0.0017165453173220158, "learning_rate": 9.995151288361564e-05, "loss": 0.5614, "step": 4330 }, { "epoch": 0.33871848903457424, "grad_norm": 0.8089917898178101, "learning_rate": 9.994949314197985e-05, "loss": 0.4937, "step": 4340 }, { "epoch": 0.3394989463825802, "grad_norm": 6.39279317855835, "learning_rate": 9.994743220917372e-05, "loss": 1.6623, "step": 4350 }, { "epoch": 0.3402794037305861, "grad_norm": 13.513890266418457, "learning_rate": 9.994533008689684e-05, "loss": 2.12, "step": 4360 }, { "epoch": 0.34105986107859204, "grad_norm": 2.0500733852386475, "learning_rate": 9.994318677688273e-05, "loss": 0.3837, "step": 4370 }, { "epoch": 0.341840318426598, "grad_norm": 40.21711349487305, "learning_rate": 9.994100228089888e-05, "loss": 1.4057, "step": 4380 }, { "epoch": 0.3426207757746039, "grad_norm": 59.82537078857422, "learning_rate": 9.993877660074675e-05, "loss": 0.922, "step": 4390 }, { "epoch": 0.34340123312260984, "grad_norm": 30.938413619995117, "learning_rate": 9.993650973826176e-05, "loss": 1.4275, "step": 4400 }, { "epoch": 0.34418169047061575, "grad_norm": 0.035656414926052094, "learning_rate": 9.99342016953133e-05, "loss": 0.4097, "step": 4410 }, { "epoch": 0.3449621478186217, "grad_norm": 9.82229232788086, "learning_rate": 9.993185247380471e-05, "loss": 0.7342, "step": 4420 }, { "epoch": 0.34574260516662764, "grad_norm": 8.707996368408203, "learning_rate": 9.992946207567328e-05, "loss": 0.6301, "step": 4430 }, { "epoch": 0.34652306251463355, "grad_norm": 6.594290733337402, "learning_rate": 9.99270305028903e-05, "loss": 0.9255, "step": 4440 }, { "epoch": 0.3473035198626395, "grad_norm": 55.64490509033203, "learning_rate": 9.992455775746092e-05, "loss": 2.7586, "step": 4450 }, { "epoch": 0.34808397721064543, "grad_norm": 18.06779670715332, "learning_rate": 9.992204384142438e-05, "loss": 0.9131, "step": 4460 }, { "epoch": 0.34886443455865135, "grad_norm": 5.652497291564941, "learning_rate": 9.991948875685375e-05, "loss": 0.7655, "step": 4470 }, { "epoch": 0.3496448919066573, "grad_norm": 48.18488693237305, "learning_rate": 9.99168925058561e-05, "loss": 0.9117, "step": 4480 }, { "epoch": 0.35042534925466323, "grad_norm": 25.798969268798828, "learning_rate": 9.991425509057245e-05, "loss": 1.0564, "step": 4490 }, { "epoch": 0.35120580660266915, "grad_norm": 1.1699587106704712, "learning_rate": 9.991157651317777e-05, "loss": 0.3099, "step": 4500 }, { "epoch": 0.3519862639506751, "grad_norm": 44.54468536376953, "learning_rate": 9.990885677588096e-05, "loss": 0.9814, "step": 4510 }, { "epoch": 0.35276672129868103, "grad_norm": 28.770235061645508, "learning_rate": 9.990609588092488e-05, "loss": 0.9127, "step": 4520 }, { "epoch": 0.35354717864668694, "grad_norm": 3.9841158390045166, "learning_rate": 9.990329383058628e-05, "loss": 2.0077, "step": 4530 }, { "epoch": 0.3543276359946929, "grad_norm": 32.356719970703125, "learning_rate": 9.990045062717594e-05, "loss": 1.0274, "step": 4540 }, { "epoch": 0.3551080933426988, "grad_norm": 1.2889198064804077, "learning_rate": 9.98975662730385e-05, "loss": 0.7582, "step": 4550 }, { "epoch": 0.35588855069070474, "grad_norm": 49.0152473449707, "learning_rate": 9.989464077055257e-05, "loss": 0.5338, "step": 4560 }, { "epoch": 0.3566690080387107, "grad_norm": 3.9910778999328613, "learning_rate": 9.989167412213067e-05, "loss": 1.8175, "step": 4570 }, { "epoch": 0.3574494653867166, "grad_norm": 11.524066925048828, "learning_rate": 9.988866633021928e-05, "loss": 0.5534, "step": 4580 }, { "epoch": 0.35822992273472254, "grad_norm": 76.08323669433594, "learning_rate": 9.988561739729878e-05, "loss": 0.8186, "step": 4590 }, { "epoch": 0.35901038008272845, "grad_norm": 15.860508918762207, "learning_rate": 9.988252732588351e-05, "loss": 0.3079, "step": 4600 }, { "epoch": 0.3597908374307344, "grad_norm": 0.6550878882408142, "learning_rate": 9.98793961185217e-05, "loss": 0.6749, "step": 4610 }, { "epoch": 0.36057129477874034, "grad_norm": 11.891127586364746, "learning_rate": 9.987622377779555e-05, "loss": 0.8806, "step": 4620 }, { "epoch": 0.36135175212674625, "grad_norm": 0.2516235113143921, "learning_rate": 9.98730103063211e-05, "loss": 1.2591, "step": 4630 }, { "epoch": 0.3621322094747522, "grad_norm": 0.00010328528878744692, "learning_rate": 9.986975570674842e-05, "loss": 0.9852, "step": 4640 }, { "epoch": 0.36291266682275813, "grad_norm": 2.1215724732570607e-09, "learning_rate": 9.986645998176139e-05, "loss": 0.4795, "step": 4650 }, { "epoch": 0.36369312417076405, "grad_norm": 17.579479217529297, "learning_rate": 9.986312313407787e-05, "loss": 0.6918, "step": 4660 }, { "epoch": 0.36447358151877, "grad_norm": 66.99678802490234, "learning_rate": 9.985974516644962e-05, "loss": 0.4536, "step": 4670 }, { "epoch": 0.36525403886677593, "grad_norm": 0.025410788133740425, "learning_rate": 9.985632608166228e-05, "loss": 0.5175, "step": 4680 }, { "epoch": 0.36603449621478185, "grad_norm": 49.9116325378418, "learning_rate": 9.985286588253544e-05, "loss": 1.2667, "step": 4690 }, { "epoch": 0.3668149535627878, "grad_norm": 1.777963638305664, "learning_rate": 9.984936457192256e-05, "loss": 0.7312, "step": 4700 }, { "epoch": 0.36759541091079373, "grad_norm": 0.00023074759519658983, "learning_rate": 9.984582215271103e-05, "loss": 0.1217, "step": 4710 }, { "epoch": 0.36837586825879964, "grad_norm": 0.7179341316223145, "learning_rate": 9.984223862782214e-05, "loss": 3.252, "step": 4720 }, { "epoch": 0.3691563256068056, "grad_norm": 2.1000545530114323e-05, "learning_rate": 9.983861400021104e-05, "loss": 0.9837, "step": 4730 }, { "epoch": 0.3699367829548115, "grad_norm": 0.0001699527056189254, "learning_rate": 9.983494827286681e-05, "loss": 0.2548, "step": 4740 }, { "epoch": 0.37071724030281744, "grad_norm": 64.94124603271484, "learning_rate": 9.983124144881244e-05, "loss": 1.004, "step": 4750 }, { "epoch": 0.37149769765082336, "grad_norm": 4.1760454177856445, "learning_rate": 9.982749353110474e-05, "loss": 0.4211, "step": 4760 }, { "epoch": 0.3722781549988293, "grad_norm": 1.8572329281596467e-05, "learning_rate": 9.98237045228345e-05, "loss": 1.5099, "step": 4770 }, { "epoch": 0.37305861234683524, "grad_norm": 3.898281875081011e-07, "learning_rate": 9.981987442712633e-05, "loss": 0.9033, "step": 4780 }, { "epoch": 0.37383906969484115, "grad_norm": 0.23063763976097107, "learning_rate": 9.981600324713873e-05, "loss": 1.7242, "step": 4790 }, { "epoch": 0.3746195270428471, "grad_norm": 0.015200862661004066, "learning_rate": 9.981209098606413e-05, "loss": 0.8823, "step": 4800 }, { "epoch": 0.37539998439085304, "grad_norm": 0.010344978421926498, "learning_rate": 9.980813764712878e-05, "loss": 2.0015, "step": 4810 }, { "epoch": 0.37618044173885895, "grad_norm": 73.27796173095703, "learning_rate": 9.98041432335928e-05, "loss": 0.8186, "step": 4820 }, { "epoch": 0.3769608990868649, "grad_norm": 0.0006082833278924227, "learning_rate": 9.980010774875024e-05, "loss": 0.3812, "step": 4830 }, { "epoch": 0.37774135643487083, "grad_norm": 3.770277908188291e-05, "learning_rate": 9.9796031195929e-05, "loss": 1.0389, "step": 4840 }, { "epoch": 0.37852181378287675, "grad_norm": 104.53764343261719, "learning_rate": 9.97919135784908e-05, "loss": 1.5568, "step": 4850 }, { "epoch": 0.3793022711308827, "grad_norm": 3.955908045583101e-09, "learning_rate": 9.97877548998313e-05, "loss": 0.7482, "step": 4860 }, { "epoch": 0.38008272847888863, "grad_norm": 1.5290652513504028, "learning_rate": 9.978355516337994e-05, "loss": 1.2799, "step": 4870 }, { "epoch": 0.38086318582689455, "grad_norm": 60.555572509765625, "learning_rate": 9.977931437260009e-05, "loss": 0.9653, "step": 4880 }, { "epoch": 0.3816436431749005, "grad_norm": 2.333822250366211, "learning_rate": 9.977503253098892e-05, "loss": 1.9994, "step": 4890 }, { "epoch": 0.38242410052290643, "grad_norm": 0.04746650159358978, "learning_rate": 9.97707096420775e-05, "loss": 0.0984, "step": 4900 }, { "epoch": 0.38320455787091234, "grad_norm": 0.16147121787071228, "learning_rate": 9.976634570943071e-05, "loss": 1.2237, "step": 4910 }, { "epoch": 0.3839850152189183, "grad_norm": 0.08274467289447784, "learning_rate": 9.976194073664732e-05, "loss": 2.1904, "step": 4920 }, { "epoch": 0.3847654725669242, "grad_norm": 82.22650146484375, "learning_rate": 9.97574947273599e-05, "loss": 0.466, "step": 4930 }, { "epoch": 0.38554592991493014, "grad_norm": 24.859355926513672, "learning_rate": 9.975300768523487e-05, "loss": 0.3493, "step": 4940 }, { "epoch": 0.38632638726293606, "grad_norm": 1.2780405282974243, "learning_rate": 9.974847961397253e-05, "loss": 2.8073, "step": 4950 }, { "epoch": 0.387106844610942, "grad_norm": 0.04860816150903702, "learning_rate": 9.974391051730696e-05, "loss": 0.5481, "step": 4960 }, { "epoch": 0.38788730195894794, "grad_norm": 3.8242519622144755e-06, "learning_rate": 9.97393003990061e-05, "loss": 1.5707, "step": 4970 }, { "epoch": 0.38866775930695385, "grad_norm": 2.876926898956299, "learning_rate": 9.973464926287173e-05, "loss": 0.6285, "step": 4980 }, { "epoch": 0.3894482166549598, "grad_norm": 86.59673309326172, "learning_rate": 9.97299571127394e-05, "loss": 1.2899, "step": 4990 }, { "epoch": 0.39022867400296574, "grad_norm": 2.2732436656951904, "learning_rate": 9.972522395247855e-05, "loss": 0.898, "step": 5000 }, { "epoch": 0.39100913135097165, "grad_norm": 0.33496013283729553, "learning_rate": 9.972044978599241e-05, "loss": 0.3816, "step": 5010 }, { "epoch": 0.3917895886989776, "grad_norm": 46.32811737060547, "learning_rate": 9.971563461721802e-05, "loss": 2.3764, "step": 5020 }, { "epoch": 0.39257004604698353, "grad_norm": 81.04834747314453, "learning_rate": 9.971077845012625e-05, "loss": 1.0033, "step": 5030 }, { "epoch": 0.39335050339498945, "grad_norm": 73.85846710205078, "learning_rate": 9.970588128872176e-05, "loss": 0.4376, "step": 5040 }, { "epoch": 0.3941309607429954, "grad_norm": 8.540753697161563e-06, "learning_rate": 9.970094313704303e-05, "loss": 3.2187, "step": 5050 }, { "epoch": 0.39491141809100133, "grad_norm": 2.9623231887817383, "learning_rate": 9.969596399916233e-05, "loss": 0.7573, "step": 5060 }, { "epoch": 0.39569187543900725, "grad_norm": 58.8137321472168, "learning_rate": 9.969094387918574e-05, "loss": 0.5387, "step": 5070 }, { "epoch": 0.3964723327870132, "grad_norm": 52.218223571777344, "learning_rate": 9.968588278125315e-05, "loss": 3.7089, "step": 5080 }, { "epoch": 0.39725279013501913, "grad_norm": 3.2659690380096436, "learning_rate": 9.968078070953822e-05, "loss": 0.3519, "step": 5090 }, { "epoch": 0.39803324748302504, "grad_norm": 0.0405220165848732, "learning_rate": 9.967563766824839e-05, "loss": 0.6423, "step": 5100 }, { "epoch": 0.39881370483103096, "grad_norm": 77.0758056640625, "learning_rate": 9.967045366162491e-05, "loss": 1.3186, "step": 5110 }, { "epoch": 0.3995941621790369, "grad_norm": 0.002084170002490282, "learning_rate": 9.966522869394281e-05, "loss": 1.0188, "step": 5120 }, { "epoch": 0.40037461952704284, "grad_norm": 20.08683204650879, "learning_rate": 9.96599627695109e-05, "loss": 2.0177, "step": 5130 }, { "epoch": 0.40115507687504876, "grad_norm": 4.911335418000817e-05, "learning_rate": 9.965465589267175e-05, "loss": 0.4571, "step": 5140 }, { "epoch": 0.4019355342230547, "grad_norm": 0.00017713586566969752, "learning_rate": 9.96493080678017e-05, "loss": 0.5754, "step": 5150 }, { "epoch": 0.40271599157106064, "grad_norm": 8.778138160705566, "learning_rate": 9.964391929931089e-05, "loss": 0.2, "step": 5160 }, { "epoch": 0.40349644891906655, "grad_norm": 7.931397205673818e-10, "learning_rate": 9.96384895916432e-05, "loss": 1.9417, "step": 5170 }, { "epoch": 0.4042769062670725, "grad_norm": 46.630882263183594, "learning_rate": 9.963301894927623e-05, "loss": 1.6278, "step": 5180 }, { "epoch": 0.40505736361507844, "grad_norm": 0.9577981233596802, "learning_rate": 9.962750737672143e-05, "loss": 1.0923, "step": 5190 }, { "epoch": 0.40583782096308435, "grad_norm": 10.732783317565918, "learning_rate": 9.962195487852395e-05, "loss": 0.7055, "step": 5200 }, { "epoch": 0.4066182783110903, "grad_norm": 9.7693775291674e-10, "learning_rate": 9.961636145926267e-05, "loss": 0.0712, "step": 5210 }, { "epoch": 0.40739873565909623, "grad_norm": 25.384599685668945, "learning_rate": 9.961072712355026e-05, "loss": 0.6719, "step": 5220 }, { "epoch": 0.40817919300710215, "grad_norm": 88.48016357421875, "learning_rate": 9.960505187603308e-05, "loss": 0.5012, "step": 5230 }, { "epoch": 0.4089596503551081, "grad_norm": 3.6306564421551e-09, "learning_rate": 9.959933572139131e-05, "loss": 4.2087, "step": 5240 }, { "epoch": 0.40974010770311403, "grad_norm": 2.399672031402588, "learning_rate": 9.959357866433876e-05, "loss": 0.4929, "step": 5250 }, { "epoch": 0.41052056505111995, "grad_norm": 14.656164169311523, "learning_rate": 9.958778070962307e-05, "loss": 1.2333, "step": 5260 }, { "epoch": 0.4113010223991259, "grad_norm": 2.2186574935913086, "learning_rate": 9.958194186202551e-05, "loss": 2.2942, "step": 5270 }, { "epoch": 0.41208147974713183, "grad_norm": 2.9925684928894043, "learning_rate": 9.957606212636117e-05, "loss": 0.7031, "step": 5280 }, { "epoch": 0.41286193709513774, "grad_norm": 81.13506317138672, "learning_rate": 9.95701415074788e-05, "loss": 1.2966, "step": 5290 }, { "epoch": 0.41364239444314366, "grad_norm": 0.000894436496309936, "learning_rate": 9.956418001026087e-05, "loss": 0.2619, "step": 5300 }, { "epoch": 0.4144228517911496, "grad_norm": 2.2047314643859863, "learning_rate": 9.955817763962356e-05, "loss": 0.7434, "step": 5310 }, { "epoch": 0.41520330913915554, "grad_norm": 0.00014062505215406418, "learning_rate": 9.955213440051676e-05, "loss": 0.0787, "step": 5320 }, { "epoch": 0.41598376648716145, "grad_norm": 7.172604000516003e-06, "learning_rate": 9.95460502979241e-05, "loss": 2.4076, "step": 5330 }, { "epoch": 0.4167642238351674, "grad_norm": 1.4012911319732666, "learning_rate": 9.953992533686282e-05, "loss": 0.7168, "step": 5340 }, { "epoch": 0.41754468118317334, "grad_norm": 6.324690818786621, "learning_rate": 9.953375952238398e-05, "loss": 2.9089, "step": 5350 }, { "epoch": 0.41832513853117925, "grad_norm": 10.851876258850098, "learning_rate": 9.952755285957218e-05, "loss": 0.1223, "step": 5360 }, { "epoch": 0.4191055958791852, "grad_norm": 61.75676345825195, "learning_rate": 9.952130535354585e-05, "loss": 1.1446, "step": 5370 }, { "epoch": 0.41988605322719114, "grad_norm": 6.387716293334961, "learning_rate": 9.9515017009457e-05, "loss": 4.3004, "step": 5380 }, { "epoch": 0.42066651057519705, "grad_norm": 3.521226644515991, "learning_rate": 9.950868783249136e-05, "loss": 0.4539, "step": 5390 }, { "epoch": 0.421446967923203, "grad_norm": 76.65003967285156, "learning_rate": 9.950231782786832e-05, "loss": 1.3463, "step": 5400 }, { "epoch": 0.42222742527120893, "grad_norm": 58.46784973144531, "learning_rate": 9.949590700084096e-05, "loss": 1.5774, "step": 5410 }, { "epoch": 0.42300788261921485, "grad_norm": 29.42442512512207, "learning_rate": 9.9489455356696e-05, "loss": 1.471, "step": 5420 }, { "epoch": 0.4237883399672208, "grad_norm": 49.83870315551758, "learning_rate": 9.948296290075385e-05, "loss": 0.7125, "step": 5430 }, { "epoch": 0.42456879731522673, "grad_norm": 8.928007125854492, "learning_rate": 9.947642963836852e-05, "loss": 0.4729, "step": 5440 }, { "epoch": 0.42534925466323265, "grad_norm": 0.006001488771289587, "learning_rate": 9.946985557492774e-05, "loss": 0.8174, "step": 5450 }, { "epoch": 0.42612971201123856, "grad_norm": 3.2906408309936523, "learning_rate": 9.946324071585283e-05, "loss": 0.7242, "step": 5460 }, { "epoch": 0.42691016935924453, "grad_norm": 5.074894428253174, "learning_rate": 9.945658506659878e-05, "loss": 1.5351, "step": 5470 }, { "epoch": 0.42769062670725044, "grad_norm": 0.013566683046519756, "learning_rate": 9.944988863265423e-05, "loss": 0.3799, "step": 5480 }, { "epoch": 0.42847108405525636, "grad_norm": 0.2592769265174866, "learning_rate": 9.944315141954142e-05, "loss": 0.3326, "step": 5490 }, { "epoch": 0.4292515414032623, "grad_norm": 56.55481719970703, "learning_rate": 9.943637343281623e-05, "loss": 0.5757, "step": 5500 }, { "epoch": 0.43003199875126824, "grad_norm": 52.10049819946289, "learning_rate": 9.94295546780682e-05, "loss": 0.3103, "step": 5510 }, { "epoch": 0.43081245609927415, "grad_norm": 0.00042787150596268475, "learning_rate": 9.942269516092043e-05, "loss": 0.3928, "step": 5520 }, { "epoch": 0.4315929134472801, "grad_norm": 3.384069771072973e-07, "learning_rate": 9.941579488702968e-05, "loss": 2.6701, "step": 5530 }, { "epoch": 0.43237337079528604, "grad_norm": 69.2872085571289, "learning_rate": 9.940885386208631e-05, "loss": 0.7537, "step": 5540 }, { "epoch": 0.43315382814329195, "grad_norm": 2.248446464538574, "learning_rate": 9.940187209181427e-05, "loss": 0.0384, "step": 5550 }, { "epoch": 0.4339342854912979, "grad_norm": 0.11496047675609589, "learning_rate": 9.939484958197112e-05, "loss": 0.8702, "step": 5560 }, { "epoch": 0.43471474283930384, "grad_norm": 83.21031188964844, "learning_rate": 9.938778633834802e-05, "loss": 2.4813, "step": 5570 }, { "epoch": 0.43549520018730975, "grad_norm": 40.44232177734375, "learning_rate": 9.938068236676974e-05, "loss": 1.668, "step": 5580 }, { "epoch": 0.4362756575353157, "grad_norm": 52.226715087890625, "learning_rate": 9.937353767309458e-05, "loss": 2.4203, "step": 5590 }, { "epoch": 0.43705611488332163, "grad_norm": 93.075439453125, "learning_rate": 9.936635226321447e-05, "loss": 0.6886, "step": 5600 }, { "epoch": 0.43783657223132755, "grad_norm": 0.0370655320584774, "learning_rate": 9.935912614305493e-05, "loss": 1.5385, "step": 5610 }, { "epoch": 0.43861702957933346, "grad_norm": 50.700130462646484, "learning_rate": 9.935185931857499e-05, "loss": 1.234, "step": 5620 }, { "epoch": 0.43939748692733943, "grad_norm": 16.66071891784668, "learning_rate": 9.93445517957673e-05, "loss": 0.2529, "step": 5630 }, { "epoch": 0.44017794427534535, "grad_norm": 0.025942066684365273, "learning_rate": 9.933720358065807e-05, "loss": 0.9281, "step": 5640 }, { "epoch": 0.44095840162335126, "grad_norm": 0.00847440306097269, "learning_rate": 9.932981467930702e-05, "loss": 1.5612, "step": 5650 }, { "epoch": 0.44173885897135723, "grad_norm": 1.0681140422821045, "learning_rate": 9.932238509780749e-05, "loss": 1.734, "step": 5660 }, { "epoch": 0.44251931631936314, "grad_norm": 56.31792449951172, "learning_rate": 9.93149148422863e-05, "loss": 2.0712, "step": 5670 }, { "epoch": 0.44329977366736906, "grad_norm": 1.9309101104736328, "learning_rate": 9.930740391890386e-05, "loss": 1.5979, "step": 5680 }, { "epoch": 0.444080231015375, "grad_norm": 61.22231674194336, "learning_rate": 9.92998523338541e-05, "loss": 1.4279, "step": 5690 }, { "epoch": 0.44486068836338094, "grad_norm": 86.64026641845703, "learning_rate": 9.92922600933645e-05, "loss": 0.7382, "step": 5700 }, { "epoch": 0.44564114571138685, "grad_norm": 4.9843316078186035, "learning_rate": 9.928462720369602e-05, "loss": 2.5206, "step": 5710 }, { "epoch": 0.4464216030593928, "grad_norm": 17.059606552124023, "learning_rate": 9.927695367114318e-05, "loss": 1.4367, "step": 5720 }, { "epoch": 0.44720206040739874, "grad_norm": 1.7653934955596924, "learning_rate": 9.926923950203402e-05, "loss": 0.8985, "step": 5730 }, { "epoch": 0.44798251775540465, "grad_norm": 32.64850997924805, "learning_rate": 9.926148470273007e-05, "loss": 0.8873, "step": 5740 }, { "epoch": 0.4487629751034106, "grad_norm": 56.86506271362305, "learning_rate": 9.925368927962637e-05, "loss": 2.0438, "step": 5750 }, { "epoch": 0.44954343245141654, "grad_norm": 53.06357192993164, "learning_rate": 9.924585323915148e-05, "loss": 0.9069, "step": 5760 }, { "epoch": 0.45032388979942245, "grad_norm": 0.1541537493467331, "learning_rate": 9.923797658776744e-05, "loss": 0.9701, "step": 5770 }, { "epoch": 0.4511043471474284, "grad_norm": 55.30256271362305, "learning_rate": 9.923005933196977e-05, "loss": 0.752, "step": 5780 }, { "epoch": 0.45188480449543433, "grad_norm": 12.553262710571289, "learning_rate": 9.922210147828749e-05, "loss": 0.5744, "step": 5790 }, { "epoch": 0.45266526184344025, "grad_norm": 56.211185455322266, "learning_rate": 9.92141030332831e-05, "loss": 2.1868, "step": 5800 }, { "epoch": 0.45344571919144616, "grad_norm": 44.826934814453125, "learning_rate": 9.920606400355255e-05, "loss": 0.6291, "step": 5810 }, { "epoch": 0.45422617653945213, "grad_norm": 0.347988098859787, "learning_rate": 9.91979843957253e-05, "loss": 1.1644, "step": 5820 }, { "epoch": 0.45500663388745805, "grad_norm": 41.838741302490234, "learning_rate": 9.918986421646425e-05, "loss": 1.897, "step": 5830 }, { "epoch": 0.45578709123546396, "grad_norm": 0.07808752357959747, "learning_rate": 9.918170347246574e-05, "loss": 0.6393, "step": 5840 }, { "epoch": 0.45656754858346993, "grad_norm": 24.800209045410156, "learning_rate": 9.91735021704596e-05, "loss": 0.6908, "step": 5850 }, { "epoch": 0.45734800593147584, "grad_norm": 28.077816009521484, "learning_rate": 9.916526031720908e-05, "loss": 0.6714, "step": 5860 }, { "epoch": 0.45812846327948176, "grad_norm": 64.07998657226562, "learning_rate": 9.915697791951086e-05, "loss": 0.8563, "step": 5870 }, { "epoch": 0.4589089206274877, "grad_norm": 0.026845192536711693, "learning_rate": 9.91486549841951e-05, "loss": 1.9563, "step": 5880 }, { "epoch": 0.45968937797549364, "grad_norm": 37.07597351074219, "learning_rate": 9.914029151812534e-05, "loss": 0.4418, "step": 5890 }, { "epoch": 0.46046983532349955, "grad_norm": 1.552006483078003, "learning_rate": 9.913188752819857e-05, "loss": 1.8555, "step": 5900 }, { "epoch": 0.4612502926715055, "grad_norm": 66.1205062866211, "learning_rate": 9.912344302134521e-05, "loss": 1.4313, "step": 5910 }, { "epoch": 0.46203075001951144, "grad_norm": 4.578004360198975, "learning_rate": 9.911495800452906e-05, "loss": 2.7846, "step": 5920 }, { "epoch": 0.46281120736751735, "grad_norm": 37.89592742919922, "learning_rate": 9.910643248474735e-05, "loss": 0.3734, "step": 5930 }, { "epoch": 0.4635916647155233, "grad_norm": 8.450699533568695e-05, "learning_rate": 9.909786646903072e-05, "loss": 0.4354, "step": 5940 }, { "epoch": 0.46437212206352924, "grad_norm": 4.293805977795273e-05, "learning_rate": 9.908925996444316e-05, "loss": 0.3882, "step": 5950 }, { "epoch": 0.46515257941153515, "grad_norm": 1.9179746857389546e-07, "learning_rate": 9.908061297808209e-05, "loss": 1.4029, "step": 5960 }, { "epoch": 0.46593303675954106, "grad_norm": 0.0844622254371643, "learning_rate": 9.907192551707831e-05, "loss": 2.5749, "step": 5970 }, { "epoch": 0.46671349410754703, "grad_norm": 17.458057403564453, "learning_rate": 9.9063197588596e-05, "loss": 1.4711, "step": 5980 }, { "epoch": 0.46749395145555295, "grad_norm": 2.408496856689453, "learning_rate": 9.905442919983266e-05, "loss": 0.2675, "step": 5990 }, { "epoch": 0.46827440880355886, "grad_norm": 38.14802932739258, "learning_rate": 9.904562035801924e-05, "loss": 0.9949, "step": 6000 }, { "epoch": 0.46905486615156483, "grad_norm": 0.0007477306644432247, "learning_rate": 9.903677107042e-05, "loss": 0.0908, "step": 6010 }, { "epoch": 0.46983532349957075, "grad_norm": 54.57261657714844, "learning_rate": 9.902788134433256e-05, "loss": 1.3682, "step": 6020 }, { "epoch": 0.47061578084757666, "grad_norm": 63.36696243286133, "learning_rate": 9.901895118708788e-05, "loss": 1.0471, "step": 6030 }, { "epoch": 0.47139623819558263, "grad_norm": 0.04630555212497711, "learning_rate": 9.900998060605027e-05, "loss": 0.111, "step": 6040 }, { "epoch": 0.47217669554358854, "grad_norm": 66.14763641357422, "learning_rate": 9.90009696086174e-05, "loss": 1.1918, "step": 6050 }, { "epoch": 0.47295715289159446, "grad_norm": 0.3311450779438019, "learning_rate": 9.899191820222022e-05, "loss": 0.7873, "step": 6060 }, { "epoch": 0.4737376102396004, "grad_norm": 52.740875244140625, "learning_rate": 9.898282639432306e-05, "loss": 1.5445, "step": 6070 }, { "epoch": 0.47451806758760634, "grad_norm": 28.408681869506836, "learning_rate": 9.89736941924235e-05, "loss": 1.5339, "step": 6080 }, { "epoch": 0.47529852493561225, "grad_norm": 0.33915793895721436, "learning_rate": 9.896452160405252e-05, "loss": 1.1844, "step": 6090 }, { "epoch": 0.4760789822836182, "grad_norm": 2.4481306076049805, "learning_rate": 9.895530863677429e-05, "loss": 0.3326, "step": 6100 }, { "epoch": 0.47685943963162414, "grad_norm": 9.80547046661377, "learning_rate": 9.89460552981864e-05, "loss": 0.6921, "step": 6110 }, { "epoch": 0.47763989697963005, "grad_norm": 3.3799116749833047e-07, "learning_rate": 9.893676159591963e-05, "loss": 0.5425, "step": 6120 }, { "epoch": 0.478420354327636, "grad_norm": 1.6407069836077426e-07, "learning_rate": 9.892742753763813e-05, "loss": 0.0921, "step": 6130 }, { "epoch": 0.47920081167564194, "grad_norm": 37.7635612487793, "learning_rate": 9.891805313103927e-05, "loss": 0.3376, "step": 6140 }, { "epoch": 0.47998126902364785, "grad_norm": 0.0007379294838756323, "learning_rate": 9.890863838385371e-05, "loss": 1.2239, "step": 6150 }, { "epoch": 0.48076172637165376, "grad_norm": 1.5424108505249023, "learning_rate": 9.88991833038454e-05, "loss": 0.969, "step": 6160 }, { "epoch": 0.48154218371965973, "grad_norm": 1.8169808413404098e-07, "learning_rate": 9.888968789881152e-05, "loss": 1.5421, "step": 6170 }, { "epoch": 0.48232264106766565, "grad_norm": 0.4752025604248047, "learning_rate": 9.888015217658251e-05, "loss": 0.2779, "step": 6180 }, { "epoch": 0.48310309841567156, "grad_norm": 0.13063034415245056, "learning_rate": 9.887057614502208e-05, "loss": 3.2541, "step": 6190 }, { "epoch": 0.48388355576367753, "grad_norm": 0.24971577525138855, "learning_rate": 9.886095981202715e-05, "loss": 1.0007, "step": 6200 }, { "epoch": 0.48466401311168344, "grad_norm": 0.011976394802331924, "learning_rate": 9.88513031855279e-05, "loss": 0.0139, "step": 6210 }, { "epoch": 0.48544447045968936, "grad_norm": 52.369041442871094, "learning_rate": 9.884160627348775e-05, "loss": 2.0091, "step": 6220 }, { "epoch": 0.48622492780769533, "grad_norm": 0.3506132662296295, "learning_rate": 9.883186908390329e-05, "loss": 0.4435, "step": 6230 }, { "epoch": 0.48700538515570124, "grad_norm": 4.650276184082031, "learning_rate": 9.882209162480437e-05, "loss": 1.6357, "step": 6240 }, { "epoch": 0.48778584250370716, "grad_norm": 56.31660461425781, "learning_rate": 9.881227390425403e-05, "loss": 0.9129, "step": 6250 }, { "epoch": 0.4885662998517131, "grad_norm": 2.010699987411499, "learning_rate": 9.88024159303485e-05, "loss": 0.9621, "step": 6260 }, { "epoch": 0.48934675719971904, "grad_norm": 0.18086794018745422, "learning_rate": 9.879251771121725e-05, "loss": 1.5871, "step": 6270 }, { "epoch": 0.49012721454772495, "grad_norm": 3.91213321685791, "learning_rate": 9.87825792550229e-05, "loss": 2.2983, "step": 6280 }, { "epoch": 0.4909076718957309, "grad_norm": 51.183929443359375, "learning_rate": 9.877260056996126e-05, "loss": 0.8454, "step": 6290 }, { "epoch": 0.49168812924373684, "grad_norm": 34.326629638671875, "learning_rate": 9.876258166426131e-05, "loss": 1.191, "step": 6300 }, { "epoch": 0.49246858659174275, "grad_norm": 1.0975525379180908, "learning_rate": 9.875252254618523e-05, "loss": 0.2663, "step": 6310 }, { "epoch": 0.49324904393974867, "grad_norm": 3.1184887886047363, "learning_rate": 9.87424232240283e-05, "loss": 1.3323, "step": 6320 }, { "epoch": 0.49402950128775464, "grad_norm": 8.74647331237793, "learning_rate": 9.873228370611903e-05, "loss": 0.4933, "step": 6330 }, { "epoch": 0.49480995863576055, "grad_norm": 51.158206939697266, "learning_rate": 9.872210400081898e-05, "loss": 1.4282, "step": 6340 }, { "epoch": 0.49559041598376646, "grad_norm": 10.619514465332031, "learning_rate": 9.871188411652296e-05, "loss": 0.8554, "step": 6350 }, { "epoch": 0.49637087333177243, "grad_norm": 34.76881790161133, "learning_rate": 9.870162406165888e-05, "loss": 0.8635, "step": 6360 }, { "epoch": 0.49715133067977835, "grad_norm": 0.014710797928273678, "learning_rate": 9.86913238446877e-05, "loss": 1.0073, "step": 6370 }, { "epoch": 0.49793178802778426, "grad_norm": 0.00262262555770576, "learning_rate": 9.86809834741036e-05, "loss": 0.1259, "step": 6380 }, { "epoch": 0.49871224537579023, "grad_norm": 0.0012736758217215538, "learning_rate": 9.867060295843382e-05, "loss": 0.9914, "step": 6390 }, { "epoch": 0.49949270272379614, "grad_norm": 35.19121551513672, "learning_rate": 9.866018230623873e-05, "loss": 1.8764, "step": 6400 }, { "epoch": 0.5002731600718021, "grad_norm": 0.935328483581543, "learning_rate": 9.864972152611178e-05, "loss": 1.3374, "step": 6410 }, { "epoch": 0.501053617419808, "grad_norm": 23.448575973510742, "learning_rate": 9.863922062667951e-05, "loss": 1.4024, "step": 6420 }, { "epoch": 0.5018340747678139, "grad_norm": 6.185776233673096, "learning_rate": 9.862867961660157e-05, "loss": 0.4412, "step": 6430 }, { "epoch": 0.5026145321158199, "grad_norm": 0.019975431263446808, "learning_rate": 9.861809850457065e-05, "loss": 0.2771, "step": 6440 }, { "epoch": 0.5033949894638258, "grad_norm": 3.049410820007324, "learning_rate": 9.860747729931257e-05, "loss": 0.9518, "step": 6450 }, { "epoch": 0.5041754468118317, "grad_norm": 28.222091674804688, "learning_rate": 9.859681600958614e-05, "loss": 1.3099, "step": 6460 }, { "epoch": 0.5049559041598377, "grad_norm": 0.0015314549673348665, "learning_rate": 9.858611464418327e-05, "loss": 0.3673, "step": 6470 }, { "epoch": 0.5057363615078436, "grad_norm": 0.2558833658695221, "learning_rate": 9.857537321192893e-05, "loss": 1.1245, "step": 6480 }, { "epoch": 0.5065168188558495, "grad_norm": 2.1790192127227783, "learning_rate": 9.856459172168108e-05, "loss": 0.5932, "step": 6490 }, { "epoch": 0.5072972762038555, "grad_norm": 51.02792739868164, "learning_rate": 9.855377018233076e-05, "loss": 0.6189, "step": 6500 }, { "epoch": 0.5080777335518614, "grad_norm": 54.68989181518555, "learning_rate": 9.854290860280203e-05, "loss": 2.7634, "step": 6510 }, { "epoch": 0.5088581908998673, "grad_norm": 3.412045478820801, "learning_rate": 9.853200699205194e-05, "loss": 0.7287, "step": 6520 }, { "epoch": 0.5096386482478733, "grad_norm": 83.18421173095703, "learning_rate": 9.85210653590706e-05, "loss": 1.2859, "step": 6530 }, { "epoch": 0.5104191055958792, "grad_norm": 1.2750853300094604, "learning_rate": 9.851008371288106e-05, "loss": 0.649, "step": 6540 }, { "epoch": 0.5111995629438851, "grad_norm": 2.0689053535461426, "learning_rate": 9.849906206253945e-05, "loss": 1.6541, "step": 6550 }, { "epoch": 0.511980020291891, "grad_norm": 58.269386291503906, "learning_rate": 9.848800041713482e-05, "loss": 1.5054, "step": 6560 }, { "epoch": 0.512760477639897, "grad_norm": 61.32803726196289, "learning_rate": 9.847689878578923e-05, "loss": 1.2377, "step": 6570 }, { "epoch": 0.5135409349879029, "grad_norm": 0.00047993057523854077, "learning_rate": 9.846575717765772e-05, "loss": 1.1732, "step": 6580 }, { "epoch": 0.5143213923359088, "grad_norm": 0.004535530228167772, "learning_rate": 9.845457560192827e-05, "loss": 1.0141, "step": 6590 }, { "epoch": 0.5151018496839148, "grad_norm": 0.3655521273612976, "learning_rate": 9.844335406782186e-05, "loss": 1.2912, "step": 6600 }, { "epoch": 0.5158823070319207, "grad_norm": 0.005310993175953627, "learning_rate": 9.843209258459237e-05, "loss": 0.2338, "step": 6610 }, { "epoch": 0.5166627643799266, "grad_norm": 1.9839240312576294, "learning_rate": 9.84207911615267e-05, "loss": 0.5871, "step": 6620 }, { "epoch": 0.5174432217279326, "grad_norm": 7.782805937495141e-07, "learning_rate": 9.84094498079446e-05, "loss": 0.4506, "step": 6630 }, { "epoch": 0.5182236790759385, "grad_norm": 54.680179595947266, "learning_rate": 9.83980685331988e-05, "loss": 1.7223, "step": 6640 }, { "epoch": 0.5190041364239444, "grad_norm": 43.05501937866211, "learning_rate": 9.838664734667495e-05, "loss": 0.7108, "step": 6650 }, { "epoch": 0.5197845937719504, "grad_norm": 0.022166762501001358, "learning_rate": 9.837518625779161e-05, "loss": 0.7489, "step": 6660 }, { "epoch": 0.5205650511199563, "grad_norm": 41.44973373413086, "learning_rate": 9.836368527600022e-05, "loss": 0.7476, "step": 6670 }, { "epoch": 0.5213455084679622, "grad_norm": 44.777591705322266, "learning_rate": 9.835214441078515e-05, "loss": 1.1241, "step": 6680 }, { "epoch": 0.5221259658159682, "grad_norm": 39.23445510864258, "learning_rate": 9.834056367166365e-05, "loss": 1.8415, "step": 6690 }, { "epoch": 0.5229064231639741, "grad_norm": 30.00687026977539, "learning_rate": 9.832894306818585e-05, "loss": 0.5706, "step": 6700 }, { "epoch": 0.52368688051198, "grad_norm": 68.04293823242188, "learning_rate": 9.831728260993476e-05, "loss": 1.0121, "step": 6710 }, { "epoch": 0.524467337859986, "grad_norm": 4.57463264465332, "learning_rate": 9.830558230652624e-05, "loss": 0.3254, "step": 6720 }, { "epoch": 0.5252477952079919, "grad_norm": 18.793424606323242, "learning_rate": 9.829384216760904e-05, "loss": 2.0302, "step": 6730 }, { "epoch": 0.5260282525559978, "grad_norm": 7.409657001495361, "learning_rate": 9.828206220286472e-05, "loss": 1.2441, "step": 6740 }, { "epoch": 0.5268087099040037, "grad_norm": 48.33645248413086, "learning_rate": 9.827024242200771e-05, "loss": 1.3994, "step": 6750 }, { "epoch": 0.5275891672520097, "grad_norm": 56.547359466552734, "learning_rate": 9.825838283478528e-05, "loss": 0.8465, "step": 6760 }, { "epoch": 0.5283696246000156, "grad_norm": 0.01121858786791563, "learning_rate": 9.824648345097749e-05, "loss": 1.0637, "step": 6770 }, { "epoch": 0.5291500819480215, "grad_norm": 8.223366737365723, "learning_rate": 9.823454428039726e-05, "loss": 0.7012, "step": 6780 }, { "epoch": 0.5299305392960275, "grad_norm": 0.00867557991296053, "learning_rate": 9.822256533289032e-05, "loss": 0.8317, "step": 6790 }, { "epoch": 0.5307109966440334, "grad_norm": 0.0009431852377019823, "learning_rate": 9.821054661833516e-05, "loss": 0.7048, "step": 6800 }, { "epoch": 0.5314914539920393, "grad_norm": 1.0806217193603516, "learning_rate": 9.819848814664307e-05, "loss": 1.2247, "step": 6810 }, { "epoch": 0.5322719113400453, "grad_norm": 5.017229080200195, "learning_rate": 9.81863899277582e-05, "loss": 0.4701, "step": 6820 }, { "epoch": 0.5330523686880512, "grad_norm": 13.995375633239746, "learning_rate": 9.817425197165739e-05, "loss": 2.9128, "step": 6830 }, { "epoch": 0.5338328260360571, "grad_norm": 82.41301727294922, "learning_rate": 9.816207428835029e-05, "loss": 2.4041, "step": 6840 }, { "epoch": 0.5346132833840631, "grad_norm": 66.8274917602539, "learning_rate": 9.814985688787932e-05, "loss": 3.669, "step": 6850 }, { "epoch": 0.535393740732069, "grad_norm": 3.1551668643951416, "learning_rate": 9.813759978031962e-05, "loss": 0.5636, "step": 6860 }, { "epoch": 0.5361741980800749, "grad_norm": 20.27729606628418, "learning_rate": 9.812530297577908e-05, "loss": 1.534, "step": 6870 }, { "epoch": 0.5369546554280809, "grad_norm": 0.1851717084646225, "learning_rate": 9.811296648439837e-05, "loss": 0.9975, "step": 6880 }, { "epoch": 0.5377351127760868, "grad_norm": 38.749061584472656, "learning_rate": 9.810059031635084e-05, "loss": 1.0313, "step": 6890 }, { "epoch": 0.5385155701240927, "grad_norm": 24.485456466674805, "learning_rate": 9.808817448184258e-05, "loss": 1.5311, "step": 6900 }, { "epoch": 0.5392960274720987, "grad_norm": 0.004445152822881937, "learning_rate": 9.807571899111237e-05, "loss": 0.1711, "step": 6910 }, { "epoch": 0.5400764848201046, "grad_norm": 1.9483662843704224, "learning_rate": 9.806322385443174e-05, "loss": 0.1513, "step": 6920 }, { "epoch": 0.5408569421681105, "grad_norm": 0.0006264004623517394, "learning_rate": 9.805068908210488e-05, "loss": 2.5281, "step": 6930 }, { "epoch": 0.5416373995161164, "grad_norm": 53.3282356262207, "learning_rate": 9.803811468446864e-05, "loss": 1.6079, "step": 6940 }, { "epoch": 0.5424178568641224, "grad_norm": 19.502546310424805, "learning_rate": 9.802550067189263e-05, "loss": 0.1501, "step": 6950 }, { "epoch": 0.5431983142121283, "grad_norm": 20.476640701293945, "learning_rate": 9.801284705477902e-05, "loss": 0.4741, "step": 6960 }, { "epoch": 0.5439787715601342, "grad_norm": 48.1911506652832, "learning_rate": 9.800015384356271e-05, "loss": 1.6906, "step": 6970 }, { "epoch": 0.5447592289081402, "grad_norm": 0.8714758157730103, "learning_rate": 9.798742104871129e-05, "loss": 1.0196, "step": 6980 }, { "epoch": 0.5455396862561461, "grad_norm": 1.112245868739592e-07, "learning_rate": 9.797464868072488e-05, "loss": 0.0536, "step": 6990 }, { "epoch": 0.546320143604152, "grad_norm": 0.03219883516430855, "learning_rate": 9.796183675013632e-05, "loss": 0.523, "step": 7000 }, { "epoch": 0.547100600952158, "grad_norm": 6.496858172971853e-13, "learning_rate": 9.794898526751104e-05, "loss": 1.7537, "step": 7010 }, { "epoch": 0.5478810583001639, "grad_norm": 1.2731820042688469e-09, "learning_rate": 9.793609424344712e-05, "loss": 0.3894, "step": 7020 }, { "epoch": 0.5486615156481698, "grad_norm": 0.006216781213879585, "learning_rate": 9.792316368857519e-05, "loss": 1.6057, "step": 7030 }, { "epoch": 0.5494419729961758, "grad_norm": 1.2926510528643576e-08, "learning_rate": 9.791019361355855e-05, "loss": 5.085, "step": 7040 }, { "epoch": 0.5502224303441817, "grad_norm": 3.7267448902130127, "learning_rate": 9.789718402909305e-05, "loss": 1.3625, "step": 7050 }, { "epoch": 0.5510028876921876, "grad_norm": 25.583145141601562, "learning_rate": 9.788413494590711e-05, "loss": 1.5236, "step": 7060 }, { "epoch": 0.5517833450401936, "grad_norm": 0.00015216317842714489, "learning_rate": 9.787104637476175e-05, "loss": 1.4341, "step": 7070 }, { "epoch": 0.5525638023881995, "grad_norm": 47.065372467041016, "learning_rate": 9.785791832645055e-05, "loss": 0.1853, "step": 7080 }, { "epoch": 0.5533442597362054, "grad_norm": 0.002304552122950554, "learning_rate": 9.784475081179962e-05, "loss": 1.5718, "step": 7090 }, { "epoch": 0.5541247170842113, "grad_norm": 21.46728515625, "learning_rate": 9.783154384166766e-05, "loss": 1.6335, "step": 7100 }, { "epoch": 0.5549051744322173, "grad_norm": 423.6686096191406, "learning_rate": 9.781829742694588e-05, "loss": 1.1247, "step": 7110 }, { "epoch": 0.5556856317802232, "grad_norm": 10.277878761291504, "learning_rate": 9.780501157855801e-05, "loss": 0.4371, "step": 7120 }, { "epoch": 0.5564660891282291, "grad_norm": 7.607505949636106e-07, "learning_rate": 9.77916863074603e-05, "loss": 0.0516, "step": 7130 }, { "epoch": 0.5572465464762351, "grad_norm": 3.1203581940530967e-09, "learning_rate": 9.777832162464154e-05, "loss": 2.3446, "step": 7140 }, { "epoch": 0.558027003824241, "grad_norm": 10.839029312133789, "learning_rate": 9.776491754112299e-05, "loss": 0.958, "step": 7150 }, { "epoch": 0.5588074611722469, "grad_norm": 0.4206421375274658, "learning_rate": 9.775147406795841e-05, "loss": 1.2235, "step": 7160 }, { "epoch": 0.5595879185202529, "grad_norm": 0.00017085533181671053, "learning_rate": 9.773799121623408e-05, "loss": 0.4796, "step": 7170 }, { "epoch": 0.5603683758682588, "grad_norm": 26.582189559936523, "learning_rate": 9.772446899706868e-05, "loss": 0.6587, "step": 7180 }, { "epoch": 0.5611488332162647, "grad_norm": 42.50886917114258, "learning_rate": 9.771090742161342e-05, "loss": 1.4833, "step": 7190 }, { "epoch": 0.5619292905642707, "grad_norm": 50.29939270019531, "learning_rate": 9.769730650105191e-05, "loss": 1.683, "step": 7200 }, { "epoch": 0.5627097479122766, "grad_norm": 10.882280349731445, "learning_rate": 9.768366624660028e-05, "loss": 0.7569, "step": 7210 }, { "epoch": 0.5634902052602825, "grad_norm": 60.33913040161133, "learning_rate": 9.766998666950702e-05, "loss": 1.2821, "step": 7220 }, { "epoch": 0.5642706626082885, "grad_norm": 41.544944763183594, "learning_rate": 9.765626778105308e-05, "loss": 1.24, "step": 7230 }, { "epoch": 0.5650511199562944, "grad_norm": 13.627266883850098, "learning_rate": 9.764250959255186e-05, "loss": 0.624, "step": 7240 }, { "epoch": 0.5658315773043003, "grad_norm": 30.541210174560547, "learning_rate": 9.76287121153491e-05, "loss": 0.7803, "step": 7250 }, { "epoch": 0.5666120346523063, "grad_norm": 47.805694580078125, "learning_rate": 9.761487536082302e-05, "loss": 0.4765, "step": 7260 }, { "epoch": 0.5673924920003122, "grad_norm": 59.42711639404297, "learning_rate": 9.760099934038415e-05, "loss": 0.8472, "step": 7270 }, { "epoch": 0.5681729493483181, "grad_norm": 19.21670913696289, "learning_rate": 9.758708406547546e-05, "loss": 0.8136, "step": 7280 }, { "epoch": 0.568953406696324, "grad_norm": 0.024506118148565292, "learning_rate": 9.757312954757228e-05, "loss": 0.4705, "step": 7290 }, { "epoch": 0.56973386404433, "grad_norm": 13.569670677185059, "learning_rate": 9.755913579818226e-05, "loss": 0.8712, "step": 7300 }, { "epoch": 0.5705143213923359, "grad_norm": 46.79026794433594, "learning_rate": 9.754510282884546e-05, "loss": 1.7591, "step": 7310 }, { "epoch": 0.5712947787403418, "grad_norm": 0.011232595890760422, "learning_rate": 9.753103065113424e-05, "loss": 0.3608, "step": 7320 }, { "epoch": 0.5720752360883478, "grad_norm": 3.212200403213501, "learning_rate": 9.751691927665334e-05, "loss": 0.3634, "step": 7330 }, { "epoch": 0.5728556934363537, "grad_norm": 0.02611798420548439, "learning_rate": 9.750276871703979e-05, "loss": 0.9882, "step": 7340 }, { "epoch": 0.5736361507843596, "grad_norm": 40.64421081542969, "learning_rate": 9.74885789839629e-05, "loss": 0.4115, "step": 7350 }, { "epoch": 0.5744166081323656, "grad_norm": 0.035366058349609375, "learning_rate": 9.747435008912438e-05, "loss": 1.0184, "step": 7360 }, { "epoch": 0.5751970654803715, "grad_norm": 1.079415202140808, "learning_rate": 9.746008204425814e-05, "loss": 1.518, "step": 7370 }, { "epoch": 0.5759775228283774, "grad_norm": 48.33514404296875, "learning_rate": 9.744577486113042e-05, "loss": 1.0792, "step": 7380 }, { "epoch": 0.5767579801763834, "grad_norm": 46.865501403808594, "learning_rate": 9.743142855153976e-05, "loss": 1.2278, "step": 7390 }, { "epoch": 0.5775384375243893, "grad_norm": 0.035304367542266846, "learning_rate": 9.741704312731691e-05, "loss": 2.4778, "step": 7400 }, { "epoch": 0.5783188948723952, "grad_norm": 1.251697301864624, "learning_rate": 9.740261860032491e-05, "loss": 0.6881, "step": 7410 }, { "epoch": 0.5790993522204012, "grad_norm": 38.47404861450195, "learning_rate": 9.738815498245902e-05, "loss": 0.8292, "step": 7420 }, { "epoch": 0.5798798095684071, "grad_norm": 40.813621520996094, "learning_rate": 9.737365228564679e-05, "loss": 1.4995, "step": 7430 }, { "epoch": 0.580660266916413, "grad_norm": 36.29606246948242, "learning_rate": 9.735911052184794e-05, "loss": 0.9977, "step": 7440 }, { "epoch": 0.581440724264419, "grad_norm": 0.03261225298047066, "learning_rate": 9.734452970305443e-05, "loss": 0.5158, "step": 7450 }, { "epoch": 0.5822211816124249, "grad_norm": 2.965651273727417, "learning_rate": 9.732990984129042e-05, "loss": 0.605, "step": 7460 }, { "epoch": 0.5830016389604308, "grad_norm": 3.623199701309204, "learning_rate": 9.73152509486123e-05, "loss": 0.3894, "step": 7470 }, { "epoch": 0.5837820963084367, "grad_norm": 84.90872955322266, "learning_rate": 9.73005530371086e-05, "loss": 2.7016, "step": 7480 }, { "epoch": 0.5845625536564427, "grad_norm": 0.00025274348445236683, "learning_rate": 9.728581611890004e-05, "loss": 0.3299, "step": 7490 }, { "epoch": 0.5853430110044486, "grad_norm": 0.36983373761177063, "learning_rate": 9.727104020613954e-05, "loss": 0.4919, "step": 7500 }, { "epoch": 0.5861234683524545, "grad_norm": 0.40055277943611145, "learning_rate": 9.725622531101211e-05, "loss": 1.3109, "step": 7510 }, { "epoch": 0.5869039257004605, "grad_norm": 14.323739051818848, "learning_rate": 9.724137144573497e-05, "loss": 0.492, "step": 7520 }, { "epoch": 0.5876843830484664, "grad_norm": 52.50056076049805, "learning_rate": 9.722647862255749e-05, "loss": 1.8018, "step": 7530 }, { "epoch": 0.5884648403964723, "grad_norm": 0.0008027786388993263, "learning_rate": 9.721154685376109e-05, "loss": 0.6453, "step": 7540 }, { "epoch": 0.5892452977444783, "grad_norm": 14.831884384155273, "learning_rate": 9.719657615165934e-05, "loss": 0.8703, "step": 7550 }, { "epoch": 0.5900257550924842, "grad_norm": 65.61073303222656, "learning_rate": 9.718156652859795e-05, "loss": 0.4422, "step": 7560 }, { "epoch": 0.5908062124404901, "grad_norm": 47.878257751464844, "learning_rate": 9.716651799695471e-05, "loss": 0.406, "step": 7570 }, { "epoch": 0.5915866697884961, "grad_norm": 0.26020410656929016, "learning_rate": 9.715143056913947e-05, "loss": 0.2718, "step": 7580 }, { "epoch": 0.592367127136502, "grad_norm": 57.03702163696289, "learning_rate": 9.713630425759419e-05, "loss": 2.6524, "step": 7590 }, { "epoch": 0.5931475844845079, "grad_norm": 0.009130135178565979, "learning_rate": 9.712113907479285e-05, "loss": 0.9718, "step": 7600 }, { "epoch": 0.5939280418325138, "grad_norm": 0.03151361271739006, "learning_rate": 9.710593503324155e-05, "loss": 2.1971, "step": 7610 }, { "epoch": 0.5947084991805198, "grad_norm": 2.31083083152771, "learning_rate": 9.709069214547839e-05, "loss": 0.1109, "step": 7620 }, { "epoch": 0.5954889565285257, "grad_norm": 71.40426635742188, "learning_rate": 9.70754104240735e-05, "loss": 1.5467, "step": 7630 }, { "epoch": 0.5962694138765317, "grad_norm": 20.671236038208008, "learning_rate": 9.706008988162907e-05, "loss": 2.6514, "step": 7640 }, { "epoch": 0.5970498712245376, "grad_norm": 17.801488876342773, "learning_rate": 9.704473053077928e-05, "loss": 1.1676, "step": 7650 }, { "epoch": 0.5978303285725435, "grad_norm": 0.09868277609348297, "learning_rate": 9.702933238419029e-05, "loss": 0.5239, "step": 7660 }, { "epoch": 0.5986107859205494, "grad_norm": 0.02296440117061138, "learning_rate": 9.70138954545603e-05, "loss": 0.5087, "step": 7670 }, { "epoch": 0.5993912432685554, "grad_norm": 0.036499544978141785, "learning_rate": 9.69984197546195e-05, "loss": 0.3802, "step": 7680 }, { "epoch": 0.6001717006165613, "grad_norm": 9.838994026184082, "learning_rate": 9.698290529712999e-05, "loss": 0.9191, "step": 7690 }, { "epoch": 0.6009521579645672, "grad_norm": 11.949810981750488, "learning_rate": 9.696735209488588e-05, "loss": 0.6987, "step": 7700 }, { "epoch": 0.6017326153125732, "grad_norm": 6.8122071752441116e-06, "learning_rate": 9.695176016071321e-05, "loss": 1.0909, "step": 7710 }, { "epoch": 0.6025130726605791, "grad_norm": 16.3994140625, "learning_rate": 9.693612950746997e-05, "loss": 0.7157, "step": 7720 }, { "epoch": 0.603293530008585, "grad_norm": 0.040160659700632095, "learning_rate": 9.69204601480461e-05, "loss": 0.5506, "step": 7730 }, { "epoch": 0.604073987356591, "grad_norm": 57.082244873046875, "learning_rate": 9.690475209536341e-05, "loss": 1.5083, "step": 7740 }, { "epoch": 0.6048544447045969, "grad_norm": 0.025508442893624306, "learning_rate": 9.688900536237566e-05, "loss": 0.1772, "step": 7750 }, { "epoch": 0.6056349020526028, "grad_norm": 9.786832379177213e-05, "learning_rate": 9.687321996206849e-05, "loss": 1.0359, "step": 7760 }, { "epoch": 0.6064153594006088, "grad_norm": 28.551362991333008, "learning_rate": 9.685739590745944e-05, "loss": 1.2453, "step": 7770 }, { "epoch": 0.6071958167486147, "grad_norm": 4.971829414367676, "learning_rate": 9.68415332115979e-05, "loss": 0.5187, "step": 7780 }, { "epoch": 0.6079762740966206, "grad_norm": 62.52102279663086, "learning_rate": 9.682563188756518e-05, "loss": 2.0489, "step": 7790 }, { "epoch": 0.6087567314446265, "grad_norm": 0.0007101665250957012, "learning_rate": 9.680969194847436e-05, "loss": 1.4431, "step": 7800 }, { "epoch": 0.6095371887926325, "grad_norm": 0.03687829524278641, "learning_rate": 9.679371340747045e-05, "loss": 0.8274, "step": 7810 }, { "epoch": 0.6103176461406384, "grad_norm": 0.7268645763397217, "learning_rate": 9.677769627773024e-05, "loss": 0.8634, "step": 7820 }, { "epoch": 0.6110981034886444, "grad_norm": 7.965996265411377, "learning_rate": 9.676164057246235e-05, "loss": 0.4535, "step": 7830 }, { "epoch": 0.6118785608366503, "grad_norm": 0.0013677034294232726, "learning_rate": 9.674554630490726e-05, "loss": 1.266, "step": 7840 }, { "epoch": 0.6126590181846562, "grad_norm": 8.891855239868164, "learning_rate": 9.672941348833717e-05, "loss": 1.3937, "step": 7850 }, { "epoch": 0.6134394755326621, "grad_norm": 0.060114502906799316, "learning_rate": 9.671324213605614e-05, "loss": 0.6587, "step": 7860 }, { "epoch": 0.6142199328806681, "grad_norm": 5.528909683227539, "learning_rate": 9.669703226139996e-05, "loss": 0.6251, "step": 7870 }, { "epoch": 0.615000390228674, "grad_norm": 21.050743103027344, "learning_rate": 9.66807838777362e-05, "loss": 0.3692, "step": 7880 }, { "epoch": 0.61578084757668, "grad_norm": 5.410530548033421e-07, "learning_rate": 9.666449699846423e-05, "loss": 0.7565, "step": 7890 }, { "epoch": 0.6165613049246859, "grad_norm": 0.0002509712358005345, "learning_rate": 9.664817163701508e-05, "loss": 0.656, "step": 7900 }, { "epoch": 0.6173417622726918, "grad_norm": 0.029291460290551186, "learning_rate": 9.663180780685162e-05, "loss": 1.2556, "step": 7910 }, { "epoch": 0.6181222196206977, "grad_norm": 9.711715698242188, "learning_rate": 9.661540552146833e-05, "loss": 1.4943, "step": 7920 }, { "epoch": 0.6189026769687037, "grad_norm": 20.323610305786133, "learning_rate": 9.65989647943915e-05, "loss": 0.7127, "step": 7930 }, { "epoch": 0.6196831343167096, "grad_norm": 4.817214488983154, "learning_rate": 9.658248563917906e-05, "loss": 1.8099, "step": 7940 }, { "epoch": 0.6204635916647155, "grad_norm": 41.11168670654297, "learning_rate": 9.656596806942068e-05, "loss": 0.8875, "step": 7950 }, { "epoch": 0.6212440490127215, "grad_norm": 56.30549621582031, "learning_rate": 9.654941209873765e-05, "loss": 1.2167, "step": 7960 }, { "epoch": 0.6220245063607274, "grad_norm": 39.02580642700195, "learning_rate": 9.653281774078297e-05, "loss": 0.2388, "step": 7970 }, { "epoch": 0.6228049637087333, "grad_norm": 1.3934792280197144, "learning_rate": 9.651618500924127e-05, "loss": 0.1799, "step": 7980 }, { "epoch": 0.6235854210567392, "grad_norm": 66.73686218261719, "learning_rate": 9.649951391782886e-05, "loss": 1.2296, "step": 7990 }, { "epoch": 0.6243658784047452, "grad_norm": 0.0006028424249961972, "learning_rate": 9.648280448029365e-05, "loss": 0.3993, "step": 8000 }, { "epoch": 0.6251463357527511, "grad_norm": 0.25307711958885193, "learning_rate": 9.64660567104152e-05, "loss": 1.1959, "step": 8010 }, { "epoch": 0.625926793100757, "grad_norm": 0.15425390005111694, "learning_rate": 9.644927062200463e-05, "loss": 2.079, "step": 8020 }, { "epoch": 0.626707250448763, "grad_norm": 0.0008805795223452151, "learning_rate": 9.643244622890475e-05, "loss": 1.327, "step": 8030 }, { "epoch": 0.6274877077967689, "grad_norm": 0.9369128942489624, "learning_rate": 9.641558354498987e-05, "loss": 0.8932, "step": 8040 }, { "epoch": 0.6282681651447748, "grad_norm": 22.965044021606445, "learning_rate": 9.63986825841659e-05, "loss": 1.1273, "step": 8050 }, { "epoch": 0.6290486224927808, "grad_norm": 9.68968391418457, "learning_rate": 9.638174336037034e-05, "loss": 0.0267, "step": 8060 }, { "epoch": 0.6298290798407867, "grad_norm": 0.0005601391312666237, "learning_rate": 9.636476588757224e-05, "loss": 2.1434, "step": 8070 }, { "epoch": 0.6306095371887926, "grad_norm": 43.502105712890625, "learning_rate": 9.634775017977216e-05, "loss": 0.606, "step": 8080 }, { "epoch": 0.6313899945367986, "grad_norm": 0.001144526293501258, "learning_rate": 9.633069625100224e-05, "loss": 0.5548, "step": 8090 }, { "epoch": 0.6321704518848045, "grad_norm": 6.474184036254883, "learning_rate": 9.631360411532608e-05, "loss": 0.5924, "step": 8100 }, { "epoch": 0.6329509092328104, "grad_norm": 1.9110515117645264, "learning_rate": 9.629647378683886e-05, "loss": 0.7659, "step": 8110 }, { "epoch": 0.6337313665808164, "grad_norm": 78.93585205078125, "learning_rate": 9.627930527966718e-05, "loss": 1.1818, "step": 8120 }, { "epoch": 0.6345118239288223, "grad_norm": 37.94493103027344, "learning_rate": 9.626209860796916e-05, "loss": 2.0675, "step": 8130 }, { "epoch": 0.6352922812768282, "grad_norm": 1.4972370862960815, "learning_rate": 9.62448537859344e-05, "loss": 0.356, "step": 8140 }, { "epoch": 0.6360727386248342, "grad_norm": 5.993861675262451, "learning_rate": 9.622757082778398e-05, "loss": 0.7325, "step": 8150 }, { "epoch": 0.6368531959728401, "grad_norm": 13.716777801513672, "learning_rate": 9.621024974777036e-05, "loss": 0.9638, "step": 8160 }, { "epoch": 0.637633653320846, "grad_norm": 3.911118745803833, "learning_rate": 9.619289056017751e-05, "loss": 1.6913, "step": 8170 }, { "epoch": 0.6384141106688519, "grad_norm": 62.19038772583008, "learning_rate": 9.617549327932078e-05, "loss": 1.3146, "step": 8180 }, { "epoch": 0.6391945680168579, "grad_norm": 64.44358825683594, "learning_rate": 9.615805791954695e-05, "loss": 2.7173, "step": 8190 }, { "epoch": 0.6399750253648638, "grad_norm": 60.38733673095703, "learning_rate": 9.61405844952342e-05, "loss": 2.2941, "step": 8200 }, { "epoch": 0.6407554827128698, "grad_norm": 0.036649566143751144, "learning_rate": 9.612307302079213e-05, "loss": 0.439, "step": 8210 }, { "epoch": 0.6415359400608757, "grad_norm": 72.93924713134766, "learning_rate": 9.610552351066165e-05, "loss": 0.5796, "step": 8220 }, { "epoch": 0.6423163974088816, "grad_norm": 13.401516914367676, "learning_rate": 9.60879359793151e-05, "loss": 0.3281, "step": 8230 }, { "epoch": 0.6430968547568875, "grad_norm": 14.01244068145752, "learning_rate": 9.607031044125614e-05, "loss": 0.5925, "step": 8240 }, { "epoch": 0.6438773121048935, "grad_norm": 0.3519912362098694, "learning_rate": 9.605264691101978e-05, "loss": 1.3033, "step": 8250 }, { "epoch": 0.6446577694528994, "grad_norm": 0.004155475180596113, "learning_rate": 9.603494540317238e-05, "loss": 0.6315, "step": 8260 }, { "epoch": 0.6454382268009053, "grad_norm": 51.62873458862305, "learning_rate": 9.601720593231158e-05, "loss": 1.3946, "step": 8270 }, { "epoch": 0.6462186841489113, "grad_norm": 50.64020919799805, "learning_rate": 9.599942851306638e-05, "loss": 0.8091, "step": 8280 }, { "epoch": 0.6469991414969172, "grad_norm": 6.914702892303467, "learning_rate": 9.598161316009701e-05, "loss": 0.6304, "step": 8290 }, { "epoch": 0.6477795988449231, "grad_norm": 7.955085277557373, "learning_rate": 9.596375988809505e-05, "loss": 0.5228, "step": 8300 }, { "epoch": 0.648560056192929, "grad_norm": 18.709684371948242, "learning_rate": 9.594586871178327e-05, "loss": 0.9406, "step": 8310 }, { "epoch": 0.649340513540935, "grad_norm": 0.03159976378083229, "learning_rate": 9.592793964591578e-05, "loss": 0.5873, "step": 8320 }, { "epoch": 0.6501209708889409, "grad_norm": 62.471961975097656, "learning_rate": 9.590997270527789e-05, "loss": 0.6104, "step": 8330 }, { "epoch": 0.6509014282369469, "grad_norm": 4.519401550292969, "learning_rate": 9.589196790468615e-05, "loss": 1.8798, "step": 8340 }, { "epoch": 0.6516818855849528, "grad_norm": 39.14466857910156, "learning_rate": 9.587392525898833e-05, "loss": 0.7252, "step": 8350 }, { "epoch": 0.6524623429329587, "grad_norm": 0.0004648494068533182, "learning_rate": 9.585584478306342e-05, "loss": 0.3632, "step": 8360 }, { "epoch": 0.6532428002809646, "grad_norm": 17.922636032104492, "learning_rate": 9.583772649182159e-05, "loss": 1.0404, "step": 8370 }, { "epoch": 0.6540232576289706, "grad_norm": 52.733394622802734, "learning_rate": 9.581957040020424e-05, "loss": 0.4739, "step": 8380 }, { "epoch": 0.6548037149769765, "grad_norm": 66.99308776855469, "learning_rate": 9.580137652318386e-05, "loss": 1.7214, "step": 8390 }, { "epoch": 0.6555841723249825, "grad_norm": 2.9420151710510254, "learning_rate": 9.578314487576418e-05, "loss": 0.3692, "step": 8400 }, { "epoch": 0.6563646296729884, "grad_norm": 30.528118133544922, "learning_rate": 9.576487547298003e-05, "loss": 0.6922, "step": 8410 }, { "epoch": 0.6571450870209943, "grad_norm": 0.8822042346000671, "learning_rate": 9.574656832989739e-05, "loss": 1.9788, "step": 8420 }, { "epoch": 0.6579255443690002, "grad_norm": 4.979971885681152, "learning_rate": 9.572822346161338e-05, "loss": 0.5478, "step": 8430 }, { "epoch": 0.6587060017170062, "grad_norm": 35.818092346191406, "learning_rate": 9.57098408832562e-05, "loss": 1.3641, "step": 8440 }, { "epoch": 0.6594864590650121, "grad_norm": 23.56801414489746, "learning_rate": 9.569142060998514e-05, "loss": 0.8958, "step": 8450 }, { "epoch": 0.660266916413018, "grad_norm": 0.06824395060539246, "learning_rate": 9.567296265699066e-05, "loss": 1.4302, "step": 8460 }, { "epoch": 0.661047373761024, "grad_norm": 0.00021735116024501622, "learning_rate": 9.565446703949417e-05, "loss": 2.3565, "step": 8470 }, { "epoch": 0.6618278311090299, "grad_norm": 0.0019540295470505953, "learning_rate": 9.563593377274821e-05, "loss": 0.435, "step": 8480 }, { "epoch": 0.6626082884570358, "grad_norm": 0.0047746263444423676, "learning_rate": 9.561736287203638e-05, "loss": 1.2002, "step": 8490 }, { "epoch": 0.6633887458050417, "grad_norm": 57.810672760009766, "learning_rate": 9.559875435267326e-05, "loss": 1.0457, "step": 8500 }, { "epoch": 0.6641692031530477, "grad_norm": 2.046326160430908, "learning_rate": 9.558010823000451e-05, "loss": 0.3933, "step": 8510 }, { "epoch": 0.6649496605010536, "grad_norm": 31.771995544433594, "learning_rate": 9.55614245194068e-05, "loss": 1.1063, "step": 8520 }, { "epoch": 0.6657301178490596, "grad_norm": 1.884420394897461, "learning_rate": 9.554270323628771e-05, "loss": 0.3216, "step": 8530 }, { "epoch": 0.6665105751970655, "grad_norm": 0.9028748273849487, "learning_rate": 9.55239443960859e-05, "loss": 1.1951, "step": 8540 }, { "epoch": 0.6672910325450714, "grad_norm": 2.9773038477287628e-05, "learning_rate": 9.550514801427098e-05, "loss": 0.948, "step": 8550 }, { "epoch": 0.6680714898930773, "grad_norm": 11.40433120727539, "learning_rate": 9.548631410634346e-05, "loss": 1.0299, "step": 8560 }, { "epoch": 0.6688519472410833, "grad_norm": 0.689365804195404, "learning_rate": 9.54674426878349e-05, "loss": 0.253, "step": 8570 }, { "epoch": 0.6696324045890892, "grad_norm": 4.528523921966553, "learning_rate": 9.544853377430771e-05, "loss": 0.7915, "step": 8580 }, { "epoch": 0.6704128619370952, "grad_norm": 2.6693929289223206e-09, "learning_rate": 9.542958738135526e-05, "loss": 0.3344, "step": 8590 }, { "epoch": 0.6711933192851011, "grad_norm": 90.78608703613281, "learning_rate": 9.541060352460178e-05, "loss": 1.3186, "step": 8600 }, { "epoch": 0.671973776633107, "grad_norm": 72.95679473876953, "learning_rate": 9.539158221970246e-05, "loss": 0.1651, "step": 8610 }, { "epoch": 0.6727542339811129, "grad_norm": 0.004029855597764254, "learning_rate": 9.537252348234334e-05, "loss": 2.5257, "step": 8620 }, { "epoch": 0.6735346913291189, "grad_norm": 126.72969055175781, "learning_rate": 9.535342732824132e-05, "loss": 3.0816, "step": 8630 }, { "epoch": 0.6743151486771248, "grad_norm": 39.24493408203125, "learning_rate": 9.533429377314416e-05, "loss": 3.278, "step": 8640 }, { "epoch": 0.6750956060251307, "grad_norm": 37.542564392089844, "learning_rate": 9.53151228328305e-05, "loss": 0.8647, "step": 8650 }, { "epoch": 0.6758760633731367, "grad_norm": 14.442136764526367, "learning_rate": 9.529591452310975e-05, "loss": 0.4998, "step": 8660 }, { "epoch": 0.6766565207211426, "grad_norm": 29.716121673583984, "learning_rate": 9.527666885982216e-05, "loss": 0.541, "step": 8670 }, { "epoch": 0.6774369780691485, "grad_norm": 42.91162109375, "learning_rate": 9.525738585883883e-05, "loss": 0.5751, "step": 8680 }, { "epoch": 0.6782174354171544, "grad_norm": 3.21102237701416, "learning_rate": 9.523806553606156e-05, "loss": 0.3043, "step": 8690 }, { "epoch": 0.6789978927651604, "grad_norm": 52.90839385986328, "learning_rate": 9.521870790742302e-05, "loss": 0.685, "step": 8700 }, { "epoch": 0.6797783501131663, "grad_norm": 0.0020777760073542595, "learning_rate": 9.519931298888658e-05, "loss": 0.5612, "step": 8710 }, { "epoch": 0.6805588074611723, "grad_norm": 46.25379180908203, "learning_rate": 9.51798807964464e-05, "loss": 0.7983, "step": 8720 }, { "epoch": 0.6813392648091782, "grad_norm": 0.19482280313968658, "learning_rate": 9.516041134612734e-05, "loss": 1.2481, "step": 8730 }, { "epoch": 0.6821197221571841, "grad_norm": 0.0002395164337940514, "learning_rate": 9.514090465398502e-05, "loss": 1.4959, "step": 8740 }, { "epoch": 0.68290017950519, "grad_norm": 0.16013354063034058, "learning_rate": 9.512136073610575e-05, "loss": 0.5128, "step": 8750 }, { "epoch": 0.683680636853196, "grad_norm": 17.525259017944336, "learning_rate": 9.510177960860658e-05, "loss": 0.5179, "step": 8760 }, { "epoch": 0.6844610942012019, "grad_norm": 0.0016455745790153742, "learning_rate": 9.508216128763518e-05, "loss": 0.4293, "step": 8770 }, { "epoch": 0.6852415515492079, "grad_norm": 2.1851837635040283, "learning_rate": 9.506250578936993e-05, "loss": 1.6695, "step": 8780 }, { "epoch": 0.6860220088972138, "grad_norm": 38.016029357910156, "learning_rate": 9.504281313001986e-05, "loss": 0.9156, "step": 8790 }, { "epoch": 0.6868024662452197, "grad_norm": 6.92643404006958, "learning_rate": 9.502308332582466e-05, "loss": 0.1688, "step": 8800 }, { "epoch": 0.6875829235932256, "grad_norm": 862.4967651367188, "learning_rate": 9.500331639305462e-05, "loss": 4.6745, "step": 8810 }, { "epoch": 0.6883633809412315, "grad_norm": 0.0006037608836777508, "learning_rate": 9.49835123480107e-05, "loss": 0.7949, "step": 8820 }, { "epoch": 0.6891438382892375, "grad_norm": 0.00010248275066260248, "learning_rate": 9.49636712070244e-05, "loss": 0.0901, "step": 8830 }, { "epoch": 0.6899242956372434, "grad_norm": 12.445732116699219, "learning_rate": 9.494379298645789e-05, "loss": 0.4997, "step": 8840 }, { "epoch": 0.6907047529852494, "grad_norm": 6.305129528045654, "learning_rate": 9.492387770270381e-05, "loss": 0.3734, "step": 8850 }, { "epoch": 0.6914852103332553, "grad_norm": 34.12310028076172, "learning_rate": 9.490392537218546e-05, "loss": 1.5739, "step": 8860 }, { "epoch": 0.6922656676812612, "grad_norm": 4.1644459997769445e-05, "learning_rate": 9.488393601135666e-05, "loss": 0.5979, "step": 8870 }, { "epoch": 0.6930461250292671, "grad_norm": 48.17484664916992, "learning_rate": 9.486390963670175e-05, "loss": 1.7442, "step": 8880 }, { "epoch": 0.6938265823772731, "grad_norm": 3.2540643957190696e-08, "learning_rate": 9.484384626473564e-05, "loss": 1.6239, "step": 8890 }, { "epoch": 0.694607039725279, "grad_norm": 0.01425440888851881, "learning_rate": 9.48237459120037e-05, "loss": 0.6439, "step": 8900 }, { "epoch": 0.695387497073285, "grad_norm": 2.2680177688598633, "learning_rate": 9.480360859508178e-05, "loss": 0.7415, "step": 8910 }, { "epoch": 0.6961679544212909, "grad_norm": 5.447899341583252, "learning_rate": 9.478343433057631e-05, "loss": 0.9192, "step": 8920 }, { "epoch": 0.6969484117692968, "grad_norm": 6.804657936096191, "learning_rate": 9.476322313512408e-05, "loss": 1.3799, "step": 8930 }, { "epoch": 0.6977288691173027, "grad_norm": 53.744571685791016, "learning_rate": 9.47429750253924e-05, "loss": 1.0447, "step": 8940 }, { "epoch": 0.6985093264653087, "grad_norm": 20.53790855407715, "learning_rate": 9.4722690018079e-05, "loss": 1.3624, "step": 8950 }, { "epoch": 0.6992897838133146, "grad_norm": 7.079352144501172e-07, "learning_rate": 9.470236812991205e-05, "loss": 1.7281, "step": 8960 }, { "epoch": 0.7000702411613206, "grad_norm": 1.2337582111358643, "learning_rate": 9.468200937765013e-05, "loss": 1.4859, "step": 8970 }, { "epoch": 0.7008506985093265, "grad_norm": 59.161590576171875, "learning_rate": 9.466161377808219e-05, "loss": 1.9196, "step": 8980 }, { "epoch": 0.7016311558573324, "grad_norm": 50.03242111206055, "learning_rate": 9.464118134802762e-05, "loss": 0.3488, "step": 8990 }, { "epoch": 0.7024116132053383, "grad_norm": 7.526350964326411e-05, "learning_rate": 9.462071210433612e-05, "loss": 0.4794, "step": 9000 }, { "epoch": 0.7031920705533442, "grad_norm": 50.92277526855469, "learning_rate": 9.460020606388782e-05, "loss": 1.2395, "step": 9010 }, { "epoch": 0.7039725279013502, "grad_norm": 0.0001332208194071427, "learning_rate": 9.457966324359315e-05, "loss": 0.4707, "step": 9020 }, { "epoch": 0.7047529852493561, "grad_norm": 0.033170610666275024, "learning_rate": 9.455908366039288e-05, "loss": 0.4093, "step": 9030 }, { "epoch": 0.7055334425973621, "grad_norm": 5.431732461147476e-06, "learning_rate": 9.453846733125806e-05, "loss": 0.8617, "step": 9040 }, { "epoch": 0.706313899945368, "grad_norm": 0.00780220702290535, "learning_rate": 9.451781427319012e-05, "loss": 1.345, "step": 9050 }, { "epoch": 0.7070943572933739, "grad_norm": 51.573158264160156, "learning_rate": 9.449712450322072e-05, "loss": 1.286, "step": 9060 }, { "epoch": 0.7078748146413798, "grad_norm": 34.31037521362305, "learning_rate": 9.447639803841182e-05, "loss": 2.3959, "step": 9070 }, { "epoch": 0.7086552719893858, "grad_norm": 65.48462677001953, "learning_rate": 9.445563489585563e-05, "loss": 0.5199, "step": 9080 }, { "epoch": 0.7094357293373917, "grad_norm": 51.40140914916992, "learning_rate": 9.443483509267459e-05, "loss": 0.7834, "step": 9090 }, { "epoch": 0.7102161866853977, "grad_norm": 12.657033920288086, "learning_rate": 9.441399864602143e-05, "loss": 0.4621, "step": 9100 }, { "epoch": 0.7109966440334036, "grad_norm": 0.18859697878360748, "learning_rate": 9.439312557307902e-05, "loss": 0.4602, "step": 9110 }, { "epoch": 0.7117771013814095, "grad_norm": 1.1662402153015137, "learning_rate": 9.437221589106049e-05, "loss": 0.9147, "step": 9120 }, { "epoch": 0.7125575587294154, "grad_norm": 38.23827362060547, "learning_rate": 9.435126961720915e-05, "loss": 1.2606, "step": 9130 }, { "epoch": 0.7133380160774214, "grad_norm": 0.7538944482803345, "learning_rate": 9.433028676879847e-05, "loss": 1.0351, "step": 9140 }, { "epoch": 0.7141184734254273, "grad_norm": 0.0005075185908935964, "learning_rate": 9.430926736313209e-05, "loss": 0.8804, "step": 9150 }, { "epoch": 0.7148989307734332, "grad_norm": 21.777206420898438, "learning_rate": 9.428821141754381e-05, "loss": 1.0078, "step": 9160 }, { "epoch": 0.7156793881214392, "grad_norm": 50.828529357910156, "learning_rate": 9.426711894939753e-05, "loss": 0.3999, "step": 9170 }, { "epoch": 0.7164598454694451, "grad_norm": 1.3094151020050049, "learning_rate": 9.424598997608732e-05, "loss": 0.6567, "step": 9180 }, { "epoch": 0.717240302817451, "grad_norm": 0.7577186822891235, "learning_rate": 9.422482451503728e-05, "loss": 0.4967, "step": 9190 }, { "epoch": 0.7180207601654569, "grad_norm": 9.299863101830397e-09, "learning_rate": 9.42036225837017e-05, "loss": 0.0471, "step": 9200 }, { "epoch": 0.7188012175134629, "grad_norm": 2.130914802250805e-13, "learning_rate": 9.418238419956484e-05, "loss": 1.7301, "step": 9210 }, { "epoch": 0.7195816748614688, "grad_norm": 26.992446899414062, "learning_rate": 9.416110938014109e-05, "loss": 0.1198, "step": 9220 }, { "epoch": 0.7203621322094748, "grad_norm": 5.468270683195442e-05, "learning_rate": 9.413979814297485e-05, "loss": 0.9217, "step": 9230 }, { "epoch": 0.7211425895574807, "grad_norm": 0.8918013572692871, "learning_rate": 9.41184505056406e-05, "loss": 1.265, "step": 9240 }, { "epoch": 0.7219230469054866, "grad_norm": 83.64592742919922, "learning_rate": 9.409706648574278e-05, "loss": 2.5767, "step": 9250 }, { "epoch": 0.7227035042534925, "grad_norm": 1.9623314142227173, "learning_rate": 9.407564610091587e-05, "loss": 1.7349, "step": 9260 }, { "epoch": 0.7234839616014985, "grad_norm": 39.234317779541016, "learning_rate": 9.405418936882434e-05, "loss": 0.592, "step": 9270 }, { "epoch": 0.7242644189495044, "grad_norm": 65.8040542602539, "learning_rate": 9.403269630716259e-05, "loss": 1.2316, "step": 9280 }, { "epoch": 0.7250448762975104, "grad_norm": 39.89567565917969, "learning_rate": 9.401116693365504e-05, "loss": 2.1814, "step": 9290 }, { "epoch": 0.7258253336455163, "grad_norm": 2.613337993621826, "learning_rate": 9.3989601266056e-05, "loss": 0.2446, "step": 9300 }, { "epoch": 0.7266057909935222, "grad_norm": 44.03250503540039, "learning_rate": 9.396799932214977e-05, "loss": 0.8227, "step": 9310 }, { "epoch": 0.7273862483415281, "grad_norm": 16.39767837524414, "learning_rate": 9.39463611197505e-05, "loss": 1.2959, "step": 9320 }, { "epoch": 0.7281667056895341, "grad_norm": 59.34518814086914, "learning_rate": 9.392468667670229e-05, "loss": 2.1178, "step": 9330 }, { "epoch": 0.72894716303754, "grad_norm": 4.03518533706665, "learning_rate": 9.39029760108791e-05, "loss": 0.777, "step": 9340 }, { "epoch": 0.729727620385546, "grad_norm": 0.06964946538209915, "learning_rate": 9.388122914018478e-05, "loss": 1.0955, "step": 9350 }, { "epoch": 0.7305080777335519, "grad_norm": 10.531116485595703, "learning_rate": 9.385944608255303e-05, "loss": 0.3112, "step": 9360 }, { "epoch": 0.7312885350815578, "grad_norm": 53.286312103271484, "learning_rate": 9.383762685594737e-05, "loss": 1.2833, "step": 9370 }, { "epoch": 0.7320689924295637, "grad_norm": 0.09685280174016953, "learning_rate": 9.381577147836118e-05, "loss": 0.569, "step": 9380 }, { "epoch": 0.7328494497775696, "grad_norm": 1.3586193858827755e-07, "learning_rate": 9.379387996781764e-05, "loss": 1.0987, "step": 9390 }, { "epoch": 0.7336299071255756, "grad_norm": 1.8557615280151367, "learning_rate": 9.377195234236969e-05, "loss": 2.1667, "step": 9400 }, { "epoch": 0.7344103644735815, "grad_norm": 54.7618522644043, "learning_rate": 9.374998862010013e-05, "loss": 1.502, "step": 9410 }, { "epoch": 0.7351908218215875, "grad_norm": 67.63577270507812, "learning_rate": 9.372798881912148e-05, "loss": 1.4237, "step": 9420 }, { "epoch": 0.7359712791695934, "grad_norm": 17.463558197021484, "learning_rate": 9.370595295757598e-05, "loss": 0.3832, "step": 9430 }, { "epoch": 0.7367517365175993, "grad_norm": 66.19950866699219, "learning_rate": 9.36838810536357e-05, "loss": 1.7184, "step": 9440 }, { "epoch": 0.7375321938656052, "grad_norm": 0.3658444285392761, "learning_rate": 9.366177312550232e-05, "loss": 1.0973, "step": 9450 }, { "epoch": 0.7383126512136112, "grad_norm": 0.022006291896104813, "learning_rate": 9.363962919140734e-05, "loss": 1.0796, "step": 9460 }, { "epoch": 0.7390931085616171, "grad_norm": 0.00027838730602525175, "learning_rate": 9.361744926961185e-05, "loss": 1.9034, "step": 9470 }, { "epoch": 0.739873565909623, "grad_norm": 0.3998589813709259, "learning_rate": 9.35952333784067e-05, "loss": 1.3968, "step": 9480 }, { "epoch": 0.740654023257629, "grad_norm": 32.08036804199219, "learning_rate": 9.357298153611236e-05, "loss": 1.4303, "step": 9490 }, { "epoch": 0.7414344806056349, "grad_norm": 5.425138473510742, "learning_rate": 9.355069376107892e-05, "loss": 0.1034, "step": 9500 }, { "epoch": 0.7422149379536408, "grad_norm": 30.735441207885742, "learning_rate": 9.352837007168618e-05, "loss": 1.1738, "step": 9510 }, { "epoch": 0.7429953953016467, "grad_norm": 11.689058303833008, "learning_rate": 9.35060104863435e-05, "loss": 1.0133, "step": 9520 }, { "epoch": 0.7437758526496527, "grad_norm": 12.790924072265625, "learning_rate": 9.348361502348986e-05, "loss": 1.4594, "step": 9530 }, { "epoch": 0.7445563099976586, "grad_norm": 32.837467193603516, "learning_rate": 9.34611837015938e-05, "loss": 0.4372, "step": 9540 }, { "epoch": 0.7453367673456646, "grad_norm": 47.088661193847656, "learning_rate": 9.343871653915349e-05, "loss": 1.886, "step": 9550 }, { "epoch": 0.7461172246936705, "grad_norm": 3.3653592254268005e-05, "learning_rate": 9.341621355469659e-05, "loss": 0.078, "step": 9560 }, { "epoch": 0.7468976820416764, "grad_norm": 4.299693584442139, "learning_rate": 9.339367476678034e-05, "loss": 0.6906, "step": 9570 }, { "epoch": 0.7476781393896823, "grad_norm": 18.840805053710938, "learning_rate": 9.337110019399149e-05, "loss": 1.133, "step": 9580 }, { "epoch": 0.7484585967376883, "grad_norm": 8.864206392900087e-06, "learning_rate": 9.334848985494632e-05, "loss": 1.3721, "step": 9590 }, { "epoch": 0.7492390540856942, "grad_norm": 0.5536551475524902, "learning_rate": 9.33258437682906e-05, "loss": 0.7809, "step": 9600 }, { "epoch": 0.7500195114337002, "grad_norm": 5.465764115797356e-05, "learning_rate": 9.330316195269953e-05, "loss": 0.2602, "step": 9610 }, { "epoch": 0.7507999687817061, "grad_norm": 50.65366744995117, "learning_rate": 9.328044442687784e-05, "loss": 0.8001, "step": 9620 }, { "epoch": 0.751580426129712, "grad_norm": 54.035240173339844, "learning_rate": 9.32576912095597e-05, "loss": 1.3913, "step": 9630 }, { "epoch": 0.7523608834777179, "grad_norm": 0.7376708984375, "learning_rate": 9.323490231950867e-05, "loss": 0.4636, "step": 9640 }, { "epoch": 0.7531413408257239, "grad_norm": 36.873104095458984, "learning_rate": 9.321207777551776e-05, "loss": 1.2622, "step": 9650 }, { "epoch": 0.7539217981737298, "grad_norm": 1.9300039639347233e-05, "learning_rate": 9.318921759640939e-05, "loss": 1.3158, "step": 9660 }, { "epoch": 0.7547022555217358, "grad_norm": 13.232084274291992, "learning_rate": 9.316632180103535e-05, "loss": 0.2827, "step": 9670 }, { "epoch": 0.7554827128697417, "grad_norm": 2.7236337661743164, "learning_rate": 9.314339040827679e-05, "loss": 1.6498, "step": 9680 }, { "epoch": 0.7562631702177476, "grad_norm": 9.229357965523377e-05, "learning_rate": 9.312042343704425e-05, "loss": 0.8143, "step": 9690 }, { "epoch": 0.7570436275657535, "grad_norm": 48.95867156982422, "learning_rate": 9.309742090627757e-05, "loss": 2.1144, "step": 9700 }, { "epoch": 0.7578240849137594, "grad_norm": 1.1643975973129272, "learning_rate": 9.307438283494595e-05, "loss": 0.7393, "step": 9710 }, { "epoch": 0.7586045422617654, "grad_norm": 2.9845916287740692e-05, "learning_rate": 9.305130924204788e-05, "loss": 0.7176, "step": 9720 }, { "epoch": 0.7593849996097713, "grad_norm": 1.4221373021428008e-05, "learning_rate": 9.302820014661115e-05, "loss": 0.6517, "step": 9730 }, { "epoch": 0.7601654569577773, "grad_norm": 61.808475494384766, "learning_rate": 9.300505556769282e-05, "loss": 0.785, "step": 9740 }, { "epoch": 0.7609459143057832, "grad_norm": 47.0339241027832, "learning_rate": 9.29818755243792e-05, "loss": 1.022, "step": 9750 }, { "epoch": 0.7617263716537891, "grad_norm": 0.0022127856500446796, "learning_rate": 9.295866003578589e-05, "loss": 2.0017, "step": 9760 }, { "epoch": 0.762506829001795, "grad_norm": 7.076489418977872e-05, "learning_rate": 9.293540912105767e-05, "loss": 0.3585, "step": 9770 }, { "epoch": 0.763287286349801, "grad_norm": 40.70147705078125, "learning_rate": 9.291212279936857e-05, "loss": 0.6532, "step": 9780 }, { "epoch": 0.764067743697807, "grad_norm": 2.6155385971069336, "learning_rate": 9.28888010899218e-05, "loss": 1.0556, "step": 9790 }, { "epoch": 0.7648482010458129, "grad_norm": 0.6977835297584534, "learning_rate": 9.286544401194975e-05, "loss": 2.7733, "step": 9800 }, { "epoch": 0.7656286583938188, "grad_norm": 3.6149048805236816, "learning_rate": 9.2842051584714e-05, "loss": 2.1253, "step": 9810 }, { "epoch": 0.7664091157418247, "grad_norm": 51.35221481323242, "learning_rate": 9.281862382750526e-05, "loss": 1.6301, "step": 9820 }, { "epoch": 0.7671895730898306, "grad_norm": 12.507816314697266, "learning_rate": 9.279516075964336e-05, "loss": 0.5273, "step": 9830 }, { "epoch": 0.7679700304378366, "grad_norm": 9.683573722839355, "learning_rate": 9.27716624004773e-05, "loss": 0.8224, "step": 9840 }, { "epoch": 0.7687504877858425, "grad_norm": 1.2265725135803223, "learning_rate": 9.274812876938514e-05, "loss": 1.0238, "step": 9850 }, { "epoch": 0.7695309451338485, "grad_norm": 0.18335406482219696, "learning_rate": 9.272455988577404e-05, "loss": 1.4574, "step": 9860 }, { "epoch": 0.7703114024818544, "grad_norm": 49.96540832519531, "learning_rate": 9.270095576908022e-05, "loss": 1.2212, "step": 9870 }, { "epoch": 0.7710918598298603, "grad_norm": 0.36174869537353516, "learning_rate": 9.267731643876898e-05, "loss": 2.0641, "step": 9880 }, { "epoch": 0.7718723171778662, "grad_norm": 51.78955078125, "learning_rate": 9.265364191433466e-05, "loss": 2.0582, "step": 9890 }, { "epoch": 0.7726527745258721, "grad_norm": 1.56452631472348e-06, "learning_rate": 9.262993221530057e-05, "loss": 0.2814, "step": 9900 }, { "epoch": 0.7734332318738781, "grad_norm": 0.06889957189559937, "learning_rate": 9.260618736121908e-05, "loss": 0.5443, "step": 9910 }, { "epoch": 0.774213689221884, "grad_norm": 0.00021075314725749195, "learning_rate": 9.258240737167157e-05, "loss": 1.0861, "step": 9920 }, { "epoch": 0.77499414656989, "grad_norm": 4.2871174812316895, "learning_rate": 9.255859226626833e-05, "loss": 2.9569, "step": 9930 }, { "epoch": 0.7757746039178959, "grad_norm": 0.16662397980690002, "learning_rate": 9.253474206464863e-05, "loss": 0.9966, "step": 9940 }, { "epoch": 0.7765550612659018, "grad_norm": 4.747070789337158, "learning_rate": 9.251085678648072e-05, "loss": 0.7685, "step": 9950 }, { "epoch": 0.7773355186139077, "grad_norm": 9.950766752808704e-07, "learning_rate": 9.248693645146171e-05, "loss": 0.8595, "step": 9960 }, { "epoch": 0.7781159759619137, "grad_norm": 3.301793098449707, "learning_rate": 9.24629810793177e-05, "loss": 0.7267, "step": 9970 }, { "epoch": 0.7788964333099196, "grad_norm": 34.93284606933594, "learning_rate": 9.243899068980363e-05, "loss": 1.1627, "step": 9980 }, { "epoch": 0.7796768906579256, "grad_norm": 14.096379280090332, "learning_rate": 9.24149653027033e-05, "loss": 1.5315, "step": 9990 }, { "epoch": 0.7804573480059315, "grad_norm": 51.46378707885742, "learning_rate": 9.239090493782945e-05, "loss": 0.8666, "step": 10000 }, { "epoch": 0.7812378053539374, "grad_norm": 0.4632790684700012, "learning_rate": 9.236680961502357e-05, "loss": 0.6835, "step": 10010 }, { "epoch": 0.7820182627019433, "grad_norm": 1.5781743059051223e-05, "learning_rate": 9.234267935415604e-05, "loss": 0.7809, "step": 10020 }, { "epoch": 0.7827987200499492, "grad_norm": 0.00013900962949264795, "learning_rate": 9.231851417512604e-05, "loss": 1.1447, "step": 10030 }, { "epoch": 0.7835791773979552, "grad_norm": 10.146800994873047, "learning_rate": 9.229431409786152e-05, "loss": 0.5029, "step": 10040 }, { "epoch": 0.7843596347459612, "grad_norm": 9.001547813415527, "learning_rate": 9.227007914231925e-05, "loss": 0.2138, "step": 10050 }, { "epoch": 0.7851400920939671, "grad_norm": 9.599566459655762, "learning_rate": 9.224580932848475e-05, "loss": 1.292, "step": 10060 }, { "epoch": 0.785920549441973, "grad_norm": 47.64023971557617, "learning_rate": 9.222150467637224e-05, "loss": 0.5444, "step": 10070 }, { "epoch": 0.7867010067899789, "grad_norm": 30.91171646118164, "learning_rate": 9.219716520602473e-05, "loss": 1.3808, "step": 10080 }, { "epoch": 0.7874814641379848, "grad_norm": 1.6111406087875366, "learning_rate": 9.217279093751394e-05, "loss": 0.1607, "step": 10090 }, { "epoch": 0.7882619214859908, "grad_norm": 1.8544057607650757, "learning_rate": 9.214838189094024e-05, "loss": 0.9206, "step": 10100 }, { "epoch": 0.7890423788339967, "grad_norm": 24.458349227905273, "learning_rate": 9.212393808643271e-05, "loss": 0.7696, "step": 10110 }, { "epoch": 0.7898228361820027, "grad_norm": 46.76490020751953, "learning_rate": 9.209945954414909e-05, "loss": 0.4999, "step": 10120 }, { "epoch": 0.7906032935300086, "grad_norm": 1.0910409688949585, "learning_rate": 9.207494628427578e-05, "loss": 1.4236, "step": 10130 }, { "epoch": 0.7913837508780145, "grad_norm": 77.94813537597656, "learning_rate": 9.205039832702779e-05, "loss": 1.9332, "step": 10140 }, { "epoch": 0.7921642082260204, "grad_norm": 0.00010009103425545618, "learning_rate": 9.202581569264875e-05, "loss": 0.7059, "step": 10150 }, { "epoch": 0.7929446655740264, "grad_norm": 1.2641462087631226, "learning_rate": 9.200119840141088e-05, "loss": 0.6298, "step": 10160 }, { "epoch": 0.7937251229220323, "grad_norm": 58.1982421875, "learning_rate": 9.1976546473615e-05, "loss": 1.4736, "step": 10170 }, { "epoch": 0.7945055802700383, "grad_norm": 19.780921936035156, "learning_rate": 9.195185992959048e-05, "loss": 0.2553, "step": 10180 }, { "epoch": 0.7952860376180442, "grad_norm": 9.129549980163574, "learning_rate": 9.192713878969522e-05, "loss": 0.4162, "step": 10190 }, { "epoch": 0.7960664949660501, "grad_norm": 2.2736198902130127, "learning_rate": 9.190238307431569e-05, "loss": 1.5475, "step": 10200 }, { "epoch": 0.796846952314056, "grad_norm": 12.91063117980957, "learning_rate": 9.187759280386683e-05, "loss": 1.3482, "step": 10210 }, { "epoch": 0.7976274096620619, "grad_norm": 2.047335624694824, "learning_rate": 9.185276799879211e-05, "loss": 0.7524, "step": 10220 }, { "epoch": 0.7984078670100679, "grad_norm": 85.41983795166016, "learning_rate": 9.182790867956345e-05, "loss": 1.066, "step": 10230 }, { "epoch": 0.7991883243580739, "grad_norm": 27.353626251220703, "learning_rate": 9.180301486668128e-05, "loss": 2.0177, "step": 10240 }, { "epoch": 0.7999687817060798, "grad_norm": 8.590062861912884e-06, "learning_rate": 9.17780865806744e-05, "loss": 0.3788, "step": 10250 }, { "epoch": 0.8007492390540857, "grad_norm": 2.5400359630584717, "learning_rate": 9.175312384210011e-05, "loss": 1.0827, "step": 10260 }, { "epoch": 0.8015296964020916, "grad_norm": 0.13091030716896057, "learning_rate": 9.172812667154406e-05, "loss": 0.4355, "step": 10270 }, { "epoch": 0.8023101537500975, "grad_norm": 0.00023703681654296815, "learning_rate": 9.170309508962038e-05, "loss": 1.3295, "step": 10280 }, { "epoch": 0.8030906110981035, "grad_norm": 4.602359294891357, "learning_rate": 9.167802911697147e-05, "loss": 0.2444, "step": 10290 }, { "epoch": 0.8038710684461094, "grad_norm": 0.378699392080307, "learning_rate": 9.16529287742682e-05, "loss": 1.6013, "step": 10300 }, { "epoch": 0.8046515257941154, "grad_norm": 0.025837572291493416, "learning_rate": 9.162779408220968e-05, "loss": 1.3339, "step": 10310 }, { "epoch": 0.8054319831421213, "grad_norm": 5.020955562591553, "learning_rate": 9.160262506152342e-05, "loss": 1.3104, "step": 10320 }, { "epoch": 0.8062124404901272, "grad_norm": 24.9432430267334, "learning_rate": 9.157742173296521e-05, "loss": 0.456, "step": 10330 }, { "epoch": 0.8069928978381331, "grad_norm": 57.66835403442383, "learning_rate": 9.155218411731912e-05, "loss": 0.9609, "step": 10340 }, { "epoch": 0.8077733551861391, "grad_norm": 58.548484802246094, "learning_rate": 9.152691223539755e-05, "loss": 0.6119, "step": 10350 }, { "epoch": 0.808553812534145, "grad_norm": 76.95970153808594, "learning_rate": 9.150160610804108e-05, "loss": 0.6974, "step": 10360 }, { "epoch": 0.809334269882151, "grad_norm": 4.002541231784562e-07, "learning_rate": 9.147626575611861e-05, "loss": 1.2283, "step": 10370 }, { "epoch": 0.8101147272301569, "grad_norm": 63.17544174194336, "learning_rate": 9.145089120052718e-05, "loss": 1.2178, "step": 10380 }, { "epoch": 0.8108951845781628, "grad_norm": 0.06589673459529877, "learning_rate": 9.142548246219212e-05, "loss": 1.199, "step": 10390 }, { "epoch": 0.8116756419261687, "grad_norm": 0.006386119872331619, "learning_rate": 9.140003956206688e-05, "loss": 1.1264, "step": 10400 }, { "epoch": 0.8124560992741746, "grad_norm": 0.06595534831285477, "learning_rate": 9.137456252113312e-05, "loss": 0.262, "step": 10410 }, { "epoch": 0.8132365566221806, "grad_norm": 3.372828960418701, "learning_rate": 9.134905136040064e-05, "loss": 2.7002, "step": 10420 }, { "epoch": 0.8140170139701866, "grad_norm": 0.0010774118127301335, "learning_rate": 9.13235061009074e-05, "loss": 0.9205, "step": 10430 }, { "epoch": 0.8147974713181925, "grad_norm": 23.298646926879883, "learning_rate": 9.129792676371947e-05, "loss": 1.9437, "step": 10440 }, { "epoch": 0.8155779286661984, "grad_norm": 0.0358101949095726, "learning_rate": 9.127231336993099e-05, "loss": 1.1757, "step": 10450 }, { "epoch": 0.8163583860142043, "grad_norm": 34.35308837890625, "learning_rate": 9.12466659406642e-05, "loss": 0.9185, "step": 10460 }, { "epoch": 0.8171388433622102, "grad_norm": 0.12688755989074707, "learning_rate": 9.122098449706944e-05, "loss": 0.7325, "step": 10470 }, { "epoch": 0.8179193007102162, "grad_norm": 0.5014522671699524, "learning_rate": 9.119526906032507e-05, "loss": 0.816, "step": 10480 }, { "epoch": 0.8186997580582221, "grad_norm": 49.12326431274414, "learning_rate": 9.11695196516375e-05, "loss": 0.6054, "step": 10490 }, { "epoch": 0.8194802154062281, "grad_norm": 9.400172233581543, "learning_rate": 9.114373629224113e-05, "loss": 0.7559, "step": 10500 }, { "epoch": 0.820260672754234, "grad_norm": 65.43171691894531, "learning_rate": 9.11179190033984e-05, "loss": 1.0216, "step": 10510 }, { "epoch": 0.8210411301022399, "grad_norm": 75.4448013305664, "learning_rate": 9.109206780639966e-05, "loss": 1.2247, "step": 10520 }, { "epoch": 0.8218215874502458, "grad_norm": 0.2027098685503006, "learning_rate": 9.106618272256329e-05, "loss": 0.5028, "step": 10530 }, { "epoch": 0.8226020447982518, "grad_norm": 2.619052565933089e-07, "learning_rate": 9.10402637732356e-05, "loss": 0.3584, "step": 10540 }, { "epoch": 0.8233825021462577, "grad_norm": 7.298355579376221, "learning_rate": 9.10143109797908e-05, "loss": 1.472, "step": 10550 }, { "epoch": 0.8241629594942637, "grad_norm": 82.12861633300781, "learning_rate": 9.098832436363102e-05, "loss": 2.129, "step": 10560 }, { "epoch": 0.8249434168422696, "grad_norm": 0.17317530512809753, "learning_rate": 9.096230394618634e-05, "loss": 2.7479, "step": 10570 }, { "epoch": 0.8257238741902755, "grad_norm": 1.2783308420694084e-06, "learning_rate": 9.093624974891461e-05, "loss": 0.4849, "step": 10580 }, { "epoch": 0.8265043315382814, "grad_norm": 6.741588731529191e-05, "learning_rate": 9.091016179330161e-05, "loss": 0.814, "step": 10590 }, { "epoch": 0.8272847888862873, "grad_norm": 0.0009110376122407615, "learning_rate": 9.088404010086095e-05, "loss": 0.918, "step": 10600 }, { "epoch": 0.8280652462342933, "grad_norm": 6.268826382438419e-06, "learning_rate": 9.085788469313403e-05, "loss": 0.3706, "step": 10610 }, { "epoch": 0.8288457035822993, "grad_norm": 47.73321533203125, "learning_rate": 9.08316955916901e-05, "loss": 2.3103, "step": 10620 }, { "epoch": 0.8296261609303052, "grad_norm": 72.35128784179688, "learning_rate": 9.080547281812613e-05, "loss": 1.4624, "step": 10630 }, { "epoch": 0.8304066182783111, "grad_norm": 9.136421203613281, "learning_rate": 9.077921639406695e-05, "loss": 0.1298, "step": 10640 }, { "epoch": 0.831187075626317, "grad_norm": 7.849686145782471, "learning_rate": 9.075292634116507e-05, "loss": 1.55, "step": 10650 }, { "epoch": 0.8319675329743229, "grad_norm": 41.03010940551758, "learning_rate": 9.072660268110074e-05, "loss": 0.6723, "step": 10660 }, { "epoch": 0.8327479903223289, "grad_norm": 32.15593338012695, "learning_rate": 9.070024543558193e-05, "loss": 0.5337, "step": 10670 }, { "epoch": 0.8335284476703348, "grad_norm": 0.2335835099220276, "learning_rate": 9.067385462634433e-05, "loss": 2.3867, "step": 10680 }, { "epoch": 0.8343089050183408, "grad_norm": 70.02042388916016, "learning_rate": 9.064743027515128e-05, "loss": 1.8435, "step": 10690 }, { "epoch": 0.8350893623663467, "grad_norm": 0.010791340842843056, "learning_rate": 9.062097240379381e-05, "loss": 1.3253, "step": 10700 }, { "epoch": 0.8358698197143526, "grad_norm": 42.142948150634766, "learning_rate": 9.059448103409054e-05, "loss": 1.3379, "step": 10710 }, { "epoch": 0.8366502770623585, "grad_norm": 18.970975875854492, "learning_rate": 9.056795618788778e-05, "loss": 0.5949, "step": 10720 }, { "epoch": 0.8374307344103644, "grad_norm": 31.398378372192383, "learning_rate": 9.05413978870594e-05, "loss": 0.8256, "step": 10730 }, { "epoch": 0.8382111917583704, "grad_norm": 38.225547790527344, "learning_rate": 9.051480615350687e-05, "loss": 0.4197, "step": 10740 }, { "epoch": 0.8389916491063764, "grad_norm": 0.0025365985929965973, "learning_rate": 9.048818100915923e-05, "loss": 1.5254, "step": 10750 }, { "epoch": 0.8397721064543823, "grad_norm": 12.11093521118164, "learning_rate": 9.046152247597309e-05, "loss": 1.0872, "step": 10760 }, { "epoch": 0.8405525638023882, "grad_norm": 43.06413650512695, "learning_rate": 9.043483057593258e-05, "loss": 1.2137, "step": 10770 }, { "epoch": 0.8413330211503941, "grad_norm": 19.772510528564453, "learning_rate": 9.040810533104934e-05, "loss": 0.8811, "step": 10780 }, { "epoch": 0.8421134784984, "grad_norm": 0.31982916593551636, "learning_rate": 9.038134676336252e-05, "loss": 1.9339, "step": 10790 }, { "epoch": 0.842893935846406, "grad_norm": 0.11078077554702759, "learning_rate": 9.035455489493874e-05, "loss": 0.6942, "step": 10800 }, { "epoch": 0.843674393194412, "grad_norm": 21.829547882080078, "learning_rate": 9.032772974787207e-05, "loss": 0.423, "step": 10810 }, { "epoch": 0.8444548505424179, "grad_norm": 26.680130004882812, "learning_rate": 9.030087134428408e-05, "loss": 0.7315, "step": 10820 }, { "epoch": 0.8452353078904238, "grad_norm": 0.0005266700172796845, "learning_rate": 9.02739797063237e-05, "loss": 0.2991, "step": 10830 }, { "epoch": 0.8460157652384297, "grad_norm": 0.07193426787853241, "learning_rate": 9.024705485616729e-05, "loss": 1.2723, "step": 10840 }, { "epoch": 0.8467962225864356, "grad_norm": 10.340862274169922, "learning_rate": 9.02200968160186e-05, "loss": 0.6524, "step": 10850 }, { "epoch": 0.8475766799344416, "grad_norm": 0.5865875482559204, "learning_rate": 9.019310560810876e-05, "loss": 0.9401, "step": 10860 }, { "epoch": 0.8483571372824475, "grad_norm": 3.370558988535777e-05, "learning_rate": 9.016608125469624e-05, "loss": 0.7333, "step": 10870 }, { "epoch": 0.8491375946304535, "grad_norm": 52.608585357666016, "learning_rate": 9.013902377806685e-05, "loss": 2.4574, "step": 10880 }, { "epoch": 0.8499180519784594, "grad_norm": 3.0554566383361816, "learning_rate": 9.01119332005337e-05, "loss": 0.7603, "step": 10890 }, { "epoch": 0.8506985093264653, "grad_norm": 50.016151428222656, "learning_rate": 9.008480954443721e-05, "loss": 1.0146, "step": 10900 }, { "epoch": 0.8514789666744712, "grad_norm": 5.849006652832031, "learning_rate": 9.00576528321451e-05, "loss": 0.3672, "step": 10910 }, { "epoch": 0.8522594240224771, "grad_norm": 15.98238468170166, "learning_rate": 9.00304630860523e-05, "loss": 0.5704, "step": 10920 }, { "epoch": 0.8530398813704831, "grad_norm": 0.1195988729596138, "learning_rate": 9.000324032858102e-05, "loss": 0.204, "step": 10930 }, { "epoch": 0.8538203387184891, "grad_norm": 1.65741475939285e-05, "learning_rate": 8.997598458218068e-05, "loss": 1.2947, "step": 10940 }, { "epoch": 0.854600796066495, "grad_norm": 0.12614373862743378, "learning_rate": 8.994869586932793e-05, "loss": 0.9826, "step": 10950 }, { "epoch": 0.8553812534145009, "grad_norm": 1.6010971069335938, "learning_rate": 8.992137421252653e-05, "loss": 0.7233, "step": 10960 }, { "epoch": 0.8561617107625068, "grad_norm": 1.2691033361988957e-06, "learning_rate": 8.98940196343075e-05, "loss": 1.8345, "step": 10970 }, { "epoch": 0.8569421681105127, "grad_norm": 2.088942050933838, "learning_rate": 8.986663215722896e-05, "loss": 2.192, "step": 10980 }, { "epoch": 0.8577226254585187, "grad_norm": 0.39988765120506287, "learning_rate": 8.983921180387617e-05, "loss": 0.6454, "step": 10990 }, { "epoch": 0.8585030828065247, "grad_norm": 2.1378994408127738e-10, "learning_rate": 8.98117585968615e-05, "loss": 0.2324, "step": 11000 }, { "epoch": 0.8592835401545306, "grad_norm": 0.02639651857316494, "learning_rate": 8.978427255882441e-05, "loss": 3.9428, "step": 11010 }, { "epoch": 0.8600639975025365, "grad_norm": 19.26889419555664, "learning_rate": 8.975675371243141e-05, "loss": 0.6036, "step": 11020 }, { "epoch": 0.8608444548505424, "grad_norm": 66.81867980957031, "learning_rate": 8.972920208037616e-05, "loss": 2.2432, "step": 11030 }, { "epoch": 0.8616249121985483, "grad_norm": 0.0035200975835323334, "learning_rate": 8.970161768537923e-05, "loss": 1.3233, "step": 11040 }, { "epoch": 0.8624053695465543, "grad_norm": 28.795499801635742, "learning_rate": 8.967400055018831e-05, "loss": 0.9746, "step": 11050 }, { "epoch": 0.8631858268945602, "grad_norm": 0.11977910250425339, "learning_rate": 8.964635069757802e-05, "loss": 0.6801, "step": 11060 }, { "epoch": 0.8639662842425662, "grad_norm": 0.9692222476005554, "learning_rate": 8.961866815035001e-05, "loss": 0.9246, "step": 11070 }, { "epoch": 0.8647467415905721, "grad_norm": 1.2255003452301025, "learning_rate": 8.959095293133283e-05, "loss": 2.5522, "step": 11080 }, { "epoch": 0.865527198938578, "grad_norm": 2.081770896911621, "learning_rate": 8.956320506338206e-05, "loss": 0.4645, "step": 11090 }, { "epoch": 0.8663076562865839, "grad_norm": 56.65788269042969, "learning_rate": 8.953542456938013e-05, "loss": 0.7179, "step": 11100 }, { "epoch": 0.8670881136345898, "grad_norm": 43.48613739013672, "learning_rate": 8.950761147223642e-05, "loss": 1.8452, "step": 11110 }, { "epoch": 0.8678685709825958, "grad_norm": 22.29058074951172, "learning_rate": 8.947976579488717e-05, "loss": 0.928, "step": 11120 }, { "epoch": 0.8686490283306018, "grad_norm": 0.007680551148951054, "learning_rate": 8.94518875602955e-05, "loss": 2.3963, "step": 11130 }, { "epoch": 0.8694294856786077, "grad_norm": 0.2508794069290161, "learning_rate": 8.942397679145135e-05, "loss": 0.2604, "step": 11140 }, { "epoch": 0.8702099430266136, "grad_norm": 0.05226520821452141, "learning_rate": 8.939603351137156e-05, "loss": 0.2393, "step": 11150 }, { "epoch": 0.8709904003746195, "grad_norm": 45.50146484375, "learning_rate": 8.93680577430997e-05, "loss": 0.6496, "step": 11160 }, { "epoch": 0.8717708577226254, "grad_norm": 13.582279205322266, "learning_rate": 8.93400495097062e-05, "loss": 0.5958, "step": 11170 }, { "epoch": 0.8725513150706314, "grad_norm": 57.39128112792969, "learning_rate": 8.93120088342882e-05, "loss": 0.4043, "step": 11180 }, { "epoch": 0.8733317724186374, "grad_norm": 0.5639352798461914, "learning_rate": 8.928393573996963e-05, "loss": 1.905, "step": 11190 }, { "epoch": 0.8741122297666433, "grad_norm": 2.6245787143707275, "learning_rate": 8.925583024990114e-05, "loss": 1.2335, "step": 11200 }, { "epoch": 0.8748926871146492, "grad_norm": 3.0179084205883555e-05, "learning_rate": 8.922769238726013e-05, "loss": 1.4729, "step": 11210 }, { "epoch": 0.8756731444626551, "grad_norm": 9.200638487527613e-06, "learning_rate": 8.919952217525065e-05, "loss": 0.2296, "step": 11220 }, { "epoch": 0.876453601810661, "grad_norm": 32.401824951171875, "learning_rate": 8.917131963710344e-05, "loss": 1.731, "step": 11230 }, { "epoch": 0.8772340591586669, "grad_norm": 0.001450688112527132, "learning_rate": 8.914308479607592e-05, "loss": 0.5757, "step": 11240 }, { "epoch": 0.878014516506673, "grad_norm": 22.435937881469727, "learning_rate": 8.911481767545211e-05, "loss": 2.6246, "step": 11250 }, { "epoch": 0.8787949738546789, "grad_norm": 30.424789428710938, "learning_rate": 8.908651829854271e-05, "loss": 0.7685, "step": 11260 }, { "epoch": 0.8795754312026848, "grad_norm": 2.2438132762908936, "learning_rate": 8.905818668868494e-05, "loss": 0.4759, "step": 11270 }, { "epoch": 0.8803558885506907, "grad_norm": 1.3357861042022705, "learning_rate": 8.902982286924268e-05, "loss": 0.5434, "step": 11280 }, { "epoch": 0.8811363458986966, "grad_norm": 47.41687774658203, "learning_rate": 8.900142686360631e-05, "loss": 1.4807, "step": 11290 }, { "epoch": 0.8819168032467025, "grad_norm": 37.302310943603516, "learning_rate": 8.89729986951928e-05, "loss": 1.3951, "step": 11300 }, { "epoch": 0.8826972605947085, "grad_norm": 34.153472900390625, "learning_rate": 8.894453838744561e-05, "loss": 0.6078, "step": 11310 }, { "epoch": 0.8834777179427145, "grad_norm": 34.647335052490234, "learning_rate": 8.891604596383472e-05, "loss": 1.1206, "step": 11320 }, { "epoch": 0.8842581752907204, "grad_norm": 0.013813276775181293, "learning_rate": 8.888752144785656e-05, "loss": 0.4337, "step": 11330 }, { "epoch": 0.8850386326387263, "grad_norm": 88.2090835571289, "learning_rate": 8.885896486303411e-05, "loss": 1.7786, "step": 11340 }, { "epoch": 0.8858190899867322, "grad_norm": 1.2261537313461304, "learning_rate": 8.88303762329167e-05, "loss": 0.2787, "step": 11350 }, { "epoch": 0.8865995473347381, "grad_norm": 85.28449249267578, "learning_rate": 8.880175558108014e-05, "loss": 2.0685, "step": 11360 }, { "epoch": 0.8873800046827441, "grad_norm": 70.23323822021484, "learning_rate": 8.877310293112663e-05, "loss": 0.6905, "step": 11370 }, { "epoch": 0.88816046203075, "grad_norm": 109.84261322021484, "learning_rate": 8.87444183066848e-05, "loss": 2.1192, "step": 11380 }, { "epoch": 0.888940919378756, "grad_norm": 2.6113624572753906, "learning_rate": 8.871570173140955e-05, "loss": 0.9877, "step": 11390 }, { "epoch": 0.8897213767267619, "grad_norm": 0.0027678406331688166, "learning_rate": 8.868695322898222e-05, "loss": 0.6999, "step": 11400 }, { "epoch": 0.8905018340747678, "grad_norm": 27.829450607299805, "learning_rate": 8.865817282311043e-05, "loss": 2.1734, "step": 11410 }, { "epoch": 0.8912822914227737, "grad_norm": 1.0777357816696167, "learning_rate": 8.862936053752814e-05, "loss": 1.1006, "step": 11420 }, { "epoch": 0.8920627487707796, "grad_norm": 9.996204376220703, "learning_rate": 8.86005163959956e-05, "loss": 1.8153, "step": 11430 }, { "epoch": 0.8928432061187856, "grad_norm": 11.059738159179688, "learning_rate": 8.85716404222993e-05, "loss": 1.3938, "step": 11440 }, { "epoch": 0.8936236634667916, "grad_norm": 0.06884201616048813, "learning_rate": 8.8542732640252e-05, "loss": 0.395, "step": 11450 }, { "epoch": 0.8944041208147975, "grad_norm": 0.38125911355018616, "learning_rate": 8.851379307369267e-05, "loss": 0.7953, "step": 11460 }, { "epoch": 0.8951845781628034, "grad_norm": 7.784693717956543, "learning_rate": 8.848482174648653e-05, "loss": 1.3387, "step": 11470 }, { "epoch": 0.8959650355108093, "grad_norm": 2.370662212371826, "learning_rate": 8.845581868252498e-05, "loss": 0.4055, "step": 11480 }, { "epoch": 0.8967454928588152, "grad_norm": 0.028177833184599876, "learning_rate": 8.842678390572557e-05, "loss": 1.538, "step": 11490 }, { "epoch": 0.8975259502068212, "grad_norm": 28.854820251464844, "learning_rate": 8.839771744003199e-05, "loss": 0.3277, "step": 11500 }, { "epoch": 0.8983064075548272, "grad_norm": 6.505209445953369, "learning_rate": 8.836861930941414e-05, "loss": 1.0238, "step": 11510 }, { "epoch": 0.8990868649028331, "grad_norm": 0.08821121603250504, "learning_rate": 8.833948953786792e-05, "loss": 0.9824, "step": 11520 }, { "epoch": 0.899867322250839, "grad_norm": 0.0032478803768754005, "learning_rate": 8.831032814941545e-05, "loss": 1.7226, "step": 11530 }, { "epoch": 0.9006477795988449, "grad_norm": 0.9303324818611145, "learning_rate": 8.828113516810481e-05, "loss": 0.5442, "step": 11540 }, { "epoch": 0.9014282369468508, "grad_norm": 0.5977065563201904, "learning_rate": 8.825191061801019e-05, "loss": 1.8397, "step": 11550 }, { "epoch": 0.9022086942948568, "grad_norm": 27.579898834228516, "learning_rate": 8.822265452323182e-05, "loss": 1.6211, "step": 11560 }, { "epoch": 0.9029891516428628, "grad_norm": 0.21320217847824097, "learning_rate": 8.81933669078959e-05, "loss": 1.362, "step": 11570 }, { "epoch": 0.9037696089908687, "grad_norm": 49.39560317993164, "learning_rate": 8.816404779615465e-05, "loss": 0.7101, "step": 11580 }, { "epoch": 0.9045500663388746, "grad_norm": 44.66103744506836, "learning_rate": 8.813469721218631e-05, "loss": 2.1111, "step": 11590 }, { "epoch": 0.9053305236868805, "grad_norm": 3.096705913543701, "learning_rate": 8.810531518019496e-05, "loss": 0.6784, "step": 11600 }, { "epoch": 0.9061109810348864, "grad_norm": 0.00022902319324202836, "learning_rate": 8.807590172441072e-05, "loss": 0.8203, "step": 11610 }, { "epoch": 0.9068914383828923, "grad_norm": 1.1045008897781372, "learning_rate": 8.804645686908959e-05, "loss": 0.9274, "step": 11620 }, { "epoch": 0.9076718957308983, "grad_norm": 44.20703887939453, "learning_rate": 8.801698063851345e-05, "loss": 0.7383, "step": 11630 }, { "epoch": 0.9084523530789043, "grad_norm": 0.0486653596162796, "learning_rate": 8.798747305699005e-05, "loss": 0.6478, "step": 11640 }, { "epoch": 0.9092328104269102, "grad_norm": 36.32588195800781, "learning_rate": 8.795793414885301e-05, "loss": 0.4524, "step": 11650 }, { "epoch": 0.9100132677749161, "grad_norm": 30.98180389404297, "learning_rate": 8.79283639384618e-05, "loss": 2.5093, "step": 11660 }, { "epoch": 0.910793725122922, "grad_norm": 51.505775451660156, "learning_rate": 8.789876245020168e-05, "loss": 0.973, "step": 11670 }, { "epoch": 0.9115741824709279, "grad_norm": 0.001640350092202425, "learning_rate": 8.78691297084837e-05, "loss": 0.1758, "step": 11680 }, { "epoch": 0.9123546398189339, "grad_norm": 9.590207810106222e-06, "learning_rate": 8.783946573774467e-05, "loss": 0.3, "step": 11690 }, { "epoch": 0.9131350971669399, "grad_norm": 39.28508758544922, "learning_rate": 8.780977056244721e-05, "loss": 0.8231, "step": 11700 }, { "epoch": 0.9139155545149458, "grad_norm": 3.148670196533203, "learning_rate": 8.778004420707961e-05, "loss": 1.8962, "step": 11710 }, { "epoch": 0.9146960118629517, "grad_norm": 40.00190353393555, "learning_rate": 8.775028669615593e-05, "loss": 0.4237, "step": 11720 }, { "epoch": 0.9154764692109576, "grad_norm": 0.02514764852821827, "learning_rate": 8.772049805421586e-05, "loss": 0.8909, "step": 11730 }, { "epoch": 0.9162569265589635, "grad_norm": 25.32301139831543, "learning_rate": 8.769067830582482e-05, "loss": 0.9899, "step": 11740 }, { "epoch": 0.9170373839069695, "grad_norm": 0.029089761897921562, "learning_rate": 8.766082747557383e-05, "loss": 1.1534, "step": 11750 }, { "epoch": 0.9178178412549755, "grad_norm": 0.00031210810993798077, "learning_rate": 8.763094558807961e-05, "loss": 1.7147, "step": 11760 }, { "epoch": 0.9185982986029814, "grad_norm": 37.92123031616211, "learning_rate": 8.76010326679844e-05, "loss": 0.4765, "step": 11770 }, { "epoch": 0.9193787559509873, "grad_norm": 2.677894599401043e-07, "learning_rate": 8.757108873995612e-05, "loss": 0.4737, "step": 11780 }, { "epoch": 0.9201592132989932, "grad_norm": 42.612613677978516, "learning_rate": 8.75411138286882e-05, "loss": 1.2752, "step": 11790 }, { "epoch": 0.9209396706469991, "grad_norm": 3.9161059856414795, "learning_rate": 8.751110795889966e-05, "loss": 2.0453, "step": 11800 }, { "epoch": 0.921720127995005, "grad_norm": 21.108699798583984, "learning_rate": 8.748107115533501e-05, "loss": 1.2515, "step": 11810 }, { "epoch": 0.922500585343011, "grad_norm": 39.70624923706055, "learning_rate": 8.745100344276433e-05, "loss": 2.1655, "step": 11820 }, { "epoch": 0.923281042691017, "grad_norm": 12.807904243469238, "learning_rate": 8.742090484598312e-05, "loss": 0.3978, "step": 11830 }, { "epoch": 0.9240615000390229, "grad_norm": 0.06400003284215927, "learning_rate": 8.739077538981239e-05, "loss": 0.7546, "step": 11840 }, { "epoch": 0.9248419573870288, "grad_norm": 42.75697708129883, "learning_rate": 8.73606150990986e-05, "loss": 1.3483, "step": 11850 }, { "epoch": 0.9256224147350347, "grad_norm": 39.99211120605469, "learning_rate": 8.733042399871361e-05, "loss": 0.7464, "step": 11860 }, { "epoch": 0.9264028720830406, "grad_norm": 23.466768264770508, "learning_rate": 8.730020211355471e-05, "loss": 0.9, "step": 11870 }, { "epoch": 0.9271833294310466, "grad_norm": 25.129772186279297, "learning_rate": 8.72699494685446e-05, "loss": 0.8635, "step": 11880 }, { "epoch": 0.9279637867790526, "grad_norm": 10.731846809387207, "learning_rate": 8.723966608863128e-05, "loss": 0.0681, "step": 11890 }, { "epoch": 0.9287442441270585, "grad_norm": 0.04492070898413658, "learning_rate": 8.720935199878815e-05, "loss": 0.916, "step": 11900 }, { "epoch": 0.9295247014750644, "grad_norm": 7.944001900739295e-09, "learning_rate": 8.717900722401391e-05, "loss": 0.739, "step": 11910 }, { "epoch": 0.9303051588230703, "grad_norm": 3.294824273325503e-05, "learning_rate": 8.714863178933257e-05, "loss": 1.0964, "step": 11920 }, { "epoch": 0.9310856161710762, "grad_norm": 0.0023006428964436054, "learning_rate": 8.711822571979346e-05, "loss": 0.5517, "step": 11930 }, { "epoch": 0.9318660735190821, "grad_norm": 0.00011115364759461954, "learning_rate": 8.70877890404711e-05, "loss": 1.5457, "step": 11940 }, { "epoch": 0.9326465308670882, "grad_norm": 0.007222015410661697, "learning_rate": 8.705732177646531e-05, "loss": 0.4518, "step": 11950 }, { "epoch": 0.9334269882150941, "grad_norm": 14.632031440734863, "learning_rate": 8.702682395290114e-05, "loss": 1.1325, "step": 11960 }, { "epoch": 0.9342074455631, "grad_norm": 0.00015641542267985642, "learning_rate": 8.699629559492876e-05, "loss": 1.5673, "step": 11970 }, { "epoch": 0.9349879029111059, "grad_norm": 2.1813745498657227, "learning_rate": 8.696573672772363e-05, "loss": 1.3276, "step": 11980 }, { "epoch": 0.9357683602591118, "grad_norm": 0.20304657518863678, "learning_rate": 8.693514737648628e-05, "loss": 0.8412, "step": 11990 }, { "epoch": 0.9365488176071177, "grad_norm": 0.0002769280690699816, "learning_rate": 8.690452756644243e-05, "loss": 0.4273, "step": 12000 }, { "epoch": 0.9373292749551237, "grad_norm": 180.66477966308594, "learning_rate": 8.687387732284291e-05, "loss": 0.5784, "step": 12010 }, { "epoch": 0.9381097323031297, "grad_norm": 143.87356567382812, "learning_rate": 8.684319667096364e-05, "loss": 1.2544, "step": 12020 }, { "epoch": 0.9388901896511356, "grad_norm": 94.39666748046875, "learning_rate": 8.681248563610558e-05, "loss": 0.521, "step": 12030 }, { "epoch": 0.9396706469991415, "grad_norm": 33.32435607910156, "learning_rate": 8.678174424359485e-05, "loss": 0.5367, "step": 12040 }, { "epoch": 0.9404511043471474, "grad_norm": 0.4227607846260071, "learning_rate": 8.675097251878246e-05, "loss": 1.8416, "step": 12050 }, { "epoch": 0.9412315616951533, "grad_norm": 10.019822120666504, "learning_rate": 8.672017048704458e-05, "loss": 2.5491, "step": 12060 }, { "epoch": 0.9420120190431593, "grad_norm": 26.867095947265625, "learning_rate": 8.668933817378224e-05, "loss": 0.6875, "step": 12070 }, { "epoch": 0.9427924763911653, "grad_norm": 10.189391136169434, "learning_rate": 8.665847560442156e-05, "loss": 0.8775, "step": 12080 }, { "epoch": 0.9435729337391712, "grad_norm": 3.7130205631256104, "learning_rate": 8.662758280441352e-05, "loss": 0.6623, "step": 12090 }, { "epoch": 0.9443533910871771, "grad_norm": 25.05450439453125, "learning_rate": 8.65966597992341e-05, "loss": 2.4015, "step": 12100 }, { "epoch": 0.945133848435183, "grad_norm": 5.740344590776658e-07, "learning_rate": 8.65657066143841e-05, "loss": 0.2352, "step": 12110 }, { "epoch": 0.9459143057831889, "grad_norm": 1.194147216665442e-06, "learning_rate": 8.653472327538932e-05, "loss": 0.9961, "step": 12120 }, { "epoch": 0.9466947631311948, "grad_norm": 23.505138397216797, "learning_rate": 8.650370980780035e-05, "loss": 1.3468, "step": 12130 }, { "epoch": 0.9474752204792009, "grad_norm": 0.6704900860786438, "learning_rate": 8.647266623719263e-05, "loss": 0.5786, "step": 12140 }, { "epoch": 0.9482556778272068, "grad_norm": 32.911720275878906, "learning_rate": 8.644159258916645e-05, "loss": 1.064, "step": 12150 }, { "epoch": 0.9490361351752127, "grad_norm": 127.1581039428711, "learning_rate": 8.641048888934691e-05, "loss": 2.8751, "step": 12160 }, { "epoch": 0.9498165925232186, "grad_norm": 38.89763259887695, "learning_rate": 8.637935516338384e-05, "loss": 2.585, "step": 12170 }, { "epoch": 0.9505970498712245, "grad_norm": 21.23341941833496, "learning_rate": 8.63481914369519e-05, "loss": 0.6433, "step": 12180 }, { "epoch": 0.9513775072192304, "grad_norm": 380.9919738769531, "learning_rate": 8.631699773575044e-05, "loss": 1.8712, "step": 12190 }, { "epoch": 0.9521579645672364, "grad_norm": 1.6460078954696655, "learning_rate": 8.628577408550353e-05, "loss": 1.1621, "step": 12200 }, { "epoch": 0.9529384219152424, "grad_norm": 0.005625886842608452, "learning_rate": 8.625452051196001e-05, "loss": 1.7173, "step": 12210 }, { "epoch": 0.9537188792632483, "grad_norm": 0.43974214792251587, "learning_rate": 8.622323704089328e-05, "loss": 1.4362, "step": 12220 }, { "epoch": 0.9544993366112542, "grad_norm": 0.0861930325627327, "learning_rate": 8.619192369810149e-05, "loss": 0.2172, "step": 12230 }, { "epoch": 0.9552797939592601, "grad_norm": 109.56983184814453, "learning_rate": 8.616058050940739e-05, "loss": 0.5874, "step": 12240 }, { "epoch": 0.956060251307266, "grad_norm": 31.9949951171875, "learning_rate": 8.612920750065831e-05, "loss": 1.9491, "step": 12250 }, { "epoch": 0.956840708655272, "grad_norm": 1.3033921718597412, "learning_rate": 8.609780469772623e-05, "loss": 0.7603, "step": 12260 }, { "epoch": 0.957621166003278, "grad_norm": 175.17483520507812, "learning_rate": 8.606637212650767e-05, "loss": 1.6961, "step": 12270 }, { "epoch": 0.9584016233512839, "grad_norm": 88.24026489257812, "learning_rate": 8.603490981292369e-05, "loss": 3.0626, "step": 12280 }, { "epoch": 0.9591820806992898, "grad_norm": 6.234077453613281, "learning_rate": 8.600341778291991e-05, "loss": 1.012, "step": 12290 }, { "epoch": 0.9599625380472957, "grad_norm": 1.844632625579834, "learning_rate": 8.597189606246641e-05, "loss": 0.6959, "step": 12300 }, { "epoch": 0.9607429953953016, "grad_norm": 0.0016822753241285682, "learning_rate": 8.594034467755781e-05, "loss": 1.416, "step": 12310 }, { "epoch": 0.9615234527433075, "grad_norm": 1.9487806558609009, "learning_rate": 8.590876365421313e-05, "loss": 2.5349, "step": 12320 }, { "epoch": 0.9623039100913136, "grad_norm": 56.0797233581543, "learning_rate": 8.587715301847588e-05, "loss": 1.0408, "step": 12330 }, { "epoch": 0.9630843674393195, "grad_norm": 0.0024899712298065424, "learning_rate": 8.584551279641398e-05, "loss": 0.5106, "step": 12340 }, { "epoch": 0.9638648247873254, "grad_norm": 60.09833526611328, "learning_rate": 8.58138430141197e-05, "loss": 1.2098, "step": 12350 }, { "epoch": 0.9646452821353313, "grad_norm": 88.48211669921875, "learning_rate": 8.57821436977098e-05, "loss": 0.7373, "step": 12360 }, { "epoch": 0.9654257394833372, "grad_norm": 0.0008333827718161047, "learning_rate": 8.575041487332527e-05, "loss": 1.7268, "step": 12370 }, { "epoch": 0.9662061968313431, "grad_norm": 63.89524841308594, "learning_rate": 8.571865656713152e-05, "loss": 0.7619, "step": 12380 }, { "epoch": 0.9669866541793491, "grad_norm": 67.05152130126953, "learning_rate": 8.568686880531822e-05, "loss": 1.5792, "step": 12390 }, { "epoch": 0.9677671115273551, "grad_norm": 11.015143394470215, "learning_rate": 8.565505161409937e-05, "loss": 0.577, "step": 12400 }, { "epoch": 0.968547568875361, "grad_norm": 0.0003411987272556871, "learning_rate": 8.562320501971319e-05, "loss": 0.6672, "step": 12410 }, { "epoch": 0.9693280262233669, "grad_norm": 0.3806394040584564, "learning_rate": 8.559132904842222e-05, "loss": 0.5932, "step": 12420 }, { "epoch": 0.9701084835713728, "grad_norm": 18.847597122192383, "learning_rate": 8.555942372651316e-05, "loss": 1.0309, "step": 12430 }, { "epoch": 0.9708889409193787, "grad_norm": 27.726774215698242, "learning_rate": 8.552748908029693e-05, "loss": 1.5833, "step": 12440 }, { "epoch": 0.9716693982673846, "grad_norm": 15.595227241516113, "learning_rate": 8.549552513610865e-05, "loss": 1.102, "step": 12450 }, { "epoch": 0.9724498556153907, "grad_norm": 2.3900837898254395, "learning_rate": 8.546353192030762e-05, "loss": 2.0294, "step": 12460 }, { "epoch": 0.9732303129633966, "grad_norm": 7.092124938964844, "learning_rate": 8.543150945927722e-05, "loss": 0.7207, "step": 12470 }, { "epoch": 0.9740107703114025, "grad_norm": 61.77547073364258, "learning_rate": 8.539945777942498e-05, "loss": 2.3424, "step": 12480 }, { "epoch": 0.9747912276594084, "grad_norm": 7.770050615363289e-06, "learning_rate": 8.536737690718252e-05, "loss": 1.0175, "step": 12490 }, { "epoch": 0.9755716850074143, "grad_norm": 63.13569641113281, "learning_rate": 8.533526686900555e-05, "loss": 1.5255, "step": 12500 }, { "epoch": 0.9763521423554202, "grad_norm": 0.00011756386084016412, "learning_rate": 8.530312769137382e-05, "loss": 1.0027, "step": 12510 }, { "epoch": 0.9771325997034263, "grad_norm": 89.27020263671875, "learning_rate": 8.52709594007911e-05, "loss": 0.8815, "step": 12520 }, { "epoch": 0.9779130570514322, "grad_norm": 2.415660858154297, "learning_rate": 8.523876202378518e-05, "loss": 1.2294, "step": 12530 }, { "epoch": 0.9786935143994381, "grad_norm": 0.9319227337837219, "learning_rate": 8.520653558690786e-05, "loss": 0.534, "step": 12540 }, { "epoch": 0.979473971747444, "grad_norm": 59.17282485961914, "learning_rate": 8.517428011673482e-05, "loss": 1.1959, "step": 12550 }, { "epoch": 0.9802544290954499, "grad_norm": 6.27915096282959, "learning_rate": 8.514199563986578e-05, "loss": 0.914, "step": 12560 }, { "epoch": 0.9810348864434558, "grad_norm": 32.939640045166016, "learning_rate": 8.510968218292434e-05, "loss": 0.7607, "step": 12570 }, { "epoch": 0.9818153437914618, "grad_norm": 8.092778205871582, "learning_rate": 8.5077339772558e-05, "loss": 1.4045, "step": 12580 }, { "epoch": 0.9825958011394678, "grad_norm": 30.04991912841797, "learning_rate": 8.504496843543813e-05, "loss": 0.8721, "step": 12590 }, { "epoch": 0.9833762584874737, "grad_norm": 15.851937294006348, "learning_rate": 8.501256819825996e-05, "loss": 0.5861, "step": 12600 }, { "epoch": 0.9841567158354796, "grad_norm": 0.01121127512305975, "learning_rate": 8.498013908774256e-05, "loss": 0.9448, "step": 12610 }, { "epoch": 0.9849371731834855, "grad_norm": 1.1823562383651733, "learning_rate": 8.494768113062879e-05, "loss": 0.5183, "step": 12620 }, { "epoch": 0.9857176305314914, "grad_norm": 0.07073729485273361, "learning_rate": 8.491519435368534e-05, "loss": 0.1142, "step": 12630 }, { "epoch": 0.9864980878794973, "grad_norm": 78.96869659423828, "learning_rate": 8.48826787837026e-05, "loss": 1.7843, "step": 12640 }, { "epoch": 0.9872785452275034, "grad_norm": 86.60768127441406, "learning_rate": 8.485013444749479e-05, "loss": 2.746, "step": 12650 }, { "epoch": 0.9880590025755093, "grad_norm": 7.178143277997151e-06, "learning_rate": 8.481756137189977e-05, "loss": 1.0557, "step": 12660 }, { "epoch": 0.9888394599235152, "grad_norm": 47.43173599243164, "learning_rate": 8.478495958377914e-05, "loss": 1.7503, "step": 12670 }, { "epoch": 0.9896199172715211, "grad_norm": 40.4387092590332, "learning_rate": 8.47523291100182e-05, "loss": 0.6436, "step": 12680 }, { "epoch": 0.990400374619527, "grad_norm": 11.480759620666504, "learning_rate": 8.471966997752585e-05, "loss": 0.2876, "step": 12690 }, { "epoch": 0.9911808319675329, "grad_norm": 0.0001139993328251876, "learning_rate": 8.468698221323468e-05, "loss": 0.2114, "step": 12700 }, { "epoch": 0.991961289315539, "grad_norm": 76.51677703857422, "learning_rate": 8.465426584410084e-05, "loss": 1.865, "step": 12710 }, { "epoch": 0.9927417466635449, "grad_norm": 3.665128231048584, "learning_rate": 8.462152089710408e-05, "loss": 1.9389, "step": 12720 }, { "epoch": 0.9935222040115508, "grad_norm": 50.49843978881836, "learning_rate": 8.458874739924778e-05, "loss": 2.4169, "step": 12730 }, { "epoch": 0.9943026613595567, "grad_norm": 15.426827430725098, "learning_rate": 8.455594537755878e-05, "loss": 0.2876, "step": 12740 }, { "epoch": 0.9950831187075626, "grad_norm": 8.526009764864284e-07, "learning_rate": 8.452311485908751e-05, "loss": 0.5408, "step": 12750 }, { "epoch": 0.9958635760555685, "grad_norm": 25.179977416992188, "learning_rate": 8.449025587090782e-05, "loss": 1.8653, "step": 12760 }, { "epoch": 0.9966440334035745, "grad_norm": 55.514373779296875, "learning_rate": 8.445736844011713e-05, "loss": 0.8341, "step": 12770 }, { "epoch": 0.9974244907515805, "grad_norm": 3.265545606613159, "learning_rate": 8.442445259383625e-05, "loss": 1.9346, "step": 12780 }, { "epoch": 0.9982049480995864, "grad_norm": 11.585598945617676, "learning_rate": 8.439150835920944e-05, "loss": 1.2786, "step": 12790 }, { "epoch": 0.9989854054475923, "grad_norm": 0.5894767642021179, "learning_rate": 8.435853576340438e-05, "loss": 0.0325, "step": 12800 }, { "epoch": 0.9997658627955982, "grad_norm": 0.6876865029335022, "learning_rate": 8.432553483361213e-05, "loss": 0.4318, "step": 12810 }, { "epoch": 1.0005463201436042, "grad_norm": 9.892061643768102e-05, "learning_rate": 8.429250559704714e-05, "loss": 1.2832, "step": 12820 }, { "epoch": 1.0013267774916101, "grad_norm": 0.00020257089636288583, "learning_rate": 8.425944808094715e-05, "loss": 1.1023, "step": 12830 }, { "epoch": 1.002107234839616, "grad_norm": 57.085105895996094, "learning_rate": 8.422636231257326e-05, "loss": 0.3823, "step": 12840 }, { "epoch": 1.002887692187622, "grad_norm": 0.9345187544822693, "learning_rate": 8.419324831920989e-05, "loss": 1.7061, "step": 12850 }, { "epoch": 1.0036681495356279, "grad_norm": 26.76588249206543, "learning_rate": 8.416010612816467e-05, "loss": 1.7186, "step": 12860 }, { "epoch": 1.0044486068836338, "grad_norm": 9.51021957397461, "learning_rate": 8.412693576676856e-05, "loss": 1.865, "step": 12870 }, { "epoch": 1.0052290642316397, "grad_norm": 13.387110710144043, "learning_rate": 8.409373726237567e-05, "loss": 0.2966, "step": 12880 }, { "epoch": 1.0060095215796456, "grad_norm": 6.979156017303467, "learning_rate": 8.406051064236337e-05, "loss": 1.7553, "step": 12890 }, { "epoch": 1.0067899789276515, "grad_norm": 11.995668411254883, "learning_rate": 8.402725593413225e-05, "loss": 0.6281, "step": 12900 }, { "epoch": 1.0075704362756575, "grad_norm": 27.00234603881836, "learning_rate": 8.399397316510596e-05, "loss": 0.4262, "step": 12910 }, { "epoch": 1.0083508936236634, "grad_norm": 71.79927062988281, "learning_rate": 8.396066236273137e-05, "loss": 1.3269, "step": 12920 }, { "epoch": 1.0091313509716695, "grad_norm": 51.39724349975586, "learning_rate": 8.392732355447844e-05, "loss": 1.0123, "step": 12930 }, { "epoch": 1.0099118083196754, "grad_norm": 52.78282928466797, "learning_rate": 8.389395676784025e-05, "loss": 1.4514, "step": 12940 }, { "epoch": 1.0106922656676813, "grad_norm": 40.91914367675781, "learning_rate": 8.386056203033295e-05, "loss": 0.9037, "step": 12950 }, { "epoch": 1.0114727230156872, "grad_norm": 50.55011749267578, "learning_rate": 8.382713936949566e-05, "loss": 0.9325, "step": 12960 }, { "epoch": 1.0122531803636932, "grad_norm": 0.46198034286499023, "learning_rate": 8.379368881289067e-05, "loss": 0.6073, "step": 12970 }, { "epoch": 1.013033637711699, "grad_norm": 37.872135162353516, "learning_rate": 8.376021038810315e-05, "loss": 1.9623, "step": 12980 }, { "epoch": 1.013814095059705, "grad_norm": 4.987172603607178, "learning_rate": 8.372670412274129e-05, "loss": 0.6192, "step": 12990 }, { "epoch": 1.014594552407711, "grad_norm": 4.3910542444791645e-05, "learning_rate": 8.369317004443628e-05, "loss": 0.584, "step": 13000 }, { "epoch": 1.0153750097557168, "grad_norm": 0.6088114976882935, "learning_rate": 8.36596081808422e-05, "loss": 1.1512, "step": 13010 }, { "epoch": 1.0161554671037227, "grad_norm": 4.626126289367676, "learning_rate": 8.362601855963605e-05, "loss": 0.6647, "step": 13020 }, { "epoch": 1.0169359244517286, "grad_norm": 0.0009490216034464538, "learning_rate": 8.359240120851772e-05, "loss": 0.6454, "step": 13030 }, { "epoch": 1.0177163817997346, "grad_norm": 1.2626225043277373e-07, "learning_rate": 8.355875615521e-05, "loss": 0.2324, "step": 13040 }, { "epoch": 1.0184968391477405, "grad_norm": 21.736064910888672, "learning_rate": 8.352508342745847e-05, "loss": 1.5845, "step": 13050 }, { "epoch": 1.0192772964957466, "grad_norm": 7.8866047859191895, "learning_rate": 8.349138305303159e-05, "loss": 1.2487, "step": 13060 }, { "epoch": 1.0200577538437525, "grad_norm": 49.58051300048828, "learning_rate": 8.345765505972056e-05, "loss": 0.5553, "step": 13070 }, { "epoch": 1.0208382111917584, "grad_norm": 0.45068633556365967, "learning_rate": 8.342389947533943e-05, "loss": 0.0828, "step": 13080 }, { "epoch": 1.0216186685397644, "grad_norm": 49.6279411315918, "learning_rate": 8.33901163277249e-05, "loss": 1.5898, "step": 13090 }, { "epoch": 1.0223991258877703, "grad_norm": 5.856717280039447e-07, "learning_rate": 8.335630564473652e-05, "loss": 1.2666, "step": 13100 }, { "epoch": 1.0231795832357762, "grad_norm": 0.04553453251719475, "learning_rate": 8.332246745425644e-05, "loss": 1.0408, "step": 13110 }, { "epoch": 1.023960040583782, "grad_norm": 0.0003157809842377901, "learning_rate": 8.328860178418958e-05, "loss": 0.0292, "step": 13120 }, { "epoch": 1.024740497931788, "grad_norm": 3.6817857562709833e-06, "learning_rate": 8.325470866246343e-05, "loss": 2.1363, "step": 13130 }, { "epoch": 1.025520955279794, "grad_norm": 0.0001370991230942309, "learning_rate": 8.322078811702823e-05, "loss": 0.0206, "step": 13140 }, { "epoch": 1.0263014126277998, "grad_norm": 1.1230233907699585, "learning_rate": 8.318684017585673e-05, "loss": 1.1947, "step": 13150 }, { "epoch": 1.0270818699758058, "grad_norm": 6.144953204056947e-07, "learning_rate": 8.315286486694434e-05, "loss": 1.9959, "step": 13160 }, { "epoch": 1.0278623273238117, "grad_norm": 0.0007690726779401302, "learning_rate": 8.311886221830902e-05, "loss": 0.2656, "step": 13170 }, { "epoch": 1.0286427846718176, "grad_norm": 11.274613380432129, "learning_rate": 8.308483225799126e-05, "loss": 1.606, "step": 13180 }, { "epoch": 1.0294232420198237, "grad_norm": 48.34074020385742, "learning_rate": 8.30507750140541e-05, "loss": 1.329, "step": 13190 }, { "epoch": 1.0302036993678296, "grad_norm": 3.7099595069885254, "learning_rate": 8.301669051458305e-05, "loss": 0.2159, "step": 13200 }, { "epoch": 1.0309841567158355, "grad_norm": 12.82052230834961, "learning_rate": 8.29825787876861e-05, "loss": 1.4214, "step": 13210 }, { "epoch": 1.0317646140638415, "grad_norm": 1.2749547958374023, "learning_rate": 8.294843986149374e-05, "loss": 0.7242, "step": 13220 }, { "epoch": 1.0325450714118474, "grad_norm": 13.616936683654785, "learning_rate": 8.291427376415882e-05, "loss": 0.7435, "step": 13230 }, { "epoch": 1.0333255287598533, "grad_norm": 0.00015055372205097228, "learning_rate": 8.288008052385666e-05, "loss": 0.3443, "step": 13240 }, { "epoch": 1.0341059861078592, "grad_norm": 1.2600555419921875, "learning_rate": 8.284586016878492e-05, "loss": 0.5666, "step": 13250 }, { "epoch": 1.0348864434558651, "grad_norm": 40.253013610839844, "learning_rate": 8.281161272716365e-05, "loss": 2.7526, "step": 13260 }, { "epoch": 1.035666900803871, "grad_norm": 6.156032759463415e-05, "learning_rate": 8.277733822723518e-05, "loss": 0.3435, "step": 13270 }, { "epoch": 1.036447358151877, "grad_norm": 0.009943058714270592, "learning_rate": 8.274303669726426e-05, "loss": 1.0844, "step": 13280 }, { "epoch": 1.0372278154998829, "grad_norm": 46.61821365356445, "learning_rate": 8.270870816553782e-05, "loss": 0.6919, "step": 13290 }, { "epoch": 1.0380082728478888, "grad_norm": 5.1633910480575196e-09, "learning_rate": 8.267435266036512e-05, "loss": 0.9052, "step": 13300 }, { "epoch": 1.0387887301958947, "grad_norm": 56.16582489013672, "learning_rate": 8.263997021007765e-05, "loss": 1.0907, "step": 13310 }, { "epoch": 1.0395691875439008, "grad_norm": 0.8746923804283142, "learning_rate": 8.260556084302911e-05, "loss": 1.2991, "step": 13320 }, { "epoch": 1.0403496448919067, "grad_norm": 39.039512634277344, "learning_rate": 8.257112458759541e-05, "loss": 1.4904, "step": 13330 }, { "epoch": 1.0411301022399126, "grad_norm": 2.8694819320662646e-07, "learning_rate": 8.253666147217464e-05, "loss": 1.7749, "step": 13340 }, { "epoch": 1.0419105595879186, "grad_norm": 75.42256164550781, "learning_rate": 8.250217152518702e-05, "loss": 2.2867, "step": 13350 }, { "epoch": 1.0426910169359245, "grad_norm": 47.55293273925781, "learning_rate": 8.24676547750749e-05, "loss": 0.5861, "step": 13360 }, { "epoch": 1.0434714742839304, "grad_norm": 66.87212371826172, "learning_rate": 8.243311125030274e-05, "loss": 1.0252, "step": 13370 }, { "epoch": 1.0442519316319363, "grad_norm": 2.618544101715088, "learning_rate": 8.239854097935709e-05, "loss": 1.612, "step": 13380 }, { "epoch": 1.0450323889799422, "grad_norm": 7.044678568490781e-06, "learning_rate": 8.236394399074654e-05, "loss": 0.6337, "step": 13390 }, { "epoch": 1.0458128463279481, "grad_norm": 0.528467059135437, "learning_rate": 8.232932031300171e-05, "loss": 0.4871, "step": 13400 }, { "epoch": 1.046593303675954, "grad_norm": 7.7571539878845215, "learning_rate": 8.229466997467527e-05, "loss": 0.878, "step": 13410 }, { "epoch": 1.04737376102396, "grad_norm": 78.52983856201172, "learning_rate": 8.225999300434181e-05, "loss": 2.2092, "step": 13420 }, { "epoch": 1.0481542183719659, "grad_norm": 0.11696358025074005, "learning_rate": 8.222528943059793e-05, "loss": 0.4794, "step": 13430 }, { "epoch": 1.048934675719972, "grad_norm": 1.6190725564956665, "learning_rate": 8.219055928206213e-05, "loss": 0.4143, "step": 13440 }, { "epoch": 1.049715133067978, "grad_norm": 38.03479766845703, "learning_rate": 8.215580258737493e-05, "loss": 0.6102, "step": 13450 }, { "epoch": 1.0504955904159838, "grad_norm": 4.810214042663574, "learning_rate": 8.212101937519854e-05, "loss": 0.2218, "step": 13460 }, { "epoch": 1.0512760477639898, "grad_norm": 65.26640319824219, "learning_rate": 8.208620967421728e-05, "loss": 2.3941, "step": 13470 }, { "epoch": 1.0520565051119957, "grad_norm": 0.008846109732985497, "learning_rate": 8.20513735131371e-05, "loss": 0.6208, "step": 13480 }, { "epoch": 1.0528369624600016, "grad_norm": 57.17930221557617, "learning_rate": 8.201651092068592e-05, "loss": 3.1685, "step": 13490 }, { "epoch": 1.0536174198080075, "grad_norm": 22.42201042175293, "learning_rate": 8.198162192561337e-05, "loss": 2.3165, "step": 13500 }, { "epoch": 1.0543978771560134, "grad_norm": 10.212949752807617, "learning_rate": 8.19467065566909e-05, "loss": 0.5393, "step": 13510 }, { "epoch": 1.0551783345040193, "grad_norm": 37.82101821899414, "learning_rate": 8.19117648427117e-05, "loss": 1.2392, "step": 13520 }, { "epoch": 1.0559587918520252, "grad_norm": 0.038047194480895996, "learning_rate": 8.187679681249065e-05, "loss": 0.7501, "step": 13530 }, { "epoch": 1.0567392492000311, "grad_norm": 0.0002589684445410967, "learning_rate": 8.184180249486439e-05, "loss": 0.1071, "step": 13540 }, { "epoch": 1.057519706548037, "grad_norm": 6.387953987996298e-08, "learning_rate": 8.18067819186912e-05, "loss": 0.755, "step": 13550 }, { "epoch": 1.058300163896043, "grad_norm": 0.07943969964981079, "learning_rate": 8.177173511285102e-05, "loss": 0.3546, "step": 13560 }, { "epoch": 1.0590806212440491, "grad_norm": 0.06328899413347244, "learning_rate": 8.173666210624542e-05, "loss": 0.4805, "step": 13570 }, { "epoch": 1.059861078592055, "grad_norm": 28.725759506225586, "learning_rate": 8.17015629277976e-05, "loss": 0.4756, "step": 13580 }, { "epoch": 1.060641535940061, "grad_norm": 64.62845611572266, "learning_rate": 8.16664376064523e-05, "loss": 3.3899, "step": 13590 }, { "epoch": 1.0614219932880669, "grad_norm": 46.945552825927734, "learning_rate": 8.163128617117583e-05, "loss": 0.6698, "step": 13600 }, { "epoch": 1.0622024506360728, "grad_norm": 31.494598388671875, "learning_rate": 8.159610865095608e-05, "loss": 1.8572, "step": 13610 }, { "epoch": 1.0629829079840787, "grad_norm": 2.593214503576746e-06, "learning_rate": 8.15609050748024e-05, "loss": 0.7823, "step": 13620 }, { "epoch": 1.0637633653320846, "grad_norm": 23.18966293334961, "learning_rate": 8.152567547174565e-05, "loss": 0.6276, "step": 13630 }, { "epoch": 1.0645438226800905, "grad_norm": 6.394977569580078, "learning_rate": 8.149041987083816e-05, "loss": 0.8798, "step": 13640 }, { "epoch": 1.0653242800280964, "grad_norm": 8.956060810305644e-06, "learning_rate": 8.145513830115366e-05, "loss": 0.2264, "step": 13650 }, { "epoch": 1.0661047373761023, "grad_norm": 0.010577269829809666, "learning_rate": 8.141983079178736e-05, "loss": 1.8401, "step": 13660 }, { "epoch": 1.0668851947241083, "grad_norm": 0.7367900609970093, "learning_rate": 8.138449737185578e-05, "loss": 0.9894, "step": 13670 }, { "epoch": 1.0676656520721142, "grad_norm": 0.0003608228580560535, "learning_rate": 8.134913807049689e-05, "loss": 0.5108, "step": 13680 }, { "epoch": 1.0684461094201203, "grad_norm": 48.65434646606445, "learning_rate": 8.131375291686995e-05, "loss": 1.2039, "step": 13690 }, { "epoch": 1.0692265667681262, "grad_norm": 0.004866276402026415, "learning_rate": 8.127834194015553e-05, "loss": 0.1723, "step": 13700 }, { "epoch": 1.0700070241161321, "grad_norm": 61.13690948486328, "learning_rate": 8.124290516955557e-05, "loss": 0.6815, "step": 13710 }, { "epoch": 1.070787481464138, "grad_norm": 61.9520149230957, "learning_rate": 8.120744263429319e-05, "loss": 1.9792, "step": 13720 }, { "epoch": 1.071567938812144, "grad_norm": 0.23533307015895844, "learning_rate": 8.117195436361281e-05, "loss": 1.9054, "step": 13730 }, { "epoch": 1.0723483961601499, "grad_norm": 13.175278663635254, "learning_rate": 8.113644038678008e-05, "loss": 0.7785, "step": 13740 }, { "epoch": 1.0731288535081558, "grad_norm": 2.3585263988934457e-05, "learning_rate": 8.110090073308178e-05, "loss": 2.0138, "step": 13750 }, { "epoch": 1.0739093108561617, "grad_norm": 0.43680545687675476, "learning_rate": 8.106533543182598e-05, "loss": 0.1986, "step": 13760 }, { "epoch": 1.0746897682041676, "grad_norm": 0.013349410146474838, "learning_rate": 8.102974451234178e-05, "loss": 0.3865, "step": 13770 }, { "epoch": 1.0754702255521735, "grad_norm": 22.574199676513672, "learning_rate": 8.099412800397948e-05, "loss": 0.5899, "step": 13780 }, { "epoch": 1.0762506829001794, "grad_norm": 19.712507247924805, "learning_rate": 8.095848593611044e-05, "loss": 1.1524, "step": 13790 }, { "epoch": 1.0770311402481854, "grad_norm": 2.589094877243042, "learning_rate": 8.092281833812716e-05, "loss": 1.6711, "step": 13800 }, { "epoch": 1.0778115975961913, "grad_norm": 38.977909088134766, "learning_rate": 8.088712523944314e-05, "loss": 1.2787, "step": 13810 }, { "epoch": 1.0785920549441972, "grad_norm": 34.47909164428711, "learning_rate": 8.085140666949291e-05, "loss": 1.0232, "step": 13820 }, { "epoch": 1.0793725122922033, "grad_norm": 0.8124696612358093, "learning_rate": 8.081566265773202e-05, "loss": 0.2438, "step": 13830 }, { "epoch": 1.0801529696402092, "grad_norm": 0.05350351333618164, "learning_rate": 8.077989323363702e-05, "loss": 0.4916, "step": 13840 }, { "epoch": 1.0809334269882152, "grad_norm": 8.868869372236077e-07, "learning_rate": 8.074409842670538e-05, "loss": 1.8634, "step": 13850 }, { "epoch": 1.081713884336221, "grad_norm": 38.613704681396484, "learning_rate": 8.07082782664555e-05, "loss": 1.8085, "step": 13860 }, { "epoch": 1.082494341684227, "grad_norm": 45.651336669921875, "learning_rate": 8.067243278242676e-05, "loss": 2.1611, "step": 13870 }, { "epoch": 1.083274799032233, "grad_norm": 0.4036369025707245, "learning_rate": 8.063656200417928e-05, "loss": 0.6555, "step": 13880 }, { "epoch": 1.0840552563802388, "grad_norm": 2.1979187749820994e-06, "learning_rate": 8.060066596129422e-05, "loss": 1.0594, "step": 13890 }, { "epoch": 1.0848357137282447, "grad_norm": 2.6379153728485107, "learning_rate": 8.056474468337343e-05, "loss": 0.4087, "step": 13900 }, { "epoch": 1.0856161710762506, "grad_norm": 4.4034193706465885e-05, "learning_rate": 8.052879820003962e-05, "loss": 0.3825, "step": 13910 }, { "epoch": 1.0863966284242565, "grad_norm": 1.2897426131530665e-05, "learning_rate": 8.049282654093631e-05, "loss": 0.4262, "step": 13920 }, { "epoch": 1.0871770857722625, "grad_norm": 2.5914298475981923e-06, "learning_rate": 8.045682973572777e-05, "loss": 0.6124, "step": 13930 }, { "epoch": 1.0879575431202684, "grad_norm": 46.8470344543457, "learning_rate": 8.042080781409896e-05, "loss": 1.1768, "step": 13940 }, { "epoch": 1.0887380004682745, "grad_norm": 1.629309058189392, "learning_rate": 8.038476080575562e-05, "loss": 1.8716, "step": 13950 }, { "epoch": 1.0895184578162804, "grad_norm": 0.21702909469604492, "learning_rate": 8.034868874042412e-05, "loss": 0.9467, "step": 13960 }, { "epoch": 1.0902989151642863, "grad_norm": 3.1508551501246984e-07, "learning_rate": 8.031259164785155e-05, "loss": 1.7675, "step": 13970 }, { "epoch": 1.0910793725122923, "grad_norm": 1.32136070728302, "learning_rate": 8.027646955780556e-05, "loss": 1.9713, "step": 13980 }, { "epoch": 1.0918598298602982, "grad_norm": 0.06869912147521973, "learning_rate": 8.024032250007454e-05, "loss": 0.4424, "step": 13990 }, { "epoch": 1.092640287208304, "grad_norm": 15.436603546142578, "learning_rate": 8.020415050446732e-05, "loss": 0.2352, "step": 14000 }, { "epoch": 1.09342074455631, "grad_norm": 1.4183063507080078, "learning_rate": 8.016795360081342e-05, "loss": 0.7249, "step": 14010 }, { "epoch": 1.094201201904316, "grad_norm": 9.759514808654785, "learning_rate": 8.013173181896283e-05, "loss": 0.5102, "step": 14020 }, { "epoch": 1.0949816592523218, "grad_norm": 4.5370564460754395, "learning_rate": 8.009548518878606e-05, "loss": 0.3963, "step": 14030 }, { "epoch": 1.0957621166003277, "grad_norm": 1.1387583072064444e-06, "learning_rate": 8.005921374017415e-05, "loss": 2.1259, "step": 14040 }, { "epoch": 1.0965425739483337, "grad_norm": 10.59189510345459, "learning_rate": 8.002291750303857e-05, "loss": 1.1702, "step": 14050 }, { "epoch": 1.0973230312963396, "grad_norm": 10.202428817749023, "learning_rate": 7.998659650731125e-05, "loss": 1.0879, "step": 14060 }, { "epoch": 1.0981034886443455, "grad_norm": 37.752140045166016, "learning_rate": 7.995025078294452e-05, "loss": 1.529, "step": 14070 }, { "epoch": 1.0988839459923516, "grad_norm": 4.652754306793213, "learning_rate": 7.991388035991114e-05, "loss": 0.3956, "step": 14080 }, { "epoch": 1.0996644033403575, "grad_norm": 0.00034467552904970944, "learning_rate": 7.98774852682042e-05, "loss": 0.4728, "step": 14090 }, { "epoch": 1.1004448606883634, "grad_norm": 0.8711071014404297, "learning_rate": 7.984106553783712e-05, "loss": 0.3728, "step": 14100 }, { "epoch": 1.1012253180363694, "grad_norm": 1.9005073308944702, "learning_rate": 7.98046211988437e-05, "loss": 0.4657, "step": 14110 }, { "epoch": 1.1020057753843753, "grad_norm": 42.511497497558594, "learning_rate": 7.976815228127801e-05, "loss": 1.4052, "step": 14120 }, { "epoch": 1.1027862327323812, "grad_norm": 34.27906799316406, "learning_rate": 7.973165881521434e-05, "loss": 1.217, "step": 14130 }, { "epoch": 1.103566690080387, "grad_norm": 0.008204025216400623, "learning_rate": 7.969514083074727e-05, "loss": 2.7314, "step": 14140 }, { "epoch": 1.104347147428393, "grad_norm": 0.2557177245616913, "learning_rate": 7.965859835799162e-05, "loss": 0.9439, "step": 14150 }, { "epoch": 1.105127604776399, "grad_norm": 39.277400970458984, "learning_rate": 7.962203142708231e-05, "loss": 0.5969, "step": 14160 }, { "epoch": 1.1059080621244048, "grad_norm": 0.045487336814403534, "learning_rate": 7.958544006817456e-05, "loss": 0.869, "step": 14170 }, { "epoch": 1.1066885194724108, "grad_norm": 21.060131072998047, "learning_rate": 7.954882431144364e-05, "loss": 0.2066, "step": 14180 }, { "epoch": 1.1074689768204167, "grad_norm": 0.9694964289665222, "learning_rate": 7.951218418708497e-05, "loss": 1.0377, "step": 14190 }, { "epoch": 1.1082494341684228, "grad_norm": 7.772018909454346, "learning_rate": 7.947551972531409e-05, "loss": 0.0204, "step": 14200 }, { "epoch": 1.1090298915164287, "grad_norm": 93.92030334472656, "learning_rate": 7.943883095636652e-05, "loss": 2.4873, "step": 14210 }, { "epoch": 1.1098103488644346, "grad_norm": 72.38433837890625, "learning_rate": 7.940211791049796e-05, "loss": 1.1869, "step": 14220 }, { "epoch": 1.1105908062124406, "grad_norm": 59.901004791259766, "learning_rate": 7.936538061798403e-05, "loss": 5.1337, "step": 14230 }, { "epoch": 1.1113712635604465, "grad_norm": 0.010661332868039608, "learning_rate": 7.932861910912036e-05, "loss": 0.5729, "step": 14240 }, { "epoch": 1.1121517209084524, "grad_norm": 0.6803077459335327, "learning_rate": 7.92918334142226e-05, "loss": 1.349, "step": 14250 }, { "epoch": 1.1129321782564583, "grad_norm": 5.760603427886963, "learning_rate": 7.925502356362627e-05, "loss": 0.5931, "step": 14260 }, { "epoch": 1.1137126356044642, "grad_norm": 29.1840763092041, "learning_rate": 7.921818958768686e-05, "loss": 0.7213, "step": 14270 }, { "epoch": 1.1144930929524701, "grad_norm": 33.425418853759766, "learning_rate": 7.918133151677977e-05, "loss": 1.2351, "step": 14280 }, { "epoch": 1.115273550300476, "grad_norm": 34.22904968261719, "learning_rate": 7.914444938130021e-05, "loss": 0.8819, "step": 14290 }, { "epoch": 1.116054007648482, "grad_norm": 10.303384780883789, "learning_rate": 7.910754321166329e-05, "loss": 0.8157, "step": 14300 }, { "epoch": 1.1168344649964879, "grad_norm": 26.004243850708008, "learning_rate": 7.907061303830392e-05, "loss": 0.7456, "step": 14310 }, { "epoch": 1.1176149223444938, "grad_norm": 11.272562980651855, "learning_rate": 7.903365889167682e-05, "loss": 1.2884, "step": 14320 }, { "epoch": 1.1183953796924997, "grad_norm": 11.90373420715332, "learning_rate": 7.899668080225642e-05, "loss": 0.2785, "step": 14330 }, { "epoch": 1.1191758370405058, "grad_norm": 3.1206201356326346e-07, "learning_rate": 7.895967880053697e-05, "loss": 0.5356, "step": 14340 }, { "epoch": 1.1199562943885117, "grad_norm": 11.254258155822754, "learning_rate": 7.89226529170324e-05, "loss": 1.1941, "step": 14350 }, { "epoch": 1.1207367517365177, "grad_norm": 13.978715896606445, "learning_rate": 7.888560318227636e-05, "loss": 0.2811, "step": 14360 }, { "epoch": 1.1215172090845236, "grad_norm": 2.0998995751142502e-05, "learning_rate": 7.884852962682212e-05, "loss": 0.5678, "step": 14370 }, { "epoch": 1.1222976664325295, "grad_norm": 26.62897300720215, "learning_rate": 7.881143228124266e-05, "loss": 0.3172, "step": 14380 }, { "epoch": 1.1230781237805354, "grad_norm": 3.174713114617589e-08, "learning_rate": 7.87743111761305e-05, "loss": 1.003, "step": 14390 }, { "epoch": 1.1238585811285413, "grad_norm": 1.8061119318008423, "learning_rate": 7.873716634209784e-05, "loss": 2.5562, "step": 14400 }, { "epoch": 1.1246390384765472, "grad_norm": 31.073867797851562, "learning_rate": 7.869999780977641e-05, "loss": 0.6857, "step": 14410 }, { "epoch": 1.1254194958245531, "grad_norm": 51.120540618896484, "learning_rate": 7.866280560981745e-05, "loss": 2.2526, "step": 14420 }, { "epoch": 1.126199953172559, "grad_norm": 0.10467230528593063, "learning_rate": 7.862558977289174e-05, "loss": 0.7654, "step": 14430 }, { "epoch": 1.126980410520565, "grad_norm": 0.00010914414451690391, "learning_rate": 7.858835032968959e-05, "loss": 1.0868, "step": 14440 }, { "epoch": 1.127760867868571, "grad_norm": 43.196014404296875, "learning_rate": 7.855108731092073e-05, "loss": 0.9148, "step": 14450 }, { "epoch": 1.128541325216577, "grad_norm": 0.3297148644924164, "learning_rate": 7.851380074731433e-05, "loss": 0.58, "step": 14460 }, { "epoch": 1.129321782564583, "grad_norm": 13.183833122253418, "learning_rate": 7.847649066961904e-05, "loss": 0.161, "step": 14470 }, { "epoch": 1.1301022399125888, "grad_norm": 41.99236297607422, "learning_rate": 7.84391571086028e-05, "loss": 0.8755, "step": 14480 }, { "epoch": 1.1308826972605948, "grad_norm": 8.9720373352975e-07, "learning_rate": 7.840180009505303e-05, "loss": 0.098, "step": 14490 }, { "epoch": 1.1316631546086007, "grad_norm": 35.03562545776367, "learning_rate": 7.83644196597764e-05, "loss": 0.2565, "step": 14500 }, { "epoch": 1.1324436119566066, "grad_norm": 4.2327664573349466e-07, "learning_rate": 7.83270158335989e-05, "loss": 0.9135, "step": 14510 }, { "epoch": 1.1332240693046125, "grad_norm": 0.0076245772652328014, "learning_rate": 7.828958864736587e-05, "loss": 0.2073, "step": 14520 }, { "epoch": 1.1340045266526184, "grad_norm": 4.616389936984433e-09, "learning_rate": 7.825213813194187e-05, "loss": 0.096, "step": 14530 }, { "epoch": 1.1347849840006243, "grad_norm": 25.069517135620117, "learning_rate": 7.821466431821072e-05, "loss": 0.9064, "step": 14540 }, { "epoch": 1.1355654413486302, "grad_norm": 6.9510321617126465, "learning_rate": 7.817716723707545e-05, "loss": 1.6894, "step": 14550 }, { "epoch": 1.1363458986966362, "grad_norm": 43.50651550292969, "learning_rate": 7.813964691945821e-05, "loss": 2.6459, "step": 14560 }, { "epoch": 1.137126356044642, "grad_norm": 35.28486633300781, "learning_rate": 7.810210339630042e-05, "loss": 1.4862, "step": 14570 }, { "epoch": 1.137906813392648, "grad_norm": 5.5574896578036714e-06, "learning_rate": 7.806453669856259e-05, "loss": 0.907, "step": 14580 }, { "epoch": 1.138687270740654, "grad_norm": 38.928470611572266, "learning_rate": 7.802694685722431e-05, "loss": 0.6483, "step": 14590 }, { "epoch": 1.13946772808866, "grad_norm": 1.095226526260376, "learning_rate": 7.798933390328431e-05, "loss": 0.4901, "step": 14600 }, { "epoch": 1.140248185436666, "grad_norm": 0.0013487986288964748, "learning_rate": 7.795169786776035e-05, "loss": 0.9132, "step": 14610 }, { "epoch": 1.1410286427846719, "grad_norm": 15.737239837646484, "learning_rate": 7.791403878168922e-05, "loss": 1.3128, "step": 14620 }, { "epoch": 1.1418091001326778, "grad_norm": 0.00030404122662730515, "learning_rate": 7.787635667612674e-05, "loss": 0.4789, "step": 14630 }, { "epoch": 1.1425895574806837, "grad_norm": 0.6297647953033447, "learning_rate": 7.783865158214768e-05, "loss": 0.0293, "step": 14640 }, { "epoch": 1.1433700148286896, "grad_norm": 46.85776138305664, "learning_rate": 7.780092353084579e-05, "loss": 2.0859, "step": 14650 }, { "epoch": 1.1441504721766955, "grad_norm": 0.00015316448116209358, "learning_rate": 7.776317255333376e-05, "loss": 1.1816, "step": 14660 }, { "epoch": 1.1449309295247014, "grad_norm": 6.184297561645508, "learning_rate": 7.772539868074318e-05, "loss": 1.1873, "step": 14670 }, { "epoch": 1.1457113868727073, "grad_norm": 47.606781005859375, "learning_rate": 7.76876019442245e-05, "loss": 1.8268, "step": 14680 }, { "epoch": 1.1464918442207133, "grad_norm": 0.004689674358814955, "learning_rate": 7.764978237494707e-05, "loss": 0.6779, "step": 14690 }, { "epoch": 1.1472723015687192, "grad_norm": 41.07505416870117, "learning_rate": 7.761194000409901e-05, "loss": 1.2738, "step": 14700 }, { "epoch": 1.1480527589167253, "grad_norm": 0.14501197636127472, "learning_rate": 7.75740748628873e-05, "loss": 1.1459, "step": 14710 }, { "epoch": 1.1488332162647312, "grad_norm": 0.05744193121790886, "learning_rate": 7.753618698253765e-05, "loss": 1.2282, "step": 14720 }, { "epoch": 1.1496136736127371, "grad_norm": 8.063496589660645, "learning_rate": 7.749827639429456e-05, "loss": 0.6149, "step": 14730 }, { "epoch": 1.150394130960743, "grad_norm": 0.5605101585388184, "learning_rate": 7.746034312942123e-05, "loss": 0.3887, "step": 14740 }, { "epoch": 1.151174588308749, "grad_norm": 20.27739143371582, "learning_rate": 7.742238721919957e-05, "loss": 1.3573, "step": 14750 }, { "epoch": 1.1519550456567549, "grad_norm": 2.61468768119812, "learning_rate": 7.738440869493018e-05, "loss": 0.6847, "step": 14760 }, { "epoch": 1.1527355030047608, "grad_norm": 0.26672032475471497, "learning_rate": 7.734640758793225e-05, "loss": 0.9144, "step": 14770 }, { "epoch": 1.1535159603527667, "grad_norm": 20.54535484313965, "learning_rate": 7.73083839295437e-05, "loss": 0.2751, "step": 14780 }, { "epoch": 1.1542964177007726, "grad_norm": 1.1103615760803223, "learning_rate": 7.727033775112096e-05, "loss": 0.947, "step": 14790 }, { "epoch": 1.1550768750487785, "grad_norm": 8.565208435058594, "learning_rate": 7.723226908403902e-05, "loss": 1.2774, "step": 14800 }, { "epoch": 1.1558573323967845, "grad_norm": 0.001084254472516477, "learning_rate": 7.71941779596915e-05, "loss": 0.5417, "step": 14810 }, { "epoch": 1.1566377897447904, "grad_norm": 0.0005097747780382633, "learning_rate": 7.715606440949045e-05, "loss": 0.9245, "step": 14820 }, { "epoch": 1.1574182470927963, "grad_norm": 0.0013373733963817358, "learning_rate": 7.71179284648665e-05, "loss": 1.8957, "step": 14830 }, { "epoch": 1.1581987044408022, "grad_norm": 3.3863561153411865, "learning_rate": 7.707977015726869e-05, "loss": 0.3075, "step": 14840 }, { "epoch": 1.1589791617888083, "grad_norm": 41.45360565185547, "learning_rate": 7.704158951816446e-05, "loss": 1.0029, "step": 14850 }, { "epoch": 1.1597596191368142, "grad_norm": 28.2884578704834, "learning_rate": 7.700338657903977e-05, "loss": 0.4945, "step": 14860 }, { "epoch": 1.1605400764848202, "grad_norm": 5.369661331176758, "learning_rate": 7.69651613713989e-05, "loss": 0.1512, "step": 14870 }, { "epoch": 1.161320533832826, "grad_norm": 52.72662353515625, "learning_rate": 7.692691392676454e-05, "loss": 1.0431, "step": 14880 }, { "epoch": 1.162100991180832, "grad_norm": 64.16173553466797, "learning_rate": 7.688864427667768e-05, "loss": 1.918, "step": 14890 }, { "epoch": 1.162881448528838, "grad_norm": 0.009893955662846565, "learning_rate": 7.68503524526976e-05, "loss": 1.4701, "step": 14900 }, { "epoch": 1.1636619058768438, "grad_norm": 29.174976348876953, "learning_rate": 7.681203848640193e-05, "loss": 2.4439, "step": 14910 }, { "epoch": 1.1644423632248497, "grad_norm": 0.05682931840419769, "learning_rate": 7.677370240938653e-05, "loss": 1.2603, "step": 14920 }, { "epoch": 1.1652228205728556, "grad_norm": 8.036317825317383, "learning_rate": 7.673534425326548e-05, "loss": 1.681, "step": 14930 }, { "epoch": 1.1660032779208616, "grad_norm": 48.152549743652344, "learning_rate": 7.669696404967106e-05, "loss": 0.8372, "step": 14940 }, { "epoch": 1.1667837352688675, "grad_norm": 0.025480227544903755, "learning_rate": 7.66585618302538e-05, "loss": 0.4711, "step": 14950 }, { "epoch": 1.1675641926168736, "grad_norm": 0.004622321110218763, "learning_rate": 7.66201376266823e-05, "loss": 0.3303, "step": 14960 }, { "epoch": 1.1683446499648795, "grad_norm": 21.413469314575195, "learning_rate": 7.658169147064333e-05, "loss": 1.4434, "step": 14970 }, { "epoch": 1.1691251073128854, "grad_norm": 42.26973342895508, "learning_rate": 7.654322339384178e-05, "loss": 0.4439, "step": 14980 }, { "epoch": 1.1699055646608914, "grad_norm": 56.95500946044922, "learning_rate": 7.65047334280006e-05, "loss": 1.851, "step": 14990 }, { "epoch": 1.1706860220088973, "grad_norm": 0.0011174381943419576, "learning_rate": 7.646622160486075e-05, "loss": 1.7686, "step": 15000 }, { "epoch": 1.1714664793569032, "grad_norm": 0.0004623864952009171, "learning_rate": 7.642768795618129e-05, "loss": 0.0741, "step": 15010 }, { "epoch": 1.172246936704909, "grad_norm": 1.7365893654641695e-05, "learning_rate": 7.63891325137392e-05, "loss": 1.4602, "step": 15020 }, { "epoch": 1.173027394052915, "grad_norm": 10.92381477355957, "learning_rate": 7.635055530932951e-05, "loss": 0.2436, "step": 15030 }, { "epoch": 1.173807851400921, "grad_norm": 0.46861162781715393, "learning_rate": 7.631195637476516e-05, "loss": 1.6745, "step": 15040 }, { "epoch": 1.1745883087489268, "grad_norm": 0.00027051439974457026, "learning_rate": 7.627333574187697e-05, "loss": 0.6537, "step": 15050 }, { "epoch": 1.1753687660969327, "grad_norm": 2.5070583820343018, "learning_rate": 7.623469344251373e-05, "loss": 0.7209, "step": 15060 }, { "epoch": 1.1761492234449387, "grad_norm": 16.99152946472168, "learning_rate": 7.619602950854205e-05, "loss": 0.4085, "step": 15070 }, { "epoch": 1.1769296807929446, "grad_norm": 0.03525177016854286, "learning_rate": 7.61573439718464e-05, "loss": 0.1498, "step": 15080 }, { "epoch": 1.1777101381409505, "grad_norm": 43.2532958984375, "learning_rate": 7.611863686432903e-05, "loss": 2.3887, "step": 15090 }, { "epoch": 1.1784905954889564, "grad_norm": 1.2691663187069935e-07, "learning_rate": 7.607990821791005e-05, "loss": 0.6657, "step": 15100 }, { "epoch": 1.1792710528369625, "grad_norm": 46.22968292236328, "learning_rate": 7.604115806452723e-05, "loss": 1.4542, "step": 15110 }, { "epoch": 1.1800515101849685, "grad_norm": 8.150375651894137e-05, "learning_rate": 7.600238643613618e-05, "loss": 0.601, "step": 15120 }, { "epoch": 1.1808319675329744, "grad_norm": 1.4502999782562256, "learning_rate": 7.596359336471015e-05, "loss": 0.3452, "step": 15130 }, { "epoch": 1.1816124248809803, "grad_norm": 19.779787063598633, "learning_rate": 7.59247788822401e-05, "loss": 1.2247, "step": 15140 }, { "epoch": 1.1823928822289862, "grad_norm": 31.30126190185547, "learning_rate": 7.588594302073464e-05, "loss": 0.3812, "step": 15150 }, { "epoch": 1.1831733395769921, "grad_norm": 4.278034210205078, "learning_rate": 7.584708581222002e-05, "loss": 1.2073, "step": 15160 }, { "epoch": 1.183953796924998, "grad_norm": 3.023759290954331e-06, "learning_rate": 7.580820728874008e-05, "loss": 2.7868, "step": 15170 }, { "epoch": 1.184734254273004, "grad_norm": 0.6000422835350037, "learning_rate": 7.576930748235624e-05, "loss": 0.5173, "step": 15180 }, { "epoch": 1.1855147116210099, "grad_norm": 47.56028747558594, "learning_rate": 7.573038642514748e-05, "loss": 2.4751, "step": 15190 }, { "epoch": 1.1862951689690158, "grad_norm": 23.9864559173584, "learning_rate": 7.569144414921031e-05, "loss": 0.9667, "step": 15200 }, { "epoch": 1.1870756263170217, "grad_norm": 13.402782440185547, "learning_rate": 7.565248068665872e-05, "loss": 0.934, "step": 15210 }, { "epoch": 1.1878560836650278, "grad_norm": 14.633162498474121, "learning_rate": 7.561349606962416e-05, "loss": 0.6039, "step": 15220 }, { "epoch": 1.1886365410130337, "grad_norm": 10.086926460266113, "learning_rate": 7.557449033025558e-05, "loss": 1.0448, "step": 15230 }, { "epoch": 1.1894169983610396, "grad_norm": 12.034710884094238, "learning_rate": 7.553546350071928e-05, "loss": 0.4767, "step": 15240 }, { "epoch": 1.1901974557090456, "grad_norm": 7.936996553326026e-06, "learning_rate": 7.549641561319902e-05, "loss": 0.0658, "step": 15250 }, { "epoch": 1.1909779130570515, "grad_norm": 5.2773613929748535, "learning_rate": 7.545734669989586e-05, "loss": 1.2455, "step": 15260 }, { "epoch": 1.1917583704050574, "grad_norm": 10.09801959991455, "learning_rate": 7.541825679302825e-05, "loss": 1.4523, "step": 15270 }, { "epoch": 1.1925388277530633, "grad_norm": 0.3290392756462097, "learning_rate": 7.537914592483194e-05, "loss": 0.4414, "step": 15280 }, { "epoch": 1.1933192851010692, "grad_norm": 0.00040467019425705075, "learning_rate": 7.534001412755991e-05, "loss": 0.831, "step": 15290 }, { "epoch": 1.1940997424490751, "grad_norm": 4.611071586608887, "learning_rate": 7.53008614334825e-05, "loss": 1.8377, "step": 15300 }, { "epoch": 1.194880199797081, "grad_norm": 0.007054849527776241, "learning_rate": 7.526168787488721e-05, "loss": 1.1124, "step": 15310 }, { "epoch": 1.195660657145087, "grad_norm": 19.00647735595703, "learning_rate": 7.522249348407879e-05, "loss": 0.4197, "step": 15320 }, { "epoch": 1.1964411144930929, "grad_norm": 0.24617056548595428, "learning_rate": 7.518327829337912e-05, "loss": 0.3874, "step": 15330 }, { "epoch": 1.1972215718410988, "grad_norm": 43.505096435546875, "learning_rate": 7.514404233512725e-05, "loss": 1.059, "step": 15340 }, { "epoch": 1.1980020291891047, "grad_norm": 1.2489773035049438, "learning_rate": 7.51047856416794e-05, "loss": 0.3069, "step": 15350 }, { "epoch": 1.1987824865371108, "grad_norm": 50.59429931640625, "learning_rate": 7.506550824540881e-05, "loss": 0.1905, "step": 15360 }, { "epoch": 1.1995629438851168, "grad_norm": 49.17162322998047, "learning_rate": 7.502621017870588e-05, "loss": 1.3437, "step": 15370 }, { "epoch": 1.2003434012331227, "grad_norm": 17.885541915893555, "learning_rate": 7.498689147397799e-05, "loss": 1.0513, "step": 15380 }, { "epoch": 1.2011238585811286, "grad_norm": 2.4361978034903586e-07, "learning_rate": 7.494755216364956e-05, "loss": 0.3555, "step": 15390 }, { "epoch": 1.2019043159291345, "grad_norm": 60.8936653137207, "learning_rate": 7.490819228016202e-05, "loss": 1.7618, "step": 15400 }, { "epoch": 1.2026847732771404, "grad_norm": 0.6997115612030029, "learning_rate": 7.486881185597373e-05, "loss": 0.5708, "step": 15410 }, { "epoch": 1.2034652306251463, "grad_norm": 0.29672977328300476, "learning_rate": 7.482941092356004e-05, "loss": 0.4767, "step": 15420 }, { "epoch": 1.2042456879731522, "grad_norm": 0.3411640226840973, "learning_rate": 7.478998951541316e-05, "loss": 2.2572, "step": 15430 }, { "epoch": 1.2050261453211581, "grad_norm": 3.5091769695281982, "learning_rate": 7.475054766404221e-05, "loss": 2.2157, "step": 15440 }, { "epoch": 1.205806602669164, "grad_norm": 0.06144566833972931, "learning_rate": 7.471108540197316e-05, "loss": 1.5861, "step": 15450 }, { "epoch": 1.20658706001717, "grad_norm": 2.090874671936035, "learning_rate": 7.467160276174884e-05, "loss": 0.5432, "step": 15460 }, { "epoch": 1.2073675173651761, "grad_norm": 0.16866926848888397, "learning_rate": 7.463209977592884e-05, "loss": 0.3952, "step": 15470 }, { "epoch": 1.208147974713182, "grad_norm": 46.683372497558594, "learning_rate": 7.459257647708955e-05, "loss": 1.6898, "step": 15480 }, { "epoch": 1.208928432061188, "grad_norm": 3.992119312286377, "learning_rate": 7.455303289782414e-05, "loss": 0.6744, "step": 15490 }, { "epoch": 1.2097088894091939, "grad_norm": 43.23140335083008, "learning_rate": 7.451346907074245e-05, "loss": 1.7335, "step": 15500 }, { "epoch": 1.2104893467571998, "grad_norm": 44.27477264404297, "learning_rate": 7.447388502847106e-05, "loss": 0.7387, "step": 15510 }, { "epoch": 1.2112698041052057, "grad_norm": 0.1679820567369461, "learning_rate": 7.443428080365318e-05, "loss": 1.7443, "step": 15520 }, { "epoch": 1.2120502614532116, "grad_norm": 7.058843766571954e-05, "learning_rate": 7.439465642894872e-05, "loss": 0.9857, "step": 15530 }, { "epoch": 1.2128307188012175, "grad_norm": 0.11891741305589676, "learning_rate": 7.435501193703415e-05, "loss": 0.2799, "step": 15540 }, { "epoch": 1.2136111761492234, "grad_norm": 20.137723922729492, "learning_rate": 7.431534736060257e-05, "loss": 1.0802, "step": 15550 }, { "epoch": 1.2143916334972293, "grad_norm": 4.478515148162842, "learning_rate": 7.427566273236363e-05, "loss": 1.1262, "step": 15560 }, { "epoch": 1.2151720908452353, "grad_norm": 37.74848175048828, "learning_rate": 7.42359580850435e-05, "loss": 1.6794, "step": 15570 }, { "epoch": 1.2159525481932412, "grad_norm": 0.0001180763283628039, "learning_rate": 7.419623345138488e-05, "loss": 1.0961, "step": 15580 }, { "epoch": 1.216733005541247, "grad_norm": 6.324615242192522e-05, "learning_rate": 7.415648886414694e-05, "loss": 0.0529, "step": 15590 }, { "epoch": 1.217513462889253, "grad_norm": 5.168887615203857, "learning_rate": 7.411672435610531e-05, "loss": 0.17, "step": 15600 }, { "epoch": 1.218293920237259, "grad_norm": 0.005032880697399378, "learning_rate": 7.407693996005207e-05, "loss": 0.7068, "step": 15610 }, { "epoch": 1.219074377585265, "grad_norm": 0.0009420686401426792, "learning_rate": 7.403713570879565e-05, "loss": 0.9126, "step": 15620 }, { "epoch": 1.219854834933271, "grad_norm": 8.980721473693848, "learning_rate": 7.399731163516088e-05, "loss": 0.2624, "step": 15630 }, { "epoch": 1.2206352922812769, "grad_norm": 23.076135635375977, "learning_rate": 7.395746777198895e-05, "loss": 1.3006, "step": 15640 }, { "epoch": 1.2214157496292828, "grad_norm": 0.07298509776592255, "learning_rate": 7.391760415213735e-05, "loss": 0.6209, "step": 15650 }, { "epoch": 1.2221962069772887, "grad_norm": 2.6981508653989295e-06, "learning_rate": 7.387772080847988e-05, "loss": 0.787, "step": 15660 }, { "epoch": 1.2229766643252946, "grad_norm": 0.2646944522857666, "learning_rate": 7.383781777390658e-05, "loss": 1.1545, "step": 15670 }, { "epoch": 1.2237571216733005, "grad_norm": 0.16488932073116302, "learning_rate": 7.379789508132377e-05, "loss": 0.5825, "step": 15680 }, { "epoch": 1.2245375790213064, "grad_norm": 0.34593504667282104, "learning_rate": 7.375795276365392e-05, "loss": 1.3832, "step": 15690 }, { "epoch": 1.2253180363693124, "grad_norm": 0.3061760365962982, "learning_rate": 7.371799085383575e-05, "loss": 1.0564, "step": 15700 }, { "epoch": 1.2260984937173183, "grad_norm": 0.015735339373350143, "learning_rate": 7.367800938482409e-05, "loss": 1.1702, "step": 15710 }, { "epoch": 1.2268789510653242, "grad_norm": 19.672164916992188, "learning_rate": 7.363800838958991e-05, "loss": 1.6207, "step": 15720 }, { "epoch": 1.2276594084133303, "grad_norm": 0.003373056184500456, "learning_rate": 7.359798790112033e-05, "loss": 0.4638, "step": 15730 }, { "epoch": 1.2284398657613362, "grad_norm": 0.04260651394724846, "learning_rate": 7.355794795241844e-05, "loss": 1.188, "step": 15740 }, { "epoch": 1.2292203231093422, "grad_norm": 36.1136360168457, "learning_rate": 7.351788857650345e-05, "loss": 0.5226, "step": 15750 }, { "epoch": 1.230000780457348, "grad_norm": 9.62464714050293, "learning_rate": 7.347780980641064e-05, "loss": 0.5578, "step": 15760 }, { "epoch": 1.230781237805354, "grad_norm": 26.886932373046875, "learning_rate": 7.343771167519117e-05, "loss": 1.4843, "step": 15770 }, { "epoch": 1.23156169515336, "grad_norm": 1.5477793567697518e-05, "learning_rate": 7.339759421591224e-05, "loss": 0.2607, "step": 15780 }, { "epoch": 1.2323421525013658, "grad_norm": 20.806283950805664, "learning_rate": 7.335745746165696e-05, "loss": 1.0338, "step": 15790 }, { "epoch": 1.2331226098493717, "grad_norm": 0.18499815464019775, "learning_rate": 7.331730144552437e-05, "loss": 0.5777, "step": 15800 }, { "epoch": 1.2339030671973776, "grad_norm": 15.987350463867188, "learning_rate": 7.32771262006294e-05, "loss": 0.1239, "step": 15810 }, { "epoch": 1.2346835245453835, "grad_norm": 51.00580978393555, "learning_rate": 7.32369317601028e-05, "loss": 0.6431, "step": 15820 }, { "epoch": 1.2354639818933895, "grad_norm": 28.59132194519043, "learning_rate": 7.319671815709119e-05, "loss": 1.099, "step": 15830 }, { "epoch": 1.2362444392413954, "grad_norm": 1.3742252588272095, "learning_rate": 7.315648542475698e-05, "loss": 0.1693, "step": 15840 }, { "epoch": 1.2370248965894013, "grad_norm": 0.00036608395748771727, "learning_rate": 7.311623359627833e-05, "loss": 0.1572, "step": 15850 }, { "epoch": 1.2378053539374072, "grad_norm": 12.284563064575195, "learning_rate": 7.307596270484918e-05, "loss": 0.277, "step": 15860 }, { "epoch": 1.2385858112854133, "grad_norm": 0.00085182033944875, "learning_rate": 7.303567278367917e-05, "loss": 1.9326, "step": 15870 }, { "epoch": 1.2393662686334193, "grad_norm": 0.0015476574189960957, "learning_rate": 7.299536386599367e-05, "loss": 0.3339, "step": 15880 }, { "epoch": 1.2401467259814252, "grad_norm": 11.180502891540527, "learning_rate": 7.295503598503366e-05, "loss": 1.0722, "step": 15890 }, { "epoch": 1.240927183329431, "grad_norm": 1.789854832168203e-05, "learning_rate": 7.29146891740558e-05, "loss": 1.0567, "step": 15900 }, { "epoch": 1.241707640677437, "grad_norm": 0.014964636415243149, "learning_rate": 7.287432346633233e-05, "loss": 0.629, "step": 15910 }, { "epoch": 1.242488098025443, "grad_norm": 1.1818044640676817e-06, "learning_rate": 7.283393889515112e-05, "loss": 0.0488, "step": 15920 }, { "epoch": 1.2432685553734488, "grad_norm": 45.90778350830078, "learning_rate": 7.279353549381554e-05, "loss": 2.102, "step": 15930 }, { "epoch": 1.2440490127214547, "grad_norm": 0.10997821390628815, "learning_rate": 7.275311329564453e-05, "loss": 4.0926, "step": 15940 }, { "epoch": 1.2448294700694607, "grad_norm": 4.090299606323242, "learning_rate": 7.27126723339725e-05, "loss": 0.3856, "step": 15950 }, { "epoch": 1.2456099274174666, "grad_norm": 24.23040008544922, "learning_rate": 7.267221264214936e-05, "loss": 0.1491, "step": 15960 }, { "epoch": 1.2463903847654725, "grad_norm": 65.79607391357422, "learning_rate": 7.263173425354045e-05, "loss": 1.4866, "step": 15970 }, { "epoch": 1.2471708421134786, "grad_norm": 1.5714404582977295, "learning_rate": 7.259123720152652e-05, "loss": 1.1359, "step": 15980 }, { "epoch": 1.2479512994614845, "grad_norm": 50.30306625366211, "learning_rate": 7.255072151950376e-05, "loss": 1.654, "step": 15990 }, { "epoch": 1.2487317568094904, "grad_norm": 5.232126568444073e-05, "learning_rate": 7.251018724088367e-05, "loss": 1.5325, "step": 16000 }, { "epoch": 1.2495122141574964, "grad_norm": 28.464221954345703, "learning_rate": 7.246963439909309e-05, "loss": 1.8179, "step": 16010 }, { "epoch": 1.2502926715055023, "grad_norm": 0.07274913042783737, "learning_rate": 7.24290630275742e-05, "loss": 0.0345, "step": 16020 }, { "epoch": 1.2510731288535082, "grad_norm": 0.0002677069860510528, "learning_rate": 7.238847315978442e-05, "loss": 0.5961, "step": 16030 }, { "epoch": 1.251853586201514, "grad_norm": 10.164164543151855, "learning_rate": 7.234786482919646e-05, "loss": 0.55, "step": 16040 }, { "epoch": 1.25263404354952, "grad_norm": 45.208030700683594, "learning_rate": 7.230723806929824e-05, "loss": 0.8216, "step": 16050 }, { "epoch": 1.253414500897526, "grad_norm": 0.022487549111247063, "learning_rate": 7.226659291359287e-05, "loss": 1.2431, "step": 16060 }, { "epoch": 1.2541949582455318, "grad_norm": 15.880640029907227, "learning_rate": 7.222592939559867e-05, "loss": 0.0487, "step": 16070 }, { "epoch": 1.2549754155935378, "grad_norm": 2.3655835320823826e-05, "learning_rate": 7.218524754884903e-05, "loss": 0.1528, "step": 16080 }, { "epoch": 1.2557558729415437, "grad_norm": 0.33056506514549255, "learning_rate": 7.214454740689251e-05, "loss": 1.1647, "step": 16090 }, { "epoch": 1.2565363302895496, "grad_norm": 55.99821090698242, "learning_rate": 7.210382900329275e-05, "loss": 1.3796, "step": 16100 }, { "epoch": 1.2573167876375555, "grad_norm": 5.380942457122728e-05, "learning_rate": 7.206309237162844e-05, "loss": 1.2422, "step": 16110 }, { "epoch": 1.2580972449855614, "grad_norm": 28.779945373535156, "learning_rate": 7.202233754549333e-05, "loss": 0.417, "step": 16120 }, { "epoch": 1.2588777023335675, "grad_norm": 0.0022038770839571953, "learning_rate": 7.198156455849609e-05, "loss": 0.9051, "step": 16130 }, { "epoch": 1.2596581596815735, "grad_norm": 0.0036659492179751396, "learning_rate": 7.194077344426048e-05, "loss": 1.3457, "step": 16140 }, { "epoch": 1.2604386170295794, "grad_norm": 36.93067932128906, "learning_rate": 7.189996423642513e-05, "loss": 1.5451, "step": 16150 }, { "epoch": 1.2612190743775853, "grad_norm": 0.04987495392560959, "learning_rate": 7.185913696864361e-05, "loss": 1.13, "step": 16160 }, { "epoch": 1.2619995317255912, "grad_norm": 0.07304591685533524, "learning_rate": 7.181829167458441e-05, "loss": 1.3668, "step": 16170 }, { "epoch": 1.2627799890735971, "grad_norm": 1.3673689365386963, "learning_rate": 7.177742838793083e-05, "loss": 1.069, "step": 16180 }, { "epoch": 1.263560446421603, "grad_norm": 16.000823974609375, "learning_rate": 7.173654714238109e-05, "loss": 0.9112, "step": 16190 }, { "epoch": 1.264340903769609, "grad_norm": 1.9647473096847534, "learning_rate": 7.169564797164814e-05, "loss": 0.0715, "step": 16200 }, { "epoch": 1.2651213611176149, "grad_norm": 1.1932563781738281, "learning_rate": 7.165473090945975e-05, "loss": 0.2619, "step": 16210 }, { "epoch": 1.2659018184656208, "grad_norm": 2.668316602706909, "learning_rate": 7.161379598955843e-05, "loss": 0.5574, "step": 16220 }, { "epoch": 1.266682275813627, "grad_norm": 1.0716549425637822e-08, "learning_rate": 7.157284324570144e-05, "loss": 0.6693, "step": 16230 }, { "epoch": 1.2674627331616328, "grad_norm": 9.179781773127615e-05, "learning_rate": 7.153187271166071e-05, "loss": 1.299, "step": 16240 }, { "epoch": 1.2682431905096387, "grad_norm": 59.244930267333984, "learning_rate": 7.149088442122284e-05, "loss": 1.2305, "step": 16250 }, { "epoch": 1.2690236478576447, "grad_norm": 18.719036102294922, "learning_rate": 7.144987840818914e-05, "loss": 0.2246, "step": 16260 }, { "epoch": 1.2698041052056506, "grad_norm": 0.21231059730052948, "learning_rate": 7.140885470637542e-05, "loss": 1.4923, "step": 16270 }, { "epoch": 1.2705845625536565, "grad_norm": 3.356634579176898e-07, "learning_rate": 7.136781334961219e-05, "loss": 0.5428, "step": 16280 }, { "epoch": 1.2713650199016624, "grad_norm": 18.25281524658203, "learning_rate": 7.132675437174443e-05, "loss": 0.8539, "step": 16290 }, { "epoch": 1.2721454772496683, "grad_norm": 3.0588083177462977e-07, "learning_rate": 7.128567780663171e-05, "loss": 0.1291, "step": 16300 }, { "epoch": 1.2729259345976742, "grad_norm": 32.88163757324219, "learning_rate": 7.124458368814809e-05, "loss": 2.0664, "step": 16310 }, { "epoch": 1.2737063919456801, "grad_norm": 3.1073005199432373, "learning_rate": 7.120347205018208e-05, "loss": 1.4382, "step": 16320 }, { "epoch": 1.274486849293686, "grad_norm": 48.93598175048828, "learning_rate": 7.116234292663667e-05, "loss": 1.4327, "step": 16330 }, { "epoch": 1.275267306641692, "grad_norm": 4.686527554920161e-11, "learning_rate": 7.112119635142923e-05, "loss": 0.0149, "step": 16340 }, { "epoch": 1.2760477639896979, "grad_norm": 2.506943702697754, "learning_rate": 7.108003235849158e-05, "loss": 0.9217, "step": 16350 }, { "epoch": 1.2768282213377038, "grad_norm": 0.007850771769881248, "learning_rate": 7.103885098176987e-05, "loss": 1.2819, "step": 16360 }, { "epoch": 1.2776086786857097, "grad_norm": 1.201541543006897, "learning_rate": 7.099765225522456e-05, "loss": 0.5083, "step": 16370 }, { "epoch": 1.2783891360337156, "grad_norm": 1.7927274703979492, "learning_rate": 7.095643621283045e-05, "loss": 0.7671, "step": 16380 }, { "epoch": 1.2791695933817218, "grad_norm": 49.63917541503906, "learning_rate": 7.091520288857665e-05, "loss": 2.2272, "step": 16390 }, { "epoch": 1.2799500507297277, "grad_norm": 42.62212371826172, "learning_rate": 7.087395231646645e-05, "loss": 1.0197, "step": 16400 }, { "epoch": 1.2807305080777336, "grad_norm": 16.70367431640625, "learning_rate": 7.08326845305174e-05, "loss": 0.5912, "step": 16410 }, { "epoch": 1.2815109654257395, "grad_norm": 0.021338213235139847, "learning_rate": 7.079139956476126e-05, "loss": 0.1044, "step": 16420 }, { "epoch": 1.2822914227737454, "grad_norm": 0.017756953835487366, "learning_rate": 7.075009745324395e-05, "loss": 1.3735, "step": 16430 }, { "epoch": 1.2830718801217513, "grad_norm": 7.095729351043701, "learning_rate": 7.070877823002547e-05, "loss": 0.3988, "step": 16440 }, { "epoch": 1.2838523374697572, "grad_norm": 38.567604064941406, "learning_rate": 7.066744192918005e-05, "loss": 1.1289, "step": 16450 }, { "epoch": 1.2846327948177632, "grad_norm": 0.00015506644558627158, "learning_rate": 7.06260885847959e-05, "loss": 0.3998, "step": 16460 }, { "epoch": 1.285413252165769, "grad_norm": 16.752622604370117, "learning_rate": 7.058471823097533e-05, "loss": 0.2616, "step": 16470 }, { "epoch": 1.2861937095137752, "grad_norm": 2.3494606018066406, "learning_rate": 7.054333090183465e-05, "loss": 4.6909, "step": 16480 }, { "epoch": 1.2869741668617811, "grad_norm": 46.26172637939453, "learning_rate": 7.050192663150422e-05, "loss": 1.0096, "step": 16490 }, { "epoch": 1.287754624209787, "grad_norm": 1.201383352279663, "learning_rate": 7.046050545412831e-05, "loss": 1.0108, "step": 16500 }, { "epoch": 1.288535081557793, "grad_norm": 5.515385055332445e-07, "learning_rate": 7.041906740386518e-05, "loss": 1.6396, "step": 16510 }, { "epoch": 1.2893155389057989, "grad_norm": 7.240560531616211, "learning_rate": 7.037761251488696e-05, "loss": 0.1567, "step": 16520 }, { "epoch": 1.2900959962538048, "grad_norm": 1.1850215196609497, "learning_rate": 7.03361408213797e-05, "loss": 0.5309, "step": 16530 }, { "epoch": 1.2908764536018107, "grad_norm": 0.10555583238601685, "learning_rate": 7.029465235754331e-05, "loss": 0.4628, "step": 16540 }, { "epoch": 1.2916569109498166, "grad_norm": 46.775550842285156, "learning_rate": 7.025314715759153e-05, "loss": 1.839, "step": 16550 }, { "epoch": 1.2924373682978225, "grad_norm": 8.276897430419922, "learning_rate": 7.021162525575183e-05, "loss": 0.0247, "step": 16560 }, { "epoch": 1.2932178256458284, "grad_norm": 62.0821533203125, "learning_rate": 7.017008668626557e-05, "loss": 1.0257, "step": 16570 }, { "epoch": 1.2939982829938343, "grad_norm": 61.38323211669922, "learning_rate": 7.01285314833878e-05, "loss": 0.9119, "step": 16580 }, { "epoch": 1.2947787403418403, "grad_norm": 24.953426361083984, "learning_rate": 7.008695968138725e-05, "loss": 1.2479, "step": 16590 }, { "epoch": 1.2955591976898462, "grad_norm": 0.4902319014072418, "learning_rate": 7.004537131454638e-05, "loss": 0.7, "step": 16600 }, { "epoch": 1.296339655037852, "grad_norm": 0.020014774054288864, "learning_rate": 7.000376641716133e-05, "loss": 0.0599, "step": 16610 }, { "epoch": 1.297120112385858, "grad_norm": 10.10654067993164, "learning_rate": 6.996214502354183e-05, "loss": 0.853, "step": 16620 }, { "epoch": 1.297900569733864, "grad_norm": 0.034189675003290176, "learning_rate": 6.992050716801122e-05, "loss": 2.4615, "step": 16630 }, { "epoch": 1.29868102708187, "grad_norm": 17.056989669799805, "learning_rate": 6.987885288490643e-05, "loss": 1.7232, "step": 16640 }, { "epoch": 1.299461484429876, "grad_norm": 3.8021726608276367, "learning_rate": 6.983718220857795e-05, "loss": 0.9082, "step": 16650 }, { "epoch": 1.3002419417778819, "grad_norm": 0.0016640127869322896, "learning_rate": 6.979549517338976e-05, "loss": 0.7894, "step": 16660 }, { "epoch": 1.3010223991258878, "grad_norm": 0.0018107325304299593, "learning_rate": 6.975379181371932e-05, "loss": 0.4747, "step": 16670 }, { "epoch": 1.3018028564738937, "grad_norm": 26.44985008239746, "learning_rate": 6.97120721639576e-05, "loss": 1.3115, "step": 16680 }, { "epoch": 1.3025833138218996, "grad_norm": 0.4625835716724396, "learning_rate": 6.967033625850897e-05, "loss": 0.9148, "step": 16690 }, { "epoch": 1.3033637711699055, "grad_norm": 4.2758065887937846e-07, "learning_rate": 6.962858413179121e-05, "loss": 1.4819, "step": 16700 }, { "epoch": 1.3041442285179115, "grad_norm": 1.5021567344665527, "learning_rate": 6.958681581823547e-05, "loss": 0.4725, "step": 16710 }, { "epoch": 1.3049246858659174, "grad_norm": 0.45929327607154846, "learning_rate": 6.95450313522863e-05, "loss": 0.0265, "step": 16720 }, { "epoch": 1.3057051432139233, "grad_norm": 63.05253601074219, "learning_rate": 6.950323076840147e-05, "loss": 0.4704, "step": 16730 }, { "epoch": 1.3064856005619294, "grad_norm": 53.459842681884766, "learning_rate": 6.946141410105213e-05, "loss": 0.6937, "step": 16740 }, { "epoch": 1.3072660579099353, "grad_norm": 0.03195926547050476, "learning_rate": 6.941958138472267e-05, "loss": 0.1157, "step": 16750 }, { "epoch": 1.3080465152579412, "grad_norm": 57.97876739501953, "learning_rate": 6.937773265391068e-05, "loss": 1.4795, "step": 16760 }, { "epoch": 1.3088269726059472, "grad_norm": 52.485504150390625, "learning_rate": 6.933586794312702e-05, "loss": 1.568, "step": 16770 }, { "epoch": 1.309607429953953, "grad_norm": 1.3821322917938232, "learning_rate": 6.929398728689567e-05, "loss": 0.6191, "step": 16780 }, { "epoch": 1.310387887301959, "grad_norm": 3.0984549522399902, "learning_rate": 6.925209071975379e-05, "loss": 0.9458, "step": 16790 }, { "epoch": 1.311168344649965, "grad_norm": 51.80403518676758, "learning_rate": 6.921017827625164e-05, "loss": 0.7153, "step": 16800 }, { "epoch": 1.3119488019979708, "grad_norm": 6.354888319037855e-05, "learning_rate": 6.916824999095262e-05, "loss": 0.8756, "step": 16810 }, { "epoch": 1.3127292593459767, "grad_norm": 23.010360717773438, "learning_rate": 6.912630589843312e-05, "loss": 1.3956, "step": 16820 }, { "epoch": 1.3135097166939826, "grad_norm": 2.183323860168457, "learning_rate": 6.908434603328263e-05, "loss": 1.2873, "step": 16830 }, { "epoch": 1.3142901740419886, "grad_norm": 0.3114471435546875, "learning_rate": 6.90423704301036e-05, "loss": 2.5646, "step": 16840 }, { "epoch": 1.3150706313899945, "grad_norm": 4.872425506619038e-06, "learning_rate": 6.90003791235115e-05, "loss": 1.9407, "step": 16850 }, { "epoch": 1.3158510887380004, "grad_norm": 0.31955668330192566, "learning_rate": 6.895837214813474e-05, "loss": 0.4642, "step": 16860 }, { "epoch": 1.3166315460860063, "grad_norm": 3.809271447607898e-06, "learning_rate": 6.89163495386146e-05, "loss": 0.579, "step": 16870 }, { "epoch": 1.3174120034340122, "grad_norm": 7.399552032438805e-06, "learning_rate": 6.887431132960533e-05, "loss": 0.638, "step": 16880 }, { "epoch": 1.3181924607820181, "grad_norm": 6.543385825352743e-05, "learning_rate": 6.883225755577401e-05, "loss": 0.665, "step": 16890 }, { "epoch": 1.3189729181300243, "grad_norm": 29.476024627685547, "learning_rate": 6.879018825180055e-05, "loss": 1.4586, "step": 16900 }, { "epoch": 1.3197533754780302, "grad_norm": 6.078284059185535e-06, "learning_rate": 6.874810345237766e-05, "loss": 1.0899, "step": 16910 }, { "epoch": 1.320533832826036, "grad_norm": 31.434192657470703, "learning_rate": 6.870600319221085e-05, "loss": 0.5962, "step": 16920 }, { "epoch": 1.321314290174042, "grad_norm": 31.558059692382812, "learning_rate": 6.866388750601837e-05, "loss": 0.7745, "step": 16930 }, { "epoch": 1.322094747522048, "grad_norm": 32.943885803222656, "learning_rate": 6.862175642853119e-05, "loss": 0.7585, "step": 16940 }, { "epoch": 1.3228752048700538, "grad_norm": 9.586359977722168, "learning_rate": 6.857960999449297e-05, "loss": 0.6813, "step": 16950 }, { "epoch": 1.3236556622180597, "grad_norm": 19.564783096313477, "learning_rate": 6.853744823866004e-05, "loss": 1.1954, "step": 16960 }, { "epoch": 1.3244361195660657, "grad_norm": 0.0008135417592711747, "learning_rate": 6.849527119580136e-05, "loss": 0.4992, "step": 16970 }, { "epoch": 1.3252165769140716, "grad_norm": 0.0034319348633289337, "learning_rate": 6.84530789006985e-05, "loss": 1.8232, "step": 16980 }, { "epoch": 1.3259970342620777, "grad_norm": 5.529355049133301, "learning_rate": 6.841087138814562e-05, "loss": 1.1755, "step": 16990 }, { "epoch": 1.3267774916100836, "grad_norm": 1.479116439819336, "learning_rate": 6.836864869294939e-05, "loss": 0.5419, "step": 17000 }, { "epoch": 1.3275579489580895, "grad_norm": 14.388936996459961, "learning_rate": 6.832641084992907e-05, "loss": 1.7516, "step": 17010 }, { "epoch": 1.3283384063060955, "grad_norm": 6.888252258300781, "learning_rate": 6.828415789391631e-05, "loss": 0.8862, "step": 17020 }, { "epoch": 1.3291188636541014, "grad_norm": 0.2259782999753952, "learning_rate": 6.824188985975533e-05, "loss": 0.7888, "step": 17030 }, { "epoch": 1.3298993210021073, "grad_norm": 0.20432275533676147, "learning_rate": 6.819960678230271e-05, "loss": 0.8379, "step": 17040 }, { "epoch": 1.3306797783501132, "grad_norm": 12.537790298461914, "learning_rate": 6.815730869642747e-05, "loss": 0.807, "step": 17050 }, { "epoch": 1.331460235698119, "grad_norm": 0.02838488109409809, "learning_rate": 6.8114995637011e-05, "loss": 0.4556, "step": 17060 }, { "epoch": 1.332240693046125, "grad_norm": 0.22770993411540985, "learning_rate": 6.807266763894702e-05, "loss": 0.4645, "step": 17070 }, { "epoch": 1.333021150394131, "grad_norm": 13.060320854187012, "learning_rate": 6.803032473714162e-05, "loss": 0.9662, "step": 17080 }, { "epoch": 1.3338016077421369, "grad_norm": 42.66017532348633, "learning_rate": 6.798796696651313e-05, "loss": 1.1246, "step": 17090 }, { "epoch": 1.3345820650901428, "grad_norm": 0.00010493778245290741, "learning_rate": 6.794559436199213e-05, "loss": 1.2048, "step": 17100 }, { "epoch": 1.3353625224381487, "grad_norm": 0.5220862030982971, "learning_rate": 6.79032069585215e-05, "loss": 1.1721, "step": 17110 }, { "epoch": 1.3361429797861546, "grad_norm": 0.18259654939174652, "learning_rate": 6.786080479105627e-05, "loss": 0.8042, "step": 17120 }, { "epoch": 1.3369234371341605, "grad_norm": 0.060175586491823196, "learning_rate": 6.781838789456364e-05, "loss": 0.7491, "step": 17130 }, { "epoch": 1.3377038944821664, "grad_norm": 55.724971771240234, "learning_rate": 6.7775956304023e-05, "loss": 0.8063, "step": 17140 }, { "epoch": 1.3384843518301726, "grad_norm": 19.680768966674805, "learning_rate": 6.773351005442583e-05, "loss": 0.9294, "step": 17150 }, { "epoch": 1.3392648091781785, "grad_norm": 8.618398666381836, "learning_rate": 6.769104918077572e-05, "loss": 0.6359, "step": 17160 }, { "epoch": 1.3400452665261844, "grad_norm": 1.873159405363367e-08, "learning_rate": 6.764857371808826e-05, "loss": 2.5235, "step": 17170 }, { "epoch": 1.3408257238741903, "grad_norm": 52.72881317138672, "learning_rate": 6.760608370139112e-05, "loss": 1.2153, "step": 17180 }, { "epoch": 1.3416061812221962, "grad_norm": 1.9692984819412231, "learning_rate": 6.7563579165724e-05, "loss": 0.5641, "step": 17190 }, { "epoch": 1.3423866385702021, "grad_norm": 9.65872168308124e-05, "learning_rate": 6.752106014613852e-05, "loss": 2.7393, "step": 17200 }, { "epoch": 1.343167095918208, "grad_norm": 0.12596841156482697, "learning_rate": 6.747852667769827e-05, "loss": 0.6117, "step": 17210 }, { "epoch": 1.343947553266214, "grad_norm": 0.0009383680881001055, "learning_rate": 6.743597879547872e-05, "loss": 0.2199, "step": 17220 }, { "epoch": 1.3447280106142199, "grad_norm": 0.27956098318099976, "learning_rate": 6.739341653456728e-05, "loss": 0.9869, "step": 17230 }, { "epoch": 1.3455084679622258, "grad_norm": 11.534627914428711, "learning_rate": 6.735083993006319e-05, "loss": 0.9009, "step": 17240 }, { "epoch": 1.346288925310232, "grad_norm": 7.020449638366699, "learning_rate": 6.73082490170775e-05, "loss": 0.3904, "step": 17250 }, { "epoch": 1.3470693826582378, "grad_norm": 0.4477950930595398, "learning_rate": 6.72656438307331e-05, "loss": 0.6724, "step": 17260 }, { "epoch": 1.3478498400062437, "grad_norm": 0.014391960576176643, "learning_rate": 6.722302440616463e-05, "loss": 0.2966, "step": 17270 }, { "epoch": 1.3486302973542497, "grad_norm": 55.42076110839844, "learning_rate": 6.718039077851848e-05, "loss": 0.9194, "step": 17280 }, { "epoch": 1.3494107547022556, "grad_norm": 56.1174430847168, "learning_rate": 6.713774298295273e-05, "loss": 1.6298, "step": 17290 }, { "epoch": 1.3501912120502615, "grad_norm": 29.674327850341797, "learning_rate": 6.709508105463716e-05, "loss": 1.4175, "step": 17300 }, { "epoch": 1.3509716693982674, "grad_norm": 37.25205612182617, "learning_rate": 6.705240502875318e-05, "loss": 0.7216, "step": 17310 }, { "epoch": 1.3517521267462733, "grad_norm": 0.0031614259351044893, "learning_rate": 6.70097149404939e-05, "loss": 1.0236, "step": 17320 }, { "epoch": 1.3525325840942792, "grad_norm": 58.900535583496094, "learning_rate": 6.696701082506395e-05, "loss": 0.8832, "step": 17330 }, { "epoch": 1.3533130414422851, "grad_norm": 4.175566673278809, "learning_rate": 6.692429271767953e-05, "loss": 0.2608, "step": 17340 }, { "epoch": 1.354093498790291, "grad_norm": 39.607295989990234, "learning_rate": 6.688156065356844e-05, "loss": 1.051, "step": 17350 }, { "epoch": 1.354873956138297, "grad_norm": 5.6130111261154525e-06, "learning_rate": 6.683881466796994e-05, "loss": 0.3924, "step": 17360 }, { "epoch": 1.355654413486303, "grad_norm": 46.783145904541016, "learning_rate": 6.679605479613477e-05, "loss": 3.4201, "step": 17370 }, { "epoch": 1.3564348708343088, "grad_norm": 0.22098881006240845, "learning_rate": 6.675328107332513e-05, "loss": 0.9662, "step": 17380 }, { "epoch": 1.3572153281823147, "grad_norm": 0.0011627674102783203, "learning_rate": 6.671049353481466e-05, "loss": 0.7548, "step": 17390 }, { "epoch": 1.3579957855303206, "grad_norm": 38.597557067871094, "learning_rate": 6.666769221588839e-05, "loss": 1.0412, "step": 17400 }, { "epoch": 1.3587762428783268, "grad_norm": 11.341829299926758, "learning_rate": 6.662487715184266e-05, "loss": 0.7493, "step": 17410 }, { "epoch": 1.3595567002263327, "grad_norm": 0.001878074835985899, "learning_rate": 6.658204837798523e-05, "loss": 0.7333, "step": 17420 }, { "epoch": 1.3603371575743386, "grad_norm": 0.40023699402809143, "learning_rate": 6.65392059296351e-05, "loss": 0.2432, "step": 17430 }, { "epoch": 1.3611176149223445, "grad_norm": 0.11613233387470245, "learning_rate": 6.649634984212258e-05, "loss": 0.5323, "step": 17440 }, { "epoch": 1.3618980722703504, "grad_norm": 2.2078163623809814, "learning_rate": 6.64534801507892e-05, "loss": 1.3115, "step": 17450 }, { "epoch": 1.3626785296183563, "grad_norm": 3.0959393978118896, "learning_rate": 6.641059689098775e-05, "loss": 0.7336, "step": 17460 }, { "epoch": 1.3634589869663623, "grad_norm": 8.56583309173584, "learning_rate": 6.636770009808219e-05, "loss": 0.5928, "step": 17470 }, { "epoch": 1.3642394443143682, "grad_norm": 20.7721004486084, "learning_rate": 6.63247898074476e-05, "loss": 1.0434, "step": 17480 }, { "epoch": 1.365019901662374, "grad_norm": 0.17920184135437012, "learning_rate": 6.628186605447027e-05, "loss": 0.7184, "step": 17490 }, { "epoch": 1.3658003590103802, "grad_norm": 0.18073244392871857, "learning_rate": 6.623892887454752e-05, "loss": 1.1548, "step": 17500 }, { "epoch": 1.3665808163583861, "grad_norm": 0.0002529042540118098, "learning_rate": 6.619597830308776e-05, "loss": 1.0218, "step": 17510 }, { "epoch": 1.367361273706392, "grad_norm": 4.4073667027078045e-07, "learning_rate": 6.615301437551051e-05, "loss": 0.4125, "step": 17520 }, { "epoch": 1.368141731054398, "grad_norm": 0.0018284658435732126, "learning_rate": 6.611003712724617e-05, "loss": 0.2231, "step": 17530 }, { "epoch": 1.3689221884024039, "grad_norm": 8.416059494018555, "learning_rate": 6.606704659373628e-05, "loss": 0.3617, "step": 17540 }, { "epoch": 1.3697026457504098, "grad_norm": 113.29763793945312, "learning_rate": 6.602404281043322e-05, "loss": 0.9754, "step": 17550 }, { "epoch": 1.3704831030984157, "grad_norm": 56.287540435791016, "learning_rate": 6.598102581280032e-05, "loss": 2.2883, "step": 17560 }, { "epoch": 1.3712635604464216, "grad_norm": 3.09262752532959, "learning_rate": 6.593799563631186e-05, "loss": 0.6381, "step": 17570 }, { "epoch": 1.3720440177944275, "grad_norm": 5.676676273345947, "learning_rate": 6.589495231645293e-05, "loss": 0.9866, "step": 17580 }, { "epoch": 1.3728244751424334, "grad_norm": 0.01620888151228428, "learning_rate": 6.585189588871952e-05, "loss": 0.3731, "step": 17590 }, { "epoch": 1.3736049324904394, "grad_norm": 0.41776251792907715, "learning_rate": 6.580882638861832e-05, "loss": 0.9153, "step": 17600 }, { "epoch": 1.3743853898384453, "grad_norm": 1.9870280084433034e-05, "learning_rate": 6.57657438516669e-05, "loss": 0.5748, "step": 17610 }, { "epoch": 1.3751658471864512, "grad_norm": 0.0008792297448962927, "learning_rate": 6.572264831339358e-05, "loss": 0.1678, "step": 17620 }, { "epoch": 1.375946304534457, "grad_norm": 6.01303113967333e-08, "learning_rate": 6.567953980933735e-05, "loss": 2.9064, "step": 17630 }, { "epoch": 1.376726761882463, "grad_norm": 0.37628453969955444, "learning_rate": 6.563641837504791e-05, "loss": 0.0403, "step": 17640 }, { "epoch": 1.377507219230469, "grad_norm": 0.26884397864341736, "learning_rate": 6.55932840460856e-05, "loss": 0.8042, "step": 17650 }, { "epoch": 1.378287676578475, "grad_norm": 0.0005718205939047039, "learning_rate": 6.555013685802148e-05, "loss": 0.7636, "step": 17660 }, { "epoch": 1.379068133926481, "grad_norm": 5.187681198120117, "learning_rate": 6.550697684643715e-05, "loss": 1.6329, "step": 17670 }, { "epoch": 1.379848591274487, "grad_norm": 0.003713503247126937, "learning_rate": 6.546380404692474e-05, "loss": 1.7662, "step": 17680 }, { "epoch": 1.3806290486224928, "grad_norm": 67.9324951171875, "learning_rate": 6.542061849508701e-05, "loss": 3.2811, "step": 17690 }, { "epoch": 1.3814095059704987, "grad_norm": 4.829217433929443, "learning_rate": 6.537742022653721e-05, "loss": 0.6113, "step": 17700 }, { "epoch": 1.3821899633185046, "grad_norm": 4.9284699343843386e-05, "learning_rate": 6.533420927689905e-05, "loss": 2.4698, "step": 17710 }, { "epoch": 1.3829704206665105, "grad_norm": 36.11857604980469, "learning_rate": 6.529098568180672e-05, "loss": 3.1013, "step": 17720 }, { "epoch": 1.3837508780145165, "grad_norm": 35.37229919433594, "learning_rate": 6.524774947690483e-05, "loss": 0.8092, "step": 17730 }, { "epoch": 1.3845313353625224, "grad_norm": 3.6355788707733154, "learning_rate": 6.520450069784844e-05, "loss": 0.1253, "step": 17740 }, { "epoch": 1.3853117927105283, "grad_norm": 0.01998009905219078, "learning_rate": 6.516123938030287e-05, "loss": 0.8166, "step": 17750 }, { "epoch": 1.3860922500585344, "grad_norm": 1.4495409727096558, "learning_rate": 6.511796555994388e-05, "loss": 0.7859, "step": 17760 }, { "epoch": 1.3868727074065403, "grad_norm": 31.480539321899414, "learning_rate": 6.50746792724575e-05, "loss": 0.98, "step": 17770 }, { "epoch": 1.3876531647545463, "grad_norm": 38.0267219543457, "learning_rate": 6.503138055354005e-05, "loss": 0.9297, "step": 17780 }, { "epoch": 1.3884336221025522, "grad_norm": 0.6976431608200073, "learning_rate": 6.498806943889808e-05, "loss": 0.8869, "step": 17790 }, { "epoch": 1.389214079450558, "grad_norm": 0.3048703074455261, "learning_rate": 6.494474596424837e-05, "loss": 0.3903, "step": 17800 }, { "epoch": 1.389994536798564, "grad_norm": 0.23949263989925385, "learning_rate": 6.490141016531794e-05, "loss": 1.4834, "step": 17810 }, { "epoch": 1.39077499414657, "grad_norm": 42.83589172363281, "learning_rate": 6.485806207784389e-05, "loss": 1.3046, "step": 17820 }, { "epoch": 1.3915554514945758, "grad_norm": 2.6972645628120517e-06, "learning_rate": 6.481470173757353e-05, "loss": 0.6909, "step": 17830 }, { "epoch": 1.3923359088425817, "grad_norm": 39.857086181640625, "learning_rate": 6.477132918026424e-05, "loss": 1.4446, "step": 17840 }, { "epoch": 1.3931163661905877, "grad_norm": 0.00047229923075065017, "learning_rate": 6.472794444168346e-05, "loss": 0.0383, "step": 17850 }, { "epoch": 1.3938968235385936, "grad_norm": 10.687626838684082, "learning_rate": 6.468454755760872e-05, "loss": 0.1296, "step": 17860 }, { "epoch": 1.3946772808865995, "grad_norm": 0.04510192945599556, "learning_rate": 6.464113856382752e-05, "loss": 1.2684, "step": 17870 }, { "epoch": 1.3954577382346054, "grad_norm": 1.947009442559988e-09, "learning_rate": 6.459771749613738e-05, "loss": 1.0827, "step": 17880 }, { "epoch": 1.3962381955826113, "grad_norm": 0.4970480799674988, "learning_rate": 6.455428439034574e-05, "loss": 0.9025, "step": 17890 }, { "epoch": 1.3970186529306172, "grad_norm": 4.4791399034238566e-08, "learning_rate": 6.451083928227e-05, "loss": 0.8598, "step": 17900 }, { "epoch": 1.3977991102786231, "grad_norm": 68.11105346679688, "learning_rate": 6.446738220773744e-05, "loss": 2.0488, "step": 17910 }, { "epoch": 1.3985795676266293, "grad_norm": 0.050787825137376785, "learning_rate": 6.442391320258525e-05, "loss": 1.1975, "step": 17920 }, { "epoch": 1.3993600249746352, "grad_norm": 0.7949303984642029, "learning_rate": 6.43804323026604e-05, "loss": 0.9232, "step": 17930 }, { "epoch": 1.400140482322641, "grad_norm": 0.28324851393699646, "learning_rate": 6.433693954381967e-05, "loss": 1.2801, "step": 17940 }, { "epoch": 1.400920939670647, "grad_norm": 1.937788724899292, "learning_rate": 6.429343496192969e-05, "loss": 1.3638, "step": 17950 }, { "epoch": 1.401701397018653, "grad_norm": 1.130745530128479, "learning_rate": 6.424991859286674e-05, "loss": 0.603, "step": 17960 }, { "epoch": 1.4024818543666588, "grad_norm": 0.07360313087701797, "learning_rate": 6.420639047251692e-05, "loss": 0.0272, "step": 17970 }, { "epoch": 1.4032623117146648, "grad_norm": 0.01609094999730587, "learning_rate": 6.416285063677597e-05, "loss": 1.3349, "step": 17980 }, { "epoch": 1.4040427690626707, "grad_norm": 40.28419494628906, "learning_rate": 6.411929912154925e-05, "loss": 1.044, "step": 17990 }, { "epoch": 1.4048232264106766, "grad_norm": 72.37548828125, "learning_rate": 6.407573596275185e-05, "loss": 1.9599, "step": 18000 }, { "epoch": 1.4056036837586827, "grad_norm": 0.15986989438533783, "learning_rate": 6.403216119630838e-05, "loss": 0.9964, "step": 18010 }, { "epoch": 1.4063841411066886, "grad_norm": 0.09070328623056412, "learning_rate": 6.398857485815306e-05, "loss": 0.6331, "step": 18020 }, { "epoch": 1.4071645984546945, "grad_norm": 10.145639419555664, "learning_rate": 6.394497698422964e-05, "loss": 0.761, "step": 18030 }, { "epoch": 1.4079450558027005, "grad_norm": 1.3403260707855225, "learning_rate": 6.390136761049137e-05, "loss": 0.6732, "step": 18040 }, { "epoch": 1.4087255131507064, "grad_norm": 4.114490032196045, "learning_rate": 6.385774677290104e-05, "loss": 0.4664, "step": 18050 }, { "epoch": 1.4095059704987123, "grad_norm": 2.115405559539795, "learning_rate": 6.381411450743084e-05, "loss": 0.3374, "step": 18060 }, { "epoch": 1.4102864278467182, "grad_norm": 7.303625106811523, "learning_rate": 6.377047085006236e-05, "loss": 3.1412, "step": 18070 }, { "epoch": 1.4110668851947241, "grad_norm": 10.102337837219238, "learning_rate": 6.372681583678668e-05, "loss": 0.8387, "step": 18080 }, { "epoch": 1.41184734254273, "grad_norm": 0.28348350524902344, "learning_rate": 6.368314950360415e-05, "loss": 1.0219, "step": 18090 }, { "epoch": 1.412627799890736, "grad_norm": 0.0021556804422289133, "learning_rate": 6.36394718865245e-05, "loss": 0.2351, "step": 18100 }, { "epoch": 1.4134082572387419, "grad_norm": 0.0003699703374877572, "learning_rate": 6.359578302156675e-05, "loss": 0.0308, "step": 18110 }, { "epoch": 1.4141887145867478, "grad_norm": 55.59209442138672, "learning_rate": 6.355208294475923e-05, "loss": 0.669, "step": 18120 }, { "epoch": 1.4149691719347537, "grad_norm": 1.8222777843475342, "learning_rate": 6.350837169213946e-05, "loss": 1.5468, "step": 18130 }, { "epoch": 1.4157496292827596, "grad_norm": 38.04872131347656, "learning_rate": 6.346464929975422e-05, "loss": 0.9933, "step": 18140 }, { "epoch": 1.4165300866307655, "grad_norm": 4.993416786193848, "learning_rate": 6.342091580365946e-05, "loss": 1.25, "step": 18150 }, { "epoch": 1.4173105439787714, "grad_norm": 3.710566997528076, "learning_rate": 6.337717123992027e-05, "loss": 0.7554, "step": 18160 }, { "epoch": 1.4180910013267776, "grad_norm": 1.022260308265686, "learning_rate": 6.333341564461092e-05, "loss": 1.3403, "step": 18170 }, { "epoch": 1.4188714586747835, "grad_norm": 43.93464279174805, "learning_rate": 6.328964905381472e-05, "loss": 2.0506, "step": 18180 }, { "epoch": 1.4196519160227894, "grad_norm": 4.084843158721924, "learning_rate": 6.324587150362408e-05, "loss": 0.8518, "step": 18190 }, { "epoch": 1.4204323733707953, "grad_norm": 0.01947934739291668, "learning_rate": 6.320208303014043e-05, "loss": 0.301, "step": 18200 }, { "epoch": 1.4212128307188012, "grad_norm": 34.18440246582031, "learning_rate": 6.31582836694742e-05, "loss": 0.4866, "step": 18210 }, { "epoch": 1.4219932880668071, "grad_norm": 4.792870044708252, "learning_rate": 6.311447345774486e-05, "loss": 0.3177, "step": 18220 }, { "epoch": 1.422773745414813, "grad_norm": 1.2343689377303235e-06, "learning_rate": 6.307065243108072e-05, "loss": 1.2798, "step": 18230 }, { "epoch": 1.423554202762819, "grad_norm": 1.0866369009017944, "learning_rate": 6.302682062561914e-05, "loss": 0.1932, "step": 18240 }, { "epoch": 1.4243346601108249, "grad_norm": 0.0021116959396749735, "learning_rate": 6.298297807750626e-05, "loss": 0.8838, "step": 18250 }, { "epoch": 1.4251151174588308, "grad_norm": 10.430094718933105, "learning_rate": 6.293912482289712e-05, "loss": 1.1055, "step": 18260 }, { "epoch": 1.425895574806837, "grad_norm": 48.94272994995117, "learning_rate": 6.289526089795558e-05, "loss": 1.1596, "step": 18270 }, { "epoch": 1.4266760321548428, "grad_norm": 1.1787877082824707, "learning_rate": 6.285138633885434e-05, "loss": 1.5406, "step": 18280 }, { "epoch": 1.4274564895028488, "grad_norm": 19.861085891723633, "learning_rate": 6.28075011817748e-05, "loss": 0.1486, "step": 18290 }, { "epoch": 1.4282369468508547, "grad_norm": 49.54465866088867, "learning_rate": 6.276360546290717e-05, "loss": 1.8698, "step": 18300 }, { "epoch": 1.4290174041988606, "grad_norm": 5.11936616897583, "learning_rate": 6.271969921845032e-05, "loss": 0.6967, "step": 18310 }, { "epoch": 1.4297978615468665, "grad_norm": 2.9285593032836914, "learning_rate": 6.26757824846118e-05, "loss": 0.2176, "step": 18320 }, { "epoch": 1.4305783188948724, "grad_norm": 5.905811309814453, "learning_rate": 6.263185529760786e-05, "loss": 0.7961, "step": 18330 }, { "epoch": 1.4313587762428783, "grad_norm": 0.5823084115982056, "learning_rate": 6.258791769366332e-05, "loss": 0.7978, "step": 18340 }, { "epoch": 1.4321392335908842, "grad_norm": 5.589509237324819e-05, "learning_rate": 6.254396970901159e-05, "loss": 0.0566, "step": 18350 }, { "epoch": 1.4329196909388902, "grad_norm": 0.0657353326678276, "learning_rate": 6.25000113798947e-05, "loss": 1.8269, "step": 18360 }, { "epoch": 1.433700148286896, "grad_norm": 42.802703857421875, "learning_rate": 6.245604274256314e-05, "loss": 0.2936, "step": 18370 }, { "epoch": 1.434480605634902, "grad_norm": 1.737991452217102, "learning_rate": 6.241206383327592e-05, "loss": 0.8018, "step": 18380 }, { "epoch": 1.435261062982908, "grad_norm": 25.346847534179688, "learning_rate": 6.236807468830054e-05, "loss": 1.9919, "step": 18390 }, { "epoch": 1.4360415203309138, "grad_norm": 16.954374313354492, "learning_rate": 6.232407534391295e-05, "loss": 0.3725, "step": 18400 }, { "epoch": 1.4368219776789197, "grad_norm": 0.9959704875946045, "learning_rate": 6.228006583639747e-05, "loss": 0.9978, "step": 18410 }, { "epoch": 1.4376024350269256, "grad_norm": 4.357720627012895e-06, "learning_rate": 6.223604620204682e-05, "loss": 2.0566, "step": 18420 }, { "epoch": 1.4383828923749318, "grad_norm": 4.830660327570513e-05, "learning_rate": 6.219201647716209e-05, "loss": 0.2457, "step": 18430 }, { "epoch": 1.4391633497229377, "grad_norm": 0.011094238609075546, "learning_rate": 6.214797669805266e-05, "loss": 0.2389, "step": 18440 }, { "epoch": 1.4399438070709436, "grad_norm": 8.750463166506961e-06, "learning_rate": 6.210392690103624e-05, "loss": 1.6997, "step": 18450 }, { "epoch": 1.4407242644189495, "grad_norm": 6.744166851043701, "learning_rate": 6.205986712243875e-05, "loss": 0.2439, "step": 18460 }, { "epoch": 1.4415047217669554, "grad_norm": 56.61109924316406, "learning_rate": 6.201579739859441e-05, "loss": 1.0354, "step": 18470 }, { "epoch": 1.4422851791149613, "grad_norm": 0.030948661267757416, "learning_rate": 6.197171776584555e-05, "loss": 0.418, "step": 18480 }, { "epoch": 1.4430656364629673, "grad_norm": 5.321959179127589e-06, "learning_rate": 6.192762826054274e-05, "loss": 0.731, "step": 18490 }, { "epoch": 1.4438460938109732, "grad_norm": 5.769700273106082e-10, "learning_rate": 6.18835289190447e-05, "loss": 0.9904, "step": 18500 }, { "epoch": 1.444626551158979, "grad_norm": 0.5561239123344421, "learning_rate": 6.18394197777182e-05, "loss": 1.3277, "step": 18510 }, { "epoch": 1.4454070085069852, "grad_norm": 51.21713638305664, "learning_rate": 6.179530087293815e-05, "loss": 1.4944, "step": 18520 }, { "epoch": 1.4461874658549911, "grad_norm": 55.569190979003906, "learning_rate": 6.175117224108748e-05, "loss": 3.3253, "step": 18530 }, { "epoch": 1.446967923202997, "grad_norm": 4.12024450302124, "learning_rate": 6.170703391855713e-05, "loss": 0.9532, "step": 18540 }, { "epoch": 1.447748380551003, "grad_norm": 1.0935174226760864, "learning_rate": 6.166288594174608e-05, "loss": 0.9796, "step": 18550 }, { "epoch": 1.4485288378990089, "grad_norm": 0.008399920538067818, "learning_rate": 6.161872834706124e-05, "loss": 1.2911, "step": 18560 }, { "epoch": 1.4493092952470148, "grad_norm": 1.9544976949691772, "learning_rate": 6.157456117091745e-05, "loss": 0.4771, "step": 18570 }, { "epoch": 1.4500897525950207, "grad_norm": 6.754466448910534e-05, "learning_rate": 6.153038444973744e-05, "loss": 0.7429, "step": 18580 }, { "epoch": 1.4508702099430266, "grad_norm": 2.543808932387037e-06, "learning_rate": 6.148619821995185e-05, "loss": 0.617, "step": 18590 }, { "epoch": 1.4516506672910325, "grad_norm": 0.10267318785190582, "learning_rate": 6.144200251799913e-05, "loss": 0.8692, "step": 18600 }, { "epoch": 1.4524311246390385, "grad_norm": 0.766514241695404, "learning_rate": 6.139779738032553e-05, "loss": 0.0055, "step": 18610 }, { "epoch": 1.4532115819870444, "grad_norm": 32.09298324584961, "learning_rate": 6.135358284338512e-05, "loss": 1.7193, "step": 18620 }, { "epoch": 1.4539920393350503, "grad_norm": 0.3428858816623688, "learning_rate": 6.130935894363972e-05, "loss": 0.4487, "step": 18630 }, { "epoch": 1.4547724966830562, "grad_norm": 0.00805765762925148, "learning_rate": 6.126512571755883e-05, "loss": 3.0999, "step": 18640 }, { "epoch": 1.455552954031062, "grad_norm": 18.52367401123047, "learning_rate": 6.122088320161964e-05, "loss": 0.4253, "step": 18650 }, { "epoch": 1.456333411379068, "grad_norm": 0.0071366941556334496, "learning_rate": 6.117663143230707e-05, "loss": 0.7395, "step": 18660 }, { "epoch": 1.457113868727074, "grad_norm": 13.743526458740234, "learning_rate": 6.113237044611361e-05, "loss": 0.1165, "step": 18670 }, { "epoch": 1.45789432607508, "grad_norm": 0.0024282445665448904, "learning_rate": 6.108810027953937e-05, "loss": 0.9431, "step": 18680 }, { "epoch": 1.458674783423086, "grad_norm": 3.3176936540257884e-06, "learning_rate": 6.1043820969092e-05, "loss": 0.8886, "step": 18690 }, { "epoch": 1.459455240771092, "grad_norm": 17.197681427001953, "learning_rate": 6.099953255128675e-05, "loss": 1.7766, "step": 18700 }, { "epoch": 1.4602356981190978, "grad_norm": 2.220248222351074, "learning_rate": 6.095523506264633e-05, "loss": 2.0037, "step": 18710 }, { "epoch": 1.4610161554671037, "grad_norm": 27.847068786621094, "learning_rate": 6.091092853970097e-05, "loss": 1.4636, "step": 18720 }, { "epoch": 1.4617966128151096, "grad_norm": 6.860899925231934, "learning_rate": 6.0866613018988297e-05, "loss": 0.5809, "step": 18730 }, { "epoch": 1.4625770701631156, "grad_norm": 0.10107483714818954, "learning_rate": 6.0822288537053416e-05, "loss": 0.4771, "step": 18740 }, { "epoch": 1.4633575275111215, "grad_norm": 0.000141332478960976, "learning_rate": 6.077795513044878e-05, "loss": 0.0297, "step": 18750 }, { "epoch": 1.4641379848591274, "grad_norm": 5.3891448974609375, "learning_rate": 6.073361283573423e-05, "loss": 0.1366, "step": 18760 }, { "epoch": 1.4649184422071333, "grad_norm": 42.594932556152344, "learning_rate": 6.0689261689476927e-05, "loss": 2.1961, "step": 18770 }, { "epoch": 1.4656988995551394, "grad_norm": 0.11372056603431702, "learning_rate": 6.06449017282513e-05, "loss": 0.6067, "step": 18780 }, { "epoch": 1.4664793569031453, "grad_norm": 0.001790837850421667, "learning_rate": 6.0600532988639114e-05, "loss": 0.8782, "step": 18790 }, { "epoch": 1.4672598142511513, "grad_norm": 6.262240886688232, "learning_rate": 6.055615550722931e-05, "loss": 0.6527, "step": 18800 }, { "epoch": 1.4680402715991572, "grad_norm": 0.0001394073769915849, "learning_rate": 6.051176932061806e-05, "loss": 0.609, "step": 18810 }, { "epoch": 1.468820728947163, "grad_norm": 16.28183364868164, "learning_rate": 6.046737446540873e-05, "loss": 2.3972, "step": 18820 }, { "epoch": 1.469601186295169, "grad_norm": 6.912973403930664, "learning_rate": 6.0422970978211834e-05, "loss": 1.1081, "step": 18830 }, { "epoch": 1.470381643643175, "grad_norm": 0.0012439063284546137, "learning_rate": 6.037855889564498e-05, "loss": 1.1357, "step": 18840 }, { "epoch": 1.4711621009911808, "grad_norm": 56.19667434692383, "learning_rate": 6.033413825433285e-05, "loss": 2.354, "step": 18850 }, { "epoch": 1.4719425583391867, "grad_norm": 0.0037663921248167753, "learning_rate": 6.028970909090725e-05, "loss": 0.2803, "step": 18860 }, { "epoch": 1.4727230156871927, "grad_norm": 9.612967915018089e-06, "learning_rate": 6.024527144200699e-05, "loss": 0.6668, "step": 18870 }, { "epoch": 1.4735034730351986, "grad_norm": 28.315608978271484, "learning_rate": 6.02008253442778e-05, "loss": 2.0943, "step": 18880 }, { "epoch": 1.4742839303832045, "grad_norm": 0.7584246397018433, "learning_rate": 6.015637083437249e-05, "loss": 0.3253, "step": 18890 }, { "epoch": 1.4750643877312104, "grad_norm": 3.4292781352996826, "learning_rate": 6.011190794895074e-05, "loss": 1.1821, "step": 18900 }, { "epoch": 1.4758448450792163, "grad_norm": 12.621614456176758, "learning_rate": 6.006743672467915e-05, "loss": 0.6592, "step": 18910 }, { "epoch": 1.4766253024272222, "grad_norm": 0.8569507598876953, "learning_rate": 6.00229571982312e-05, "loss": 0.1478, "step": 18920 }, { "epoch": 1.4774057597752284, "grad_norm": 0.0002535637468099594, "learning_rate": 5.997846940628724e-05, "loss": 1.1781, "step": 18930 }, { "epoch": 1.4781862171232343, "grad_norm": 7.55508290239959e-06, "learning_rate": 5.993397338553439e-05, "loss": 0.5306, "step": 18940 }, { "epoch": 1.4789666744712402, "grad_norm": 0.12348146736621857, "learning_rate": 5.988946917266659e-05, "loss": 0.1923, "step": 18950 }, { "epoch": 1.479747131819246, "grad_norm": 0.00036039817496202886, "learning_rate": 5.984495680438452e-05, "loss": 0.2572, "step": 18960 }, { "epoch": 1.480527589167252, "grad_norm": 2.984592981647438e-07, "learning_rate": 5.9800436317395594e-05, "loss": 1.1437, "step": 18970 }, { "epoch": 1.481308046515258, "grad_norm": 0.0528603233397007, "learning_rate": 5.975590774841392e-05, "loss": 1.8279, "step": 18980 }, { "epoch": 1.4820885038632639, "grad_norm": 0.0007880390621721745, "learning_rate": 5.9711371134160254e-05, "loss": 1.2552, "step": 18990 }, { "epoch": 1.4828689612112698, "grad_norm": 0.008361490443348885, "learning_rate": 5.9666826511362004e-05, "loss": 1.413, "step": 19000 }, { "epoch": 1.4836494185592757, "grad_norm": 0.05358072742819786, "learning_rate": 5.962227391675319e-05, "loss": 0.8824, "step": 19010 }, { "epoch": 1.4844298759072816, "grad_norm": 0.00043389989878050983, "learning_rate": 5.9577713387074405e-05, "loss": 0.3058, "step": 19020 }, { "epoch": 1.4852103332552877, "grad_norm": 60.52457046508789, "learning_rate": 5.953314495907275e-05, "loss": 0.6162, "step": 19030 }, { "epoch": 1.4859907906032936, "grad_norm": 0.36423781514167786, "learning_rate": 5.948856866950188e-05, "loss": 0.2633, "step": 19040 }, { "epoch": 1.4867712479512996, "grad_norm": 1.0531165571592283e-05, "learning_rate": 5.944398455512191e-05, "loss": 0.4567, "step": 19050 }, { "epoch": 1.4875517052993055, "grad_norm": 47.43577575683594, "learning_rate": 5.939939265269945e-05, "loss": 2.0081, "step": 19060 }, { "epoch": 1.4883321626473114, "grad_norm": 38.81036376953125, "learning_rate": 5.935479299900744e-05, "loss": 0.6646, "step": 19070 }, { "epoch": 1.4891126199953173, "grad_norm": 1.6050429344177246, "learning_rate": 5.931018563082531e-05, "loss": 0.5204, "step": 19080 }, { "epoch": 1.4898930773433232, "grad_norm": 0.078951396048069, "learning_rate": 5.926557058493881e-05, "loss": 0.4197, "step": 19090 }, { "epoch": 1.4906735346913291, "grad_norm": 2.977109432220459, "learning_rate": 5.9220947898140025e-05, "loss": 1.0123, "step": 19100 }, { "epoch": 1.491453992039335, "grad_norm": 3.1784112453460693, "learning_rate": 5.917631760722732e-05, "loss": 1.6444, "step": 19110 }, { "epoch": 1.492234449387341, "grad_norm": 0.03160808980464935, "learning_rate": 5.913167974900536e-05, "loss": 0.9252, "step": 19120 }, { "epoch": 1.4930149067353469, "grad_norm": 0.002941631944850087, "learning_rate": 5.908703436028506e-05, "loss": 1.3656, "step": 19130 }, { "epoch": 1.4937953640833528, "grad_norm": 0.00814164336770773, "learning_rate": 5.904238147788351e-05, "loss": 1.6489, "step": 19140 }, { "epoch": 1.4945758214313587, "grad_norm": 6.922767639160156, "learning_rate": 5.8997721138624006e-05, "loss": 0.8766, "step": 19150 }, { "epoch": 1.4953562787793646, "grad_norm": 5.123744010925293, "learning_rate": 5.895305337933597e-05, "loss": 0.9134, "step": 19160 }, { "epoch": 1.4961367361273705, "grad_norm": 31.880294799804688, "learning_rate": 5.890837823685497e-05, "loss": 0.8887, "step": 19170 }, { "epoch": 1.4969171934753764, "grad_norm": 4.0199822137765295e-07, "learning_rate": 5.886369574802263e-05, "loss": 0.9717, "step": 19180 }, { "epoch": 1.4976976508233826, "grad_norm": 2.6747709398478037e-06, "learning_rate": 5.881900594968667e-05, "loss": 1.5533, "step": 19190 }, { "epoch": 1.4984781081713885, "grad_norm": 0.00020924198906868696, "learning_rate": 5.877430887870081e-05, "loss": 0.487, "step": 19200 }, { "epoch": 1.4992585655193944, "grad_norm": 0.012734944000840187, "learning_rate": 5.8729604571924776e-05, "loss": 0.2319, "step": 19210 }, { "epoch": 1.5000390228674003, "grad_norm": 0.2888699769973755, "learning_rate": 5.868489306622429e-05, "loss": 1.1856, "step": 19220 }, { "epoch": 1.5008194802154062, "grad_norm": 8.60429573059082, "learning_rate": 5.8640174398470926e-05, "loss": 0.1638, "step": 19230 }, { "epoch": 1.5015999375634121, "grad_norm": 2.983750164275989e-05, "learning_rate": 5.859544860554227e-05, "loss": 3.0502, "step": 19240 }, { "epoch": 1.502380394911418, "grad_norm": 0.0243430994451046, "learning_rate": 5.8550715724321715e-05, "loss": 0.7847, "step": 19250 }, { "epoch": 1.503160852259424, "grad_norm": 0.001944607705809176, "learning_rate": 5.850597579169853e-05, "loss": 1.7657, "step": 19260 }, { "epoch": 1.50394130960743, "grad_norm": 0.00044312793761491776, "learning_rate": 5.846122884456776e-05, "loss": 0.3973, "step": 19270 }, { "epoch": 1.504721766955436, "grad_norm": 0.02026950940489769, "learning_rate": 5.84164749198303e-05, "loss": 0.2291, "step": 19280 }, { "epoch": 1.505502224303442, "grad_norm": 0.42607226967811584, "learning_rate": 5.837171405439272e-05, "loss": 1.0548, "step": 19290 }, { "epoch": 1.5062826816514479, "grad_norm": 5.227291584014893, "learning_rate": 5.83269462851674e-05, "loss": 0.2234, "step": 19300 }, { "epoch": 1.5070631389994538, "grad_norm": 1.4641375541687012, "learning_rate": 5.8282171649072325e-05, "loss": 0.8951, "step": 19310 }, { "epoch": 1.5078435963474597, "grad_norm": 5.018607680540299e-06, "learning_rate": 5.823739018303123e-05, "loss": 0.5481, "step": 19320 }, { "epoch": 1.5086240536954656, "grad_norm": 60.11586380004883, "learning_rate": 5.81926019239734e-05, "loss": 2.3614, "step": 19330 }, { "epoch": 1.5094045110434715, "grad_norm": 6.57179862173507e-06, "learning_rate": 5.814780690883378e-05, "loss": 0.5306, "step": 19340 }, { "epoch": 1.5101849683914774, "grad_norm": 8.855113264871761e-05, "learning_rate": 5.8103005174552846e-05, "loss": 0.6372, "step": 19350 }, { "epoch": 1.5109654257394833, "grad_norm": 50.526031494140625, "learning_rate": 5.805819675807669e-05, "loss": 1.4548, "step": 19360 }, { "epoch": 1.5117458830874893, "grad_norm": 1.4122130870819092, "learning_rate": 5.801338169635681e-05, "loss": 0.7567, "step": 19370 }, { "epoch": 1.5125263404354952, "grad_norm": 35.40172576904297, "learning_rate": 5.7968560026350236e-05, "loss": 0.4017, "step": 19380 }, { "epoch": 1.513306797783501, "grad_norm": 6.476219596152077e-07, "learning_rate": 5.792373178501945e-05, "loss": 1.0184, "step": 19390 }, { "epoch": 1.514087255131507, "grad_norm": 3.57866468903012e-07, "learning_rate": 5.7878897009332376e-05, "loss": 0.2702, "step": 19400 }, { "epoch": 1.514867712479513, "grad_norm": 34.72517013549805, "learning_rate": 5.783405573626228e-05, "loss": 1.5825, "step": 19410 }, { "epoch": 1.5156481698275188, "grad_norm": 35.87035369873047, "learning_rate": 5.7789208002787796e-05, "loss": 1.2819, "step": 19420 }, { "epoch": 1.5164286271755247, "grad_norm": 3.8606178760528564, "learning_rate": 5.774435384589291e-05, "loss": 0.8184, "step": 19430 }, { "epoch": 1.5172090845235306, "grad_norm": 1.1874459981918335, "learning_rate": 5.769949330256689e-05, "loss": 0.363, "step": 19440 }, { "epoch": 1.5179895418715366, "grad_norm": 57.610191345214844, "learning_rate": 5.765462640980428e-05, "loss": 1.1065, "step": 19450 }, { "epoch": 1.5187699992195427, "grad_norm": 22.041465759277344, "learning_rate": 5.760975320460482e-05, "loss": 1.0598, "step": 19460 }, { "epoch": 1.5195504565675486, "grad_norm": 19.411191940307617, "learning_rate": 5.756487372397351e-05, "loss": 1.3748, "step": 19470 }, { "epoch": 1.5203309139155545, "grad_norm": 0.10800797492265701, "learning_rate": 5.75199880049205e-05, "loss": 1.3595, "step": 19480 }, { "epoch": 1.5211113712635604, "grad_norm": 0.026111967861652374, "learning_rate": 5.747509608446109e-05, "loss": 0.7284, "step": 19490 }, { "epoch": 1.5218918286115664, "grad_norm": 0.5697845816612244, "learning_rate": 5.743019799961566e-05, "loss": 2.3566, "step": 19500 }, { "epoch": 1.5226722859595723, "grad_norm": 0.0015555124264210463, "learning_rate": 5.738529378740976e-05, "loss": 0.6254, "step": 19510 }, { "epoch": 1.5234527433075784, "grad_norm": 52.46384048461914, "learning_rate": 5.734038348487389e-05, "loss": 0.6338, "step": 19520 }, { "epoch": 1.5242332006555843, "grad_norm": 0.001647842931561172, "learning_rate": 5.7295467129043644e-05, "loss": 0.5647, "step": 19530 }, { "epoch": 1.5250136580035902, "grad_norm": 1.7748408317565918, "learning_rate": 5.7250544756959576e-05, "loss": 0.4189, "step": 19540 }, { "epoch": 1.5257941153515961, "grad_norm": 12.625972747802734, "learning_rate": 5.72056164056672e-05, "loss": 0.5406, "step": 19550 }, { "epoch": 1.526574572699602, "grad_norm": 26.848318099975586, "learning_rate": 5.716068211221698e-05, "loss": 1.4614, "step": 19560 }, { "epoch": 1.527355030047608, "grad_norm": 5.812100410461426, "learning_rate": 5.7115741913664264e-05, "loss": 0.0883, "step": 19570 }, { "epoch": 1.528135487395614, "grad_norm": 44.151092529296875, "learning_rate": 5.707079584706927e-05, "loss": 1.5955, "step": 19580 }, { "epoch": 1.5289159447436198, "grad_norm": 1.2502694129943848, "learning_rate": 5.702584394949708e-05, "loss": 0.5244, "step": 19590 }, { "epoch": 1.5296964020916257, "grad_norm": 32.96466064453125, "learning_rate": 5.698088625801756e-05, "loss": 0.4484, "step": 19600 }, { "epoch": 1.5304768594396316, "grad_norm": 5.660415425268184e-08, "learning_rate": 5.693592280970534e-05, "loss": 1.1164, "step": 19610 }, { "epoch": 1.5312573167876375, "grad_norm": 19.453800201416016, "learning_rate": 5.6890953641639824e-05, "loss": 0.8106, "step": 19620 }, { "epoch": 1.5320377741356435, "grad_norm": 0.04822535812854767, "learning_rate": 5.684597879090514e-05, "loss": 0.604, "step": 19630 }, { "epoch": 1.5328182314836494, "grad_norm": 0.00011620177974691615, "learning_rate": 5.680099829459008e-05, "loss": 1.3639, "step": 19640 }, { "epoch": 1.5335986888316553, "grad_norm": 0.08724360913038254, "learning_rate": 5.675601218978809e-05, "loss": 0.5596, "step": 19650 }, { "epoch": 1.5343791461796612, "grad_norm": 0.37494370341300964, "learning_rate": 5.671102051359727e-05, "loss": 0.5558, "step": 19660 }, { "epoch": 1.5351596035276671, "grad_norm": 3.1753580570220947, "learning_rate": 5.6666023303120266e-05, "loss": 0.9573, "step": 19670 }, { "epoch": 1.535940060875673, "grad_norm": 0.12596221268177032, "learning_rate": 5.662102059546434e-05, "loss": 1.151, "step": 19680 }, { "epoch": 1.536720518223679, "grad_norm": 27.574541091918945, "learning_rate": 5.657601242774125e-05, "loss": 1.5459, "step": 19690 }, { "epoch": 1.5375009755716849, "grad_norm": 5.653417110443115, "learning_rate": 5.653099883706727e-05, "loss": 2.5211, "step": 19700 }, { "epoch": 1.538281432919691, "grad_norm": 0.7937735915184021, "learning_rate": 5.648597986056318e-05, "loss": 0.3426, "step": 19710 }, { "epoch": 1.539061890267697, "grad_norm": 2.5061755180358887, "learning_rate": 5.6440955535354124e-05, "loss": 0.7989, "step": 19720 }, { "epoch": 1.5398423476157028, "grad_norm": 27.361835479736328, "learning_rate": 5.639592589856973e-05, "loss": 1.1338, "step": 19730 }, { "epoch": 1.5406228049637087, "grad_norm": 1.5489771612919867e-05, "learning_rate": 5.6350890987343944e-05, "loss": 0.142, "step": 19740 }, { "epoch": 1.5414032623117147, "grad_norm": 1.1988106052740477e-05, "learning_rate": 5.630585083881513e-05, "loss": 1.1921, "step": 19750 }, { "epoch": 1.5421837196597206, "grad_norm": 36.36326599121094, "learning_rate": 5.626080549012592e-05, "loss": 0.7187, "step": 19760 }, { "epoch": 1.5429641770077265, "grad_norm": 30.49346160888672, "learning_rate": 5.6215754978423254e-05, "loss": 1.4105, "step": 19770 }, { "epoch": 1.5437446343557326, "grad_norm": 0.00679404940456152, "learning_rate": 5.617069934085832e-05, "loss": 0.1904, "step": 19780 }, { "epoch": 1.5445250917037385, "grad_norm": 15.944265365600586, "learning_rate": 5.612563861458655e-05, "loss": 0.8945, "step": 19790 }, { "epoch": 1.5453055490517444, "grad_norm": 24.04916763305664, "learning_rate": 5.6080572836767555e-05, "loss": 0.208, "step": 19800 }, { "epoch": 1.5460860063997504, "grad_norm": 34.59318542480469, "learning_rate": 5.60355020445651e-05, "loss": 1.1715, "step": 19810 }, { "epoch": 1.5468664637477563, "grad_norm": 5.566281318664551, "learning_rate": 5.5990426275147136e-05, "loss": 0.7822, "step": 19820 }, { "epoch": 1.5476469210957622, "grad_norm": 27.87342643737793, "learning_rate": 5.594534556568567e-05, "loss": 0.6418, "step": 19830 }, { "epoch": 1.548427378443768, "grad_norm": 0.00011106702004326507, "learning_rate": 5.5900259953356804e-05, "loss": 0.2046, "step": 19840 }, { "epoch": 1.549207835791774, "grad_norm": 88.22048950195312, "learning_rate": 5.585516947534066e-05, "loss": 1.8431, "step": 19850 }, { "epoch": 1.54998829313978, "grad_norm": 3.4051570310111856e-06, "learning_rate": 5.5810074168821424e-05, "loss": 1.2887, "step": 19860 }, { "epoch": 1.5507687504877858, "grad_norm": 1.627982726404298e-08, "learning_rate": 5.5764974070987196e-05, "loss": 0.4555, "step": 19870 }, { "epoch": 1.5515492078357918, "grad_norm": 58.66267395019531, "learning_rate": 5.571986921903007e-05, "loss": 1.8405, "step": 19880 }, { "epoch": 1.5523296651837977, "grad_norm": 1.2103909341476538e-07, "learning_rate": 5.567475965014606e-05, "loss": 1.071, "step": 19890 }, { "epoch": 1.5531101225318036, "grad_norm": 29.37429428100586, "learning_rate": 5.562964540153506e-05, "loss": 1.3935, "step": 19900 }, { "epoch": 1.5538905798798095, "grad_norm": 0.00016841200704220682, "learning_rate": 5.558452651040081e-05, "loss": 0.8621, "step": 19910 }, { "epoch": 1.5546710372278154, "grad_norm": 82.55367279052734, "learning_rate": 5.553940301395093e-05, "loss": 0.669, "step": 19920 }, { "epoch": 1.5554514945758213, "grad_norm": 26.137784957885742, "learning_rate": 5.5494274949396754e-05, "loss": 1.3306, "step": 19930 }, { "epoch": 1.5562319519238272, "grad_norm": 0.6996078491210938, "learning_rate": 5.5449142353953465e-05, "loss": 0.8355, "step": 19940 }, { "epoch": 1.5570124092718332, "grad_norm": 31.849136352539062, "learning_rate": 5.5404005264839956e-05, "loss": 1.5454, "step": 19950 }, { "epoch": 1.557792866619839, "grad_norm": 1.0991023373208009e-05, "learning_rate": 5.5358863719278764e-05, "loss": 0.8768, "step": 19960 }, { "epoch": 1.5585733239678452, "grad_norm": 31.20032501220703, "learning_rate": 5.531371775449621e-05, "loss": 1.9345, "step": 19970 }, { "epoch": 1.5593537813158511, "grad_norm": 0.3876931965351105, "learning_rate": 5.526856740772218e-05, "loss": 0.9811, "step": 19980 }, { "epoch": 1.560134238663857, "grad_norm": 0.008426464162766933, "learning_rate": 5.522341271619019e-05, "loss": 0.8697, "step": 19990 }, { "epoch": 1.560914696011863, "grad_norm": 6.8745598793029785, "learning_rate": 5.517825371713737e-05, "loss": 0.0568, "step": 20000 }, { "epoch": 1.5616951533598689, "grad_norm": 4.607158184051514, "learning_rate": 5.5133090447804346e-05, "loss": 1.1868, "step": 20010 }, { "epoch": 1.5624756107078748, "grad_norm": 31.82735824584961, "learning_rate": 5.508792294543533e-05, "loss": 1.0846, "step": 20020 }, { "epoch": 1.563256068055881, "grad_norm": 3.365826368331909, "learning_rate": 5.504275124727798e-05, "loss": 1.5055, "step": 20030 }, { "epoch": 1.5640365254038868, "grad_norm": 17.653242111206055, "learning_rate": 5.499757539058342e-05, "loss": 0.488, "step": 20040 }, { "epoch": 1.5648169827518927, "grad_norm": 0.3879726827144623, "learning_rate": 5.495239541260623e-05, "loss": 0.931, "step": 20050 }, { "epoch": 1.5655974400998987, "grad_norm": 39.06393814086914, "learning_rate": 5.490721135060435e-05, "loss": 0.5254, "step": 20060 }, { "epoch": 1.5663778974479046, "grad_norm": 1.1639221906661987, "learning_rate": 5.486202324183911e-05, "loss": 1.1717, "step": 20070 }, { "epoch": 1.5671583547959105, "grad_norm": 42.985511779785156, "learning_rate": 5.481683112357517e-05, "loss": 0.8609, "step": 20080 }, { "epoch": 1.5679388121439164, "grad_norm": 9.094670531339943e-06, "learning_rate": 5.477163503308052e-05, "loss": 0.1233, "step": 20090 }, { "epoch": 1.5687192694919223, "grad_norm": 37.52073287963867, "learning_rate": 5.472643500762639e-05, "loss": 0.6381, "step": 20100 }, { "epoch": 1.5694997268399282, "grad_norm": 0.0003045310149900615, "learning_rate": 5.468123108448727e-05, "loss": 0.2886, "step": 20110 }, { "epoch": 1.5702801841879341, "grad_norm": 0.2237444669008255, "learning_rate": 5.4636023300940865e-05, "loss": 1.0482, "step": 20120 }, { "epoch": 1.57106064153594, "grad_norm": 16.167692184448242, "learning_rate": 5.4590811694268085e-05, "loss": 0.4846, "step": 20130 }, { "epoch": 1.571841098883946, "grad_norm": 47.023067474365234, "learning_rate": 5.4545596301752964e-05, "loss": 1.4037, "step": 20140 }, { "epoch": 1.5726215562319519, "grad_norm": 48.14585876464844, "learning_rate": 5.4500377160682646e-05, "loss": 0.8417, "step": 20150 }, { "epoch": 1.5734020135799578, "grad_norm": 1.0672800954125705e-06, "learning_rate": 5.4455154308347404e-05, "loss": 1.1042, "step": 20160 }, { "epoch": 1.5741824709279637, "grad_norm": 0.0014652134850621223, "learning_rate": 5.440992778204054e-05, "loss": 1.655, "step": 20170 }, { "epoch": 1.5749629282759696, "grad_norm": 33.9235725402832, "learning_rate": 5.436469761905841e-05, "loss": 1.4122, "step": 20180 }, { "epoch": 1.5757433856239755, "grad_norm": 25.62478256225586, "learning_rate": 5.431946385670036e-05, "loss": 1.2307, "step": 20190 }, { "epoch": 1.5765238429719814, "grad_norm": 52.47317123413086, "learning_rate": 5.427422653226868e-05, "loss": 0.6712, "step": 20200 }, { "epoch": 1.5773043003199874, "grad_norm": 58.45132827758789, "learning_rate": 5.4228985683068664e-05, "loss": 1.6876, "step": 20210 }, { "epoch": 1.5780847576679935, "grad_norm": 1.3851629495620728, "learning_rate": 5.4183741346408435e-05, "loss": 0.8356, "step": 20220 }, { "epoch": 1.5788652150159994, "grad_norm": 0.010332033038139343, "learning_rate": 5.4138493559599036e-05, "loss": 0.9256, "step": 20230 }, { "epoch": 1.5796456723640053, "grad_norm": 46.7064323425293, "learning_rate": 5.409324235995434e-05, "loss": 1.0566, "step": 20240 }, { "epoch": 1.5804261297120112, "grad_norm": 0.012174281291663647, "learning_rate": 5.404798778479104e-05, "loss": 2.1908, "step": 20250 }, { "epoch": 1.5812065870600172, "grad_norm": 0.055964235216379166, "learning_rate": 5.4002729871428624e-05, "loss": 0.8299, "step": 20260 }, { "epoch": 1.581987044408023, "grad_norm": 0.04050574079155922, "learning_rate": 5.3957468657189315e-05, "loss": 1.8807, "step": 20270 }, { "epoch": 1.582767501756029, "grad_norm": 44.31588363647461, "learning_rate": 5.391220417939804e-05, "loss": 1.4118, "step": 20280 }, { "epoch": 1.5835479591040351, "grad_norm": 60.45882034301758, "learning_rate": 5.386693647538248e-05, "loss": 0.3896, "step": 20290 }, { "epoch": 1.584328416452041, "grad_norm": 3.0239250659942627, "learning_rate": 5.382166558247291e-05, "loss": 2.1834, "step": 20300 }, { "epoch": 1.585108873800047, "grad_norm": 2.8246383666992188, "learning_rate": 5.377639153800229e-05, "loss": 0.5203, "step": 20310 }, { "epoch": 1.5858893311480529, "grad_norm": 0.015088041312992573, "learning_rate": 5.3731114379306114e-05, "loss": 0.1914, "step": 20320 }, { "epoch": 1.5866697884960588, "grad_norm": 0.00013219547690823674, "learning_rate": 5.368583414372251e-05, "loss": 0.7982, "step": 20330 }, { "epoch": 1.5874502458440647, "grad_norm": 0.0006202549557201564, "learning_rate": 5.3640550868592124e-05, "loss": 0.7759, "step": 20340 }, { "epoch": 1.5882307031920706, "grad_norm": 0.30399084091186523, "learning_rate": 5.3595264591258054e-05, "loss": 0.9756, "step": 20350 }, { "epoch": 1.5890111605400765, "grad_norm": 27.899831771850586, "learning_rate": 5.354997534906596e-05, "loss": 1.9642, "step": 20360 }, { "epoch": 1.5897916178880824, "grad_norm": 2.0328943729400635, "learning_rate": 5.35046831793639e-05, "loss": 0.3076, "step": 20370 }, { "epoch": 1.5905720752360883, "grad_norm": 99.052490234375, "learning_rate": 5.345938811950234e-05, "loss": 2.0924, "step": 20380 }, { "epoch": 1.5913525325840943, "grad_norm": 3.535752296447754, "learning_rate": 5.341409020683414e-05, "loss": 0.319, "step": 20390 }, { "epoch": 1.5921329899321002, "grad_norm": 5.720830813515931e-07, "learning_rate": 5.336878947871454e-05, "loss": 0.3174, "step": 20400 }, { "epoch": 1.592913447280106, "grad_norm": 71.32350158691406, "learning_rate": 5.3323485972501055e-05, "loss": 1.6845, "step": 20410 }, { "epoch": 1.593693904628112, "grad_norm": 2.370527982711792, "learning_rate": 5.3278179725553525e-05, "loss": 0.5922, "step": 20420 }, { "epoch": 1.594474361976118, "grad_norm": 0.013675123453140259, "learning_rate": 5.3232870775234024e-05, "loss": 0.5931, "step": 20430 }, { "epoch": 1.5952548193241238, "grad_norm": 3.333381215497866e-08, "learning_rate": 5.318755915890688e-05, "loss": 0.5146, "step": 20440 }, { "epoch": 1.5960352766721297, "grad_norm": 13.518216133117676, "learning_rate": 5.314224491393859e-05, "loss": 1.4546, "step": 20450 }, { "epoch": 1.5968157340201357, "grad_norm": 2.5204646587371826, "learning_rate": 5.309692807769786e-05, "loss": 2.4162, "step": 20460 }, { "epoch": 1.5975961913681416, "grad_norm": 9.781959306565113e-06, "learning_rate": 5.305160868755549e-05, "loss": 1.3862, "step": 20470 }, { "epoch": 1.5983766487161477, "grad_norm": 0.0047400938346982, "learning_rate": 5.300628678088443e-05, "loss": 2.0894, "step": 20480 }, { "epoch": 1.5991571060641536, "grad_norm": 1.9240214824676514, "learning_rate": 5.296096239505966e-05, "loss": 1.8343, "step": 20490 }, { "epoch": 1.5999375634121595, "grad_norm": 2.6736748218536377, "learning_rate": 5.2915635567458245e-05, "loss": 0.1457, "step": 20500 }, { "epoch": 1.6007180207601654, "grad_norm": 1.4274554252624512, "learning_rate": 5.287030633545922e-05, "loss": 0.2772, "step": 20510 }, { "epoch": 1.6014984781081714, "grad_norm": 55.156532287597656, "learning_rate": 5.282497473644365e-05, "loss": 0.6489, "step": 20520 }, { "epoch": 1.6022789354561773, "grad_norm": 5.8431965044292156e-06, "learning_rate": 5.277964080779453e-05, "loss": 0.4494, "step": 20530 }, { "epoch": 1.6030593928041834, "grad_norm": 12.031957626342773, "learning_rate": 5.273430458689674e-05, "loss": 2.4088, "step": 20540 }, { "epoch": 1.6038398501521893, "grad_norm": 27.250608444213867, "learning_rate": 5.268896611113713e-05, "loss": 1.2248, "step": 20550 }, { "epoch": 1.6046203075001952, "grad_norm": 1.2413510084152222, "learning_rate": 5.264362541790434e-05, "loss": 1.4055, "step": 20560 }, { "epoch": 1.6054007648482012, "grad_norm": 0.7440712451934814, "learning_rate": 5.2598282544588874e-05, "loss": 0.4348, "step": 20570 }, { "epoch": 1.606181222196207, "grad_norm": 0.732166588306427, "learning_rate": 5.2552937528583014e-05, "loss": 3.105, "step": 20580 }, { "epoch": 1.606961679544213, "grad_norm": 0.03779369965195656, "learning_rate": 5.2507590407280815e-05, "loss": 0.8086, "step": 20590 }, { "epoch": 1.607742136892219, "grad_norm": 0.03578812628984451, "learning_rate": 5.24622412180781e-05, "loss": 1.2931, "step": 20600 }, { "epoch": 1.6085225942402248, "grad_norm": 27.9847354888916, "learning_rate": 5.2416889998372344e-05, "loss": 0.2665, "step": 20610 }, { "epoch": 1.6093030515882307, "grad_norm": 0.38156062364578247, "learning_rate": 5.237153678556273e-05, "loss": 0.6004, "step": 20620 }, { "epoch": 1.6100835089362366, "grad_norm": 5.8809146139537916e-06, "learning_rate": 5.232618161705009e-05, "loss": 0.3352, "step": 20630 }, { "epoch": 1.6108639662842426, "grad_norm": 0.05808285251259804, "learning_rate": 5.228082453023682e-05, "loss": 0.4951, "step": 20640 }, { "epoch": 1.6116444236322485, "grad_norm": 0.0736168846487999, "learning_rate": 5.2235465562526956e-05, "loss": 0.0526, "step": 20650 }, { "epoch": 1.6124248809802544, "grad_norm": 0.005917796399444342, "learning_rate": 5.219010475132604e-05, "loss": 0.1153, "step": 20660 }, { "epoch": 1.6132053383282603, "grad_norm": 39.21625900268555, "learning_rate": 5.21447421340412e-05, "loss": 0.5997, "step": 20670 }, { "epoch": 1.6139857956762662, "grad_norm": 40.29851531982422, "learning_rate": 5.209937774808098e-05, "loss": 5.3299, "step": 20680 }, { "epoch": 1.6147662530242721, "grad_norm": 4.9903657782124355e-05, "learning_rate": 5.205401163085542e-05, "loss": 0.1729, "step": 20690 }, { "epoch": 1.615546710372278, "grad_norm": 0.03153657168149948, "learning_rate": 5.200864381977596e-05, "loss": 1.1679, "step": 20700 }, { "epoch": 1.616327167720284, "grad_norm": 0.011658350005745888, "learning_rate": 5.196327435225548e-05, "loss": 0.1952, "step": 20710 }, { "epoch": 1.6171076250682899, "grad_norm": 5.017102466808865e-07, "learning_rate": 5.191790326570821e-05, "loss": 0.8395, "step": 20720 }, { "epoch": 1.617888082416296, "grad_norm": 0.6353729963302612, "learning_rate": 5.1872530597549696e-05, "loss": 1.0601, "step": 20730 }, { "epoch": 1.618668539764302, "grad_norm": 22.19393539428711, "learning_rate": 5.1827156385196775e-05, "loss": 1.74, "step": 20740 }, { "epoch": 1.6194489971123078, "grad_norm": 1.667719857323391e-06, "learning_rate": 5.178178066606762e-05, "loss": 0.3663, "step": 20750 }, { "epoch": 1.6202294544603137, "grad_norm": 47.38582992553711, "learning_rate": 5.1736403477581594e-05, "loss": 0.8091, "step": 20760 }, { "epoch": 1.6210099118083197, "grad_norm": 0.21062630414962769, "learning_rate": 5.1691024857159297e-05, "loss": 0.2815, "step": 20770 }, { "epoch": 1.6217903691563256, "grad_norm": 1.026199460029602, "learning_rate": 5.164564484222247e-05, "loss": 1.9512, "step": 20780 }, { "epoch": 1.6225708265043315, "grad_norm": 30.83462142944336, "learning_rate": 5.160026347019407e-05, "loss": 0.5864, "step": 20790 }, { "epoch": 1.6233512838523376, "grad_norm": 0.025723319500684738, "learning_rate": 5.155488077849812e-05, "loss": 1.3208, "step": 20800 }, { "epoch": 1.6241317412003435, "grad_norm": 1.1020880918977127e-07, "learning_rate": 5.150949680455974e-05, "loss": 0.2486, "step": 20810 }, { "epoch": 1.6249121985483495, "grad_norm": 2.570460796356201, "learning_rate": 5.146411158580513e-05, "loss": 1.6481, "step": 20820 }, { "epoch": 1.6256926558963554, "grad_norm": 47.203922271728516, "learning_rate": 5.141872515966152e-05, "loss": 0.3489, "step": 20830 }, { "epoch": 1.6264731132443613, "grad_norm": 7.59179162979126, "learning_rate": 5.137333756355707e-05, "loss": 0.8454, "step": 20840 }, { "epoch": 1.6272535705923672, "grad_norm": 0.002626607194542885, "learning_rate": 5.132794883492099e-05, "loss": 1.9901, "step": 20850 }, { "epoch": 1.628034027940373, "grad_norm": 0.28491634130477905, "learning_rate": 5.128255901118335e-05, "loss": 0.7532, "step": 20860 }, { "epoch": 1.628814485288379, "grad_norm": 0.0006308460724540055, "learning_rate": 5.1237168129775216e-05, "loss": 0.1481, "step": 20870 }, { "epoch": 1.629594942636385, "grad_norm": 11.376788139343262, "learning_rate": 5.119177622812842e-05, "loss": 1.0631, "step": 20880 }, { "epoch": 1.6303753999843908, "grad_norm": 16.143569946289062, "learning_rate": 5.1146383343675706e-05, "loss": 0.7264, "step": 20890 }, { "epoch": 1.6311558573323968, "grad_norm": 57.7200813293457, "learning_rate": 5.110098951385061e-05, "loss": 0.8139, "step": 20900 }, { "epoch": 1.6319363146804027, "grad_norm": 1.1819399333035108e-05, "learning_rate": 5.1055594776087436e-05, "loss": 1.5962, "step": 20910 }, { "epoch": 1.6327167720284086, "grad_norm": 1.585782527923584, "learning_rate": 5.101019916782125e-05, "loss": 2.1014, "step": 20920 }, { "epoch": 1.6334972293764145, "grad_norm": 0.6489249467849731, "learning_rate": 5.0964802726487835e-05, "loss": 1.3317, "step": 20930 }, { "epoch": 1.6342776867244204, "grad_norm": 0.0019743037410080433, "learning_rate": 5.091940548952365e-05, "loss": 0.0838, "step": 20940 }, { "epoch": 1.6350581440724263, "grad_norm": 0.3787420392036438, "learning_rate": 5.0874007494365826e-05, "loss": 0.4802, "step": 20950 }, { "epoch": 1.6358386014204322, "grad_norm": 0.025786397978663445, "learning_rate": 5.082860877845212e-05, "loss": 0.9618, "step": 20960 }, { "epoch": 1.6366190587684382, "grad_norm": 0.0014336465392261744, "learning_rate": 5.078320937922084e-05, "loss": 2.8194, "step": 20970 }, { "epoch": 1.637399516116444, "grad_norm": 65.5517807006836, "learning_rate": 5.073780933411093e-05, "loss": 0.916, "step": 20980 }, { "epoch": 1.6381799734644502, "grad_norm": 0.2608548104763031, "learning_rate": 5.0692408680561806e-05, "loss": 0.6485, "step": 20990 }, { "epoch": 1.6389604308124561, "grad_norm": 1.4473145008087158, "learning_rate": 5.064700745601343e-05, "loss": 0.5851, "step": 21000 }, { "epoch": 1.639740888160462, "grad_norm": 0.11096695065498352, "learning_rate": 5.06016056979062e-05, "loss": 0.2983, "step": 21010 }, { "epoch": 1.640521345508468, "grad_norm": 0.15635938942432404, "learning_rate": 5.055620344368095e-05, "loss": 0.4633, "step": 21020 }, { "epoch": 1.6413018028564739, "grad_norm": 90.75917053222656, "learning_rate": 5.0510800730778974e-05, "loss": 1.5644, "step": 21030 }, { "epoch": 1.6420822602044798, "grad_norm": 2.372506990866441e-08, "learning_rate": 5.046539759664188e-05, "loss": 1.1676, "step": 21040 }, { "epoch": 1.642862717552486, "grad_norm": 70.3769760131836, "learning_rate": 5.0419994078711674e-05, "loss": 1.0743, "step": 21050 }, { "epoch": 1.6436431749004918, "grad_norm": 1.29304838180542, "learning_rate": 5.037459021443066e-05, "loss": 0.4095, "step": 21060 }, { "epoch": 1.6444236322484977, "grad_norm": 0.04536210000514984, "learning_rate": 5.032918604124142e-05, "loss": 0.2607, "step": 21070 }, { "epoch": 1.6452040895965037, "grad_norm": 8.825725555419922, "learning_rate": 5.02837815965868e-05, "loss": 1.2542, "step": 21080 }, { "epoch": 1.6459845469445096, "grad_norm": 12.517806053161621, "learning_rate": 5.023837691790984e-05, "loss": 3.7509, "step": 21090 }, { "epoch": 1.6467650042925155, "grad_norm": 0.4820645749568939, "learning_rate": 5.0192972042653844e-05, "loss": 2.1886, "step": 21100 }, { "epoch": 1.6475454616405214, "grad_norm": 40.499412536621094, "learning_rate": 5.014756700826221e-05, "loss": 0.9779, "step": 21110 }, { "epoch": 1.6483259189885273, "grad_norm": 0.00010101215593749657, "learning_rate": 5.01021618521785e-05, "loss": 0.2086, "step": 21120 }, { "epoch": 1.6491063763365332, "grad_norm": 4.254552364349365, "learning_rate": 5.005675661184635e-05, "loss": 0.312, "step": 21130 }, { "epoch": 1.6498868336845391, "grad_norm": 2.831031560897827, "learning_rate": 5.001135132470951e-05, "loss": 0.087, "step": 21140 }, { "epoch": 1.650667291032545, "grad_norm": 49.950191497802734, "learning_rate": 4.9965946028211714e-05, "loss": 4.2688, "step": 21150 }, { "epoch": 1.651447748380551, "grad_norm": 2.2463319301605225, "learning_rate": 4.992054075979676e-05, "loss": 0.2042, "step": 21160 }, { "epoch": 1.6522282057285569, "grad_norm": 53.4621467590332, "learning_rate": 4.9875135556908376e-05, "loss": 2.1503, "step": 21170 }, { "epoch": 1.6530086630765628, "grad_norm": 2.5740723609924316, "learning_rate": 4.9829730456990244e-05, "loss": 0.1176, "step": 21180 }, { "epoch": 1.6537891204245687, "grad_norm": 0.07874448597431183, "learning_rate": 4.9784325497486e-05, "loss": 0.2987, "step": 21190 }, { "epoch": 1.6545695777725746, "grad_norm": 1.9069774150848389, "learning_rate": 4.9738920715839105e-05, "loss": 1.505, "step": 21200 }, { "epoch": 1.6553500351205805, "grad_norm": 3.814951014646795e-06, "learning_rate": 4.9693516149492924e-05, "loss": 0.4965, "step": 21210 }, { "epoch": 1.6561304924685865, "grad_norm": 4.037242889404297, "learning_rate": 4.964811183589061e-05, "loss": 0.1444, "step": 21220 }, { "epoch": 1.6569109498165924, "grad_norm": 0.0023153889924287796, "learning_rate": 4.960270781247515e-05, "loss": 0.0273, "step": 21230 }, { "epoch": 1.6576914071645985, "grad_norm": 3.0313501611090032e-06, "learning_rate": 4.955730411668922e-05, "loss": 0.5516, "step": 21240 }, { "epoch": 1.6584718645126044, "grad_norm": 0.18385180830955505, "learning_rate": 4.951190078597531e-05, "loss": 0.7793, "step": 21250 }, { "epoch": 1.6592523218606103, "grad_norm": 0.6903563737869263, "learning_rate": 4.9466497857775544e-05, "loss": 1.9214, "step": 21260 }, { "epoch": 1.6600327792086162, "grad_norm": 0.040249597281217575, "learning_rate": 4.9421095369531764e-05, "loss": 2.2853, "step": 21270 }, { "epoch": 1.6608132365566222, "grad_norm": 35.857269287109375, "learning_rate": 4.9375693358685395e-05, "loss": 2.6226, "step": 21280 }, { "epoch": 1.661593693904628, "grad_norm": 11.268913269042969, "learning_rate": 4.933029186267749e-05, "loss": 0.6422, "step": 21290 }, { "epoch": 1.662374151252634, "grad_norm": 2.7771138775278814e-05, "learning_rate": 4.9284890918948734e-05, "loss": 0.169, "step": 21300 }, { "epoch": 1.6631546086006401, "grad_norm": 82.51126098632812, "learning_rate": 4.9239490564939244e-05, "loss": 1.7565, "step": 21310 }, { "epoch": 1.663935065948646, "grad_norm": 18.174606323242188, "learning_rate": 4.919409083808876e-05, "loss": 0.4783, "step": 21320 }, { "epoch": 1.664715523296652, "grad_norm": 16.663576126098633, "learning_rate": 4.914869177583643e-05, "loss": 0.6331, "step": 21330 }, { "epoch": 1.6654959806446579, "grad_norm": 0.04770435020327568, "learning_rate": 4.9103293415620916e-05, "loss": 1.0681, "step": 21340 }, { "epoch": 1.6662764379926638, "grad_norm": 7.188361167907715, "learning_rate": 4.9057895794880224e-05, "loss": 2.0846, "step": 21350 }, { "epoch": 1.6670568953406697, "grad_norm": 48.30089569091797, "learning_rate": 4.9012498951051825e-05, "loss": 0.5371, "step": 21360 }, { "epoch": 1.6678373526886756, "grad_norm": 0.17572489380836487, "learning_rate": 4.8967102921572524e-05, "loss": 0.6895, "step": 21370 }, { "epoch": 1.6686178100366815, "grad_norm": 0.0517539419233799, "learning_rate": 4.8921707743878404e-05, "loss": 0.7653, "step": 21380 }, { "epoch": 1.6693982673846874, "grad_norm": 32.768985748291016, "learning_rate": 4.8876313455404934e-05, "loss": 0.6787, "step": 21390 }, { "epoch": 1.6701787247326934, "grad_norm": 6.695905176457018e-05, "learning_rate": 4.883092009358678e-05, "loss": 1.0841, "step": 21400 }, { "epoch": 1.6709591820806993, "grad_norm": 0.32399168610572815, "learning_rate": 4.87855276958579e-05, "loss": 1.3301, "step": 21410 }, { "epoch": 1.6717396394287052, "grad_norm": 0.19651849567890167, "learning_rate": 4.874013629965138e-05, "loss": 0.7622, "step": 21420 }, { "epoch": 1.672520096776711, "grad_norm": 0.5481512546539307, "learning_rate": 4.869474594239958e-05, "loss": 0.3891, "step": 21430 }, { "epoch": 1.673300554124717, "grad_norm": 7.097708225250244, "learning_rate": 4.864935666153388e-05, "loss": 0.1903, "step": 21440 }, { "epoch": 1.674081011472723, "grad_norm": 0.6818715929985046, "learning_rate": 4.860396849448492e-05, "loss": 0.2203, "step": 21450 }, { "epoch": 1.6748614688207288, "grad_norm": 41.85393142700195, "learning_rate": 4.8558581478682294e-05, "loss": 0.4238, "step": 21460 }, { "epoch": 1.6756419261687348, "grad_norm": 53.905677795410156, "learning_rate": 4.851319565155472e-05, "loss": 1.0562, "step": 21470 }, { "epoch": 1.6764223835167407, "grad_norm": 39.14933776855469, "learning_rate": 4.846781105052989e-05, "loss": 0.8403, "step": 21480 }, { "epoch": 1.6772028408647466, "grad_norm": 1.2465424537658691, "learning_rate": 4.842242771303451e-05, "loss": 1.5033, "step": 21490 }, { "epoch": 1.6779832982127527, "grad_norm": 8.150337219238281, "learning_rate": 4.837704567649428e-05, "loss": 2.6654, "step": 21500 }, { "epoch": 1.6787637555607586, "grad_norm": 0.18484830856323242, "learning_rate": 4.833166497833372e-05, "loss": 1.4964, "step": 21510 }, { "epoch": 1.6795442129087645, "grad_norm": 0.00015833714860491455, "learning_rate": 4.828628565597636e-05, "loss": 0.5441, "step": 21520 }, { "epoch": 1.6803246702567705, "grad_norm": 50.95884323120117, "learning_rate": 4.824090774684454e-05, "loss": 1.7497, "step": 21530 }, { "epoch": 1.6811051276047764, "grad_norm": 48.3619384765625, "learning_rate": 4.8195531288359466e-05, "loss": 1.1273, "step": 21540 }, { "epoch": 1.6818855849527823, "grad_norm": 0.0017549977637827396, "learning_rate": 4.815015631794108e-05, "loss": 1.4486, "step": 21550 }, { "epoch": 1.6826660423007884, "grad_norm": 0.11870098859071732, "learning_rate": 4.810478287300817e-05, "loss": 0.1412, "step": 21560 }, { "epoch": 1.6834464996487943, "grad_norm": 0.0002780222857836634, "learning_rate": 4.805941099097826e-05, "loss": 1.6231, "step": 21570 }, { "epoch": 1.6842269569968003, "grad_norm": 55.342716217041016, "learning_rate": 4.801404070926751e-05, "loss": 1.7901, "step": 21580 }, { "epoch": 1.6850074143448062, "grad_norm": 3.412632465362549, "learning_rate": 4.796867206529086e-05, "loss": 0.0696, "step": 21590 }, { "epoch": 1.685787871692812, "grad_norm": 6.7516374588012695, "learning_rate": 4.792330509646182e-05, "loss": 1.5025, "step": 21600 }, { "epoch": 1.686568329040818, "grad_norm": 1.281043114431668e-05, "learning_rate": 4.78779398401926e-05, "loss": 0.4093, "step": 21610 }, { "epoch": 1.687348786388824, "grad_norm": 37.101993560791016, "learning_rate": 4.783257633389389e-05, "loss": 1.4379, "step": 21620 }, { "epoch": 1.6881292437368298, "grad_norm": 56.41515350341797, "learning_rate": 4.778721461497504e-05, "loss": 1.4588, "step": 21630 }, { "epoch": 1.6889097010848357, "grad_norm": 5.122697984916158e-06, "learning_rate": 4.774185472084386e-05, "loss": 0.6948, "step": 21640 }, { "epoch": 1.6896901584328416, "grad_norm": 63.40077209472656, "learning_rate": 4.7696496688906704e-05, "loss": 2.4883, "step": 21650 }, { "epoch": 1.6904706157808476, "grad_norm": 4.130136403546203e-06, "learning_rate": 4.765114055656834e-05, "loss": 0.3206, "step": 21660 }, { "epoch": 1.6912510731288535, "grad_norm": 0.13058346509933472, "learning_rate": 4.7605786361232e-05, "loss": 1.351, "step": 21670 }, { "epoch": 1.6920315304768594, "grad_norm": 47.44347381591797, "learning_rate": 4.756043414029932e-05, "loss": 0.9191, "step": 21680 }, { "epoch": 1.6928119878248653, "grad_norm": 0.0047844285145401955, "learning_rate": 4.7515083931170284e-05, "loss": 0.1568, "step": 21690 }, { "epoch": 1.6935924451728712, "grad_norm": 43.56236267089844, "learning_rate": 4.746973577124325e-05, "loss": 1.2394, "step": 21700 }, { "epoch": 1.6943729025208771, "grad_norm": 1.7203421592712402, "learning_rate": 4.742438969791485e-05, "loss": 0.6306, "step": 21710 }, { "epoch": 1.695153359868883, "grad_norm": 0.0004018844338133931, "learning_rate": 4.7379045748580056e-05, "loss": 0.5336, "step": 21720 }, { "epoch": 1.695933817216889, "grad_norm": 40.43912124633789, "learning_rate": 4.733370396063199e-05, "loss": 1.0372, "step": 21730 }, { "epoch": 1.6967142745648949, "grad_norm": 12.819672584533691, "learning_rate": 4.72883643714621e-05, "loss": 0.8208, "step": 21740 }, { "epoch": 1.697494731912901, "grad_norm": 3.561326957424171e-05, "learning_rate": 4.7243027018459926e-05, "loss": 0.7186, "step": 21750 }, { "epoch": 1.698275189260907, "grad_norm": 41.82414245605469, "learning_rate": 4.7197691939013243e-05, "loss": 0.9432, "step": 21760 }, { "epoch": 1.6990556466089128, "grad_norm": 0.019969845190644264, "learning_rate": 4.715235917050791e-05, "loss": 1.8267, "step": 21770 }, { "epoch": 1.6998361039569188, "grad_norm": 0.06363899260759354, "learning_rate": 4.710702875032785e-05, "loss": 0.715, "step": 21780 }, { "epoch": 1.7006165613049247, "grad_norm": 4.866072654724121, "learning_rate": 4.706170071585512e-05, "loss": 0.5587, "step": 21790 }, { "epoch": 1.7013970186529306, "grad_norm": 0.034537654370069504, "learning_rate": 4.701637510446976e-05, "loss": 0.6576, "step": 21800 }, { "epoch": 1.7021774760009365, "grad_norm": 13.53211498260498, "learning_rate": 4.6971051953549855e-05, "loss": 0.7478, "step": 21810 }, { "epoch": 1.7029579333489426, "grad_norm": 0.01826702244579792, "learning_rate": 4.692573130047139e-05, "loss": 0.6309, "step": 21820 }, { "epoch": 1.7037383906969485, "grad_norm": 33.52978515625, "learning_rate": 4.688041318260836e-05, "loss": 0.5877, "step": 21830 }, { "epoch": 1.7045188480449545, "grad_norm": 0.7539353966712952, "learning_rate": 4.683509763733263e-05, "loss": 1.5259, "step": 21840 }, { "epoch": 1.7052993053929604, "grad_norm": 100.28569793701172, "learning_rate": 4.6789784702013975e-05, "loss": 1.7804, "step": 21850 }, { "epoch": 1.7060797627409663, "grad_norm": 4.1331446709591546e-07, "learning_rate": 4.674447441401997e-05, "loss": 1.0625, "step": 21860 }, { "epoch": 1.7068602200889722, "grad_norm": 47.859371185302734, "learning_rate": 4.669916681071604e-05, "loss": 0.847, "step": 21870 }, { "epoch": 1.7076406774369781, "grad_norm": 0.08291533589363098, "learning_rate": 4.665386192946542e-05, "loss": 0.652, "step": 21880 }, { "epoch": 1.708421134784984, "grad_norm": 2.0455920696258545, "learning_rate": 4.660855980762903e-05, "loss": 1.7044, "step": 21890 }, { "epoch": 1.70920159213299, "grad_norm": 0.001365151721984148, "learning_rate": 4.6563260482565586e-05, "loss": 1.3111, "step": 21900 }, { "epoch": 1.7099820494809959, "grad_norm": 54.70901870727539, "learning_rate": 4.651796399163145e-05, "loss": 1.5678, "step": 21910 }, { "epoch": 1.7107625068290018, "grad_norm": 39.61702346801758, "learning_rate": 4.647267037218069e-05, "loss": 0.67, "step": 21920 }, { "epoch": 1.7115429641770077, "grad_norm": 0.0010940422071143985, "learning_rate": 4.642737966156494e-05, "loss": 0.415, "step": 21930 }, { "epoch": 1.7123234215250136, "grad_norm": 0.5741721391677856, "learning_rate": 4.638209189713351e-05, "loss": 0.5323, "step": 21940 }, { "epoch": 1.7131038788730195, "grad_norm": 0.0008567293407395482, "learning_rate": 4.633680711623322e-05, "loss": 0.9283, "step": 21950 }, { "epoch": 1.7138843362210254, "grad_norm": 0.11211058497428894, "learning_rate": 4.6291525356208495e-05, "loss": 1.5876, "step": 21960 }, { "epoch": 1.7146647935690313, "grad_norm": 1.0090612704516388e-05, "learning_rate": 4.624624665440119e-05, "loss": 0.0254, "step": 21970 }, { "epoch": 1.7154452509170373, "grad_norm": 0.07422367483377457, "learning_rate": 4.620097104815067e-05, "loss": 1.1101, "step": 21980 }, { "epoch": 1.7162257082650432, "grad_norm": 0.00016244892321992666, "learning_rate": 4.615569857479382e-05, "loss": 0.9351, "step": 21990 }, { "epoch": 1.717006165613049, "grad_norm": 26.694711685180664, "learning_rate": 4.611042927166479e-05, "loss": 1.2162, "step": 22000 }, { "epoch": 1.7177866229610552, "grad_norm": 0.0031326529569923878, "learning_rate": 4.6065163176095256e-05, "loss": 1.9415, "step": 22010 }, { "epoch": 1.7185670803090611, "grad_norm": 0.0018533789552748203, "learning_rate": 4.6019900325414164e-05, "loss": 0.0851, "step": 22020 }, { "epoch": 1.719347537657067, "grad_norm": 0.5745460391044617, "learning_rate": 4.5974640756947864e-05, "loss": 0.1837, "step": 22030 }, { "epoch": 1.720127995005073, "grad_norm": 0.08341173827648163, "learning_rate": 4.592938450801989e-05, "loss": 0.0196, "step": 22040 }, { "epoch": 1.7209084523530789, "grad_norm": 5.3973726608091965e-06, "learning_rate": 4.588413161595114e-05, "loss": 0.5012, "step": 22050 }, { "epoch": 1.7216889097010848, "grad_norm": 1.1654116605086529e-07, "learning_rate": 4.583888211805969e-05, "loss": 0.0606, "step": 22060 }, { "epoch": 1.722469367049091, "grad_norm": 0.05267452821135521, "learning_rate": 4.5793636051660804e-05, "loss": 0.0854, "step": 22070 }, { "epoch": 1.7232498243970968, "grad_norm": 0.0005213894182816148, "learning_rate": 4.574839345406699e-05, "loss": 2.4604, "step": 22080 }, { "epoch": 1.7240302817451028, "grad_norm": 0.0006774759967811406, "learning_rate": 4.5703154362587795e-05, "loss": 0.5281, "step": 22090 }, { "epoch": 1.7248107390931087, "grad_norm": 0.256835401058197, "learning_rate": 4.565791881452997e-05, "loss": 0.548, "step": 22100 }, { "epoch": 1.7255911964411146, "grad_norm": 75.47777557373047, "learning_rate": 4.5612686847197264e-05, "loss": 1.321, "step": 22110 }, { "epoch": 1.7263716537891205, "grad_norm": 79.86677551269531, "learning_rate": 4.5567458497890546e-05, "loss": 1.6857, "step": 22120 }, { "epoch": 1.7271521111371264, "grad_norm": 0.022913841530680656, "learning_rate": 4.552223380390763e-05, "loss": 2.9717, "step": 22130 }, { "epoch": 1.7279325684851323, "grad_norm": 2.466359853744507, "learning_rate": 4.547701280254338e-05, "loss": 0.159, "step": 22140 }, { "epoch": 1.7287130258331382, "grad_norm": 60.59958267211914, "learning_rate": 4.543179553108958e-05, "loss": 0.7301, "step": 22150 }, { "epoch": 1.7294934831811442, "grad_norm": 3.465244313716198e-09, "learning_rate": 4.5386582026834906e-05, "loss": 0.2713, "step": 22160 }, { "epoch": 1.73027394052915, "grad_norm": 2.2539663314819336, "learning_rate": 4.534137232706501e-05, "loss": 0.571, "step": 22170 }, { "epoch": 1.731054397877156, "grad_norm": 0.3366568982601166, "learning_rate": 4.529616646906233e-05, "loss": 0.2032, "step": 22180 }, { "epoch": 1.731834855225162, "grad_norm": 0.0013072154251858592, "learning_rate": 4.525096449010621e-05, "loss": 2.0557, "step": 22190 }, { "epoch": 1.7326153125731678, "grad_norm": 6.300201416015625, "learning_rate": 4.52057664274727e-05, "loss": 1.4872, "step": 22200 }, { "epoch": 1.7333957699211737, "grad_norm": 78.94749450683594, "learning_rate": 4.516057231843471e-05, "loss": 0.5405, "step": 22210 }, { "epoch": 1.7341762272691796, "grad_norm": 1.786816561377691e-08, "learning_rate": 4.511538220026182e-05, "loss": 0.0601, "step": 22220 }, { "epoch": 1.7349566846171856, "grad_norm": 0.12275684624910355, "learning_rate": 4.5070196110220396e-05, "loss": 0.9074, "step": 22230 }, { "epoch": 1.7357371419651915, "grad_norm": 0.0003189127310179174, "learning_rate": 4.502501408557339e-05, "loss": 0.419, "step": 22240 }, { "epoch": 1.7365175993131974, "grad_norm": 45.07834243774414, "learning_rate": 4.497983616358048e-05, "loss": 1.2748, "step": 22250 }, { "epoch": 1.7372980566612035, "grad_norm": 6.688942448818125e-07, "learning_rate": 4.493466238149793e-05, "loss": 1.6876, "step": 22260 }, { "epoch": 1.7380785140092094, "grad_norm": 0.005018897820264101, "learning_rate": 4.4889492776578565e-05, "loss": 1.7529, "step": 22270 }, { "epoch": 1.7388589713572153, "grad_norm": 8.813771614768484e-07, "learning_rate": 4.4844327386071804e-05, "loss": 1.2264, "step": 22280 }, { "epoch": 1.7396394287052213, "grad_norm": 0.0008363195811398327, "learning_rate": 4.479916624722357e-05, "loss": 0.9457, "step": 22290 }, { "epoch": 1.7404198860532272, "grad_norm": 0.005100402981042862, "learning_rate": 4.475400939727632e-05, "loss": 0.3116, "step": 22300 }, { "epoch": 1.741200343401233, "grad_norm": 1.4255502223968506, "learning_rate": 4.470885687346889e-05, "loss": 0.4434, "step": 22310 }, { "epoch": 1.741980800749239, "grad_norm": 45.88622283935547, "learning_rate": 4.466370871303664e-05, "loss": 1.1204, "step": 22320 }, { "epoch": 1.7427612580972451, "grad_norm": 0.23392795026302338, "learning_rate": 4.461856495321124e-05, "loss": 0.3264, "step": 22330 }, { "epoch": 1.743541715445251, "grad_norm": 0.0455491878092289, "learning_rate": 4.457342563122084e-05, "loss": 0.4507, "step": 22340 }, { "epoch": 1.744322172793257, "grad_norm": 0.003446984337642789, "learning_rate": 4.4528290784289854e-05, "loss": 0.9984, "step": 22350 }, { "epoch": 1.7451026301412629, "grad_norm": 0.003921607509255409, "learning_rate": 4.4483160449638995e-05, "loss": 0.4333, "step": 22360 }, { "epoch": 1.7458830874892688, "grad_norm": 49.32126998901367, "learning_rate": 4.443803466448531e-05, "loss": 2.0987, "step": 22370 }, { "epoch": 1.7466635448372747, "grad_norm": 3.526056104874442e-07, "learning_rate": 4.439291346604205e-05, "loss": 0.4222, "step": 22380 }, { "epoch": 1.7474440021852806, "grad_norm": 43.01560974121094, "learning_rate": 4.4347796891518736e-05, "loss": 1.6041, "step": 22390 }, { "epoch": 1.7482244595332865, "grad_norm": 9.610936164855957, "learning_rate": 4.430268497812099e-05, "loss": 0.8194, "step": 22400 }, { "epoch": 1.7490049168812924, "grad_norm": 10.678997039794922, "learning_rate": 4.425757776305068e-05, "loss": 0.561, "step": 22410 }, { "epoch": 1.7497853742292984, "grad_norm": 8.100939750671387, "learning_rate": 4.421247528350574e-05, "loss": 0.5386, "step": 22420 }, { "epoch": 1.7505658315773043, "grad_norm": 42.0233268737793, "learning_rate": 4.416737757668025e-05, "loss": 2.2196, "step": 22430 }, { "epoch": 1.7513462889253102, "grad_norm": 83.78080749511719, "learning_rate": 4.412228467976428e-05, "loss": 1.5664, "step": 22440 }, { "epoch": 1.752126746273316, "grad_norm": 17.094125747680664, "learning_rate": 4.407719662994401e-05, "loss": 0.5809, "step": 22450 }, { "epoch": 1.752907203621322, "grad_norm": 11.183182716369629, "learning_rate": 4.4032113464401585e-05, "loss": 0.6073, "step": 22460 }, { "epoch": 1.753687660969328, "grad_norm": 9.12381710804766e-06, "learning_rate": 4.39870352203151e-05, "loss": 0.9461, "step": 22470 }, { "epoch": 1.7544681183173338, "grad_norm": 0.0001830037945182994, "learning_rate": 4.394196193485865e-05, "loss": 0.5364, "step": 22480 }, { "epoch": 1.7552485756653398, "grad_norm": 1.9644840955734253, "learning_rate": 4.389689364520219e-05, "loss": 1.0993, "step": 22490 }, { "epoch": 1.7560290330133457, "grad_norm": 3.125764851574786e-05, "learning_rate": 4.3851830388511605e-05, "loss": 0.8421, "step": 22500 }, { "epoch": 1.7568094903613516, "grad_norm": 3.3445630833739415e-05, "learning_rate": 4.380677220194855e-05, "loss": 0.3314, "step": 22510 }, { "epoch": 1.7575899477093577, "grad_norm": 0.02401496283710003, "learning_rate": 4.3761719122670595e-05, "loss": 0.5202, "step": 22520 }, { "epoch": 1.7583704050573636, "grad_norm": 32.83649826049805, "learning_rate": 4.371667118783101e-05, "loss": 1.02, "step": 22530 }, { "epoch": 1.7591508624053696, "grad_norm": 47.032676696777344, "learning_rate": 4.367162843457891e-05, "loss": 0.8202, "step": 22540 }, { "epoch": 1.7599313197533755, "grad_norm": 34.51866912841797, "learning_rate": 4.362659090005905e-05, "loss": 0.5898, "step": 22550 }, { "epoch": 1.7607117771013814, "grad_norm": 0.08327773213386536, "learning_rate": 4.358155862141194e-05, "loss": 1.7651, "step": 22560 }, { "epoch": 1.7614922344493873, "grad_norm": 1.6261988878250122, "learning_rate": 4.3536531635773745e-05, "loss": 1.7785, "step": 22570 }, { "epoch": 1.7622726917973934, "grad_norm": 1.250341534614563, "learning_rate": 4.349150998027624e-05, "loss": 1.6029, "step": 22580 }, { "epoch": 1.7630531491453993, "grad_norm": 11.970406532287598, "learning_rate": 4.344649369204683e-05, "loss": 0.5072, "step": 22590 }, { "epoch": 1.7638336064934053, "grad_norm": 1.0202245712280273, "learning_rate": 4.340148280820848e-05, "loss": 0.9217, "step": 22600 }, { "epoch": 1.7646140638414112, "grad_norm": 34.642059326171875, "learning_rate": 4.3356477365879725e-05, "loss": 1.5028, "step": 22610 }, { "epoch": 1.765394521189417, "grad_norm": 54.67207336425781, "learning_rate": 4.331147740217457e-05, "loss": 1.26, "step": 22620 }, { "epoch": 1.766174978537423, "grad_norm": 30.081071853637695, "learning_rate": 4.3266482954202545e-05, "loss": 0.3095, "step": 22630 }, { "epoch": 1.766955435885429, "grad_norm": 0.38360029458999634, "learning_rate": 4.322149405906859e-05, "loss": 0.93, "step": 22640 }, { "epoch": 1.7677358932334348, "grad_norm": 0.012800214812159538, "learning_rate": 4.3176510753873075e-05, "loss": 0.9898, "step": 22650 }, { "epoch": 1.7685163505814407, "grad_norm": 6.709954323014244e-05, "learning_rate": 4.3131533075711825e-05, "loss": 0.1898, "step": 22660 }, { "epoch": 1.7692968079294467, "grad_norm": 0.01472716685384512, "learning_rate": 4.308656106167591e-05, "loss": 0.2401, "step": 22670 }, { "epoch": 1.7700772652774526, "grad_norm": 2.2141734007163905e-05, "learning_rate": 4.3041594748851835e-05, "loss": 0.6846, "step": 22680 }, { "epoch": 1.7708577226254585, "grad_norm": 65.04307556152344, "learning_rate": 4.299663417432132e-05, "loss": 2.621, "step": 22690 }, { "epoch": 1.7716381799734644, "grad_norm": 0.0004803487390745431, "learning_rate": 4.295167937516144e-05, "loss": 0.4884, "step": 22700 }, { "epoch": 1.7724186373214703, "grad_norm": 0.43179354071617126, "learning_rate": 4.2906730388444406e-05, "loss": 0.6171, "step": 22710 }, { "epoch": 1.7731990946694762, "grad_norm": 0.8222143650054932, "learning_rate": 4.2861787251237725e-05, "loss": 0.7261, "step": 22720 }, { "epoch": 1.7739795520174821, "grad_norm": 2.3041009100666088e-08, "learning_rate": 4.2816850000604026e-05, "loss": 0.0063, "step": 22730 }, { "epoch": 1.774760009365488, "grad_norm": 0.0003917988215107471, "learning_rate": 4.277191867360113e-05, "loss": 0.5468, "step": 22740 }, { "epoch": 1.775540466713494, "grad_norm": 0.06409639865159988, "learning_rate": 4.27269933072819e-05, "loss": 2.1791, "step": 22750 }, { "epoch": 1.7763209240614999, "grad_norm": 0.0009544471395201981, "learning_rate": 4.2682073938694355e-05, "loss": 0.8525, "step": 22760 }, { "epoch": 1.777101381409506, "grad_norm": 45.30816650390625, "learning_rate": 4.263716060488155e-05, "loss": 1.2142, "step": 22770 }, { "epoch": 1.777881838757512, "grad_norm": 0.10559545457363129, "learning_rate": 4.259225334288152e-05, "loss": 0.6182, "step": 22780 }, { "epoch": 1.7786622961055178, "grad_norm": 0.8708995580673218, "learning_rate": 4.254735218972736e-05, "loss": 1.1154, "step": 22790 }, { "epoch": 1.7794427534535238, "grad_norm": 45.40227508544922, "learning_rate": 4.2502457182447066e-05, "loss": 1.7978, "step": 22800 }, { "epoch": 1.7802232108015297, "grad_norm": 0.0006610558484680951, "learning_rate": 4.245756835806363e-05, "loss": 1.1483, "step": 22810 }, { "epoch": 1.7810036681495356, "grad_norm": 0.0016300451243296266, "learning_rate": 4.241268575359487e-05, "loss": 1.3185, "step": 22820 }, { "epoch": 1.7817841254975415, "grad_norm": 0.1046861782670021, "learning_rate": 4.236780940605355e-05, "loss": 0.4392, "step": 22830 }, { "epoch": 1.7825645828455476, "grad_norm": 0.0039477222599089146, "learning_rate": 4.232293935244722e-05, "loss": 0.8003, "step": 22840 }, { "epoch": 1.7833450401935536, "grad_norm": 0.0004634255019482225, "learning_rate": 4.227807562977825e-05, "loss": 0.0647, "step": 22850 }, { "epoch": 1.7841254975415595, "grad_norm": 0.8950148224830627, "learning_rate": 4.2233218275043805e-05, "loss": 1.0977, "step": 22860 }, { "epoch": 1.7849059548895654, "grad_norm": 28.8367977142334, "learning_rate": 4.218836732523579e-05, "loss": 0.7182, "step": 22870 }, { "epoch": 1.7856864122375713, "grad_norm": 41.56075668334961, "learning_rate": 4.214352281734085e-05, "loss": 1.1103, "step": 22880 }, { "epoch": 1.7864668695855772, "grad_norm": 46.85104751586914, "learning_rate": 4.209868478834025e-05, "loss": 1.8579, "step": 22890 }, { "epoch": 1.7872473269335831, "grad_norm": 48.383506774902344, "learning_rate": 4.2053853275210016e-05, "loss": 0.811, "step": 22900 }, { "epoch": 1.788027784281589, "grad_norm": 0.27532026171684265, "learning_rate": 4.200902831492067e-05, "loss": 0.1309, "step": 22910 }, { "epoch": 1.788808241629595, "grad_norm": 0.017690371721982956, "learning_rate": 4.196420994443749e-05, "loss": 0.4197, "step": 22920 }, { "epoch": 1.7895886989776009, "grad_norm": 1.1601506471633911, "learning_rate": 4.191939820072016e-05, "loss": 1.0285, "step": 22930 }, { "epoch": 1.7903691563256068, "grad_norm": 0.032384805381298065, "learning_rate": 4.1874593120723033e-05, "loss": 0.5403, "step": 22940 }, { "epoch": 1.7911496136736127, "grad_norm": 12.99577522277832, "learning_rate": 4.1829794741394855e-05, "loss": 0.6629, "step": 22950 }, { "epoch": 1.7919300710216186, "grad_norm": 3.649952873274742e-08, "learning_rate": 4.178500309967891e-05, "loss": 0.9047, "step": 22960 }, { "epoch": 1.7927105283696245, "grad_norm": 0.054159872233867645, "learning_rate": 4.174021823251294e-05, "loss": 0.772, "step": 22970 }, { "epoch": 1.7934909857176304, "grad_norm": 1.9959516066592187e-06, "learning_rate": 4.169544017682903e-05, "loss": 1.3254, "step": 22980 }, { "epoch": 1.7942714430656364, "grad_norm": 30.54090690612793, "learning_rate": 4.1650668969553725e-05, "loss": 2.8308, "step": 22990 }, { "epoch": 1.7950519004136423, "grad_norm": 5.61015510559082, "learning_rate": 4.160590464760787e-05, "loss": 0.2375, "step": 23000 }, { "epoch": 1.7958323577616482, "grad_norm": 26.482603073120117, "learning_rate": 4.156114724790668e-05, "loss": 0.2325, "step": 23010 }, { "epoch": 1.796612815109654, "grad_norm": 0.11949822306632996, "learning_rate": 4.151639680735959e-05, "loss": 0.6167, "step": 23020 }, { "epoch": 1.7973932724576602, "grad_norm": 29.800121307373047, "learning_rate": 4.147165336287036e-05, "loss": 0.2472, "step": 23030 }, { "epoch": 1.7981737298056661, "grad_norm": 1.9849360342050204e-06, "learning_rate": 4.142691695133698e-05, "loss": 1.2709, "step": 23040 }, { "epoch": 1.798954187153672, "grad_norm": 41.16581344604492, "learning_rate": 4.138218760965157e-05, "loss": 0.481, "step": 23050 }, { "epoch": 1.799734644501678, "grad_norm": 1.2925261749785477e-08, "learning_rate": 4.1337465374700514e-05, "loss": 0.5889, "step": 23060 }, { "epoch": 1.8005151018496839, "grad_norm": 0.0002364539832342416, "learning_rate": 4.129275028336425e-05, "loss": 0.6695, "step": 23070 }, { "epoch": 1.8012955591976898, "grad_norm": 36.79536437988281, "learning_rate": 4.1248042372517416e-05, "loss": 1.4779, "step": 23080 }, { "epoch": 1.802076016545696, "grad_norm": 0.17343249917030334, "learning_rate": 4.120334167902863e-05, "loss": 0.6679, "step": 23090 }, { "epoch": 1.8028564738937018, "grad_norm": 20.67839813232422, "learning_rate": 4.1158648239760625e-05, "loss": 1.2699, "step": 23100 }, { "epoch": 1.8036369312417078, "grad_norm": 1.2648146707761043e-07, "learning_rate": 4.111396209157013e-05, "loss": 0.3416, "step": 23110 }, { "epoch": 1.8044173885897137, "grad_norm": 2.1918528148034966e-07, "learning_rate": 4.106928327130789e-05, "loss": 1.1927, "step": 23120 }, { "epoch": 1.8051978459377196, "grad_norm": 9.444477081298828, "learning_rate": 4.102461181581854e-05, "loss": 1.6943, "step": 23130 }, { "epoch": 1.8059783032857255, "grad_norm": 38.784950256347656, "learning_rate": 4.0979947761940694e-05, "loss": 1.1751, "step": 23140 }, { "epoch": 1.8067587606337314, "grad_norm": 2.0578191595177486e-08, "learning_rate": 4.093529114650688e-05, "loss": 0.0777, "step": 23150 }, { "epoch": 1.8075392179817373, "grad_norm": 0.9909342527389526, "learning_rate": 4.089064200634343e-05, "loss": 0.3616, "step": 23160 }, { "epoch": 1.8083196753297432, "grad_norm": 16.76162338256836, "learning_rate": 4.084600037827055e-05, "loss": 0.6793, "step": 23170 }, { "epoch": 1.8091001326777492, "grad_norm": 1.2102850632800255e-05, "learning_rate": 4.080136629910224e-05, "loss": 0.3691, "step": 23180 }, { "epoch": 1.809880590025755, "grad_norm": 6.797701835632324, "learning_rate": 4.075673980564632e-05, "loss": 0.2886, "step": 23190 }, { "epoch": 1.810661047373761, "grad_norm": 2.2534488053338464e-08, "learning_rate": 4.071212093470426e-05, "loss": 0.1335, "step": 23200 }, { "epoch": 1.811441504721767, "grad_norm": 0.004243421368300915, "learning_rate": 4.066750972307134e-05, "loss": 0.2907, "step": 23210 }, { "epoch": 1.8122219620697728, "grad_norm": 0.2348150759935379, "learning_rate": 4.0622906207536445e-05, "loss": 3.2972, "step": 23220 }, { "epoch": 1.8130024194177787, "grad_norm": 1.3275203855300788e-05, "learning_rate": 4.0578310424882173e-05, "loss": 0.5777, "step": 23230 }, { "epoch": 1.8137828767657846, "grad_norm": 33.510658264160156, "learning_rate": 4.053372241188475e-05, "loss": 0.8417, "step": 23240 }, { "epoch": 1.8145633341137906, "grad_norm": 37.20769500732422, "learning_rate": 4.04891422053139e-05, "loss": 1.7718, "step": 23250 }, { "epoch": 1.8153437914617965, "grad_norm": 1.933334715431556e-05, "learning_rate": 4.0444569841933034e-05, "loss": 0.7678, "step": 23260 }, { "epoch": 1.8161242488098024, "grad_norm": 37.909122467041016, "learning_rate": 4.0400005358499e-05, "loss": 0.6111, "step": 23270 }, { "epoch": 1.8169047061578085, "grad_norm": 8.029224395751953, "learning_rate": 4.035544879176223e-05, "loss": 0.9214, "step": 23280 }, { "epoch": 1.8176851635058144, "grad_norm": 6.63645076751709, "learning_rate": 4.031090017846653e-05, "loss": 1.1639, "step": 23290 }, { "epoch": 1.8184656208538204, "grad_norm": 3.859848737716675, "learning_rate": 4.026635955534924e-05, "loss": 0.2613, "step": 23300 }, { "epoch": 1.8192460782018263, "grad_norm": 8.740327835083008, "learning_rate": 4.022182695914105e-05, "loss": 0.4547, "step": 23310 }, { "epoch": 1.8200265355498322, "grad_norm": 0.1261139214038849, "learning_rate": 4.0177302426566075e-05, "loss": 1.1221, "step": 23320 }, { "epoch": 1.820806992897838, "grad_norm": 0.04213743656873703, "learning_rate": 4.013278599434173e-05, "loss": 0.2915, "step": 23330 }, { "epoch": 1.8215874502458442, "grad_norm": 0.00043451451347209513, "learning_rate": 4.0088277699178777e-05, "loss": 0.5183, "step": 23340 }, { "epoch": 1.8223679075938501, "grad_norm": 0.0166937205940485, "learning_rate": 4.004377757778131e-05, "loss": 0.0948, "step": 23350 }, { "epoch": 1.823148364941856, "grad_norm": 3.59621000289917, "learning_rate": 3.999928566684657e-05, "loss": 0.9385, "step": 23360 }, { "epoch": 1.823928822289862, "grad_norm": 3.543958015939097e-08, "learning_rate": 3.995480200306515e-05, "loss": 0.9834, "step": 23370 }, { "epoch": 1.8247092796378679, "grad_norm": 7.353454113006592, "learning_rate": 3.9910326623120764e-05, "loss": 1.4397, "step": 23380 }, { "epoch": 1.8254897369858738, "grad_norm": 55.0924186706543, "learning_rate": 3.986585956369036e-05, "loss": 1.323, "step": 23390 }, { "epoch": 1.8262701943338797, "grad_norm": 0.015754669904708862, "learning_rate": 3.9821400861443916e-05, "loss": 1.3286, "step": 23400 }, { "epoch": 1.8270506516818856, "grad_norm": 0.4562488794326782, "learning_rate": 3.977695055304464e-05, "loss": 0.7128, "step": 23410 }, { "epoch": 1.8278311090298915, "grad_norm": 43.183048248291016, "learning_rate": 3.973250867514874e-05, "loss": 0.9965, "step": 23420 }, { "epoch": 1.8286115663778975, "grad_norm": 4.6577596890529094e-07, "learning_rate": 3.9688075264405524e-05, "loss": 0.2827, "step": 23430 }, { "epoch": 1.8293920237259034, "grad_norm": 0.00020056143694091588, "learning_rate": 3.964365035745726e-05, "loss": 0.4973, "step": 23440 }, { "epoch": 1.8301724810739093, "grad_norm": 31.505395889282227, "learning_rate": 3.959923399093923e-05, "loss": 1.1041, "step": 23450 }, { "epoch": 1.8309529384219152, "grad_norm": 0.45826637744903564, "learning_rate": 3.9554826201479716e-05, "loss": 0.0045, "step": 23460 }, { "epoch": 1.8317333957699211, "grad_norm": 46.24296188354492, "learning_rate": 3.9510427025699834e-05, "loss": 1.0564, "step": 23470 }, { "epoch": 1.832513853117927, "grad_norm": 40.72001647949219, "learning_rate": 3.94660365002137e-05, "loss": 1.8935, "step": 23480 }, { "epoch": 1.833294310465933, "grad_norm": 0.08623958379030228, "learning_rate": 3.9421654661628185e-05, "loss": 0.5077, "step": 23490 }, { "epoch": 1.8340747678139389, "grad_norm": 0.00014190695947036147, "learning_rate": 3.937728154654312e-05, "loss": 0.6978, "step": 23500 }, { "epoch": 1.8348552251619448, "grad_norm": 20.867637634277344, "learning_rate": 3.9332917191551037e-05, "loss": 0.49, "step": 23510 }, { "epoch": 1.8356356825099507, "grad_norm": 0.35819879174232483, "learning_rate": 3.928856163323733e-05, "loss": 0.1014, "step": 23520 }, { "epoch": 1.8364161398579566, "grad_norm": 70.75408935546875, "learning_rate": 3.924421490818004e-05, "loss": 3.2741, "step": 23530 }, { "epoch": 1.8371965972059627, "grad_norm": 1.0380865234083103e-07, "learning_rate": 3.919987705295001e-05, "loss": 0.4643, "step": 23540 }, { "epoch": 1.8379770545539686, "grad_norm": 0.1350480318069458, "learning_rate": 3.915554810411074e-05, "loss": 1.303, "step": 23550 }, { "epoch": 1.8387575119019746, "grad_norm": 16.59437370300293, "learning_rate": 3.911122809821836e-05, "loss": 2.1433, "step": 23560 }, { "epoch": 1.8395379692499805, "grad_norm": 53.31951141357422, "learning_rate": 3.9066917071821675e-05, "loss": 0.6518, "step": 23570 }, { "epoch": 1.8403184265979864, "grad_norm": 0.13815993070602417, "learning_rate": 3.9022615061462034e-05, "loss": 1.1043, "step": 23580 }, { "epoch": 1.8410988839459923, "grad_norm": 1.3820558786392212, "learning_rate": 3.8978322103673397e-05, "loss": 0.2971, "step": 23590 }, { "epoch": 1.8418793412939984, "grad_norm": 0.158604234457016, "learning_rate": 3.8934038234982214e-05, "loss": 1.0221, "step": 23600 }, { "epoch": 1.8426597986420044, "grad_norm": 0.006823586765676737, "learning_rate": 3.888976349190748e-05, "loss": 1.0428, "step": 23610 }, { "epoch": 1.8434402559900103, "grad_norm": 0.2200150042772293, "learning_rate": 3.884549791096062e-05, "loss": 1.8464, "step": 23620 }, { "epoch": 1.8442207133380162, "grad_norm": 2.2968795747146942e-05, "learning_rate": 3.880124152864558e-05, "loss": 1.5282, "step": 23630 }, { "epoch": 1.845001170686022, "grad_norm": 18.445926666259766, "learning_rate": 3.875699438145862e-05, "loss": 0.2527, "step": 23640 }, { "epoch": 1.845781628034028, "grad_norm": 49.831668853759766, "learning_rate": 3.871275650588844e-05, "loss": 1.8936, "step": 23650 }, { "epoch": 1.846562085382034, "grad_norm": 0.025264471769332886, "learning_rate": 3.8668527938416125e-05, "loss": 0.463, "step": 23660 }, { "epoch": 1.8473425427300398, "grad_norm": 7.791200914653018e-05, "learning_rate": 3.8624308715515e-05, "loss": 0.9327, "step": 23670 }, { "epoch": 1.8481230000780458, "grad_norm": 15.435553550720215, "learning_rate": 3.858009887365077e-05, "loss": 1.1377, "step": 23680 }, { "epoch": 1.8489034574260517, "grad_norm": 36.36300277709961, "learning_rate": 3.8535898449281325e-05, "loss": 1.2851, "step": 23690 }, { "epoch": 1.8496839147740576, "grad_norm": 4.985684394836426, "learning_rate": 3.8491707478856885e-05, "loss": 0.0899, "step": 23700 }, { "epoch": 1.8504643721220635, "grad_norm": 4.401831392897293e-05, "learning_rate": 3.844752599881976e-05, "loss": 1.7606, "step": 23710 }, { "epoch": 1.8512448294700694, "grad_norm": 0.06194838881492615, "learning_rate": 3.840335404560453e-05, "loss": 0.4333, "step": 23720 }, { "epoch": 1.8520252868180753, "grad_norm": 0.012446342036128044, "learning_rate": 3.835919165563787e-05, "loss": 0.1386, "step": 23730 }, { "epoch": 1.8528057441660812, "grad_norm": 78.01200866699219, "learning_rate": 3.8315038865338555e-05, "loss": 1.6024, "step": 23740 }, { "epoch": 1.8535862015140872, "grad_norm": 4.170412540435791, "learning_rate": 3.82708957111175e-05, "loss": 1.1088, "step": 23750 }, { "epoch": 1.854366658862093, "grad_norm": 1.04580731203896e-05, "learning_rate": 3.8226762229377614e-05, "loss": 1.2849, "step": 23760 }, { "epoch": 1.855147116210099, "grad_norm": 57.779754638671875, "learning_rate": 3.818263845651389e-05, "loss": 0.8573, "step": 23770 }, { "epoch": 1.855927573558105, "grad_norm": 25.989601135253906, "learning_rate": 3.813852442891324e-05, "loss": 0.6765, "step": 23780 }, { "epoch": 1.856708030906111, "grad_norm": 0.014243219047784805, "learning_rate": 3.8094420182954624e-05, "loss": 1.549, "step": 23790 }, { "epoch": 1.857488488254117, "grad_norm": 0.006569798570126295, "learning_rate": 3.805032575500885e-05, "loss": 0.2146, "step": 23800 }, { "epoch": 1.8582689456021229, "grad_norm": 0.029686694964766502, "learning_rate": 3.800624118143869e-05, "loss": 0.4949, "step": 23810 }, { "epoch": 1.8590494029501288, "grad_norm": 6.546103000640869, "learning_rate": 3.796216649859878e-05, "loss": 2.2488, "step": 23820 }, { "epoch": 1.8598298602981347, "grad_norm": 0.12819623947143555, "learning_rate": 3.791810174283557e-05, "loss": 0.9862, "step": 23830 }, { "epoch": 1.8606103176461406, "grad_norm": 1.798675775527954, "learning_rate": 3.7874046950487365e-05, "loss": 0.4003, "step": 23840 }, { "epoch": 1.8613907749941467, "grad_norm": 45.85932922363281, "learning_rate": 3.78300021578842e-05, "loss": 0.6263, "step": 23850 }, { "epoch": 1.8621712323421526, "grad_norm": 7.008039474487305, "learning_rate": 3.7785967401347944e-05, "loss": 0.3043, "step": 23860 }, { "epoch": 1.8629516896901586, "grad_norm": 21.64405059814453, "learning_rate": 3.7741942717192094e-05, "loss": 0.4736, "step": 23870 }, { "epoch": 1.8637321470381645, "grad_norm": 2.6776916683957097e-08, "learning_rate": 3.769792814172192e-05, "loss": 0.4083, "step": 23880 }, { "epoch": 1.8645126043861704, "grad_norm": 59.70551681518555, "learning_rate": 3.7653923711234306e-05, "loss": 1.3214, "step": 23890 }, { "epoch": 1.8652930617341763, "grad_norm": 1.2317818800511304e-06, "learning_rate": 3.7609929462017814e-05, "loss": 1.1039, "step": 23900 }, { "epoch": 1.8660735190821822, "grad_norm": 0.8701543211936951, "learning_rate": 3.7565945430352547e-05, "loss": 1.3019, "step": 23910 }, { "epoch": 1.8668539764301881, "grad_norm": 0.3090846538543701, "learning_rate": 3.752197165251025e-05, "loss": 0.399, "step": 23920 }, { "epoch": 1.867634433778194, "grad_norm": 27.304723739624023, "learning_rate": 3.747800816475418e-05, "loss": 0.537, "step": 23930 }, { "epoch": 1.8684148911262, "grad_norm": 0.8099300265312195, "learning_rate": 3.743405500333908e-05, "loss": 0.825, "step": 23940 }, { "epoch": 1.8691953484742059, "grad_norm": 43.0421142578125, "learning_rate": 3.739011220451124e-05, "loss": 1.1039, "step": 23950 }, { "epoch": 1.8699758058222118, "grad_norm": 46.806846618652344, "learning_rate": 3.7346179804508343e-05, "loss": 0.7547, "step": 23960 }, { "epoch": 1.8707562631702177, "grad_norm": 0.00016689456242602319, "learning_rate": 3.730225783955956e-05, "loss": 4.0015, "step": 23970 }, { "epoch": 1.8715367205182236, "grad_norm": 0.00046824358287267387, "learning_rate": 3.7258346345885383e-05, "loss": 0.9711, "step": 23980 }, { "epoch": 1.8723171778662295, "grad_norm": 13.834807395935059, "learning_rate": 3.7214445359697735e-05, "loss": 0.7928, "step": 23990 }, { "epoch": 1.8730976352142354, "grad_norm": 0.018621161580085754, "learning_rate": 3.717055491719982e-05, "loss": 1.8895, "step": 24000 }, { "epoch": 1.8738780925622414, "grad_norm": 0.5802863836288452, "learning_rate": 3.712667505458622e-05, "loss": 0.2756, "step": 24010 }, { "epoch": 1.8746585499102473, "grad_norm": 46.72166061401367, "learning_rate": 3.7082805808042696e-05, "loss": 1.032, "step": 24020 }, { "epoch": 1.8754390072582532, "grad_norm": 48.039520263671875, "learning_rate": 3.703894721374632e-05, "loss": 1.0127, "step": 24030 }, { "epoch": 1.876219464606259, "grad_norm": 0.05087466910481453, "learning_rate": 3.699509930786539e-05, "loss": 1.2346, "step": 24040 }, { "epoch": 1.8769999219542652, "grad_norm": 31.790119171142578, "learning_rate": 3.6951262126559315e-05, "loss": 0.2938, "step": 24050 }, { "epoch": 1.8777803793022712, "grad_norm": 5.346080303192139, "learning_rate": 3.690743570597874e-05, "loss": 1.0873, "step": 24060 }, { "epoch": 1.878560836650277, "grad_norm": 7.171860829657817e-07, "learning_rate": 3.686362008226539e-05, "loss": 0.0249, "step": 24070 }, { "epoch": 1.879341293998283, "grad_norm": 0.0009441017173230648, "learning_rate": 3.681981529155213e-05, "loss": 1.2795, "step": 24080 }, { "epoch": 1.880121751346289, "grad_norm": 7.874264156271238e-07, "learning_rate": 3.677602136996282e-05, "loss": 0.7461, "step": 24090 }, { "epoch": 1.8809022086942948, "grad_norm": 1.7541738748550415, "learning_rate": 3.673223835361244e-05, "loss": 1.8159, "step": 24100 }, { "epoch": 1.881682666042301, "grad_norm": 0.14342258870601654, "learning_rate": 3.668846627860689e-05, "loss": 0.9154, "step": 24110 }, { "epoch": 1.8824631233903069, "grad_norm": 2.541520416343701e-07, "learning_rate": 3.664470518104314e-05, "loss": 2.0994, "step": 24120 }, { "epoch": 1.8832435807383128, "grad_norm": 1.3055483577772975e-05, "learning_rate": 3.6600955097009024e-05, "loss": 1.2072, "step": 24130 }, { "epoch": 1.8840240380863187, "grad_norm": 23.39139175415039, "learning_rate": 3.655721606258334e-05, "loss": 0.6577, "step": 24140 }, { "epoch": 1.8848044954343246, "grad_norm": 6.286259651184082, "learning_rate": 3.651348811383577e-05, "loss": 0.2997, "step": 24150 }, { "epoch": 1.8855849527823305, "grad_norm": 3.474562644958496, "learning_rate": 3.646977128682684e-05, "loss": 0.7834, "step": 24160 }, { "epoch": 1.8863654101303364, "grad_norm": 16.204011917114258, "learning_rate": 3.642606561760793e-05, "loss": 1.7633, "step": 24170 }, { "epoch": 1.8871458674783423, "grad_norm": 44.72784423828125, "learning_rate": 3.6382371142221175e-05, "loss": 0.5443, "step": 24180 }, { "epoch": 1.8879263248263483, "grad_norm": 1.67126614769586e-07, "learning_rate": 3.633868789669952e-05, "loss": 1.0933, "step": 24190 }, { "epoch": 1.8887067821743542, "grad_norm": 9.963812885871448e-09, "learning_rate": 3.629501591706662e-05, "loss": 0.6429, "step": 24200 }, { "epoch": 1.88948723952236, "grad_norm": 2.0460259914398193, "learning_rate": 3.625135523933689e-05, "loss": 0.1334, "step": 24210 }, { "epoch": 1.890267696870366, "grad_norm": 0.08299527317285538, "learning_rate": 3.6207705899515355e-05, "loss": 0.5164, "step": 24220 }, { "epoch": 1.891048154218372, "grad_norm": 49.47283935546875, "learning_rate": 3.6164067933597713e-05, "loss": 0.6176, "step": 24230 }, { "epoch": 1.8918286115663778, "grad_norm": 9.358882904052734, "learning_rate": 3.6120441377570336e-05, "loss": 0.5464, "step": 24240 }, { "epoch": 1.8926090689143837, "grad_norm": 2.0653233528137207, "learning_rate": 3.6076826267410095e-05, "loss": 1.2293, "step": 24250 }, { "epoch": 1.8933895262623897, "grad_norm": 0.00041112504550255835, "learning_rate": 3.603322263908451e-05, "loss": 0.4034, "step": 24260 }, { "epoch": 1.8941699836103956, "grad_norm": 70.98471069335938, "learning_rate": 3.598963052855157e-05, "loss": 2.0693, "step": 24270 }, { "epoch": 1.8949504409584015, "grad_norm": 1.5118367671966553, "learning_rate": 3.594604997175981e-05, "loss": 1.5463, "step": 24280 }, { "epoch": 1.8957308983064074, "grad_norm": 0.04422280192375183, "learning_rate": 3.590248100464818e-05, "loss": 0.6958, "step": 24290 }, { "epoch": 1.8965113556544135, "grad_norm": 6.168531399453059e-06, "learning_rate": 3.5858923663146146e-05, "loss": 0.7424, "step": 24300 }, { "epoch": 1.8972918130024194, "grad_norm": 0.00010713309893617406, "learning_rate": 3.5815377983173544e-05, "loss": 1.1302, "step": 24310 }, { "epoch": 1.8980722703504254, "grad_norm": 46.48796463012695, "learning_rate": 3.577184400064057e-05, "loss": 0.9124, "step": 24320 }, { "epoch": 1.8988527276984313, "grad_norm": 15.14918041229248, "learning_rate": 3.572832175144783e-05, "loss": 1.08, "step": 24330 }, { "epoch": 1.8996331850464372, "grad_norm": 4.831262049265206e-05, "learning_rate": 3.568481127148621e-05, "loss": 0.0379, "step": 24340 }, { "epoch": 1.900413642394443, "grad_norm": 71.39578247070312, "learning_rate": 3.5641312596636937e-05, "loss": 1.7415, "step": 24350 }, { "epoch": 1.9011940997424492, "grad_norm": 1.3533830642700195, "learning_rate": 3.559782576277142e-05, "loss": 0.5327, "step": 24360 }, { "epoch": 1.9019745570904552, "grad_norm": 0.0841241180896759, "learning_rate": 3.555435080575141e-05, "loss": 1.65, "step": 24370 }, { "epoch": 1.902755014438461, "grad_norm": 8.758081436157227, "learning_rate": 3.5510887761428765e-05, "loss": 2.5704, "step": 24380 }, { "epoch": 1.903535471786467, "grad_norm": 10.82071304321289, "learning_rate": 3.54674366656456e-05, "loss": 1.8161, "step": 24390 }, { "epoch": 1.904315929134473, "grad_norm": 16.516754150390625, "learning_rate": 3.542399755423412e-05, "loss": 1.1733, "step": 24400 }, { "epoch": 1.9050963864824788, "grad_norm": 0.3717861473560333, "learning_rate": 3.5380570463016686e-05, "loss": 0.4364, "step": 24410 }, { "epoch": 1.9058768438304847, "grad_norm": 2.7536627385416068e-05, "learning_rate": 3.5337155427805694e-05, "loss": 0.9228, "step": 24420 }, { "epoch": 1.9066573011784906, "grad_norm": 21.14723777770996, "learning_rate": 3.529375248440365e-05, "loss": 0.1477, "step": 24430 }, { "epoch": 1.9074377585264966, "grad_norm": 1.0672245025634766, "learning_rate": 3.525036166860309e-05, "loss": 1.4234, "step": 24440 }, { "epoch": 1.9082182158745025, "grad_norm": 31.768104553222656, "learning_rate": 3.520698301618649e-05, "loss": 0.4301, "step": 24450 }, { "epoch": 1.9089986732225084, "grad_norm": 0.653933048248291, "learning_rate": 3.516361656292636e-05, "loss": 0.687, "step": 24460 }, { "epoch": 1.9097791305705143, "grad_norm": 34.76579284667969, "learning_rate": 3.512026234458511e-05, "loss": 1.8729, "step": 24470 }, { "epoch": 1.9105595879185202, "grad_norm": 0.0032537386287003756, "learning_rate": 3.50769203969151e-05, "loss": 1.5925, "step": 24480 }, { "epoch": 1.9113400452665261, "grad_norm": 2.1661665439605713, "learning_rate": 3.503359075565852e-05, "loss": 0.2118, "step": 24490 }, { "epoch": 1.912120502614532, "grad_norm": 0.00010317000123905018, "learning_rate": 3.499027345654745e-05, "loss": 0.8827, "step": 24500 }, { "epoch": 1.912900959962538, "grad_norm": 0.04974118992686272, "learning_rate": 3.4946968535303784e-05, "loss": 1.744, "step": 24510 }, { "epoch": 1.9136814173105439, "grad_norm": 0.6756658554077148, "learning_rate": 3.490367602763916e-05, "loss": 0.7952, "step": 24520 }, { "epoch": 1.9144618746585498, "grad_norm": 30.926450729370117, "learning_rate": 3.486039596925509e-05, "loss": 1.3325, "step": 24530 }, { "epoch": 1.9152423320065557, "grad_norm": 9.425595635548234e-05, "learning_rate": 3.481712839584269e-05, "loss": 1.4034, "step": 24540 }, { "epoch": 1.9160227893545618, "grad_norm": 26.642850875854492, "learning_rate": 3.4773873343082905e-05, "loss": 1.6034, "step": 24550 }, { "epoch": 1.9168032467025677, "grad_norm": 0.25802430510520935, "learning_rate": 3.473063084664623e-05, "loss": 0.5995, "step": 24560 }, { "epoch": 1.9175837040505737, "grad_norm": 0.4528713822364807, "learning_rate": 3.468740094219291e-05, "loss": 0.1909, "step": 24570 }, { "epoch": 1.9183641613985796, "grad_norm": 15.111522674560547, "learning_rate": 3.464418366537273e-05, "loss": 1.5801, "step": 24580 }, { "epoch": 1.9191446187465855, "grad_norm": 1.4172719717025757, "learning_rate": 3.4600979051825134e-05, "loss": 0.3448, "step": 24590 }, { "epoch": 1.9199250760945914, "grad_norm": 36.149879455566406, "learning_rate": 3.455778713717905e-05, "loss": 0.9534, "step": 24600 }, { "epoch": 1.9207055334425973, "grad_norm": 0.8631346821784973, "learning_rate": 3.451460795705299e-05, "loss": 0.472, "step": 24610 }, { "epoch": 1.9214859907906034, "grad_norm": 37.74773406982422, "learning_rate": 3.447144154705494e-05, "loss": 0.6436, "step": 24620 }, { "epoch": 1.9222664481386094, "grad_norm": 1.151184278569417e-06, "learning_rate": 3.442828794278233e-05, "loss": 0.7656, "step": 24630 }, { "epoch": 1.9230469054866153, "grad_norm": 0.5503702163696289, "learning_rate": 3.43851471798221e-05, "loss": 1.6578, "step": 24640 }, { "epoch": 1.9238273628346212, "grad_norm": 0.8215304017066956, "learning_rate": 3.434201929375051e-05, "loss": 1.3437, "step": 24650 }, { "epoch": 1.924607820182627, "grad_norm": 2.416992629150627e-06, "learning_rate": 3.42989043201333e-05, "loss": 0.6802, "step": 24660 }, { "epoch": 1.925388277530633, "grad_norm": 26.039220809936523, "learning_rate": 3.425580229452546e-05, "loss": 0.4974, "step": 24670 }, { "epoch": 1.926168734878639, "grad_norm": 0.001985641196370125, "learning_rate": 3.42127132524714e-05, "loss": 1.6364, "step": 24680 }, { "epoch": 1.9269491922266448, "grad_norm": 2.4921052954596234e-07, "learning_rate": 3.416963722950472e-05, "loss": 0.524, "step": 24690 }, { "epoch": 1.9277296495746508, "grad_norm": 0.6401686072349548, "learning_rate": 3.412657426114839e-05, "loss": 1.4678, "step": 24700 }, { "epoch": 1.9285101069226567, "grad_norm": 3.618363618850708, "learning_rate": 3.408352438291456e-05, "loss": 0.7004, "step": 24710 }, { "epoch": 1.9292905642706626, "grad_norm": 0.29783499240875244, "learning_rate": 3.4040487630304536e-05, "loss": 0.757, "step": 24720 }, { "epoch": 1.9300710216186685, "grad_norm": 0.0320902019739151, "learning_rate": 3.3997464038808904e-05, "loss": 0.8225, "step": 24730 }, { "epoch": 1.9308514789666744, "grad_norm": 48.40365219116211, "learning_rate": 3.395445364390732e-05, "loss": 1.3788, "step": 24740 }, { "epoch": 1.9316319363146803, "grad_norm": 0.07659325748682022, "learning_rate": 3.391145648106861e-05, "loss": 1.8757, "step": 24750 }, { "epoch": 1.9324123936626862, "grad_norm": 38.49648666381836, "learning_rate": 3.3868472585750625e-05, "loss": 0.7217, "step": 24760 }, { "epoch": 1.9331928510106922, "grad_norm": 2.974672270283918e-06, "learning_rate": 3.3825501993400335e-05, "loss": 0.3299, "step": 24770 }, { "epoch": 1.933973308358698, "grad_norm": 0.9465207457542419, "learning_rate": 3.378254473945369e-05, "loss": 0.9885, "step": 24780 }, { "epoch": 1.934753765706704, "grad_norm": 0.0180289875715971, "learning_rate": 3.373960085933571e-05, "loss": 0.486, "step": 24790 }, { "epoch": 1.93553422305471, "grad_norm": 0.6527422070503235, "learning_rate": 3.3696670388460304e-05, "loss": 0.4105, "step": 24800 }, { "epoch": 1.936314680402716, "grad_norm": 3.048807382583618, "learning_rate": 3.3653753362230365e-05, "loss": 0.2045, "step": 24810 }, { "epoch": 1.937095137750722, "grad_norm": 0.7973394393920898, "learning_rate": 3.3610849816037715e-05, "loss": 0.7687, "step": 24820 }, { "epoch": 1.9378755950987279, "grad_norm": 30.129440307617188, "learning_rate": 3.3567959785263004e-05, "loss": 0.5779, "step": 24830 }, { "epoch": 1.9386560524467338, "grad_norm": 7.424580417136895e-08, "learning_rate": 3.3525083305275806e-05, "loss": 1.9567, "step": 24840 }, { "epoch": 1.9394365097947397, "grad_norm": 50.41664123535156, "learning_rate": 3.3482220411434454e-05, "loss": 1.6986, "step": 24850 }, { "epoch": 1.9402169671427456, "grad_norm": 4.4609785079956055, "learning_rate": 3.343937113908615e-05, "loss": 1.3763, "step": 24860 }, { "epoch": 1.9409974244907517, "grad_norm": 0.0026027297135442495, "learning_rate": 3.339653552356677e-05, "loss": 1.1527, "step": 24870 }, { "epoch": 1.9417778818387577, "grad_norm": 9.384146324009635e-06, "learning_rate": 3.335371360020102e-05, "loss": 1.4681, "step": 24880 }, { "epoch": 1.9425583391867636, "grad_norm": 0.36783191561698914, "learning_rate": 3.3310905404302256e-05, "loss": 1.0031, "step": 24890 }, { "epoch": 1.9433387965347695, "grad_norm": 1.8857052326202393, "learning_rate": 3.326811097117255e-05, "loss": 0.7547, "step": 24900 }, { "epoch": 1.9441192538827754, "grad_norm": 0.0002518291294109076, "learning_rate": 3.322533033610259e-05, "loss": 0.503, "step": 24910 }, { "epoch": 1.9448997112307813, "grad_norm": 0.11181085556745529, "learning_rate": 3.318256353437169e-05, "loss": 0.5882, "step": 24920 }, { "epoch": 1.9456801685787872, "grad_norm": 0.29577258229255676, "learning_rate": 3.31398106012478e-05, "loss": 0.5357, "step": 24930 }, { "epoch": 1.9464606259267931, "grad_norm": 49.33049774169922, "learning_rate": 3.309707157198737e-05, "loss": 1.7088, "step": 24940 }, { "epoch": 1.947241083274799, "grad_norm": 3.9758615493774414, "learning_rate": 3.305434648183544e-05, "loss": 0.3997, "step": 24950 }, { "epoch": 1.948021540622805, "grad_norm": 0.04083220288157463, "learning_rate": 3.3011635366025484e-05, "loss": 0.0144, "step": 24960 }, { "epoch": 1.9488019979708109, "grad_norm": 7.539558410644531, "learning_rate": 3.296893825977957e-05, "loss": 0.7937, "step": 24970 }, { "epoch": 1.9495824553188168, "grad_norm": 7.05432341874257e-07, "learning_rate": 3.292625519830808e-05, "loss": 1.4518, "step": 24980 }, { "epoch": 1.9503629126668227, "grad_norm": 0.09120603650808334, "learning_rate": 3.288358621680992e-05, "loss": 0.6368, "step": 24990 }, { "epoch": 1.9511433700148286, "grad_norm": 0.2118591070175171, "learning_rate": 3.284093135047231e-05, "loss": 0.1397, "step": 25000 }, { "epoch": 1.9519238273628345, "grad_norm": 0.033967263996601105, "learning_rate": 3.279829063447084e-05, "loss": 1.0633, "step": 25010 }, { "epoch": 1.9527042847108405, "grad_norm": 1.3979352712631226, "learning_rate": 3.275566410396951e-05, "loss": 0.2829, "step": 25020 }, { "epoch": 1.9534847420588464, "grad_norm": 36.74382019042969, "learning_rate": 3.271305179412051e-05, "loss": 0.7056, "step": 25030 }, { "epoch": 1.9542651994068523, "grad_norm": 78.06299591064453, "learning_rate": 3.267045374006438e-05, "loss": 1.8314, "step": 25040 }, { "epoch": 1.9550456567548582, "grad_norm": 8.80843734741211, "learning_rate": 3.2627869976929856e-05, "loss": 1.2586, "step": 25050 }, { "epoch": 1.9558261141028643, "grad_norm": 33.692222595214844, "learning_rate": 3.258530053983396e-05, "loss": 1.4325, "step": 25060 }, { "epoch": 1.9566065714508702, "grad_norm": 51.98230743408203, "learning_rate": 3.2542745463881794e-05, "loss": 1.252, "step": 25070 }, { "epoch": 1.9573870287988762, "grad_norm": 1.1896955966949463, "learning_rate": 3.250020478416671e-05, "loss": 3.1767, "step": 25080 }, { "epoch": 1.958167486146882, "grad_norm": 0.004445586819201708, "learning_rate": 3.245767853577013e-05, "loss": 0.2778, "step": 25090 }, { "epoch": 1.958947943494888, "grad_norm": 0.015711573883891106, "learning_rate": 3.241516675376164e-05, "loss": 0.5651, "step": 25100 }, { "epoch": 1.959728400842894, "grad_norm": 42.32749557495117, "learning_rate": 3.2372669473198816e-05, "loss": 0.5504, "step": 25110 }, { "epoch": 1.9605088581908998, "grad_norm": 0.5061352849006653, "learning_rate": 3.233018672912731e-05, "loss": 1.652, "step": 25120 }, { "epoch": 1.961289315538906, "grad_norm": 1.3850916624069214, "learning_rate": 3.228771855658082e-05, "loss": 0.4327, "step": 25130 }, { "epoch": 1.9620697728869119, "grad_norm": 0.775698721408844, "learning_rate": 3.224526499058096e-05, "loss": 0.7714, "step": 25140 }, { "epoch": 1.9628502302349178, "grad_norm": 46.00149154663086, "learning_rate": 3.220282606613737e-05, "loss": 0.8217, "step": 25150 }, { "epoch": 1.9636306875829237, "grad_norm": 15.187111854553223, "learning_rate": 3.2160401818247556e-05, "loss": 1.13, "step": 25160 }, { "epoch": 1.9644111449309296, "grad_norm": 27.419458389282227, "learning_rate": 3.211799228189697e-05, "loss": 1.1076, "step": 25170 }, { "epoch": 1.9651916022789355, "grad_norm": 7.90415215305984e-05, "learning_rate": 3.207559749205888e-05, "loss": 0.0562, "step": 25180 }, { "epoch": 1.9659720596269414, "grad_norm": 41.95665740966797, "learning_rate": 3.2033217483694455e-05, "loss": 0.5071, "step": 25190 }, { "epoch": 1.9667525169749474, "grad_norm": 0.020275263115763664, "learning_rate": 3.199085229175263e-05, "loss": 0.9272, "step": 25200 }, { "epoch": 1.9675329743229533, "grad_norm": 0.17847752571105957, "learning_rate": 3.194850195117011e-05, "loss": 0.6496, "step": 25210 }, { "epoch": 1.9683134316709592, "grad_norm": 0.0055974675342440605, "learning_rate": 3.19061664968714e-05, "loss": 0.8146, "step": 25220 }, { "epoch": 1.969093889018965, "grad_norm": 15.684666633605957, "learning_rate": 3.1863845963768685e-05, "loss": 0.8318, "step": 25230 }, { "epoch": 1.969874346366971, "grad_norm": 60.0120964050293, "learning_rate": 3.1821540386761894e-05, "loss": 1.1431, "step": 25240 }, { "epoch": 1.970654803714977, "grad_norm": 22.573179244995117, "learning_rate": 3.177924980073856e-05, "loss": 1.4687, "step": 25250 }, { "epoch": 1.9714352610629828, "grad_norm": 12.676541328430176, "learning_rate": 3.173697424057391e-05, "loss": 0.347, "step": 25260 }, { "epoch": 1.9722157184109887, "grad_norm": 3.967153787612915, "learning_rate": 3.1694713741130744e-05, "loss": 0.12, "step": 25270 }, { "epoch": 1.9729961757589947, "grad_norm": 0.0006802030839025974, "learning_rate": 3.165246833725946e-05, "loss": 0.3, "step": 25280 }, { "epoch": 1.9737766331070006, "grad_norm": 45.15895080566406, "learning_rate": 3.1610238063798e-05, "loss": 0.2775, "step": 25290 }, { "epoch": 1.9745570904550065, "grad_norm": 50.831687927246094, "learning_rate": 3.1568022955571824e-05, "loss": 0.9303, "step": 25300 }, { "epoch": 1.9753375478030124, "grad_norm": 0.08076174557209015, "learning_rate": 3.15258230473939e-05, "loss": 1.0626, "step": 25310 }, { "epoch": 1.9761180051510185, "grad_norm": 1.8216647079682957e-09, "learning_rate": 3.148363837406464e-05, "loss": 0.2984, "step": 25320 }, { "epoch": 1.9768984624990245, "grad_norm": 30.55402374267578, "learning_rate": 3.144146897037194e-05, "loss": 1.9792, "step": 25330 }, { "epoch": 1.9776789198470304, "grad_norm": 52.432830810546875, "learning_rate": 3.139931487109102e-05, "loss": 1.3619, "step": 25340 }, { "epoch": 1.9784593771950363, "grad_norm": 0.5994627475738525, "learning_rate": 3.135717611098458e-05, "loss": 0.7943, "step": 25350 }, { "epoch": 1.9792398345430422, "grad_norm": 0.807468593120575, "learning_rate": 3.1315052724802566e-05, "loss": 1.75, "step": 25360 }, { "epoch": 1.9800202918910481, "grad_norm": 0.4832046627998352, "learning_rate": 3.127294474728236e-05, "loss": 1.0956, "step": 25370 }, { "epoch": 1.9808007492390542, "grad_norm": 7.688863754272461, "learning_rate": 3.123085221314852e-05, "loss": 0.3429, "step": 25380 }, { "epoch": 1.9815812065870602, "grad_norm": 2.0159497580607422e-05, "learning_rate": 3.118877515711295e-05, "loss": 0.4776, "step": 25390 }, { "epoch": 1.982361663935066, "grad_norm": 0.0029121001716703176, "learning_rate": 3.1146713613874775e-05, "loss": 0.0429, "step": 25400 }, { "epoch": 1.983142121283072, "grad_norm": 70.2876205444336, "learning_rate": 3.110466761812029e-05, "loss": 0.2696, "step": 25410 }, { "epoch": 1.983922578631078, "grad_norm": 5.711496275928596e-10, "learning_rate": 3.1062637204523014e-05, "loss": 0.8996, "step": 25420 }, { "epoch": 1.9847030359790838, "grad_norm": 0.5862593054771423, "learning_rate": 3.102062240774359e-05, "loss": 1.1331, "step": 25430 }, { "epoch": 1.9854834933270897, "grad_norm": 85.54405212402344, "learning_rate": 3.0978623262429806e-05, "loss": 1.5893, "step": 25440 }, { "epoch": 1.9862639506750956, "grad_norm": 6.912586286489386e-06, "learning_rate": 3.093663980321649e-05, "loss": 1.6659, "step": 25450 }, { "epoch": 1.9870444080231016, "grad_norm": 1.353969931602478, "learning_rate": 3.0894672064725614e-05, "loss": 0.3165, "step": 25460 }, { "epoch": 1.9878248653711075, "grad_norm": 1.1379843734005135e-09, "learning_rate": 3.085272008156611e-05, "loss": 0.721, "step": 25470 }, { "epoch": 1.9886053227191134, "grad_norm": 0.2630331218242645, "learning_rate": 3.0810783888333996e-05, "loss": 0.518, "step": 25480 }, { "epoch": 1.9893857800671193, "grad_norm": 0.31837162375450134, "learning_rate": 3.0768863519612167e-05, "loss": 0.9379, "step": 25490 }, { "epoch": 1.9901662374151252, "grad_norm": 5.576870441436768, "learning_rate": 3.072695900997055e-05, "loss": 0.031, "step": 25500 }, { "epoch": 1.9909466947631311, "grad_norm": 2.1766440868377686, "learning_rate": 3.0685070393965974e-05, "loss": 2.7766, "step": 25510 }, { "epoch": 1.991727152111137, "grad_norm": 0.016400795429944992, "learning_rate": 3.064319770614213e-05, "loss": 0.931, "step": 25520 }, { "epoch": 1.992507609459143, "grad_norm": 12.725517272949219, "learning_rate": 3.060134098102965e-05, "loss": 2.5748, "step": 25530 }, { "epoch": 1.9932880668071489, "grad_norm": 0.0008636490674689412, "learning_rate": 3.055950025314588e-05, "loss": 0.8295, "step": 25540 }, { "epoch": 1.9940685241551548, "grad_norm": 0.12384522706270218, "learning_rate": 3.0517675556995116e-05, "loss": 0.0936, "step": 25550 }, { "epoch": 1.9948489815031607, "grad_norm": 61.26963806152344, "learning_rate": 3.047586692706831e-05, "loss": 0.9006, "step": 25560 }, { "epoch": 1.9956294388511668, "grad_norm": 2.5371127421180972e-08, "learning_rate": 3.043407439784325e-05, "loss": 0.2862, "step": 25570 }, { "epoch": 1.9964098961991728, "grad_norm": 36.4479866027832, "learning_rate": 3.0392298003784382e-05, "loss": 1.0143, "step": 25580 }, { "epoch": 1.9971903535471787, "grad_norm": 0.007001777645200491, "learning_rate": 3.0350537779342914e-05, "loss": 0.9285, "step": 25590 }, { "epoch": 1.9979708108951846, "grad_norm": 42.04073715209961, "learning_rate": 3.0308793758956667e-05, "loss": 1.0238, "step": 25600 }, { "epoch": 1.9987512682431905, "grad_norm": 0.0021264858078211546, "learning_rate": 3.02670659770501e-05, "loss": 1.5027, "step": 25610 }, { "epoch": 1.9995317255911964, "grad_norm": 0.19171792268753052, "learning_rate": 3.0225354468034317e-05, "loss": 1.6025, "step": 25620 }, { "epoch": 2.0003121829392025, "grad_norm": 98.07879638671875, "learning_rate": 3.0183659266306964e-05, "loss": 1.0799, "step": 25630 }, { "epoch": 2.0010926402872085, "grad_norm": 0.38005921244621277, "learning_rate": 3.014198040625229e-05, "loss": 0.8573, "step": 25640 }, { "epoch": 2.0018730976352144, "grad_norm": 5.383617877960205, "learning_rate": 3.0100317922240996e-05, "loss": 0.6595, "step": 25650 }, { "epoch": 2.0026535549832203, "grad_norm": 0.22013996541500092, "learning_rate": 3.0058671848630338e-05, "loss": 0.0558, "step": 25660 }, { "epoch": 2.003434012331226, "grad_norm": 3.6882798326587363e-07, "learning_rate": 3.0017042219764e-05, "loss": 0.108, "step": 25670 }, { "epoch": 2.004214469679232, "grad_norm": 0.01585361920297146, "learning_rate": 2.9975429069972166e-05, "loss": 1.2641, "step": 25680 }, { "epoch": 2.004994927027238, "grad_norm": 0.001181729487143457, "learning_rate": 2.993383243357134e-05, "loss": 0.2464, "step": 25690 }, { "epoch": 2.005775384375244, "grad_norm": 0.9797963500022888, "learning_rate": 2.9892252344864467e-05, "loss": 0.3328, "step": 25700 }, { "epoch": 2.00655584172325, "grad_norm": 8.205447556974832e-06, "learning_rate": 2.9850688838140862e-05, "loss": 0.8833, "step": 25710 }, { "epoch": 2.0073362990712558, "grad_norm": 0.13399428129196167, "learning_rate": 2.98091419476761e-05, "loss": 0.0313, "step": 25720 }, { "epoch": 2.0081167564192617, "grad_norm": 4.621370508495204e-10, "learning_rate": 2.9767611707732107e-05, "loss": 0.1964, "step": 25730 }, { "epoch": 2.0088972137672676, "grad_norm": 7.719918926341052e-07, "learning_rate": 2.972609815255706e-05, "loss": 0.2324, "step": 25740 }, { "epoch": 2.0096776711152735, "grad_norm": 0.004049698356539011, "learning_rate": 2.9684601316385396e-05, "loss": 0.6326, "step": 25750 }, { "epoch": 2.0104581284632794, "grad_norm": 3.357577323913574, "learning_rate": 2.9643121233437715e-05, "loss": 1.0279, "step": 25760 }, { "epoch": 2.0112385858112853, "grad_norm": 3.8555106129933847e-07, "learning_rate": 2.9601657937920857e-05, "loss": 2.5202, "step": 25770 }, { "epoch": 2.0120190431592913, "grad_norm": 1.3730878833939641e-07, "learning_rate": 2.956021146402781e-05, "loss": 0.034, "step": 25780 }, { "epoch": 2.012799500507297, "grad_norm": 1.7080418501791428e-07, "learning_rate": 2.9518781845937626e-05, "loss": 1.3658, "step": 25790 }, { "epoch": 2.013579957855303, "grad_norm": 0.004128089174628258, "learning_rate": 2.9477369117815546e-05, "loss": 0.0758, "step": 25800 }, { "epoch": 2.014360415203309, "grad_norm": 4.3436452301648387e-07, "learning_rate": 2.9435973313812815e-05, "loss": 0.1982, "step": 25810 }, { "epoch": 2.015140872551315, "grad_norm": 37.810096740722656, "learning_rate": 2.939459446806679e-05, "loss": 0.0917, "step": 25820 }, { "epoch": 2.015921329899321, "grad_norm": 0.26041847467422485, "learning_rate": 2.9353232614700754e-05, "loss": 0.5103, "step": 25830 }, { "epoch": 2.0167017872473267, "grad_norm": 0.0001422716595698148, "learning_rate": 2.9311887787824068e-05, "loss": 0.1755, "step": 25840 }, { "epoch": 2.0174822445953327, "grad_norm": 0.05642390996217728, "learning_rate": 2.927056002153196e-05, "loss": 0.0007, "step": 25850 }, { "epoch": 2.018262701943339, "grad_norm": 7.756619240135987e-13, "learning_rate": 2.9229249349905684e-05, "loss": 0.0191, "step": 25860 }, { "epoch": 2.019043159291345, "grad_norm": 103.0235595703125, "learning_rate": 2.918795580701233e-05, "loss": 1.3986, "step": 25870 }, { "epoch": 2.019823616639351, "grad_norm": 3.3781661987304688, "learning_rate": 2.9146679426904876e-05, "loss": 0.7521, "step": 25880 }, { "epoch": 2.0206040739873568, "grad_norm": 0.009529034607112408, "learning_rate": 2.9105420243622182e-05, "loss": 0.5684, "step": 25890 }, { "epoch": 2.0213845313353627, "grad_norm": 9.872029860247267e-10, "learning_rate": 2.906417829118886e-05, "loss": 0.0163, "step": 25900 }, { "epoch": 2.0221649886833686, "grad_norm": 0.00018539708980824798, "learning_rate": 2.9022953603615395e-05, "loss": 0.0254, "step": 25910 }, { "epoch": 2.0229454460313745, "grad_norm": 0.01795944571495056, "learning_rate": 2.8981746214897944e-05, "loss": 0.114, "step": 25920 }, { "epoch": 2.0237259033793804, "grad_norm": 0.056700363755226135, "learning_rate": 2.8940556159018484e-05, "loss": 0.3087, "step": 25930 }, { "epoch": 2.0245063607273863, "grad_norm": 0.00224350206553936, "learning_rate": 2.889938346994463e-05, "loss": 0.0146, "step": 25940 }, { "epoch": 2.0252868180753922, "grad_norm": 9.900408744812012, "learning_rate": 2.885822818162971e-05, "loss": 0.1125, "step": 25950 }, { "epoch": 2.026067275423398, "grad_norm": 4.4600043296813965, "learning_rate": 2.8817090328012704e-05, "loss": 0.1713, "step": 25960 }, { "epoch": 2.026847732771404, "grad_norm": 0.3353830575942993, "learning_rate": 2.877596994301823e-05, "loss": 1.3635, "step": 25970 }, { "epoch": 2.02762819011941, "grad_norm": 108.07823944091797, "learning_rate": 2.8734867060556447e-05, "loss": 1.5941, "step": 25980 }, { "epoch": 2.028408647467416, "grad_norm": 134.56443786621094, "learning_rate": 2.8693781714523104e-05, "loss": 0.3383, "step": 25990 }, { "epoch": 2.029189104815422, "grad_norm": 0.419847309589386, "learning_rate": 2.865271393879953e-05, "loss": 0.0753, "step": 26000 }, { "epoch": 2.0299695621634277, "grad_norm": 0.0037621513474732637, "learning_rate": 2.8611663767252494e-05, "loss": 1.4775, "step": 26010 }, { "epoch": 2.0307500195114336, "grad_norm": 0.00020689735538326204, "learning_rate": 2.8570631233734292e-05, "loss": 0.2159, "step": 26020 }, { "epoch": 2.0315304768594395, "grad_norm": 8.361428626812994e-05, "learning_rate": 2.852961637208268e-05, "loss": 0.4753, "step": 26030 }, { "epoch": 2.0323109342074455, "grad_norm": 9.77057768025702e-10, "learning_rate": 2.848861921612083e-05, "loss": 0.1254, "step": 26040 }, { "epoch": 2.0330913915554514, "grad_norm": 7.082812526586213e-13, "learning_rate": 2.844763979965729e-05, "loss": 0.6117, "step": 26050 }, { "epoch": 2.0338718489034573, "grad_norm": 13.338913917541504, "learning_rate": 2.8406678156486034e-05, "loss": 0.0186, "step": 26060 }, { "epoch": 2.034652306251463, "grad_norm": 2.314552068710327, "learning_rate": 2.836573432038629e-05, "loss": 1.9713, "step": 26070 }, { "epoch": 2.035432763599469, "grad_norm": 111.98283386230469, "learning_rate": 2.832480832512271e-05, "loss": 0.8219, "step": 26080 }, { "epoch": 2.036213220947475, "grad_norm": 37.844078063964844, "learning_rate": 2.8283900204445136e-05, "loss": 0.4319, "step": 26090 }, { "epoch": 2.036993678295481, "grad_norm": 14.029542922973633, "learning_rate": 2.824300999208872e-05, "loss": 0.0097, "step": 26100 }, { "epoch": 2.0377741356434873, "grad_norm": 0.00010180075332755223, "learning_rate": 2.820213772177388e-05, "loss": 0.0203, "step": 26110 }, { "epoch": 2.038554592991493, "grad_norm": 1.2391136806400027e-05, "learning_rate": 2.8161283427206143e-05, "loss": 0.9182, "step": 26120 }, { "epoch": 2.039335050339499, "grad_norm": 9.387850877828896e-05, "learning_rate": 2.81204471420763e-05, "loss": 0.0724, "step": 26130 }, { "epoch": 2.040115507687505, "grad_norm": 0.15147614479064941, "learning_rate": 2.8079628900060233e-05, "loss": 0.3784, "step": 26140 }, { "epoch": 2.040895965035511, "grad_norm": 129.17037963867188, "learning_rate": 2.8038828734818995e-05, "loss": 0.9068, "step": 26150 }, { "epoch": 2.041676422383517, "grad_norm": 0.06553638726472855, "learning_rate": 2.7998046679998668e-05, "loss": 0.3397, "step": 26160 }, { "epoch": 2.042456879731523, "grad_norm": 0.05100405961275101, "learning_rate": 2.795728276923047e-05, "loss": 0.0717, "step": 26170 }, { "epoch": 2.0432373370795287, "grad_norm": 1.2694729889517475e-07, "learning_rate": 2.7916537036130584e-05, "loss": 0.5827, "step": 26180 }, { "epoch": 2.0440177944275346, "grad_norm": 1.5002992768131662e-06, "learning_rate": 2.7875809514300272e-05, "loss": 0.4723, "step": 26190 }, { "epoch": 2.0447982517755405, "grad_norm": 3.0192535632522777e-05, "learning_rate": 2.783510023732575e-05, "loss": 0.0862, "step": 26200 }, { "epoch": 2.0455787091235464, "grad_norm": 0.007100996095687151, "learning_rate": 2.7794409238778158e-05, "loss": 0.0, "step": 26210 }, { "epoch": 2.0463591664715524, "grad_norm": 0.0008602620218880475, "learning_rate": 2.7753736552213616e-05, "loss": 0.9352, "step": 26220 }, { "epoch": 2.0471396238195583, "grad_norm": 3.4370423174223674e-11, "learning_rate": 2.771308221117309e-05, "loss": 0.3896, "step": 26230 }, { "epoch": 2.047920081167564, "grad_norm": 5.0753031246131286e-05, "learning_rate": 2.7672446249182472e-05, "loss": 0.1046, "step": 26240 }, { "epoch": 2.04870053851557, "grad_norm": 0.2621128261089325, "learning_rate": 2.763182869975244e-05, "loss": 1.336, "step": 26250 }, { "epoch": 2.049480995863576, "grad_norm": 0.0025993799790740013, "learning_rate": 2.7591229596378528e-05, "loss": 0.5698, "step": 26260 }, { "epoch": 2.050261453211582, "grad_norm": 0.036587078124284744, "learning_rate": 2.755064897254105e-05, "loss": 0.0195, "step": 26270 }, { "epoch": 2.051041910559588, "grad_norm": 3.440267093424154e-08, "learning_rate": 2.7510086861705094e-05, "loss": 0.0057, "step": 26280 }, { "epoch": 2.0518223679075938, "grad_norm": 2.941135608125478e-05, "learning_rate": 2.7469543297320456e-05, "loss": 0.4777, "step": 26290 }, { "epoch": 2.0526028252555997, "grad_norm": 0.0801917314529419, "learning_rate": 2.7429018312821614e-05, "loss": 0.0012, "step": 26300 }, { "epoch": 2.0533832826036056, "grad_norm": 1.0764252920125728e-06, "learning_rate": 2.7388511941627805e-05, "loss": 0.2024, "step": 26310 }, { "epoch": 2.0541637399516115, "grad_norm": 42.37961959838867, "learning_rate": 2.7348024217142827e-05, "loss": 2.9864, "step": 26320 }, { "epoch": 2.0549441972996174, "grad_norm": 3.61101206181047e-06, "learning_rate": 2.730755517275516e-05, "loss": 0.0003, "step": 26330 }, { "epoch": 2.0557246546476233, "grad_norm": 0.036643143743276596, "learning_rate": 2.7267104841837863e-05, "loss": 0.0987, "step": 26340 }, { "epoch": 2.0565051119956292, "grad_norm": 0.000123582300147973, "learning_rate": 2.722667325774858e-05, "loss": 0.1032, "step": 26350 }, { "epoch": 2.057285569343635, "grad_norm": 8.651711141283158e-06, "learning_rate": 2.7186260453829443e-05, "loss": 0.1192, "step": 26360 }, { "epoch": 2.0580660266916415, "grad_norm": 0.04431761056184769, "learning_rate": 2.714586646340716e-05, "loss": 0.0038, "step": 26370 }, { "epoch": 2.0588464840396474, "grad_norm": 2.8218696910414787e-12, "learning_rate": 2.710549131979288e-05, "loss": 0.0032, "step": 26380 }, { "epoch": 2.0596269413876533, "grad_norm": 0.004620996303856373, "learning_rate": 2.7065135056282204e-05, "loss": 0.0001, "step": 26390 }, { "epoch": 2.0604073987356593, "grad_norm": 0.0009973037522286177, "learning_rate": 2.7024797706155204e-05, "loss": 0.1086, "step": 26400 }, { "epoch": 2.061187856083665, "grad_norm": 0.00033386031282134354, "learning_rate": 2.6984479302676336e-05, "loss": 0.1944, "step": 26410 }, { "epoch": 2.061968313431671, "grad_norm": 3.535060599801909e-08, "learning_rate": 2.6944179879094443e-05, "loss": 0.191, "step": 26420 }, { "epoch": 2.062748770779677, "grad_norm": 0.010547350160777569, "learning_rate": 2.6903899468642668e-05, "loss": 0.0016, "step": 26430 }, { "epoch": 2.063529228127683, "grad_norm": 3.213576555252075, "learning_rate": 2.686363810453856e-05, "loss": 1.5271, "step": 26440 }, { "epoch": 2.064309685475689, "grad_norm": 69.19461059570312, "learning_rate": 2.682339581998386e-05, "loss": 1.4639, "step": 26450 }, { "epoch": 2.0650901428236947, "grad_norm": 1.0486480306326484e-07, "learning_rate": 2.6783172648164666e-05, "loss": 0.0147, "step": 26460 }, { "epoch": 2.0658706001717007, "grad_norm": 13.421584129333496, "learning_rate": 2.6742968622251264e-05, "loss": 1.7384, "step": 26470 }, { "epoch": 2.0666510575197066, "grad_norm": 0.00023604616580996662, "learning_rate": 2.6702783775398132e-05, "loss": 0.0, "step": 26480 }, { "epoch": 2.0674315148677125, "grad_norm": 4.015465043805122e-12, "learning_rate": 2.6662618140743988e-05, "loss": 0.692, "step": 26490 }, { "epoch": 2.0682119722157184, "grad_norm": 2.1036534195379222e-14, "learning_rate": 2.6622471751411678e-05, "loss": 0.0897, "step": 26500 }, { "epoch": 2.0689924295637243, "grad_norm": 1.178400907519972e-05, "learning_rate": 2.65823446405082e-05, "loss": 0.0004, "step": 26510 }, { "epoch": 2.0697728869117302, "grad_norm": 0.055531445890665054, "learning_rate": 2.6542236841124594e-05, "loss": 0.012, "step": 26520 }, { "epoch": 2.070553344259736, "grad_norm": 7.828985530977661e-07, "learning_rate": 2.6502148386336057e-05, "loss": 1.0484, "step": 26530 }, { "epoch": 2.071333801607742, "grad_norm": 1.7514663568363176e-06, "learning_rate": 2.646207930920175e-05, "loss": 0.1406, "step": 26540 }, { "epoch": 2.072114258955748, "grad_norm": 0.8897789120674133, "learning_rate": 2.6422029642764933e-05, "loss": 0.1969, "step": 26550 }, { "epoch": 2.072894716303754, "grad_norm": 0.18779289722442627, "learning_rate": 2.6381999420052783e-05, "loss": 0.1714, "step": 26560 }, { "epoch": 2.07367517365176, "grad_norm": 118.19866180419922, "learning_rate": 2.6341988674076497e-05, "loss": 2.3051, "step": 26570 }, { "epoch": 2.0744556309997657, "grad_norm": 7.315911293029785, "learning_rate": 2.6301997437831217e-05, "loss": 0.0031, "step": 26580 }, { "epoch": 2.0752360883477716, "grad_norm": 1.4385477697942406e-06, "learning_rate": 2.6262025744295927e-05, "loss": 0.6723, "step": 26590 }, { "epoch": 2.0760165456957775, "grad_norm": 0.00010235334048047662, "learning_rate": 2.6222073626433584e-05, "loss": 0.524, "step": 26600 }, { "epoch": 2.0767970030437835, "grad_norm": 0.0751703754067421, "learning_rate": 2.6182141117190918e-05, "loss": 0.0055, "step": 26610 }, { "epoch": 2.0775774603917894, "grad_norm": 0.9944970607757568, "learning_rate": 2.6142228249498578e-05, "loss": 0.8321, "step": 26620 }, { "epoch": 2.0783579177397957, "grad_norm": 1.3824878931045532, "learning_rate": 2.610233505627091e-05, "loss": 0.0864, "step": 26630 }, { "epoch": 2.0791383750878016, "grad_norm": 4.768913128977426e-13, "learning_rate": 2.606246157040613e-05, "loss": 0.0231, "step": 26640 }, { "epoch": 2.0799188324358076, "grad_norm": 2.846596847128069e-10, "learning_rate": 2.602260782478615e-05, "loss": 0.0967, "step": 26650 }, { "epoch": 2.0806992897838135, "grad_norm": 5.813966708956286e-06, "learning_rate": 2.5982773852276644e-05, "loss": 0.0023, "step": 26660 }, { "epoch": 2.0814797471318194, "grad_norm": 1.337128741063509e-09, "learning_rate": 2.594295968572693e-05, "loss": 0.0004, "step": 26670 }, { "epoch": 2.0822602044798253, "grad_norm": 3.281140470434707e-09, "learning_rate": 2.5903165357970005e-05, "loss": 0.1601, "step": 26680 }, { "epoch": 2.083040661827831, "grad_norm": 7.578184158774093e-05, "learning_rate": 2.586339090182254e-05, "loss": 0.1502, "step": 26690 }, { "epoch": 2.083821119175837, "grad_norm": 0.00915143545717001, "learning_rate": 2.5823636350084775e-05, "loss": 1.3793, "step": 26700 }, { "epoch": 2.084601576523843, "grad_norm": 0.01805698312819004, "learning_rate": 2.5783901735540584e-05, "loss": 0.0301, "step": 26710 }, { "epoch": 2.085382033871849, "grad_norm": 1.9149654084671397e-10, "learning_rate": 2.5744187090957316e-05, "loss": 0.6883, "step": 26720 }, { "epoch": 2.086162491219855, "grad_norm": 1.6224537375819637e-06, "learning_rate": 2.570449244908598e-05, "loss": 0.7574, "step": 26730 }, { "epoch": 2.086942948567861, "grad_norm": 212.9560546875, "learning_rate": 2.566481784266097e-05, "loss": 1.8576, "step": 26740 }, { "epoch": 2.0877234059158667, "grad_norm": 0.00015921909653116018, "learning_rate": 2.5625163304400245e-05, "loss": 1.7303, "step": 26750 }, { "epoch": 2.0885038632638726, "grad_norm": 0.013452545739710331, "learning_rate": 2.558552886700512e-05, "loss": 0.0011, "step": 26760 }, { "epoch": 2.0892843206118785, "grad_norm": 8.070929652603809e-06, "learning_rate": 2.5545914563160443e-05, "loss": 1.574, "step": 26770 }, { "epoch": 2.0900647779598844, "grad_norm": 0.004110492300242186, "learning_rate": 2.5506320425534375e-05, "loss": 0.5442, "step": 26780 }, { "epoch": 2.0908452353078903, "grad_norm": 8.283843611067709e-10, "learning_rate": 2.5466746486778458e-05, "loss": 0.3234, "step": 26790 }, { "epoch": 2.0916256926558963, "grad_norm": 6.580946454448622e-09, "learning_rate": 2.542719277952762e-05, "loss": 0.0004, "step": 26800 }, { "epoch": 2.092406150003902, "grad_norm": 0.0010235244408249855, "learning_rate": 2.5387659336400072e-05, "loss": 0.0153, "step": 26810 }, { "epoch": 2.093186607351908, "grad_norm": 0.24290083348751068, "learning_rate": 2.5348146189997345e-05, "loss": 1.247, "step": 26820 }, { "epoch": 2.093967064699914, "grad_norm": 4.409335451782681e-05, "learning_rate": 2.530865337290418e-05, "loss": 0.1006, "step": 26830 }, { "epoch": 2.09474752204792, "grad_norm": 9.038549423217773, "learning_rate": 2.52691809176886e-05, "loss": 0.144, "step": 26840 }, { "epoch": 2.095527979395926, "grad_norm": 1.1245354413986206, "learning_rate": 2.5229728856901796e-05, "loss": 0.3469, "step": 26850 }, { "epoch": 2.0963084367439317, "grad_norm": 2.387321673680276e-09, "learning_rate": 2.5190297223078195e-05, "loss": 0.8331, "step": 26860 }, { "epoch": 2.0970888940919377, "grad_norm": 2.5276014614661335e-10, "learning_rate": 2.5150886048735313e-05, "loss": 0.0001, "step": 26870 }, { "epoch": 2.097869351439944, "grad_norm": 0.02269677072763443, "learning_rate": 2.5111495366373843e-05, "loss": 0.2311, "step": 26880 }, { "epoch": 2.09864980878795, "grad_norm": 1.765695003541623e-07, "learning_rate": 2.507212520847759e-05, "loss": 0.0448, "step": 26890 }, { "epoch": 2.099430266135956, "grad_norm": 203.69290161132812, "learning_rate": 2.5032775607513358e-05, "loss": 1.3521, "step": 26900 }, { "epoch": 2.1002107234839618, "grad_norm": 3.4434066037647426e-05, "learning_rate": 2.4993446595931097e-05, "loss": 1.4841, "step": 26910 }, { "epoch": 2.1009911808319677, "grad_norm": 0.0002487737510818988, "learning_rate": 2.4954138206163685e-05, "loss": 0.0001, "step": 26920 }, { "epoch": 2.1017716381799736, "grad_norm": 2.313194751739502, "learning_rate": 2.4914850470627078e-05, "loss": 0.5814, "step": 26930 }, { "epoch": 2.1025520955279795, "grad_norm": 4.538455300462374e-08, "learning_rate": 2.4875583421720123e-05, "loss": 0.5102, "step": 26940 }, { "epoch": 2.1033325528759854, "grad_norm": 14.016276359558105, "learning_rate": 2.483633709182466e-05, "loss": 0.074, "step": 26950 }, { "epoch": 2.1041130102239913, "grad_norm": 4.9571683563565117e-11, "learning_rate": 2.479711151330545e-05, "loss": 0.0003, "step": 26960 }, { "epoch": 2.1048934675719972, "grad_norm": 0.28219160437583923, "learning_rate": 2.4757906718510072e-05, "loss": 0.1947, "step": 26970 }, { "epoch": 2.105673924920003, "grad_norm": 2.000363826751709, "learning_rate": 2.4718722739769057e-05, "loss": 0.548, "step": 26980 }, { "epoch": 2.106454382268009, "grad_norm": 3.634791079232258e-10, "learning_rate": 2.467955960939569e-05, "loss": 1.4461, "step": 26990 }, { "epoch": 2.107234839616015, "grad_norm": 0.00023928057635203004, "learning_rate": 2.464041735968613e-05, "loss": 0.0638, "step": 27000 }, { "epoch": 2.108015296964021, "grad_norm": 1.9216714330916318e-10, "learning_rate": 2.4601296022919245e-05, "loss": 0.1088, "step": 27010 }, { "epoch": 2.108795754312027, "grad_norm": 159.6993865966797, "learning_rate": 2.456219563135674e-05, "loss": 2.9701, "step": 27020 }, { "epoch": 2.1095762116600327, "grad_norm": 184.36904907226562, "learning_rate": 2.4523116217242958e-05, "loss": 1.8129, "step": 27030 }, { "epoch": 2.1103566690080386, "grad_norm": 1.1307780883526575e-12, "learning_rate": 2.448405781280502e-05, "loss": 0.4697, "step": 27040 }, { "epoch": 2.1111371263560446, "grad_norm": 10.915018081665039, "learning_rate": 2.444502045025268e-05, "loss": 0.0177, "step": 27050 }, { "epoch": 2.1119175837040505, "grad_norm": 43.65808868408203, "learning_rate": 2.4406004161778374e-05, "loss": 0.1361, "step": 27060 }, { "epoch": 2.1126980410520564, "grad_norm": 3.3598575592041016, "learning_rate": 2.4367008979557114e-05, "loss": 0.3633, "step": 27070 }, { "epoch": 2.1134784984000623, "grad_norm": 1.2000231031095154e-08, "learning_rate": 2.4328034935746507e-05, "loss": 0.3738, "step": 27080 }, { "epoch": 2.114258955748068, "grad_norm": 190.1020965576172, "learning_rate": 2.428908206248679e-05, "loss": 0.9442, "step": 27090 }, { "epoch": 2.115039413096074, "grad_norm": 8.433188438415527, "learning_rate": 2.425015039190066e-05, "loss": 0.1653, "step": 27100 }, { "epoch": 2.11581987044408, "grad_norm": 141.552001953125, "learning_rate": 2.421123995609339e-05, "loss": 0.3785, "step": 27110 }, { "epoch": 2.116600327792086, "grad_norm": 10.547119140625, "learning_rate": 2.4172350787152714e-05, "loss": 0.3827, "step": 27120 }, { "epoch": 2.1173807851400923, "grad_norm": 9.30328369140625, "learning_rate": 2.4133482917148864e-05, "loss": 0.0089, "step": 27130 }, { "epoch": 2.1181612424880982, "grad_norm": 0.009139536879956722, "learning_rate": 2.4094636378134434e-05, "loss": 0.0024, "step": 27140 }, { "epoch": 2.118941699836104, "grad_norm": 0.15115059912204742, "learning_rate": 2.4055811202144505e-05, "loss": 0.0129, "step": 27150 }, { "epoch": 2.11972215718411, "grad_norm": 162.1376190185547, "learning_rate": 2.4017007421196508e-05, "loss": 2.3596, "step": 27160 }, { "epoch": 2.120502614532116, "grad_norm": 0.002663856605067849, "learning_rate": 2.397822506729019e-05, "loss": 3.0959, "step": 27170 }, { "epoch": 2.121283071880122, "grad_norm": 5.7644479056762066e-06, "learning_rate": 2.39394641724077e-05, "loss": 0.0004, "step": 27180 }, { "epoch": 2.122063529228128, "grad_norm": 4.785835335496813e-05, "learning_rate": 2.390072476851345e-05, "loss": 0.0668, "step": 27190 }, { "epoch": 2.1228439865761337, "grad_norm": 1.4409127970793634e-06, "learning_rate": 2.3862006887554166e-05, "loss": 0.3021, "step": 27200 }, { "epoch": 2.1236244439241396, "grad_norm": 2.1471103082149057e-06, "learning_rate": 2.382331056145875e-05, "loss": 0.7031, "step": 27210 }, { "epoch": 2.1244049012721455, "grad_norm": 4.70070488610419e-14, "learning_rate": 2.3784635822138424e-05, "loss": 0.4572, "step": 27220 }, { "epoch": 2.1251853586201515, "grad_norm": 178.70465087890625, "learning_rate": 2.3745982701486514e-05, "loss": 1.5983, "step": 27230 }, { "epoch": 2.1259658159681574, "grad_norm": 8.373917381732099e-08, "learning_rate": 2.3707351231378612e-05, "loss": 0.0597, "step": 27240 }, { "epoch": 2.1267462733161633, "grad_norm": 0.4737749397754669, "learning_rate": 2.3668741443672356e-05, "loss": 1.2344, "step": 27250 }, { "epoch": 2.127526730664169, "grad_norm": 9.960779425455257e-05, "learning_rate": 2.3630153370207582e-05, "loss": 0.6203, "step": 27260 }, { "epoch": 2.128307188012175, "grad_norm": 1.7590502920938889e-06, "learning_rate": 2.3591587042806213e-05, "loss": 0.0193, "step": 27270 }, { "epoch": 2.129087645360181, "grad_norm": 1.4878227122538945e-11, "learning_rate": 2.3553042493272176e-05, "loss": 0.667, "step": 27280 }, { "epoch": 2.129868102708187, "grad_norm": 0.38585662841796875, "learning_rate": 2.351451975339153e-05, "loss": 0.121, "step": 27290 }, { "epoch": 2.130648560056193, "grad_norm": 2.600064911637001e-12, "learning_rate": 2.3476018854932247e-05, "loss": 1.1299, "step": 27300 }, { "epoch": 2.1314290174041988, "grad_norm": 12.171748161315918, "learning_rate": 2.3437539829644385e-05, "loss": 0.0135, "step": 27310 }, { "epoch": 2.1322094747522047, "grad_norm": 0.0016079798806458712, "learning_rate": 2.3399082709259886e-05, "loss": 0.0188, "step": 27320 }, { "epoch": 2.1329899321002106, "grad_norm": 0.00035869970452040434, "learning_rate": 2.336064752549269e-05, "loss": 0.001, "step": 27330 }, { "epoch": 2.1337703894482165, "grad_norm": 7.7371763992778995e-16, "learning_rate": 2.3322234310038587e-05, "loss": 0.8011, "step": 27340 }, { "epoch": 2.1345508467962224, "grad_norm": 5.361710282159038e-05, "learning_rate": 2.3283843094575298e-05, "loss": 0.4662, "step": 27350 }, { "epoch": 2.1353313041442283, "grad_norm": 0.021017905324697495, "learning_rate": 2.3245473910762404e-05, "loss": 0.1137, "step": 27360 }, { "epoch": 2.1361117614922343, "grad_norm": 1.9953435526076646e-07, "learning_rate": 2.320712679024127e-05, "loss": 0.0039, "step": 27370 }, { "epoch": 2.1368922188402406, "grad_norm": 0.027051346376538277, "learning_rate": 2.3168801764635117e-05, "loss": 0.0001, "step": 27380 }, { "epoch": 2.137672676188246, "grad_norm": 2.500294936369496e-11, "learning_rate": 2.31304988655489e-05, "loss": 0.1205, "step": 27390 }, { "epoch": 2.1384531335362524, "grad_norm": 4.070713988113539e-09, "learning_rate": 2.3092218124569376e-05, "loss": 0.2147, "step": 27400 }, { "epoch": 2.1392335908842584, "grad_norm": 0.02425590716302395, "learning_rate": 2.3053959573264978e-05, "loss": 0.0, "step": 27410 }, { "epoch": 2.1400140482322643, "grad_norm": 5.908094846684975e-11, "learning_rate": 2.3015723243185877e-05, "loss": 0.0009, "step": 27420 }, { "epoch": 2.14079450558027, "grad_norm": 0.1725311577320099, "learning_rate": 2.2977509165863907e-05, "loss": 1.8829, "step": 27430 }, { "epoch": 2.141574962928276, "grad_norm": 8.780289499554783e-05, "learning_rate": 2.293931737281258e-05, "loss": 0.0569, "step": 27440 }, { "epoch": 2.142355420276282, "grad_norm": 3.0101862648734823e-05, "learning_rate": 2.290114789552698e-05, "loss": 1.0659, "step": 27450 }, { "epoch": 2.143135877624288, "grad_norm": 180.4490966796875, "learning_rate": 2.2863000765483788e-05, "loss": 0.3342, "step": 27460 }, { "epoch": 2.143916334972294, "grad_norm": 63.52663040161133, "learning_rate": 2.2824876014141327e-05, "loss": 0.0746, "step": 27470 }, { "epoch": 2.1446967923202997, "grad_norm": 177.1395263671875, "learning_rate": 2.2786773672939372e-05, "loss": 0.4897, "step": 27480 }, { "epoch": 2.1454772496683057, "grad_norm": 0.003967598080635071, "learning_rate": 2.2748693773299284e-05, "loss": 0.0958, "step": 27490 }, { "epoch": 2.1462577070163116, "grad_norm": 4.518438527304574e-18, "learning_rate": 2.2710636346623898e-05, "loss": 0.0022, "step": 27500 }, { "epoch": 2.1470381643643175, "grad_norm": 6.903857752149634e-08, "learning_rate": 2.2672601424297536e-05, "loss": 0.0008, "step": 27510 }, { "epoch": 2.1478186217123234, "grad_norm": 1.3903465878595256e-10, "learning_rate": 2.26345890376859e-05, "loss": 0.7075, "step": 27520 }, { "epoch": 2.1485990790603293, "grad_norm": 8.995040972378625e-15, "learning_rate": 2.259659921813619e-05, "loss": 0.007, "step": 27530 }, { "epoch": 2.1493795364083352, "grad_norm": 5.556034011533484e-05, "learning_rate": 2.2558631996976914e-05, "loss": 0.0384, "step": 27540 }, { "epoch": 2.150159993756341, "grad_norm": 5.721829893445829e-06, "learning_rate": 2.2520687405518027e-05, "loss": 0.564, "step": 27550 }, { "epoch": 2.150940451104347, "grad_norm": 0.055675067007541656, "learning_rate": 2.2482765475050733e-05, "loss": 0.0001, "step": 27560 }, { "epoch": 2.151720908452353, "grad_norm": 1.6937552572926506e-05, "learning_rate": 2.2444866236847623e-05, "loss": 0.0028, "step": 27570 }, { "epoch": 2.152501365800359, "grad_norm": 5.930499202833062e-09, "learning_rate": 2.2406989722162558e-05, "loss": 2.7458, "step": 27580 }, { "epoch": 2.153281823148365, "grad_norm": 6.177411364660657e-07, "learning_rate": 2.2369135962230626e-05, "loss": 0.4403, "step": 27590 }, { "epoch": 2.1540622804963707, "grad_norm": 0.04021603614091873, "learning_rate": 2.233130498826819e-05, "loss": 1.3678, "step": 27600 }, { "epoch": 2.1548427378443766, "grad_norm": 0.00041691059595905244, "learning_rate": 2.2293496831472788e-05, "loss": 1.461, "step": 27610 }, { "epoch": 2.1556231951923825, "grad_norm": 27.920442581176758, "learning_rate": 2.2255711523023188e-05, "loss": 0.0124, "step": 27620 }, { "epoch": 2.1564036525403885, "grad_norm": 1.32997045717208e-19, "learning_rate": 2.221794909407925e-05, "loss": 0.0023, "step": 27630 }, { "epoch": 2.1571841098883944, "grad_norm": 2.735917405516375e-06, "learning_rate": 2.2180209575782036e-05, "loss": 0.1073, "step": 27640 }, { "epoch": 2.1579645672364007, "grad_norm": 8.972650178407093e-09, "learning_rate": 2.2142492999253657e-05, "loss": 0.0854, "step": 27650 }, { "epoch": 2.1587450245844066, "grad_norm": 1.0043543170468183e-06, "learning_rate": 2.2104799395597335e-05, "loss": 0.7073, "step": 27660 }, { "epoch": 2.1595254819324126, "grad_norm": 87.87557983398438, "learning_rate": 2.206712879589737e-05, "loss": 0.0313, "step": 27670 }, { "epoch": 2.1603059392804185, "grad_norm": 8.924705505371094, "learning_rate": 2.2029481231219023e-05, "loss": 0.3171, "step": 27680 }, { "epoch": 2.1610863966284244, "grad_norm": 200.0184326171875, "learning_rate": 2.1991856732608647e-05, "loss": 1.6237, "step": 27690 }, { "epoch": 2.1618668539764303, "grad_norm": 0.008014976046979427, "learning_rate": 2.195425533109347e-05, "loss": 0.3416, "step": 27700 }, { "epoch": 2.162647311324436, "grad_norm": 9.913729392962978e-09, "learning_rate": 2.1916677057681785e-05, "loss": 0.0, "step": 27710 }, { "epoch": 2.163427768672442, "grad_norm": 0.0008486440638080239, "learning_rate": 2.1879121943362707e-05, "loss": 0.5153, "step": 27720 }, { "epoch": 2.164208226020448, "grad_norm": 0.0002521092537790537, "learning_rate": 2.184159001910633e-05, "loss": 0.8563, "step": 27730 }, { "epoch": 2.164988683368454, "grad_norm": 0.7091737389564514, "learning_rate": 2.1804081315863585e-05, "loss": 0.4124, "step": 27740 }, { "epoch": 2.16576914071646, "grad_norm": 227.946044921875, "learning_rate": 2.176659586456629e-05, "loss": 1.8093, "step": 27750 }, { "epoch": 2.166549598064466, "grad_norm": 2.0538752778520575e-06, "learning_rate": 2.1729133696127054e-05, "loss": 0.0179, "step": 27760 }, { "epoch": 2.1673300554124717, "grad_norm": 0.028842711821198463, "learning_rate": 2.1691694841439265e-05, "loss": 0.3815, "step": 27770 }, { "epoch": 2.1681105127604776, "grad_norm": 3.98197114170884e-10, "learning_rate": 2.1654279331377147e-05, "loss": 0.4047, "step": 27780 }, { "epoch": 2.1688909701084835, "grad_norm": 2.1827180106395245e-16, "learning_rate": 2.1616887196795614e-05, "loss": 0.0048, "step": 27790 }, { "epoch": 2.1696714274564894, "grad_norm": 0.011947333812713623, "learning_rate": 2.157951846853035e-05, "loss": 0.0019, "step": 27800 }, { "epoch": 2.1704518848044954, "grad_norm": 7.423368231229688e-08, "learning_rate": 2.1542173177397713e-05, "loss": 0.3171, "step": 27810 }, { "epoch": 2.1712323421525013, "grad_norm": 1.2332695753514145e-08, "learning_rate": 2.150485135419475e-05, "loss": 0.2155, "step": 27820 }, { "epoch": 2.172012799500507, "grad_norm": 1.0378048420633945e-10, "learning_rate": 2.1467553029699112e-05, "loss": 0.0055, "step": 27830 }, { "epoch": 2.172793256848513, "grad_norm": 8.545025867090012e-17, "learning_rate": 2.1430278234669133e-05, "loss": 1.5454, "step": 27840 }, { "epoch": 2.173573714196519, "grad_norm": 0.3371439576148987, "learning_rate": 2.1393026999843703e-05, "loss": 0.5029, "step": 27850 }, { "epoch": 2.174354171544525, "grad_norm": 1.8403001149636111e-06, "learning_rate": 2.1355799355942262e-05, "loss": 0.6609, "step": 27860 }, { "epoch": 2.175134628892531, "grad_norm": 0.5155243277549744, "learning_rate": 2.1318595333664854e-05, "loss": 0.8387, "step": 27870 }, { "epoch": 2.1759150862405368, "grad_norm": 184.76950073242188, "learning_rate": 2.1281414963692014e-05, "loss": 0.1438, "step": 27880 }, { "epoch": 2.1766955435885427, "grad_norm": 5.0829814426833764e-05, "learning_rate": 2.1244258276684787e-05, "loss": 1.4594, "step": 27890 }, { "epoch": 2.177476000936549, "grad_norm": 4.234400677316508e-11, "learning_rate": 2.1207125303284637e-05, "loss": 0.0001, "step": 27900 }, { "epoch": 2.178256458284555, "grad_norm": 1.1056568854916904e-10, "learning_rate": 2.1170016074113542e-05, "loss": 1.014, "step": 27910 }, { "epoch": 2.179036915632561, "grad_norm": 4.591631075356872e-09, "learning_rate": 2.113293061977384e-05, "loss": 0.3403, "step": 27920 }, { "epoch": 2.1798173729805668, "grad_norm": 6.53014765200112e-14, "learning_rate": 2.1095868970848322e-05, "loss": 0.008, "step": 27930 }, { "epoch": 2.1805978303285727, "grad_norm": 0.0019696371164172888, "learning_rate": 2.1058831157900088e-05, "loss": 0.0308, "step": 27940 }, { "epoch": 2.1813782876765786, "grad_norm": 8.0718174115188e-15, "learning_rate": 2.10218172114726e-05, "loss": 0.5434, "step": 27950 }, { "epoch": 2.1821587450245845, "grad_norm": 0.26627057790756226, "learning_rate": 2.0984827162089665e-05, "loss": 0.0001, "step": 27960 }, { "epoch": 2.1829392023725904, "grad_norm": 0.04139802232384682, "learning_rate": 2.0947861040255356e-05, "loss": 0.001, "step": 27970 }, { "epoch": 2.1837196597205963, "grad_norm": 4.1397134964427096e-07, "learning_rate": 2.091091887645405e-05, "loss": 0.0052, "step": 27980 }, { "epoch": 2.1845001170686023, "grad_norm": 0.00010960004874505103, "learning_rate": 2.08740007011503e-05, "loss": 0.002, "step": 27990 }, { "epoch": 2.185280574416608, "grad_norm": 0.00015495822299271822, "learning_rate": 2.0837106544788953e-05, "loss": 0.6876, "step": 28000 }, { "epoch": 2.186061031764614, "grad_norm": 7.378990267170593e-05, "learning_rate": 2.0800236437794972e-05, "loss": 0.0006, "step": 28010 }, { "epoch": 2.18684148911262, "grad_norm": 0.0001176554651465267, "learning_rate": 2.0763390410573567e-05, "loss": 0.3669, "step": 28020 }, { "epoch": 2.187621946460626, "grad_norm": 4.130102482235998e-09, "learning_rate": 2.072656849351002e-05, "loss": 0.0089, "step": 28030 }, { "epoch": 2.188402403808632, "grad_norm": 1.4616011512202931e-08, "learning_rate": 2.068977071696977e-05, "loss": 0.1802, "step": 28040 }, { "epoch": 2.1891828611566377, "grad_norm": 1.43246558278326e-12, "learning_rate": 2.0652997111298363e-05, "loss": 0.0135, "step": 28050 }, { "epoch": 2.1899633185046437, "grad_norm": 365.1261901855469, "learning_rate": 2.0616247706821347e-05, "loss": 0.6377, "step": 28060 }, { "epoch": 2.1907437758526496, "grad_norm": 12.422369956970215, "learning_rate": 2.0579522533844397e-05, "loss": 0.0047, "step": 28070 }, { "epoch": 2.1915242332006555, "grad_norm": 2.1104255750061625e-15, "learning_rate": 2.054282162265313e-05, "loss": 1.9192, "step": 28080 }, { "epoch": 2.1923046905486614, "grad_norm": 3.5538564278903007e-13, "learning_rate": 2.0506145003513215e-05, "loss": 0.0014, "step": 28090 }, { "epoch": 2.1930851478966673, "grad_norm": 0.00013382562610786408, "learning_rate": 2.0469492706670236e-05, "loss": 0.2018, "step": 28100 }, { "epoch": 2.1938656052446732, "grad_norm": 3.3238044125027955e-05, "learning_rate": 2.043286476234975e-05, "loss": 0.0, "step": 28110 }, { "epoch": 2.194646062592679, "grad_norm": 6.489884071925189e-06, "learning_rate": 2.0396261200757237e-05, "loss": 1.2673, "step": 28120 }, { "epoch": 2.195426519940685, "grad_norm": 4.15155426480851e-07, "learning_rate": 2.035968205207807e-05, "loss": 0.0223, "step": 28130 }, { "epoch": 2.196206977288691, "grad_norm": 6.578194734174758e-05, "learning_rate": 2.032312734647747e-05, "loss": 1.6357, "step": 28140 }, { "epoch": 2.1969874346366973, "grad_norm": 5.8557468616853464e-21, "learning_rate": 2.028659711410048e-05, "loss": 0.7133, "step": 28150 }, { "epoch": 2.1977678919847032, "grad_norm": 5.440007953438908e-06, "learning_rate": 2.025009138507204e-05, "loss": 0.016, "step": 28160 }, { "epoch": 2.198548349332709, "grad_norm": 0.00112254754640162, "learning_rate": 2.0213610189496784e-05, "loss": 0.0001, "step": 28170 }, { "epoch": 2.199328806680715, "grad_norm": 2.7225906862327065e-08, "learning_rate": 2.0177153557459212e-05, "loss": 0.0051, "step": 28180 }, { "epoch": 2.200109264028721, "grad_norm": 0.006148796062916517, "learning_rate": 2.014072151902346e-05, "loss": 1.3892, "step": 28190 }, { "epoch": 2.200889721376727, "grad_norm": 1.1964095830917358, "learning_rate": 2.010431410423351e-05, "loss": 0.0007, "step": 28200 }, { "epoch": 2.201670178724733, "grad_norm": 0.0008482402190566063, "learning_rate": 2.0067931343112928e-05, "loss": 0.0279, "step": 28210 }, { "epoch": 2.2024506360727387, "grad_norm": 203.20982360839844, "learning_rate": 2.0031573265665022e-05, "loss": 0.8307, "step": 28220 }, { "epoch": 2.2032310934207446, "grad_norm": 7.246639486396811e-12, "learning_rate": 1.999523990187267e-05, "loss": 0.0001, "step": 28230 }, { "epoch": 2.2040115507687505, "grad_norm": 0.022905228659510612, "learning_rate": 1.9958931281698463e-05, "loss": 0.0331, "step": 28240 }, { "epoch": 2.2047920081167565, "grad_norm": 8.755494125664478e-15, "learning_rate": 1.9922647435084514e-05, "loss": 0.0435, "step": 28250 }, { "epoch": 2.2055724654647624, "grad_norm": 302.1018981933594, "learning_rate": 1.988638839195251e-05, "loss": 2.5054, "step": 28260 }, { "epoch": 2.2063529228127683, "grad_norm": 6.365735316649079e-06, "learning_rate": 1.9850154182203722e-05, "loss": 0.0001, "step": 28270 }, { "epoch": 2.207133380160774, "grad_norm": 210.52674865722656, "learning_rate": 1.9813944835718936e-05, "loss": 0.6641, "step": 28280 }, { "epoch": 2.20791383750878, "grad_norm": 4.210061160847545e-05, "learning_rate": 1.9777760382358417e-05, "loss": 0.0003, "step": 28290 }, { "epoch": 2.208694294856786, "grad_norm": 0.001491089817136526, "learning_rate": 1.974160085196189e-05, "loss": 2.045, "step": 28300 }, { "epoch": 2.209474752204792, "grad_norm": 1.0782598110381514e-05, "learning_rate": 1.970546627434857e-05, "loss": 0.0367, "step": 28310 }, { "epoch": 2.210255209552798, "grad_norm": 229.82244873046875, "learning_rate": 1.966935667931704e-05, "loss": 1.2602, "step": 28320 }, { "epoch": 2.2110356669008038, "grad_norm": 5.791094537244135e-08, "learning_rate": 1.9633272096645345e-05, "loss": 0.0015, "step": 28330 }, { "epoch": 2.2118161242488097, "grad_norm": 0.8963795304298401, "learning_rate": 1.9597212556090827e-05, "loss": 0.0004, "step": 28340 }, { "epoch": 2.2125965815968156, "grad_norm": 1.6252406567218713e-05, "learning_rate": 1.956117808739023e-05, "loss": 0.0005, "step": 28350 }, { "epoch": 2.2133770389448215, "grad_norm": 7.480576238236682e-16, "learning_rate": 1.9525168720259647e-05, "loss": 0.0043, "step": 28360 }, { "epoch": 2.2141574962928274, "grad_norm": 0.00015660261851735413, "learning_rate": 1.9489184484394374e-05, "loss": 0.3501, "step": 28370 }, { "epoch": 2.2149379536408333, "grad_norm": 204.61285400390625, "learning_rate": 1.9453225409469093e-05, "loss": 1.9355, "step": 28380 }, { "epoch": 2.2157184109888393, "grad_norm": 1.1758524465221853e-07, "learning_rate": 1.9417291525137656e-05, "loss": 0.4495, "step": 28390 }, { "epoch": 2.2164988683368456, "grad_norm": 0.0005095585947856307, "learning_rate": 1.9381382861033194e-05, "loss": 0.8172, "step": 28400 }, { "epoch": 2.217279325684851, "grad_norm": 0.00011332322173984721, "learning_rate": 1.9345499446767982e-05, "loss": 0.1318, "step": 28410 }, { "epoch": 2.2180597830328574, "grad_norm": 5.318884568894511e-11, "learning_rate": 1.9309641311933535e-05, "loss": 1.5683, "step": 28420 }, { "epoch": 2.2188402403808634, "grad_norm": 8.58475141285453e-06, "learning_rate": 1.927380848610051e-05, "loss": 0.0013, "step": 28430 }, { "epoch": 2.2196206977288693, "grad_norm": 6.644186214543879e-06, "learning_rate": 1.9238000998818634e-05, "loss": 0.0339, "step": 28440 }, { "epoch": 2.220401155076875, "grad_norm": 0.00013304341700859368, "learning_rate": 1.9202218879616824e-05, "loss": 0.0302, "step": 28450 }, { "epoch": 2.221181612424881, "grad_norm": 1.0497045057773069e-16, "learning_rate": 1.9166462158002995e-05, "loss": 0.0025, "step": 28460 }, { "epoch": 2.221962069772887, "grad_norm": 2.2750330875361165e-14, "learning_rate": 1.9130730863464196e-05, "loss": 0.5506, "step": 28470 }, { "epoch": 2.222742527120893, "grad_norm": 1.969460150519353e-09, "learning_rate": 1.909502502546643e-05, "loss": 0.0003, "step": 28480 }, { "epoch": 2.223522984468899, "grad_norm": 7.709286364843138e-08, "learning_rate": 1.905934467345478e-05, "loss": 1.6219, "step": 28490 }, { "epoch": 2.2243034418169048, "grad_norm": 3.405430531140739e-15, "learning_rate": 1.9023689836853253e-05, "loss": 0.5878, "step": 28500 }, { "epoch": 2.2250838991649107, "grad_norm": 0.00013919989578425884, "learning_rate": 1.898806054506484e-05, "loss": 0.1291, "step": 28510 }, { "epoch": 2.2258643565129166, "grad_norm": 1.1194244535462015e-18, "learning_rate": 1.8952456827471475e-05, "loss": 0.7777, "step": 28520 }, { "epoch": 2.2266448138609225, "grad_norm": 4.966762290337101e-08, "learning_rate": 1.891687871343401e-05, "loss": 0.865, "step": 28530 }, { "epoch": 2.2274252712089284, "grad_norm": 6.397501639554548e-09, "learning_rate": 1.888132623229215e-05, "loss": 0.0003, "step": 28540 }, { "epoch": 2.2282057285569343, "grad_norm": 1.4362217370944563e-06, "learning_rate": 1.884579941336445e-05, "loss": 1.4745, "step": 28550 }, { "epoch": 2.2289861859049402, "grad_norm": 0.010631516575813293, "learning_rate": 1.881029828594837e-05, "loss": 0.0259, "step": 28560 }, { "epoch": 2.229766643252946, "grad_norm": 2.6044972400995903e-07, "learning_rate": 1.8774822879320107e-05, "loss": 0.0, "step": 28570 }, { "epoch": 2.230547100600952, "grad_norm": 9.406716117155156e-07, "learning_rate": 1.8739373222734708e-05, "loss": 1.3937, "step": 28580 }, { "epoch": 2.231327557948958, "grad_norm": 0.0014336185995489359, "learning_rate": 1.8703949345425948e-05, "loss": 0.1551, "step": 28590 }, { "epoch": 2.232108015296964, "grad_norm": 1.689692794570874e-06, "learning_rate": 1.8668551276606377e-05, "loss": 1.1188, "step": 28600 }, { "epoch": 2.23288847264497, "grad_norm": 2.9793050289154053, "learning_rate": 1.8633179045467203e-05, "loss": 0.0023, "step": 28610 }, { "epoch": 2.2336689299929757, "grad_norm": 3.615961425684072e-13, "learning_rate": 1.8597832681178405e-05, "loss": 1.1973, "step": 28620 }, { "epoch": 2.2344493873409816, "grad_norm": 0.01192985288798809, "learning_rate": 1.8562512212888565e-05, "loss": 0.003, "step": 28630 }, { "epoch": 2.2352298446889876, "grad_norm": 215.98329162597656, "learning_rate": 1.8527217669724924e-05, "loss": 2.44, "step": 28640 }, { "epoch": 2.236010302036994, "grad_norm": 0.008464322425425053, "learning_rate": 1.849194908079336e-05, "loss": 0.3656, "step": 28650 }, { "epoch": 2.2367907593849994, "grad_norm": 0.8279557228088379, "learning_rate": 1.8456706475178347e-05, "loss": 1.2165, "step": 28660 }, { "epoch": 2.2375712167330057, "grad_norm": 3.254080729675479e-05, "learning_rate": 1.842148988194295e-05, "loss": 0.0004, "step": 28670 }, { "epoch": 2.2383516740810117, "grad_norm": 1.2258682957622113e-10, "learning_rate": 1.8386299330128714e-05, "loss": 1.4324, "step": 28680 }, { "epoch": 2.2391321314290176, "grad_norm": 0.004264687187969685, "learning_rate": 1.8351134848755796e-05, "loss": 1.745, "step": 28690 }, { "epoch": 2.2399125887770235, "grad_norm": 0.00042669291724450886, "learning_rate": 1.8315996466822772e-05, "loss": 0.0516, "step": 28700 }, { "epoch": 2.2406930461250294, "grad_norm": 0.00012339148088358343, "learning_rate": 1.828088421330677e-05, "loss": 0.651, "step": 28710 }, { "epoch": 2.2414735034730353, "grad_norm": 2.1883664585303775e-12, "learning_rate": 1.82457981171633e-05, "loss": 0.5319, "step": 28720 }, { "epoch": 2.2422539608210412, "grad_norm": 2.0608095785412672e-10, "learning_rate": 1.8210738207326356e-05, "loss": 0.9683, "step": 28730 }, { "epoch": 2.243034418169047, "grad_norm": 4.095708573004231e-05, "learning_rate": 1.8175704512708335e-05, "loss": 0.0071, "step": 28740 }, { "epoch": 2.243814875517053, "grad_norm": 1.1333201307891236e-15, "learning_rate": 1.8140697062199963e-05, "loss": 0.0018, "step": 28750 }, { "epoch": 2.244595332865059, "grad_norm": 0.00019960488134529442, "learning_rate": 1.8105715884670388e-05, "loss": 0.7299, "step": 28760 }, { "epoch": 2.245375790213065, "grad_norm": 0.051362793892621994, "learning_rate": 1.8070761008967036e-05, "loss": 0.0008, "step": 28770 }, { "epoch": 2.246156247561071, "grad_norm": 1.3369414773478638e-05, "learning_rate": 1.8035832463915702e-05, "loss": 1.4077, "step": 28780 }, { "epoch": 2.2469367049090767, "grad_norm": 6.260481313802302e-06, "learning_rate": 1.8000930278320398e-05, "loss": 0.001, "step": 28790 }, { "epoch": 2.2477171622570826, "grad_norm": 2.0338929718481842e-14, "learning_rate": 1.796605448096348e-05, "loss": 0.0582, "step": 28800 }, { "epoch": 2.2484976196050885, "grad_norm": 2.200808495445017e-07, "learning_rate": 1.7931205100605446e-05, "loss": 0.9008, "step": 28810 }, { "epoch": 2.2492780769530945, "grad_norm": 5.862556551328169e-10, "learning_rate": 1.7896382165985092e-05, "loss": 0.2649, "step": 28820 }, { "epoch": 2.2500585343011004, "grad_norm": 0.005376019049435854, "learning_rate": 1.786158570581939e-05, "loss": 0.0015, "step": 28830 }, { "epoch": 2.2508389916491063, "grad_norm": 17.125396728515625, "learning_rate": 1.782681574880343e-05, "loss": 0.5649, "step": 28840 }, { "epoch": 2.251619448997112, "grad_norm": 1.2211214303970337, "learning_rate": 1.7792072323610508e-05, "loss": 1.5577, "step": 28850 }, { "epoch": 2.252399906345118, "grad_norm": 3.984374462306928e-12, "learning_rate": 1.7757355458891982e-05, "loss": 0.3288, "step": 28860 }, { "epoch": 2.253180363693124, "grad_norm": 1.6309401051906036e-10, "learning_rate": 1.772266518327738e-05, "loss": 0.0083, "step": 28870 }, { "epoch": 2.25396082104113, "grad_norm": 0.00034212172613479197, "learning_rate": 1.7688001525374215e-05, "loss": 0.6162, "step": 28880 }, { "epoch": 2.254741278389136, "grad_norm": 1.2708798635685525e-07, "learning_rate": 1.7653364513768116e-05, "loss": 0.0385, "step": 28890 }, { "epoch": 2.255521735737142, "grad_norm": 0.03911673277616501, "learning_rate": 1.7618754177022722e-05, "loss": 0.8912, "step": 28900 }, { "epoch": 2.2563021930851477, "grad_norm": 0.017582669854164124, "learning_rate": 1.758417054367968e-05, "loss": 0.4513, "step": 28910 }, { "epoch": 2.257082650433154, "grad_norm": 2.5487133291455383e-14, "learning_rate": 1.7549613642258573e-05, "loss": 0.0602, "step": 28920 }, { "epoch": 2.25786310778116, "grad_norm": 1.2419158679222164e-07, "learning_rate": 1.7515083501257006e-05, "loss": 0.0005, "step": 28930 }, { "epoch": 2.258643565129166, "grad_norm": 0.010186822153627872, "learning_rate": 1.7480580149150466e-05, "loss": 0.0001, "step": 28940 }, { "epoch": 2.259424022477172, "grad_norm": 2.926939123426564e-05, "learning_rate": 1.7446103614392343e-05, "loss": 0.0044, "step": 28950 }, { "epoch": 2.2602044798251777, "grad_norm": 0.6418260931968689, "learning_rate": 1.7411653925413957e-05, "loss": 1.549, "step": 28960 }, { "epoch": 2.2609849371731836, "grad_norm": 1.1723219586201594e-06, "learning_rate": 1.737723111062446e-05, "loss": 0.6384, "step": 28970 }, { "epoch": 2.2617653945211895, "grad_norm": 0.04600205644965172, "learning_rate": 1.7342835198410863e-05, "loss": 0.0717, "step": 28980 }, { "epoch": 2.2625458518691954, "grad_norm": 199.13775634765625, "learning_rate": 1.730846621713795e-05, "loss": 0.3244, "step": 28990 }, { "epoch": 2.2633263092172013, "grad_norm": 3.316258727336959e-17, "learning_rate": 1.7274124195148338e-05, "loss": 0.0252, "step": 29000 }, { "epoch": 2.2641067665652073, "grad_norm": 1.9493307945595006e-07, "learning_rate": 1.7239809160762383e-05, "loss": 0.5724, "step": 29010 }, { "epoch": 2.264887223913213, "grad_norm": 0.06668946892023087, "learning_rate": 1.7205521142278225e-05, "loss": 0.9245, "step": 29020 }, { "epoch": 2.265667681261219, "grad_norm": 4.2764901185153234e-14, "learning_rate": 1.7171260167971658e-05, "loss": 0.1911, "step": 29030 }, { "epoch": 2.266448138609225, "grad_norm": 1.0847069025039673, "learning_rate": 1.713702626609624e-05, "loss": 0.1063, "step": 29040 }, { "epoch": 2.267228595957231, "grad_norm": 0.001475973636843264, "learning_rate": 1.710281946488319e-05, "loss": 0.4168, "step": 29050 }, { "epoch": 2.268009053305237, "grad_norm": 5.03236151416786e-07, "learning_rate": 1.7068639792541337e-05, "loss": 1.7054, "step": 29060 }, { "epoch": 2.2687895106532427, "grad_norm": 5.004310854594962e-10, "learning_rate": 1.7034487277257193e-05, "loss": 0.0003, "step": 29070 }, { "epoch": 2.2695699680012487, "grad_norm": 1.9284122743101761e-07, "learning_rate": 1.7000361947194828e-05, "loss": 0.0043, "step": 29080 }, { "epoch": 2.2703504253492546, "grad_norm": 1.50415091686904e-16, "learning_rate": 1.6966263830495936e-05, "loss": 0.0086, "step": 29090 }, { "epoch": 2.2711308826972605, "grad_norm": 163.6029052734375, "learning_rate": 1.6932192955279724e-05, "loss": 0.338, "step": 29100 }, { "epoch": 2.2719113400452664, "grad_norm": 4.9290561118956947e-17, "learning_rate": 1.6898149349642984e-05, "loss": 0.1245, "step": 29110 }, { "epoch": 2.2726917973932723, "grad_norm": 0.01845482736825943, "learning_rate": 1.6864133041659964e-05, "loss": 0.7721, "step": 29120 }, { "epoch": 2.2734722547412782, "grad_norm": 0.42253369092941284, "learning_rate": 1.6830144059382448e-05, "loss": 0.0258, "step": 29130 }, { "epoch": 2.274252712089284, "grad_norm": 0.0007411639671772718, "learning_rate": 1.679618243083968e-05, "loss": 0.7881, "step": 29140 }, { "epoch": 2.27503316943729, "grad_norm": 2.0271454559406266e-05, "learning_rate": 1.676224818403831e-05, "loss": 0.4049, "step": 29150 }, { "epoch": 2.275813626785296, "grad_norm": 0.1324540078639984, "learning_rate": 1.6728341346962462e-05, "loss": 0.0122, "step": 29160 }, { "epoch": 2.2765940841333023, "grad_norm": 2.3629531491209388e-12, "learning_rate": 1.669446194757359e-05, "loss": 0.1406, "step": 29170 }, { "epoch": 2.277374541481308, "grad_norm": 2.2875452643233984e-08, "learning_rate": 1.6660610013810603e-05, "loss": 0.0, "step": 29180 }, { "epoch": 2.278154998829314, "grad_norm": 8.568787279727985e-08, "learning_rate": 1.6626785573589665e-05, "loss": 1.0792, "step": 29190 }, { "epoch": 2.27893545617732, "grad_norm": 1.937781780725345e-05, "learning_rate": 1.659298865480435e-05, "loss": 0.6439, "step": 29200 }, { "epoch": 2.279715913525326, "grad_norm": 0.010542660020291805, "learning_rate": 1.6559219285325495e-05, "loss": 0.1753, "step": 29210 }, { "epoch": 2.280496370873332, "grad_norm": 28.777854919433594, "learning_rate": 1.6525477493001253e-05, "loss": 0.0113, "step": 29220 }, { "epoch": 2.281276828221338, "grad_norm": 0.0008937482489272952, "learning_rate": 1.649176330565698e-05, "loss": 0.1313, "step": 29230 }, { "epoch": 2.2820572855693437, "grad_norm": 3.146440940327011e-05, "learning_rate": 1.645807675109529e-05, "loss": 0.0, "step": 29240 }, { "epoch": 2.2828377429173496, "grad_norm": 3.4924204328490305e-07, "learning_rate": 1.6424417857096052e-05, "loss": 0.0139, "step": 29250 }, { "epoch": 2.2836182002653556, "grad_norm": 1.5560833332983748e-07, "learning_rate": 1.6390786651416245e-05, "loss": 0.4481, "step": 29260 }, { "epoch": 2.2843986576133615, "grad_norm": 0.00024010951165109873, "learning_rate": 1.6357183161790086e-05, "loss": 0.0023, "step": 29270 }, { "epoch": 2.2851791149613674, "grad_norm": 2.9010316371369527e-09, "learning_rate": 1.6323607415928905e-05, "loss": 0.0002, "step": 29280 }, { "epoch": 2.2859595723093733, "grad_norm": 1.5927986169117503e-05, "learning_rate": 1.629005944152117e-05, "loss": 0.0212, "step": 29290 }, { "epoch": 2.286740029657379, "grad_norm": 8.826788578797239e-22, "learning_rate": 1.6256539266232406e-05, "loss": 0.0007, "step": 29300 }, { "epoch": 2.287520487005385, "grad_norm": 1.8401525494482485e-06, "learning_rate": 1.622304691770527e-05, "loss": 2.0744, "step": 29310 }, { "epoch": 2.288300944353391, "grad_norm": 3.847404514090158e-06, "learning_rate": 1.6189582423559424e-05, "loss": 0.0424, "step": 29320 }, { "epoch": 2.289081401701397, "grad_norm": 0.34409666061401367, "learning_rate": 1.6156145811391565e-05, "loss": 0.2845, "step": 29330 }, { "epoch": 2.289861859049403, "grad_norm": 180.2010498046875, "learning_rate": 1.6122737108775444e-05, "loss": 0.2992, "step": 29340 }, { "epoch": 2.290642316397409, "grad_norm": 9.646710513733225e-11, "learning_rate": 1.6089356343261706e-05, "loss": 1.9876, "step": 29350 }, { "epoch": 2.2914227737454147, "grad_norm": 6.982622813289652e-10, "learning_rate": 1.6056003542378088e-05, "loss": 0.0046, "step": 29360 }, { "epoch": 2.2922032310934206, "grad_norm": 0.0001232082722708583, "learning_rate": 1.6022678733629142e-05, "loss": 0.0002, "step": 29370 }, { "epoch": 2.2929836884414265, "grad_norm": 294.8337707519531, "learning_rate": 1.598938194449641e-05, "loss": 1.363, "step": 29380 }, { "epoch": 2.2937641457894324, "grad_norm": 0.3886575400829315, "learning_rate": 1.595611320243828e-05, "loss": 0.8926, "step": 29390 }, { "epoch": 2.2945446031374384, "grad_norm": 0.0002822733367793262, "learning_rate": 1.5922872534890054e-05, "loss": 0.0, "step": 29400 }, { "epoch": 2.2953250604854443, "grad_norm": 79.61001586914062, "learning_rate": 1.5889659969263843e-05, "loss": 0.3346, "step": 29410 }, { "epoch": 2.2961055178334506, "grad_norm": 3.2513478021023445e-10, "learning_rate": 1.585647553294863e-05, "loss": 0.0014, "step": 29420 }, { "epoch": 2.296885975181456, "grad_norm": 6.914351313724865e-10, "learning_rate": 1.5823319253310133e-05, "loss": 3.8508, "step": 29430 }, { "epoch": 2.2976664325294625, "grad_norm": 6.965128208349589e-14, "learning_rate": 1.579019115769092e-05, "loss": 0.0199, "step": 29440 }, { "epoch": 2.2984468898774684, "grad_norm": 2.478855662402246e-12, "learning_rate": 1.5757091273410296e-05, "loss": 0.3049, "step": 29450 }, { "epoch": 2.2992273472254743, "grad_norm": 5.027878614782821e-06, "learning_rate": 1.5724019627764265e-05, "loss": 0.4531, "step": 29460 }, { "epoch": 2.30000780457348, "grad_norm": 111.20915985107422, "learning_rate": 1.5690976248025603e-05, "loss": 2.1627, "step": 29470 }, { "epoch": 2.300788261921486, "grad_norm": 62.71500015258789, "learning_rate": 1.565796116144371e-05, "loss": 0.0257, "step": 29480 }, { "epoch": 2.301568719269492, "grad_norm": 9.235794351525328e-08, "learning_rate": 1.5624974395244722e-05, "loss": 0.2352, "step": 29490 }, { "epoch": 2.302349176617498, "grad_norm": 6.738876492419499e-13, "learning_rate": 1.5592015976631363e-05, "loss": 0.0007, "step": 29500 }, { "epoch": 2.303129633965504, "grad_norm": 1.0343747314443312e-16, "learning_rate": 1.5559085932783013e-05, "loss": 0.0698, "step": 29510 }, { "epoch": 2.3039100913135098, "grad_norm": 0.0002653372648637742, "learning_rate": 1.552618429085566e-05, "loss": 0.003, "step": 29520 }, { "epoch": 2.3046905486615157, "grad_norm": 3.0795322888965693e-12, "learning_rate": 1.5493311077981827e-05, "loss": 0.0039, "step": 29530 }, { "epoch": 2.3054710060095216, "grad_norm": 0.0023853040765970945, "learning_rate": 1.546046632127065e-05, "loss": 2.4906, "step": 29540 }, { "epoch": 2.3062514633575275, "grad_norm": 12.284415245056152, "learning_rate": 1.5427650047807733e-05, "loss": 0.6692, "step": 29550 }, { "epoch": 2.3070319207055334, "grad_norm": 5.2024464736177833e-08, "learning_rate": 1.5394862284655264e-05, "loss": 0.0006, "step": 29560 }, { "epoch": 2.3078123780535393, "grad_norm": 7.312227694455942e-07, "learning_rate": 1.5362103058851847e-05, "loss": 0.6756, "step": 29570 }, { "epoch": 2.3085928354015453, "grad_norm": 7.574797271613494e-18, "learning_rate": 1.5329372397412605e-05, "loss": 0.0, "step": 29580 }, { "epoch": 2.309373292749551, "grad_norm": 0.0, "learning_rate": 1.529667032732909e-05, "loss": 0.97, "step": 29590 }, { "epoch": 2.310153750097557, "grad_norm": 5.293955920339377e-23, "learning_rate": 1.5263996875569288e-05, "loss": 0.0633, "step": 29600 }, { "epoch": 2.310934207445563, "grad_norm": 0.1192730963230133, "learning_rate": 1.5231352069077553e-05, "loss": 0.1783, "step": 29610 }, { "epoch": 2.311714664793569, "grad_norm": 3.9510272472398356e-05, "learning_rate": 1.5198735934774627e-05, "loss": 0.0019, "step": 29620 }, { "epoch": 2.312495122141575, "grad_norm": 4.5667154546834635e-14, "learning_rate": 1.5166148499557637e-05, "loss": 0.1592, "step": 29630 }, { "epoch": 2.3132755794895807, "grad_norm": 151.92227172851562, "learning_rate": 1.513358979029999e-05, "loss": 0.7626, "step": 29640 }, { "epoch": 2.3140560368375866, "grad_norm": 1.377452372253174e-05, "learning_rate": 1.5101059833851473e-05, "loss": 1.341, "step": 29650 }, { "epoch": 2.3148364941855926, "grad_norm": 18.119611740112305, "learning_rate": 1.5068558657038084e-05, "loss": 0.0049, "step": 29660 }, { "epoch": 2.315616951533599, "grad_norm": 3.9845085098022537e-07, "learning_rate": 1.5036086286662148e-05, "loss": 1.8925, "step": 29670 }, { "epoch": 2.3163974088816044, "grad_norm": 9.201725481838147e-16, "learning_rate": 1.500364274950221e-05, "loss": 1.578, "step": 29680 }, { "epoch": 2.3171778662296107, "grad_norm": 1.5374733266071416e-05, "learning_rate": 1.497122807231306e-05, "loss": 0.0, "step": 29690 }, { "epoch": 2.3179583235776167, "grad_norm": 1.1048708984162658e-05, "learning_rate": 1.4938842281825637e-05, "loss": 0.1567, "step": 29700 }, { "epoch": 2.3187387809256226, "grad_norm": 5.3418560241880186e-08, "learning_rate": 1.4906485404747127e-05, "loss": 0.0, "step": 29710 }, { "epoch": 2.3195192382736285, "grad_norm": 1.4238677841695176e-15, "learning_rate": 1.4874157467760812e-05, "loss": 1.4317, "step": 29720 }, { "epoch": 2.3202996956216344, "grad_norm": 0.0007119226502254605, "learning_rate": 1.484185849752613e-05, "loss": 1.1788, "step": 29730 }, { "epoch": 2.3210801529696403, "grad_norm": 4.933306172461016e-07, "learning_rate": 1.4809588520678636e-05, "loss": 0.0012, "step": 29740 }, { "epoch": 2.3218606103176462, "grad_norm": 0.19965912401676178, "learning_rate": 1.477734756382998e-05, "loss": 0.0255, "step": 29750 }, { "epoch": 2.322641067665652, "grad_norm": 0.8561561107635498, "learning_rate": 1.4745135653567887e-05, "loss": 0.3782, "step": 29760 }, { "epoch": 2.323421525013658, "grad_norm": 0.00013684302393812686, "learning_rate": 1.4712952816456093e-05, "loss": 0.8732, "step": 29770 }, { "epoch": 2.324201982361664, "grad_norm": 1.5134020259210956e-07, "learning_rate": 1.4680799079034402e-05, "loss": 0.9348, "step": 29780 }, { "epoch": 2.32498243970967, "grad_norm": 2.4426712388891936e-11, "learning_rate": 1.4648674467818573e-05, "loss": 0.0, "step": 29790 }, { "epoch": 2.325762897057676, "grad_norm": 0.0035370292607694864, "learning_rate": 1.4616579009300407e-05, "loss": 0.0001, "step": 29800 }, { "epoch": 2.3265433544056817, "grad_norm": 1.096762188785628e-15, "learning_rate": 1.4584512729947597e-05, "loss": 0.0002, "step": 29810 }, { "epoch": 2.3273238117536876, "grad_norm": 1.2029965912319085e-09, "learning_rate": 1.4552475656203817e-05, "loss": 0.0779, "step": 29820 }, { "epoch": 2.3281042691016935, "grad_norm": 120.34854125976562, "learning_rate": 1.452046781448867e-05, "loss": 0.1021, "step": 29830 }, { "epoch": 2.3288847264496995, "grad_norm": 3.3695923207233136e-07, "learning_rate": 1.4488489231197589e-05, "loss": 0.0978, "step": 29840 }, { "epoch": 2.3296651837977054, "grad_norm": 7.707518670940772e-05, "learning_rate": 1.4456539932701957e-05, "loss": 0.9351, "step": 29850 }, { "epoch": 2.3304456411457113, "grad_norm": 0.002746081445366144, "learning_rate": 1.4424619945348927e-05, "loss": 0.0, "step": 29860 }, { "epoch": 2.331226098493717, "grad_norm": 0.0011035347124561667, "learning_rate": 1.439272929546156e-05, "loss": 0.0, "step": 29870 }, { "epoch": 2.332006555841723, "grad_norm": 1.0748327783360878e-09, "learning_rate": 1.4360868009338658e-05, "loss": 0.0, "step": 29880 }, { "epoch": 2.332787013189729, "grad_norm": 6.85275438172539e-07, "learning_rate": 1.432903611325484e-05, "loss": 0.0, "step": 29890 }, { "epoch": 2.333567470537735, "grad_norm": 3.086071274083224e-06, "learning_rate": 1.4297233633460489e-05, "loss": 1.504, "step": 29900 }, { "epoch": 2.334347927885741, "grad_norm": 0.00041684453026391566, "learning_rate": 1.426546059618174e-05, "loss": 0.0, "step": 29910 }, { "epoch": 2.335128385233747, "grad_norm": 187.2286834716797, "learning_rate": 1.4233717027620413e-05, "loss": 2.9529, "step": 29920 }, { "epoch": 2.3359088425817527, "grad_norm": 6.371737981680781e-07, "learning_rate": 1.4202002953954041e-05, "loss": 0.3322, "step": 29930 }, { "epoch": 2.336689299929759, "grad_norm": 0.9140729904174805, "learning_rate": 1.4170318401335859e-05, "loss": 0.1837, "step": 29940 }, { "epoch": 2.337469757277765, "grad_norm": 9.586511851011892e-08, "learning_rate": 1.4138663395894702e-05, "loss": 0.0001, "step": 29950 }, { "epoch": 2.338250214625771, "grad_norm": 166.947998046875, "learning_rate": 1.410703796373512e-05, "loss": 0.1164, "step": 29960 }, { "epoch": 2.339030671973777, "grad_norm": 0.01821865886449814, "learning_rate": 1.4075442130937183e-05, "loss": 0.2303, "step": 29970 }, { "epoch": 2.3398111293217827, "grad_norm": 0.018942847847938538, "learning_rate": 1.4043875923556627e-05, "loss": 0.0001, "step": 29980 }, { "epoch": 2.3405915866697886, "grad_norm": 0.03353448212146759, "learning_rate": 1.4012339367624711e-05, "loss": 0.7039, "step": 29990 }, { "epoch": 2.3413720440177945, "grad_norm": 1.6034233851769386e-07, "learning_rate": 1.3980832489148287e-05, "loss": 0.1224, "step": 30000 }, { "epoch": 2.3421525013658004, "grad_norm": 194.84609985351562, "learning_rate": 1.3949355314109686e-05, "loss": 0.7069, "step": 30010 }, { "epoch": 2.3429329587138064, "grad_norm": 0.00033877615351229906, "learning_rate": 1.3917907868466745e-05, "loss": 0.0678, "step": 30020 }, { "epoch": 2.3437134160618123, "grad_norm": 0.2966441512107849, "learning_rate": 1.3886490178152834e-05, "loss": 0.0029, "step": 30030 }, { "epoch": 2.344493873409818, "grad_norm": 3.915573643098469e-07, "learning_rate": 1.385510226907672e-05, "loss": 0.3636, "step": 30040 }, { "epoch": 2.345274330757824, "grad_norm": 1.262197732925415, "learning_rate": 1.3823744167122665e-05, "loss": 0.2405, "step": 30050 }, { "epoch": 2.34605478810583, "grad_norm": 2.086556503569506e-19, "learning_rate": 1.3792415898150323e-05, "loss": 2.1505, "step": 30060 }, { "epoch": 2.346835245453836, "grad_norm": 5.639322807837743e-06, "learning_rate": 1.3761117487994767e-05, "loss": 0.0, "step": 30070 }, { "epoch": 2.347615702801842, "grad_norm": 0.0005181823507882655, "learning_rate": 1.3729848962466407e-05, "loss": 0.0275, "step": 30080 }, { "epoch": 2.3483961601498478, "grad_norm": 5.675804004567908e-07, "learning_rate": 1.3698610347351065e-05, "loss": 0.0289, "step": 30090 }, { "epoch": 2.3491766174978537, "grad_norm": 2.3446404817661914e-09, "learning_rate": 1.366740166840984e-05, "loss": 0.0001, "step": 30100 }, { "epoch": 2.3499570748458596, "grad_norm": 7.366129928243481e-09, "learning_rate": 1.3636222951379168e-05, "loss": 0.1921, "step": 30110 }, { "epoch": 2.3507375321938655, "grad_norm": 6.541293259942904e-06, "learning_rate": 1.3605074221970781e-05, "loss": 0.3531, "step": 30120 }, { "epoch": 2.3515179895418714, "grad_norm": 7.940363033753783e-09, "learning_rate": 1.3573955505871689e-05, "loss": 0.0002, "step": 30130 }, { "epoch": 2.3522984468898773, "grad_norm": 2.120935965134194e-18, "learning_rate": 1.3542866828744145e-05, "loss": 0.1597, "step": 30140 }, { "epoch": 2.3530789042378832, "grad_norm": 2.8953287250360615e-12, "learning_rate": 1.3511808216225607e-05, "loss": 0.0, "step": 30150 }, { "epoch": 2.353859361585889, "grad_norm": 426.9054260253906, "learning_rate": 1.348077969392878e-05, "loss": 0.7626, "step": 30160 }, { "epoch": 2.3546398189338955, "grad_norm": 0.006858580280095339, "learning_rate": 1.3449781287441516e-05, "loss": 0.121, "step": 30170 }, { "epoch": 2.355420276281901, "grad_norm": 0.0003124236536677927, "learning_rate": 1.341881302232687e-05, "loss": 1.0282, "step": 30180 }, { "epoch": 2.3562007336299073, "grad_norm": 0.06409773230552673, "learning_rate": 1.338787492412299e-05, "loss": 0.8941, "step": 30190 }, { "epoch": 2.356981190977913, "grad_norm": 9.21978660084477e-10, "learning_rate": 1.3356967018343202e-05, "loss": 0.0274, "step": 30200 }, { "epoch": 2.357761648325919, "grad_norm": 272.5448303222656, "learning_rate": 1.3326089330475916e-05, "loss": 0.3298, "step": 30210 }, { "epoch": 2.358542105673925, "grad_norm": 4.685094972955994e-05, "learning_rate": 1.3295241885984582e-05, "loss": 0.1805, "step": 30220 }, { "epoch": 2.359322563021931, "grad_norm": 8.916493324161243e-18, "learning_rate": 1.3264424710307788e-05, "loss": 0.0028, "step": 30230 }, { "epoch": 2.360103020369937, "grad_norm": 10.937233924865723, "learning_rate": 1.3233637828859074e-05, "loss": 0.3216, "step": 30240 }, { "epoch": 2.360883477717943, "grad_norm": 1.9286907121755917e-16, "learning_rate": 1.3202881267027067e-05, "loss": 0.0558, "step": 30250 }, { "epoch": 2.3616639350659487, "grad_norm": 1.1528916274983203e-06, "learning_rate": 1.3172155050175344e-05, "loss": 0.001, "step": 30260 }, { "epoch": 2.3624443924139547, "grad_norm": 9.056205385604699e-08, "learning_rate": 1.3141459203642503e-05, "loss": 0.0201, "step": 30270 }, { "epoch": 2.3632248497619606, "grad_norm": 72.918701171875, "learning_rate": 1.3110793752742035e-05, "loss": 0.7724, "step": 30280 }, { "epoch": 2.3640053071099665, "grad_norm": 3.27559462220961e-07, "learning_rate": 1.308015872276242e-05, "loss": 0.099, "step": 30290 }, { "epoch": 2.3647857644579724, "grad_norm": 2.3372873329208232e-05, "learning_rate": 1.3049554138967051e-05, "loss": 0.0002, "step": 30300 }, { "epoch": 2.3655662218059783, "grad_norm": 177.09768676757812, "learning_rate": 1.3018980026594164e-05, "loss": 0.5471, "step": 30310 }, { "epoch": 2.3663466791539842, "grad_norm": 1.786733446351718e-05, "learning_rate": 1.2988436410856918e-05, "loss": 2.1845, "step": 30320 }, { "epoch": 2.36712713650199, "grad_norm": 1.1945124576973054e-11, "learning_rate": 1.2957923316943283e-05, "loss": 0.7127, "step": 30330 }, { "epoch": 2.367907593849996, "grad_norm": 11.638202667236328, "learning_rate": 1.2927440770016102e-05, "loss": 0.2237, "step": 30340 }, { "epoch": 2.368688051198002, "grad_norm": 4.211646000840119e-07, "learning_rate": 1.2896988795212977e-05, "loss": 0.0001, "step": 30350 }, { "epoch": 2.369468508546008, "grad_norm": 251.97036743164062, "learning_rate": 1.286656741764633e-05, "loss": 1.2411, "step": 30360 }, { "epoch": 2.370248965894014, "grad_norm": 0.6915257573127747, "learning_rate": 1.283617666240336e-05, "loss": 0.0002, "step": 30370 }, { "epoch": 2.3710294232420197, "grad_norm": 7.871942943893373e-05, "learning_rate": 1.2805816554546002e-05, "loss": 0.0, "step": 30380 }, { "epoch": 2.3718098805900256, "grad_norm": 7.493450482343178e-08, "learning_rate": 1.277548711911089e-05, "loss": 0.2484, "step": 30390 }, { "epoch": 2.3725903379380315, "grad_norm": 1.512717366218567, "learning_rate": 1.2745188381109413e-05, "loss": 0.2046, "step": 30400 }, { "epoch": 2.3733707952860374, "grad_norm": 0.9041994214057922, "learning_rate": 1.2714920365527611e-05, "loss": 0.0852, "step": 30410 }, { "epoch": 2.3741512526340434, "grad_norm": 0.08699410408735275, "learning_rate": 1.2684683097326177e-05, "loss": 1.9877, "step": 30420 }, { "epoch": 2.3749317099820493, "grad_norm": 6.432399459299631e-06, "learning_rate": 1.2654476601440484e-05, "loss": 1.4962, "step": 30430 }, { "epoch": 2.3757121673300556, "grad_norm": 0.00024829444009810686, "learning_rate": 1.2624300902780518e-05, "loss": 0.0001, "step": 30440 }, { "epoch": 2.376492624678061, "grad_norm": 0.0683436170220375, "learning_rate": 1.2594156026230869e-05, "loss": 1.5125, "step": 30450 }, { "epoch": 2.3772730820260675, "grad_norm": 4.083558796933486e-22, "learning_rate": 1.2564041996650678e-05, "loss": 0.1911, "step": 30460 }, { "epoch": 2.3780535393740734, "grad_norm": 5.6656685046618804e-05, "learning_rate": 1.2533958838873705e-05, "loss": 0.0, "step": 30470 }, { "epoch": 2.3788339967220793, "grad_norm": 6.898962601553649e-05, "learning_rate": 1.2503906577708185e-05, "loss": 0.0, "step": 30480 }, { "epoch": 2.379614454070085, "grad_norm": 2.0142909909481865e-11, "learning_rate": 1.2473885237936944e-05, "loss": 0.3101, "step": 30490 }, { "epoch": 2.380394911418091, "grad_norm": 0.00017895206110551953, "learning_rate": 1.244389484431724e-05, "loss": 0.0003, "step": 30500 }, { "epoch": 2.381175368766097, "grad_norm": 6.29999067314202e-06, "learning_rate": 1.2413935421580851e-05, "loss": 0.0, "step": 30510 }, { "epoch": 2.381955826114103, "grad_norm": 4.521431833381939e-07, "learning_rate": 1.2384006994434032e-05, "loss": 0.0, "step": 30520 }, { "epoch": 2.382736283462109, "grad_norm": 1.842394885898102e-05, "learning_rate": 1.2354109587557422e-05, "loss": 0.0227, "step": 30530 }, { "epoch": 2.3835167408101148, "grad_norm": 2.835592427530287e-09, "learning_rate": 1.2324243225606135e-05, "loss": 0.453, "step": 30540 }, { "epoch": 2.3842971981581207, "grad_norm": 0.2322986125946045, "learning_rate": 1.2294407933209633e-05, "loss": 0.0115, "step": 30550 }, { "epoch": 2.3850776555061266, "grad_norm": 5.22947977239438e-14, "learning_rate": 1.2264603734971807e-05, "loss": 0.0274, "step": 30560 }, { "epoch": 2.3858581128541325, "grad_norm": 2.995477596097972e-15, "learning_rate": 1.2234830655470853e-05, "loss": 1.1651, "step": 30570 }, { "epoch": 2.3866385702021384, "grad_norm": 0.00010099189967149869, "learning_rate": 1.220508871925936e-05, "loss": 0.0, "step": 30580 }, { "epoch": 2.3874190275501443, "grad_norm": 309.7481384277344, "learning_rate": 1.2175377950864181e-05, "loss": 4.395, "step": 30590 }, { "epoch": 2.3881994848981503, "grad_norm": 7.486784133018432e-23, "learning_rate": 1.2145698374786501e-05, "loss": 0.001, "step": 30600 }, { "epoch": 2.388979942246156, "grad_norm": 0.02301146648824215, "learning_rate": 1.2116050015501795e-05, "loss": 0.0006, "step": 30610 }, { "epoch": 2.389760399594162, "grad_norm": 4.1090452361944806e-16, "learning_rate": 1.2086432897459737e-05, "loss": 0.4331, "step": 30620 }, { "epoch": 2.390540856942168, "grad_norm": 284.6919250488281, "learning_rate": 1.2056847045084308e-05, "loss": 1.8292, "step": 30630 }, { "epoch": 2.391321314290174, "grad_norm": 0.37592825293540955, "learning_rate": 1.2027292482773639e-05, "loss": 0.0128, "step": 30640 }, { "epoch": 2.39210177163818, "grad_norm": 0.01579117402434349, "learning_rate": 1.1997769234900113e-05, "loss": 0.8022, "step": 30650 }, { "epoch": 2.3928822289861857, "grad_norm": 2.1695347951424145e-14, "learning_rate": 1.1968277325810251e-05, "loss": 0.0391, "step": 30660 }, { "epoch": 2.3936626863341917, "grad_norm": 285.8024597167969, "learning_rate": 1.1938816779824753e-05, "loss": 0.6952, "step": 30670 }, { "epoch": 2.3944431436821976, "grad_norm": 2.7052454948425293, "learning_rate": 1.1909387621238439e-05, "loss": 0.1416, "step": 30680 }, { "epoch": 2.395223601030204, "grad_norm": 0.0008680449100211263, "learning_rate": 1.187998987432028e-05, "loss": 1.172, "step": 30690 }, { "epoch": 2.3960040583782094, "grad_norm": 1.2633448762694607e-15, "learning_rate": 1.1850623563313296e-05, "loss": 0.0001, "step": 30700 }, { "epoch": 2.3967845157262158, "grad_norm": 1.0665467467731114e-17, "learning_rate": 1.1821288712434591e-05, "loss": 0.1835, "step": 30710 }, { "epoch": 2.3975649730742217, "grad_norm": 0.00472455145791173, "learning_rate": 1.1791985345875373e-05, "loss": 0.0006, "step": 30720 }, { "epoch": 2.3983454304222276, "grad_norm": 2.081298653048929e-15, "learning_rate": 1.1762713487800813e-05, "loss": 0.1604, "step": 30730 }, { "epoch": 2.3991258877702335, "grad_norm": 1.492285264248494e-06, "learning_rate": 1.1733473162350156e-05, "loss": 2.225, "step": 30740 }, { "epoch": 2.3999063451182394, "grad_norm": 1.9213453583688533e-07, "learning_rate": 1.1704264393636621e-05, "loss": 0.3886, "step": 30750 }, { "epoch": 2.4006868024662453, "grad_norm": 2.1827824920814254e-13, "learning_rate": 1.1675087205747426e-05, "loss": 0.0, "step": 30760 }, { "epoch": 2.4014672598142512, "grad_norm": 0.2885519862174988, "learning_rate": 1.1645941622743695e-05, "loss": 0.1745, "step": 30770 }, { "epoch": 2.402247717162257, "grad_norm": 0.001634718501009047, "learning_rate": 1.1616827668660545e-05, "loss": 0.0, "step": 30780 }, { "epoch": 2.403028174510263, "grad_norm": 9.618447692437133e-16, "learning_rate": 1.158774536750698e-05, "loss": 0.2085, "step": 30790 }, { "epoch": 2.403808631858269, "grad_norm": 1.8258782500845587e-16, "learning_rate": 1.1558694743265885e-05, "loss": 0.0008, "step": 30800 }, { "epoch": 2.404589089206275, "grad_norm": 1.2440332852747815e-07, "learning_rate": 1.1529675819894075e-05, "loss": 0.489, "step": 30810 }, { "epoch": 2.405369546554281, "grad_norm": 5.6702891015447676e-05, "learning_rate": 1.1500688621322147e-05, "loss": 0.5853, "step": 30820 }, { "epoch": 2.4061500039022867, "grad_norm": 0.010418553836643696, "learning_rate": 1.147173317145464e-05, "loss": 1.2328, "step": 30830 }, { "epoch": 2.4069304612502926, "grad_norm": 0.0015043625608086586, "learning_rate": 1.1442809494169804e-05, "loss": 0.5443, "step": 30840 }, { "epoch": 2.4077109185982986, "grad_norm": 0.38860613107681274, "learning_rate": 1.1413917613319775e-05, "loss": 1.2227, "step": 30850 }, { "epoch": 2.4084913759463045, "grad_norm": 3.5585859251656774e-21, "learning_rate": 1.1385057552730382e-05, "loss": 0.1658, "step": 30860 }, { "epoch": 2.4092718332943104, "grad_norm": 4.808939912948063e-10, "learning_rate": 1.1356229336201308e-05, "loss": 0.1557, "step": 30870 }, { "epoch": 2.4100522906423163, "grad_norm": 7.308386784643517e-09, "learning_rate": 1.132743298750588e-05, "loss": 1.6504, "step": 30880 }, { "epoch": 2.410832747990322, "grad_norm": 6.105677075041172e-10, "learning_rate": 1.1298668530391232e-05, "loss": 0.0, "step": 30890 }, { "epoch": 2.411613205338328, "grad_norm": 1.2822355677233332e-10, "learning_rate": 1.1269935988578129e-05, "loss": 0.0142, "step": 30900 }, { "epoch": 2.412393662686334, "grad_norm": 1.0691497600440192e-12, "learning_rate": 1.1241235385761057e-05, "loss": 0.2239, "step": 30910 }, { "epoch": 2.41317412003434, "grad_norm": 5.364148591979756e-07, "learning_rate": 1.121256674560816e-05, "loss": 0.1139, "step": 30920 }, { "epoch": 2.413954577382346, "grad_norm": 284.3153381347656, "learning_rate": 1.1183930091761202e-05, "loss": 0.4734, "step": 30930 }, { "epoch": 2.4147350347303522, "grad_norm": 1.574044894425697e-08, "learning_rate": 1.1155325447835608e-05, "loss": 1.6763, "step": 30940 }, { "epoch": 2.4155154920783577, "grad_norm": 1.6060916174484063e-19, "learning_rate": 1.1126752837420345e-05, "loss": 0.9619, "step": 30950 }, { "epoch": 2.416295949426364, "grad_norm": 9.772094927029684e-05, "learning_rate": 1.1098212284078036e-05, "loss": 0.0005, "step": 30960 }, { "epoch": 2.41707640677437, "grad_norm": 0.005812688730657101, "learning_rate": 1.10697038113448e-05, "loss": 1.1127, "step": 30970 }, { "epoch": 2.417856864122376, "grad_norm": 0.0004505214747041464, "learning_rate": 1.1041227442730344e-05, "loss": 1.5875, "step": 30980 }, { "epoch": 2.418637321470382, "grad_norm": 7.922157863049506e-08, "learning_rate": 1.1012783201717907e-05, "loss": 0.0011, "step": 30990 }, { "epoch": 2.4194177788183877, "grad_norm": 5.424059779102208e-08, "learning_rate": 1.0984371111764185e-05, "loss": 0.0, "step": 31000 }, { "epoch": 2.4201982361663936, "grad_norm": 335.3708801269531, "learning_rate": 1.095599119629942e-05, "loss": 0.4164, "step": 31010 }, { "epoch": 2.4209786935143995, "grad_norm": 0.00026503499248065054, "learning_rate": 1.0927643478727256e-05, "loss": 0.3014, "step": 31020 }, { "epoch": 2.4217591508624055, "grad_norm": 4.594489665023932e-14, "learning_rate": 1.0899327982424862e-05, "loss": 0.0228, "step": 31030 }, { "epoch": 2.4225396082104114, "grad_norm": 0.009176124818623066, "learning_rate": 1.0871044730742752e-05, "loss": 0.9399, "step": 31040 }, { "epoch": 2.4233200655584173, "grad_norm": 248.33827209472656, "learning_rate": 1.084279374700492e-05, "loss": 1.7967, "step": 31050 }, { "epoch": 2.424100522906423, "grad_norm": 233.01034545898438, "learning_rate": 1.0814575054508703e-05, "loss": 1.3471, "step": 31060 }, { "epoch": 2.424880980254429, "grad_norm": 5.455961945699528e-05, "learning_rate": 1.0786388676524855e-05, "loss": 0.043, "step": 31070 }, { "epoch": 2.425661437602435, "grad_norm": 266.8015441894531, "learning_rate": 1.0758234636297438e-05, "loss": 1.7403, "step": 31080 }, { "epoch": 2.426441894950441, "grad_norm": 2.230910922662588e-08, "learning_rate": 1.0730112957043842e-05, "loss": 1.6638, "step": 31090 }, { "epoch": 2.427222352298447, "grad_norm": 5.843081574141706e-09, "learning_rate": 1.0702023661954825e-05, "loss": 0.4814, "step": 31100 }, { "epoch": 2.4280028096464528, "grad_norm": 43.01310348510742, "learning_rate": 1.0673966774194367e-05, "loss": 0.0597, "step": 31110 }, { "epoch": 2.4287832669944587, "grad_norm": 3.3520866082881184e-08, "learning_rate": 1.0645942316899799e-05, "loss": 0.4181, "step": 31120 }, { "epoch": 2.4295637243424646, "grad_norm": 2.7300829970045015e-06, "learning_rate": 1.061795031318164e-05, "loss": 0.0, "step": 31130 }, { "epoch": 2.4303441816904705, "grad_norm": 1.2723034160444513e-05, "learning_rate": 1.0589990786123682e-05, "loss": 0.2359, "step": 31140 }, { "epoch": 2.4311246390384764, "grad_norm": 0.5951895117759705, "learning_rate": 1.0562063758782941e-05, "loss": 0.0005, "step": 31150 }, { "epoch": 2.4319050963864823, "grad_norm": 1.0225551960729717e-07, "learning_rate": 1.0534169254189619e-05, "loss": 0.0964, "step": 31160 }, { "epoch": 2.4326855537344882, "grad_norm": 4.678537830586638e-13, "learning_rate": 1.0506307295347084e-05, "loss": 0.0045, "step": 31170 }, { "epoch": 2.433466011082494, "grad_norm": 1.8254380229620892e-09, "learning_rate": 1.0478477905231898e-05, "loss": 0.023, "step": 31180 }, { "epoch": 2.4342464684305005, "grad_norm": 7.81493781687459e-06, "learning_rate": 1.0450681106793736e-05, "loss": 0.3291, "step": 31190 }, { "epoch": 2.435026925778506, "grad_norm": 0.00012367757153697312, "learning_rate": 1.0422916922955389e-05, "loss": 0.0085, "step": 31200 }, { "epoch": 2.4358073831265123, "grad_norm": 8.182890930523303e-19, "learning_rate": 1.0395185376612792e-05, "loss": 0.0001, "step": 31210 }, { "epoch": 2.436587840474518, "grad_norm": 1.1366774882448638e-10, "learning_rate": 1.0367486490634931e-05, "loss": 0.0005, "step": 31220 }, { "epoch": 2.437368297822524, "grad_norm": 1.5081602679155293e-15, "learning_rate": 1.0339820287863893e-05, "loss": 0.0181, "step": 31230 }, { "epoch": 2.43814875517053, "grad_norm": 0.0, "learning_rate": 1.0312186791114759e-05, "loss": 0.0005, "step": 31240 }, { "epoch": 2.438929212518536, "grad_norm": 9.464818140259013e-05, "learning_rate": 1.0284586023175702e-05, "loss": 0.0002, "step": 31250 }, { "epoch": 2.439709669866542, "grad_norm": 1.1910776720469585e-06, "learning_rate": 1.0257018006807834e-05, "loss": 0.0178, "step": 31260 }, { "epoch": 2.440490127214548, "grad_norm": 249.18057250976562, "learning_rate": 1.0229482764745335e-05, "loss": 1.9, "step": 31270 }, { "epoch": 2.4412705845625537, "grad_norm": 406.0775146484375, "learning_rate": 1.0201980319695281e-05, "loss": 1.9826, "step": 31280 }, { "epoch": 2.4420510419105597, "grad_norm": 1.7052905633673779e-15, "learning_rate": 1.0174510694337769e-05, "loss": 0.0, "step": 31290 }, { "epoch": 2.4428314992585656, "grad_norm": 4.269708063349512e-16, "learning_rate": 1.01470739113258e-05, "loss": 0.0, "step": 31300 }, { "epoch": 2.4436119566065715, "grad_norm": 11.617895126342773, "learning_rate": 1.0119669993285275e-05, "loss": 0.0043, "step": 31310 }, { "epoch": 2.4443924139545774, "grad_norm": 0.05410260707139969, "learning_rate": 1.0092298962815039e-05, "loss": 0.4511, "step": 31320 }, { "epoch": 2.4451728713025833, "grad_norm": 1.7712960243225098, "learning_rate": 1.0064960842486754e-05, "loss": 0.0008, "step": 31330 }, { "epoch": 2.4459533286505892, "grad_norm": 0.020819485187530518, "learning_rate": 1.0037655654845008e-05, "loss": 0.0003, "step": 31340 }, { "epoch": 2.446733785998595, "grad_norm": 185.15899658203125, "learning_rate": 1.001038342240717e-05, "loss": 0.6657, "step": 31350 }, { "epoch": 2.447514243346601, "grad_norm": 0.0012188347754999995, "learning_rate": 9.983144167663483e-06, "loss": 0.0004, "step": 31360 }, { "epoch": 2.448294700694607, "grad_norm": 4.041361535200849e-06, "learning_rate": 9.955937913076962e-06, "loss": 0.0002, "step": 31370 }, { "epoch": 2.449075158042613, "grad_norm": 3.6336111897128376e-10, "learning_rate": 9.928764681083442e-06, "loss": 0.0032, "step": 31380 }, { "epoch": 2.449855615390619, "grad_norm": 0.0001976362691493705, "learning_rate": 9.901624494091483e-06, "loss": 0.3962, "step": 31390 }, { "epoch": 2.4506360727386247, "grad_norm": 1.16826257001712e-10, "learning_rate": 9.874517374482405e-06, "loss": 0.0, "step": 31400 }, { "epoch": 2.4514165300866306, "grad_norm": 0.0007720371941104531, "learning_rate": 9.847443344610297e-06, "loss": 1.6812, "step": 31410 }, { "epoch": 2.4521969874346365, "grad_norm": 176.58828735351562, "learning_rate": 9.8204024268019e-06, "loss": 0.9513, "step": 31420 }, { "epoch": 2.4529774447826425, "grad_norm": 8.960714693745186e-22, "learning_rate": 9.793394643356706e-06, "loss": 0.1805, "step": 31430 }, { "epoch": 2.4537579021306484, "grad_norm": 3.0730760158803605e-08, "learning_rate": 9.76642001654684e-06, "loss": 0.1001, "step": 31440 }, { "epoch": 2.4545383594786543, "grad_norm": 4.813366300551783e-17, "learning_rate": 9.739478568617106e-06, "loss": 0.0689, "step": 31450 }, { "epoch": 2.4553188168266606, "grad_norm": 0.0002698384050745517, "learning_rate": 9.712570321784947e-06, "loss": 0.0106, "step": 31460 }, { "epoch": 2.456099274174666, "grad_norm": 2.8609814961555458e-08, "learning_rate": 9.685695298240432e-06, "loss": 0.0, "step": 31470 }, { "epoch": 2.4568797315226725, "grad_norm": 3.8634730969988595e-08, "learning_rate": 9.658853520146205e-06, "loss": 0.0678, "step": 31480 }, { "epoch": 2.4576601888706784, "grad_norm": 3.986447438819596e-07, "learning_rate": 9.632045009637502e-06, "loss": 2.0, "step": 31490 }, { "epoch": 2.4584406462186843, "grad_norm": 7.760418156976812e-06, "learning_rate": 9.605269788822163e-06, "loss": 1.8596, "step": 31500 }, { "epoch": 2.45922110356669, "grad_norm": 2.0599917605057527e-11, "learning_rate": 9.578527879780514e-06, "loss": 0.0004, "step": 31510 }, { "epoch": 2.460001560914696, "grad_norm": 1.1984980031792738e-14, "learning_rate": 9.551819304565457e-06, "loss": 0.838, "step": 31520 }, { "epoch": 2.460782018262702, "grad_norm": 5.388801946537569e-05, "learning_rate": 9.525144085202387e-06, "loss": 0.2957, "step": 31530 }, { "epoch": 2.461562475610708, "grad_norm": 3.7882412584622216e-07, "learning_rate": 9.498502243689217e-06, "loss": 0.0093, "step": 31540 }, { "epoch": 2.462342932958714, "grad_norm": 5.491552443901218e-14, "learning_rate": 9.471893801996274e-06, "loss": 0.3542, "step": 31550 }, { "epoch": 2.46312339030672, "grad_norm": 4.296512543078279e-06, "learning_rate": 9.445318782066415e-06, "loss": 0.0958, "step": 31560 }, { "epoch": 2.4639038476547257, "grad_norm": 1.0587911840678754e-22, "learning_rate": 9.418777205814877e-06, "loss": 1.1193, "step": 31570 }, { "epoch": 2.4646843050027316, "grad_norm": 3.066175979782315e-09, "learning_rate": 9.392269095129357e-06, "loss": 0.2969, "step": 31580 }, { "epoch": 2.4654647623507375, "grad_norm": 0.6269698143005371, "learning_rate": 9.365794471869922e-06, "loss": 0.0051, "step": 31590 }, { "epoch": 2.4662452196987434, "grad_norm": 0.03895184397697449, "learning_rate": 9.339353357869052e-06, "loss": 0.9741, "step": 31600 }, { "epoch": 2.4670256770467494, "grad_norm": 0.0293387733399868, "learning_rate": 9.312945774931587e-06, "loss": 0.0065, "step": 31610 }, { "epoch": 2.4678061343947553, "grad_norm": 0.0004621434782166034, "learning_rate": 9.286571744834693e-06, "loss": 0.0901, "step": 31620 }, { "epoch": 2.468586591742761, "grad_norm": 2.129108174486749e-11, "learning_rate": 9.260231289327908e-06, "loss": 0.0, "step": 31630 }, { "epoch": 2.469367049090767, "grad_norm": 0.00019757759582716972, "learning_rate": 9.233924430133023e-06, "loss": 0.0, "step": 31640 }, { "epoch": 2.470147506438773, "grad_norm": 0.03482899069786072, "learning_rate": 9.207651188944195e-06, "loss": 0.0107, "step": 31650 }, { "epoch": 2.470927963786779, "grad_norm": 1.669070548437901e-09, "learning_rate": 9.181411587427791e-06, "loss": 0.0, "step": 31660 }, { "epoch": 2.471708421134785, "grad_norm": 0.3998055160045624, "learning_rate": 9.155205647222482e-06, "loss": 5.0276, "step": 31670 }, { "epoch": 2.4724888784827908, "grad_norm": 0.0005485009751282632, "learning_rate": 9.129033389939168e-06, "loss": 0.054, "step": 31680 }, { "epoch": 2.4732693358307967, "grad_norm": 0.23071005940437317, "learning_rate": 9.102894837160959e-06, "loss": 0.0005, "step": 31690 }, { "epoch": 2.4740497931788026, "grad_norm": 2.5303051687448497e-18, "learning_rate": 9.076790010443193e-06, "loss": 0.925, "step": 31700 }, { "epoch": 2.474830250526809, "grad_norm": 2.3178323149686264e-10, "learning_rate": 9.050718931313363e-06, "loss": 0.0, "step": 31710 }, { "epoch": 2.4756107078748144, "grad_norm": 184.19573974609375, "learning_rate": 9.024681621271175e-06, "loss": 0.7901, "step": 31720 }, { "epoch": 2.4763911652228208, "grad_norm": 7.937559030196661e-13, "learning_rate": 8.998678101788443e-06, "loss": 1.1143, "step": 31730 }, { "epoch": 2.4771716225708267, "grad_norm": 9.148439860251758e-21, "learning_rate": 8.97270839430916e-06, "loss": 0.787, "step": 31740 }, { "epoch": 2.4779520799188326, "grad_norm": 5.923636147570077e-12, "learning_rate": 8.946772520249385e-06, "loss": 0.1406, "step": 31750 }, { "epoch": 2.4787325372668385, "grad_norm": 228.17822265625, "learning_rate": 8.92087050099732e-06, "loss": 0.5815, "step": 31760 }, { "epoch": 2.4795129946148444, "grad_norm": 6.485000118286108e-14, "learning_rate": 8.895002357913234e-06, "loss": 0.011, "step": 31770 }, { "epoch": 2.4802934519628503, "grad_norm": 11.712849617004395, "learning_rate": 8.869168112329441e-06, "loss": 0.2275, "step": 31780 }, { "epoch": 2.4810739093108563, "grad_norm": 0.0017456557834520936, "learning_rate": 8.843367785550345e-06, "loss": 0.0016, "step": 31790 }, { "epoch": 2.481854366658862, "grad_norm": 9.026371117215604e-06, "learning_rate": 8.817601398852315e-06, "loss": 0.0231, "step": 31800 }, { "epoch": 2.482634824006868, "grad_norm": 1.0750035528417357e-08, "learning_rate": 8.791868973483803e-06, "loss": 0.0095, "step": 31810 }, { "epoch": 2.483415281354874, "grad_norm": 0.197080597281456, "learning_rate": 8.766170530665185e-06, "loss": 0.0644, "step": 31820 }, { "epoch": 2.48419573870288, "grad_norm": 0.05906852334737778, "learning_rate": 8.740506091588868e-06, "loss": 0.0, "step": 31830 }, { "epoch": 2.484976196050886, "grad_norm": 6.384251491908799e-07, "learning_rate": 8.714875677419192e-06, "loss": 0.1455, "step": 31840 }, { "epoch": 2.4857566533988917, "grad_norm": 6.302929966145454e-12, "learning_rate": 8.689279309292459e-06, "loss": 1.0475, "step": 31850 }, { "epoch": 2.4865371107468976, "grad_norm": 3.5742025135193104e-20, "learning_rate": 8.663717008316847e-06, "loss": 0.6127, "step": 31860 }, { "epoch": 2.4873175680949036, "grad_norm": 5.618056475760469e-13, "learning_rate": 8.6381887955725e-06, "loss": 0.0023, "step": 31870 }, { "epoch": 2.4880980254429095, "grad_norm": 2.1841951536316628e-07, "learning_rate": 8.612694692111412e-06, "loss": 2.6341, "step": 31880 }, { "epoch": 2.4888784827909154, "grad_norm": 2.8184949929510594e-09, "learning_rate": 8.587234718957443e-06, "loss": 0.6065, "step": 31890 }, { "epoch": 2.4896589401389213, "grad_norm": 8.832481384277344, "learning_rate": 8.561808897106339e-06, "loss": 0.0034, "step": 31900 }, { "epoch": 2.490439397486927, "grad_norm": 3.9514170566690154e-06, "learning_rate": 8.536417247525663e-06, "loss": 0.6379, "step": 31910 }, { "epoch": 2.491219854834933, "grad_norm": 3.576242191272172e-09, "learning_rate": 8.511059791154819e-06, "loss": 0.0021, "step": 31920 }, { "epoch": 2.492000312182939, "grad_norm": 4.6747011595016374e-08, "learning_rate": 8.485736548904966e-06, "loss": 0.2805, "step": 31930 }, { "epoch": 2.492780769530945, "grad_norm": 7.914739398984239e-06, "learning_rate": 8.460447541659111e-06, "loss": 0.1509, "step": 31940 }, { "epoch": 2.493561226878951, "grad_norm": 7.899691978656152e-17, "learning_rate": 8.435192790271967e-06, "loss": 0.5303, "step": 31950 }, { "epoch": 2.4943416842269572, "grad_norm": 10.499479293823242, "learning_rate": 8.409972315570047e-06, "loss": 0.0055, "step": 31960 }, { "epoch": 2.4951221415749627, "grad_norm": 9.64181890594773e-06, "learning_rate": 8.384786138351563e-06, "loss": 0.1455, "step": 31970 }, { "epoch": 2.495902598922969, "grad_norm": 8.813726992684678e-08, "learning_rate": 8.359634279386464e-06, "loss": 0.0, "step": 31980 }, { "epoch": 2.496683056270975, "grad_norm": 1.7394331530695695e-09, "learning_rate": 8.334516759416405e-06, "loss": 0.852, "step": 31990 }, { "epoch": 2.497463513618981, "grad_norm": 0.025341708213090897, "learning_rate": 8.309433599154682e-06, "loss": 1.4413, "step": 32000 }, { "epoch": 2.498243970966987, "grad_norm": 2.502560505490692e-07, "learning_rate": 8.284384819286317e-06, "loss": 0.0, "step": 32010 }, { "epoch": 2.4990244283149927, "grad_norm": 278.2901306152344, "learning_rate": 8.259370440467918e-06, "loss": 0.8205, "step": 32020 }, { "epoch": 2.4998048856629986, "grad_norm": 5.790270662942021e-08, "learning_rate": 8.234390483327775e-06, "loss": 0.9642, "step": 32030 }, { "epoch": 2.5005853430110045, "grad_norm": 3.1117013410408845e-09, "learning_rate": 8.209444968465745e-06, "loss": 0.8302, "step": 32040 }, { "epoch": 2.5013658003590105, "grad_norm": 1.0056027191507333e-12, "learning_rate": 8.184533916453341e-06, "loss": 1.7611, "step": 32050 }, { "epoch": 2.5021462577070164, "grad_norm": 1.1364232705091126e-05, "learning_rate": 8.159657347833588e-06, "loss": 0.0, "step": 32060 }, { "epoch": 2.5029267150550223, "grad_norm": 0.0008929189061746001, "learning_rate": 8.134815283121128e-06, "loss": 0.0043, "step": 32070 }, { "epoch": 2.503707172403028, "grad_norm": 1.3651838508879536e-21, "learning_rate": 8.110007742802134e-06, "loss": 0.009, "step": 32080 }, { "epoch": 2.504487629751034, "grad_norm": 11.406990051269531, "learning_rate": 8.08523474733428e-06, "loss": 0.09, "step": 32090 }, { "epoch": 2.50526808709904, "grad_norm": 25.157150268554688, "learning_rate": 8.060496317146809e-06, "loss": 0.0139, "step": 32100 }, { "epoch": 2.506048544447046, "grad_norm": 1.541529672977049e-05, "learning_rate": 8.035792472640392e-06, "loss": 0.9274, "step": 32110 }, { "epoch": 2.506829001795052, "grad_norm": 3.0823123324807966e-06, "learning_rate": 8.01112323418724e-06, "loss": 0.0001, "step": 32120 }, { "epoch": 2.5076094591430578, "grad_norm": 0.08010544627904892, "learning_rate": 7.986488622130973e-06, "loss": 0.0092, "step": 32130 }, { "epoch": 2.5083899164910637, "grad_norm": 1.5729060010016838e-07, "learning_rate": 7.961888656786697e-06, "loss": 0.5046, "step": 32140 }, { "epoch": 2.5091703738390696, "grad_norm": 4.572706836356748e-16, "learning_rate": 7.937323358440935e-06, "loss": 0.2813, "step": 32150 }, { "epoch": 2.5099508311870755, "grad_norm": 9.056230293675097e-14, "learning_rate": 7.912792747351628e-06, "loss": 0.4358, "step": 32160 }, { "epoch": 2.5107312885350814, "grad_norm": 2.8794074763416333e-11, "learning_rate": 7.888296843748083e-06, "loss": 0.0975, "step": 32170 }, { "epoch": 2.5115117458830873, "grad_norm": 2.362916262654835e-08, "learning_rate": 7.863835667831e-06, "loss": 0.26, "step": 32180 }, { "epoch": 2.5122922032310933, "grad_norm": 3.7175784399514656e-19, "learning_rate": 7.83940923977246e-06, "loss": 0.5833, "step": 32190 }, { "epoch": 2.513072660579099, "grad_norm": 278.1390075683594, "learning_rate": 7.815017579715861e-06, "loss": 0.2599, "step": 32200 }, { "epoch": 2.5138531179271055, "grad_norm": 1.0799265792967944e-18, "learning_rate": 7.790660707775949e-06, "loss": 1.35, "step": 32210 }, { "epoch": 2.514633575275111, "grad_norm": 0.2948494851589203, "learning_rate": 7.766338644038773e-06, "loss": 0.2754, "step": 32220 }, { "epoch": 2.5154140326231174, "grad_norm": 0.0003002735029440373, "learning_rate": 7.742051408561696e-06, "loss": 0.1201, "step": 32230 }, { "epoch": 2.516194489971123, "grad_norm": 4.7347893996629864e-05, "learning_rate": 7.717799021373313e-06, "loss": 0.0146, "step": 32240 }, { "epoch": 2.516974947319129, "grad_norm": 21.485971450805664, "learning_rate": 7.693581502473541e-06, "loss": 0.0367, "step": 32250 }, { "epoch": 2.517755404667135, "grad_norm": 252.40081787109375, "learning_rate": 7.669398871833494e-06, "loss": 1.1277, "step": 32260 }, { "epoch": 2.518535862015141, "grad_norm": 38.04701614379883, "learning_rate": 7.645251149395521e-06, "loss": 0.3047, "step": 32270 }, { "epoch": 2.519316319363147, "grad_norm": 0.007141559850424528, "learning_rate": 7.621138355073232e-06, "loss": 0.8679, "step": 32280 }, { "epoch": 2.520096776711153, "grad_norm": 1.3814762667152536e-07, "learning_rate": 7.597060508751347e-06, "loss": 0.0007, "step": 32290 }, { "epoch": 2.5208772340591588, "grad_norm": 0.06472454220056534, "learning_rate": 7.573017630285873e-06, "loss": 0.0, "step": 32300 }, { "epoch": 2.5216576914071647, "grad_norm": 0.00029342470224946737, "learning_rate": 7.549009739503887e-06, "loss": 0.0267, "step": 32310 }, { "epoch": 2.5224381487551706, "grad_norm": 3.442346496740356e-07, "learning_rate": 7.5250368562036765e-06, "loss": 1.2629, "step": 32320 }, { "epoch": 2.5232186061031765, "grad_norm": 2.6451106760374665e-15, "learning_rate": 7.5010990001546045e-06, "loss": 0.7777, "step": 32330 }, { "epoch": 2.5239990634511824, "grad_norm": 0.0022019429598003626, "learning_rate": 7.477196191097208e-06, "loss": 0.1655, "step": 32340 }, { "epoch": 2.5247795207991883, "grad_norm": 0.019295230507850647, "learning_rate": 7.453328448743058e-06, "loss": 0.9781, "step": 32350 }, { "epoch": 2.5255599781471942, "grad_norm": 179.39305114746094, "learning_rate": 7.4294957927748765e-06, "loss": 0.366, "step": 32360 }, { "epoch": 2.5263404354952, "grad_norm": 2.07561828835523e-10, "learning_rate": 7.405698242846387e-06, "loss": 0.0592, "step": 32370 }, { "epoch": 2.527120892843206, "grad_norm": 1.138383150100708, "learning_rate": 7.381935818582403e-06, "loss": 0.1496, "step": 32380 }, { "epoch": 2.527901350191212, "grad_norm": 1.7265396223820062e-08, "learning_rate": 7.358208539578771e-06, "loss": 0.0397, "step": 32390 }, { "epoch": 2.528681807539218, "grad_norm": 1.3316818581188272e-07, "learning_rate": 7.334516425402327e-06, "loss": 0.1137, "step": 32400 }, { "epoch": 2.529462264887224, "grad_norm": 63.99897766113281, "learning_rate": 7.3108594955909434e-06, "loss": 0.0305, "step": 32410 }, { "epoch": 2.5302427222352297, "grad_norm": 2.53629514190834e-05, "learning_rate": 7.287237769653438e-06, "loss": 0.0018, "step": 32420 }, { "epoch": 2.5310231795832356, "grad_norm": 0.06212920323014259, "learning_rate": 7.263651267069643e-06, "loss": 0.005, "step": 32430 }, { "epoch": 2.5318036369312416, "grad_norm": 7.825429548802276e-08, "learning_rate": 7.24010000729029e-06, "loss": 0.6747, "step": 32440 }, { "epoch": 2.5325840942792475, "grad_norm": 43.8482666015625, "learning_rate": 7.216584009737093e-06, "loss": 0.1808, "step": 32450 }, { "epoch": 2.533364551627254, "grad_norm": 2.1205712741334537e-08, "learning_rate": 7.193103293802683e-06, "loss": 0.3964, "step": 32460 }, { "epoch": 2.5341450089752593, "grad_norm": 0.0007450974080711603, "learning_rate": 7.169657878850561e-06, "loss": 0.0, "step": 32470 }, { "epoch": 2.5349254663232657, "grad_norm": 0.0016309081111103296, "learning_rate": 7.146247784215154e-06, "loss": 0.016, "step": 32480 }, { "epoch": 2.535705923671271, "grad_norm": 0.005112816579639912, "learning_rate": 7.122873029201732e-06, "loss": 0.0002, "step": 32490 }, { "epoch": 2.5364863810192775, "grad_norm": 8.899023669073358e-07, "learning_rate": 7.099533633086458e-06, "loss": 0.0007, "step": 32500 }, { "epoch": 2.537266838367283, "grad_norm": 15.795042037963867, "learning_rate": 7.076229615116292e-06, "loss": 0.4537, "step": 32510 }, { "epoch": 2.5380472957152893, "grad_norm": 0.6787014603614807, "learning_rate": 7.052960994509056e-06, "loss": 1.7144, "step": 32520 }, { "epoch": 2.5388277530632952, "grad_norm": 1.6081843057236256e-07, "learning_rate": 7.029727790453356e-06, "loss": 1.2472, "step": 32530 }, { "epoch": 2.539608210411301, "grad_norm": 0.22756808996200562, "learning_rate": 7.0065300221086316e-06, "loss": 0.329, "step": 32540 }, { "epoch": 2.540388667759307, "grad_norm": 0.0002664399507921189, "learning_rate": 6.983367708605032e-06, "loss": 0.6846, "step": 32550 }, { "epoch": 2.541169125107313, "grad_norm": 5.893477705853911e-08, "learning_rate": 6.960240869043544e-06, "loss": 0.4668, "step": 32560 }, { "epoch": 2.541949582455319, "grad_norm": 1.531070239479959e-07, "learning_rate": 6.9371495224958445e-06, "loss": 0.1799, "step": 32570 }, { "epoch": 2.542730039803325, "grad_norm": 2.5454155450077565e-10, "learning_rate": 6.9140936880043525e-06, "loss": 0.0, "step": 32580 }, { "epoch": 2.5435104971513307, "grad_norm": 279.1732177734375, "learning_rate": 6.891073384582231e-06, "loss": 1.7055, "step": 32590 }, { "epoch": 2.5442909544993366, "grad_norm": 2.79231595993042, "learning_rate": 6.8680886312132985e-06, "loss": 0.0008, "step": 32600 }, { "epoch": 2.5450714118473425, "grad_norm": 8.424194675171748e-05, "learning_rate": 6.845139446852095e-06, "loss": 0.0344, "step": 32610 }, { "epoch": 2.5458518691953484, "grad_norm": 1.6954788861767156e-06, "learning_rate": 6.822225850423808e-06, "loss": 1.2532, "step": 32620 }, { "epoch": 2.5466323265433544, "grad_norm": 5.65844038646901e-06, "learning_rate": 6.799347860824301e-06, "loss": 1.1134, "step": 32630 }, { "epoch": 2.5474127838913603, "grad_norm": 8.109286083347184e-14, "learning_rate": 6.776505496920022e-06, "loss": 0.0154, "step": 32640 }, { "epoch": 2.548193241239366, "grad_norm": 243.0125732421875, "learning_rate": 6.753698777548095e-06, "loss": 0.5607, "step": 32650 }, { "epoch": 2.548973698587372, "grad_norm": 5.170684858057939e-07, "learning_rate": 6.730927721516228e-06, "loss": 0.6427, "step": 32660 }, { "epoch": 2.549754155935378, "grad_norm": 0.2602443993091583, "learning_rate": 6.7081923476026985e-06, "loss": 0.464, "step": 32670 }, { "epoch": 2.550534613283384, "grad_norm": 3.0131985113257542e-05, "learning_rate": 6.685492674556393e-06, "loss": 0.0022, "step": 32680 }, { "epoch": 2.55131507063139, "grad_norm": 4.961176469642226e-10, "learning_rate": 6.6628287210967425e-06, "loss": 0.2164, "step": 32690 }, { "epoch": 2.5520955279793958, "grad_norm": 244.85946655273438, "learning_rate": 6.640200505913735e-06, "loss": 0.4519, "step": 32700 }, { "epoch": 2.552875985327402, "grad_norm": 5.325290203472832e-06, "learning_rate": 6.617608047667845e-06, "loss": 0.0001, "step": 32710 }, { "epoch": 2.5536564426754076, "grad_norm": 0.0005364320240914822, "learning_rate": 6.595051364990113e-06, "loss": 1.861, "step": 32720 }, { "epoch": 2.554436900023414, "grad_norm": 0.003387443022802472, "learning_rate": 6.5725304764820305e-06, "loss": 0.6986, "step": 32730 }, { "epoch": 2.5552173573714194, "grad_norm": 1.974759911149704e-08, "learning_rate": 6.550045400715621e-06, "loss": 0.0, "step": 32740 }, { "epoch": 2.5559978147194258, "grad_norm": 0.0022972319275140762, "learning_rate": 6.527596156233312e-06, "loss": 0.0008, "step": 32750 }, { "epoch": 2.5567782720674312, "grad_norm": 0.0009155732113867998, "learning_rate": 6.50518276154804e-06, "loss": 0.0948, "step": 32760 }, { "epoch": 2.5575587294154376, "grad_norm": 1.2502309409967438e-08, "learning_rate": 6.4828052351431634e-06, "loss": 3.3757, "step": 32770 }, { "epoch": 2.5583391867634435, "grad_norm": 0.009155502542853355, "learning_rate": 6.460463595472427e-06, "loss": 1.0454, "step": 32780 }, { "epoch": 2.5591196441114494, "grad_norm": 2.0515193455139524e-07, "learning_rate": 6.438157860960026e-06, "loss": 0.0043, "step": 32790 }, { "epoch": 2.5599001014594553, "grad_norm": 2.145350492810394e-07, "learning_rate": 6.415888050000518e-06, "loss": 0.0004, "step": 32800 }, { "epoch": 2.5606805588074613, "grad_norm": 5.182963036531874e-07, "learning_rate": 6.393654180958858e-06, "loss": 2.2282, "step": 32810 }, { "epoch": 2.561461016155467, "grad_norm": 0.0, "learning_rate": 6.371456272170329e-06, "loss": 0.2464, "step": 32820 }, { "epoch": 2.562241473503473, "grad_norm": 2.9713309590562176e-09, "learning_rate": 6.349294341940593e-06, "loss": 0.0006, "step": 32830 }, { "epoch": 2.563021930851479, "grad_norm": 9.95053581090248e-14, "learning_rate": 6.32716840854562e-06, "loss": 0.0, "step": 32840 }, { "epoch": 2.563802388199485, "grad_norm": 1.3272781290348272e-16, "learning_rate": 6.305078490231725e-06, "loss": 2.0757, "step": 32850 }, { "epoch": 2.564582845547491, "grad_norm": 1.2425418915995579e-08, "learning_rate": 6.283024605215482e-06, "loss": 0.0013, "step": 32860 }, { "epoch": 2.5653633028954967, "grad_norm": 8.812655466483577e-12, "learning_rate": 6.261006771683764e-06, "loss": 0.0145, "step": 32870 }, { "epoch": 2.5661437602435027, "grad_norm": 0.012916338630020618, "learning_rate": 6.2390250077937305e-06, "loss": 0.4857, "step": 32880 }, { "epoch": 2.5669242175915086, "grad_norm": 106.12604522705078, "learning_rate": 6.217079331672776e-06, "loss": 0.0524, "step": 32890 }, { "epoch": 2.5677046749395145, "grad_norm": 0.001981984591111541, "learning_rate": 6.1951697614185566e-06, "loss": 2.1622, "step": 32900 }, { "epoch": 2.5684851322875204, "grad_norm": 3.305194388580207e-14, "learning_rate": 6.17329631509892e-06, "loss": 0.1398, "step": 32910 }, { "epoch": 2.5692655896355263, "grad_norm": 0.002218644367530942, "learning_rate": 6.151459010751953e-06, "loss": 1.2993, "step": 32920 }, { "epoch": 2.5700460469835322, "grad_norm": 0.00015026754408609122, "learning_rate": 6.129657866385935e-06, "loss": 0.0168, "step": 32930 }, { "epoch": 2.570826504331538, "grad_norm": 0.0012799123069271445, "learning_rate": 6.107892899979323e-06, "loss": 0.9698, "step": 32940 }, { "epoch": 2.571606961679544, "grad_norm": 1.7073870800261154e-13, "learning_rate": 6.086164129480731e-06, "loss": 0.5255, "step": 32950 }, { "epoch": 2.5723874190275504, "grad_norm": 0.008531242609024048, "learning_rate": 6.064471572808916e-06, "loss": 0.0001, "step": 32960 }, { "epoch": 2.573167876375556, "grad_norm": 4.434143647813471e-06, "learning_rate": 6.042815247852812e-06, "loss": 0.0191, "step": 32970 }, { "epoch": 2.5739483337235622, "grad_norm": 2.1988906860351562, "learning_rate": 6.021195172471411e-06, "loss": 0.3835, "step": 32980 }, { "epoch": 2.5747287910715677, "grad_norm": 1.283716052125892e-21, "learning_rate": 5.999611364493868e-06, "loss": 0.0012, "step": 32990 }, { "epoch": 2.575509248419574, "grad_norm": 3.2552881240844727, "learning_rate": 5.978063841719411e-06, "loss": 0.0014, "step": 33000 }, { "epoch": 2.5762897057675795, "grad_norm": 6.644073498703528e-17, "learning_rate": 5.956552621917344e-06, "loss": 0.6411, "step": 33010 }, { "epoch": 2.577070163115586, "grad_norm": 3.554277181625366, "learning_rate": 5.9350777228270205e-06, "loss": 0.0032, "step": 33020 }, { "epoch": 2.577850620463592, "grad_norm": 0.00010075556201627478, "learning_rate": 5.913639162157869e-06, "loss": 0.0, "step": 33030 }, { "epoch": 2.5786310778115977, "grad_norm": 0.03173457086086273, "learning_rate": 5.89223695758932e-06, "loss": 0.0478, "step": 33040 }, { "epoch": 2.5794115351596036, "grad_norm": 2.581108341382504e-13, "learning_rate": 5.870871126770855e-06, "loss": 0.9232, "step": 33050 }, { "epoch": 2.5801919925076096, "grad_norm": 1.1653195315375342e-06, "learning_rate": 5.84954168732193e-06, "loss": 1.0711, "step": 33060 }, { "epoch": 2.5809724498556155, "grad_norm": 6.896935658473965e-20, "learning_rate": 5.828248656832003e-06, "loss": 0.1499, "step": 33070 }, { "epoch": 2.5817529072036214, "grad_norm": 1.7919608354568481, "learning_rate": 5.806992052860533e-06, "loss": 0.6013, "step": 33080 }, { "epoch": 2.5825333645516273, "grad_norm": 0.0004548436845652759, "learning_rate": 5.785771892936881e-06, "loss": 0.4141, "step": 33090 }, { "epoch": 2.583313821899633, "grad_norm": 0.0, "learning_rate": 5.7645881945604165e-06, "loss": 0.0, "step": 33100 }, { "epoch": 2.584094279247639, "grad_norm": 0.13170254230499268, "learning_rate": 5.743440975200393e-06, "loss": 0.0024, "step": 33110 }, { "epoch": 2.584874736595645, "grad_norm": 1.1884622721169101e-14, "learning_rate": 5.7223302522960154e-06, "loss": 0.7057, "step": 33120 }, { "epoch": 2.585655193943651, "grad_norm": 1.8453439460091453e-13, "learning_rate": 5.701256043256359e-06, "loss": 0.0, "step": 33130 }, { "epoch": 2.586435651291657, "grad_norm": 1.1516529049848678e-11, "learning_rate": 5.680218365460416e-06, "loss": 0.3896, "step": 33140 }, { "epoch": 2.587216108639663, "grad_norm": 0.0056893001310527325, "learning_rate": 5.659217236257059e-06, "loss": 0.6519, "step": 33150 }, { "epoch": 2.5879965659876687, "grad_norm": 1.4390453770829481e-06, "learning_rate": 5.638252672964972e-06, "loss": 0.042, "step": 33160 }, { "epoch": 2.5887770233356746, "grad_norm": 1.7544034562888555e-05, "learning_rate": 5.617324692872744e-06, "loss": 0.0018, "step": 33170 }, { "epoch": 2.5895574806836805, "grad_norm": 1.6007531881332397, "learning_rate": 5.596433313238747e-06, "loss": 0.5041, "step": 33180 }, { "epoch": 2.5903379380316864, "grad_norm": 6.1260161388076995e-09, "learning_rate": 5.575578551291211e-06, "loss": 0.0, "step": 33190 }, { "epoch": 2.5911183953796924, "grad_norm": 2.927170917011307e-18, "learning_rate": 5.554760424228128e-06, "loss": 0.0231, "step": 33200 }, { "epoch": 2.5918988527276983, "grad_norm": 2.057391405105591, "learning_rate": 5.533978949217322e-06, "loss": 0.0007, "step": 33210 }, { "epoch": 2.592679310075704, "grad_norm": 0.004864888731390238, "learning_rate": 5.513234143396351e-06, "loss": 0.4181, "step": 33220 }, { "epoch": 2.5934597674237105, "grad_norm": 1.0975718760502379e-13, "learning_rate": 5.492526023872563e-06, "loss": 0.0695, "step": 33230 }, { "epoch": 2.594240224771716, "grad_norm": 3.326022692817787e-07, "learning_rate": 5.471854607723048e-06, "loss": 0.0276, "step": 33240 }, { "epoch": 2.5950206821197224, "grad_norm": 4.2327101823502744e-07, "learning_rate": 5.451219911994604e-06, "loss": 0.0015, "step": 33250 }, { "epoch": 2.595801139467728, "grad_norm": 1.8893464584834874e-05, "learning_rate": 5.430621953703785e-06, "loss": 1.1529, "step": 33260 }, { "epoch": 2.596581596815734, "grad_norm": 8.914184945751913e-06, "learning_rate": 5.410060749836809e-06, "loss": 0.118, "step": 33270 }, { "epoch": 2.59736205416374, "grad_norm": 0.07775567471981049, "learning_rate": 5.389536317349625e-06, "loss": 0.6268, "step": 33280 }, { "epoch": 2.598142511511746, "grad_norm": 9.568450032304554e-09, "learning_rate": 5.3690486731678205e-06, "loss": 0.5825, "step": 33290 }, { "epoch": 2.598922968859752, "grad_norm": 261.511474609375, "learning_rate": 5.3485978341866694e-06, "loss": 2.9511, "step": 33300 }, { "epoch": 2.599703426207758, "grad_norm": 7.09142057075951e-07, "learning_rate": 5.328183817271093e-06, "loss": 0.0, "step": 33310 }, { "epoch": 2.6004838835557638, "grad_norm": 238.4812469482422, "learning_rate": 5.307806639255642e-06, "loss": 0.6745, "step": 33320 }, { "epoch": 2.6012643409037697, "grad_norm": 2.5221299624256146e-15, "learning_rate": 5.287466316944473e-06, "loss": 0.7283, "step": 33330 }, { "epoch": 2.6020447982517756, "grad_norm": 0.31384918093681335, "learning_rate": 5.267162867111386e-06, "loss": 0.0121, "step": 33340 }, { "epoch": 2.6028252555997815, "grad_norm": 7.826183718862012e-07, "learning_rate": 5.246896306499738e-06, "loss": 0.5504, "step": 33350 }, { "epoch": 2.6036057129477874, "grad_norm": 5.406313174916022e-09, "learning_rate": 5.22666665182247e-06, "loss": 0.9671, "step": 33360 }, { "epoch": 2.6043861702957933, "grad_norm": 9.942142042973501e-08, "learning_rate": 5.206473919762106e-06, "loss": 0.0348, "step": 33370 }, { "epoch": 2.6051666276437992, "grad_norm": 6.00198575284594e-08, "learning_rate": 5.186318126970707e-06, "loss": 0.0, "step": 33380 }, { "epoch": 2.605947084991805, "grad_norm": 2.772690181328507e-18, "learning_rate": 5.166199290069895e-06, "loss": 0.7578, "step": 33390 }, { "epoch": 2.606727542339811, "grad_norm": 3.210815906524658, "learning_rate": 5.14611742565077e-06, "loss": 1.0131, "step": 33400 }, { "epoch": 2.607507999687817, "grad_norm": 0.0006296706851571798, "learning_rate": 5.126072550274003e-06, "loss": 0.0885, "step": 33410 }, { "epoch": 2.608288457035823, "grad_norm": 2.116186692546762e-08, "learning_rate": 5.106064680469697e-06, "loss": 0.5514, "step": 33420 }, { "epoch": 2.609068914383829, "grad_norm": 1.1567913134058472e-06, "learning_rate": 5.086093832737493e-06, "loss": 0.0002, "step": 33430 }, { "epoch": 2.6098493717318347, "grad_norm": 1.472561239612702e-10, "learning_rate": 5.066160023546474e-06, "loss": 0.2182, "step": 33440 }, { "epoch": 2.6106298290798406, "grad_norm": 0.006320985499769449, "learning_rate": 5.046263269335156e-06, "loss": 0.085, "step": 33450 }, { "epoch": 2.6114102864278466, "grad_norm": 5.413837655553039e-19, "learning_rate": 5.026403586511575e-06, "loss": 0.6318, "step": 33460 }, { "epoch": 2.6121907437758525, "grad_norm": 7.841154392451699e-10, "learning_rate": 5.006580991453108e-06, "loss": 0.9386, "step": 33470 }, { "epoch": 2.612971201123859, "grad_norm": 1.0349990873706862e-15, "learning_rate": 4.986795500506602e-06, "loss": 0.3775, "step": 33480 }, { "epoch": 2.6137516584718643, "grad_norm": 2.750820075636127e-22, "learning_rate": 4.9670471299882835e-06, "loss": 0.0016, "step": 33490 }, { "epoch": 2.6145321158198707, "grad_norm": 2.6430812795297243e-05, "learning_rate": 4.94733589618378e-06, "loss": 0.0143, "step": 33500 }, { "epoch": 2.615312573167876, "grad_norm": 4.1617440160734986e-08, "learning_rate": 4.9276618153480724e-06, "loss": 0.0, "step": 33510 }, { "epoch": 2.6160930305158825, "grad_norm": 2.1422061763587408e-05, "learning_rate": 4.908024903705538e-06, "loss": 0.0283, "step": 33520 }, { "epoch": 2.6168734878638884, "grad_norm": 1.1938641364395153e-07, "learning_rate": 4.8884251774498565e-06, "loss": 0.0, "step": 33530 }, { "epoch": 2.6176539452118943, "grad_norm": 0.09271605312824249, "learning_rate": 4.8688626527440885e-06, "loss": 0.0059, "step": 33540 }, { "epoch": 2.6184344025599002, "grad_norm": 5.075870035398111e-07, "learning_rate": 4.8493373457206e-06, "loss": 0.0001, "step": 33550 }, { "epoch": 2.619214859907906, "grad_norm": 23.929956436157227, "learning_rate": 4.829849272481035e-06, "loss": 2.0308, "step": 33560 }, { "epoch": 2.619995317255912, "grad_norm": 300.56817626953125, "learning_rate": 4.810398449096387e-06, "loss": 0.5585, "step": 33570 }, { "epoch": 2.620775774603918, "grad_norm": 7.055079719216206e-14, "learning_rate": 4.790984891606881e-06, "loss": 1.517, "step": 33580 }, { "epoch": 2.621556231951924, "grad_norm": 0.0070375483483076096, "learning_rate": 4.77160861602205e-06, "loss": 0.0011, "step": 33590 }, { "epoch": 2.62233668929993, "grad_norm": 2.0225567709530878e-07, "learning_rate": 4.752269638320639e-06, "loss": 0.9377, "step": 33600 }, { "epoch": 2.6231171466479357, "grad_norm": 1.4261396472647903e-06, "learning_rate": 4.73296797445068e-06, "loss": 0.016, "step": 33610 }, { "epoch": 2.6238976039959416, "grad_norm": 4.438880176435776e-21, "learning_rate": 4.7137036403294085e-06, "loss": 0.0541, "step": 33620 }, { "epoch": 2.6246780613439475, "grad_norm": 44.056949615478516, "learning_rate": 4.694476651843294e-06, "loss": 0.629, "step": 33630 }, { "epoch": 2.6254585186919535, "grad_norm": 3.850926805171184e-05, "learning_rate": 4.675287024847979e-06, "loss": 0.0844, "step": 33640 }, { "epoch": 2.6262389760399594, "grad_norm": 1.0066686684240267e-07, "learning_rate": 4.656134775168314e-06, "loss": 1.3893, "step": 33650 }, { "epoch": 2.6270194333879653, "grad_norm": 1.6732981319989015e-11, "learning_rate": 4.637019918598334e-06, "loss": 0.7188, "step": 33660 }, { "epoch": 2.627799890735971, "grad_norm": 4.760467345477082e-05, "learning_rate": 4.617942470901221e-06, "loss": 0.062, "step": 33670 }, { "epoch": 2.628580348083977, "grad_norm": 1.1190292534832627e-13, "learning_rate": 4.598902447809311e-06, "loss": 0.0, "step": 33680 }, { "epoch": 2.629360805431983, "grad_norm": 2.247318963595344e-16, "learning_rate": 4.579899865024084e-06, "loss": 0.0001, "step": 33690 }, { "epoch": 2.630141262779989, "grad_norm": 1.2163898333028555e-08, "learning_rate": 4.5609347382161605e-06, "loss": 0.7094, "step": 33700 }, { "epoch": 2.630921720127995, "grad_norm": 2.6269600305272277e-15, "learning_rate": 4.542007083025224e-06, "loss": 0.012, "step": 33710 }, { "epoch": 2.6317021774760008, "grad_norm": 2.9815462767146528e-05, "learning_rate": 4.523116915060116e-06, "loss": 0.283, "step": 33720 }, { "epoch": 2.632482634824007, "grad_norm": 3.7082540416122356e-07, "learning_rate": 4.504264249898715e-06, "loss": 0.2372, "step": 33730 }, { "epoch": 2.6332630921720126, "grad_norm": 273.8210144042969, "learning_rate": 4.485449103087997e-06, "loss": 0.3322, "step": 33740 }, { "epoch": 2.634043549520019, "grad_norm": 2.527123797335662e-06, "learning_rate": 4.466671490143998e-06, "loss": 3.9085, "step": 33750 }, { "epoch": 2.6348240068680244, "grad_norm": 0.002832346362993121, "learning_rate": 4.447931426551782e-06, "loss": 0.0, "step": 33760 }, { "epoch": 2.635604464216031, "grad_norm": 22.766206741333008, "learning_rate": 4.4292289277654775e-06, "loss": 0.0105, "step": 33770 }, { "epoch": 2.6363849215640363, "grad_norm": 1.594238521578084e-13, "learning_rate": 4.410564009208218e-06, "loss": 0.0, "step": 33780 }, { "epoch": 2.6371653789120426, "grad_norm": 0.015271048061549664, "learning_rate": 4.391936686272152e-06, "loss": 0.011, "step": 33790 }, { "epoch": 2.6379458362600485, "grad_norm": 1.151661853366501e-19, "learning_rate": 4.3733469743184095e-06, "loss": 0.0001, "step": 33800 }, { "epoch": 2.6387262936080544, "grad_norm": 0.002187962643802166, "learning_rate": 4.354794888677139e-06, "loss": 0.0036, "step": 33810 }, { "epoch": 2.6395067509560604, "grad_norm": 1.8937040294986218e-05, "learning_rate": 4.336280444647406e-06, "loss": 0.0004, "step": 33820 }, { "epoch": 2.6402872083040663, "grad_norm": 16.37470245361328, "learning_rate": 4.317803657497288e-06, "loss": 0.0074, "step": 33830 }, { "epoch": 2.641067665652072, "grad_norm": 0.0005464993300847709, "learning_rate": 4.2993645424637685e-06, "loss": 0.8549, "step": 33840 }, { "epoch": 2.641848123000078, "grad_norm": 272.48162841796875, "learning_rate": 4.2809631147528005e-06, "loss": 0.7294, "step": 33850 }, { "epoch": 2.642628580348084, "grad_norm": 4.591046945279231e-06, "learning_rate": 4.262599389539235e-06, "loss": 0.012, "step": 33860 }, { "epoch": 2.64340903769609, "grad_norm": 1.0221311693985075e-20, "learning_rate": 4.244273381966824e-06, "loss": 0.0161, "step": 33870 }, { "epoch": 2.644189495044096, "grad_norm": 274.3841247558594, "learning_rate": 4.2259851071482445e-06, "loss": 0.6576, "step": 33880 }, { "epoch": 2.6449699523921018, "grad_norm": 5.816648702433675e-17, "learning_rate": 4.207734580165023e-06, "loss": 0.1478, "step": 33890 }, { "epoch": 2.6457504097401077, "grad_norm": 4.0705185710976366e-07, "learning_rate": 4.189521816067593e-06, "loss": 0.0001, "step": 33900 }, { "epoch": 2.6465308670881136, "grad_norm": 6.483745598763743e-23, "learning_rate": 4.1713468298752e-06, "loss": 1.4205, "step": 33910 }, { "epoch": 2.6473113244361195, "grad_norm": 2.014824218576905e-07, "learning_rate": 4.1532096365759885e-06, "loss": 0.0004, "step": 33920 }, { "epoch": 2.6480917817841254, "grad_norm": 3.743392066509216e-23, "learning_rate": 4.1351102511269055e-06, "loss": 0.0, "step": 33930 }, { "epoch": 2.6488722391321313, "grad_norm": 2.345401071579545e-06, "learning_rate": 4.117048688453718e-06, "loss": 0.0017, "step": 33940 }, { "epoch": 2.6496526964801372, "grad_norm": 5.603112185781356e-06, "learning_rate": 4.099024963451015e-06, "loss": 0.0012, "step": 33950 }, { "epoch": 2.650433153828143, "grad_norm": 5.601246356964111, "learning_rate": 4.081039090982175e-06, "loss": 1.0046, "step": 33960 }, { "epoch": 2.651213611176149, "grad_norm": 2.029733404091597e-21, "learning_rate": 4.063091085879367e-06, "loss": 0.9069, "step": 33970 }, { "epoch": 2.6519940685241554, "grad_norm": 1.1794318197644316e-05, "learning_rate": 4.0451809629435236e-06, "loss": 0.0013, "step": 33980 }, { "epoch": 2.652774525872161, "grad_norm": 3.118545066627121e-08, "learning_rate": 4.027308736944341e-06, "loss": 0.0935, "step": 33990 }, { "epoch": 2.6535549832201673, "grad_norm": 7.507606483159179e-07, "learning_rate": 4.009474422620269e-06, "loss": 0.2441, "step": 34000 }, { "epoch": 2.6543354405681727, "grad_norm": 5.389193979965512e-09, "learning_rate": 3.991678034678503e-06, "loss": 0.5167, "step": 34010 }, { "epoch": 2.655115897916179, "grad_norm": 1.0987394154227115e-11, "learning_rate": 3.973919587794922e-06, "loss": 0.0096, "step": 34020 }, { "epoch": 2.6558963552641845, "grad_norm": 6.000165891950499e-20, "learning_rate": 3.956199096614166e-06, "loss": 1.2193, "step": 34030 }, { "epoch": 2.656676812612191, "grad_norm": 8.778447765623004e-16, "learning_rate": 3.9385165757495405e-06, "loss": 0.0059, "step": 34040 }, { "epoch": 2.657457269960197, "grad_norm": 2.4371112766763403e-11, "learning_rate": 3.920872039783047e-06, "loss": 0.0028, "step": 34050 }, { "epoch": 2.6582377273082027, "grad_norm": 274.0638122558594, "learning_rate": 3.903265503265374e-06, "loss": 1.025, "step": 34060 }, { "epoch": 2.6590181846562086, "grad_norm": 4.381836049560661e-07, "learning_rate": 3.8856969807158584e-06, "loss": 0.0001, "step": 34070 }, { "epoch": 2.6597986420042146, "grad_norm": 246.39788818359375, "learning_rate": 3.868166486622493e-06, "loss": 0.4099, "step": 34080 }, { "epoch": 2.6605790993522205, "grad_norm": 1.3650200642799067e-18, "learning_rate": 3.85067403544192e-06, "loss": 0.3284, "step": 34090 }, { "epoch": 2.6613595567002264, "grad_norm": 242.4805908203125, "learning_rate": 3.833219641599406e-06, "loss": 2.4331, "step": 34100 }, { "epoch": 2.6621400140482323, "grad_norm": 1.4766764877549576e-07, "learning_rate": 3.815803319488814e-06, "loss": 1.9021, "step": 34110 }, { "epoch": 2.662920471396238, "grad_norm": 45.691322326660156, "learning_rate": 3.798425083472645e-06, "loss": 2.0504, "step": 34120 }, { "epoch": 2.663700928744244, "grad_norm": 0.0011006060522049665, "learning_rate": 3.7810849478819576e-06, "loss": 0.6354, "step": 34130 }, { "epoch": 2.66448138609225, "grad_norm": 189.8499298095703, "learning_rate": 3.7637829270164014e-06, "loss": 0.643, "step": 34140 }, { "epoch": 2.665261843440256, "grad_norm": 1.676145853718252e-16, "learning_rate": 3.7465190351442135e-06, "loss": 0.162, "step": 34150 }, { "epoch": 2.666042300788262, "grad_norm": 131.11514282226562, "learning_rate": 3.7292932865021614e-06, "loss": 0.6568, "step": 34160 }, { "epoch": 2.666822758136268, "grad_norm": 0.018124770373106003, "learning_rate": 3.712105695295587e-06, "loss": 1.0613, "step": 34170 }, { "epoch": 2.6676032154842737, "grad_norm": 9.850300575209303e-10, "learning_rate": 3.6949562756983245e-06, "loss": 0.0017, "step": 34180 }, { "epoch": 2.6683836728322796, "grad_norm": 235.30149841308594, "learning_rate": 3.6778450418527733e-06, "loss": 1.0566, "step": 34190 }, { "epoch": 2.6691641301802855, "grad_norm": 9.505000342358016e-09, "learning_rate": 3.660772007869806e-06, "loss": 0.0015, "step": 34200 }, { "epoch": 2.6699445875282914, "grad_norm": 0.002341636922210455, "learning_rate": 3.6437371878288275e-06, "loss": 1.3121, "step": 34210 }, { "epoch": 2.6707250448762974, "grad_norm": 1.260479791653779e-07, "learning_rate": 3.6267405957776947e-06, "loss": 0.6651, "step": 34220 }, { "epoch": 2.6715055022243037, "grad_norm": 9.471100570479507e-10, "learning_rate": 3.6097822457327568e-06, "loss": 0.4223, "step": 34230 }, { "epoch": 2.672285959572309, "grad_norm": 0.00010060240310849622, "learning_rate": 3.592862151678844e-06, "loss": 0.7502, "step": 34240 }, { "epoch": 2.6730664169203155, "grad_norm": 9.415930253453553e-05, "learning_rate": 3.5759803275692007e-06, "loss": 0.0755, "step": 34250 }, { "epoch": 2.673846874268321, "grad_norm": 0.00018394121434539557, "learning_rate": 3.5591367873255453e-06, "loss": 0.6165, "step": 34260 }, { "epoch": 2.6746273316163274, "grad_norm": 2.5903491973876953, "learning_rate": 3.542331544838001e-06, "loss": 0.1155, "step": 34270 }, { "epoch": 2.675407788964333, "grad_norm": 6.003338421578519e-05, "learning_rate": 3.5255646139651322e-06, "loss": 0.0016, "step": 34280 }, { "epoch": 2.676188246312339, "grad_norm": 1.627708731688493e-11, "learning_rate": 3.508836008533878e-06, "loss": 1.027, "step": 34290 }, { "epoch": 2.676968703660345, "grad_norm": 273.7149353027344, "learning_rate": 3.4921457423395976e-06, "loss": 0.4918, "step": 34300 }, { "epoch": 2.677749161008351, "grad_norm": 1.9501188858184193e-11, "learning_rate": 3.4754938291460314e-06, "loss": 0.0377, "step": 34310 }, { "epoch": 2.678529618356357, "grad_norm": 5.9736852563219145e-05, "learning_rate": 3.4588802826852893e-06, "loss": 0.0222, "step": 34320 }, { "epoch": 2.679310075704363, "grad_norm": 312.1669921875, "learning_rate": 3.44230511665784e-06, "loss": 3.0204, "step": 34330 }, { "epoch": 2.6800905330523688, "grad_norm": 392.56744384765625, "learning_rate": 3.425768344732483e-06, "loss": 0.6596, "step": 34340 }, { "epoch": 2.6808709904003747, "grad_norm": 3.923849634969445e-14, "learning_rate": 3.4092699805463867e-06, "loss": 0.0026, "step": 34350 }, { "epoch": 2.6816514477483806, "grad_norm": 147.90354919433594, "learning_rate": 3.39281003770503e-06, "loss": 0.8152, "step": 34360 }, { "epoch": 2.6824319050963865, "grad_norm": 0.00016789170331321657, "learning_rate": 3.376388529782215e-06, "loss": 0.2015, "step": 34370 }, { "epoch": 2.6832123624443924, "grad_norm": 1.1009997251676396e-06, "learning_rate": 3.360005470320038e-06, "loss": 0.0062, "step": 34380 }, { "epoch": 2.6839928197923983, "grad_norm": 189.47874450683594, "learning_rate": 3.343660872828891e-06, "loss": 0.1313, "step": 34390 }, { "epoch": 2.6847732771404043, "grad_norm": 6.394549927790649e-06, "learning_rate": 3.327354750787459e-06, "loss": 0.7632, "step": 34400 }, { "epoch": 2.68555373448841, "grad_norm": 8.24633161755628e-08, "learning_rate": 3.3110871176426974e-06, "loss": 0.0001, "step": 34410 }, { "epoch": 2.686334191836416, "grad_norm": 1.4998055261802426e-18, "learning_rate": 3.294857986809802e-06, "loss": 0.0239, "step": 34420 }, { "epoch": 2.687114649184422, "grad_norm": 2.3177593178047573e-08, "learning_rate": 3.2786673716722227e-06, "loss": 0.0, "step": 34430 }, { "epoch": 2.687895106532428, "grad_norm": 0.0012782815610989928, "learning_rate": 3.26251528558168e-06, "loss": 3.0813, "step": 34440 }, { "epoch": 2.688675563880434, "grad_norm": 0.00913954060524702, "learning_rate": 3.246401741858063e-06, "loss": 0.5146, "step": 34450 }, { "epoch": 2.6894560212284397, "grad_norm": 5.803015401539824e-09, "learning_rate": 3.230326753789531e-06, "loss": 0.0, "step": 34460 }, { "epoch": 2.6902364785764457, "grad_norm": 0.2496180385351181, "learning_rate": 3.2142903346324195e-06, "loss": 0.0004, "step": 34470 }, { "epoch": 2.6910169359244516, "grad_norm": 7.848896530049387e-06, "learning_rate": 3.198292497611277e-06, "loss": 0.0222, "step": 34480 }, { "epoch": 2.6917973932724575, "grad_norm": 2.4973063913569064e-15, "learning_rate": 3.1823332559188012e-06, "loss": 1.0162, "step": 34490 }, { "epoch": 2.692577850620464, "grad_norm": 118.88430786132812, "learning_rate": 3.1664126227158976e-06, "loss": 0.053, "step": 34500 }, { "epoch": 2.6933583079684693, "grad_norm": 3.7308425362425623e-06, "learning_rate": 3.150530611131608e-06, "loss": 0.0001, "step": 34510 }, { "epoch": 2.6941387653164757, "grad_norm": 7.105061285983538e-07, "learning_rate": 3.134687234263156e-06, "loss": 1.0315, "step": 34520 }, { "epoch": 2.694919222664481, "grad_norm": 9.881821055879314e-12, "learning_rate": 3.1188825051758575e-06, "loss": 0.293, "step": 34530 }, { "epoch": 2.6956996800124875, "grad_norm": 168.21942138671875, "learning_rate": 3.1031164369031915e-06, "loss": 1.9331, "step": 34540 }, { "epoch": 2.6964801373604934, "grad_norm": 208.61349487304688, "learning_rate": 3.0873890424467698e-06, "loss": 1.0772, "step": 34550 }, { "epoch": 2.6972605947084993, "grad_norm": 2.124788807122968e-06, "learning_rate": 3.0717003347762565e-06, "loss": 0.1359, "step": 34560 }, { "epoch": 2.6980410520565052, "grad_norm": 2.2703235125209886e-10, "learning_rate": 3.0560503268294683e-06, "loss": 1.0912, "step": 34570 }, { "epoch": 2.698821509404511, "grad_norm": 286.3675537109375, "learning_rate": 3.040439031512271e-06, "loss": 1.3075, "step": 34580 }, { "epoch": 2.699601966752517, "grad_norm": 0.09111825376749039, "learning_rate": 3.0248664616986334e-06, "loss": 0.5007, "step": 34590 }, { "epoch": 2.700382424100523, "grad_norm": 0.000960067380219698, "learning_rate": 3.0093326302305604e-06, "loss": 0.0, "step": 34600 }, { "epoch": 2.701162881448529, "grad_norm": 0.0028044397477060556, "learning_rate": 2.9938375499181327e-06, "loss": 0.0031, "step": 34610 }, { "epoch": 2.701943338796535, "grad_norm": 2.1090053776006283e-20, "learning_rate": 2.9783812335394733e-06, "loss": 0.0, "step": 34620 }, { "epoch": 2.7027237961445407, "grad_norm": 1.833546640526377e-14, "learning_rate": 2.96296369384072e-06, "loss": 0.5248, "step": 34630 }, { "epoch": 2.7035042534925466, "grad_norm": 0.0002606491034384817, "learning_rate": 2.947584943536058e-06, "loss": 0.0043, "step": 34640 }, { "epoch": 2.7042847108405526, "grad_norm": 8.347085522775188e-11, "learning_rate": 2.9322449953076647e-06, "loss": 2.5233, "step": 34650 }, { "epoch": 2.7050651681885585, "grad_norm": 260.7015686035156, "learning_rate": 2.9169438618057377e-06, "loss": 1.1463, "step": 34660 }, { "epoch": 2.7058456255365644, "grad_norm": 0.000424844678491354, "learning_rate": 2.901681555648439e-06, "loss": 0.0, "step": 34670 }, { "epoch": 2.7066260828845703, "grad_norm": 1.760547545422153e-13, "learning_rate": 2.8864580894219394e-06, "loss": 0.0237, "step": 34680 }, { "epoch": 2.707406540232576, "grad_norm": 0.004844082985073328, "learning_rate": 2.8712734756803584e-06, "loss": 0.1702, "step": 34690 }, { "epoch": 2.708186997580582, "grad_norm": 0.027812352403998375, "learning_rate": 2.85612772694579e-06, "loss": 0.0471, "step": 34700 }, { "epoch": 2.708967454928588, "grad_norm": 3.4556917327677183e-09, "learning_rate": 2.841020855708276e-06, "loss": 0.3409, "step": 34710 }, { "epoch": 2.709747912276594, "grad_norm": 9.396440738126607e-14, "learning_rate": 2.8259528744258023e-06, "loss": 0.0052, "step": 34720 }, { "epoch": 2.7105283696246, "grad_norm": 6.341630935668945, "learning_rate": 2.810923795524262e-06, "loss": 0.1053, "step": 34730 }, { "epoch": 2.711308826972606, "grad_norm": 0.005546544678509235, "learning_rate": 2.795933631397485e-06, "loss": 1.1266, "step": 34740 }, { "epoch": 2.712089284320612, "grad_norm": 1.9116978833277898e-10, "learning_rate": 2.780982394407211e-06, "loss": 0.0004, "step": 34750 }, { "epoch": 2.7128697416686176, "grad_norm": 280.81304931640625, "learning_rate": 2.7660700968830666e-06, "loss": 0.5277, "step": 34760 }, { "epoch": 2.713650199016624, "grad_norm": 3.132147823765596e-20, "learning_rate": 2.7511967511225756e-06, "loss": 0.2108, "step": 34770 }, { "epoch": 2.7144306563646294, "grad_norm": 9.376853267895058e-05, "learning_rate": 2.7363623693911432e-06, "loss": 0.0007, "step": 34780 }, { "epoch": 2.715211113712636, "grad_norm": 4.393995922669092e-08, "learning_rate": 2.72156696392204e-06, "loss": 0.0452, "step": 34790 }, { "epoch": 2.7159915710606413, "grad_norm": 0.0003435779071878642, "learning_rate": 2.706810546916383e-06, "loss": 0.0086, "step": 34800 }, { "epoch": 2.7167720284086476, "grad_norm": 8.209787368774414, "learning_rate": 2.6920931305431496e-06, "loss": 0.0087, "step": 34810 }, { "epoch": 2.7175524857566535, "grad_norm": 7.450088346091447e-16, "learning_rate": 2.677414726939159e-06, "loss": 0.5131, "step": 34820 }, { "epoch": 2.7183329431046594, "grad_norm": 0.0019514625892043114, "learning_rate": 2.6627753482090347e-06, "loss": 0.7882, "step": 34830 }, { "epoch": 2.7191134004526654, "grad_norm": 7.329802610911429e-05, "learning_rate": 2.648175006425246e-06, "loss": 0.0634, "step": 34840 }, { "epoch": 2.7198938578006713, "grad_norm": 1.3475154638290405, "learning_rate": 2.6336137136280526e-06, "loss": 0.0131, "step": 34850 }, { "epoch": 2.720674315148677, "grad_norm": 0.001319439266808331, "learning_rate": 2.619091481825531e-06, "loss": 0.0597, "step": 34860 }, { "epoch": 2.721454772496683, "grad_norm": 0.009119241498410702, "learning_rate": 2.604608322993518e-06, "loss": 0.0006, "step": 34870 }, { "epoch": 2.722235229844689, "grad_norm": 6.017874511599075e-06, "learning_rate": 2.590164249075655e-06, "loss": 1.9251, "step": 34880 }, { "epoch": 2.723015687192695, "grad_norm": 0.0004034480080008507, "learning_rate": 2.57575927198333e-06, "loss": 0.5727, "step": 34890 }, { "epoch": 2.723796144540701, "grad_norm": 4.727761790945806e-07, "learning_rate": 2.5613934035957132e-06, "loss": 0.0172, "step": 34900 }, { "epoch": 2.7245766018887068, "grad_norm": 1.950404616845415e-18, "learning_rate": 2.5470666557596945e-06, "loss": 0.017, "step": 34910 }, { "epoch": 2.7253570592367127, "grad_norm": 2.9835364818573, "learning_rate": 2.5327790402899177e-06, "loss": 0.0016, "step": 34920 }, { "epoch": 2.7261375165847186, "grad_norm": 1.925894999033062e-10, "learning_rate": 2.51853056896878e-06, "loss": 1.1155, "step": 34930 }, { "epoch": 2.7269179739327245, "grad_norm": 0.009026541374623775, "learning_rate": 2.5043212535463488e-06, "loss": 1.1879, "step": 34940 }, { "epoch": 2.7276984312807304, "grad_norm": 1.3697922346698476e-17, "learning_rate": 2.490151105740446e-06, "loss": 0.0281, "step": 34950 }, { "epoch": 2.7284788886287363, "grad_norm": 9.61051628109999e-05, "learning_rate": 2.4760201372365566e-06, "loss": 0.0002, "step": 34960 }, { "epoch": 2.7292593459767422, "grad_norm": 2.5067332654060692e-09, "learning_rate": 2.4619283596878985e-06, "loss": 0.0021, "step": 34970 }, { "epoch": 2.730039803324748, "grad_norm": 68.98681640625, "learning_rate": 2.44787578471532e-06, "loss": 0.6315, "step": 34980 }, { "epoch": 2.730820260672754, "grad_norm": 2.5030828965100227e-08, "learning_rate": 2.4338624239073903e-06, "loss": 1.2717, "step": 34990 }, { "epoch": 2.7316007180207604, "grad_norm": 242.74081420898438, "learning_rate": 2.419888288820299e-06, "loss": 1.3068, "step": 35000 }, { "epoch": 2.732381175368766, "grad_norm": 9.598036740499083e-06, "learning_rate": 2.4059533909779165e-06, "loss": 0.0011, "step": 35010 }, { "epoch": 2.7331616327167723, "grad_norm": 3.2457666065965896e-07, "learning_rate": 2.3920577418717505e-06, "loss": 1.1327, "step": 35020 }, { "epoch": 2.7339420900647777, "grad_norm": 0.007707437966018915, "learning_rate": 2.378201352960924e-06, "loss": 0.0002, "step": 35030 }, { "epoch": 2.734722547412784, "grad_norm": 3.7804625034332275, "learning_rate": 2.3643842356722124e-06, "loss": 0.9504, "step": 35040 }, { "epoch": 2.7355030047607896, "grad_norm": 1.9210584512041606e-11, "learning_rate": 2.350606401399974e-06, "loss": 1.5375, "step": 35050 }, { "epoch": 2.736283462108796, "grad_norm": 1.1549177396550862e-18, "learning_rate": 2.336867861506203e-06, "loss": 0.0014, "step": 35060 }, { "epoch": 2.737063919456802, "grad_norm": 0.04194324463605881, "learning_rate": 2.32316862732046e-06, "loss": 0.0127, "step": 35070 }, { "epoch": 2.7378443768048077, "grad_norm": 115.06947326660156, "learning_rate": 2.3095087101399126e-06, "loss": 0.12, "step": 35080 }, { "epoch": 2.7386248341528137, "grad_norm": 2.2268917265755306e-10, "learning_rate": 2.2958881212293006e-06, "loss": 0.0128, "step": 35090 }, { "epoch": 2.7394052915008196, "grad_norm": 51.513614654541016, "learning_rate": 2.282306871820938e-06, "loss": 0.666, "step": 35100 }, { "epoch": 2.7401857488488255, "grad_norm": 8.180680274963379, "learning_rate": 2.268764973114684e-06, "loss": 0.097, "step": 35110 }, { "epoch": 2.7409662061968314, "grad_norm": 5.291911353566547e-09, "learning_rate": 2.255262436277944e-06, "loss": 0.1661, "step": 35120 }, { "epoch": 2.7417466635448373, "grad_norm": 1.2681041489749473e-20, "learning_rate": 2.2417992724456827e-06, "loss": 0.1017, "step": 35130 }, { "epoch": 2.7425271208928432, "grad_norm": 4.007921264800628e-18, "learning_rate": 2.228375492720369e-06, "loss": 0.3182, "step": 35140 }, { "epoch": 2.743307578240849, "grad_norm": 2.2716215575258047e-09, "learning_rate": 2.2149911081720264e-06, "loss": 2.6478, "step": 35150 }, { "epoch": 2.744088035588855, "grad_norm": 0.0053436425514519215, "learning_rate": 2.2016461298381588e-06, "loss": 0.0315, "step": 35160 }, { "epoch": 2.744868492936861, "grad_norm": 2.0400614791221683e-12, "learning_rate": 2.1883405687238067e-06, "loss": 1.6938, "step": 35170 }, { "epoch": 2.745648950284867, "grad_norm": 1.920632939800271e-06, "learning_rate": 2.1750744358014717e-06, "loss": 0.0001, "step": 35180 }, { "epoch": 2.746429407632873, "grad_norm": 2.3888586042630777e-07, "learning_rate": 2.161847742011164e-06, "loss": 0.0, "step": 35190 }, { "epoch": 2.7472098649808787, "grad_norm": 0.08937634527683258, "learning_rate": 2.148660498260352e-06, "loss": 1.8679, "step": 35200 }, { "epoch": 2.7479903223288846, "grad_norm": 4.1917643102351576e-05, "learning_rate": 2.1355127154239927e-06, "loss": 0.8796, "step": 35210 }, { "epoch": 2.7487707796768905, "grad_norm": 0.0017686202190816402, "learning_rate": 2.122404404344486e-06, "loss": 0.0002, "step": 35220 }, { "epoch": 2.7495512370248965, "grad_norm": 5.840608992002672e-07, "learning_rate": 2.1093355758316723e-06, "loss": 0.0002, "step": 35230 }, { "epoch": 2.7503316943729024, "grad_norm": 1.9371828795394235e-21, "learning_rate": 2.09630624066286e-06, "loss": 0.2608, "step": 35240 }, { "epoch": 2.7511121517209087, "grad_norm": 0.0011315630981698632, "learning_rate": 2.083316409582764e-06, "loss": 0.0001, "step": 35250 }, { "epoch": 2.751892609068914, "grad_norm": 279.3219909667969, "learning_rate": 2.070366093303544e-06, "loss": 0.3636, "step": 35260 }, { "epoch": 2.7526730664169206, "grad_norm": 4.184293516118487e-07, "learning_rate": 2.057455302504746e-06, "loss": 0.1676, "step": 35270 }, { "epoch": 2.753453523764926, "grad_norm": 0.03150082752108574, "learning_rate": 2.044584047833359e-06, "loss": 1.2382, "step": 35280 }, { "epoch": 2.7542339811129324, "grad_norm": 21.65041732788086, "learning_rate": 2.0317523399037195e-06, "loss": 0.0081, "step": 35290 }, { "epoch": 2.755014438460938, "grad_norm": 5.577290135261835e-15, "learning_rate": 2.0189601892975974e-06, "loss": 1.3071, "step": 35300 }, { "epoch": 2.755794895808944, "grad_norm": 0.056859709322452545, "learning_rate": 2.006207606564109e-06, "loss": 0.0181, "step": 35310 }, { "epoch": 2.75657535315695, "grad_norm": 1.200806442821027e-21, "learning_rate": 1.9934946022197586e-06, "loss": 0.1702, "step": 35320 }, { "epoch": 2.757355810504956, "grad_norm": 2.7878815600731964e-17, "learning_rate": 1.9808211867484105e-06, "loss": 0.402, "step": 35330 }, { "epoch": 2.758136267852962, "grad_norm": 5.401259422302246, "learning_rate": 1.968187370601271e-06, "loss": 0.0813, "step": 35340 }, { "epoch": 2.758916725200968, "grad_norm": 0.5862646102905273, "learning_rate": 1.9555931641969116e-06, "loss": 0.0017, "step": 35350 }, { "epoch": 2.759697182548974, "grad_norm": 8.405132393818349e-05, "learning_rate": 1.943038577921208e-06, "loss": 0.0025, "step": 35360 }, { "epoch": 2.7604776398969797, "grad_norm": 0.005214150529354811, "learning_rate": 1.9305236221273894e-06, "loss": 0.7878, "step": 35370 }, { "epoch": 2.7612580972449856, "grad_norm": 0.00010408924572402611, "learning_rate": 1.9180483071359946e-06, "loss": 0.1641, "step": 35380 }, { "epoch": 2.7620385545929915, "grad_norm": 6.571749811001837e-09, "learning_rate": 1.9056126432348664e-06, "loss": 0.2695, "step": 35390 }, { "epoch": 2.7628190119409974, "grad_norm": 4.0899040658889474e-19, "learning_rate": 1.893216640679163e-06, "loss": 0.0102, "step": 35400 }, { "epoch": 2.7635994692890034, "grad_norm": 0.0037415786646306515, "learning_rate": 1.8808603096913235e-06, "loss": 0.0203, "step": 35410 }, { "epoch": 2.7643799266370093, "grad_norm": 0.0011977603426203132, "learning_rate": 1.8685436604610807e-06, "loss": 0.0143, "step": 35420 }, { "epoch": 2.765160383985015, "grad_norm": 32.34883117675781, "learning_rate": 1.8562667031454262e-06, "loss": 0.5085, "step": 35430 }, { "epoch": 2.765940841333021, "grad_norm": 5.38933093707783e-08, "learning_rate": 1.8440294478686448e-06, "loss": 0.002, "step": 35440 }, { "epoch": 2.766721298681027, "grad_norm": 290.1611328125, "learning_rate": 1.8318319047222532e-06, "loss": 2.7862, "step": 35450 }, { "epoch": 2.767501756029033, "grad_norm": 9.790755939320661e-08, "learning_rate": 1.8196740837650495e-06, "loss": 0.1858, "step": 35460 }, { "epoch": 2.768282213377039, "grad_norm": 1.5828281902940944e-05, "learning_rate": 1.807555995023047e-06, "loss": 0.1846, "step": 35470 }, { "epoch": 2.7690626707250448, "grad_norm": 8.833049626985384e-16, "learning_rate": 1.7954776484895186e-06, "loss": 0.0407, "step": 35480 }, { "epoch": 2.7698431280730507, "grad_norm": 0.0, "learning_rate": 1.7834390541249358e-06, "loss": 0.9468, "step": 35490 }, { "epoch": 2.7706235854210566, "grad_norm": 127.19840240478516, "learning_rate": 1.7714402218570125e-06, "loss": 0.1476, "step": 35500 }, { "epoch": 2.7714040427690625, "grad_norm": 3.28693672280167e-09, "learning_rate": 1.7594811615806616e-06, "loss": 0.2407, "step": 35510 }, { "epoch": 2.772184500117069, "grad_norm": 8.966098903329112e-06, "learning_rate": 1.7475618831579942e-06, "loss": 0.5673, "step": 35520 }, { "epoch": 2.7729649574650743, "grad_norm": 0.01867363229393959, "learning_rate": 1.7356823964183255e-06, "loss": 0.0314, "step": 35530 }, { "epoch": 2.7737454148130807, "grad_norm": 1.3436168546038588e-14, "learning_rate": 1.7238427111581412e-06, "loss": 1.5335, "step": 35540 }, { "epoch": 2.774525872161086, "grad_norm": 9.506376266479492, "learning_rate": 1.7120428371411146e-06, "loss": 0.0026, "step": 35550 }, { "epoch": 2.7753063295090925, "grad_norm": 5.515518296306254e-06, "learning_rate": 1.7002827840981007e-06, "loss": 0.0003, "step": 35560 }, { "epoch": 2.7760867868570984, "grad_norm": 0.0, "learning_rate": 1.6885625617270972e-06, "loss": 0.0489, "step": 35570 }, { "epoch": 2.7768672442051043, "grad_norm": 1.0065394917546655e-06, "learning_rate": 1.6768821796932565e-06, "loss": 0.2155, "step": 35580 }, { "epoch": 2.7776477015531102, "grad_norm": 1.872426310001174e-06, "learning_rate": 1.6652416476288846e-06, "loss": 0.0216, "step": 35590 }, { "epoch": 2.778428158901116, "grad_norm": 0.001323552569374442, "learning_rate": 1.6536409751334191e-06, "loss": 0.0067, "step": 35600 }, { "epoch": 2.779208616249122, "grad_norm": 6.04273395765631e-08, "learning_rate": 1.6420801717734246e-06, "loss": 0.0008, "step": 35610 }, { "epoch": 2.779989073597128, "grad_norm": 0.00045711363782174885, "learning_rate": 1.630559247082597e-06, "loss": 0.0002, "step": 35620 }, { "epoch": 2.780769530945134, "grad_norm": 1.8189606908230104e-15, "learning_rate": 1.6190782105617364e-06, "loss": 0.7252, "step": 35630 }, { "epoch": 2.78154998829314, "grad_norm": 8.045684012358834e-07, "learning_rate": 1.6076370716787692e-06, "loss": 0.0043, "step": 35640 }, { "epoch": 2.7823304456411457, "grad_norm": 5.913908334159864e-16, "learning_rate": 1.5962358398686816e-06, "loss": 0.0176, "step": 35650 }, { "epoch": 2.7831109029891516, "grad_norm": 0.8195492625236511, "learning_rate": 1.5848745245335916e-06, "loss": 0.0003, "step": 35660 }, { "epoch": 2.7838913603371576, "grad_norm": 0.07716309279203415, "learning_rate": 1.5735531350426657e-06, "loss": 0.0038, "step": 35670 }, { "epoch": 2.7846718176851635, "grad_norm": 3.6004589674121235e-06, "learning_rate": 1.5622716807321692e-06, "loss": 0.0014, "step": 35680 }, { "epoch": 2.7854522750331694, "grad_norm": 2.6100972263520816e-06, "learning_rate": 1.5510301709054209e-06, "loss": 1.281, "step": 35690 }, { "epoch": 2.7862327323811753, "grad_norm": 0.5086005330085754, "learning_rate": 1.5398286148328056e-06, "loss": 2.1155, "step": 35700 }, { "epoch": 2.787013189729181, "grad_norm": 268.41796875, "learning_rate": 1.5286670217517673e-06, "loss": 0.3591, "step": 35710 }, { "epoch": 2.787793647077187, "grad_norm": 1.9338531274115667e-05, "learning_rate": 1.5175454008667712e-06, "loss": 0.0009, "step": 35720 }, { "epoch": 2.788574104425193, "grad_norm": 180.71678161621094, "learning_rate": 1.5064637613493471e-06, "loss": 0.2561, "step": 35730 }, { "epoch": 2.789354561773199, "grad_norm": 2.9723254413305256e-10, "learning_rate": 1.495422112338024e-06, "loss": 0.0281, "step": 35740 }, { "epoch": 2.790135019121205, "grad_norm": 1.7081179678156422e-13, "learning_rate": 1.484420462938385e-06, "loss": 0.3611, "step": 35750 }, { "epoch": 2.790915476469211, "grad_norm": 3.5201527099128085e-18, "learning_rate": 1.4734588222230006e-06, "loss": 0.0, "step": 35760 }, { "epoch": 2.791695933817217, "grad_norm": 0.011384816840291023, "learning_rate": 1.4625371992314618e-06, "loss": 0.56, "step": 35770 }, { "epoch": 2.7924763911652226, "grad_norm": 5.476379556057509e-06, "learning_rate": 1.4516556029703532e-06, "loss": 0.2762, "step": 35780 }, { "epoch": 2.793256848513229, "grad_norm": 0.014588144607841969, "learning_rate": 1.4408140424132578e-06, "loss": 0.2594, "step": 35790 }, { "epoch": 2.7940373058612344, "grad_norm": 6.3460006458626594e-06, "learning_rate": 1.4300125265007347e-06, "loss": 0.4624, "step": 35800 }, { "epoch": 2.794817763209241, "grad_norm": 0.00012951460666954517, "learning_rate": 1.41925106414032e-06, "loss": 0.1455, "step": 35810 }, { "epoch": 2.7955982205572463, "grad_norm": 6.552781234292482e-11, "learning_rate": 1.4085296642065316e-06, "loss": 1.6117, "step": 35820 }, { "epoch": 2.7963786779052526, "grad_norm": 70.52780151367188, "learning_rate": 1.39784833554083e-06, "loss": 0.0387, "step": 35830 }, { "epoch": 2.7971591352532585, "grad_norm": 1.323123655083691e-07, "learning_rate": 1.3872070869516529e-06, "loss": 0.003, "step": 35840 }, { "epoch": 2.7979395926012645, "grad_norm": 250.273193359375, "learning_rate": 1.3766059272143638e-06, "loss": 0.2595, "step": 35850 }, { "epoch": 2.7987200499492704, "grad_norm": 3.7702760913305156e-09, "learning_rate": 1.366044865071281e-06, "loss": 0.5753, "step": 35860 }, { "epoch": 2.7995005072972763, "grad_norm": 383.3858642578125, "learning_rate": 1.355523909231654e-06, "loss": 1.8688, "step": 35870 }, { "epoch": 2.800280964645282, "grad_norm": 2.6719694687926676e-06, "learning_rate": 1.3450430683716596e-06, "loss": 0.0001, "step": 35880 }, { "epoch": 2.801061421993288, "grad_norm": 198.89208984375, "learning_rate": 1.3346023511343842e-06, "loss": 0.5731, "step": 35890 }, { "epoch": 2.801841879341294, "grad_norm": 0.2880525588989258, "learning_rate": 1.3242017661298345e-06, "loss": 0.0024, "step": 35900 }, { "epoch": 2.8026223366893, "grad_norm": 1.8106071248422465e-14, "learning_rate": 1.3138413219349165e-06, "loss": 0.8769, "step": 35910 }, { "epoch": 2.803402794037306, "grad_norm": 1.7158454561697067e-12, "learning_rate": 1.3035210270934462e-06, "loss": 0.0018, "step": 35920 }, { "epoch": 2.8041832513853118, "grad_norm": 0.22820571064949036, "learning_rate": 1.2932408901161096e-06, "loss": 0.0376, "step": 35930 }, { "epoch": 2.8049637087333177, "grad_norm": 0.018509892746806145, "learning_rate": 1.2830009194804927e-06, "loss": 1.6518, "step": 35940 }, { "epoch": 2.8057441660813236, "grad_norm": 7.552736759185791, "learning_rate": 1.2728011236310566e-06, "loss": 0.4229, "step": 35950 }, { "epoch": 2.8065246234293295, "grad_norm": 0.0012847146717831492, "learning_rate": 1.2626415109791235e-06, "loss": 1.4565, "step": 35960 }, { "epoch": 2.8073050807773354, "grad_norm": 9.853501348189297e-10, "learning_rate": 1.252522089902891e-06, "loss": 1.1181, "step": 35970 }, { "epoch": 2.8080855381253413, "grad_norm": 8.04511088858817e-08, "learning_rate": 1.2424428687473954e-06, "loss": 0.4782, "step": 35980 }, { "epoch": 2.8088659954733473, "grad_norm": 0.0016354135004803538, "learning_rate": 1.2324038558245376e-06, "loss": 0.1163, "step": 35990 }, { "epoch": 2.809646452821353, "grad_norm": 219.00985717773438, "learning_rate": 1.2224050594130454e-06, "loss": 0.2557, "step": 36000 }, { "epoch": 2.810426910169359, "grad_norm": 3.0811437385855345e-13, "learning_rate": 1.2124464877584951e-06, "loss": 0.0113, "step": 36010 }, { "epoch": 2.8112073675173654, "grad_norm": 0.0, "learning_rate": 1.2025281490732953e-06, "loss": 0.1054, "step": 36020 }, { "epoch": 2.811987824865371, "grad_norm": 5.375960568094855e-15, "learning_rate": 1.192650051536659e-06, "loss": 1.0442, "step": 36030 }, { "epoch": 2.8127682822133773, "grad_norm": 0.00020814120944123715, "learning_rate": 1.1828122032946254e-06, "loss": 0.2798, "step": 36040 }, { "epoch": 2.8135487395613827, "grad_norm": 5.018740125706245e-07, "learning_rate": 1.1730146124600273e-06, "loss": 0.8126, "step": 36050 }, { "epoch": 2.814329196909389, "grad_norm": 0.0027081905864179134, "learning_rate": 1.1632572871125346e-06, "loss": 0.0516, "step": 36060 }, { "epoch": 2.8151096542573946, "grad_norm": 21.75045394897461, "learning_rate": 1.1535402352985668e-06, "loss": 1.5453, "step": 36070 }, { "epoch": 2.815890111605401, "grad_norm": 4.614805584424175e-05, "learning_rate": 1.143863465031364e-06, "loss": 0.0003, "step": 36080 }, { "epoch": 2.816670568953407, "grad_norm": 0.002450536238029599, "learning_rate": 1.134226984290937e-06, "loss": 0.8069, "step": 36090 }, { "epoch": 2.8174510263014128, "grad_norm": 0.24124084413051605, "learning_rate": 1.1246308010240625e-06, "loss": 0.7377, "step": 36100 }, { "epoch": 2.8182314836494187, "grad_norm": 8.658199262661639e-16, "learning_rate": 1.1150749231443102e-06, "loss": 0.7351, "step": 36110 }, { "epoch": 2.8190119409974246, "grad_norm": 0.6915903687477112, "learning_rate": 1.1055593585319824e-06, "loss": 0.0231, "step": 36120 }, { "epoch": 2.8197923983454305, "grad_norm": 297.76934814453125, "learning_rate": 1.0960841150341573e-06, "loss": 0.4391, "step": 36130 }, { "epoch": 2.8205728556934364, "grad_norm": 1.6281728743017254e-10, "learning_rate": 1.0866492004646455e-06, "loss": 0.7291, "step": 36140 }, { "epoch": 2.8213533130414423, "grad_norm": 253.92724609375, "learning_rate": 1.0772546226040291e-06, "loss": 1.4697, "step": 36150 }, { "epoch": 2.8221337703894482, "grad_norm": 0.0, "learning_rate": 1.0679003891995887e-06, "loss": 0.0234, "step": 36160 }, { "epoch": 2.822914227737454, "grad_norm": 34.89122009277344, "learning_rate": 1.058586507965359e-06, "loss": 0.2094, "step": 36170 }, { "epoch": 2.82369468508546, "grad_norm": 269.281982421875, "learning_rate": 1.049312986582096e-06, "loss": 0.4945, "step": 36180 }, { "epoch": 2.824475142433466, "grad_norm": 1.9220181703567505, "learning_rate": 1.040079832697266e-06, "loss": 0.001, "step": 36190 }, { "epoch": 2.825255599781472, "grad_norm": 242.4416046142578, "learning_rate": 1.0308870539250503e-06, "loss": 0.8693, "step": 36200 }, { "epoch": 2.826036057129478, "grad_norm": 7.059574604034424, "learning_rate": 1.021734657846324e-06, "loss": 0.003, "step": 36210 }, { "epoch": 2.8268165144774837, "grad_norm": 8.370619773864746, "learning_rate": 1.0126226520086824e-06, "loss": 0.8058, "step": 36220 }, { "epoch": 2.8275969718254896, "grad_norm": 3.950748542638045e-16, "learning_rate": 1.003551043926393e-06, "loss": 0.6948, "step": 36230 }, { "epoch": 2.8283774291734955, "grad_norm": 71.38811492919922, "learning_rate": 9.945198410804102e-07, "loss": 0.0513, "step": 36240 }, { "epoch": 2.8291578865215015, "grad_norm": 0.11276044696569443, "learning_rate": 9.855290509183824e-07, "loss": 0.0315, "step": 36250 }, { "epoch": 2.8299383438695074, "grad_norm": 133.40269470214844, "learning_rate": 9.765786808546228e-07, "loss": 0.9867, "step": 36260 }, { "epoch": 2.8307188012175137, "grad_norm": 316.03729248046875, "learning_rate": 9.676687382701054e-07, "loss": 1.9482, "step": 36270 }, { "epoch": 2.831499258565519, "grad_norm": 2.645971820553622e-13, "learning_rate": 9.587992305124748e-07, "loss": 0.2695, "step": 36280 }, { "epoch": 2.8322797159135256, "grad_norm": 2.3823126902178943e-14, "learning_rate": 9.499701648960302e-07, "loss": 0.2852, "step": 36290 }, { "epoch": 2.833060173261531, "grad_norm": 6.899634197310434e-09, "learning_rate": 9.411815487017084e-07, "loss": 1.3206, "step": 36300 }, { "epoch": 2.8338406306095374, "grad_norm": 13.426802635192871, "learning_rate": 9.324333891771064e-07, "loss": 0.0057, "step": 36310 }, { "epoch": 2.834621087957543, "grad_norm": 32.85948181152344, "learning_rate": 9.237256935364425e-07, "loss": 0.0127, "step": 36320 }, { "epoch": 2.835401545305549, "grad_norm": 0.058131031692028046, "learning_rate": 9.15058468960589e-07, "loss": 0.0143, "step": 36330 }, { "epoch": 2.836182002653555, "grad_norm": 256.7023620605469, "learning_rate": 9.06431722597012e-07, "loss": 1.7361, "step": 36340 }, { "epoch": 2.836962460001561, "grad_norm": 3.593015662772814e-07, "learning_rate": 8.978454615598209e-07, "loss": 0.5695, "step": 36350 }, { "epoch": 2.837742917349567, "grad_norm": 1.4453445384976504e-11, "learning_rate": 8.892996929297292e-07, "loss": 0.0975, "step": 36360 }, { "epoch": 2.838523374697573, "grad_norm": 0.0018407270545139909, "learning_rate": 8.807944237540666e-07, "loss": 0.0378, "step": 36370 }, { "epoch": 2.839303832045579, "grad_norm": 1.47364867704447e-13, "learning_rate": 8.723296610467446e-07, "loss": 0.0, "step": 36380 }, { "epoch": 2.8400842893935847, "grad_norm": 4.2558838397877e-12, "learning_rate": 8.639054117882905e-07, "loss": 0.1764, "step": 36390 }, { "epoch": 2.8408647467415906, "grad_norm": 3.332346343309922e-10, "learning_rate": 8.555216829258195e-07, "loss": 0.0023, "step": 36400 }, { "epoch": 2.8416452040895965, "grad_norm": 1.7551040373646465e-13, "learning_rate": 8.471784813730232e-07, "loss": 1.1252, "step": 36410 }, { "epoch": 2.8424256614376024, "grad_norm": 9.194217974055003e-19, "learning_rate": 8.388758140101815e-07, "loss": 0.538, "step": 36420 }, { "epoch": 2.8432061187856084, "grad_norm": 26.544153213500977, "learning_rate": 8.306136876841286e-07, "loss": 0.0204, "step": 36430 }, { "epoch": 2.8439865761336143, "grad_norm": 35.749874114990234, "learning_rate": 8.223921092083031e-07, "loss": 0.6544, "step": 36440 }, { "epoch": 2.84476703348162, "grad_norm": 1.4025773452885915e-05, "learning_rate": 8.142110853626594e-07, "loss": 0.0, "step": 36450 }, { "epoch": 2.845547490829626, "grad_norm": 1.284917203747682e-07, "learning_rate": 8.060706228937454e-07, "loss": 0.8626, "step": 36460 }, { "epoch": 2.846327948177632, "grad_norm": 6.08206844329834, "learning_rate": 7.979707285146353e-07, "loss": 0.0023, "step": 36470 }, { "epoch": 2.847108405525638, "grad_norm": 0.37371331453323364, "learning_rate": 7.899114089049697e-07, "loss": 0.5559, "step": 36480 }, { "epoch": 2.847888862873644, "grad_norm": 299.5367736816406, "learning_rate": 7.818926707109154e-07, "loss": 0.1753, "step": 36490 }, { "epoch": 2.8486693202216498, "grad_norm": 1.221839374920819e-05, "learning_rate": 7.739145205451714e-07, "loss": 0.9911, "step": 36500 }, { "epoch": 2.8494497775696557, "grad_norm": 1.6280899217235856e-05, "learning_rate": 7.659769649869752e-07, "loss": 0.3591, "step": 36510 }, { "epoch": 2.8502302349176616, "grad_norm": 278.3134460449219, "learning_rate": 7.580800105820796e-07, "loss": 0.6678, "step": 36520 }, { "epoch": 2.8510106922656675, "grad_norm": 0.000931643764488399, "learning_rate": 7.502236638427695e-07, "loss": 0.0144, "step": 36530 }, { "epoch": 2.851791149613674, "grad_norm": 339.99200439453125, "learning_rate": 7.424079312478127e-07, "loss": 0.8403, "step": 36540 }, { "epoch": 2.8525716069616793, "grad_norm": 4.815967763471818e-11, "learning_rate": 7.346328192425145e-07, "loss": 0.1756, "step": 36550 }, { "epoch": 2.8533520643096857, "grad_norm": 0.000435869034845382, "learning_rate": 7.268983342386737e-07, "loss": 0.0307, "step": 36560 }, { "epoch": 2.854132521657691, "grad_norm": 5.227929705142742e-06, "learning_rate": 7.192044826145771e-07, "loss": 1.2073, "step": 36570 }, { "epoch": 2.8549129790056975, "grad_norm": 1.5421983690089291e-09, "learning_rate": 7.115512707150162e-07, "loss": 0.1502, "step": 36580 }, { "epoch": 2.8556934363537034, "grad_norm": 2.3623229594704753e-07, "learning_rate": 7.03938704851248e-07, "loss": 0.2127, "step": 36590 }, { "epoch": 2.8564738937017093, "grad_norm": 0.0002894347708206624, "learning_rate": 6.963667913010397e-07, "loss": 0.3019, "step": 36600 }, { "epoch": 2.8572543510497153, "grad_norm": 5.101186957290338e-07, "learning_rate": 6.888355363086019e-07, "loss": 0.1228, "step": 36610 }, { "epoch": 2.858034808397721, "grad_norm": 154.7194061279297, "learning_rate": 6.813449460846444e-07, "loss": 0.1678, "step": 36620 }, { "epoch": 2.858815265745727, "grad_norm": 327.861328125, "learning_rate": 6.738950268063315e-07, "loss": 0.6057, "step": 36630 }, { "epoch": 2.859595723093733, "grad_norm": 0.0005815518088638783, "learning_rate": 6.664857846172822e-07, "loss": 0.4766, "step": 36640 }, { "epoch": 2.860376180441739, "grad_norm": 0.01614786870777607, "learning_rate": 6.591172256275702e-07, "loss": 0.6633, "step": 36650 }, { "epoch": 2.861156637789745, "grad_norm": 2.4461796283721924, "learning_rate": 6.5178935591374e-07, "loss": 0.1324, "step": 36660 }, { "epoch": 2.8619370951377507, "grad_norm": 4.134871958716424e-11, "learning_rate": 6.445021815187524e-07, "loss": 2.0777, "step": 36670 }, { "epoch": 2.8627175524857567, "grad_norm": 0.0005343449302017689, "learning_rate": 6.372557084520281e-07, "loss": 0.0769, "step": 36680 }, { "epoch": 2.8634980098337626, "grad_norm": 0.3777041435241699, "learning_rate": 6.300499426894202e-07, "loss": 1.8982, "step": 36690 }, { "epoch": 2.8642784671817685, "grad_norm": 1.9186600919152141e-10, "learning_rate": 6.228848901732031e-07, "loss": 0.3288, "step": 36700 }, { "epoch": 2.8650589245297744, "grad_norm": 3.368470788700506e-05, "learning_rate": 6.157605568120839e-07, "loss": 4.5806, "step": 36710 }, { "epoch": 2.8658393818777803, "grad_norm": 0.30120527744293213, "learning_rate": 6.08676948481196e-07, "loss": 0.0001, "step": 36720 }, { "epoch": 2.8666198392257862, "grad_norm": 0.01890019327402115, "learning_rate": 6.016340710220835e-07, "loss": 0.9956, "step": 36730 }, { "epoch": 2.867400296573792, "grad_norm": 8.265578799182549e-06, "learning_rate": 5.94631930242695e-07, "loss": 0.2812, "step": 36740 }, { "epoch": 2.868180753921798, "grad_norm": 302.0165710449219, "learning_rate": 5.876705319173892e-07, "loss": 3.4252, "step": 36750 }, { "epoch": 2.868961211269804, "grad_norm": 0.029607435688376427, "learning_rate": 5.80749881786935e-07, "loss": 0.0, "step": 36760 }, { "epoch": 2.86974166861781, "grad_norm": 1.4220939981157699e-09, "learning_rate": 5.73869985558484e-07, "loss": 0.7625, "step": 36770 }, { "epoch": 2.870522125965816, "grad_norm": 2.4834584166910645e-08, "learning_rate": 5.67030848905592e-07, "loss": 0.9126, "step": 36780 }, { "epoch": 2.871302583313822, "grad_norm": 0.07383527606725693, "learning_rate": 5.602324774681922e-07, "loss": 0.9126, "step": 36790 }, { "epoch": 2.8720830406618276, "grad_norm": 2.807852615660522e-05, "learning_rate": 5.53474876852611e-07, "loss": 0.0002, "step": 36800 }, { "epoch": 2.872863498009834, "grad_norm": 0.7243090867996216, "learning_rate": 5.467580526315408e-07, "loss": 1.1119, "step": 36810 }, { "epoch": 2.8736439553578395, "grad_norm": 9.3350399974065e-16, "learning_rate": 5.400820103440618e-07, "loss": 1.944, "step": 36820 }, { "epoch": 2.874424412705846, "grad_norm": 115.54177856445312, "learning_rate": 5.334467554956091e-07, "loss": 0.0732, "step": 36830 }, { "epoch": 2.8752048700538513, "grad_norm": 1.0314905011910014e-05, "learning_rate": 5.268522935579889e-07, "loss": 0.2186, "step": 36840 }, { "epoch": 2.8759853274018576, "grad_norm": 7.815126323862387e-10, "learning_rate": 5.202986299693679e-07, "loss": 0.0001, "step": 36850 }, { "epoch": 2.8767657847498636, "grad_norm": 5.589182497073235e-20, "learning_rate": 5.137857701342619e-07, "loss": 0.0, "step": 36860 }, { "epoch": 2.8775462420978695, "grad_norm": 9.388667621124114e-08, "learning_rate": 5.073137194235523e-07, "loss": 0.125, "step": 36870 }, { "epoch": 2.8783266994458754, "grad_norm": 1.190881082635542e-08, "learning_rate": 5.008824831744474e-07, "loss": 0.0003, "step": 36880 }, { "epoch": 2.8791071567938813, "grad_norm": 3.307443628985104e-17, "learning_rate": 4.944920666905162e-07, "loss": 0.0466, "step": 36890 }, { "epoch": 2.879887614141887, "grad_norm": 6.361972615387401e-10, "learning_rate": 4.881424752416541e-07, "loss": 0.0082, "step": 36900 }, { "epoch": 2.880668071489893, "grad_norm": 0.12347617745399475, "learning_rate": 4.818337140640894e-07, "loss": 0.0001, "step": 36910 }, { "epoch": 2.881448528837899, "grad_norm": 0.29002267122268677, "learning_rate": 4.755657883603826e-07, "loss": 0.0005, "step": 36920 }, { "epoch": 2.882228986185905, "grad_norm": 0.01090227346867323, "learning_rate": 4.6933870329941566e-07, "loss": 0.0094, "step": 36930 }, { "epoch": 2.883009443533911, "grad_norm": 0.003904677229002118, "learning_rate": 4.631524640164031e-07, "loss": 0.196, "step": 36940 }, { "epoch": 2.883789900881917, "grad_norm": 2.1362526416778564, "learning_rate": 4.5700707561286414e-07, "loss": 0.4271, "step": 36950 }, { "epoch": 2.8845703582299227, "grad_norm": 1.6145381472737474e-12, "learning_rate": 4.5090254315662826e-07, "loss": 3.4132, "step": 36960 }, { "epoch": 2.8853508155779286, "grad_norm": 1.6603729591224692e-07, "learning_rate": 4.4483887168184637e-07, "loss": 0.9031, "step": 36970 }, { "epoch": 2.8861312729259345, "grad_norm": 1.5513957676889246e-16, "learning_rate": 4.388160661889573e-07, "loss": 0.0732, "step": 36980 }, { "epoch": 2.8869117302739404, "grad_norm": 7.504744048425493e-11, "learning_rate": 4.328341316446993e-07, "loss": 0.0176, "step": 36990 }, { "epoch": 2.8876921876219463, "grad_norm": 0.0, "learning_rate": 4.268930729821319e-07, "loss": 1.3189, "step": 37000 }, { "epoch": 2.8884726449699523, "grad_norm": 0.6716959476470947, "learning_rate": 4.2099289510056926e-07, "loss": 1.6209, "step": 37010 }, { "epoch": 2.889253102317958, "grad_norm": 359.7210693359375, "learning_rate": 4.151336028656361e-07, "loss": 1.725, "step": 37020 }, { "epoch": 2.890033559665964, "grad_norm": 367.9685363769531, "learning_rate": 4.093152011092394e-07, "loss": 0.5489, "step": 37030 }, { "epoch": 2.8908140170139704, "grad_norm": 3.560751215821877e-21, "learning_rate": 4.0353769462956325e-07, "loss": 0.686, "step": 37040 }, { "epoch": 2.891594474361976, "grad_norm": 0.00012073075777152553, "learning_rate": 3.9780108819105766e-07, "loss": 0.0, "step": 37050 }, { "epoch": 2.8923749317099823, "grad_norm": 1.461513876914978, "learning_rate": 3.9210538652445503e-07, "loss": 0.0046, "step": 37060 }, { "epoch": 2.8931553890579877, "grad_norm": 6.231614112854004, "learning_rate": 3.864505943267593e-07, "loss": 1.5488, "step": 37070 }, { "epoch": 2.893935846405994, "grad_norm": 0.04269225895404816, "learning_rate": 3.8083671626122365e-07, "loss": 0.0004, "step": 37080 }, { "epoch": 2.8947163037539996, "grad_norm": 4.1955885535571724e-05, "learning_rate": 3.7526375695736694e-07, "loss": 1.2249, "step": 37090 }, { "epoch": 2.895496761102006, "grad_norm": 348.7253723144531, "learning_rate": 3.6973172101096856e-07, "loss": 0.1455, "step": 37100 }, { "epoch": 2.896277218450012, "grad_norm": 0.0001276261027669534, "learning_rate": 3.6424061298405697e-07, "loss": 0.1337, "step": 37110 }, { "epoch": 2.8970576757980178, "grad_norm": 168.937255859375, "learning_rate": 3.5879043740491557e-07, "loss": 0.2812, "step": 37120 }, { "epoch": 2.8978381331460237, "grad_norm": 2.2684753275825642e-05, "learning_rate": 3.533811987680602e-07, "loss": 0.2242, "step": 37130 }, { "epoch": 2.8986185904940296, "grad_norm": 0.010522971861064434, "learning_rate": 3.480129015342559e-07, "loss": 0.2176, "step": 37140 }, { "epoch": 2.8993990478420355, "grad_norm": 8.882963697942614e-07, "learning_rate": 3.42685550130506e-07, "loss": 1.5899, "step": 37150 }, { "epoch": 2.9001795051900414, "grad_norm": 0.0, "learning_rate": 3.37399148950035e-07, "loss": 0.2396, "step": 37160 }, { "epoch": 2.9009599625380473, "grad_norm": 0.022511860355734825, "learning_rate": 3.3215370235232246e-07, "loss": 0.0, "step": 37170 }, { "epoch": 2.9017404198860532, "grad_norm": 3.4341320991516113, "learning_rate": 3.269492146630471e-07, "loss": 0.4312, "step": 37180 }, { "epoch": 2.902520877234059, "grad_norm": 1.03269021565211e-05, "learning_rate": 3.217856901741312e-07, "loss": 0.005, "step": 37190 }, { "epoch": 2.903301334582065, "grad_norm": 37.59806442260742, "learning_rate": 3.1666313314370757e-07, "loss": 0.3451, "step": 37200 }, { "epoch": 2.904081791930071, "grad_norm": 0.0004427414678502828, "learning_rate": 3.1158154779611927e-07, "loss": 0.0069, "step": 37210 }, { "epoch": 2.904862249278077, "grad_norm": 1.5279600620269775, "learning_rate": 3.065409383219364e-07, "loss": 0.0609, "step": 37220 }, { "epoch": 2.905642706626083, "grad_norm": 0.007380710914731026, "learning_rate": 3.015413088779229e-07, "loss": 0.3202, "step": 37230 }, { "epoch": 2.9064231639740887, "grad_norm": 6.796607954174735e-14, "learning_rate": 2.965826635870583e-07, "loss": 0.2579, "step": 37240 }, { "epoch": 2.9072036213220946, "grad_norm": 6.750393338940684e-17, "learning_rate": 2.916650065385218e-07, "loss": 0.0001, "step": 37250 }, { "epoch": 2.9079840786701006, "grad_norm": 0.0006772010819986463, "learning_rate": 2.867883417876971e-07, "loss": 0.0192, "step": 37260 }, { "epoch": 2.9087645360181065, "grad_norm": 2.4011129252953824e-14, "learning_rate": 2.819526733561451e-07, "loss": 0.0001, "step": 37270 }, { "epoch": 2.9095449933661124, "grad_norm": 0.01254895981401205, "learning_rate": 2.771580052316369e-07, "loss": 0.003, "step": 37280 }, { "epoch": 2.9103254507141187, "grad_norm": 4.720600799146268e-09, "learning_rate": 2.7240434136812656e-07, "loss": 0.9636, "step": 37290 }, { "epoch": 2.911105908062124, "grad_norm": 1.525995321571827e-05, "learning_rate": 2.676916856857503e-07, "loss": 0.0058, "step": 37300 }, { "epoch": 2.9118863654101306, "grad_norm": 0.05982762202620506, "learning_rate": 2.6302004207083284e-07, "loss": 0.6551, "step": 37310 }, { "epoch": 2.912666822758136, "grad_norm": 0.000589911243878305, "learning_rate": 2.583894143758758e-07, "loss": 0.0857, "step": 37320 }, { "epoch": 2.9134472801061424, "grad_norm": 0.11605760455131531, "learning_rate": 2.537998064195579e-07, "loss": 0.0133, "step": 37330 }, { "epoch": 2.914227737454148, "grad_norm": 0.3421231508255005, "learning_rate": 2.4925122198671823e-07, "loss": 0.6504, "step": 37340 }, { "epoch": 2.9150081948021542, "grad_norm": 302.4140319824219, "learning_rate": 2.447436648283896e-07, "loss": 0.4466, "step": 37350 }, { "epoch": 2.91578865215016, "grad_norm": 0.0016285435995087028, "learning_rate": 2.4027713866175415e-07, "loss": 0.2798, "step": 37360 }, { "epoch": 2.916569109498166, "grad_norm": 5.190363694396183e-10, "learning_rate": 2.358516471701544e-07, "loss": 0.0, "step": 37370 }, { "epoch": 2.917349566846172, "grad_norm": 2.233219901664949e-17, "learning_rate": 2.314671940031099e-07, "loss": 0.0486, "step": 37380 }, { "epoch": 2.918130024194178, "grad_norm": 2.173655033743671e-09, "learning_rate": 2.2712378277627843e-07, "loss": 0.4348, "step": 37390 }, { "epoch": 2.918910481542184, "grad_norm": 336.09649658203125, "learning_rate": 2.228214170714893e-07, "loss": 0.5809, "step": 37400 }, { "epoch": 2.9196909388901897, "grad_norm": 147.56405639648438, "learning_rate": 2.1856010043671548e-07, "loss": 0.0556, "step": 37410 }, { "epoch": 2.9204713962381956, "grad_norm": 1.872330201990735e-08, "learning_rate": 2.143398363860738e-07, "loss": 0.0394, "step": 37420 }, { "epoch": 2.9212518535862015, "grad_norm": 5.727499683416681e-06, "learning_rate": 2.1016062839983587e-07, "loss": 0.0003, "step": 37430 }, { "epoch": 2.9220323109342075, "grad_norm": 68.21370697021484, "learning_rate": 2.0602247992441704e-07, "loss": 0.0325, "step": 37440 }, { "epoch": 2.9228127682822134, "grad_norm": 79.4469985961914, "learning_rate": 2.019253943723598e-07, "loss": 0.0433, "step": 37450 }, { "epoch": 2.9235932256302193, "grad_norm": 3.062634368333761e-09, "learning_rate": 1.9786937512235591e-07, "loss": 0.9574, "step": 37460 }, { "epoch": 2.924373682978225, "grad_norm": 0.0030439510010182858, "learning_rate": 1.9385442551922428e-07, "loss": 0.2112, "step": 37470 }, { "epoch": 2.925154140326231, "grad_norm": 3.254121012366204e-15, "learning_rate": 1.8988054887392747e-07, "loss": 0.2216, "step": 37480 }, { "epoch": 2.925934597674237, "grad_norm": 1.1417990890549845e-06, "learning_rate": 1.8594774846353857e-07, "loss": 0.7698, "step": 37490 }, { "epoch": 2.926715055022243, "grad_norm": 4.754112346307647e-14, "learning_rate": 1.8205602753127438e-07, "loss": 0.0678, "step": 37500 }, { "epoch": 2.927495512370249, "grad_norm": 1.8571523696664372e-07, "learning_rate": 1.7820538928646213e-07, "loss": 0.5082, "step": 37510 }, { "epoch": 2.9282759697182548, "grad_norm": 0.005671486724168062, "learning_rate": 1.7439583690455618e-07, "loss": 1.6796, "step": 37520 }, { "epoch": 2.9290564270662607, "grad_norm": 8.371974224985479e-10, "learning_rate": 1.7062737352713244e-07, "loss": 0.0002, "step": 37530 }, { "epoch": 2.9298368844142666, "grad_norm": 343.91131591796875, "learning_rate": 1.6690000226187164e-07, "loss": 3.0723, "step": 37540 }, { "epoch": 2.9306173417622725, "grad_norm": 0.007303249090909958, "learning_rate": 1.6321372618257613e-07, "loss": 0.8656, "step": 37550 }, { "epoch": 2.931397799110279, "grad_norm": 274.3439025878906, "learning_rate": 1.5956854832916425e-07, "loss": 0.3517, "step": 37560 }, { "epoch": 2.9321782564582843, "grad_norm": 5.482262355942322e-15, "learning_rate": 1.5596447170764806e-07, "loss": 0.2141, "step": 37570 }, { "epoch": 2.9329587138062907, "grad_norm": 3.0375571849483585e-13, "learning_rate": 1.5240149929016122e-07, "loss": 0.0001, "step": 37580 }, { "epoch": 2.933739171154296, "grad_norm": 0.00010239938274025917, "learning_rate": 1.488796340149201e-07, "loss": 1.1344, "step": 37590 }, { "epoch": 2.9345196285023025, "grad_norm": 1.0532202168178628e-06, "learning_rate": 1.45398878786257e-07, "loss": 2.5855, "step": 37600 }, { "epoch": 2.9353000858503084, "grad_norm": 0.02686687372624874, "learning_rate": 1.4195923647460364e-07, "loss": 0.0826, "step": 37610 }, { "epoch": 2.9360805431983144, "grad_norm": 6.217739610292483e-06, "learning_rate": 1.3856070991647985e-07, "loss": 0.1367, "step": 37620 }, { "epoch": 2.9368610005463203, "grad_norm": 313.4095153808594, "learning_rate": 1.3520330191449936e-07, "loss": 0.9538, "step": 37630 }, { "epoch": 2.937641457894326, "grad_norm": 2.7152030759392334e-15, "learning_rate": 1.318870152373808e-07, "loss": 0.1295, "step": 37640 }, { "epoch": 2.938421915242332, "grad_norm": 3.337225393806875e-07, "learning_rate": 1.2861185261990872e-07, "loss": 0.0, "step": 37650 }, { "epoch": 2.939202372590338, "grad_norm": 282.7817687988281, "learning_rate": 1.2537781676297268e-07, "loss": 3.676, "step": 37660 }, { "epoch": 2.939982829938344, "grad_norm": 8.363390406884719e-06, "learning_rate": 1.2218491033354484e-07, "loss": 0.0, "step": 37670 }, { "epoch": 2.94076328728635, "grad_norm": 0.0, "learning_rate": 1.1903313596466903e-07, "loss": 0.0003, "step": 37680 }, { "epoch": 2.9415437446343558, "grad_norm": 0.005267214495688677, "learning_rate": 1.1592249625548279e-07, "loss": 1.2254, "step": 37690 }, { "epoch": 2.9423242019823617, "grad_norm": 7.535186918872228e-10, "learning_rate": 1.1285299377118974e-07, "loss": 0.7656, "step": 37700 }, { "epoch": 2.9431046593303676, "grad_norm": 59.69273376464844, "learning_rate": 1.0982463104307617e-07, "loss": 0.158, "step": 37710 }, { "epoch": 2.9438851166783735, "grad_norm": 7.175589416874573e-05, "learning_rate": 1.0683741056849994e-07, "loss": 0.6752, "step": 37720 }, { "epoch": 2.9446655740263794, "grad_norm": 334.2007141113281, "learning_rate": 1.0389133481089608e-07, "loss": 1.0146, "step": 37730 }, { "epoch": 2.9454460313743853, "grad_norm": 2.3582126118526503e-07, "learning_rate": 1.0098640619976007e-07, "loss": 0.8188, "step": 37740 }, { "epoch": 2.9462264887223912, "grad_norm": 0.0169204231351614, "learning_rate": 9.812262713066455e-08, "loss": 0.005, "step": 37750 }, { "epoch": 2.947006946070397, "grad_norm": 52.56903076171875, "learning_rate": 9.529999996524264e-08, "loss": 0.4055, "step": 37760 }, { "epoch": 2.947787403418403, "grad_norm": 2.238950173705234e-06, "learning_rate": 9.251852703118235e-08, "loss": 0.0862, "step": 37770 }, { "epoch": 2.948567860766409, "grad_norm": 0.10402812063694, "learning_rate": 8.977821062225444e-08, "loss": 1.1434, "step": 37780 }, { "epoch": 2.949348318114415, "grad_norm": 7.077328205108643, "learning_rate": 8.707905299827346e-08, "loss": 0.0021, "step": 37790 }, { "epoch": 2.950128775462421, "grad_norm": 0.0004994513583369553, "learning_rate": 8.442105638511998e-08, "loss": 0.5704, "step": 37800 }, { "epoch": 2.950909232810427, "grad_norm": 34.23684310913086, "learning_rate": 8.180422297472956e-08, "loss": 1.4182, "step": 37810 }, { "epoch": 2.9516896901584326, "grad_norm": 0.00012013528612442315, "learning_rate": 7.922855492508153e-08, "loss": 0.0009, "step": 37820 }, { "epoch": 2.952470147506439, "grad_norm": 8.701053072490694e-11, "learning_rate": 7.66940543602268e-08, "loss": 0.0001, "step": 37830 }, { "epoch": 2.9532506048544445, "grad_norm": 3.51215390104187e-09, "learning_rate": 7.420072337025464e-08, "loss": 0.0023, "step": 37840 }, { "epoch": 2.954031062202451, "grad_norm": 3.1763735522036263e-22, "learning_rate": 7.174856401129804e-08, "loss": 0.0, "step": 37850 }, { "epoch": 2.9548115195504567, "grad_norm": 4.604586711920433e-10, "learning_rate": 6.933757830556164e-08, "loss": 0.3049, "step": 37860 }, { "epoch": 2.9555919768984626, "grad_norm": 0.0019449929241091013, "learning_rate": 6.696776824127171e-08, "loss": 0.0023, "step": 37870 }, { "epoch": 2.9563724342464686, "grad_norm": 168.966552734375, "learning_rate": 6.463913577270386e-08, "loss": 0.0901, "step": 37880 }, { "epoch": 2.9571528915944745, "grad_norm": 2.1862453181142882e-08, "learning_rate": 6.235168282018866e-08, "loss": 0.0011, "step": 37890 }, { "epoch": 2.9579333489424804, "grad_norm": 5.355248049454531e-07, "learning_rate": 6.01054112700783e-08, "loss": 0.0021, "step": 37900 }, { "epoch": 2.9587138062904863, "grad_norm": 4.50494515348847e-15, "learning_rate": 5.7900322974785427e-08, "loss": 0.6629, "step": 37910 }, { "epoch": 2.959494263638492, "grad_norm": 504.38427734375, "learning_rate": 5.57364197527388e-08, "loss": 0.7332, "step": 37920 }, { "epoch": 2.960274720986498, "grad_norm": 1.0105886960598554e-10, "learning_rate": 5.361370338842764e-08, "loss": 0.0002, "step": 37930 }, { "epoch": 2.961055178334504, "grad_norm": 5.368253255255695e-07, "learning_rate": 5.153217563235724e-08, "loss": 0.1828, "step": 37940 }, { "epoch": 2.96183563568251, "grad_norm": 3.649755736349902e-13, "learning_rate": 4.9491838201076727e-08, "loss": 0.0, "step": 37950 }, { "epoch": 2.962616093030516, "grad_norm": 0.0007102579111233354, "learning_rate": 4.749269277715685e-08, "loss": 0.0084, "step": 37960 }, { "epoch": 2.963396550378522, "grad_norm": 71.00413513183594, "learning_rate": 4.553474100920663e-08, "loss": 0.7264, "step": 37970 }, { "epoch": 2.9641770077265277, "grad_norm": 8.966297855295124e-07, "learning_rate": 4.361798451186783e-08, "loss": 0.6338, "step": 37980 }, { "epoch": 2.9649574650745336, "grad_norm": 0.019383694976568222, "learning_rate": 4.174242486580382e-08, "loss": 0.0, "step": 37990 }, { "epoch": 2.9657379224225395, "grad_norm": 6.672384188277647e-05, "learning_rate": 3.9908063617710714e-08, "loss": 0.3395, "step": 38000 }, { "epoch": 2.9665183797705454, "grad_norm": 229.11349487304688, "learning_rate": 3.811490228030068e-08, "loss": 0.2167, "step": 38010 }, { "epoch": 2.9672988371185514, "grad_norm": 0.004529143683612347, "learning_rate": 3.6362942332313075e-08, "loss": 0.0106, "step": 38020 }, { "epoch": 2.9680792944665573, "grad_norm": 0.0003937554429285228, "learning_rate": 3.4652185218525535e-08, "loss": 0.0688, "step": 38030 }, { "epoch": 2.968859751814563, "grad_norm": 213.34629821777344, "learning_rate": 3.298263234970955e-08, "loss": 0.2664, "step": 38040 }, { "epoch": 2.969640209162569, "grad_norm": 3.023458219897357e-13, "learning_rate": 3.135428510268601e-08, "loss": 0.0103, "step": 38050 }, { "epoch": 2.9704206665105755, "grad_norm": 0.4616576135158539, "learning_rate": 2.9767144820275207e-08, "loss": 0.2242, "step": 38060 }, { "epoch": 2.971201123858581, "grad_norm": 0.009038741700351238, "learning_rate": 2.8221212811324615e-08, "loss": 0.177, "step": 38070 }, { "epoch": 2.9719815812065873, "grad_norm": 0.0001392298872815445, "learning_rate": 2.6716490350692237e-08, "loss": 0.4559, "step": 38080 }, { "epoch": 2.9727620385545928, "grad_norm": 2.2928576253616623e-15, "learning_rate": 2.525297867926324e-08, "loss": 0.0002, "step": 38090 }, { "epoch": 2.973542495902599, "grad_norm": 5.54908638150664e-07, "learning_rate": 2.3830679003927768e-08, "loss": 1.0038, "step": 38100 }, { "epoch": 2.9743229532506046, "grad_norm": 4.5243123136806673e-10, "learning_rate": 2.2449592497597593e-08, "loss": 2.0914, "step": 38110 }, { "epoch": 2.975103410598611, "grad_norm": 3.340051625855267e-05, "learning_rate": 2.110972029918945e-08, "loss": 0.2649, "step": 38120 }, { "epoch": 2.975883867946617, "grad_norm": 1.426492843847882e-07, "learning_rate": 1.9811063513647256e-08, "loss": 0.3439, "step": 38130 }, { "epoch": 2.9766643252946228, "grad_norm": 2.1971930890240512e-15, "learning_rate": 1.8553623211903236e-08, "loss": 0.0, "step": 38140 }, { "epoch": 2.9774447826426287, "grad_norm": 5.232332900284575e-10, "learning_rate": 1.733740043092791e-08, "loss": 1.475, "step": 38150 }, { "epoch": 2.9782252399906346, "grad_norm": 0.12338301539421082, "learning_rate": 1.6162396173674544e-08, "loss": 0.0095, "step": 38160 }, { "epoch": 2.9790056973386405, "grad_norm": 4.5922202843939885e-05, "learning_rate": 1.5028611409129146e-08, "loss": 1.14, "step": 38170 }, { "epoch": 2.9797861546866464, "grad_norm": 0.004267883487045765, "learning_rate": 1.3936047072266034e-08, "loss": 0.0202, "step": 38180 }, { "epoch": 2.9805666120346523, "grad_norm": 3.284040417383949e-07, "learning_rate": 1.2884704064075603e-08, "loss": 0.0, "step": 38190 }, { "epoch": 2.9813470693826583, "grad_norm": 0.00014708541857544333, "learning_rate": 1.187458325156432e-08, "loss": 1.3839, "step": 38200 }, { "epoch": 2.982127526730664, "grad_norm": 1.2784296359313885e-06, "learning_rate": 1.0905685467721417e-08, "loss": 1.0751, "step": 38210 }, { "epoch": 2.98290798407867, "grad_norm": 7.825324535369873, "learning_rate": 9.9780115115633e-09, "loss": 0.7728, "step": 38220 }, { "epoch": 2.983688441426676, "grad_norm": 2.2648984909778846e-08, "learning_rate": 9.091562148100252e-09, "loss": 0.6783, "step": 38230 }, { "epoch": 2.984468898774682, "grad_norm": 0.1770647168159485, "learning_rate": 8.246338108347518e-09, "loss": 1.2771, "step": 38240 }, { "epoch": 2.985249356122688, "grad_norm": 0.0, "learning_rate": 7.442340089330868e-09, "loss": 0.6814, "step": 38250 }, { "epoch": 2.9860298134706937, "grad_norm": 2.466651380927942e-07, "learning_rate": 6.67956875405884e-09, "loss": 0.2075, "step": 38260 }, { "epoch": 2.9868102708186997, "grad_norm": 6.2170888486738456e-15, "learning_rate": 5.958024731567147e-09, "loss": 1.4441, "step": 38270 }, { "epoch": 2.9875907281667056, "grad_norm": 2.349893390984903e-09, "learning_rate": 5.277708616879817e-09, "loss": 0.0633, "step": 38280 }, { "epoch": 2.9883711855147115, "grad_norm": 4.472295131563442e-06, "learning_rate": 4.638620971020302e-09, "loss": 1.0314, "step": 38290 }, { "epoch": 2.9891516428627174, "grad_norm": 2.1543214254056445e-21, "learning_rate": 4.0407623210225735e-09, "loss": 0.8742, "step": 38300 }, { "epoch": 2.9899321002107238, "grad_norm": 2.3384396446269684e-15, "learning_rate": 3.484133159903369e-09, "loss": 0.016, "step": 38310 }, { "epoch": 2.9907125575587292, "grad_norm": 176.8972930908203, "learning_rate": 2.9687339467010523e-09, "loss": 0.156, "step": 38320 }, { "epoch": 2.9914930149067356, "grad_norm": 403.177734375, "learning_rate": 2.4945651064423036e-09, "loss": 0.9814, "step": 38330 }, { "epoch": 2.992273472254741, "grad_norm": 0.0004511200822889805, "learning_rate": 2.0616270301476727e-09, "loss": 0.2183, "step": 38340 }, { "epoch": 2.9930539296027474, "grad_norm": 0.0015130466781556606, "learning_rate": 1.6699200748482301e-09, "loss": 0.0, "step": 38350 }, { "epoch": 2.993834386950753, "grad_norm": 31.965559005737305, "learning_rate": 1.3194445635578144e-09, "loss": 0.5585, "step": 38360 }, { "epoch": 2.9946148442987592, "grad_norm": 3.413600999091493e-11, "learning_rate": 1.0102007853118878e-09, "loss": 0.2753, "step": 38370 }, { "epoch": 2.995395301646765, "grad_norm": 7.039468571539655e-21, "learning_rate": 7.421889951231276e-10, "loss": 0.0101, "step": 38380 }, { "epoch": 2.996175758994771, "grad_norm": 0.17176811397075653, "learning_rate": 5.154094140036314e-10, "loss": 1.0048, "step": 38390 }, { "epoch": 2.996956216342777, "grad_norm": 3.7634078076109745e-09, "learning_rate": 3.2986222898157005e-10, "loss": 0.857, "step": 38400 }, { "epoch": 2.997736673690783, "grad_norm": 9.749450691742823e-05, "learning_rate": 1.855475930567785e-10, "loss": 0.0004, "step": 38410 }, { "epoch": 2.998517131038789, "grad_norm": 0.10597077757120132, "learning_rate": 8.24656252507161e-11, "loss": 1.1732, "step": 38420 }, { "epoch": 2.9992975883867947, "grad_norm": 4.119867882040667e-15, "learning_rate": 2.0616410562057653e-11, "loss": 0.1238, "step": 38430 } ], "logging_steps": 10, "max_steps": 38439, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }