diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26935 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 38439, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007804573480059315, + "grad_norm": 70.57565307617188, + "learning_rate": 2.3413111342351717e-07, + "loss": 0.8495, + "step": 10 + }, + { + "epoch": 0.001560914696011863, + "grad_norm": 60.91403579711914, + "learning_rate": 4.94276795005203e-07, + "loss": 0.9127, + "step": 20 + }, + { + "epoch": 0.0023413720440177946, + "grad_norm": 35.46073532104492, + "learning_rate": 7.544224765868887e-07, + "loss": 0.7408, + "step": 30 + }, + { + "epoch": 0.003121829392023726, + "grad_norm": 28.7984619140625, + "learning_rate": 1.0145681581685745e-06, + "loss": 0.7778, + "step": 40 + }, + { + "epoch": 0.0039022867400296574, + "grad_norm": 24.812997817993164, + "learning_rate": 1.2747138397502602e-06, + "loss": 0.6852, + "step": 50 + }, + { + "epoch": 0.004682744088035589, + "grad_norm": 39.767250061035156, + "learning_rate": 1.534859521331946e-06, + "loss": 0.7863, + "step": 60 + }, + { + "epoch": 0.005463201436041521, + "grad_norm": 37.76068878173828, + "learning_rate": 1.7950052029136317e-06, + "loss": 0.8252, + "step": 70 + }, + { + "epoch": 0.006243658784047452, + "grad_norm": 63.68617248535156, + "learning_rate": 2.0551508844953173e-06, + "loss": 0.7234, + "step": 80 + }, + { + "epoch": 0.007024116132053383, + "grad_norm": 23.260103225708008, + "learning_rate": 2.315296566077003e-06, + "loss": 0.811, + "step": 90 + }, + { + "epoch": 0.007804573480059315, + "grad_norm": 26.783702850341797, + "learning_rate": 2.575442247658689e-06, + "loss": 0.5367, + "step": 100 + }, + { + "epoch": 0.008585030828065247, + "grad_norm": 64.16069030761719, + "learning_rate": 2.835587929240375e-06, + "loss": 0.6239, + "step": 110 + }, + { + "epoch": 0.009365488176071178, + "grad_norm": 23.482328414916992, + "learning_rate": 3.0957336108220607e-06, + "loss": 0.6538, + "step": 120 + }, + { + "epoch": 0.01014594552407711, + "grad_norm": 47.013214111328125, + "learning_rate": 3.3558792924037465e-06, + "loss": 0.5528, + "step": 130 + }, + { + "epoch": 0.010926402872083041, + "grad_norm": 57.85599136352539, + "learning_rate": 3.616024973985432e-06, + "loss": 1.022, + "step": 140 + }, + { + "epoch": 0.011706860220088973, + "grad_norm": 36.2352294921875, + "learning_rate": 3.8761706555671176e-06, + "loss": 0.5808, + "step": 150 + }, + { + "epoch": 0.012487317568094904, + "grad_norm": 55.68172836303711, + "learning_rate": 4.136316337148804e-06, + "loss": 0.8512, + "step": 160 + }, + { + "epoch": 0.013267774916100835, + "grad_norm": 56.305442810058594, + "learning_rate": 4.396462018730489e-06, + "loss": 0.9563, + "step": 170 + }, + { + "epoch": 0.014048232264106767, + "grad_norm": 69.95011901855469, + "learning_rate": 4.656607700312175e-06, + "loss": 1.0437, + "step": 180 + }, + { + "epoch": 0.014828689612112698, + "grad_norm": 86.20785522460938, + "learning_rate": 4.916753381893861e-06, + "loss": 1.1461, + "step": 190 + }, + { + "epoch": 0.01560914696011863, + "grad_norm": 61.00738525390625, + "learning_rate": 5.176899063475546e-06, + "loss": 0.6641, + "step": 200 + }, + { + "epoch": 0.016389604308124563, + "grad_norm": 27.987585067749023, + "learning_rate": 5.437044745057232e-06, + "loss": 1.0178, + "step": 210 + }, + { + "epoch": 0.017170061656130494, + "grad_norm": 50.186649322509766, + "learning_rate": 5.697190426638918e-06, + "loss": 0.9153, + "step": 220 + }, + { + "epoch": 0.017950519004136425, + "grad_norm": 44.61861038208008, + "learning_rate": 5.957336108220604e-06, + "loss": 0.7384, + "step": 230 + }, + { + "epoch": 0.018730976352142357, + "grad_norm": 44.65412902832031, + "learning_rate": 6.217481789802289e-06, + "loss": 0.9101, + "step": 240 + }, + { + "epoch": 0.019511433700148288, + "grad_norm": 43.09900665283203, + "learning_rate": 6.477627471383975e-06, + "loss": 0.815, + "step": 250 + }, + { + "epoch": 0.02029189104815422, + "grad_norm": 26.3706111907959, + "learning_rate": 6.737773152965661e-06, + "loss": 0.8089, + "step": 260 + }, + { + "epoch": 0.02107234839616015, + "grad_norm": 23.912540435791016, + "learning_rate": 6.997918834547347e-06, + "loss": 0.5597, + "step": 270 + }, + { + "epoch": 0.021852805744166082, + "grad_norm": 43.356571197509766, + "learning_rate": 7.258064516129033e-06, + "loss": 0.776, + "step": 280 + }, + { + "epoch": 0.022633263092172014, + "grad_norm": 47.17486572265625, + "learning_rate": 7.518210197710718e-06, + "loss": 0.6612, + "step": 290 + }, + { + "epoch": 0.023413720440177945, + "grad_norm": 27.746665954589844, + "learning_rate": 7.778355879292404e-06, + "loss": 0.9178, + "step": 300 + }, + { + "epoch": 0.024194177788183877, + "grad_norm": 37.433265686035156, + "learning_rate": 8.03850156087409e-06, + "loss": 0.7673, + "step": 310 + }, + { + "epoch": 0.024974635136189808, + "grad_norm": 71.587646484375, + "learning_rate": 8.298647242455775e-06, + "loss": 1.0383, + "step": 320 + }, + { + "epoch": 0.02575509248419574, + "grad_norm": 56.9047737121582, + "learning_rate": 8.558792924037463e-06, + "loss": 0.8289, + "step": 330 + }, + { + "epoch": 0.02653554983220167, + "grad_norm": 98.57324981689453, + "learning_rate": 8.818938605619147e-06, + "loss": 1.0829, + "step": 340 + }, + { + "epoch": 0.027316007180207602, + "grad_norm": 57.22220230102539, + "learning_rate": 9.079084287200833e-06, + "loss": 1.0618, + "step": 350 + }, + { + "epoch": 0.028096464528213533, + "grad_norm": 37.49354934692383, + "learning_rate": 9.339229968782518e-06, + "loss": 0.8915, + "step": 360 + }, + { + "epoch": 0.028876921876219465, + "grad_norm": 29.789419174194336, + "learning_rate": 9.599375650364204e-06, + "loss": 0.6783, + "step": 370 + }, + { + "epoch": 0.029657379224225396, + "grad_norm": 75.5431137084961, + "learning_rate": 9.85952133194589e-06, + "loss": 1.0771, + "step": 380 + }, + { + "epoch": 0.030437836572231328, + "grad_norm": 46.971492767333984, + "learning_rate": 1.0119667013527576e-05, + "loss": 0.9261, + "step": 390 + }, + { + "epoch": 0.03121829392023726, + "grad_norm": 28.467092514038086, + "learning_rate": 1.0379812695109261e-05, + "loss": 0.6424, + "step": 400 + }, + { + "epoch": 0.03199875126824319, + "grad_norm": 70.29894256591797, + "learning_rate": 1.0639958376690947e-05, + "loss": 0.7372, + "step": 410 + }, + { + "epoch": 0.032779208616249125, + "grad_norm": 47.83068084716797, + "learning_rate": 1.0900104058272635e-05, + "loss": 0.8506, + "step": 420 + }, + { + "epoch": 0.03355966596425505, + "grad_norm": 114.6404037475586, + "learning_rate": 1.1160249739854319e-05, + "loss": 0.754, + "step": 430 + }, + { + "epoch": 0.03434012331226099, + "grad_norm": 37.66133499145508, + "learning_rate": 1.1420395421436004e-05, + "loss": 0.7524, + "step": 440 + }, + { + "epoch": 0.035120580660266916, + "grad_norm": 69.27783203125, + "learning_rate": 1.168054110301769e-05, + "loss": 0.7973, + "step": 450 + }, + { + "epoch": 0.03590103800827285, + "grad_norm": 40.7580451965332, + "learning_rate": 1.1940686784599376e-05, + "loss": 1.1207, + "step": 460 + }, + { + "epoch": 0.03668149535627878, + "grad_norm": 69.8377685546875, + "learning_rate": 1.2200832466181062e-05, + "loss": 0.7528, + "step": 470 + }, + { + "epoch": 0.037461952704284714, + "grad_norm": 61.14483642578125, + "learning_rate": 1.2460978147762747e-05, + "loss": 0.5989, + "step": 480 + }, + { + "epoch": 0.03824241005229064, + "grad_norm": 42.87925338745117, + "learning_rate": 1.2721123829344433e-05, + "loss": 1.0347, + "step": 490 + }, + { + "epoch": 0.039022867400296576, + "grad_norm": 82.56037902832031, + "learning_rate": 1.2981269510926119e-05, + "loss": 0.7157, + "step": 500 + }, + { + "epoch": 0.039803324748302504, + "grad_norm": 67.13080596923828, + "learning_rate": 1.3241415192507806e-05, + "loss": 0.5716, + "step": 510 + }, + { + "epoch": 0.04058378209630844, + "grad_norm": 26.124019622802734, + "learning_rate": 1.3501560874089492e-05, + "loss": 0.5971, + "step": 520 + }, + { + "epoch": 0.04136423944431437, + "grad_norm": 43.60024642944336, + "learning_rate": 1.3761706555671178e-05, + "loss": 0.6868, + "step": 530 + }, + { + "epoch": 0.0421446967923203, + "grad_norm": 72.22933959960938, + "learning_rate": 1.402185223725286e-05, + "loss": 0.8075, + "step": 540 + }, + { + "epoch": 0.04292515414032623, + "grad_norm": 51.11296081542969, + "learning_rate": 1.4281997918834548e-05, + "loss": 0.5868, + "step": 550 + }, + { + "epoch": 0.043705611488332165, + "grad_norm": 59.321163177490234, + "learning_rate": 1.4542143600416234e-05, + "loss": 0.9031, + "step": 560 + }, + { + "epoch": 0.04448606883633809, + "grad_norm": 14.339578628540039, + "learning_rate": 1.480228928199792e-05, + "loss": 0.5629, + "step": 570 + }, + { + "epoch": 0.04526652618434403, + "grad_norm": 14.040125846862793, + "learning_rate": 1.5062434963579605e-05, + "loss": 0.8403, + "step": 580 + }, + { + "epoch": 0.046046983532349955, + "grad_norm": 43.124237060546875, + "learning_rate": 1.5322580645161292e-05, + "loss": 0.7625, + "step": 590 + }, + { + "epoch": 0.04682744088035589, + "grad_norm": 109.8456802368164, + "learning_rate": 1.5582726326742977e-05, + "loss": 1.2096, + "step": 600 + }, + { + "epoch": 0.04760789822836182, + "grad_norm": 28.653745651245117, + "learning_rate": 1.5842872008324664e-05, + "loss": 1.6295, + "step": 610 + }, + { + "epoch": 0.04838835557636775, + "grad_norm": 31.663606643676758, + "learning_rate": 1.6103017689906348e-05, + "loss": 0.7588, + "step": 620 + }, + { + "epoch": 0.04916881292437368, + "grad_norm": 40.83611297607422, + "learning_rate": 1.6363163371488032e-05, + "loss": 0.707, + "step": 630 + }, + { + "epoch": 0.049949270272379616, + "grad_norm": 16.77260398864746, + "learning_rate": 1.662330905306972e-05, + "loss": 1.0579, + "step": 640 + }, + { + "epoch": 0.050729727620385544, + "grad_norm": 16.65311622619629, + "learning_rate": 1.6883454734651404e-05, + "loss": 0.309, + "step": 650 + }, + { + "epoch": 0.05151018496839148, + "grad_norm": 83.49839782714844, + "learning_rate": 1.714360041623309e-05, + "loss": 1.103, + "step": 660 + }, + { + "epoch": 0.05229064231639741, + "grad_norm": 21.528764724731445, + "learning_rate": 1.740374609781478e-05, + "loss": 0.651, + "step": 670 + }, + { + "epoch": 0.05307109966440334, + "grad_norm": 4.0382866859436035, + "learning_rate": 1.7663891779396463e-05, + "loss": 0.905, + "step": 680 + }, + { + "epoch": 0.05385155701240927, + "grad_norm": 108.43582153320312, + "learning_rate": 1.792403746097815e-05, + "loss": 1.2111, + "step": 690 + }, + { + "epoch": 0.054632014360415204, + "grad_norm": 19.11322784423828, + "learning_rate": 1.8184183142559834e-05, + "loss": 1.1405, + "step": 700 + }, + { + "epoch": 0.05541247170842113, + "grad_norm": 5.063117504119873, + "learning_rate": 1.844432882414152e-05, + "loss": 0.779, + "step": 710 + }, + { + "epoch": 0.05619292905642707, + "grad_norm": 32.06193923950195, + "learning_rate": 1.8704474505723206e-05, + "loss": 0.9897, + "step": 720 + }, + { + "epoch": 0.056973386404432995, + "grad_norm": 8.792379379272461, + "learning_rate": 1.896462018730489e-05, + "loss": 1.2692, + "step": 730 + }, + { + "epoch": 0.05775384375243893, + "grad_norm": 47.35828399658203, + "learning_rate": 1.9224765868886577e-05, + "loss": 1.0116, + "step": 740 + }, + { + "epoch": 0.05853430110044486, + "grad_norm": 100.82672882080078, + "learning_rate": 1.948491155046826e-05, + "loss": 1.0753, + "step": 750 + }, + { + "epoch": 0.05931475844845079, + "grad_norm": 13.072049140930176, + "learning_rate": 1.974505723204995e-05, + "loss": 0.966, + "step": 760 + }, + { + "epoch": 0.06009521579645672, + "grad_norm": 11.36054801940918, + "learning_rate": 2.0005202913631636e-05, + "loss": 1.8183, + "step": 770 + }, + { + "epoch": 0.060875673144462655, + "grad_norm": 164.9627227783203, + "learning_rate": 2.026534859521332e-05, + "loss": 1.2052, + "step": 780 + }, + { + "epoch": 0.06165613049246858, + "grad_norm": 1.3306998014450073, + "learning_rate": 2.0525494276795008e-05, + "loss": 1.7305, + "step": 790 + }, + { + "epoch": 0.06243658784047452, + "grad_norm": 0.005146509036421776, + "learning_rate": 2.078563995837669e-05, + "loss": 0.3162, + "step": 800 + }, + { + "epoch": 0.06321704518848045, + "grad_norm": 192.14456176757812, + "learning_rate": 2.1045785639958376e-05, + "loss": 1.4213, + "step": 810 + }, + { + "epoch": 0.06399750253648638, + "grad_norm": 52.385902404785156, + "learning_rate": 2.1305931321540063e-05, + "loss": 0.3254, + "step": 820 + }, + { + "epoch": 0.06477795988449231, + "grad_norm": 116.30766296386719, + "learning_rate": 2.1566077003121747e-05, + "loss": 1.6886, + "step": 830 + }, + { + "epoch": 0.06555841723249825, + "grad_norm": 154.6099853515625, + "learning_rate": 2.1826222684703435e-05, + "loss": 2.9836, + "step": 840 + }, + { + "epoch": 0.06633887458050418, + "grad_norm": 118.08147430419922, + "learning_rate": 2.2086368366285122e-05, + "loss": 0.353, + "step": 850 + }, + { + "epoch": 0.0671193319285101, + "grad_norm": 1.24558687210083, + "learning_rate": 2.2346514047866806e-05, + "loss": 0.3155, + "step": 860 + }, + { + "epoch": 0.06789978927651603, + "grad_norm": 4.226469993591309, + "learning_rate": 2.2606659729448494e-05, + "loss": 0.5152, + "step": 870 + }, + { + "epoch": 0.06868024662452198, + "grad_norm": 17.129283905029297, + "learning_rate": 2.2866805411030178e-05, + "loss": 3.3197, + "step": 880 + }, + { + "epoch": 0.0694607039725279, + "grad_norm": 204.71778869628906, + "learning_rate": 2.3126951092611865e-05, + "loss": 0.6891, + "step": 890 + }, + { + "epoch": 0.07024116132053383, + "grad_norm": 16.1604061126709, + "learning_rate": 2.338709677419355e-05, + "loss": 0.9631, + "step": 900 + }, + { + "epoch": 0.07102161866853976, + "grad_norm": 33.664344787597656, + "learning_rate": 2.3647242455775233e-05, + "loss": 1.199, + "step": 910 + }, + { + "epoch": 0.0718020760165457, + "grad_norm": 122.5814437866211, + "learning_rate": 2.390738813735692e-05, + "loss": 1.1581, + "step": 920 + }, + { + "epoch": 0.07258253336455163, + "grad_norm": 4.358405902848972e-08, + "learning_rate": 2.4167533818938605e-05, + "loss": 0.6411, + "step": 930 + }, + { + "epoch": 0.07336299071255756, + "grad_norm": 103.39940643310547, + "learning_rate": 2.4427679500520292e-05, + "loss": 2.3409, + "step": 940 + }, + { + "epoch": 0.07414344806056349, + "grad_norm": 184.74298095703125, + "learning_rate": 2.468782518210198e-05, + "loss": 1.114, + "step": 950 + }, + { + "epoch": 0.07492390540856943, + "grad_norm": 4.424814224243164, + "learning_rate": 2.4947970863683664e-05, + "loss": 0.4251, + "step": 960 + }, + { + "epoch": 0.07570436275657536, + "grad_norm": 228.1300048828125, + "learning_rate": 2.5208116545265348e-05, + "loss": 2.1234, + "step": 970 + }, + { + "epoch": 0.07648482010458128, + "grad_norm": 173.34849548339844, + "learning_rate": 2.5468262226847035e-05, + "loss": 2.6774, + "step": 980 + }, + { + "epoch": 0.07726527745258721, + "grad_norm": 25.359329223632812, + "learning_rate": 2.572840790842872e-05, + "loss": 1.4619, + "step": 990 + }, + { + "epoch": 0.07804573480059315, + "grad_norm": 93.51551055908203, + "learning_rate": 2.5988553590010407e-05, + "loss": 1.7448, + "step": 1000 + }, + { + "epoch": 0.07882619214859908, + "grad_norm": 5.242687225341797, + "learning_rate": 2.624869927159209e-05, + "loss": 0.8575, + "step": 1010 + }, + { + "epoch": 0.07960664949660501, + "grad_norm": 137.5213165283203, + "learning_rate": 2.6508844953173778e-05, + "loss": 0.849, + "step": 1020 + }, + { + "epoch": 0.08038710684461094, + "grad_norm": 85.96156311035156, + "learning_rate": 2.6768990634755466e-05, + "loss": 0.5037, + "step": 1030 + }, + { + "epoch": 0.08116756419261688, + "grad_norm": 28.6999454498291, + "learning_rate": 2.702913631633715e-05, + "loss": 2.0381, + "step": 1040 + }, + { + "epoch": 0.0819480215406228, + "grad_norm": 100.8990478515625, + "learning_rate": 2.7289281997918837e-05, + "loss": 1.587, + "step": 1050 + }, + { + "epoch": 0.08272847888862873, + "grad_norm": 90.79500579833984, + "learning_rate": 2.754942767950052e-05, + "loss": 0.9584, + "step": 1060 + }, + { + "epoch": 0.08350893623663466, + "grad_norm": 5.2520341873168945, + "learning_rate": 2.780957336108221e-05, + "loss": 0.8497, + "step": 1070 + }, + { + "epoch": 0.0842893935846406, + "grad_norm": 20.94938850402832, + "learning_rate": 2.8069719042663896e-05, + "loss": 2.2675, + "step": 1080 + }, + { + "epoch": 0.08506985093264653, + "grad_norm": 142.2401123046875, + "learning_rate": 2.832986472424558e-05, + "loss": 1.5153, + "step": 1090 + }, + { + "epoch": 0.08585030828065246, + "grad_norm": 77.06143188476562, + "learning_rate": 2.8590010405827268e-05, + "loss": 1.6392, + "step": 1100 + }, + { + "epoch": 0.08663076562865839, + "grad_norm": 63.13711929321289, + "learning_rate": 2.8850156087408952e-05, + "loss": 1.902, + "step": 1110 + }, + { + "epoch": 0.08741122297666433, + "grad_norm": 34.4248161315918, + "learning_rate": 2.911030176899064e-05, + "loss": 1.384, + "step": 1120 + }, + { + "epoch": 0.08819168032467026, + "grad_norm": 0.0009433151572011411, + "learning_rate": 2.937044745057232e-05, + "loss": 1.1749, + "step": 1130 + }, + { + "epoch": 0.08897213767267619, + "grad_norm": 1.5084794759750366, + "learning_rate": 2.9630593132154004e-05, + "loss": 1.5541, + "step": 1140 + }, + { + "epoch": 0.08975259502068211, + "grad_norm": 108.27482604980469, + "learning_rate": 2.989073881373569e-05, + "loss": 1.8344, + "step": 1150 + }, + { + "epoch": 0.09053305236868806, + "grad_norm": 0.0024440279230475426, + "learning_rate": 3.015088449531738e-05, + "loss": 2.5862, + "step": 1160 + }, + { + "epoch": 0.09131350971669398, + "grad_norm": 0.9278786778450012, + "learning_rate": 3.0411030176899063e-05, + "loss": 1.0581, + "step": 1170 + }, + { + "epoch": 0.09209396706469991, + "grad_norm": 26.764270782470703, + "learning_rate": 3.067117585848075e-05, + "loss": 1.2931, + "step": 1180 + }, + { + "epoch": 0.09287442441270584, + "grad_norm": 0.8647301197052002, + "learning_rate": 3.093132154006244e-05, + "loss": 0.4697, + "step": 1190 + }, + { + "epoch": 0.09365488176071178, + "grad_norm": 127.77442169189453, + "learning_rate": 3.119146722164412e-05, + "loss": 1.0145, + "step": 1200 + }, + { + "epoch": 0.09443533910871771, + "grad_norm": 106.81623840332031, + "learning_rate": 3.1451612903225806e-05, + "loss": 3.1067, + "step": 1210 + }, + { + "epoch": 0.09521579645672364, + "grad_norm": 1.4174572229385376, + "learning_rate": 3.171175858480749e-05, + "loss": 0.9611, + "step": 1220 + }, + { + "epoch": 0.09599625380472958, + "grad_norm": 28.78997802734375, + "learning_rate": 3.197190426638918e-05, + "loss": 1.1636, + "step": 1230 + }, + { + "epoch": 0.0967767111527355, + "grad_norm": 37.20909881591797, + "learning_rate": 3.223204994797087e-05, + "loss": 1.9076, + "step": 1240 + }, + { + "epoch": 0.09755716850074143, + "grad_norm": 150.9630584716797, + "learning_rate": 3.249219562955255e-05, + "loss": 1.7947, + "step": 1250 + }, + { + "epoch": 0.09833762584874736, + "grad_norm": 33.25954818725586, + "learning_rate": 3.2752341311134236e-05, + "loss": 0.9458, + "step": 1260 + }, + { + "epoch": 0.0991180831967533, + "grad_norm": 56.61296844482422, + "learning_rate": 3.3012486992715924e-05, + "loss": 1.1717, + "step": 1270 + }, + { + "epoch": 0.09989854054475923, + "grad_norm": 0.33454135060310364, + "learning_rate": 3.327263267429761e-05, + "loss": 0.2624, + "step": 1280 + }, + { + "epoch": 0.10067899789276516, + "grad_norm": 195.7353057861328, + "learning_rate": 3.35327783558793e-05, + "loss": 3.4511, + "step": 1290 + }, + { + "epoch": 0.10145945524077109, + "grad_norm": 34.02952194213867, + "learning_rate": 3.379292403746098e-05, + "loss": 1.033, + "step": 1300 + }, + { + "epoch": 0.10223991258877703, + "grad_norm": 11.683356285095215, + "learning_rate": 3.405306971904266e-05, + "loss": 0.9921, + "step": 1310 + }, + { + "epoch": 0.10302036993678296, + "grad_norm": 5.753941059112549, + "learning_rate": 3.431321540062435e-05, + "loss": 0.5025, + "step": 1320 + }, + { + "epoch": 0.10380082728478889, + "grad_norm": 154.21438598632812, + "learning_rate": 3.4573361082206035e-05, + "loss": 1.7834, + "step": 1330 + }, + { + "epoch": 0.10458128463279481, + "grad_norm": 3.0168781250949905e-08, + "learning_rate": 3.483350676378772e-05, + "loss": 0.3439, + "step": 1340 + }, + { + "epoch": 0.10536174198080075, + "grad_norm": 187.32237243652344, + "learning_rate": 3.509365244536941e-05, + "loss": 1.1506, + "step": 1350 + }, + { + "epoch": 0.10614219932880668, + "grad_norm": 0.014487487263977528, + "learning_rate": 3.535379812695109e-05, + "loss": 1.7747, + "step": 1360 + }, + { + "epoch": 0.10692265667681261, + "grad_norm": 1.2514485661085928e-06, + "learning_rate": 3.561394380853278e-05, + "loss": 1.5893, + "step": 1370 + }, + { + "epoch": 0.10770311402481854, + "grad_norm": 0.7034757137298584, + "learning_rate": 3.5874089490114465e-05, + "loss": 1.6552, + "step": 1380 + }, + { + "epoch": 0.10848357137282448, + "grad_norm": 22.506364822387695, + "learning_rate": 3.613423517169615e-05, + "loss": 3.3548, + "step": 1390 + }, + { + "epoch": 0.10926402872083041, + "grad_norm": 0.44950780272483826, + "learning_rate": 3.639438085327784e-05, + "loss": 1.3068, + "step": 1400 + }, + { + "epoch": 0.11004448606883634, + "grad_norm": 0.26861312985420227, + "learning_rate": 3.665452653485952e-05, + "loss": 1.4875, + "step": 1410 + }, + { + "epoch": 0.11082494341684226, + "grad_norm": 10.148048400878906, + "learning_rate": 3.691467221644121e-05, + "loss": 0.6517, + "step": 1420 + }, + { + "epoch": 0.1116054007648482, + "grad_norm": 117.42066955566406, + "learning_rate": 3.7174817898022896e-05, + "loss": 2.4652, + "step": 1430 + }, + { + "epoch": 0.11238585811285413, + "grad_norm": 0.7029063701629639, + "learning_rate": 3.743496357960458e-05, + "loss": 0.4546, + "step": 1440 + }, + { + "epoch": 0.11316631546086006, + "grad_norm": 0.00036122280289418995, + "learning_rate": 3.769510926118627e-05, + "loss": 0.6733, + "step": 1450 + }, + { + "epoch": 0.11394677280886599, + "grad_norm": 2.137751579284668, + "learning_rate": 3.795525494276795e-05, + "loss": 0.6666, + "step": 1460 + }, + { + "epoch": 0.11472723015687193, + "grad_norm": 0.006048910319805145, + "learning_rate": 3.821540062434964e-05, + "loss": 1.7245, + "step": 1470 + }, + { + "epoch": 0.11550768750487786, + "grad_norm": 3.093451976776123, + "learning_rate": 3.8475546305931326e-05, + "loss": 0.8739, + "step": 1480 + }, + { + "epoch": 0.11628814485288379, + "grad_norm": 0.0005918557289987803, + "learning_rate": 3.873569198751301e-05, + "loss": 1.8093, + "step": 1490 + }, + { + "epoch": 0.11706860220088972, + "grad_norm": 116.87903594970703, + "learning_rate": 3.8995837669094694e-05, + "loss": 2.0154, + "step": 1500 + }, + { + "epoch": 0.11784905954889566, + "grad_norm": 56.92986297607422, + "learning_rate": 3.925598335067638e-05, + "loss": 1.693, + "step": 1510 + }, + { + "epoch": 0.11862951689690159, + "grad_norm": 113.77027893066406, + "learning_rate": 3.951612903225806e-05, + "loss": 1.5754, + "step": 1520 + }, + { + "epoch": 0.11940997424490751, + "grad_norm": 0.0008633786346763372, + "learning_rate": 3.977627471383975e-05, + "loss": 1.9881, + "step": 1530 + }, + { + "epoch": 0.12019043159291344, + "grad_norm": 0.01747491955757141, + "learning_rate": 4.003642039542144e-05, + "loss": 1.3658, + "step": 1540 + }, + { + "epoch": 0.12097088894091938, + "grad_norm": 0.017468832433223724, + "learning_rate": 4.0296566077003125e-05, + "loss": 0.4905, + "step": 1550 + }, + { + "epoch": 0.12175134628892531, + "grad_norm": 6.877013683319092, + "learning_rate": 4.0556711758584806e-05, + "loss": 1.3686, + "step": 1560 + }, + { + "epoch": 0.12253180363693124, + "grad_norm": 61.5954704284668, + "learning_rate": 4.081685744016649e-05, + "loss": 0.9125, + "step": 1570 + }, + { + "epoch": 0.12331226098493717, + "grad_norm": 35.28593826293945, + "learning_rate": 4.107700312174818e-05, + "loss": 0.8816, + "step": 1580 + }, + { + "epoch": 0.12409271833294311, + "grad_norm": 23.813762664794922, + "learning_rate": 4.133714880332987e-05, + "loss": 2.0649, + "step": 1590 + }, + { + "epoch": 0.12487317568094904, + "grad_norm": 1.0868879556655884, + "learning_rate": 4.1597294484911555e-05, + "loss": 0.9881, + "step": 1600 + }, + { + "epoch": 0.12565363302895496, + "grad_norm": 89.6454086303711, + "learning_rate": 4.1857440166493236e-05, + "loss": 1.8194, + "step": 1610 + }, + { + "epoch": 0.1264340903769609, + "grad_norm": 101.64916229248047, + "learning_rate": 4.2117585848074924e-05, + "loss": 0.8102, + "step": 1620 + }, + { + "epoch": 0.12721454772496682, + "grad_norm": 69.2875747680664, + "learning_rate": 4.237773152965661e-05, + "loss": 1.708, + "step": 1630 + }, + { + "epoch": 0.12799500507297276, + "grad_norm": 29.091567993164062, + "learning_rate": 4.26378772112383e-05, + "loss": 0.1439, + "step": 1640 + }, + { + "epoch": 0.1287754624209787, + "grad_norm": 128.8373565673828, + "learning_rate": 4.2898022892819986e-05, + "loss": 1.2202, + "step": 1650 + }, + { + "epoch": 0.12955591976898462, + "grad_norm": 0.0037375022657215595, + "learning_rate": 4.3158168574401667e-05, + "loss": 1.6469, + "step": 1660 + }, + { + "epoch": 0.13033637711699056, + "grad_norm": 150.75579833984375, + "learning_rate": 4.341831425598335e-05, + "loss": 1.3376, + "step": 1670 + }, + { + "epoch": 0.1311168344649965, + "grad_norm": 32.13846206665039, + "learning_rate": 4.3678459937565035e-05, + "loss": 1.0743, + "step": 1680 + }, + { + "epoch": 0.13189729181300242, + "grad_norm": 25.405044555664062, + "learning_rate": 4.393860561914672e-05, + "loss": 2.3368, + "step": 1690 + }, + { + "epoch": 0.13267774916100836, + "grad_norm": 33.23969650268555, + "learning_rate": 4.419875130072841e-05, + "loss": 1.2335, + "step": 1700 + }, + { + "epoch": 0.13345820650901427, + "grad_norm": 9.90313583315583e-06, + "learning_rate": 4.44588969823101e-05, + "loss": 2.5255, + "step": 1710 + }, + { + "epoch": 0.1342386638570202, + "grad_norm": 3.5996785163879395, + "learning_rate": 4.471904266389178e-05, + "loss": 0.5967, + "step": 1720 + }, + { + "epoch": 0.13501912120502615, + "grad_norm": 80.54622650146484, + "learning_rate": 4.4979188345473465e-05, + "loss": 1.665, + "step": 1730 + }, + { + "epoch": 0.13579957855303207, + "grad_norm": 77.36883544921875, + "learning_rate": 4.523933402705515e-05, + "loss": 0.7881, + "step": 1740 + }, + { + "epoch": 0.136580035901038, + "grad_norm": 4.339534759521484, + "learning_rate": 4.549947970863684e-05, + "loss": 1.2602, + "step": 1750 + }, + { + "epoch": 0.13736049324904395, + "grad_norm": 43.28116226196289, + "learning_rate": 4.575962539021853e-05, + "loss": 0.2827, + "step": 1760 + }, + { + "epoch": 0.13814095059704987, + "grad_norm": 95.82295227050781, + "learning_rate": 4.601977107180021e-05, + "loss": 0.8773, + "step": 1770 + }, + { + "epoch": 0.1389214079450558, + "grad_norm": 6.8486552238464355, + "learning_rate": 4.6279916753381896e-05, + "loss": 1.5294, + "step": 1780 + }, + { + "epoch": 0.13970186529306172, + "grad_norm": 1.7467870065956959e-06, + "learning_rate": 4.654006243496358e-05, + "loss": 1.06, + "step": 1790 + }, + { + "epoch": 0.14048232264106766, + "grad_norm": 9.074289321899414, + "learning_rate": 4.680020811654527e-05, + "loss": 1.2706, + "step": 1800 + }, + { + "epoch": 0.1412627799890736, + "grad_norm": 0.0031192891765385866, + "learning_rate": 4.706035379812696e-05, + "loss": 0.835, + "step": 1810 + }, + { + "epoch": 0.14204323733707952, + "grad_norm": 0.06087380275130272, + "learning_rate": 4.732049947970864e-05, + "loss": 3.8462, + "step": 1820 + }, + { + "epoch": 0.14282369468508546, + "grad_norm": 86.36807250976562, + "learning_rate": 4.7580645161290326e-05, + "loss": 1.0386, + "step": 1830 + }, + { + "epoch": 0.1436041520330914, + "grad_norm": 45.92657470703125, + "learning_rate": 4.7840790842872014e-05, + "loss": 0.6736, + "step": 1840 + }, + { + "epoch": 0.14438460938109732, + "grad_norm": 78.54730987548828, + "learning_rate": 4.81009365244537e-05, + "loss": 1.8937, + "step": 1850 + }, + { + "epoch": 0.14516506672910326, + "grad_norm": 83.15911865234375, + "learning_rate": 4.836108220603538e-05, + "loss": 0.7178, + "step": 1860 + }, + { + "epoch": 0.14594552407710917, + "grad_norm": 35.2256965637207, + "learning_rate": 4.862122788761707e-05, + "loss": 0.5309, + "step": 1870 + }, + { + "epoch": 0.14672598142511512, + "grad_norm": 138.44583129882812, + "learning_rate": 4.888137356919875e-05, + "loss": 1.6066, + "step": 1880 + }, + { + "epoch": 0.14750643877312106, + "grad_norm": 8.984199666883796e-06, + "learning_rate": 4.914151925078044e-05, + "loss": 0.1384, + "step": 1890 + }, + { + "epoch": 0.14828689612112697, + "grad_norm": 2.7456895423938477e-08, + "learning_rate": 4.9401664932362125e-05, + "loss": 2.8467, + "step": 1900 + }, + { + "epoch": 0.1490673534691329, + "grad_norm": 7.1267924308776855, + "learning_rate": 4.966181061394381e-05, + "loss": 1.0931, + "step": 1910 + }, + { + "epoch": 0.14984781081713885, + "grad_norm": 14.842751502990723, + "learning_rate": 4.992195629552549e-05, + "loss": 0.5779, + "step": 1920 + }, + { + "epoch": 0.15062826816514477, + "grad_norm": 48.106014251708984, + "learning_rate": 5.018210197710719e-05, + "loss": 1.4842, + "step": 1930 + }, + { + "epoch": 0.1514087255131507, + "grad_norm": 0.34851303696632385, + "learning_rate": 5.044224765868887e-05, + "loss": 0.4474, + "step": 1940 + }, + { + "epoch": 0.15218918286115662, + "grad_norm": 3.1243269443511963, + "learning_rate": 5.070239334027055e-05, + "loss": 1.0802, + "step": 1950 + }, + { + "epoch": 0.15296964020916257, + "grad_norm": 20.244169235229492, + "learning_rate": 5.096253902185224e-05, + "loss": 0.81, + "step": 1960 + }, + { + "epoch": 0.1537500975571685, + "grad_norm": 12.780139923095703, + "learning_rate": 5.122268470343392e-05, + "loss": 1.698, + "step": 1970 + }, + { + "epoch": 0.15453055490517442, + "grad_norm": 0.014045203104615211, + "learning_rate": 5.148283038501561e-05, + "loss": 0.3384, + "step": 1980 + }, + { + "epoch": 0.15531101225318036, + "grad_norm": 129.87998962402344, + "learning_rate": 5.174297606659729e-05, + "loss": 1.9301, + "step": 1990 + }, + { + "epoch": 0.1560914696011863, + "grad_norm": 38.74739456176758, + "learning_rate": 5.2003121748178986e-05, + "loss": 1.2539, + "step": 2000 + }, + { + "epoch": 0.15687192694919222, + "grad_norm": 0.030707743018865585, + "learning_rate": 5.2263267429760666e-05, + "loss": 1.0761, + "step": 2010 + }, + { + "epoch": 0.15765238429719816, + "grad_norm": 70.8248062133789, + "learning_rate": 5.2523413111342354e-05, + "loss": 2.2531, + "step": 2020 + }, + { + "epoch": 0.1584328416452041, + "grad_norm": 20.452491760253906, + "learning_rate": 5.2783558792924034e-05, + "loss": 0.4756, + "step": 2030 + }, + { + "epoch": 0.15921329899321002, + "grad_norm": 1.2021034955978394, + "learning_rate": 5.304370447450573e-05, + "loss": 0.714, + "step": 2040 + }, + { + "epoch": 0.15999375634121596, + "grad_norm": 0.427171915769577, + "learning_rate": 5.330385015608741e-05, + "loss": 0.4299, + "step": 2050 + }, + { + "epoch": 0.16077421368922187, + "grad_norm": 5.084196345706005e-06, + "learning_rate": 5.3563995837669104e-05, + "loss": 2.0605, + "step": 2060 + }, + { + "epoch": 0.16155467103722781, + "grad_norm": 7.578249096695799e-06, + "learning_rate": 5.3824141519250784e-05, + "loss": 0.1577, + "step": 2070 + }, + { + "epoch": 0.16233512838523376, + "grad_norm": 5.277594254948781e-07, + "learning_rate": 5.408428720083247e-05, + "loss": 0.663, + "step": 2080 + }, + { + "epoch": 0.16311558573323967, + "grad_norm": 19.192306518554688, + "learning_rate": 5.434443288241415e-05, + "loss": 0.7749, + "step": 2090 + }, + { + "epoch": 0.1638960430812456, + "grad_norm": 1.7029945809099445e-07, + "learning_rate": 5.4604578563995847e-05, + "loss": 0.6955, + "step": 2100 + }, + { + "epoch": 0.16467650042925155, + "grad_norm": 1.3037423585160468e-08, + "learning_rate": 5.486472424557753e-05, + "loss": 1.6133, + "step": 2110 + }, + { + "epoch": 0.16545695777725747, + "grad_norm": 0.6061166524887085, + "learning_rate": 5.5124869927159215e-05, + "loss": 1.9547, + "step": 2120 + }, + { + "epoch": 0.1662374151252634, + "grad_norm": 122.31124114990234, + "learning_rate": 5.5385015608740895e-05, + "loss": 0.7001, + "step": 2130 + }, + { + "epoch": 0.16701787247326932, + "grad_norm": 0.06084052473306656, + "learning_rate": 5.5645161290322576e-05, + "loss": 2.5693, + "step": 2140 + }, + { + "epoch": 0.16779832982127527, + "grad_norm": 104.42826843261719, + "learning_rate": 5.590530697190427e-05, + "loss": 1.0454, + "step": 2150 + }, + { + "epoch": 0.1685787871692812, + "grad_norm": 1.0883114337921143, + "learning_rate": 5.616545265348595e-05, + "loss": 0.872, + "step": 2160 + }, + { + "epoch": 0.16935924451728712, + "grad_norm": 34.3475341796875, + "learning_rate": 5.6425598335067645e-05, + "loss": 1.844, + "step": 2170 + }, + { + "epoch": 0.17013970186529306, + "grad_norm": 1.2302706241607666, + "learning_rate": 5.6685744016649326e-05, + "loss": 0.916, + "step": 2180 + }, + { + "epoch": 0.170920159213299, + "grad_norm": 7.886373996734619, + "learning_rate": 5.694588969823101e-05, + "loss": 2.8442, + "step": 2190 + }, + { + "epoch": 0.17170061656130492, + "grad_norm": 51.30202865600586, + "learning_rate": 5.7206035379812694e-05, + "loss": 2.1727, + "step": 2200 + }, + { + "epoch": 0.17248107390931086, + "grad_norm": 0.004836680367588997, + "learning_rate": 5.746618106139439e-05, + "loss": 0.148, + "step": 2210 + }, + { + "epoch": 0.17326153125731678, + "grad_norm": 10.632889747619629, + "learning_rate": 5.772632674297607e-05, + "loss": 1.1186, + "step": 2220 + }, + { + "epoch": 0.17404198860532272, + "grad_norm": 1.2311549186706543, + "learning_rate": 5.7986472424557756e-05, + "loss": 0.7166, + "step": 2230 + }, + { + "epoch": 0.17482244595332866, + "grad_norm": 0.0004135339695494622, + "learning_rate": 5.824661810613944e-05, + "loss": 0.5374, + "step": 2240 + }, + { + "epoch": 0.17560290330133457, + "grad_norm": 109.13738250732422, + "learning_rate": 5.850676378772113e-05, + "loss": 0.7873, + "step": 2250 + }, + { + "epoch": 0.17638336064934051, + "grad_norm": 45.3726806640625, + "learning_rate": 5.876690946930281e-05, + "loss": 2.329, + "step": 2260 + }, + { + "epoch": 0.17716381799734646, + "grad_norm": 8.300317858811468e-05, + "learning_rate": 5.9027055150884506e-05, + "loss": 0.7245, + "step": 2270 + }, + { + "epoch": 0.17794427534535237, + "grad_norm": 100.41148376464844, + "learning_rate": 5.928720083246618e-05, + "loss": 1.6251, + "step": 2280 + }, + { + "epoch": 0.1787247326933583, + "grad_norm": 137.95272827148438, + "learning_rate": 5.9547346514047874e-05, + "loss": 2.1591, + "step": 2290 + }, + { + "epoch": 0.17950519004136423, + "grad_norm": 111.72086334228516, + "learning_rate": 5.9807492195629555e-05, + "loss": 0.5804, + "step": 2300 + }, + { + "epoch": 0.18028564738937017, + "grad_norm": 32.664913177490234, + "learning_rate": 6.0067637877211236e-05, + "loss": 0.3329, + "step": 2310 + }, + { + "epoch": 0.1810661047373761, + "grad_norm": 44.84520721435547, + "learning_rate": 6.032778355879293e-05, + "loss": 1.1485, + "step": 2320 + }, + { + "epoch": 0.18184656208538202, + "grad_norm": 0.08995208144187927, + "learning_rate": 6.058792924037461e-05, + "loss": 1.7519, + "step": 2330 + }, + { + "epoch": 0.18262701943338797, + "grad_norm": 7.0876171776035335e-06, + "learning_rate": 6.08480749219563e-05, + "loss": 0.3008, + "step": 2340 + }, + { + "epoch": 0.1834074767813939, + "grad_norm": 0.00017401509103365242, + "learning_rate": 6.110822060353799e-05, + "loss": 0.0506, + "step": 2350 + }, + { + "epoch": 0.18418793412939982, + "grad_norm": 32.613162994384766, + "learning_rate": 6.136836628511967e-05, + "loss": 0.0262, + "step": 2360 + }, + { + "epoch": 0.18496839147740576, + "grad_norm": 0.001569831627421081, + "learning_rate": 6.162851196670135e-05, + "loss": 3.8222, + "step": 2370 + }, + { + "epoch": 0.18574884882541168, + "grad_norm": 135.337646484375, + "learning_rate": 6.188865764828304e-05, + "loss": 2.4808, + "step": 2380 + }, + { + "epoch": 0.18652930617341762, + "grad_norm": 13.219198226928711, + "learning_rate": 6.214880332986472e-05, + "loss": 2.1569, + "step": 2390 + }, + { + "epoch": 0.18730976352142356, + "grad_norm": 0.014933624304831028, + "learning_rate": 6.240894901144642e-05, + "loss": 0.7584, + "step": 2400 + }, + { + "epoch": 0.18809022086942948, + "grad_norm": 104.06402587890625, + "learning_rate": 6.26690946930281e-05, + "loss": 1.2859, + "step": 2410 + }, + { + "epoch": 0.18887067821743542, + "grad_norm": 0.00016139191575348377, + "learning_rate": 6.292924037460979e-05, + "loss": 0.6368, + "step": 2420 + }, + { + "epoch": 0.18965113556544136, + "grad_norm": 0.02603684924542904, + "learning_rate": 6.318938605619147e-05, + "loss": 0.9785, + "step": 2430 + }, + { + "epoch": 0.19043159291344727, + "grad_norm": 89.01065826416016, + "learning_rate": 6.344953173777317e-05, + "loss": 0.4253, + "step": 2440 + }, + { + "epoch": 0.19121205026145321, + "grad_norm": 14.965557098388672, + "learning_rate": 6.370967741935485e-05, + "loss": 2.5722, + "step": 2450 + }, + { + "epoch": 0.19199250760945916, + "grad_norm": 2.7472351575852372e-05, + "learning_rate": 6.396982310093653e-05, + "loss": 1.3484, + "step": 2460 + }, + { + "epoch": 0.19277296495746507, + "grad_norm": 62.7291259765625, + "learning_rate": 6.422996878251821e-05, + "loss": 1.2803, + "step": 2470 + }, + { + "epoch": 0.193553422305471, + "grad_norm": 23.414655685424805, + "learning_rate": 6.44901144640999e-05, + "loss": 0.7357, + "step": 2480 + }, + { + "epoch": 0.19433387965347693, + "grad_norm": 1.1676721811682e-08, + "learning_rate": 6.475026014568158e-05, + "loss": 0.8226, + "step": 2490 + }, + { + "epoch": 0.19511433700148287, + "grad_norm": 1.240859866142273, + "learning_rate": 6.501040582726326e-05, + "loss": 1.1128, + "step": 2500 + }, + { + "epoch": 0.1958947943494888, + "grad_norm": 1.6697241067886353, + "learning_rate": 6.527055150884496e-05, + "loss": 2.9435, + "step": 2510 + }, + { + "epoch": 0.19667525169749472, + "grad_norm": 33.49791717529297, + "learning_rate": 6.553069719042664e-05, + "loss": 0.71, + "step": 2520 + }, + { + "epoch": 0.19745570904550067, + "grad_norm": 4.625143051147461, + "learning_rate": 6.579084287200833e-05, + "loss": 1.4883, + "step": 2530 + }, + { + "epoch": 0.1982361663935066, + "grad_norm": 118.58780670166016, + "learning_rate": 6.605098855359001e-05, + "loss": 0.9855, + "step": 2540 + }, + { + "epoch": 0.19901662374151252, + "grad_norm": 4.585960388183594, + "learning_rate": 6.631113423517171e-05, + "loss": 0.6574, + "step": 2550 + }, + { + "epoch": 0.19979708108951846, + "grad_norm": 24.67189598083496, + "learning_rate": 6.657127991675339e-05, + "loss": 1.0292, + "step": 2560 + }, + { + "epoch": 0.20057753843752438, + "grad_norm": 0.24398376047611237, + "learning_rate": 6.683142559833507e-05, + "loss": 1.203, + "step": 2570 + }, + { + "epoch": 0.20135799578553032, + "grad_norm": 2.676201581954956, + "learning_rate": 6.709157127991675e-05, + "loss": 0.6513, + "step": 2580 + }, + { + "epoch": 0.20213845313353626, + "grad_norm": 13.57225513458252, + "learning_rate": 6.735171696149844e-05, + "loss": 1.8385, + "step": 2590 + }, + { + "epoch": 0.20291891048154218, + "grad_norm": 76.8286361694336, + "learning_rate": 6.761186264308012e-05, + "loss": 0.4815, + "step": 2600 + }, + { + "epoch": 0.20369936782954812, + "grad_norm": 71.41983032226562, + "learning_rate": 6.787200832466182e-05, + "loss": 0.3473, + "step": 2610 + }, + { + "epoch": 0.20447982517755406, + "grad_norm": 16.46891212463379, + "learning_rate": 6.81321540062435e-05, + "loss": 0.2729, + "step": 2620 + }, + { + "epoch": 0.20526028252555997, + "grad_norm": 2.7754387855529785, + "learning_rate": 6.839229968782519e-05, + "loss": 2.3419, + "step": 2630 + }, + { + "epoch": 0.20604073987356591, + "grad_norm": 92.1739501953125, + "learning_rate": 6.865244536940687e-05, + "loss": 0.2308, + "step": 2640 + }, + { + "epoch": 0.20682119722157183, + "grad_norm": 137.1291046142578, + "learning_rate": 6.891259105098857e-05, + "loss": 3.2198, + "step": 2650 + }, + { + "epoch": 0.20760165456957777, + "grad_norm": 3.064040422439575, + "learning_rate": 6.917273673257025e-05, + "loss": 2.4636, + "step": 2660 + }, + { + "epoch": 0.2083821119175837, + "grad_norm": 68.23210144042969, + "learning_rate": 6.943288241415193e-05, + "loss": 0.4497, + "step": 2670 + }, + { + "epoch": 0.20916256926558963, + "grad_norm": 8.724065992282704e-06, + "learning_rate": 6.969302809573361e-05, + "loss": 0.2542, + "step": 2680 + }, + { + "epoch": 0.20994302661359557, + "grad_norm": 0.014424404129385948, + "learning_rate": 6.995317377731529e-05, + "loss": 0.565, + "step": 2690 + }, + { + "epoch": 0.2107234839616015, + "grad_norm": 135.26565551757812, + "learning_rate": 7.021331945889699e-05, + "loss": 2.0194, + "step": 2700 + }, + { + "epoch": 0.21150394130960742, + "grad_norm": 0.00900540966540575, + "learning_rate": 7.047346514047867e-05, + "loss": 1.9152, + "step": 2710 + }, + { + "epoch": 0.21228439865761337, + "grad_norm": 39.1527214050293, + "learning_rate": 7.073361082206036e-05, + "loss": 0.5994, + "step": 2720 + }, + { + "epoch": 0.21306485600561928, + "grad_norm": 0.008746455423533916, + "learning_rate": 7.099375650364204e-05, + "loss": 1.3674, + "step": 2730 + }, + { + "epoch": 0.21384531335362522, + "grad_norm": 57.56302261352539, + "learning_rate": 7.125390218522373e-05, + "loss": 1.2222, + "step": 2740 + }, + { + "epoch": 0.21462577070163116, + "grad_norm": 49.190887451171875, + "learning_rate": 7.151404786680542e-05, + "loss": 1.031, + "step": 2750 + }, + { + "epoch": 0.21540622804963708, + "grad_norm": 124.0704116821289, + "learning_rate": 7.177419354838711e-05, + "loss": 1.7073, + "step": 2760 + }, + { + "epoch": 0.21618668539764302, + "grad_norm": 0.058635517954826355, + "learning_rate": 7.203433922996878e-05, + "loss": 1.3333, + "step": 2770 + }, + { + "epoch": 0.21696714274564896, + "grad_norm": 40.484222412109375, + "learning_rate": 7.229448491155047e-05, + "loss": 0.8355, + "step": 2780 + }, + { + "epoch": 0.21774760009365488, + "grad_norm": 3.079050064086914, + "learning_rate": 7.255463059313215e-05, + "loss": 0.838, + "step": 2790 + }, + { + "epoch": 0.21852805744166082, + "grad_norm": 2.005028247833252, + "learning_rate": 7.281477627471385e-05, + "loss": 0.7343, + "step": 2800 + }, + { + "epoch": 0.21930851478966673, + "grad_norm": 56.680877685546875, + "learning_rate": 7.307492195629553e-05, + "loss": 0.3208, + "step": 2810 + }, + { + "epoch": 0.22008897213767267, + "grad_norm": 1.1359798008925281e-05, + "learning_rate": 7.333506763787722e-05, + "loss": 2.3247, + "step": 2820 + }, + { + "epoch": 0.22086942948567861, + "grad_norm": 123.48363494873047, + "learning_rate": 7.35952133194589e-05, + "loss": 1.2051, + "step": 2830 + }, + { + "epoch": 0.22164988683368453, + "grad_norm": 0.8620667457580566, + "learning_rate": 7.38553590010406e-05, + "loss": 0.3456, + "step": 2840 + }, + { + "epoch": 0.22243034418169047, + "grad_norm": 50.265743255615234, + "learning_rate": 7.411550468262228e-05, + "loss": 1.8603, + "step": 2850 + }, + { + "epoch": 0.2232108015296964, + "grad_norm": 60.52621841430664, + "learning_rate": 7.437565036420396e-05, + "loss": 0.7977, + "step": 2860 + }, + { + "epoch": 0.22399125887770233, + "grad_norm": 17.86060905456543, + "learning_rate": 7.463579604578564e-05, + "loss": 0.6533, + "step": 2870 + }, + { + "epoch": 0.22477171622570827, + "grad_norm": 0.051353830844163895, + "learning_rate": 7.489594172736732e-05, + "loss": 0.4748, + "step": 2880 + }, + { + "epoch": 0.2255521735737142, + "grad_norm": 0.003670419566333294, + "learning_rate": 7.515608740894901e-05, + "loss": 0.8188, + "step": 2890 + }, + { + "epoch": 0.22633263092172012, + "grad_norm": 104.48554992675781, + "learning_rate": 7.54162330905307e-05, + "loss": 1.515, + "step": 2900 + }, + { + "epoch": 0.22711308826972607, + "grad_norm": 0.0348089262843132, + "learning_rate": 7.567637877211239e-05, + "loss": 2.7286, + "step": 2910 + }, + { + "epoch": 0.22789354561773198, + "grad_norm": 10.428803443908691, + "learning_rate": 7.593652445369407e-05, + "loss": 1.0998, + "step": 2920 + }, + { + "epoch": 0.22867400296573792, + "grad_norm": 8.424872398376465, + "learning_rate": 7.619667013527576e-05, + "loss": 0.6017, + "step": 2930 + }, + { + "epoch": 0.22945446031374386, + "grad_norm": 0.40675023198127747, + "learning_rate": 7.645681581685744e-05, + "loss": 0.7057, + "step": 2940 + }, + { + "epoch": 0.23023491766174978, + "grad_norm": 0.02739277482032776, + "learning_rate": 7.671696149843914e-05, + "loss": 0.7381, + "step": 2950 + }, + { + "epoch": 0.23101537500975572, + "grad_norm": 122.3631820678711, + "learning_rate": 7.697710718002082e-05, + "loss": 2.2416, + "step": 2960 + }, + { + "epoch": 0.23179583235776166, + "grad_norm": 22.099197387695312, + "learning_rate": 7.72372528616025e-05, + "loss": 0.7604, + "step": 2970 + }, + { + "epoch": 0.23257628970576757, + "grad_norm": 0.024255866184830666, + "learning_rate": 7.749739854318418e-05, + "loss": 0.7483, + "step": 2980 + }, + { + "epoch": 0.23335674705377352, + "grad_norm": 0.019524026662111282, + "learning_rate": 7.775754422476587e-05, + "loss": 0.6842, + "step": 2990 + }, + { + "epoch": 0.23413720440177943, + "grad_norm": 1.153704243250786e-09, + "learning_rate": 7.801768990634755e-05, + "loss": 0.7946, + "step": 3000 + }, + { + "epoch": 0.23491766174978537, + "grad_norm": 0.002444772282615304, + "learning_rate": 7.827783558792925e-05, + "loss": 2.4327, + "step": 3010 + }, + { + "epoch": 0.23569811909779131, + "grad_norm": 1.4735280275344849, + "learning_rate": 7.853798126951093e-05, + "loss": 1.2604, + "step": 3020 + }, + { + "epoch": 0.23647857644579723, + "grad_norm": 5.004496870242292e-06, + "learning_rate": 7.879812695109261e-05, + "loss": 0.9982, + "step": 3030 + }, + { + "epoch": 0.23725903379380317, + "grad_norm": 5.29398775100708, + "learning_rate": 7.90582726326743e-05, + "loss": 0.9924, + "step": 3040 + }, + { + "epoch": 0.2380394911418091, + "grad_norm": 98.14347076416016, + "learning_rate": 7.931841831425598e-05, + "loss": 1.2078, + "step": 3050 + }, + { + "epoch": 0.23881994848981503, + "grad_norm": 9.180928230285645, + "learning_rate": 7.957856399583768e-05, + "loss": 0.293, + "step": 3060 + }, + { + "epoch": 0.23960040583782097, + "grad_norm": 96.04532623291016, + "learning_rate": 7.983870967741936e-05, + "loss": 0.7801, + "step": 3070 + }, + { + "epoch": 0.24038086318582688, + "grad_norm": 43.307640075683594, + "learning_rate": 8.009885535900104e-05, + "loss": 1.1076, + "step": 3080 + }, + { + "epoch": 0.24116132053383282, + "grad_norm": 5.743316173553467, + "learning_rate": 8.035900104058272e-05, + "loss": 1.1714, + "step": 3090 + }, + { + "epoch": 0.24194177788183877, + "grad_norm": 0.0005356659530662, + "learning_rate": 8.061914672216442e-05, + "loss": 1.811, + "step": 3100 + }, + { + "epoch": 0.24272223522984468, + "grad_norm": 0.5350730419158936, + "learning_rate": 8.08792924037461e-05, + "loss": 0.948, + "step": 3110 + }, + { + "epoch": 0.24350269257785062, + "grad_norm": 0.001030560932122171, + "learning_rate": 8.113943808532779e-05, + "loss": 0.3189, + "step": 3120 + }, + { + "epoch": 0.24428314992585656, + "grad_norm": 129.49615478515625, + "learning_rate": 8.139958376690947e-05, + "loss": 0.9723, + "step": 3130 + }, + { + "epoch": 0.24506360727386248, + "grad_norm": 13.451752662658691, + "learning_rate": 8.165972944849117e-05, + "loss": 1.0916, + "step": 3140 + }, + { + "epoch": 0.24584406462186842, + "grad_norm": 0.06887555867433548, + "learning_rate": 8.191987513007285e-05, + "loss": 2.2815, + "step": 3150 + }, + { + "epoch": 0.24662452196987433, + "grad_norm": 18.077409744262695, + "learning_rate": 8.218002081165454e-05, + "loss": 2.1444, + "step": 3160 + }, + { + "epoch": 0.24740497931788027, + "grad_norm": 120.06288146972656, + "learning_rate": 8.244016649323622e-05, + "loss": 1.5044, + "step": 3170 + }, + { + "epoch": 0.24818543666588622, + "grad_norm": 8.321662902832031, + "learning_rate": 8.27003121748179e-05, + "loss": 0.3245, + "step": 3180 + }, + { + "epoch": 0.24896589401389213, + "grad_norm": 3.4147167205810547, + "learning_rate": 8.296045785639958e-05, + "loss": 1.4218, + "step": 3190 + }, + { + "epoch": 0.24974635136189807, + "grad_norm": 163.7802734375, + "learning_rate": 8.322060353798128e-05, + "loss": 1.3589, + "step": 3200 + }, + { + "epoch": 0.250526808709904, + "grad_norm": 0.1835886836051941, + "learning_rate": 8.348074921956296e-05, + "loss": 2.1614, + "step": 3210 + }, + { + "epoch": 0.25130726605790993, + "grad_norm": 0.03185463324189186, + "learning_rate": 8.374089490114464e-05, + "loss": 1.6368, + "step": 3220 + }, + { + "epoch": 0.25208772340591584, + "grad_norm": 45.36469268798828, + "learning_rate": 8.400104058272633e-05, + "loss": 2.1333, + "step": 3230 + }, + { + "epoch": 0.2528681807539218, + "grad_norm": 152.0260772705078, + "learning_rate": 8.426118626430801e-05, + "loss": 1.3231, + "step": 3240 + }, + { + "epoch": 0.2536486381019277, + "grad_norm": 15.69242000579834, + "learning_rate": 8.45213319458897e-05, + "loss": 1.1895, + "step": 3250 + }, + { + "epoch": 0.25442909544993364, + "grad_norm": 87.19802856445312, + "learning_rate": 8.478147762747139e-05, + "loss": 1.4486, + "step": 3260 + }, + { + "epoch": 0.2552095527979396, + "grad_norm": 51.75240707397461, + "learning_rate": 8.504162330905308e-05, + "loss": 1.4757, + "step": 3270 + }, + { + "epoch": 0.2559900101459455, + "grad_norm": 79.54395294189453, + "learning_rate": 8.530176899063476e-05, + "loss": 2.1736, + "step": 3280 + }, + { + "epoch": 0.25677046749395144, + "grad_norm": 4.835404396057129, + "learning_rate": 8.556191467221644e-05, + "loss": 0.6074, + "step": 3290 + }, + { + "epoch": 0.2575509248419574, + "grad_norm": 78.10124206542969, + "learning_rate": 8.582206035379812e-05, + "loss": 1.0351, + "step": 3300 + }, + { + "epoch": 0.2583313821899633, + "grad_norm": 8.682250022888184, + "learning_rate": 8.608220603537982e-05, + "loss": 1.1131, + "step": 3310 + }, + { + "epoch": 0.25911183953796924, + "grad_norm": 56.00176239013672, + "learning_rate": 8.63423517169615e-05, + "loss": 2.09, + "step": 3320 + }, + { + "epoch": 0.2598922968859752, + "grad_norm": 0.021717999130487442, + "learning_rate": 8.660249739854319e-05, + "loss": 1.3968, + "step": 3330 + }, + { + "epoch": 0.2606727542339811, + "grad_norm": 95.72789001464844, + "learning_rate": 8.686264308012487e-05, + "loss": 0.629, + "step": 3340 + }, + { + "epoch": 0.26145321158198703, + "grad_norm": 82.1919937133789, + "learning_rate": 8.712278876170657e-05, + "loss": 0.6342, + "step": 3350 + }, + { + "epoch": 0.262233668929993, + "grad_norm": 15.192002296447754, + "learning_rate": 8.738293444328825e-05, + "loss": 0.0908, + "step": 3360 + }, + { + "epoch": 0.2630141262779989, + "grad_norm": 176.47926330566406, + "learning_rate": 8.764308012486994e-05, + "loss": 1.3793, + "step": 3370 + }, + { + "epoch": 0.26379458362600483, + "grad_norm": 168.18618774414062, + "learning_rate": 8.790322580645162e-05, + "loss": 2.3707, + "step": 3380 + }, + { + "epoch": 0.26457504097401074, + "grad_norm": 0.0006278129876591265, + "learning_rate": 8.81633714880333e-05, + "loss": 0.6653, + "step": 3390 + }, + { + "epoch": 0.2653554983220167, + "grad_norm": 5.725263326894492e-05, + "learning_rate": 8.842351716961498e-05, + "loss": 1.7785, + "step": 3400 + }, + { + "epoch": 0.26613595567002263, + "grad_norm": 9.961698532104492, + "learning_rate": 8.868366285119667e-05, + "loss": 2.1567, + "step": 3410 + }, + { + "epoch": 0.26691641301802854, + "grad_norm": 15.461480140686035, + "learning_rate": 8.894380853277836e-05, + "loss": 1.008, + "step": 3420 + }, + { + "epoch": 0.2676968703660345, + "grad_norm": 0.003817821154370904, + "learning_rate": 8.920395421436004e-05, + "loss": 0.3742, + "step": 3430 + }, + { + "epoch": 0.2684773277140404, + "grad_norm": 4.069940567016602, + "learning_rate": 8.946409989594173e-05, + "loss": 0.3514, + "step": 3440 + }, + { + "epoch": 0.26925778506204634, + "grad_norm": 0.027782615274190903, + "learning_rate": 8.972424557752341e-05, + "loss": 1.0349, + "step": 3450 + }, + { + "epoch": 0.2700382424100523, + "grad_norm": 0.10339374095201492, + "learning_rate": 8.998439125910511e-05, + "loss": 2.5487, + "step": 3460 + }, + { + "epoch": 0.2708186997580582, + "grad_norm": 0.7649679780006409, + "learning_rate": 9.024453694068679e-05, + "loss": 2.1772, + "step": 3470 + }, + { + "epoch": 0.27159915710606414, + "grad_norm": 54.28131866455078, + "learning_rate": 9.050468262226848e-05, + "loss": 1.4012, + "step": 3480 + }, + { + "epoch": 0.2723796144540701, + "grad_norm": 112.0675277709961, + "learning_rate": 9.076482830385015e-05, + "loss": 1.2856, + "step": 3490 + }, + { + "epoch": 0.273160071802076, + "grad_norm": 0.7972148656845093, + "learning_rate": 9.102497398543185e-05, + "loss": 0.6709, + "step": 3500 + }, + { + "epoch": 0.27394052915008194, + "grad_norm": 228.42440795898438, + "learning_rate": 9.128511966701353e-05, + "loss": 1.0519, + "step": 3510 + }, + { + "epoch": 0.2747209864980879, + "grad_norm": 1.3910229199609603e-06, + "learning_rate": 9.154526534859522e-05, + "loss": 1.2227, + "step": 3520 + }, + { + "epoch": 0.2755014438460938, + "grad_norm": 137.3799591064453, + "learning_rate": 9.18054110301769e-05, + "loss": 2.221, + "step": 3530 + }, + { + "epoch": 0.27628190119409973, + "grad_norm": 52.212223052978516, + "learning_rate": 9.20655567117586e-05, + "loss": 0.5392, + "step": 3540 + }, + { + "epoch": 0.27706235854210565, + "grad_norm": 12.489707946777344, + "learning_rate": 9.232570239334028e-05, + "loss": 1.2827, + "step": 3550 + }, + { + "epoch": 0.2778428158901116, + "grad_norm": 3.4159752804185928e-09, + "learning_rate": 9.258584807492197e-05, + "loss": 0.4744, + "step": 3560 + }, + { + "epoch": 0.27862327323811753, + "grad_norm": 6.787818908691406, + "learning_rate": 9.284599375650365e-05, + "loss": 1.175, + "step": 3570 + }, + { + "epoch": 0.27940373058612344, + "grad_norm": 0.2829515039920807, + "learning_rate": 9.310613943808533e-05, + "loss": 1.7776, + "step": 3580 + }, + { + "epoch": 0.2801841879341294, + "grad_norm": 0.000505531148519367, + "learning_rate": 9.336628511966701e-05, + "loss": 0.5682, + "step": 3590 + }, + { + "epoch": 0.28096464528213533, + "grad_norm": 53.973140716552734, + "learning_rate": 9.362643080124869e-05, + "loss": 2.9599, + "step": 3600 + }, + { + "epoch": 0.28174510263014124, + "grad_norm": 115.87065887451172, + "learning_rate": 9.388657648283039e-05, + "loss": 3.8834, + "step": 3610 + }, + { + "epoch": 0.2825255599781472, + "grad_norm": 15.08074951171875, + "learning_rate": 9.414672216441207e-05, + "loss": 0.6171, + "step": 3620 + }, + { + "epoch": 0.2833060173261531, + "grad_norm": 0.0419006384909153, + "learning_rate": 9.440686784599376e-05, + "loss": 2.639, + "step": 3630 + }, + { + "epoch": 0.28408647467415904, + "grad_norm": 1.0769245818664785e-05, + "learning_rate": 9.466701352757544e-05, + "loss": 1.3622, + "step": 3640 + }, + { + "epoch": 0.284866932022165, + "grad_norm": 45.47175979614258, + "learning_rate": 9.492715920915714e-05, + "loss": 0.6441, + "step": 3650 + }, + { + "epoch": 0.2856473893701709, + "grad_norm": 0.3931857943534851, + "learning_rate": 9.518730489073882e-05, + "loss": 1.6816, + "step": 3660 + }, + { + "epoch": 0.28642784671817684, + "grad_norm": 63.66070556640625, + "learning_rate": 9.544745057232051e-05, + "loss": 1.3548, + "step": 3670 + }, + { + "epoch": 0.2872083040661828, + "grad_norm": 12.380095481872559, + "learning_rate": 9.570759625390219e-05, + "loss": 0.5438, + "step": 3680 + }, + { + "epoch": 0.2879887614141887, + "grad_norm": 8.507214546203613, + "learning_rate": 9.596774193548387e-05, + "loss": 0.7701, + "step": 3690 + }, + { + "epoch": 0.28876921876219463, + "grad_norm": 0.00235476135276258, + "learning_rate": 9.622788761706555e-05, + "loss": 0.0634, + "step": 3700 + }, + { + "epoch": 0.2895496761102006, + "grad_norm": 0.45816951990127563, + "learning_rate": 9.648803329864725e-05, + "loss": 0.9807, + "step": 3710 + }, + { + "epoch": 0.2903301334582065, + "grad_norm": 0.003578943433240056, + "learning_rate": 9.674817898022893e-05, + "loss": 1.536, + "step": 3720 + }, + { + "epoch": 0.29111059080621243, + "grad_norm": 114.68425750732422, + "learning_rate": 9.700832466181062e-05, + "loss": 1.1759, + "step": 3730 + }, + { + "epoch": 0.29189104815421835, + "grad_norm": 28.14389419555664, + "learning_rate": 9.72684703433923e-05, + "loss": 2.6994, + "step": 3740 + }, + { + "epoch": 0.2926715055022243, + "grad_norm": 71.8919677734375, + "learning_rate": 9.7528616024974e-05, + "loss": 1.5178, + "step": 3750 + }, + { + "epoch": 0.29345196285023023, + "grad_norm": 56.985450744628906, + "learning_rate": 9.778876170655568e-05, + "loss": 0.657, + "step": 3760 + }, + { + "epoch": 0.29423242019823614, + "grad_norm": 49.4119758605957, + "learning_rate": 9.804890738813736e-05, + "loss": 0.7112, + "step": 3770 + }, + { + "epoch": 0.2950128775462421, + "grad_norm": 23.014514923095703, + "learning_rate": 9.830905306971905e-05, + "loss": 0.857, + "step": 3780 + }, + { + "epoch": 0.29579333489424803, + "grad_norm": 32.53450012207031, + "learning_rate": 9.856919875130073e-05, + "loss": 0.7348, + "step": 3790 + }, + { + "epoch": 0.29657379224225394, + "grad_norm": 59.46930694580078, + "learning_rate": 9.882934443288241e-05, + "loss": 2.8634, + "step": 3800 + }, + { + "epoch": 0.2973542495902599, + "grad_norm": 15.2013521194458, + "learning_rate": 9.90894901144641e-05, + "loss": 0.5304, + "step": 3810 + }, + { + "epoch": 0.2981347069382658, + "grad_norm": 11.043535232543945, + "learning_rate": 9.934963579604579e-05, + "loss": 0.7362, + "step": 3820 + }, + { + "epoch": 0.29891516428627174, + "grad_norm": 0.4539059102535248, + "learning_rate": 9.960978147762747e-05, + "loss": 1.841, + "step": 3830 + }, + { + "epoch": 0.2996956216342777, + "grad_norm": 0.08967360109090805, + "learning_rate": 9.986992715920916e-05, + "loss": 0.451, + "step": 3840 + }, + { + "epoch": 0.3004760789822836, + "grad_norm": 9.775715827941895, + "learning_rate": 9.99999948458971e-05, + "loss": 0.0336, + "step": 3850 + }, + { + "epoch": 0.30125653633028954, + "grad_norm": 0.0003560681361705065, + "learning_rate": 9.999995361308023e-05, + "loss": 6.5222, + "step": 3860 + }, + { + "epoch": 0.3020369936782955, + "grad_norm": 74.62921905517578, + "learning_rate": 9.999987114748048e-05, + "loss": 1.7191, + "step": 3870 + }, + { + "epoch": 0.3028174510263014, + "grad_norm": 0.004861526656895876, + "learning_rate": 9.999974744916586e-05, + "loss": 0.661, + "step": 3880 + }, + { + "epoch": 0.30359790837430733, + "grad_norm": 20.33612823486328, + "learning_rate": 9.999958251823837e-05, + "loss": 0.6154, + "step": 3890 + }, + { + "epoch": 0.30437836572231325, + "grad_norm": 3.1527907848358154, + "learning_rate": 9.999937635483406e-05, + "loss": 1.4707, + "step": 3900 + }, + { + "epoch": 0.3051588230703192, + "grad_norm": 70.3372802734375, + "learning_rate": 9.999912895912291e-05, + "loss": 1.3573, + "step": 3910 + }, + { + "epoch": 0.30593928041832513, + "grad_norm": 0.2236901819705963, + "learning_rate": 9.999884033130893e-05, + "loss": 1.436, + "step": 3920 + }, + { + "epoch": 0.30671973776633105, + "grad_norm": 0.030305564403533936, + "learning_rate": 9.999851047163016e-05, + "loss": 0.4924, + "step": 3930 + }, + { + "epoch": 0.307500195114337, + "grad_norm": 5.503699753717228e-07, + "learning_rate": 9.999813938035863e-05, + "loss": 1.1959, + "step": 3940 + }, + { + "epoch": 0.30828065246234293, + "grad_norm": 50.966861724853516, + "learning_rate": 9.999772705780032e-05, + "loss": 0.5705, + "step": 3950 + }, + { + "epoch": 0.30906110981034884, + "grad_norm": 0.027110936120152473, + "learning_rate": 9.999727350429529e-05, + "loss": 0.9733, + "step": 3960 + }, + { + "epoch": 0.3098415671583548, + "grad_norm": 6.961010456085205, + "learning_rate": 9.999677872021755e-05, + "loss": 0.5901, + "step": 3970 + }, + { + "epoch": 0.3106220245063607, + "grad_norm": 13.56571102142334, + "learning_rate": 9.999624270597515e-05, + "loss": 0.1543, + "step": 3980 + }, + { + "epoch": 0.31140248185436664, + "grad_norm": 44.97492980957031, + "learning_rate": 9.999566546201008e-05, + "loss": 1.1215, + "step": 3990 + }, + { + "epoch": 0.3121829392023726, + "grad_norm": 20.070308685302734, + "learning_rate": 9.999504698879838e-05, + "loss": 1.4225, + "step": 4000 + }, + { + "epoch": 0.3129633965503785, + "grad_norm": 1.784777283668518, + "learning_rate": 9.999438728685011e-05, + "loss": 0.4262, + "step": 4010 + }, + { + "epoch": 0.31374385389838444, + "grad_norm": 6.901144504547119, + "learning_rate": 9.999368635670928e-05, + "loss": 0.6186, + "step": 4020 + }, + { + "epoch": 0.3145243112463904, + "grad_norm": 4.161862373352051, + "learning_rate": 9.999294419895389e-05, + "loss": 1.6664, + "step": 4030 + }, + { + "epoch": 0.3153047685943963, + "grad_norm": 0.14534419775009155, + "learning_rate": 9.999216081419599e-05, + "loss": 1.4358, + "step": 4040 + }, + { + "epoch": 0.31608522594240224, + "grad_norm": 8.709707260131836, + "learning_rate": 9.999133620308161e-05, + "loss": 1.2329, + "step": 4050 + }, + { + "epoch": 0.3168656832904082, + "grad_norm": 59.18502426147461, + "learning_rate": 9.999047036629074e-05, + "loss": 1.7152, + "step": 4060 + }, + { + "epoch": 0.3176461406384141, + "grad_norm": 6.609310626983643, + "learning_rate": 9.998956330453744e-05, + "loss": 1.5284, + "step": 4070 + }, + { + "epoch": 0.31842659798642003, + "grad_norm": 34.27482604980469, + "learning_rate": 9.998861501856969e-05, + "loss": 0.7867, + "step": 4080 + }, + { + "epoch": 0.31920705533442595, + "grad_norm": 0.22041009366512299, + "learning_rate": 9.99876255091695e-05, + "loss": 1.6204, + "step": 4090 + }, + { + "epoch": 0.3199875126824319, + "grad_norm": 71.2801742553711, + "learning_rate": 9.99865947771529e-05, + "loss": 1.2527, + "step": 4100 + }, + { + "epoch": 0.32076797003043783, + "grad_norm": 29.67563819885254, + "learning_rate": 9.998552282336988e-05, + "loss": 1.1256, + "step": 4110 + }, + { + "epoch": 0.32154842737844375, + "grad_norm": 39.03765869140625, + "learning_rate": 9.998440964870443e-05, + "loss": 1.2212, + "step": 4120 + }, + { + "epoch": 0.3223288847264497, + "grad_norm": 75.92422485351562, + "learning_rate": 9.998325525407453e-05, + "loss": 1.7344, + "step": 4130 + }, + { + "epoch": 0.32310934207445563, + "grad_norm": 11.299295425415039, + "learning_rate": 9.998205964043217e-05, + "loss": 1.7633, + "step": 4140 + }, + { + "epoch": 0.32388979942246154, + "grad_norm": 0.8371900916099548, + "learning_rate": 9.99808228087633e-05, + "loss": 1.8775, + "step": 4150 + }, + { + "epoch": 0.3246702567704675, + "grad_norm": 0.0006445566541515291, + "learning_rate": 9.997954476008792e-05, + "loss": 0.9025, + "step": 4160 + }, + { + "epoch": 0.3254507141184734, + "grad_norm": 0.00038227959885261953, + "learning_rate": 9.997822549545995e-05, + "loss": 0.2794, + "step": 4170 + }, + { + "epoch": 0.32623117146647934, + "grad_norm": 4.977309799869545e-06, + "learning_rate": 9.997686501596735e-05, + "loss": 1.2245, + "step": 4180 + }, + { + "epoch": 0.3270116288144853, + "grad_norm": 2.4554672241210938, + "learning_rate": 9.997546332273202e-05, + "loss": 0.1281, + "step": 4190 + }, + { + "epoch": 0.3277920861624912, + "grad_norm": 1.094784853050701e-11, + "learning_rate": 9.99740204169099e-05, + "loss": 0.9423, + "step": 4200 + }, + { + "epoch": 0.32857254351049714, + "grad_norm": 59.58600616455078, + "learning_rate": 9.997253629969089e-05, + "loss": 6.1129, + "step": 4210 + }, + { + "epoch": 0.3293530008585031, + "grad_norm": 41.99625778198242, + "learning_rate": 9.997101097229886e-05, + "loss": 0.6757, + "step": 4220 + }, + { + "epoch": 0.330133458206509, + "grad_norm": 18.46398162841797, + "learning_rate": 9.99694444359917e-05, + "loss": 1.9334, + "step": 4230 + }, + { + "epoch": 0.33091391555451494, + "grad_norm": 0.34535104036331177, + "learning_rate": 9.996783669206124e-05, + "loss": 0.0171, + "step": 4240 + }, + { + "epoch": 0.33169437290252085, + "grad_norm": 70.5401840209961, + "learning_rate": 9.996618774183335e-05, + "loss": 1.538, + "step": 4250 + }, + { + "epoch": 0.3324748302505268, + "grad_norm": 0.9419988989830017, + "learning_rate": 9.996449758666784e-05, + "loss": 1.0131, + "step": 4260 + }, + { + "epoch": 0.33325528759853273, + "grad_norm": 44.18296813964844, + "learning_rate": 9.996276622795847e-05, + "loss": 1.2465, + "step": 4270 + }, + { + "epoch": 0.33403574494653865, + "grad_norm": 48.11833190917969, + "learning_rate": 9.996099366713306e-05, + "loss": 1.1763, + "step": 4280 + }, + { + "epoch": 0.3348162022945446, + "grad_norm": 2.812107563018799, + "learning_rate": 9.995917990565333e-05, + "loss": 1.1437, + "step": 4290 + }, + { + "epoch": 0.33559665964255053, + "grad_norm": 14.357457160949707, + "learning_rate": 9.995732494501505e-05, + "loss": 1.9292, + "step": 4300 + }, + { + "epoch": 0.33637711699055645, + "grad_norm": 25.811626434326172, + "learning_rate": 9.995542878674789e-05, + "loss": 0.8213, + "step": 4310 + }, + { + "epoch": 0.3371575743385624, + "grad_norm": 55.16279602050781, + "learning_rate": 9.995349143241552e-05, + "loss": 1.6721, + "step": 4320 + }, + { + "epoch": 0.33793803168656833, + "grad_norm": 0.0017165453173220158, + "learning_rate": 9.995151288361564e-05, + "loss": 0.5614, + "step": 4330 + }, + { + "epoch": 0.33871848903457424, + "grad_norm": 0.8089917898178101, + "learning_rate": 9.994949314197985e-05, + "loss": 0.4937, + "step": 4340 + }, + { + "epoch": 0.3394989463825802, + "grad_norm": 6.39279317855835, + "learning_rate": 9.994743220917372e-05, + "loss": 1.6623, + "step": 4350 + }, + { + "epoch": 0.3402794037305861, + "grad_norm": 13.513890266418457, + "learning_rate": 9.994533008689684e-05, + "loss": 2.12, + "step": 4360 + }, + { + "epoch": 0.34105986107859204, + "grad_norm": 2.0500733852386475, + "learning_rate": 9.994318677688273e-05, + "loss": 0.3837, + "step": 4370 + }, + { + "epoch": 0.341840318426598, + "grad_norm": 40.21711349487305, + "learning_rate": 9.994100228089888e-05, + "loss": 1.4057, + "step": 4380 + }, + { + "epoch": 0.3426207757746039, + "grad_norm": 59.82537078857422, + "learning_rate": 9.993877660074675e-05, + "loss": 0.922, + "step": 4390 + }, + { + "epoch": 0.34340123312260984, + "grad_norm": 30.938413619995117, + "learning_rate": 9.993650973826176e-05, + "loss": 1.4275, + "step": 4400 + }, + { + "epoch": 0.34418169047061575, + "grad_norm": 0.035656414926052094, + "learning_rate": 9.99342016953133e-05, + "loss": 0.4097, + "step": 4410 + }, + { + "epoch": 0.3449621478186217, + "grad_norm": 9.82229232788086, + "learning_rate": 9.993185247380471e-05, + "loss": 0.7342, + "step": 4420 + }, + { + "epoch": 0.34574260516662764, + "grad_norm": 8.707996368408203, + "learning_rate": 9.992946207567328e-05, + "loss": 0.6301, + "step": 4430 + }, + { + "epoch": 0.34652306251463355, + "grad_norm": 6.594290733337402, + "learning_rate": 9.99270305028903e-05, + "loss": 0.9255, + "step": 4440 + }, + { + "epoch": 0.3473035198626395, + "grad_norm": 55.64490509033203, + "learning_rate": 9.992455775746092e-05, + "loss": 2.7586, + "step": 4450 + }, + { + "epoch": 0.34808397721064543, + "grad_norm": 18.06779670715332, + "learning_rate": 9.992204384142438e-05, + "loss": 0.9131, + "step": 4460 + }, + { + "epoch": 0.34886443455865135, + "grad_norm": 5.652497291564941, + "learning_rate": 9.991948875685375e-05, + "loss": 0.7655, + "step": 4470 + }, + { + "epoch": 0.3496448919066573, + "grad_norm": 48.18488693237305, + "learning_rate": 9.99168925058561e-05, + "loss": 0.9117, + "step": 4480 + }, + { + "epoch": 0.35042534925466323, + "grad_norm": 25.798969268798828, + "learning_rate": 9.991425509057245e-05, + "loss": 1.0564, + "step": 4490 + }, + { + "epoch": 0.35120580660266915, + "grad_norm": 1.1699587106704712, + "learning_rate": 9.991157651317777e-05, + "loss": 0.3099, + "step": 4500 + }, + { + "epoch": 0.3519862639506751, + "grad_norm": 44.54468536376953, + "learning_rate": 9.990885677588096e-05, + "loss": 0.9814, + "step": 4510 + }, + { + "epoch": 0.35276672129868103, + "grad_norm": 28.770235061645508, + "learning_rate": 9.990609588092488e-05, + "loss": 0.9127, + "step": 4520 + }, + { + "epoch": 0.35354717864668694, + "grad_norm": 3.9841158390045166, + "learning_rate": 9.990329383058628e-05, + "loss": 2.0077, + "step": 4530 + }, + { + "epoch": 0.3543276359946929, + "grad_norm": 32.356719970703125, + "learning_rate": 9.990045062717594e-05, + "loss": 1.0274, + "step": 4540 + }, + { + "epoch": 0.3551080933426988, + "grad_norm": 1.2889198064804077, + "learning_rate": 9.98975662730385e-05, + "loss": 0.7582, + "step": 4550 + }, + { + "epoch": 0.35588855069070474, + "grad_norm": 49.0152473449707, + "learning_rate": 9.989464077055257e-05, + "loss": 0.5338, + "step": 4560 + }, + { + "epoch": 0.3566690080387107, + "grad_norm": 3.9910778999328613, + "learning_rate": 9.989167412213067e-05, + "loss": 1.8175, + "step": 4570 + }, + { + "epoch": 0.3574494653867166, + "grad_norm": 11.524066925048828, + "learning_rate": 9.988866633021928e-05, + "loss": 0.5534, + "step": 4580 + }, + { + "epoch": 0.35822992273472254, + "grad_norm": 76.08323669433594, + "learning_rate": 9.988561739729878e-05, + "loss": 0.8186, + "step": 4590 + }, + { + "epoch": 0.35901038008272845, + "grad_norm": 15.860508918762207, + "learning_rate": 9.988252732588351e-05, + "loss": 0.3079, + "step": 4600 + }, + { + "epoch": 0.3597908374307344, + "grad_norm": 0.6550878882408142, + "learning_rate": 9.98793961185217e-05, + "loss": 0.6749, + "step": 4610 + }, + { + "epoch": 0.36057129477874034, + "grad_norm": 11.891127586364746, + "learning_rate": 9.987622377779555e-05, + "loss": 0.8806, + "step": 4620 + }, + { + "epoch": 0.36135175212674625, + "grad_norm": 0.2516235113143921, + "learning_rate": 9.98730103063211e-05, + "loss": 1.2591, + "step": 4630 + }, + { + "epoch": 0.3621322094747522, + "grad_norm": 0.00010328528878744692, + "learning_rate": 9.986975570674842e-05, + "loss": 0.9852, + "step": 4640 + }, + { + "epoch": 0.36291266682275813, + "grad_norm": 2.1215724732570607e-09, + "learning_rate": 9.986645998176139e-05, + "loss": 0.4795, + "step": 4650 + }, + { + "epoch": 0.36369312417076405, + "grad_norm": 17.579479217529297, + "learning_rate": 9.986312313407787e-05, + "loss": 0.6918, + "step": 4660 + }, + { + "epoch": 0.36447358151877, + "grad_norm": 66.99678802490234, + "learning_rate": 9.985974516644962e-05, + "loss": 0.4536, + "step": 4670 + }, + { + "epoch": 0.36525403886677593, + "grad_norm": 0.025410788133740425, + "learning_rate": 9.985632608166228e-05, + "loss": 0.5175, + "step": 4680 + }, + { + "epoch": 0.36603449621478185, + "grad_norm": 49.9116325378418, + "learning_rate": 9.985286588253544e-05, + "loss": 1.2667, + "step": 4690 + }, + { + "epoch": 0.3668149535627878, + "grad_norm": 1.777963638305664, + "learning_rate": 9.984936457192256e-05, + "loss": 0.7312, + "step": 4700 + }, + { + "epoch": 0.36759541091079373, + "grad_norm": 0.00023074759519658983, + "learning_rate": 9.984582215271103e-05, + "loss": 0.1217, + "step": 4710 + }, + { + "epoch": 0.36837586825879964, + "grad_norm": 0.7179341316223145, + "learning_rate": 9.984223862782214e-05, + "loss": 3.252, + "step": 4720 + }, + { + "epoch": 0.3691563256068056, + "grad_norm": 2.1000545530114323e-05, + "learning_rate": 9.983861400021104e-05, + "loss": 0.9837, + "step": 4730 + }, + { + "epoch": 0.3699367829548115, + "grad_norm": 0.0001699527056189254, + "learning_rate": 9.983494827286681e-05, + "loss": 0.2548, + "step": 4740 + }, + { + "epoch": 0.37071724030281744, + "grad_norm": 64.94124603271484, + "learning_rate": 9.983124144881244e-05, + "loss": 1.004, + "step": 4750 + }, + { + "epoch": 0.37149769765082336, + "grad_norm": 4.1760454177856445, + "learning_rate": 9.982749353110474e-05, + "loss": 0.4211, + "step": 4760 + }, + { + "epoch": 0.3722781549988293, + "grad_norm": 1.8572329281596467e-05, + "learning_rate": 9.98237045228345e-05, + "loss": 1.5099, + "step": 4770 + }, + { + "epoch": 0.37305861234683524, + "grad_norm": 3.898281875081011e-07, + "learning_rate": 9.981987442712633e-05, + "loss": 0.9033, + "step": 4780 + }, + { + "epoch": 0.37383906969484115, + "grad_norm": 0.23063763976097107, + "learning_rate": 9.981600324713873e-05, + "loss": 1.7242, + "step": 4790 + }, + { + "epoch": 0.3746195270428471, + "grad_norm": 0.015200862661004066, + "learning_rate": 9.981209098606413e-05, + "loss": 0.8823, + "step": 4800 + }, + { + "epoch": 0.37539998439085304, + "grad_norm": 0.010344978421926498, + "learning_rate": 9.980813764712878e-05, + "loss": 2.0015, + "step": 4810 + }, + { + "epoch": 0.37618044173885895, + "grad_norm": 73.27796173095703, + "learning_rate": 9.98041432335928e-05, + "loss": 0.8186, + "step": 4820 + }, + { + "epoch": 0.3769608990868649, + "grad_norm": 0.0006082833278924227, + "learning_rate": 9.980010774875024e-05, + "loss": 0.3812, + "step": 4830 + }, + { + "epoch": 0.37774135643487083, + "grad_norm": 3.770277908188291e-05, + "learning_rate": 9.9796031195929e-05, + "loss": 1.0389, + "step": 4840 + }, + { + "epoch": 0.37852181378287675, + "grad_norm": 104.53764343261719, + "learning_rate": 9.97919135784908e-05, + "loss": 1.5568, + "step": 4850 + }, + { + "epoch": 0.3793022711308827, + "grad_norm": 3.955908045583101e-09, + "learning_rate": 9.97877548998313e-05, + "loss": 0.7482, + "step": 4860 + }, + { + "epoch": 0.38008272847888863, + "grad_norm": 1.5290652513504028, + "learning_rate": 9.978355516337994e-05, + "loss": 1.2799, + "step": 4870 + }, + { + "epoch": 0.38086318582689455, + "grad_norm": 60.555572509765625, + "learning_rate": 9.977931437260009e-05, + "loss": 0.9653, + "step": 4880 + }, + { + "epoch": 0.3816436431749005, + "grad_norm": 2.333822250366211, + "learning_rate": 9.977503253098892e-05, + "loss": 1.9994, + "step": 4890 + }, + { + "epoch": 0.38242410052290643, + "grad_norm": 0.04746650159358978, + "learning_rate": 9.97707096420775e-05, + "loss": 0.0984, + "step": 4900 + }, + { + "epoch": 0.38320455787091234, + "grad_norm": 0.16147121787071228, + "learning_rate": 9.976634570943071e-05, + "loss": 1.2237, + "step": 4910 + }, + { + "epoch": 0.3839850152189183, + "grad_norm": 0.08274467289447784, + "learning_rate": 9.976194073664732e-05, + "loss": 2.1904, + "step": 4920 + }, + { + "epoch": 0.3847654725669242, + "grad_norm": 82.22650146484375, + "learning_rate": 9.97574947273599e-05, + "loss": 0.466, + "step": 4930 + }, + { + "epoch": 0.38554592991493014, + "grad_norm": 24.859355926513672, + "learning_rate": 9.975300768523487e-05, + "loss": 0.3493, + "step": 4940 + }, + { + "epoch": 0.38632638726293606, + "grad_norm": 1.2780405282974243, + "learning_rate": 9.974847961397253e-05, + "loss": 2.8073, + "step": 4950 + }, + { + "epoch": 0.387106844610942, + "grad_norm": 0.04860816150903702, + "learning_rate": 9.974391051730696e-05, + "loss": 0.5481, + "step": 4960 + }, + { + "epoch": 0.38788730195894794, + "grad_norm": 3.8242519622144755e-06, + "learning_rate": 9.97393003990061e-05, + "loss": 1.5707, + "step": 4970 + }, + { + "epoch": 0.38866775930695385, + "grad_norm": 2.876926898956299, + "learning_rate": 9.973464926287173e-05, + "loss": 0.6285, + "step": 4980 + }, + { + "epoch": 0.3894482166549598, + "grad_norm": 86.59673309326172, + "learning_rate": 9.97299571127394e-05, + "loss": 1.2899, + "step": 4990 + }, + { + "epoch": 0.39022867400296574, + "grad_norm": 2.2732436656951904, + "learning_rate": 9.972522395247855e-05, + "loss": 0.898, + "step": 5000 + }, + { + "epoch": 0.39100913135097165, + "grad_norm": 0.33496013283729553, + "learning_rate": 9.972044978599241e-05, + "loss": 0.3816, + "step": 5010 + }, + { + "epoch": 0.3917895886989776, + "grad_norm": 46.32811737060547, + "learning_rate": 9.971563461721802e-05, + "loss": 2.3764, + "step": 5020 + }, + { + "epoch": 0.39257004604698353, + "grad_norm": 81.04834747314453, + "learning_rate": 9.971077845012625e-05, + "loss": 1.0033, + "step": 5030 + }, + { + "epoch": 0.39335050339498945, + "grad_norm": 73.85846710205078, + "learning_rate": 9.970588128872176e-05, + "loss": 0.4376, + "step": 5040 + }, + { + "epoch": 0.3941309607429954, + "grad_norm": 8.540753697161563e-06, + "learning_rate": 9.970094313704303e-05, + "loss": 3.2187, + "step": 5050 + }, + { + "epoch": 0.39491141809100133, + "grad_norm": 2.9623231887817383, + "learning_rate": 9.969596399916233e-05, + "loss": 0.7573, + "step": 5060 + }, + { + "epoch": 0.39569187543900725, + "grad_norm": 58.8137321472168, + "learning_rate": 9.969094387918574e-05, + "loss": 0.5387, + "step": 5070 + }, + { + "epoch": 0.3964723327870132, + "grad_norm": 52.218223571777344, + "learning_rate": 9.968588278125315e-05, + "loss": 3.7089, + "step": 5080 + }, + { + "epoch": 0.39725279013501913, + "grad_norm": 3.2659690380096436, + "learning_rate": 9.968078070953822e-05, + "loss": 0.3519, + "step": 5090 + }, + { + "epoch": 0.39803324748302504, + "grad_norm": 0.0405220165848732, + "learning_rate": 9.967563766824839e-05, + "loss": 0.6423, + "step": 5100 + }, + { + "epoch": 0.39881370483103096, + "grad_norm": 77.0758056640625, + "learning_rate": 9.967045366162491e-05, + "loss": 1.3186, + "step": 5110 + }, + { + "epoch": 0.3995941621790369, + "grad_norm": 0.002084170002490282, + "learning_rate": 9.966522869394281e-05, + "loss": 1.0188, + "step": 5120 + }, + { + "epoch": 0.40037461952704284, + "grad_norm": 20.08683204650879, + "learning_rate": 9.96599627695109e-05, + "loss": 2.0177, + "step": 5130 + }, + { + "epoch": 0.40115507687504876, + "grad_norm": 4.911335418000817e-05, + "learning_rate": 9.965465589267175e-05, + "loss": 0.4571, + "step": 5140 + }, + { + "epoch": 0.4019355342230547, + "grad_norm": 0.00017713586566969752, + "learning_rate": 9.96493080678017e-05, + "loss": 0.5754, + "step": 5150 + }, + { + "epoch": 0.40271599157106064, + "grad_norm": 8.778138160705566, + "learning_rate": 9.964391929931089e-05, + "loss": 0.2, + "step": 5160 + }, + { + "epoch": 0.40349644891906655, + "grad_norm": 7.931397205673818e-10, + "learning_rate": 9.96384895916432e-05, + "loss": 1.9417, + "step": 5170 + }, + { + "epoch": 0.4042769062670725, + "grad_norm": 46.630882263183594, + "learning_rate": 9.963301894927623e-05, + "loss": 1.6278, + "step": 5180 + }, + { + "epoch": 0.40505736361507844, + "grad_norm": 0.9577981233596802, + "learning_rate": 9.962750737672143e-05, + "loss": 1.0923, + "step": 5190 + }, + { + "epoch": 0.40583782096308435, + "grad_norm": 10.732783317565918, + "learning_rate": 9.962195487852395e-05, + "loss": 0.7055, + "step": 5200 + }, + { + "epoch": 0.4066182783110903, + "grad_norm": 9.7693775291674e-10, + "learning_rate": 9.961636145926267e-05, + "loss": 0.0712, + "step": 5210 + }, + { + "epoch": 0.40739873565909623, + "grad_norm": 25.384599685668945, + "learning_rate": 9.961072712355026e-05, + "loss": 0.6719, + "step": 5220 + }, + { + "epoch": 0.40817919300710215, + "grad_norm": 88.48016357421875, + "learning_rate": 9.960505187603308e-05, + "loss": 0.5012, + "step": 5230 + }, + { + "epoch": 0.4089596503551081, + "grad_norm": 3.6306564421551e-09, + "learning_rate": 9.959933572139131e-05, + "loss": 4.2087, + "step": 5240 + }, + { + "epoch": 0.40974010770311403, + "grad_norm": 2.399672031402588, + "learning_rate": 9.959357866433876e-05, + "loss": 0.4929, + "step": 5250 + }, + { + "epoch": 0.41052056505111995, + "grad_norm": 14.656164169311523, + "learning_rate": 9.958778070962307e-05, + "loss": 1.2333, + "step": 5260 + }, + { + "epoch": 0.4113010223991259, + "grad_norm": 2.2186574935913086, + "learning_rate": 9.958194186202551e-05, + "loss": 2.2942, + "step": 5270 + }, + { + "epoch": 0.41208147974713183, + "grad_norm": 2.9925684928894043, + "learning_rate": 9.957606212636117e-05, + "loss": 0.7031, + "step": 5280 + }, + { + "epoch": 0.41286193709513774, + "grad_norm": 81.13506317138672, + "learning_rate": 9.95701415074788e-05, + "loss": 1.2966, + "step": 5290 + }, + { + "epoch": 0.41364239444314366, + "grad_norm": 0.000894436496309936, + "learning_rate": 9.956418001026087e-05, + "loss": 0.2619, + "step": 5300 + }, + { + "epoch": 0.4144228517911496, + "grad_norm": 2.2047314643859863, + "learning_rate": 9.955817763962356e-05, + "loss": 0.7434, + "step": 5310 + }, + { + "epoch": 0.41520330913915554, + "grad_norm": 0.00014062505215406418, + "learning_rate": 9.955213440051676e-05, + "loss": 0.0787, + "step": 5320 + }, + { + "epoch": 0.41598376648716145, + "grad_norm": 7.172604000516003e-06, + "learning_rate": 9.95460502979241e-05, + "loss": 2.4076, + "step": 5330 + }, + { + "epoch": 0.4167642238351674, + "grad_norm": 1.4012911319732666, + "learning_rate": 9.953992533686282e-05, + "loss": 0.7168, + "step": 5340 + }, + { + "epoch": 0.41754468118317334, + "grad_norm": 6.324690818786621, + "learning_rate": 9.953375952238398e-05, + "loss": 2.9089, + "step": 5350 + }, + { + "epoch": 0.41832513853117925, + "grad_norm": 10.851876258850098, + "learning_rate": 9.952755285957218e-05, + "loss": 0.1223, + "step": 5360 + }, + { + "epoch": 0.4191055958791852, + "grad_norm": 61.75676345825195, + "learning_rate": 9.952130535354585e-05, + "loss": 1.1446, + "step": 5370 + }, + { + "epoch": 0.41988605322719114, + "grad_norm": 6.387716293334961, + "learning_rate": 9.9515017009457e-05, + "loss": 4.3004, + "step": 5380 + }, + { + "epoch": 0.42066651057519705, + "grad_norm": 3.521226644515991, + "learning_rate": 9.950868783249136e-05, + "loss": 0.4539, + "step": 5390 + }, + { + "epoch": 0.421446967923203, + "grad_norm": 76.65003967285156, + "learning_rate": 9.950231782786832e-05, + "loss": 1.3463, + "step": 5400 + }, + { + "epoch": 0.42222742527120893, + "grad_norm": 58.46784973144531, + "learning_rate": 9.949590700084096e-05, + "loss": 1.5774, + "step": 5410 + }, + { + "epoch": 0.42300788261921485, + "grad_norm": 29.42442512512207, + "learning_rate": 9.9489455356696e-05, + "loss": 1.471, + "step": 5420 + }, + { + "epoch": 0.4237883399672208, + "grad_norm": 49.83870315551758, + "learning_rate": 9.948296290075385e-05, + "loss": 0.7125, + "step": 5430 + }, + { + "epoch": 0.42456879731522673, + "grad_norm": 8.928007125854492, + "learning_rate": 9.947642963836852e-05, + "loss": 0.4729, + "step": 5440 + }, + { + "epoch": 0.42534925466323265, + "grad_norm": 0.006001488771289587, + "learning_rate": 9.946985557492774e-05, + "loss": 0.8174, + "step": 5450 + }, + { + "epoch": 0.42612971201123856, + "grad_norm": 3.2906408309936523, + "learning_rate": 9.946324071585283e-05, + "loss": 0.7242, + "step": 5460 + }, + { + "epoch": 0.42691016935924453, + "grad_norm": 5.074894428253174, + "learning_rate": 9.945658506659878e-05, + "loss": 1.5351, + "step": 5470 + }, + { + "epoch": 0.42769062670725044, + "grad_norm": 0.013566683046519756, + "learning_rate": 9.944988863265423e-05, + "loss": 0.3799, + "step": 5480 + }, + { + "epoch": 0.42847108405525636, + "grad_norm": 0.2592769265174866, + "learning_rate": 9.944315141954142e-05, + "loss": 0.3326, + "step": 5490 + }, + { + "epoch": 0.4292515414032623, + "grad_norm": 56.55481719970703, + "learning_rate": 9.943637343281623e-05, + "loss": 0.5757, + "step": 5500 + }, + { + "epoch": 0.43003199875126824, + "grad_norm": 52.10049819946289, + "learning_rate": 9.94295546780682e-05, + "loss": 0.3103, + "step": 5510 + }, + { + "epoch": 0.43081245609927415, + "grad_norm": 0.00042787150596268475, + "learning_rate": 9.942269516092043e-05, + "loss": 0.3928, + "step": 5520 + }, + { + "epoch": 0.4315929134472801, + "grad_norm": 3.384069771072973e-07, + "learning_rate": 9.941579488702968e-05, + "loss": 2.6701, + "step": 5530 + }, + { + "epoch": 0.43237337079528604, + "grad_norm": 69.2872085571289, + "learning_rate": 9.940885386208631e-05, + "loss": 0.7537, + "step": 5540 + }, + { + "epoch": 0.43315382814329195, + "grad_norm": 2.248446464538574, + "learning_rate": 9.940187209181427e-05, + "loss": 0.0384, + "step": 5550 + }, + { + "epoch": 0.4339342854912979, + "grad_norm": 0.11496047675609589, + "learning_rate": 9.939484958197112e-05, + "loss": 0.8702, + "step": 5560 + }, + { + "epoch": 0.43471474283930384, + "grad_norm": 83.21031188964844, + "learning_rate": 9.938778633834802e-05, + "loss": 2.4813, + "step": 5570 + }, + { + "epoch": 0.43549520018730975, + "grad_norm": 40.44232177734375, + "learning_rate": 9.938068236676974e-05, + "loss": 1.668, + "step": 5580 + }, + { + "epoch": 0.4362756575353157, + "grad_norm": 52.226715087890625, + "learning_rate": 9.937353767309458e-05, + "loss": 2.4203, + "step": 5590 + }, + { + "epoch": 0.43705611488332163, + "grad_norm": 93.075439453125, + "learning_rate": 9.936635226321447e-05, + "loss": 0.6886, + "step": 5600 + }, + { + "epoch": 0.43783657223132755, + "grad_norm": 0.0370655320584774, + "learning_rate": 9.935912614305493e-05, + "loss": 1.5385, + "step": 5610 + }, + { + "epoch": 0.43861702957933346, + "grad_norm": 50.700130462646484, + "learning_rate": 9.935185931857499e-05, + "loss": 1.234, + "step": 5620 + }, + { + "epoch": 0.43939748692733943, + "grad_norm": 16.66071891784668, + "learning_rate": 9.93445517957673e-05, + "loss": 0.2529, + "step": 5630 + }, + { + "epoch": 0.44017794427534535, + "grad_norm": 0.025942066684365273, + "learning_rate": 9.933720358065807e-05, + "loss": 0.9281, + "step": 5640 + }, + { + "epoch": 0.44095840162335126, + "grad_norm": 0.00847440306097269, + "learning_rate": 9.932981467930702e-05, + "loss": 1.5612, + "step": 5650 + }, + { + "epoch": 0.44173885897135723, + "grad_norm": 1.0681140422821045, + "learning_rate": 9.932238509780749e-05, + "loss": 1.734, + "step": 5660 + }, + { + "epoch": 0.44251931631936314, + "grad_norm": 56.31792449951172, + "learning_rate": 9.93149148422863e-05, + "loss": 2.0712, + "step": 5670 + }, + { + "epoch": 0.44329977366736906, + "grad_norm": 1.9309101104736328, + "learning_rate": 9.930740391890386e-05, + "loss": 1.5979, + "step": 5680 + }, + { + "epoch": 0.444080231015375, + "grad_norm": 61.22231674194336, + "learning_rate": 9.92998523338541e-05, + "loss": 1.4279, + "step": 5690 + }, + { + "epoch": 0.44486068836338094, + "grad_norm": 86.64026641845703, + "learning_rate": 9.92922600933645e-05, + "loss": 0.7382, + "step": 5700 + }, + { + "epoch": 0.44564114571138685, + "grad_norm": 4.9843316078186035, + "learning_rate": 9.928462720369602e-05, + "loss": 2.5206, + "step": 5710 + }, + { + "epoch": 0.4464216030593928, + "grad_norm": 17.059606552124023, + "learning_rate": 9.927695367114318e-05, + "loss": 1.4367, + "step": 5720 + }, + { + "epoch": 0.44720206040739874, + "grad_norm": 1.7653934955596924, + "learning_rate": 9.926923950203402e-05, + "loss": 0.8985, + "step": 5730 + }, + { + "epoch": 0.44798251775540465, + "grad_norm": 32.64850997924805, + "learning_rate": 9.926148470273007e-05, + "loss": 0.8873, + "step": 5740 + }, + { + "epoch": 0.4487629751034106, + "grad_norm": 56.86506271362305, + "learning_rate": 9.925368927962637e-05, + "loss": 2.0438, + "step": 5750 + }, + { + "epoch": 0.44954343245141654, + "grad_norm": 53.06357192993164, + "learning_rate": 9.924585323915148e-05, + "loss": 0.9069, + "step": 5760 + }, + { + "epoch": 0.45032388979942245, + "grad_norm": 0.1541537493467331, + "learning_rate": 9.923797658776744e-05, + "loss": 0.9701, + "step": 5770 + }, + { + "epoch": 0.4511043471474284, + "grad_norm": 55.30256271362305, + "learning_rate": 9.923005933196977e-05, + "loss": 0.752, + "step": 5780 + }, + { + "epoch": 0.45188480449543433, + "grad_norm": 12.553262710571289, + "learning_rate": 9.922210147828749e-05, + "loss": 0.5744, + "step": 5790 + }, + { + "epoch": 0.45266526184344025, + "grad_norm": 56.211185455322266, + "learning_rate": 9.92141030332831e-05, + "loss": 2.1868, + "step": 5800 + }, + { + "epoch": 0.45344571919144616, + "grad_norm": 44.826934814453125, + "learning_rate": 9.920606400355255e-05, + "loss": 0.6291, + "step": 5810 + }, + { + "epoch": 0.45422617653945213, + "grad_norm": 0.347988098859787, + "learning_rate": 9.91979843957253e-05, + "loss": 1.1644, + "step": 5820 + }, + { + "epoch": 0.45500663388745805, + "grad_norm": 41.838741302490234, + "learning_rate": 9.918986421646425e-05, + "loss": 1.897, + "step": 5830 + }, + { + "epoch": 0.45578709123546396, + "grad_norm": 0.07808752357959747, + "learning_rate": 9.918170347246574e-05, + "loss": 0.6393, + "step": 5840 + }, + { + "epoch": 0.45656754858346993, + "grad_norm": 24.800209045410156, + "learning_rate": 9.91735021704596e-05, + "loss": 0.6908, + "step": 5850 + }, + { + "epoch": 0.45734800593147584, + "grad_norm": 28.077816009521484, + "learning_rate": 9.916526031720908e-05, + "loss": 0.6714, + "step": 5860 + }, + { + "epoch": 0.45812846327948176, + "grad_norm": 64.07998657226562, + "learning_rate": 9.915697791951086e-05, + "loss": 0.8563, + "step": 5870 + }, + { + "epoch": 0.4589089206274877, + "grad_norm": 0.026845192536711693, + "learning_rate": 9.91486549841951e-05, + "loss": 1.9563, + "step": 5880 + }, + { + "epoch": 0.45968937797549364, + "grad_norm": 37.07597351074219, + "learning_rate": 9.914029151812534e-05, + "loss": 0.4418, + "step": 5890 + }, + { + "epoch": 0.46046983532349955, + "grad_norm": 1.552006483078003, + "learning_rate": 9.913188752819857e-05, + "loss": 1.8555, + "step": 5900 + }, + { + "epoch": 0.4612502926715055, + "grad_norm": 66.1205062866211, + "learning_rate": 9.912344302134521e-05, + "loss": 1.4313, + "step": 5910 + }, + { + "epoch": 0.46203075001951144, + "grad_norm": 4.578004360198975, + "learning_rate": 9.911495800452906e-05, + "loss": 2.7846, + "step": 5920 + }, + { + "epoch": 0.46281120736751735, + "grad_norm": 37.89592742919922, + "learning_rate": 9.910643248474735e-05, + "loss": 0.3734, + "step": 5930 + }, + { + "epoch": 0.4635916647155233, + "grad_norm": 8.450699533568695e-05, + "learning_rate": 9.909786646903072e-05, + "loss": 0.4354, + "step": 5940 + }, + { + "epoch": 0.46437212206352924, + "grad_norm": 4.293805977795273e-05, + "learning_rate": 9.908925996444316e-05, + "loss": 0.3882, + "step": 5950 + }, + { + "epoch": 0.46515257941153515, + "grad_norm": 1.9179746857389546e-07, + "learning_rate": 9.908061297808209e-05, + "loss": 1.4029, + "step": 5960 + }, + { + "epoch": 0.46593303675954106, + "grad_norm": 0.0844622254371643, + "learning_rate": 9.907192551707831e-05, + "loss": 2.5749, + "step": 5970 + }, + { + "epoch": 0.46671349410754703, + "grad_norm": 17.458057403564453, + "learning_rate": 9.9063197588596e-05, + "loss": 1.4711, + "step": 5980 + }, + { + "epoch": 0.46749395145555295, + "grad_norm": 2.408496856689453, + "learning_rate": 9.905442919983266e-05, + "loss": 0.2675, + "step": 5990 + }, + { + "epoch": 0.46827440880355886, + "grad_norm": 38.14802932739258, + "learning_rate": 9.904562035801924e-05, + "loss": 0.9949, + "step": 6000 + }, + { + "epoch": 0.46905486615156483, + "grad_norm": 0.0007477306644432247, + "learning_rate": 9.903677107042e-05, + "loss": 0.0908, + "step": 6010 + }, + { + "epoch": 0.46983532349957075, + "grad_norm": 54.57261657714844, + "learning_rate": 9.902788134433256e-05, + "loss": 1.3682, + "step": 6020 + }, + { + "epoch": 0.47061578084757666, + "grad_norm": 63.36696243286133, + "learning_rate": 9.901895118708788e-05, + "loss": 1.0471, + "step": 6030 + }, + { + "epoch": 0.47139623819558263, + "grad_norm": 0.04630555212497711, + "learning_rate": 9.900998060605027e-05, + "loss": 0.111, + "step": 6040 + }, + { + "epoch": 0.47217669554358854, + "grad_norm": 66.14763641357422, + "learning_rate": 9.90009696086174e-05, + "loss": 1.1918, + "step": 6050 + }, + { + "epoch": 0.47295715289159446, + "grad_norm": 0.3311450779438019, + "learning_rate": 9.899191820222022e-05, + "loss": 0.7873, + "step": 6060 + }, + { + "epoch": 0.4737376102396004, + "grad_norm": 52.740875244140625, + "learning_rate": 9.898282639432306e-05, + "loss": 1.5445, + "step": 6070 + }, + { + "epoch": 0.47451806758760634, + "grad_norm": 28.408681869506836, + "learning_rate": 9.89736941924235e-05, + "loss": 1.5339, + "step": 6080 + }, + { + "epoch": 0.47529852493561225, + "grad_norm": 0.33915793895721436, + "learning_rate": 9.896452160405252e-05, + "loss": 1.1844, + "step": 6090 + }, + { + "epoch": 0.4760789822836182, + "grad_norm": 2.4481306076049805, + "learning_rate": 9.895530863677429e-05, + "loss": 0.3326, + "step": 6100 + }, + { + "epoch": 0.47685943963162414, + "grad_norm": 9.80547046661377, + "learning_rate": 9.89460552981864e-05, + "loss": 0.6921, + "step": 6110 + }, + { + "epoch": 0.47763989697963005, + "grad_norm": 3.3799116749833047e-07, + "learning_rate": 9.893676159591963e-05, + "loss": 0.5425, + "step": 6120 + }, + { + "epoch": 0.478420354327636, + "grad_norm": 1.6407069836077426e-07, + "learning_rate": 9.892742753763813e-05, + "loss": 0.0921, + "step": 6130 + }, + { + "epoch": 0.47920081167564194, + "grad_norm": 37.7635612487793, + "learning_rate": 9.891805313103927e-05, + "loss": 0.3376, + "step": 6140 + }, + { + "epoch": 0.47998126902364785, + "grad_norm": 0.0007379294838756323, + "learning_rate": 9.890863838385371e-05, + "loss": 1.2239, + "step": 6150 + }, + { + "epoch": 0.48076172637165376, + "grad_norm": 1.5424108505249023, + "learning_rate": 9.88991833038454e-05, + "loss": 0.969, + "step": 6160 + }, + { + "epoch": 0.48154218371965973, + "grad_norm": 1.8169808413404098e-07, + "learning_rate": 9.888968789881152e-05, + "loss": 1.5421, + "step": 6170 + }, + { + "epoch": 0.48232264106766565, + "grad_norm": 0.4752025604248047, + "learning_rate": 9.888015217658251e-05, + "loss": 0.2779, + "step": 6180 + }, + { + "epoch": 0.48310309841567156, + "grad_norm": 0.13063034415245056, + "learning_rate": 9.887057614502208e-05, + "loss": 3.2541, + "step": 6190 + }, + { + "epoch": 0.48388355576367753, + "grad_norm": 0.24971577525138855, + "learning_rate": 9.886095981202715e-05, + "loss": 1.0007, + "step": 6200 + }, + { + "epoch": 0.48466401311168344, + "grad_norm": 0.011976394802331924, + "learning_rate": 9.88513031855279e-05, + "loss": 0.0139, + "step": 6210 + }, + { + "epoch": 0.48544447045968936, + "grad_norm": 52.369041442871094, + "learning_rate": 9.884160627348775e-05, + "loss": 2.0091, + "step": 6220 + }, + { + "epoch": 0.48622492780769533, + "grad_norm": 0.3506132662296295, + "learning_rate": 9.883186908390329e-05, + "loss": 0.4435, + "step": 6230 + }, + { + "epoch": 0.48700538515570124, + "grad_norm": 4.650276184082031, + "learning_rate": 9.882209162480437e-05, + "loss": 1.6357, + "step": 6240 + }, + { + "epoch": 0.48778584250370716, + "grad_norm": 56.31660461425781, + "learning_rate": 9.881227390425403e-05, + "loss": 0.9129, + "step": 6250 + }, + { + "epoch": 0.4885662998517131, + "grad_norm": 2.010699987411499, + "learning_rate": 9.88024159303485e-05, + "loss": 0.9621, + "step": 6260 + }, + { + "epoch": 0.48934675719971904, + "grad_norm": 0.18086794018745422, + "learning_rate": 9.879251771121725e-05, + "loss": 1.5871, + "step": 6270 + }, + { + "epoch": 0.49012721454772495, + "grad_norm": 3.91213321685791, + "learning_rate": 9.87825792550229e-05, + "loss": 2.2983, + "step": 6280 + }, + { + "epoch": 0.4909076718957309, + "grad_norm": 51.183929443359375, + "learning_rate": 9.877260056996126e-05, + "loss": 0.8454, + "step": 6290 + }, + { + "epoch": 0.49168812924373684, + "grad_norm": 34.326629638671875, + "learning_rate": 9.876258166426131e-05, + "loss": 1.191, + "step": 6300 + }, + { + "epoch": 0.49246858659174275, + "grad_norm": 1.0975525379180908, + "learning_rate": 9.875252254618523e-05, + "loss": 0.2663, + "step": 6310 + }, + { + "epoch": 0.49324904393974867, + "grad_norm": 3.1184887886047363, + "learning_rate": 9.87424232240283e-05, + "loss": 1.3323, + "step": 6320 + }, + { + "epoch": 0.49402950128775464, + "grad_norm": 8.74647331237793, + "learning_rate": 9.873228370611903e-05, + "loss": 0.4933, + "step": 6330 + }, + { + "epoch": 0.49480995863576055, + "grad_norm": 51.158206939697266, + "learning_rate": 9.872210400081898e-05, + "loss": 1.4282, + "step": 6340 + }, + { + "epoch": 0.49559041598376646, + "grad_norm": 10.619514465332031, + "learning_rate": 9.871188411652296e-05, + "loss": 0.8554, + "step": 6350 + }, + { + "epoch": 0.49637087333177243, + "grad_norm": 34.76881790161133, + "learning_rate": 9.870162406165888e-05, + "loss": 0.8635, + "step": 6360 + }, + { + "epoch": 0.49715133067977835, + "grad_norm": 0.014710797928273678, + "learning_rate": 9.86913238446877e-05, + "loss": 1.0073, + "step": 6370 + }, + { + "epoch": 0.49793178802778426, + "grad_norm": 0.00262262555770576, + "learning_rate": 9.86809834741036e-05, + "loss": 0.1259, + "step": 6380 + }, + { + "epoch": 0.49871224537579023, + "grad_norm": 0.0012736758217215538, + "learning_rate": 9.867060295843382e-05, + "loss": 0.9914, + "step": 6390 + }, + { + "epoch": 0.49949270272379614, + "grad_norm": 35.19121551513672, + "learning_rate": 9.866018230623873e-05, + "loss": 1.8764, + "step": 6400 + }, + { + "epoch": 0.5002731600718021, + "grad_norm": 0.935328483581543, + "learning_rate": 9.864972152611178e-05, + "loss": 1.3374, + "step": 6410 + }, + { + "epoch": 0.501053617419808, + "grad_norm": 23.448575973510742, + "learning_rate": 9.863922062667951e-05, + "loss": 1.4024, + "step": 6420 + }, + { + "epoch": 0.5018340747678139, + "grad_norm": 6.185776233673096, + "learning_rate": 9.862867961660157e-05, + "loss": 0.4412, + "step": 6430 + }, + { + "epoch": 0.5026145321158199, + "grad_norm": 0.019975431263446808, + "learning_rate": 9.861809850457065e-05, + "loss": 0.2771, + "step": 6440 + }, + { + "epoch": 0.5033949894638258, + "grad_norm": 3.049410820007324, + "learning_rate": 9.860747729931257e-05, + "loss": 0.9518, + "step": 6450 + }, + { + "epoch": 0.5041754468118317, + "grad_norm": 28.222091674804688, + "learning_rate": 9.859681600958614e-05, + "loss": 1.3099, + "step": 6460 + }, + { + "epoch": 0.5049559041598377, + "grad_norm": 0.0015314549673348665, + "learning_rate": 9.858611464418327e-05, + "loss": 0.3673, + "step": 6470 + }, + { + "epoch": 0.5057363615078436, + "grad_norm": 0.2558833658695221, + "learning_rate": 9.857537321192893e-05, + "loss": 1.1245, + "step": 6480 + }, + { + "epoch": 0.5065168188558495, + "grad_norm": 2.1790192127227783, + "learning_rate": 9.856459172168108e-05, + "loss": 0.5932, + "step": 6490 + }, + { + "epoch": 0.5072972762038555, + "grad_norm": 51.02792739868164, + "learning_rate": 9.855377018233076e-05, + "loss": 0.6189, + "step": 6500 + }, + { + "epoch": 0.5080777335518614, + "grad_norm": 54.68989181518555, + "learning_rate": 9.854290860280203e-05, + "loss": 2.7634, + "step": 6510 + }, + { + "epoch": 0.5088581908998673, + "grad_norm": 3.412045478820801, + "learning_rate": 9.853200699205194e-05, + "loss": 0.7287, + "step": 6520 + }, + { + "epoch": 0.5096386482478733, + "grad_norm": 83.18421173095703, + "learning_rate": 9.85210653590706e-05, + "loss": 1.2859, + "step": 6530 + }, + { + "epoch": 0.5104191055958792, + "grad_norm": 1.2750853300094604, + "learning_rate": 9.851008371288106e-05, + "loss": 0.649, + "step": 6540 + }, + { + "epoch": 0.5111995629438851, + "grad_norm": 2.0689053535461426, + "learning_rate": 9.849906206253945e-05, + "loss": 1.6541, + "step": 6550 + }, + { + "epoch": 0.511980020291891, + "grad_norm": 58.269386291503906, + "learning_rate": 9.848800041713482e-05, + "loss": 1.5054, + "step": 6560 + }, + { + "epoch": 0.512760477639897, + "grad_norm": 61.32803726196289, + "learning_rate": 9.847689878578923e-05, + "loss": 1.2377, + "step": 6570 + }, + { + "epoch": 0.5135409349879029, + "grad_norm": 0.00047993057523854077, + "learning_rate": 9.846575717765772e-05, + "loss": 1.1732, + "step": 6580 + }, + { + "epoch": 0.5143213923359088, + "grad_norm": 0.004535530228167772, + "learning_rate": 9.845457560192827e-05, + "loss": 1.0141, + "step": 6590 + }, + { + "epoch": 0.5151018496839148, + "grad_norm": 0.3655521273612976, + "learning_rate": 9.844335406782186e-05, + "loss": 1.2912, + "step": 6600 + }, + { + "epoch": 0.5158823070319207, + "grad_norm": 0.005310993175953627, + "learning_rate": 9.843209258459237e-05, + "loss": 0.2338, + "step": 6610 + }, + { + "epoch": 0.5166627643799266, + "grad_norm": 1.9839240312576294, + "learning_rate": 9.84207911615267e-05, + "loss": 0.5871, + "step": 6620 + }, + { + "epoch": 0.5174432217279326, + "grad_norm": 7.782805937495141e-07, + "learning_rate": 9.84094498079446e-05, + "loss": 0.4506, + "step": 6630 + }, + { + "epoch": 0.5182236790759385, + "grad_norm": 54.680179595947266, + "learning_rate": 9.83980685331988e-05, + "loss": 1.7223, + "step": 6640 + }, + { + "epoch": 0.5190041364239444, + "grad_norm": 43.05501937866211, + "learning_rate": 9.838664734667495e-05, + "loss": 0.7108, + "step": 6650 + }, + { + "epoch": 0.5197845937719504, + "grad_norm": 0.022166762501001358, + "learning_rate": 9.837518625779161e-05, + "loss": 0.7489, + "step": 6660 + }, + { + "epoch": 0.5205650511199563, + "grad_norm": 41.44973373413086, + "learning_rate": 9.836368527600022e-05, + "loss": 0.7476, + "step": 6670 + }, + { + "epoch": 0.5213455084679622, + "grad_norm": 44.777591705322266, + "learning_rate": 9.835214441078515e-05, + "loss": 1.1241, + "step": 6680 + }, + { + "epoch": 0.5221259658159682, + "grad_norm": 39.23445510864258, + "learning_rate": 9.834056367166365e-05, + "loss": 1.8415, + "step": 6690 + }, + { + "epoch": 0.5229064231639741, + "grad_norm": 30.00687026977539, + "learning_rate": 9.832894306818585e-05, + "loss": 0.5706, + "step": 6700 + }, + { + "epoch": 0.52368688051198, + "grad_norm": 68.04293823242188, + "learning_rate": 9.831728260993476e-05, + "loss": 1.0121, + "step": 6710 + }, + { + "epoch": 0.524467337859986, + "grad_norm": 4.57463264465332, + "learning_rate": 9.830558230652624e-05, + "loss": 0.3254, + "step": 6720 + }, + { + "epoch": 0.5252477952079919, + "grad_norm": 18.793424606323242, + "learning_rate": 9.829384216760904e-05, + "loss": 2.0302, + "step": 6730 + }, + { + "epoch": 0.5260282525559978, + "grad_norm": 7.409657001495361, + "learning_rate": 9.828206220286472e-05, + "loss": 1.2441, + "step": 6740 + }, + { + "epoch": 0.5268087099040037, + "grad_norm": 48.33645248413086, + "learning_rate": 9.827024242200771e-05, + "loss": 1.3994, + "step": 6750 + }, + { + "epoch": 0.5275891672520097, + "grad_norm": 56.547359466552734, + "learning_rate": 9.825838283478528e-05, + "loss": 0.8465, + "step": 6760 + }, + { + "epoch": 0.5283696246000156, + "grad_norm": 0.01121858786791563, + "learning_rate": 9.824648345097749e-05, + "loss": 1.0637, + "step": 6770 + }, + { + "epoch": 0.5291500819480215, + "grad_norm": 8.223366737365723, + "learning_rate": 9.823454428039726e-05, + "loss": 0.7012, + "step": 6780 + }, + { + "epoch": 0.5299305392960275, + "grad_norm": 0.00867557991296053, + "learning_rate": 9.822256533289032e-05, + "loss": 0.8317, + "step": 6790 + }, + { + "epoch": 0.5307109966440334, + "grad_norm": 0.0009431852377019823, + "learning_rate": 9.821054661833516e-05, + "loss": 0.7048, + "step": 6800 + }, + { + "epoch": 0.5314914539920393, + "grad_norm": 1.0806217193603516, + "learning_rate": 9.819848814664307e-05, + "loss": 1.2247, + "step": 6810 + }, + { + "epoch": 0.5322719113400453, + "grad_norm": 5.017229080200195, + "learning_rate": 9.81863899277582e-05, + "loss": 0.4701, + "step": 6820 + }, + { + "epoch": 0.5330523686880512, + "grad_norm": 13.995375633239746, + "learning_rate": 9.817425197165739e-05, + "loss": 2.9128, + "step": 6830 + }, + { + "epoch": 0.5338328260360571, + "grad_norm": 82.41301727294922, + "learning_rate": 9.816207428835029e-05, + "loss": 2.4041, + "step": 6840 + }, + { + "epoch": 0.5346132833840631, + "grad_norm": 66.8274917602539, + "learning_rate": 9.814985688787932e-05, + "loss": 3.669, + "step": 6850 + }, + { + "epoch": 0.535393740732069, + "grad_norm": 3.1551668643951416, + "learning_rate": 9.813759978031962e-05, + "loss": 0.5636, + "step": 6860 + }, + { + "epoch": 0.5361741980800749, + "grad_norm": 20.27729606628418, + "learning_rate": 9.812530297577908e-05, + "loss": 1.534, + "step": 6870 + }, + { + "epoch": 0.5369546554280809, + "grad_norm": 0.1851717084646225, + "learning_rate": 9.811296648439837e-05, + "loss": 0.9975, + "step": 6880 + }, + { + "epoch": 0.5377351127760868, + "grad_norm": 38.749061584472656, + "learning_rate": 9.810059031635084e-05, + "loss": 1.0313, + "step": 6890 + }, + { + "epoch": 0.5385155701240927, + "grad_norm": 24.485456466674805, + "learning_rate": 9.808817448184258e-05, + "loss": 1.5311, + "step": 6900 + }, + { + "epoch": 0.5392960274720987, + "grad_norm": 0.004445152822881937, + "learning_rate": 9.807571899111237e-05, + "loss": 0.1711, + "step": 6910 + }, + { + "epoch": 0.5400764848201046, + "grad_norm": 1.9483662843704224, + "learning_rate": 9.806322385443174e-05, + "loss": 0.1513, + "step": 6920 + }, + { + "epoch": 0.5408569421681105, + "grad_norm": 0.0006264004623517394, + "learning_rate": 9.805068908210488e-05, + "loss": 2.5281, + "step": 6930 + }, + { + "epoch": 0.5416373995161164, + "grad_norm": 53.3282356262207, + "learning_rate": 9.803811468446864e-05, + "loss": 1.6079, + "step": 6940 + }, + { + "epoch": 0.5424178568641224, + "grad_norm": 19.502546310424805, + "learning_rate": 9.802550067189263e-05, + "loss": 0.1501, + "step": 6950 + }, + { + "epoch": 0.5431983142121283, + "grad_norm": 20.476640701293945, + "learning_rate": 9.801284705477902e-05, + "loss": 0.4741, + "step": 6960 + }, + { + "epoch": 0.5439787715601342, + "grad_norm": 48.1911506652832, + "learning_rate": 9.800015384356271e-05, + "loss": 1.6906, + "step": 6970 + }, + { + "epoch": 0.5447592289081402, + "grad_norm": 0.8714758157730103, + "learning_rate": 9.798742104871129e-05, + "loss": 1.0196, + "step": 6980 + }, + { + "epoch": 0.5455396862561461, + "grad_norm": 1.112245868739592e-07, + "learning_rate": 9.797464868072488e-05, + "loss": 0.0536, + "step": 6990 + }, + { + "epoch": 0.546320143604152, + "grad_norm": 0.03219883516430855, + "learning_rate": 9.796183675013632e-05, + "loss": 0.523, + "step": 7000 + }, + { + "epoch": 0.547100600952158, + "grad_norm": 6.496858172971853e-13, + "learning_rate": 9.794898526751104e-05, + "loss": 1.7537, + "step": 7010 + }, + { + "epoch": 0.5478810583001639, + "grad_norm": 1.2731820042688469e-09, + "learning_rate": 9.793609424344712e-05, + "loss": 0.3894, + "step": 7020 + }, + { + "epoch": 0.5486615156481698, + "grad_norm": 0.006216781213879585, + "learning_rate": 9.792316368857519e-05, + "loss": 1.6057, + "step": 7030 + }, + { + "epoch": 0.5494419729961758, + "grad_norm": 1.2926510528643576e-08, + "learning_rate": 9.791019361355855e-05, + "loss": 5.085, + "step": 7040 + }, + { + "epoch": 0.5502224303441817, + "grad_norm": 3.7267448902130127, + "learning_rate": 9.789718402909305e-05, + "loss": 1.3625, + "step": 7050 + }, + { + "epoch": 0.5510028876921876, + "grad_norm": 25.583145141601562, + "learning_rate": 9.788413494590711e-05, + "loss": 1.5236, + "step": 7060 + }, + { + "epoch": 0.5517833450401936, + "grad_norm": 0.00015216317842714489, + "learning_rate": 9.787104637476175e-05, + "loss": 1.4341, + "step": 7070 + }, + { + "epoch": 0.5525638023881995, + "grad_norm": 47.065372467041016, + "learning_rate": 9.785791832645055e-05, + "loss": 0.1853, + "step": 7080 + }, + { + "epoch": 0.5533442597362054, + "grad_norm": 0.002304552122950554, + "learning_rate": 9.784475081179962e-05, + "loss": 1.5718, + "step": 7090 + }, + { + "epoch": 0.5541247170842113, + "grad_norm": 21.46728515625, + "learning_rate": 9.783154384166766e-05, + "loss": 1.6335, + "step": 7100 + }, + { + "epoch": 0.5549051744322173, + "grad_norm": 423.6686096191406, + "learning_rate": 9.781829742694588e-05, + "loss": 1.1247, + "step": 7110 + }, + { + "epoch": 0.5556856317802232, + "grad_norm": 10.277878761291504, + "learning_rate": 9.780501157855801e-05, + "loss": 0.4371, + "step": 7120 + }, + { + "epoch": 0.5564660891282291, + "grad_norm": 7.607505949636106e-07, + "learning_rate": 9.77916863074603e-05, + "loss": 0.0516, + "step": 7130 + }, + { + "epoch": 0.5572465464762351, + "grad_norm": 3.1203581940530967e-09, + "learning_rate": 9.777832162464154e-05, + "loss": 2.3446, + "step": 7140 + }, + { + "epoch": 0.558027003824241, + "grad_norm": 10.839029312133789, + "learning_rate": 9.776491754112299e-05, + "loss": 0.958, + "step": 7150 + }, + { + "epoch": 0.5588074611722469, + "grad_norm": 0.4206421375274658, + "learning_rate": 9.775147406795841e-05, + "loss": 1.2235, + "step": 7160 + }, + { + "epoch": 0.5595879185202529, + "grad_norm": 0.00017085533181671053, + "learning_rate": 9.773799121623408e-05, + "loss": 0.4796, + "step": 7170 + }, + { + "epoch": 0.5603683758682588, + "grad_norm": 26.582189559936523, + "learning_rate": 9.772446899706868e-05, + "loss": 0.6587, + "step": 7180 + }, + { + "epoch": 0.5611488332162647, + "grad_norm": 42.50886917114258, + "learning_rate": 9.771090742161342e-05, + "loss": 1.4833, + "step": 7190 + }, + { + "epoch": 0.5619292905642707, + "grad_norm": 50.29939270019531, + "learning_rate": 9.769730650105191e-05, + "loss": 1.683, + "step": 7200 + }, + { + "epoch": 0.5627097479122766, + "grad_norm": 10.882280349731445, + "learning_rate": 9.768366624660028e-05, + "loss": 0.7569, + "step": 7210 + }, + { + "epoch": 0.5634902052602825, + "grad_norm": 60.33913040161133, + "learning_rate": 9.766998666950702e-05, + "loss": 1.2821, + "step": 7220 + }, + { + "epoch": 0.5642706626082885, + "grad_norm": 41.544944763183594, + "learning_rate": 9.765626778105308e-05, + "loss": 1.24, + "step": 7230 + }, + { + "epoch": 0.5650511199562944, + "grad_norm": 13.627266883850098, + "learning_rate": 9.764250959255186e-05, + "loss": 0.624, + "step": 7240 + }, + { + "epoch": 0.5658315773043003, + "grad_norm": 30.541210174560547, + "learning_rate": 9.76287121153491e-05, + "loss": 0.7803, + "step": 7250 + }, + { + "epoch": 0.5666120346523063, + "grad_norm": 47.805694580078125, + "learning_rate": 9.761487536082302e-05, + "loss": 0.4765, + "step": 7260 + }, + { + "epoch": 0.5673924920003122, + "grad_norm": 59.42711639404297, + "learning_rate": 9.760099934038415e-05, + "loss": 0.8472, + "step": 7270 + }, + { + "epoch": 0.5681729493483181, + "grad_norm": 19.21670913696289, + "learning_rate": 9.758708406547546e-05, + "loss": 0.8136, + "step": 7280 + }, + { + "epoch": 0.568953406696324, + "grad_norm": 0.024506118148565292, + "learning_rate": 9.757312954757228e-05, + "loss": 0.4705, + "step": 7290 + }, + { + "epoch": 0.56973386404433, + "grad_norm": 13.569670677185059, + "learning_rate": 9.755913579818226e-05, + "loss": 0.8712, + "step": 7300 + }, + { + "epoch": 0.5705143213923359, + "grad_norm": 46.79026794433594, + "learning_rate": 9.754510282884546e-05, + "loss": 1.7591, + "step": 7310 + }, + { + "epoch": 0.5712947787403418, + "grad_norm": 0.011232595890760422, + "learning_rate": 9.753103065113424e-05, + "loss": 0.3608, + "step": 7320 + }, + { + "epoch": 0.5720752360883478, + "grad_norm": 3.212200403213501, + "learning_rate": 9.751691927665334e-05, + "loss": 0.3634, + "step": 7330 + }, + { + "epoch": 0.5728556934363537, + "grad_norm": 0.02611798420548439, + "learning_rate": 9.750276871703979e-05, + "loss": 0.9882, + "step": 7340 + }, + { + "epoch": 0.5736361507843596, + "grad_norm": 40.64421081542969, + "learning_rate": 9.74885789839629e-05, + "loss": 0.4115, + "step": 7350 + }, + { + "epoch": 0.5744166081323656, + "grad_norm": 0.035366058349609375, + "learning_rate": 9.747435008912438e-05, + "loss": 1.0184, + "step": 7360 + }, + { + "epoch": 0.5751970654803715, + "grad_norm": 1.079415202140808, + "learning_rate": 9.746008204425814e-05, + "loss": 1.518, + "step": 7370 + }, + { + "epoch": 0.5759775228283774, + "grad_norm": 48.33514404296875, + "learning_rate": 9.744577486113042e-05, + "loss": 1.0792, + "step": 7380 + }, + { + "epoch": 0.5767579801763834, + "grad_norm": 46.865501403808594, + "learning_rate": 9.743142855153976e-05, + "loss": 1.2278, + "step": 7390 + }, + { + "epoch": 0.5775384375243893, + "grad_norm": 0.035304367542266846, + "learning_rate": 9.741704312731691e-05, + "loss": 2.4778, + "step": 7400 + }, + { + "epoch": 0.5783188948723952, + "grad_norm": 1.251697301864624, + "learning_rate": 9.740261860032491e-05, + "loss": 0.6881, + "step": 7410 + }, + { + "epoch": 0.5790993522204012, + "grad_norm": 38.47404861450195, + "learning_rate": 9.738815498245902e-05, + "loss": 0.8292, + "step": 7420 + }, + { + "epoch": 0.5798798095684071, + "grad_norm": 40.813621520996094, + "learning_rate": 9.737365228564679e-05, + "loss": 1.4995, + "step": 7430 + }, + { + "epoch": 0.580660266916413, + "grad_norm": 36.29606246948242, + "learning_rate": 9.735911052184794e-05, + "loss": 0.9977, + "step": 7440 + }, + { + "epoch": 0.581440724264419, + "grad_norm": 0.03261225298047066, + "learning_rate": 9.734452970305443e-05, + "loss": 0.5158, + "step": 7450 + }, + { + "epoch": 0.5822211816124249, + "grad_norm": 2.965651273727417, + "learning_rate": 9.732990984129042e-05, + "loss": 0.605, + "step": 7460 + }, + { + "epoch": 0.5830016389604308, + "grad_norm": 3.623199701309204, + "learning_rate": 9.73152509486123e-05, + "loss": 0.3894, + "step": 7470 + }, + { + "epoch": 0.5837820963084367, + "grad_norm": 84.90872955322266, + "learning_rate": 9.73005530371086e-05, + "loss": 2.7016, + "step": 7480 + }, + { + "epoch": 0.5845625536564427, + "grad_norm": 0.00025274348445236683, + "learning_rate": 9.728581611890004e-05, + "loss": 0.3299, + "step": 7490 + }, + { + "epoch": 0.5853430110044486, + "grad_norm": 0.36983373761177063, + "learning_rate": 9.727104020613954e-05, + "loss": 0.4919, + "step": 7500 + }, + { + "epoch": 0.5861234683524545, + "grad_norm": 0.40055277943611145, + "learning_rate": 9.725622531101211e-05, + "loss": 1.3109, + "step": 7510 + }, + { + "epoch": 0.5869039257004605, + "grad_norm": 14.323739051818848, + "learning_rate": 9.724137144573497e-05, + "loss": 0.492, + "step": 7520 + }, + { + "epoch": 0.5876843830484664, + "grad_norm": 52.50056076049805, + "learning_rate": 9.722647862255749e-05, + "loss": 1.8018, + "step": 7530 + }, + { + "epoch": 0.5884648403964723, + "grad_norm": 0.0008027786388993263, + "learning_rate": 9.721154685376109e-05, + "loss": 0.6453, + "step": 7540 + }, + { + "epoch": 0.5892452977444783, + "grad_norm": 14.831884384155273, + "learning_rate": 9.719657615165934e-05, + "loss": 0.8703, + "step": 7550 + }, + { + "epoch": 0.5900257550924842, + "grad_norm": 65.61073303222656, + "learning_rate": 9.718156652859795e-05, + "loss": 0.4422, + "step": 7560 + }, + { + "epoch": 0.5908062124404901, + "grad_norm": 47.878257751464844, + "learning_rate": 9.716651799695471e-05, + "loss": 0.406, + "step": 7570 + }, + { + "epoch": 0.5915866697884961, + "grad_norm": 0.26020410656929016, + "learning_rate": 9.715143056913947e-05, + "loss": 0.2718, + "step": 7580 + }, + { + "epoch": 0.592367127136502, + "grad_norm": 57.03702163696289, + "learning_rate": 9.713630425759419e-05, + "loss": 2.6524, + "step": 7590 + }, + { + "epoch": 0.5931475844845079, + "grad_norm": 0.009130135178565979, + "learning_rate": 9.712113907479285e-05, + "loss": 0.9718, + "step": 7600 + }, + { + "epoch": 0.5939280418325138, + "grad_norm": 0.03151361271739006, + "learning_rate": 9.710593503324155e-05, + "loss": 2.1971, + "step": 7610 + }, + { + "epoch": 0.5947084991805198, + "grad_norm": 2.31083083152771, + "learning_rate": 9.709069214547839e-05, + "loss": 0.1109, + "step": 7620 + }, + { + "epoch": 0.5954889565285257, + "grad_norm": 71.40426635742188, + "learning_rate": 9.70754104240735e-05, + "loss": 1.5467, + "step": 7630 + }, + { + "epoch": 0.5962694138765317, + "grad_norm": 20.671236038208008, + "learning_rate": 9.706008988162907e-05, + "loss": 2.6514, + "step": 7640 + }, + { + "epoch": 0.5970498712245376, + "grad_norm": 17.801488876342773, + "learning_rate": 9.704473053077928e-05, + "loss": 1.1676, + "step": 7650 + }, + { + "epoch": 0.5978303285725435, + "grad_norm": 0.09868277609348297, + "learning_rate": 9.702933238419029e-05, + "loss": 0.5239, + "step": 7660 + }, + { + "epoch": 0.5986107859205494, + "grad_norm": 0.02296440117061138, + "learning_rate": 9.70138954545603e-05, + "loss": 0.5087, + "step": 7670 + }, + { + "epoch": 0.5993912432685554, + "grad_norm": 0.036499544978141785, + "learning_rate": 9.69984197546195e-05, + "loss": 0.3802, + "step": 7680 + }, + { + "epoch": 0.6001717006165613, + "grad_norm": 9.838994026184082, + "learning_rate": 9.698290529712999e-05, + "loss": 0.9191, + "step": 7690 + }, + { + "epoch": 0.6009521579645672, + "grad_norm": 11.949810981750488, + "learning_rate": 9.696735209488588e-05, + "loss": 0.6987, + "step": 7700 + }, + { + "epoch": 0.6017326153125732, + "grad_norm": 6.8122071752441116e-06, + "learning_rate": 9.695176016071321e-05, + "loss": 1.0909, + "step": 7710 + }, + { + "epoch": 0.6025130726605791, + "grad_norm": 16.3994140625, + "learning_rate": 9.693612950746997e-05, + "loss": 0.7157, + "step": 7720 + }, + { + "epoch": 0.603293530008585, + "grad_norm": 0.040160659700632095, + "learning_rate": 9.69204601480461e-05, + "loss": 0.5506, + "step": 7730 + }, + { + "epoch": 0.604073987356591, + "grad_norm": 57.082244873046875, + "learning_rate": 9.690475209536341e-05, + "loss": 1.5083, + "step": 7740 + }, + { + "epoch": 0.6048544447045969, + "grad_norm": 0.025508442893624306, + "learning_rate": 9.688900536237566e-05, + "loss": 0.1772, + "step": 7750 + }, + { + "epoch": 0.6056349020526028, + "grad_norm": 9.786832379177213e-05, + "learning_rate": 9.687321996206849e-05, + "loss": 1.0359, + "step": 7760 + }, + { + "epoch": 0.6064153594006088, + "grad_norm": 28.551362991333008, + "learning_rate": 9.685739590745944e-05, + "loss": 1.2453, + "step": 7770 + }, + { + "epoch": 0.6071958167486147, + "grad_norm": 4.971829414367676, + "learning_rate": 9.68415332115979e-05, + "loss": 0.5187, + "step": 7780 + }, + { + "epoch": 0.6079762740966206, + "grad_norm": 62.52102279663086, + "learning_rate": 9.682563188756518e-05, + "loss": 2.0489, + "step": 7790 + }, + { + "epoch": 0.6087567314446265, + "grad_norm": 0.0007101665250957012, + "learning_rate": 9.680969194847436e-05, + "loss": 1.4431, + "step": 7800 + }, + { + "epoch": 0.6095371887926325, + "grad_norm": 0.03687829524278641, + "learning_rate": 9.679371340747045e-05, + "loss": 0.8274, + "step": 7810 + }, + { + "epoch": 0.6103176461406384, + "grad_norm": 0.7268645763397217, + "learning_rate": 9.677769627773024e-05, + "loss": 0.8634, + "step": 7820 + }, + { + "epoch": 0.6110981034886444, + "grad_norm": 7.965996265411377, + "learning_rate": 9.676164057246235e-05, + "loss": 0.4535, + "step": 7830 + }, + { + "epoch": 0.6118785608366503, + "grad_norm": 0.0013677034294232726, + "learning_rate": 9.674554630490726e-05, + "loss": 1.266, + "step": 7840 + }, + { + "epoch": 0.6126590181846562, + "grad_norm": 8.891855239868164, + "learning_rate": 9.672941348833717e-05, + "loss": 1.3937, + "step": 7850 + }, + { + "epoch": 0.6134394755326621, + "grad_norm": 0.060114502906799316, + "learning_rate": 9.671324213605614e-05, + "loss": 0.6587, + "step": 7860 + }, + { + "epoch": 0.6142199328806681, + "grad_norm": 5.528909683227539, + "learning_rate": 9.669703226139996e-05, + "loss": 0.6251, + "step": 7870 + }, + { + "epoch": 0.615000390228674, + "grad_norm": 21.050743103027344, + "learning_rate": 9.66807838777362e-05, + "loss": 0.3692, + "step": 7880 + }, + { + "epoch": 0.61578084757668, + "grad_norm": 5.410530548033421e-07, + "learning_rate": 9.666449699846423e-05, + "loss": 0.7565, + "step": 7890 + }, + { + "epoch": 0.6165613049246859, + "grad_norm": 0.0002509712358005345, + "learning_rate": 9.664817163701508e-05, + "loss": 0.656, + "step": 7900 + }, + { + "epoch": 0.6173417622726918, + "grad_norm": 0.029291460290551186, + "learning_rate": 9.663180780685162e-05, + "loss": 1.2556, + "step": 7910 + }, + { + "epoch": 0.6181222196206977, + "grad_norm": 9.711715698242188, + "learning_rate": 9.661540552146833e-05, + "loss": 1.4943, + "step": 7920 + }, + { + "epoch": 0.6189026769687037, + "grad_norm": 20.323610305786133, + "learning_rate": 9.65989647943915e-05, + "loss": 0.7127, + "step": 7930 + }, + { + "epoch": 0.6196831343167096, + "grad_norm": 4.817214488983154, + "learning_rate": 9.658248563917906e-05, + "loss": 1.8099, + "step": 7940 + }, + { + "epoch": 0.6204635916647155, + "grad_norm": 41.11168670654297, + "learning_rate": 9.656596806942068e-05, + "loss": 0.8875, + "step": 7950 + }, + { + "epoch": 0.6212440490127215, + "grad_norm": 56.30549621582031, + "learning_rate": 9.654941209873765e-05, + "loss": 1.2167, + "step": 7960 + }, + { + "epoch": 0.6220245063607274, + "grad_norm": 39.02580642700195, + "learning_rate": 9.653281774078297e-05, + "loss": 0.2388, + "step": 7970 + }, + { + "epoch": 0.6228049637087333, + "grad_norm": 1.3934792280197144, + "learning_rate": 9.651618500924127e-05, + "loss": 0.1799, + "step": 7980 + }, + { + "epoch": 0.6235854210567392, + "grad_norm": 66.73686218261719, + "learning_rate": 9.649951391782886e-05, + "loss": 1.2296, + "step": 7990 + }, + { + "epoch": 0.6243658784047452, + "grad_norm": 0.0006028424249961972, + "learning_rate": 9.648280448029365e-05, + "loss": 0.3993, + "step": 8000 + }, + { + "epoch": 0.6251463357527511, + "grad_norm": 0.25307711958885193, + "learning_rate": 9.64660567104152e-05, + "loss": 1.1959, + "step": 8010 + }, + { + "epoch": 0.625926793100757, + "grad_norm": 0.15425390005111694, + "learning_rate": 9.644927062200463e-05, + "loss": 2.079, + "step": 8020 + }, + { + "epoch": 0.626707250448763, + "grad_norm": 0.0008805795223452151, + "learning_rate": 9.643244622890475e-05, + "loss": 1.327, + "step": 8030 + }, + { + "epoch": 0.6274877077967689, + "grad_norm": 0.9369128942489624, + "learning_rate": 9.641558354498987e-05, + "loss": 0.8932, + "step": 8040 + }, + { + "epoch": 0.6282681651447748, + "grad_norm": 22.965044021606445, + "learning_rate": 9.63986825841659e-05, + "loss": 1.1273, + "step": 8050 + }, + { + "epoch": 0.6290486224927808, + "grad_norm": 9.68968391418457, + "learning_rate": 9.638174336037034e-05, + "loss": 0.0267, + "step": 8060 + }, + { + "epoch": 0.6298290798407867, + "grad_norm": 0.0005601391312666237, + "learning_rate": 9.636476588757224e-05, + "loss": 2.1434, + "step": 8070 + }, + { + "epoch": 0.6306095371887926, + "grad_norm": 43.502105712890625, + "learning_rate": 9.634775017977216e-05, + "loss": 0.606, + "step": 8080 + }, + { + "epoch": 0.6313899945367986, + "grad_norm": 0.001144526293501258, + "learning_rate": 9.633069625100224e-05, + "loss": 0.5548, + "step": 8090 + }, + { + "epoch": 0.6321704518848045, + "grad_norm": 6.474184036254883, + "learning_rate": 9.631360411532608e-05, + "loss": 0.5924, + "step": 8100 + }, + { + "epoch": 0.6329509092328104, + "grad_norm": 1.9110515117645264, + "learning_rate": 9.629647378683886e-05, + "loss": 0.7659, + "step": 8110 + }, + { + "epoch": 0.6337313665808164, + "grad_norm": 78.93585205078125, + "learning_rate": 9.627930527966718e-05, + "loss": 1.1818, + "step": 8120 + }, + { + "epoch": 0.6345118239288223, + "grad_norm": 37.94493103027344, + "learning_rate": 9.626209860796916e-05, + "loss": 2.0675, + "step": 8130 + }, + { + "epoch": 0.6352922812768282, + "grad_norm": 1.4972370862960815, + "learning_rate": 9.62448537859344e-05, + "loss": 0.356, + "step": 8140 + }, + { + "epoch": 0.6360727386248342, + "grad_norm": 5.993861675262451, + "learning_rate": 9.622757082778398e-05, + "loss": 0.7325, + "step": 8150 + }, + { + "epoch": 0.6368531959728401, + "grad_norm": 13.716777801513672, + "learning_rate": 9.621024974777036e-05, + "loss": 0.9638, + "step": 8160 + }, + { + "epoch": 0.637633653320846, + "grad_norm": 3.911118745803833, + "learning_rate": 9.619289056017751e-05, + "loss": 1.6913, + "step": 8170 + }, + { + "epoch": 0.6384141106688519, + "grad_norm": 62.19038772583008, + "learning_rate": 9.617549327932078e-05, + "loss": 1.3146, + "step": 8180 + }, + { + "epoch": 0.6391945680168579, + "grad_norm": 64.44358825683594, + "learning_rate": 9.615805791954695e-05, + "loss": 2.7173, + "step": 8190 + }, + { + "epoch": 0.6399750253648638, + "grad_norm": 60.38733673095703, + "learning_rate": 9.61405844952342e-05, + "loss": 2.2941, + "step": 8200 + }, + { + "epoch": 0.6407554827128698, + "grad_norm": 0.036649566143751144, + "learning_rate": 9.612307302079213e-05, + "loss": 0.439, + "step": 8210 + }, + { + "epoch": 0.6415359400608757, + "grad_norm": 72.93924713134766, + "learning_rate": 9.610552351066165e-05, + "loss": 0.5796, + "step": 8220 + }, + { + "epoch": 0.6423163974088816, + "grad_norm": 13.401516914367676, + "learning_rate": 9.60879359793151e-05, + "loss": 0.3281, + "step": 8230 + }, + { + "epoch": 0.6430968547568875, + "grad_norm": 14.01244068145752, + "learning_rate": 9.607031044125614e-05, + "loss": 0.5925, + "step": 8240 + }, + { + "epoch": 0.6438773121048935, + "grad_norm": 0.3519912362098694, + "learning_rate": 9.605264691101978e-05, + "loss": 1.3033, + "step": 8250 + }, + { + "epoch": 0.6446577694528994, + "grad_norm": 0.004155475180596113, + "learning_rate": 9.603494540317238e-05, + "loss": 0.6315, + "step": 8260 + }, + { + "epoch": 0.6454382268009053, + "grad_norm": 51.62873458862305, + "learning_rate": 9.601720593231158e-05, + "loss": 1.3946, + "step": 8270 + }, + { + "epoch": 0.6462186841489113, + "grad_norm": 50.64020919799805, + "learning_rate": 9.599942851306638e-05, + "loss": 0.8091, + "step": 8280 + }, + { + "epoch": 0.6469991414969172, + "grad_norm": 6.914702892303467, + "learning_rate": 9.598161316009701e-05, + "loss": 0.6304, + "step": 8290 + }, + { + "epoch": 0.6477795988449231, + "grad_norm": 7.955085277557373, + "learning_rate": 9.596375988809505e-05, + "loss": 0.5228, + "step": 8300 + }, + { + "epoch": 0.648560056192929, + "grad_norm": 18.709684371948242, + "learning_rate": 9.594586871178327e-05, + "loss": 0.9406, + "step": 8310 + }, + { + "epoch": 0.649340513540935, + "grad_norm": 0.03159976378083229, + "learning_rate": 9.592793964591578e-05, + "loss": 0.5873, + "step": 8320 + }, + { + "epoch": 0.6501209708889409, + "grad_norm": 62.471961975097656, + "learning_rate": 9.590997270527789e-05, + "loss": 0.6104, + "step": 8330 + }, + { + "epoch": 0.6509014282369469, + "grad_norm": 4.519401550292969, + "learning_rate": 9.589196790468615e-05, + "loss": 1.8798, + "step": 8340 + }, + { + "epoch": 0.6516818855849528, + "grad_norm": 39.14466857910156, + "learning_rate": 9.587392525898833e-05, + "loss": 0.7252, + "step": 8350 + }, + { + "epoch": 0.6524623429329587, + "grad_norm": 0.0004648494068533182, + "learning_rate": 9.585584478306342e-05, + "loss": 0.3632, + "step": 8360 + }, + { + "epoch": 0.6532428002809646, + "grad_norm": 17.922636032104492, + "learning_rate": 9.583772649182159e-05, + "loss": 1.0404, + "step": 8370 + }, + { + "epoch": 0.6540232576289706, + "grad_norm": 52.733394622802734, + "learning_rate": 9.581957040020424e-05, + "loss": 0.4739, + "step": 8380 + }, + { + "epoch": 0.6548037149769765, + "grad_norm": 66.99308776855469, + "learning_rate": 9.580137652318386e-05, + "loss": 1.7214, + "step": 8390 + }, + { + "epoch": 0.6555841723249825, + "grad_norm": 2.9420151710510254, + "learning_rate": 9.578314487576418e-05, + "loss": 0.3692, + "step": 8400 + }, + { + "epoch": 0.6563646296729884, + "grad_norm": 30.528118133544922, + "learning_rate": 9.576487547298003e-05, + "loss": 0.6922, + "step": 8410 + }, + { + "epoch": 0.6571450870209943, + "grad_norm": 0.8822042346000671, + "learning_rate": 9.574656832989739e-05, + "loss": 1.9788, + "step": 8420 + }, + { + "epoch": 0.6579255443690002, + "grad_norm": 4.979971885681152, + "learning_rate": 9.572822346161338e-05, + "loss": 0.5478, + "step": 8430 + }, + { + "epoch": 0.6587060017170062, + "grad_norm": 35.818092346191406, + "learning_rate": 9.57098408832562e-05, + "loss": 1.3641, + "step": 8440 + }, + { + "epoch": 0.6594864590650121, + "grad_norm": 23.56801414489746, + "learning_rate": 9.569142060998514e-05, + "loss": 0.8958, + "step": 8450 + }, + { + "epoch": 0.660266916413018, + "grad_norm": 0.06824395060539246, + "learning_rate": 9.567296265699066e-05, + "loss": 1.4302, + "step": 8460 + }, + { + "epoch": 0.661047373761024, + "grad_norm": 0.00021735116024501622, + "learning_rate": 9.565446703949417e-05, + "loss": 2.3565, + "step": 8470 + }, + { + "epoch": 0.6618278311090299, + "grad_norm": 0.0019540295470505953, + "learning_rate": 9.563593377274821e-05, + "loss": 0.435, + "step": 8480 + }, + { + "epoch": 0.6626082884570358, + "grad_norm": 0.0047746263444423676, + "learning_rate": 9.561736287203638e-05, + "loss": 1.2002, + "step": 8490 + }, + { + "epoch": 0.6633887458050417, + "grad_norm": 57.810672760009766, + "learning_rate": 9.559875435267326e-05, + "loss": 1.0457, + "step": 8500 + }, + { + "epoch": 0.6641692031530477, + "grad_norm": 2.046326160430908, + "learning_rate": 9.558010823000451e-05, + "loss": 0.3933, + "step": 8510 + }, + { + "epoch": 0.6649496605010536, + "grad_norm": 31.771995544433594, + "learning_rate": 9.55614245194068e-05, + "loss": 1.1063, + "step": 8520 + }, + { + "epoch": 0.6657301178490596, + "grad_norm": 1.884420394897461, + "learning_rate": 9.554270323628771e-05, + "loss": 0.3216, + "step": 8530 + }, + { + "epoch": 0.6665105751970655, + "grad_norm": 0.9028748273849487, + "learning_rate": 9.55239443960859e-05, + "loss": 1.1951, + "step": 8540 + }, + { + "epoch": 0.6672910325450714, + "grad_norm": 2.9773038477287628e-05, + "learning_rate": 9.550514801427098e-05, + "loss": 0.948, + "step": 8550 + }, + { + "epoch": 0.6680714898930773, + "grad_norm": 11.40433120727539, + "learning_rate": 9.548631410634346e-05, + "loss": 1.0299, + "step": 8560 + }, + { + "epoch": 0.6688519472410833, + "grad_norm": 0.689365804195404, + "learning_rate": 9.54674426878349e-05, + "loss": 0.253, + "step": 8570 + }, + { + "epoch": 0.6696324045890892, + "grad_norm": 4.528523921966553, + "learning_rate": 9.544853377430771e-05, + "loss": 0.7915, + "step": 8580 + }, + { + "epoch": 0.6704128619370952, + "grad_norm": 2.6693929289223206e-09, + "learning_rate": 9.542958738135526e-05, + "loss": 0.3344, + "step": 8590 + }, + { + "epoch": 0.6711933192851011, + "grad_norm": 90.78608703613281, + "learning_rate": 9.541060352460178e-05, + "loss": 1.3186, + "step": 8600 + }, + { + "epoch": 0.671973776633107, + "grad_norm": 72.95679473876953, + "learning_rate": 9.539158221970246e-05, + "loss": 0.1651, + "step": 8610 + }, + { + "epoch": 0.6727542339811129, + "grad_norm": 0.004029855597764254, + "learning_rate": 9.537252348234334e-05, + "loss": 2.5257, + "step": 8620 + }, + { + "epoch": 0.6735346913291189, + "grad_norm": 126.72969055175781, + "learning_rate": 9.535342732824132e-05, + "loss": 3.0816, + "step": 8630 + }, + { + "epoch": 0.6743151486771248, + "grad_norm": 39.24493408203125, + "learning_rate": 9.533429377314416e-05, + "loss": 3.278, + "step": 8640 + }, + { + "epoch": 0.6750956060251307, + "grad_norm": 37.542564392089844, + "learning_rate": 9.53151228328305e-05, + "loss": 0.8647, + "step": 8650 + }, + { + "epoch": 0.6758760633731367, + "grad_norm": 14.442136764526367, + "learning_rate": 9.529591452310975e-05, + "loss": 0.4998, + "step": 8660 + }, + { + "epoch": 0.6766565207211426, + "grad_norm": 29.716121673583984, + "learning_rate": 9.527666885982216e-05, + "loss": 0.541, + "step": 8670 + }, + { + "epoch": 0.6774369780691485, + "grad_norm": 42.91162109375, + "learning_rate": 9.525738585883883e-05, + "loss": 0.5751, + "step": 8680 + }, + { + "epoch": 0.6782174354171544, + "grad_norm": 3.21102237701416, + "learning_rate": 9.523806553606156e-05, + "loss": 0.3043, + "step": 8690 + }, + { + "epoch": 0.6789978927651604, + "grad_norm": 52.90839385986328, + "learning_rate": 9.521870790742302e-05, + "loss": 0.685, + "step": 8700 + }, + { + "epoch": 0.6797783501131663, + "grad_norm": 0.0020777760073542595, + "learning_rate": 9.519931298888658e-05, + "loss": 0.5612, + "step": 8710 + }, + { + "epoch": 0.6805588074611723, + "grad_norm": 46.25379180908203, + "learning_rate": 9.51798807964464e-05, + "loss": 0.7983, + "step": 8720 + }, + { + "epoch": 0.6813392648091782, + "grad_norm": 0.19482280313968658, + "learning_rate": 9.516041134612734e-05, + "loss": 1.2481, + "step": 8730 + }, + { + "epoch": 0.6821197221571841, + "grad_norm": 0.0002395164337940514, + "learning_rate": 9.514090465398502e-05, + "loss": 1.4959, + "step": 8740 + }, + { + "epoch": 0.68290017950519, + "grad_norm": 0.16013354063034058, + "learning_rate": 9.512136073610575e-05, + "loss": 0.5128, + "step": 8750 + }, + { + "epoch": 0.683680636853196, + "grad_norm": 17.525259017944336, + "learning_rate": 9.510177960860658e-05, + "loss": 0.5179, + "step": 8760 + }, + { + "epoch": 0.6844610942012019, + "grad_norm": 0.0016455745790153742, + "learning_rate": 9.508216128763518e-05, + "loss": 0.4293, + "step": 8770 + }, + { + "epoch": 0.6852415515492079, + "grad_norm": 2.1851837635040283, + "learning_rate": 9.506250578936993e-05, + "loss": 1.6695, + "step": 8780 + }, + { + "epoch": 0.6860220088972138, + "grad_norm": 38.016029357910156, + "learning_rate": 9.504281313001986e-05, + "loss": 0.9156, + "step": 8790 + }, + { + "epoch": 0.6868024662452197, + "grad_norm": 6.92643404006958, + "learning_rate": 9.502308332582466e-05, + "loss": 0.1688, + "step": 8800 + }, + { + "epoch": 0.6875829235932256, + "grad_norm": 862.4967651367188, + "learning_rate": 9.500331639305462e-05, + "loss": 4.6745, + "step": 8810 + }, + { + "epoch": 0.6883633809412315, + "grad_norm": 0.0006037608836777508, + "learning_rate": 9.49835123480107e-05, + "loss": 0.7949, + "step": 8820 + }, + { + "epoch": 0.6891438382892375, + "grad_norm": 0.00010248275066260248, + "learning_rate": 9.49636712070244e-05, + "loss": 0.0901, + "step": 8830 + }, + { + "epoch": 0.6899242956372434, + "grad_norm": 12.445732116699219, + "learning_rate": 9.494379298645789e-05, + "loss": 0.4997, + "step": 8840 + }, + { + "epoch": 0.6907047529852494, + "grad_norm": 6.305129528045654, + "learning_rate": 9.492387770270381e-05, + "loss": 0.3734, + "step": 8850 + }, + { + "epoch": 0.6914852103332553, + "grad_norm": 34.12310028076172, + "learning_rate": 9.490392537218546e-05, + "loss": 1.5739, + "step": 8860 + }, + { + "epoch": 0.6922656676812612, + "grad_norm": 4.1644459997769445e-05, + "learning_rate": 9.488393601135666e-05, + "loss": 0.5979, + "step": 8870 + }, + { + "epoch": 0.6930461250292671, + "grad_norm": 48.17484664916992, + "learning_rate": 9.486390963670175e-05, + "loss": 1.7442, + "step": 8880 + }, + { + "epoch": 0.6938265823772731, + "grad_norm": 3.2540643957190696e-08, + "learning_rate": 9.484384626473564e-05, + "loss": 1.6239, + "step": 8890 + }, + { + "epoch": 0.694607039725279, + "grad_norm": 0.01425440888851881, + "learning_rate": 9.48237459120037e-05, + "loss": 0.6439, + "step": 8900 + }, + { + "epoch": 0.695387497073285, + "grad_norm": 2.2680177688598633, + "learning_rate": 9.480360859508178e-05, + "loss": 0.7415, + "step": 8910 + }, + { + "epoch": 0.6961679544212909, + "grad_norm": 5.447899341583252, + "learning_rate": 9.478343433057631e-05, + "loss": 0.9192, + "step": 8920 + }, + { + "epoch": 0.6969484117692968, + "grad_norm": 6.804657936096191, + "learning_rate": 9.476322313512408e-05, + "loss": 1.3799, + "step": 8930 + }, + { + "epoch": 0.6977288691173027, + "grad_norm": 53.744571685791016, + "learning_rate": 9.47429750253924e-05, + "loss": 1.0447, + "step": 8940 + }, + { + "epoch": 0.6985093264653087, + "grad_norm": 20.53790855407715, + "learning_rate": 9.4722690018079e-05, + "loss": 1.3624, + "step": 8950 + }, + { + "epoch": 0.6992897838133146, + "grad_norm": 7.079352144501172e-07, + "learning_rate": 9.470236812991205e-05, + "loss": 1.7281, + "step": 8960 + }, + { + "epoch": 0.7000702411613206, + "grad_norm": 1.2337582111358643, + "learning_rate": 9.468200937765013e-05, + "loss": 1.4859, + "step": 8970 + }, + { + "epoch": 0.7008506985093265, + "grad_norm": 59.161590576171875, + "learning_rate": 9.466161377808219e-05, + "loss": 1.9196, + "step": 8980 + }, + { + "epoch": 0.7016311558573324, + "grad_norm": 50.03242111206055, + "learning_rate": 9.464118134802762e-05, + "loss": 0.3488, + "step": 8990 + }, + { + "epoch": 0.7024116132053383, + "grad_norm": 7.526350964326411e-05, + "learning_rate": 9.462071210433612e-05, + "loss": 0.4794, + "step": 9000 + }, + { + "epoch": 0.7031920705533442, + "grad_norm": 50.92277526855469, + "learning_rate": 9.460020606388782e-05, + "loss": 1.2395, + "step": 9010 + }, + { + "epoch": 0.7039725279013502, + "grad_norm": 0.0001332208194071427, + "learning_rate": 9.457966324359315e-05, + "loss": 0.4707, + "step": 9020 + }, + { + "epoch": 0.7047529852493561, + "grad_norm": 0.033170610666275024, + "learning_rate": 9.455908366039288e-05, + "loss": 0.4093, + "step": 9030 + }, + { + "epoch": 0.7055334425973621, + "grad_norm": 5.431732461147476e-06, + "learning_rate": 9.453846733125806e-05, + "loss": 0.8617, + "step": 9040 + }, + { + "epoch": 0.706313899945368, + "grad_norm": 0.00780220702290535, + "learning_rate": 9.451781427319012e-05, + "loss": 1.345, + "step": 9050 + }, + { + "epoch": 0.7070943572933739, + "grad_norm": 51.573158264160156, + "learning_rate": 9.449712450322072e-05, + "loss": 1.286, + "step": 9060 + }, + { + "epoch": 0.7078748146413798, + "grad_norm": 34.31037521362305, + "learning_rate": 9.447639803841182e-05, + "loss": 2.3959, + "step": 9070 + }, + { + "epoch": 0.7086552719893858, + "grad_norm": 65.48462677001953, + "learning_rate": 9.445563489585563e-05, + "loss": 0.5199, + "step": 9080 + }, + { + "epoch": 0.7094357293373917, + "grad_norm": 51.40140914916992, + "learning_rate": 9.443483509267459e-05, + "loss": 0.7834, + "step": 9090 + }, + { + "epoch": 0.7102161866853977, + "grad_norm": 12.657033920288086, + "learning_rate": 9.441399864602143e-05, + "loss": 0.4621, + "step": 9100 + }, + { + "epoch": 0.7109966440334036, + "grad_norm": 0.18859697878360748, + "learning_rate": 9.439312557307902e-05, + "loss": 0.4602, + "step": 9110 + }, + { + "epoch": 0.7117771013814095, + "grad_norm": 1.1662402153015137, + "learning_rate": 9.437221589106049e-05, + "loss": 0.9147, + "step": 9120 + }, + { + "epoch": 0.7125575587294154, + "grad_norm": 38.23827362060547, + "learning_rate": 9.435126961720915e-05, + "loss": 1.2606, + "step": 9130 + }, + { + "epoch": 0.7133380160774214, + "grad_norm": 0.7538944482803345, + "learning_rate": 9.433028676879847e-05, + "loss": 1.0351, + "step": 9140 + }, + { + "epoch": 0.7141184734254273, + "grad_norm": 0.0005075185908935964, + "learning_rate": 9.430926736313209e-05, + "loss": 0.8804, + "step": 9150 + }, + { + "epoch": 0.7148989307734332, + "grad_norm": 21.777206420898438, + "learning_rate": 9.428821141754381e-05, + "loss": 1.0078, + "step": 9160 + }, + { + "epoch": 0.7156793881214392, + "grad_norm": 50.828529357910156, + "learning_rate": 9.426711894939753e-05, + "loss": 0.3999, + "step": 9170 + }, + { + "epoch": 0.7164598454694451, + "grad_norm": 1.3094151020050049, + "learning_rate": 9.424598997608732e-05, + "loss": 0.6567, + "step": 9180 + }, + { + "epoch": 0.717240302817451, + "grad_norm": 0.7577186822891235, + "learning_rate": 9.422482451503728e-05, + "loss": 0.4967, + "step": 9190 + }, + { + "epoch": 0.7180207601654569, + "grad_norm": 9.299863101830397e-09, + "learning_rate": 9.42036225837017e-05, + "loss": 0.0471, + "step": 9200 + }, + { + "epoch": 0.7188012175134629, + "grad_norm": 2.130914802250805e-13, + "learning_rate": 9.418238419956484e-05, + "loss": 1.7301, + "step": 9210 + }, + { + "epoch": 0.7195816748614688, + "grad_norm": 26.992446899414062, + "learning_rate": 9.416110938014109e-05, + "loss": 0.1198, + "step": 9220 + }, + { + "epoch": 0.7203621322094748, + "grad_norm": 5.468270683195442e-05, + "learning_rate": 9.413979814297485e-05, + "loss": 0.9217, + "step": 9230 + }, + { + "epoch": 0.7211425895574807, + "grad_norm": 0.8918013572692871, + "learning_rate": 9.41184505056406e-05, + "loss": 1.265, + "step": 9240 + }, + { + "epoch": 0.7219230469054866, + "grad_norm": 83.64592742919922, + "learning_rate": 9.409706648574278e-05, + "loss": 2.5767, + "step": 9250 + }, + { + "epoch": 0.7227035042534925, + "grad_norm": 1.9623314142227173, + "learning_rate": 9.407564610091587e-05, + "loss": 1.7349, + "step": 9260 + }, + { + "epoch": 0.7234839616014985, + "grad_norm": 39.234317779541016, + "learning_rate": 9.405418936882434e-05, + "loss": 0.592, + "step": 9270 + }, + { + "epoch": 0.7242644189495044, + "grad_norm": 65.8040542602539, + "learning_rate": 9.403269630716259e-05, + "loss": 1.2316, + "step": 9280 + }, + { + "epoch": 0.7250448762975104, + "grad_norm": 39.89567565917969, + "learning_rate": 9.401116693365504e-05, + "loss": 2.1814, + "step": 9290 + }, + { + "epoch": 0.7258253336455163, + "grad_norm": 2.613337993621826, + "learning_rate": 9.3989601266056e-05, + "loss": 0.2446, + "step": 9300 + }, + { + "epoch": 0.7266057909935222, + "grad_norm": 44.03250503540039, + "learning_rate": 9.396799932214977e-05, + "loss": 0.8227, + "step": 9310 + }, + { + "epoch": 0.7273862483415281, + "grad_norm": 16.39767837524414, + "learning_rate": 9.39463611197505e-05, + "loss": 1.2959, + "step": 9320 + }, + { + "epoch": 0.7281667056895341, + "grad_norm": 59.34518814086914, + "learning_rate": 9.392468667670229e-05, + "loss": 2.1178, + "step": 9330 + }, + { + "epoch": 0.72894716303754, + "grad_norm": 4.03518533706665, + "learning_rate": 9.39029760108791e-05, + "loss": 0.777, + "step": 9340 + }, + { + "epoch": 0.729727620385546, + "grad_norm": 0.06964946538209915, + "learning_rate": 9.388122914018478e-05, + "loss": 1.0955, + "step": 9350 + }, + { + "epoch": 0.7305080777335519, + "grad_norm": 10.531116485595703, + "learning_rate": 9.385944608255303e-05, + "loss": 0.3112, + "step": 9360 + }, + { + "epoch": 0.7312885350815578, + "grad_norm": 53.286312103271484, + "learning_rate": 9.383762685594737e-05, + "loss": 1.2833, + "step": 9370 + }, + { + "epoch": 0.7320689924295637, + "grad_norm": 0.09685280174016953, + "learning_rate": 9.381577147836118e-05, + "loss": 0.569, + "step": 9380 + }, + { + "epoch": 0.7328494497775696, + "grad_norm": 1.3586193858827755e-07, + "learning_rate": 9.379387996781764e-05, + "loss": 1.0987, + "step": 9390 + }, + { + "epoch": 0.7336299071255756, + "grad_norm": 1.8557615280151367, + "learning_rate": 9.377195234236969e-05, + "loss": 2.1667, + "step": 9400 + }, + { + "epoch": 0.7344103644735815, + "grad_norm": 54.7618522644043, + "learning_rate": 9.374998862010013e-05, + "loss": 1.502, + "step": 9410 + }, + { + "epoch": 0.7351908218215875, + "grad_norm": 67.63577270507812, + "learning_rate": 9.372798881912148e-05, + "loss": 1.4237, + "step": 9420 + }, + { + "epoch": 0.7359712791695934, + "grad_norm": 17.463558197021484, + "learning_rate": 9.370595295757598e-05, + "loss": 0.3832, + "step": 9430 + }, + { + "epoch": 0.7367517365175993, + "grad_norm": 66.19950866699219, + "learning_rate": 9.36838810536357e-05, + "loss": 1.7184, + "step": 9440 + }, + { + "epoch": 0.7375321938656052, + "grad_norm": 0.3658444285392761, + "learning_rate": 9.366177312550232e-05, + "loss": 1.0973, + "step": 9450 + }, + { + "epoch": 0.7383126512136112, + "grad_norm": 0.022006291896104813, + "learning_rate": 9.363962919140734e-05, + "loss": 1.0796, + "step": 9460 + }, + { + "epoch": 0.7390931085616171, + "grad_norm": 0.00027838730602525175, + "learning_rate": 9.361744926961185e-05, + "loss": 1.9034, + "step": 9470 + }, + { + "epoch": 0.739873565909623, + "grad_norm": 0.3998589813709259, + "learning_rate": 9.35952333784067e-05, + "loss": 1.3968, + "step": 9480 + }, + { + "epoch": 0.740654023257629, + "grad_norm": 32.08036804199219, + "learning_rate": 9.357298153611236e-05, + "loss": 1.4303, + "step": 9490 + }, + { + "epoch": 0.7414344806056349, + "grad_norm": 5.425138473510742, + "learning_rate": 9.355069376107892e-05, + "loss": 0.1034, + "step": 9500 + }, + { + "epoch": 0.7422149379536408, + "grad_norm": 30.735441207885742, + "learning_rate": 9.352837007168618e-05, + "loss": 1.1738, + "step": 9510 + }, + { + "epoch": 0.7429953953016467, + "grad_norm": 11.689058303833008, + "learning_rate": 9.35060104863435e-05, + "loss": 1.0133, + "step": 9520 + }, + { + "epoch": 0.7437758526496527, + "grad_norm": 12.790924072265625, + "learning_rate": 9.348361502348986e-05, + "loss": 1.4594, + "step": 9530 + }, + { + "epoch": 0.7445563099976586, + "grad_norm": 32.837467193603516, + "learning_rate": 9.34611837015938e-05, + "loss": 0.4372, + "step": 9540 + }, + { + "epoch": 0.7453367673456646, + "grad_norm": 47.088661193847656, + "learning_rate": 9.343871653915349e-05, + "loss": 1.886, + "step": 9550 + }, + { + "epoch": 0.7461172246936705, + "grad_norm": 3.3653592254268005e-05, + "learning_rate": 9.341621355469659e-05, + "loss": 0.078, + "step": 9560 + }, + { + "epoch": 0.7468976820416764, + "grad_norm": 4.299693584442139, + "learning_rate": 9.339367476678034e-05, + "loss": 0.6906, + "step": 9570 + }, + { + "epoch": 0.7476781393896823, + "grad_norm": 18.840805053710938, + "learning_rate": 9.337110019399149e-05, + "loss": 1.133, + "step": 9580 + }, + { + "epoch": 0.7484585967376883, + "grad_norm": 8.864206392900087e-06, + "learning_rate": 9.334848985494632e-05, + "loss": 1.3721, + "step": 9590 + }, + { + "epoch": 0.7492390540856942, + "grad_norm": 0.5536551475524902, + "learning_rate": 9.33258437682906e-05, + "loss": 0.7809, + "step": 9600 + }, + { + "epoch": 0.7500195114337002, + "grad_norm": 5.465764115797356e-05, + "learning_rate": 9.330316195269953e-05, + "loss": 0.2602, + "step": 9610 + }, + { + "epoch": 0.7507999687817061, + "grad_norm": 50.65366744995117, + "learning_rate": 9.328044442687784e-05, + "loss": 0.8001, + "step": 9620 + }, + { + "epoch": 0.751580426129712, + "grad_norm": 54.035240173339844, + "learning_rate": 9.32576912095597e-05, + "loss": 1.3913, + "step": 9630 + }, + { + "epoch": 0.7523608834777179, + "grad_norm": 0.7376708984375, + "learning_rate": 9.323490231950867e-05, + "loss": 0.4636, + "step": 9640 + }, + { + "epoch": 0.7531413408257239, + "grad_norm": 36.873104095458984, + "learning_rate": 9.321207777551776e-05, + "loss": 1.2622, + "step": 9650 + }, + { + "epoch": 0.7539217981737298, + "grad_norm": 1.9300039639347233e-05, + "learning_rate": 9.318921759640939e-05, + "loss": 1.3158, + "step": 9660 + }, + { + "epoch": 0.7547022555217358, + "grad_norm": 13.232084274291992, + "learning_rate": 9.316632180103535e-05, + "loss": 0.2827, + "step": 9670 + }, + { + "epoch": 0.7554827128697417, + "grad_norm": 2.7236337661743164, + "learning_rate": 9.314339040827679e-05, + "loss": 1.6498, + "step": 9680 + }, + { + "epoch": 0.7562631702177476, + "grad_norm": 9.229357965523377e-05, + "learning_rate": 9.312042343704425e-05, + "loss": 0.8143, + "step": 9690 + }, + { + "epoch": 0.7570436275657535, + "grad_norm": 48.95867156982422, + "learning_rate": 9.309742090627757e-05, + "loss": 2.1144, + "step": 9700 + }, + { + "epoch": 0.7578240849137594, + "grad_norm": 1.1643975973129272, + "learning_rate": 9.307438283494595e-05, + "loss": 0.7393, + "step": 9710 + }, + { + "epoch": 0.7586045422617654, + "grad_norm": 2.9845916287740692e-05, + "learning_rate": 9.305130924204788e-05, + "loss": 0.7176, + "step": 9720 + }, + { + "epoch": 0.7593849996097713, + "grad_norm": 1.4221373021428008e-05, + "learning_rate": 9.302820014661115e-05, + "loss": 0.6517, + "step": 9730 + }, + { + "epoch": 0.7601654569577773, + "grad_norm": 61.808475494384766, + "learning_rate": 9.300505556769282e-05, + "loss": 0.785, + "step": 9740 + }, + { + "epoch": 0.7609459143057832, + "grad_norm": 47.0339241027832, + "learning_rate": 9.29818755243792e-05, + "loss": 1.022, + "step": 9750 + }, + { + "epoch": 0.7617263716537891, + "grad_norm": 0.0022127856500446796, + "learning_rate": 9.295866003578589e-05, + "loss": 2.0017, + "step": 9760 + }, + { + "epoch": 0.762506829001795, + "grad_norm": 7.076489418977872e-05, + "learning_rate": 9.293540912105767e-05, + "loss": 0.3585, + "step": 9770 + }, + { + "epoch": 0.763287286349801, + "grad_norm": 40.70147705078125, + "learning_rate": 9.291212279936857e-05, + "loss": 0.6532, + "step": 9780 + }, + { + "epoch": 0.764067743697807, + "grad_norm": 2.6155385971069336, + "learning_rate": 9.28888010899218e-05, + "loss": 1.0556, + "step": 9790 + }, + { + "epoch": 0.7648482010458129, + "grad_norm": 0.6977835297584534, + "learning_rate": 9.286544401194975e-05, + "loss": 2.7733, + "step": 9800 + }, + { + "epoch": 0.7656286583938188, + "grad_norm": 3.6149048805236816, + "learning_rate": 9.2842051584714e-05, + "loss": 2.1253, + "step": 9810 + }, + { + "epoch": 0.7664091157418247, + "grad_norm": 51.35221481323242, + "learning_rate": 9.281862382750526e-05, + "loss": 1.6301, + "step": 9820 + }, + { + "epoch": 0.7671895730898306, + "grad_norm": 12.507816314697266, + "learning_rate": 9.279516075964336e-05, + "loss": 0.5273, + "step": 9830 + }, + { + "epoch": 0.7679700304378366, + "grad_norm": 9.683573722839355, + "learning_rate": 9.27716624004773e-05, + "loss": 0.8224, + "step": 9840 + }, + { + "epoch": 0.7687504877858425, + "grad_norm": 1.2265725135803223, + "learning_rate": 9.274812876938514e-05, + "loss": 1.0238, + "step": 9850 + }, + { + "epoch": 0.7695309451338485, + "grad_norm": 0.18335406482219696, + "learning_rate": 9.272455988577404e-05, + "loss": 1.4574, + "step": 9860 + }, + { + "epoch": 0.7703114024818544, + "grad_norm": 49.96540832519531, + "learning_rate": 9.270095576908022e-05, + "loss": 1.2212, + "step": 9870 + }, + { + "epoch": 0.7710918598298603, + "grad_norm": 0.36174869537353516, + "learning_rate": 9.267731643876898e-05, + "loss": 2.0641, + "step": 9880 + }, + { + "epoch": 0.7718723171778662, + "grad_norm": 51.78955078125, + "learning_rate": 9.265364191433466e-05, + "loss": 2.0582, + "step": 9890 + }, + { + "epoch": 0.7726527745258721, + "grad_norm": 1.56452631472348e-06, + "learning_rate": 9.262993221530057e-05, + "loss": 0.2814, + "step": 9900 + }, + { + "epoch": 0.7734332318738781, + "grad_norm": 0.06889957189559937, + "learning_rate": 9.260618736121908e-05, + "loss": 0.5443, + "step": 9910 + }, + { + "epoch": 0.774213689221884, + "grad_norm": 0.00021075314725749195, + "learning_rate": 9.258240737167157e-05, + "loss": 1.0861, + "step": 9920 + }, + { + "epoch": 0.77499414656989, + "grad_norm": 4.2871174812316895, + "learning_rate": 9.255859226626833e-05, + "loss": 2.9569, + "step": 9930 + }, + { + "epoch": 0.7757746039178959, + "grad_norm": 0.16662397980690002, + "learning_rate": 9.253474206464863e-05, + "loss": 0.9966, + "step": 9940 + }, + { + "epoch": 0.7765550612659018, + "grad_norm": 4.747070789337158, + "learning_rate": 9.251085678648072e-05, + "loss": 0.7685, + "step": 9950 + }, + { + "epoch": 0.7773355186139077, + "grad_norm": 9.950766752808704e-07, + "learning_rate": 9.248693645146171e-05, + "loss": 0.8595, + "step": 9960 + }, + { + "epoch": 0.7781159759619137, + "grad_norm": 3.301793098449707, + "learning_rate": 9.24629810793177e-05, + "loss": 0.7267, + "step": 9970 + }, + { + "epoch": 0.7788964333099196, + "grad_norm": 34.93284606933594, + "learning_rate": 9.243899068980363e-05, + "loss": 1.1627, + "step": 9980 + }, + { + "epoch": 0.7796768906579256, + "grad_norm": 14.096379280090332, + "learning_rate": 9.24149653027033e-05, + "loss": 1.5315, + "step": 9990 + }, + { + "epoch": 0.7804573480059315, + "grad_norm": 51.46378707885742, + "learning_rate": 9.239090493782945e-05, + "loss": 0.8666, + "step": 10000 + }, + { + "epoch": 0.7812378053539374, + "grad_norm": 0.4632790684700012, + "learning_rate": 9.236680961502357e-05, + "loss": 0.6835, + "step": 10010 + }, + { + "epoch": 0.7820182627019433, + "grad_norm": 1.5781743059051223e-05, + "learning_rate": 9.234267935415604e-05, + "loss": 0.7809, + "step": 10020 + }, + { + "epoch": 0.7827987200499492, + "grad_norm": 0.00013900962949264795, + "learning_rate": 9.231851417512604e-05, + "loss": 1.1447, + "step": 10030 + }, + { + "epoch": 0.7835791773979552, + "grad_norm": 10.146800994873047, + "learning_rate": 9.229431409786152e-05, + "loss": 0.5029, + "step": 10040 + }, + { + "epoch": 0.7843596347459612, + "grad_norm": 9.001547813415527, + "learning_rate": 9.227007914231925e-05, + "loss": 0.2138, + "step": 10050 + }, + { + "epoch": 0.7851400920939671, + "grad_norm": 9.599566459655762, + "learning_rate": 9.224580932848475e-05, + "loss": 1.292, + "step": 10060 + }, + { + "epoch": 0.785920549441973, + "grad_norm": 47.64023971557617, + "learning_rate": 9.222150467637224e-05, + "loss": 0.5444, + "step": 10070 + }, + { + "epoch": 0.7867010067899789, + "grad_norm": 30.91171646118164, + "learning_rate": 9.219716520602473e-05, + "loss": 1.3808, + "step": 10080 + }, + { + "epoch": 0.7874814641379848, + "grad_norm": 1.6111406087875366, + "learning_rate": 9.217279093751394e-05, + "loss": 0.1607, + "step": 10090 + }, + { + "epoch": 0.7882619214859908, + "grad_norm": 1.8544057607650757, + "learning_rate": 9.214838189094024e-05, + "loss": 0.9206, + "step": 10100 + }, + { + "epoch": 0.7890423788339967, + "grad_norm": 24.458349227905273, + "learning_rate": 9.212393808643271e-05, + "loss": 0.7696, + "step": 10110 + }, + { + "epoch": 0.7898228361820027, + "grad_norm": 46.76490020751953, + "learning_rate": 9.209945954414909e-05, + "loss": 0.4999, + "step": 10120 + }, + { + "epoch": 0.7906032935300086, + "grad_norm": 1.0910409688949585, + "learning_rate": 9.207494628427578e-05, + "loss": 1.4236, + "step": 10130 + }, + { + "epoch": 0.7913837508780145, + "grad_norm": 77.94813537597656, + "learning_rate": 9.205039832702779e-05, + "loss": 1.9332, + "step": 10140 + }, + { + "epoch": 0.7921642082260204, + "grad_norm": 0.00010009103425545618, + "learning_rate": 9.202581569264875e-05, + "loss": 0.7059, + "step": 10150 + }, + { + "epoch": 0.7929446655740264, + "grad_norm": 1.2641462087631226, + "learning_rate": 9.200119840141088e-05, + "loss": 0.6298, + "step": 10160 + }, + { + "epoch": 0.7937251229220323, + "grad_norm": 58.1982421875, + "learning_rate": 9.1976546473615e-05, + "loss": 1.4736, + "step": 10170 + }, + { + "epoch": 0.7945055802700383, + "grad_norm": 19.780921936035156, + "learning_rate": 9.195185992959048e-05, + "loss": 0.2553, + "step": 10180 + }, + { + "epoch": 0.7952860376180442, + "grad_norm": 9.129549980163574, + "learning_rate": 9.192713878969522e-05, + "loss": 0.4162, + "step": 10190 + }, + { + "epoch": 0.7960664949660501, + "grad_norm": 2.2736198902130127, + "learning_rate": 9.190238307431569e-05, + "loss": 1.5475, + "step": 10200 + }, + { + "epoch": 0.796846952314056, + "grad_norm": 12.91063117980957, + "learning_rate": 9.187759280386683e-05, + "loss": 1.3482, + "step": 10210 + }, + { + "epoch": 0.7976274096620619, + "grad_norm": 2.047335624694824, + "learning_rate": 9.185276799879211e-05, + "loss": 0.7524, + "step": 10220 + }, + { + "epoch": 0.7984078670100679, + "grad_norm": 85.41983795166016, + "learning_rate": 9.182790867956345e-05, + "loss": 1.066, + "step": 10230 + }, + { + "epoch": 0.7991883243580739, + "grad_norm": 27.353626251220703, + "learning_rate": 9.180301486668128e-05, + "loss": 2.0177, + "step": 10240 + }, + { + "epoch": 0.7999687817060798, + "grad_norm": 8.590062861912884e-06, + "learning_rate": 9.17780865806744e-05, + "loss": 0.3788, + "step": 10250 + }, + { + "epoch": 0.8007492390540857, + "grad_norm": 2.5400359630584717, + "learning_rate": 9.175312384210011e-05, + "loss": 1.0827, + "step": 10260 + }, + { + "epoch": 0.8015296964020916, + "grad_norm": 0.13091030716896057, + "learning_rate": 9.172812667154406e-05, + "loss": 0.4355, + "step": 10270 + }, + { + "epoch": 0.8023101537500975, + "grad_norm": 0.00023703681654296815, + "learning_rate": 9.170309508962038e-05, + "loss": 1.3295, + "step": 10280 + }, + { + "epoch": 0.8030906110981035, + "grad_norm": 4.602359294891357, + "learning_rate": 9.167802911697147e-05, + "loss": 0.2444, + "step": 10290 + }, + { + "epoch": 0.8038710684461094, + "grad_norm": 0.378699392080307, + "learning_rate": 9.16529287742682e-05, + "loss": 1.6013, + "step": 10300 + }, + { + "epoch": 0.8046515257941154, + "grad_norm": 0.025837572291493416, + "learning_rate": 9.162779408220968e-05, + "loss": 1.3339, + "step": 10310 + }, + { + "epoch": 0.8054319831421213, + "grad_norm": 5.020955562591553, + "learning_rate": 9.160262506152342e-05, + "loss": 1.3104, + "step": 10320 + }, + { + "epoch": 0.8062124404901272, + "grad_norm": 24.9432430267334, + "learning_rate": 9.157742173296521e-05, + "loss": 0.456, + "step": 10330 + }, + { + "epoch": 0.8069928978381331, + "grad_norm": 57.66835403442383, + "learning_rate": 9.155218411731912e-05, + "loss": 0.9609, + "step": 10340 + }, + { + "epoch": 0.8077733551861391, + "grad_norm": 58.548484802246094, + "learning_rate": 9.152691223539755e-05, + "loss": 0.6119, + "step": 10350 + }, + { + "epoch": 0.808553812534145, + "grad_norm": 76.95970153808594, + "learning_rate": 9.150160610804108e-05, + "loss": 0.6974, + "step": 10360 + }, + { + "epoch": 0.809334269882151, + "grad_norm": 4.002541231784562e-07, + "learning_rate": 9.147626575611861e-05, + "loss": 1.2283, + "step": 10370 + }, + { + "epoch": 0.8101147272301569, + "grad_norm": 63.17544174194336, + "learning_rate": 9.145089120052718e-05, + "loss": 1.2178, + "step": 10380 + }, + { + "epoch": 0.8108951845781628, + "grad_norm": 0.06589673459529877, + "learning_rate": 9.142548246219212e-05, + "loss": 1.199, + "step": 10390 + }, + { + "epoch": 0.8116756419261687, + "grad_norm": 0.006386119872331619, + "learning_rate": 9.140003956206688e-05, + "loss": 1.1264, + "step": 10400 + }, + { + "epoch": 0.8124560992741746, + "grad_norm": 0.06595534831285477, + "learning_rate": 9.137456252113312e-05, + "loss": 0.262, + "step": 10410 + }, + { + "epoch": 0.8132365566221806, + "grad_norm": 3.372828960418701, + "learning_rate": 9.134905136040064e-05, + "loss": 2.7002, + "step": 10420 + }, + { + "epoch": 0.8140170139701866, + "grad_norm": 0.0010774118127301335, + "learning_rate": 9.13235061009074e-05, + "loss": 0.9205, + "step": 10430 + }, + { + "epoch": 0.8147974713181925, + "grad_norm": 23.298646926879883, + "learning_rate": 9.129792676371947e-05, + "loss": 1.9437, + "step": 10440 + }, + { + "epoch": 0.8155779286661984, + "grad_norm": 0.0358101949095726, + "learning_rate": 9.127231336993099e-05, + "loss": 1.1757, + "step": 10450 + }, + { + "epoch": 0.8163583860142043, + "grad_norm": 34.35308837890625, + "learning_rate": 9.12466659406642e-05, + "loss": 0.9185, + "step": 10460 + }, + { + "epoch": 0.8171388433622102, + "grad_norm": 0.12688755989074707, + "learning_rate": 9.122098449706944e-05, + "loss": 0.7325, + "step": 10470 + }, + { + "epoch": 0.8179193007102162, + "grad_norm": 0.5014522671699524, + "learning_rate": 9.119526906032507e-05, + "loss": 0.816, + "step": 10480 + }, + { + "epoch": 0.8186997580582221, + "grad_norm": 49.12326431274414, + "learning_rate": 9.11695196516375e-05, + "loss": 0.6054, + "step": 10490 + }, + { + "epoch": 0.8194802154062281, + "grad_norm": 9.400172233581543, + "learning_rate": 9.114373629224113e-05, + "loss": 0.7559, + "step": 10500 + }, + { + "epoch": 0.820260672754234, + "grad_norm": 65.43171691894531, + "learning_rate": 9.11179190033984e-05, + "loss": 1.0216, + "step": 10510 + }, + { + "epoch": 0.8210411301022399, + "grad_norm": 75.4448013305664, + "learning_rate": 9.109206780639966e-05, + "loss": 1.2247, + "step": 10520 + }, + { + "epoch": 0.8218215874502458, + "grad_norm": 0.2027098685503006, + "learning_rate": 9.106618272256329e-05, + "loss": 0.5028, + "step": 10530 + }, + { + "epoch": 0.8226020447982518, + "grad_norm": 2.619052565933089e-07, + "learning_rate": 9.10402637732356e-05, + "loss": 0.3584, + "step": 10540 + }, + { + "epoch": 0.8233825021462577, + "grad_norm": 7.298355579376221, + "learning_rate": 9.10143109797908e-05, + "loss": 1.472, + "step": 10550 + }, + { + "epoch": 0.8241629594942637, + "grad_norm": 82.12861633300781, + "learning_rate": 9.098832436363102e-05, + "loss": 2.129, + "step": 10560 + }, + { + "epoch": 0.8249434168422696, + "grad_norm": 0.17317530512809753, + "learning_rate": 9.096230394618634e-05, + "loss": 2.7479, + "step": 10570 + }, + { + "epoch": 0.8257238741902755, + "grad_norm": 1.2783308420694084e-06, + "learning_rate": 9.093624974891461e-05, + "loss": 0.4849, + "step": 10580 + }, + { + "epoch": 0.8265043315382814, + "grad_norm": 6.741588731529191e-05, + "learning_rate": 9.091016179330161e-05, + "loss": 0.814, + "step": 10590 + }, + { + "epoch": 0.8272847888862873, + "grad_norm": 0.0009110376122407615, + "learning_rate": 9.088404010086095e-05, + "loss": 0.918, + "step": 10600 + }, + { + "epoch": 0.8280652462342933, + "grad_norm": 6.268826382438419e-06, + "learning_rate": 9.085788469313403e-05, + "loss": 0.3706, + "step": 10610 + }, + { + "epoch": 0.8288457035822993, + "grad_norm": 47.73321533203125, + "learning_rate": 9.08316955916901e-05, + "loss": 2.3103, + "step": 10620 + }, + { + "epoch": 0.8296261609303052, + "grad_norm": 72.35128784179688, + "learning_rate": 9.080547281812613e-05, + "loss": 1.4624, + "step": 10630 + }, + { + "epoch": 0.8304066182783111, + "grad_norm": 9.136421203613281, + "learning_rate": 9.077921639406695e-05, + "loss": 0.1298, + "step": 10640 + }, + { + "epoch": 0.831187075626317, + "grad_norm": 7.849686145782471, + "learning_rate": 9.075292634116507e-05, + "loss": 1.55, + "step": 10650 + }, + { + "epoch": 0.8319675329743229, + "grad_norm": 41.03010940551758, + "learning_rate": 9.072660268110074e-05, + "loss": 0.6723, + "step": 10660 + }, + { + "epoch": 0.8327479903223289, + "grad_norm": 32.15593338012695, + "learning_rate": 9.070024543558193e-05, + "loss": 0.5337, + "step": 10670 + }, + { + "epoch": 0.8335284476703348, + "grad_norm": 0.2335835099220276, + "learning_rate": 9.067385462634433e-05, + "loss": 2.3867, + "step": 10680 + }, + { + "epoch": 0.8343089050183408, + "grad_norm": 70.02042388916016, + "learning_rate": 9.064743027515128e-05, + "loss": 1.8435, + "step": 10690 + }, + { + "epoch": 0.8350893623663467, + "grad_norm": 0.010791340842843056, + "learning_rate": 9.062097240379381e-05, + "loss": 1.3253, + "step": 10700 + }, + { + "epoch": 0.8358698197143526, + "grad_norm": 42.142948150634766, + "learning_rate": 9.059448103409054e-05, + "loss": 1.3379, + "step": 10710 + }, + { + "epoch": 0.8366502770623585, + "grad_norm": 18.970975875854492, + "learning_rate": 9.056795618788778e-05, + "loss": 0.5949, + "step": 10720 + }, + { + "epoch": 0.8374307344103644, + "grad_norm": 31.398378372192383, + "learning_rate": 9.05413978870594e-05, + "loss": 0.8256, + "step": 10730 + }, + { + "epoch": 0.8382111917583704, + "grad_norm": 38.225547790527344, + "learning_rate": 9.051480615350687e-05, + "loss": 0.4197, + "step": 10740 + }, + { + "epoch": 0.8389916491063764, + "grad_norm": 0.0025365985929965973, + "learning_rate": 9.048818100915923e-05, + "loss": 1.5254, + "step": 10750 + }, + { + "epoch": 0.8397721064543823, + "grad_norm": 12.11093521118164, + "learning_rate": 9.046152247597309e-05, + "loss": 1.0872, + "step": 10760 + }, + { + "epoch": 0.8405525638023882, + "grad_norm": 43.06413650512695, + "learning_rate": 9.043483057593258e-05, + "loss": 1.2137, + "step": 10770 + }, + { + "epoch": 0.8413330211503941, + "grad_norm": 19.772510528564453, + "learning_rate": 9.040810533104934e-05, + "loss": 0.8811, + "step": 10780 + }, + { + "epoch": 0.8421134784984, + "grad_norm": 0.31982916593551636, + "learning_rate": 9.038134676336252e-05, + "loss": 1.9339, + "step": 10790 + }, + { + "epoch": 0.842893935846406, + "grad_norm": 0.11078077554702759, + "learning_rate": 9.035455489493874e-05, + "loss": 0.6942, + "step": 10800 + }, + { + "epoch": 0.843674393194412, + "grad_norm": 21.829547882080078, + "learning_rate": 9.032772974787207e-05, + "loss": 0.423, + "step": 10810 + }, + { + "epoch": 0.8444548505424179, + "grad_norm": 26.680130004882812, + "learning_rate": 9.030087134428408e-05, + "loss": 0.7315, + "step": 10820 + }, + { + "epoch": 0.8452353078904238, + "grad_norm": 0.0005266700172796845, + "learning_rate": 9.02739797063237e-05, + "loss": 0.2991, + "step": 10830 + }, + { + "epoch": 0.8460157652384297, + "grad_norm": 0.07193426787853241, + "learning_rate": 9.024705485616729e-05, + "loss": 1.2723, + "step": 10840 + }, + { + "epoch": 0.8467962225864356, + "grad_norm": 10.340862274169922, + "learning_rate": 9.02200968160186e-05, + "loss": 0.6524, + "step": 10850 + }, + { + "epoch": 0.8475766799344416, + "grad_norm": 0.5865875482559204, + "learning_rate": 9.019310560810876e-05, + "loss": 0.9401, + "step": 10860 + }, + { + "epoch": 0.8483571372824475, + "grad_norm": 3.370558988535777e-05, + "learning_rate": 9.016608125469624e-05, + "loss": 0.7333, + "step": 10870 + }, + { + "epoch": 0.8491375946304535, + "grad_norm": 52.608585357666016, + "learning_rate": 9.013902377806685e-05, + "loss": 2.4574, + "step": 10880 + }, + { + "epoch": 0.8499180519784594, + "grad_norm": 3.0554566383361816, + "learning_rate": 9.01119332005337e-05, + "loss": 0.7603, + "step": 10890 + }, + { + "epoch": 0.8506985093264653, + "grad_norm": 50.016151428222656, + "learning_rate": 9.008480954443721e-05, + "loss": 1.0146, + "step": 10900 + }, + { + "epoch": 0.8514789666744712, + "grad_norm": 5.849006652832031, + "learning_rate": 9.00576528321451e-05, + "loss": 0.3672, + "step": 10910 + }, + { + "epoch": 0.8522594240224771, + "grad_norm": 15.98238468170166, + "learning_rate": 9.00304630860523e-05, + "loss": 0.5704, + "step": 10920 + }, + { + "epoch": 0.8530398813704831, + "grad_norm": 0.1195988729596138, + "learning_rate": 9.000324032858102e-05, + "loss": 0.204, + "step": 10930 + }, + { + "epoch": 0.8538203387184891, + "grad_norm": 1.65741475939285e-05, + "learning_rate": 8.997598458218068e-05, + "loss": 1.2947, + "step": 10940 + }, + { + "epoch": 0.854600796066495, + "grad_norm": 0.12614373862743378, + "learning_rate": 8.994869586932793e-05, + "loss": 0.9826, + "step": 10950 + }, + { + "epoch": 0.8553812534145009, + "grad_norm": 1.6010971069335938, + "learning_rate": 8.992137421252653e-05, + "loss": 0.7233, + "step": 10960 + }, + { + "epoch": 0.8561617107625068, + "grad_norm": 1.2691033361988957e-06, + "learning_rate": 8.98940196343075e-05, + "loss": 1.8345, + "step": 10970 + }, + { + "epoch": 0.8569421681105127, + "grad_norm": 2.088942050933838, + "learning_rate": 8.986663215722896e-05, + "loss": 2.192, + "step": 10980 + }, + { + "epoch": 0.8577226254585187, + "grad_norm": 0.39988765120506287, + "learning_rate": 8.983921180387617e-05, + "loss": 0.6454, + "step": 10990 + }, + { + "epoch": 0.8585030828065247, + "grad_norm": 2.1378994408127738e-10, + "learning_rate": 8.98117585968615e-05, + "loss": 0.2324, + "step": 11000 + }, + { + "epoch": 0.8592835401545306, + "grad_norm": 0.02639651857316494, + "learning_rate": 8.978427255882441e-05, + "loss": 3.9428, + "step": 11010 + }, + { + "epoch": 0.8600639975025365, + "grad_norm": 19.26889419555664, + "learning_rate": 8.975675371243141e-05, + "loss": 0.6036, + "step": 11020 + }, + { + "epoch": 0.8608444548505424, + "grad_norm": 66.81867980957031, + "learning_rate": 8.972920208037616e-05, + "loss": 2.2432, + "step": 11030 + }, + { + "epoch": 0.8616249121985483, + "grad_norm": 0.0035200975835323334, + "learning_rate": 8.970161768537923e-05, + "loss": 1.3233, + "step": 11040 + }, + { + "epoch": 0.8624053695465543, + "grad_norm": 28.795499801635742, + "learning_rate": 8.967400055018831e-05, + "loss": 0.9746, + "step": 11050 + }, + { + "epoch": 0.8631858268945602, + "grad_norm": 0.11977910250425339, + "learning_rate": 8.964635069757802e-05, + "loss": 0.6801, + "step": 11060 + }, + { + "epoch": 0.8639662842425662, + "grad_norm": 0.9692222476005554, + "learning_rate": 8.961866815035001e-05, + "loss": 0.9246, + "step": 11070 + }, + { + "epoch": 0.8647467415905721, + "grad_norm": 1.2255003452301025, + "learning_rate": 8.959095293133283e-05, + "loss": 2.5522, + "step": 11080 + }, + { + "epoch": 0.865527198938578, + "grad_norm": 2.081770896911621, + "learning_rate": 8.956320506338206e-05, + "loss": 0.4645, + "step": 11090 + }, + { + "epoch": 0.8663076562865839, + "grad_norm": 56.65788269042969, + "learning_rate": 8.953542456938013e-05, + "loss": 0.7179, + "step": 11100 + }, + { + "epoch": 0.8670881136345898, + "grad_norm": 43.48613739013672, + "learning_rate": 8.950761147223642e-05, + "loss": 1.8452, + "step": 11110 + }, + { + "epoch": 0.8678685709825958, + "grad_norm": 22.29058074951172, + "learning_rate": 8.947976579488717e-05, + "loss": 0.928, + "step": 11120 + }, + { + "epoch": 0.8686490283306018, + "grad_norm": 0.007680551148951054, + "learning_rate": 8.94518875602955e-05, + "loss": 2.3963, + "step": 11130 + }, + { + "epoch": 0.8694294856786077, + "grad_norm": 0.2508794069290161, + "learning_rate": 8.942397679145135e-05, + "loss": 0.2604, + "step": 11140 + }, + { + "epoch": 0.8702099430266136, + "grad_norm": 0.05226520821452141, + "learning_rate": 8.939603351137156e-05, + "loss": 0.2393, + "step": 11150 + }, + { + "epoch": 0.8709904003746195, + "grad_norm": 45.50146484375, + "learning_rate": 8.93680577430997e-05, + "loss": 0.6496, + "step": 11160 + }, + { + "epoch": 0.8717708577226254, + "grad_norm": 13.582279205322266, + "learning_rate": 8.93400495097062e-05, + "loss": 0.5958, + "step": 11170 + }, + { + "epoch": 0.8725513150706314, + "grad_norm": 57.39128112792969, + "learning_rate": 8.93120088342882e-05, + "loss": 0.4043, + "step": 11180 + }, + { + "epoch": 0.8733317724186374, + "grad_norm": 0.5639352798461914, + "learning_rate": 8.928393573996963e-05, + "loss": 1.905, + "step": 11190 + }, + { + "epoch": 0.8741122297666433, + "grad_norm": 2.6245787143707275, + "learning_rate": 8.925583024990114e-05, + "loss": 1.2335, + "step": 11200 + }, + { + "epoch": 0.8748926871146492, + "grad_norm": 3.0179084205883555e-05, + "learning_rate": 8.922769238726013e-05, + "loss": 1.4729, + "step": 11210 + }, + { + "epoch": 0.8756731444626551, + "grad_norm": 9.200638487527613e-06, + "learning_rate": 8.919952217525065e-05, + "loss": 0.2296, + "step": 11220 + }, + { + "epoch": 0.876453601810661, + "grad_norm": 32.401824951171875, + "learning_rate": 8.917131963710344e-05, + "loss": 1.731, + "step": 11230 + }, + { + "epoch": 0.8772340591586669, + "grad_norm": 0.001450688112527132, + "learning_rate": 8.914308479607592e-05, + "loss": 0.5757, + "step": 11240 + }, + { + "epoch": 0.878014516506673, + "grad_norm": 22.435937881469727, + "learning_rate": 8.911481767545211e-05, + "loss": 2.6246, + "step": 11250 + }, + { + "epoch": 0.8787949738546789, + "grad_norm": 30.424789428710938, + "learning_rate": 8.908651829854271e-05, + "loss": 0.7685, + "step": 11260 + }, + { + "epoch": 0.8795754312026848, + "grad_norm": 2.2438132762908936, + "learning_rate": 8.905818668868494e-05, + "loss": 0.4759, + "step": 11270 + }, + { + "epoch": 0.8803558885506907, + "grad_norm": 1.3357861042022705, + "learning_rate": 8.902982286924268e-05, + "loss": 0.5434, + "step": 11280 + }, + { + "epoch": 0.8811363458986966, + "grad_norm": 47.41687774658203, + "learning_rate": 8.900142686360631e-05, + "loss": 1.4807, + "step": 11290 + }, + { + "epoch": 0.8819168032467025, + "grad_norm": 37.302310943603516, + "learning_rate": 8.89729986951928e-05, + "loss": 1.3951, + "step": 11300 + }, + { + "epoch": 0.8826972605947085, + "grad_norm": 34.153472900390625, + "learning_rate": 8.894453838744561e-05, + "loss": 0.6078, + "step": 11310 + }, + { + "epoch": 0.8834777179427145, + "grad_norm": 34.647335052490234, + "learning_rate": 8.891604596383472e-05, + "loss": 1.1206, + "step": 11320 + }, + { + "epoch": 0.8842581752907204, + "grad_norm": 0.013813276775181293, + "learning_rate": 8.888752144785656e-05, + "loss": 0.4337, + "step": 11330 + }, + { + "epoch": 0.8850386326387263, + "grad_norm": 88.2090835571289, + "learning_rate": 8.885896486303411e-05, + "loss": 1.7786, + "step": 11340 + }, + { + "epoch": 0.8858190899867322, + "grad_norm": 1.2261537313461304, + "learning_rate": 8.88303762329167e-05, + "loss": 0.2787, + "step": 11350 + }, + { + "epoch": 0.8865995473347381, + "grad_norm": 85.28449249267578, + "learning_rate": 8.880175558108014e-05, + "loss": 2.0685, + "step": 11360 + }, + { + "epoch": 0.8873800046827441, + "grad_norm": 70.23323822021484, + "learning_rate": 8.877310293112663e-05, + "loss": 0.6905, + "step": 11370 + }, + { + "epoch": 0.88816046203075, + "grad_norm": 109.84261322021484, + "learning_rate": 8.87444183066848e-05, + "loss": 2.1192, + "step": 11380 + }, + { + "epoch": 0.888940919378756, + "grad_norm": 2.6113624572753906, + "learning_rate": 8.871570173140955e-05, + "loss": 0.9877, + "step": 11390 + }, + { + "epoch": 0.8897213767267619, + "grad_norm": 0.0027678406331688166, + "learning_rate": 8.868695322898222e-05, + "loss": 0.6999, + "step": 11400 + }, + { + "epoch": 0.8905018340747678, + "grad_norm": 27.829450607299805, + "learning_rate": 8.865817282311043e-05, + "loss": 2.1734, + "step": 11410 + }, + { + "epoch": 0.8912822914227737, + "grad_norm": 1.0777357816696167, + "learning_rate": 8.862936053752814e-05, + "loss": 1.1006, + "step": 11420 + }, + { + "epoch": 0.8920627487707796, + "grad_norm": 9.996204376220703, + "learning_rate": 8.86005163959956e-05, + "loss": 1.8153, + "step": 11430 + }, + { + "epoch": 0.8928432061187856, + "grad_norm": 11.059738159179688, + "learning_rate": 8.85716404222993e-05, + "loss": 1.3938, + "step": 11440 + }, + { + "epoch": 0.8936236634667916, + "grad_norm": 0.06884201616048813, + "learning_rate": 8.8542732640252e-05, + "loss": 0.395, + "step": 11450 + }, + { + "epoch": 0.8944041208147975, + "grad_norm": 0.38125911355018616, + "learning_rate": 8.851379307369267e-05, + "loss": 0.7953, + "step": 11460 + }, + { + "epoch": 0.8951845781628034, + "grad_norm": 7.784693717956543, + "learning_rate": 8.848482174648653e-05, + "loss": 1.3387, + "step": 11470 + }, + { + "epoch": 0.8959650355108093, + "grad_norm": 2.370662212371826, + "learning_rate": 8.845581868252498e-05, + "loss": 0.4055, + "step": 11480 + }, + { + "epoch": 0.8967454928588152, + "grad_norm": 0.028177833184599876, + "learning_rate": 8.842678390572557e-05, + "loss": 1.538, + "step": 11490 + }, + { + "epoch": 0.8975259502068212, + "grad_norm": 28.854820251464844, + "learning_rate": 8.839771744003199e-05, + "loss": 0.3277, + "step": 11500 + }, + { + "epoch": 0.8983064075548272, + "grad_norm": 6.505209445953369, + "learning_rate": 8.836861930941414e-05, + "loss": 1.0238, + "step": 11510 + }, + { + "epoch": 0.8990868649028331, + "grad_norm": 0.08821121603250504, + "learning_rate": 8.833948953786792e-05, + "loss": 0.9824, + "step": 11520 + }, + { + "epoch": 0.899867322250839, + "grad_norm": 0.0032478803768754005, + "learning_rate": 8.831032814941545e-05, + "loss": 1.7226, + "step": 11530 + }, + { + "epoch": 0.9006477795988449, + "grad_norm": 0.9303324818611145, + "learning_rate": 8.828113516810481e-05, + "loss": 0.5442, + "step": 11540 + }, + { + "epoch": 0.9014282369468508, + "grad_norm": 0.5977065563201904, + "learning_rate": 8.825191061801019e-05, + "loss": 1.8397, + "step": 11550 + }, + { + "epoch": 0.9022086942948568, + "grad_norm": 27.579898834228516, + "learning_rate": 8.822265452323182e-05, + "loss": 1.6211, + "step": 11560 + }, + { + "epoch": 0.9029891516428628, + "grad_norm": 0.21320217847824097, + "learning_rate": 8.81933669078959e-05, + "loss": 1.362, + "step": 11570 + }, + { + "epoch": 0.9037696089908687, + "grad_norm": 49.39560317993164, + "learning_rate": 8.816404779615465e-05, + "loss": 0.7101, + "step": 11580 + }, + { + "epoch": 0.9045500663388746, + "grad_norm": 44.66103744506836, + "learning_rate": 8.813469721218631e-05, + "loss": 2.1111, + "step": 11590 + }, + { + "epoch": 0.9053305236868805, + "grad_norm": 3.096705913543701, + "learning_rate": 8.810531518019496e-05, + "loss": 0.6784, + "step": 11600 + }, + { + "epoch": 0.9061109810348864, + "grad_norm": 0.00022902319324202836, + "learning_rate": 8.807590172441072e-05, + "loss": 0.8203, + "step": 11610 + }, + { + "epoch": 0.9068914383828923, + "grad_norm": 1.1045008897781372, + "learning_rate": 8.804645686908959e-05, + "loss": 0.9274, + "step": 11620 + }, + { + "epoch": 0.9076718957308983, + "grad_norm": 44.20703887939453, + "learning_rate": 8.801698063851345e-05, + "loss": 0.7383, + "step": 11630 + }, + { + "epoch": 0.9084523530789043, + "grad_norm": 0.0486653596162796, + "learning_rate": 8.798747305699005e-05, + "loss": 0.6478, + "step": 11640 + }, + { + "epoch": 0.9092328104269102, + "grad_norm": 36.32588195800781, + "learning_rate": 8.795793414885301e-05, + "loss": 0.4524, + "step": 11650 + }, + { + "epoch": 0.9100132677749161, + "grad_norm": 30.98180389404297, + "learning_rate": 8.79283639384618e-05, + "loss": 2.5093, + "step": 11660 + }, + { + "epoch": 0.910793725122922, + "grad_norm": 51.505775451660156, + "learning_rate": 8.789876245020168e-05, + "loss": 0.973, + "step": 11670 + }, + { + "epoch": 0.9115741824709279, + "grad_norm": 0.001640350092202425, + "learning_rate": 8.78691297084837e-05, + "loss": 0.1758, + "step": 11680 + }, + { + "epoch": 0.9123546398189339, + "grad_norm": 9.590207810106222e-06, + "learning_rate": 8.783946573774467e-05, + "loss": 0.3, + "step": 11690 + }, + { + "epoch": 0.9131350971669399, + "grad_norm": 39.28508758544922, + "learning_rate": 8.780977056244721e-05, + "loss": 0.8231, + "step": 11700 + }, + { + "epoch": 0.9139155545149458, + "grad_norm": 3.148670196533203, + "learning_rate": 8.778004420707961e-05, + "loss": 1.8962, + "step": 11710 + }, + { + "epoch": 0.9146960118629517, + "grad_norm": 40.00190353393555, + "learning_rate": 8.775028669615593e-05, + "loss": 0.4237, + "step": 11720 + }, + { + "epoch": 0.9154764692109576, + "grad_norm": 0.02514764852821827, + "learning_rate": 8.772049805421586e-05, + "loss": 0.8909, + "step": 11730 + }, + { + "epoch": 0.9162569265589635, + "grad_norm": 25.32301139831543, + "learning_rate": 8.769067830582482e-05, + "loss": 0.9899, + "step": 11740 + }, + { + "epoch": 0.9170373839069695, + "grad_norm": 0.029089761897921562, + "learning_rate": 8.766082747557383e-05, + "loss": 1.1534, + "step": 11750 + }, + { + "epoch": 0.9178178412549755, + "grad_norm": 0.00031210810993798077, + "learning_rate": 8.763094558807961e-05, + "loss": 1.7147, + "step": 11760 + }, + { + "epoch": 0.9185982986029814, + "grad_norm": 37.92123031616211, + "learning_rate": 8.76010326679844e-05, + "loss": 0.4765, + "step": 11770 + }, + { + "epoch": 0.9193787559509873, + "grad_norm": 2.677894599401043e-07, + "learning_rate": 8.757108873995612e-05, + "loss": 0.4737, + "step": 11780 + }, + { + "epoch": 0.9201592132989932, + "grad_norm": 42.612613677978516, + "learning_rate": 8.75411138286882e-05, + "loss": 1.2752, + "step": 11790 + }, + { + "epoch": 0.9209396706469991, + "grad_norm": 3.9161059856414795, + "learning_rate": 8.751110795889966e-05, + "loss": 2.0453, + "step": 11800 + }, + { + "epoch": 0.921720127995005, + "grad_norm": 21.108699798583984, + "learning_rate": 8.748107115533501e-05, + "loss": 1.2515, + "step": 11810 + }, + { + "epoch": 0.922500585343011, + "grad_norm": 39.70624923706055, + "learning_rate": 8.745100344276433e-05, + "loss": 2.1655, + "step": 11820 + }, + { + "epoch": 0.923281042691017, + "grad_norm": 12.807904243469238, + "learning_rate": 8.742090484598312e-05, + "loss": 0.3978, + "step": 11830 + }, + { + "epoch": 0.9240615000390229, + "grad_norm": 0.06400003284215927, + "learning_rate": 8.739077538981239e-05, + "loss": 0.7546, + "step": 11840 + }, + { + "epoch": 0.9248419573870288, + "grad_norm": 42.75697708129883, + "learning_rate": 8.73606150990986e-05, + "loss": 1.3483, + "step": 11850 + }, + { + "epoch": 0.9256224147350347, + "grad_norm": 39.99211120605469, + "learning_rate": 8.733042399871361e-05, + "loss": 0.7464, + "step": 11860 + }, + { + "epoch": 0.9264028720830406, + "grad_norm": 23.466768264770508, + "learning_rate": 8.730020211355471e-05, + "loss": 0.9, + "step": 11870 + }, + { + "epoch": 0.9271833294310466, + "grad_norm": 25.129772186279297, + "learning_rate": 8.72699494685446e-05, + "loss": 0.8635, + "step": 11880 + }, + { + "epoch": 0.9279637867790526, + "grad_norm": 10.731846809387207, + "learning_rate": 8.723966608863128e-05, + "loss": 0.0681, + "step": 11890 + }, + { + "epoch": 0.9287442441270585, + "grad_norm": 0.04492070898413658, + "learning_rate": 8.720935199878815e-05, + "loss": 0.916, + "step": 11900 + }, + { + "epoch": 0.9295247014750644, + "grad_norm": 7.944001900739295e-09, + "learning_rate": 8.717900722401391e-05, + "loss": 0.739, + "step": 11910 + }, + { + "epoch": 0.9303051588230703, + "grad_norm": 3.294824273325503e-05, + "learning_rate": 8.714863178933257e-05, + "loss": 1.0964, + "step": 11920 + }, + { + "epoch": 0.9310856161710762, + "grad_norm": 0.0023006428964436054, + "learning_rate": 8.711822571979346e-05, + "loss": 0.5517, + "step": 11930 + }, + { + "epoch": 0.9318660735190821, + "grad_norm": 0.00011115364759461954, + "learning_rate": 8.70877890404711e-05, + "loss": 1.5457, + "step": 11940 + }, + { + "epoch": 0.9326465308670882, + "grad_norm": 0.007222015410661697, + "learning_rate": 8.705732177646531e-05, + "loss": 0.4518, + "step": 11950 + }, + { + "epoch": 0.9334269882150941, + "grad_norm": 14.632031440734863, + "learning_rate": 8.702682395290114e-05, + "loss": 1.1325, + "step": 11960 + }, + { + "epoch": 0.9342074455631, + "grad_norm": 0.00015641542267985642, + "learning_rate": 8.699629559492876e-05, + "loss": 1.5673, + "step": 11970 + }, + { + "epoch": 0.9349879029111059, + "grad_norm": 2.1813745498657227, + "learning_rate": 8.696573672772363e-05, + "loss": 1.3276, + "step": 11980 + }, + { + "epoch": 0.9357683602591118, + "grad_norm": 0.20304657518863678, + "learning_rate": 8.693514737648628e-05, + "loss": 0.8412, + "step": 11990 + }, + { + "epoch": 0.9365488176071177, + "grad_norm": 0.0002769280690699816, + "learning_rate": 8.690452756644243e-05, + "loss": 0.4273, + "step": 12000 + }, + { + "epoch": 0.9373292749551237, + "grad_norm": 180.66477966308594, + "learning_rate": 8.687387732284291e-05, + "loss": 0.5784, + "step": 12010 + }, + { + "epoch": 0.9381097323031297, + "grad_norm": 143.87356567382812, + "learning_rate": 8.684319667096364e-05, + "loss": 1.2544, + "step": 12020 + }, + { + "epoch": 0.9388901896511356, + "grad_norm": 94.39666748046875, + "learning_rate": 8.681248563610558e-05, + "loss": 0.521, + "step": 12030 + }, + { + "epoch": 0.9396706469991415, + "grad_norm": 33.32435607910156, + "learning_rate": 8.678174424359485e-05, + "loss": 0.5367, + "step": 12040 + }, + { + "epoch": 0.9404511043471474, + "grad_norm": 0.4227607846260071, + "learning_rate": 8.675097251878246e-05, + "loss": 1.8416, + "step": 12050 + }, + { + "epoch": 0.9412315616951533, + "grad_norm": 10.019822120666504, + "learning_rate": 8.672017048704458e-05, + "loss": 2.5491, + "step": 12060 + }, + { + "epoch": 0.9420120190431593, + "grad_norm": 26.867095947265625, + "learning_rate": 8.668933817378224e-05, + "loss": 0.6875, + "step": 12070 + }, + { + "epoch": 0.9427924763911653, + "grad_norm": 10.189391136169434, + "learning_rate": 8.665847560442156e-05, + "loss": 0.8775, + "step": 12080 + }, + { + "epoch": 0.9435729337391712, + "grad_norm": 3.7130205631256104, + "learning_rate": 8.662758280441352e-05, + "loss": 0.6623, + "step": 12090 + }, + { + "epoch": 0.9443533910871771, + "grad_norm": 25.05450439453125, + "learning_rate": 8.65966597992341e-05, + "loss": 2.4015, + "step": 12100 + }, + { + "epoch": 0.945133848435183, + "grad_norm": 5.740344590776658e-07, + "learning_rate": 8.65657066143841e-05, + "loss": 0.2352, + "step": 12110 + }, + { + "epoch": 0.9459143057831889, + "grad_norm": 1.194147216665442e-06, + "learning_rate": 8.653472327538932e-05, + "loss": 0.9961, + "step": 12120 + }, + { + "epoch": 0.9466947631311948, + "grad_norm": 23.505138397216797, + "learning_rate": 8.650370980780035e-05, + "loss": 1.3468, + "step": 12130 + }, + { + "epoch": 0.9474752204792009, + "grad_norm": 0.6704900860786438, + "learning_rate": 8.647266623719263e-05, + "loss": 0.5786, + "step": 12140 + }, + { + "epoch": 0.9482556778272068, + "grad_norm": 32.911720275878906, + "learning_rate": 8.644159258916645e-05, + "loss": 1.064, + "step": 12150 + }, + { + "epoch": 0.9490361351752127, + "grad_norm": 127.1581039428711, + "learning_rate": 8.641048888934691e-05, + "loss": 2.8751, + "step": 12160 + }, + { + "epoch": 0.9498165925232186, + "grad_norm": 38.89763259887695, + "learning_rate": 8.637935516338384e-05, + "loss": 2.585, + "step": 12170 + }, + { + "epoch": 0.9505970498712245, + "grad_norm": 21.23341941833496, + "learning_rate": 8.63481914369519e-05, + "loss": 0.6433, + "step": 12180 + }, + { + "epoch": 0.9513775072192304, + "grad_norm": 380.9919738769531, + "learning_rate": 8.631699773575044e-05, + "loss": 1.8712, + "step": 12190 + }, + { + "epoch": 0.9521579645672364, + "grad_norm": 1.6460078954696655, + "learning_rate": 8.628577408550353e-05, + "loss": 1.1621, + "step": 12200 + }, + { + "epoch": 0.9529384219152424, + "grad_norm": 0.005625886842608452, + "learning_rate": 8.625452051196001e-05, + "loss": 1.7173, + "step": 12210 + }, + { + "epoch": 0.9537188792632483, + "grad_norm": 0.43974214792251587, + "learning_rate": 8.622323704089328e-05, + "loss": 1.4362, + "step": 12220 + }, + { + "epoch": 0.9544993366112542, + "grad_norm": 0.0861930325627327, + "learning_rate": 8.619192369810149e-05, + "loss": 0.2172, + "step": 12230 + }, + { + "epoch": 0.9552797939592601, + "grad_norm": 109.56983184814453, + "learning_rate": 8.616058050940739e-05, + "loss": 0.5874, + "step": 12240 + }, + { + "epoch": 0.956060251307266, + "grad_norm": 31.9949951171875, + "learning_rate": 8.612920750065831e-05, + "loss": 1.9491, + "step": 12250 + }, + { + "epoch": 0.956840708655272, + "grad_norm": 1.3033921718597412, + "learning_rate": 8.609780469772623e-05, + "loss": 0.7603, + "step": 12260 + }, + { + "epoch": 0.957621166003278, + "grad_norm": 175.17483520507812, + "learning_rate": 8.606637212650767e-05, + "loss": 1.6961, + "step": 12270 + }, + { + "epoch": 0.9584016233512839, + "grad_norm": 88.24026489257812, + "learning_rate": 8.603490981292369e-05, + "loss": 3.0626, + "step": 12280 + }, + { + "epoch": 0.9591820806992898, + "grad_norm": 6.234077453613281, + "learning_rate": 8.600341778291991e-05, + "loss": 1.012, + "step": 12290 + }, + { + "epoch": 0.9599625380472957, + "grad_norm": 1.844632625579834, + "learning_rate": 8.597189606246641e-05, + "loss": 0.6959, + "step": 12300 + }, + { + "epoch": 0.9607429953953016, + "grad_norm": 0.0016822753241285682, + "learning_rate": 8.594034467755781e-05, + "loss": 1.416, + "step": 12310 + }, + { + "epoch": 0.9615234527433075, + "grad_norm": 1.9487806558609009, + "learning_rate": 8.590876365421313e-05, + "loss": 2.5349, + "step": 12320 + }, + { + "epoch": 0.9623039100913136, + "grad_norm": 56.0797233581543, + "learning_rate": 8.587715301847588e-05, + "loss": 1.0408, + "step": 12330 + }, + { + "epoch": 0.9630843674393195, + "grad_norm": 0.0024899712298065424, + "learning_rate": 8.584551279641398e-05, + "loss": 0.5106, + "step": 12340 + }, + { + "epoch": 0.9638648247873254, + "grad_norm": 60.09833526611328, + "learning_rate": 8.58138430141197e-05, + "loss": 1.2098, + "step": 12350 + }, + { + "epoch": 0.9646452821353313, + "grad_norm": 88.48211669921875, + "learning_rate": 8.57821436977098e-05, + "loss": 0.7373, + "step": 12360 + }, + { + "epoch": 0.9654257394833372, + "grad_norm": 0.0008333827718161047, + "learning_rate": 8.575041487332527e-05, + "loss": 1.7268, + "step": 12370 + }, + { + "epoch": 0.9662061968313431, + "grad_norm": 63.89524841308594, + "learning_rate": 8.571865656713152e-05, + "loss": 0.7619, + "step": 12380 + }, + { + "epoch": 0.9669866541793491, + "grad_norm": 67.05152130126953, + "learning_rate": 8.568686880531822e-05, + "loss": 1.5792, + "step": 12390 + }, + { + "epoch": 0.9677671115273551, + "grad_norm": 11.015143394470215, + "learning_rate": 8.565505161409937e-05, + "loss": 0.577, + "step": 12400 + }, + { + "epoch": 0.968547568875361, + "grad_norm": 0.0003411987272556871, + "learning_rate": 8.562320501971319e-05, + "loss": 0.6672, + "step": 12410 + }, + { + "epoch": 0.9693280262233669, + "grad_norm": 0.3806394040584564, + "learning_rate": 8.559132904842222e-05, + "loss": 0.5932, + "step": 12420 + }, + { + "epoch": 0.9701084835713728, + "grad_norm": 18.847597122192383, + "learning_rate": 8.555942372651316e-05, + "loss": 1.0309, + "step": 12430 + }, + { + "epoch": 0.9708889409193787, + "grad_norm": 27.726774215698242, + "learning_rate": 8.552748908029693e-05, + "loss": 1.5833, + "step": 12440 + }, + { + "epoch": 0.9716693982673846, + "grad_norm": 15.595227241516113, + "learning_rate": 8.549552513610865e-05, + "loss": 1.102, + "step": 12450 + }, + { + "epoch": 0.9724498556153907, + "grad_norm": 2.3900837898254395, + "learning_rate": 8.546353192030762e-05, + "loss": 2.0294, + "step": 12460 + }, + { + "epoch": 0.9732303129633966, + "grad_norm": 7.092124938964844, + "learning_rate": 8.543150945927722e-05, + "loss": 0.7207, + "step": 12470 + }, + { + "epoch": 0.9740107703114025, + "grad_norm": 61.77547073364258, + "learning_rate": 8.539945777942498e-05, + "loss": 2.3424, + "step": 12480 + }, + { + "epoch": 0.9747912276594084, + "grad_norm": 7.770050615363289e-06, + "learning_rate": 8.536737690718252e-05, + "loss": 1.0175, + "step": 12490 + }, + { + "epoch": 0.9755716850074143, + "grad_norm": 63.13569641113281, + "learning_rate": 8.533526686900555e-05, + "loss": 1.5255, + "step": 12500 + }, + { + "epoch": 0.9763521423554202, + "grad_norm": 0.00011756386084016412, + "learning_rate": 8.530312769137382e-05, + "loss": 1.0027, + "step": 12510 + }, + { + "epoch": 0.9771325997034263, + "grad_norm": 89.27020263671875, + "learning_rate": 8.52709594007911e-05, + "loss": 0.8815, + "step": 12520 + }, + { + "epoch": 0.9779130570514322, + "grad_norm": 2.415660858154297, + "learning_rate": 8.523876202378518e-05, + "loss": 1.2294, + "step": 12530 + }, + { + "epoch": 0.9786935143994381, + "grad_norm": 0.9319227337837219, + "learning_rate": 8.520653558690786e-05, + "loss": 0.534, + "step": 12540 + }, + { + "epoch": 0.979473971747444, + "grad_norm": 59.17282485961914, + "learning_rate": 8.517428011673482e-05, + "loss": 1.1959, + "step": 12550 + }, + { + "epoch": 0.9802544290954499, + "grad_norm": 6.27915096282959, + "learning_rate": 8.514199563986578e-05, + "loss": 0.914, + "step": 12560 + }, + { + "epoch": 0.9810348864434558, + "grad_norm": 32.939640045166016, + "learning_rate": 8.510968218292434e-05, + "loss": 0.7607, + "step": 12570 + }, + { + "epoch": 0.9818153437914618, + "grad_norm": 8.092778205871582, + "learning_rate": 8.5077339772558e-05, + "loss": 1.4045, + "step": 12580 + }, + { + "epoch": 0.9825958011394678, + "grad_norm": 30.04991912841797, + "learning_rate": 8.504496843543813e-05, + "loss": 0.8721, + "step": 12590 + }, + { + "epoch": 0.9833762584874737, + "grad_norm": 15.851937294006348, + "learning_rate": 8.501256819825996e-05, + "loss": 0.5861, + "step": 12600 + }, + { + "epoch": 0.9841567158354796, + "grad_norm": 0.01121127512305975, + "learning_rate": 8.498013908774256e-05, + "loss": 0.9448, + "step": 12610 + }, + { + "epoch": 0.9849371731834855, + "grad_norm": 1.1823562383651733, + "learning_rate": 8.494768113062879e-05, + "loss": 0.5183, + "step": 12620 + }, + { + "epoch": 0.9857176305314914, + "grad_norm": 0.07073729485273361, + "learning_rate": 8.491519435368534e-05, + "loss": 0.1142, + "step": 12630 + }, + { + "epoch": 0.9864980878794973, + "grad_norm": 78.96869659423828, + "learning_rate": 8.48826787837026e-05, + "loss": 1.7843, + "step": 12640 + }, + { + "epoch": 0.9872785452275034, + "grad_norm": 86.60768127441406, + "learning_rate": 8.485013444749479e-05, + "loss": 2.746, + "step": 12650 + }, + { + "epoch": 0.9880590025755093, + "grad_norm": 7.178143277997151e-06, + "learning_rate": 8.481756137189977e-05, + "loss": 1.0557, + "step": 12660 + }, + { + "epoch": 0.9888394599235152, + "grad_norm": 47.43173599243164, + "learning_rate": 8.478495958377914e-05, + "loss": 1.7503, + "step": 12670 + }, + { + "epoch": 0.9896199172715211, + "grad_norm": 40.4387092590332, + "learning_rate": 8.47523291100182e-05, + "loss": 0.6436, + "step": 12680 + }, + { + "epoch": 0.990400374619527, + "grad_norm": 11.480759620666504, + "learning_rate": 8.471966997752585e-05, + "loss": 0.2876, + "step": 12690 + }, + { + "epoch": 0.9911808319675329, + "grad_norm": 0.0001139993328251876, + "learning_rate": 8.468698221323468e-05, + "loss": 0.2114, + "step": 12700 + }, + { + "epoch": 0.991961289315539, + "grad_norm": 76.51677703857422, + "learning_rate": 8.465426584410084e-05, + "loss": 1.865, + "step": 12710 + }, + { + "epoch": 0.9927417466635449, + "grad_norm": 3.665128231048584, + "learning_rate": 8.462152089710408e-05, + "loss": 1.9389, + "step": 12720 + }, + { + "epoch": 0.9935222040115508, + "grad_norm": 50.49843978881836, + "learning_rate": 8.458874739924778e-05, + "loss": 2.4169, + "step": 12730 + }, + { + "epoch": 0.9943026613595567, + "grad_norm": 15.426827430725098, + "learning_rate": 8.455594537755878e-05, + "loss": 0.2876, + "step": 12740 + }, + { + "epoch": 0.9950831187075626, + "grad_norm": 8.526009764864284e-07, + "learning_rate": 8.452311485908751e-05, + "loss": 0.5408, + "step": 12750 + }, + { + "epoch": 0.9958635760555685, + "grad_norm": 25.179977416992188, + "learning_rate": 8.449025587090782e-05, + "loss": 1.8653, + "step": 12760 + }, + { + "epoch": 0.9966440334035745, + "grad_norm": 55.514373779296875, + "learning_rate": 8.445736844011713e-05, + "loss": 0.8341, + "step": 12770 + }, + { + "epoch": 0.9974244907515805, + "grad_norm": 3.265545606613159, + "learning_rate": 8.442445259383625e-05, + "loss": 1.9346, + "step": 12780 + }, + { + "epoch": 0.9982049480995864, + "grad_norm": 11.585598945617676, + "learning_rate": 8.439150835920944e-05, + "loss": 1.2786, + "step": 12790 + }, + { + "epoch": 0.9989854054475923, + "grad_norm": 0.5894767642021179, + "learning_rate": 8.435853576340438e-05, + "loss": 0.0325, + "step": 12800 + }, + { + "epoch": 0.9997658627955982, + "grad_norm": 0.6876865029335022, + "learning_rate": 8.432553483361213e-05, + "loss": 0.4318, + "step": 12810 + }, + { + "epoch": 1.0005463201436042, + "grad_norm": 9.892061643768102e-05, + "learning_rate": 8.429250559704714e-05, + "loss": 1.2832, + "step": 12820 + }, + { + "epoch": 1.0013267774916101, + "grad_norm": 0.00020257089636288583, + "learning_rate": 8.425944808094715e-05, + "loss": 1.1023, + "step": 12830 + }, + { + "epoch": 1.002107234839616, + "grad_norm": 57.085105895996094, + "learning_rate": 8.422636231257326e-05, + "loss": 0.3823, + "step": 12840 + }, + { + "epoch": 1.002887692187622, + "grad_norm": 0.9345187544822693, + "learning_rate": 8.419324831920989e-05, + "loss": 1.7061, + "step": 12850 + }, + { + "epoch": 1.0036681495356279, + "grad_norm": 26.76588249206543, + "learning_rate": 8.416010612816467e-05, + "loss": 1.7186, + "step": 12860 + }, + { + "epoch": 1.0044486068836338, + "grad_norm": 9.51021957397461, + "learning_rate": 8.412693576676856e-05, + "loss": 1.865, + "step": 12870 + }, + { + "epoch": 1.0052290642316397, + "grad_norm": 13.387110710144043, + "learning_rate": 8.409373726237567e-05, + "loss": 0.2966, + "step": 12880 + }, + { + "epoch": 1.0060095215796456, + "grad_norm": 6.979156017303467, + "learning_rate": 8.406051064236337e-05, + "loss": 1.7553, + "step": 12890 + }, + { + "epoch": 1.0067899789276515, + "grad_norm": 11.995668411254883, + "learning_rate": 8.402725593413225e-05, + "loss": 0.6281, + "step": 12900 + }, + { + "epoch": 1.0075704362756575, + "grad_norm": 27.00234603881836, + "learning_rate": 8.399397316510596e-05, + "loss": 0.4262, + "step": 12910 + }, + { + "epoch": 1.0083508936236634, + "grad_norm": 71.79927062988281, + "learning_rate": 8.396066236273137e-05, + "loss": 1.3269, + "step": 12920 + }, + { + "epoch": 1.0091313509716695, + "grad_norm": 51.39724349975586, + "learning_rate": 8.392732355447844e-05, + "loss": 1.0123, + "step": 12930 + }, + { + "epoch": 1.0099118083196754, + "grad_norm": 52.78282928466797, + "learning_rate": 8.389395676784025e-05, + "loss": 1.4514, + "step": 12940 + }, + { + "epoch": 1.0106922656676813, + "grad_norm": 40.91914367675781, + "learning_rate": 8.386056203033295e-05, + "loss": 0.9037, + "step": 12950 + }, + { + "epoch": 1.0114727230156872, + "grad_norm": 50.55011749267578, + "learning_rate": 8.382713936949566e-05, + "loss": 0.9325, + "step": 12960 + }, + { + "epoch": 1.0122531803636932, + "grad_norm": 0.46198034286499023, + "learning_rate": 8.379368881289067e-05, + "loss": 0.6073, + "step": 12970 + }, + { + "epoch": 1.013033637711699, + "grad_norm": 37.872135162353516, + "learning_rate": 8.376021038810315e-05, + "loss": 1.9623, + "step": 12980 + }, + { + "epoch": 1.013814095059705, + "grad_norm": 4.987172603607178, + "learning_rate": 8.372670412274129e-05, + "loss": 0.6192, + "step": 12990 + }, + { + "epoch": 1.014594552407711, + "grad_norm": 4.3910542444791645e-05, + "learning_rate": 8.369317004443628e-05, + "loss": 0.584, + "step": 13000 + }, + { + "epoch": 1.0153750097557168, + "grad_norm": 0.6088114976882935, + "learning_rate": 8.36596081808422e-05, + "loss": 1.1512, + "step": 13010 + }, + { + "epoch": 1.0161554671037227, + "grad_norm": 4.626126289367676, + "learning_rate": 8.362601855963605e-05, + "loss": 0.6647, + "step": 13020 + }, + { + "epoch": 1.0169359244517286, + "grad_norm": 0.0009490216034464538, + "learning_rate": 8.359240120851772e-05, + "loss": 0.6454, + "step": 13030 + }, + { + "epoch": 1.0177163817997346, + "grad_norm": 1.2626225043277373e-07, + "learning_rate": 8.355875615521e-05, + "loss": 0.2324, + "step": 13040 + }, + { + "epoch": 1.0184968391477405, + "grad_norm": 21.736064910888672, + "learning_rate": 8.352508342745847e-05, + "loss": 1.5845, + "step": 13050 + }, + { + "epoch": 1.0192772964957466, + "grad_norm": 7.8866047859191895, + "learning_rate": 8.349138305303159e-05, + "loss": 1.2487, + "step": 13060 + }, + { + "epoch": 1.0200577538437525, + "grad_norm": 49.58051300048828, + "learning_rate": 8.345765505972056e-05, + "loss": 0.5553, + "step": 13070 + }, + { + "epoch": 1.0208382111917584, + "grad_norm": 0.45068633556365967, + "learning_rate": 8.342389947533943e-05, + "loss": 0.0828, + "step": 13080 + }, + { + "epoch": 1.0216186685397644, + "grad_norm": 49.6279411315918, + "learning_rate": 8.33901163277249e-05, + "loss": 1.5898, + "step": 13090 + }, + { + "epoch": 1.0223991258877703, + "grad_norm": 5.856717280039447e-07, + "learning_rate": 8.335630564473652e-05, + "loss": 1.2666, + "step": 13100 + }, + { + "epoch": 1.0231795832357762, + "grad_norm": 0.04553453251719475, + "learning_rate": 8.332246745425644e-05, + "loss": 1.0408, + "step": 13110 + }, + { + "epoch": 1.023960040583782, + "grad_norm": 0.0003157809842377901, + "learning_rate": 8.328860178418958e-05, + "loss": 0.0292, + "step": 13120 + }, + { + "epoch": 1.024740497931788, + "grad_norm": 3.6817857562709833e-06, + "learning_rate": 8.325470866246343e-05, + "loss": 2.1363, + "step": 13130 + }, + { + "epoch": 1.025520955279794, + "grad_norm": 0.0001370991230942309, + "learning_rate": 8.322078811702823e-05, + "loss": 0.0206, + "step": 13140 + }, + { + "epoch": 1.0263014126277998, + "grad_norm": 1.1230233907699585, + "learning_rate": 8.318684017585673e-05, + "loss": 1.1947, + "step": 13150 + }, + { + "epoch": 1.0270818699758058, + "grad_norm": 6.144953204056947e-07, + "learning_rate": 8.315286486694434e-05, + "loss": 1.9959, + "step": 13160 + }, + { + "epoch": 1.0278623273238117, + "grad_norm": 0.0007690726779401302, + "learning_rate": 8.311886221830902e-05, + "loss": 0.2656, + "step": 13170 + }, + { + "epoch": 1.0286427846718176, + "grad_norm": 11.274613380432129, + "learning_rate": 8.308483225799126e-05, + "loss": 1.606, + "step": 13180 + }, + { + "epoch": 1.0294232420198237, + "grad_norm": 48.34074020385742, + "learning_rate": 8.30507750140541e-05, + "loss": 1.329, + "step": 13190 + }, + { + "epoch": 1.0302036993678296, + "grad_norm": 3.7099595069885254, + "learning_rate": 8.301669051458305e-05, + "loss": 0.2159, + "step": 13200 + }, + { + "epoch": 1.0309841567158355, + "grad_norm": 12.82052230834961, + "learning_rate": 8.29825787876861e-05, + "loss": 1.4214, + "step": 13210 + }, + { + "epoch": 1.0317646140638415, + "grad_norm": 1.2749547958374023, + "learning_rate": 8.294843986149374e-05, + "loss": 0.7242, + "step": 13220 + }, + { + "epoch": 1.0325450714118474, + "grad_norm": 13.616936683654785, + "learning_rate": 8.291427376415882e-05, + "loss": 0.7435, + "step": 13230 + }, + { + "epoch": 1.0333255287598533, + "grad_norm": 0.00015055372205097228, + "learning_rate": 8.288008052385666e-05, + "loss": 0.3443, + "step": 13240 + }, + { + "epoch": 1.0341059861078592, + "grad_norm": 1.2600555419921875, + "learning_rate": 8.284586016878492e-05, + "loss": 0.5666, + "step": 13250 + }, + { + "epoch": 1.0348864434558651, + "grad_norm": 40.253013610839844, + "learning_rate": 8.281161272716365e-05, + "loss": 2.7526, + "step": 13260 + }, + { + "epoch": 1.035666900803871, + "grad_norm": 6.156032759463415e-05, + "learning_rate": 8.277733822723518e-05, + "loss": 0.3435, + "step": 13270 + }, + { + "epoch": 1.036447358151877, + "grad_norm": 0.009943058714270592, + "learning_rate": 8.274303669726426e-05, + "loss": 1.0844, + "step": 13280 + }, + { + "epoch": 1.0372278154998829, + "grad_norm": 46.61821365356445, + "learning_rate": 8.270870816553782e-05, + "loss": 0.6919, + "step": 13290 + }, + { + "epoch": 1.0380082728478888, + "grad_norm": 5.1633910480575196e-09, + "learning_rate": 8.267435266036512e-05, + "loss": 0.9052, + "step": 13300 + }, + { + "epoch": 1.0387887301958947, + "grad_norm": 56.16582489013672, + "learning_rate": 8.263997021007765e-05, + "loss": 1.0907, + "step": 13310 + }, + { + "epoch": 1.0395691875439008, + "grad_norm": 0.8746923804283142, + "learning_rate": 8.260556084302911e-05, + "loss": 1.2991, + "step": 13320 + }, + { + "epoch": 1.0403496448919067, + "grad_norm": 39.039512634277344, + "learning_rate": 8.257112458759541e-05, + "loss": 1.4904, + "step": 13330 + }, + { + "epoch": 1.0411301022399126, + "grad_norm": 2.8694819320662646e-07, + "learning_rate": 8.253666147217464e-05, + "loss": 1.7749, + "step": 13340 + }, + { + "epoch": 1.0419105595879186, + "grad_norm": 75.42256164550781, + "learning_rate": 8.250217152518702e-05, + "loss": 2.2867, + "step": 13350 + }, + { + "epoch": 1.0426910169359245, + "grad_norm": 47.55293273925781, + "learning_rate": 8.24676547750749e-05, + "loss": 0.5861, + "step": 13360 + }, + { + "epoch": 1.0434714742839304, + "grad_norm": 66.87212371826172, + "learning_rate": 8.243311125030274e-05, + "loss": 1.0252, + "step": 13370 + }, + { + "epoch": 1.0442519316319363, + "grad_norm": 2.618544101715088, + "learning_rate": 8.239854097935709e-05, + "loss": 1.612, + "step": 13380 + }, + { + "epoch": 1.0450323889799422, + "grad_norm": 7.044678568490781e-06, + "learning_rate": 8.236394399074654e-05, + "loss": 0.6337, + "step": 13390 + }, + { + "epoch": 1.0458128463279481, + "grad_norm": 0.528467059135437, + "learning_rate": 8.232932031300171e-05, + "loss": 0.4871, + "step": 13400 + }, + { + "epoch": 1.046593303675954, + "grad_norm": 7.7571539878845215, + "learning_rate": 8.229466997467527e-05, + "loss": 0.878, + "step": 13410 + }, + { + "epoch": 1.04737376102396, + "grad_norm": 78.52983856201172, + "learning_rate": 8.225999300434181e-05, + "loss": 2.2092, + "step": 13420 + }, + { + "epoch": 1.0481542183719659, + "grad_norm": 0.11696358025074005, + "learning_rate": 8.222528943059793e-05, + "loss": 0.4794, + "step": 13430 + }, + { + "epoch": 1.048934675719972, + "grad_norm": 1.6190725564956665, + "learning_rate": 8.219055928206213e-05, + "loss": 0.4143, + "step": 13440 + }, + { + "epoch": 1.049715133067978, + "grad_norm": 38.03479766845703, + "learning_rate": 8.215580258737493e-05, + "loss": 0.6102, + "step": 13450 + }, + { + "epoch": 1.0504955904159838, + "grad_norm": 4.810214042663574, + "learning_rate": 8.212101937519854e-05, + "loss": 0.2218, + "step": 13460 + }, + { + "epoch": 1.0512760477639898, + "grad_norm": 65.26640319824219, + "learning_rate": 8.208620967421728e-05, + "loss": 2.3941, + "step": 13470 + }, + { + "epoch": 1.0520565051119957, + "grad_norm": 0.008846109732985497, + "learning_rate": 8.20513735131371e-05, + "loss": 0.6208, + "step": 13480 + }, + { + "epoch": 1.0528369624600016, + "grad_norm": 57.17930221557617, + "learning_rate": 8.201651092068592e-05, + "loss": 3.1685, + "step": 13490 + }, + { + "epoch": 1.0536174198080075, + "grad_norm": 22.42201042175293, + "learning_rate": 8.198162192561337e-05, + "loss": 2.3165, + "step": 13500 + }, + { + "epoch": 1.0543978771560134, + "grad_norm": 10.212949752807617, + "learning_rate": 8.19467065566909e-05, + "loss": 0.5393, + "step": 13510 + }, + { + "epoch": 1.0551783345040193, + "grad_norm": 37.82101821899414, + "learning_rate": 8.19117648427117e-05, + "loss": 1.2392, + "step": 13520 + }, + { + "epoch": 1.0559587918520252, + "grad_norm": 0.038047194480895996, + "learning_rate": 8.187679681249065e-05, + "loss": 0.7501, + "step": 13530 + }, + { + "epoch": 1.0567392492000311, + "grad_norm": 0.0002589684445410967, + "learning_rate": 8.184180249486439e-05, + "loss": 0.1071, + "step": 13540 + }, + { + "epoch": 1.057519706548037, + "grad_norm": 6.387953987996298e-08, + "learning_rate": 8.18067819186912e-05, + "loss": 0.755, + "step": 13550 + }, + { + "epoch": 1.058300163896043, + "grad_norm": 0.07943969964981079, + "learning_rate": 8.177173511285102e-05, + "loss": 0.3546, + "step": 13560 + }, + { + "epoch": 1.0590806212440491, + "grad_norm": 0.06328899413347244, + "learning_rate": 8.173666210624542e-05, + "loss": 0.4805, + "step": 13570 + }, + { + "epoch": 1.059861078592055, + "grad_norm": 28.725759506225586, + "learning_rate": 8.17015629277976e-05, + "loss": 0.4756, + "step": 13580 + }, + { + "epoch": 1.060641535940061, + "grad_norm": 64.62845611572266, + "learning_rate": 8.16664376064523e-05, + "loss": 3.3899, + "step": 13590 + }, + { + "epoch": 1.0614219932880669, + "grad_norm": 46.945552825927734, + "learning_rate": 8.163128617117583e-05, + "loss": 0.6698, + "step": 13600 + }, + { + "epoch": 1.0622024506360728, + "grad_norm": 31.494598388671875, + "learning_rate": 8.159610865095608e-05, + "loss": 1.8572, + "step": 13610 + }, + { + "epoch": 1.0629829079840787, + "grad_norm": 2.593214503576746e-06, + "learning_rate": 8.15609050748024e-05, + "loss": 0.7823, + "step": 13620 + }, + { + "epoch": 1.0637633653320846, + "grad_norm": 23.18966293334961, + "learning_rate": 8.152567547174565e-05, + "loss": 0.6276, + "step": 13630 + }, + { + "epoch": 1.0645438226800905, + "grad_norm": 6.394977569580078, + "learning_rate": 8.149041987083816e-05, + "loss": 0.8798, + "step": 13640 + }, + { + "epoch": 1.0653242800280964, + "grad_norm": 8.956060810305644e-06, + "learning_rate": 8.145513830115366e-05, + "loss": 0.2264, + "step": 13650 + }, + { + "epoch": 1.0661047373761023, + "grad_norm": 0.010577269829809666, + "learning_rate": 8.141983079178736e-05, + "loss": 1.8401, + "step": 13660 + }, + { + "epoch": 1.0668851947241083, + "grad_norm": 0.7367900609970093, + "learning_rate": 8.138449737185578e-05, + "loss": 0.9894, + "step": 13670 + }, + { + "epoch": 1.0676656520721142, + "grad_norm": 0.0003608228580560535, + "learning_rate": 8.134913807049689e-05, + "loss": 0.5108, + "step": 13680 + }, + { + "epoch": 1.0684461094201203, + "grad_norm": 48.65434646606445, + "learning_rate": 8.131375291686995e-05, + "loss": 1.2039, + "step": 13690 + }, + { + "epoch": 1.0692265667681262, + "grad_norm": 0.004866276402026415, + "learning_rate": 8.127834194015553e-05, + "loss": 0.1723, + "step": 13700 + }, + { + "epoch": 1.0700070241161321, + "grad_norm": 61.13690948486328, + "learning_rate": 8.124290516955557e-05, + "loss": 0.6815, + "step": 13710 + }, + { + "epoch": 1.070787481464138, + "grad_norm": 61.9520149230957, + "learning_rate": 8.120744263429319e-05, + "loss": 1.9792, + "step": 13720 + }, + { + "epoch": 1.071567938812144, + "grad_norm": 0.23533307015895844, + "learning_rate": 8.117195436361281e-05, + "loss": 1.9054, + "step": 13730 + }, + { + "epoch": 1.0723483961601499, + "grad_norm": 13.175278663635254, + "learning_rate": 8.113644038678008e-05, + "loss": 0.7785, + "step": 13740 + }, + { + "epoch": 1.0731288535081558, + "grad_norm": 2.3585263988934457e-05, + "learning_rate": 8.110090073308178e-05, + "loss": 2.0138, + "step": 13750 + }, + { + "epoch": 1.0739093108561617, + "grad_norm": 0.43680545687675476, + "learning_rate": 8.106533543182598e-05, + "loss": 0.1986, + "step": 13760 + }, + { + "epoch": 1.0746897682041676, + "grad_norm": 0.013349410146474838, + "learning_rate": 8.102974451234178e-05, + "loss": 0.3865, + "step": 13770 + }, + { + "epoch": 1.0754702255521735, + "grad_norm": 22.574199676513672, + "learning_rate": 8.099412800397948e-05, + "loss": 0.5899, + "step": 13780 + }, + { + "epoch": 1.0762506829001794, + "grad_norm": 19.712507247924805, + "learning_rate": 8.095848593611044e-05, + "loss": 1.1524, + "step": 13790 + }, + { + "epoch": 1.0770311402481854, + "grad_norm": 2.589094877243042, + "learning_rate": 8.092281833812716e-05, + "loss": 1.6711, + "step": 13800 + }, + { + "epoch": 1.0778115975961913, + "grad_norm": 38.977909088134766, + "learning_rate": 8.088712523944314e-05, + "loss": 1.2787, + "step": 13810 + }, + { + "epoch": 1.0785920549441972, + "grad_norm": 34.47909164428711, + "learning_rate": 8.085140666949291e-05, + "loss": 1.0232, + "step": 13820 + }, + { + "epoch": 1.0793725122922033, + "grad_norm": 0.8124696612358093, + "learning_rate": 8.081566265773202e-05, + "loss": 0.2438, + "step": 13830 + }, + { + "epoch": 1.0801529696402092, + "grad_norm": 0.05350351333618164, + "learning_rate": 8.077989323363702e-05, + "loss": 0.4916, + "step": 13840 + }, + { + "epoch": 1.0809334269882152, + "grad_norm": 8.868869372236077e-07, + "learning_rate": 8.074409842670538e-05, + "loss": 1.8634, + "step": 13850 + }, + { + "epoch": 1.081713884336221, + "grad_norm": 38.613704681396484, + "learning_rate": 8.07082782664555e-05, + "loss": 1.8085, + "step": 13860 + }, + { + "epoch": 1.082494341684227, + "grad_norm": 45.651336669921875, + "learning_rate": 8.067243278242676e-05, + "loss": 2.1611, + "step": 13870 + }, + { + "epoch": 1.083274799032233, + "grad_norm": 0.4036369025707245, + "learning_rate": 8.063656200417928e-05, + "loss": 0.6555, + "step": 13880 + }, + { + "epoch": 1.0840552563802388, + "grad_norm": 2.1979187749820994e-06, + "learning_rate": 8.060066596129422e-05, + "loss": 1.0594, + "step": 13890 + }, + { + "epoch": 1.0848357137282447, + "grad_norm": 2.6379153728485107, + "learning_rate": 8.056474468337343e-05, + "loss": 0.4087, + "step": 13900 + }, + { + "epoch": 1.0856161710762506, + "grad_norm": 4.4034193706465885e-05, + "learning_rate": 8.052879820003962e-05, + "loss": 0.3825, + "step": 13910 + }, + { + "epoch": 1.0863966284242565, + "grad_norm": 1.2897426131530665e-05, + "learning_rate": 8.049282654093631e-05, + "loss": 0.4262, + "step": 13920 + }, + { + "epoch": 1.0871770857722625, + "grad_norm": 2.5914298475981923e-06, + "learning_rate": 8.045682973572777e-05, + "loss": 0.6124, + "step": 13930 + }, + { + "epoch": 1.0879575431202684, + "grad_norm": 46.8470344543457, + "learning_rate": 8.042080781409896e-05, + "loss": 1.1768, + "step": 13940 + }, + { + "epoch": 1.0887380004682745, + "grad_norm": 1.629309058189392, + "learning_rate": 8.038476080575562e-05, + "loss": 1.8716, + "step": 13950 + }, + { + "epoch": 1.0895184578162804, + "grad_norm": 0.21702909469604492, + "learning_rate": 8.034868874042412e-05, + "loss": 0.9467, + "step": 13960 + }, + { + "epoch": 1.0902989151642863, + "grad_norm": 3.1508551501246984e-07, + "learning_rate": 8.031259164785155e-05, + "loss": 1.7675, + "step": 13970 + }, + { + "epoch": 1.0910793725122923, + "grad_norm": 1.32136070728302, + "learning_rate": 8.027646955780556e-05, + "loss": 1.9713, + "step": 13980 + }, + { + "epoch": 1.0918598298602982, + "grad_norm": 0.06869912147521973, + "learning_rate": 8.024032250007454e-05, + "loss": 0.4424, + "step": 13990 + }, + { + "epoch": 1.092640287208304, + "grad_norm": 15.436603546142578, + "learning_rate": 8.020415050446732e-05, + "loss": 0.2352, + "step": 14000 + }, + { + "epoch": 1.09342074455631, + "grad_norm": 1.4183063507080078, + "learning_rate": 8.016795360081342e-05, + "loss": 0.7249, + "step": 14010 + }, + { + "epoch": 1.094201201904316, + "grad_norm": 9.759514808654785, + "learning_rate": 8.013173181896283e-05, + "loss": 0.5102, + "step": 14020 + }, + { + "epoch": 1.0949816592523218, + "grad_norm": 4.5370564460754395, + "learning_rate": 8.009548518878606e-05, + "loss": 0.3963, + "step": 14030 + }, + { + "epoch": 1.0957621166003277, + "grad_norm": 1.1387583072064444e-06, + "learning_rate": 8.005921374017415e-05, + "loss": 2.1259, + "step": 14040 + }, + { + "epoch": 1.0965425739483337, + "grad_norm": 10.59189510345459, + "learning_rate": 8.002291750303857e-05, + "loss": 1.1702, + "step": 14050 + }, + { + "epoch": 1.0973230312963396, + "grad_norm": 10.202428817749023, + "learning_rate": 7.998659650731125e-05, + "loss": 1.0879, + "step": 14060 + }, + { + "epoch": 1.0981034886443455, + "grad_norm": 37.752140045166016, + "learning_rate": 7.995025078294452e-05, + "loss": 1.529, + "step": 14070 + }, + { + "epoch": 1.0988839459923516, + "grad_norm": 4.652754306793213, + "learning_rate": 7.991388035991114e-05, + "loss": 0.3956, + "step": 14080 + }, + { + "epoch": 1.0996644033403575, + "grad_norm": 0.00034467552904970944, + "learning_rate": 7.98774852682042e-05, + "loss": 0.4728, + "step": 14090 + }, + { + "epoch": 1.1004448606883634, + "grad_norm": 0.8711071014404297, + "learning_rate": 7.984106553783712e-05, + "loss": 0.3728, + "step": 14100 + }, + { + "epoch": 1.1012253180363694, + "grad_norm": 1.9005073308944702, + "learning_rate": 7.98046211988437e-05, + "loss": 0.4657, + "step": 14110 + }, + { + "epoch": 1.1020057753843753, + "grad_norm": 42.511497497558594, + "learning_rate": 7.976815228127801e-05, + "loss": 1.4052, + "step": 14120 + }, + { + "epoch": 1.1027862327323812, + "grad_norm": 34.27906799316406, + "learning_rate": 7.973165881521434e-05, + "loss": 1.217, + "step": 14130 + }, + { + "epoch": 1.103566690080387, + "grad_norm": 0.008204025216400623, + "learning_rate": 7.969514083074727e-05, + "loss": 2.7314, + "step": 14140 + }, + { + "epoch": 1.104347147428393, + "grad_norm": 0.2557177245616913, + "learning_rate": 7.965859835799162e-05, + "loss": 0.9439, + "step": 14150 + }, + { + "epoch": 1.105127604776399, + "grad_norm": 39.277400970458984, + "learning_rate": 7.962203142708231e-05, + "loss": 0.5969, + "step": 14160 + }, + { + "epoch": 1.1059080621244048, + "grad_norm": 0.045487336814403534, + "learning_rate": 7.958544006817456e-05, + "loss": 0.869, + "step": 14170 + }, + { + "epoch": 1.1066885194724108, + "grad_norm": 21.060131072998047, + "learning_rate": 7.954882431144364e-05, + "loss": 0.2066, + "step": 14180 + }, + { + "epoch": 1.1074689768204167, + "grad_norm": 0.9694964289665222, + "learning_rate": 7.951218418708497e-05, + "loss": 1.0377, + "step": 14190 + }, + { + "epoch": 1.1082494341684228, + "grad_norm": 7.772018909454346, + "learning_rate": 7.947551972531409e-05, + "loss": 0.0204, + "step": 14200 + }, + { + "epoch": 1.1090298915164287, + "grad_norm": 93.92030334472656, + "learning_rate": 7.943883095636652e-05, + "loss": 2.4873, + "step": 14210 + }, + { + "epoch": 1.1098103488644346, + "grad_norm": 72.38433837890625, + "learning_rate": 7.940211791049796e-05, + "loss": 1.1869, + "step": 14220 + }, + { + "epoch": 1.1105908062124406, + "grad_norm": 59.901004791259766, + "learning_rate": 7.936538061798403e-05, + "loss": 5.1337, + "step": 14230 + }, + { + "epoch": 1.1113712635604465, + "grad_norm": 0.010661332868039608, + "learning_rate": 7.932861910912036e-05, + "loss": 0.5729, + "step": 14240 + }, + { + "epoch": 1.1121517209084524, + "grad_norm": 0.6803077459335327, + "learning_rate": 7.92918334142226e-05, + "loss": 1.349, + "step": 14250 + }, + { + "epoch": 1.1129321782564583, + "grad_norm": 5.760603427886963, + "learning_rate": 7.925502356362627e-05, + "loss": 0.5931, + "step": 14260 + }, + { + "epoch": 1.1137126356044642, + "grad_norm": 29.1840763092041, + "learning_rate": 7.921818958768686e-05, + "loss": 0.7213, + "step": 14270 + }, + { + "epoch": 1.1144930929524701, + "grad_norm": 33.425418853759766, + "learning_rate": 7.918133151677977e-05, + "loss": 1.2351, + "step": 14280 + }, + { + "epoch": 1.115273550300476, + "grad_norm": 34.22904968261719, + "learning_rate": 7.914444938130021e-05, + "loss": 0.8819, + "step": 14290 + }, + { + "epoch": 1.116054007648482, + "grad_norm": 10.303384780883789, + "learning_rate": 7.910754321166329e-05, + "loss": 0.8157, + "step": 14300 + }, + { + "epoch": 1.1168344649964879, + "grad_norm": 26.004243850708008, + "learning_rate": 7.907061303830392e-05, + "loss": 0.7456, + "step": 14310 + }, + { + "epoch": 1.1176149223444938, + "grad_norm": 11.272562980651855, + "learning_rate": 7.903365889167682e-05, + "loss": 1.2884, + "step": 14320 + }, + { + "epoch": 1.1183953796924997, + "grad_norm": 11.90373420715332, + "learning_rate": 7.899668080225642e-05, + "loss": 0.2785, + "step": 14330 + }, + { + "epoch": 1.1191758370405058, + "grad_norm": 3.1206201356326346e-07, + "learning_rate": 7.895967880053697e-05, + "loss": 0.5356, + "step": 14340 + }, + { + "epoch": 1.1199562943885117, + "grad_norm": 11.254258155822754, + "learning_rate": 7.89226529170324e-05, + "loss": 1.1941, + "step": 14350 + }, + { + "epoch": 1.1207367517365177, + "grad_norm": 13.978715896606445, + "learning_rate": 7.888560318227636e-05, + "loss": 0.2811, + "step": 14360 + }, + { + "epoch": 1.1215172090845236, + "grad_norm": 2.0998995751142502e-05, + "learning_rate": 7.884852962682212e-05, + "loss": 0.5678, + "step": 14370 + }, + { + "epoch": 1.1222976664325295, + "grad_norm": 26.62897300720215, + "learning_rate": 7.881143228124266e-05, + "loss": 0.3172, + "step": 14380 + }, + { + "epoch": 1.1230781237805354, + "grad_norm": 3.174713114617589e-08, + "learning_rate": 7.87743111761305e-05, + "loss": 1.003, + "step": 14390 + }, + { + "epoch": 1.1238585811285413, + "grad_norm": 1.8061119318008423, + "learning_rate": 7.873716634209784e-05, + "loss": 2.5562, + "step": 14400 + }, + { + "epoch": 1.1246390384765472, + "grad_norm": 31.073867797851562, + "learning_rate": 7.869999780977641e-05, + "loss": 0.6857, + "step": 14410 + }, + { + "epoch": 1.1254194958245531, + "grad_norm": 51.120540618896484, + "learning_rate": 7.866280560981745e-05, + "loss": 2.2526, + "step": 14420 + }, + { + "epoch": 1.126199953172559, + "grad_norm": 0.10467230528593063, + "learning_rate": 7.862558977289174e-05, + "loss": 0.7654, + "step": 14430 + }, + { + "epoch": 1.126980410520565, + "grad_norm": 0.00010914414451690391, + "learning_rate": 7.858835032968959e-05, + "loss": 1.0868, + "step": 14440 + }, + { + "epoch": 1.127760867868571, + "grad_norm": 43.196014404296875, + "learning_rate": 7.855108731092073e-05, + "loss": 0.9148, + "step": 14450 + }, + { + "epoch": 1.128541325216577, + "grad_norm": 0.3297148644924164, + "learning_rate": 7.851380074731433e-05, + "loss": 0.58, + "step": 14460 + }, + { + "epoch": 1.129321782564583, + "grad_norm": 13.183833122253418, + "learning_rate": 7.847649066961904e-05, + "loss": 0.161, + "step": 14470 + }, + { + "epoch": 1.1301022399125888, + "grad_norm": 41.99236297607422, + "learning_rate": 7.84391571086028e-05, + "loss": 0.8755, + "step": 14480 + }, + { + "epoch": 1.1308826972605948, + "grad_norm": 8.9720373352975e-07, + "learning_rate": 7.840180009505303e-05, + "loss": 0.098, + "step": 14490 + }, + { + "epoch": 1.1316631546086007, + "grad_norm": 35.03562545776367, + "learning_rate": 7.83644196597764e-05, + "loss": 0.2565, + "step": 14500 + }, + { + "epoch": 1.1324436119566066, + "grad_norm": 4.2327664573349466e-07, + "learning_rate": 7.83270158335989e-05, + "loss": 0.9135, + "step": 14510 + }, + { + "epoch": 1.1332240693046125, + "grad_norm": 0.0076245772652328014, + "learning_rate": 7.828958864736587e-05, + "loss": 0.2073, + "step": 14520 + }, + { + "epoch": 1.1340045266526184, + "grad_norm": 4.616389936984433e-09, + "learning_rate": 7.825213813194187e-05, + "loss": 0.096, + "step": 14530 + }, + { + "epoch": 1.1347849840006243, + "grad_norm": 25.069517135620117, + "learning_rate": 7.821466431821072e-05, + "loss": 0.9064, + "step": 14540 + }, + { + "epoch": 1.1355654413486302, + "grad_norm": 6.9510321617126465, + "learning_rate": 7.817716723707545e-05, + "loss": 1.6894, + "step": 14550 + }, + { + "epoch": 1.1363458986966362, + "grad_norm": 43.50651550292969, + "learning_rate": 7.813964691945821e-05, + "loss": 2.6459, + "step": 14560 + }, + { + "epoch": 1.137126356044642, + "grad_norm": 35.28486633300781, + "learning_rate": 7.810210339630042e-05, + "loss": 1.4862, + "step": 14570 + }, + { + "epoch": 1.137906813392648, + "grad_norm": 5.5574896578036714e-06, + "learning_rate": 7.806453669856259e-05, + "loss": 0.907, + "step": 14580 + }, + { + "epoch": 1.138687270740654, + "grad_norm": 38.928470611572266, + "learning_rate": 7.802694685722431e-05, + "loss": 0.6483, + "step": 14590 + }, + { + "epoch": 1.13946772808866, + "grad_norm": 1.095226526260376, + "learning_rate": 7.798933390328431e-05, + "loss": 0.4901, + "step": 14600 + }, + { + "epoch": 1.140248185436666, + "grad_norm": 0.0013487986288964748, + "learning_rate": 7.795169786776035e-05, + "loss": 0.9132, + "step": 14610 + }, + { + "epoch": 1.1410286427846719, + "grad_norm": 15.737239837646484, + "learning_rate": 7.791403878168922e-05, + "loss": 1.3128, + "step": 14620 + }, + { + "epoch": 1.1418091001326778, + "grad_norm": 0.00030404122662730515, + "learning_rate": 7.787635667612674e-05, + "loss": 0.4789, + "step": 14630 + }, + { + "epoch": 1.1425895574806837, + "grad_norm": 0.6297647953033447, + "learning_rate": 7.783865158214768e-05, + "loss": 0.0293, + "step": 14640 + }, + { + "epoch": 1.1433700148286896, + "grad_norm": 46.85776138305664, + "learning_rate": 7.780092353084579e-05, + "loss": 2.0859, + "step": 14650 + }, + { + "epoch": 1.1441504721766955, + "grad_norm": 0.00015316448116209358, + "learning_rate": 7.776317255333376e-05, + "loss": 1.1816, + "step": 14660 + }, + { + "epoch": 1.1449309295247014, + "grad_norm": 6.184297561645508, + "learning_rate": 7.772539868074318e-05, + "loss": 1.1873, + "step": 14670 + }, + { + "epoch": 1.1457113868727073, + "grad_norm": 47.606781005859375, + "learning_rate": 7.76876019442245e-05, + "loss": 1.8268, + "step": 14680 + }, + { + "epoch": 1.1464918442207133, + "grad_norm": 0.004689674358814955, + "learning_rate": 7.764978237494707e-05, + "loss": 0.6779, + "step": 14690 + }, + { + "epoch": 1.1472723015687192, + "grad_norm": 41.07505416870117, + "learning_rate": 7.761194000409901e-05, + "loss": 1.2738, + "step": 14700 + }, + { + "epoch": 1.1480527589167253, + "grad_norm": 0.14501197636127472, + "learning_rate": 7.75740748628873e-05, + "loss": 1.1459, + "step": 14710 + }, + { + "epoch": 1.1488332162647312, + "grad_norm": 0.05744193121790886, + "learning_rate": 7.753618698253765e-05, + "loss": 1.2282, + "step": 14720 + }, + { + "epoch": 1.1496136736127371, + "grad_norm": 8.063496589660645, + "learning_rate": 7.749827639429456e-05, + "loss": 0.6149, + "step": 14730 + }, + { + "epoch": 1.150394130960743, + "grad_norm": 0.5605101585388184, + "learning_rate": 7.746034312942123e-05, + "loss": 0.3887, + "step": 14740 + }, + { + "epoch": 1.151174588308749, + "grad_norm": 20.27739143371582, + "learning_rate": 7.742238721919957e-05, + "loss": 1.3573, + "step": 14750 + }, + { + "epoch": 1.1519550456567549, + "grad_norm": 2.61468768119812, + "learning_rate": 7.738440869493018e-05, + "loss": 0.6847, + "step": 14760 + }, + { + "epoch": 1.1527355030047608, + "grad_norm": 0.26672032475471497, + "learning_rate": 7.734640758793225e-05, + "loss": 0.9144, + "step": 14770 + }, + { + "epoch": 1.1535159603527667, + "grad_norm": 20.54535484313965, + "learning_rate": 7.73083839295437e-05, + "loss": 0.2751, + "step": 14780 + }, + { + "epoch": 1.1542964177007726, + "grad_norm": 1.1103615760803223, + "learning_rate": 7.727033775112096e-05, + "loss": 0.947, + "step": 14790 + }, + { + "epoch": 1.1550768750487785, + "grad_norm": 8.565208435058594, + "learning_rate": 7.723226908403902e-05, + "loss": 1.2774, + "step": 14800 + }, + { + "epoch": 1.1558573323967845, + "grad_norm": 0.001084254472516477, + "learning_rate": 7.71941779596915e-05, + "loss": 0.5417, + "step": 14810 + }, + { + "epoch": 1.1566377897447904, + "grad_norm": 0.0005097747780382633, + "learning_rate": 7.715606440949045e-05, + "loss": 0.9245, + "step": 14820 + }, + { + "epoch": 1.1574182470927963, + "grad_norm": 0.0013373733963817358, + "learning_rate": 7.71179284648665e-05, + "loss": 1.8957, + "step": 14830 + }, + { + "epoch": 1.1581987044408022, + "grad_norm": 3.3863561153411865, + "learning_rate": 7.707977015726869e-05, + "loss": 0.3075, + "step": 14840 + }, + { + "epoch": 1.1589791617888083, + "grad_norm": 41.45360565185547, + "learning_rate": 7.704158951816446e-05, + "loss": 1.0029, + "step": 14850 + }, + { + "epoch": 1.1597596191368142, + "grad_norm": 28.2884578704834, + "learning_rate": 7.700338657903977e-05, + "loss": 0.4945, + "step": 14860 + }, + { + "epoch": 1.1605400764848202, + "grad_norm": 5.369661331176758, + "learning_rate": 7.69651613713989e-05, + "loss": 0.1512, + "step": 14870 + }, + { + "epoch": 1.161320533832826, + "grad_norm": 52.72662353515625, + "learning_rate": 7.692691392676454e-05, + "loss": 1.0431, + "step": 14880 + }, + { + "epoch": 1.162100991180832, + "grad_norm": 64.16173553466797, + "learning_rate": 7.688864427667768e-05, + "loss": 1.918, + "step": 14890 + }, + { + "epoch": 1.162881448528838, + "grad_norm": 0.009893955662846565, + "learning_rate": 7.68503524526976e-05, + "loss": 1.4701, + "step": 14900 + }, + { + "epoch": 1.1636619058768438, + "grad_norm": 29.174976348876953, + "learning_rate": 7.681203848640193e-05, + "loss": 2.4439, + "step": 14910 + }, + { + "epoch": 1.1644423632248497, + "grad_norm": 0.05682931840419769, + "learning_rate": 7.677370240938653e-05, + "loss": 1.2603, + "step": 14920 + }, + { + "epoch": 1.1652228205728556, + "grad_norm": 8.036317825317383, + "learning_rate": 7.673534425326548e-05, + "loss": 1.681, + "step": 14930 + }, + { + "epoch": 1.1660032779208616, + "grad_norm": 48.152549743652344, + "learning_rate": 7.669696404967106e-05, + "loss": 0.8372, + "step": 14940 + }, + { + "epoch": 1.1667837352688675, + "grad_norm": 0.025480227544903755, + "learning_rate": 7.66585618302538e-05, + "loss": 0.4711, + "step": 14950 + }, + { + "epoch": 1.1675641926168736, + "grad_norm": 0.004622321110218763, + "learning_rate": 7.66201376266823e-05, + "loss": 0.3303, + "step": 14960 + }, + { + "epoch": 1.1683446499648795, + "grad_norm": 21.413469314575195, + "learning_rate": 7.658169147064333e-05, + "loss": 1.4434, + "step": 14970 + }, + { + "epoch": 1.1691251073128854, + "grad_norm": 42.26973342895508, + "learning_rate": 7.654322339384178e-05, + "loss": 0.4439, + "step": 14980 + }, + { + "epoch": 1.1699055646608914, + "grad_norm": 56.95500946044922, + "learning_rate": 7.65047334280006e-05, + "loss": 1.851, + "step": 14990 + }, + { + "epoch": 1.1706860220088973, + "grad_norm": 0.0011174381943419576, + "learning_rate": 7.646622160486075e-05, + "loss": 1.7686, + "step": 15000 + }, + { + "epoch": 1.1714664793569032, + "grad_norm": 0.0004623864952009171, + "learning_rate": 7.642768795618129e-05, + "loss": 0.0741, + "step": 15010 + }, + { + "epoch": 1.172246936704909, + "grad_norm": 1.7365893654641695e-05, + "learning_rate": 7.63891325137392e-05, + "loss": 1.4602, + "step": 15020 + }, + { + "epoch": 1.173027394052915, + "grad_norm": 10.92381477355957, + "learning_rate": 7.635055530932951e-05, + "loss": 0.2436, + "step": 15030 + }, + { + "epoch": 1.173807851400921, + "grad_norm": 0.46861162781715393, + "learning_rate": 7.631195637476516e-05, + "loss": 1.6745, + "step": 15040 + }, + { + "epoch": 1.1745883087489268, + "grad_norm": 0.00027051439974457026, + "learning_rate": 7.627333574187697e-05, + "loss": 0.6537, + "step": 15050 + }, + { + "epoch": 1.1753687660969327, + "grad_norm": 2.5070583820343018, + "learning_rate": 7.623469344251373e-05, + "loss": 0.7209, + "step": 15060 + }, + { + "epoch": 1.1761492234449387, + "grad_norm": 16.99152946472168, + "learning_rate": 7.619602950854205e-05, + "loss": 0.4085, + "step": 15070 + }, + { + "epoch": 1.1769296807929446, + "grad_norm": 0.03525177016854286, + "learning_rate": 7.61573439718464e-05, + "loss": 0.1498, + "step": 15080 + }, + { + "epoch": 1.1777101381409505, + "grad_norm": 43.2532958984375, + "learning_rate": 7.611863686432903e-05, + "loss": 2.3887, + "step": 15090 + }, + { + "epoch": 1.1784905954889564, + "grad_norm": 1.2691663187069935e-07, + "learning_rate": 7.607990821791005e-05, + "loss": 0.6657, + "step": 15100 + }, + { + "epoch": 1.1792710528369625, + "grad_norm": 46.22968292236328, + "learning_rate": 7.604115806452723e-05, + "loss": 1.4542, + "step": 15110 + }, + { + "epoch": 1.1800515101849685, + "grad_norm": 8.150375651894137e-05, + "learning_rate": 7.600238643613618e-05, + "loss": 0.601, + "step": 15120 + }, + { + "epoch": 1.1808319675329744, + "grad_norm": 1.4502999782562256, + "learning_rate": 7.596359336471015e-05, + "loss": 0.3452, + "step": 15130 + }, + { + "epoch": 1.1816124248809803, + "grad_norm": 19.779787063598633, + "learning_rate": 7.59247788822401e-05, + "loss": 1.2247, + "step": 15140 + }, + { + "epoch": 1.1823928822289862, + "grad_norm": 31.30126190185547, + "learning_rate": 7.588594302073464e-05, + "loss": 0.3812, + "step": 15150 + }, + { + "epoch": 1.1831733395769921, + "grad_norm": 4.278034210205078, + "learning_rate": 7.584708581222002e-05, + "loss": 1.2073, + "step": 15160 + }, + { + "epoch": 1.183953796924998, + "grad_norm": 3.023759290954331e-06, + "learning_rate": 7.580820728874008e-05, + "loss": 2.7868, + "step": 15170 + }, + { + "epoch": 1.184734254273004, + "grad_norm": 0.6000422835350037, + "learning_rate": 7.576930748235624e-05, + "loss": 0.5173, + "step": 15180 + }, + { + "epoch": 1.1855147116210099, + "grad_norm": 47.56028747558594, + "learning_rate": 7.573038642514748e-05, + "loss": 2.4751, + "step": 15190 + }, + { + "epoch": 1.1862951689690158, + "grad_norm": 23.9864559173584, + "learning_rate": 7.569144414921031e-05, + "loss": 0.9667, + "step": 15200 + }, + { + "epoch": 1.1870756263170217, + "grad_norm": 13.402782440185547, + "learning_rate": 7.565248068665872e-05, + "loss": 0.934, + "step": 15210 + }, + { + "epoch": 1.1878560836650278, + "grad_norm": 14.633162498474121, + "learning_rate": 7.561349606962416e-05, + "loss": 0.6039, + "step": 15220 + }, + { + "epoch": 1.1886365410130337, + "grad_norm": 10.086926460266113, + "learning_rate": 7.557449033025558e-05, + "loss": 1.0448, + "step": 15230 + }, + { + "epoch": 1.1894169983610396, + "grad_norm": 12.034710884094238, + "learning_rate": 7.553546350071928e-05, + "loss": 0.4767, + "step": 15240 + }, + { + "epoch": 1.1901974557090456, + "grad_norm": 7.936996553326026e-06, + "learning_rate": 7.549641561319902e-05, + "loss": 0.0658, + "step": 15250 + }, + { + "epoch": 1.1909779130570515, + "grad_norm": 5.2773613929748535, + "learning_rate": 7.545734669989586e-05, + "loss": 1.2455, + "step": 15260 + }, + { + "epoch": 1.1917583704050574, + "grad_norm": 10.09801959991455, + "learning_rate": 7.541825679302825e-05, + "loss": 1.4523, + "step": 15270 + }, + { + "epoch": 1.1925388277530633, + "grad_norm": 0.3290392756462097, + "learning_rate": 7.537914592483194e-05, + "loss": 0.4414, + "step": 15280 + }, + { + "epoch": 1.1933192851010692, + "grad_norm": 0.00040467019425705075, + "learning_rate": 7.534001412755991e-05, + "loss": 0.831, + "step": 15290 + }, + { + "epoch": 1.1940997424490751, + "grad_norm": 4.611071586608887, + "learning_rate": 7.53008614334825e-05, + "loss": 1.8377, + "step": 15300 + }, + { + "epoch": 1.194880199797081, + "grad_norm": 0.007054849527776241, + "learning_rate": 7.526168787488721e-05, + "loss": 1.1124, + "step": 15310 + }, + { + "epoch": 1.195660657145087, + "grad_norm": 19.00647735595703, + "learning_rate": 7.522249348407879e-05, + "loss": 0.4197, + "step": 15320 + }, + { + "epoch": 1.1964411144930929, + "grad_norm": 0.24617056548595428, + "learning_rate": 7.518327829337912e-05, + "loss": 0.3874, + "step": 15330 + }, + { + "epoch": 1.1972215718410988, + "grad_norm": 43.505096435546875, + "learning_rate": 7.514404233512725e-05, + "loss": 1.059, + "step": 15340 + }, + { + "epoch": 1.1980020291891047, + "grad_norm": 1.2489773035049438, + "learning_rate": 7.51047856416794e-05, + "loss": 0.3069, + "step": 15350 + }, + { + "epoch": 1.1987824865371108, + "grad_norm": 50.59429931640625, + "learning_rate": 7.506550824540881e-05, + "loss": 0.1905, + "step": 15360 + }, + { + "epoch": 1.1995629438851168, + "grad_norm": 49.17162322998047, + "learning_rate": 7.502621017870588e-05, + "loss": 1.3437, + "step": 15370 + }, + { + "epoch": 1.2003434012331227, + "grad_norm": 17.885541915893555, + "learning_rate": 7.498689147397799e-05, + "loss": 1.0513, + "step": 15380 + }, + { + "epoch": 1.2011238585811286, + "grad_norm": 2.4361978034903586e-07, + "learning_rate": 7.494755216364956e-05, + "loss": 0.3555, + "step": 15390 + }, + { + "epoch": 1.2019043159291345, + "grad_norm": 60.8936653137207, + "learning_rate": 7.490819228016202e-05, + "loss": 1.7618, + "step": 15400 + }, + { + "epoch": 1.2026847732771404, + "grad_norm": 0.6997115612030029, + "learning_rate": 7.486881185597373e-05, + "loss": 0.5708, + "step": 15410 + }, + { + "epoch": 1.2034652306251463, + "grad_norm": 0.29672977328300476, + "learning_rate": 7.482941092356004e-05, + "loss": 0.4767, + "step": 15420 + }, + { + "epoch": 1.2042456879731522, + "grad_norm": 0.3411640226840973, + "learning_rate": 7.478998951541316e-05, + "loss": 2.2572, + "step": 15430 + }, + { + "epoch": 1.2050261453211581, + "grad_norm": 3.5091769695281982, + "learning_rate": 7.475054766404221e-05, + "loss": 2.2157, + "step": 15440 + }, + { + "epoch": 1.205806602669164, + "grad_norm": 0.06144566833972931, + "learning_rate": 7.471108540197316e-05, + "loss": 1.5861, + "step": 15450 + }, + { + "epoch": 1.20658706001717, + "grad_norm": 2.090874671936035, + "learning_rate": 7.467160276174884e-05, + "loss": 0.5432, + "step": 15460 + }, + { + "epoch": 1.2073675173651761, + "grad_norm": 0.16866926848888397, + "learning_rate": 7.463209977592884e-05, + "loss": 0.3952, + "step": 15470 + }, + { + "epoch": 1.208147974713182, + "grad_norm": 46.683372497558594, + "learning_rate": 7.459257647708955e-05, + "loss": 1.6898, + "step": 15480 + }, + { + "epoch": 1.208928432061188, + "grad_norm": 3.992119312286377, + "learning_rate": 7.455303289782414e-05, + "loss": 0.6744, + "step": 15490 + }, + { + "epoch": 1.2097088894091939, + "grad_norm": 43.23140335083008, + "learning_rate": 7.451346907074245e-05, + "loss": 1.7335, + "step": 15500 + }, + { + "epoch": 1.2104893467571998, + "grad_norm": 44.27477264404297, + "learning_rate": 7.447388502847106e-05, + "loss": 0.7387, + "step": 15510 + }, + { + "epoch": 1.2112698041052057, + "grad_norm": 0.1679820567369461, + "learning_rate": 7.443428080365318e-05, + "loss": 1.7443, + "step": 15520 + }, + { + "epoch": 1.2120502614532116, + "grad_norm": 7.058843766571954e-05, + "learning_rate": 7.439465642894872e-05, + "loss": 0.9857, + "step": 15530 + }, + { + "epoch": 1.2128307188012175, + "grad_norm": 0.11891741305589676, + "learning_rate": 7.435501193703415e-05, + "loss": 0.2799, + "step": 15540 + }, + { + "epoch": 1.2136111761492234, + "grad_norm": 20.137723922729492, + "learning_rate": 7.431534736060257e-05, + "loss": 1.0802, + "step": 15550 + }, + { + "epoch": 1.2143916334972293, + "grad_norm": 4.478515148162842, + "learning_rate": 7.427566273236363e-05, + "loss": 1.1262, + "step": 15560 + }, + { + "epoch": 1.2151720908452353, + "grad_norm": 37.74848175048828, + "learning_rate": 7.42359580850435e-05, + "loss": 1.6794, + "step": 15570 + }, + { + "epoch": 1.2159525481932412, + "grad_norm": 0.0001180763283628039, + "learning_rate": 7.419623345138488e-05, + "loss": 1.0961, + "step": 15580 + }, + { + "epoch": 1.216733005541247, + "grad_norm": 6.324615242192522e-05, + "learning_rate": 7.415648886414694e-05, + "loss": 0.0529, + "step": 15590 + }, + { + "epoch": 1.217513462889253, + "grad_norm": 5.168887615203857, + "learning_rate": 7.411672435610531e-05, + "loss": 0.17, + "step": 15600 + }, + { + "epoch": 1.218293920237259, + "grad_norm": 0.005032880697399378, + "learning_rate": 7.407693996005207e-05, + "loss": 0.7068, + "step": 15610 + }, + { + "epoch": 1.219074377585265, + "grad_norm": 0.0009420686401426792, + "learning_rate": 7.403713570879565e-05, + "loss": 0.9126, + "step": 15620 + }, + { + "epoch": 1.219854834933271, + "grad_norm": 8.980721473693848, + "learning_rate": 7.399731163516088e-05, + "loss": 0.2624, + "step": 15630 + }, + { + "epoch": 1.2206352922812769, + "grad_norm": 23.076135635375977, + "learning_rate": 7.395746777198895e-05, + "loss": 1.3006, + "step": 15640 + }, + { + "epoch": 1.2214157496292828, + "grad_norm": 0.07298509776592255, + "learning_rate": 7.391760415213735e-05, + "loss": 0.6209, + "step": 15650 + }, + { + "epoch": 1.2221962069772887, + "grad_norm": 2.6981508653989295e-06, + "learning_rate": 7.387772080847988e-05, + "loss": 0.787, + "step": 15660 + }, + { + "epoch": 1.2229766643252946, + "grad_norm": 0.2646944522857666, + "learning_rate": 7.383781777390658e-05, + "loss": 1.1545, + "step": 15670 + }, + { + "epoch": 1.2237571216733005, + "grad_norm": 0.16488932073116302, + "learning_rate": 7.379789508132377e-05, + "loss": 0.5825, + "step": 15680 + }, + { + "epoch": 1.2245375790213064, + "grad_norm": 0.34593504667282104, + "learning_rate": 7.375795276365392e-05, + "loss": 1.3832, + "step": 15690 + }, + { + "epoch": 1.2253180363693124, + "grad_norm": 0.3061760365962982, + "learning_rate": 7.371799085383575e-05, + "loss": 1.0564, + "step": 15700 + }, + { + "epoch": 1.2260984937173183, + "grad_norm": 0.015735339373350143, + "learning_rate": 7.367800938482409e-05, + "loss": 1.1702, + "step": 15710 + }, + { + "epoch": 1.2268789510653242, + "grad_norm": 19.672164916992188, + "learning_rate": 7.363800838958991e-05, + "loss": 1.6207, + "step": 15720 + }, + { + "epoch": 1.2276594084133303, + "grad_norm": 0.003373056184500456, + "learning_rate": 7.359798790112033e-05, + "loss": 0.4638, + "step": 15730 + }, + { + "epoch": 1.2284398657613362, + "grad_norm": 0.04260651394724846, + "learning_rate": 7.355794795241844e-05, + "loss": 1.188, + "step": 15740 + }, + { + "epoch": 1.2292203231093422, + "grad_norm": 36.1136360168457, + "learning_rate": 7.351788857650345e-05, + "loss": 0.5226, + "step": 15750 + }, + { + "epoch": 1.230000780457348, + "grad_norm": 9.62464714050293, + "learning_rate": 7.347780980641064e-05, + "loss": 0.5578, + "step": 15760 + }, + { + "epoch": 1.230781237805354, + "grad_norm": 26.886932373046875, + "learning_rate": 7.343771167519117e-05, + "loss": 1.4843, + "step": 15770 + }, + { + "epoch": 1.23156169515336, + "grad_norm": 1.5477793567697518e-05, + "learning_rate": 7.339759421591224e-05, + "loss": 0.2607, + "step": 15780 + }, + { + "epoch": 1.2323421525013658, + "grad_norm": 20.806283950805664, + "learning_rate": 7.335745746165696e-05, + "loss": 1.0338, + "step": 15790 + }, + { + "epoch": 1.2331226098493717, + "grad_norm": 0.18499815464019775, + "learning_rate": 7.331730144552437e-05, + "loss": 0.5777, + "step": 15800 + }, + { + "epoch": 1.2339030671973776, + "grad_norm": 15.987350463867188, + "learning_rate": 7.32771262006294e-05, + "loss": 0.1239, + "step": 15810 + }, + { + "epoch": 1.2346835245453835, + "grad_norm": 51.00580978393555, + "learning_rate": 7.32369317601028e-05, + "loss": 0.6431, + "step": 15820 + }, + { + "epoch": 1.2354639818933895, + "grad_norm": 28.59132194519043, + "learning_rate": 7.319671815709119e-05, + "loss": 1.099, + "step": 15830 + }, + { + "epoch": 1.2362444392413954, + "grad_norm": 1.3742252588272095, + "learning_rate": 7.315648542475698e-05, + "loss": 0.1693, + "step": 15840 + }, + { + "epoch": 1.2370248965894013, + "grad_norm": 0.00036608395748771727, + "learning_rate": 7.311623359627833e-05, + "loss": 0.1572, + "step": 15850 + }, + { + "epoch": 1.2378053539374072, + "grad_norm": 12.284563064575195, + "learning_rate": 7.307596270484918e-05, + "loss": 0.277, + "step": 15860 + }, + { + "epoch": 1.2385858112854133, + "grad_norm": 0.00085182033944875, + "learning_rate": 7.303567278367917e-05, + "loss": 1.9326, + "step": 15870 + }, + { + "epoch": 1.2393662686334193, + "grad_norm": 0.0015476574189960957, + "learning_rate": 7.299536386599367e-05, + "loss": 0.3339, + "step": 15880 + }, + { + "epoch": 1.2401467259814252, + "grad_norm": 11.180502891540527, + "learning_rate": 7.295503598503366e-05, + "loss": 1.0722, + "step": 15890 + }, + { + "epoch": 1.240927183329431, + "grad_norm": 1.789854832168203e-05, + "learning_rate": 7.29146891740558e-05, + "loss": 1.0567, + "step": 15900 + }, + { + "epoch": 1.241707640677437, + "grad_norm": 0.014964636415243149, + "learning_rate": 7.287432346633233e-05, + "loss": 0.629, + "step": 15910 + }, + { + "epoch": 1.242488098025443, + "grad_norm": 1.1818044640676817e-06, + "learning_rate": 7.283393889515112e-05, + "loss": 0.0488, + "step": 15920 + }, + { + "epoch": 1.2432685553734488, + "grad_norm": 45.90778350830078, + "learning_rate": 7.279353549381554e-05, + "loss": 2.102, + "step": 15930 + }, + { + "epoch": 1.2440490127214547, + "grad_norm": 0.10997821390628815, + "learning_rate": 7.275311329564453e-05, + "loss": 4.0926, + "step": 15940 + }, + { + "epoch": 1.2448294700694607, + "grad_norm": 4.090299606323242, + "learning_rate": 7.27126723339725e-05, + "loss": 0.3856, + "step": 15950 + }, + { + "epoch": 1.2456099274174666, + "grad_norm": 24.23040008544922, + "learning_rate": 7.267221264214936e-05, + "loss": 0.1491, + "step": 15960 + }, + { + "epoch": 1.2463903847654725, + "grad_norm": 65.79607391357422, + "learning_rate": 7.263173425354045e-05, + "loss": 1.4866, + "step": 15970 + }, + { + "epoch": 1.2471708421134786, + "grad_norm": 1.5714404582977295, + "learning_rate": 7.259123720152652e-05, + "loss": 1.1359, + "step": 15980 + }, + { + "epoch": 1.2479512994614845, + "grad_norm": 50.30306625366211, + "learning_rate": 7.255072151950376e-05, + "loss": 1.654, + "step": 15990 + }, + { + "epoch": 1.2487317568094904, + "grad_norm": 5.232126568444073e-05, + "learning_rate": 7.251018724088367e-05, + "loss": 1.5325, + "step": 16000 + }, + { + "epoch": 1.2495122141574964, + "grad_norm": 28.464221954345703, + "learning_rate": 7.246963439909309e-05, + "loss": 1.8179, + "step": 16010 + }, + { + "epoch": 1.2502926715055023, + "grad_norm": 0.07274913042783737, + "learning_rate": 7.24290630275742e-05, + "loss": 0.0345, + "step": 16020 + }, + { + "epoch": 1.2510731288535082, + "grad_norm": 0.0002677069860510528, + "learning_rate": 7.238847315978442e-05, + "loss": 0.5961, + "step": 16030 + }, + { + "epoch": 1.251853586201514, + "grad_norm": 10.164164543151855, + "learning_rate": 7.234786482919646e-05, + "loss": 0.55, + "step": 16040 + }, + { + "epoch": 1.25263404354952, + "grad_norm": 45.208030700683594, + "learning_rate": 7.230723806929824e-05, + "loss": 0.8216, + "step": 16050 + }, + { + "epoch": 1.253414500897526, + "grad_norm": 0.022487549111247063, + "learning_rate": 7.226659291359287e-05, + "loss": 1.2431, + "step": 16060 + }, + { + "epoch": 1.2541949582455318, + "grad_norm": 15.880640029907227, + "learning_rate": 7.222592939559867e-05, + "loss": 0.0487, + "step": 16070 + }, + { + "epoch": 1.2549754155935378, + "grad_norm": 2.3655835320823826e-05, + "learning_rate": 7.218524754884903e-05, + "loss": 0.1528, + "step": 16080 + }, + { + "epoch": 1.2557558729415437, + "grad_norm": 0.33056506514549255, + "learning_rate": 7.214454740689251e-05, + "loss": 1.1647, + "step": 16090 + }, + { + "epoch": 1.2565363302895496, + "grad_norm": 55.99821090698242, + "learning_rate": 7.210382900329275e-05, + "loss": 1.3796, + "step": 16100 + }, + { + "epoch": 1.2573167876375555, + "grad_norm": 5.380942457122728e-05, + "learning_rate": 7.206309237162844e-05, + "loss": 1.2422, + "step": 16110 + }, + { + "epoch": 1.2580972449855614, + "grad_norm": 28.779945373535156, + "learning_rate": 7.202233754549333e-05, + "loss": 0.417, + "step": 16120 + }, + { + "epoch": 1.2588777023335675, + "grad_norm": 0.0022038770839571953, + "learning_rate": 7.198156455849609e-05, + "loss": 0.9051, + "step": 16130 + }, + { + "epoch": 1.2596581596815735, + "grad_norm": 0.0036659492179751396, + "learning_rate": 7.194077344426048e-05, + "loss": 1.3457, + "step": 16140 + }, + { + "epoch": 1.2604386170295794, + "grad_norm": 36.93067932128906, + "learning_rate": 7.189996423642513e-05, + "loss": 1.5451, + "step": 16150 + }, + { + "epoch": 1.2612190743775853, + "grad_norm": 0.04987495392560959, + "learning_rate": 7.185913696864361e-05, + "loss": 1.13, + "step": 16160 + }, + { + "epoch": 1.2619995317255912, + "grad_norm": 0.07304591685533524, + "learning_rate": 7.181829167458441e-05, + "loss": 1.3668, + "step": 16170 + }, + { + "epoch": 1.2627799890735971, + "grad_norm": 1.3673689365386963, + "learning_rate": 7.177742838793083e-05, + "loss": 1.069, + "step": 16180 + }, + { + "epoch": 1.263560446421603, + "grad_norm": 16.000823974609375, + "learning_rate": 7.173654714238109e-05, + "loss": 0.9112, + "step": 16190 + }, + { + "epoch": 1.264340903769609, + "grad_norm": 1.9647473096847534, + "learning_rate": 7.169564797164814e-05, + "loss": 0.0715, + "step": 16200 + }, + { + "epoch": 1.2651213611176149, + "grad_norm": 1.1932563781738281, + "learning_rate": 7.165473090945975e-05, + "loss": 0.2619, + "step": 16210 + }, + { + "epoch": 1.2659018184656208, + "grad_norm": 2.668316602706909, + "learning_rate": 7.161379598955843e-05, + "loss": 0.5574, + "step": 16220 + }, + { + "epoch": 1.266682275813627, + "grad_norm": 1.0716549425637822e-08, + "learning_rate": 7.157284324570144e-05, + "loss": 0.6693, + "step": 16230 + }, + { + "epoch": 1.2674627331616328, + "grad_norm": 9.179781773127615e-05, + "learning_rate": 7.153187271166071e-05, + "loss": 1.299, + "step": 16240 + }, + { + "epoch": 1.2682431905096387, + "grad_norm": 59.244930267333984, + "learning_rate": 7.149088442122284e-05, + "loss": 1.2305, + "step": 16250 + }, + { + "epoch": 1.2690236478576447, + "grad_norm": 18.719036102294922, + "learning_rate": 7.144987840818914e-05, + "loss": 0.2246, + "step": 16260 + }, + { + "epoch": 1.2698041052056506, + "grad_norm": 0.21231059730052948, + "learning_rate": 7.140885470637542e-05, + "loss": 1.4923, + "step": 16270 + }, + { + "epoch": 1.2705845625536565, + "grad_norm": 3.356634579176898e-07, + "learning_rate": 7.136781334961219e-05, + "loss": 0.5428, + "step": 16280 + }, + { + "epoch": 1.2713650199016624, + "grad_norm": 18.25281524658203, + "learning_rate": 7.132675437174443e-05, + "loss": 0.8539, + "step": 16290 + }, + { + "epoch": 1.2721454772496683, + "grad_norm": 3.0588083177462977e-07, + "learning_rate": 7.128567780663171e-05, + "loss": 0.1291, + "step": 16300 + }, + { + "epoch": 1.2729259345976742, + "grad_norm": 32.88163757324219, + "learning_rate": 7.124458368814809e-05, + "loss": 2.0664, + "step": 16310 + }, + { + "epoch": 1.2737063919456801, + "grad_norm": 3.1073005199432373, + "learning_rate": 7.120347205018208e-05, + "loss": 1.4382, + "step": 16320 + }, + { + "epoch": 1.274486849293686, + "grad_norm": 48.93598175048828, + "learning_rate": 7.116234292663667e-05, + "loss": 1.4327, + "step": 16330 + }, + { + "epoch": 1.275267306641692, + "grad_norm": 4.686527554920161e-11, + "learning_rate": 7.112119635142923e-05, + "loss": 0.0149, + "step": 16340 + }, + { + "epoch": 1.2760477639896979, + "grad_norm": 2.506943702697754, + "learning_rate": 7.108003235849158e-05, + "loss": 0.9217, + "step": 16350 + }, + { + "epoch": 1.2768282213377038, + "grad_norm": 0.007850771769881248, + "learning_rate": 7.103885098176987e-05, + "loss": 1.2819, + "step": 16360 + }, + { + "epoch": 1.2776086786857097, + "grad_norm": 1.201541543006897, + "learning_rate": 7.099765225522456e-05, + "loss": 0.5083, + "step": 16370 + }, + { + "epoch": 1.2783891360337156, + "grad_norm": 1.7927274703979492, + "learning_rate": 7.095643621283045e-05, + "loss": 0.7671, + "step": 16380 + }, + { + "epoch": 1.2791695933817218, + "grad_norm": 49.63917541503906, + "learning_rate": 7.091520288857665e-05, + "loss": 2.2272, + "step": 16390 + }, + { + "epoch": 1.2799500507297277, + "grad_norm": 42.62212371826172, + "learning_rate": 7.087395231646645e-05, + "loss": 1.0197, + "step": 16400 + }, + { + "epoch": 1.2807305080777336, + "grad_norm": 16.70367431640625, + "learning_rate": 7.08326845305174e-05, + "loss": 0.5912, + "step": 16410 + }, + { + "epoch": 1.2815109654257395, + "grad_norm": 0.021338213235139847, + "learning_rate": 7.079139956476126e-05, + "loss": 0.1044, + "step": 16420 + }, + { + "epoch": 1.2822914227737454, + "grad_norm": 0.017756953835487366, + "learning_rate": 7.075009745324395e-05, + "loss": 1.3735, + "step": 16430 + }, + { + "epoch": 1.2830718801217513, + "grad_norm": 7.095729351043701, + "learning_rate": 7.070877823002547e-05, + "loss": 0.3988, + "step": 16440 + }, + { + "epoch": 1.2838523374697572, + "grad_norm": 38.567604064941406, + "learning_rate": 7.066744192918005e-05, + "loss": 1.1289, + "step": 16450 + }, + { + "epoch": 1.2846327948177632, + "grad_norm": 0.00015506644558627158, + "learning_rate": 7.06260885847959e-05, + "loss": 0.3998, + "step": 16460 + }, + { + "epoch": 1.285413252165769, + "grad_norm": 16.752622604370117, + "learning_rate": 7.058471823097533e-05, + "loss": 0.2616, + "step": 16470 + }, + { + "epoch": 1.2861937095137752, + "grad_norm": 2.3494606018066406, + "learning_rate": 7.054333090183465e-05, + "loss": 4.6909, + "step": 16480 + }, + { + "epoch": 1.2869741668617811, + "grad_norm": 46.26172637939453, + "learning_rate": 7.050192663150422e-05, + "loss": 1.0096, + "step": 16490 + }, + { + "epoch": 1.287754624209787, + "grad_norm": 1.201383352279663, + "learning_rate": 7.046050545412831e-05, + "loss": 1.0108, + "step": 16500 + }, + { + "epoch": 1.288535081557793, + "grad_norm": 5.515385055332445e-07, + "learning_rate": 7.041906740386518e-05, + "loss": 1.6396, + "step": 16510 + }, + { + "epoch": 1.2893155389057989, + "grad_norm": 7.240560531616211, + "learning_rate": 7.037761251488696e-05, + "loss": 0.1567, + "step": 16520 + }, + { + "epoch": 1.2900959962538048, + "grad_norm": 1.1850215196609497, + "learning_rate": 7.03361408213797e-05, + "loss": 0.5309, + "step": 16530 + }, + { + "epoch": 1.2908764536018107, + "grad_norm": 0.10555583238601685, + "learning_rate": 7.029465235754331e-05, + "loss": 0.4628, + "step": 16540 + }, + { + "epoch": 1.2916569109498166, + "grad_norm": 46.775550842285156, + "learning_rate": 7.025314715759153e-05, + "loss": 1.839, + "step": 16550 + }, + { + "epoch": 1.2924373682978225, + "grad_norm": 8.276897430419922, + "learning_rate": 7.021162525575183e-05, + "loss": 0.0247, + "step": 16560 + }, + { + "epoch": 1.2932178256458284, + "grad_norm": 62.0821533203125, + "learning_rate": 7.017008668626557e-05, + "loss": 1.0257, + "step": 16570 + }, + { + "epoch": 1.2939982829938343, + "grad_norm": 61.38323211669922, + "learning_rate": 7.01285314833878e-05, + "loss": 0.9119, + "step": 16580 + }, + { + "epoch": 1.2947787403418403, + "grad_norm": 24.953426361083984, + "learning_rate": 7.008695968138725e-05, + "loss": 1.2479, + "step": 16590 + }, + { + "epoch": 1.2955591976898462, + "grad_norm": 0.4902319014072418, + "learning_rate": 7.004537131454638e-05, + "loss": 0.7, + "step": 16600 + }, + { + "epoch": 1.296339655037852, + "grad_norm": 0.020014774054288864, + "learning_rate": 7.000376641716133e-05, + "loss": 0.0599, + "step": 16610 + }, + { + "epoch": 1.297120112385858, + "grad_norm": 10.10654067993164, + "learning_rate": 6.996214502354183e-05, + "loss": 0.853, + "step": 16620 + }, + { + "epoch": 1.297900569733864, + "grad_norm": 0.034189675003290176, + "learning_rate": 6.992050716801122e-05, + "loss": 2.4615, + "step": 16630 + }, + { + "epoch": 1.29868102708187, + "grad_norm": 17.056989669799805, + "learning_rate": 6.987885288490643e-05, + "loss": 1.7232, + "step": 16640 + }, + { + "epoch": 1.299461484429876, + "grad_norm": 3.8021726608276367, + "learning_rate": 6.983718220857795e-05, + "loss": 0.9082, + "step": 16650 + }, + { + "epoch": 1.3002419417778819, + "grad_norm": 0.0016640127869322896, + "learning_rate": 6.979549517338976e-05, + "loss": 0.7894, + "step": 16660 + }, + { + "epoch": 1.3010223991258878, + "grad_norm": 0.0018107325304299593, + "learning_rate": 6.975379181371932e-05, + "loss": 0.4747, + "step": 16670 + }, + { + "epoch": 1.3018028564738937, + "grad_norm": 26.44985008239746, + "learning_rate": 6.97120721639576e-05, + "loss": 1.3115, + "step": 16680 + }, + { + "epoch": 1.3025833138218996, + "grad_norm": 0.4625835716724396, + "learning_rate": 6.967033625850897e-05, + "loss": 0.9148, + "step": 16690 + }, + { + "epoch": 1.3033637711699055, + "grad_norm": 4.2758065887937846e-07, + "learning_rate": 6.962858413179121e-05, + "loss": 1.4819, + "step": 16700 + }, + { + "epoch": 1.3041442285179115, + "grad_norm": 1.5021567344665527, + "learning_rate": 6.958681581823547e-05, + "loss": 0.4725, + "step": 16710 + }, + { + "epoch": 1.3049246858659174, + "grad_norm": 0.45929327607154846, + "learning_rate": 6.95450313522863e-05, + "loss": 0.0265, + "step": 16720 + }, + { + "epoch": 1.3057051432139233, + "grad_norm": 63.05253601074219, + "learning_rate": 6.950323076840147e-05, + "loss": 0.4704, + "step": 16730 + }, + { + "epoch": 1.3064856005619294, + "grad_norm": 53.459842681884766, + "learning_rate": 6.946141410105213e-05, + "loss": 0.6937, + "step": 16740 + }, + { + "epoch": 1.3072660579099353, + "grad_norm": 0.03195926547050476, + "learning_rate": 6.941958138472267e-05, + "loss": 0.1157, + "step": 16750 + }, + { + "epoch": 1.3080465152579412, + "grad_norm": 57.97876739501953, + "learning_rate": 6.937773265391068e-05, + "loss": 1.4795, + "step": 16760 + }, + { + "epoch": 1.3088269726059472, + "grad_norm": 52.485504150390625, + "learning_rate": 6.933586794312702e-05, + "loss": 1.568, + "step": 16770 + }, + { + "epoch": 1.309607429953953, + "grad_norm": 1.3821322917938232, + "learning_rate": 6.929398728689567e-05, + "loss": 0.6191, + "step": 16780 + }, + { + "epoch": 1.310387887301959, + "grad_norm": 3.0984549522399902, + "learning_rate": 6.925209071975379e-05, + "loss": 0.9458, + "step": 16790 + }, + { + "epoch": 1.311168344649965, + "grad_norm": 51.80403518676758, + "learning_rate": 6.921017827625164e-05, + "loss": 0.7153, + "step": 16800 + }, + { + "epoch": 1.3119488019979708, + "grad_norm": 6.354888319037855e-05, + "learning_rate": 6.916824999095262e-05, + "loss": 0.8756, + "step": 16810 + }, + { + "epoch": 1.3127292593459767, + "grad_norm": 23.010360717773438, + "learning_rate": 6.912630589843312e-05, + "loss": 1.3956, + "step": 16820 + }, + { + "epoch": 1.3135097166939826, + "grad_norm": 2.183323860168457, + "learning_rate": 6.908434603328263e-05, + "loss": 1.2873, + "step": 16830 + }, + { + "epoch": 1.3142901740419886, + "grad_norm": 0.3114471435546875, + "learning_rate": 6.90423704301036e-05, + "loss": 2.5646, + "step": 16840 + }, + { + "epoch": 1.3150706313899945, + "grad_norm": 4.872425506619038e-06, + "learning_rate": 6.90003791235115e-05, + "loss": 1.9407, + "step": 16850 + }, + { + "epoch": 1.3158510887380004, + "grad_norm": 0.31955668330192566, + "learning_rate": 6.895837214813474e-05, + "loss": 0.4642, + "step": 16860 + }, + { + "epoch": 1.3166315460860063, + "grad_norm": 3.809271447607898e-06, + "learning_rate": 6.89163495386146e-05, + "loss": 0.579, + "step": 16870 + }, + { + "epoch": 1.3174120034340122, + "grad_norm": 7.399552032438805e-06, + "learning_rate": 6.887431132960533e-05, + "loss": 0.638, + "step": 16880 + }, + { + "epoch": 1.3181924607820181, + "grad_norm": 6.543385825352743e-05, + "learning_rate": 6.883225755577401e-05, + "loss": 0.665, + "step": 16890 + }, + { + "epoch": 1.3189729181300243, + "grad_norm": 29.476024627685547, + "learning_rate": 6.879018825180055e-05, + "loss": 1.4586, + "step": 16900 + }, + { + "epoch": 1.3197533754780302, + "grad_norm": 6.078284059185535e-06, + "learning_rate": 6.874810345237766e-05, + "loss": 1.0899, + "step": 16910 + }, + { + "epoch": 1.320533832826036, + "grad_norm": 31.434192657470703, + "learning_rate": 6.870600319221085e-05, + "loss": 0.5962, + "step": 16920 + }, + { + "epoch": 1.321314290174042, + "grad_norm": 31.558059692382812, + "learning_rate": 6.866388750601837e-05, + "loss": 0.7745, + "step": 16930 + }, + { + "epoch": 1.322094747522048, + "grad_norm": 32.943885803222656, + "learning_rate": 6.862175642853119e-05, + "loss": 0.7585, + "step": 16940 + }, + { + "epoch": 1.3228752048700538, + "grad_norm": 9.586359977722168, + "learning_rate": 6.857960999449297e-05, + "loss": 0.6813, + "step": 16950 + }, + { + "epoch": 1.3236556622180597, + "grad_norm": 19.564783096313477, + "learning_rate": 6.853744823866004e-05, + "loss": 1.1954, + "step": 16960 + }, + { + "epoch": 1.3244361195660657, + "grad_norm": 0.0008135417592711747, + "learning_rate": 6.849527119580136e-05, + "loss": 0.4992, + "step": 16970 + }, + { + "epoch": 1.3252165769140716, + "grad_norm": 0.0034319348633289337, + "learning_rate": 6.84530789006985e-05, + "loss": 1.8232, + "step": 16980 + }, + { + "epoch": 1.3259970342620777, + "grad_norm": 5.529355049133301, + "learning_rate": 6.841087138814562e-05, + "loss": 1.1755, + "step": 16990 + }, + { + "epoch": 1.3267774916100836, + "grad_norm": 1.479116439819336, + "learning_rate": 6.836864869294939e-05, + "loss": 0.5419, + "step": 17000 + }, + { + "epoch": 1.3275579489580895, + "grad_norm": 14.388936996459961, + "learning_rate": 6.832641084992907e-05, + "loss": 1.7516, + "step": 17010 + }, + { + "epoch": 1.3283384063060955, + "grad_norm": 6.888252258300781, + "learning_rate": 6.828415789391631e-05, + "loss": 0.8862, + "step": 17020 + }, + { + "epoch": 1.3291188636541014, + "grad_norm": 0.2259782999753952, + "learning_rate": 6.824188985975533e-05, + "loss": 0.7888, + "step": 17030 + }, + { + "epoch": 1.3298993210021073, + "grad_norm": 0.20432275533676147, + "learning_rate": 6.819960678230271e-05, + "loss": 0.8379, + "step": 17040 + }, + { + "epoch": 1.3306797783501132, + "grad_norm": 12.537790298461914, + "learning_rate": 6.815730869642747e-05, + "loss": 0.807, + "step": 17050 + }, + { + "epoch": 1.331460235698119, + "grad_norm": 0.02838488109409809, + "learning_rate": 6.8114995637011e-05, + "loss": 0.4556, + "step": 17060 + }, + { + "epoch": 1.332240693046125, + "grad_norm": 0.22770993411540985, + "learning_rate": 6.807266763894702e-05, + "loss": 0.4645, + "step": 17070 + }, + { + "epoch": 1.333021150394131, + "grad_norm": 13.060320854187012, + "learning_rate": 6.803032473714162e-05, + "loss": 0.9662, + "step": 17080 + }, + { + "epoch": 1.3338016077421369, + "grad_norm": 42.66017532348633, + "learning_rate": 6.798796696651313e-05, + "loss": 1.1246, + "step": 17090 + }, + { + "epoch": 1.3345820650901428, + "grad_norm": 0.00010493778245290741, + "learning_rate": 6.794559436199213e-05, + "loss": 1.2048, + "step": 17100 + }, + { + "epoch": 1.3353625224381487, + "grad_norm": 0.5220862030982971, + "learning_rate": 6.79032069585215e-05, + "loss": 1.1721, + "step": 17110 + }, + { + "epoch": 1.3361429797861546, + "grad_norm": 0.18259654939174652, + "learning_rate": 6.786080479105627e-05, + "loss": 0.8042, + "step": 17120 + }, + { + "epoch": 1.3369234371341605, + "grad_norm": 0.060175586491823196, + "learning_rate": 6.781838789456364e-05, + "loss": 0.7491, + "step": 17130 + }, + { + "epoch": 1.3377038944821664, + "grad_norm": 55.724971771240234, + "learning_rate": 6.7775956304023e-05, + "loss": 0.8063, + "step": 17140 + }, + { + "epoch": 1.3384843518301726, + "grad_norm": 19.680768966674805, + "learning_rate": 6.773351005442583e-05, + "loss": 0.9294, + "step": 17150 + }, + { + "epoch": 1.3392648091781785, + "grad_norm": 8.618398666381836, + "learning_rate": 6.769104918077572e-05, + "loss": 0.6359, + "step": 17160 + }, + { + "epoch": 1.3400452665261844, + "grad_norm": 1.873159405363367e-08, + "learning_rate": 6.764857371808826e-05, + "loss": 2.5235, + "step": 17170 + }, + { + "epoch": 1.3408257238741903, + "grad_norm": 52.72881317138672, + "learning_rate": 6.760608370139112e-05, + "loss": 1.2153, + "step": 17180 + }, + { + "epoch": 1.3416061812221962, + "grad_norm": 1.9692984819412231, + "learning_rate": 6.7563579165724e-05, + "loss": 0.5641, + "step": 17190 + }, + { + "epoch": 1.3423866385702021, + "grad_norm": 9.65872168308124e-05, + "learning_rate": 6.752106014613852e-05, + "loss": 2.7393, + "step": 17200 + }, + { + "epoch": 1.343167095918208, + "grad_norm": 0.12596841156482697, + "learning_rate": 6.747852667769827e-05, + "loss": 0.6117, + "step": 17210 + }, + { + "epoch": 1.343947553266214, + "grad_norm": 0.0009383680881001055, + "learning_rate": 6.743597879547872e-05, + "loss": 0.2199, + "step": 17220 + }, + { + "epoch": 1.3447280106142199, + "grad_norm": 0.27956098318099976, + "learning_rate": 6.739341653456728e-05, + "loss": 0.9869, + "step": 17230 + }, + { + "epoch": 1.3455084679622258, + "grad_norm": 11.534627914428711, + "learning_rate": 6.735083993006319e-05, + "loss": 0.9009, + "step": 17240 + }, + { + "epoch": 1.346288925310232, + "grad_norm": 7.020449638366699, + "learning_rate": 6.73082490170775e-05, + "loss": 0.3904, + "step": 17250 + }, + { + "epoch": 1.3470693826582378, + "grad_norm": 0.4477950930595398, + "learning_rate": 6.72656438307331e-05, + "loss": 0.6724, + "step": 17260 + }, + { + "epoch": 1.3478498400062437, + "grad_norm": 0.014391960576176643, + "learning_rate": 6.722302440616463e-05, + "loss": 0.2966, + "step": 17270 + }, + { + "epoch": 1.3486302973542497, + "grad_norm": 55.42076110839844, + "learning_rate": 6.718039077851848e-05, + "loss": 0.9194, + "step": 17280 + }, + { + "epoch": 1.3494107547022556, + "grad_norm": 56.1174430847168, + "learning_rate": 6.713774298295273e-05, + "loss": 1.6298, + "step": 17290 + }, + { + "epoch": 1.3501912120502615, + "grad_norm": 29.674327850341797, + "learning_rate": 6.709508105463716e-05, + "loss": 1.4175, + "step": 17300 + }, + { + "epoch": 1.3509716693982674, + "grad_norm": 37.25205612182617, + "learning_rate": 6.705240502875318e-05, + "loss": 0.7216, + "step": 17310 + }, + { + "epoch": 1.3517521267462733, + "grad_norm": 0.0031614259351044893, + "learning_rate": 6.70097149404939e-05, + "loss": 1.0236, + "step": 17320 + }, + { + "epoch": 1.3525325840942792, + "grad_norm": 58.900535583496094, + "learning_rate": 6.696701082506395e-05, + "loss": 0.8832, + "step": 17330 + }, + { + "epoch": 1.3533130414422851, + "grad_norm": 4.175566673278809, + "learning_rate": 6.692429271767953e-05, + "loss": 0.2608, + "step": 17340 + }, + { + "epoch": 1.354093498790291, + "grad_norm": 39.607295989990234, + "learning_rate": 6.688156065356844e-05, + "loss": 1.051, + "step": 17350 + }, + { + "epoch": 1.354873956138297, + "grad_norm": 5.6130111261154525e-06, + "learning_rate": 6.683881466796994e-05, + "loss": 0.3924, + "step": 17360 + }, + { + "epoch": 1.355654413486303, + "grad_norm": 46.783145904541016, + "learning_rate": 6.679605479613477e-05, + "loss": 3.4201, + "step": 17370 + }, + { + "epoch": 1.3564348708343088, + "grad_norm": 0.22098881006240845, + "learning_rate": 6.675328107332513e-05, + "loss": 0.9662, + "step": 17380 + }, + { + "epoch": 1.3572153281823147, + "grad_norm": 0.0011627674102783203, + "learning_rate": 6.671049353481466e-05, + "loss": 0.7548, + "step": 17390 + }, + { + "epoch": 1.3579957855303206, + "grad_norm": 38.597557067871094, + "learning_rate": 6.666769221588839e-05, + "loss": 1.0412, + "step": 17400 + }, + { + "epoch": 1.3587762428783268, + "grad_norm": 11.341829299926758, + "learning_rate": 6.662487715184266e-05, + "loss": 0.7493, + "step": 17410 + }, + { + "epoch": 1.3595567002263327, + "grad_norm": 0.001878074835985899, + "learning_rate": 6.658204837798523e-05, + "loss": 0.7333, + "step": 17420 + }, + { + "epoch": 1.3603371575743386, + "grad_norm": 0.40023699402809143, + "learning_rate": 6.65392059296351e-05, + "loss": 0.2432, + "step": 17430 + }, + { + "epoch": 1.3611176149223445, + "grad_norm": 0.11613233387470245, + "learning_rate": 6.649634984212258e-05, + "loss": 0.5323, + "step": 17440 + }, + { + "epoch": 1.3618980722703504, + "grad_norm": 2.2078163623809814, + "learning_rate": 6.64534801507892e-05, + "loss": 1.3115, + "step": 17450 + }, + { + "epoch": 1.3626785296183563, + "grad_norm": 3.0959393978118896, + "learning_rate": 6.641059689098775e-05, + "loss": 0.7336, + "step": 17460 + }, + { + "epoch": 1.3634589869663623, + "grad_norm": 8.56583309173584, + "learning_rate": 6.636770009808219e-05, + "loss": 0.5928, + "step": 17470 + }, + { + "epoch": 1.3642394443143682, + "grad_norm": 20.7721004486084, + "learning_rate": 6.63247898074476e-05, + "loss": 1.0434, + "step": 17480 + }, + { + "epoch": 1.365019901662374, + "grad_norm": 0.17920184135437012, + "learning_rate": 6.628186605447027e-05, + "loss": 0.7184, + "step": 17490 + }, + { + "epoch": 1.3658003590103802, + "grad_norm": 0.18073244392871857, + "learning_rate": 6.623892887454752e-05, + "loss": 1.1548, + "step": 17500 + }, + { + "epoch": 1.3665808163583861, + "grad_norm": 0.0002529042540118098, + "learning_rate": 6.619597830308776e-05, + "loss": 1.0218, + "step": 17510 + }, + { + "epoch": 1.367361273706392, + "grad_norm": 4.4073667027078045e-07, + "learning_rate": 6.615301437551051e-05, + "loss": 0.4125, + "step": 17520 + }, + { + "epoch": 1.368141731054398, + "grad_norm": 0.0018284658435732126, + "learning_rate": 6.611003712724617e-05, + "loss": 0.2231, + "step": 17530 + }, + { + "epoch": 1.3689221884024039, + "grad_norm": 8.416059494018555, + "learning_rate": 6.606704659373628e-05, + "loss": 0.3617, + "step": 17540 + }, + { + "epoch": 1.3697026457504098, + "grad_norm": 113.29763793945312, + "learning_rate": 6.602404281043322e-05, + "loss": 0.9754, + "step": 17550 + }, + { + "epoch": 1.3704831030984157, + "grad_norm": 56.287540435791016, + "learning_rate": 6.598102581280032e-05, + "loss": 2.2883, + "step": 17560 + }, + { + "epoch": 1.3712635604464216, + "grad_norm": 3.09262752532959, + "learning_rate": 6.593799563631186e-05, + "loss": 0.6381, + "step": 17570 + }, + { + "epoch": 1.3720440177944275, + "grad_norm": 5.676676273345947, + "learning_rate": 6.589495231645293e-05, + "loss": 0.9866, + "step": 17580 + }, + { + "epoch": 1.3728244751424334, + "grad_norm": 0.01620888151228428, + "learning_rate": 6.585189588871952e-05, + "loss": 0.3731, + "step": 17590 + }, + { + "epoch": 1.3736049324904394, + "grad_norm": 0.41776251792907715, + "learning_rate": 6.580882638861832e-05, + "loss": 0.9153, + "step": 17600 + }, + { + "epoch": 1.3743853898384453, + "grad_norm": 1.9870280084433034e-05, + "learning_rate": 6.57657438516669e-05, + "loss": 0.5748, + "step": 17610 + }, + { + "epoch": 1.3751658471864512, + "grad_norm": 0.0008792297448962927, + "learning_rate": 6.572264831339358e-05, + "loss": 0.1678, + "step": 17620 + }, + { + "epoch": 1.375946304534457, + "grad_norm": 6.01303113967333e-08, + "learning_rate": 6.567953980933735e-05, + "loss": 2.9064, + "step": 17630 + }, + { + "epoch": 1.376726761882463, + "grad_norm": 0.37628453969955444, + "learning_rate": 6.563641837504791e-05, + "loss": 0.0403, + "step": 17640 + }, + { + "epoch": 1.377507219230469, + "grad_norm": 0.26884397864341736, + "learning_rate": 6.55932840460856e-05, + "loss": 0.8042, + "step": 17650 + }, + { + "epoch": 1.378287676578475, + "grad_norm": 0.0005718205939047039, + "learning_rate": 6.555013685802148e-05, + "loss": 0.7636, + "step": 17660 + }, + { + "epoch": 1.379068133926481, + "grad_norm": 5.187681198120117, + "learning_rate": 6.550697684643715e-05, + "loss": 1.6329, + "step": 17670 + }, + { + "epoch": 1.379848591274487, + "grad_norm": 0.003713503247126937, + "learning_rate": 6.546380404692474e-05, + "loss": 1.7662, + "step": 17680 + }, + { + "epoch": 1.3806290486224928, + "grad_norm": 67.9324951171875, + "learning_rate": 6.542061849508701e-05, + "loss": 3.2811, + "step": 17690 + }, + { + "epoch": 1.3814095059704987, + "grad_norm": 4.829217433929443, + "learning_rate": 6.537742022653721e-05, + "loss": 0.6113, + "step": 17700 + }, + { + "epoch": 1.3821899633185046, + "grad_norm": 4.9284699343843386e-05, + "learning_rate": 6.533420927689905e-05, + "loss": 2.4698, + "step": 17710 + }, + { + "epoch": 1.3829704206665105, + "grad_norm": 36.11857604980469, + "learning_rate": 6.529098568180672e-05, + "loss": 3.1013, + "step": 17720 + }, + { + "epoch": 1.3837508780145165, + "grad_norm": 35.37229919433594, + "learning_rate": 6.524774947690483e-05, + "loss": 0.8092, + "step": 17730 + }, + { + "epoch": 1.3845313353625224, + "grad_norm": 3.6355788707733154, + "learning_rate": 6.520450069784844e-05, + "loss": 0.1253, + "step": 17740 + }, + { + "epoch": 1.3853117927105283, + "grad_norm": 0.01998009905219078, + "learning_rate": 6.516123938030287e-05, + "loss": 0.8166, + "step": 17750 + }, + { + "epoch": 1.3860922500585344, + "grad_norm": 1.4495409727096558, + "learning_rate": 6.511796555994388e-05, + "loss": 0.7859, + "step": 17760 + }, + { + "epoch": 1.3868727074065403, + "grad_norm": 31.480539321899414, + "learning_rate": 6.50746792724575e-05, + "loss": 0.98, + "step": 17770 + }, + { + "epoch": 1.3876531647545463, + "grad_norm": 38.0267219543457, + "learning_rate": 6.503138055354005e-05, + "loss": 0.9297, + "step": 17780 + }, + { + "epoch": 1.3884336221025522, + "grad_norm": 0.6976431608200073, + "learning_rate": 6.498806943889808e-05, + "loss": 0.8869, + "step": 17790 + }, + { + "epoch": 1.389214079450558, + "grad_norm": 0.3048703074455261, + "learning_rate": 6.494474596424837e-05, + "loss": 0.3903, + "step": 17800 + }, + { + "epoch": 1.389994536798564, + "grad_norm": 0.23949263989925385, + "learning_rate": 6.490141016531794e-05, + "loss": 1.4834, + "step": 17810 + }, + { + "epoch": 1.39077499414657, + "grad_norm": 42.83589172363281, + "learning_rate": 6.485806207784389e-05, + "loss": 1.3046, + "step": 17820 + }, + { + "epoch": 1.3915554514945758, + "grad_norm": 2.6972645628120517e-06, + "learning_rate": 6.481470173757353e-05, + "loss": 0.6909, + "step": 17830 + }, + { + "epoch": 1.3923359088425817, + "grad_norm": 39.857086181640625, + "learning_rate": 6.477132918026424e-05, + "loss": 1.4446, + "step": 17840 + }, + { + "epoch": 1.3931163661905877, + "grad_norm": 0.00047229923075065017, + "learning_rate": 6.472794444168346e-05, + "loss": 0.0383, + "step": 17850 + }, + { + "epoch": 1.3938968235385936, + "grad_norm": 10.687626838684082, + "learning_rate": 6.468454755760872e-05, + "loss": 0.1296, + "step": 17860 + }, + { + "epoch": 1.3946772808865995, + "grad_norm": 0.04510192945599556, + "learning_rate": 6.464113856382752e-05, + "loss": 1.2684, + "step": 17870 + }, + { + "epoch": 1.3954577382346054, + "grad_norm": 1.947009442559988e-09, + "learning_rate": 6.459771749613738e-05, + "loss": 1.0827, + "step": 17880 + }, + { + "epoch": 1.3962381955826113, + "grad_norm": 0.4970480799674988, + "learning_rate": 6.455428439034574e-05, + "loss": 0.9025, + "step": 17890 + }, + { + "epoch": 1.3970186529306172, + "grad_norm": 4.4791399034238566e-08, + "learning_rate": 6.451083928227e-05, + "loss": 0.8598, + "step": 17900 + }, + { + "epoch": 1.3977991102786231, + "grad_norm": 68.11105346679688, + "learning_rate": 6.446738220773744e-05, + "loss": 2.0488, + "step": 17910 + }, + { + "epoch": 1.3985795676266293, + "grad_norm": 0.050787825137376785, + "learning_rate": 6.442391320258525e-05, + "loss": 1.1975, + "step": 17920 + }, + { + "epoch": 1.3993600249746352, + "grad_norm": 0.7949303984642029, + "learning_rate": 6.43804323026604e-05, + "loss": 0.9232, + "step": 17930 + }, + { + "epoch": 1.400140482322641, + "grad_norm": 0.28324851393699646, + "learning_rate": 6.433693954381967e-05, + "loss": 1.2801, + "step": 17940 + }, + { + "epoch": 1.400920939670647, + "grad_norm": 1.937788724899292, + "learning_rate": 6.429343496192969e-05, + "loss": 1.3638, + "step": 17950 + }, + { + "epoch": 1.401701397018653, + "grad_norm": 1.130745530128479, + "learning_rate": 6.424991859286674e-05, + "loss": 0.603, + "step": 17960 + }, + { + "epoch": 1.4024818543666588, + "grad_norm": 0.07360313087701797, + "learning_rate": 6.420639047251692e-05, + "loss": 0.0272, + "step": 17970 + }, + { + "epoch": 1.4032623117146648, + "grad_norm": 0.01609094999730587, + "learning_rate": 6.416285063677597e-05, + "loss": 1.3349, + "step": 17980 + }, + { + "epoch": 1.4040427690626707, + "grad_norm": 40.28419494628906, + "learning_rate": 6.411929912154925e-05, + "loss": 1.044, + "step": 17990 + }, + { + "epoch": 1.4048232264106766, + "grad_norm": 72.37548828125, + "learning_rate": 6.407573596275185e-05, + "loss": 1.9599, + "step": 18000 + }, + { + "epoch": 1.4056036837586827, + "grad_norm": 0.15986989438533783, + "learning_rate": 6.403216119630838e-05, + "loss": 0.9964, + "step": 18010 + }, + { + "epoch": 1.4063841411066886, + "grad_norm": 0.09070328623056412, + "learning_rate": 6.398857485815306e-05, + "loss": 0.6331, + "step": 18020 + }, + { + "epoch": 1.4071645984546945, + "grad_norm": 10.145639419555664, + "learning_rate": 6.394497698422964e-05, + "loss": 0.761, + "step": 18030 + }, + { + "epoch": 1.4079450558027005, + "grad_norm": 1.3403260707855225, + "learning_rate": 6.390136761049137e-05, + "loss": 0.6732, + "step": 18040 + }, + { + "epoch": 1.4087255131507064, + "grad_norm": 4.114490032196045, + "learning_rate": 6.385774677290104e-05, + "loss": 0.4664, + "step": 18050 + }, + { + "epoch": 1.4095059704987123, + "grad_norm": 2.115405559539795, + "learning_rate": 6.381411450743084e-05, + "loss": 0.3374, + "step": 18060 + }, + { + "epoch": 1.4102864278467182, + "grad_norm": 7.303625106811523, + "learning_rate": 6.377047085006236e-05, + "loss": 3.1412, + "step": 18070 + }, + { + "epoch": 1.4110668851947241, + "grad_norm": 10.102337837219238, + "learning_rate": 6.372681583678668e-05, + "loss": 0.8387, + "step": 18080 + }, + { + "epoch": 1.41184734254273, + "grad_norm": 0.28348350524902344, + "learning_rate": 6.368314950360415e-05, + "loss": 1.0219, + "step": 18090 + }, + { + "epoch": 1.412627799890736, + "grad_norm": 0.0021556804422289133, + "learning_rate": 6.36394718865245e-05, + "loss": 0.2351, + "step": 18100 + }, + { + "epoch": 1.4134082572387419, + "grad_norm": 0.0003699703374877572, + "learning_rate": 6.359578302156675e-05, + "loss": 0.0308, + "step": 18110 + }, + { + "epoch": 1.4141887145867478, + "grad_norm": 55.59209442138672, + "learning_rate": 6.355208294475923e-05, + "loss": 0.669, + "step": 18120 + }, + { + "epoch": 1.4149691719347537, + "grad_norm": 1.8222777843475342, + "learning_rate": 6.350837169213946e-05, + "loss": 1.5468, + "step": 18130 + }, + { + "epoch": 1.4157496292827596, + "grad_norm": 38.04872131347656, + "learning_rate": 6.346464929975422e-05, + "loss": 0.9933, + "step": 18140 + }, + { + "epoch": 1.4165300866307655, + "grad_norm": 4.993416786193848, + "learning_rate": 6.342091580365946e-05, + "loss": 1.25, + "step": 18150 + }, + { + "epoch": 1.4173105439787714, + "grad_norm": 3.710566997528076, + "learning_rate": 6.337717123992027e-05, + "loss": 0.7554, + "step": 18160 + }, + { + "epoch": 1.4180910013267776, + "grad_norm": 1.022260308265686, + "learning_rate": 6.333341564461092e-05, + "loss": 1.3403, + "step": 18170 + }, + { + "epoch": 1.4188714586747835, + "grad_norm": 43.93464279174805, + "learning_rate": 6.328964905381472e-05, + "loss": 2.0506, + "step": 18180 + }, + { + "epoch": 1.4196519160227894, + "grad_norm": 4.084843158721924, + "learning_rate": 6.324587150362408e-05, + "loss": 0.8518, + "step": 18190 + }, + { + "epoch": 1.4204323733707953, + "grad_norm": 0.01947934739291668, + "learning_rate": 6.320208303014043e-05, + "loss": 0.301, + "step": 18200 + }, + { + "epoch": 1.4212128307188012, + "grad_norm": 34.18440246582031, + "learning_rate": 6.31582836694742e-05, + "loss": 0.4866, + "step": 18210 + }, + { + "epoch": 1.4219932880668071, + "grad_norm": 4.792870044708252, + "learning_rate": 6.311447345774486e-05, + "loss": 0.3177, + "step": 18220 + }, + { + "epoch": 1.422773745414813, + "grad_norm": 1.2343689377303235e-06, + "learning_rate": 6.307065243108072e-05, + "loss": 1.2798, + "step": 18230 + }, + { + "epoch": 1.423554202762819, + "grad_norm": 1.0866369009017944, + "learning_rate": 6.302682062561914e-05, + "loss": 0.1932, + "step": 18240 + }, + { + "epoch": 1.4243346601108249, + "grad_norm": 0.0021116959396749735, + "learning_rate": 6.298297807750626e-05, + "loss": 0.8838, + "step": 18250 + }, + { + "epoch": 1.4251151174588308, + "grad_norm": 10.430094718933105, + "learning_rate": 6.293912482289712e-05, + "loss": 1.1055, + "step": 18260 + }, + { + "epoch": 1.425895574806837, + "grad_norm": 48.94272994995117, + "learning_rate": 6.289526089795558e-05, + "loss": 1.1596, + "step": 18270 + }, + { + "epoch": 1.4266760321548428, + "grad_norm": 1.1787877082824707, + "learning_rate": 6.285138633885434e-05, + "loss": 1.5406, + "step": 18280 + }, + { + "epoch": 1.4274564895028488, + "grad_norm": 19.861085891723633, + "learning_rate": 6.28075011817748e-05, + "loss": 0.1486, + "step": 18290 + }, + { + "epoch": 1.4282369468508547, + "grad_norm": 49.54465866088867, + "learning_rate": 6.276360546290717e-05, + "loss": 1.8698, + "step": 18300 + }, + { + "epoch": 1.4290174041988606, + "grad_norm": 5.11936616897583, + "learning_rate": 6.271969921845032e-05, + "loss": 0.6967, + "step": 18310 + }, + { + "epoch": 1.4297978615468665, + "grad_norm": 2.9285593032836914, + "learning_rate": 6.26757824846118e-05, + "loss": 0.2176, + "step": 18320 + }, + { + "epoch": 1.4305783188948724, + "grad_norm": 5.905811309814453, + "learning_rate": 6.263185529760786e-05, + "loss": 0.7961, + "step": 18330 + }, + { + "epoch": 1.4313587762428783, + "grad_norm": 0.5823084115982056, + "learning_rate": 6.258791769366332e-05, + "loss": 0.7978, + "step": 18340 + }, + { + "epoch": 1.4321392335908842, + "grad_norm": 5.589509237324819e-05, + "learning_rate": 6.254396970901159e-05, + "loss": 0.0566, + "step": 18350 + }, + { + "epoch": 1.4329196909388902, + "grad_norm": 0.0657353326678276, + "learning_rate": 6.25000113798947e-05, + "loss": 1.8269, + "step": 18360 + }, + { + "epoch": 1.433700148286896, + "grad_norm": 42.802703857421875, + "learning_rate": 6.245604274256314e-05, + "loss": 0.2936, + "step": 18370 + }, + { + "epoch": 1.434480605634902, + "grad_norm": 1.737991452217102, + "learning_rate": 6.241206383327592e-05, + "loss": 0.8018, + "step": 18380 + }, + { + "epoch": 1.435261062982908, + "grad_norm": 25.346847534179688, + "learning_rate": 6.236807468830054e-05, + "loss": 1.9919, + "step": 18390 + }, + { + "epoch": 1.4360415203309138, + "grad_norm": 16.954374313354492, + "learning_rate": 6.232407534391295e-05, + "loss": 0.3725, + "step": 18400 + }, + { + "epoch": 1.4368219776789197, + "grad_norm": 0.9959704875946045, + "learning_rate": 6.228006583639747e-05, + "loss": 0.9978, + "step": 18410 + }, + { + "epoch": 1.4376024350269256, + "grad_norm": 4.357720627012895e-06, + "learning_rate": 6.223604620204682e-05, + "loss": 2.0566, + "step": 18420 + }, + { + "epoch": 1.4383828923749318, + "grad_norm": 4.830660327570513e-05, + "learning_rate": 6.219201647716209e-05, + "loss": 0.2457, + "step": 18430 + }, + { + "epoch": 1.4391633497229377, + "grad_norm": 0.011094238609075546, + "learning_rate": 6.214797669805266e-05, + "loss": 0.2389, + "step": 18440 + }, + { + "epoch": 1.4399438070709436, + "grad_norm": 8.750463166506961e-06, + "learning_rate": 6.210392690103624e-05, + "loss": 1.6997, + "step": 18450 + }, + { + "epoch": 1.4407242644189495, + "grad_norm": 6.744166851043701, + "learning_rate": 6.205986712243875e-05, + "loss": 0.2439, + "step": 18460 + }, + { + "epoch": 1.4415047217669554, + "grad_norm": 56.61109924316406, + "learning_rate": 6.201579739859441e-05, + "loss": 1.0354, + "step": 18470 + }, + { + "epoch": 1.4422851791149613, + "grad_norm": 0.030948661267757416, + "learning_rate": 6.197171776584555e-05, + "loss": 0.418, + "step": 18480 + }, + { + "epoch": 1.4430656364629673, + "grad_norm": 5.321959179127589e-06, + "learning_rate": 6.192762826054274e-05, + "loss": 0.731, + "step": 18490 + }, + { + "epoch": 1.4438460938109732, + "grad_norm": 5.769700273106082e-10, + "learning_rate": 6.18835289190447e-05, + "loss": 0.9904, + "step": 18500 + }, + { + "epoch": 1.444626551158979, + "grad_norm": 0.5561239123344421, + "learning_rate": 6.18394197777182e-05, + "loss": 1.3277, + "step": 18510 + }, + { + "epoch": 1.4454070085069852, + "grad_norm": 51.21713638305664, + "learning_rate": 6.179530087293815e-05, + "loss": 1.4944, + "step": 18520 + }, + { + "epoch": 1.4461874658549911, + "grad_norm": 55.569190979003906, + "learning_rate": 6.175117224108748e-05, + "loss": 3.3253, + "step": 18530 + }, + { + "epoch": 1.446967923202997, + "grad_norm": 4.12024450302124, + "learning_rate": 6.170703391855713e-05, + "loss": 0.9532, + "step": 18540 + }, + { + "epoch": 1.447748380551003, + "grad_norm": 1.0935174226760864, + "learning_rate": 6.166288594174608e-05, + "loss": 0.9796, + "step": 18550 + }, + { + "epoch": 1.4485288378990089, + "grad_norm": 0.008399920538067818, + "learning_rate": 6.161872834706124e-05, + "loss": 1.2911, + "step": 18560 + }, + { + "epoch": 1.4493092952470148, + "grad_norm": 1.9544976949691772, + "learning_rate": 6.157456117091745e-05, + "loss": 0.4771, + "step": 18570 + }, + { + "epoch": 1.4500897525950207, + "grad_norm": 6.754466448910534e-05, + "learning_rate": 6.153038444973744e-05, + "loss": 0.7429, + "step": 18580 + }, + { + "epoch": 1.4508702099430266, + "grad_norm": 2.543808932387037e-06, + "learning_rate": 6.148619821995185e-05, + "loss": 0.617, + "step": 18590 + }, + { + "epoch": 1.4516506672910325, + "grad_norm": 0.10267318785190582, + "learning_rate": 6.144200251799913e-05, + "loss": 0.8692, + "step": 18600 + }, + { + "epoch": 1.4524311246390385, + "grad_norm": 0.766514241695404, + "learning_rate": 6.139779738032553e-05, + "loss": 0.0055, + "step": 18610 + }, + { + "epoch": 1.4532115819870444, + "grad_norm": 32.09298324584961, + "learning_rate": 6.135358284338512e-05, + "loss": 1.7193, + "step": 18620 + }, + { + "epoch": 1.4539920393350503, + "grad_norm": 0.3428858816623688, + "learning_rate": 6.130935894363972e-05, + "loss": 0.4487, + "step": 18630 + }, + { + "epoch": 1.4547724966830562, + "grad_norm": 0.00805765762925148, + "learning_rate": 6.126512571755883e-05, + "loss": 3.0999, + "step": 18640 + }, + { + "epoch": 1.455552954031062, + "grad_norm": 18.52367401123047, + "learning_rate": 6.122088320161964e-05, + "loss": 0.4253, + "step": 18650 + }, + { + "epoch": 1.456333411379068, + "grad_norm": 0.0071366941556334496, + "learning_rate": 6.117663143230707e-05, + "loss": 0.7395, + "step": 18660 + }, + { + "epoch": 1.457113868727074, + "grad_norm": 13.743526458740234, + "learning_rate": 6.113237044611361e-05, + "loss": 0.1165, + "step": 18670 + }, + { + "epoch": 1.45789432607508, + "grad_norm": 0.0024282445665448904, + "learning_rate": 6.108810027953937e-05, + "loss": 0.9431, + "step": 18680 + }, + { + "epoch": 1.458674783423086, + "grad_norm": 3.3176936540257884e-06, + "learning_rate": 6.1043820969092e-05, + "loss": 0.8886, + "step": 18690 + }, + { + "epoch": 1.459455240771092, + "grad_norm": 17.197681427001953, + "learning_rate": 6.099953255128675e-05, + "loss": 1.7766, + "step": 18700 + }, + { + "epoch": 1.4602356981190978, + "grad_norm": 2.220248222351074, + "learning_rate": 6.095523506264633e-05, + "loss": 2.0037, + "step": 18710 + }, + { + "epoch": 1.4610161554671037, + "grad_norm": 27.847068786621094, + "learning_rate": 6.091092853970097e-05, + "loss": 1.4636, + "step": 18720 + }, + { + "epoch": 1.4617966128151096, + "grad_norm": 6.860899925231934, + "learning_rate": 6.0866613018988297e-05, + "loss": 0.5809, + "step": 18730 + }, + { + "epoch": 1.4625770701631156, + "grad_norm": 0.10107483714818954, + "learning_rate": 6.0822288537053416e-05, + "loss": 0.4771, + "step": 18740 + }, + { + "epoch": 1.4633575275111215, + "grad_norm": 0.000141332478960976, + "learning_rate": 6.077795513044878e-05, + "loss": 0.0297, + "step": 18750 + }, + { + "epoch": 1.4641379848591274, + "grad_norm": 5.3891448974609375, + "learning_rate": 6.073361283573423e-05, + "loss": 0.1366, + "step": 18760 + }, + { + "epoch": 1.4649184422071333, + "grad_norm": 42.594932556152344, + "learning_rate": 6.0689261689476927e-05, + "loss": 2.1961, + "step": 18770 + }, + { + "epoch": 1.4656988995551394, + "grad_norm": 0.11372056603431702, + "learning_rate": 6.06449017282513e-05, + "loss": 0.6067, + "step": 18780 + }, + { + "epoch": 1.4664793569031453, + "grad_norm": 0.001790837850421667, + "learning_rate": 6.0600532988639114e-05, + "loss": 0.8782, + "step": 18790 + }, + { + "epoch": 1.4672598142511513, + "grad_norm": 6.262240886688232, + "learning_rate": 6.055615550722931e-05, + "loss": 0.6527, + "step": 18800 + }, + { + "epoch": 1.4680402715991572, + "grad_norm": 0.0001394073769915849, + "learning_rate": 6.051176932061806e-05, + "loss": 0.609, + "step": 18810 + }, + { + "epoch": 1.468820728947163, + "grad_norm": 16.28183364868164, + "learning_rate": 6.046737446540873e-05, + "loss": 2.3972, + "step": 18820 + }, + { + "epoch": 1.469601186295169, + "grad_norm": 6.912973403930664, + "learning_rate": 6.0422970978211834e-05, + "loss": 1.1081, + "step": 18830 + }, + { + "epoch": 1.470381643643175, + "grad_norm": 0.0012439063284546137, + "learning_rate": 6.037855889564498e-05, + "loss": 1.1357, + "step": 18840 + }, + { + "epoch": 1.4711621009911808, + "grad_norm": 56.19667434692383, + "learning_rate": 6.033413825433285e-05, + "loss": 2.354, + "step": 18850 + }, + { + "epoch": 1.4719425583391867, + "grad_norm": 0.0037663921248167753, + "learning_rate": 6.028970909090725e-05, + "loss": 0.2803, + "step": 18860 + }, + { + "epoch": 1.4727230156871927, + "grad_norm": 9.612967915018089e-06, + "learning_rate": 6.024527144200699e-05, + "loss": 0.6668, + "step": 18870 + }, + { + "epoch": 1.4735034730351986, + "grad_norm": 28.315608978271484, + "learning_rate": 6.02008253442778e-05, + "loss": 2.0943, + "step": 18880 + }, + { + "epoch": 1.4742839303832045, + "grad_norm": 0.7584246397018433, + "learning_rate": 6.015637083437249e-05, + "loss": 0.3253, + "step": 18890 + }, + { + "epoch": 1.4750643877312104, + "grad_norm": 3.4292781352996826, + "learning_rate": 6.011190794895074e-05, + "loss": 1.1821, + "step": 18900 + }, + { + "epoch": 1.4758448450792163, + "grad_norm": 12.621614456176758, + "learning_rate": 6.006743672467915e-05, + "loss": 0.6592, + "step": 18910 + }, + { + "epoch": 1.4766253024272222, + "grad_norm": 0.8569507598876953, + "learning_rate": 6.00229571982312e-05, + "loss": 0.1478, + "step": 18920 + }, + { + "epoch": 1.4774057597752284, + "grad_norm": 0.0002535637468099594, + "learning_rate": 5.997846940628724e-05, + "loss": 1.1781, + "step": 18930 + }, + { + "epoch": 1.4781862171232343, + "grad_norm": 7.55508290239959e-06, + "learning_rate": 5.993397338553439e-05, + "loss": 0.5306, + "step": 18940 + }, + { + "epoch": 1.4789666744712402, + "grad_norm": 0.12348146736621857, + "learning_rate": 5.988946917266659e-05, + "loss": 0.1923, + "step": 18950 + }, + { + "epoch": 1.479747131819246, + "grad_norm": 0.00036039817496202886, + "learning_rate": 5.984495680438452e-05, + "loss": 0.2572, + "step": 18960 + }, + { + "epoch": 1.480527589167252, + "grad_norm": 2.984592981647438e-07, + "learning_rate": 5.9800436317395594e-05, + "loss": 1.1437, + "step": 18970 + }, + { + "epoch": 1.481308046515258, + "grad_norm": 0.0528603233397007, + "learning_rate": 5.975590774841392e-05, + "loss": 1.8279, + "step": 18980 + }, + { + "epoch": 1.4820885038632639, + "grad_norm": 0.0007880390621721745, + "learning_rate": 5.9711371134160254e-05, + "loss": 1.2552, + "step": 18990 + }, + { + "epoch": 1.4828689612112698, + "grad_norm": 0.008361490443348885, + "learning_rate": 5.9666826511362004e-05, + "loss": 1.413, + "step": 19000 + }, + { + "epoch": 1.4836494185592757, + "grad_norm": 0.05358072742819786, + "learning_rate": 5.962227391675319e-05, + "loss": 0.8824, + "step": 19010 + }, + { + "epoch": 1.4844298759072816, + "grad_norm": 0.00043389989878050983, + "learning_rate": 5.9577713387074405e-05, + "loss": 0.3058, + "step": 19020 + }, + { + "epoch": 1.4852103332552877, + "grad_norm": 60.52457046508789, + "learning_rate": 5.953314495907275e-05, + "loss": 0.6162, + "step": 19030 + }, + { + "epoch": 1.4859907906032936, + "grad_norm": 0.36423781514167786, + "learning_rate": 5.948856866950188e-05, + "loss": 0.2633, + "step": 19040 + }, + { + "epoch": 1.4867712479512996, + "grad_norm": 1.0531165571592283e-05, + "learning_rate": 5.944398455512191e-05, + "loss": 0.4567, + "step": 19050 + }, + { + "epoch": 1.4875517052993055, + "grad_norm": 47.43577575683594, + "learning_rate": 5.939939265269945e-05, + "loss": 2.0081, + "step": 19060 + }, + { + "epoch": 1.4883321626473114, + "grad_norm": 38.81036376953125, + "learning_rate": 5.935479299900744e-05, + "loss": 0.6646, + "step": 19070 + }, + { + "epoch": 1.4891126199953173, + "grad_norm": 1.6050429344177246, + "learning_rate": 5.931018563082531e-05, + "loss": 0.5204, + "step": 19080 + }, + { + "epoch": 1.4898930773433232, + "grad_norm": 0.078951396048069, + "learning_rate": 5.926557058493881e-05, + "loss": 0.4197, + "step": 19090 + }, + { + "epoch": 1.4906735346913291, + "grad_norm": 2.977109432220459, + "learning_rate": 5.9220947898140025e-05, + "loss": 1.0123, + "step": 19100 + }, + { + "epoch": 1.491453992039335, + "grad_norm": 3.1784112453460693, + "learning_rate": 5.917631760722732e-05, + "loss": 1.6444, + "step": 19110 + }, + { + "epoch": 1.492234449387341, + "grad_norm": 0.03160808980464935, + "learning_rate": 5.913167974900536e-05, + "loss": 0.9252, + "step": 19120 + }, + { + "epoch": 1.4930149067353469, + "grad_norm": 0.002941631944850087, + "learning_rate": 5.908703436028506e-05, + "loss": 1.3656, + "step": 19130 + }, + { + "epoch": 1.4937953640833528, + "grad_norm": 0.00814164336770773, + "learning_rate": 5.904238147788351e-05, + "loss": 1.6489, + "step": 19140 + }, + { + "epoch": 1.4945758214313587, + "grad_norm": 6.922767639160156, + "learning_rate": 5.8997721138624006e-05, + "loss": 0.8766, + "step": 19150 + }, + { + "epoch": 1.4953562787793646, + "grad_norm": 5.123744010925293, + "learning_rate": 5.895305337933597e-05, + "loss": 0.9134, + "step": 19160 + }, + { + "epoch": 1.4961367361273705, + "grad_norm": 31.880294799804688, + "learning_rate": 5.890837823685497e-05, + "loss": 0.8887, + "step": 19170 + }, + { + "epoch": 1.4969171934753764, + "grad_norm": 4.0199822137765295e-07, + "learning_rate": 5.886369574802263e-05, + "loss": 0.9717, + "step": 19180 + }, + { + "epoch": 1.4976976508233826, + "grad_norm": 2.6747709398478037e-06, + "learning_rate": 5.881900594968667e-05, + "loss": 1.5533, + "step": 19190 + }, + { + "epoch": 1.4984781081713885, + "grad_norm": 0.00020924198906868696, + "learning_rate": 5.877430887870081e-05, + "loss": 0.487, + "step": 19200 + }, + { + "epoch": 1.4992585655193944, + "grad_norm": 0.012734944000840187, + "learning_rate": 5.8729604571924776e-05, + "loss": 0.2319, + "step": 19210 + }, + { + "epoch": 1.5000390228674003, + "grad_norm": 0.2888699769973755, + "learning_rate": 5.868489306622429e-05, + "loss": 1.1856, + "step": 19220 + }, + { + "epoch": 1.5008194802154062, + "grad_norm": 8.60429573059082, + "learning_rate": 5.8640174398470926e-05, + "loss": 0.1638, + "step": 19230 + }, + { + "epoch": 1.5015999375634121, + "grad_norm": 2.983750164275989e-05, + "learning_rate": 5.859544860554227e-05, + "loss": 3.0502, + "step": 19240 + }, + { + "epoch": 1.502380394911418, + "grad_norm": 0.0243430994451046, + "learning_rate": 5.8550715724321715e-05, + "loss": 0.7847, + "step": 19250 + }, + { + "epoch": 1.503160852259424, + "grad_norm": 0.001944607705809176, + "learning_rate": 5.850597579169853e-05, + "loss": 1.7657, + "step": 19260 + }, + { + "epoch": 1.50394130960743, + "grad_norm": 0.00044312793761491776, + "learning_rate": 5.846122884456776e-05, + "loss": 0.3973, + "step": 19270 + }, + { + "epoch": 1.504721766955436, + "grad_norm": 0.02026950940489769, + "learning_rate": 5.84164749198303e-05, + "loss": 0.2291, + "step": 19280 + }, + { + "epoch": 1.505502224303442, + "grad_norm": 0.42607226967811584, + "learning_rate": 5.837171405439272e-05, + "loss": 1.0548, + "step": 19290 + }, + { + "epoch": 1.5062826816514479, + "grad_norm": 5.227291584014893, + "learning_rate": 5.83269462851674e-05, + "loss": 0.2234, + "step": 19300 + }, + { + "epoch": 1.5070631389994538, + "grad_norm": 1.4641375541687012, + "learning_rate": 5.8282171649072325e-05, + "loss": 0.8951, + "step": 19310 + }, + { + "epoch": 1.5078435963474597, + "grad_norm": 5.018607680540299e-06, + "learning_rate": 5.823739018303123e-05, + "loss": 0.5481, + "step": 19320 + }, + { + "epoch": 1.5086240536954656, + "grad_norm": 60.11586380004883, + "learning_rate": 5.81926019239734e-05, + "loss": 2.3614, + "step": 19330 + }, + { + "epoch": 1.5094045110434715, + "grad_norm": 6.57179862173507e-06, + "learning_rate": 5.814780690883378e-05, + "loss": 0.5306, + "step": 19340 + }, + { + "epoch": 1.5101849683914774, + "grad_norm": 8.855113264871761e-05, + "learning_rate": 5.8103005174552846e-05, + "loss": 0.6372, + "step": 19350 + }, + { + "epoch": 1.5109654257394833, + "grad_norm": 50.526031494140625, + "learning_rate": 5.805819675807669e-05, + "loss": 1.4548, + "step": 19360 + }, + { + "epoch": 1.5117458830874893, + "grad_norm": 1.4122130870819092, + "learning_rate": 5.801338169635681e-05, + "loss": 0.7567, + "step": 19370 + }, + { + "epoch": 1.5125263404354952, + "grad_norm": 35.40172576904297, + "learning_rate": 5.7968560026350236e-05, + "loss": 0.4017, + "step": 19380 + }, + { + "epoch": 1.513306797783501, + "grad_norm": 6.476219596152077e-07, + "learning_rate": 5.792373178501945e-05, + "loss": 1.0184, + "step": 19390 + }, + { + "epoch": 1.514087255131507, + "grad_norm": 3.57866468903012e-07, + "learning_rate": 5.7878897009332376e-05, + "loss": 0.2702, + "step": 19400 + }, + { + "epoch": 1.514867712479513, + "grad_norm": 34.72517013549805, + "learning_rate": 5.783405573626228e-05, + "loss": 1.5825, + "step": 19410 + }, + { + "epoch": 1.5156481698275188, + "grad_norm": 35.87035369873047, + "learning_rate": 5.7789208002787796e-05, + "loss": 1.2819, + "step": 19420 + }, + { + "epoch": 1.5164286271755247, + "grad_norm": 3.8606178760528564, + "learning_rate": 5.774435384589291e-05, + "loss": 0.8184, + "step": 19430 + }, + { + "epoch": 1.5172090845235306, + "grad_norm": 1.1874459981918335, + "learning_rate": 5.769949330256689e-05, + "loss": 0.363, + "step": 19440 + }, + { + "epoch": 1.5179895418715366, + "grad_norm": 57.610191345214844, + "learning_rate": 5.765462640980428e-05, + "loss": 1.1065, + "step": 19450 + }, + { + "epoch": 1.5187699992195427, + "grad_norm": 22.041465759277344, + "learning_rate": 5.760975320460482e-05, + "loss": 1.0598, + "step": 19460 + }, + { + "epoch": 1.5195504565675486, + "grad_norm": 19.411191940307617, + "learning_rate": 5.756487372397351e-05, + "loss": 1.3748, + "step": 19470 + }, + { + "epoch": 1.5203309139155545, + "grad_norm": 0.10800797492265701, + "learning_rate": 5.75199880049205e-05, + "loss": 1.3595, + "step": 19480 + }, + { + "epoch": 1.5211113712635604, + "grad_norm": 0.026111967861652374, + "learning_rate": 5.747509608446109e-05, + "loss": 0.7284, + "step": 19490 + }, + { + "epoch": 1.5218918286115664, + "grad_norm": 0.5697845816612244, + "learning_rate": 5.743019799961566e-05, + "loss": 2.3566, + "step": 19500 + }, + { + "epoch": 1.5226722859595723, + "grad_norm": 0.0015555124264210463, + "learning_rate": 5.738529378740976e-05, + "loss": 0.6254, + "step": 19510 + }, + { + "epoch": 1.5234527433075784, + "grad_norm": 52.46384048461914, + "learning_rate": 5.734038348487389e-05, + "loss": 0.6338, + "step": 19520 + }, + { + "epoch": 1.5242332006555843, + "grad_norm": 0.001647842931561172, + "learning_rate": 5.7295467129043644e-05, + "loss": 0.5647, + "step": 19530 + }, + { + "epoch": 1.5250136580035902, + "grad_norm": 1.7748408317565918, + "learning_rate": 5.7250544756959576e-05, + "loss": 0.4189, + "step": 19540 + }, + { + "epoch": 1.5257941153515961, + "grad_norm": 12.625972747802734, + "learning_rate": 5.72056164056672e-05, + "loss": 0.5406, + "step": 19550 + }, + { + "epoch": 1.526574572699602, + "grad_norm": 26.848318099975586, + "learning_rate": 5.716068211221698e-05, + "loss": 1.4614, + "step": 19560 + }, + { + "epoch": 1.527355030047608, + "grad_norm": 5.812100410461426, + "learning_rate": 5.7115741913664264e-05, + "loss": 0.0883, + "step": 19570 + }, + { + "epoch": 1.528135487395614, + "grad_norm": 44.151092529296875, + "learning_rate": 5.707079584706927e-05, + "loss": 1.5955, + "step": 19580 + }, + { + "epoch": 1.5289159447436198, + "grad_norm": 1.2502694129943848, + "learning_rate": 5.702584394949708e-05, + "loss": 0.5244, + "step": 19590 + }, + { + "epoch": 1.5296964020916257, + "grad_norm": 32.96466064453125, + "learning_rate": 5.698088625801756e-05, + "loss": 0.4484, + "step": 19600 + }, + { + "epoch": 1.5304768594396316, + "grad_norm": 5.660415425268184e-08, + "learning_rate": 5.693592280970534e-05, + "loss": 1.1164, + "step": 19610 + }, + { + "epoch": 1.5312573167876375, + "grad_norm": 19.453800201416016, + "learning_rate": 5.6890953641639824e-05, + "loss": 0.8106, + "step": 19620 + }, + { + "epoch": 1.5320377741356435, + "grad_norm": 0.04822535812854767, + "learning_rate": 5.684597879090514e-05, + "loss": 0.604, + "step": 19630 + }, + { + "epoch": 1.5328182314836494, + "grad_norm": 0.00011620177974691615, + "learning_rate": 5.680099829459008e-05, + "loss": 1.3639, + "step": 19640 + }, + { + "epoch": 1.5335986888316553, + "grad_norm": 0.08724360913038254, + "learning_rate": 5.675601218978809e-05, + "loss": 0.5596, + "step": 19650 + }, + { + "epoch": 1.5343791461796612, + "grad_norm": 0.37494370341300964, + "learning_rate": 5.671102051359727e-05, + "loss": 0.5558, + "step": 19660 + }, + { + "epoch": 1.5351596035276671, + "grad_norm": 3.1753580570220947, + "learning_rate": 5.6666023303120266e-05, + "loss": 0.9573, + "step": 19670 + }, + { + "epoch": 1.535940060875673, + "grad_norm": 0.12596221268177032, + "learning_rate": 5.662102059546434e-05, + "loss": 1.151, + "step": 19680 + }, + { + "epoch": 1.536720518223679, + "grad_norm": 27.574541091918945, + "learning_rate": 5.657601242774125e-05, + "loss": 1.5459, + "step": 19690 + }, + { + "epoch": 1.5375009755716849, + "grad_norm": 5.653417110443115, + "learning_rate": 5.653099883706727e-05, + "loss": 2.5211, + "step": 19700 + }, + { + "epoch": 1.538281432919691, + "grad_norm": 0.7937735915184021, + "learning_rate": 5.648597986056318e-05, + "loss": 0.3426, + "step": 19710 + }, + { + "epoch": 1.539061890267697, + "grad_norm": 2.5061755180358887, + "learning_rate": 5.6440955535354124e-05, + "loss": 0.7989, + "step": 19720 + }, + { + "epoch": 1.5398423476157028, + "grad_norm": 27.361835479736328, + "learning_rate": 5.639592589856973e-05, + "loss": 1.1338, + "step": 19730 + }, + { + "epoch": 1.5406228049637087, + "grad_norm": 1.5489771612919867e-05, + "learning_rate": 5.6350890987343944e-05, + "loss": 0.142, + "step": 19740 + }, + { + "epoch": 1.5414032623117147, + "grad_norm": 1.1988106052740477e-05, + "learning_rate": 5.630585083881513e-05, + "loss": 1.1921, + "step": 19750 + }, + { + "epoch": 1.5421837196597206, + "grad_norm": 36.36326599121094, + "learning_rate": 5.626080549012592e-05, + "loss": 0.7187, + "step": 19760 + }, + { + "epoch": 1.5429641770077265, + "grad_norm": 30.49346160888672, + "learning_rate": 5.6215754978423254e-05, + "loss": 1.4105, + "step": 19770 + }, + { + "epoch": 1.5437446343557326, + "grad_norm": 0.00679404940456152, + "learning_rate": 5.617069934085832e-05, + "loss": 0.1904, + "step": 19780 + }, + { + "epoch": 1.5445250917037385, + "grad_norm": 15.944265365600586, + "learning_rate": 5.612563861458655e-05, + "loss": 0.8945, + "step": 19790 + }, + { + "epoch": 1.5453055490517444, + "grad_norm": 24.04916763305664, + "learning_rate": 5.6080572836767555e-05, + "loss": 0.208, + "step": 19800 + }, + { + "epoch": 1.5460860063997504, + "grad_norm": 34.59318542480469, + "learning_rate": 5.60355020445651e-05, + "loss": 1.1715, + "step": 19810 + }, + { + "epoch": 1.5468664637477563, + "grad_norm": 5.566281318664551, + "learning_rate": 5.5990426275147136e-05, + "loss": 0.7822, + "step": 19820 + }, + { + "epoch": 1.5476469210957622, + "grad_norm": 27.87342643737793, + "learning_rate": 5.594534556568567e-05, + "loss": 0.6418, + "step": 19830 + }, + { + "epoch": 1.548427378443768, + "grad_norm": 0.00011106702004326507, + "learning_rate": 5.5900259953356804e-05, + "loss": 0.2046, + "step": 19840 + }, + { + "epoch": 1.549207835791774, + "grad_norm": 88.22048950195312, + "learning_rate": 5.585516947534066e-05, + "loss": 1.8431, + "step": 19850 + }, + { + "epoch": 1.54998829313978, + "grad_norm": 3.4051570310111856e-06, + "learning_rate": 5.5810074168821424e-05, + "loss": 1.2887, + "step": 19860 + }, + { + "epoch": 1.5507687504877858, + "grad_norm": 1.627982726404298e-08, + "learning_rate": 5.5764974070987196e-05, + "loss": 0.4555, + "step": 19870 + }, + { + "epoch": 1.5515492078357918, + "grad_norm": 58.66267395019531, + "learning_rate": 5.571986921903007e-05, + "loss": 1.8405, + "step": 19880 + }, + { + "epoch": 1.5523296651837977, + "grad_norm": 1.2103909341476538e-07, + "learning_rate": 5.567475965014606e-05, + "loss": 1.071, + "step": 19890 + }, + { + "epoch": 1.5531101225318036, + "grad_norm": 29.37429428100586, + "learning_rate": 5.562964540153506e-05, + "loss": 1.3935, + "step": 19900 + }, + { + "epoch": 1.5538905798798095, + "grad_norm": 0.00016841200704220682, + "learning_rate": 5.558452651040081e-05, + "loss": 0.8621, + "step": 19910 + }, + { + "epoch": 1.5546710372278154, + "grad_norm": 82.55367279052734, + "learning_rate": 5.553940301395093e-05, + "loss": 0.669, + "step": 19920 + }, + { + "epoch": 1.5554514945758213, + "grad_norm": 26.137784957885742, + "learning_rate": 5.5494274949396754e-05, + "loss": 1.3306, + "step": 19930 + }, + { + "epoch": 1.5562319519238272, + "grad_norm": 0.6996078491210938, + "learning_rate": 5.5449142353953465e-05, + "loss": 0.8355, + "step": 19940 + }, + { + "epoch": 1.5570124092718332, + "grad_norm": 31.849136352539062, + "learning_rate": 5.5404005264839956e-05, + "loss": 1.5454, + "step": 19950 + }, + { + "epoch": 1.557792866619839, + "grad_norm": 1.0991023373208009e-05, + "learning_rate": 5.5358863719278764e-05, + "loss": 0.8768, + "step": 19960 + }, + { + "epoch": 1.5585733239678452, + "grad_norm": 31.20032501220703, + "learning_rate": 5.531371775449621e-05, + "loss": 1.9345, + "step": 19970 + }, + { + "epoch": 1.5593537813158511, + "grad_norm": 0.3876931965351105, + "learning_rate": 5.526856740772218e-05, + "loss": 0.9811, + "step": 19980 + }, + { + "epoch": 1.560134238663857, + "grad_norm": 0.008426464162766933, + "learning_rate": 5.522341271619019e-05, + "loss": 0.8697, + "step": 19990 + }, + { + "epoch": 1.560914696011863, + "grad_norm": 6.8745598793029785, + "learning_rate": 5.517825371713737e-05, + "loss": 0.0568, + "step": 20000 + }, + { + "epoch": 1.5616951533598689, + "grad_norm": 4.607158184051514, + "learning_rate": 5.5133090447804346e-05, + "loss": 1.1868, + "step": 20010 + }, + { + "epoch": 1.5624756107078748, + "grad_norm": 31.82735824584961, + "learning_rate": 5.508792294543533e-05, + "loss": 1.0846, + "step": 20020 + }, + { + "epoch": 1.563256068055881, + "grad_norm": 3.365826368331909, + "learning_rate": 5.504275124727798e-05, + "loss": 1.5055, + "step": 20030 + }, + { + "epoch": 1.5640365254038868, + "grad_norm": 17.653242111206055, + "learning_rate": 5.499757539058342e-05, + "loss": 0.488, + "step": 20040 + }, + { + "epoch": 1.5648169827518927, + "grad_norm": 0.3879726827144623, + "learning_rate": 5.495239541260623e-05, + "loss": 0.931, + "step": 20050 + }, + { + "epoch": 1.5655974400998987, + "grad_norm": 39.06393814086914, + "learning_rate": 5.490721135060435e-05, + "loss": 0.5254, + "step": 20060 + }, + { + "epoch": 1.5663778974479046, + "grad_norm": 1.1639221906661987, + "learning_rate": 5.486202324183911e-05, + "loss": 1.1717, + "step": 20070 + }, + { + "epoch": 1.5671583547959105, + "grad_norm": 42.985511779785156, + "learning_rate": 5.481683112357517e-05, + "loss": 0.8609, + "step": 20080 + }, + { + "epoch": 1.5679388121439164, + "grad_norm": 9.094670531339943e-06, + "learning_rate": 5.477163503308052e-05, + "loss": 0.1233, + "step": 20090 + }, + { + "epoch": 1.5687192694919223, + "grad_norm": 37.52073287963867, + "learning_rate": 5.472643500762639e-05, + "loss": 0.6381, + "step": 20100 + }, + { + "epoch": 1.5694997268399282, + "grad_norm": 0.0003045310149900615, + "learning_rate": 5.468123108448727e-05, + "loss": 0.2886, + "step": 20110 + }, + { + "epoch": 1.5702801841879341, + "grad_norm": 0.2237444669008255, + "learning_rate": 5.4636023300940865e-05, + "loss": 1.0482, + "step": 20120 + }, + { + "epoch": 1.57106064153594, + "grad_norm": 16.167692184448242, + "learning_rate": 5.4590811694268085e-05, + "loss": 0.4846, + "step": 20130 + }, + { + "epoch": 1.571841098883946, + "grad_norm": 47.023067474365234, + "learning_rate": 5.4545596301752964e-05, + "loss": 1.4037, + "step": 20140 + }, + { + "epoch": 1.5726215562319519, + "grad_norm": 48.14585876464844, + "learning_rate": 5.4500377160682646e-05, + "loss": 0.8417, + "step": 20150 + }, + { + "epoch": 1.5734020135799578, + "grad_norm": 1.0672800954125705e-06, + "learning_rate": 5.4455154308347404e-05, + "loss": 1.1042, + "step": 20160 + }, + { + "epoch": 1.5741824709279637, + "grad_norm": 0.0014652134850621223, + "learning_rate": 5.440992778204054e-05, + "loss": 1.655, + "step": 20170 + }, + { + "epoch": 1.5749629282759696, + "grad_norm": 33.9235725402832, + "learning_rate": 5.436469761905841e-05, + "loss": 1.4122, + "step": 20180 + }, + { + "epoch": 1.5757433856239755, + "grad_norm": 25.62478256225586, + "learning_rate": 5.431946385670036e-05, + "loss": 1.2307, + "step": 20190 + }, + { + "epoch": 1.5765238429719814, + "grad_norm": 52.47317123413086, + "learning_rate": 5.427422653226868e-05, + "loss": 0.6712, + "step": 20200 + }, + { + "epoch": 1.5773043003199874, + "grad_norm": 58.45132827758789, + "learning_rate": 5.4228985683068664e-05, + "loss": 1.6876, + "step": 20210 + }, + { + "epoch": 1.5780847576679935, + "grad_norm": 1.3851629495620728, + "learning_rate": 5.4183741346408435e-05, + "loss": 0.8356, + "step": 20220 + }, + { + "epoch": 1.5788652150159994, + "grad_norm": 0.010332033038139343, + "learning_rate": 5.4138493559599036e-05, + "loss": 0.9256, + "step": 20230 + }, + { + "epoch": 1.5796456723640053, + "grad_norm": 46.7064323425293, + "learning_rate": 5.409324235995434e-05, + "loss": 1.0566, + "step": 20240 + }, + { + "epoch": 1.5804261297120112, + "grad_norm": 0.012174281291663647, + "learning_rate": 5.404798778479104e-05, + "loss": 2.1908, + "step": 20250 + }, + { + "epoch": 1.5812065870600172, + "grad_norm": 0.055964235216379166, + "learning_rate": 5.4002729871428624e-05, + "loss": 0.8299, + "step": 20260 + }, + { + "epoch": 1.581987044408023, + "grad_norm": 0.04050574079155922, + "learning_rate": 5.3957468657189315e-05, + "loss": 1.8807, + "step": 20270 + }, + { + "epoch": 1.582767501756029, + "grad_norm": 44.31588363647461, + "learning_rate": 5.391220417939804e-05, + "loss": 1.4118, + "step": 20280 + }, + { + "epoch": 1.5835479591040351, + "grad_norm": 60.45882034301758, + "learning_rate": 5.386693647538248e-05, + "loss": 0.3896, + "step": 20290 + }, + { + "epoch": 1.584328416452041, + "grad_norm": 3.0239250659942627, + "learning_rate": 5.382166558247291e-05, + "loss": 2.1834, + "step": 20300 + }, + { + "epoch": 1.585108873800047, + "grad_norm": 2.8246383666992188, + "learning_rate": 5.377639153800229e-05, + "loss": 0.5203, + "step": 20310 + }, + { + "epoch": 1.5858893311480529, + "grad_norm": 0.015088041312992573, + "learning_rate": 5.3731114379306114e-05, + "loss": 0.1914, + "step": 20320 + }, + { + "epoch": 1.5866697884960588, + "grad_norm": 0.00013219547690823674, + "learning_rate": 5.368583414372251e-05, + "loss": 0.7982, + "step": 20330 + }, + { + "epoch": 1.5874502458440647, + "grad_norm": 0.0006202549557201564, + "learning_rate": 5.3640550868592124e-05, + "loss": 0.7759, + "step": 20340 + }, + { + "epoch": 1.5882307031920706, + "grad_norm": 0.30399084091186523, + "learning_rate": 5.3595264591258054e-05, + "loss": 0.9756, + "step": 20350 + }, + { + "epoch": 1.5890111605400765, + "grad_norm": 27.899831771850586, + "learning_rate": 5.354997534906596e-05, + "loss": 1.9642, + "step": 20360 + }, + { + "epoch": 1.5897916178880824, + "grad_norm": 2.0328943729400635, + "learning_rate": 5.35046831793639e-05, + "loss": 0.3076, + "step": 20370 + }, + { + "epoch": 1.5905720752360883, + "grad_norm": 99.052490234375, + "learning_rate": 5.345938811950234e-05, + "loss": 2.0924, + "step": 20380 + }, + { + "epoch": 1.5913525325840943, + "grad_norm": 3.535752296447754, + "learning_rate": 5.341409020683414e-05, + "loss": 0.319, + "step": 20390 + }, + { + "epoch": 1.5921329899321002, + "grad_norm": 5.720830813515931e-07, + "learning_rate": 5.336878947871454e-05, + "loss": 0.3174, + "step": 20400 + }, + { + "epoch": 1.592913447280106, + "grad_norm": 71.32350158691406, + "learning_rate": 5.3323485972501055e-05, + "loss": 1.6845, + "step": 20410 + }, + { + "epoch": 1.593693904628112, + "grad_norm": 2.370527982711792, + "learning_rate": 5.3278179725553525e-05, + "loss": 0.5922, + "step": 20420 + }, + { + "epoch": 1.594474361976118, + "grad_norm": 0.013675123453140259, + "learning_rate": 5.3232870775234024e-05, + "loss": 0.5931, + "step": 20430 + }, + { + "epoch": 1.5952548193241238, + "grad_norm": 3.333381215497866e-08, + "learning_rate": 5.318755915890688e-05, + "loss": 0.5146, + "step": 20440 + }, + { + "epoch": 1.5960352766721297, + "grad_norm": 13.518216133117676, + "learning_rate": 5.314224491393859e-05, + "loss": 1.4546, + "step": 20450 + }, + { + "epoch": 1.5968157340201357, + "grad_norm": 2.5204646587371826, + "learning_rate": 5.309692807769786e-05, + "loss": 2.4162, + "step": 20460 + }, + { + "epoch": 1.5975961913681416, + "grad_norm": 9.781959306565113e-06, + "learning_rate": 5.305160868755549e-05, + "loss": 1.3862, + "step": 20470 + }, + { + "epoch": 1.5983766487161477, + "grad_norm": 0.0047400938346982, + "learning_rate": 5.300628678088443e-05, + "loss": 2.0894, + "step": 20480 + }, + { + "epoch": 1.5991571060641536, + "grad_norm": 1.9240214824676514, + "learning_rate": 5.296096239505966e-05, + "loss": 1.8343, + "step": 20490 + }, + { + "epoch": 1.5999375634121595, + "grad_norm": 2.6736748218536377, + "learning_rate": 5.2915635567458245e-05, + "loss": 0.1457, + "step": 20500 + }, + { + "epoch": 1.6007180207601654, + "grad_norm": 1.4274554252624512, + "learning_rate": 5.287030633545922e-05, + "loss": 0.2772, + "step": 20510 + }, + { + "epoch": 1.6014984781081714, + "grad_norm": 55.156532287597656, + "learning_rate": 5.282497473644365e-05, + "loss": 0.6489, + "step": 20520 + }, + { + "epoch": 1.6022789354561773, + "grad_norm": 5.8431965044292156e-06, + "learning_rate": 5.277964080779453e-05, + "loss": 0.4494, + "step": 20530 + }, + { + "epoch": 1.6030593928041834, + "grad_norm": 12.031957626342773, + "learning_rate": 5.273430458689674e-05, + "loss": 2.4088, + "step": 20540 + }, + { + "epoch": 1.6038398501521893, + "grad_norm": 27.250608444213867, + "learning_rate": 5.268896611113713e-05, + "loss": 1.2248, + "step": 20550 + }, + { + "epoch": 1.6046203075001952, + "grad_norm": 1.2413510084152222, + "learning_rate": 5.264362541790434e-05, + "loss": 1.4055, + "step": 20560 + }, + { + "epoch": 1.6054007648482012, + "grad_norm": 0.7440712451934814, + "learning_rate": 5.2598282544588874e-05, + "loss": 0.4348, + "step": 20570 + }, + { + "epoch": 1.606181222196207, + "grad_norm": 0.732166588306427, + "learning_rate": 5.2552937528583014e-05, + "loss": 3.105, + "step": 20580 + }, + { + "epoch": 1.606961679544213, + "grad_norm": 0.03779369965195656, + "learning_rate": 5.2507590407280815e-05, + "loss": 0.8086, + "step": 20590 + }, + { + "epoch": 1.607742136892219, + "grad_norm": 0.03578812628984451, + "learning_rate": 5.24622412180781e-05, + "loss": 1.2931, + "step": 20600 + }, + { + "epoch": 1.6085225942402248, + "grad_norm": 27.9847354888916, + "learning_rate": 5.2416889998372344e-05, + "loss": 0.2665, + "step": 20610 + }, + { + "epoch": 1.6093030515882307, + "grad_norm": 0.38156062364578247, + "learning_rate": 5.237153678556273e-05, + "loss": 0.6004, + "step": 20620 + }, + { + "epoch": 1.6100835089362366, + "grad_norm": 5.8809146139537916e-06, + "learning_rate": 5.232618161705009e-05, + "loss": 0.3352, + "step": 20630 + }, + { + "epoch": 1.6108639662842426, + "grad_norm": 0.05808285251259804, + "learning_rate": 5.228082453023682e-05, + "loss": 0.4951, + "step": 20640 + }, + { + "epoch": 1.6116444236322485, + "grad_norm": 0.0736168846487999, + "learning_rate": 5.2235465562526956e-05, + "loss": 0.0526, + "step": 20650 + }, + { + "epoch": 1.6124248809802544, + "grad_norm": 0.005917796399444342, + "learning_rate": 5.219010475132604e-05, + "loss": 0.1153, + "step": 20660 + }, + { + "epoch": 1.6132053383282603, + "grad_norm": 39.21625900268555, + "learning_rate": 5.21447421340412e-05, + "loss": 0.5997, + "step": 20670 + }, + { + "epoch": 1.6139857956762662, + "grad_norm": 40.29851531982422, + "learning_rate": 5.209937774808098e-05, + "loss": 5.3299, + "step": 20680 + }, + { + "epoch": 1.6147662530242721, + "grad_norm": 4.9903657782124355e-05, + "learning_rate": 5.205401163085542e-05, + "loss": 0.1729, + "step": 20690 + }, + { + "epoch": 1.615546710372278, + "grad_norm": 0.03153657168149948, + "learning_rate": 5.200864381977596e-05, + "loss": 1.1679, + "step": 20700 + }, + { + "epoch": 1.616327167720284, + "grad_norm": 0.011658350005745888, + "learning_rate": 5.196327435225548e-05, + "loss": 0.1952, + "step": 20710 + }, + { + "epoch": 1.6171076250682899, + "grad_norm": 5.017102466808865e-07, + "learning_rate": 5.191790326570821e-05, + "loss": 0.8395, + "step": 20720 + }, + { + "epoch": 1.617888082416296, + "grad_norm": 0.6353729963302612, + "learning_rate": 5.1872530597549696e-05, + "loss": 1.0601, + "step": 20730 + }, + { + "epoch": 1.618668539764302, + "grad_norm": 22.19393539428711, + "learning_rate": 5.1827156385196775e-05, + "loss": 1.74, + "step": 20740 + }, + { + "epoch": 1.6194489971123078, + "grad_norm": 1.667719857323391e-06, + "learning_rate": 5.178178066606762e-05, + "loss": 0.3663, + "step": 20750 + }, + { + "epoch": 1.6202294544603137, + "grad_norm": 47.38582992553711, + "learning_rate": 5.1736403477581594e-05, + "loss": 0.8091, + "step": 20760 + }, + { + "epoch": 1.6210099118083197, + "grad_norm": 0.21062630414962769, + "learning_rate": 5.1691024857159297e-05, + "loss": 0.2815, + "step": 20770 + }, + { + "epoch": 1.6217903691563256, + "grad_norm": 1.026199460029602, + "learning_rate": 5.164564484222247e-05, + "loss": 1.9512, + "step": 20780 + }, + { + "epoch": 1.6225708265043315, + "grad_norm": 30.83462142944336, + "learning_rate": 5.160026347019407e-05, + "loss": 0.5864, + "step": 20790 + }, + { + "epoch": 1.6233512838523376, + "grad_norm": 0.025723319500684738, + "learning_rate": 5.155488077849812e-05, + "loss": 1.3208, + "step": 20800 + }, + { + "epoch": 1.6241317412003435, + "grad_norm": 1.1020880918977127e-07, + "learning_rate": 5.150949680455974e-05, + "loss": 0.2486, + "step": 20810 + }, + { + "epoch": 1.6249121985483495, + "grad_norm": 2.570460796356201, + "learning_rate": 5.146411158580513e-05, + "loss": 1.6481, + "step": 20820 + }, + { + "epoch": 1.6256926558963554, + "grad_norm": 47.203922271728516, + "learning_rate": 5.141872515966152e-05, + "loss": 0.3489, + "step": 20830 + }, + { + "epoch": 1.6264731132443613, + "grad_norm": 7.59179162979126, + "learning_rate": 5.137333756355707e-05, + "loss": 0.8454, + "step": 20840 + }, + { + "epoch": 1.6272535705923672, + "grad_norm": 0.002626607194542885, + "learning_rate": 5.132794883492099e-05, + "loss": 1.9901, + "step": 20850 + }, + { + "epoch": 1.628034027940373, + "grad_norm": 0.28491634130477905, + "learning_rate": 5.128255901118335e-05, + "loss": 0.7532, + "step": 20860 + }, + { + "epoch": 1.628814485288379, + "grad_norm": 0.0006308460724540055, + "learning_rate": 5.1237168129775216e-05, + "loss": 0.1481, + "step": 20870 + }, + { + "epoch": 1.629594942636385, + "grad_norm": 11.376788139343262, + "learning_rate": 5.119177622812842e-05, + "loss": 1.0631, + "step": 20880 + }, + { + "epoch": 1.6303753999843908, + "grad_norm": 16.143569946289062, + "learning_rate": 5.1146383343675706e-05, + "loss": 0.7264, + "step": 20890 + }, + { + "epoch": 1.6311558573323968, + "grad_norm": 57.7200813293457, + "learning_rate": 5.110098951385061e-05, + "loss": 0.8139, + "step": 20900 + }, + { + "epoch": 1.6319363146804027, + "grad_norm": 1.1819399333035108e-05, + "learning_rate": 5.1055594776087436e-05, + "loss": 1.5962, + "step": 20910 + }, + { + "epoch": 1.6327167720284086, + "grad_norm": 1.585782527923584, + "learning_rate": 5.101019916782125e-05, + "loss": 2.1014, + "step": 20920 + }, + { + "epoch": 1.6334972293764145, + "grad_norm": 0.6489249467849731, + "learning_rate": 5.0964802726487835e-05, + "loss": 1.3317, + "step": 20930 + }, + { + "epoch": 1.6342776867244204, + "grad_norm": 0.0019743037410080433, + "learning_rate": 5.091940548952365e-05, + "loss": 0.0838, + "step": 20940 + }, + { + "epoch": 1.6350581440724263, + "grad_norm": 0.3787420392036438, + "learning_rate": 5.0874007494365826e-05, + "loss": 0.4802, + "step": 20950 + }, + { + "epoch": 1.6358386014204322, + "grad_norm": 0.025786397978663445, + "learning_rate": 5.082860877845212e-05, + "loss": 0.9618, + "step": 20960 + }, + { + "epoch": 1.6366190587684382, + "grad_norm": 0.0014336465392261744, + "learning_rate": 5.078320937922084e-05, + "loss": 2.8194, + "step": 20970 + }, + { + "epoch": 1.637399516116444, + "grad_norm": 65.5517807006836, + "learning_rate": 5.073780933411093e-05, + "loss": 0.916, + "step": 20980 + }, + { + "epoch": 1.6381799734644502, + "grad_norm": 0.2608548104763031, + "learning_rate": 5.0692408680561806e-05, + "loss": 0.6485, + "step": 20990 + }, + { + "epoch": 1.6389604308124561, + "grad_norm": 1.4473145008087158, + "learning_rate": 5.064700745601343e-05, + "loss": 0.5851, + "step": 21000 + }, + { + "epoch": 1.639740888160462, + "grad_norm": 0.11096695065498352, + "learning_rate": 5.06016056979062e-05, + "loss": 0.2983, + "step": 21010 + }, + { + "epoch": 1.640521345508468, + "grad_norm": 0.15635938942432404, + "learning_rate": 5.055620344368095e-05, + "loss": 0.4633, + "step": 21020 + }, + { + "epoch": 1.6413018028564739, + "grad_norm": 90.75917053222656, + "learning_rate": 5.0510800730778974e-05, + "loss": 1.5644, + "step": 21030 + }, + { + "epoch": 1.6420822602044798, + "grad_norm": 2.372506990866441e-08, + "learning_rate": 5.046539759664188e-05, + "loss": 1.1676, + "step": 21040 + }, + { + "epoch": 1.642862717552486, + "grad_norm": 70.3769760131836, + "learning_rate": 5.0419994078711674e-05, + "loss": 1.0743, + "step": 21050 + }, + { + "epoch": 1.6436431749004918, + "grad_norm": 1.29304838180542, + "learning_rate": 5.037459021443066e-05, + "loss": 0.4095, + "step": 21060 + }, + { + "epoch": 1.6444236322484977, + "grad_norm": 0.04536210000514984, + "learning_rate": 5.032918604124142e-05, + "loss": 0.2607, + "step": 21070 + }, + { + "epoch": 1.6452040895965037, + "grad_norm": 8.825725555419922, + "learning_rate": 5.02837815965868e-05, + "loss": 1.2542, + "step": 21080 + }, + { + "epoch": 1.6459845469445096, + "grad_norm": 12.517806053161621, + "learning_rate": 5.023837691790984e-05, + "loss": 3.7509, + "step": 21090 + }, + { + "epoch": 1.6467650042925155, + "grad_norm": 0.4820645749568939, + "learning_rate": 5.0192972042653844e-05, + "loss": 2.1886, + "step": 21100 + }, + { + "epoch": 1.6475454616405214, + "grad_norm": 40.499412536621094, + "learning_rate": 5.014756700826221e-05, + "loss": 0.9779, + "step": 21110 + }, + { + "epoch": 1.6483259189885273, + "grad_norm": 0.00010101215593749657, + "learning_rate": 5.01021618521785e-05, + "loss": 0.2086, + "step": 21120 + }, + { + "epoch": 1.6491063763365332, + "grad_norm": 4.254552364349365, + "learning_rate": 5.005675661184635e-05, + "loss": 0.312, + "step": 21130 + }, + { + "epoch": 1.6498868336845391, + "grad_norm": 2.831031560897827, + "learning_rate": 5.001135132470951e-05, + "loss": 0.087, + "step": 21140 + }, + { + "epoch": 1.650667291032545, + "grad_norm": 49.950191497802734, + "learning_rate": 4.9965946028211714e-05, + "loss": 4.2688, + "step": 21150 + }, + { + "epoch": 1.651447748380551, + "grad_norm": 2.2463319301605225, + "learning_rate": 4.992054075979676e-05, + "loss": 0.2042, + "step": 21160 + }, + { + "epoch": 1.6522282057285569, + "grad_norm": 53.4621467590332, + "learning_rate": 4.9875135556908376e-05, + "loss": 2.1503, + "step": 21170 + }, + { + "epoch": 1.6530086630765628, + "grad_norm": 2.5740723609924316, + "learning_rate": 4.9829730456990244e-05, + "loss": 0.1176, + "step": 21180 + }, + { + "epoch": 1.6537891204245687, + "grad_norm": 0.07874448597431183, + "learning_rate": 4.9784325497486e-05, + "loss": 0.2987, + "step": 21190 + }, + { + "epoch": 1.6545695777725746, + "grad_norm": 1.9069774150848389, + "learning_rate": 4.9738920715839105e-05, + "loss": 1.505, + "step": 21200 + }, + { + "epoch": 1.6553500351205805, + "grad_norm": 3.814951014646795e-06, + "learning_rate": 4.9693516149492924e-05, + "loss": 0.4965, + "step": 21210 + }, + { + "epoch": 1.6561304924685865, + "grad_norm": 4.037242889404297, + "learning_rate": 4.964811183589061e-05, + "loss": 0.1444, + "step": 21220 + }, + { + "epoch": 1.6569109498165924, + "grad_norm": 0.0023153889924287796, + "learning_rate": 4.960270781247515e-05, + "loss": 0.0273, + "step": 21230 + }, + { + "epoch": 1.6576914071645985, + "grad_norm": 3.0313501611090032e-06, + "learning_rate": 4.955730411668922e-05, + "loss": 0.5516, + "step": 21240 + }, + { + "epoch": 1.6584718645126044, + "grad_norm": 0.18385180830955505, + "learning_rate": 4.951190078597531e-05, + "loss": 0.7793, + "step": 21250 + }, + { + "epoch": 1.6592523218606103, + "grad_norm": 0.6903563737869263, + "learning_rate": 4.9466497857775544e-05, + "loss": 1.9214, + "step": 21260 + }, + { + "epoch": 1.6600327792086162, + "grad_norm": 0.040249597281217575, + "learning_rate": 4.9421095369531764e-05, + "loss": 2.2853, + "step": 21270 + }, + { + "epoch": 1.6608132365566222, + "grad_norm": 35.857269287109375, + "learning_rate": 4.9375693358685395e-05, + "loss": 2.6226, + "step": 21280 + }, + { + "epoch": 1.661593693904628, + "grad_norm": 11.268913269042969, + "learning_rate": 4.933029186267749e-05, + "loss": 0.6422, + "step": 21290 + }, + { + "epoch": 1.662374151252634, + "grad_norm": 2.7771138775278814e-05, + "learning_rate": 4.9284890918948734e-05, + "loss": 0.169, + "step": 21300 + }, + { + "epoch": 1.6631546086006401, + "grad_norm": 82.51126098632812, + "learning_rate": 4.9239490564939244e-05, + "loss": 1.7565, + "step": 21310 + }, + { + "epoch": 1.663935065948646, + "grad_norm": 18.174606323242188, + "learning_rate": 4.919409083808876e-05, + "loss": 0.4783, + "step": 21320 + }, + { + "epoch": 1.664715523296652, + "grad_norm": 16.663576126098633, + "learning_rate": 4.914869177583643e-05, + "loss": 0.6331, + "step": 21330 + }, + { + "epoch": 1.6654959806446579, + "grad_norm": 0.04770435020327568, + "learning_rate": 4.9103293415620916e-05, + "loss": 1.0681, + "step": 21340 + }, + { + "epoch": 1.6662764379926638, + "grad_norm": 7.188361167907715, + "learning_rate": 4.9057895794880224e-05, + "loss": 2.0846, + "step": 21350 + }, + { + "epoch": 1.6670568953406697, + "grad_norm": 48.30089569091797, + "learning_rate": 4.9012498951051825e-05, + "loss": 0.5371, + "step": 21360 + }, + { + "epoch": 1.6678373526886756, + "grad_norm": 0.17572489380836487, + "learning_rate": 4.8967102921572524e-05, + "loss": 0.6895, + "step": 21370 + }, + { + "epoch": 1.6686178100366815, + "grad_norm": 0.0517539419233799, + "learning_rate": 4.8921707743878404e-05, + "loss": 0.7653, + "step": 21380 + }, + { + "epoch": 1.6693982673846874, + "grad_norm": 32.768985748291016, + "learning_rate": 4.8876313455404934e-05, + "loss": 0.6787, + "step": 21390 + }, + { + "epoch": 1.6701787247326934, + "grad_norm": 6.695905176457018e-05, + "learning_rate": 4.883092009358678e-05, + "loss": 1.0841, + "step": 21400 + }, + { + "epoch": 1.6709591820806993, + "grad_norm": 0.32399168610572815, + "learning_rate": 4.87855276958579e-05, + "loss": 1.3301, + "step": 21410 + }, + { + "epoch": 1.6717396394287052, + "grad_norm": 0.19651849567890167, + "learning_rate": 4.874013629965138e-05, + "loss": 0.7622, + "step": 21420 + }, + { + "epoch": 1.672520096776711, + "grad_norm": 0.5481512546539307, + "learning_rate": 4.869474594239958e-05, + "loss": 0.3891, + "step": 21430 + }, + { + "epoch": 1.673300554124717, + "grad_norm": 7.097708225250244, + "learning_rate": 4.864935666153388e-05, + "loss": 0.1903, + "step": 21440 + }, + { + "epoch": 1.674081011472723, + "grad_norm": 0.6818715929985046, + "learning_rate": 4.860396849448492e-05, + "loss": 0.2203, + "step": 21450 + }, + { + "epoch": 1.6748614688207288, + "grad_norm": 41.85393142700195, + "learning_rate": 4.8558581478682294e-05, + "loss": 0.4238, + "step": 21460 + }, + { + "epoch": 1.6756419261687348, + "grad_norm": 53.905677795410156, + "learning_rate": 4.851319565155472e-05, + "loss": 1.0562, + "step": 21470 + }, + { + "epoch": 1.6764223835167407, + "grad_norm": 39.14933776855469, + "learning_rate": 4.846781105052989e-05, + "loss": 0.8403, + "step": 21480 + }, + { + "epoch": 1.6772028408647466, + "grad_norm": 1.2465424537658691, + "learning_rate": 4.842242771303451e-05, + "loss": 1.5033, + "step": 21490 + }, + { + "epoch": 1.6779832982127527, + "grad_norm": 8.150337219238281, + "learning_rate": 4.837704567649428e-05, + "loss": 2.6654, + "step": 21500 + }, + { + "epoch": 1.6787637555607586, + "grad_norm": 0.18484830856323242, + "learning_rate": 4.833166497833372e-05, + "loss": 1.4964, + "step": 21510 + }, + { + "epoch": 1.6795442129087645, + "grad_norm": 0.00015833714860491455, + "learning_rate": 4.828628565597636e-05, + "loss": 0.5441, + "step": 21520 + }, + { + "epoch": 1.6803246702567705, + "grad_norm": 50.95884323120117, + "learning_rate": 4.824090774684454e-05, + "loss": 1.7497, + "step": 21530 + }, + { + "epoch": 1.6811051276047764, + "grad_norm": 48.3619384765625, + "learning_rate": 4.8195531288359466e-05, + "loss": 1.1273, + "step": 21540 + }, + { + "epoch": 1.6818855849527823, + "grad_norm": 0.0017549977637827396, + "learning_rate": 4.815015631794108e-05, + "loss": 1.4486, + "step": 21550 + }, + { + "epoch": 1.6826660423007884, + "grad_norm": 0.11870098859071732, + "learning_rate": 4.810478287300817e-05, + "loss": 0.1412, + "step": 21560 + }, + { + "epoch": 1.6834464996487943, + "grad_norm": 0.0002780222857836634, + "learning_rate": 4.805941099097826e-05, + "loss": 1.6231, + "step": 21570 + }, + { + "epoch": 1.6842269569968003, + "grad_norm": 55.342716217041016, + "learning_rate": 4.801404070926751e-05, + "loss": 1.7901, + "step": 21580 + }, + { + "epoch": 1.6850074143448062, + "grad_norm": 3.412632465362549, + "learning_rate": 4.796867206529086e-05, + "loss": 0.0696, + "step": 21590 + }, + { + "epoch": 1.685787871692812, + "grad_norm": 6.7516374588012695, + "learning_rate": 4.792330509646182e-05, + "loss": 1.5025, + "step": 21600 + }, + { + "epoch": 1.686568329040818, + "grad_norm": 1.281043114431668e-05, + "learning_rate": 4.78779398401926e-05, + "loss": 0.4093, + "step": 21610 + }, + { + "epoch": 1.687348786388824, + "grad_norm": 37.101993560791016, + "learning_rate": 4.783257633389389e-05, + "loss": 1.4379, + "step": 21620 + }, + { + "epoch": 1.6881292437368298, + "grad_norm": 56.41515350341797, + "learning_rate": 4.778721461497504e-05, + "loss": 1.4588, + "step": 21630 + }, + { + "epoch": 1.6889097010848357, + "grad_norm": 5.122697984916158e-06, + "learning_rate": 4.774185472084386e-05, + "loss": 0.6948, + "step": 21640 + }, + { + "epoch": 1.6896901584328416, + "grad_norm": 63.40077209472656, + "learning_rate": 4.7696496688906704e-05, + "loss": 2.4883, + "step": 21650 + }, + { + "epoch": 1.6904706157808476, + "grad_norm": 4.130136403546203e-06, + "learning_rate": 4.765114055656834e-05, + "loss": 0.3206, + "step": 21660 + }, + { + "epoch": 1.6912510731288535, + "grad_norm": 0.13058346509933472, + "learning_rate": 4.7605786361232e-05, + "loss": 1.351, + "step": 21670 + }, + { + "epoch": 1.6920315304768594, + "grad_norm": 47.44347381591797, + "learning_rate": 4.756043414029932e-05, + "loss": 0.9191, + "step": 21680 + }, + { + "epoch": 1.6928119878248653, + "grad_norm": 0.0047844285145401955, + "learning_rate": 4.7515083931170284e-05, + "loss": 0.1568, + "step": 21690 + }, + { + "epoch": 1.6935924451728712, + "grad_norm": 43.56236267089844, + "learning_rate": 4.746973577124325e-05, + "loss": 1.2394, + "step": 21700 + }, + { + "epoch": 1.6943729025208771, + "grad_norm": 1.7203421592712402, + "learning_rate": 4.742438969791485e-05, + "loss": 0.6306, + "step": 21710 + }, + { + "epoch": 1.695153359868883, + "grad_norm": 0.0004018844338133931, + "learning_rate": 4.7379045748580056e-05, + "loss": 0.5336, + "step": 21720 + }, + { + "epoch": 1.695933817216889, + "grad_norm": 40.43912124633789, + "learning_rate": 4.733370396063199e-05, + "loss": 1.0372, + "step": 21730 + }, + { + "epoch": 1.6967142745648949, + "grad_norm": 12.819672584533691, + "learning_rate": 4.72883643714621e-05, + "loss": 0.8208, + "step": 21740 + }, + { + "epoch": 1.697494731912901, + "grad_norm": 3.561326957424171e-05, + "learning_rate": 4.7243027018459926e-05, + "loss": 0.7186, + "step": 21750 + }, + { + "epoch": 1.698275189260907, + "grad_norm": 41.82414245605469, + "learning_rate": 4.7197691939013243e-05, + "loss": 0.9432, + "step": 21760 + }, + { + "epoch": 1.6990556466089128, + "grad_norm": 0.019969845190644264, + "learning_rate": 4.715235917050791e-05, + "loss": 1.8267, + "step": 21770 + }, + { + "epoch": 1.6998361039569188, + "grad_norm": 0.06363899260759354, + "learning_rate": 4.710702875032785e-05, + "loss": 0.715, + "step": 21780 + }, + { + "epoch": 1.7006165613049247, + "grad_norm": 4.866072654724121, + "learning_rate": 4.706170071585512e-05, + "loss": 0.5587, + "step": 21790 + }, + { + "epoch": 1.7013970186529306, + "grad_norm": 0.034537654370069504, + "learning_rate": 4.701637510446976e-05, + "loss": 0.6576, + "step": 21800 + }, + { + "epoch": 1.7021774760009365, + "grad_norm": 13.53211498260498, + "learning_rate": 4.6971051953549855e-05, + "loss": 0.7478, + "step": 21810 + }, + { + "epoch": 1.7029579333489426, + "grad_norm": 0.01826702244579792, + "learning_rate": 4.692573130047139e-05, + "loss": 0.6309, + "step": 21820 + }, + { + "epoch": 1.7037383906969485, + "grad_norm": 33.52978515625, + "learning_rate": 4.688041318260836e-05, + "loss": 0.5877, + "step": 21830 + }, + { + "epoch": 1.7045188480449545, + "grad_norm": 0.7539353966712952, + "learning_rate": 4.683509763733263e-05, + "loss": 1.5259, + "step": 21840 + }, + { + "epoch": 1.7052993053929604, + "grad_norm": 100.28569793701172, + "learning_rate": 4.6789784702013975e-05, + "loss": 1.7804, + "step": 21850 + }, + { + "epoch": 1.7060797627409663, + "grad_norm": 4.1331446709591546e-07, + "learning_rate": 4.674447441401997e-05, + "loss": 1.0625, + "step": 21860 + }, + { + "epoch": 1.7068602200889722, + "grad_norm": 47.859371185302734, + "learning_rate": 4.669916681071604e-05, + "loss": 0.847, + "step": 21870 + }, + { + "epoch": 1.7076406774369781, + "grad_norm": 0.08291533589363098, + "learning_rate": 4.665386192946542e-05, + "loss": 0.652, + "step": 21880 + }, + { + "epoch": 1.708421134784984, + "grad_norm": 2.0455920696258545, + "learning_rate": 4.660855980762903e-05, + "loss": 1.7044, + "step": 21890 + }, + { + "epoch": 1.70920159213299, + "grad_norm": 0.001365151721984148, + "learning_rate": 4.6563260482565586e-05, + "loss": 1.3111, + "step": 21900 + }, + { + "epoch": 1.7099820494809959, + "grad_norm": 54.70901870727539, + "learning_rate": 4.651796399163145e-05, + "loss": 1.5678, + "step": 21910 + }, + { + "epoch": 1.7107625068290018, + "grad_norm": 39.61702346801758, + "learning_rate": 4.647267037218069e-05, + "loss": 0.67, + "step": 21920 + }, + { + "epoch": 1.7115429641770077, + "grad_norm": 0.0010940422071143985, + "learning_rate": 4.642737966156494e-05, + "loss": 0.415, + "step": 21930 + }, + { + "epoch": 1.7123234215250136, + "grad_norm": 0.5741721391677856, + "learning_rate": 4.638209189713351e-05, + "loss": 0.5323, + "step": 21940 + }, + { + "epoch": 1.7131038788730195, + "grad_norm": 0.0008567293407395482, + "learning_rate": 4.633680711623322e-05, + "loss": 0.9283, + "step": 21950 + }, + { + "epoch": 1.7138843362210254, + "grad_norm": 0.11211058497428894, + "learning_rate": 4.6291525356208495e-05, + "loss": 1.5876, + "step": 21960 + }, + { + "epoch": 1.7146647935690313, + "grad_norm": 1.0090612704516388e-05, + "learning_rate": 4.624624665440119e-05, + "loss": 0.0254, + "step": 21970 + }, + { + "epoch": 1.7154452509170373, + "grad_norm": 0.07422367483377457, + "learning_rate": 4.620097104815067e-05, + "loss": 1.1101, + "step": 21980 + }, + { + "epoch": 1.7162257082650432, + "grad_norm": 0.00016244892321992666, + "learning_rate": 4.615569857479382e-05, + "loss": 0.9351, + "step": 21990 + }, + { + "epoch": 1.717006165613049, + "grad_norm": 26.694711685180664, + "learning_rate": 4.611042927166479e-05, + "loss": 1.2162, + "step": 22000 + }, + { + "epoch": 1.7177866229610552, + "grad_norm": 0.0031326529569923878, + "learning_rate": 4.6065163176095256e-05, + "loss": 1.9415, + "step": 22010 + }, + { + "epoch": 1.7185670803090611, + "grad_norm": 0.0018533789552748203, + "learning_rate": 4.6019900325414164e-05, + "loss": 0.0851, + "step": 22020 + }, + { + "epoch": 1.719347537657067, + "grad_norm": 0.5745460391044617, + "learning_rate": 4.5974640756947864e-05, + "loss": 0.1837, + "step": 22030 + }, + { + "epoch": 1.720127995005073, + "grad_norm": 0.08341173827648163, + "learning_rate": 4.592938450801989e-05, + "loss": 0.0196, + "step": 22040 + }, + { + "epoch": 1.7209084523530789, + "grad_norm": 5.3973726608091965e-06, + "learning_rate": 4.588413161595114e-05, + "loss": 0.5012, + "step": 22050 + }, + { + "epoch": 1.7216889097010848, + "grad_norm": 1.1654116605086529e-07, + "learning_rate": 4.583888211805969e-05, + "loss": 0.0606, + "step": 22060 + }, + { + "epoch": 1.722469367049091, + "grad_norm": 0.05267452821135521, + "learning_rate": 4.5793636051660804e-05, + "loss": 0.0854, + "step": 22070 + }, + { + "epoch": 1.7232498243970968, + "grad_norm": 0.0005213894182816148, + "learning_rate": 4.574839345406699e-05, + "loss": 2.4604, + "step": 22080 + }, + { + "epoch": 1.7240302817451028, + "grad_norm": 0.0006774759967811406, + "learning_rate": 4.5703154362587795e-05, + "loss": 0.5281, + "step": 22090 + }, + { + "epoch": 1.7248107390931087, + "grad_norm": 0.256835401058197, + "learning_rate": 4.565791881452997e-05, + "loss": 0.548, + "step": 22100 + }, + { + "epoch": 1.7255911964411146, + "grad_norm": 75.47777557373047, + "learning_rate": 4.5612686847197264e-05, + "loss": 1.321, + "step": 22110 + }, + { + "epoch": 1.7263716537891205, + "grad_norm": 79.86677551269531, + "learning_rate": 4.5567458497890546e-05, + "loss": 1.6857, + "step": 22120 + }, + { + "epoch": 1.7271521111371264, + "grad_norm": 0.022913841530680656, + "learning_rate": 4.552223380390763e-05, + "loss": 2.9717, + "step": 22130 + }, + { + "epoch": 1.7279325684851323, + "grad_norm": 2.466359853744507, + "learning_rate": 4.547701280254338e-05, + "loss": 0.159, + "step": 22140 + }, + { + "epoch": 1.7287130258331382, + "grad_norm": 60.59958267211914, + "learning_rate": 4.543179553108958e-05, + "loss": 0.7301, + "step": 22150 + }, + { + "epoch": 1.7294934831811442, + "grad_norm": 3.465244313716198e-09, + "learning_rate": 4.5386582026834906e-05, + "loss": 0.2713, + "step": 22160 + }, + { + "epoch": 1.73027394052915, + "grad_norm": 2.2539663314819336, + "learning_rate": 4.534137232706501e-05, + "loss": 0.571, + "step": 22170 + }, + { + "epoch": 1.731054397877156, + "grad_norm": 0.3366568982601166, + "learning_rate": 4.529616646906233e-05, + "loss": 0.2032, + "step": 22180 + }, + { + "epoch": 1.731834855225162, + "grad_norm": 0.0013072154251858592, + "learning_rate": 4.525096449010621e-05, + "loss": 2.0557, + "step": 22190 + }, + { + "epoch": 1.7326153125731678, + "grad_norm": 6.300201416015625, + "learning_rate": 4.52057664274727e-05, + "loss": 1.4872, + "step": 22200 + }, + { + "epoch": 1.7333957699211737, + "grad_norm": 78.94749450683594, + "learning_rate": 4.516057231843471e-05, + "loss": 0.5405, + "step": 22210 + }, + { + "epoch": 1.7341762272691796, + "grad_norm": 1.786816561377691e-08, + "learning_rate": 4.511538220026182e-05, + "loss": 0.0601, + "step": 22220 + }, + { + "epoch": 1.7349566846171856, + "grad_norm": 0.12275684624910355, + "learning_rate": 4.5070196110220396e-05, + "loss": 0.9074, + "step": 22230 + }, + { + "epoch": 1.7357371419651915, + "grad_norm": 0.0003189127310179174, + "learning_rate": 4.502501408557339e-05, + "loss": 0.419, + "step": 22240 + }, + { + "epoch": 1.7365175993131974, + "grad_norm": 45.07834243774414, + "learning_rate": 4.497983616358048e-05, + "loss": 1.2748, + "step": 22250 + }, + { + "epoch": 1.7372980566612035, + "grad_norm": 6.688942448818125e-07, + "learning_rate": 4.493466238149793e-05, + "loss": 1.6876, + "step": 22260 + }, + { + "epoch": 1.7380785140092094, + "grad_norm": 0.005018897820264101, + "learning_rate": 4.4889492776578565e-05, + "loss": 1.7529, + "step": 22270 + }, + { + "epoch": 1.7388589713572153, + "grad_norm": 8.813771614768484e-07, + "learning_rate": 4.4844327386071804e-05, + "loss": 1.2264, + "step": 22280 + }, + { + "epoch": 1.7396394287052213, + "grad_norm": 0.0008363195811398327, + "learning_rate": 4.479916624722357e-05, + "loss": 0.9457, + "step": 22290 + }, + { + "epoch": 1.7404198860532272, + "grad_norm": 0.005100402981042862, + "learning_rate": 4.475400939727632e-05, + "loss": 0.3116, + "step": 22300 + }, + { + "epoch": 1.741200343401233, + "grad_norm": 1.4255502223968506, + "learning_rate": 4.470885687346889e-05, + "loss": 0.4434, + "step": 22310 + }, + { + "epoch": 1.741980800749239, + "grad_norm": 45.88622283935547, + "learning_rate": 4.466370871303664e-05, + "loss": 1.1204, + "step": 22320 + }, + { + "epoch": 1.7427612580972451, + "grad_norm": 0.23392795026302338, + "learning_rate": 4.461856495321124e-05, + "loss": 0.3264, + "step": 22330 + }, + { + "epoch": 1.743541715445251, + "grad_norm": 0.0455491878092289, + "learning_rate": 4.457342563122084e-05, + "loss": 0.4507, + "step": 22340 + }, + { + "epoch": 1.744322172793257, + "grad_norm": 0.003446984337642789, + "learning_rate": 4.4528290784289854e-05, + "loss": 0.9984, + "step": 22350 + }, + { + "epoch": 1.7451026301412629, + "grad_norm": 0.003921607509255409, + "learning_rate": 4.4483160449638995e-05, + "loss": 0.4333, + "step": 22360 + }, + { + "epoch": 1.7458830874892688, + "grad_norm": 49.32126998901367, + "learning_rate": 4.443803466448531e-05, + "loss": 2.0987, + "step": 22370 + }, + { + "epoch": 1.7466635448372747, + "grad_norm": 3.526056104874442e-07, + "learning_rate": 4.439291346604205e-05, + "loss": 0.4222, + "step": 22380 + }, + { + "epoch": 1.7474440021852806, + "grad_norm": 43.01560974121094, + "learning_rate": 4.4347796891518736e-05, + "loss": 1.6041, + "step": 22390 + }, + { + "epoch": 1.7482244595332865, + "grad_norm": 9.610936164855957, + "learning_rate": 4.430268497812099e-05, + "loss": 0.8194, + "step": 22400 + }, + { + "epoch": 1.7490049168812924, + "grad_norm": 10.678997039794922, + "learning_rate": 4.425757776305068e-05, + "loss": 0.561, + "step": 22410 + }, + { + "epoch": 1.7497853742292984, + "grad_norm": 8.100939750671387, + "learning_rate": 4.421247528350574e-05, + "loss": 0.5386, + "step": 22420 + }, + { + "epoch": 1.7505658315773043, + "grad_norm": 42.0233268737793, + "learning_rate": 4.416737757668025e-05, + "loss": 2.2196, + "step": 22430 + }, + { + "epoch": 1.7513462889253102, + "grad_norm": 83.78080749511719, + "learning_rate": 4.412228467976428e-05, + "loss": 1.5664, + "step": 22440 + }, + { + "epoch": 1.752126746273316, + "grad_norm": 17.094125747680664, + "learning_rate": 4.407719662994401e-05, + "loss": 0.5809, + "step": 22450 + }, + { + "epoch": 1.752907203621322, + "grad_norm": 11.183182716369629, + "learning_rate": 4.4032113464401585e-05, + "loss": 0.6073, + "step": 22460 + }, + { + "epoch": 1.753687660969328, + "grad_norm": 9.12381710804766e-06, + "learning_rate": 4.39870352203151e-05, + "loss": 0.9461, + "step": 22470 + }, + { + "epoch": 1.7544681183173338, + "grad_norm": 0.0001830037945182994, + "learning_rate": 4.394196193485865e-05, + "loss": 0.5364, + "step": 22480 + }, + { + "epoch": 1.7552485756653398, + "grad_norm": 1.9644840955734253, + "learning_rate": 4.389689364520219e-05, + "loss": 1.0993, + "step": 22490 + }, + { + "epoch": 1.7560290330133457, + "grad_norm": 3.125764851574786e-05, + "learning_rate": 4.3851830388511605e-05, + "loss": 0.8421, + "step": 22500 + }, + { + "epoch": 1.7568094903613516, + "grad_norm": 3.3445630833739415e-05, + "learning_rate": 4.380677220194855e-05, + "loss": 0.3314, + "step": 22510 + }, + { + "epoch": 1.7575899477093577, + "grad_norm": 0.02401496283710003, + "learning_rate": 4.3761719122670595e-05, + "loss": 0.5202, + "step": 22520 + }, + { + "epoch": 1.7583704050573636, + "grad_norm": 32.83649826049805, + "learning_rate": 4.371667118783101e-05, + "loss": 1.02, + "step": 22530 + }, + { + "epoch": 1.7591508624053696, + "grad_norm": 47.032676696777344, + "learning_rate": 4.367162843457891e-05, + "loss": 0.8202, + "step": 22540 + }, + { + "epoch": 1.7599313197533755, + "grad_norm": 34.51866912841797, + "learning_rate": 4.362659090005905e-05, + "loss": 0.5898, + "step": 22550 + }, + { + "epoch": 1.7607117771013814, + "grad_norm": 0.08327773213386536, + "learning_rate": 4.358155862141194e-05, + "loss": 1.7651, + "step": 22560 + }, + { + "epoch": 1.7614922344493873, + "grad_norm": 1.6261988878250122, + "learning_rate": 4.3536531635773745e-05, + "loss": 1.7785, + "step": 22570 + }, + { + "epoch": 1.7622726917973934, + "grad_norm": 1.250341534614563, + "learning_rate": 4.349150998027624e-05, + "loss": 1.6029, + "step": 22580 + }, + { + "epoch": 1.7630531491453993, + "grad_norm": 11.970406532287598, + "learning_rate": 4.344649369204683e-05, + "loss": 0.5072, + "step": 22590 + }, + { + "epoch": 1.7638336064934053, + "grad_norm": 1.0202245712280273, + "learning_rate": 4.340148280820848e-05, + "loss": 0.9217, + "step": 22600 + }, + { + "epoch": 1.7646140638414112, + "grad_norm": 34.642059326171875, + "learning_rate": 4.3356477365879725e-05, + "loss": 1.5028, + "step": 22610 + }, + { + "epoch": 1.765394521189417, + "grad_norm": 54.67207336425781, + "learning_rate": 4.331147740217457e-05, + "loss": 1.26, + "step": 22620 + }, + { + "epoch": 1.766174978537423, + "grad_norm": 30.081071853637695, + "learning_rate": 4.3266482954202545e-05, + "loss": 0.3095, + "step": 22630 + }, + { + "epoch": 1.766955435885429, + "grad_norm": 0.38360029458999634, + "learning_rate": 4.322149405906859e-05, + "loss": 0.93, + "step": 22640 + }, + { + "epoch": 1.7677358932334348, + "grad_norm": 0.012800214812159538, + "learning_rate": 4.3176510753873075e-05, + "loss": 0.9898, + "step": 22650 + }, + { + "epoch": 1.7685163505814407, + "grad_norm": 6.709954323014244e-05, + "learning_rate": 4.3131533075711825e-05, + "loss": 0.1898, + "step": 22660 + }, + { + "epoch": 1.7692968079294467, + "grad_norm": 0.01472716685384512, + "learning_rate": 4.308656106167591e-05, + "loss": 0.2401, + "step": 22670 + }, + { + "epoch": 1.7700772652774526, + "grad_norm": 2.2141734007163905e-05, + "learning_rate": 4.3041594748851835e-05, + "loss": 0.6846, + "step": 22680 + }, + { + "epoch": 1.7708577226254585, + "grad_norm": 65.04307556152344, + "learning_rate": 4.299663417432132e-05, + "loss": 2.621, + "step": 22690 + }, + { + "epoch": 1.7716381799734644, + "grad_norm": 0.0004803487390745431, + "learning_rate": 4.295167937516144e-05, + "loss": 0.4884, + "step": 22700 + }, + { + "epoch": 1.7724186373214703, + "grad_norm": 0.43179354071617126, + "learning_rate": 4.2906730388444406e-05, + "loss": 0.6171, + "step": 22710 + }, + { + "epoch": 1.7731990946694762, + "grad_norm": 0.8222143650054932, + "learning_rate": 4.2861787251237725e-05, + "loss": 0.7261, + "step": 22720 + }, + { + "epoch": 1.7739795520174821, + "grad_norm": 2.3041009100666088e-08, + "learning_rate": 4.2816850000604026e-05, + "loss": 0.0063, + "step": 22730 + }, + { + "epoch": 1.774760009365488, + "grad_norm": 0.0003917988215107471, + "learning_rate": 4.277191867360113e-05, + "loss": 0.5468, + "step": 22740 + }, + { + "epoch": 1.775540466713494, + "grad_norm": 0.06409639865159988, + "learning_rate": 4.27269933072819e-05, + "loss": 2.1791, + "step": 22750 + }, + { + "epoch": 1.7763209240614999, + "grad_norm": 0.0009544471395201981, + "learning_rate": 4.2682073938694355e-05, + "loss": 0.8525, + "step": 22760 + }, + { + "epoch": 1.777101381409506, + "grad_norm": 45.30816650390625, + "learning_rate": 4.263716060488155e-05, + "loss": 1.2142, + "step": 22770 + }, + { + "epoch": 1.777881838757512, + "grad_norm": 0.10559545457363129, + "learning_rate": 4.259225334288152e-05, + "loss": 0.6182, + "step": 22780 + }, + { + "epoch": 1.7786622961055178, + "grad_norm": 0.8708995580673218, + "learning_rate": 4.254735218972736e-05, + "loss": 1.1154, + "step": 22790 + }, + { + "epoch": 1.7794427534535238, + "grad_norm": 45.40227508544922, + "learning_rate": 4.2502457182447066e-05, + "loss": 1.7978, + "step": 22800 + }, + { + "epoch": 1.7802232108015297, + "grad_norm": 0.0006610558484680951, + "learning_rate": 4.245756835806363e-05, + "loss": 1.1483, + "step": 22810 + }, + { + "epoch": 1.7810036681495356, + "grad_norm": 0.0016300451243296266, + "learning_rate": 4.241268575359487e-05, + "loss": 1.3185, + "step": 22820 + }, + { + "epoch": 1.7817841254975415, + "grad_norm": 0.1046861782670021, + "learning_rate": 4.236780940605355e-05, + "loss": 0.4392, + "step": 22830 + }, + { + "epoch": 1.7825645828455476, + "grad_norm": 0.0039477222599089146, + "learning_rate": 4.232293935244722e-05, + "loss": 0.8003, + "step": 22840 + }, + { + "epoch": 1.7833450401935536, + "grad_norm": 0.0004634255019482225, + "learning_rate": 4.227807562977825e-05, + "loss": 0.0647, + "step": 22850 + }, + { + "epoch": 1.7841254975415595, + "grad_norm": 0.8950148224830627, + "learning_rate": 4.2233218275043805e-05, + "loss": 1.0977, + "step": 22860 + }, + { + "epoch": 1.7849059548895654, + "grad_norm": 28.8367977142334, + "learning_rate": 4.218836732523579e-05, + "loss": 0.7182, + "step": 22870 + }, + { + "epoch": 1.7856864122375713, + "grad_norm": 41.56075668334961, + "learning_rate": 4.214352281734085e-05, + "loss": 1.1103, + "step": 22880 + }, + { + "epoch": 1.7864668695855772, + "grad_norm": 46.85104751586914, + "learning_rate": 4.209868478834025e-05, + "loss": 1.8579, + "step": 22890 + }, + { + "epoch": 1.7872473269335831, + "grad_norm": 48.383506774902344, + "learning_rate": 4.2053853275210016e-05, + "loss": 0.811, + "step": 22900 + }, + { + "epoch": 1.788027784281589, + "grad_norm": 0.27532026171684265, + "learning_rate": 4.200902831492067e-05, + "loss": 0.1309, + "step": 22910 + }, + { + "epoch": 1.788808241629595, + "grad_norm": 0.017690371721982956, + "learning_rate": 4.196420994443749e-05, + "loss": 0.4197, + "step": 22920 + }, + { + "epoch": 1.7895886989776009, + "grad_norm": 1.1601506471633911, + "learning_rate": 4.191939820072016e-05, + "loss": 1.0285, + "step": 22930 + }, + { + "epoch": 1.7903691563256068, + "grad_norm": 0.032384805381298065, + "learning_rate": 4.1874593120723033e-05, + "loss": 0.5403, + "step": 22940 + }, + { + "epoch": 1.7911496136736127, + "grad_norm": 12.99577522277832, + "learning_rate": 4.1829794741394855e-05, + "loss": 0.6629, + "step": 22950 + }, + { + "epoch": 1.7919300710216186, + "grad_norm": 3.649952873274742e-08, + "learning_rate": 4.178500309967891e-05, + "loss": 0.9047, + "step": 22960 + }, + { + "epoch": 1.7927105283696245, + "grad_norm": 0.054159872233867645, + "learning_rate": 4.174021823251294e-05, + "loss": 0.772, + "step": 22970 + }, + { + "epoch": 1.7934909857176304, + "grad_norm": 1.9959516066592187e-06, + "learning_rate": 4.169544017682903e-05, + "loss": 1.3254, + "step": 22980 + }, + { + "epoch": 1.7942714430656364, + "grad_norm": 30.54090690612793, + "learning_rate": 4.1650668969553725e-05, + "loss": 2.8308, + "step": 22990 + }, + { + "epoch": 1.7950519004136423, + "grad_norm": 5.61015510559082, + "learning_rate": 4.160590464760787e-05, + "loss": 0.2375, + "step": 23000 + }, + { + "epoch": 1.7958323577616482, + "grad_norm": 26.482603073120117, + "learning_rate": 4.156114724790668e-05, + "loss": 0.2325, + "step": 23010 + }, + { + "epoch": 1.796612815109654, + "grad_norm": 0.11949822306632996, + "learning_rate": 4.151639680735959e-05, + "loss": 0.6167, + "step": 23020 + }, + { + "epoch": 1.7973932724576602, + "grad_norm": 29.800121307373047, + "learning_rate": 4.147165336287036e-05, + "loss": 0.2472, + "step": 23030 + }, + { + "epoch": 1.7981737298056661, + "grad_norm": 1.9849360342050204e-06, + "learning_rate": 4.142691695133698e-05, + "loss": 1.2709, + "step": 23040 + }, + { + "epoch": 1.798954187153672, + "grad_norm": 41.16581344604492, + "learning_rate": 4.138218760965157e-05, + "loss": 0.481, + "step": 23050 + }, + { + "epoch": 1.799734644501678, + "grad_norm": 1.2925261749785477e-08, + "learning_rate": 4.1337465374700514e-05, + "loss": 0.5889, + "step": 23060 + }, + { + "epoch": 1.8005151018496839, + "grad_norm": 0.0002364539832342416, + "learning_rate": 4.129275028336425e-05, + "loss": 0.6695, + "step": 23070 + }, + { + "epoch": 1.8012955591976898, + "grad_norm": 36.79536437988281, + "learning_rate": 4.1248042372517416e-05, + "loss": 1.4779, + "step": 23080 + }, + { + "epoch": 1.802076016545696, + "grad_norm": 0.17343249917030334, + "learning_rate": 4.120334167902863e-05, + "loss": 0.6679, + "step": 23090 + }, + { + "epoch": 1.8028564738937018, + "grad_norm": 20.67839813232422, + "learning_rate": 4.1158648239760625e-05, + "loss": 1.2699, + "step": 23100 + }, + { + "epoch": 1.8036369312417078, + "grad_norm": 1.2648146707761043e-07, + "learning_rate": 4.111396209157013e-05, + "loss": 0.3416, + "step": 23110 + }, + { + "epoch": 1.8044173885897137, + "grad_norm": 2.1918528148034966e-07, + "learning_rate": 4.106928327130789e-05, + "loss": 1.1927, + "step": 23120 + }, + { + "epoch": 1.8051978459377196, + "grad_norm": 9.444477081298828, + "learning_rate": 4.102461181581854e-05, + "loss": 1.6943, + "step": 23130 + }, + { + "epoch": 1.8059783032857255, + "grad_norm": 38.784950256347656, + "learning_rate": 4.0979947761940694e-05, + "loss": 1.1751, + "step": 23140 + }, + { + "epoch": 1.8067587606337314, + "grad_norm": 2.0578191595177486e-08, + "learning_rate": 4.093529114650688e-05, + "loss": 0.0777, + "step": 23150 + }, + { + "epoch": 1.8075392179817373, + "grad_norm": 0.9909342527389526, + "learning_rate": 4.089064200634343e-05, + "loss": 0.3616, + "step": 23160 + }, + { + "epoch": 1.8083196753297432, + "grad_norm": 16.76162338256836, + "learning_rate": 4.084600037827055e-05, + "loss": 0.6793, + "step": 23170 + }, + { + "epoch": 1.8091001326777492, + "grad_norm": 1.2102850632800255e-05, + "learning_rate": 4.080136629910224e-05, + "loss": 0.3691, + "step": 23180 + }, + { + "epoch": 1.809880590025755, + "grad_norm": 6.797701835632324, + "learning_rate": 4.075673980564632e-05, + "loss": 0.2886, + "step": 23190 + }, + { + "epoch": 1.810661047373761, + "grad_norm": 2.2534488053338464e-08, + "learning_rate": 4.071212093470426e-05, + "loss": 0.1335, + "step": 23200 + }, + { + "epoch": 1.811441504721767, + "grad_norm": 0.004243421368300915, + "learning_rate": 4.066750972307134e-05, + "loss": 0.2907, + "step": 23210 + }, + { + "epoch": 1.8122219620697728, + "grad_norm": 0.2348150759935379, + "learning_rate": 4.0622906207536445e-05, + "loss": 3.2972, + "step": 23220 + }, + { + "epoch": 1.8130024194177787, + "grad_norm": 1.3275203855300788e-05, + "learning_rate": 4.0578310424882173e-05, + "loss": 0.5777, + "step": 23230 + }, + { + "epoch": 1.8137828767657846, + "grad_norm": 33.510658264160156, + "learning_rate": 4.053372241188475e-05, + "loss": 0.8417, + "step": 23240 + }, + { + "epoch": 1.8145633341137906, + "grad_norm": 37.20769500732422, + "learning_rate": 4.04891422053139e-05, + "loss": 1.7718, + "step": 23250 + }, + { + "epoch": 1.8153437914617965, + "grad_norm": 1.933334715431556e-05, + "learning_rate": 4.0444569841933034e-05, + "loss": 0.7678, + "step": 23260 + }, + { + "epoch": 1.8161242488098024, + "grad_norm": 37.909122467041016, + "learning_rate": 4.0400005358499e-05, + "loss": 0.6111, + "step": 23270 + }, + { + "epoch": 1.8169047061578085, + "grad_norm": 8.029224395751953, + "learning_rate": 4.035544879176223e-05, + "loss": 0.9214, + "step": 23280 + }, + { + "epoch": 1.8176851635058144, + "grad_norm": 6.63645076751709, + "learning_rate": 4.031090017846653e-05, + "loss": 1.1639, + "step": 23290 + }, + { + "epoch": 1.8184656208538204, + "grad_norm": 3.859848737716675, + "learning_rate": 4.026635955534924e-05, + "loss": 0.2613, + "step": 23300 + }, + { + "epoch": 1.8192460782018263, + "grad_norm": 8.740327835083008, + "learning_rate": 4.022182695914105e-05, + "loss": 0.4547, + "step": 23310 + }, + { + "epoch": 1.8200265355498322, + "grad_norm": 0.1261139214038849, + "learning_rate": 4.0177302426566075e-05, + "loss": 1.1221, + "step": 23320 + }, + { + "epoch": 1.820806992897838, + "grad_norm": 0.04213743656873703, + "learning_rate": 4.013278599434173e-05, + "loss": 0.2915, + "step": 23330 + }, + { + "epoch": 1.8215874502458442, + "grad_norm": 0.00043451451347209513, + "learning_rate": 4.0088277699178777e-05, + "loss": 0.5183, + "step": 23340 + }, + { + "epoch": 1.8223679075938501, + "grad_norm": 0.0166937205940485, + "learning_rate": 4.004377757778131e-05, + "loss": 0.0948, + "step": 23350 + }, + { + "epoch": 1.823148364941856, + "grad_norm": 3.59621000289917, + "learning_rate": 3.999928566684657e-05, + "loss": 0.9385, + "step": 23360 + }, + { + "epoch": 1.823928822289862, + "grad_norm": 3.543958015939097e-08, + "learning_rate": 3.995480200306515e-05, + "loss": 0.9834, + "step": 23370 + }, + { + "epoch": 1.8247092796378679, + "grad_norm": 7.353454113006592, + "learning_rate": 3.9910326623120764e-05, + "loss": 1.4397, + "step": 23380 + }, + { + "epoch": 1.8254897369858738, + "grad_norm": 55.0924186706543, + "learning_rate": 3.986585956369036e-05, + "loss": 1.323, + "step": 23390 + }, + { + "epoch": 1.8262701943338797, + "grad_norm": 0.015754669904708862, + "learning_rate": 3.9821400861443916e-05, + "loss": 1.3286, + "step": 23400 + }, + { + "epoch": 1.8270506516818856, + "grad_norm": 0.4562488794326782, + "learning_rate": 3.977695055304464e-05, + "loss": 0.7128, + "step": 23410 + }, + { + "epoch": 1.8278311090298915, + "grad_norm": 43.183048248291016, + "learning_rate": 3.973250867514874e-05, + "loss": 0.9965, + "step": 23420 + }, + { + "epoch": 1.8286115663778975, + "grad_norm": 4.6577596890529094e-07, + "learning_rate": 3.9688075264405524e-05, + "loss": 0.2827, + "step": 23430 + }, + { + "epoch": 1.8293920237259034, + "grad_norm": 0.00020056143694091588, + "learning_rate": 3.964365035745726e-05, + "loss": 0.4973, + "step": 23440 + }, + { + "epoch": 1.8301724810739093, + "grad_norm": 31.505395889282227, + "learning_rate": 3.959923399093923e-05, + "loss": 1.1041, + "step": 23450 + }, + { + "epoch": 1.8309529384219152, + "grad_norm": 0.45826637744903564, + "learning_rate": 3.9554826201479716e-05, + "loss": 0.0045, + "step": 23460 + }, + { + "epoch": 1.8317333957699211, + "grad_norm": 46.24296188354492, + "learning_rate": 3.9510427025699834e-05, + "loss": 1.0564, + "step": 23470 + }, + { + "epoch": 1.832513853117927, + "grad_norm": 40.72001647949219, + "learning_rate": 3.94660365002137e-05, + "loss": 1.8935, + "step": 23480 + }, + { + "epoch": 1.833294310465933, + "grad_norm": 0.08623958379030228, + "learning_rate": 3.9421654661628185e-05, + "loss": 0.5077, + "step": 23490 + }, + { + "epoch": 1.8340747678139389, + "grad_norm": 0.00014190695947036147, + "learning_rate": 3.937728154654312e-05, + "loss": 0.6978, + "step": 23500 + }, + { + "epoch": 1.8348552251619448, + "grad_norm": 20.867637634277344, + "learning_rate": 3.9332917191551037e-05, + "loss": 0.49, + "step": 23510 + }, + { + "epoch": 1.8356356825099507, + "grad_norm": 0.35819879174232483, + "learning_rate": 3.928856163323733e-05, + "loss": 0.1014, + "step": 23520 + }, + { + "epoch": 1.8364161398579566, + "grad_norm": 70.75408935546875, + "learning_rate": 3.924421490818004e-05, + "loss": 3.2741, + "step": 23530 + }, + { + "epoch": 1.8371965972059627, + "grad_norm": 1.0380865234083103e-07, + "learning_rate": 3.919987705295001e-05, + "loss": 0.4643, + "step": 23540 + }, + { + "epoch": 1.8379770545539686, + "grad_norm": 0.1350480318069458, + "learning_rate": 3.915554810411074e-05, + "loss": 1.303, + "step": 23550 + }, + { + "epoch": 1.8387575119019746, + "grad_norm": 16.59437370300293, + "learning_rate": 3.911122809821836e-05, + "loss": 2.1433, + "step": 23560 + }, + { + "epoch": 1.8395379692499805, + "grad_norm": 53.31951141357422, + "learning_rate": 3.9066917071821675e-05, + "loss": 0.6518, + "step": 23570 + }, + { + "epoch": 1.8403184265979864, + "grad_norm": 0.13815993070602417, + "learning_rate": 3.9022615061462034e-05, + "loss": 1.1043, + "step": 23580 + }, + { + "epoch": 1.8410988839459923, + "grad_norm": 1.3820558786392212, + "learning_rate": 3.8978322103673397e-05, + "loss": 0.2971, + "step": 23590 + }, + { + "epoch": 1.8418793412939984, + "grad_norm": 0.158604234457016, + "learning_rate": 3.8934038234982214e-05, + "loss": 1.0221, + "step": 23600 + }, + { + "epoch": 1.8426597986420044, + "grad_norm": 0.006823586765676737, + "learning_rate": 3.888976349190748e-05, + "loss": 1.0428, + "step": 23610 + }, + { + "epoch": 1.8434402559900103, + "grad_norm": 0.2200150042772293, + "learning_rate": 3.884549791096062e-05, + "loss": 1.8464, + "step": 23620 + }, + { + "epoch": 1.8442207133380162, + "grad_norm": 2.2968795747146942e-05, + "learning_rate": 3.880124152864558e-05, + "loss": 1.5282, + "step": 23630 + }, + { + "epoch": 1.845001170686022, + "grad_norm": 18.445926666259766, + "learning_rate": 3.875699438145862e-05, + "loss": 0.2527, + "step": 23640 + }, + { + "epoch": 1.845781628034028, + "grad_norm": 49.831668853759766, + "learning_rate": 3.871275650588844e-05, + "loss": 1.8936, + "step": 23650 + }, + { + "epoch": 1.846562085382034, + "grad_norm": 0.025264471769332886, + "learning_rate": 3.8668527938416125e-05, + "loss": 0.463, + "step": 23660 + }, + { + "epoch": 1.8473425427300398, + "grad_norm": 7.791200914653018e-05, + "learning_rate": 3.8624308715515e-05, + "loss": 0.9327, + "step": 23670 + }, + { + "epoch": 1.8481230000780458, + "grad_norm": 15.435553550720215, + "learning_rate": 3.858009887365077e-05, + "loss": 1.1377, + "step": 23680 + }, + { + "epoch": 1.8489034574260517, + "grad_norm": 36.36300277709961, + "learning_rate": 3.8535898449281325e-05, + "loss": 1.2851, + "step": 23690 + }, + { + "epoch": 1.8496839147740576, + "grad_norm": 4.985684394836426, + "learning_rate": 3.8491707478856885e-05, + "loss": 0.0899, + "step": 23700 + }, + { + "epoch": 1.8504643721220635, + "grad_norm": 4.401831392897293e-05, + "learning_rate": 3.844752599881976e-05, + "loss": 1.7606, + "step": 23710 + }, + { + "epoch": 1.8512448294700694, + "grad_norm": 0.06194838881492615, + "learning_rate": 3.840335404560453e-05, + "loss": 0.4333, + "step": 23720 + }, + { + "epoch": 1.8520252868180753, + "grad_norm": 0.012446342036128044, + "learning_rate": 3.835919165563787e-05, + "loss": 0.1386, + "step": 23730 + }, + { + "epoch": 1.8528057441660812, + "grad_norm": 78.01200866699219, + "learning_rate": 3.8315038865338555e-05, + "loss": 1.6024, + "step": 23740 + }, + { + "epoch": 1.8535862015140872, + "grad_norm": 4.170412540435791, + "learning_rate": 3.82708957111175e-05, + "loss": 1.1088, + "step": 23750 + }, + { + "epoch": 1.854366658862093, + "grad_norm": 1.04580731203896e-05, + "learning_rate": 3.8226762229377614e-05, + "loss": 1.2849, + "step": 23760 + }, + { + "epoch": 1.855147116210099, + "grad_norm": 57.779754638671875, + "learning_rate": 3.818263845651389e-05, + "loss": 0.8573, + "step": 23770 + }, + { + "epoch": 1.855927573558105, + "grad_norm": 25.989601135253906, + "learning_rate": 3.813852442891324e-05, + "loss": 0.6765, + "step": 23780 + }, + { + "epoch": 1.856708030906111, + "grad_norm": 0.014243219047784805, + "learning_rate": 3.8094420182954624e-05, + "loss": 1.549, + "step": 23790 + }, + { + "epoch": 1.857488488254117, + "grad_norm": 0.006569798570126295, + "learning_rate": 3.805032575500885e-05, + "loss": 0.2146, + "step": 23800 + }, + { + "epoch": 1.8582689456021229, + "grad_norm": 0.029686694964766502, + "learning_rate": 3.800624118143869e-05, + "loss": 0.4949, + "step": 23810 + }, + { + "epoch": 1.8590494029501288, + "grad_norm": 6.546103000640869, + "learning_rate": 3.796216649859878e-05, + "loss": 2.2488, + "step": 23820 + }, + { + "epoch": 1.8598298602981347, + "grad_norm": 0.12819623947143555, + "learning_rate": 3.791810174283557e-05, + "loss": 0.9862, + "step": 23830 + }, + { + "epoch": 1.8606103176461406, + "grad_norm": 1.798675775527954, + "learning_rate": 3.7874046950487365e-05, + "loss": 0.4003, + "step": 23840 + }, + { + "epoch": 1.8613907749941467, + "grad_norm": 45.85932922363281, + "learning_rate": 3.78300021578842e-05, + "loss": 0.6263, + "step": 23850 + }, + { + "epoch": 1.8621712323421526, + "grad_norm": 7.008039474487305, + "learning_rate": 3.7785967401347944e-05, + "loss": 0.3043, + "step": 23860 + }, + { + "epoch": 1.8629516896901586, + "grad_norm": 21.64405059814453, + "learning_rate": 3.7741942717192094e-05, + "loss": 0.4736, + "step": 23870 + }, + { + "epoch": 1.8637321470381645, + "grad_norm": 2.6776916683957097e-08, + "learning_rate": 3.769792814172192e-05, + "loss": 0.4083, + "step": 23880 + }, + { + "epoch": 1.8645126043861704, + "grad_norm": 59.70551681518555, + "learning_rate": 3.7653923711234306e-05, + "loss": 1.3214, + "step": 23890 + }, + { + "epoch": 1.8652930617341763, + "grad_norm": 1.2317818800511304e-06, + "learning_rate": 3.7609929462017814e-05, + "loss": 1.1039, + "step": 23900 + }, + { + "epoch": 1.8660735190821822, + "grad_norm": 0.8701543211936951, + "learning_rate": 3.7565945430352547e-05, + "loss": 1.3019, + "step": 23910 + }, + { + "epoch": 1.8668539764301881, + "grad_norm": 0.3090846538543701, + "learning_rate": 3.752197165251025e-05, + "loss": 0.399, + "step": 23920 + }, + { + "epoch": 1.867634433778194, + "grad_norm": 27.304723739624023, + "learning_rate": 3.747800816475418e-05, + "loss": 0.537, + "step": 23930 + }, + { + "epoch": 1.8684148911262, + "grad_norm": 0.8099300265312195, + "learning_rate": 3.743405500333908e-05, + "loss": 0.825, + "step": 23940 + }, + { + "epoch": 1.8691953484742059, + "grad_norm": 43.0421142578125, + "learning_rate": 3.739011220451124e-05, + "loss": 1.1039, + "step": 23950 + }, + { + "epoch": 1.8699758058222118, + "grad_norm": 46.806846618652344, + "learning_rate": 3.7346179804508343e-05, + "loss": 0.7547, + "step": 23960 + }, + { + "epoch": 1.8707562631702177, + "grad_norm": 0.00016689456242602319, + "learning_rate": 3.730225783955956e-05, + "loss": 4.0015, + "step": 23970 + }, + { + "epoch": 1.8715367205182236, + "grad_norm": 0.00046824358287267387, + "learning_rate": 3.7258346345885383e-05, + "loss": 0.9711, + "step": 23980 + }, + { + "epoch": 1.8723171778662295, + "grad_norm": 13.834807395935059, + "learning_rate": 3.7214445359697735e-05, + "loss": 0.7928, + "step": 23990 + }, + { + "epoch": 1.8730976352142354, + "grad_norm": 0.018621161580085754, + "learning_rate": 3.717055491719982e-05, + "loss": 1.8895, + "step": 24000 + }, + { + "epoch": 1.8738780925622414, + "grad_norm": 0.5802863836288452, + "learning_rate": 3.712667505458622e-05, + "loss": 0.2756, + "step": 24010 + }, + { + "epoch": 1.8746585499102473, + "grad_norm": 46.72166061401367, + "learning_rate": 3.7082805808042696e-05, + "loss": 1.032, + "step": 24020 + }, + { + "epoch": 1.8754390072582532, + "grad_norm": 48.039520263671875, + "learning_rate": 3.703894721374632e-05, + "loss": 1.0127, + "step": 24030 + }, + { + "epoch": 1.876219464606259, + "grad_norm": 0.05087466910481453, + "learning_rate": 3.699509930786539e-05, + "loss": 1.2346, + "step": 24040 + }, + { + "epoch": 1.8769999219542652, + "grad_norm": 31.790119171142578, + "learning_rate": 3.6951262126559315e-05, + "loss": 0.2938, + "step": 24050 + }, + { + "epoch": 1.8777803793022712, + "grad_norm": 5.346080303192139, + "learning_rate": 3.690743570597874e-05, + "loss": 1.0873, + "step": 24060 + }, + { + "epoch": 1.878560836650277, + "grad_norm": 7.171860829657817e-07, + "learning_rate": 3.686362008226539e-05, + "loss": 0.0249, + "step": 24070 + }, + { + "epoch": 1.879341293998283, + "grad_norm": 0.0009441017173230648, + "learning_rate": 3.681981529155213e-05, + "loss": 1.2795, + "step": 24080 + }, + { + "epoch": 1.880121751346289, + "grad_norm": 7.874264156271238e-07, + "learning_rate": 3.677602136996282e-05, + "loss": 0.7461, + "step": 24090 + }, + { + "epoch": 1.8809022086942948, + "grad_norm": 1.7541738748550415, + "learning_rate": 3.673223835361244e-05, + "loss": 1.8159, + "step": 24100 + }, + { + "epoch": 1.881682666042301, + "grad_norm": 0.14342258870601654, + "learning_rate": 3.668846627860689e-05, + "loss": 0.9154, + "step": 24110 + }, + { + "epoch": 1.8824631233903069, + "grad_norm": 2.541520416343701e-07, + "learning_rate": 3.664470518104314e-05, + "loss": 2.0994, + "step": 24120 + }, + { + "epoch": 1.8832435807383128, + "grad_norm": 1.3055483577772975e-05, + "learning_rate": 3.6600955097009024e-05, + "loss": 1.2072, + "step": 24130 + }, + { + "epoch": 1.8840240380863187, + "grad_norm": 23.39139175415039, + "learning_rate": 3.655721606258334e-05, + "loss": 0.6577, + "step": 24140 + }, + { + "epoch": 1.8848044954343246, + "grad_norm": 6.286259651184082, + "learning_rate": 3.651348811383577e-05, + "loss": 0.2997, + "step": 24150 + }, + { + "epoch": 1.8855849527823305, + "grad_norm": 3.474562644958496, + "learning_rate": 3.646977128682684e-05, + "loss": 0.7834, + "step": 24160 + }, + { + "epoch": 1.8863654101303364, + "grad_norm": 16.204011917114258, + "learning_rate": 3.642606561760793e-05, + "loss": 1.7633, + "step": 24170 + }, + { + "epoch": 1.8871458674783423, + "grad_norm": 44.72784423828125, + "learning_rate": 3.6382371142221175e-05, + "loss": 0.5443, + "step": 24180 + }, + { + "epoch": 1.8879263248263483, + "grad_norm": 1.67126614769586e-07, + "learning_rate": 3.633868789669952e-05, + "loss": 1.0933, + "step": 24190 + }, + { + "epoch": 1.8887067821743542, + "grad_norm": 9.963812885871448e-09, + "learning_rate": 3.629501591706662e-05, + "loss": 0.6429, + "step": 24200 + }, + { + "epoch": 1.88948723952236, + "grad_norm": 2.0460259914398193, + "learning_rate": 3.625135523933689e-05, + "loss": 0.1334, + "step": 24210 + }, + { + "epoch": 1.890267696870366, + "grad_norm": 0.08299527317285538, + "learning_rate": 3.6207705899515355e-05, + "loss": 0.5164, + "step": 24220 + }, + { + "epoch": 1.891048154218372, + "grad_norm": 49.47283935546875, + "learning_rate": 3.6164067933597713e-05, + "loss": 0.6176, + "step": 24230 + }, + { + "epoch": 1.8918286115663778, + "grad_norm": 9.358882904052734, + "learning_rate": 3.6120441377570336e-05, + "loss": 0.5464, + "step": 24240 + }, + { + "epoch": 1.8926090689143837, + "grad_norm": 2.0653233528137207, + "learning_rate": 3.6076826267410095e-05, + "loss": 1.2293, + "step": 24250 + }, + { + "epoch": 1.8933895262623897, + "grad_norm": 0.00041112504550255835, + "learning_rate": 3.603322263908451e-05, + "loss": 0.4034, + "step": 24260 + }, + { + "epoch": 1.8941699836103956, + "grad_norm": 70.98471069335938, + "learning_rate": 3.598963052855157e-05, + "loss": 2.0693, + "step": 24270 + }, + { + "epoch": 1.8949504409584015, + "grad_norm": 1.5118367671966553, + "learning_rate": 3.594604997175981e-05, + "loss": 1.5463, + "step": 24280 + }, + { + "epoch": 1.8957308983064074, + "grad_norm": 0.04422280192375183, + "learning_rate": 3.590248100464818e-05, + "loss": 0.6958, + "step": 24290 + }, + { + "epoch": 1.8965113556544135, + "grad_norm": 6.168531399453059e-06, + "learning_rate": 3.5858923663146146e-05, + "loss": 0.7424, + "step": 24300 + }, + { + "epoch": 1.8972918130024194, + "grad_norm": 0.00010713309893617406, + "learning_rate": 3.5815377983173544e-05, + "loss": 1.1302, + "step": 24310 + }, + { + "epoch": 1.8980722703504254, + "grad_norm": 46.48796463012695, + "learning_rate": 3.577184400064057e-05, + "loss": 0.9124, + "step": 24320 + }, + { + "epoch": 1.8988527276984313, + "grad_norm": 15.14918041229248, + "learning_rate": 3.572832175144783e-05, + "loss": 1.08, + "step": 24330 + }, + { + "epoch": 1.8996331850464372, + "grad_norm": 4.831262049265206e-05, + "learning_rate": 3.568481127148621e-05, + "loss": 0.0379, + "step": 24340 + }, + { + "epoch": 1.900413642394443, + "grad_norm": 71.39578247070312, + "learning_rate": 3.5641312596636937e-05, + "loss": 1.7415, + "step": 24350 + }, + { + "epoch": 1.9011940997424492, + "grad_norm": 1.3533830642700195, + "learning_rate": 3.559782576277142e-05, + "loss": 0.5327, + "step": 24360 + }, + { + "epoch": 1.9019745570904552, + "grad_norm": 0.0841241180896759, + "learning_rate": 3.555435080575141e-05, + "loss": 1.65, + "step": 24370 + }, + { + "epoch": 1.902755014438461, + "grad_norm": 8.758081436157227, + "learning_rate": 3.5510887761428765e-05, + "loss": 2.5704, + "step": 24380 + }, + { + "epoch": 1.903535471786467, + "grad_norm": 10.82071304321289, + "learning_rate": 3.54674366656456e-05, + "loss": 1.8161, + "step": 24390 + }, + { + "epoch": 1.904315929134473, + "grad_norm": 16.516754150390625, + "learning_rate": 3.542399755423412e-05, + "loss": 1.1733, + "step": 24400 + }, + { + "epoch": 1.9050963864824788, + "grad_norm": 0.3717861473560333, + "learning_rate": 3.5380570463016686e-05, + "loss": 0.4364, + "step": 24410 + }, + { + "epoch": 1.9058768438304847, + "grad_norm": 2.7536627385416068e-05, + "learning_rate": 3.5337155427805694e-05, + "loss": 0.9228, + "step": 24420 + }, + { + "epoch": 1.9066573011784906, + "grad_norm": 21.14723777770996, + "learning_rate": 3.529375248440365e-05, + "loss": 0.1477, + "step": 24430 + }, + { + "epoch": 1.9074377585264966, + "grad_norm": 1.0672245025634766, + "learning_rate": 3.525036166860309e-05, + "loss": 1.4234, + "step": 24440 + }, + { + "epoch": 1.9082182158745025, + "grad_norm": 31.768104553222656, + "learning_rate": 3.520698301618649e-05, + "loss": 0.4301, + "step": 24450 + }, + { + "epoch": 1.9089986732225084, + "grad_norm": 0.653933048248291, + "learning_rate": 3.516361656292636e-05, + "loss": 0.687, + "step": 24460 + }, + { + "epoch": 1.9097791305705143, + "grad_norm": 34.76579284667969, + "learning_rate": 3.512026234458511e-05, + "loss": 1.8729, + "step": 24470 + }, + { + "epoch": 1.9105595879185202, + "grad_norm": 0.0032537386287003756, + "learning_rate": 3.50769203969151e-05, + "loss": 1.5925, + "step": 24480 + }, + { + "epoch": 1.9113400452665261, + "grad_norm": 2.1661665439605713, + "learning_rate": 3.503359075565852e-05, + "loss": 0.2118, + "step": 24490 + }, + { + "epoch": 1.912120502614532, + "grad_norm": 0.00010317000123905018, + "learning_rate": 3.499027345654745e-05, + "loss": 0.8827, + "step": 24500 + }, + { + "epoch": 1.912900959962538, + "grad_norm": 0.04974118992686272, + "learning_rate": 3.4946968535303784e-05, + "loss": 1.744, + "step": 24510 + }, + { + "epoch": 1.9136814173105439, + "grad_norm": 0.6756658554077148, + "learning_rate": 3.490367602763916e-05, + "loss": 0.7952, + "step": 24520 + }, + { + "epoch": 1.9144618746585498, + "grad_norm": 30.926450729370117, + "learning_rate": 3.486039596925509e-05, + "loss": 1.3325, + "step": 24530 + }, + { + "epoch": 1.9152423320065557, + "grad_norm": 9.425595635548234e-05, + "learning_rate": 3.481712839584269e-05, + "loss": 1.4034, + "step": 24540 + }, + { + "epoch": 1.9160227893545618, + "grad_norm": 26.642850875854492, + "learning_rate": 3.4773873343082905e-05, + "loss": 1.6034, + "step": 24550 + }, + { + "epoch": 1.9168032467025677, + "grad_norm": 0.25802430510520935, + "learning_rate": 3.473063084664623e-05, + "loss": 0.5995, + "step": 24560 + }, + { + "epoch": 1.9175837040505737, + "grad_norm": 0.4528713822364807, + "learning_rate": 3.468740094219291e-05, + "loss": 0.1909, + "step": 24570 + }, + { + "epoch": 1.9183641613985796, + "grad_norm": 15.111522674560547, + "learning_rate": 3.464418366537273e-05, + "loss": 1.5801, + "step": 24580 + }, + { + "epoch": 1.9191446187465855, + "grad_norm": 1.4172719717025757, + "learning_rate": 3.4600979051825134e-05, + "loss": 0.3448, + "step": 24590 + }, + { + "epoch": 1.9199250760945914, + "grad_norm": 36.149879455566406, + "learning_rate": 3.455778713717905e-05, + "loss": 0.9534, + "step": 24600 + }, + { + "epoch": 1.9207055334425973, + "grad_norm": 0.8631346821784973, + "learning_rate": 3.451460795705299e-05, + "loss": 0.472, + "step": 24610 + }, + { + "epoch": 1.9214859907906034, + "grad_norm": 37.74773406982422, + "learning_rate": 3.447144154705494e-05, + "loss": 0.6436, + "step": 24620 + }, + { + "epoch": 1.9222664481386094, + "grad_norm": 1.151184278569417e-06, + "learning_rate": 3.442828794278233e-05, + "loss": 0.7656, + "step": 24630 + }, + { + "epoch": 1.9230469054866153, + "grad_norm": 0.5503702163696289, + "learning_rate": 3.43851471798221e-05, + "loss": 1.6578, + "step": 24640 + }, + { + "epoch": 1.9238273628346212, + "grad_norm": 0.8215304017066956, + "learning_rate": 3.434201929375051e-05, + "loss": 1.3437, + "step": 24650 + }, + { + "epoch": 1.924607820182627, + "grad_norm": 2.416992629150627e-06, + "learning_rate": 3.42989043201333e-05, + "loss": 0.6802, + "step": 24660 + }, + { + "epoch": 1.925388277530633, + "grad_norm": 26.039220809936523, + "learning_rate": 3.425580229452546e-05, + "loss": 0.4974, + "step": 24670 + }, + { + "epoch": 1.926168734878639, + "grad_norm": 0.001985641196370125, + "learning_rate": 3.42127132524714e-05, + "loss": 1.6364, + "step": 24680 + }, + { + "epoch": 1.9269491922266448, + "grad_norm": 2.4921052954596234e-07, + "learning_rate": 3.416963722950472e-05, + "loss": 0.524, + "step": 24690 + }, + { + "epoch": 1.9277296495746508, + "grad_norm": 0.6401686072349548, + "learning_rate": 3.412657426114839e-05, + "loss": 1.4678, + "step": 24700 + }, + { + "epoch": 1.9285101069226567, + "grad_norm": 3.618363618850708, + "learning_rate": 3.408352438291456e-05, + "loss": 0.7004, + "step": 24710 + }, + { + "epoch": 1.9292905642706626, + "grad_norm": 0.29783499240875244, + "learning_rate": 3.4040487630304536e-05, + "loss": 0.757, + "step": 24720 + }, + { + "epoch": 1.9300710216186685, + "grad_norm": 0.0320902019739151, + "learning_rate": 3.3997464038808904e-05, + "loss": 0.8225, + "step": 24730 + }, + { + "epoch": 1.9308514789666744, + "grad_norm": 48.40365219116211, + "learning_rate": 3.395445364390732e-05, + "loss": 1.3788, + "step": 24740 + }, + { + "epoch": 1.9316319363146803, + "grad_norm": 0.07659325748682022, + "learning_rate": 3.391145648106861e-05, + "loss": 1.8757, + "step": 24750 + }, + { + "epoch": 1.9324123936626862, + "grad_norm": 38.49648666381836, + "learning_rate": 3.3868472585750625e-05, + "loss": 0.7217, + "step": 24760 + }, + { + "epoch": 1.9331928510106922, + "grad_norm": 2.974672270283918e-06, + "learning_rate": 3.3825501993400335e-05, + "loss": 0.3299, + "step": 24770 + }, + { + "epoch": 1.933973308358698, + "grad_norm": 0.9465207457542419, + "learning_rate": 3.378254473945369e-05, + "loss": 0.9885, + "step": 24780 + }, + { + "epoch": 1.934753765706704, + "grad_norm": 0.0180289875715971, + "learning_rate": 3.373960085933571e-05, + "loss": 0.486, + "step": 24790 + }, + { + "epoch": 1.93553422305471, + "grad_norm": 0.6527422070503235, + "learning_rate": 3.3696670388460304e-05, + "loss": 0.4105, + "step": 24800 + }, + { + "epoch": 1.936314680402716, + "grad_norm": 3.048807382583618, + "learning_rate": 3.3653753362230365e-05, + "loss": 0.2045, + "step": 24810 + }, + { + "epoch": 1.937095137750722, + "grad_norm": 0.7973394393920898, + "learning_rate": 3.3610849816037715e-05, + "loss": 0.7687, + "step": 24820 + }, + { + "epoch": 1.9378755950987279, + "grad_norm": 30.129440307617188, + "learning_rate": 3.3567959785263004e-05, + "loss": 0.5779, + "step": 24830 + }, + { + "epoch": 1.9386560524467338, + "grad_norm": 7.424580417136895e-08, + "learning_rate": 3.3525083305275806e-05, + "loss": 1.9567, + "step": 24840 + }, + { + "epoch": 1.9394365097947397, + "grad_norm": 50.41664123535156, + "learning_rate": 3.3482220411434454e-05, + "loss": 1.6986, + "step": 24850 + }, + { + "epoch": 1.9402169671427456, + "grad_norm": 4.4609785079956055, + "learning_rate": 3.343937113908615e-05, + "loss": 1.3763, + "step": 24860 + }, + { + "epoch": 1.9409974244907517, + "grad_norm": 0.0026027297135442495, + "learning_rate": 3.339653552356677e-05, + "loss": 1.1527, + "step": 24870 + }, + { + "epoch": 1.9417778818387577, + "grad_norm": 9.384146324009635e-06, + "learning_rate": 3.335371360020102e-05, + "loss": 1.4681, + "step": 24880 + }, + { + "epoch": 1.9425583391867636, + "grad_norm": 0.36783191561698914, + "learning_rate": 3.3310905404302256e-05, + "loss": 1.0031, + "step": 24890 + }, + { + "epoch": 1.9433387965347695, + "grad_norm": 1.8857052326202393, + "learning_rate": 3.326811097117255e-05, + "loss": 0.7547, + "step": 24900 + }, + { + "epoch": 1.9441192538827754, + "grad_norm": 0.0002518291294109076, + "learning_rate": 3.322533033610259e-05, + "loss": 0.503, + "step": 24910 + }, + { + "epoch": 1.9448997112307813, + "grad_norm": 0.11181085556745529, + "learning_rate": 3.318256353437169e-05, + "loss": 0.5882, + "step": 24920 + }, + { + "epoch": 1.9456801685787872, + "grad_norm": 0.29577258229255676, + "learning_rate": 3.31398106012478e-05, + "loss": 0.5357, + "step": 24930 + }, + { + "epoch": 1.9464606259267931, + "grad_norm": 49.33049774169922, + "learning_rate": 3.309707157198737e-05, + "loss": 1.7088, + "step": 24940 + }, + { + "epoch": 1.947241083274799, + "grad_norm": 3.9758615493774414, + "learning_rate": 3.305434648183544e-05, + "loss": 0.3997, + "step": 24950 + }, + { + "epoch": 1.948021540622805, + "grad_norm": 0.04083220288157463, + "learning_rate": 3.3011635366025484e-05, + "loss": 0.0144, + "step": 24960 + }, + { + "epoch": 1.9488019979708109, + "grad_norm": 7.539558410644531, + "learning_rate": 3.296893825977957e-05, + "loss": 0.7937, + "step": 24970 + }, + { + "epoch": 1.9495824553188168, + "grad_norm": 7.05432341874257e-07, + "learning_rate": 3.292625519830808e-05, + "loss": 1.4518, + "step": 24980 + }, + { + "epoch": 1.9503629126668227, + "grad_norm": 0.09120603650808334, + "learning_rate": 3.288358621680992e-05, + "loss": 0.6368, + "step": 24990 + }, + { + "epoch": 1.9511433700148286, + "grad_norm": 0.2118591070175171, + "learning_rate": 3.284093135047231e-05, + "loss": 0.1397, + "step": 25000 + }, + { + "epoch": 1.9519238273628345, + "grad_norm": 0.033967263996601105, + "learning_rate": 3.279829063447084e-05, + "loss": 1.0633, + "step": 25010 + }, + { + "epoch": 1.9527042847108405, + "grad_norm": 1.3979352712631226, + "learning_rate": 3.275566410396951e-05, + "loss": 0.2829, + "step": 25020 + }, + { + "epoch": 1.9534847420588464, + "grad_norm": 36.74382019042969, + "learning_rate": 3.271305179412051e-05, + "loss": 0.7056, + "step": 25030 + }, + { + "epoch": 1.9542651994068523, + "grad_norm": 78.06299591064453, + "learning_rate": 3.267045374006438e-05, + "loss": 1.8314, + "step": 25040 + }, + { + "epoch": 1.9550456567548582, + "grad_norm": 8.80843734741211, + "learning_rate": 3.2627869976929856e-05, + "loss": 1.2586, + "step": 25050 + }, + { + "epoch": 1.9558261141028643, + "grad_norm": 33.692222595214844, + "learning_rate": 3.258530053983396e-05, + "loss": 1.4325, + "step": 25060 + }, + { + "epoch": 1.9566065714508702, + "grad_norm": 51.98230743408203, + "learning_rate": 3.2542745463881794e-05, + "loss": 1.252, + "step": 25070 + }, + { + "epoch": 1.9573870287988762, + "grad_norm": 1.1896955966949463, + "learning_rate": 3.250020478416671e-05, + "loss": 3.1767, + "step": 25080 + }, + { + "epoch": 1.958167486146882, + "grad_norm": 0.004445586819201708, + "learning_rate": 3.245767853577013e-05, + "loss": 0.2778, + "step": 25090 + }, + { + "epoch": 1.958947943494888, + "grad_norm": 0.015711573883891106, + "learning_rate": 3.241516675376164e-05, + "loss": 0.5651, + "step": 25100 + }, + { + "epoch": 1.959728400842894, + "grad_norm": 42.32749557495117, + "learning_rate": 3.2372669473198816e-05, + "loss": 0.5504, + "step": 25110 + }, + { + "epoch": 1.9605088581908998, + "grad_norm": 0.5061352849006653, + "learning_rate": 3.233018672912731e-05, + "loss": 1.652, + "step": 25120 + }, + { + "epoch": 1.961289315538906, + "grad_norm": 1.3850916624069214, + "learning_rate": 3.228771855658082e-05, + "loss": 0.4327, + "step": 25130 + }, + { + "epoch": 1.9620697728869119, + "grad_norm": 0.775698721408844, + "learning_rate": 3.224526499058096e-05, + "loss": 0.7714, + "step": 25140 + }, + { + "epoch": 1.9628502302349178, + "grad_norm": 46.00149154663086, + "learning_rate": 3.220282606613737e-05, + "loss": 0.8217, + "step": 25150 + }, + { + "epoch": 1.9636306875829237, + "grad_norm": 15.187111854553223, + "learning_rate": 3.2160401818247556e-05, + "loss": 1.13, + "step": 25160 + }, + { + "epoch": 1.9644111449309296, + "grad_norm": 27.419458389282227, + "learning_rate": 3.211799228189697e-05, + "loss": 1.1076, + "step": 25170 + }, + { + "epoch": 1.9651916022789355, + "grad_norm": 7.90415215305984e-05, + "learning_rate": 3.207559749205888e-05, + "loss": 0.0562, + "step": 25180 + }, + { + "epoch": 1.9659720596269414, + "grad_norm": 41.95665740966797, + "learning_rate": 3.2033217483694455e-05, + "loss": 0.5071, + "step": 25190 + }, + { + "epoch": 1.9667525169749474, + "grad_norm": 0.020275263115763664, + "learning_rate": 3.199085229175263e-05, + "loss": 0.9272, + "step": 25200 + }, + { + "epoch": 1.9675329743229533, + "grad_norm": 0.17847752571105957, + "learning_rate": 3.194850195117011e-05, + "loss": 0.6496, + "step": 25210 + }, + { + "epoch": 1.9683134316709592, + "grad_norm": 0.0055974675342440605, + "learning_rate": 3.19061664968714e-05, + "loss": 0.8146, + "step": 25220 + }, + { + "epoch": 1.969093889018965, + "grad_norm": 15.684666633605957, + "learning_rate": 3.1863845963768685e-05, + "loss": 0.8318, + "step": 25230 + }, + { + "epoch": 1.969874346366971, + "grad_norm": 60.0120964050293, + "learning_rate": 3.1821540386761894e-05, + "loss": 1.1431, + "step": 25240 + }, + { + "epoch": 1.970654803714977, + "grad_norm": 22.573179244995117, + "learning_rate": 3.177924980073856e-05, + "loss": 1.4687, + "step": 25250 + }, + { + "epoch": 1.9714352610629828, + "grad_norm": 12.676541328430176, + "learning_rate": 3.173697424057391e-05, + "loss": 0.347, + "step": 25260 + }, + { + "epoch": 1.9722157184109887, + "grad_norm": 3.967153787612915, + "learning_rate": 3.1694713741130744e-05, + "loss": 0.12, + "step": 25270 + }, + { + "epoch": 1.9729961757589947, + "grad_norm": 0.0006802030839025974, + "learning_rate": 3.165246833725946e-05, + "loss": 0.3, + "step": 25280 + }, + { + "epoch": 1.9737766331070006, + "grad_norm": 45.15895080566406, + "learning_rate": 3.1610238063798e-05, + "loss": 0.2775, + "step": 25290 + }, + { + "epoch": 1.9745570904550065, + "grad_norm": 50.831687927246094, + "learning_rate": 3.1568022955571824e-05, + "loss": 0.9303, + "step": 25300 + }, + { + "epoch": 1.9753375478030124, + "grad_norm": 0.08076174557209015, + "learning_rate": 3.15258230473939e-05, + "loss": 1.0626, + "step": 25310 + }, + { + "epoch": 1.9761180051510185, + "grad_norm": 1.8216647079682957e-09, + "learning_rate": 3.148363837406464e-05, + "loss": 0.2984, + "step": 25320 + }, + { + "epoch": 1.9768984624990245, + "grad_norm": 30.55402374267578, + "learning_rate": 3.144146897037194e-05, + "loss": 1.9792, + "step": 25330 + }, + { + "epoch": 1.9776789198470304, + "grad_norm": 52.432830810546875, + "learning_rate": 3.139931487109102e-05, + "loss": 1.3619, + "step": 25340 + }, + { + "epoch": 1.9784593771950363, + "grad_norm": 0.5994627475738525, + "learning_rate": 3.135717611098458e-05, + "loss": 0.7943, + "step": 25350 + }, + { + "epoch": 1.9792398345430422, + "grad_norm": 0.807468593120575, + "learning_rate": 3.1315052724802566e-05, + "loss": 1.75, + "step": 25360 + }, + { + "epoch": 1.9800202918910481, + "grad_norm": 0.4832046627998352, + "learning_rate": 3.127294474728236e-05, + "loss": 1.0956, + "step": 25370 + }, + { + "epoch": 1.9808007492390542, + "grad_norm": 7.688863754272461, + "learning_rate": 3.123085221314852e-05, + "loss": 0.3429, + "step": 25380 + }, + { + "epoch": 1.9815812065870602, + "grad_norm": 2.0159497580607422e-05, + "learning_rate": 3.118877515711295e-05, + "loss": 0.4776, + "step": 25390 + }, + { + "epoch": 1.982361663935066, + "grad_norm": 0.0029121001716703176, + "learning_rate": 3.1146713613874775e-05, + "loss": 0.0429, + "step": 25400 + }, + { + "epoch": 1.983142121283072, + "grad_norm": 70.2876205444336, + "learning_rate": 3.110466761812029e-05, + "loss": 0.2696, + "step": 25410 + }, + { + "epoch": 1.983922578631078, + "grad_norm": 5.711496275928596e-10, + "learning_rate": 3.1062637204523014e-05, + "loss": 0.8996, + "step": 25420 + }, + { + "epoch": 1.9847030359790838, + "grad_norm": 0.5862593054771423, + "learning_rate": 3.102062240774359e-05, + "loss": 1.1331, + "step": 25430 + }, + { + "epoch": 1.9854834933270897, + "grad_norm": 85.54405212402344, + "learning_rate": 3.0978623262429806e-05, + "loss": 1.5893, + "step": 25440 + }, + { + "epoch": 1.9862639506750956, + "grad_norm": 6.912586286489386e-06, + "learning_rate": 3.093663980321649e-05, + "loss": 1.6659, + "step": 25450 + }, + { + "epoch": 1.9870444080231016, + "grad_norm": 1.353969931602478, + "learning_rate": 3.0894672064725614e-05, + "loss": 0.3165, + "step": 25460 + }, + { + "epoch": 1.9878248653711075, + "grad_norm": 1.1379843734005135e-09, + "learning_rate": 3.085272008156611e-05, + "loss": 0.721, + "step": 25470 + }, + { + "epoch": 1.9886053227191134, + "grad_norm": 0.2630331218242645, + "learning_rate": 3.0810783888333996e-05, + "loss": 0.518, + "step": 25480 + }, + { + "epoch": 1.9893857800671193, + "grad_norm": 0.31837162375450134, + "learning_rate": 3.0768863519612167e-05, + "loss": 0.9379, + "step": 25490 + }, + { + "epoch": 1.9901662374151252, + "grad_norm": 5.576870441436768, + "learning_rate": 3.072695900997055e-05, + "loss": 0.031, + "step": 25500 + }, + { + "epoch": 1.9909466947631311, + "grad_norm": 2.1766440868377686, + "learning_rate": 3.0685070393965974e-05, + "loss": 2.7766, + "step": 25510 + }, + { + "epoch": 1.991727152111137, + "grad_norm": 0.016400795429944992, + "learning_rate": 3.064319770614213e-05, + "loss": 0.931, + "step": 25520 + }, + { + "epoch": 1.992507609459143, + "grad_norm": 12.725517272949219, + "learning_rate": 3.060134098102965e-05, + "loss": 2.5748, + "step": 25530 + }, + { + "epoch": 1.9932880668071489, + "grad_norm": 0.0008636490674689412, + "learning_rate": 3.055950025314588e-05, + "loss": 0.8295, + "step": 25540 + }, + { + "epoch": 1.9940685241551548, + "grad_norm": 0.12384522706270218, + "learning_rate": 3.0517675556995116e-05, + "loss": 0.0936, + "step": 25550 + }, + { + "epoch": 1.9948489815031607, + "grad_norm": 61.26963806152344, + "learning_rate": 3.047586692706831e-05, + "loss": 0.9006, + "step": 25560 + }, + { + "epoch": 1.9956294388511668, + "grad_norm": 2.5371127421180972e-08, + "learning_rate": 3.043407439784325e-05, + "loss": 0.2862, + "step": 25570 + }, + { + "epoch": 1.9964098961991728, + "grad_norm": 36.4479866027832, + "learning_rate": 3.0392298003784382e-05, + "loss": 1.0143, + "step": 25580 + }, + { + "epoch": 1.9971903535471787, + "grad_norm": 0.007001777645200491, + "learning_rate": 3.0350537779342914e-05, + "loss": 0.9285, + "step": 25590 + }, + { + "epoch": 1.9979708108951846, + "grad_norm": 42.04073715209961, + "learning_rate": 3.0308793758956667e-05, + "loss": 1.0238, + "step": 25600 + }, + { + "epoch": 1.9987512682431905, + "grad_norm": 0.0021264858078211546, + "learning_rate": 3.02670659770501e-05, + "loss": 1.5027, + "step": 25610 + }, + { + "epoch": 1.9995317255911964, + "grad_norm": 0.19171792268753052, + "learning_rate": 3.0225354468034317e-05, + "loss": 1.6025, + "step": 25620 + }, + { + "epoch": 2.0003121829392025, + "grad_norm": 98.07879638671875, + "learning_rate": 3.0183659266306964e-05, + "loss": 1.0799, + "step": 25630 + }, + { + "epoch": 2.0010926402872085, + "grad_norm": 0.38005921244621277, + "learning_rate": 3.014198040625229e-05, + "loss": 0.8573, + "step": 25640 + }, + { + "epoch": 2.0018730976352144, + "grad_norm": 5.383617877960205, + "learning_rate": 3.0100317922240996e-05, + "loss": 0.6595, + "step": 25650 + }, + { + "epoch": 2.0026535549832203, + "grad_norm": 0.22013996541500092, + "learning_rate": 3.0058671848630338e-05, + "loss": 0.0558, + "step": 25660 + }, + { + "epoch": 2.003434012331226, + "grad_norm": 3.6882798326587363e-07, + "learning_rate": 3.0017042219764e-05, + "loss": 0.108, + "step": 25670 + }, + { + "epoch": 2.004214469679232, + "grad_norm": 0.01585361920297146, + "learning_rate": 2.9975429069972166e-05, + "loss": 1.2641, + "step": 25680 + }, + { + "epoch": 2.004994927027238, + "grad_norm": 0.001181729487143457, + "learning_rate": 2.993383243357134e-05, + "loss": 0.2464, + "step": 25690 + }, + { + "epoch": 2.005775384375244, + "grad_norm": 0.9797963500022888, + "learning_rate": 2.9892252344864467e-05, + "loss": 0.3328, + "step": 25700 + }, + { + "epoch": 2.00655584172325, + "grad_norm": 8.205447556974832e-06, + "learning_rate": 2.9850688838140862e-05, + "loss": 0.8833, + "step": 25710 + }, + { + "epoch": 2.0073362990712558, + "grad_norm": 0.13399428129196167, + "learning_rate": 2.98091419476761e-05, + "loss": 0.0313, + "step": 25720 + }, + { + "epoch": 2.0081167564192617, + "grad_norm": 4.621370508495204e-10, + "learning_rate": 2.9767611707732107e-05, + "loss": 0.1964, + "step": 25730 + }, + { + "epoch": 2.0088972137672676, + "grad_norm": 7.719918926341052e-07, + "learning_rate": 2.972609815255706e-05, + "loss": 0.2324, + "step": 25740 + }, + { + "epoch": 2.0096776711152735, + "grad_norm": 0.004049698356539011, + "learning_rate": 2.9684601316385396e-05, + "loss": 0.6326, + "step": 25750 + }, + { + "epoch": 2.0104581284632794, + "grad_norm": 3.357577323913574, + "learning_rate": 2.9643121233437715e-05, + "loss": 1.0279, + "step": 25760 + }, + { + "epoch": 2.0112385858112853, + "grad_norm": 3.8555106129933847e-07, + "learning_rate": 2.9601657937920857e-05, + "loss": 2.5202, + "step": 25770 + }, + { + "epoch": 2.0120190431592913, + "grad_norm": 1.3730878833939641e-07, + "learning_rate": 2.956021146402781e-05, + "loss": 0.034, + "step": 25780 + }, + { + "epoch": 2.012799500507297, + "grad_norm": 1.7080418501791428e-07, + "learning_rate": 2.9518781845937626e-05, + "loss": 1.3658, + "step": 25790 + }, + { + "epoch": 2.013579957855303, + "grad_norm": 0.004128089174628258, + "learning_rate": 2.9477369117815546e-05, + "loss": 0.0758, + "step": 25800 + }, + { + "epoch": 2.014360415203309, + "grad_norm": 4.3436452301648387e-07, + "learning_rate": 2.9435973313812815e-05, + "loss": 0.1982, + "step": 25810 + }, + { + "epoch": 2.015140872551315, + "grad_norm": 37.810096740722656, + "learning_rate": 2.939459446806679e-05, + "loss": 0.0917, + "step": 25820 + }, + { + "epoch": 2.015921329899321, + "grad_norm": 0.26041847467422485, + "learning_rate": 2.9353232614700754e-05, + "loss": 0.5103, + "step": 25830 + }, + { + "epoch": 2.0167017872473267, + "grad_norm": 0.0001422716595698148, + "learning_rate": 2.9311887787824068e-05, + "loss": 0.1755, + "step": 25840 + }, + { + "epoch": 2.0174822445953327, + "grad_norm": 0.05642390996217728, + "learning_rate": 2.927056002153196e-05, + "loss": 0.0007, + "step": 25850 + }, + { + "epoch": 2.018262701943339, + "grad_norm": 7.756619240135987e-13, + "learning_rate": 2.9229249349905684e-05, + "loss": 0.0191, + "step": 25860 + }, + { + "epoch": 2.019043159291345, + "grad_norm": 103.0235595703125, + "learning_rate": 2.918795580701233e-05, + "loss": 1.3986, + "step": 25870 + }, + { + "epoch": 2.019823616639351, + "grad_norm": 3.3781661987304688, + "learning_rate": 2.9146679426904876e-05, + "loss": 0.7521, + "step": 25880 + }, + { + "epoch": 2.0206040739873568, + "grad_norm": 0.009529034607112408, + "learning_rate": 2.9105420243622182e-05, + "loss": 0.5684, + "step": 25890 + }, + { + "epoch": 2.0213845313353627, + "grad_norm": 9.872029860247267e-10, + "learning_rate": 2.906417829118886e-05, + "loss": 0.0163, + "step": 25900 + }, + { + "epoch": 2.0221649886833686, + "grad_norm": 0.00018539708980824798, + "learning_rate": 2.9022953603615395e-05, + "loss": 0.0254, + "step": 25910 + }, + { + "epoch": 2.0229454460313745, + "grad_norm": 0.01795944571495056, + "learning_rate": 2.8981746214897944e-05, + "loss": 0.114, + "step": 25920 + }, + { + "epoch": 2.0237259033793804, + "grad_norm": 0.056700363755226135, + "learning_rate": 2.8940556159018484e-05, + "loss": 0.3087, + "step": 25930 + }, + { + "epoch": 2.0245063607273863, + "grad_norm": 0.00224350206553936, + "learning_rate": 2.889938346994463e-05, + "loss": 0.0146, + "step": 25940 + }, + { + "epoch": 2.0252868180753922, + "grad_norm": 9.900408744812012, + "learning_rate": 2.885822818162971e-05, + "loss": 0.1125, + "step": 25950 + }, + { + "epoch": 2.026067275423398, + "grad_norm": 4.4600043296813965, + "learning_rate": 2.8817090328012704e-05, + "loss": 0.1713, + "step": 25960 + }, + { + "epoch": 2.026847732771404, + "grad_norm": 0.3353830575942993, + "learning_rate": 2.877596994301823e-05, + "loss": 1.3635, + "step": 25970 + }, + { + "epoch": 2.02762819011941, + "grad_norm": 108.07823944091797, + "learning_rate": 2.8734867060556447e-05, + "loss": 1.5941, + "step": 25980 + }, + { + "epoch": 2.028408647467416, + "grad_norm": 134.56443786621094, + "learning_rate": 2.8693781714523104e-05, + "loss": 0.3383, + "step": 25990 + }, + { + "epoch": 2.029189104815422, + "grad_norm": 0.419847309589386, + "learning_rate": 2.865271393879953e-05, + "loss": 0.0753, + "step": 26000 + }, + { + "epoch": 2.0299695621634277, + "grad_norm": 0.0037621513474732637, + "learning_rate": 2.8611663767252494e-05, + "loss": 1.4775, + "step": 26010 + }, + { + "epoch": 2.0307500195114336, + "grad_norm": 0.00020689735538326204, + "learning_rate": 2.8570631233734292e-05, + "loss": 0.2159, + "step": 26020 + }, + { + "epoch": 2.0315304768594395, + "grad_norm": 8.361428626812994e-05, + "learning_rate": 2.852961637208268e-05, + "loss": 0.4753, + "step": 26030 + }, + { + "epoch": 2.0323109342074455, + "grad_norm": 9.77057768025702e-10, + "learning_rate": 2.848861921612083e-05, + "loss": 0.1254, + "step": 26040 + }, + { + "epoch": 2.0330913915554514, + "grad_norm": 7.082812526586213e-13, + "learning_rate": 2.844763979965729e-05, + "loss": 0.6117, + "step": 26050 + }, + { + "epoch": 2.0338718489034573, + "grad_norm": 13.338913917541504, + "learning_rate": 2.8406678156486034e-05, + "loss": 0.0186, + "step": 26060 + }, + { + "epoch": 2.034652306251463, + "grad_norm": 2.314552068710327, + "learning_rate": 2.836573432038629e-05, + "loss": 1.9713, + "step": 26070 + }, + { + "epoch": 2.035432763599469, + "grad_norm": 111.98283386230469, + "learning_rate": 2.832480832512271e-05, + "loss": 0.8219, + "step": 26080 + }, + { + "epoch": 2.036213220947475, + "grad_norm": 37.844078063964844, + "learning_rate": 2.8283900204445136e-05, + "loss": 0.4319, + "step": 26090 + }, + { + "epoch": 2.036993678295481, + "grad_norm": 14.029542922973633, + "learning_rate": 2.824300999208872e-05, + "loss": 0.0097, + "step": 26100 + }, + { + "epoch": 2.0377741356434873, + "grad_norm": 0.00010180075332755223, + "learning_rate": 2.820213772177388e-05, + "loss": 0.0203, + "step": 26110 + }, + { + "epoch": 2.038554592991493, + "grad_norm": 1.2391136806400027e-05, + "learning_rate": 2.8161283427206143e-05, + "loss": 0.9182, + "step": 26120 + }, + { + "epoch": 2.039335050339499, + "grad_norm": 9.387850877828896e-05, + "learning_rate": 2.81204471420763e-05, + "loss": 0.0724, + "step": 26130 + }, + { + "epoch": 2.040115507687505, + "grad_norm": 0.15147614479064941, + "learning_rate": 2.8079628900060233e-05, + "loss": 0.3784, + "step": 26140 + }, + { + "epoch": 2.040895965035511, + "grad_norm": 129.17037963867188, + "learning_rate": 2.8038828734818995e-05, + "loss": 0.9068, + "step": 26150 + }, + { + "epoch": 2.041676422383517, + "grad_norm": 0.06553638726472855, + "learning_rate": 2.7998046679998668e-05, + "loss": 0.3397, + "step": 26160 + }, + { + "epoch": 2.042456879731523, + "grad_norm": 0.05100405961275101, + "learning_rate": 2.795728276923047e-05, + "loss": 0.0717, + "step": 26170 + }, + { + "epoch": 2.0432373370795287, + "grad_norm": 1.2694729889517475e-07, + "learning_rate": 2.7916537036130584e-05, + "loss": 0.5827, + "step": 26180 + }, + { + "epoch": 2.0440177944275346, + "grad_norm": 1.5002992768131662e-06, + "learning_rate": 2.7875809514300272e-05, + "loss": 0.4723, + "step": 26190 + }, + { + "epoch": 2.0447982517755405, + "grad_norm": 3.0192535632522777e-05, + "learning_rate": 2.783510023732575e-05, + "loss": 0.0862, + "step": 26200 + }, + { + "epoch": 2.0455787091235464, + "grad_norm": 0.007100996095687151, + "learning_rate": 2.7794409238778158e-05, + "loss": 0.0, + "step": 26210 + }, + { + "epoch": 2.0463591664715524, + "grad_norm": 0.0008602620218880475, + "learning_rate": 2.7753736552213616e-05, + "loss": 0.9352, + "step": 26220 + }, + { + "epoch": 2.0471396238195583, + "grad_norm": 3.4370423174223674e-11, + "learning_rate": 2.771308221117309e-05, + "loss": 0.3896, + "step": 26230 + }, + { + "epoch": 2.047920081167564, + "grad_norm": 5.0753031246131286e-05, + "learning_rate": 2.7672446249182472e-05, + "loss": 0.1046, + "step": 26240 + }, + { + "epoch": 2.04870053851557, + "grad_norm": 0.2621128261089325, + "learning_rate": 2.763182869975244e-05, + "loss": 1.336, + "step": 26250 + }, + { + "epoch": 2.049480995863576, + "grad_norm": 0.0025993799790740013, + "learning_rate": 2.7591229596378528e-05, + "loss": 0.5698, + "step": 26260 + }, + { + "epoch": 2.050261453211582, + "grad_norm": 0.036587078124284744, + "learning_rate": 2.755064897254105e-05, + "loss": 0.0195, + "step": 26270 + }, + { + "epoch": 2.051041910559588, + "grad_norm": 3.440267093424154e-08, + "learning_rate": 2.7510086861705094e-05, + "loss": 0.0057, + "step": 26280 + }, + { + "epoch": 2.0518223679075938, + "grad_norm": 2.941135608125478e-05, + "learning_rate": 2.7469543297320456e-05, + "loss": 0.4777, + "step": 26290 + }, + { + "epoch": 2.0526028252555997, + "grad_norm": 0.0801917314529419, + "learning_rate": 2.7429018312821614e-05, + "loss": 0.0012, + "step": 26300 + }, + { + "epoch": 2.0533832826036056, + "grad_norm": 1.0764252920125728e-06, + "learning_rate": 2.7388511941627805e-05, + "loss": 0.2024, + "step": 26310 + }, + { + "epoch": 2.0541637399516115, + "grad_norm": 42.37961959838867, + "learning_rate": 2.7348024217142827e-05, + "loss": 2.9864, + "step": 26320 + }, + { + "epoch": 2.0549441972996174, + "grad_norm": 3.61101206181047e-06, + "learning_rate": 2.730755517275516e-05, + "loss": 0.0003, + "step": 26330 + }, + { + "epoch": 2.0557246546476233, + "grad_norm": 0.036643143743276596, + "learning_rate": 2.7267104841837863e-05, + "loss": 0.0987, + "step": 26340 + }, + { + "epoch": 2.0565051119956292, + "grad_norm": 0.000123582300147973, + "learning_rate": 2.722667325774858e-05, + "loss": 0.1032, + "step": 26350 + }, + { + "epoch": 2.057285569343635, + "grad_norm": 8.651711141283158e-06, + "learning_rate": 2.7186260453829443e-05, + "loss": 0.1192, + "step": 26360 + }, + { + "epoch": 2.0580660266916415, + "grad_norm": 0.04431761056184769, + "learning_rate": 2.714586646340716e-05, + "loss": 0.0038, + "step": 26370 + }, + { + "epoch": 2.0588464840396474, + "grad_norm": 2.8218696910414787e-12, + "learning_rate": 2.710549131979288e-05, + "loss": 0.0032, + "step": 26380 + }, + { + "epoch": 2.0596269413876533, + "grad_norm": 0.004620996303856373, + "learning_rate": 2.7065135056282204e-05, + "loss": 0.0001, + "step": 26390 + }, + { + "epoch": 2.0604073987356593, + "grad_norm": 0.0009973037522286177, + "learning_rate": 2.7024797706155204e-05, + "loss": 0.1086, + "step": 26400 + }, + { + "epoch": 2.061187856083665, + "grad_norm": 0.00033386031282134354, + "learning_rate": 2.6984479302676336e-05, + "loss": 0.1944, + "step": 26410 + }, + { + "epoch": 2.061968313431671, + "grad_norm": 3.535060599801909e-08, + "learning_rate": 2.6944179879094443e-05, + "loss": 0.191, + "step": 26420 + }, + { + "epoch": 2.062748770779677, + "grad_norm": 0.010547350160777569, + "learning_rate": 2.6903899468642668e-05, + "loss": 0.0016, + "step": 26430 + }, + { + "epoch": 2.063529228127683, + "grad_norm": 3.213576555252075, + "learning_rate": 2.686363810453856e-05, + "loss": 1.5271, + "step": 26440 + }, + { + "epoch": 2.064309685475689, + "grad_norm": 69.19461059570312, + "learning_rate": 2.682339581998386e-05, + "loss": 1.4639, + "step": 26450 + }, + { + "epoch": 2.0650901428236947, + "grad_norm": 1.0486480306326484e-07, + "learning_rate": 2.6783172648164666e-05, + "loss": 0.0147, + "step": 26460 + }, + { + "epoch": 2.0658706001717007, + "grad_norm": 13.421584129333496, + "learning_rate": 2.6742968622251264e-05, + "loss": 1.7384, + "step": 26470 + }, + { + "epoch": 2.0666510575197066, + "grad_norm": 0.00023604616580996662, + "learning_rate": 2.6702783775398132e-05, + "loss": 0.0, + "step": 26480 + }, + { + "epoch": 2.0674315148677125, + "grad_norm": 4.015465043805122e-12, + "learning_rate": 2.6662618140743988e-05, + "loss": 0.692, + "step": 26490 + }, + { + "epoch": 2.0682119722157184, + "grad_norm": 2.1036534195379222e-14, + "learning_rate": 2.6622471751411678e-05, + "loss": 0.0897, + "step": 26500 + }, + { + "epoch": 2.0689924295637243, + "grad_norm": 1.178400907519972e-05, + "learning_rate": 2.65823446405082e-05, + "loss": 0.0004, + "step": 26510 + }, + { + "epoch": 2.0697728869117302, + "grad_norm": 0.055531445890665054, + "learning_rate": 2.6542236841124594e-05, + "loss": 0.012, + "step": 26520 + }, + { + "epoch": 2.070553344259736, + "grad_norm": 7.828985530977661e-07, + "learning_rate": 2.6502148386336057e-05, + "loss": 1.0484, + "step": 26530 + }, + { + "epoch": 2.071333801607742, + "grad_norm": 1.7514663568363176e-06, + "learning_rate": 2.646207930920175e-05, + "loss": 0.1406, + "step": 26540 + }, + { + "epoch": 2.072114258955748, + "grad_norm": 0.8897789120674133, + "learning_rate": 2.6422029642764933e-05, + "loss": 0.1969, + "step": 26550 + }, + { + "epoch": 2.072894716303754, + "grad_norm": 0.18779289722442627, + "learning_rate": 2.6381999420052783e-05, + "loss": 0.1714, + "step": 26560 + }, + { + "epoch": 2.07367517365176, + "grad_norm": 118.19866180419922, + "learning_rate": 2.6341988674076497e-05, + "loss": 2.3051, + "step": 26570 + }, + { + "epoch": 2.0744556309997657, + "grad_norm": 7.315911293029785, + "learning_rate": 2.6301997437831217e-05, + "loss": 0.0031, + "step": 26580 + }, + { + "epoch": 2.0752360883477716, + "grad_norm": 1.4385477697942406e-06, + "learning_rate": 2.6262025744295927e-05, + "loss": 0.6723, + "step": 26590 + }, + { + "epoch": 2.0760165456957775, + "grad_norm": 0.00010235334048047662, + "learning_rate": 2.6222073626433584e-05, + "loss": 0.524, + "step": 26600 + }, + { + "epoch": 2.0767970030437835, + "grad_norm": 0.0751703754067421, + "learning_rate": 2.6182141117190918e-05, + "loss": 0.0055, + "step": 26610 + }, + { + "epoch": 2.0775774603917894, + "grad_norm": 0.9944970607757568, + "learning_rate": 2.6142228249498578e-05, + "loss": 0.8321, + "step": 26620 + }, + { + "epoch": 2.0783579177397957, + "grad_norm": 1.3824878931045532, + "learning_rate": 2.610233505627091e-05, + "loss": 0.0864, + "step": 26630 + }, + { + "epoch": 2.0791383750878016, + "grad_norm": 4.768913128977426e-13, + "learning_rate": 2.606246157040613e-05, + "loss": 0.0231, + "step": 26640 + }, + { + "epoch": 2.0799188324358076, + "grad_norm": 2.846596847128069e-10, + "learning_rate": 2.602260782478615e-05, + "loss": 0.0967, + "step": 26650 + }, + { + "epoch": 2.0806992897838135, + "grad_norm": 5.813966708956286e-06, + "learning_rate": 2.5982773852276644e-05, + "loss": 0.0023, + "step": 26660 + }, + { + "epoch": 2.0814797471318194, + "grad_norm": 1.337128741063509e-09, + "learning_rate": 2.594295968572693e-05, + "loss": 0.0004, + "step": 26670 + }, + { + "epoch": 2.0822602044798253, + "grad_norm": 3.281140470434707e-09, + "learning_rate": 2.5903165357970005e-05, + "loss": 0.1601, + "step": 26680 + }, + { + "epoch": 2.083040661827831, + "grad_norm": 7.578184158774093e-05, + "learning_rate": 2.586339090182254e-05, + "loss": 0.1502, + "step": 26690 + }, + { + "epoch": 2.083821119175837, + "grad_norm": 0.00915143545717001, + "learning_rate": 2.5823636350084775e-05, + "loss": 1.3793, + "step": 26700 + }, + { + "epoch": 2.084601576523843, + "grad_norm": 0.01805698312819004, + "learning_rate": 2.5783901735540584e-05, + "loss": 0.0301, + "step": 26710 + }, + { + "epoch": 2.085382033871849, + "grad_norm": 1.9149654084671397e-10, + "learning_rate": 2.5744187090957316e-05, + "loss": 0.6883, + "step": 26720 + }, + { + "epoch": 2.086162491219855, + "grad_norm": 1.6224537375819637e-06, + "learning_rate": 2.570449244908598e-05, + "loss": 0.7574, + "step": 26730 + }, + { + "epoch": 2.086942948567861, + "grad_norm": 212.9560546875, + "learning_rate": 2.566481784266097e-05, + "loss": 1.8576, + "step": 26740 + }, + { + "epoch": 2.0877234059158667, + "grad_norm": 0.00015921909653116018, + "learning_rate": 2.5625163304400245e-05, + "loss": 1.7303, + "step": 26750 + }, + { + "epoch": 2.0885038632638726, + "grad_norm": 0.013452545739710331, + "learning_rate": 2.558552886700512e-05, + "loss": 0.0011, + "step": 26760 + }, + { + "epoch": 2.0892843206118785, + "grad_norm": 8.070929652603809e-06, + "learning_rate": 2.5545914563160443e-05, + "loss": 1.574, + "step": 26770 + }, + { + "epoch": 2.0900647779598844, + "grad_norm": 0.004110492300242186, + "learning_rate": 2.5506320425534375e-05, + "loss": 0.5442, + "step": 26780 + }, + { + "epoch": 2.0908452353078903, + "grad_norm": 8.283843611067709e-10, + "learning_rate": 2.5466746486778458e-05, + "loss": 0.3234, + "step": 26790 + }, + { + "epoch": 2.0916256926558963, + "grad_norm": 6.580946454448622e-09, + "learning_rate": 2.542719277952762e-05, + "loss": 0.0004, + "step": 26800 + }, + { + "epoch": 2.092406150003902, + "grad_norm": 0.0010235244408249855, + "learning_rate": 2.5387659336400072e-05, + "loss": 0.0153, + "step": 26810 + }, + { + "epoch": 2.093186607351908, + "grad_norm": 0.24290083348751068, + "learning_rate": 2.5348146189997345e-05, + "loss": 1.247, + "step": 26820 + }, + { + "epoch": 2.093967064699914, + "grad_norm": 4.409335451782681e-05, + "learning_rate": 2.530865337290418e-05, + "loss": 0.1006, + "step": 26830 + }, + { + "epoch": 2.09474752204792, + "grad_norm": 9.038549423217773, + "learning_rate": 2.52691809176886e-05, + "loss": 0.144, + "step": 26840 + }, + { + "epoch": 2.095527979395926, + "grad_norm": 1.1245354413986206, + "learning_rate": 2.5229728856901796e-05, + "loss": 0.3469, + "step": 26850 + }, + { + "epoch": 2.0963084367439317, + "grad_norm": 2.387321673680276e-09, + "learning_rate": 2.5190297223078195e-05, + "loss": 0.8331, + "step": 26860 + }, + { + "epoch": 2.0970888940919377, + "grad_norm": 2.5276014614661335e-10, + "learning_rate": 2.5150886048735313e-05, + "loss": 0.0001, + "step": 26870 + }, + { + "epoch": 2.097869351439944, + "grad_norm": 0.02269677072763443, + "learning_rate": 2.5111495366373843e-05, + "loss": 0.2311, + "step": 26880 + }, + { + "epoch": 2.09864980878795, + "grad_norm": 1.765695003541623e-07, + "learning_rate": 2.507212520847759e-05, + "loss": 0.0448, + "step": 26890 + }, + { + "epoch": 2.099430266135956, + "grad_norm": 203.69290161132812, + "learning_rate": 2.5032775607513358e-05, + "loss": 1.3521, + "step": 26900 + }, + { + "epoch": 2.1002107234839618, + "grad_norm": 3.4434066037647426e-05, + "learning_rate": 2.4993446595931097e-05, + "loss": 1.4841, + "step": 26910 + }, + { + "epoch": 2.1009911808319677, + "grad_norm": 0.0002487737510818988, + "learning_rate": 2.4954138206163685e-05, + "loss": 0.0001, + "step": 26920 + }, + { + "epoch": 2.1017716381799736, + "grad_norm": 2.313194751739502, + "learning_rate": 2.4914850470627078e-05, + "loss": 0.5814, + "step": 26930 + }, + { + "epoch": 2.1025520955279795, + "grad_norm": 4.538455300462374e-08, + "learning_rate": 2.4875583421720123e-05, + "loss": 0.5102, + "step": 26940 + }, + { + "epoch": 2.1033325528759854, + "grad_norm": 14.016276359558105, + "learning_rate": 2.483633709182466e-05, + "loss": 0.074, + "step": 26950 + }, + { + "epoch": 2.1041130102239913, + "grad_norm": 4.9571683563565117e-11, + "learning_rate": 2.479711151330545e-05, + "loss": 0.0003, + "step": 26960 + }, + { + "epoch": 2.1048934675719972, + "grad_norm": 0.28219160437583923, + "learning_rate": 2.4757906718510072e-05, + "loss": 0.1947, + "step": 26970 + }, + { + "epoch": 2.105673924920003, + "grad_norm": 2.000363826751709, + "learning_rate": 2.4718722739769057e-05, + "loss": 0.548, + "step": 26980 + }, + { + "epoch": 2.106454382268009, + "grad_norm": 3.634791079232258e-10, + "learning_rate": 2.467955960939569e-05, + "loss": 1.4461, + "step": 26990 + }, + { + "epoch": 2.107234839616015, + "grad_norm": 0.00023928057635203004, + "learning_rate": 2.464041735968613e-05, + "loss": 0.0638, + "step": 27000 + }, + { + "epoch": 2.108015296964021, + "grad_norm": 1.9216714330916318e-10, + "learning_rate": 2.4601296022919245e-05, + "loss": 0.1088, + "step": 27010 + }, + { + "epoch": 2.108795754312027, + "grad_norm": 159.6993865966797, + "learning_rate": 2.456219563135674e-05, + "loss": 2.9701, + "step": 27020 + }, + { + "epoch": 2.1095762116600327, + "grad_norm": 184.36904907226562, + "learning_rate": 2.4523116217242958e-05, + "loss": 1.8129, + "step": 27030 + }, + { + "epoch": 2.1103566690080386, + "grad_norm": 1.1307780883526575e-12, + "learning_rate": 2.448405781280502e-05, + "loss": 0.4697, + "step": 27040 + }, + { + "epoch": 2.1111371263560446, + "grad_norm": 10.915018081665039, + "learning_rate": 2.444502045025268e-05, + "loss": 0.0177, + "step": 27050 + }, + { + "epoch": 2.1119175837040505, + "grad_norm": 43.65808868408203, + "learning_rate": 2.4406004161778374e-05, + "loss": 0.1361, + "step": 27060 + }, + { + "epoch": 2.1126980410520564, + "grad_norm": 3.3598575592041016, + "learning_rate": 2.4367008979557114e-05, + "loss": 0.3633, + "step": 27070 + }, + { + "epoch": 2.1134784984000623, + "grad_norm": 1.2000231031095154e-08, + "learning_rate": 2.4328034935746507e-05, + "loss": 0.3738, + "step": 27080 + }, + { + "epoch": 2.114258955748068, + "grad_norm": 190.1020965576172, + "learning_rate": 2.428908206248679e-05, + "loss": 0.9442, + "step": 27090 + }, + { + "epoch": 2.115039413096074, + "grad_norm": 8.433188438415527, + "learning_rate": 2.425015039190066e-05, + "loss": 0.1653, + "step": 27100 + }, + { + "epoch": 2.11581987044408, + "grad_norm": 141.552001953125, + "learning_rate": 2.421123995609339e-05, + "loss": 0.3785, + "step": 27110 + }, + { + "epoch": 2.116600327792086, + "grad_norm": 10.547119140625, + "learning_rate": 2.4172350787152714e-05, + "loss": 0.3827, + "step": 27120 + }, + { + "epoch": 2.1173807851400923, + "grad_norm": 9.30328369140625, + "learning_rate": 2.4133482917148864e-05, + "loss": 0.0089, + "step": 27130 + }, + { + "epoch": 2.1181612424880982, + "grad_norm": 0.009139536879956722, + "learning_rate": 2.4094636378134434e-05, + "loss": 0.0024, + "step": 27140 + }, + { + "epoch": 2.118941699836104, + "grad_norm": 0.15115059912204742, + "learning_rate": 2.4055811202144505e-05, + "loss": 0.0129, + "step": 27150 + }, + { + "epoch": 2.11972215718411, + "grad_norm": 162.1376190185547, + "learning_rate": 2.4017007421196508e-05, + "loss": 2.3596, + "step": 27160 + }, + { + "epoch": 2.120502614532116, + "grad_norm": 0.002663856605067849, + "learning_rate": 2.397822506729019e-05, + "loss": 3.0959, + "step": 27170 + }, + { + "epoch": 2.121283071880122, + "grad_norm": 5.7644479056762066e-06, + "learning_rate": 2.39394641724077e-05, + "loss": 0.0004, + "step": 27180 + }, + { + "epoch": 2.122063529228128, + "grad_norm": 4.785835335496813e-05, + "learning_rate": 2.390072476851345e-05, + "loss": 0.0668, + "step": 27190 + }, + { + "epoch": 2.1228439865761337, + "grad_norm": 1.4409127970793634e-06, + "learning_rate": 2.3862006887554166e-05, + "loss": 0.3021, + "step": 27200 + }, + { + "epoch": 2.1236244439241396, + "grad_norm": 2.1471103082149057e-06, + "learning_rate": 2.382331056145875e-05, + "loss": 0.7031, + "step": 27210 + }, + { + "epoch": 2.1244049012721455, + "grad_norm": 4.70070488610419e-14, + "learning_rate": 2.3784635822138424e-05, + "loss": 0.4572, + "step": 27220 + }, + { + "epoch": 2.1251853586201515, + "grad_norm": 178.70465087890625, + "learning_rate": 2.3745982701486514e-05, + "loss": 1.5983, + "step": 27230 + }, + { + "epoch": 2.1259658159681574, + "grad_norm": 8.373917381732099e-08, + "learning_rate": 2.3707351231378612e-05, + "loss": 0.0597, + "step": 27240 + }, + { + "epoch": 2.1267462733161633, + "grad_norm": 0.4737749397754669, + "learning_rate": 2.3668741443672356e-05, + "loss": 1.2344, + "step": 27250 + }, + { + "epoch": 2.127526730664169, + "grad_norm": 9.960779425455257e-05, + "learning_rate": 2.3630153370207582e-05, + "loss": 0.6203, + "step": 27260 + }, + { + "epoch": 2.128307188012175, + "grad_norm": 1.7590502920938889e-06, + "learning_rate": 2.3591587042806213e-05, + "loss": 0.0193, + "step": 27270 + }, + { + "epoch": 2.129087645360181, + "grad_norm": 1.4878227122538945e-11, + "learning_rate": 2.3553042493272176e-05, + "loss": 0.667, + "step": 27280 + }, + { + "epoch": 2.129868102708187, + "grad_norm": 0.38585662841796875, + "learning_rate": 2.351451975339153e-05, + "loss": 0.121, + "step": 27290 + }, + { + "epoch": 2.130648560056193, + "grad_norm": 2.600064911637001e-12, + "learning_rate": 2.3476018854932247e-05, + "loss": 1.1299, + "step": 27300 + }, + { + "epoch": 2.1314290174041988, + "grad_norm": 12.171748161315918, + "learning_rate": 2.3437539829644385e-05, + "loss": 0.0135, + "step": 27310 + }, + { + "epoch": 2.1322094747522047, + "grad_norm": 0.0016079798806458712, + "learning_rate": 2.3399082709259886e-05, + "loss": 0.0188, + "step": 27320 + }, + { + "epoch": 2.1329899321002106, + "grad_norm": 0.00035869970452040434, + "learning_rate": 2.336064752549269e-05, + "loss": 0.001, + "step": 27330 + }, + { + "epoch": 2.1337703894482165, + "grad_norm": 7.7371763992778995e-16, + "learning_rate": 2.3322234310038587e-05, + "loss": 0.8011, + "step": 27340 + }, + { + "epoch": 2.1345508467962224, + "grad_norm": 5.361710282159038e-05, + "learning_rate": 2.3283843094575298e-05, + "loss": 0.4662, + "step": 27350 + }, + { + "epoch": 2.1353313041442283, + "grad_norm": 0.021017905324697495, + "learning_rate": 2.3245473910762404e-05, + "loss": 0.1137, + "step": 27360 + }, + { + "epoch": 2.1361117614922343, + "grad_norm": 1.9953435526076646e-07, + "learning_rate": 2.320712679024127e-05, + "loss": 0.0039, + "step": 27370 + }, + { + "epoch": 2.1368922188402406, + "grad_norm": 0.027051346376538277, + "learning_rate": 2.3168801764635117e-05, + "loss": 0.0001, + "step": 27380 + }, + { + "epoch": 2.137672676188246, + "grad_norm": 2.500294936369496e-11, + "learning_rate": 2.31304988655489e-05, + "loss": 0.1205, + "step": 27390 + }, + { + "epoch": 2.1384531335362524, + "grad_norm": 4.070713988113539e-09, + "learning_rate": 2.3092218124569376e-05, + "loss": 0.2147, + "step": 27400 + }, + { + "epoch": 2.1392335908842584, + "grad_norm": 0.02425590716302395, + "learning_rate": 2.3053959573264978e-05, + "loss": 0.0, + "step": 27410 + }, + { + "epoch": 2.1400140482322643, + "grad_norm": 5.908094846684975e-11, + "learning_rate": 2.3015723243185877e-05, + "loss": 0.0009, + "step": 27420 + }, + { + "epoch": 2.14079450558027, + "grad_norm": 0.1725311577320099, + "learning_rate": 2.2977509165863907e-05, + "loss": 1.8829, + "step": 27430 + }, + { + "epoch": 2.141574962928276, + "grad_norm": 8.780289499554783e-05, + "learning_rate": 2.293931737281258e-05, + "loss": 0.0569, + "step": 27440 + }, + { + "epoch": 2.142355420276282, + "grad_norm": 3.0101862648734823e-05, + "learning_rate": 2.290114789552698e-05, + "loss": 1.0659, + "step": 27450 + }, + { + "epoch": 2.143135877624288, + "grad_norm": 180.4490966796875, + "learning_rate": 2.2863000765483788e-05, + "loss": 0.3342, + "step": 27460 + }, + { + "epoch": 2.143916334972294, + "grad_norm": 63.52663040161133, + "learning_rate": 2.2824876014141327e-05, + "loss": 0.0746, + "step": 27470 + }, + { + "epoch": 2.1446967923202997, + "grad_norm": 177.1395263671875, + "learning_rate": 2.2786773672939372e-05, + "loss": 0.4897, + "step": 27480 + }, + { + "epoch": 2.1454772496683057, + "grad_norm": 0.003967598080635071, + "learning_rate": 2.2748693773299284e-05, + "loss": 0.0958, + "step": 27490 + }, + { + "epoch": 2.1462577070163116, + "grad_norm": 4.518438527304574e-18, + "learning_rate": 2.2710636346623898e-05, + "loss": 0.0022, + "step": 27500 + }, + { + "epoch": 2.1470381643643175, + "grad_norm": 6.903857752149634e-08, + "learning_rate": 2.2672601424297536e-05, + "loss": 0.0008, + "step": 27510 + }, + { + "epoch": 2.1478186217123234, + "grad_norm": 1.3903465878595256e-10, + "learning_rate": 2.26345890376859e-05, + "loss": 0.7075, + "step": 27520 + }, + { + "epoch": 2.1485990790603293, + "grad_norm": 8.995040972378625e-15, + "learning_rate": 2.259659921813619e-05, + "loss": 0.007, + "step": 27530 + }, + { + "epoch": 2.1493795364083352, + "grad_norm": 5.556034011533484e-05, + "learning_rate": 2.2558631996976914e-05, + "loss": 0.0384, + "step": 27540 + }, + { + "epoch": 2.150159993756341, + "grad_norm": 5.721829893445829e-06, + "learning_rate": 2.2520687405518027e-05, + "loss": 0.564, + "step": 27550 + }, + { + "epoch": 2.150940451104347, + "grad_norm": 0.055675067007541656, + "learning_rate": 2.2482765475050733e-05, + "loss": 0.0001, + "step": 27560 + }, + { + "epoch": 2.151720908452353, + "grad_norm": 1.6937552572926506e-05, + "learning_rate": 2.2444866236847623e-05, + "loss": 0.0028, + "step": 27570 + }, + { + "epoch": 2.152501365800359, + "grad_norm": 5.930499202833062e-09, + "learning_rate": 2.2406989722162558e-05, + "loss": 2.7458, + "step": 27580 + }, + { + "epoch": 2.153281823148365, + "grad_norm": 6.177411364660657e-07, + "learning_rate": 2.2369135962230626e-05, + "loss": 0.4403, + "step": 27590 + }, + { + "epoch": 2.1540622804963707, + "grad_norm": 0.04021603614091873, + "learning_rate": 2.233130498826819e-05, + "loss": 1.3678, + "step": 27600 + }, + { + "epoch": 2.1548427378443766, + "grad_norm": 0.00041691059595905244, + "learning_rate": 2.2293496831472788e-05, + "loss": 1.461, + "step": 27610 + }, + { + "epoch": 2.1556231951923825, + "grad_norm": 27.920442581176758, + "learning_rate": 2.2255711523023188e-05, + "loss": 0.0124, + "step": 27620 + }, + { + "epoch": 2.1564036525403885, + "grad_norm": 1.32997045717208e-19, + "learning_rate": 2.221794909407925e-05, + "loss": 0.0023, + "step": 27630 + }, + { + "epoch": 2.1571841098883944, + "grad_norm": 2.735917405516375e-06, + "learning_rate": 2.2180209575782036e-05, + "loss": 0.1073, + "step": 27640 + }, + { + "epoch": 2.1579645672364007, + "grad_norm": 8.972650178407093e-09, + "learning_rate": 2.2142492999253657e-05, + "loss": 0.0854, + "step": 27650 + }, + { + "epoch": 2.1587450245844066, + "grad_norm": 1.0043543170468183e-06, + "learning_rate": 2.2104799395597335e-05, + "loss": 0.7073, + "step": 27660 + }, + { + "epoch": 2.1595254819324126, + "grad_norm": 87.87557983398438, + "learning_rate": 2.206712879589737e-05, + "loss": 0.0313, + "step": 27670 + }, + { + "epoch": 2.1603059392804185, + "grad_norm": 8.924705505371094, + "learning_rate": 2.2029481231219023e-05, + "loss": 0.3171, + "step": 27680 + }, + { + "epoch": 2.1610863966284244, + "grad_norm": 200.0184326171875, + "learning_rate": 2.1991856732608647e-05, + "loss": 1.6237, + "step": 27690 + }, + { + "epoch": 2.1618668539764303, + "grad_norm": 0.008014976046979427, + "learning_rate": 2.195425533109347e-05, + "loss": 0.3416, + "step": 27700 + }, + { + "epoch": 2.162647311324436, + "grad_norm": 9.913729392962978e-09, + "learning_rate": 2.1916677057681785e-05, + "loss": 0.0, + "step": 27710 + }, + { + "epoch": 2.163427768672442, + "grad_norm": 0.0008486440638080239, + "learning_rate": 2.1879121943362707e-05, + "loss": 0.5153, + "step": 27720 + }, + { + "epoch": 2.164208226020448, + "grad_norm": 0.0002521092537790537, + "learning_rate": 2.184159001910633e-05, + "loss": 0.8563, + "step": 27730 + }, + { + "epoch": 2.164988683368454, + "grad_norm": 0.7091737389564514, + "learning_rate": 2.1804081315863585e-05, + "loss": 0.4124, + "step": 27740 + }, + { + "epoch": 2.16576914071646, + "grad_norm": 227.946044921875, + "learning_rate": 2.176659586456629e-05, + "loss": 1.8093, + "step": 27750 + }, + { + "epoch": 2.166549598064466, + "grad_norm": 2.0538752778520575e-06, + "learning_rate": 2.1729133696127054e-05, + "loss": 0.0179, + "step": 27760 + }, + { + "epoch": 2.1673300554124717, + "grad_norm": 0.028842711821198463, + "learning_rate": 2.1691694841439265e-05, + "loss": 0.3815, + "step": 27770 + }, + { + "epoch": 2.1681105127604776, + "grad_norm": 3.98197114170884e-10, + "learning_rate": 2.1654279331377147e-05, + "loss": 0.4047, + "step": 27780 + }, + { + "epoch": 2.1688909701084835, + "grad_norm": 2.1827180106395245e-16, + "learning_rate": 2.1616887196795614e-05, + "loss": 0.0048, + "step": 27790 + }, + { + "epoch": 2.1696714274564894, + "grad_norm": 0.011947333812713623, + "learning_rate": 2.157951846853035e-05, + "loss": 0.0019, + "step": 27800 + }, + { + "epoch": 2.1704518848044954, + "grad_norm": 7.423368231229688e-08, + "learning_rate": 2.1542173177397713e-05, + "loss": 0.3171, + "step": 27810 + }, + { + "epoch": 2.1712323421525013, + "grad_norm": 1.2332695753514145e-08, + "learning_rate": 2.150485135419475e-05, + "loss": 0.2155, + "step": 27820 + }, + { + "epoch": 2.172012799500507, + "grad_norm": 1.0378048420633945e-10, + "learning_rate": 2.1467553029699112e-05, + "loss": 0.0055, + "step": 27830 + }, + { + "epoch": 2.172793256848513, + "grad_norm": 8.545025867090012e-17, + "learning_rate": 2.1430278234669133e-05, + "loss": 1.5454, + "step": 27840 + }, + { + "epoch": 2.173573714196519, + "grad_norm": 0.3371439576148987, + "learning_rate": 2.1393026999843703e-05, + "loss": 0.5029, + "step": 27850 + }, + { + "epoch": 2.174354171544525, + "grad_norm": 1.8403001149636111e-06, + "learning_rate": 2.1355799355942262e-05, + "loss": 0.6609, + "step": 27860 + }, + { + "epoch": 2.175134628892531, + "grad_norm": 0.5155243277549744, + "learning_rate": 2.1318595333664854e-05, + "loss": 0.8387, + "step": 27870 + }, + { + "epoch": 2.1759150862405368, + "grad_norm": 184.76950073242188, + "learning_rate": 2.1281414963692014e-05, + "loss": 0.1438, + "step": 27880 + }, + { + "epoch": 2.1766955435885427, + "grad_norm": 5.0829814426833764e-05, + "learning_rate": 2.1244258276684787e-05, + "loss": 1.4594, + "step": 27890 + }, + { + "epoch": 2.177476000936549, + "grad_norm": 4.234400677316508e-11, + "learning_rate": 2.1207125303284637e-05, + "loss": 0.0001, + "step": 27900 + }, + { + "epoch": 2.178256458284555, + "grad_norm": 1.1056568854916904e-10, + "learning_rate": 2.1170016074113542e-05, + "loss": 1.014, + "step": 27910 + }, + { + "epoch": 2.179036915632561, + "grad_norm": 4.591631075356872e-09, + "learning_rate": 2.113293061977384e-05, + "loss": 0.3403, + "step": 27920 + }, + { + "epoch": 2.1798173729805668, + "grad_norm": 6.53014765200112e-14, + "learning_rate": 2.1095868970848322e-05, + "loss": 0.008, + "step": 27930 + }, + { + "epoch": 2.1805978303285727, + "grad_norm": 0.0019696371164172888, + "learning_rate": 2.1058831157900088e-05, + "loss": 0.0308, + "step": 27940 + }, + { + "epoch": 2.1813782876765786, + "grad_norm": 8.0718174115188e-15, + "learning_rate": 2.10218172114726e-05, + "loss": 0.5434, + "step": 27950 + }, + { + "epoch": 2.1821587450245845, + "grad_norm": 0.26627057790756226, + "learning_rate": 2.0984827162089665e-05, + "loss": 0.0001, + "step": 27960 + }, + { + "epoch": 2.1829392023725904, + "grad_norm": 0.04139802232384682, + "learning_rate": 2.0947861040255356e-05, + "loss": 0.001, + "step": 27970 + }, + { + "epoch": 2.1837196597205963, + "grad_norm": 4.1397134964427096e-07, + "learning_rate": 2.091091887645405e-05, + "loss": 0.0052, + "step": 27980 + }, + { + "epoch": 2.1845001170686023, + "grad_norm": 0.00010960004874505103, + "learning_rate": 2.08740007011503e-05, + "loss": 0.002, + "step": 27990 + }, + { + "epoch": 2.185280574416608, + "grad_norm": 0.00015495822299271822, + "learning_rate": 2.0837106544788953e-05, + "loss": 0.6876, + "step": 28000 + }, + { + "epoch": 2.186061031764614, + "grad_norm": 7.378990267170593e-05, + "learning_rate": 2.0800236437794972e-05, + "loss": 0.0006, + "step": 28010 + }, + { + "epoch": 2.18684148911262, + "grad_norm": 0.0001176554651465267, + "learning_rate": 2.0763390410573567e-05, + "loss": 0.3669, + "step": 28020 + }, + { + "epoch": 2.187621946460626, + "grad_norm": 4.130102482235998e-09, + "learning_rate": 2.072656849351002e-05, + "loss": 0.0089, + "step": 28030 + }, + { + "epoch": 2.188402403808632, + "grad_norm": 1.4616011512202931e-08, + "learning_rate": 2.068977071696977e-05, + "loss": 0.1802, + "step": 28040 + }, + { + "epoch": 2.1891828611566377, + "grad_norm": 1.43246558278326e-12, + "learning_rate": 2.0652997111298363e-05, + "loss": 0.0135, + "step": 28050 + }, + { + "epoch": 2.1899633185046437, + "grad_norm": 365.1261901855469, + "learning_rate": 2.0616247706821347e-05, + "loss": 0.6377, + "step": 28060 + }, + { + "epoch": 2.1907437758526496, + "grad_norm": 12.422369956970215, + "learning_rate": 2.0579522533844397e-05, + "loss": 0.0047, + "step": 28070 + }, + { + "epoch": 2.1915242332006555, + "grad_norm": 2.1104255750061625e-15, + "learning_rate": 2.054282162265313e-05, + "loss": 1.9192, + "step": 28080 + }, + { + "epoch": 2.1923046905486614, + "grad_norm": 3.5538564278903007e-13, + "learning_rate": 2.0506145003513215e-05, + "loss": 0.0014, + "step": 28090 + }, + { + "epoch": 2.1930851478966673, + "grad_norm": 0.00013382562610786408, + "learning_rate": 2.0469492706670236e-05, + "loss": 0.2018, + "step": 28100 + }, + { + "epoch": 2.1938656052446732, + "grad_norm": 3.3238044125027955e-05, + "learning_rate": 2.043286476234975e-05, + "loss": 0.0, + "step": 28110 + }, + { + "epoch": 2.194646062592679, + "grad_norm": 6.489884071925189e-06, + "learning_rate": 2.0396261200757237e-05, + "loss": 1.2673, + "step": 28120 + }, + { + "epoch": 2.195426519940685, + "grad_norm": 4.15155426480851e-07, + "learning_rate": 2.035968205207807e-05, + "loss": 0.0223, + "step": 28130 + }, + { + "epoch": 2.196206977288691, + "grad_norm": 6.578194734174758e-05, + "learning_rate": 2.032312734647747e-05, + "loss": 1.6357, + "step": 28140 + }, + { + "epoch": 2.1969874346366973, + "grad_norm": 5.8557468616853464e-21, + "learning_rate": 2.028659711410048e-05, + "loss": 0.7133, + "step": 28150 + }, + { + "epoch": 2.1977678919847032, + "grad_norm": 5.440007953438908e-06, + "learning_rate": 2.025009138507204e-05, + "loss": 0.016, + "step": 28160 + }, + { + "epoch": 2.198548349332709, + "grad_norm": 0.00112254754640162, + "learning_rate": 2.0213610189496784e-05, + "loss": 0.0001, + "step": 28170 + }, + { + "epoch": 2.199328806680715, + "grad_norm": 2.7225906862327065e-08, + "learning_rate": 2.0177153557459212e-05, + "loss": 0.0051, + "step": 28180 + }, + { + "epoch": 2.200109264028721, + "grad_norm": 0.006148796062916517, + "learning_rate": 2.014072151902346e-05, + "loss": 1.3892, + "step": 28190 + }, + { + "epoch": 2.200889721376727, + "grad_norm": 1.1964095830917358, + "learning_rate": 2.010431410423351e-05, + "loss": 0.0007, + "step": 28200 + }, + { + "epoch": 2.201670178724733, + "grad_norm": 0.0008482402190566063, + "learning_rate": 2.0067931343112928e-05, + "loss": 0.0279, + "step": 28210 + }, + { + "epoch": 2.2024506360727387, + "grad_norm": 203.20982360839844, + "learning_rate": 2.0031573265665022e-05, + "loss": 0.8307, + "step": 28220 + }, + { + "epoch": 2.2032310934207446, + "grad_norm": 7.246639486396811e-12, + "learning_rate": 1.999523990187267e-05, + "loss": 0.0001, + "step": 28230 + }, + { + "epoch": 2.2040115507687505, + "grad_norm": 0.022905228659510612, + "learning_rate": 1.9958931281698463e-05, + "loss": 0.0331, + "step": 28240 + }, + { + "epoch": 2.2047920081167565, + "grad_norm": 8.755494125664478e-15, + "learning_rate": 1.9922647435084514e-05, + "loss": 0.0435, + "step": 28250 + }, + { + "epoch": 2.2055724654647624, + "grad_norm": 302.1018981933594, + "learning_rate": 1.988638839195251e-05, + "loss": 2.5054, + "step": 28260 + }, + { + "epoch": 2.2063529228127683, + "grad_norm": 6.365735316649079e-06, + "learning_rate": 1.9850154182203722e-05, + "loss": 0.0001, + "step": 28270 + }, + { + "epoch": 2.207133380160774, + "grad_norm": 210.52674865722656, + "learning_rate": 1.9813944835718936e-05, + "loss": 0.6641, + "step": 28280 + }, + { + "epoch": 2.20791383750878, + "grad_norm": 4.210061160847545e-05, + "learning_rate": 1.9777760382358417e-05, + "loss": 0.0003, + "step": 28290 + }, + { + "epoch": 2.208694294856786, + "grad_norm": 0.001491089817136526, + "learning_rate": 1.974160085196189e-05, + "loss": 2.045, + "step": 28300 + }, + { + "epoch": 2.209474752204792, + "grad_norm": 1.0782598110381514e-05, + "learning_rate": 1.970546627434857e-05, + "loss": 0.0367, + "step": 28310 + }, + { + "epoch": 2.210255209552798, + "grad_norm": 229.82244873046875, + "learning_rate": 1.966935667931704e-05, + "loss": 1.2602, + "step": 28320 + }, + { + "epoch": 2.2110356669008038, + "grad_norm": 5.791094537244135e-08, + "learning_rate": 1.9633272096645345e-05, + "loss": 0.0015, + "step": 28330 + }, + { + "epoch": 2.2118161242488097, + "grad_norm": 0.8963795304298401, + "learning_rate": 1.9597212556090827e-05, + "loss": 0.0004, + "step": 28340 + }, + { + "epoch": 2.2125965815968156, + "grad_norm": 1.6252406567218713e-05, + "learning_rate": 1.956117808739023e-05, + "loss": 0.0005, + "step": 28350 + }, + { + "epoch": 2.2133770389448215, + "grad_norm": 7.480576238236682e-16, + "learning_rate": 1.9525168720259647e-05, + "loss": 0.0043, + "step": 28360 + }, + { + "epoch": 2.2141574962928274, + "grad_norm": 0.00015660261851735413, + "learning_rate": 1.9489184484394374e-05, + "loss": 0.3501, + "step": 28370 + }, + { + "epoch": 2.2149379536408333, + "grad_norm": 204.61285400390625, + "learning_rate": 1.9453225409469093e-05, + "loss": 1.9355, + "step": 28380 + }, + { + "epoch": 2.2157184109888393, + "grad_norm": 1.1758524465221853e-07, + "learning_rate": 1.9417291525137656e-05, + "loss": 0.4495, + "step": 28390 + }, + { + "epoch": 2.2164988683368456, + "grad_norm": 0.0005095585947856307, + "learning_rate": 1.9381382861033194e-05, + "loss": 0.8172, + "step": 28400 + }, + { + "epoch": 2.217279325684851, + "grad_norm": 0.00011332322173984721, + "learning_rate": 1.9345499446767982e-05, + "loss": 0.1318, + "step": 28410 + }, + { + "epoch": 2.2180597830328574, + "grad_norm": 5.318884568894511e-11, + "learning_rate": 1.9309641311933535e-05, + "loss": 1.5683, + "step": 28420 + }, + { + "epoch": 2.2188402403808634, + "grad_norm": 8.58475141285453e-06, + "learning_rate": 1.927380848610051e-05, + "loss": 0.0013, + "step": 28430 + }, + { + "epoch": 2.2196206977288693, + "grad_norm": 6.644186214543879e-06, + "learning_rate": 1.9238000998818634e-05, + "loss": 0.0339, + "step": 28440 + }, + { + "epoch": 2.220401155076875, + "grad_norm": 0.00013304341700859368, + "learning_rate": 1.9202218879616824e-05, + "loss": 0.0302, + "step": 28450 + }, + { + "epoch": 2.221181612424881, + "grad_norm": 1.0497045057773069e-16, + "learning_rate": 1.9166462158002995e-05, + "loss": 0.0025, + "step": 28460 + }, + { + "epoch": 2.221962069772887, + "grad_norm": 2.2750330875361165e-14, + "learning_rate": 1.9130730863464196e-05, + "loss": 0.5506, + "step": 28470 + }, + { + "epoch": 2.222742527120893, + "grad_norm": 1.969460150519353e-09, + "learning_rate": 1.909502502546643e-05, + "loss": 0.0003, + "step": 28480 + }, + { + "epoch": 2.223522984468899, + "grad_norm": 7.709286364843138e-08, + "learning_rate": 1.905934467345478e-05, + "loss": 1.6219, + "step": 28490 + }, + { + "epoch": 2.2243034418169048, + "grad_norm": 3.405430531140739e-15, + "learning_rate": 1.9023689836853253e-05, + "loss": 0.5878, + "step": 28500 + }, + { + "epoch": 2.2250838991649107, + "grad_norm": 0.00013919989578425884, + "learning_rate": 1.898806054506484e-05, + "loss": 0.1291, + "step": 28510 + }, + { + "epoch": 2.2258643565129166, + "grad_norm": 1.1194244535462015e-18, + "learning_rate": 1.8952456827471475e-05, + "loss": 0.7777, + "step": 28520 + }, + { + "epoch": 2.2266448138609225, + "grad_norm": 4.966762290337101e-08, + "learning_rate": 1.891687871343401e-05, + "loss": 0.865, + "step": 28530 + }, + { + "epoch": 2.2274252712089284, + "grad_norm": 6.397501639554548e-09, + "learning_rate": 1.888132623229215e-05, + "loss": 0.0003, + "step": 28540 + }, + { + "epoch": 2.2282057285569343, + "grad_norm": 1.4362217370944563e-06, + "learning_rate": 1.884579941336445e-05, + "loss": 1.4745, + "step": 28550 + }, + { + "epoch": 2.2289861859049402, + "grad_norm": 0.010631516575813293, + "learning_rate": 1.881029828594837e-05, + "loss": 0.0259, + "step": 28560 + }, + { + "epoch": 2.229766643252946, + "grad_norm": 2.6044972400995903e-07, + "learning_rate": 1.8774822879320107e-05, + "loss": 0.0, + "step": 28570 + }, + { + "epoch": 2.230547100600952, + "grad_norm": 9.406716117155156e-07, + "learning_rate": 1.8739373222734708e-05, + "loss": 1.3937, + "step": 28580 + }, + { + "epoch": 2.231327557948958, + "grad_norm": 0.0014336185995489359, + "learning_rate": 1.8703949345425948e-05, + "loss": 0.1551, + "step": 28590 + }, + { + "epoch": 2.232108015296964, + "grad_norm": 1.689692794570874e-06, + "learning_rate": 1.8668551276606377e-05, + "loss": 1.1188, + "step": 28600 + }, + { + "epoch": 2.23288847264497, + "grad_norm": 2.9793050289154053, + "learning_rate": 1.8633179045467203e-05, + "loss": 0.0023, + "step": 28610 + }, + { + "epoch": 2.2336689299929757, + "grad_norm": 3.615961425684072e-13, + "learning_rate": 1.8597832681178405e-05, + "loss": 1.1973, + "step": 28620 + }, + { + "epoch": 2.2344493873409816, + "grad_norm": 0.01192985288798809, + "learning_rate": 1.8562512212888565e-05, + "loss": 0.003, + "step": 28630 + }, + { + "epoch": 2.2352298446889876, + "grad_norm": 215.98329162597656, + "learning_rate": 1.8527217669724924e-05, + "loss": 2.44, + "step": 28640 + }, + { + "epoch": 2.236010302036994, + "grad_norm": 0.008464322425425053, + "learning_rate": 1.849194908079336e-05, + "loss": 0.3656, + "step": 28650 + }, + { + "epoch": 2.2367907593849994, + "grad_norm": 0.8279557228088379, + "learning_rate": 1.8456706475178347e-05, + "loss": 1.2165, + "step": 28660 + }, + { + "epoch": 2.2375712167330057, + "grad_norm": 3.254080729675479e-05, + "learning_rate": 1.842148988194295e-05, + "loss": 0.0004, + "step": 28670 + }, + { + "epoch": 2.2383516740810117, + "grad_norm": 1.2258682957622113e-10, + "learning_rate": 1.8386299330128714e-05, + "loss": 1.4324, + "step": 28680 + }, + { + "epoch": 2.2391321314290176, + "grad_norm": 0.004264687187969685, + "learning_rate": 1.8351134848755796e-05, + "loss": 1.745, + "step": 28690 + }, + { + "epoch": 2.2399125887770235, + "grad_norm": 0.00042669291724450886, + "learning_rate": 1.8315996466822772e-05, + "loss": 0.0516, + "step": 28700 + }, + { + "epoch": 2.2406930461250294, + "grad_norm": 0.00012339148088358343, + "learning_rate": 1.828088421330677e-05, + "loss": 0.651, + "step": 28710 + }, + { + "epoch": 2.2414735034730353, + "grad_norm": 2.1883664585303775e-12, + "learning_rate": 1.82457981171633e-05, + "loss": 0.5319, + "step": 28720 + }, + { + "epoch": 2.2422539608210412, + "grad_norm": 2.0608095785412672e-10, + "learning_rate": 1.8210738207326356e-05, + "loss": 0.9683, + "step": 28730 + }, + { + "epoch": 2.243034418169047, + "grad_norm": 4.095708573004231e-05, + "learning_rate": 1.8175704512708335e-05, + "loss": 0.0071, + "step": 28740 + }, + { + "epoch": 2.243814875517053, + "grad_norm": 1.1333201307891236e-15, + "learning_rate": 1.8140697062199963e-05, + "loss": 0.0018, + "step": 28750 + }, + { + "epoch": 2.244595332865059, + "grad_norm": 0.00019960488134529442, + "learning_rate": 1.8105715884670388e-05, + "loss": 0.7299, + "step": 28760 + }, + { + "epoch": 2.245375790213065, + "grad_norm": 0.051362793892621994, + "learning_rate": 1.8070761008967036e-05, + "loss": 0.0008, + "step": 28770 + }, + { + "epoch": 2.246156247561071, + "grad_norm": 1.3369414773478638e-05, + "learning_rate": 1.8035832463915702e-05, + "loss": 1.4077, + "step": 28780 + }, + { + "epoch": 2.2469367049090767, + "grad_norm": 6.260481313802302e-06, + "learning_rate": 1.8000930278320398e-05, + "loss": 0.001, + "step": 28790 + }, + { + "epoch": 2.2477171622570826, + "grad_norm": 2.0338929718481842e-14, + "learning_rate": 1.796605448096348e-05, + "loss": 0.0582, + "step": 28800 + }, + { + "epoch": 2.2484976196050885, + "grad_norm": 2.200808495445017e-07, + "learning_rate": 1.7931205100605446e-05, + "loss": 0.9008, + "step": 28810 + }, + { + "epoch": 2.2492780769530945, + "grad_norm": 5.862556551328169e-10, + "learning_rate": 1.7896382165985092e-05, + "loss": 0.2649, + "step": 28820 + }, + { + "epoch": 2.2500585343011004, + "grad_norm": 0.005376019049435854, + "learning_rate": 1.786158570581939e-05, + "loss": 0.0015, + "step": 28830 + }, + { + "epoch": 2.2508389916491063, + "grad_norm": 17.125396728515625, + "learning_rate": 1.782681574880343e-05, + "loss": 0.5649, + "step": 28840 + }, + { + "epoch": 2.251619448997112, + "grad_norm": 1.2211214303970337, + "learning_rate": 1.7792072323610508e-05, + "loss": 1.5577, + "step": 28850 + }, + { + "epoch": 2.252399906345118, + "grad_norm": 3.984374462306928e-12, + "learning_rate": 1.7757355458891982e-05, + "loss": 0.3288, + "step": 28860 + }, + { + "epoch": 2.253180363693124, + "grad_norm": 1.6309401051906036e-10, + "learning_rate": 1.772266518327738e-05, + "loss": 0.0083, + "step": 28870 + }, + { + "epoch": 2.25396082104113, + "grad_norm": 0.00034212172613479197, + "learning_rate": 1.7688001525374215e-05, + "loss": 0.6162, + "step": 28880 + }, + { + "epoch": 2.254741278389136, + "grad_norm": 1.2708798635685525e-07, + "learning_rate": 1.7653364513768116e-05, + "loss": 0.0385, + "step": 28890 + }, + { + "epoch": 2.255521735737142, + "grad_norm": 0.03911673277616501, + "learning_rate": 1.7618754177022722e-05, + "loss": 0.8912, + "step": 28900 + }, + { + "epoch": 2.2563021930851477, + "grad_norm": 0.017582669854164124, + "learning_rate": 1.758417054367968e-05, + "loss": 0.4513, + "step": 28910 + }, + { + "epoch": 2.257082650433154, + "grad_norm": 2.5487133291455383e-14, + "learning_rate": 1.7549613642258573e-05, + "loss": 0.0602, + "step": 28920 + }, + { + "epoch": 2.25786310778116, + "grad_norm": 1.2419158679222164e-07, + "learning_rate": 1.7515083501257006e-05, + "loss": 0.0005, + "step": 28930 + }, + { + "epoch": 2.258643565129166, + "grad_norm": 0.010186822153627872, + "learning_rate": 1.7480580149150466e-05, + "loss": 0.0001, + "step": 28940 + }, + { + "epoch": 2.259424022477172, + "grad_norm": 2.926939123426564e-05, + "learning_rate": 1.7446103614392343e-05, + "loss": 0.0044, + "step": 28950 + }, + { + "epoch": 2.2602044798251777, + "grad_norm": 0.6418260931968689, + "learning_rate": 1.7411653925413957e-05, + "loss": 1.549, + "step": 28960 + }, + { + "epoch": 2.2609849371731836, + "grad_norm": 1.1723219586201594e-06, + "learning_rate": 1.737723111062446e-05, + "loss": 0.6384, + "step": 28970 + }, + { + "epoch": 2.2617653945211895, + "grad_norm": 0.04600205644965172, + "learning_rate": 1.7342835198410863e-05, + "loss": 0.0717, + "step": 28980 + }, + { + "epoch": 2.2625458518691954, + "grad_norm": 199.13775634765625, + "learning_rate": 1.730846621713795e-05, + "loss": 0.3244, + "step": 28990 + }, + { + "epoch": 2.2633263092172013, + "grad_norm": 3.316258727336959e-17, + "learning_rate": 1.7274124195148338e-05, + "loss": 0.0252, + "step": 29000 + }, + { + "epoch": 2.2641067665652073, + "grad_norm": 1.9493307945595006e-07, + "learning_rate": 1.7239809160762383e-05, + "loss": 0.5724, + "step": 29010 + }, + { + "epoch": 2.264887223913213, + "grad_norm": 0.06668946892023087, + "learning_rate": 1.7205521142278225e-05, + "loss": 0.9245, + "step": 29020 + }, + { + "epoch": 2.265667681261219, + "grad_norm": 4.2764901185153234e-14, + "learning_rate": 1.7171260167971658e-05, + "loss": 0.1911, + "step": 29030 + }, + { + "epoch": 2.266448138609225, + "grad_norm": 1.0847069025039673, + "learning_rate": 1.713702626609624e-05, + "loss": 0.1063, + "step": 29040 + }, + { + "epoch": 2.267228595957231, + "grad_norm": 0.001475973636843264, + "learning_rate": 1.710281946488319e-05, + "loss": 0.4168, + "step": 29050 + }, + { + "epoch": 2.268009053305237, + "grad_norm": 5.03236151416786e-07, + "learning_rate": 1.7068639792541337e-05, + "loss": 1.7054, + "step": 29060 + }, + { + "epoch": 2.2687895106532427, + "grad_norm": 5.004310854594962e-10, + "learning_rate": 1.7034487277257193e-05, + "loss": 0.0003, + "step": 29070 + }, + { + "epoch": 2.2695699680012487, + "grad_norm": 1.9284122743101761e-07, + "learning_rate": 1.7000361947194828e-05, + "loss": 0.0043, + "step": 29080 + }, + { + "epoch": 2.2703504253492546, + "grad_norm": 1.50415091686904e-16, + "learning_rate": 1.6966263830495936e-05, + "loss": 0.0086, + "step": 29090 + }, + { + "epoch": 2.2711308826972605, + "grad_norm": 163.6029052734375, + "learning_rate": 1.6932192955279724e-05, + "loss": 0.338, + "step": 29100 + }, + { + "epoch": 2.2719113400452664, + "grad_norm": 4.9290561118956947e-17, + "learning_rate": 1.6898149349642984e-05, + "loss": 0.1245, + "step": 29110 + }, + { + "epoch": 2.2726917973932723, + "grad_norm": 0.01845482736825943, + "learning_rate": 1.6864133041659964e-05, + "loss": 0.7721, + "step": 29120 + }, + { + "epoch": 2.2734722547412782, + "grad_norm": 0.42253369092941284, + "learning_rate": 1.6830144059382448e-05, + "loss": 0.0258, + "step": 29130 + }, + { + "epoch": 2.274252712089284, + "grad_norm": 0.0007411639671772718, + "learning_rate": 1.679618243083968e-05, + "loss": 0.7881, + "step": 29140 + }, + { + "epoch": 2.27503316943729, + "grad_norm": 2.0271454559406266e-05, + "learning_rate": 1.676224818403831e-05, + "loss": 0.4049, + "step": 29150 + }, + { + "epoch": 2.275813626785296, + "grad_norm": 0.1324540078639984, + "learning_rate": 1.6728341346962462e-05, + "loss": 0.0122, + "step": 29160 + }, + { + "epoch": 2.2765940841333023, + "grad_norm": 2.3629531491209388e-12, + "learning_rate": 1.669446194757359e-05, + "loss": 0.1406, + "step": 29170 + }, + { + "epoch": 2.277374541481308, + "grad_norm": 2.2875452643233984e-08, + "learning_rate": 1.6660610013810603e-05, + "loss": 0.0, + "step": 29180 + }, + { + "epoch": 2.278154998829314, + "grad_norm": 8.568787279727985e-08, + "learning_rate": 1.6626785573589665e-05, + "loss": 1.0792, + "step": 29190 + }, + { + "epoch": 2.27893545617732, + "grad_norm": 1.937781780725345e-05, + "learning_rate": 1.659298865480435e-05, + "loss": 0.6439, + "step": 29200 + }, + { + "epoch": 2.279715913525326, + "grad_norm": 0.010542660020291805, + "learning_rate": 1.6559219285325495e-05, + "loss": 0.1753, + "step": 29210 + }, + { + "epoch": 2.280496370873332, + "grad_norm": 28.777854919433594, + "learning_rate": 1.6525477493001253e-05, + "loss": 0.0113, + "step": 29220 + }, + { + "epoch": 2.281276828221338, + "grad_norm": 0.0008937482489272952, + "learning_rate": 1.649176330565698e-05, + "loss": 0.1313, + "step": 29230 + }, + { + "epoch": 2.2820572855693437, + "grad_norm": 3.146440940327011e-05, + "learning_rate": 1.645807675109529e-05, + "loss": 0.0, + "step": 29240 + }, + { + "epoch": 2.2828377429173496, + "grad_norm": 3.4924204328490305e-07, + "learning_rate": 1.6424417857096052e-05, + "loss": 0.0139, + "step": 29250 + }, + { + "epoch": 2.2836182002653556, + "grad_norm": 1.5560833332983748e-07, + "learning_rate": 1.6390786651416245e-05, + "loss": 0.4481, + "step": 29260 + }, + { + "epoch": 2.2843986576133615, + "grad_norm": 0.00024010951165109873, + "learning_rate": 1.6357183161790086e-05, + "loss": 0.0023, + "step": 29270 + }, + { + "epoch": 2.2851791149613674, + "grad_norm": 2.9010316371369527e-09, + "learning_rate": 1.6323607415928905e-05, + "loss": 0.0002, + "step": 29280 + }, + { + "epoch": 2.2859595723093733, + "grad_norm": 1.5927986169117503e-05, + "learning_rate": 1.629005944152117e-05, + "loss": 0.0212, + "step": 29290 + }, + { + "epoch": 2.286740029657379, + "grad_norm": 8.826788578797239e-22, + "learning_rate": 1.6256539266232406e-05, + "loss": 0.0007, + "step": 29300 + }, + { + "epoch": 2.287520487005385, + "grad_norm": 1.8401525494482485e-06, + "learning_rate": 1.622304691770527e-05, + "loss": 2.0744, + "step": 29310 + }, + { + "epoch": 2.288300944353391, + "grad_norm": 3.847404514090158e-06, + "learning_rate": 1.6189582423559424e-05, + "loss": 0.0424, + "step": 29320 + }, + { + "epoch": 2.289081401701397, + "grad_norm": 0.34409666061401367, + "learning_rate": 1.6156145811391565e-05, + "loss": 0.2845, + "step": 29330 + }, + { + "epoch": 2.289861859049403, + "grad_norm": 180.2010498046875, + "learning_rate": 1.6122737108775444e-05, + "loss": 0.2992, + "step": 29340 + }, + { + "epoch": 2.290642316397409, + "grad_norm": 9.646710513733225e-11, + "learning_rate": 1.6089356343261706e-05, + "loss": 1.9876, + "step": 29350 + }, + { + "epoch": 2.2914227737454147, + "grad_norm": 6.982622813289652e-10, + "learning_rate": 1.6056003542378088e-05, + "loss": 0.0046, + "step": 29360 + }, + { + "epoch": 2.2922032310934206, + "grad_norm": 0.0001232082722708583, + "learning_rate": 1.6022678733629142e-05, + "loss": 0.0002, + "step": 29370 + }, + { + "epoch": 2.2929836884414265, + "grad_norm": 294.8337707519531, + "learning_rate": 1.598938194449641e-05, + "loss": 1.363, + "step": 29380 + }, + { + "epoch": 2.2937641457894324, + "grad_norm": 0.3886575400829315, + "learning_rate": 1.595611320243828e-05, + "loss": 0.8926, + "step": 29390 + }, + { + "epoch": 2.2945446031374384, + "grad_norm": 0.0002822733367793262, + "learning_rate": 1.5922872534890054e-05, + "loss": 0.0, + "step": 29400 + }, + { + "epoch": 2.2953250604854443, + "grad_norm": 79.61001586914062, + "learning_rate": 1.5889659969263843e-05, + "loss": 0.3346, + "step": 29410 + }, + { + "epoch": 2.2961055178334506, + "grad_norm": 3.2513478021023445e-10, + "learning_rate": 1.585647553294863e-05, + "loss": 0.0014, + "step": 29420 + }, + { + "epoch": 2.296885975181456, + "grad_norm": 6.914351313724865e-10, + "learning_rate": 1.5823319253310133e-05, + "loss": 3.8508, + "step": 29430 + }, + { + "epoch": 2.2976664325294625, + "grad_norm": 6.965128208349589e-14, + "learning_rate": 1.579019115769092e-05, + "loss": 0.0199, + "step": 29440 + }, + { + "epoch": 2.2984468898774684, + "grad_norm": 2.478855662402246e-12, + "learning_rate": 1.5757091273410296e-05, + "loss": 0.3049, + "step": 29450 + }, + { + "epoch": 2.2992273472254743, + "grad_norm": 5.027878614782821e-06, + "learning_rate": 1.5724019627764265e-05, + "loss": 0.4531, + "step": 29460 + }, + { + "epoch": 2.30000780457348, + "grad_norm": 111.20915985107422, + "learning_rate": 1.5690976248025603e-05, + "loss": 2.1627, + "step": 29470 + }, + { + "epoch": 2.300788261921486, + "grad_norm": 62.71500015258789, + "learning_rate": 1.565796116144371e-05, + "loss": 0.0257, + "step": 29480 + }, + { + "epoch": 2.301568719269492, + "grad_norm": 9.235794351525328e-08, + "learning_rate": 1.5624974395244722e-05, + "loss": 0.2352, + "step": 29490 + }, + { + "epoch": 2.302349176617498, + "grad_norm": 6.738876492419499e-13, + "learning_rate": 1.5592015976631363e-05, + "loss": 0.0007, + "step": 29500 + }, + { + "epoch": 2.303129633965504, + "grad_norm": 1.0343747314443312e-16, + "learning_rate": 1.5559085932783013e-05, + "loss": 0.0698, + "step": 29510 + }, + { + "epoch": 2.3039100913135098, + "grad_norm": 0.0002653372648637742, + "learning_rate": 1.552618429085566e-05, + "loss": 0.003, + "step": 29520 + }, + { + "epoch": 2.3046905486615157, + "grad_norm": 3.0795322888965693e-12, + "learning_rate": 1.5493311077981827e-05, + "loss": 0.0039, + "step": 29530 + }, + { + "epoch": 2.3054710060095216, + "grad_norm": 0.0023853040765970945, + "learning_rate": 1.546046632127065e-05, + "loss": 2.4906, + "step": 29540 + }, + { + "epoch": 2.3062514633575275, + "grad_norm": 12.284415245056152, + "learning_rate": 1.5427650047807733e-05, + "loss": 0.6692, + "step": 29550 + }, + { + "epoch": 2.3070319207055334, + "grad_norm": 5.2024464736177833e-08, + "learning_rate": 1.5394862284655264e-05, + "loss": 0.0006, + "step": 29560 + }, + { + "epoch": 2.3078123780535393, + "grad_norm": 7.312227694455942e-07, + "learning_rate": 1.5362103058851847e-05, + "loss": 0.6756, + "step": 29570 + }, + { + "epoch": 2.3085928354015453, + "grad_norm": 7.574797271613494e-18, + "learning_rate": 1.5329372397412605e-05, + "loss": 0.0, + "step": 29580 + }, + { + "epoch": 2.309373292749551, + "grad_norm": 0.0, + "learning_rate": 1.529667032732909e-05, + "loss": 0.97, + "step": 29590 + }, + { + "epoch": 2.310153750097557, + "grad_norm": 5.293955920339377e-23, + "learning_rate": 1.5263996875569288e-05, + "loss": 0.0633, + "step": 29600 + }, + { + "epoch": 2.310934207445563, + "grad_norm": 0.1192730963230133, + "learning_rate": 1.5231352069077553e-05, + "loss": 0.1783, + "step": 29610 + }, + { + "epoch": 2.311714664793569, + "grad_norm": 3.9510272472398356e-05, + "learning_rate": 1.5198735934774627e-05, + "loss": 0.0019, + "step": 29620 + }, + { + "epoch": 2.312495122141575, + "grad_norm": 4.5667154546834635e-14, + "learning_rate": 1.5166148499557637e-05, + "loss": 0.1592, + "step": 29630 + }, + { + "epoch": 2.3132755794895807, + "grad_norm": 151.92227172851562, + "learning_rate": 1.513358979029999e-05, + "loss": 0.7626, + "step": 29640 + }, + { + "epoch": 2.3140560368375866, + "grad_norm": 1.377452372253174e-05, + "learning_rate": 1.5101059833851473e-05, + "loss": 1.341, + "step": 29650 + }, + { + "epoch": 2.3148364941855926, + "grad_norm": 18.119611740112305, + "learning_rate": 1.5068558657038084e-05, + "loss": 0.0049, + "step": 29660 + }, + { + "epoch": 2.315616951533599, + "grad_norm": 3.9845085098022537e-07, + "learning_rate": 1.5036086286662148e-05, + "loss": 1.8925, + "step": 29670 + }, + { + "epoch": 2.3163974088816044, + "grad_norm": 9.201725481838147e-16, + "learning_rate": 1.500364274950221e-05, + "loss": 1.578, + "step": 29680 + }, + { + "epoch": 2.3171778662296107, + "grad_norm": 1.5374733266071416e-05, + "learning_rate": 1.497122807231306e-05, + "loss": 0.0, + "step": 29690 + }, + { + "epoch": 2.3179583235776167, + "grad_norm": 1.1048708984162658e-05, + "learning_rate": 1.4938842281825637e-05, + "loss": 0.1567, + "step": 29700 + }, + { + "epoch": 2.3187387809256226, + "grad_norm": 5.3418560241880186e-08, + "learning_rate": 1.4906485404747127e-05, + "loss": 0.0, + "step": 29710 + }, + { + "epoch": 2.3195192382736285, + "grad_norm": 1.4238677841695176e-15, + "learning_rate": 1.4874157467760812e-05, + "loss": 1.4317, + "step": 29720 + }, + { + "epoch": 2.3202996956216344, + "grad_norm": 0.0007119226502254605, + "learning_rate": 1.484185849752613e-05, + "loss": 1.1788, + "step": 29730 + }, + { + "epoch": 2.3210801529696403, + "grad_norm": 4.933306172461016e-07, + "learning_rate": 1.4809588520678636e-05, + "loss": 0.0012, + "step": 29740 + }, + { + "epoch": 2.3218606103176462, + "grad_norm": 0.19965912401676178, + "learning_rate": 1.477734756382998e-05, + "loss": 0.0255, + "step": 29750 + }, + { + "epoch": 2.322641067665652, + "grad_norm": 0.8561561107635498, + "learning_rate": 1.4745135653567887e-05, + "loss": 0.3782, + "step": 29760 + }, + { + "epoch": 2.323421525013658, + "grad_norm": 0.00013684302393812686, + "learning_rate": 1.4712952816456093e-05, + "loss": 0.8732, + "step": 29770 + }, + { + "epoch": 2.324201982361664, + "grad_norm": 1.5134020259210956e-07, + "learning_rate": 1.4680799079034402e-05, + "loss": 0.9348, + "step": 29780 + }, + { + "epoch": 2.32498243970967, + "grad_norm": 2.4426712388891936e-11, + "learning_rate": 1.4648674467818573e-05, + "loss": 0.0, + "step": 29790 + }, + { + "epoch": 2.325762897057676, + "grad_norm": 0.0035370292607694864, + "learning_rate": 1.4616579009300407e-05, + "loss": 0.0001, + "step": 29800 + }, + { + "epoch": 2.3265433544056817, + "grad_norm": 1.096762188785628e-15, + "learning_rate": 1.4584512729947597e-05, + "loss": 0.0002, + "step": 29810 + }, + { + "epoch": 2.3273238117536876, + "grad_norm": 1.2029965912319085e-09, + "learning_rate": 1.4552475656203817e-05, + "loss": 0.0779, + "step": 29820 + }, + { + "epoch": 2.3281042691016935, + "grad_norm": 120.34854125976562, + "learning_rate": 1.452046781448867e-05, + "loss": 0.1021, + "step": 29830 + }, + { + "epoch": 2.3288847264496995, + "grad_norm": 3.3695923207233136e-07, + "learning_rate": 1.4488489231197589e-05, + "loss": 0.0978, + "step": 29840 + }, + { + "epoch": 2.3296651837977054, + "grad_norm": 7.707518670940772e-05, + "learning_rate": 1.4456539932701957e-05, + "loss": 0.9351, + "step": 29850 + }, + { + "epoch": 2.3304456411457113, + "grad_norm": 0.002746081445366144, + "learning_rate": 1.4424619945348927e-05, + "loss": 0.0, + "step": 29860 + }, + { + "epoch": 2.331226098493717, + "grad_norm": 0.0011035347124561667, + "learning_rate": 1.439272929546156e-05, + "loss": 0.0, + "step": 29870 + }, + { + "epoch": 2.332006555841723, + "grad_norm": 1.0748327783360878e-09, + "learning_rate": 1.4360868009338658e-05, + "loss": 0.0, + "step": 29880 + }, + { + "epoch": 2.332787013189729, + "grad_norm": 6.85275438172539e-07, + "learning_rate": 1.432903611325484e-05, + "loss": 0.0, + "step": 29890 + }, + { + "epoch": 2.333567470537735, + "grad_norm": 3.086071274083224e-06, + "learning_rate": 1.4297233633460489e-05, + "loss": 1.504, + "step": 29900 + }, + { + "epoch": 2.334347927885741, + "grad_norm": 0.00041684453026391566, + "learning_rate": 1.426546059618174e-05, + "loss": 0.0, + "step": 29910 + }, + { + "epoch": 2.335128385233747, + "grad_norm": 187.2286834716797, + "learning_rate": 1.4233717027620413e-05, + "loss": 2.9529, + "step": 29920 + }, + { + "epoch": 2.3359088425817527, + "grad_norm": 6.371737981680781e-07, + "learning_rate": 1.4202002953954041e-05, + "loss": 0.3322, + "step": 29930 + }, + { + "epoch": 2.336689299929759, + "grad_norm": 0.9140729904174805, + "learning_rate": 1.4170318401335859e-05, + "loss": 0.1837, + "step": 29940 + }, + { + "epoch": 2.337469757277765, + "grad_norm": 9.586511851011892e-08, + "learning_rate": 1.4138663395894702e-05, + "loss": 0.0001, + "step": 29950 + }, + { + "epoch": 2.338250214625771, + "grad_norm": 166.947998046875, + "learning_rate": 1.410703796373512e-05, + "loss": 0.1164, + "step": 29960 + }, + { + "epoch": 2.339030671973777, + "grad_norm": 0.01821865886449814, + "learning_rate": 1.4075442130937183e-05, + "loss": 0.2303, + "step": 29970 + }, + { + "epoch": 2.3398111293217827, + "grad_norm": 0.018942847847938538, + "learning_rate": 1.4043875923556627e-05, + "loss": 0.0001, + "step": 29980 + }, + { + "epoch": 2.3405915866697886, + "grad_norm": 0.03353448212146759, + "learning_rate": 1.4012339367624711e-05, + "loss": 0.7039, + "step": 29990 + }, + { + "epoch": 2.3413720440177945, + "grad_norm": 1.6034233851769386e-07, + "learning_rate": 1.3980832489148287e-05, + "loss": 0.1224, + "step": 30000 + }, + { + "epoch": 2.3421525013658004, + "grad_norm": 194.84609985351562, + "learning_rate": 1.3949355314109686e-05, + "loss": 0.7069, + "step": 30010 + }, + { + "epoch": 2.3429329587138064, + "grad_norm": 0.00033877615351229906, + "learning_rate": 1.3917907868466745e-05, + "loss": 0.0678, + "step": 30020 + }, + { + "epoch": 2.3437134160618123, + "grad_norm": 0.2966441512107849, + "learning_rate": 1.3886490178152834e-05, + "loss": 0.0029, + "step": 30030 + }, + { + "epoch": 2.344493873409818, + "grad_norm": 3.915573643098469e-07, + "learning_rate": 1.385510226907672e-05, + "loss": 0.3636, + "step": 30040 + }, + { + "epoch": 2.345274330757824, + "grad_norm": 1.262197732925415, + "learning_rate": 1.3823744167122665e-05, + "loss": 0.2405, + "step": 30050 + }, + { + "epoch": 2.34605478810583, + "grad_norm": 2.086556503569506e-19, + "learning_rate": 1.3792415898150323e-05, + "loss": 2.1505, + "step": 30060 + }, + { + "epoch": 2.346835245453836, + "grad_norm": 5.639322807837743e-06, + "learning_rate": 1.3761117487994767e-05, + "loss": 0.0, + "step": 30070 + }, + { + "epoch": 2.347615702801842, + "grad_norm": 0.0005181823507882655, + "learning_rate": 1.3729848962466407e-05, + "loss": 0.0275, + "step": 30080 + }, + { + "epoch": 2.3483961601498478, + "grad_norm": 5.675804004567908e-07, + "learning_rate": 1.3698610347351065e-05, + "loss": 0.0289, + "step": 30090 + }, + { + "epoch": 2.3491766174978537, + "grad_norm": 2.3446404817661914e-09, + "learning_rate": 1.366740166840984e-05, + "loss": 0.0001, + "step": 30100 + }, + { + "epoch": 2.3499570748458596, + "grad_norm": 7.366129928243481e-09, + "learning_rate": 1.3636222951379168e-05, + "loss": 0.1921, + "step": 30110 + }, + { + "epoch": 2.3507375321938655, + "grad_norm": 6.541293259942904e-06, + "learning_rate": 1.3605074221970781e-05, + "loss": 0.3531, + "step": 30120 + }, + { + "epoch": 2.3515179895418714, + "grad_norm": 7.940363033753783e-09, + "learning_rate": 1.3573955505871689e-05, + "loss": 0.0002, + "step": 30130 + }, + { + "epoch": 2.3522984468898773, + "grad_norm": 2.120935965134194e-18, + "learning_rate": 1.3542866828744145e-05, + "loss": 0.1597, + "step": 30140 + }, + { + "epoch": 2.3530789042378832, + "grad_norm": 2.8953287250360615e-12, + "learning_rate": 1.3511808216225607e-05, + "loss": 0.0, + "step": 30150 + }, + { + "epoch": 2.353859361585889, + "grad_norm": 426.9054260253906, + "learning_rate": 1.348077969392878e-05, + "loss": 0.7626, + "step": 30160 + }, + { + "epoch": 2.3546398189338955, + "grad_norm": 0.006858580280095339, + "learning_rate": 1.3449781287441516e-05, + "loss": 0.121, + "step": 30170 + }, + { + "epoch": 2.355420276281901, + "grad_norm": 0.0003124236536677927, + "learning_rate": 1.341881302232687e-05, + "loss": 1.0282, + "step": 30180 + }, + { + "epoch": 2.3562007336299073, + "grad_norm": 0.06409773230552673, + "learning_rate": 1.338787492412299e-05, + "loss": 0.8941, + "step": 30190 + }, + { + "epoch": 2.356981190977913, + "grad_norm": 9.21978660084477e-10, + "learning_rate": 1.3356967018343202e-05, + "loss": 0.0274, + "step": 30200 + }, + { + "epoch": 2.357761648325919, + "grad_norm": 272.5448303222656, + "learning_rate": 1.3326089330475916e-05, + "loss": 0.3298, + "step": 30210 + }, + { + "epoch": 2.358542105673925, + "grad_norm": 4.685094972955994e-05, + "learning_rate": 1.3295241885984582e-05, + "loss": 0.1805, + "step": 30220 + }, + { + "epoch": 2.359322563021931, + "grad_norm": 8.916493324161243e-18, + "learning_rate": 1.3264424710307788e-05, + "loss": 0.0028, + "step": 30230 + }, + { + "epoch": 2.360103020369937, + "grad_norm": 10.937233924865723, + "learning_rate": 1.3233637828859074e-05, + "loss": 0.3216, + "step": 30240 + }, + { + "epoch": 2.360883477717943, + "grad_norm": 1.9286907121755917e-16, + "learning_rate": 1.3202881267027067e-05, + "loss": 0.0558, + "step": 30250 + }, + { + "epoch": 2.3616639350659487, + "grad_norm": 1.1528916274983203e-06, + "learning_rate": 1.3172155050175344e-05, + "loss": 0.001, + "step": 30260 + }, + { + "epoch": 2.3624443924139547, + "grad_norm": 9.056205385604699e-08, + "learning_rate": 1.3141459203642503e-05, + "loss": 0.0201, + "step": 30270 + }, + { + "epoch": 2.3632248497619606, + "grad_norm": 72.918701171875, + "learning_rate": 1.3110793752742035e-05, + "loss": 0.7724, + "step": 30280 + }, + { + "epoch": 2.3640053071099665, + "grad_norm": 3.27559462220961e-07, + "learning_rate": 1.308015872276242e-05, + "loss": 0.099, + "step": 30290 + }, + { + "epoch": 2.3647857644579724, + "grad_norm": 2.3372873329208232e-05, + "learning_rate": 1.3049554138967051e-05, + "loss": 0.0002, + "step": 30300 + }, + { + "epoch": 2.3655662218059783, + "grad_norm": 177.09768676757812, + "learning_rate": 1.3018980026594164e-05, + "loss": 0.5471, + "step": 30310 + }, + { + "epoch": 2.3663466791539842, + "grad_norm": 1.786733446351718e-05, + "learning_rate": 1.2988436410856918e-05, + "loss": 2.1845, + "step": 30320 + }, + { + "epoch": 2.36712713650199, + "grad_norm": 1.1945124576973054e-11, + "learning_rate": 1.2957923316943283e-05, + "loss": 0.7127, + "step": 30330 + }, + { + "epoch": 2.367907593849996, + "grad_norm": 11.638202667236328, + "learning_rate": 1.2927440770016102e-05, + "loss": 0.2237, + "step": 30340 + }, + { + "epoch": 2.368688051198002, + "grad_norm": 4.211646000840119e-07, + "learning_rate": 1.2896988795212977e-05, + "loss": 0.0001, + "step": 30350 + }, + { + "epoch": 2.369468508546008, + "grad_norm": 251.97036743164062, + "learning_rate": 1.286656741764633e-05, + "loss": 1.2411, + "step": 30360 + }, + { + "epoch": 2.370248965894014, + "grad_norm": 0.6915257573127747, + "learning_rate": 1.283617666240336e-05, + "loss": 0.0002, + "step": 30370 + }, + { + "epoch": 2.3710294232420197, + "grad_norm": 7.871942943893373e-05, + "learning_rate": 1.2805816554546002e-05, + "loss": 0.0, + "step": 30380 + }, + { + "epoch": 2.3718098805900256, + "grad_norm": 7.493450482343178e-08, + "learning_rate": 1.277548711911089e-05, + "loss": 0.2484, + "step": 30390 + }, + { + "epoch": 2.3725903379380315, + "grad_norm": 1.512717366218567, + "learning_rate": 1.2745188381109413e-05, + "loss": 0.2046, + "step": 30400 + }, + { + "epoch": 2.3733707952860374, + "grad_norm": 0.9041994214057922, + "learning_rate": 1.2714920365527611e-05, + "loss": 0.0852, + "step": 30410 + }, + { + "epoch": 2.3741512526340434, + "grad_norm": 0.08699410408735275, + "learning_rate": 1.2684683097326177e-05, + "loss": 1.9877, + "step": 30420 + }, + { + "epoch": 2.3749317099820493, + "grad_norm": 6.432399459299631e-06, + "learning_rate": 1.2654476601440484e-05, + "loss": 1.4962, + "step": 30430 + }, + { + "epoch": 2.3757121673300556, + "grad_norm": 0.00024829444009810686, + "learning_rate": 1.2624300902780518e-05, + "loss": 0.0001, + "step": 30440 + }, + { + "epoch": 2.376492624678061, + "grad_norm": 0.0683436170220375, + "learning_rate": 1.2594156026230869e-05, + "loss": 1.5125, + "step": 30450 + }, + { + "epoch": 2.3772730820260675, + "grad_norm": 4.083558796933486e-22, + "learning_rate": 1.2564041996650678e-05, + "loss": 0.1911, + "step": 30460 + }, + { + "epoch": 2.3780535393740734, + "grad_norm": 5.6656685046618804e-05, + "learning_rate": 1.2533958838873705e-05, + "loss": 0.0, + "step": 30470 + }, + { + "epoch": 2.3788339967220793, + "grad_norm": 6.898962601553649e-05, + "learning_rate": 1.2503906577708185e-05, + "loss": 0.0, + "step": 30480 + }, + { + "epoch": 2.379614454070085, + "grad_norm": 2.0142909909481865e-11, + "learning_rate": 1.2473885237936944e-05, + "loss": 0.3101, + "step": 30490 + }, + { + "epoch": 2.380394911418091, + "grad_norm": 0.00017895206110551953, + "learning_rate": 1.244389484431724e-05, + "loss": 0.0003, + "step": 30500 + }, + { + "epoch": 2.381175368766097, + "grad_norm": 6.29999067314202e-06, + "learning_rate": 1.2413935421580851e-05, + "loss": 0.0, + "step": 30510 + }, + { + "epoch": 2.381955826114103, + "grad_norm": 4.521431833381939e-07, + "learning_rate": 1.2384006994434032e-05, + "loss": 0.0, + "step": 30520 + }, + { + "epoch": 2.382736283462109, + "grad_norm": 1.842394885898102e-05, + "learning_rate": 1.2354109587557422e-05, + "loss": 0.0227, + "step": 30530 + }, + { + "epoch": 2.3835167408101148, + "grad_norm": 2.835592427530287e-09, + "learning_rate": 1.2324243225606135e-05, + "loss": 0.453, + "step": 30540 + }, + { + "epoch": 2.3842971981581207, + "grad_norm": 0.2322986125946045, + "learning_rate": 1.2294407933209633e-05, + "loss": 0.0115, + "step": 30550 + }, + { + "epoch": 2.3850776555061266, + "grad_norm": 5.22947977239438e-14, + "learning_rate": 1.2264603734971807e-05, + "loss": 0.0274, + "step": 30560 + }, + { + "epoch": 2.3858581128541325, + "grad_norm": 2.995477596097972e-15, + "learning_rate": 1.2234830655470853e-05, + "loss": 1.1651, + "step": 30570 + }, + { + "epoch": 2.3866385702021384, + "grad_norm": 0.00010099189967149869, + "learning_rate": 1.220508871925936e-05, + "loss": 0.0, + "step": 30580 + }, + { + "epoch": 2.3874190275501443, + "grad_norm": 309.7481384277344, + "learning_rate": 1.2175377950864181e-05, + "loss": 4.395, + "step": 30590 + }, + { + "epoch": 2.3881994848981503, + "grad_norm": 7.486784133018432e-23, + "learning_rate": 1.2145698374786501e-05, + "loss": 0.001, + "step": 30600 + }, + { + "epoch": 2.388979942246156, + "grad_norm": 0.02301146648824215, + "learning_rate": 1.2116050015501795e-05, + "loss": 0.0006, + "step": 30610 + }, + { + "epoch": 2.389760399594162, + "grad_norm": 4.1090452361944806e-16, + "learning_rate": 1.2086432897459737e-05, + "loss": 0.4331, + "step": 30620 + }, + { + "epoch": 2.390540856942168, + "grad_norm": 284.6919250488281, + "learning_rate": 1.2056847045084308e-05, + "loss": 1.8292, + "step": 30630 + }, + { + "epoch": 2.391321314290174, + "grad_norm": 0.37592825293540955, + "learning_rate": 1.2027292482773639e-05, + "loss": 0.0128, + "step": 30640 + }, + { + "epoch": 2.39210177163818, + "grad_norm": 0.01579117402434349, + "learning_rate": 1.1997769234900113e-05, + "loss": 0.8022, + "step": 30650 + }, + { + "epoch": 2.3928822289861857, + "grad_norm": 2.1695347951424145e-14, + "learning_rate": 1.1968277325810251e-05, + "loss": 0.0391, + "step": 30660 + }, + { + "epoch": 2.3936626863341917, + "grad_norm": 285.8024597167969, + "learning_rate": 1.1938816779824753e-05, + "loss": 0.6952, + "step": 30670 + }, + { + "epoch": 2.3944431436821976, + "grad_norm": 2.7052454948425293, + "learning_rate": 1.1909387621238439e-05, + "loss": 0.1416, + "step": 30680 + }, + { + "epoch": 2.395223601030204, + "grad_norm": 0.0008680449100211263, + "learning_rate": 1.187998987432028e-05, + "loss": 1.172, + "step": 30690 + }, + { + "epoch": 2.3960040583782094, + "grad_norm": 1.2633448762694607e-15, + "learning_rate": 1.1850623563313296e-05, + "loss": 0.0001, + "step": 30700 + }, + { + "epoch": 2.3967845157262158, + "grad_norm": 1.0665467467731114e-17, + "learning_rate": 1.1821288712434591e-05, + "loss": 0.1835, + "step": 30710 + }, + { + "epoch": 2.3975649730742217, + "grad_norm": 0.00472455145791173, + "learning_rate": 1.1791985345875373e-05, + "loss": 0.0006, + "step": 30720 + }, + { + "epoch": 2.3983454304222276, + "grad_norm": 2.081298653048929e-15, + "learning_rate": 1.1762713487800813e-05, + "loss": 0.1604, + "step": 30730 + }, + { + "epoch": 2.3991258877702335, + "grad_norm": 1.492285264248494e-06, + "learning_rate": 1.1733473162350156e-05, + "loss": 2.225, + "step": 30740 + }, + { + "epoch": 2.3999063451182394, + "grad_norm": 1.9213453583688533e-07, + "learning_rate": 1.1704264393636621e-05, + "loss": 0.3886, + "step": 30750 + }, + { + "epoch": 2.4006868024662453, + "grad_norm": 2.1827824920814254e-13, + "learning_rate": 1.1675087205747426e-05, + "loss": 0.0, + "step": 30760 + }, + { + "epoch": 2.4014672598142512, + "grad_norm": 0.2885519862174988, + "learning_rate": 1.1645941622743695e-05, + "loss": 0.1745, + "step": 30770 + }, + { + "epoch": 2.402247717162257, + "grad_norm": 0.001634718501009047, + "learning_rate": 1.1616827668660545e-05, + "loss": 0.0, + "step": 30780 + }, + { + "epoch": 2.403028174510263, + "grad_norm": 9.618447692437133e-16, + "learning_rate": 1.158774536750698e-05, + "loss": 0.2085, + "step": 30790 + }, + { + "epoch": 2.403808631858269, + "grad_norm": 1.8258782500845587e-16, + "learning_rate": 1.1558694743265885e-05, + "loss": 0.0008, + "step": 30800 + }, + { + "epoch": 2.404589089206275, + "grad_norm": 1.2440332852747815e-07, + "learning_rate": 1.1529675819894075e-05, + "loss": 0.489, + "step": 30810 + }, + { + "epoch": 2.405369546554281, + "grad_norm": 5.6702891015447676e-05, + "learning_rate": 1.1500688621322147e-05, + "loss": 0.5853, + "step": 30820 + }, + { + "epoch": 2.4061500039022867, + "grad_norm": 0.010418553836643696, + "learning_rate": 1.147173317145464e-05, + "loss": 1.2328, + "step": 30830 + }, + { + "epoch": 2.4069304612502926, + "grad_norm": 0.0015043625608086586, + "learning_rate": 1.1442809494169804e-05, + "loss": 0.5443, + "step": 30840 + }, + { + "epoch": 2.4077109185982986, + "grad_norm": 0.38860613107681274, + "learning_rate": 1.1413917613319775e-05, + "loss": 1.2227, + "step": 30850 + }, + { + "epoch": 2.4084913759463045, + "grad_norm": 3.5585859251656774e-21, + "learning_rate": 1.1385057552730382e-05, + "loss": 0.1658, + "step": 30860 + }, + { + "epoch": 2.4092718332943104, + "grad_norm": 4.808939912948063e-10, + "learning_rate": 1.1356229336201308e-05, + "loss": 0.1557, + "step": 30870 + }, + { + "epoch": 2.4100522906423163, + "grad_norm": 7.308386784643517e-09, + "learning_rate": 1.132743298750588e-05, + "loss": 1.6504, + "step": 30880 + }, + { + "epoch": 2.410832747990322, + "grad_norm": 6.105677075041172e-10, + "learning_rate": 1.1298668530391232e-05, + "loss": 0.0, + "step": 30890 + }, + { + "epoch": 2.411613205338328, + "grad_norm": 1.2822355677233332e-10, + "learning_rate": 1.1269935988578129e-05, + "loss": 0.0142, + "step": 30900 + }, + { + "epoch": 2.412393662686334, + "grad_norm": 1.0691497600440192e-12, + "learning_rate": 1.1241235385761057e-05, + "loss": 0.2239, + "step": 30910 + }, + { + "epoch": 2.41317412003434, + "grad_norm": 5.364148591979756e-07, + "learning_rate": 1.121256674560816e-05, + "loss": 0.1139, + "step": 30920 + }, + { + "epoch": 2.413954577382346, + "grad_norm": 284.3153381347656, + "learning_rate": 1.1183930091761202e-05, + "loss": 0.4734, + "step": 30930 + }, + { + "epoch": 2.4147350347303522, + "grad_norm": 1.574044894425697e-08, + "learning_rate": 1.1155325447835608e-05, + "loss": 1.6763, + "step": 30940 + }, + { + "epoch": 2.4155154920783577, + "grad_norm": 1.6060916174484063e-19, + "learning_rate": 1.1126752837420345e-05, + "loss": 0.9619, + "step": 30950 + }, + { + "epoch": 2.416295949426364, + "grad_norm": 9.772094927029684e-05, + "learning_rate": 1.1098212284078036e-05, + "loss": 0.0005, + "step": 30960 + }, + { + "epoch": 2.41707640677437, + "grad_norm": 0.005812688730657101, + "learning_rate": 1.10697038113448e-05, + "loss": 1.1127, + "step": 30970 + }, + { + "epoch": 2.417856864122376, + "grad_norm": 0.0004505214747041464, + "learning_rate": 1.1041227442730344e-05, + "loss": 1.5875, + "step": 30980 + }, + { + "epoch": 2.418637321470382, + "grad_norm": 7.922157863049506e-08, + "learning_rate": 1.1012783201717907e-05, + "loss": 0.0011, + "step": 30990 + }, + { + "epoch": 2.4194177788183877, + "grad_norm": 5.424059779102208e-08, + "learning_rate": 1.0984371111764185e-05, + "loss": 0.0, + "step": 31000 + }, + { + "epoch": 2.4201982361663936, + "grad_norm": 335.3708801269531, + "learning_rate": 1.095599119629942e-05, + "loss": 0.4164, + "step": 31010 + }, + { + "epoch": 2.4209786935143995, + "grad_norm": 0.00026503499248065054, + "learning_rate": 1.0927643478727256e-05, + "loss": 0.3014, + "step": 31020 + }, + { + "epoch": 2.4217591508624055, + "grad_norm": 4.594489665023932e-14, + "learning_rate": 1.0899327982424862e-05, + "loss": 0.0228, + "step": 31030 + }, + { + "epoch": 2.4225396082104114, + "grad_norm": 0.009176124818623066, + "learning_rate": 1.0871044730742752e-05, + "loss": 0.9399, + "step": 31040 + }, + { + "epoch": 2.4233200655584173, + "grad_norm": 248.33827209472656, + "learning_rate": 1.084279374700492e-05, + "loss": 1.7967, + "step": 31050 + }, + { + "epoch": 2.424100522906423, + "grad_norm": 233.01034545898438, + "learning_rate": 1.0814575054508703e-05, + "loss": 1.3471, + "step": 31060 + }, + { + "epoch": 2.424880980254429, + "grad_norm": 5.455961945699528e-05, + "learning_rate": 1.0786388676524855e-05, + "loss": 0.043, + "step": 31070 + }, + { + "epoch": 2.425661437602435, + "grad_norm": 266.8015441894531, + "learning_rate": 1.0758234636297438e-05, + "loss": 1.7403, + "step": 31080 + }, + { + "epoch": 2.426441894950441, + "grad_norm": 2.230910922662588e-08, + "learning_rate": 1.0730112957043842e-05, + "loss": 1.6638, + "step": 31090 + }, + { + "epoch": 2.427222352298447, + "grad_norm": 5.843081574141706e-09, + "learning_rate": 1.0702023661954825e-05, + "loss": 0.4814, + "step": 31100 + }, + { + "epoch": 2.4280028096464528, + "grad_norm": 43.01310348510742, + "learning_rate": 1.0673966774194367e-05, + "loss": 0.0597, + "step": 31110 + }, + { + "epoch": 2.4287832669944587, + "grad_norm": 3.3520866082881184e-08, + "learning_rate": 1.0645942316899799e-05, + "loss": 0.4181, + "step": 31120 + }, + { + "epoch": 2.4295637243424646, + "grad_norm": 2.7300829970045015e-06, + "learning_rate": 1.061795031318164e-05, + "loss": 0.0, + "step": 31130 + }, + { + "epoch": 2.4303441816904705, + "grad_norm": 1.2723034160444513e-05, + "learning_rate": 1.0589990786123682e-05, + "loss": 0.2359, + "step": 31140 + }, + { + "epoch": 2.4311246390384764, + "grad_norm": 0.5951895117759705, + "learning_rate": 1.0562063758782941e-05, + "loss": 0.0005, + "step": 31150 + }, + { + "epoch": 2.4319050963864823, + "grad_norm": 1.0225551960729717e-07, + "learning_rate": 1.0534169254189619e-05, + "loss": 0.0964, + "step": 31160 + }, + { + "epoch": 2.4326855537344882, + "grad_norm": 4.678537830586638e-13, + "learning_rate": 1.0506307295347084e-05, + "loss": 0.0045, + "step": 31170 + }, + { + "epoch": 2.433466011082494, + "grad_norm": 1.8254380229620892e-09, + "learning_rate": 1.0478477905231898e-05, + "loss": 0.023, + "step": 31180 + }, + { + "epoch": 2.4342464684305005, + "grad_norm": 7.81493781687459e-06, + "learning_rate": 1.0450681106793736e-05, + "loss": 0.3291, + "step": 31190 + }, + { + "epoch": 2.435026925778506, + "grad_norm": 0.00012367757153697312, + "learning_rate": 1.0422916922955389e-05, + "loss": 0.0085, + "step": 31200 + }, + { + "epoch": 2.4358073831265123, + "grad_norm": 8.182890930523303e-19, + "learning_rate": 1.0395185376612792e-05, + "loss": 0.0001, + "step": 31210 + }, + { + "epoch": 2.436587840474518, + "grad_norm": 1.1366774882448638e-10, + "learning_rate": 1.0367486490634931e-05, + "loss": 0.0005, + "step": 31220 + }, + { + "epoch": 2.437368297822524, + "grad_norm": 1.5081602679155293e-15, + "learning_rate": 1.0339820287863893e-05, + "loss": 0.0181, + "step": 31230 + }, + { + "epoch": 2.43814875517053, + "grad_norm": 0.0, + "learning_rate": 1.0312186791114759e-05, + "loss": 0.0005, + "step": 31240 + }, + { + "epoch": 2.438929212518536, + "grad_norm": 9.464818140259013e-05, + "learning_rate": 1.0284586023175702e-05, + "loss": 0.0002, + "step": 31250 + }, + { + "epoch": 2.439709669866542, + "grad_norm": 1.1910776720469585e-06, + "learning_rate": 1.0257018006807834e-05, + "loss": 0.0178, + "step": 31260 + }, + { + "epoch": 2.440490127214548, + "grad_norm": 249.18057250976562, + "learning_rate": 1.0229482764745335e-05, + "loss": 1.9, + "step": 31270 + }, + { + "epoch": 2.4412705845625537, + "grad_norm": 406.0775146484375, + "learning_rate": 1.0201980319695281e-05, + "loss": 1.9826, + "step": 31280 + }, + { + "epoch": 2.4420510419105597, + "grad_norm": 1.7052905633673779e-15, + "learning_rate": 1.0174510694337769e-05, + "loss": 0.0, + "step": 31290 + }, + { + "epoch": 2.4428314992585656, + "grad_norm": 4.269708063349512e-16, + "learning_rate": 1.01470739113258e-05, + "loss": 0.0, + "step": 31300 + }, + { + "epoch": 2.4436119566065715, + "grad_norm": 11.617895126342773, + "learning_rate": 1.0119669993285275e-05, + "loss": 0.0043, + "step": 31310 + }, + { + "epoch": 2.4443924139545774, + "grad_norm": 0.05410260707139969, + "learning_rate": 1.0092298962815039e-05, + "loss": 0.4511, + "step": 31320 + }, + { + "epoch": 2.4451728713025833, + "grad_norm": 1.7712960243225098, + "learning_rate": 1.0064960842486754e-05, + "loss": 0.0008, + "step": 31330 + }, + { + "epoch": 2.4459533286505892, + "grad_norm": 0.020819485187530518, + "learning_rate": 1.0037655654845008e-05, + "loss": 0.0003, + "step": 31340 + }, + { + "epoch": 2.446733785998595, + "grad_norm": 185.15899658203125, + "learning_rate": 1.001038342240717e-05, + "loss": 0.6657, + "step": 31350 + }, + { + "epoch": 2.447514243346601, + "grad_norm": 0.0012188347754999995, + "learning_rate": 9.983144167663483e-06, + "loss": 0.0004, + "step": 31360 + }, + { + "epoch": 2.448294700694607, + "grad_norm": 4.041361535200849e-06, + "learning_rate": 9.955937913076962e-06, + "loss": 0.0002, + "step": 31370 + }, + { + "epoch": 2.449075158042613, + "grad_norm": 3.6336111897128376e-10, + "learning_rate": 9.928764681083442e-06, + "loss": 0.0032, + "step": 31380 + }, + { + "epoch": 2.449855615390619, + "grad_norm": 0.0001976362691493705, + "learning_rate": 9.901624494091483e-06, + "loss": 0.3962, + "step": 31390 + }, + { + "epoch": 2.4506360727386247, + "grad_norm": 1.16826257001712e-10, + "learning_rate": 9.874517374482405e-06, + "loss": 0.0, + "step": 31400 + }, + { + "epoch": 2.4514165300866306, + "grad_norm": 0.0007720371941104531, + "learning_rate": 9.847443344610297e-06, + "loss": 1.6812, + "step": 31410 + }, + { + "epoch": 2.4521969874346365, + "grad_norm": 176.58828735351562, + "learning_rate": 9.8204024268019e-06, + "loss": 0.9513, + "step": 31420 + }, + { + "epoch": 2.4529774447826425, + "grad_norm": 8.960714693745186e-22, + "learning_rate": 9.793394643356706e-06, + "loss": 0.1805, + "step": 31430 + }, + { + "epoch": 2.4537579021306484, + "grad_norm": 3.0730760158803605e-08, + "learning_rate": 9.76642001654684e-06, + "loss": 0.1001, + "step": 31440 + }, + { + "epoch": 2.4545383594786543, + "grad_norm": 4.813366300551783e-17, + "learning_rate": 9.739478568617106e-06, + "loss": 0.0689, + "step": 31450 + }, + { + "epoch": 2.4553188168266606, + "grad_norm": 0.0002698384050745517, + "learning_rate": 9.712570321784947e-06, + "loss": 0.0106, + "step": 31460 + }, + { + "epoch": 2.456099274174666, + "grad_norm": 2.8609814961555458e-08, + "learning_rate": 9.685695298240432e-06, + "loss": 0.0, + "step": 31470 + }, + { + "epoch": 2.4568797315226725, + "grad_norm": 3.8634730969988595e-08, + "learning_rate": 9.658853520146205e-06, + "loss": 0.0678, + "step": 31480 + }, + { + "epoch": 2.4576601888706784, + "grad_norm": 3.986447438819596e-07, + "learning_rate": 9.632045009637502e-06, + "loss": 2.0, + "step": 31490 + }, + { + "epoch": 2.4584406462186843, + "grad_norm": 7.760418156976812e-06, + "learning_rate": 9.605269788822163e-06, + "loss": 1.8596, + "step": 31500 + }, + { + "epoch": 2.45922110356669, + "grad_norm": 2.0599917605057527e-11, + "learning_rate": 9.578527879780514e-06, + "loss": 0.0004, + "step": 31510 + }, + { + "epoch": 2.460001560914696, + "grad_norm": 1.1984980031792738e-14, + "learning_rate": 9.551819304565457e-06, + "loss": 0.838, + "step": 31520 + }, + { + "epoch": 2.460782018262702, + "grad_norm": 5.388801946537569e-05, + "learning_rate": 9.525144085202387e-06, + "loss": 0.2957, + "step": 31530 + }, + { + "epoch": 2.461562475610708, + "grad_norm": 3.7882412584622216e-07, + "learning_rate": 9.498502243689217e-06, + "loss": 0.0093, + "step": 31540 + }, + { + "epoch": 2.462342932958714, + "grad_norm": 5.491552443901218e-14, + "learning_rate": 9.471893801996274e-06, + "loss": 0.3542, + "step": 31550 + }, + { + "epoch": 2.46312339030672, + "grad_norm": 4.296512543078279e-06, + "learning_rate": 9.445318782066415e-06, + "loss": 0.0958, + "step": 31560 + }, + { + "epoch": 2.4639038476547257, + "grad_norm": 1.0587911840678754e-22, + "learning_rate": 9.418777205814877e-06, + "loss": 1.1193, + "step": 31570 + }, + { + "epoch": 2.4646843050027316, + "grad_norm": 3.066175979782315e-09, + "learning_rate": 9.392269095129357e-06, + "loss": 0.2969, + "step": 31580 + }, + { + "epoch": 2.4654647623507375, + "grad_norm": 0.6269698143005371, + "learning_rate": 9.365794471869922e-06, + "loss": 0.0051, + "step": 31590 + }, + { + "epoch": 2.4662452196987434, + "grad_norm": 0.03895184397697449, + "learning_rate": 9.339353357869052e-06, + "loss": 0.9741, + "step": 31600 + }, + { + "epoch": 2.4670256770467494, + "grad_norm": 0.0293387733399868, + "learning_rate": 9.312945774931587e-06, + "loss": 0.0065, + "step": 31610 + }, + { + "epoch": 2.4678061343947553, + "grad_norm": 0.0004621434782166034, + "learning_rate": 9.286571744834693e-06, + "loss": 0.0901, + "step": 31620 + }, + { + "epoch": 2.468586591742761, + "grad_norm": 2.129108174486749e-11, + "learning_rate": 9.260231289327908e-06, + "loss": 0.0, + "step": 31630 + }, + { + "epoch": 2.469367049090767, + "grad_norm": 0.00019757759582716972, + "learning_rate": 9.233924430133023e-06, + "loss": 0.0, + "step": 31640 + }, + { + "epoch": 2.470147506438773, + "grad_norm": 0.03482899069786072, + "learning_rate": 9.207651188944195e-06, + "loss": 0.0107, + "step": 31650 + }, + { + "epoch": 2.470927963786779, + "grad_norm": 1.669070548437901e-09, + "learning_rate": 9.181411587427791e-06, + "loss": 0.0, + "step": 31660 + }, + { + "epoch": 2.471708421134785, + "grad_norm": 0.3998055160045624, + "learning_rate": 9.155205647222482e-06, + "loss": 5.0276, + "step": 31670 + }, + { + "epoch": 2.4724888784827908, + "grad_norm": 0.0005485009751282632, + "learning_rate": 9.129033389939168e-06, + "loss": 0.054, + "step": 31680 + }, + { + "epoch": 2.4732693358307967, + "grad_norm": 0.23071005940437317, + "learning_rate": 9.102894837160959e-06, + "loss": 0.0005, + "step": 31690 + }, + { + "epoch": 2.4740497931788026, + "grad_norm": 2.5303051687448497e-18, + "learning_rate": 9.076790010443193e-06, + "loss": 0.925, + "step": 31700 + }, + { + "epoch": 2.474830250526809, + "grad_norm": 2.3178323149686264e-10, + "learning_rate": 9.050718931313363e-06, + "loss": 0.0, + "step": 31710 + }, + { + "epoch": 2.4756107078748144, + "grad_norm": 184.19573974609375, + "learning_rate": 9.024681621271175e-06, + "loss": 0.7901, + "step": 31720 + }, + { + "epoch": 2.4763911652228208, + "grad_norm": 7.937559030196661e-13, + "learning_rate": 8.998678101788443e-06, + "loss": 1.1143, + "step": 31730 + }, + { + "epoch": 2.4771716225708267, + "grad_norm": 9.148439860251758e-21, + "learning_rate": 8.97270839430916e-06, + "loss": 0.787, + "step": 31740 + }, + { + "epoch": 2.4779520799188326, + "grad_norm": 5.923636147570077e-12, + "learning_rate": 8.946772520249385e-06, + "loss": 0.1406, + "step": 31750 + }, + { + "epoch": 2.4787325372668385, + "grad_norm": 228.17822265625, + "learning_rate": 8.92087050099732e-06, + "loss": 0.5815, + "step": 31760 + }, + { + "epoch": 2.4795129946148444, + "grad_norm": 6.485000118286108e-14, + "learning_rate": 8.895002357913234e-06, + "loss": 0.011, + "step": 31770 + }, + { + "epoch": 2.4802934519628503, + "grad_norm": 11.712849617004395, + "learning_rate": 8.869168112329441e-06, + "loss": 0.2275, + "step": 31780 + }, + { + "epoch": 2.4810739093108563, + "grad_norm": 0.0017456557834520936, + "learning_rate": 8.843367785550345e-06, + "loss": 0.0016, + "step": 31790 + }, + { + "epoch": 2.481854366658862, + "grad_norm": 9.026371117215604e-06, + "learning_rate": 8.817601398852315e-06, + "loss": 0.0231, + "step": 31800 + }, + { + "epoch": 2.482634824006868, + "grad_norm": 1.0750035528417357e-08, + "learning_rate": 8.791868973483803e-06, + "loss": 0.0095, + "step": 31810 + }, + { + "epoch": 2.483415281354874, + "grad_norm": 0.197080597281456, + "learning_rate": 8.766170530665185e-06, + "loss": 0.0644, + "step": 31820 + }, + { + "epoch": 2.48419573870288, + "grad_norm": 0.05906852334737778, + "learning_rate": 8.740506091588868e-06, + "loss": 0.0, + "step": 31830 + }, + { + "epoch": 2.484976196050886, + "grad_norm": 6.384251491908799e-07, + "learning_rate": 8.714875677419192e-06, + "loss": 0.1455, + "step": 31840 + }, + { + "epoch": 2.4857566533988917, + "grad_norm": 6.302929966145454e-12, + "learning_rate": 8.689279309292459e-06, + "loss": 1.0475, + "step": 31850 + }, + { + "epoch": 2.4865371107468976, + "grad_norm": 3.5742025135193104e-20, + "learning_rate": 8.663717008316847e-06, + "loss": 0.6127, + "step": 31860 + }, + { + "epoch": 2.4873175680949036, + "grad_norm": 5.618056475760469e-13, + "learning_rate": 8.6381887955725e-06, + "loss": 0.0023, + "step": 31870 + }, + { + "epoch": 2.4880980254429095, + "grad_norm": 2.1841951536316628e-07, + "learning_rate": 8.612694692111412e-06, + "loss": 2.6341, + "step": 31880 + }, + { + "epoch": 2.4888784827909154, + "grad_norm": 2.8184949929510594e-09, + "learning_rate": 8.587234718957443e-06, + "loss": 0.6065, + "step": 31890 + }, + { + "epoch": 2.4896589401389213, + "grad_norm": 8.832481384277344, + "learning_rate": 8.561808897106339e-06, + "loss": 0.0034, + "step": 31900 + }, + { + "epoch": 2.490439397486927, + "grad_norm": 3.9514170566690154e-06, + "learning_rate": 8.536417247525663e-06, + "loss": 0.6379, + "step": 31910 + }, + { + "epoch": 2.491219854834933, + "grad_norm": 3.576242191272172e-09, + "learning_rate": 8.511059791154819e-06, + "loss": 0.0021, + "step": 31920 + }, + { + "epoch": 2.492000312182939, + "grad_norm": 4.6747011595016374e-08, + "learning_rate": 8.485736548904966e-06, + "loss": 0.2805, + "step": 31930 + }, + { + "epoch": 2.492780769530945, + "grad_norm": 7.914739398984239e-06, + "learning_rate": 8.460447541659111e-06, + "loss": 0.1509, + "step": 31940 + }, + { + "epoch": 2.493561226878951, + "grad_norm": 7.899691978656152e-17, + "learning_rate": 8.435192790271967e-06, + "loss": 0.5303, + "step": 31950 + }, + { + "epoch": 2.4943416842269572, + "grad_norm": 10.499479293823242, + "learning_rate": 8.409972315570047e-06, + "loss": 0.0055, + "step": 31960 + }, + { + "epoch": 2.4951221415749627, + "grad_norm": 9.64181890594773e-06, + "learning_rate": 8.384786138351563e-06, + "loss": 0.1455, + "step": 31970 + }, + { + "epoch": 2.495902598922969, + "grad_norm": 8.813726992684678e-08, + "learning_rate": 8.359634279386464e-06, + "loss": 0.0, + "step": 31980 + }, + { + "epoch": 2.496683056270975, + "grad_norm": 1.7394331530695695e-09, + "learning_rate": 8.334516759416405e-06, + "loss": 0.852, + "step": 31990 + }, + { + "epoch": 2.497463513618981, + "grad_norm": 0.025341708213090897, + "learning_rate": 8.309433599154682e-06, + "loss": 1.4413, + "step": 32000 + }, + { + "epoch": 2.498243970966987, + "grad_norm": 2.502560505490692e-07, + "learning_rate": 8.284384819286317e-06, + "loss": 0.0, + "step": 32010 + }, + { + "epoch": 2.4990244283149927, + "grad_norm": 278.2901306152344, + "learning_rate": 8.259370440467918e-06, + "loss": 0.8205, + "step": 32020 + }, + { + "epoch": 2.4998048856629986, + "grad_norm": 5.790270662942021e-08, + "learning_rate": 8.234390483327775e-06, + "loss": 0.9642, + "step": 32030 + }, + { + "epoch": 2.5005853430110045, + "grad_norm": 3.1117013410408845e-09, + "learning_rate": 8.209444968465745e-06, + "loss": 0.8302, + "step": 32040 + }, + { + "epoch": 2.5013658003590105, + "grad_norm": 1.0056027191507333e-12, + "learning_rate": 8.184533916453341e-06, + "loss": 1.7611, + "step": 32050 + }, + { + "epoch": 2.5021462577070164, + "grad_norm": 1.1364232705091126e-05, + "learning_rate": 8.159657347833588e-06, + "loss": 0.0, + "step": 32060 + }, + { + "epoch": 2.5029267150550223, + "grad_norm": 0.0008929189061746001, + "learning_rate": 8.134815283121128e-06, + "loss": 0.0043, + "step": 32070 + }, + { + "epoch": 2.503707172403028, + "grad_norm": 1.3651838508879536e-21, + "learning_rate": 8.110007742802134e-06, + "loss": 0.009, + "step": 32080 + }, + { + "epoch": 2.504487629751034, + "grad_norm": 11.406990051269531, + "learning_rate": 8.08523474733428e-06, + "loss": 0.09, + "step": 32090 + }, + { + "epoch": 2.50526808709904, + "grad_norm": 25.157150268554688, + "learning_rate": 8.060496317146809e-06, + "loss": 0.0139, + "step": 32100 + }, + { + "epoch": 2.506048544447046, + "grad_norm": 1.541529672977049e-05, + "learning_rate": 8.035792472640392e-06, + "loss": 0.9274, + "step": 32110 + }, + { + "epoch": 2.506829001795052, + "grad_norm": 3.0823123324807966e-06, + "learning_rate": 8.01112323418724e-06, + "loss": 0.0001, + "step": 32120 + }, + { + "epoch": 2.5076094591430578, + "grad_norm": 0.08010544627904892, + "learning_rate": 7.986488622130973e-06, + "loss": 0.0092, + "step": 32130 + }, + { + "epoch": 2.5083899164910637, + "grad_norm": 1.5729060010016838e-07, + "learning_rate": 7.961888656786697e-06, + "loss": 0.5046, + "step": 32140 + }, + { + "epoch": 2.5091703738390696, + "grad_norm": 4.572706836356748e-16, + "learning_rate": 7.937323358440935e-06, + "loss": 0.2813, + "step": 32150 + }, + { + "epoch": 2.5099508311870755, + "grad_norm": 9.056230293675097e-14, + "learning_rate": 7.912792747351628e-06, + "loss": 0.4358, + "step": 32160 + }, + { + "epoch": 2.5107312885350814, + "grad_norm": 2.8794074763416333e-11, + "learning_rate": 7.888296843748083e-06, + "loss": 0.0975, + "step": 32170 + }, + { + "epoch": 2.5115117458830873, + "grad_norm": 2.362916262654835e-08, + "learning_rate": 7.863835667831e-06, + "loss": 0.26, + "step": 32180 + }, + { + "epoch": 2.5122922032310933, + "grad_norm": 3.7175784399514656e-19, + "learning_rate": 7.83940923977246e-06, + "loss": 0.5833, + "step": 32190 + }, + { + "epoch": 2.513072660579099, + "grad_norm": 278.1390075683594, + "learning_rate": 7.815017579715861e-06, + "loss": 0.2599, + "step": 32200 + }, + { + "epoch": 2.5138531179271055, + "grad_norm": 1.0799265792967944e-18, + "learning_rate": 7.790660707775949e-06, + "loss": 1.35, + "step": 32210 + }, + { + "epoch": 2.514633575275111, + "grad_norm": 0.2948494851589203, + "learning_rate": 7.766338644038773e-06, + "loss": 0.2754, + "step": 32220 + }, + { + "epoch": 2.5154140326231174, + "grad_norm": 0.0003002735029440373, + "learning_rate": 7.742051408561696e-06, + "loss": 0.1201, + "step": 32230 + }, + { + "epoch": 2.516194489971123, + "grad_norm": 4.7347893996629864e-05, + "learning_rate": 7.717799021373313e-06, + "loss": 0.0146, + "step": 32240 + }, + { + "epoch": 2.516974947319129, + "grad_norm": 21.485971450805664, + "learning_rate": 7.693581502473541e-06, + "loss": 0.0367, + "step": 32250 + }, + { + "epoch": 2.517755404667135, + "grad_norm": 252.40081787109375, + "learning_rate": 7.669398871833494e-06, + "loss": 1.1277, + "step": 32260 + }, + { + "epoch": 2.518535862015141, + "grad_norm": 38.04701614379883, + "learning_rate": 7.645251149395521e-06, + "loss": 0.3047, + "step": 32270 + }, + { + "epoch": 2.519316319363147, + "grad_norm": 0.007141559850424528, + "learning_rate": 7.621138355073232e-06, + "loss": 0.8679, + "step": 32280 + }, + { + "epoch": 2.520096776711153, + "grad_norm": 1.3814762667152536e-07, + "learning_rate": 7.597060508751347e-06, + "loss": 0.0007, + "step": 32290 + }, + { + "epoch": 2.5208772340591588, + "grad_norm": 0.06472454220056534, + "learning_rate": 7.573017630285873e-06, + "loss": 0.0, + "step": 32300 + }, + { + "epoch": 2.5216576914071647, + "grad_norm": 0.00029342470224946737, + "learning_rate": 7.549009739503887e-06, + "loss": 0.0267, + "step": 32310 + }, + { + "epoch": 2.5224381487551706, + "grad_norm": 3.442346496740356e-07, + "learning_rate": 7.5250368562036765e-06, + "loss": 1.2629, + "step": 32320 + }, + { + "epoch": 2.5232186061031765, + "grad_norm": 2.6451106760374665e-15, + "learning_rate": 7.5010990001546045e-06, + "loss": 0.7777, + "step": 32330 + }, + { + "epoch": 2.5239990634511824, + "grad_norm": 0.0022019429598003626, + "learning_rate": 7.477196191097208e-06, + "loss": 0.1655, + "step": 32340 + }, + { + "epoch": 2.5247795207991883, + "grad_norm": 0.019295230507850647, + "learning_rate": 7.453328448743058e-06, + "loss": 0.9781, + "step": 32350 + }, + { + "epoch": 2.5255599781471942, + "grad_norm": 179.39305114746094, + "learning_rate": 7.4294957927748765e-06, + "loss": 0.366, + "step": 32360 + }, + { + "epoch": 2.5263404354952, + "grad_norm": 2.07561828835523e-10, + "learning_rate": 7.405698242846387e-06, + "loss": 0.0592, + "step": 32370 + }, + { + "epoch": 2.527120892843206, + "grad_norm": 1.138383150100708, + "learning_rate": 7.381935818582403e-06, + "loss": 0.1496, + "step": 32380 + }, + { + "epoch": 2.527901350191212, + "grad_norm": 1.7265396223820062e-08, + "learning_rate": 7.358208539578771e-06, + "loss": 0.0397, + "step": 32390 + }, + { + "epoch": 2.528681807539218, + "grad_norm": 1.3316818581188272e-07, + "learning_rate": 7.334516425402327e-06, + "loss": 0.1137, + "step": 32400 + }, + { + "epoch": 2.529462264887224, + "grad_norm": 63.99897766113281, + "learning_rate": 7.3108594955909434e-06, + "loss": 0.0305, + "step": 32410 + }, + { + "epoch": 2.5302427222352297, + "grad_norm": 2.53629514190834e-05, + "learning_rate": 7.287237769653438e-06, + "loss": 0.0018, + "step": 32420 + }, + { + "epoch": 2.5310231795832356, + "grad_norm": 0.06212920323014259, + "learning_rate": 7.263651267069643e-06, + "loss": 0.005, + "step": 32430 + }, + { + "epoch": 2.5318036369312416, + "grad_norm": 7.825429548802276e-08, + "learning_rate": 7.24010000729029e-06, + "loss": 0.6747, + "step": 32440 + }, + { + "epoch": 2.5325840942792475, + "grad_norm": 43.8482666015625, + "learning_rate": 7.216584009737093e-06, + "loss": 0.1808, + "step": 32450 + }, + { + "epoch": 2.533364551627254, + "grad_norm": 2.1205712741334537e-08, + "learning_rate": 7.193103293802683e-06, + "loss": 0.3964, + "step": 32460 + }, + { + "epoch": 2.5341450089752593, + "grad_norm": 0.0007450974080711603, + "learning_rate": 7.169657878850561e-06, + "loss": 0.0, + "step": 32470 + }, + { + "epoch": 2.5349254663232657, + "grad_norm": 0.0016309081111103296, + "learning_rate": 7.146247784215154e-06, + "loss": 0.016, + "step": 32480 + }, + { + "epoch": 2.535705923671271, + "grad_norm": 0.005112816579639912, + "learning_rate": 7.122873029201732e-06, + "loss": 0.0002, + "step": 32490 + }, + { + "epoch": 2.5364863810192775, + "grad_norm": 8.899023669073358e-07, + "learning_rate": 7.099533633086458e-06, + "loss": 0.0007, + "step": 32500 + }, + { + "epoch": 2.537266838367283, + "grad_norm": 15.795042037963867, + "learning_rate": 7.076229615116292e-06, + "loss": 0.4537, + "step": 32510 + }, + { + "epoch": 2.5380472957152893, + "grad_norm": 0.6787014603614807, + "learning_rate": 7.052960994509056e-06, + "loss": 1.7144, + "step": 32520 + }, + { + "epoch": 2.5388277530632952, + "grad_norm": 1.6081843057236256e-07, + "learning_rate": 7.029727790453356e-06, + "loss": 1.2472, + "step": 32530 + }, + { + "epoch": 2.539608210411301, + "grad_norm": 0.22756808996200562, + "learning_rate": 7.0065300221086316e-06, + "loss": 0.329, + "step": 32540 + }, + { + "epoch": 2.540388667759307, + "grad_norm": 0.0002664399507921189, + "learning_rate": 6.983367708605032e-06, + "loss": 0.6846, + "step": 32550 + }, + { + "epoch": 2.541169125107313, + "grad_norm": 5.893477705853911e-08, + "learning_rate": 6.960240869043544e-06, + "loss": 0.4668, + "step": 32560 + }, + { + "epoch": 2.541949582455319, + "grad_norm": 1.531070239479959e-07, + "learning_rate": 6.9371495224958445e-06, + "loss": 0.1799, + "step": 32570 + }, + { + "epoch": 2.542730039803325, + "grad_norm": 2.5454155450077565e-10, + "learning_rate": 6.9140936880043525e-06, + "loss": 0.0, + "step": 32580 + }, + { + "epoch": 2.5435104971513307, + "grad_norm": 279.1732177734375, + "learning_rate": 6.891073384582231e-06, + "loss": 1.7055, + "step": 32590 + }, + { + "epoch": 2.5442909544993366, + "grad_norm": 2.79231595993042, + "learning_rate": 6.8680886312132985e-06, + "loss": 0.0008, + "step": 32600 + }, + { + "epoch": 2.5450714118473425, + "grad_norm": 8.424194675171748e-05, + "learning_rate": 6.845139446852095e-06, + "loss": 0.0344, + "step": 32610 + }, + { + "epoch": 2.5458518691953484, + "grad_norm": 1.6954788861767156e-06, + "learning_rate": 6.822225850423808e-06, + "loss": 1.2532, + "step": 32620 + }, + { + "epoch": 2.5466323265433544, + "grad_norm": 5.65844038646901e-06, + "learning_rate": 6.799347860824301e-06, + "loss": 1.1134, + "step": 32630 + }, + { + "epoch": 2.5474127838913603, + "grad_norm": 8.109286083347184e-14, + "learning_rate": 6.776505496920022e-06, + "loss": 0.0154, + "step": 32640 + }, + { + "epoch": 2.548193241239366, + "grad_norm": 243.0125732421875, + "learning_rate": 6.753698777548095e-06, + "loss": 0.5607, + "step": 32650 + }, + { + "epoch": 2.548973698587372, + "grad_norm": 5.170684858057939e-07, + "learning_rate": 6.730927721516228e-06, + "loss": 0.6427, + "step": 32660 + }, + { + "epoch": 2.549754155935378, + "grad_norm": 0.2602443993091583, + "learning_rate": 6.7081923476026985e-06, + "loss": 0.464, + "step": 32670 + }, + { + "epoch": 2.550534613283384, + "grad_norm": 3.0131985113257542e-05, + "learning_rate": 6.685492674556393e-06, + "loss": 0.0022, + "step": 32680 + }, + { + "epoch": 2.55131507063139, + "grad_norm": 4.961176469642226e-10, + "learning_rate": 6.6628287210967425e-06, + "loss": 0.2164, + "step": 32690 + }, + { + "epoch": 2.5520955279793958, + "grad_norm": 244.85946655273438, + "learning_rate": 6.640200505913735e-06, + "loss": 0.4519, + "step": 32700 + }, + { + "epoch": 2.552875985327402, + "grad_norm": 5.325290203472832e-06, + "learning_rate": 6.617608047667845e-06, + "loss": 0.0001, + "step": 32710 + }, + { + "epoch": 2.5536564426754076, + "grad_norm": 0.0005364320240914822, + "learning_rate": 6.595051364990113e-06, + "loss": 1.861, + "step": 32720 + }, + { + "epoch": 2.554436900023414, + "grad_norm": 0.003387443022802472, + "learning_rate": 6.5725304764820305e-06, + "loss": 0.6986, + "step": 32730 + }, + { + "epoch": 2.5552173573714194, + "grad_norm": 1.974759911149704e-08, + "learning_rate": 6.550045400715621e-06, + "loss": 0.0, + "step": 32740 + }, + { + "epoch": 2.5559978147194258, + "grad_norm": 0.0022972319275140762, + "learning_rate": 6.527596156233312e-06, + "loss": 0.0008, + "step": 32750 + }, + { + "epoch": 2.5567782720674312, + "grad_norm": 0.0009155732113867998, + "learning_rate": 6.50518276154804e-06, + "loss": 0.0948, + "step": 32760 + }, + { + "epoch": 2.5575587294154376, + "grad_norm": 1.2502309409967438e-08, + "learning_rate": 6.4828052351431634e-06, + "loss": 3.3757, + "step": 32770 + }, + { + "epoch": 2.5583391867634435, + "grad_norm": 0.009155502542853355, + "learning_rate": 6.460463595472427e-06, + "loss": 1.0454, + "step": 32780 + }, + { + "epoch": 2.5591196441114494, + "grad_norm": 2.0515193455139524e-07, + "learning_rate": 6.438157860960026e-06, + "loss": 0.0043, + "step": 32790 + }, + { + "epoch": 2.5599001014594553, + "grad_norm": 2.145350492810394e-07, + "learning_rate": 6.415888050000518e-06, + "loss": 0.0004, + "step": 32800 + }, + { + "epoch": 2.5606805588074613, + "grad_norm": 5.182963036531874e-07, + "learning_rate": 6.393654180958858e-06, + "loss": 2.2282, + "step": 32810 + }, + { + "epoch": 2.561461016155467, + "grad_norm": 0.0, + "learning_rate": 6.371456272170329e-06, + "loss": 0.2464, + "step": 32820 + }, + { + "epoch": 2.562241473503473, + "grad_norm": 2.9713309590562176e-09, + "learning_rate": 6.349294341940593e-06, + "loss": 0.0006, + "step": 32830 + }, + { + "epoch": 2.563021930851479, + "grad_norm": 9.95053581090248e-14, + "learning_rate": 6.32716840854562e-06, + "loss": 0.0, + "step": 32840 + }, + { + "epoch": 2.563802388199485, + "grad_norm": 1.3272781290348272e-16, + "learning_rate": 6.305078490231725e-06, + "loss": 2.0757, + "step": 32850 + }, + { + "epoch": 2.564582845547491, + "grad_norm": 1.2425418915995579e-08, + "learning_rate": 6.283024605215482e-06, + "loss": 0.0013, + "step": 32860 + }, + { + "epoch": 2.5653633028954967, + "grad_norm": 8.812655466483577e-12, + "learning_rate": 6.261006771683764e-06, + "loss": 0.0145, + "step": 32870 + }, + { + "epoch": 2.5661437602435027, + "grad_norm": 0.012916338630020618, + "learning_rate": 6.2390250077937305e-06, + "loss": 0.4857, + "step": 32880 + }, + { + "epoch": 2.5669242175915086, + "grad_norm": 106.12604522705078, + "learning_rate": 6.217079331672776e-06, + "loss": 0.0524, + "step": 32890 + }, + { + "epoch": 2.5677046749395145, + "grad_norm": 0.001981984591111541, + "learning_rate": 6.1951697614185566e-06, + "loss": 2.1622, + "step": 32900 + }, + { + "epoch": 2.5684851322875204, + "grad_norm": 3.305194388580207e-14, + "learning_rate": 6.17329631509892e-06, + "loss": 0.1398, + "step": 32910 + }, + { + "epoch": 2.5692655896355263, + "grad_norm": 0.002218644367530942, + "learning_rate": 6.151459010751953e-06, + "loss": 1.2993, + "step": 32920 + }, + { + "epoch": 2.5700460469835322, + "grad_norm": 0.00015026754408609122, + "learning_rate": 6.129657866385935e-06, + "loss": 0.0168, + "step": 32930 + }, + { + "epoch": 2.570826504331538, + "grad_norm": 0.0012799123069271445, + "learning_rate": 6.107892899979323e-06, + "loss": 0.9698, + "step": 32940 + }, + { + "epoch": 2.571606961679544, + "grad_norm": 1.7073870800261154e-13, + "learning_rate": 6.086164129480731e-06, + "loss": 0.5255, + "step": 32950 + }, + { + "epoch": 2.5723874190275504, + "grad_norm": 0.008531242609024048, + "learning_rate": 6.064471572808916e-06, + "loss": 0.0001, + "step": 32960 + }, + { + "epoch": 2.573167876375556, + "grad_norm": 4.434143647813471e-06, + "learning_rate": 6.042815247852812e-06, + "loss": 0.0191, + "step": 32970 + }, + { + "epoch": 2.5739483337235622, + "grad_norm": 2.1988906860351562, + "learning_rate": 6.021195172471411e-06, + "loss": 0.3835, + "step": 32980 + }, + { + "epoch": 2.5747287910715677, + "grad_norm": 1.283716052125892e-21, + "learning_rate": 5.999611364493868e-06, + "loss": 0.0012, + "step": 32990 + }, + { + "epoch": 2.575509248419574, + "grad_norm": 3.2552881240844727, + "learning_rate": 5.978063841719411e-06, + "loss": 0.0014, + "step": 33000 + }, + { + "epoch": 2.5762897057675795, + "grad_norm": 6.644073498703528e-17, + "learning_rate": 5.956552621917344e-06, + "loss": 0.6411, + "step": 33010 + }, + { + "epoch": 2.577070163115586, + "grad_norm": 3.554277181625366, + "learning_rate": 5.9350777228270205e-06, + "loss": 0.0032, + "step": 33020 + }, + { + "epoch": 2.577850620463592, + "grad_norm": 0.00010075556201627478, + "learning_rate": 5.913639162157869e-06, + "loss": 0.0, + "step": 33030 + }, + { + "epoch": 2.5786310778115977, + "grad_norm": 0.03173457086086273, + "learning_rate": 5.89223695758932e-06, + "loss": 0.0478, + "step": 33040 + }, + { + "epoch": 2.5794115351596036, + "grad_norm": 2.581108341382504e-13, + "learning_rate": 5.870871126770855e-06, + "loss": 0.9232, + "step": 33050 + }, + { + "epoch": 2.5801919925076096, + "grad_norm": 1.1653195315375342e-06, + "learning_rate": 5.84954168732193e-06, + "loss": 1.0711, + "step": 33060 + }, + { + "epoch": 2.5809724498556155, + "grad_norm": 6.896935658473965e-20, + "learning_rate": 5.828248656832003e-06, + "loss": 0.1499, + "step": 33070 + }, + { + "epoch": 2.5817529072036214, + "grad_norm": 1.7919608354568481, + "learning_rate": 5.806992052860533e-06, + "loss": 0.6013, + "step": 33080 + }, + { + "epoch": 2.5825333645516273, + "grad_norm": 0.0004548436845652759, + "learning_rate": 5.785771892936881e-06, + "loss": 0.4141, + "step": 33090 + }, + { + "epoch": 2.583313821899633, + "grad_norm": 0.0, + "learning_rate": 5.7645881945604165e-06, + "loss": 0.0, + "step": 33100 + }, + { + "epoch": 2.584094279247639, + "grad_norm": 0.13170254230499268, + "learning_rate": 5.743440975200393e-06, + "loss": 0.0024, + "step": 33110 + }, + { + "epoch": 2.584874736595645, + "grad_norm": 1.1884622721169101e-14, + "learning_rate": 5.7223302522960154e-06, + "loss": 0.7057, + "step": 33120 + }, + { + "epoch": 2.585655193943651, + "grad_norm": 1.8453439460091453e-13, + "learning_rate": 5.701256043256359e-06, + "loss": 0.0, + "step": 33130 + }, + { + "epoch": 2.586435651291657, + "grad_norm": 1.1516529049848678e-11, + "learning_rate": 5.680218365460416e-06, + "loss": 0.3896, + "step": 33140 + }, + { + "epoch": 2.587216108639663, + "grad_norm": 0.0056893001310527325, + "learning_rate": 5.659217236257059e-06, + "loss": 0.6519, + "step": 33150 + }, + { + "epoch": 2.5879965659876687, + "grad_norm": 1.4390453770829481e-06, + "learning_rate": 5.638252672964972e-06, + "loss": 0.042, + "step": 33160 + }, + { + "epoch": 2.5887770233356746, + "grad_norm": 1.7544034562888555e-05, + "learning_rate": 5.617324692872744e-06, + "loss": 0.0018, + "step": 33170 + }, + { + "epoch": 2.5895574806836805, + "grad_norm": 1.6007531881332397, + "learning_rate": 5.596433313238747e-06, + "loss": 0.5041, + "step": 33180 + }, + { + "epoch": 2.5903379380316864, + "grad_norm": 6.1260161388076995e-09, + "learning_rate": 5.575578551291211e-06, + "loss": 0.0, + "step": 33190 + }, + { + "epoch": 2.5911183953796924, + "grad_norm": 2.927170917011307e-18, + "learning_rate": 5.554760424228128e-06, + "loss": 0.0231, + "step": 33200 + }, + { + "epoch": 2.5918988527276983, + "grad_norm": 2.057391405105591, + "learning_rate": 5.533978949217322e-06, + "loss": 0.0007, + "step": 33210 + }, + { + "epoch": 2.592679310075704, + "grad_norm": 0.004864888731390238, + "learning_rate": 5.513234143396351e-06, + "loss": 0.4181, + "step": 33220 + }, + { + "epoch": 2.5934597674237105, + "grad_norm": 1.0975718760502379e-13, + "learning_rate": 5.492526023872563e-06, + "loss": 0.0695, + "step": 33230 + }, + { + "epoch": 2.594240224771716, + "grad_norm": 3.326022692817787e-07, + "learning_rate": 5.471854607723048e-06, + "loss": 0.0276, + "step": 33240 + }, + { + "epoch": 2.5950206821197224, + "grad_norm": 4.2327101823502744e-07, + "learning_rate": 5.451219911994604e-06, + "loss": 0.0015, + "step": 33250 + }, + { + "epoch": 2.595801139467728, + "grad_norm": 1.8893464584834874e-05, + "learning_rate": 5.430621953703785e-06, + "loss": 1.1529, + "step": 33260 + }, + { + "epoch": 2.596581596815734, + "grad_norm": 8.914184945751913e-06, + "learning_rate": 5.410060749836809e-06, + "loss": 0.118, + "step": 33270 + }, + { + "epoch": 2.59736205416374, + "grad_norm": 0.07775567471981049, + "learning_rate": 5.389536317349625e-06, + "loss": 0.6268, + "step": 33280 + }, + { + "epoch": 2.598142511511746, + "grad_norm": 9.568450032304554e-09, + "learning_rate": 5.3690486731678205e-06, + "loss": 0.5825, + "step": 33290 + }, + { + "epoch": 2.598922968859752, + "grad_norm": 261.511474609375, + "learning_rate": 5.3485978341866694e-06, + "loss": 2.9511, + "step": 33300 + }, + { + "epoch": 2.599703426207758, + "grad_norm": 7.09142057075951e-07, + "learning_rate": 5.328183817271093e-06, + "loss": 0.0, + "step": 33310 + }, + { + "epoch": 2.6004838835557638, + "grad_norm": 238.4812469482422, + "learning_rate": 5.307806639255642e-06, + "loss": 0.6745, + "step": 33320 + }, + { + "epoch": 2.6012643409037697, + "grad_norm": 2.5221299624256146e-15, + "learning_rate": 5.287466316944473e-06, + "loss": 0.7283, + "step": 33330 + }, + { + "epoch": 2.6020447982517756, + "grad_norm": 0.31384918093681335, + "learning_rate": 5.267162867111386e-06, + "loss": 0.0121, + "step": 33340 + }, + { + "epoch": 2.6028252555997815, + "grad_norm": 7.826183718862012e-07, + "learning_rate": 5.246896306499738e-06, + "loss": 0.5504, + "step": 33350 + }, + { + "epoch": 2.6036057129477874, + "grad_norm": 5.406313174916022e-09, + "learning_rate": 5.22666665182247e-06, + "loss": 0.9671, + "step": 33360 + }, + { + "epoch": 2.6043861702957933, + "grad_norm": 9.942142042973501e-08, + "learning_rate": 5.206473919762106e-06, + "loss": 0.0348, + "step": 33370 + }, + { + "epoch": 2.6051666276437992, + "grad_norm": 6.00198575284594e-08, + "learning_rate": 5.186318126970707e-06, + "loss": 0.0, + "step": 33380 + }, + { + "epoch": 2.605947084991805, + "grad_norm": 2.772690181328507e-18, + "learning_rate": 5.166199290069895e-06, + "loss": 0.7578, + "step": 33390 + }, + { + "epoch": 2.606727542339811, + "grad_norm": 3.210815906524658, + "learning_rate": 5.14611742565077e-06, + "loss": 1.0131, + "step": 33400 + }, + { + "epoch": 2.607507999687817, + "grad_norm": 0.0006296706851571798, + "learning_rate": 5.126072550274003e-06, + "loss": 0.0885, + "step": 33410 + }, + { + "epoch": 2.608288457035823, + "grad_norm": 2.116186692546762e-08, + "learning_rate": 5.106064680469697e-06, + "loss": 0.5514, + "step": 33420 + }, + { + "epoch": 2.609068914383829, + "grad_norm": 1.1567913134058472e-06, + "learning_rate": 5.086093832737493e-06, + "loss": 0.0002, + "step": 33430 + }, + { + "epoch": 2.6098493717318347, + "grad_norm": 1.472561239612702e-10, + "learning_rate": 5.066160023546474e-06, + "loss": 0.2182, + "step": 33440 + }, + { + "epoch": 2.6106298290798406, + "grad_norm": 0.006320985499769449, + "learning_rate": 5.046263269335156e-06, + "loss": 0.085, + "step": 33450 + }, + { + "epoch": 2.6114102864278466, + "grad_norm": 5.413837655553039e-19, + "learning_rate": 5.026403586511575e-06, + "loss": 0.6318, + "step": 33460 + }, + { + "epoch": 2.6121907437758525, + "grad_norm": 7.841154392451699e-10, + "learning_rate": 5.006580991453108e-06, + "loss": 0.9386, + "step": 33470 + }, + { + "epoch": 2.612971201123859, + "grad_norm": 1.0349990873706862e-15, + "learning_rate": 4.986795500506602e-06, + "loss": 0.3775, + "step": 33480 + }, + { + "epoch": 2.6137516584718643, + "grad_norm": 2.750820075636127e-22, + "learning_rate": 4.9670471299882835e-06, + "loss": 0.0016, + "step": 33490 + }, + { + "epoch": 2.6145321158198707, + "grad_norm": 2.6430812795297243e-05, + "learning_rate": 4.94733589618378e-06, + "loss": 0.0143, + "step": 33500 + }, + { + "epoch": 2.615312573167876, + "grad_norm": 4.1617440160734986e-08, + "learning_rate": 4.9276618153480724e-06, + "loss": 0.0, + "step": 33510 + }, + { + "epoch": 2.6160930305158825, + "grad_norm": 2.1422061763587408e-05, + "learning_rate": 4.908024903705538e-06, + "loss": 0.0283, + "step": 33520 + }, + { + "epoch": 2.6168734878638884, + "grad_norm": 1.1938641364395153e-07, + "learning_rate": 4.8884251774498565e-06, + "loss": 0.0, + "step": 33530 + }, + { + "epoch": 2.6176539452118943, + "grad_norm": 0.09271605312824249, + "learning_rate": 4.8688626527440885e-06, + "loss": 0.0059, + "step": 33540 + }, + { + "epoch": 2.6184344025599002, + "grad_norm": 5.075870035398111e-07, + "learning_rate": 4.8493373457206e-06, + "loss": 0.0001, + "step": 33550 + }, + { + "epoch": 2.619214859907906, + "grad_norm": 23.929956436157227, + "learning_rate": 4.829849272481035e-06, + "loss": 2.0308, + "step": 33560 + }, + { + "epoch": 2.619995317255912, + "grad_norm": 300.56817626953125, + "learning_rate": 4.810398449096387e-06, + "loss": 0.5585, + "step": 33570 + }, + { + "epoch": 2.620775774603918, + "grad_norm": 7.055079719216206e-14, + "learning_rate": 4.790984891606881e-06, + "loss": 1.517, + "step": 33580 + }, + { + "epoch": 2.621556231951924, + "grad_norm": 0.0070375483483076096, + "learning_rate": 4.77160861602205e-06, + "loss": 0.0011, + "step": 33590 + }, + { + "epoch": 2.62233668929993, + "grad_norm": 2.0225567709530878e-07, + "learning_rate": 4.752269638320639e-06, + "loss": 0.9377, + "step": 33600 + }, + { + "epoch": 2.6231171466479357, + "grad_norm": 1.4261396472647903e-06, + "learning_rate": 4.73296797445068e-06, + "loss": 0.016, + "step": 33610 + }, + { + "epoch": 2.6238976039959416, + "grad_norm": 4.438880176435776e-21, + "learning_rate": 4.7137036403294085e-06, + "loss": 0.0541, + "step": 33620 + }, + { + "epoch": 2.6246780613439475, + "grad_norm": 44.056949615478516, + "learning_rate": 4.694476651843294e-06, + "loss": 0.629, + "step": 33630 + }, + { + "epoch": 2.6254585186919535, + "grad_norm": 3.850926805171184e-05, + "learning_rate": 4.675287024847979e-06, + "loss": 0.0844, + "step": 33640 + }, + { + "epoch": 2.6262389760399594, + "grad_norm": 1.0066686684240267e-07, + "learning_rate": 4.656134775168314e-06, + "loss": 1.3893, + "step": 33650 + }, + { + "epoch": 2.6270194333879653, + "grad_norm": 1.6732981319989015e-11, + "learning_rate": 4.637019918598334e-06, + "loss": 0.7188, + "step": 33660 + }, + { + "epoch": 2.627799890735971, + "grad_norm": 4.760467345477082e-05, + "learning_rate": 4.617942470901221e-06, + "loss": 0.062, + "step": 33670 + }, + { + "epoch": 2.628580348083977, + "grad_norm": 1.1190292534832627e-13, + "learning_rate": 4.598902447809311e-06, + "loss": 0.0, + "step": 33680 + }, + { + "epoch": 2.629360805431983, + "grad_norm": 2.247318963595344e-16, + "learning_rate": 4.579899865024084e-06, + "loss": 0.0001, + "step": 33690 + }, + { + "epoch": 2.630141262779989, + "grad_norm": 1.2163898333028555e-08, + "learning_rate": 4.5609347382161605e-06, + "loss": 0.7094, + "step": 33700 + }, + { + "epoch": 2.630921720127995, + "grad_norm": 2.6269600305272277e-15, + "learning_rate": 4.542007083025224e-06, + "loss": 0.012, + "step": 33710 + }, + { + "epoch": 2.6317021774760008, + "grad_norm": 2.9815462767146528e-05, + "learning_rate": 4.523116915060116e-06, + "loss": 0.283, + "step": 33720 + }, + { + "epoch": 2.632482634824007, + "grad_norm": 3.7082540416122356e-07, + "learning_rate": 4.504264249898715e-06, + "loss": 0.2372, + "step": 33730 + }, + { + "epoch": 2.6332630921720126, + "grad_norm": 273.8210144042969, + "learning_rate": 4.485449103087997e-06, + "loss": 0.3322, + "step": 33740 + }, + { + "epoch": 2.634043549520019, + "grad_norm": 2.527123797335662e-06, + "learning_rate": 4.466671490143998e-06, + "loss": 3.9085, + "step": 33750 + }, + { + "epoch": 2.6348240068680244, + "grad_norm": 0.002832346362993121, + "learning_rate": 4.447931426551782e-06, + "loss": 0.0, + "step": 33760 + }, + { + "epoch": 2.635604464216031, + "grad_norm": 22.766206741333008, + "learning_rate": 4.4292289277654775e-06, + "loss": 0.0105, + "step": 33770 + }, + { + "epoch": 2.6363849215640363, + "grad_norm": 1.594238521578084e-13, + "learning_rate": 4.410564009208218e-06, + "loss": 0.0, + "step": 33780 + }, + { + "epoch": 2.6371653789120426, + "grad_norm": 0.015271048061549664, + "learning_rate": 4.391936686272152e-06, + "loss": 0.011, + "step": 33790 + }, + { + "epoch": 2.6379458362600485, + "grad_norm": 1.151661853366501e-19, + "learning_rate": 4.3733469743184095e-06, + "loss": 0.0001, + "step": 33800 + }, + { + "epoch": 2.6387262936080544, + "grad_norm": 0.002187962643802166, + "learning_rate": 4.354794888677139e-06, + "loss": 0.0036, + "step": 33810 + }, + { + "epoch": 2.6395067509560604, + "grad_norm": 1.8937040294986218e-05, + "learning_rate": 4.336280444647406e-06, + "loss": 0.0004, + "step": 33820 + }, + { + "epoch": 2.6402872083040663, + "grad_norm": 16.37470245361328, + "learning_rate": 4.317803657497288e-06, + "loss": 0.0074, + "step": 33830 + }, + { + "epoch": 2.641067665652072, + "grad_norm": 0.0005464993300847709, + "learning_rate": 4.2993645424637685e-06, + "loss": 0.8549, + "step": 33840 + }, + { + "epoch": 2.641848123000078, + "grad_norm": 272.48162841796875, + "learning_rate": 4.2809631147528005e-06, + "loss": 0.7294, + "step": 33850 + }, + { + "epoch": 2.642628580348084, + "grad_norm": 4.591046945279231e-06, + "learning_rate": 4.262599389539235e-06, + "loss": 0.012, + "step": 33860 + }, + { + "epoch": 2.64340903769609, + "grad_norm": 1.0221311693985075e-20, + "learning_rate": 4.244273381966824e-06, + "loss": 0.0161, + "step": 33870 + }, + { + "epoch": 2.644189495044096, + "grad_norm": 274.3841247558594, + "learning_rate": 4.2259851071482445e-06, + "loss": 0.6576, + "step": 33880 + }, + { + "epoch": 2.6449699523921018, + "grad_norm": 5.816648702433675e-17, + "learning_rate": 4.207734580165023e-06, + "loss": 0.1478, + "step": 33890 + }, + { + "epoch": 2.6457504097401077, + "grad_norm": 4.0705185710976366e-07, + "learning_rate": 4.189521816067593e-06, + "loss": 0.0001, + "step": 33900 + }, + { + "epoch": 2.6465308670881136, + "grad_norm": 6.483745598763743e-23, + "learning_rate": 4.1713468298752e-06, + "loss": 1.4205, + "step": 33910 + }, + { + "epoch": 2.6473113244361195, + "grad_norm": 2.014824218576905e-07, + "learning_rate": 4.1532096365759885e-06, + "loss": 0.0004, + "step": 33920 + }, + { + "epoch": 2.6480917817841254, + "grad_norm": 3.743392066509216e-23, + "learning_rate": 4.1351102511269055e-06, + "loss": 0.0, + "step": 33930 + }, + { + "epoch": 2.6488722391321313, + "grad_norm": 2.345401071579545e-06, + "learning_rate": 4.117048688453718e-06, + "loss": 0.0017, + "step": 33940 + }, + { + "epoch": 2.6496526964801372, + "grad_norm": 5.603112185781356e-06, + "learning_rate": 4.099024963451015e-06, + "loss": 0.0012, + "step": 33950 + }, + { + "epoch": 2.650433153828143, + "grad_norm": 5.601246356964111, + "learning_rate": 4.081039090982175e-06, + "loss": 1.0046, + "step": 33960 + }, + { + "epoch": 2.651213611176149, + "grad_norm": 2.029733404091597e-21, + "learning_rate": 4.063091085879367e-06, + "loss": 0.9069, + "step": 33970 + }, + { + "epoch": 2.6519940685241554, + "grad_norm": 1.1794318197644316e-05, + "learning_rate": 4.0451809629435236e-06, + "loss": 0.0013, + "step": 33980 + }, + { + "epoch": 2.652774525872161, + "grad_norm": 3.118545066627121e-08, + "learning_rate": 4.027308736944341e-06, + "loss": 0.0935, + "step": 33990 + }, + { + "epoch": 2.6535549832201673, + "grad_norm": 7.507606483159179e-07, + "learning_rate": 4.009474422620269e-06, + "loss": 0.2441, + "step": 34000 + }, + { + "epoch": 2.6543354405681727, + "grad_norm": 5.389193979965512e-09, + "learning_rate": 3.991678034678503e-06, + "loss": 0.5167, + "step": 34010 + }, + { + "epoch": 2.655115897916179, + "grad_norm": 1.0987394154227115e-11, + "learning_rate": 3.973919587794922e-06, + "loss": 0.0096, + "step": 34020 + }, + { + "epoch": 2.6558963552641845, + "grad_norm": 6.000165891950499e-20, + "learning_rate": 3.956199096614166e-06, + "loss": 1.2193, + "step": 34030 + }, + { + "epoch": 2.656676812612191, + "grad_norm": 8.778447765623004e-16, + "learning_rate": 3.9385165757495405e-06, + "loss": 0.0059, + "step": 34040 + }, + { + "epoch": 2.657457269960197, + "grad_norm": 2.4371112766763403e-11, + "learning_rate": 3.920872039783047e-06, + "loss": 0.0028, + "step": 34050 + }, + { + "epoch": 2.6582377273082027, + "grad_norm": 274.0638122558594, + "learning_rate": 3.903265503265374e-06, + "loss": 1.025, + "step": 34060 + }, + { + "epoch": 2.6590181846562086, + "grad_norm": 4.381836049560661e-07, + "learning_rate": 3.8856969807158584e-06, + "loss": 0.0001, + "step": 34070 + }, + { + "epoch": 2.6597986420042146, + "grad_norm": 246.39788818359375, + "learning_rate": 3.868166486622493e-06, + "loss": 0.4099, + "step": 34080 + }, + { + "epoch": 2.6605790993522205, + "grad_norm": 1.3650200642799067e-18, + "learning_rate": 3.85067403544192e-06, + "loss": 0.3284, + "step": 34090 + }, + { + "epoch": 2.6613595567002264, + "grad_norm": 242.4805908203125, + "learning_rate": 3.833219641599406e-06, + "loss": 2.4331, + "step": 34100 + }, + { + "epoch": 2.6621400140482323, + "grad_norm": 1.4766764877549576e-07, + "learning_rate": 3.815803319488814e-06, + "loss": 1.9021, + "step": 34110 + }, + { + "epoch": 2.662920471396238, + "grad_norm": 45.691322326660156, + "learning_rate": 3.798425083472645e-06, + "loss": 2.0504, + "step": 34120 + }, + { + "epoch": 2.663700928744244, + "grad_norm": 0.0011006060522049665, + "learning_rate": 3.7810849478819576e-06, + "loss": 0.6354, + "step": 34130 + }, + { + "epoch": 2.66448138609225, + "grad_norm": 189.8499298095703, + "learning_rate": 3.7637829270164014e-06, + "loss": 0.643, + "step": 34140 + }, + { + "epoch": 2.665261843440256, + "grad_norm": 1.676145853718252e-16, + "learning_rate": 3.7465190351442135e-06, + "loss": 0.162, + "step": 34150 + }, + { + "epoch": 2.666042300788262, + "grad_norm": 131.11514282226562, + "learning_rate": 3.7292932865021614e-06, + "loss": 0.6568, + "step": 34160 + }, + { + "epoch": 2.666822758136268, + "grad_norm": 0.018124770373106003, + "learning_rate": 3.712105695295587e-06, + "loss": 1.0613, + "step": 34170 + }, + { + "epoch": 2.6676032154842737, + "grad_norm": 9.850300575209303e-10, + "learning_rate": 3.6949562756983245e-06, + "loss": 0.0017, + "step": 34180 + }, + { + "epoch": 2.6683836728322796, + "grad_norm": 235.30149841308594, + "learning_rate": 3.6778450418527733e-06, + "loss": 1.0566, + "step": 34190 + }, + { + "epoch": 2.6691641301802855, + "grad_norm": 9.505000342358016e-09, + "learning_rate": 3.660772007869806e-06, + "loss": 0.0015, + "step": 34200 + }, + { + "epoch": 2.6699445875282914, + "grad_norm": 0.002341636922210455, + "learning_rate": 3.6437371878288275e-06, + "loss": 1.3121, + "step": 34210 + }, + { + "epoch": 2.6707250448762974, + "grad_norm": 1.260479791653779e-07, + "learning_rate": 3.6267405957776947e-06, + "loss": 0.6651, + "step": 34220 + }, + { + "epoch": 2.6715055022243037, + "grad_norm": 9.471100570479507e-10, + "learning_rate": 3.6097822457327568e-06, + "loss": 0.4223, + "step": 34230 + }, + { + "epoch": 2.672285959572309, + "grad_norm": 0.00010060240310849622, + "learning_rate": 3.592862151678844e-06, + "loss": 0.7502, + "step": 34240 + }, + { + "epoch": 2.6730664169203155, + "grad_norm": 9.415930253453553e-05, + "learning_rate": 3.5759803275692007e-06, + "loss": 0.0755, + "step": 34250 + }, + { + "epoch": 2.673846874268321, + "grad_norm": 0.00018394121434539557, + "learning_rate": 3.5591367873255453e-06, + "loss": 0.6165, + "step": 34260 + }, + { + "epoch": 2.6746273316163274, + "grad_norm": 2.5903491973876953, + "learning_rate": 3.542331544838001e-06, + "loss": 0.1155, + "step": 34270 + }, + { + "epoch": 2.675407788964333, + "grad_norm": 6.003338421578519e-05, + "learning_rate": 3.5255646139651322e-06, + "loss": 0.0016, + "step": 34280 + }, + { + "epoch": 2.676188246312339, + "grad_norm": 1.627708731688493e-11, + "learning_rate": 3.508836008533878e-06, + "loss": 1.027, + "step": 34290 + }, + { + "epoch": 2.676968703660345, + "grad_norm": 273.7149353027344, + "learning_rate": 3.4921457423395976e-06, + "loss": 0.4918, + "step": 34300 + }, + { + "epoch": 2.677749161008351, + "grad_norm": 1.9501188858184193e-11, + "learning_rate": 3.4754938291460314e-06, + "loss": 0.0377, + "step": 34310 + }, + { + "epoch": 2.678529618356357, + "grad_norm": 5.9736852563219145e-05, + "learning_rate": 3.4588802826852893e-06, + "loss": 0.0222, + "step": 34320 + }, + { + "epoch": 2.679310075704363, + "grad_norm": 312.1669921875, + "learning_rate": 3.44230511665784e-06, + "loss": 3.0204, + "step": 34330 + }, + { + "epoch": 2.6800905330523688, + "grad_norm": 392.56744384765625, + "learning_rate": 3.425768344732483e-06, + "loss": 0.6596, + "step": 34340 + }, + { + "epoch": 2.6808709904003747, + "grad_norm": 3.923849634969445e-14, + "learning_rate": 3.4092699805463867e-06, + "loss": 0.0026, + "step": 34350 + }, + { + "epoch": 2.6816514477483806, + "grad_norm": 147.90354919433594, + "learning_rate": 3.39281003770503e-06, + "loss": 0.8152, + "step": 34360 + }, + { + "epoch": 2.6824319050963865, + "grad_norm": 0.00016789170331321657, + "learning_rate": 3.376388529782215e-06, + "loss": 0.2015, + "step": 34370 + }, + { + "epoch": 2.6832123624443924, + "grad_norm": 1.1009997251676396e-06, + "learning_rate": 3.360005470320038e-06, + "loss": 0.0062, + "step": 34380 + }, + { + "epoch": 2.6839928197923983, + "grad_norm": 189.47874450683594, + "learning_rate": 3.343660872828891e-06, + "loss": 0.1313, + "step": 34390 + }, + { + "epoch": 2.6847732771404043, + "grad_norm": 6.394549927790649e-06, + "learning_rate": 3.327354750787459e-06, + "loss": 0.7632, + "step": 34400 + }, + { + "epoch": 2.68555373448841, + "grad_norm": 8.24633161755628e-08, + "learning_rate": 3.3110871176426974e-06, + "loss": 0.0001, + "step": 34410 + }, + { + "epoch": 2.686334191836416, + "grad_norm": 1.4998055261802426e-18, + "learning_rate": 3.294857986809802e-06, + "loss": 0.0239, + "step": 34420 + }, + { + "epoch": 2.687114649184422, + "grad_norm": 2.3177593178047573e-08, + "learning_rate": 3.2786673716722227e-06, + "loss": 0.0, + "step": 34430 + }, + { + "epoch": 2.687895106532428, + "grad_norm": 0.0012782815610989928, + "learning_rate": 3.26251528558168e-06, + "loss": 3.0813, + "step": 34440 + }, + { + "epoch": 2.688675563880434, + "grad_norm": 0.00913954060524702, + "learning_rate": 3.246401741858063e-06, + "loss": 0.5146, + "step": 34450 + }, + { + "epoch": 2.6894560212284397, + "grad_norm": 5.803015401539824e-09, + "learning_rate": 3.230326753789531e-06, + "loss": 0.0, + "step": 34460 + }, + { + "epoch": 2.6902364785764457, + "grad_norm": 0.2496180385351181, + "learning_rate": 3.2142903346324195e-06, + "loss": 0.0004, + "step": 34470 + }, + { + "epoch": 2.6910169359244516, + "grad_norm": 7.848896530049387e-06, + "learning_rate": 3.198292497611277e-06, + "loss": 0.0222, + "step": 34480 + }, + { + "epoch": 2.6917973932724575, + "grad_norm": 2.4973063913569064e-15, + "learning_rate": 3.1823332559188012e-06, + "loss": 1.0162, + "step": 34490 + }, + { + "epoch": 2.692577850620464, + "grad_norm": 118.88430786132812, + "learning_rate": 3.1664126227158976e-06, + "loss": 0.053, + "step": 34500 + }, + { + "epoch": 2.6933583079684693, + "grad_norm": 3.7308425362425623e-06, + "learning_rate": 3.150530611131608e-06, + "loss": 0.0001, + "step": 34510 + }, + { + "epoch": 2.6941387653164757, + "grad_norm": 7.105061285983538e-07, + "learning_rate": 3.134687234263156e-06, + "loss": 1.0315, + "step": 34520 + }, + { + "epoch": 2.694919222664481, + "grad_norm": 9.881821055879314e-12, + "learning_rate": 3.1188825051758575e-06, + "loss": 0.293, + "step": 34530 + }, + { + "epoch": 2.6956996800124875, + "grad_norm": 168.21942138671875, + "learning_rate": 3.1031164369031915e-06, + "loss": 1.9331, + "step": 34540 + }, + { + "epoch": 2.6964801373604934, + "grad_norm": 208.61349487304688, + "learning_rate": 3.0873890424467698e-06, + "loss": 1.0772, + "step": 34550 + }, + { + "epoch": 2.6972605947084993, + "grad_norm": 2.124788807122968e-06, + "learning_rate": 3.0717003347762565e-06, + "loss": 0.1359, + "step": 34560 + }, + { + "epoch": 2.6980410520565052, + "grad_norm": 2.2703235125209886e-10, + "learning_rate": 3.0560503268294683e-06, + "loss": 1.0912, + "step": 34570 + }, + { + "epoch": 2.698821509404511, + "grad_norm": 286.3675537109375, + "learning_rate": 3.040439031512271e-06, + "loss": 1.3075, + "step": 34580 + }, + { + "epoch": 2.699601966752517, + "grad_norm": 0.09111825376749039, + "learning_rate": 3.0248664616986334e-06, + "loss": 0.5007, + "step": 34590 + }, + { + "epoch": 2.700382424100523, + "grad_norm": 0.000960067380219698, + "learning_rate": 3.0093326302305604e-06, + "loss": 0.0, + "step": 34600 + }, + { + "epoch": 2.701162881448529, + "grad_norm": 0.0028044397477060556, + "learning_rate": 2.9938375499181327e-06, + "loss": 0.0031, + "step": 34610 + }, + { + "epoch": 2.701943338796535, + "grad_norm": 2.1090053776006283e-20, + "learning_rate": 2.9783812335394733e-06, + "loss": 0.0, + "step": 34620 + }, + { + "epoch": 2.7027237961445407, + "grad_norm": 1.833546640526377e-14, + "learning_rate": 2.96296369384072e-06, + "loss": 0.5248, + "step": 34630 + }, + { + "epoch": 2.7035042534925466, + "grad_norm": 0.0002606491034384817, + "learning_rate": 2.947584943536058e-06, + "loss": 0.0043, + "step": 34640 + }, + { + "epoch": 2.7042847108405526, + "grad_norm": 8.347085522775188e-11, + "learning_rate": 2.9322449953076647e-06, + "loss": 2.5233, + "step": 34650 + }, + { + "epoch": 2.7050651681885585, + "grad_norm": 260.7015686035156, + "learning_rate": 2.9169438618057377e-06, + "loss": 1.1463, + "step": 34660 + }, + { + "epoch": 2.7058456255365644, + "grad_norm": 0.000424844678491354, + "learning_rate": 2.901681555648439e-06, + "loss": 0.0, + "step": 34670 + }, + { + "epoch": 2.7066260828845703, + "grad_norm": 1.760547545422153e-13, + "learning_rate": 2.8864580894219394e-06, + "loss": 0.0237, + "step": 34680 + }, + { + "epoch": 2.707406540232576, + "grad_norm": 0.004844082985073328, + "learning_rate": 2.8712734756803584e-06, + "loss": 0.1702, + "step": 34690 + }, + { + "epoch": 2.708186997580582, + "grad_norm": 0.027812352403998375, + "learning_rate": 2.85612772694579e-06, + "loss": 0.0471, + "step": 34700 + }, + { + "epoch": 2.708967454928588, + "grad_norm": 3.4556917327677183e-09, + "learning_rate": 2.841020855708276e-06, + "loss": 0.3409, + "step": 34710 + }, + { + "epoch": 2.709747912276594, + "grad_norm": 9.396440738126607e-14, + "learning_rate": 2.8259528744258023e-06, + "loss": 0.0052, + "step": 34720 + }, + { + "epoch": 2.7105283696246, + "grad_norm": 6.341630935668945, + "learning_rate": 2.810923795524262e-06, + "loss": 0.1053, + "step": 34730 + }, + { + "epoch": 2.711308826972606, + "grad_norm": 0.005546544678509235, + "learning_rate": 2.795933631397485e-06, + "loss": 1.1266, + "step": 34740 + }, + { + "epoch": 2.712089284320612, + "grad_norm": 1.9116978833277898e-10, + "learning_rate": 2.780982394407211e-06, + "loss": 0.0004, + "step": 34750 + }, + { + "epoch": 2.7128697416686176, + "grad_norm": 280.81304931640625, + "learning_rate": 2.7660700968830666e-06, + "loss": 0.5277, + "step": 34760 + }, + { + "epoch": 2.713650199016624, + "grad_norm": 3.132147823765596e-20, + "learning_rate": 2.7511967511225756e-06, + "loss": 0.2108, + "step": 34770 + }, + { + "epoch": 2.7144306563646294, + "grad_norm": 9.376853267895058e-05, + "learning_rate": 2.7363623693911432e-06, + "loss": 0.0007, + "step": 34780 + }, + { + "epoch": 2.715211113712636, + "grad_norm": 4.393995922669092e-08, + "learning_rate": 2.72156696392204e-06, + "loss": 0.0452, + "step": 34790 + }, + { + "epoch": 2.7159915710606413, + "grad_norm": 0.0003435779071878642, + "learning_rate": 2.706810546916383e-06, + "loss": 0.0086, + "step": 34800 + }, + { + "epoch": 2.7167720284086476, + "grad_norm": 8.209787368774414, + "learning_rate": 2.6920931305431496e-06, + "loss": 0.0087, + "step": 34810 + }, + { + "epoch": 2.7175524857566535, + "grad_norm": 7.450088346091447e-16, + "learning_rate": 2.677414726939159e-06, + "loss": 0.5131, + "step": 34820 + }, + { + "epoch": 2.7183329431046594, + "grad_norm": 0.0019514625892043114, + "learning_rate": 2.6627753482090347e-06, + "loss": 0.7882, + "step": 34830 + }, + { + "epoch": 2.7191134004526654, + "grad_norm": 7.329802610911429e-05, + "learning_rate": 2.648175006425246e-06, + "loss": 0.0634, + "step": 34840 + }, + { + "epoch": 2.7198938578006713, + "grad_norm": 1.3475154638290405, + "learning_rate": 2.6336137136280526e-06, + "loss": 0.0131, + "step": 34850 + }, + { + "epoch": 2.720674315148677, + "grad_norm": 0.001319439266808331, + "learning_rate": 2.619091481825531e-06, + "loss": 0.0597, + "step": 34860 + }, + { + "epoch": 2.721454772496683, + "grad_norm": 0.009119241498410702, + "learning_rate": 2.604608322993518e-06, + "loss": 0.0006, + "step": 34870 + }, + { + "epoch": 2.722235229844689, + "grad_norm": 6.017874511599075e-06, + "learning_rate": 2.590164249075655e-06, + "loss": 1.9251, + "step": 34880 + }, + { + "epoch": 2.723015687192695, + "grad_norm": 0.0004034480080008507, + "learning_rate": 2.57575927198333e-06, + "loss": 0.5727, + "step": 34890 + }, + { + "epoch": 2.723796144540701, + "grad_norm": 4.727761790945806e-07, + "learning_rate": 2.5613934035957132e-06, + "loss": 0.0172, + "step": 34900 + }, + { + "epoch": 2.7245766018887068, + "grad_norm": 1.950404616845415e-18, + "learning_rate": 2.5470666557596945e-06, + "loss": 0.017, + "step": 34910 + }, + { + "epoch": 2.7253570592367127, + "grad_norm": 2.9835364818573, + "learning_rate": 2.5327790402899177e-06, + "loss": 0.0016, + "step": 34920 + }, + { + "epoch": 2.7261375165847186, + "grad_norm": 1.925894999033062e-10, + "learning_rate": 2.51853056896878e-06, + "loss": 1.1155, + "step": 34930 + }, + { + "epoch": 2.7269179739327245, + "grad_norm": 0.009026541374623775, + "learning_rate": 2.5043212535463488e-06, + "loss": 1.1879, + "step": 34940 + }, + { + "epoch": 2.7276984312807304, + "grad_norm": 1.3697922346698476e-17, + "learning_rate": 2.490151105740446e-06, + "loss": 0.0281, + "step": 34950 + }, + { + "epoch": 2.7284788886287363, + "grad_norm": 9.61051628109999e-05, + "learning_rate": 2.4760201372365566e-06, + "loss": 0.0002, + "step": 34960 + }, + { + "epoch": 2.7292593459767422, + "grad_norm": 2.5067332654060692e-09, + "learning_rate": 2.4619283596878985e-06, + "loss": 0.0021, + "step": 34970 + }, + { + "epoch": 2.730039803324748, + "grad_norm": 68.98681640625, + "learning_rate": 2.44787578471532e-06, + "loss": 0.6315, + "step": 34980 + }, + { + "epoch": 2.730820260672754, + "grad_norm": 2.5030828965100227e-08, + "learning_rate": 2.4338624239073903e-06, + "loss": 1.2717, + "step": 34990 + }, + { + "epoch": 2.7316007180207604, + "grad_norm": 242.74081420898438, + "learning_rate": 2.419888288820299e-06, + "loss": 1.3068, + "step": 35000 + }, + { + "epoch": 2.732381175368766, + "grad_norm": 9.598036740499083e-06, + "learning_rate": 2.4059533909779165e-06, + "loss": 0.0011, + "step": 35010 + }, + { + "epoch": 2.7331616327167723, + "grad_norm": 3.2457666065965896e-07, + "learning_rate": 2.3920577418717505e-06, + "loss": 1.1327, + "step": 35020 + }, + { + "epoch": 2.7339420900647777, + "grad_norm": 0.007707437966018915, + "learning_rate": 2.378201352960924e-06, + "loss": 0.0002, + "step": 35030 + }, + { + "epoch": 2.734722547412784, + "grad_norm": 3.7804625034332275, + "learning_rate": 2.3643842356722124e-06, + "loss": 0.9504, + "step": 35040 + }, + { + "epoch": 2.7355030047607896, + "grad_norm": 1.9210584512041606e-11, + "learning_rate": 2.350606401399974e-06, + "loss": 1.5375, + "step": 35050 + }, + { + "epoch": 2.736283462108796, + "grad_norm": 1.1549177396550862e-18, + "learning_rate": 2.336867861506203e-06, + "loss": 0.0014, + "step": 35060 + }, + { + "epoch": 2.737063919456802, + "grad_norm": 0.04194324463605881, + "learning_rate": 2.32316862732046e-06, + "loss": 0.0127, + "step": 35070 + }, + { + "epoch": 2.7378443768048077, + "grad_norm": 115.06947326660156, + "learning_rate": 2.3095087101399126e-06, + "loss": 0.12, + "step": 35080 + }, + { + "epoch": 2.7386248341528137, + "grad_norm": 2.2268917265755306e-10, + "learning_rate": 2.2958881212293006e-06, + "loss": 0.0128, + "step": 35090 + }, + { + "epoch": 2.7394052915008196, + "grad_norm": 51.513614654541016, + "learning_rate": 2.282306871820938e-06, + "loss": 0.666, + "step": 35100 + }, + { + "epoch": 2.7401857488488255, + "grad_norm": 8.180680274963379, + "learning_rate": 2.268764973114684e-06, + "loss": 0.097, + "step": 35110 + }, + { + "epoch": 2.7409662061968314, + "grad_norm": 5.291911353566547e-09, + "learning_rate": 2.255262436277944e-06, + "loss": 0.1661, + "step": 35120 + }, + { + "epoch": 2.7417466635448373, + "grad_norm": 1.2681041489749473e-20, + "learning_rate": 2.2417992724456827e-06, + "loss": 0.1017, + "step": 35130 + }, + { + "epoch": 2.7425271208928432, + "grad_norm": 4.007921264800628e-18, + "learning_rate": 2.228375492720369e-06, + "loss": 0.3182, + "step": 35140 + }, + { + "epoch": 2.743307578240849, + "grad_norm": 2.2716215575258047e-09, + "learning_rate": 2.2149911081720264e-06, + "loss": 2.6478, + "step": 35150 + }, + { + "epoch": 2.744088035588855, + "grad_norm": 0.0053436425514519215, + "learning_rate": 2.2016461298381588e-06, + "loss": 0.0315, + "step": 35160 + }, + { + "epoch": 2.744868492936861, + "grad_norm": 2.0400614791221683e-12, + "learning_rate": 2.1883405687238067e-06, + "loss": 1.6938, + "step": 35170 + }, + { + "epoch": 2.745648950284867, + "grad_norm": 1.920632939800271e-06, + "learning_rate": 2.1750744358014717e-06, + "loss": 0.0001, + "step": 35180 + }, + { + "epoch": 2.746429407632873, + "grad_norm": 2.3888586042630777e-07, + "learning_rate": 2.161847742011164e-06, + "loss": 0.0, + "step": 35190 + }, + { + "epoch": 2.7472098649808787, + "grad_norm": 0.08937634527683258, + "learning_rate": 2.148660498260352e-06, + "loss": 1.8679, + "step": 35200 + }, + { + "epoch": 2.7479903223288846, + "grad_norm": 4.1917643102351576e-05, + "learning_rate": 2.1355127154239927e-06, + "loss": 0.8796, + "step": 35210 + }, + { + "epoch": 2.7487707796768905, + "grad_norm": 0.0017686202190816402, + "learning_rate": 2.122404404344486e-06, + "loss": 0.0002, + "step": 35220 + }, + { + "epoch": 2.7495512370248965, + "grad_norm": 5.840608992002672e-07, + "learning_rate": 2.1093355758316723e-06, + "loss": 0.0002, + "step": 35230 + }, + { + "epoch": 2.7503316943729024, + "grad_norm": 1.9371828795394235e-21, + "learning_rate": 2.09630624066286e-06, + "loss": 0.2608, + "step": 35240 + }, + { + "epoch": 2.7511121517209087, + "grad_norm": 0.0011315630981698632, + "learning_rate": 2.083316409582764e-06, + "loss": 0.0001, + "step": 35250 + }, + { + "epoch": 2.751892609068914, + "grad_norm": 279.3219909667969, + "learning_rate": 2.070366093303544e-06, + "loss": 0.3636, + "step": 35260 + }, + { + "epoch": 2.7526730664169206, + "grad_norm": 4.184293516118487e-07, + "learning_rate": 2.057455302504746e-06, + "loss": 0.1676, + "step": 35270 + }, + { + "epoch": 2.753453523764926, + "grad_norm": 0.03150082752108574, + "learning_rate": 2.044584047833359e-06, + "loss": 1.2382, + "step": 35280 + }, + { + "epoch": 2.7542339811129324, + "grad_norm": 21.65041732788086, + "learning_rate": 2.0317523399037195e-06, + "loss": 0.0081, + "step": 35290 + }, + { + "epoch": 2.755014438460938, + "grad_norm": 5.577290135261835e-15, + "learning_rate": 2.0189601892975974e-06, + "loss": 1.3071, + "step": 35300 + }, + { + "epoch": 2.755794895808944, + "grad_norm": 0.056859709322452545, + "learning_rate": 2.006207606564109e-06, + "loss": 0.0181, + "step": 35310 + }, + { + "epoch": 2.75657535315695, + "grad_norm": 1.200806442821027e-21, + "learning_rate": 1.9934946022197586e-06, + "loss": 0.1702, + "step": 35320 + }, + { + "epoch": 2.757355810504956, + "grad_norm": 2.7878815600731964e-17, + "learning_rate": 1.9808211867484105e-06, + "loss": 0.402, + "step": 35330 + }, + { + "epoch": 2.758136267852962, + "grad_norm": 5.401259422302246, + "learning_rate": 1.968187370601271e-06, + "loss": 0.0813, + "step": 35340 + }, + { + "epoch": 2.758916725200968, + "grad_norm": 0.5862646102905273, + "learning_rate": 1.9555931641969116e-06, + "loss": 0.0017, + "step": 35350 + }, + { + "epoch": 2.759697182548974, + "grad_norm": 8.405132393818349e-05, + "learning_rate": 1.943038577921208e-06, + "loss": 0.0025, + "step": 35360 + }, + { + "epoch": 2.7604776398969797, + "grad_norm": 0.005214150529354811, + "learning_rate": 1.9305236221273894e-06, + "loss": 0.7878, + "step": 35370 + }, + { + "epoch": 2.7612580972449856, + "grad_norm": 0.00010408924572402611, + "learning_rate": 1.9180483071359946e-06, + "loss": 0.1641, + "step": 35380 + }, + { + "epoch": 2.7620385545929915, + "grad_norm": 6.571749811001837e-09, + "learning_rate": 1.9056126432348664e-06, + "loss": 0.2695, + "step": 35390 + }, + { + "epoch": 2.7628190119409974, + "grad_norm": 4.0899040658889474e-19, + "learning_rate": 1.893216640679163e-06, + "loss": 0.0102, + "step": 35400 + }, + { + "epoch": 2.7635994692890034, + "grad_norm": 0.0037415786646306515, + "learning_rate": 1.8808603096913235e-06, + "loss": 0.0203, + "step": 35410 + }, + { + "epoch": 2.7643799266370093, + "grad_norm": 0.0011977603426203132, + "learning_rate": 1.8685436604610807e-06, + "loss": 0.0143, + "step": 35420 + }, + { + "epoch": 2.765160383985015, + "grad_norm": 32.34883117675781, + "learning_rate": 1.8562667031454262e-06, + "loss": 0.5085, + "step": 35430 + }, + { + "epoch": 2.765940841333021, + "grad_norm": 5.38933093707783e-08, + "learning_rate": 1.8440294478686448e-06, + "loss": 0.002, + "step": 35440 + }, + { + "epoch": 2.766721298681027, + "grad_norm": 290.1611328125, + "learning_rate": 1.8318319047222532e-06, + "loss": 2.7862, + "step": 35450 + }, + { + "epoch": 2.767501756029033, + "grad_norm": 9.790755939320661e-08, + "learning_rate": 1.8196740837650495e-06, + "loss": 0.1858, + "step": 35460 + }, + { + "epoch": 2.768282213377039, + "grad_norm": 1.5828281902940944e-05, + "learning_rate": 1.807555995023047e-06, + "loss": 0.1846, + "step": 35470 + }, + { + "epoch": 2.7690626707250448, + "grad_norm": 8.833049626985384e-16, + "learning_rate": 1.7954776484895186e-06, + "loss": 0.0407, + "step": 35480 + }, + { + "epoch": 2.7698431280730507, + "grad_norm": 0.0, + "learning_rate": 1.7834390541249358e-06, + "loss": 0.9468, + "step": 35490 + }, + { + "epoch": 2.7706235854210566, + "grad_norm": 127.19840240478516, + "learning_rate": 1.7714402218570125e-06, + "loss": 0.1476, + "step": 35500 + }, + { + "epoch": 2.7714040427690625, + "grad_norm": 3.28693672280167e-09, + "learning_rate": 1.7594811615806616e-06, + "loss": 0.2407, + "step": 35510 + }, + { + "epoch": 2.772184500117069, + "grad_norm": 8.966098903329112e-06, + "learning_rate": 1.7475618831579942e-06, + "loss": 0.5673, + "step": 35520 + }, + { + "epoch": 2.7729649574650743, + "grad_norm": 0.01867363229393959, + "learning_rate": 1.7356823964183255e-06, + "loss": 0.0314, + "step": 35530 + }, + { + "epoch": 2.7737454148130807, + "grad_norm": 1.3436168546038588e-14, + "learning_rate": 1.7238427111581412e-06, + "loss": 1.5335, + "step": 35540 + }, + { + "epoch": 2.774525872161086, + "grad_norm": 9.506376266479492, + "learning_rate": 1.7120428371411146e-06, + "loss": 0.0026, + "step": 35550 + }, + { + "epoch": 2.7753063295090925, + "grad_norm": 5.515518296306254e-06, + "learning_rate": 1.7002827840981007e-06, + "loss": 0.0003, + "step": 35560 + }, + { + "epoch": 2.7760867868570984, + "grad_norm": 0.0, + "learning_rate": 1.6885625617270972e-06, + "loss": 0.0489, + "step": 35570 + }, + { + "epoch": 2.7768672442051043, + "grad_norm": 1.0065394917546655e-06, + "learning_rate": 1.6768821796932565e-06, + "loss": 0.2155, + "step": 35580 + }, + { + "epoch": 2.7776477015531102, + "grad_norm": 1.872426310001174e-06, + "learning_rate": 1.6652416476288846e-06, + "loss": 0.0216, + "step": 35590 + }, + { + "epoch": 2.778428158901116, + "grad_norm": 0.001323552569374442, + "learning_rate": 1.6536409751334191e-06, + "loss": 0.0067, + "step": 35600 + }, + { + "epoch": 2.779208616249122, + "grad_norm": 6.04273395765631e-08, + "learning_rate": 1.6420801717734246e-06, + "loss": 0.0008, + "step": 35610 + }, + { + "epoch": 2.779989073597128, + "grad_norm": 0.00045711363782174885, + "learning_rate": 1.630559247082597e-06, + "loss": 0.0002, + "step": 35620 + }, + { + "epoch": 2.780769530945134, + "grad_norm": 1.8189606908230104e-15, + "learning_rate": 1.6190782105617364e-06, + "loss": 0.7252, + "step": 35630 + }, + { + "epoch": 2.78154998829314, + "grad_norm": 8.045684012358834e-07, + "learning_rate": 1.6076370716787692e-06, + "loss": 0.0043, + "step": 35640 + }, + { + "epoch": 2.7823304456411457, + "grad_norm": 5.913908334159864e-16, + "learning_rate": 1.5962358398686816e-06, + "loss": 0.0176, + "step": 35650 + }, + { + "epoch": 2.7831109029891516, + "grad_norm": 0.8195492625236511, + "learning_rate": 1.5848745245335916e-06, + "loss": 0.0003, + "step": 35660 + }, + { + "epoch": 2.7838913603371576, + "grad_norm": 0.07716309279203415, + "learning_rate": 1.5735531350426657e-06, + "loss": 0.0038, + "step": 35670 + }, + { + "epoch": 2.7846718176851635, + "grad_norm": 3.6004589674121235e-06, + "learning_rate": 1.5622716807321692e-06, + "loss": 0.0014, + "step": 35680 + }, + { + "epoch": 2.7854522750331694, + "grad_norm": 2.6100972263520816e-06, + "learning_rate": 1.5510301709054209e-06, + "loss": 1.281, + "step": 35690 + }, + { + "epoch": 2.7862327323811753, + "grad_norm": 0.5086005330085754, + "learning_rate": 1.5398286148328056e-06, + "loss": 2.1155, + "step": 35700 + }, + { + "epoch": 2.787013189729181, + "grad_norm": 268.41796875, + "learning_rate": 1.5286670217517673e-06, + "loss": 0.3591, + "step": 35710 + }, + { + "epoch": 2.787793647077187, + "grad_norm": 1.9338531274115667e-05, + "learning_rate": 1.5175454008667712e-06, + "loss": 0.0009, + "step": 35720 + }, + { + "epoch": 2.788574104425193, + "grad_norm": 180.71678161621094, + "learning_rate": 1.5064637613493471e-06, + "loss": 0.2561, + "step": 35730 + }, + { + "epoch": 2.789354561773199, + "grad_norm": 2.9723254413305256e-10, + "learning_rate": 1.495422112338024e-06, + "loss": 0.0281, + "step": 35740 + }, + { + "epoch": 2.790135019121205, + "grad_norm": 1.7081179678156422e-13, + "learning_rate": 1.484420462938385e-06, + "loss": 0.3611, + "step": 35750 + }, + { + "epoch": 2.790915476469211, + "grad_norm": 3.5201527099128085e-18, + "learning_rate": 1.4734588222230006e-06, + "loss": 0.0, + "step": 35760 + }, + { + "epoch": 2.791695933817217, + "grad_norm": 0.011384816840291023, + "learning_rate": 1.4625371992314618e-06, + "loss": 0.56, + "step": 35770 + }, + { + "epoch": 2.7924763911652226, + "grad_norm": 5.476379556057509e-06, + "learning_rate": 1.4516556029703532e-06, + "loss": 0.2762, + "step": 35780 + }, + { + "epoch": 2.793256848513229, + "grad_norm": 0.014588144607841969, + "learning_rate": 1.4408140424132578e-06, + "loss": 0.2594, + "step": 35790 + }, + { + "epoch": 2.7940373058612344, + "grad_norm": 6.3460006458626594e-06, + "learning_rate": 1.4300125265007347e-06, + "loss": 0.4624, + "step": 35800 + }, + { + "epoch": 2.794817763209241, + "grad_norm": 0.00012951460666954517, + "learning_rate": 1.41925106414032e-06, + "loss": 0.1455, + "step": 35810 + }, + { + "epoch": 2.7955982205572463, + "grad_norm": 6.552781234292482e-11, + "learning_rate": 1.4085296642065316e-06, + "loss": 1.6117, + "step": 35820 + }, + { + "epoch": 2.7963786779052526, + "grad_norm": 70.52780151367188, + "learning_rate": 1.39784833554083e-06, + "loss": 0.0387, + "step": 35830 + }, + { + "epoch": 2.7971591352532585, + "grad_norm": 1.323123655083691e-07, + "learning_rate": 1.3872070869516529e-06, + "loss": 0.003, + "step": 35840 + }, + { + "epoch": 2.7979395926012645, + "grad_norm": 250.273193359375, + "learning_rate": 1.3766059272143638e-06, + "loss": 0.2595, + "step": 35850 + }, + { + "epoch": 2.7987200499492704, + "grad_norm": 3.7702760913305156e-09, + "learning_rate": 1.366044865071281e-06, + "loss": 0.5753, + "step": 35860 + }, + { + "epoch": 2.7995005072972763, + "grad_norm": 383.3858642578125, + "learning_rate": 1.355523909231654e-06, + "loss": 1.8688, + "step": 35870 + }, + { + "epoch": 2.800280964645282, + "grad_norm": 2.6719694687926676e-06, + "learning_rate": 1.3450430683716596e-06, + "loss": 0.0001, + "step": 35880 + }, + { + "epoch": 2.801061421993288, + "grad_norm": 198.89208984375, + "learning_rate": 1.3346023511343842e-06, + "loss": 0.5731, + "step": 35890 + }, + { + "epoch": 2.801841879341294, + "grad_norm": 0.2880525588989258, + "learning_rate": 1.3242017661298345e-06, + "loss": 0.0024, + "step": 35900 + }, + { + "epoch": 2.8026223366893, + "grad_norm": 1.8106071248422465e-14, + "learning_rate": 1.3138413219349165e-06, + "loss": 0.8769, + "step": 35910 + }, + { + "epoch": 2.803402794037306, + "grad_norm": 1.7158454561697067e-12, + "learning_rate": 1.3035210270934462e-06, + "loss": 0.0018, + "step": 35920 + }, + { + "epoch": 2.8041832513853118, + "grad_norm": 0.22820571064949036, + "learning_rate": 1.2932408901161096e-06, + "loss": 0.0376, + "step": 35930 + }, + { + "epoch": 2.8049637087333177, + "grad_norm": 0.018509892746806145, + "learning_rate": 1.2830009194804927e-06, + "loss": 1.6518, + "step": 35940 + }, + { + "epoch": 2.8057441660813236, + "grad_norm": 7.552736759185791, + "learning_rate": 1.2728011236310566e-06, + "loss": 0.4229, + "step": 35950 + }, + { + "epoch": 2.8065246234293295, + "grad_norm": 0.0012847146717831492, + "learning_rate": 1.2626415109791235e-06, + "loss": 1.4565, + "step": 35960 + }, + { + "epoch": 2.8073050807773354, + "grad_norm": 9.853501348189297e-10, + "learning_rate": 1.252522089902891e-06, + "loss": 1.1181, + "step": 35970 + }, + { + "epoch": 2.8080855381253413, + "grad_norm": 8.04511088858817e-08, + "learning_rate": 1.2424428687473954e-06, + "loss": 0.4782, + "step": 35980 + }, + { + "epoch": 2.8088659954733473, + "grad_norm": 0.0016354135004803538, + "learning_rate": 1.2324038558245376e-06, + "loss": 0.1163, + "step": 35990 + }, + { + "epoch": 2.809646452821353, + "grad_norm": 219.00985717773438, + "learning_rate": 1.2224050594130454e-06, + "loss": 0.2557, + "step": 36000 + }, + { + "epoch": 2.810426910169359, + "grad_norm": 3.0811437385855345e-13, + "learning_rate": 1.2124464877584951e-06, + "loss": 0.0113, + "step": 36010 + }, + { + "epoch": 2.8112073675173654, + "grad_norm": 0.0, + "learning_rate": 1.2025281490732953e-06, + "loss": 0.1054, + "step": 36020 + }, + { + "epoch": 2.811987824865371, + "grad_norm": 5.375960568094855e-15, + "learning_rate": 1.192650051536659e-06, + "loss": 1.0442, + "step": 36030 + }, + { + "epoch": 2.8127682822133773, + "grad_norm": 0.00020814120944123715, + "learning_rate": 1.1828122032946254e-06, + "loss": 0.2798, + "step": 36040 + }, + { + "epoch": 2.8135487395613827, + "grad_norm": 5.018740125706245e-07, + "learning_rate": 1.1730146124600273e-06, + "loss": 0.8126, + "step": 36050 + }, + { + "epoch": 2.814329196909389, + "grad_norm": 0.0027081905864179134, + "learning_rate": 1.1632572871125346e-06, + "loss": 0.0516, + "step": 36060 + }, + { + "epoch": 2.8151096542573946, + "grad_norm": 21.75045394897461, + "learning_rate": 1.1535402352985668e-06, + "loss": 1.5453, + "step": 36070 + }, + { + "epoch": 2.815890111605401, + "grad_norm": 4.614805584424175e-05, + "learning_rate": 1.143863465031364e-06, + "loss": 0.0003, + "step": 36080 + }, + { + "epoch": 2.816670568953407, + "grad_norm": 0.002450536238029599, + "learning_rate": 1.134226984290937e-06, + "loss": 0.8069, + "step": 36090 + }, + { + "epoch": 2.8174510263014128, + "grad_norm": 0.24124084413051605, + "learning_rate": 1.1246308010240625e-06, + "loss": 0.7377, + "step": 36100 + }, + { + "epoch": 2.8182314836494187, + "grad_norm": 8.658199262661639e-16, + "learning_rate": 1.1150749231443102e-06, + "loss": 0.7351, + "step": 36110 + }, + { + "epoch": 2.8190119409974246, + "grad_norm": 0.6915903687477112, + "learning_rate": 1.1055593585319824e-06, + "loss": 0.0231, + "step": 36120 + }, + { + "epoch": 2.8197923983454305, + "grad_norm": 297.76934814453125, + "learning_rate": 1.0960841150341573e-06, + "loss": 0.4391, + "step": 36130 + }, + { + "epoch": 2.8205728556934364, + "grad_norm": 1.6281728743017254e-10, + "learning_rate": 1.0866492004646455e-06, + "loss": 0.7291, + "step": 36140 + }, + { + "epoch": 2.8213533130414423, + "grad_norm": 253.92724609375, + "learning_rate": 1.0772546226040291e-06, + "loss": 1.4697, + "step": 36150 + }, + { + "epoch": 2.8221337703894482, + "grad_norm": 0.0, + "learning_rate": 1.0679003891995887e-06, + "loss": 0.0234, + "step": 36160 + }, + { + "epoch": 2.822914227737454, + "grad_norm": 34.89122009277344, + "learning_rate": 1.058586507965359e-06, + "loss": 0.2094, + "step": 36170 + }, + { + "epoch": 2.82369468508546, + "grad_norm": 269.281982421875, + "learning_rate": 1.049312986582096e-06, + "loss": 0.4945, + "step": 36180 + }, + { + "epoch": 2.824475142433466, + "grad_norm": 1.9220181703567505, + "learning_rate": 1.040079832697266e-06, + "loss": 0.001, + "step": 36190 + }, + { + "epoch": 2.825255599781472, + "grad_norm": 242.4416046142578, + "learning_rate": 1.0308870539250503e-06, + "loss": 0.8693, + "step": 36200 + }, + { + "epoch": 2.826036057129478, + "grad_norm": 7.059574604034424, + "learning_rate": 1.021734657846324e-06, + "loss": 0.003, + "step": 36210 + }, + { + "epoch": 2.8268165144774837, + "grad_norm": 8.370619773864746, + "learning_rate": 1.0126226520086824e-06, + "loss": 0.8058, + "step": 36220 + }, + { + "epoch": 2.8275969718254896, + "grad_norm": 3.950748542638045e-16, + "learning_rate": 1.003551043926393e-06, + "loss": 0.6948, + "step": 36230 + }, + { + "epoch": 2.8283774291734955, + "grad_norm": 71.38811492919922, + "learning_rate": 9.945198410804102e-07, + "loss": 0.0513, + "step": 36240 + }, + { + "epoch": 2.8291578865215015, + "grad_norm": 0.11276044696569443, + "learning_rate": 9.855290509183824e-07, + "loss": 0.0315, + "step": 36250 + }, + { + "epoch": 2.8299383438695074, + "grad_norm": 133.40269470214844, + "learning_rate": 9.765786808546228e-07, + "loss": 0.9867, + "step": 36260 + }, + { + "epoch": 2.8307188012175137, + "grad_norm": 316.03729248046875, + "learning_rate": 9.676687382701054e-07, + "loss": 1.9482, + "step": 36270 + }, + { + "epoch": 2.831499258565519, + "grad_norm": 2.645971820553622e-13, + "learning_rate": 9.587992305124748e-07, + "loss": 0.2695, + "step": 36280 + }, + { + "epoch": 2.8322797159135256, + "grad_norm": 2.3823126902178943e-14, + "learning_rate": 9.499701648960302e-07, + "loss": 0.2852, + "step": 36290 + }, + { + "epoch": 2.833060173261531, + "grad_norm": 6.899634197310434e-09, + "learning_rate": 9.411815487017084e-07, + "loss": 1.3206, + "step": 36300 + }, + { + "epoch": 2.8338406306095374, + "grad_norm": 13.426802635192871, + "learning_rate": 9.324333891771064e-07, + "loss": 0.0057, + "step": 36310 + }, + { + "epoch": 2.834621087957543, + "grad_norm": 32.85948181152344, + "learning_rate": 9.237256935364425e-07, + "loss": 0.0127, + "step": 36320 + }, + { + "epoch": 2.835401545305549, + "grad_norm": 0.058131031692028046, + "learning_rate": 9.15058468960589e-07, + "loss": 0.0143, + "step": 36330 + }, + { + "epoch": 2.836182002653555, + "grad_norm": 256.7023620605469, + "learning_rate": 9.06431722597012e-07, + "loss": 1.7361, + "step": 36340 + }, + { + "epoch": 2.836962460001561, + "grad_norm": 3.593015662772814e-07, + "learning_rate": 8.978454615598209e-07, + "loss": 0.5695, + "step": 36350 + }, + { + "epoch": 2.837742917349567, + "grad_norm": 1.4453445384976504e-11, + "learning_rate": 8.892996929297292e-07, + "loss": 0.0975, + "step": 36360 + }, + { + "epoch": 2.838523374697573, + "grad_norm": 0.0018407270545139909, + "learning_rate": 8.807944237540666e-07, + "loss": 0.0378, + "step": 36370 + }, + { + "epoch": 2.839303832045579, + "grad_norm": 1.47364867704447e-13, + "learning_rate": 8.723296610467446e-07, + "loss": 0.0, + "step": 36380 + }, + { + "epoch": 2.8400842893935847, + "grad_norm": 4.2558838397877e-12, + "learning_rate": 8.639054117882905e-07, + "loss": 0.1764, + "step": 36390 + }, + { + "epoch": 2.8408647467415906, + "grad_norm": 3.332346343309922e-10, + "learning_rate": 8.555216829258195e-07, + "loss": 0.0023, + "step": 36400 + }, + { + "epoch": 2.8416452040895965, + "grad_norm": 1.7551040373646465e-13, + "learning_rate": 8.471784813730232e-07, + "loss": 1.1252, + "step": 36410 + }, + { + "epoch": 2.8424256614376024, + "grad_norm": 9.194217974055003e-19, + "learning_rate": 8.388758140101815e-07, + "loss": 0.538, + "step": 36420 + }, + { + "epoch": 2.8432061187856084, + "grad_norm": 26.544153213500977, + "learning_rate": 8.306136876841286e-07, + "loss": 0.0204, + "step": 36430 + }, + { + "epoch": 2.8439865761336143, + "grad_norm": 35.749874114990234, + "learning_rate": 8.223921092083031e-07, + "loss": 0.6544, + "step": 36440 + }, + { + "epoch": 2.84476703348162, + "grad_norm": 1.4025773452885915e-05, + "learning_rate": 8.142110853626594e-07, + "loss": 0.0, + "step": 36450 + }, + { + "epoch": 2.845547490829626, + "grad_norm": 1.284917203747682e-07, + "learning_rate": 8.060706228937454e-07, + "loss": 0.8626, + "step": 36460 + }, + { + "epoch": 2.846327948177632, + "grad_norm": 6.08206844329834, + "learning_rate": 7.979707285146353e-07, + "loss": 0.0023, + "step": 36470 + }, + { + "epoch": 2.847108405525638, + "grad_norm": 0.37371331453323364, + "learning_rate": 7.899114089049697e-07, + "loss": 0.5559, + "step": 36480 + }, + { + "epoch": 2.847888862873644, + "grad_norm": 299.5367736816406, + "learning_rate": 7.818926707109154e-07, + "loss": 0.1753, + "step": 36490 + }, + { + "epoch": 2.8486693202216498, + "grad_norm": 1.221839374920819e-05, + "learning_rate": 7.739145205451714e-07, + "loss": 0.9911, + "step": 36500 + }, + { + "epoch": 2.8494497775696557, + "grad_norm": 1.6280899217235856e-05, + "learning_rate": 7.659769649869752e-07, + "loss": 0.3591, + "step": 36510 + }, + { + "epoch": 2.8502302349176616, + "grad_norm": 278.3134460449219, + "learning_rate": 7.580800105820796e-07, + "loss": 0.6678, + "step": 36520 + }, + { + "epoch": 2.8510106922656675, + "grad_norm": 0.000931643764488399, + "learning_rate": 7.502236638427695e-07, + "loss": 0.0144, + "step": 36530 + }, + { + "epoch": 2.851791149613674, + "grad_norm": 339.99200439453125, + "learning_rate": 7.424079312478127e-07, + "loss": 0.8403, + "step": 36540 + }, + { + "epoch": 2.8525716069616793, + "grad_norm": 4.815967763471818e-11, + "learning_rate": 7.346328192425145e-07, + "loss": 0.1756, + "step": 36550 + }, + { + "epoch": 2.8533520643096857, + "grad_norm": 0.000435869034845382, + "learning_rate": 7.268983342386737e-07, + "loss": 0.0307, + "step": 36560 + }, + { + "epoch": 2.854132521657691, + "grad_norm": 5.227929705142742e-06, + "learning_rate": 7.192044826145771e-07, + "loss": 1.2073, + "step": 36570 + }, + { + "epoch": 2.8549129790056975, + "grad_norm": 1.5421983690089291e-09, + "learning_rate": 7.115512707150162e-07, + "loss": 0.1502, + "step": 36580 + }, + { + "epoch": 2.8556934363537034, + "grad_norm": 2.3623229594704753e-07, + "learning_rate": 7.03938704851248e-07, + "loss": 0.2127, + "step": 36590 + }, + { + "epoch": 2.8564738937017093, + "grad_norm": 0.0002894347708206624, + "learning_rate": 6.963667913010397e-07, + "loss": 0.3019, + "step": 36600 + }, + { + "epoch": 2.8572543510497153, + "grad_norm": 5.101186957290338e-07, + "learning_rate": 6.888355363086019e-07, + "loss": 0.1228, + "step": 36610 + }, + { + "epoch": 2.858034808397721, + "grad_norm": 154.7194061279297, + "learning_rate": 6.813449460846444e-07, + "loss": 0.1678, + "step": 36620 + }, + { + "epoch": 2.858815265745727, + "grad_norm": 327.861328125, + "learning_rate": 6.738950268063315e-07, + "loss": 0.6057, + "step": 36630 + }, + { + "epoch": 2.859595723093733, + "grad_norm": 0.0005815518088638783, + "learning_rate": 6.664857846172822e-07, + "loss": 0.4766, + "step": 36640 + }, + { + "epoch": 2.860376180441739, + "grad_norm": 0.01614786870777607, + "learning_rate": 6.591172256275702e-07, + "loss": 0.6633, + "step": 36650 + }, + { + "epoch": 2.861156637789745, + "grad_norm": 2.4461796283721924, + "learning_rate": 6.5178935591374e-07, + "loss": 0.1324, + "step": 36660 + }, + { + "epoch": 2.8619370951377507, + "grad_norm": 4.134871958716424e-11, + "learning_rate": 6.445021815187524e-07, + "loss": 2.0777, + "step": 36670 + }, + { + "epoch": 2.8627175524857567, + "grad_norm": 0.0005343449302017689, + "learning_rate": 6.372557084520281e-07, + "loss": 0.0769, + "step": 36680 + }, + { + "epoch": 2.8634980098337626, + "grad_norm": 0.3777041435241699, + "learning_rate": 6.300499426894202e-07, + "loss": 1.8982, + "step": 36690 + }, + { + "epoch": 2.8642784671817685, + "grad_norm": 1.9186600919152141e-10, + "learning_rate": 6.228848901732031e-07, + "loss": 0.3288, + "step": 36700 + }, + { + "epoch": 2.8650589245297744, + "grad_norm": 3.368470788700506e-05, + "learning_rate": 6.157605568120839e-07, + "loss": 4.5806, + "step": 36710 + }, + { + "epoch": 2.8658393818777803, + "grad_norm": 0.30120527744293213, + "learning_rate": 6.08676948481196e-07, + "loss": 0.0001, + "step": 36720 + }, + { + "epoch": 2.8666198392257862, + "grad_norm": 0.01890019327402115, + "learning_rate": 6.016340710220835e-07, + "loss": 0.9956, + "step": 36730 + }, + { + "epoch": 2.867400296573792, + "grad_norm": 8.265578799182549e-06, + "learning_rate": 5.94631930242695e-07, + "loss": 0.2812, + "step": 36740 + }, + { + "epoch": 2.868180753921798, + "grad_norm": 302.0165710449219, + "learning_rate": 5.876705319173892e-07, + "loss": 3.4252, + "step": 36750 + }, + { + "epoch": 2.868961211269804, + "grad_norm": 0.029607435688376427, + "learning_rate": 5.80749881786935e-07, + "loss": 0.0, + "step": 36760 + }, + { + "epoch": 2.86974166861781, + "grad_norm": 1.4220939981157699e-09, + "learning_rate": 5.73869985558484e-07, + "loss": 0.7625, + "step": 36770 + }, + { + "epoch": 2.870522125965816, + "grad_norm": 2.4834584166910645e-08, + "learning_rate": 5.67030848905592e-07, + "loss": 0.9126, + "step": 36780 + }, + { + "epoch": 2.871302583313822, + "grad_norm": 0.07383527606725693, + "learning_rate": 5.602324774681922e-07, + "loss": 0.9126, + "step": 36790 + }, + { + "epoch": 2.8720830406618276, + "grad_norm": 2.807852615660522e-05, + "learning_rate": 5.53474876852611e-07, + "loss": 0.0002, + "step": 36800 + }, + { + "epoch": 2.872863498009834, + "grad_norm": 0.7243090867996216, + "learning_rate": 5.467580526315408e-07, + "loss": 1.1119, + "step": 36810 + }, + { + "epoch": 2.8736439553578395, + "grad_norm": 9.3350399974065e-16, + "learning_rate": 5.400820103440618e-07, + "loss": 1.944, + "step": 36820 + }, + { + "epoch": 2.874424412705846, + "grad_norm": 115.54177856445312, + "learning_rate": 5.334467554956091e-07, + "loss": 0.0732, + "step": 36830 + }, + { + "epoch": 2.8752048700538513, + "grad_norm": 1.0314905011910014e-05, + "learning_rate": 5.268522935579889e-07, + "loss": 0.2186, + "step": 36840 + }, + { + "epoch": 2.8759853274018576, + "grad_norm": 7.815126323862387e-10, + "learning_rate": 5.202986299693679e-07, + "loss": 0.0001, + "step": 36850 + }, + { + "epoch": 2.8767657847498636, + "grad_norm": 5.589182497073235e-20, + "learning_rate": 5.137857701342619e-07, + "loss": 0.0, + "step": 36860 + }, + { + "epoch": 2.8775462420978695, + "grad_norm": 9.388667621124114e-08, + "learning_rate": 5.073137194235523e-07, + "loss": 0.125, + "step": 36870 + }, + { + "epoch": 2.8783266994458754, + "grad_norm": 1.190881082635542e-08, + "learning_rate": 5.008824831744474e-07, + "loss": 0.0003, + "step": 36880 + }, + { + "epoch": 2.8791071567938813, + "grad_norm": 3.307443628985104e-17, + "learning_rate": 4.944920666905162e-07, + "loss": 0.0466, + "step": 36890 + }, + { + "epoch": 2.879887614141887, + "grad_norm": 6.361972615387401e-10, + "learning_rate": 4.881424752416541e-07, + "loss": 0.0082, + "step": 36900 + }, + { + "epoch": 2.880668071489893, + "grad_norm": 0.12347617745399475, + "learning_rate": 4.818337140640894e-07, + "loss": 0.0001, + "step": 36910 + }, + { + "epoch": 2.881448528837899, + "grad_norm": 0.29002267122268677, + "learning_rate": 4.755657883603826e-07, + "loss": 0.0005, + "step": 36920 + }, + { + "epoch": 2.882228986185905, + "grad_norm": 0.01090227346867323, + "learning_rate": 4.6933870329941566e-07, + "loss": 0.0094, + "step": 36930 + }, + { + "epoch": 2.883009443533911, + "grad_norm": 0.003904677229002118, + "learning_rate": 4.631524640164031e-07, + "loss": 0.196, + "step": 36940 + }, + { + "epoch": 2.883789900881917, + "grad_norm": 2.1362526416778564, + "learning_rate": 4.5700707561286414e-07, + "loss": 0.4271, + "step": 36950 + }, + { + "epoch": 2.8845703582299227, + "grad_norm": 1.6145381472737474e-12, + "learning_rate": 4.5090254315662826e-07, + "loss": 3.4132, + "step": 36960 + }, + { + "epoch": 2.8853508155779286, + "grad_norm": 1.6603729591224692e-07, + "learning_rate": 4.4483887168184637e-07, + "loss": 0.9031, + "step": 36970 + }, + { + "epoch": 2.8861312729259345, + "grad_norm": 1.5513957676889246e-16, + "learning_rate": 4.388160661889573e-07, + "loss": 0.0732, + "step": 36980 + }, + { + "epoch": 2.8869117302739404, + "grad_norm": 7.504744048425493e-11, + "learning_rate": 4.328341316446993e-07, + "loss": 0.0176, + "step": 36990 + }, + { + "epoch": 2.8876921876219463, + "grad_norm": 0.0, + "learning_rate": 4.268930729821319e-07, + "loss": 1.3189, + "step": 37000 + }, + { + "epoch": 2.8884726449699523, + "grad_norm": 0.6716959476470947, + "learning_rate": 4.2099289510056926e-07, + "loss": 1.6209, + "step": 37010 + }, + { + "epoch": 2.889253102317958, + "grad_norm": 359.7210693359375, + "learning_rate": 4.151336028656361e-07, + "loss": 1.725, + "step": 37020 + }, + { + "epoch": 2.890033559665964, + "grad_norm": 367.9685363769531, + "learning_rate": 4.093152011092394e-07, + "loss": 0.5489, + "step": 37030 + }, + { + "epoch": 2.8908140170139704, + "grad_norm": 3.560751215821877e-21, + "learning_rate": 4.0353769462956325e-07, + "loss": 0.686, + "step": 37040 + }, + { + "epoch": 2.891594474361976, + "grad_norm": 0.00012073075777152553, + "learning_rate": 3.9780108819105766e-07, + "loss": 0.0, + "step": 37050 + }, + { + "epoch": 2.8923749317099823, + "grad_norm": 1.461513876914978, + "learning_rate": 3.9210538652445503e-07, + "loss": 0.0046, + "step": 37060 + }, + { + "epoch": 2.8931553890579877, + "grad_norm": 6.231614112854004, + "learning_rate": 3.864505943267593e-07, + "loss": 1.5488, + "step": 37070 + }, + { + "epoch": 2.893935846405994, + "grad_norm": 0.04269225895404816, + "learning_rate": 3.8083671626122365e-07, + "loss": 0.0004, + "step": 37080 + }, + { + "epoch": 2.8947163037539996, + "grad_norm": 4.1955885535571724e-05, + "learning_rate": 3.7526375695736694e-07, + "loss": 1.2249, + "step": 37090 + }, + { + "epoch": 2.895496761102006, + "grad_norm": 348.7253723144531, + "learning_rate": 3.6973172101096856e-07, + "loss": 0.1455, + "step": 37100 + }, + { + "epoch": 2.896277218450012, + "grad_norm": 0.0001276261027669534, + "learning_rate": 3.6424061298405697e-07, + "loss": 0.1337, + "step": 37110 + }, + { + "epoch": 2.8970576757980178, + "grad_norm": 168.937255859375, + "learning_rate": 3.5879043740491557e-07, + "loss": 0.2812, + "step": 37120 + }, + { + "epoch": 2.8978381331460237, + "grad_norm": 2.2684753275825642e-05, + "learning_rate": 3.533811987680602e-07, + "loss": 0.2242, + "step": 37130 + }, + { + "epoch": 2.8986185904940296, + "grad_norm": 0.010522971861064434, + "learning_rate": 3.480129015342559e-07, + "loss": 0.2176, + "step": 37140 + }, + { + "epoch": 2.8993990478420355, + "grad_norm": 8.882963697942614e-07, + "learning_rate": 3.42685550130506e-07, + "loss": 1.5899, + "step": 37150 + }, + { + "epoch": 2.9001795051900414, + "grad_norm": 0.0, + "learning_rate": 3.37399148950035e-07, + "loss": 0.2396, + "step": 37160 + }, + { + "epoch": 2.9009599625380473, + "grad_norm": 0.022511860355734825, + "learning_rate": 3.3215370235232246e-07, + "loss": 0.0, + "step": 37170 + }, + { + "epoch": 2.9017404198860532, + "grad_norm": 3.4341320991516113, + "learning_rate": 3.269492146630471e-07, + "loss": 0.4312, + "step": 37180 + }, + { + "epoch": 2.902520877234059, + "grad_norm": 1.03269021565211e-05, + "learning_rate": 3.217856901741312e-07, + "loss": 0.005, + "step": 37190 + }, + { + "epoch": 2.903301334582065, + "grad_norm": 37.59806442260742, + "learning_rate": 3.1666313314370757e-07, + "loss": 0.3451, + "step": 37200 + }, + { + "epoch": 2.904081791930071, + "grad_norm": 0.0004427414678502828, + "learning_rate": 3.1158154779611927e-07, + "loss": 0.0069, + "step": 37210 + }, + { + "epoch": 2.904862249278077, + "grad_norm": 1.5279600620269775, + "learning_rate": 3.065409383219364e-07, + "loss": 0.0609, + "step": 37220 + }, + { + "epoch": 2.905642706626083, + "grad_norm": 0.007380710914731026, + "learning_rate": 3.015413088779229e-07, + "loss": 0.3202, + "step": 37230 + }, + { + "epoch": 2.9064231639740887, + "grad_norm": 6.796607954174735e-14, + "learning_rate": 2.965826635870583e-07, + "loss": 0.2579, + "step": 37240 + }, + { + "epoch": 2.9072036213220946, + "grad_norm": 6.750393338940684e-17, + "learning_rate": 2.916650065385218e-07, + "loss": 0.0001, + "step": 37250 + }, + { + "epoch": 2.9079840786701006, + "grad_norm": 0.0006772010819986463, + "learning_rate": 2.867883417876971e-07, + "loss": 0.0192, + "step": 37260 + }, + { + "epoch": 2.9087645360181065, + "grad_norm": 2.4011129252953824e-14, + "learning_rate": 2.819526733561451e-07, + "loss": 0.0001, + "step": 37270 + }, + { + "epoch": 2.9095449933661124, + "grad_norm": 0.01254895981401205, + "learning_rate": 2.771580052316369e-07, + "loss": 0.003, + "step": 37280 + }, + { + "epoch": 2.9103254507141187, + "grad_norm": 4.720600799146268e-09, + "learning_rate": 2.7240434136812656e-07, + "loss": 0.9636, + "step": 37290 + }, + { + "epoch": 2.911105908062124, + "grad_norm": 1.525995321571827e-05, + "learning_rate": 2.676916856857503e-07, + "loss": 0.0058, + "step": 37300 + }, + { + "epoch": 2.9118863654101306, + "grad_norm": 0.05982762202620506, + "learning_rate": 2.6302004207083284e-07, + "loss": 0.6551, + "step": 37310 + }, + { + "epoch": 2.912666822758136, + "grad_norm": 0.000589911243878305, + "learning_rate": 2.583894143758758e-07, + "loss": 0.0857, + "step": 37320 + }, + { + "epoch": 2.9134472801061424, + "grad_norm": 0.11605760455131531, + "learning_rate": 2.537998064195579e-07, + "loss": 0.0133, + "step": 37330 + }, + { + "epoch": 2.914227737454148, + "grad_norm": 0.3421231508255005, + "learning_rate": 2.4925122198671823e-07, + "loss": 0.6504, + "step": 37340 + }, + { + "epoch": 2.9150081948021542, + "grad_norm": 302.4140319824219, + "learning_rate": 2.447436648283896e-07, + "loss": 0.4466, + "step": 37350 + }, + { + "epoch": 2.91578865215016, + "grad_norm": 0.0016285435995087028, + "learning_rate": 2.4027713866175415e-07, + "loss": 0.2798, + "step": 37360 + }, + { + "epoch": 2.916569109498166, + "grad_norm": 5.190363694396183e-10, + "learning_rate": 2.358516471701544e-07, + "loss": 0.0, + "step": 37370 + }, + { + "epoch": 2.917349566846172, + "grad_norm": 2.233219901664949e-17, + "learning_rate": 2.314671940031099e-07, + "loss": 0.0486, + "step": 37380 + }, + { + "epoch": 2.918130024194178, + "grad_norm": 2.173655033743671e-09, + "learning_rate": 2.2712378277627843e-07, + "loss": 0.4348, + "step": 37390 + }, + { + "epoch": 2.918910481542184, + "grad_norm": 336.09649658203125, + "learning_rate": 2.228214170714893e-07, + "loss": 0.5809, + "step": 37400 + }, + { + "epoch": 2.9196909388901897, + "grad_norm": 147.56405639648438, + "learning_rate": 2.1856010043671548e-07, + "loss": 0.0556, + "step": 37410 + }, + { + "epoch": 2.9204713962381956, + "grad_norm": 1.872330201990735e-08, + "learning_rate": 2.143398363860738e-07, + "loss": 0.0394, + "step": 37420 + }, + { + "epoch": 2.9212518535862015, + "grad_norm": 5.727499683416681e-06, + "learning_rate": 2.1016062839983587e-07, + "loss": 0.0003, + "step": 37430 + }, + { + "epoch": 2.9220323109342075, + "grad_norm": 68.21370697021484, + "learning_rate": 2.0602247992441704e-07, + "loss": 0.0325, + "step": 37440 + }, + { + "epoch": 2.9228127682822134, + "grad_norm": 79.4469985961914, + "learning_rate": 2.019253943723598e-07, + "loss": 0.0433, + "step": 37450 + }, + { + "epoch": 2.9235932256302193, + "grad_norm": 3.062634368333761e-09, + "learning_rate": 1.9786937512235591e-07, + "loss": 0.9574, + "step": 37460 + }, + { + "epoch": 2.924373682978225, + "grad_norm": 0.0030439510010182858, + "learning_rate": 1.9385442551922428e-07, + "loss": 0.2112, + "step": 37470 + }, + { + "epoch": 2.925154140326231, + "grad_norm": 3.254121012366204e-15, + "learning_rate": 1.8988054887392747e-07, + "loss": 0.2216, + "step": 37480 + }, + { + "epoch": 2.925934597674237, + "grad_norm": 1.1417990890549845e-06, + "learning_rate": 1.8594774846353857e-07, + "loss": 0.7698, + "step": 37490 + }, + { + "epoch": 2.926715055022243, + "grad_norm": 4.754112346307647e-14, + "learning_rate": 1.8205602753127438e-07, + "loss": 0.0678, + "step": 37500 + }, + { + "epoch": 2.927495512370249, + "grad_norm": 1.8571523696664372e-07, + "learning_rate": 1.7820538928646213e-07, + "loss": 0.5082, + "step": 37510 + }, + { + "epoch": 2.9282759697182548, + "grad_norm": 0.005671486724168062, + "learning_rate": 1.7439583690455618e-07, + "loss": 1.6796, + "step": 37520 + }, + { + "epoch": 2.9290564270662607, + "grad_norm": 8.371974224985479e-10, + "learning_rate": 1.7062737352713244e-07, + "loss": 0.0002, + "step": 37530 + }, + { + "epoch": 2.9298368844142666, + "grad_norm": 343.91131591796875, + "learning_rate": 1.6690000226187164e-07, + "loss": 3.0723, + "step": 37540 + }, + { + "epoch": 2.9306173417622725, + "grad_norm": 0.007303249090909958, + "learning_rate": 1.6321372618257613e-07, + "loss": 0.8656, + "step": 37550 + }, + { + "epoch": 2.931397799110279, + "grad_norm": 274.3439025878906, + "learning_rate": 1.5956854832916425e-07, + "loss": 0.3517, + "step": 37560 + }, + { + "epoch": 2.9321782564582843, + "grad_norm": 5.482262355942322e-15, + "learning_rate": 1.5596447170764806e-07, + "loss": 0.2141, + "step": 37570 + }, + { + "epoch": 2.9329587138062907, + "grad_norm": 3.0375571849483585e-13, + "learning_rate": 1.5240149929016122e-07, + "loss": 0.0001, + "step": 37580 + }, + { + "epoch": 2.933739171154296, + "grad_norm": 0.00010239938274025917, + "learning_rate": 1.488796340149201e-07, + "loss": 1.1344, + "step": 37590 + }, + { + "epoch": 2.9345196285023025, + "grad_norm": 1.0532202168178628e-06, + "learning_rate": 1.45398878786257e-07, + "loss": 2.5855, + "step": 37600 + }, + { + "epoch": 2.9353000858503084, + "grad_norm": 0.02686687372624874, + "learning_rate": 1.4195923647460364e-07, + "loss": 0.0826, + "step": 37610 + }, + { + "epoch": 2.9360805431983144, + "grad_norm": 6.217739610292483e-06, + "learning_rate": 1.3856070991647985e-07, + "loss": 0.1367, + "step": 37620 + }, + { + "epoch": 2.9368610005463203, + "grad_norm": 313.4095153808594, + "learning_rate": 1.3520330191449936e-07, + "loss": 0.9538, + "step": 37630 + }, + { + "epoch": 2.937641457894326, + "grad_norm": 2.7152030759392334e-15, + "learning_rate": 1.318870152373808e-07, + "loss": 0.1295, + "step": 37640 + }, + { + "epoch": 2.938421915242332, + "grad_norm": 3.337225393806875e-07, + "learning_rate": 1.2861185261990872e-07, + "loss": 0.0, + "step": 37650 + }, + { + "epoch": 2.939202372590338, + "grad_norm": 282.7817687988281, + "learning_rate": 1.2537781676297268e-07, + "loss": 3.676, + "step": 37660 + }, + { + "epoch": 2.939982829938344, + "grad_norm": 8.363390406884719e-06, + "learning_rate": 1.2218491033354484e-07, + "loss": 0.0, + "step": 37670 + }, + { + "epoch": 2.94076328728635, + "grad_norm": 0.0, + "learning_rate": 1.1903313596466903e-07, + "loss": 0.0003, + "step": 37680 + }, + { + "epoch": 2.9415437446343558, + "grad_norm": 0.005267214495688677, + "learning_rate": 1.1592249625548279e-07, + "loss": 1.2254, + "step": 37690 + }, + { + "epoch": 2.9423242019823617, + "grad_norm": 7.535186918872228e-10, + "learning_rate": 1.1285299377118974e-07, + "loss": 0.7656, + "step": 37700 + }, + { + "epoch": 2.9431046593303676, + "grad_norm": 59.69273376464844, + "learning_rate": 1.0982463104307617e-07, + "loss": 0.158, + "step": 37710 + }, + { + "epoch": 2.9438851166783735, + "grad_norm": 7.175589416874573e-05, + "learning_rate": 1.0683741056849994e-07, + "loss": 0.6752, + "step": 37720 + }, + { + "epoch": 2.9446655740263794, + "grad_norm": 334.2007141113281, + "learning_rate": 1.0389133481089608e-07, + "loss": 1.0146, + "step": 37730 + }, + { + "epoch": 2.9454460313743853, + "grad_norm": 2.3582126118526503e-07, + "learning_rate": 1.0098640619976007e-07, + "loss": 0.8188, + "step": 37740 + }, + { + "epoch": 2.9462264887223912, + "grad_norm": 0.0169204231351614, + "learning_rate": 9.812262713066455e-08, + "loss": 0.005, + "step": 37750 + }, + { + "epoch": 2.947006946070397, + "grad_norm": 52.56903076171875, + "learning_rate": 9.529999996524264e-08, + "loss": 0.4055, + "step": 37760 + }, + { + "epoch": 2.947787403418403, + "grad_norm": 2.238950173705234e-06, + "learning_rate": 9.251852703118235e-08, + "loss": 0.0862, + "step": 37770 + }, + { + "epoch": 2.948567860766409, + "grad_norm": 0.10402812063694, + "learning_rate": 8.977821062225444e-08, + "loss": 1.1434, + "step": 37780 + }, + { + "epoch": 2.949348318114415, + "grad_norm": 7.077328205108643, + "learning_rate": 8.707905299827346e-08, + "loss": 0.0021, + "step": 37790 + }, + { + "epoch": 2.950128775462421, + "grad_norm": 0.0004994513583369553, + "learning_rate": 8.442105638511998e-08, + "loss": 0.5704, + "step": 37800 + }, + { + "epoch": 2.950909232810427, + "grad_norm": 34.23684310913086, + "learning_rate": 8.180422297472956e-08, + "loss": 1.4182, + "step": 37810 + }, + { + "epoch": 2.9516896901584326, + "grad_norm": 0.00012013528612442315, + "learning_rate": 7.922855492508153e-08, + "loss": 0.0009, + "step": 37820 + }, + { + "epoch": 2.952470147506439, + "grad_norm": 8.701053072490694e-11, + "learning_rate": 7.66940543602268e-08, + "loss": 0.0001, + "step": 37830 + }, + { + "epoch": 2.9532506048544445, + "grad_norm": 3.51215390104187e-09, + "learning_rate": 7.420072337025464e-08, + "loss": 0.0023, + "step": 37840 + }, + { + "epoch": 2.954031062202451, + "grad_norm": 3.1763735522036263e-22, + "learning_rate": 7.174856401129804e-08, + "loss": 0.0, + "step": 37850 + }, + { + "epoch": 2.9548115195504567, + "grad_norm": 4.604586711920433e-10, + "learning_rate": 6.933757830556164e-08, + "loss": 0.3049, + "step": 37860 + }, + { + "epoch": 2.9555919768984626, + "grad_norm": 0.0019449929241091013, + "learning_rate": 6.696776824127171e-08, + "loss": 0.0023, + "step": 37870 + }, + { + "epoch": 2.9563724342464686, + "grad_norm": 168.966552734375, + "learning_rate": 6.463913577270386e-08, + "loss": 0.0901, + "step": 37880 + }, + { + "epoch": 2.9571528915944745, + "grad_norm": 2.1862453181142882e-08, + "learning_rate": 6.235168282018866e-08, + "loss": 0.0011, + "step": 37890 + }, + { + "epoch": 2.9579333489424804, + "grad_norm": 5.355248049454531e-07, + "learning_rate": 6.01054112700783e-08, + "loss": 0.0021, + "step": 37900 + }, + { + "epoch": 2.9587138062904863, + "grad_norm": 4.50494515348847e-15, + "learning_rate": 5.7900322974785427e-08, + "loss": 0.6629, + "step": 37910 + }, + { + "epoch": 2.959494263638492, + "grad_norm": 504.38427734375, + "learning_rate": 5.57364197527388e-08, + "loss": 0.7332, + "step": 37920 + }, + { + "epoch": 2.960274720986498, + "grad_norm": 1.0105886960598554e-10, + "learning_rate": 5.361370338842764e-08, + "loss": 0.0002, + "step": 37930 + }, + { + "epoch": 2.961055178334504, + "grad_norm": 5.368253255255695e-07, + "learning_rate": 5.153217563235724e-08, + "loss": 0.1828, + "step": 37940 + }, + { + "epoch": 2.96183563568251, + "grad_norm": 3.649755736349902e-13, + "learning_rate": 4.9491838201076727e-08, + "loss": 0.0, + "step": 37950 + }, + { + "epoch": 2.962616093030516, + "grad_norm": 0.0007102579111233354, + "learning_rate": 4.749269277715685e-08, + "loss": 0.0084, + "step": 37960 + }, + { + "epoch": 2.963396550378522, + "grad_norm": 71.00413513183594, + "learning_rate": 4.553474100920663e-08, + "loss": 0.7264, + "step": 37970 + }, + { + "epoch": 2.9641770077265277, + "grad_norm": 8.966297855295124e-07, + "learning_rate": 4.361798451186783e-08, + "loss": 0.6338, + "step": 37980 + }, + { + "epoch": 2.9649574650745336, + "grad_norm": 0.019383694976568222, + "learning_rate": 4.174242486580382e-08, + "loss": 0.0, + "step": 37990 + }, + { + "epoch": 2.9657379224225395, + "grad_norm": 6.672384188277647e-05, + "learning_rate": 3.9908063617710714e-08, + "loss": 0.3395, + "step": 38000 + }, + { + "epoch": 2.9665183797705454, + "grad_norm": 229.11349487304688, + "learning_rate": 3.811490228030068e-08, + "loss": 0.2167, + "step": 38010 + }, + { + "epoch": 2.9672988371185514, + "grad_norm": 0.004529143683612347, + "learning_rate": 3.6362942332313075e-08, + "loss": 0.0106, + "step": 38020 + }, + { + "epoch": 2.9680792944665573, + "grad_norm": 0.0003937554429285228, + "learning_rate": 3.4652185218525535e-08, + "loss": 0.0688, + "step": 38030 + }, + { + "epoch": 2.968859751814563, + "grad_norm": 213.34629821777344, + "learning_rate": 3.298263234970955e-08, + "loss": 0.2664, + "step": 38040 + }, + { + "epoch": 2.969640209162569, + "grad_norm": 3.023458219897357e-13, + "learning_rate": 3.135428510268601e-08, + "loss": 0.0103, + "step": 38050 + }, + { + "epoch": 2.9704206665105755, + "grad_norm": 0.4616576135158539, + "learning_rate": 2.9767144820275207e-08, + "loss": 0.2242, + "step": 38060 + }, + { + "epoch": 2.971201123858581, + "grad_norm": 0.009038741700351238, + "learning_rate": 2.8221212811324615e-08, + "loss": 0.177, + "step": 38070 + }, + { + "epoch": 2.9719815812065873, + "grad_norm": 0.0001392298872815445, + "learning_rate": 2.6716490350692237e-08, + "loss": 0.4559, + "step": 38080 + }, + { + "epoch": 2.9727620385545928, + "grad_norm": 2.2928576253616623e-15, + "learning_rate": 2.525297867926324e-08, + "loss": 0.0002, + "step": 38090 + }, + { + "epoch": 2.973542495902599, + "grad_norm": 5.54908638150664e-07, + "learning_rate": 2.3830679003927768e-08, + "loss": 1.0038, + "step": 38100 + }, + { + "epoch": 2.9743229532506046, + "grad_norm": 4.5243123136806673e-10, + "learning_rate": 2.2449592497597593e-08, + "loss": 2.0914, + "step": 38110 + }, + { + "epoch": 2.975103410598611, + "grad_norm": 3.340051625855267e-05, + "learning_rate": 2.110972029918945e-08, + "loss": 0.2649, + "step": 38120 + }, + { + "epoch": 2.975883867946617, + "grad_norm": 1.426492843847882e-07, + "learning_rate": 1.9811063513647256e-08, + "loss": 0.3439, + "step": 38130 + }, + { + "epoch": 2.9766643252946228, + "grad_norm": 2.1971930890240512e-15, + "learning_rate": 1.8553623211903236e-08, + "loss": 0.0, + "step": 38140 + }, + { + "epoch": 2.9774447826426287, + "grad_norm": 5.232332900284575e-10, + "learning_rate": 1.733740043092791e-08, + "loss": 1.475, + "step": 38150 + }, + { + "epoch": 2.9782252399906346, + "grad_norm": 0.12338301539421082, + "learning_rate": 1.6162396173674544e-08, + "loss": 0.0095, + "step": 38160 + }, + { + "epoch": 2.9790056973386405, + "grad_norm": 4.5922202843939885e-05, + "learning_rate": 1.5028611409129146e-08, + "loss": 1.14, + "step": 38170 + }, + { + "epoch": 2.9797861546866464, + "grad_norm": 0.004267883487045765, + "learning_rate": 1.3936047072266034e-08, + "loss": 0.0202, + "step": 38180 + }, + { + "epoch": 2.9805666120346523, + "grad_norm": 3.284040417383949e-07, + "learning_rate": 1.2884704064075603e-08, + "loss": 0.0, + "step": 38190 + }, + { + "epoch": 2.9813470693826583, + "grad_norm": 0.00014708541857544333, + "learning_rate": 1.187458325156432e-08, + "loss": 1.3839, + "step": 38200 + }, + { + "epoch": 2.982127526730664, + "grad_norm": 1.2784296359313885e-06, + "learning_rate": 1.0905685467721417e-08, + "loss": 1.0751, + "step": 38210 + }, + { + "epoch": 2.98290798407867, + "grad_norm": 7.825324535369873, + "learning_rate": 9.9780115115633e-09, + "loss": 0.7728, + "step": 38220 + }, + { + "epoch": 2.983688441426676, + "grad_norm": 2.2648984909778846e-08, + "learning_rate": 9.091562148100252e-09, + "loss": 0.6783, + "step": 38230 + }, + { + "epoch": 2.984468898774682, + "grad_norm": 0.1770647168159485, + "learning_rate": 8.246338108347518e-09, + "loss": 1.2771, + "step": 38240 + }, + { + "epoch": 2.985249356122688, + "grad_norm": 0.0, + "learning_rate": 7.442340089330868e-09, + "loss": 0.6814, + "step": 38250 + }, + { + "epoch": 2.9860298134706937, + "grad_norm": 2.466651380927942e-07, + "learning_rate": 6.67956875405884e-09, + "loss": 0.2075, + "step": 38260 + }, + { + "epoch": 2.9868102708186997, + "grad_norm": 6.2170888486738456e-15, + "learning_rate": 5.958024731567147e-09, + "loss": 1.4441, + "step": 38270 + }, + { + "epoch": 2.9875907281667056, + "grad_norm": 2.349893390984903e-09, + "learning_rate": 5.277708616879817e-09, + "loss": 0.0633, + "step": 38280 + }, + { + "epoch": 2.9883711855147115, + "grad_norm": 4.472295131563442e-06, + "learning_rate": 4.638620971020302e-09, + "loss": 1.0314, + "step": 38290 + }, + { + "epoch": 2.9891516428627174, + "grad_norm": 2.1543214254056445e-21, + "learning_rate": 4.0407623210225735e-09, + "loss": 0.8742, + "step": 38300 + }, + { + "epoch": 2.9899321002107238, + "grad_norm": 2.3384396446269684e-15, + "learning_rate": 3.484133159903369e-09, + "loss": 0.016, + "step": 38310 + }, + { + "epoch": 2.9907125575587292, + "grad_norm": 176.8972930908203, + "learning_rate": 2.9687339467010523e-09, + "loss": 0.156, + "step": 38320 + }, + { + "epoch": 2.9914930149067356, + "grad_norm": 403.177734375, + "learning_rate": 2.4945651064423036e-09, + "loss": 0.9814, + "step": 38330 + }, + { + "epoch": 2.992273472254741, + "grad_norm": 0.0004511200822889805, + "learning_rate": 2.0616270301476727e-09, + "loss": 0.2183, + "step": 38340 + }, + { + "epoch": 2.9930539296027474, + "grad_norm": 0.0015130466781556606, + "learning_rate": 1.6699200748482301e-09, + "loss": 0.0, + "step": 38350 + }, + { + "epoch": 2.993834386950753, + "grad_norm": 31.965559005737305, + "learning_rate": 1.3194445635578144e-09, + "loss": 0.5585, + "step": 38360 + }, + { + "epoch": 2.9946148442987592, + "grad_norm": 3.413600999091493e-11, + "learning_rate": 1.0102007853118878e-09, + "loss": 0.2753, + "step": 38370 + }, + { + "epoch": 2.995395301646765, + "grad_norm": 7.039468571539655e-21, + "learning_rate": 7.421889951231276e-10, + "loss": 0.0101, + "step": 38380 + }, + { + "epoch": 2.996175758994771, + "grad_norm": 0.17176811397075653, + "learning_rate": 5.154094140036314e-10, + "loss": 1.0048, + "step": 38390 + }, + { + "epoch": 2.996956216342777, + "grad_norm": 3.7634078076109745e-09, + "learning_rate": 3.2986222898157005e-10, + "loss": 0.857, + "step": 38400 + }, + { + "epoch": 2.997736673690783, + "grad_norm": 9.749450691742823e-05, + "learning_rate": 1.855475930567785e-10, + "loss": 0.0004, + "step": 38410 + }, + { + "epoch": 2.998517131038789, + "grad_norm": 0.10597077757120132, + "learning_rate": 8.24656252507161e-11, + "loss": 1.1732, + "step": 38420 + }, + { + "epoch": 2.9992975883867947, + "grad_norm": 4.119867882040667e-15, + "learning_rate": 2.0616410562057653e-11, + "loss": 0.1238, + "step": 38430 + } + ], + "logging_steps": 10, + "max_steps": 38439, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}