diff --git "a/artifacts/training/trainer_state.json" "b/artifacts/training/trainer_state.json" new file mode 100644--- /dev/null +++ "b/artifacts/training/trainer_state.json" @@ -0,0 +1,13182 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 18779, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000532509718302359, + "grad_norm": 23.296741485595703, + "learning_rate": 9.584664536741213e-07, + "loss": 7.8793, + "step": 10 + }, + { + "epoch": 0.001065019436604718, + "grad_norm": 19.85402488708496, + "learning_rate": 2.023429179978701e-06, + "loss": 7.7093, + "step": 20 + }, + { + "epoch": 0.001597529154907077, + "grad_norm": 10.07481861114502, + "learning_rate": 3.08839190628328e-06, + "loss": 7.0137, + "step": 30 + }, + { + "epoch": 0.002130038873209436, + "grad_norm": 5.072307109832764, + "learning_rate": 4.153354632587859e-06, + "loss": 6.1742, + "step": 40 + }, + { + "epoch": 0.002662548591511795, + "grad_norm": 3.5329959392547607, + "learning_rate": 5.218317358892439e-06, + "loss": 5.6068, + "step": 50 + }, + { + "epoch": 0.003195058309814154, + "grad_norm": 3.8453404903411865, + "learning_rate": 6.283280085197018e-06, + "loss": 5.1784, + "step": 60 + }, + { + "epoch": 0.003727568028116513, + "grad_norm": 6.184778213500977, + "learning_rate": 7.3482428115015974e-06, + "loss": 4.6031, + "step": 70 + }, + { + "epoch": 0.004260077746418872, + "grad_norm": 4.906091690063477, + "learning_rate": 8.413205537806178e-06, + "loss": 3.9631, + "step": 80 + }, + { + "epoch": 0.004792587464721231, + "grad_norm": 9.875988960266113, + "learning_rate": 9.478168264110757e-06, + "loss": 3.5113, + "step": 90 + }, + { + "epoch": 0.00532509718302359, + "grad_norm": 5.586822986602783, + "learning_rate": 1.0543130990415335e-05, + "loss": 3.1884, + "step": 100 + }, + { + "epoch": 0.005857606901325949, + "grad_norm": 9.180880546569824, + "learning_rate": 1.1608093716719916e-05, + "loss": 2.9137, + "step": 110 + }, + { + "epoch": 0.006390116619628308, + "grad_norm": 17.583784103393555, + "learning_rate": 1.2673056443024495e-05, + "loss": 2.7283, + "step": 120 + }, + { + "epoch": 0.006922626337930667, + "grad_norm": 16.766233444213867, + "learning_rate": 1.3738019169329076e-05, + "loss": 2.5699, + "step": 130 + }, + { + "epoch": 0.007455136056233026, + "grad_norm": 11.388614654541016, + "learning_rate": 1.4802981895633653e-05, + "loss": 2.4291, + "step": 140 + }, + { + "epoch": 0.007987645774535385, + "grad_norm": 18.473289489746094, + "learning_rate": 1.5867944621938232e-05, + "loss": 2.3001, + "step": 150 + }, + { + "epoch": 0.008520155492837744, + "grad_norm": 12.690078735351562, + "learning_rate": 1.693290734824281e-05, + "loss": 2.1744, + "step": 160 + }, + { + "epoch": 0.009052665211140103, + "grad_norm": 10.144042015075684, + "learning_rate": 1.799787007454739e-05, + "loss": 2.0552, + "step": 170 + }, + { + "epoch": 0.009585174929442462, + "grad_norm": 11.107041358947754, + "learning_rate": 1.906283280085197e-05, + "loss": 1.9585, + "step": 180 + }, + { + "epoch": 0.010117684647744821, + "grad_norm": 14.497051239013672, + "learning_rate": 2.0127795527156552e-05, + "loss": 1.8718, + "step": 190 + }, + { + "epoch": 0.01065019436604718, + "grad_norm": 10.508237838745117, + "learning_rate": 2.1192758253461128e-05, + "loss": 1.8153, + "step": 200 + }, + { + "epoch": 0.01118270408434954, + "grad_norm": 11.81551742553711, + "learning_rate": 2.2257720979765707e-05, + "loss": 1.7521, + "step": 210 + }, + { + "epoch": 0.011715213802651898, + "grad_norm": 7.003968238830566, + "learning_rate": 2.332268370607029e-05, + "loss": 1.7067, + "step": 220 + }, + { + "epoch": 0.012247723520954257, + "grad_norm": 9.637007713317871, + "learning_rate": 2.438764643237487e-05, + "loss": 1.658, + "step": 230 + }, + { + "epoch": 0.012780233239256616, + "grad_norm": 11.963647842407227, + "learning_rate": 2.5452609158679448e-05, + "loss": 1.6003, + "step": 240 + }, + { + "epoch": 0.013312742957558975, + "grad_norm": 15.572464942932129, + "learning_rate": 2.6517571884984027e-05, + "loss": 1.5396, + "step": 250 + }, + { + "epoch": 0.013845252675861335, + "grad_norm": 10.560100555419922, + "learning_rate": 2.7582534611288606e-05, + "loss": 1.4679, + "step": 260 + }, + { + "epoch": 0.014377762394163694, + "grad_norm": 14.625675201416016, + "learning_rate": 2.864749733759319e-05, + "loss": 1.3871, + "step": 270 + }, + { + "epoch": 0.014910272112466053, + "grad_norm": 15.250794410705566, + "learning_rate": 2.971246006389776e-05, + "loss": 1.2908, + "step": 280 + }, + { + "epoch": 0.015442781830768412, + "grad_norm": 10.370095252990723, + "learning_rate": 3.0777422790202344e-05, + "loss": 1.1773, + "step": 290 + }, + { + "epoch": 0.01597529154907077, + "grad_norm": 14.734580993652344, + "learning_rate": 3.1842385516506926e-05, + "loss": 1.0634, + "step": 300 + }, + { + "epoch": 0.01650780126737313, + "grad_norm": 11.359335899353027, + "learning_rate": 3.29073482428115e-05, + "loss": 0.9367, + "step": 310 + }, + { + "epoch": 0.01704031098567549, + "grad_norm": 15.065919876098633, + "learning_rate": 3.3972310969116084e-05, + "loss": 0.851, + "step": 320 + }, + { + "epoch": 0.017572820703977848, + "grad_norm": 11.290328025817871, + "learning_rate": 3.503727369542067e-05, + "loss": 0.7698, + "step": 330 + }, + { + "epoch": 0.018105330422280207, + "grad_norm": 9.410698890686035, + "learning_rate": 3.610223642172524e-05, + "loss": 0.6942, + "step": 340 + }, + { + "epoch": 0.018637840140582566, + "grad_norm": 7.125499725341797, + "learning_rate": 3.716719914802982e-05, + "loss": 0.6353, + "step": 350 + }, + { + "epoch": 0.019170349858884925, + "grad_norm": 11.152689933776855, + "learning_rate": 3.82321618743344e-05, + "loss": 0.6036, + "step": 360 + }, + { + "epoch": 0.019702859577187284, + "grad_norm": 7.263124465942383, + "learning_rate": 3.929712460063898e-05, + "loss": 0.5523, + "step": 370 + }, + { + "epoch": 0.020235369295489643, + "grad_norm": 6.285194396972656, + "learning_rate": 4.036208732694356e-05, + "loss": 0.5131, + "step": 380 + }, + { + "epoch": 0.020767879013792002, + "grad_norm": 3.8969569206237793, + "learning_rate": 4.142705005324814e-05, + "loss": 0.4857, + "step": 390 + }, + { + "epoch": 0.02130038873209436, + "grad_norm": 4.850637912750244, + "learning_rate": 4.249201277955272e-05, + "loss": 0.4606, + "step": 400 + }, + { + "epoch": 0.02183289845039672, + "grad_norm": 3.3862061500549316, + "learning_rate": 4.355697550585729e-05, + "loss": 0.4423, + "step": 410 + }, + { + "epoch": 0.02236540816869908, + "grad_norm": 3.709831714630127, + "learning_rate": 4.4621938232161876e-05, + "loss": 0.4269, + "step": 420 + }, + { + "epoch": 0.022897917887001438, + "grad_norm": 3.6177499294281006, + "learning_rate": 4.568690095846646e-05, + "loss": 0.4109, + "step": 430 + }, + { + "epoch": 0.023430427605303797, + "grad_norm": 3.2016589641571045, + "learning_rate": 4.6751863684771034e-05, + "loss": 0.4038, + "step": 440 + }, + { + "epoch": 0.023962937323606156, + "grad_norm": 3.1846377849578857, + "learning_rate": 4.781682641107562e-05, + "loss": 0.3922, + "step": 450 + }, + { + "epoch": 0.024495447041908515, + "grad_norm": 3.7085001468658447, + "learning_rate": 4.88817891373802e-05, + "loss": 0.3852, + "step": 460 + }, + { + "epoch": 0.025027956760210874, + "grad_norm": 1.9565516710281372, + "learning_rate": 4.994675186368477e-05, + "loss": 0.3727, + "step": 470 + }, + { + "epoch": 0.025560466478513233, + "grad_norm": 2.378927230834961, + "learning_rate": 5.101171458998936e-05, + "loss": 0.3654, + "step": 480 + }, + { + "epoch": 0.026092976196815592, + "grad_norm": 1.7363628149032593, + "learning_rate": 5.207667731629393e-05, + "loss": 0.3528, + "step": 490 + }, + { + "epoch": 0.02662548591511795, + "grad_norm": 2.0001909732818604, + "learning_rate": 5.314164004259851e-05, + "loss": 0.3472, + "step": 500 + }, + { + "epoch": 0.02715799563342031, + "grad_norm": 2.2477715015411377, + "learning_rate": 5.420660276890309e-05, + "loss": 0.3334, + "step": 510 + }, + { + "epoch": 0.02769050535172267, + "grad_norm": 1.9201889038085938, + "learning_rate": 5.527156549520767e-05, + "loss": 0.3183, + "step": 520 + }, + { + "epoch": 0.028223015070025028, + "grad_norm": 1.6685700416564941, + "learning_rate": 5.633652822151225e-05, + "loss": 0.3067, + "step": 530 + }, + { + "epoch": 0.028755524788327387, + "grad_norm": 1.8929866552352905, + "learning_rate": 5.7401490947816826e-05, + "loss": 0.293, + "step": 540 + }, + { + "epoch": 0.029288034506629746, + "grad_norm": 1.649090051651001, + "learning_rate": 5.8466453674121415e-05, + "loss": 0.2802, + "step": 550 + }, + { + "epoch": 0.029820544224932105, + "grad_norm": 0.9757211804389954, + "learning_rate": 5.953141640042599e-05, + "loss": 0.2706, + "step": 560 + }, + { + "epoch": 0.030353053943234464, + "grad_norm": 1.1477692127227783, + "learning_rate": 6.059637912673056e-05, + "loss": 0.2616, + "step": 570 + }, + { + "epoch": 0.030885563661536823, + "grad_norm": 1.2151044607162476, + "learning_rate": 6.166134185303514e-05, + "loss": 0.2488, + "step": 580 + }, + { + "epoch": 0.03141807337983918, + "grad_norm": 0.9159660339355469, + "learning_rate": 6.272630457933972e-05, + "loss": 0.2435, + "step": 590 + }, + { + "epoch": 0.03195058309814154, + "grad_norm": 0.9337270855903625, + "learning_rate": 6.379126730564431e-05, + "loss": 0.2341, + "step": 600 + }, + { + "epoch": 0.0324830928164439, + "grad_norm": 1.3477636575698853, + "learning_rate": 6.485623003194888e-05, + "loss": 0.2271, + "step": 610 + }, + { + "epoch": 0.03301560253474626, + "grad_norm": 0.9192898869514465, + "learning_rate": 6.592119275825347e-05, + "loss": 0.2222, + "step": 620 + }, + { + "epoch": 0.033548112253048615, + "grad_norm": 0.6390801668167114, + "learning_rate": 6.698615548455805e-05, + "loss": 0.2165, + "step": 630 + }, + { + "epoch": 0.03408062197135098, + "grad_norm": 0.6918138265609741, + "learning_rate": 6.805111821086262e-05, + "loss": 0.213, + "step": 640 + }, + { + "epoch": 0.03461313168965333, + "grad_norm": 0.6979911923408508, + "learning_rate": 6.91160809371672e-05, + "loss": 0.2126, + "step": 650 + }, + { + "epoch": 0.035145641407955695, + "grad_norm": 0.7399368286132812, + "learning_rate": 7.018104366347178e-05, + "loss": 0.2094, + "step": 660 + }, + { + "epoch": 0.03567815112625805, + "grad_norm": 0.7001500725746155, + "learning_rate": 7.124600638977636e-05, + "loss": 0.2083, + "step": 670 + }, + { + "epoch": 0.03621066084456041, + "grad_norm": 0.7533488273620605, + "learning_rate": 7.231096911608094e-05, + "loss": 0.207, + "step": 680 + }, + { + "epoch": 0.03674317056286277, + "grad_norm": 0.6705746054649353, + "learning_rate": 7.337593184238552e-05, + "loss": 0.2053, + "step": 690 + }, + { + "epoch": 0.03727568028116513, + "grad_norm": 0.4637382924556732, + "learning_rate": 7.44408945686901e-05, + "loss": 0.204, + "step": 700 + }, + { + "epoch": 0.03780818999946749, + "grad_norm": 0.40789374709129333, + "learning_rate": 7.550585729499468e-05, + "loss": 0.2019, + "step": 710 + }, + { + "epoch": 0.03834069971776985, + "grad_norm": 0.591678261756897, + "learning_rate": 7.657082002129926e-05, + "loss": 0.2011, + "step": 720 + }, + { + "epoch": 0.038873209436072205, + "grad_norm": 0.5219926834106445, + "learning_rate": 7.763578274760383e-05, + "loss": 0.1987, + "step": 730 + }, + { + "epoch": 0.03940571915437457, + "grad_norm": 0.47858574986457825, + "learning_rate": 7.870074547390842e-05, + "loss": 0.1976, + "step": 740 + }, + { + "epoch": 0.03993822887267692, + "grad_norm": 0.752047061920166, + "learning_rate": 7.9765708200213e-05, + "loss": 0.1987, + "step": 750 + }, + { + "epoch": 0.040470738590979285, + "grad_norm": 0.4126583933830261, + "learning_rate": 8.083067092651757e-05, + "loss": 0.1971, + "step": 760 + }, + { + "epoch": 0.04100324830928164, + "grad_norm": 0.8404585719108582, + "learning_rate": 8.189563365282216e-05, + "loss": 0.1948, + "step": 770 + }, + { + "epoch": 0.041535758027584004, + "grad_norm": 0.5083792209625244, + "learning_rate": 8.296059637912672e-05, + "loss": 0.194, + "step": 780 + }, + { + "epoch": 0.04206826774588636, + "grad_norm": 0.3890551030635834, + "learning_rate": 8.402555910543131e-05, + "loss": 0.1923, + "step": 790 + }, + { + "epoch": 0.04260077746418872, + "grad_norm": 0.7016918063163757, + "learning_rate": 8.509052183173589e-05, + "loss": 0.1912, + "step": 800 + }, + { + "epoch": 0.04313328718249108, + "grad_norm": 0.44527336955070496, + "learning_rate": 8.615548455804048e-05, + "loss": 0.1919, + "step": 810 + }, + { + "epoch": 0.04366579690079344, + "grad_norm": 0.3990408778190613, + "learning_rate": 8.722044728434506e-05, + "loss": 0.1905, + "step": 820 + }, + { + "epoch": 0.044198306619095795, + "grad_norm": 0.3964357078075409, + "learning_rate": 8.828541001064963e-05, + "loss": 0.1882, + "step": 830 + }, + { + "epoch": 0.04473081633739816, + "grad_norm": 0.6267169713973999, + "learning_rate": 8.93503727369542e-05, + "loss": 0.1894, + "step": 840 + }, + { + "epoch": 0.04526332605570051, + "grad_norm": 0.3614656329154968, + "learning_rate": 9.041533546325878e-05, + "loss": 0.1873, + "step": 850 + }, + { + "epoch": 0.045795835774002876, + "grad_norm": 0.3725983202457428, + "learning_rate": 9.148029818956337e-05, + "loss": 0.1854, + "step": 860 + }, + { + "epoch": 0.04632834549230523, + "grad_norm": 0.7198257446289062, + "learning_rate": 9.254526091586795e-05, + "loss": 0.1854, + "step": 870 + }, + { + "epoch": 0.046860855210607594, + "grad_norm": 0.5347720980644226, + "learning_rate": 9.361022364217252e-05, + "loss": 0.1856, + "step": 880 + }, + { + "epoch": 0.04739336492890995, + "grad_norm": 0.36126938462257385, + "learning_rate": 9.467518636847711e-05, + "loss": 0.1832, + "step": 890 + }, + { + "epoch": 0.04792587464721231, + "grad_norm": 0.5364170670509338, + "learning_rate": 9.574014909478169e-05, + "loss": 0.1837, + "step": 900 + }, + { + "epoch": 0.04845838436551467, + "grad_norm": 0.3289523422718048, + "learning_rate": 9.680511182108626e-05, + "loss": 0.1819, + "step": 910 + }, + { + "epoch": 0.04899089408381703, + "grad_norm": 0.3482621908187866, + "learning_rate": 9.787007454739084e-05, + "loss": 0.1825, + "step": 920 + }, + { + "epoch": 0.049523403802119385, + "grad_norm": 0.5768856406211853, + "learning_rate": 9.893503727369543e-05, + "loss": 0.1829, + "step": 930 + }, + { + "epoch": 0.05005591352042175, + "grad_norm": 0.337167352437973, + "learning_rate": 0.0001, + "loss": 0.1821, + "step": 940 + }, + { + "epoch": 0.0505884232387241, + "grad_norm": 0.2395765632390976, + "learning_rate": 9.99999457314795e-05, + "loss": 0.1807, + "step": 950 + }, + { + "epoch": 0.051120932957026466, + "grad_norm": 0.37906885147094727, + "learning_rate": 9.999978292608627e-05, + "loss": 0.1795, + "step": 960 + }, + { + "epoch": 0.05165344267532882, + "grad_norm": 0.4830165207386017, + "learning_rate": 9.999951158432521e-05, + "loss": 0.1801, + "step": 970 + }, + { + "epoch": 0.052185952393631184, + "grad_norm": 0.3381877839565277, + "learning_rate": 9.999913170703776e-05, + "loss": 0.1786, + "step": 980 + }, + { + "epoch": 0.05271846211193354, + "grad_norm": 0.4923486113548279, + "learning_rate": 9.999864329540193e-05, + "loss": 0.1792, + "step": 990 + }, + { + "epoch": 0.0532509718302359, + "grad_norm": 0.29676854610443115, + "learning_rate": 9.999804635093233e-05, + "loss": 0.1775, + "step": 1000 + }, + { + "epoch": 0.05378348154853826, + "grad_norm": 0.5925562977790833, + "learning_rate": 9.999734087548009e-05, + "loss": 0.1774, + "step": 1010 + }, + { + "epoch": 0.05431599126684062, + "grad_norm": 0.5015407800674438, + "learning_rate": 9.999652687123293e-05, + "loss": 0.1762, + "step": 1020 + }, + { + "epoch": 0.054848500985142976, + "grad_norm": 0.2539210915565491, + "learning_rate": 9.999560434071517e-05, + "loss": 0.1756, + "step": 1030 + }, + { + "epoch": 0.05538101070344534, + "grad_norm": 0.3802579343318939, + "learning_rate": 9.999457328678761e-05, + "loss": 0.1749, + "step": 1040 + }, + { + "epoch": 0.055913520421747694, + "grad_norm": 0.3460189998149872, + "learning_rate": 9.999343371264757e-05, + "loss": 0.1751, + "step": 1050 + }, + { + "epoch": 0.056446030140050056, + "grad_norm": 0.4987145960330963, + "learning_rate": 9.9992185621829e-05, + "loss": 0.1732, + "step": 1060 + }, + { + "epoch": 0.05697853985835241, + "grad_norm": 0.3508310616016388, + "learning_rate": 9.999082901820225e-05, + "loss": 0.173, + "step": 1070 + }, + { + "epoch": 0.057511049576654774, + "grad_norm": 0.6434867978096008, + "learning_rate": 9.998936390597424e-05, + "loss": 0.1733, + "step": 1080 + }, + { + "epoch": 0.05804355929495713, + "grad_norm": 0.25631335377693176, + "learning_rate": 9.998779028968839e-05, + "loss": 0.1727, + "step": 1090 + }, + { + "epoch": 0.05857606901325949, + "grad_norm": 0.7324220538139343, + "learning_rate": 9.998610817422456e-05, + "loss": 0.1721, + "step": 1100 + }, + { + "epoch": 0.05910857873156185, + "grad_norm": 0.2557462453842163, + "learning_rate": 9.998431756479907e-05, + "loss": 0.1718, + "step": 1110 + }, + { + "epoch": 0.05964108844986421, + "grad_norm": 0.23237043619155884, + "learning_rate": 9.998241846696474e-05, + "loss": 0.1708, + "step": 1120 + }, + { + "epoch": 0.060173598168166566, + "grad_norm": 0.7491874098777771, + "learning_rate": 9.998041088661076e-05, + "loss": 0.1709, + "step": 1130 + }, + { + "epoch": 0.06070610788646893, + "grad_norm": 0.3187454640865326, + "learning_rate": 9.997829482996277e-05, + "loss": 0.1706, + "step": 1140 + }, + { + "epoch": 0.061238617604771284, + "grad_norm": 0.28493937849998474, + "learning_rate": 9.997607030358276e-05, + "loss": 0.1696, + "step": 1150 + }, + { + "epoch": 0.061771127323073646, + "grad_norm": 0.6693065166473389, + "learning_rate": 9.997373731436915e-05, + "loss": 0.1693, + "step": 1160 + }, + { + "epoch": 0.062303637041376, + "grad_norm": 0.4238905608654022, + "learning_rate": 9.997129586955665e-05, + "loss": 0.1691, + "step": 1170 + }, + { + "epoch": 0.06283614675967836, + "grad_norm": 0.5617002248764038, + "learning_rate": 9.996874597671633e-05, + "loss": 0.1698, + "step": 1180 + }, + { + "epoch": 0.06336865647798072, + "grad_norm": 0.29317113757133484, + "learning_rate": 9.996608764375555e-05, + "loss": 0.1684, + "step": 1190 + }, + { + "epoch": 0.06390116619628308, + "grad_norm": 0.4380682706832886, + "learning_rate": 9.996332087891795e-05, + "loss": 0.1688, + "step": 1200 + }, + { + "epoch": 0.06443367591458544, + "grad_norm": 0.7158795595169067, + "learning_rate": 9.996044569078347e-05, + "loss": 0.1681, + "step": 1210 + }, + { + "epoch": 0.0649661856328878, + "grad_norm": 0.39761292934417725, + "learning_rate": 9.99574620882682e-05, + "loss": 0.1679, + "step": 1220 + }, + { + "epoch": 0.06549869535119016, + "grad_norm": 0.2829475700855255, + "learning_rate": 9.995437008062444e-05, + "loss": 0.1683, + "step": 1230 + }, + { + "epoch": 0.06603120506949252, + "grad_norm": 0.22211559116840363, + "learning_rate": 9.995116967744076e-05, + "loss": 0.1676, + "step": 1240 + }, + { + "epoch": 0.06656371478779488, + "grad_norm": 0.24345025420188904, + "learning_rate": 9.994786088864172e-05, + "loss": 0.1654, + "step": 1250 + }, + { + "epoch": 0.06709622450609723, + "grad_norm": 0.2510230243206024, + "learning_rate": 9.994444372448812e-05, + "loss": 0.1671, + "step": 1260 + }, + { + "epoch": 0.06762873422439959, + "grad_norm": 0.26244106888771057, + "learning_rate": 9.994091819557676e-05, + "loss": 0.1659, + "step": 1270 + }, + { + "epoch": 0.06816124394270195, + "grad_norm": 0.2892049252986908, + "learning_rate": 9.993728431284053e-05, + "loss": 0.1652, + "step": 1280 + }, + { + "epoch": 0.06869375366100432, + "grad_norm": 0.4036615788936615, + "learning_rate": 9.993354208754828e-05, + "loss": 0.165, + "step": 1290 + }, + { + "epoch": 0.06922626337930667, + "grad_norm": 0.32690417766571045, + "learning_rate": 9.992969153130491e-05, + "loss": 0.1646, + "step": 1300 + }, + { + "epoch": 0.06975877309760903, + "grad_norm": 0.9821091294288635, + "learning_rate": 9.992573265605119e-05, + "loss": 0.1651, + "step": 1310 + }, + { + "epoch": 0.07029128281591139, + "grad_norm": 0.23433181643486023, + "learning_rate": 9.992166547406383e-05, + "loss": 0.1659, + "step": 1320 + }, + { + "epoch": 0.07082379253421375, + "grad_norm": 0.3498155474662781, + "learning_rate": 9.99174899979554e-05, + "loss": 0.165, + "step": 1330 + }, + { + "epoch": 0.0713563022525161, + "grad_norm": 0.19588203728199005, + "learning_rate": 9.991320624067431e-05, + "loss": 0.1632, + "step": 1340 + }, + { + "epoch": 0.07188881197081846, + "grad_norm": 0.3548436462879181, + "learning_rate": 9.99088142155047e-05, + "loss": 0.1639, + "step": 1350 + }, + { + "epoch": 0.07242132168912083, + "grad_norm": 0.3274150788784027, + "learning_rate": 9.990431393606654e-05, + "loss": 0.1623, + "step": 1360 + }, + { + "epoch": 0.07295383140742319, + "grad_norm": 0.5695179104804993, + "learning_rate": 9.989970541631544e-05, + "loss": 0.1634, + "step": 1370 + }, + { + "epoch": 0.07348634112572554, + "grad_norm": 0.21706371009349823, + "learning_rate": 9.989498867054268e-05, + "loss": 0.1619, + "step": 1380 + }, + { + "epoch": 0.0740188508440279, + "grad_norm": 0.45233970880508423, + "learning_rate": 9.989016371337518e-05, + "loss": 0.1622, + "step": 1390 + }, + { + "epoch": 0.07455136056233026, + "grad_norm": 0.5718231797218323, + "learning_rate": 9.988523055977541e-05, + "loss": 0.1631, + "step": 1400 + }, + { + "epoch": 0.07508387028063263, + "grad_norm": 0.6669481992721558, + "learning_rate": 9.988018922504137e-05, + "loss": 0.1625, + "step": 1410 + }, + { + "epoch": 0.07561637999893497, + "grad_norm": 0.25058674812316895, + "learning_rate": 9.987503972480652e-05, + "loss": 0.162, + "step": 1420 + }, + { + "epoch": 0.07614888971723734, + "grad_norm": 0.2735210359096527, + "learning_rate": 9.986978207503977e-05, + "loss": 0.1617, + "step": 1430 + }, + { + "epoch": 0.0766813994355397, + "grad_norm": 0.2729678750038147, + "learning_rate": 9.98644162920454e-05, + "loss": 0.1607, + "step": 1440 + }, + { + "epoch": 0.07721390915384206, + "grad_norm": 0.20154890418052673, + "learning_rate": 9.985894239246298e-05, + "loss": 0.1612, + "step": 1450 + }, + { + "epoch": 0.07774641887214441, + "grad_norm": 0.37646111845970154, + "learning_rate": 9.985336039326747e-05, + "loss": 0.1602, + "step": 1460 + }, + { + "epoch": 0.07827892859044677, + "grad_norm": 0.29742431640625, + "learning_rate": 9.98476703117689e-05, + "loss": 0.1609, + "step": 1470 + }, + { + "epoch": 0.07881143830874913, + "grad_norm": 0.2643822729587555, + "learning_rate": 9.984187216561258e-05, + "loss": 0.1612, + "step": 1480 + }, + { + "epoch": 0.0793439480270515, + "grad_norm": 0.22973056137561798, + "learning_rate": 9.98359659727789e-05, + "loss": 0.1614, + "step": 1490 + }, + { + "epoch": 0.07987645774535385, + "grad_norm": 0.24614231288433075, + "learning_rate": 9.982995175158327e-05, + "loss": 0.1621, + "step": 1500 + }, + { + "epoch": 0.08040896746365621, + "grad_norm": 0.2537037134170532, + "learning_rate": 9.98238295206762e-05, + "loss": 0.1608, + "step": 1510 + }, + { + "epoch": 0.08094147718195857, + "grad_norm": 0.26126566529273987, + "learning_rate": 9.981759929904306e-05, + "loss": 0.1596, + "step": 1520 + }, + { + "epoch": 0.08147398690026093, + "grad_norm": 1.0934852361679077, + "learning_rate": 9.981126110600411e-05, + "loss": 0.1598, + "step": 1530 + }, + { + "epoch": 0.08200649661856328, + "grad_norm": 0.21107517182826996, + "learning_rate": 9.98048149612145e-05, + "loss": 0.1601, + "step": 1540 + }, + { + "epoch": 0.08253900633686564, + "grad_norm": 0.2115686535835266, + "learning_rate": 9.979826088466405e-05, + "loss": 0.1598, + "step": 1550 + }, + { + "epoch": 0.08307151605516801, + "grad_norm": 0.23121733963489532, + "learning_rate": 9.979159889667738e-05, + "loss": 0.1592, + "step": 1560 + }, + { + "epoch": 0.08360402577347037, + "grad_norm": 0.19117231667041779, + "learning_rate": 9.978482901791366e-05, + "loss": 0.1591, + "step": 1570 + }, + { + "epoch": 0.08413653549177272, + "grad_norm": 0.274919331073761, + "learning_rate": 9.977795126936671e-05, + "loss": 0.1593, + "step": 1580 + }, + { + "epoch": 0.08466904521007508, + "grad_norm": 0.17615851759910583, + "learning_rate": 9.977096567236481e-05, + "loss": 0.1586, + "step": 1590 + }, + { + "epoch": 0.08520155492837744, + "grad_norm": 0.3320156931877136, + "learning_rate": 9.976387224857071e-05, + "loss": 0.1575, + "step": 1600 + }, + { + "epoch": 0.0857340646466798, + "grad_norm": 0.56144779920578, + "learning_rate": 9.975667101998153e-05, + "loss": 0.1587, + "step": 1610 + }, + { + "epoch": 0.08626657436498215, + "grad_norm": 0.3506183326244354, + "learning_rate": 9.974936200892874e-05, + "loss": 0.1588, + "step": 1620 + }, + { + "epoch": 0.08679908408328452, + "grad_norm": 0.22701147198677063, + "learning_rate": 9.974194523807796e-05, + "loss": 0.1581, + "step": 1630 + }, + { + "epoch": 0.08733159380158688, + "grad_norm": 0.27218353748321533, + "learning_rate": 9.973442073042903e-05, + "loss": 0.1584, + "step": 1640 + }, + { + "epoch": 0.08786410351988924, + "grad_norm": 0.21976235508918762, + "learning_rate": 9.972678850931589e-05, + "loss": 0.1571, + "step": 1650 + }, + { + "epoch": 0.08839661323819159, + "grad_norm": 0.22460529208183289, + "learning_rate": 9.971904859840653e-05, + "loss": 0.157, + "step": 1660 + }, + { + "epoch": 0.08892912295649395, + "grad_norm": 0.16970294713974, + "learning_rate": 9.971120102170283e-05, + "loss": 0.1575, + "step": 1670 + }, + { + "epoch": 0.08946163267479632, + "grad_norm": 0.31050947308540344, + "learning_rate": 9.970324580354063e-05, + "loss": 0.1568, + "step": 1680 + }, + { + "epoch": 0.08999414239309868, + "grad_norm": 0.22615467011928558, + "learning_rate": 9.969518296858946e-05, + "loss": 0.1574, + "step": 1690 + }, + { + "epoch": 0.09052665211140103, + "grad_norm": 0.15380023419857025, + "learning_rate": 9.968701254185271e-05, + "loss": 0.1567, + "step": 1700 + }, + { + "epoch": 0.09105916182970339, + "grad_norm": 0.20212256908416748, + "learning_rate": 9.96787345486673e-05, + "loss": 0.157, + "step": 1710 + }, + { + "epoch": 0.09159167154800575, + "grad_norm": 0.24987904727458954, + "learning_rate": 9.967034901470377e-05, + "loss": 0.1574, + "step": 1720 + }, + { + "epoch": 0.09212418126630811, + "grad_norm": 0.30531537532806396, + "learning_rate": 9.966185596596618e-05, + "loss": 0.1557, + "step": 1730 + }, + { + "epoch": 0.09265669098461046, + "grad_norm": 0.23161855340003967, + "learning_rate": 9.965325542879196e-05, + "loss": 0.1568, + "step": 1740 + }, + { + "epoch": 0.09318920070291282, + "grad_norm": 0.5445181727409363, + "learning_rate": 9.964454742985188e-05, + "loss": 0.1557, + "step": 1750 + }, + { + "epoch": 0.09372171042121519, + "grad_norm": 0.28040483593940735, + "learning_rate": 9.963573199614992e-05, + "loss": 0.1573, + "step": 1760 + }, + { + "epoch": 0.09425422013951755, + "grad_norm": 0.16802328824996948, + "learning_rate": 9.962680915502331e-05, + "loss": 0.157, + "step": 1770 + }, + { + "epoch": 0.0947867298578199, + "grad_norm": 0.15088757872581482, + "learning_rate": 9.961777893414226e-05, + "loss": 0.1568, + "step": 1780 + }, + { + "epoch": 0.09531923957612226, + "grad_norm": 0.1732264906167984, + "learning_rate": 9.960864136151e-05, + "loss": 0.1562, + "step": 1790 + }, + { + "epoch": 0.09585174929442462, + "grad_norm": 0.5040917992591858, + "learning_rate": 9.959939646546272e-05, + "loss": 0.1558, + "step": 1800 + }, + { + "epoch": 0.09638425901272699, + "grad_norm": 0.19744379818439484, + "learning_rate": 9.959004427466935e-05, + "loss": 0.1559, + "step": 1810 + }, + { + "epoch": 0.09691676873102933, + "grad_norm": 0.18064717948436737, + "learning_rate": 9.958058481813158e-05, + "loss": 0.1552, + "step": 1820 + }, + { + "epoch": 0.0974492784493317, + "grad_norm": 0.1613135039806366, + "learning_rate": 9.957101812518377e-05, + "loss": 0.1556, + "step": 1830 + }, + { + "epoch": 0.09798178816763406, + "grad_norm": 0.20252278447151184, + "learning_rate": 9.956134422549275e-05, + "loss": 0.1551, + "step": 1840 + }, + { + "epoch": 0.09851429788593642, + "grad_norm": 0.3912264108657837, + "learning_rate": 9.955156314905785e-05, + "loss": 0.1549, + "step": 1850 + }, + { + "epoch": 0.09904680760423877, + "grad_norm": 0.19006063044071198, + "learning_rate": 9.954167492621079e-05, + "loss": 0.156, + "step": 1860 + }, + { + "epoch": 0.09957931732254113, + "grad_norm": 0.37275323271751404, + "learning_rate": 9.953167958761552e-05, + "loss": 0.1557, + "step": 1870 + }, + { + "epoch": 0.1001118270408435, + "grad_norm": 0.17530041933059692, + "learning_rate": 9.952157716426813e-05, + "loss": 0.1551, + "step": 1880 + }, + { + "epoch": 0.10064433675914586, + "grad_norm": 0.5232445597648621, + "learning_rate": 9.951136768749685e-05, + "loss": 0.1546, + "step": 1890 + }, + { + "epoch": 0.1011768464774482, + "grad_norm": 0.42586958408355713, + "learning_rate": 9.950105118896186e-05, + "loss": 0.1551, + "step": 1900 + }, + { + "epoch": 0.10170935619575057, + "grad_norm": 0.2536565065383911, + "learning_rate": 9.949062770065525e-05, + "loss": 0.155, + "step": 1910 + }, + { + "epoch": 0.10224186591405293, + "grad_norm": 0.15420402586460114, + "learning_rate": 9.948009725490082e-05, + "loss": 0.154, + "step": 1920 + }, + { + "epoch": 0.1027743756323553, + "grad_norm": 0.20187288522720337, + "learning_rate": 9.946945988435414e-05, + "loss": 0.1551, + "step": 1930 + }, + { + "epoch": 0.10330688535065764, + "grad_norm": 0.17882299423217773, + "learning_rate": 9.945871562200226e-05, + "loss": 0.1548, + "step": 1940 + }, + { + "epoch": 0.10383939506896, + "grad_norm": 0.18503925204277039, + "learning_rate": 9.944786450116384e-05, + "loss": 0.1544, + "step": 1950 + }, + { + "epoch": 0.10437190478726237, + "grad_norm": 0.21693278849124908, + "learning_rate": 9.943690655548876e-05, + "loss": 0.154, + "step": 1960 + }, + { + "epoch": 0.10490441450556473, + "grad_norm": 0.2536572515964508, + "learning_rate": 9.942584181895831e-05, + "loss": 0.154, + "step": 1970 + }, + { + "epoch": 0.10543692422386708, + "grad_norm": 0.20351417362689972, + "learning_rate": 9.941467032588483e-05, + "loss": 0.1531, + "step": 1980 + }, + { + "epoch": 0.10596943394216944, + "grad_norm": 0.14263711869716644, + "learning_rate": 9.940339211091182e-05, + "loss": 0.1541, + "step": 1990 + }, + { + "epoch": 0.1065019436604718, + "grad_norm": 0.21371303498744965, + "learning_rate": 9.939200720901367e-05, + "loss": 0.1544, + "step": 2000 + }, + { + "epoch": 0.10703445337877417, + "grad_norm": 0.18548338115215302, + "learning_rate": 9.93805156554956e-05, + "loss": 0.154, + "step": 2010 + }, + { + "epoch": 0.10756696309707652, + "grad_norm": 0.14423610270023346, + "learning_rate": 9.936891748599362e-05, + "loss": 0.1555, + "step": 2020 + }, + { + "epoch": 0.10809947281537888, + "grad_norm": 0.2253378927707672, + "learning_rate": 9.935721273647429e-05, + "loss": 0.1537, + "step": 2030 + }, + { + "epoch": 0.10863198253368124, + "grad_norm": 0.17178262770175934, + "learning_rate": 9.934540144323477e-05, + "loss": 0.1533, + "step": 2040 + }, + { + "epoch": 0.1091644922519836, + "grad_norm": 0.23457373678684235, + "learning_rate": 9.933348364290253e-05, + "loss": 0.1541, + "step": 2050 + }, + { + "epoch": 0.10969700197028595, + "grad_norm": 0.3339991867542267, + "learning_rate": 9.932145937243537e-05, + "loss": 0.1526, + "step": 2060 + }, + { + "epoch": 0.11022951168858831, + "grad_norm": 0.18367235362529755, + "learning_rate": 9.930932866912128e-05, + "loss": 0.1536, + "step": 2070 + }, + { + "epoch": 0.11076202140689068, + "grad_norm": 0.1779884397983551, + "learning_rate": 9.929709157057828e-05, + "loss": 0.1522, + "step": 2080 + }, + { + "epoch": 0.11129453112519304, + "grad_norm": 0.24092677235603333, + "learning_rate": 9.928474811475426e-05, + "loss": 0.1528, + "step": 2090 + }, + { + "epoch": 0.11182704084349539, + "grad_norm": 0.16710165143013, + "learning_rate": 9.927229833992706e-05, + "loss": 0.1525, + "step": 2100 + }, + { + "epoch": 0.11235955056179775, + "grad_norm": 0.3042786121368408, + "learning_rate": 9.925974228470415e-05, + "loss": 0.1545, + "step": 2110 + }, + { + "epoch": 0.11289206028010011, + "grad_norm": 0.14976242184638977, + "learning_rate": 9.924707998802259e-05, + "loss": 0.1531, + "step": 2120 + }, + { + "epoch": 0.11342456999840247, + "grad_norm": 0.1597498059272766, + "learning_rate": 9.923431148914885e-05, + "loss": 0.1523, + "step": 2130 + }, + { + "epoch": 0.11395707971670482, + "grad_norm": 0.2503865957260132, + "learning_rate": 9.922143682767886e-05, + "loss": 0.1527, + "step": 2140 + }, + { + "epoch": 0.11448958943500719, + "grad_norm": 0.240915447473526, + "learning_rate": 9.920845604353768e-05, + "loss": 0.1531, + "step": 2150 + }, + { + "epoch": 0.11502209915330955, + "grad_norm": 0.28919148445129395, + "learning_rate": 9.919536917697942e-05, + "loss": 0.1527, + "step": 2160 + }, + { + "epoch": 0.11555460887161191, + "grad_norm": 0.20423804223537445, + "learning_rate": 9.91821762685873e-05, + "loss": 0.153, + "step": 2170 + }, + { + "epoch": 0.11608711858991426, + "grad_norm": 0.18328100442886353, + "learning_rate": 9.916887735927326e-05, + "loss": 0.1515, + "step": 2180 + }, + { + "epoch": 0.11661962830821662, + "grad_norm": 0.22620701789855957, + "learning_rate": 9.915547249027795e-05, + "loss": 0.1514, + "step": 2190 + }, + { + "epoch": 0.11715213802651898, + "grad_norm": 0.23197805881500244, + "learning_rate": 9.914196170317074e-05, + "loss": 0.1526, + "step": 2200 + }, + { + "epoch": 0.11768464774482135, + "grad_norm": 0.320434033870697, + "learning_rate": 9.912834503984929e-05, + "loss": 0.1526, + "step": 2210 + }, + { + "epoch": 0.1182171574631237, + "grad_norm": 0.16544243693351746, + "learning_rate": 9.911462254253971e-05, + "loss": 0.1523, + "step": 2220 + }, + { + "epoch": 0.11874966718142606, + "grad_norm": 0.21110887825489044, + "learning_rate": 9.910079425379626e-05, + "loss": 0.1518, + "step": 2230 + }, + { + "epoch": 0.11928217689972842, + "grad_norm": 0.16100363433361053, + "learning_rate": 9.908686021650124e-05, + "loss": 0.152, + "step": 2240 + }, + { + "epoch": 0.11981468661803078, + "grad_norm": 0.14773668348789215, + "learning_rate": 9.907282047386497e-05, + "loss": 0.152, + "step": 2250 + }, + { + "epoch": 0.12034719633633313, + "grad_norm": 0.14374825358390808, + "learning_rate": 9.905867506942544e-05, + "loss": 0.1511, + "step": 2260 + }, + { + "epoch": 0.1208797060546355, + "grad_norm": 0.13494443893432617, + "learning_rate": 9.904442404704843e-05, + "loss": 0.1515, + "step": 2270 + }, + { + "epoch": 0.12141221577293786, + "grad_norm": 0.18906742334365845, + "learning_rate": 9.903006745092716e-05, + "loss": 0.1519, + "step": 2280 + }, + { + "epoch": 0.12194472549124022, + "grad_norm": 0.24630281329154968, + "learning_rate": 9.901560532558229e-05, + "loss": 0.1527, + "step": 2290 + }, + { + "epoch": 0.12247723520954257, + "grad_norm": 0.19350433349609375, + "learning_rate": 9.900103771586171e-05, + "loss": 0.1509, + "step": 2300 + }, + { + "epoch": 0.12300974492784493, + "grad_norm": 0.1677471250295639, + "learning_rate": 9.898636466694042e-05, + "loss": 0.1515, + "step": 2310 + }, + { + "epoch": 0.12354225464614729, + "grad_norm": 0.30285850167274475, + "learning_rate": 9.897158622432041e-05, + "loss": 0.1517, + "step": 2320 + }, + { + "epoch": 0.12407476436444966, + "grad_norm": 0.16860969364643097, + "learning_rate": 9.895670243383048e-05, + "loss": 0.1516, + "step": 2330 + }, + { + "epoch": 0.124607274082752, + "grad_norm": 0.10707177966833115, + "learning_rate": 9.894171334162614e-05, + "loss": 0.1517, + "step": 2340 + }, + { + "epoch": 0.12513978380105437, + "grad_norm": 0.16667041182518005, + "learning_rate": 9.892661899418945e-05, + "loss": 0.152, + "step": 2350 + }, + { + "epoch": 0.12567229351935671, + "grad_norm": 0.21757569909095764, + "learning_rate": 9.891141943832883e-05, + "loss": 0.1519, + "step": 2360 + }, + { + "epoch": 0.1262048032376591, + "grad_norm": 0.1555328369140625, + "learning_rate": 9.889611472117902e-05, + "loss": 0.1506, + "step": 2370 + }, + { + "epoch": 0.12673731295596144, + "grad_norm": 0.24122075736522675, + "learning_rate": 9.888070489020083e-05, + "loss": 0.1519, + "step": 2380 + }, + { + "epoch": 0.12726982267426382, + "grad_norm": 0.15034180879592896, + "learning_rate": 9.886518999318104e-05, + "loss": 0.1513, + "step": 2390 + }, + { + "epoch": 0.12780233239256616, + "grad_norm": 0.1593770682811737, + "learning_rate": 9.884957007823226e-05, + "loss": 0.1508, + "step": 2400 + }, + { + "epoch": 0.1283348421108685, + "grad_norm": 0.1536262482404709, + "learning_rate": 9.883384519379273e-05, + "loss": 0.1512, + "step": 2410 + }, + { + "epoch": 0.1288673518291709, + "grad_norm": 0.1585126668214798, + "learning_rate": 9.881801538862627e-05, + "loss": 0.1512, + "step": 2420 + }, + { + "epoch": 0.12939986154747324, + "grad_norm": 0.13629089295864105, + "learning_rate": 9.880208071182203e-05, + "loss": 0.1506, + "step": 2430 + }, + { + "epoch": 0.1299323712657756, + "grad_norm": 0.18671053647994995, + "learning_rate": 9.878604121279434e-05, + "loss": 0.1513, + "step": 2440 + }, + { + "epoch": 0.13046488098407796, + "grad_norm": 0.14679767191410065, + "learning_rate": 9.876989694128263e-05, + "loss": 0.1498, + "step": 2450 + }, + { + "epoch": 0.1309973907023803, + "grad_norm": 0.21141186356544495, + "learning_rate": 9.875364794735124e-05, + "loss": 0.1511, + "step": 2460 + }, + { + "epoch": 0.1315299004206827, + "grad_norm": 0.28956910967826843, + "learning_rate": 9.873729428138924e-05, + "loss": 0.1514, + "step": 2470 + }, + { + "epoch": 0.13206241013898504, + "grad_norm": 0.20682266354560852, + "learning_rate": 9.87208359941103e-05, + "loss": 0.1504, + "step": 2480 + }, + { + "epoch": 0.13259491985728739, + "grad_norm": 0.1642565280199051, + "learning_rate": 9.870427313655256e-05, + "loss": 0.1519, + "step": 2490 + }, + { + "epoch": 0.13312742957558976, + "grad_norm": 0.2124072164297104, + "learning_rate": 9.868760576007835e-05, + "loss": 0.1512, + "step": 2500 + }, + { + "epoch": 0.1336599392938921, + "grad_norm": 0.12310315668582916, + "learning_rate": 9.867083391637422e-05, + "loss": 0.1505, + "step": 2510 + }, + { + "epoch": 0.13419244901219446, + "grad_norm": 0.11877293884754181, + "learning_rate": 9.865395765745062e-05, + "loss": 0.1504, + "step": 2520 + }, + { + "epoch": 0.13472495873049684, + "grad_norm": 0.14998012781143188, + "learning_rate": 9.863697703564183e-05, + "loss": 0.1504, + "step": 2530 + }, + { + "epoch": 0.13525746844879918, + "grad_norm": 0.17364120483398438, + "learning_rate": 9.861989210360572e-05, + "loss": 0.1506, + "step": 2540 + }, + { + "epoch": 0.13578997816710156, + "grad_norm": 0.2578318119049072, + "learning_rate": 9.860270291432367e-05, + "loss": 0.15, + "step": 2550 + }, + { + "epoch": 0.1363224878854039, + "grad_norm": 0.19344595074653625, + "learning_rate": 9.858540952110036e-05, + "loss": 0.1506, + "step": 2560 + }, + { + "epoch": 0.13685499760370626, + "grad_norm": 0.12059523165225983, + "learning_rate": 9.856801197756362e-05, + "loss": 0.1504, + "step": 2570 + }, + { + "epoch": 0.13738750732200863, + "grad_norm": 0.14271683990955353, + "learning_rate": 9.855051033766424e-05, + "loss": 0.1503, + "step": 2580 + }, + { + "epoch": 0.13792001704031098, + "grad_norm": 0.1416383981704712, + "learning_rate": 9.853290465567582e-05, + "loss": 0.1499, + "step": 2590 + }, + { + "epoch": 0.13845252675861333, + "grad_norm": 0.2926510274410248, + "learning_rate": 9.851519498619462e-05, + "loss": 0.1505, + "step": 2600 + }, + { + "epoch": 0.1389850364769157, + "grad_norm": 0.169399231672287, + "learning_rate": 9.849738138413936e-05, + "loss": 0.1507, + "step": 2610 + }, + { + "epoch": 0.13951754619521806, + "grad_norm": 0.2096475064754486, + "learning_rate": 9.847946390475103e-05, + "loss": 0.1501, + "step": 2620 + }, + { + "epoch": 0.14005005591352043, + "grad_norm": 0.17753221094608307, + "learning_rate": 9.84614426035928e-05, + "loss": 0.1501, + "step": 2630 + }, + { + "epoch": 0.14058256563182278, + "grad_norm": 0.13106787204742432, + "learning_rate": 9.844331753654978e-05, + "loss": 0.1492, + "step": 2640 + }, + { + "epoch": 0.14111507535012513, + "grad_norm": 0.1869879961013794, + "learning_rate": 9.842508875982885e-05, + "loss": 0.1495, + "step": 2650 + }, + { + "epoch": 0.1416475850684275, + "grad_norm": 0.15248249471187592, + "learning_rate": 9.840675632995852e-05, + "loss": 0.1489, + "step": 2660 + }, + { + "epoch": 0.14218009478672985, + "grad_norm": 0.13266538083553314, + "learning_rate": 9.838832030378871e-05, + "loss": 0.1501, + "step": 2670 + }, + { + "epoch": 0.1427126045050322, + "grad_norm": 0.1557317078113556, + "learning_rate": 9.836978073849061e-05, + "loss": 0.1505, + "step": 2680 + }, + { + "epoch": 0.14324511422333458, + "grad_norm": 0.11426721513271332, + "learning_rate": 9.835113769155653e-05, + "loss": 0.1505, + "step": 2690 + }, + { + "epoch": 0.14377762394163693, + "grad_norm": 0.2386896014213562, + "learning_rate": 9.83323912207996e-05, + "loss": 0.1504, + "step": 2700 + }, + { + "epoch": 0.1443101336599393, + "grad_norm": 0.17772549390792847, + "learning_rate": 9.831354138435373e-05, + "loss": 0.1502, + "step": 2710 + }, + { + "epoch": 0.14484264337824165, + "grad_norm": 0.13859009742736816, + "learning_rate": 9.82945882406734e-05, + "loss": 0.1492, + "step": 2720 + }, + { + "epoch": 0.145375153096544, + "grad_norm": 0.16319668292999268, + "learning_rate": 9.827553184853333e-05, + "loss": 0.1495, + "step": 2730 + }, + { + "epoch": 0.14590766281484638, + "grad_norm": 0.17237436771392822, + "learning_rate": 9.82563722670286e-05, + "loss": 0.1497, + "step": 2740 + }, + { + "epoch": 0.14644017253314873, + "grad_norm": 0.22656778991222382, + "learning_rate": 9.823710955557413e-05, + "loss": 0.1494, + "step": 2750 + }, + { + "epoch": 0.14697268225145108, + "grad_norm": 0.12643253803253174, + "learning_rate": 9.821774377390474e-05, + "loss": 0.1497, + "step": 2760 + }, + { + "epoch": 0.14750519196975345, + "grad_norm": 0.1322176307439804, + "learning_rate": 9.819827498207481e-05, + "loss": 0.1494, + "step": 2770 + }, + { + "epoch": 0.1480377016880558, + "grad_norm": 0.13178198039531708, + "learning_rate": 9.817870324045824e-05, + "loss": 0.1497, + "step": 2780 + }, + { + "epoch": 0.14857021140635818, + "grad_norm": 0.16573889553546906, + "learning_rate": 9.815902860974812e-05, + "loss": 0.1484, + "step": 2790 + }, + { + "epoch": 0.14910272112466053, + "grad_norm": 0.17168866097927094, + "learning_rate": 9.813925115095663e-05, + "loss": 0.1498, + "step": 2800 + }, + { + "epoch": 0.14963523084296287, + "grad_norm": 0.11743076145648956, + "learning_rate": 9.811937092541483e-05, + "loss": 0.1489, + "step": 2810 + }, + { + "epoch": 0.15016774056126525, + "grad_norm": 0.09591732919216156, + "learning_rate": 9.809938799477247e-05, + "loss": 0.1492, + "step": 2820 + }, + { + "epoch": 0.1507002502795676, + "grad_norm": 0.2620985209941864, + "learning_rate": 9.807930242099777e-05, + "loss": 0.1484, + "step": 2830 + }, + { + "epoch": 0.15123275999786995, + "grad_norm": 0.1720651537179947, + "learning_rate": 9.805911426637723e-05, + "loss": 0.1489, + "step": 2840 + }, + { + "epoch": 0.15176526971617232, + "grad_norm": 0.12655815482139587, + "learning_rate": 9.803882359351556e-05, + "loss": 0.1489, + "step": 2850 + }, + { + "epoch": 0.15229777943447467, + "grad_norm": 0.1679336279630661, + "learning_rate": 9.801843046533527e-05, + "loss": 0.1474, + "step": 2860 + }, + { + "epoch": 0.15283028915277705, + "grad_norm": 0.16842088103294373, + "learning_rate": 9.799793494507667e-05, + "loss": 0.1487, + "step": 2870 + }, + { + "epoch": 0.1533627988710794, + "grad_norm": 0.15344814956188202, + "learning_rate": 9.797733709629755e-05, + "loss": 0.1491, + "step": 2880 + }, + { + "epoch": 0.15389530858938175, + "grad_norm": 0.14372903108596802, + "learning_rate": 9.795663698287305e-05, + "loss": 0.1486, + "step": 2890 + }, + { + "epoch": 0.15442781830768412, + "grad_norm": 0.14276905357837677, + "learning_rate": 9.793583466899541e-05, + "loss": 0.1483, + "step": 2900 + }, + { + "epoch": 0.15496032802598647, + "grad_norm": 0.21898868680000305, + "learning_rate": 9.791493021917384e-05, + "loss": 0.1473, + "step": 2910 + }, + { + "epoch": 0.15549283774428882, + "grad_norm": 0.21863441169261932, + "learning_rate": 9.789392369823423e-05, + "loss": 0.1472, + "step": 2920 + }, + { + "epoch": 0.1560253474625912, + "grad_norm": 0.14358623325824738, + "learning_rate": 9.787281517131905e-05, + "loss": 0.1479, + "step": 2930 + }, + { + "epoch": 0.15655785718089354, + "grad_norm": 0.12543822824954987, + "learning_rate": 9.785160470388706e-05, + "loss": 0.1479, + "step": 2940 + }, + { + "epoch": 0.15709036689919592, + "grad_norm": 0.14014865458011627, + "learning_rate": 9.783029236171317e-05, + "loss": 0.1474, + "step": 2950 + }, + { + "epoch": 0.15762287661749827, + "grad_norm": 0.15217439830303192, + "learning_rate": 9.78088782108882e-05, + "loss": 0.1478, + "step": 2960 + }, + { + "epoch": 0.15815538633580062, + "grad_norm": 0.10565731674432755, + "learning_rate": 9.778736231781864e-05, + "loss": 0.1472, + "step": 2970 + }, + { + "epoch": 0.158687896054103, + "grad_norm": 0.13025479018688202, + "learning_rate": 9.77657447492266e-05, + "loss": 0.147, + "step": 2980 + }, + { + "epoch": 0.15922040577240534, + "grad_norm": 0.26370614767074585, + "learning_rate": 9.774402557214934e-05, + "loss": 0.1468, + "step": 2990 + }, + { + "epoch": 0.1597529154907077, + "grad_norm": 0.11994566768407822, + "learning_rate": 9.772220485393935e-05, + "loss": 0.1468, + "step": 3000 + }, + { + "epoch": 0.16028542520901007, + "grad_norm": 0.13632826507091522, + "learning_rate": 9.770028266226392e-05, + "loss": 0.1465, + "step": 3010 + }, + { + "epoch": 0.16081793492731242, + "grad_norm": 0.18415699899196625, + "learning_rate": 9.767825906510508e-05, + "loss": 0.1461, + "step": 3020 + }, + { + "epoch": 0.1613504446456148, + "grad_norm": 0.1199774518609047, + "learning_rate": 9.765613413075925e-05, + "loss": 0.1462, + "step": 3030 + }, + { + "epoch": 0.16188295436391714, + "grad_norm": 0.12944312393665314, + "learning_rate": 9.763390792783718e-05, + "loss": 0.1456, + "step": 3040 + }, + { + "epoch": 0.1624154640822195, + "grad_norm": 0.12913690507411957, + "learning_rate": 9.761158052526357e-05, + "loss": 0.1461, + "step": 3050 + }, + { + "epoch": 0.16294797380052187, + "grad_norm": 0.13733190298080444, + "learning_rate": 9.758915199227704e-05, + "loss": 0.1454, + "step": 3060 + }, + { + "epoch": 0.16348048351882422, + "grad_norm": 0.13602448999881744, + "learning_rate": 9.756662239842977e-05, + "loss": 0.1453, + "step": 3070 + }, + { + "epoch": 0.16401299323712656, + "grad_norm": 0.23941437900066376, + "learning_rate": 9.754399181358735e-05, + "loss": 0.1447, + "step": 3080 + }, + { + "epoch": 0.16454550295542894, + "grad_norm": 0.17770028114318848, + "learning_rate": 9.752126030792852e-05, + "loss": 0.1448, + "step": 3090 + }, + { + "epoch": 0.1650780126737313, + "grad_norm": 0.12279467284679413, + "learning_rate": 9.749842795194502e-05, + "loss": 0.1445, + "step": 3100 + }, + { + "epoch": 0.16561052239203367, + "grad_norm": 0.16316959261894226, + "learning_rate": 9.747549481644132e-05, + "loss": 0.1436, + "step": 3110 + }, + { + "epoch": 0.16614303211033601, + "grad_norm": 0.1606248915195465, + "learning_rate": 9.74524609725344e-05, + "loss": 0.1447, + "step": 3120 + }, + { + "epoch": 0.16667554182863836, + "grad_norm": 0.14306576550006866, + "learning_rate": 9.742932649165357e-05, + "loss": 0.1448, + "step": 3130 + }, + { + "epoch": 0.16720805154694074, + "grad_norm": 0.16349278390407562, + "learning_rate": 9.740609144554018e-05, + "loss": 0.1443, + "step": 3140 + }, + { + "epoch": 0.1677405612652431, + "grad_norm": 0.26308995485305786, + "learning_rate": 9.738275590624748e-05, + "loss": 0.1436, + "step": 3150 + }, + { + "epoch": 0.16827307098354544, + "grad_norm": 0.13254424929618835, + "learning_rate": 9.735931994614034e-05, + "loss": 0.1436, + "step": 3160 + }, + { + "epoch": 0.1688055807018478, + "grad_norm": 0.14894609153270721, + "learning_rate": 9.733578363789504e-05, + "loss": 0.1428, + "step": 3170 + }, + { + "epoch": 0.16933809042015016, + "grad_norm": 0.2038808912038803, + "learning_rate": 9.731214705449902e-05, + "loss": 0.1427, + "step": 3180 + }, + { + "epoch": 0.16987060013845254, + "grad_norm": 0.1420915275812149, + "learning_rate": 9.728841026925072e-05, + "loss": 0.1431, + "step": 3190 + }, + { + "epoch": 0.1704031098567549, + "grad_norm": 0.12903986871242523, + "learning_rate": 9.726457335575931e-05, + "loss": 0.1426, + "step": 3200 + }, + { + "epoch": 0.17093561957505723, + "grad_norm": 0.15605418384075165, + "learning_rate": 9.724063638794445e-05, + "loss": 0.1435, + "step": 3210 + }, + { + "epoch": 0.1714681292933596, + "grad_norm": 0.179864302277565, + "learning_rate": 9.721659944003605e-05, + "loss": 0.1418, + "step": 3220 + }, + { + "epoch": 0.17200063901166196, + "grad_norm": 0.13642147183418274, + "learning_rate": 9.719246258657408e-05, + "loss": 0.1425, + "step": 3230 + }, + { + "epoch": 0.1725331487299643, + "grad_norm": 0.19007375836372375, + "learning_rate": 9.716822590240835e-05, + "loss": 0.1429, + "step": 3240 + }, + { + "epoch": 0.17306565844826668, + "grad_norm": 0.11586272716522217, + "learning_rate": 9.714388946269824e-05, + "loss": 0.1423, + "step": 3250 + }, + { + "epoch": 0.17359816816656903, + "grad_norm": 0.09750824421644211, + "learning_rate": 9.711945334291243e-05, + "loss": 0.1421, + "step": 3260 + }, + { + "epoch": 0.1741306778848714, + "grad_norm": 0.16159775853157043, + "learning_rate": 9.709491761882881e-05, + "loss": 0.1422, + "step": 3270 + }, + { + "epoch": 0.17466318760317376, + "grad_norm": 0.1439363956451416, + "learning_rate": 9.707028236653406e-05, + "loss": 0.1428, + "step": 3280 + }, + { + "epoch": 0.1751956973214761, + "grad_norm": 0.15214209258556366, + "learning_rate": 9.704554766242351e-05, + "loss": 0.1419, + "step": 3290 + }, + { + "epoch": 0.17572820703977848, + "grad_norm": 0.14261415600776672, + "learning_rate": 9.702071358320095e-05, + "loss": 0.142, + "step": 3300 + }, + { + "epoch": 0.17626071675808083, + "grad_norm": 0.17040428519248962, + "learning_rate": 9.69957802058783e-05, + "loss": 0.1426, + "step": 3310 + }, + { + "epoch": 0.17679322647638318, + "grad_norm": 0.11275117844343185, + "learning_rate": 9.697074760777542e-05, + "loss": 0.1423, + "step": 3320 + }, + { + "epoch": 0.17732573619468556, + "grad_norm": 0.11213172972202301, + "learning_rate": 9.694561586651985e-05, + "loss": 0.1416, + "step": 3330 + }, + { + "epoch": 0.1778582459129879, + "grad_norm": 0.08868248015642166, + "learning_rate": 9.692038506004659e-05, + "loss": 0.1414, + "step": 3340 + }, + { + "epoch": 0.17839075563129028, + "grad_norm": 0.09049142897129059, + "learning_rate": 9.689505526659783e-05, + "loss": 0.1409, + "step": 3350 + }, + { + "epoch": 0.17892326534959263, + "grad_norm": 0.21919691562652588, + "learning_rate": 9.686962656472278e-05, + "loss": 0.1423, + "step": 3360 + }, + { + "epoch": 0.17945577506789498, + "grad_norm": 0.11099066585302353, + "learning_rate": 9.684409903327728e-05, + "loss": 0.1417, + "step": 3370 + }, + { + "epoch": 0.17998828478619736, + "grad_norm": 0.10436002165079117, + "learning_rate": 9.681847275142371e-05, + "loss": 0.1413, + "step": 3380 + }, + { + "epoch": 0.1805207945044997, + "grad_norm": 0.11029750108718872, + "learning_rate": 9.679274779863065e-05, + "loss": 0.1407, + "step": 3390 + }, + { + "epoch": 0.18105330422280205, + "grad_norm": 0.15080855786800385, + "learning_rate": 9.67669242546727e-05, + "loss": 0.1414, + "step": 3400 + }, + { + "epoch": 0.18158581394110443, + "grad_norm": 0.11175508052110672, + "learning_rate": 9.674100219963018e-05, + "loss": 0.1407, + "step": 3410 + }, + { + "epoch": 0.18211832365940678, + "grad_norm": 0.10869117826223373, + "learning_rate": 9.671498171388889e-05, + "loss": 0.1401, + "step": 3420 + }, + { + "epoch": 0.18265083337770915, + "grad_norm": 0.14162185788154602, + "learning_rate": 9.668886287813985e-05, + "loss": 0.1406, + "step": 3430 + }, + { + "epoch": 0.1831833430960115, + "grad_norm": 0.2033168226480484, + "learning_rate": 9.666264577337908e-05, + "loss": 0.1407, + "step": 3440 + }, + { + "epoch": 0.18371585281431385, + "grad_norm": 3.439692497253418, + "learning_rate": 9.663633048090744e-05, + "loss": 0.1527, + "step": 3450 + }, + { + "epoch": 0.18424836253261623, + "grad_norm": 3.6431403160095215, + "learning_rate": 9.660991708233009e-05, + "loss": 0.7171, + "step": 3460 + }, + { + "epoch": 0.18478087225091858, + "grad_norm": 1.1392711400985718, + "learning_rate": 9.658340565955654e-05, + "loss": 0.3094, + "step": 3470 + }, + { + "epoch": 0.18531338196922092, + "grad_norm": 0.24986179172992706, + "learning_rate": 9.655679629480032e-05, + "loss": 0.2164, + "step": 3480 + }, + { + "epoch": 0.1858458916875233, + "grad_norm": 0.16256773471832275, + "learning_rate": 9.653008907057855e-05, + "loss": 0.1672, + "step": 3490 + }, + { + "epoch": 0.18637840140582565, + "grad_norm": 0.11707092821598053, + "learning_rate": 9.65032840697119e-05, + "loss": 0.1552, + "step": 3500 + }, + { + "epoch": 0.18691091112412803, + "grad_norm": 0.12210855633020401, + "learning_rate": 9.647638137532428e-05, + "loss": 0.1495, + "step": 3510 + }, + { + "epoch": 0.18744342084243037, + "grad_norm": 0.10740665346384048, + "learning_rate": 9.644938107084247e-05, + "loss": 0.1468, + "step": 3520 + }, + { + "epoch": 0.18797593056073272, + "grad_norm": 0.09589366614818573, + "learning_rate": 9.642228323999603e-05, + "loss": 0.1453, + "step": 3530 + }, + { + "epoch": 0.1885084402790351, + "grad_norm": 0.12820713222026825, + "learning_rate": 9.639508796681688e-05, + "loss": 0.1439, + "step": 3540 + }, + { + "epoch": 0.18904094999733745, + "grad_norm": 0.3186265528202057, + "learning_rate": 9.636779533563915e-05, + "loss": 0.1429, + "step": 3550 + }, + { + "epoch": 0.1895734597156398, + "grad_norm": 0.1107301339507103, + "learning_rate": 9.63404054310989e-05, + "loss": 0.1427, + "step": 3560 + }, + { + "epoch": 0.19010596943394217, + "grad_norm": 0.0950065553188324, + "learning_rate": 9.631291833813383e-05, + "loss": 0.1425, + "step": 3570 + }, + { + "epoch": 0.19063847915224452, + "grad_norm": 0.1297433227300644, + "learning_rate": 9.628533414198298e-05, + "loss": 0.1419, + "step": 3580 + }, + { + "epoch": 0.1911709888705469, + "grad_norm": 0.1097961962223053, + "learning_rate": 9.625765292818658e-05, + "loss": 0.1413, + "step": 3590 + }, + { + "epoch": 0.19170349858884925, + "grad_norm": 0.11060044914484024, + "learning_rate": 9.622987478258567e-05, + "loss": 0.1413, + "step": 3600 + }, + { + "epoch": 0.1922360083071516, + "grad_norm": 0.11111301183700562, + "learning_rate": 9.620199979132191e-05, + "loss": 0.1408, + "step": 3610 + }, + { + "epoch": 0.19276851802545397, + "grad_norm": 0.08389998227357864, + "learning_rate": 9.617402804083729e-05, + "loss": 0.141, + "step": 3620 + }, + { + "epoch": 0.19330102774375632, + "grad_norm": 0.1361977458000183, + "learning_rate": 9.61459596178738e-05, + "loss": 0.1409, + "step": 3630 + }, + { + "epoch": 0.19383353746205867, + "grad_norm": 0.12257982045412064, + "learning_rate": 9.61177946094733e-05, + "loss": 0.1405, + "step": 3640 + }, + { + "epoch": 0.19436604718036105, + "grad_norm": 0.09580480307340622, + "learning_rate": 9.608953310297708e-05, + "loss": 0.141, + "step": 3650 + }, + { + "epoch": 0.1948985568986634, + "grad_norm": 0.09593943506479263, + "learning_rate": 9.606117518602575e-05, + "loss": 0.1404, + "step": 3660 + }, + { + "epoch": 0.19543106661696577, + "grad_norm": 0.09266688674688339, + "learning_rate": 9.603272094655886e-05, + "loss": 0.1404, + "step": 3670 + }, + { + "epoch": 0.19596357633526812, + "grad_norm": 0.1070714071393013, + "learning_rate": 9.600417047281464e-05, + "loss": 0.1398, + "step": 3680 + }, + { + "epoch": 0.19649608605357047, + "grad_norm": 0.10331781953573227, + "learning_rate": 9.597552385332982e-05, + "loss": 0.1401, + "step": 3690 + }, + { + "epoch": 0.19702859577187284, + "grad_norm": 0.09512060880661011, + "learning_rate": 9.594678117693921e-05, + "loss": 0.1394, + "step": 3700 + }, + { + "epoch": 0.1975611054901752, + "grad_norm": 0.0843188613653183, + "learning_rate": 9.591794253277551e-05, + "loss": 0.14, + "step": 3710 + }, + { + "epoch": 0.19809361520847754, + "grad_norm": 0.0754111111164093, + "learning_rate": 9.588900801026907e-05, + "loss": 0.1406, + "step": 3720 + }, + { + "epoch": 0.19862612492677992, + "grad_norm": 0.09565232694149017, + "learning_rate": 9.585997769914752e-05, + "loss": 0.1399, + "step": 3730 + }, + { + "epoch": 0.19915863464508227, + "grad_norm": 0.10425586253404617, + "learning_rate": 9.583085168943555e-05, + "loss": 0.1391, + "step": 3740 + }, + { + "epoch": 0.19969114436338464, + "grad_norm": 0.1333099901676178, + "learning_rate": 9.580163007145459e-05, + "loss": 0.1402, + "step": 3750 + }, + { + "epoch": 0.200223654081687, + "grad_norm": 0.12000375241041183, + "learning_rate": 9.57723129358226e-05, + "loss": 0.1401, + "step": 3760 + }, + { + "epoch": 0.20075616379998934, + "grad_norm": 0.08403091132640839, + "learning_rate": 9.574290037345375e-05, + "loss": 0.1393, + "step": 3770 + }, + { + "epoch": 0.20128867351829172, + "grad_norm": 0.08062135428190231, + "learning_rate": 9.571339247555809e-05, + "loss": 0.1396, + "step": 3780 + }, + { + "epoch": 0.20182118323659407, + "grad_norm": 0.17963799834251404, + "learning_rate": 9.568378933364131e-05, + "loss": 0.14, + "step": 3790 + }, + { + "epoch": 0.2023536929548964, + "grad_norm": 0.09086289256811142, + "learning_rate": 9.565409103950451e-05, + "loss": 0.1397, + "step": 3800 + }, + { + "epoch": 0.2028862026731988, + "grad_norm": 0.10271194577217102, + "learning_rate": 9.562429768524381e-05, + "loss": 0.1394, + "step": 3810 + }, + { + "epoch": 0.20341871239150114, + "grad_norm": 0.076598159968853, + "learning_rate": 9.559440936325017e-05, + "loss": 0.1395, + "step": 3820 + }, + { + "epoch": 0.20395122210980351, + "grad_norm": 0.1196560189127922, + "learning_rate": 9.556442616620899e-05, + "loss": 0.1388, + "step": 3830 + }, + { + "epoch": 0.20448373182810586, + "grad_norm": 0.082634337246418, + "learning_rate": 9.553434818709992e-05, + "loss": 0.1394, + "step": 3840 + }, + { + "epoch": 0.2050162415464082, + "grad_norm": 0.13070203363895416, + "learning_rate": 9.550417551919655e-05, + "loss": 0.1393, + "step": 3850 + }, + { + "epoch": 0.2055487512647106, + "grad_norm": 0.21860548853874207, + "learning_rate": 9.547390825606606e-05, + "loss": 0.1387, + "step": 3860 + }, + { + "epoch": 0.20608126098301294, + "grad_norm": 0.08260785788297653, + "learning_rate": 9.544354649156899e-05, + "loss": 0.139, + "step": 3870 + }, + { + "epoch": 0.20661377070131529, + "grad_norm": 0.0788755938410759, + "learning_rate": 9.541309031985895e-05, + "loss": 0.1392, + "step": 3880 + }, + { + "epoch": 0.20714628041961766, + "grad_norm": 0.12903687357902527, + "learning_rate": 9.538253983538232e-05, + "loss": 0.1395, + "step": 3890 + }, + { + "epoch": 0.20767879013792, + "grad_norm": 0.19277387857437134, + "learning_rate": 9.535189513287792e-05, + "loss": 0.1389, + "step": 3900 + }, + { + "epoch": 0.2082112998562224, + "grad_norm": 0.1530824899673462, + "learning_rate": 9.532115630737674e-05, + "loss": 0.1388, + "step": 3910 + }, + { + "epoch": 0.20874380957452474, + "grad_norm": 0.0937756597995758, + "learning_rate": 9.52903234542017e-05, + "loss": 0.1398, + "step": 3920 + }, + { + "epoch": 0.20927631929282708, + "grad_norm": 0.12323369830846786, + "learning_rate": 9.52593966689673e-05, + "loss": 0.14, + "step": 3930 + }, + { + "epoch": 0.20980882901112946, + "grad_norm": 0.14708684384822845, + "learning_rate": 9.522837604757924e-05, + "loss": 0.1388, + "step": 3940 + }, + { + "epoch": 0.2103413387294318, + "grad_norm": 0.10080372542142868, + "learning_rate": 9.519726168623433e-05, + "loss": 0.1379, + "step": 3950 + }, + { + "epoch": 0.21087384844773416, + "grad_norm": 0.11739426851272583, + "learning_rate": 9.516605368141998e-05, + "loss": 0.1388, + "step": 3960 + }, + { + "epoch": 0.21140635816603653, + "grad_norm": 0.0870957151055336, + "learning_rate": 9.513475212991406e-05, + "loss": 0.1388, + "step": 3970 + }, + { + "epoch": 0.21193886788433888, + "grad_norm": 0.12498774379491806, + "learning_rate": 9.510335712878446e-05, + "loss": 0.139, + "step": 3980 + }, + { + "epoch": 0.21247137760264126, + "grad_norm": 0.13790611922740936, + "learning_rate": 9.507186877538899e-05, + "loss": 0.1391, + "step": 3990 + }, + { + "epoch": 0.2130038873209436, + "grad_norm": 0.09463178366422653, + "learning_rate": 9.504028716737481e-05, + "loss": 0.1387, + "step": 4000 + }, + { + "epoch": 0.21353639703924596, + "grad_norm": 0.0896778553724289, + "learning_rate": 9.500861240267836e-05, + "loss": 0.1384, + "step": 4010 + }, + { + "epoch": 0.21406890675754833, + "grad_norm": 0.09591860324144363, + "learning_rate": 9.49768445795249e-05, + "loss": 0.1379, + "step": 4020 + }, + { + "epoch": 0.21460141647585068, + "grad_norm": 0.12393760681152344, + "learning_rate": 9.49449837964283e-05, + "loss": 0.138, + "step": 4030 + }, + { + "epoch": 0.21513392619415303, + "grad_norm": 0.1211247369647026, + "learning_rate": 9.491303015219075e-05, + "loss": 0.1381, + "step": 4040 + }, + { + "epoch": 0.2156664359124554, + "grad_norm": 0.09435896575450897, + "learning_rate": 9.488098374590232e-05, + "loss": 0.1384, + "step": 4050 + }, + { + "epoch": 0.21619894563075776, + "grad_norm": 0.2162541151046753, + "learning_rate": 9.484884467694082e-05, + "loss": 0.1386, + "step": 4060 + }, + { + "epoch": 0.21673145534906013, + "grad_norm": 0.11500007659196854, + "learning_rate": 9.481661304497136e-05, + "loss": 0.139, + "step": 4070 + }, + { + "epoch": 0.21726396506736248, + "grad_norm": 0.12248394638299942, + "learning_rate": 9.478428894994612e-05, + "loss": 0.1387, + "step": 4080 + }, + { + "epoch": 0.21779647478566483, + "grad_norm": 0.08362865447998047, + "learning_rate": 9.475187249210396e-05, + "loss": 0.1386, + "step": 4090 + }, + { + "epoch": 0.2183289845039672, + "grad_norm": 0.09139638394117355, + "learning_rate": 9.471936377197025e-05, + "loss": 0.1382, + "step": 4100 + }, + { + "epoch": 0.21886149422226955, + "grad_norm": 0.08786854147911072, + "learning_rate": 9.468676289035643e-05, + "loss": 0.1377, + "step": 4110 + }, + { + "epoch": 0.2193940039405719, + "grad_norm": 0.09356456995010376, + "learning_rate": 9.465406994835972e-05, + "loss": 0.1377, + "step": 4120 + }, + { + "epoch": 0.21992651365887428, + "grad_norm": 0.07609741389751434, + "learning_rate": 9.462128504736286e-05, + "loss": 0.1382, + "step": 4130 + }, + { + "epoch": 0.22045902337717663, + "grad_norm": 0.12431398779153824, + "learning_rate": 9.458840828903368e-05, + "loss": 0.1382, + "step": 4140 + }, + { + "epoch": 0.220991533095479, + "grad_norm": 0.1060996949672699, + "learning_rate": 9.4555439775325e-05, + "loss": 0.1382, + "step": 4150 + }, + { + "epoch": 0.22152404281378135, + "grad_norm": 0.07396227866411209, + "learning_rate": 9.452237960847405e-05, + "loss": 0.1379, + "step": 4160 + }, + { + "epoch": 0.2220565525320837, + "grad_norm": 0.07973285764455795, + "learning_rate": 9.448922789100238e-05, + "loss": 0.1376, + "step": 4170 + }, + { + "epoch": 0.22258906225038608, + "grad_norm": 0.08671050518751144, + "learning_rate": 9.445598472571535e-05, + "loss": 0.1387, + "step": 4180 + }, + { + "epoch": 0.22312157196868843, + "grad_norm": 0.07978523522615433, + "learning_rate": 9.442265021570198e-05, + "loss": 0.1379, + "step": 4190 + }, + { + "epoch": 0.22365408168699077, + "grad_norm": 0.07077804952859879, + "learning_rate": 9.438922446433454e-05, + "loss": 0.1382, + "step": 4200 + }, + { + "epoch": 0.22418659140529315, + "grad_norm": 0.08330279588699341, + "learning_rate": 9.435570757526823e-05, + "loss": 0.1374, + "step": 4210 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.1032426580786705, + "learning_rate": 9.432209965244085e-05, + "loss": 0.1382, + "step": 4220 + }, + { + "epoch": 0.22525161084189788, + "grad_norm": 0.10053195804357529, + "learning_rate": 9.428840080007255e-05, + "loss": 0.1373, + "step": 4230 + }, + { + "epoch": 0.22578412056020022, + "grad_norm": 0.08895772695541382, + "learning_rate": 9.425461112266545e-05, + "loss": 0.1379, + "step": 4240 + }, + { + "epoch": 0.22631663027850257, + "grad_norm": 0.08644817024469376, + "learning_rate": 9.422073072500328e-05, + "loss": 0.1381, + "step": 4250 + }, + { + "epoch": 0.22684913999680495, + "grad_norm": 0.07521601766347885, + "learning_rate": 9.418675971215113e-05, + "loss": 0.1377, + "step": 4260 + }, + { + "epoch": 0.2273816497151073, + "grad_norm": 0.0699540451169014, + "learning_rate": 9.415269818945513e-05, + "loss": 0.1378, + "step": 4270 + }, + { + "epoch": 0.22791415943340965, + "grad_norm": 0.16751086711883545, + "learning_rate": 9.411854626254202e-05, + "loss": 0.1371, + "step": 4280 + }, + { + "epoch": 0.22844666915171202, + "grad_norm": 0.11966162919998169, + "learning_rate": 9.408430403731891e-05, + "loss": 0.1374, + "step": 4290 + }, + { + "epoch": 0.22897917887001437, + "grad_norm": 0.0889548733830452, + "learning_rate": 9.404997161997295e-05, + "loss": 0.1376, + "step": 4300 + }, + { + "epoch": 0.22951168858831675, + "grad_norm": 0.10110121965408325, + "learning_rate": 9.4015549116971e-05, + "loss": 0.1379, + "step": 4310 + }, + { + "epoch": 0.2300441983066191, + "grad_norm": 0.10038761049509048, + "learning_rate": 9.398103663505917e-05, + "loss": 0.1368, + "step": 4320 + }, + { + "epoch": 0.23057670802492145, + "grad_norm": 0.14014580845832825, + "learning_rate": 9.394643428126272e-05, + "loss": 0.1369, + "step": 4330 + }, + { + "epoch": 0.23110921774322382, + "grad_norm": 0.07071101665496826, + "learning_rate": 9.391174216288561e-05, + "loss": 0.1375, + "step": 4340 + }, + { + "epoch": 0.23164172746152617, + "grad_norm": 0.08393870294094086, + "learning_rate": 9.387696038751006e-05, + "loss": 0.1378, + "step": 4350 + }, + { + "epoch": 0.23217423717982852, + "grad_norm": 0.18702152371406555, + "learning_rate": 9.384208906299641e-05, + "loss": 0.1379, + "step": 4360 + }, + { + "epoch": 0.2327067468981309, + "grad_norm": 0.12593114376068115, + "learning_rate": 9.380712829748266e-05, + "loss": 0.1377, + "step": 4370 + }, + { + "epoch": 0.23323925661643324, + "grad_norm": 0.1111498549580574, + "learning_rate": 9.37720781993842e-05, + "loss": 0.138, + "step": 4380 + }, + { + "epoch": 0.23377176633473562, + "grad_norm": 0.11401405185461044, + "learning_rate": 9.37369388773934e-05, + "loss": 0.1373, + "step": 4390 + }, + { + "epoch": 0.23430427605303797, + "grad_norm": 0.07457905262708664, + "learning_rate": 9.370171044047937e-05, + "loss": 0.1376, + "step": 4400 + }, + { + "epoch": 0.23483678577134032, + "grad_norm": 0.07042038440704346, + "learning_rate": 9.366639299788758e-05, + "loss": 0.1379, + "step": 4410 + }, + { + "epoch": 0.2353692954896427, + "grad_norm": 0.06665973365306854, + "learning_rate": 9.363098665913941e-05, + "loss": 0.1368, + "step": 4420 + }, + { + "epoch": 0.23590180520794504, + "grad_norm": 0.06450683623552322, + "learning_rate": 9.3595491534032e-05, + "loss": 0.1376, + "step": 4430 + }, + { + "epoch": 0.2364343149262474, + "grad_norm": 0.0732714980840683, + "learning_rate": 9.355990773263782e-05, + "loss": 0.137, + "step": 4440 + }, + { + "epoch": 0.23696682464454977, + "grad_norm": 0.09271900355815887, + "learning_rate": 9.352423536530432e-05, + "loss": 0.1366, + "step": 4450 + }, + { + "epoch": 0.23749933436285212, + "grad_norm": 0.35426032543182373, + "learning_rate": 9.34884745426536e-05, + "loss": 0.1383, + "step": 4460 + }, + { + "epoch": 0.2380318440811545, + "grad_norm": 0.0786311998963356, + "learning_rate": 9.3452625375582e-05, + "loss": 0.1378, + "step": 4470 + }, + { + "epoch": 0.23856435379945684, + "grad_norm": 0.07759775966405869, + "learning_rate": 9.341668797525993e-05, + "loss": 0.1373, + "step": 4480 + }, + { + "epoch": 0.2390968635177592, + "grad_norm": 0.06604979932308197, + "learning_rate": 9.338066245313134e-05, + "loss": 0.1376, + "step": 4490 + }, + { + "epoch": 0.23962937323606157, + "grad_norm": 0.06817334145307541, + "learning_rate": 9.334454892091349e-05, + "loss": 0.1368, + "step": 4500 + }, + { + "epoch": 0.24016188295436391, + "grad_norm": 0.07450228929519653, + "learning_rate": 9.330834749059654e-05, + "loss": 0.1368, + "step": 4510 + }, + { + "epoch": 0.24069439267266626, + "grad_norm": 0.09656868129968643, + "learning_rate": 9.327205827444322e-05, + "loss": 0.1365, + "step": 4520 + }, + { + "epoch": 0.24122690239096864, + "grad_norm": 0.1612931787967682, + "learning_rate": 9.323568138498855e-05, + "loss": 0.1374, + "step": 4530 + }, + { + "epoch": 0.241759412109271, + "grad_norm": 0.1097157672047615, + "learning_rate": 9.319921693503935e-05, + "loss": 0.1374, + "step": 4540 + }, + { + "epoch": 0.24229192182757336, + "grad_norm": 0.07911382615566254, + "learning_rate": 9.316266503767402e-05, + "loss": 0.1375, + "step": 4550 + }, + { + "epoch": 0.2428244315458757, + "grad_norm": 0.07367183268070221, + "learning_rate": 9.31260258062421e-05, + "loss": 0.1372, + "step": 4560 + }, + { + "epoch": 0.24335694126417806, + "grad_norm": 0.0891132801771164, + "learning_rate": 9.308929935436404e-05, + "loss": 0.1362, + "step": 4570 + }, + { + "epoch": 0.24388945098248044, + "grad_norm": 0.10352007299661636, + "learning_rate": 9.305248579593064e-05, + "loss": 0.1378, + "step": 4580 + }, + { + "epoch": 0.2444219607007828, + "grad_norm": 0.10638166218996048, + "learning_rate": 9.301558524510293e-05, + "loss": 0.1367, + "step": 4590 + }, + { + "epoch": 0.24495447041908514, + "grad_norm": 0.07406862825155258, + "learning_rate": 9.297859781631166e-05, + "loss": 0.1365, + "step": 4600 + }, + { + "epoch": 0.2454869801373875, + "grad_norm": 0.10420636832714081, + "learning_rate": 9.294152362425701e-05, + "loss": 0.1372, + "step": 4610 + }, + { + "epoch": 0.24601948985568986, + "grad_norm": 0.0823531523346901, + "learning_rate": 9.290436278390821e-05, + "loss": 0.1367, + "step": 4620 + }, + { + "epoch": 0.24655199957399224, + "grad_norm": 0.08028628677129745, + "learning_rate": 9.286711541050322e-05, + "loss": 0.1371, + "step": 4630 + }, + { + "epoch": 0.24708450929229459, + "grad_norm": 0.13905274868011475, + "learning_rate": 9.282978161954825e-05, + "loss": 0.1368, + "step": 4640 + }, + { + "epoch": 0.24761701901059693, + "grad_norm": 0.07757926732301712, + "learning_rate": 9.279236152681763e-05, + "loss": 0.1369, + "step": 4650 + }, + { + "epoch": 0.2481495287288993, + "grad_norm": 0.08740050345659256, + "learning_rate": 9.275485524835319e-05, + "loss": 0.1371, + "step": 4660 + }, + { + "epoch": 0.24868203844720166, + "grad_norm": 0.07663418352603912, + "learning_rate": 9.271726290046413e-05, + "loss": 0.1375, + "step": 4670 + }, + { + "epoch": 0.249214548165504, + "grad_norm": 0.11915243417024612, + "learning_rate": 9.267958459972652e-05, + "loss": 0.1365, + "step": 4680 + }, + { + "epoch": 0.24974705788380638, + "grad_norm": 0.11448535323143005, + "learning_rate": 9.264182046298294e-05, + "loss": 0.137, + "step": 4690 + }, + { + "epoch": 0.25027956760210873, + "grad_norm": 0.09888239204883575, + "learning_rate": 9.260397060734219e-05, + "loss": 0.1373, + "step": 4700 + }, + { + "epoch": 0.2508120773204111, + "grad_norm": 0.12656264007091522, + "learning_rate": 9.256603515017885e-05, + "loss": 0.1364, + "step": 4710 + }, + { + "epoch": 0.25134458703871343, + "grad_norm": 0.1620924174785614, + "learning_rate": 9.252801420913304e-05, + "loss": 0.136, + "step": 4720 + }, + { + "epoch": 0.2518770967570158, + "grad_norm": 0.07459171861410141, + "learning_rate": 9.24899079021099e-05, + "loss": 0.1373, + "step": 4730 + }, + { + "epoch": 0.2524096064753182, + "grad_norm": 0.09508336335420609, + "learning_rate": 9.245171634727926e-05, + "loss": 0.1363, + "step": 4740 + }, + { + "epoch": 0.25294211619362056, + "grad_norm": 0.07613290101289749, + "learning_rate": 9.241343966307543e-05, + "loss": 0.1363, + "step": 4750 + }, + { + "epoch": 0.2534746259119229, + "grad_norm": 0.08999643474817276, + "learning_rate": 9.237507796819662e-05, + "loss": 0.1365, + "step": 4760 + }, + { + "epoch": 0.25400713563022526, + "grad_norm": 0.11467399448156357, + "learning_rate": 9.233663138160464e-05, + "loss": 0.1364, + "step": 4770 + }, + { + "epoch": 0.25453964534852763, + "grad_norm": 0.0825829803943634, + "learning_rate": 9.229810002252464e-05, + "loss": 0.1367, + "step": 4780 + }, + { + "epoch": 0.25507215506682995, + "grad_norm": 0.08100995421409607, + "learning_rate": 9.225948401044457e-05, + "loss": 0.137, + "step": 4790 + }, + { + "epoch": 0.25560466478513233, + "grad_norm": 0.08392170816659927, + "learning_rate": 9.222078346511502e-05, + "loss": 0.1366, + "step": 4800 + }, + { + "epoch": 0.2561371745034347, + "grad_norm": 0.061139799654483795, + "learning_rate": 9.218199850654854e-05, + "loss": 0.1368, + "step": 4810 + }, + { + "epoch": 0.256669684221737, + "grad_norm": 0.1355183869600296, + "learning_rate": 9.21431292550196e-05, + "loss": 0.1363, + "step": 4820 + }, + { + "epoch": 0.2572021939400394, + "grad_norm": 0.08287263661623001, + "learning_rate": 9.210417583106401e-05, + "loss": 0.1363, + "step": 4830 + }, + { + "epoch": 0.2577347036583418, + "grad_norm": 0.0793054848909378, + "learning_rate": 9.206513835547861e-05, + "loss": 0.1362, + "step": 4840 + }, + { + "epoch": 0.2582672133766441, + "grad_norm": 0.09595254063606262, + "learning_rate": 9.202601694932087e-05, + "loss": 0.136, + "step": 4850 + }, + { + "epoch": 0.2587997230949465, + "grad_norm": 0.07301712781190872, + "learning_rate": 9.198681173390858e-05, + "loss": 0.1371, + "step": 4860 + }, + { + "epoch": 0.25933223281324885, + "grad_norm": 0.11917870491743088, + "learning_rate": 9.194752283081937e-05, + "loss": 0.137, + "step": 4870 + }, + { + "epoch": 0.2598647425315512, + "grad_norm": 0.07802341878414154, + "learning_rate": 9.190815036189042e-05, + "loss": 0.1363, + "step": 4880 + }, + { + "epoch": 0.26039725224985355, + "grad_norm": 0.07218264043331146, + "learning_rate": 9.186869444921808e-05, + "loss": 0.136, + "step": 4890 + }, + { + "epoch": 0.2609297619681559, + "grad_norm": 0.07441945374011993, + "learning_rate": 9.182915521515745e-05, + "loss": 0.137, + "step": 4900 + }, + { + "epoch": 0.2614622716864583, + "grad_norm": 0.16663500666618347, + "learning_rate": 9.178953278232193e-05, + "loss": 0.1369, + "step": 4910 + }, + { + "epoch": 0.2619947814047606, + "grad_norm": 0.09314275532960892, + "learning_rate": 9.174982727358306e-05, + "loss": 0.1359, + "step": 4920 + }, + { + "epoch": 0.262527291123063, + "grad_norm": 0.07567309588193893, + "learning_rate": 9.171003881206992e-05, + "loss": 0.1364, + "step": 4930 + }, + { + "epoch": 0.2630598008413654, + "grad_norm": 0.10356537252664566, + "learning_rate": 9.167016752116883e-05, + "loss": 0.1367, + "step": 4940 + }, + { + "epoch": 0.2635923105596677, + "grad_norm": 0.07590640336275101, + "learning_rate": 9.163021352452302e-05, + "loss": 0.1359, + "step": 4950 + }, + { + "epoch": 0.2641248202779701, + "grad_norm": 0.09072890132665634, + "learning_rate": 9.159017694603214e-05, + "loss": 0.1367, + "step": 4960 + }, + { + "epoch": 0.26465732999627245, + "grad_norm": 0.10349312424659729, + "learning_rate": 9.155005790985197e-05, + "loss": 0.136, + "step": 4970 + }, + { + "epoch": 0.26518983971457477, + "grad_norm": 0.08810363709926605, + "learning_rate": 9.150985654039394e-05, + "loss": 0.1362, + "step": 4980 + }, + { + "epoch": 0.26572234943287715, + "grad_norm": 0.13400596380233765, + "learning_rate": 9.14695729623249e-05, + "loss": 0.1362, + "step": 4990 + }, + { + "epoch": 0.2662548591511795, + "grad_norm": 0.12602917850017548, + "learning_rate": 9.142920730056652e-05, + "loss": 0.1353, + "step": 5000 + }, + { + "epoch": 0.26678736886948184, + "grad_norm": 0.08865707367658615, + "learning_rate": 9.138875968029512e-05, + "loss": 0.1361, + "step": 5010 + }, + { + "epoch": 0.2673198785877842, + "grad_norm": 0.11873424053192139, + "learning_rate": 9.13482302269411e-05, + "loss": 0.1371, + "step": 5020 + }, + { + "epoch": 0.2678523883060866, + "grad_norm": 0.08005053550004959, + "learning_rate": 9.13076190661887e-05, + "loss": 0.1356, + "step": 5030 + }, + { + "epoch": 0.2683848980243889, + "grad_norm": 0.14600218832492828, + "learning_rate": 9.126692632397543e-05, + "loss": 0.1359, + "step": 5040 + }, + { + "epoch": 0.2689174077426913, + "grad_norm": 0.09237764775753021, + "learning_rate": 9.122615212649189e-05, + "loss": 0.136, + "step": 5050 + }, + { + "epoch": 0.26944991746099367, + "grad_norm": 0.06887295097112656, + "learning_rate": 9.118529660018125e-05, + "loss": 0.1371, + "step": 5060 + }, + { + "epoch": 0.26998242717929605, + "grad_norm": 0.07038972526788712, + "learning_rate": 9.114435987173886e-05, + "loss": 0.1365, + "step": 5070 + }, + { + "epoch": 0.27051493689759837, + "grad_norm": 0.06756497174501419, + "learning_rate": 9.110334206811195e-05, + "loss": 0.1359, + "step": 5080 + }, + { + "epoch": 0.27104744661590074, + "grad_norm": 0.07146366685628891, + "learning_rate": 9.106224331649906e-05, + "loss": 0.1362, + "step": 5090 + }, + { + "epoch": 0.2715799563342031, + "grad_norm": 0.07654134929180145, + "learning_rate": 9.102106374434984e-05, + "loss": 0.1357, + "step": 5100 + }, + { + "epoch": 0.27211246605250544, + "grad_norm": 0.11621958762407303, + "learning_rate": 9.097980347936457e-05, + "loss": 0.1362, + "step": 5110 + }, + { + "epoch": 0.2726449757708078, + "grad_norm": 0.08447077125310898, + "learning_rate": 9.093846264949368e-05, + "loss": 0.1354, + "step": 5120 + }, + { + "epoch": 0.2731774854891102, + "grad_norm": 0.06373301893472672, + "learning_rate": 9.089704138293756e-05, + "loss": 0.1357, + "step": 5130 + }, + { + "epoch": 0.2737099952074125, + "grad_norm": 0.07651172578334808, + "learning_rate": 9.085553980814592e-05, + "loss": 0.1361, + "step": 5140 + }, + { + "epoch": 0.2742425049257149, + "grad_norm": 0.08544403314590454, + "learning_rate": 9.081395805381761e-05, + "loss": 0.1356, + "step": 5150 + }, + { + "epoch": 0.27477501464401727, + "grad_norm": 0.1044570654630661, + "learning_rate": 9.077229624890002e-05, + "loss": 0.1355, + "step": 5160 + }, + { + "epoch": 0.2753075243623196, + "grad_norm": 0.09831110388040543, + "learning_rate": 9.073055452258889e-05, + "loss": 0.1364, + "step": 5170 + }, + { + "epoch": 0.27584003408062197, + "grad_norm": 0.07489628344774246, + "learning_rate": 9.068873300432772e-05, + "loss": 0.1369, + "step": 5180 + }, + { + "epoch": 0.27637254379892434, + "grad_norm": 0.06893607974052429, + "learning_rate": 9.064683182380749e-05, + "loss": 0.1359, + "step": 5190 + }, + { + "epoch": 0.27690505351722666, + "grad_norm": 0.08887787163257599, + "learning_rate": 9.060485111096617e-05, + "loss": 0.1347, + "step": 5200 + }, + { + "epoch": 0.27743756323552904, + "grad_norm": 0.074364572763443, + "learning_rate": 9.056279099598845e-05, + "loss": 0.1357, + "step": 5210 + }, + { + "epoch": 0.2779700729538314, + "grad_norm": 0.15046152472496033, + "learning_rate": 9.052065160930516e-05, + "loss": 0.1348, + "step": 5220 + }, + { + "epoch": 0.2785025826721338, + "grad_norm": 0.08475669473409653, + "learning_rate": 9.0478433081593e-05, + "loss": 0.1357, + "step": 5230 + }, + { + "epoch": 0.2790350923904361, + "grad_norm": 0.09118683636188507, + "learning_rate": 9.043613554377411e-05, + "loss": 0.1354, + "step": 5240 + }, + { + "epoch": 0.2795676021087385, + "grad_norm": 0.08327824622392654, + "learning_rate": 9.03937591270156e-05, + "loss": 0.135, + "step": 5250 + }, + { + "epoch": 0.28010011182704087, + "grad_norm": 0.12977443635463715, + "learning_rate": 9.035130396272922e-05, + "loss": 0.1353, + "step": 5260 + }, + { + "epoch": 0.2806326215453432, + "grad_norm": 0.07361641526222229, + "learning_rate": 9.030877018257091e-05, + "loss": 0.1363, + "step": 5270 + }, + { + "epoch": 0.28116513126364556, + "grad_norm": 0.06867006421089172, + "learning_rate": 9.02661579184404e-05, + "loss": 0.1353, + "step": 5280 + }, + { + "epoch": 0.28169764098194794, + "grad_norm": 0.09308381378650665, + "learning_rate": 9.022346730248079e-05, + "loss": 0.1357, + "step": 5290 + }, + { + "epoch": 0.28223015070025026, + "grad_norm": 0.07800911366939545, + "learning_rate": 9.01806984670782e-05, + "loss": 0.1348, + "step": 5300 + }, + { + "epoch": 0.28276266041855264, + "grad_norm": 0.06358273327350616, + "learning_rate": 9.013785154486127e-05, + "loss": 0.1352, + "step": 5310 + }, + { + "epoch": 0.283295170136855, + "grad_norm": 0.06616450846195221, + "learning_rate": 9.009492666870078e-05, + "loss": 0.135, + "step": 5320 + }, + { + "epoch": 0.28382767985515733, + "grad_norm": 0.11287859827280045, + "learning_rate": 9.005192397170932e-05, + "loss": 0.1363, + "step": 5330 + }, + { + "epoch": 0.2843601895734597, + "grad_norm": 0.06982850283384323, + "learning_rate": 9.000884358724073e-05, + "loss": 0.1358, + "step": 5340 + }, + { + "epoch": 0.2848926992917621, + "grad_norm": 0.0698726698756218, + "learning_rate": 8.996568564888978e-05, + "loss": 0.1349, + "step": 5350 + }, + { + "epoch": 0.2854252090100644, + "grad_norm": 0.0825994461774826, + "learning_rate": 8.99224502904918e-05, + "loss": 0.1351, + "step": 5360 + }, + { + "epoch": 0.2859577187283668, + "grad_norm": 0.10726054012775421, + "learning_rate": 8.987913764612212e-05, + "loss": 0.1359, + "step": 5370 + }, + { + "epoch": 0.28649022844666916, + "grad_norm": 0.06900358200073242, + "learning_rate": 8.983574785009578e-05, + "loss": 0.1358, + "step": 5380 + }, + { + "epoch": 0.28702273816497154, + "grad_norm": 0.062367282807826996, + "learning_rate": 8.979228103696709e-05, + "loss": 0.1359, + "step": 5390 + }, + { + "epoch": 0.28755524788327386, + "grad_norm": 0.11140688508749008, + "learning_rate": 8.974873734152915e-05, + "loss": 0.1349, + "step": 5400 + }, + { + "epoch": 0.28808775760157623, + "grad_norm": 0.07785354554653168, + "learning_rate": 8.970511689881351e-05, + "loss": 0.1357, + "step": 5410 + }, + { + "epoch": 0.2886202673198786, + "grad_norm": 0.09785955399274826, + "learning_rate": 8.96614198440897e-05, + "loss": 0.1349, + "step": 5420 + }, + { + "epoch": 0.28915277703818093, + "grad_norm": 0.07891997694969177, + "learning_rate": 8.961764631286487e-05, + "loss": 0.1357, + "step": 5430 + }, + { + "epoch": 0.2896852867564833, + "grad_norm": 0.08100765943527222, + "learning_rate": 8.957379644088325e-05, + "loss": 0.1356, + "step": 5440 + }, + { + "epoch": 0.2902177964747857, + "grad_norm": 0.08178524672985077, + "learning_rate": 8.952987036412584e-05, + "loss": 0.135, + "step": 5450 + }, + { + "epoch": 0.290750306193088, + "grad_norm": 0.11689390987157822, + "learning_rate": 8.948586821880997e-05, + "loss": 0.1349, + "step": 5460 + }, + { + "epoch": 0.2912828159113904, + "grad_norm": 0.06157712638378143, + "learning_rate": 8.944179014138891e-05, + "loss": 0.1352, + "step": 5470 + }, + { + "epoch": 0.29181532562969276, + "grad_norm": 0.09353891015052795, + "learning_rate": 8.939763626855129e-05, + "loss": 0.1347, + "step": 5480 + }, + { + "epoch": 0.2923478353479951, + "grad_norm": 0.09575408697128296, + "learning_rate": 8.93534067372209e-05, + "loss": 0.1351, + "step": 5490 + }, + { + "epoch": 0.29288034506629745, + "grad_norm": 0.05233992263674736, + "learning_rate": 8.930910168455603e-05, + "loss": 0.1354, + "step": 5500 + }, + { + "epoch": 0.29341285478459983, + "grad_norm": 0.1354876458644867, + "learning_rate": 8.926472124794931e-05, + "loss": 0.1348, + "step": 5510 + }, + { + "epoch": 0.29394536450290215, + "grad_norm": 0.10183115303516388, + "learning_rate": 8.922026556502699e-05, + "loss": 0.135, + "step": 5520 + }, + { + "epoch": 0.2944778742212045, + "grad_norm": 0.08267311006784439, + "learning_rate": 8.917573477364876e-05, + "loss": 0.1351, + "step": 5530 + }, + { + "epoch": 0.2950103839395069, + "grad_norm": 0.07382847368717194, + "learning_rate": 8.91311290119072e-05, + "loss": 0.1354, + "step": 5540 + }, + { + "epoch": 0.2955428936578093, + "grad_norm": 0.06079572066664696, + "learning_rate": 8.908644841812739e-05, + "loss": 0.1355, + "step": 5550 + }, + { + "epoch": 0.2960754033761116, + "grad_norm": 0.09829048812389374, + "learning_rate": 8.904169313086645e-05, + "loss": 0.1353, + "step": 5560 + }, + { + "epoch": 0.296607913094414, + "grad_norm": 0.09984841197729111, + "learning_rate": 8.899686328891315e-05, + "loss": 0.1348, + "step": 5570 + }, + { + "epoch": 0.29714042281271635, + "grad_norm": 0.1072811409831047, + "learning_rate": 8.895195903128739e-05, + "loss": 0.1356, + "step": 5580 + }, + { + "epoch": 0.2976729325310187, + "grad_norm": 0.0519433431327343, + "learning_rate": 8.890698049723995e-05, + "loss": 0.1345, + "step": 5590 + }, + { + "epoch": 0.29820544224932105, + "grad_norm": 0.057259172201156616, + "learning_rate": 8.886192782625189e-05, + "loss": 0.1338, + "step": 5600 + }, + { + "epoch": 0.2987379519676234, + "grad_norm": 0.102280393242836, + "learning_rate": 8.881680115803412e-05, + "loss": 0.1347, + "step": 5610 + }, + { + "epoch": 0.29927046168592575, + "grad_norm": 0.07144474983215332, + "learning_rate": 8.877160063252712e-05, + "loss": 0.1353, + "step": 5620 + }, + { + "epoch": 0.2998029714042281, + "grad_norm": 0.07198172062635422, + "learning_rate": 8.87263263899003e-05, + "loss": 0.1354, + "step": 5630 + }, + { + "epoch": 0.3003354811225305, + "grad_norm": 0.08418303728103638, + "learning_rate": 8.86809785705518e-05, + "loss": 0.1353, + "step": 5640 + }, + { + "epoch": 0.3008679908408328, + "grad_norm": 0.09237035363912582, + "learning_rate": 8.86355573151078e-05, + "loss": 0.1356, + "step": 5650 + }, + { + "epoch": 0.3014005005591352, + "grad_norm": 0.12669327855110168, + "learning_rate": 8.859006276442226e-05, + "loss": 0.1343, + "step": 5660 + }, + { + "epoch": 0.3019330102774376, + "grad_norm": 0.05533193424344063, + "learning_rate": 8.854449505957645e-05, + "loss": 0.1351, + "step": 5670 + }, + { + "epoch": 0.3024655199957399, + "grad_norm": 0.06650611758232117, + "learning_rate": 8.849885434187848e-05, + "loss": 0.1345, + "step": 5680 + }, + { + "epoch": 0.30299802971404227, + "grad_norm": 0.07978025078773499, + "learning_rate": 8.845314075286286e-05, + "loss": 0.1347, + "step": 5690 + }, + { + "epoch": 0.30353053943234465, + "grad_norm": 0.06983333081007004, + "learning_rate": 8.840735443429014e-05, + "loss": 0.1345, + "step": 5700 + }, + { + "epoch": 0.304063049150647, + "grad_norm": 0.08889699727296829, + "learning_rate": 8.836149552814632e-05, + "loss": 0.1341, + "step": 5710 + }, + { + "epoch": 0.30459555886894935, + "grad_norm": 0.17295877635478973, + "learning_rate": 8.831556417664255e-05, + "loss": 0.1349, + "step": 5720 + }, + { + "epoch": 0.3051280685872517, + "grad_norm": 0.08790881931781769, + "learning_rate": 8.826956052221464e-05, + "loss": 0.1356, + "step": 5730 + }, + { + "epoch": 0.3056605783055541, + "grad_norm": 0.11530311405658722, + "learning_rate": 8.822348470752263e-05, + "loss": 0.1341, + "step": 5740 + }, + { + "epoch": 0.3061930880238564, + "grad_norm": 0.07714807987213135, + "learning_rate": 8.817733687545024e-05, + "loss": 0.1349, + "step": 5750 + }, + { + "epoch": 0.3067255977421588, + "grad_norm": 0.07843048125505447, + "learning_rate": 8.813111716910463e-05, + "loss": 0.135, + "step": 5760 + }, + { + "epoch": 0.30725810746046117, + "grad_norm": 0.059752389788627625, + "learning_rate": 8.808482573181583e-05, + "loss": 0.134, + "step": 5770 + }, + { + "epoch": 0.3077906171787635, + "grad_norm": 0.07728555053472519, + "learning_rate": 8.803846270713622e-05, + "loss": 0.1347, + "step": 5780 + }, + { + "epoch": 0.30832312689706587, + "grad_norm": 0.11523959785699844, + "learning_rate": 8.79920282388403e-05, + "loss": 0.1345, + "step": 5790 + }, + { + "epoch": 0.30885563661536825, + "grad_norm": 0.0835232064127922, + "learning_rate": 8.794552247092404e-05, + "loss": 0.1342, + "step": 5800 + }, + { + "epoch": 0.30938814633367057, + "grad_norm": 0.08657065033912659, + "learning_rate": 8.789894554760456e-05, + "loss": 0.1351, + "step": 5810 + }, + { + "epoch": 0.30992065605197294, + "grad_norm": 0.17280389368534088, + "learning_rate": 8.78522976133196e-05, + "loss": 0.1345, + "step": 5820 + }, + { + "epoch": 0.3104531657702753, + "grad_norm": 0.07250665873289108, + "learning_rate": 8.780557881272711e-05, + "loss": 0.1339, + "step": 5830 + }, + { + "epoch": 0.31098567548857764, + "grad_norm": 0.135615274310112, + "learning_rate": 8.775878929070483e-05, + "loss": 0.1348, + "step": 5840 + }, + { + "epoch": 0.31151818520688, + "grad_norm": 0.09052561223506927, + "learning_rate": 8.77119291923498e-05, + "loss": 0.1342, + "step": 5850 + }, + { + "epoch": 0.3120506949251824, + "grad_norm": 0.06174413859844208, + "learning_rate": 8.766499866297791e-05, + "loss": 0.1338, + "step": 5860 + }, + { + "epoch": 0.31258320464348477, + "grad_norm": 0.08344202488660812, + "learning_rate": 8.761799784812348e-05, + "loss": 0.1343, + "step": 5870 + }, + { + "epoch": 0.3131157143617871, + "grad_norm": 0.11124816536903381, + "learning_rate": 8.757092689353876e-05, + "loss": 0.1348, + "step": 5880 + }, + { + "epoch": 0.31364822408008947, + "grad_norm": 0.19503851234912872, + "learning_rate": 8.752378594519355e-05, + "loss": 0.1351, + "step": 5890 + }, + { + "epoch": 0.31418073379839184, + "grad_norm": 0.0828915536403656, + "learning_rate": 8.747657514927463e-05, + "loss": 0.135, + "step": 5900 + }, + { + "epoch": 0.31471324351669416, + "grad_norm": 0.07251901179552078, + "learning_rate": 8.742929465218548e-05, + "loss": 0.1337, + "step": 5910 + }, + { + "epoch": 0.31524575323499654, + "grad_norm": 0.05215257406234741, + "learning_rate": 8.738194460054567e-05, + "loss": 0.1335, + "step": 5920 + }, + { + "epoch": 0.3157782629532989, + "grad_norm": 0.06828713417053223, + "learning_rate": 8.733452514119048e-05, + "loss": 0.1341, + "step": 5930 + }, + { + "epoch": 0.31631077267160124, + "grad_norm": 0.05530816689133644, + "learning_rate": 8.728703642117038e-05, + "loss": 0.1342, + "step": 5940 + }, + { + "epoch": 0.3168432823899036, + "grad_norm": 0.08321405947208405, + "learning_rate": 8.723947858775068e-05, + "loss": 0.1342, + "step": 5950 + }, + { + "epoch": 0.317375792108206, + "grad_norm": 0.08132331073284149, + "learning_rate": 8.7191851788411e-05, + "loss": 0.1344, + "step": 5960 + }, + { + "epoch": 0.3179083018265083, + "grad_norm": 0.062430258840322495, + "learning_rate": 8.714415617084484e-05, + "loss": 0.1346, + "step": 5970 + }, + { + "epoch": 0.3184408115448107, + "grad_norm": 0.09331026673316956, + "learning_rate": 8.709639188295906e-05, + "loss": 0.1346, + "step": 5980 + }, + { + "epoch": 0.31897332126311306, + "grad_norm": 0.05821016803383827, + "learning_rate": 8.70485590728735e-05, + "loss": 0.1347, + "step": 5990 + }, + { + "epoch": 0.3195058309814154, + "grad_norm": 0.059810176491737366, + "learning_rate": 8.700065788892053e-05, + "loss": 0.1341, + "step": 6000 + }, + { + "epoch": 0.32003834069971776, + "grad_norm": 0.07964300364255905, + "learning_rate": 8.695268847964449e-05, + "loss": 0.1348, + "step": 6010 + }, + { + "epoch": 0.32057085041802014, + "grad_norm": 0.08967084437608719, + "learning_rate": 8.690465099380131e-05, + "loss": 0.1343, + "step": 6020 + }, + { + "epoch": 0.3211033601363225, + "grad_norm": 0.05978870391845703, + "learning_rate": 8.685654558035803e-05, + "loss": 0.1335, + "step": 6030 + }, + { + "epoch": 0.32163586985462483, + "grad_norm": 0.05357252061367035, + "learning_rate": 8.680837238849237e-05, + "loss": 0.134, + "step": 6040 + }, + { + "epoch": 0.3221683795729272, + "grad_norm": 0.09251965582370758, + "learning_rate": 8.676013156759219e-05, + "loss": 0.1344, + "step": 6050 + }, + { + "epoch": 0.3227008892912296, + "grad_norm": 0.06674574315547943, + "learning_rate": 8.671182326725509e-05, + "loss": 0.1346, + "step": 6060 + }, + { + "epoch": 0.3232333990095319, + "grad_norm": 0.0678028017282486, + "learning_rate": 8.666344763728793e-05, + "loss": 0.1334, + "step": 6070 + }, + { + "epoch": 0.3237659087278343, + "grad_norm": 0.08303205668926239, + "learning_rate": 8.661500482770635e-05, + "loss": 0.1333, + "step": 6080 + }, + { + "epoch": 0.32429841844613666, + "grad_norm": 0.059111885726451874, + "learning_rate": 8.656649498873435e-05, + "loss": 0.1334, + "step": 6090 + }, + { + "epoch": 0.324830928164439, + "grad_norm": 0.0759367048740387, + "learning_rate": 8.651791827080373e-05, + "loss": 0.1334, + "step": 6100 + }, + { + "epoch": 0.32536343788274136, + "grad_norm": 0.05576184391975403, + "learning_rate": 8.646927482455375e-05, + "loss": 0.1347, + "step": 6110 + }, + { + "epoch": 0.32589594760104373, + "grad_norm": 0.12025826424360275, + "learning_rate": 8.642056480083058e-05, + "loss": 0.1341, + "step": 6120 + }, + { + "epoch": 0.32642845731934605, + "grad_norm": 0.055676814168691635, + "learning_rate": 8.637178835068685e-05, + "loss": 0.1342, + "step": 6130 + }, + { + "epoch": 0.32696096703764843, + "grad_norm": 0.06213228031992912, + "learning_rate": 8.632294562538114e-05, + "loss": 0.1336, + "step": 6140 + }, + { + "epoch": 0.3274934767559508, + "grad_norm": 0.05842900648713112, + "learning_rate": 8.627403677637762e-05, + "loss": 0.1339, + "step": 6150 + }, + { + "epoch": 0.32802598647425313, + "grad_norm": 0.07621738314628601, + "learning_rate": 8.62250619553455e-05, + "loss": 0.1339, + "step": 6160 + }, + { + "epoch": 0.3285584961925555, + "grad_norm": 0.06162210926413536, + "learning_rate": 8.61760213141585e-05, + "loss": 0.1345, + "step": 6170 + }, + { + "epoch": 0.3290910059108579, + "grad_norm": 0.0727054551243782, + "learning_rate": 8.612691500489453e-05, + "loss": 0.1342, + "step": 6180 + }, + { + "epoch": 0.32962351562916026, + "grad_norm": 0.05833178386092186, + "learning_rate": 8.607774317983515e-05, + "loss": 0.1337, + "step": 6190 + }, + { + "epoch": 0.3301560253474626, + "grad_norm": 0.11485815793275833, + "learning_rate": 8.602850599146502e-05, + "loss": 0.1334, + "step": 6200 + }, + { + "epoch": 0.33068853506576495, + "grad_norm": 0.0697018951177597, + "learning_rate": 8.597920359247156e-05, + "loss": 0.1335, + "step": 6210 + }, + { + "epoch": 0.33122104478406733, + "grad_norm": 0.05111430957913399, + "learning_rate": 8.592983613574435e-05, + "loss": 0.1332, + "step": 6220 + }, + { + "epoch": 0.33175355450236965, + "grad_norm": 0.06886550784111023, + "learning_rate": 8.588040377437479e-05, + "loss": 0.1338, + "step": 6230 + }, + { + "epoch": 0.33228606422067203, + "grad_norm": 0.04854755103588104, + "learning_rate": 8.58309066616555e-05, + "loss": 0.1336, + "step": 6240 + }, + { + "epoch": 0.3328185739389744, + "grad_norm": 0.0921018123626709, + "learning_rate": 8.57813449510799e-05, + "loss": 0.1338, + "step": 6250 + }, + { + "epoch": 0.3333510836572767, + "grad_norm": 0.09607180953025818, + "learning_rate": 8.573171879634177e-05, + "loss": 0.1341, + "step": 6260 + }, + { + "epoch": 0.3338835933755791, + "grad_norm": 0.16610988974571228, + "learning_rate": 8.568202835133468e-05, + "loss": 0.1343, + "step": 6270 + }, + { + "epoch": 0.3344161030938815, + "grad_norm": 0.07573292404413223, + "learning_rate": 8.563227377015162e-05, + "loss": 0.1336, + "step": 6280 + }, + { + "epoch": 0.3349486128121838, + "grad_norm": 0.10059863328933716, + "learning_rate": 8.558245520708444e-05, + "loss": 0.1338, + "step": 6290 + }, + { + "epoch": 0.3354811225304862, + "grad_norm": 0.0501171350479126, + "learning_rate": 8.553257281662342e-05, + "loss": 0.1331, + "step": 6300 + }, + { + "epoch": 0.33601363224878855, + "grad_norm": 0.07637584954500198, + "learning_rate": 8.548262675345673e-05, + "loss": 0.1336, + "step": 6310 + }, + { + "epoch": 0.3365461419670909, + "grad_norm": 0.08559510856866837, + "learning_rate": 8.543261717247006e-05, + "loss": 0.1345, + "step": 6320 + }, + { + "epoch": 0.33707865168539325, + "grad_norm": 0.07463840395212173, + "learning_rate": 8.5382544228746e-05, + "loss": 0.134, + "step": 6330 + }, + { + "epoch": 0.3376111614036956, + "grad_norm": 0.06291361898183823, + "learning_rate": 8.533240807756373e-05, + "loss": 0.134, + "step": 6340 + }, + { + "epoch": 0.338143671121998, + "grad_norm": 0.054589059203863144, + "learning_rate": 8.52822088743983e-05, + "loss": 0.1336, + "step": 6350 + }, + { + "epoch": 0.3386761808403003, + "grad_norm": 0.08773118257522583, + "learning_rate": 8.523194677492044e-05, + "loss": 0.1332, + "step": 6360 + }, + { + "epoch": 0.3392086905586027, + "grad_norm": 0.08630936592817307, + "learning_rate": 8.518162193499581e-05, + "loss": 0.1339, + "step": 6370 + }, + { + "epoch": 0.3397412002769051, + "grad_norm": 0.0654667541384697, + "learning_rate": 8.513123451068467e-05, + "loss": 0.1339, + "step": 6380 + }, + { + "epoch": 0.3402737099952074, + "grad_norm": 0.04769926890730858, + "learning_rate": 8.508078465824138e-05, + "loss": 0.1335, + "step": 6390 + }, + { + "epoch": 0.3408062197135098, + "grad_norm": 0.10144821554422379, + "learning_rate": 8.503027253411387e-05, + "loss": 0.1328, + "step": 6400 + }, + { + "epoch": 0.34133872943181215, + "grad_norm": 0.09670275449752808, + "learning_rate": 8.497969829494319e-05, + "loss": 0.1338, + "step": 6410 + }, + { + "epoch": 0.34187123915011447, + "grad_norm": 0.08334879577159882, + "learning_rate": 8.492906209756294e-05, + "loss": 0.1328, + "step": 6420 + }, + { + "epoch": 0.34240374886841685, + "grad_norm": 0.06717374920845032, + "learning_rate": 8.487836409899905e-05, + "loss": 0.134, + "step": 6430 + }, + { + "epoch": 0.3429362585867192, + "grad_norm": 0.1657373309135437, + "learning_rate": 8.482760445646885e-05, + "loss": 0.1333, + "step": 6440 + }, + { + "epoch": 0.34346876830502154, + "grad_norm": 0.07656820863485336, + "learning_rate": 8.477678332738102e-05, + "loss": 0.1331, + "step": 6450 + }, + { + "epoch": 0.3440012780233239, + "grad_norm": 0.06148603931069374, + "learning_rate": 8.472590086933479e-05, + "loss": 0.1338, + "step": 6460 + }, + { + "epoch": 0.3445337877416263, + "grad_norm": 0.0530422069132328, + "learning_rate": 8.467495724011967e-05, + "loss": 0.1335, + "step": 6470 + }, + { + "epoch": 0.3450662974599286, + "grad_norm": 1.325517177581787, + "learning_rate": 8.462395259771483e-05, + "loss": 0.1388, + "step": 6480 + }, + { + "epoch": 0.345598807178231, + "grad_norm": 4.959922790527344, + "learning_rate": 8.457288710028862e-05, + "loss": 0.2724, + "step": 6490 + }, + { + "epoch": 0.34613131689653337, + "grad_norm": 0.29457658529281616, + "learning_rate": 8.452176090619812e-05, + "loss": 0.1983, + "step": 6500 + }, + { + "epoch": 0.34666382661483575, + "grad_norm": 0.10155448317527771, + "learning_rate": 8.447057417398866e-05, + "loss": 0.1556, + "step": 6510 + }, + { + "epoch": 0.34719633633313807, + "grad_norm": 0.08506519347429276, + "learning_rate": 8.441932706239329e-05, + "loss": 0.1438, + "step": 6520 + }, + { + "epoch": 0.34772884605144044, + "grad_norm": 0.06791000813245773, + "learning_rate": 8.436801973033227e-05, + "loss": 0.1402, + "step": 6530 + }, + { + "epoch": 0.3482613557697428, + "grad_norm": 0.0665493905544281, + "learning_rate": 8.43166523369126e-05, + "loss": 0.1381, + "step": 6540 + }, + { + "epoch": 0.34879386548804514, + "grad_norm": 0.14150767028331757, + "learning_rate": 8.42652250414276e-05, + "loss": 0.136, + "step": 6550 + }, + { + "epoch": 0.3493263752063475, + "grad_norm": 0.07686637341976166, + "learning_rate": 8.421373800335632e-05, + "loss": 0.1352, + "step": 6560 + }, + { + "epoch": 0.3498588849246499, + "grad_norm": 0.06520914286375046, + "learning_rate": 8.416219138236308e-05, + "loss": 0.1349, + "step": 6570 + }, + { + "epoch": 0.3503913946429522, + "grad_norm": 0.1157696321606636, + "learning_rate": 8.411058533829688e-05, + "loss": 0.1346, + "step": 6580 + }, + { + "epoch": 0.3509239043612546, + "grad_norm": 0.10777822136878967, + "learning_rate": 8.405892003119115e-05, + "loss": 0.1354, + "step": 6590 + }, + { + "epoch": 0.35145641407955697, + "grad_norm": 0.07395236939191818, + "learning_rate": 8.4007195621263e-05, + "loss": 0.1335, + "step": 6600 + }, + { + "epoch": 0.3519889237978593, + "grad_norm": 0.0857616439461708, + "learning_rate": 8.395541226891283e-05, + "loss": 0.1339, + "step": 6610 + }, + { + "epoch": 0.35252143351616166, + "grad_norm": 0.055322933942079544, + "learning_rate": 8.390357013472386e-05, + "loss": 0.1339, + "step": 6620 + }, + { + "epoch": 0.35305394323446404, + "grad_norm": 0.06957754492759705, + "learning_rate": 8.385166937946154e-05, + "loss": 0.1337, + "step": 6630 + }, + { + "epoch": 0.35358645295276636, + "grad_norm": 0.06771986186504364, + "learning_rate": 8.379971016407313e-05, + "loss": 0.1344, + "step": 6640 + }, + { + "epoch": 0.35411896267106874, + "grad_norm": 0.08326587826013565, + "learning_rate": 8.374769264968722e-05, + "loss": 0.1333, + "step": 6650 + }, + { + "epoch": 0.3546514723893711, + "grad_norm": 0.0682111382484436, + "learning_rate": 8.369561699761317e-05, + "loss": 0.1347, + "step": 6660 + }, + { + "epoch": 0.3551839821076735, + "grad_norm": 0.10408024489879608, + "learning_rate": 8.364348336934056e-05, + "loss": 0.1335, + "step": 6670 + }, + { + "epoch": 0.3557164918259758, + "grad_norm": 0.07545497268438339, + "learning_rate": 8.359129192653883e-05, + "loss": 0.1329, + "step": 6680 + }, + { + "epoch": 0.3562490015442782, + "grad_norm": 0.06856414675712585, + "learning_rate": 8.353904283105671e-05, + "loss": 0.1333, + "step": 6690 + }, + { + "epoch": 0.35678151126258056, + "grad_norm": 0.12046464532613754, + "learning_rate": 8.34867362449217e-05, + "loss": 0.1341, + "step": 6700 + }, + { + "epoch": 0.3573140209808829, + "grad_norm": 0.08481092005968094, + "learning_rate": 8.343437233033952e-05, + "loss": 0.1331, + "step": 6710 + }, + { + "epoch": 0.35784653069918526, + "grad_norm": 0.0763193815946579, + "learning_rate": 8.338195124969377e-05, + "loss": 0.133, + "step": 6720 + }, + { + "epoch": 0.35837904041748764, + "grad_norm": 0.05080103129148483, + "learning_rate": 8.332947316554527e-05, + "loss": 0.1332, + "step": 6730 + }, + { + "epoch": 0.35891155013578996, + "grad_norm": 0.09795154631137848, + "learning_rate": 8.327693824063158e-05, + "loss": 0.1339, + "step": 6740 + }, + { + "epoch": 0.35944405985409233, + "grad_norm": 0.10069025307893753, + "learning_rate": 8.322434663786662e-05, + "loss": 0.1335, + "step": 6750 + }, + { + "epoch": 0.3599765695723947, + "grad_norm": 0.08091656118631363, + "learning_rate": 8.317169852034002e-05, + "loss": 0.1336, + "step": 6760 + }, + { + "epoch": 0.36050907929069703, + "grad_norm": 0.06075895577669144, + "learning_rate": 8.31189940513166e-05, + "loss": 0.1341, + "step": 6770 + }, + { + "epoch": 0.3610415890089994, + "grad_norm": 0.051195014268159866, + "learning_rate": 8.306623339423605e-05, + "loss": 0.1338, + "step": 6780 + }, + { + "epoch": 0.3615740987273018, + "grad_norm": 0.05651082843542099, + "learning_rate": 8.301341671271222e-05, + "loss": 0.1332, + "step": 6790 + }, + { + "epoch": 0.3621066084456041, + "grad_norm": 0.05757668614387512, + "learning_rate": 8.29605441705327e-05, + "loss": 0.133, + "step": 6800 + }, + { + "epoch": 0.3626391181639065, + "grad_norm": 0.05644191801548004, + "learning_rate": 8.290761593165836e-05, + "loss": 0.1333, + "step": 6810 + }, + { + "epoch": 0.36317162788220886, + "grad_norm": 0.07972195744514465, + "learning_rate": 8.285463216022276e-05, + "loss": 0.133, + "step": 6820 + }, + { + "epoch": 0.36370413760051123, + "grad_norm": 0.04617351293563843, + "learning_rate": 8.280159302053163e-05, + "loss": 0.1328, + "step": 6830 + }, + { + "epoch": 0.36423664731881356, + "grad_norm": 0.09602131694555283, + "learning_rate": 8.274849867706247e-05, + "loss": 0.1331, + "step": 6840 + }, + { + "epoch": 0.36476915703711593, + "grad_norm": 0.04789271950721741, + "learning_rate": 8.269534929446392e-05, + "loss": 0.133, + "step": 6850 + }, + { + "epoch": 0.3653016667554183, + "grad_norm": 0.09267139434814453, + "learning_rate": 8.26421450375553e-05, + "loss": 0.1325, + "step": 6860 + }, + { + "epoch": 0.36583417647372063, + "grad_norm": 0.058588556945323944, + "learning_rate": 8.258888607132614e-05, + "loss": 0.1336, + "step": 6870 + }, + { + "epoch": 0.366366686192023, + "grad_norm": 0.052210818976163864, + "learning_rate": 8.253557256093558e-05, + "loss": 0.1328, + "step": 6880 + }, + { + "epoch": 0.3668991959103254, + "grad_norm": 0.09430071711540222, + "learning_rate": 8.248220467171195e-05, + "loss": 0.1328, + "step": 6890 + }, + { + "epoch": 0.3674317056286277, + "grad_norm": 0.07742954045534134, + "learning_rate": 8.242878256915216e-05, + "loss": 0.1328, + "step": 6900 + }, + { + "epoch": 0.3679642153469301, + "grad_norm": 0.06042707711458206, + "learning_rate": 8.237530641892128e-05, + "loss": 0.133, + "step": 6910 + }, + { + "epoch": 0.36849672506523246, + "grad_norm": 0.06480567157268524, + "learning_rate": 8.232177638685194e-05, + "loss": 0.1328, + "step": 6920 + }, + { + "epoch": 0.3690292347835348, + "grad_norm": 0.047677043825387955, + "learning_rate": 8.226819263894395e-05, + "loss": 0.1331, + "step": 6930 + }, + { + "epoch": 0.36956174450183715, + "grad_norm": 0.051471047103405, + "learning_rate": 8.221455534136358e-05, + "loss": 0.1324, + "step": 6940 + }, + { + "epoch": 0.37009425422013953, + "grad_norm": 0.07004884630441666, + "learning_rate": 8.216086466044323e-05, + "loss": 0.1327, + "step": 6950 + }, + { + "epoch": 0.37062676393844185, + "grad_norm": 0.07678276300430298, + "learning_rate": 8.210712076268088e-05, + "loss": 0.1327, + "step": 6960 + }, + { + "epoch": 0.3711592736567442, + "grad_norm": 0.07195029407739639, + "learning_rate": 8.205332381473942e-05, + "loss": 0.1324, + "step": 6970 + }, + { + "epoch": 0.3716917833750466, + "grad_norm": 0.061837486922740936, + "learning_rate": 8.199947398344639e-05, + "loss": 0.1325, + "step": 6980 + }, + { + "epoch": 0.372224293093349, + "grad_norm": 0.1034204512834549, + "learning_rate": 8.19455714357932e-05, + "loss": 0.1326, + "step": 6990 + }, + { + "epoch": 0.3727568028116513, + "grad_norm": 0.10331778973340988, + "learning_rate": 8.189161633893481e-05, + "loss": 0.1328, + "step": 7000 + }, + { + "epoch": 0.3732893125299537, + "grad_norm": 0.06943188607692719, + "learning_rate": 8.183760886018914e-05, + "loss": 0.1323, + "step": 7010 + }, + { + "epoch": 0.37382182224825605, + "grad_norm": 0.050394218415021896, + "learning_rate": 8.178354916703654e-05, + "loss": 0.1324, + "step": 7020 + }, + { + "epoch": 0.3743543319665584, + "grad_norm": 0.06192854419350624, + "learning_rate": 8.172943742711923e-05, + "loss": 0.1323, + "step": 7030 + }, + { + "epoch": 0.37488684168486075, + "grad_norm": 0.12752105295658112, + "learning_rate": 8.16752738082409e-05, + "loss": 0.1329, + "step": 7040 + }, + { + "epoch": 0.3754193514031631, + "grad_norm": 0.07455851882696152, + "learning_rate": 8.162105847836605e-05, + "loss": 0.1331, + "step": 7050 + }, + { + "epoch": 0.37595186112146545, + "grad_norm": 0.07023312151432037, + "learning_rate": 8.156679160561963e-05, + "loss": 0.1326, + "step": 7060 + }, + { + "epoch": 0.3764843708397678, + "grad_norm": 0.057135872542858124, + "learning_rate": 8.151247335828638e-05, + "loss": 0.1334, + "step": 7070 + }, + { + "epoch": 0.3770168805580702, + "grad_norm": 0.10991890728473663, + "learning_rate": 8.145810390481033e-05, + "loss": 0.1328, + "step": 7080 + }, + { + "epoch": 0.3775493902763725, + "grad_norm": 0.06575486063957214, + "learning_rate": 8.140368341379431e-05, + "loss": 0.133, + "step": 7090 + }, + { + "epoch": 0.3780818999946749, + "grad_norm": 0.06990350782871246, + "learning_rate": 8.134921205399945e-05, + "loss": 0.1321, + "step": 7100 + }, + { + "epoch": 0.3786144097129773, + "grad_norm": 0.06953799724578857, + "learning_rate": 8.129468999434464e-05, + "loss": 0.132, + "step": 7110 + }, + { + "epoch": 0.3791469194312796, + "grad_norm": 0.09842592477798462, + "learning_rate": 8.124011740390591e-05, + "loss": 0.1323, + "step": 7120 + }, + { + "epoch": 0.37967942914958197, + "grad_norm": 0.08032160997390747, + "learning_rate": 8.118549445191613e-05, + "loss": 0.1324, + "step": 7130 + }, + { + "epoch": 0.38021193886788435, + "grad_norm": 0.07145192474126816, + "learning_rate": 8.113082130776417e-05, + "loss": 0.1315, + "step": 7140 + }, + { + "epoch": 0.3807444485861867, + "grad_norm": 0.05545572564005852, + "learning_rate": 8.107609814099466e-05, + "loss": 0.1327, + "step": 7150 + }, + { + "epoch": 0.38127695830448904, + "grad_norm": 0.06006612256169319, + "learning_rate": 8.102132512130738e-05, + "loss": 0.1316, + "step": 7160 + }, + { + "epoch": 0.3818094680227914, + "grad_norm": 0.08068816363811493, + "learning_rate": 8.096650241855661e-05, + "loss": 0.1319, + "step": 7170 + }, + { + "epoch": 0.3823419777410938, + "grad_norm": 0.08527512848377228, + "learning_rate": 8.091163020275077e-05, + "loss": 0.1324, + "step": 7180 + }, + { + "epoch": 0.3828744874593961, + "grad_norm": 0.06154448911547661, + "learning_rate": 8.085670864405179e-05, + "loss": 0.1327, + "step": 7190 + }, + { + "epoch": 0.3834069971776985, + "grad_norm": 0.05169384926557541, + "learning_rate": 8.080173791277463e-05, + "loss": 0.132, + "step": 7200 + }, + { + "epoch": 0.38393950689600087, + "grad_norm": 0.11166296899318695, + "learning_rate": 8.074671817938674e-05, + "loss": 0.1318, + "step": 7210 + }, + { + "epoch": 0.3844720166143032, + "grad_norm": 0.05975338816642761, + "learning_rate": 8.069164961450751e-05, + "loss": 0.1313, + "step": 7220 + }, + { + "epoch": 0.38500452633260557, + "grad_norm": 0.07280656695365906, + "learning_rate": 8.063653238890779e-05, + "loss": 0.1324, + "step": 7230 + }, + { + "epoch": 0.38553703605090794, + "grad_norm": 0.050891146063804626, + "learning_rate": 8.058136667350928e-05, + "loss": 0.132, + "step": 7240 + }, + { + "epoch": 0.38606954576921027, + "grad_norm": 0.10308956354856491, + "learning_rate": 8.05261526393841e-05, + "loss": 0.1323, + "step": 7250 + }, + { + "epoch": 0.38660205548751264, + "grad_norm": 0.08276902139186859, + "learning_rate": 8.04708904577542e-05, + "loss": 0.1324, + "step": 7260 + }, + { + "epoch": 0.387134565205815, + "grad_norm": 0.06150532513856888, + "learning_rate": 8.041558029999081e-05, + "loss": 0.1324, + "step": 7270 + }, + { + "epoch": 0.38766707492411734, + "grad_norm": 0.08963697403669357, + "learning_rate": 8.036022233761396e-05, + "loss": 0.1332, + "step": 7280 + }, + { + "epoch": 0.3881995846424197, + "grad_norm": 0.08556204289197922, + "learning_rate": 8.030481674229192e-05, + "loss": 0.1319, + "step": 7290 + }, + { + "epoch": 0.3887320943607221, + "grad_norm": 0.0741380900144577, + "learning_rate": 8.024936368584066e-05, + "loss": 0.132, + "step": 7300 + }, + { + "epoch": 0.38926460407902447, + "grad_norm": 0.060994237661361694, + "learning_rate": 8.019386334022336e-05, + "loss": 0.1328, + "step": 7310 + }, + { + "epoch": 0.3897971137973268, + "grad_norm": 0.053207580000162125, + "learning_rate": 8.013831587754984e-05, + "loss": 0.1321, + "step": 7320 + }, + { + "epoch": 0.39032962351562916, + "grad_norm": 0.08496523648500443, + "learning_rate": 8.008272147007597e-05, + "loss": 0.1317, + "step": 7330 + }, + { + "epoch": 0.39086213323393154, + "grad_norm": 0.06788633018732071, + "learning_rate": 8.002708029020329e-05, + "loss": 0.1323, + "step": 7340 + }, + { + "epoch": 0.39139464295223386, + "grad_norm": 0.05240168422460556, + "learning_rate": 7.997139251047835e-05, + "loss": 0.1323, + "step": 7350 + }, + { + "epoch": 0.39192715267053624, + "grad_norm": 0.08682172000408173, + "learning_rate": 7.991565830359218e-05, + "loss": 0.1321, + "step": 7360 + }, + { + "epoch": 0.3924596623888386, + "grad_norm": 0.05870863422751427, + "learning_rate": 7.985987784237981e-05, + "loss": 0.1317, + "step": 7370 + }, + { + "epoch": 0.39299217210714094, + "grad_norm": 0.053884461522102356, + "learning_rate": 7.980405129981971e-05, + "loss": 0.1322, + "step": 7380 + }, + { + "epoch": 0.3935246818254433, + "grad_norm": 0.051192574203014374, + "learning_rate": 7.974817884903325e-05, + "loss": 0.132, + "step": 7390 + }, + { + "epoch": 0.3940571915437457, + "grad_norm": 0.07789867371320724, + "learning_rate": 7.969226066328415e-05, + "loss": 0.1322, + "step": 7400 + }, + { + "epoch": 0.394589701262048, + "grad_norm": 0.12169856578111649, + "learning_rate": 7.963629691597794e-05, + "loss": 0.1331, + "step": 7410 + }, + { + "epoch": 0.3951222109803504, + "grad_norm": 0.05751097947359085, + "learning_rate": 7.95802877806615e-05, + "loss": 0.1317, + "step": 7420 + }, + { + "epoch": 0.39565472069865276, + "grad_norm": 0.0670279935002327, + "learning_rate": 7.952423343102242e-05, + "loss": 0.1321, + "step": 7430 + }, + { + "epoch": 0.3961872304169551, + "grad_norm": 0.12209637463092804, + "learning_rate": 7.946813404088849e-05, + "loss": 0.1318, + "step": 7440 + }, + { + "epoch": 0.39671974013525746, + "grad_norm": 0.06626468896865845, + "learning_rate": 7.94119897842272e-05, + "loss": 0.1318, + "step": 7450 + }, + { + "epoch": 0.39725224985355984, + "grad_norm": 0.04306609556078911, + "learning_rate": 7.935580083514516e-05, + "loss": 0.1318, + "step": 7460 + }, + { + "epoch": 0.3977847595718622, + "grad_norm": 0.07492338865995407, + "learning_rate": 7.929956736788759e-05, + "loss": 0.1318, + "step": 7470 + }, + { + "epoch": 0.39831726929016453, + "grad_norm": 0.051630035042762756, + "learning_rate": 7.924328955683774e-05, + "loss": 0.1314, + "step": 7480 + }, + { + "epoch": 0.3988497790084669, + "grad_norm": 0.06161106750369072, + "learning_rate": 7.918696757651637e-05, + "loss": 0.1319, + "step": 7490 + }, + { + "epoch": 0.3993822887267693, + "grad_norm": 0.048934947699308395, + "learning_rate": 7.913060160158125e-05, + "loss": 0.1318, + "step": 7500 + }, + { + "epoch": 0.3999147984450716, + "grad_norm": 0.08472836762666702, + "learning_rate": 7.907419180682656e-05, + "loss": 0.1324, + "step": 7510 + }, + { + "epoch": 0.400447308163374, + "grad_norm": 0.07017608731985092, + "learning_rate": 7.901773836718234e-05, + "loss": 0.1315, + "step": 7520 + }, + { + "epoch": 0.40097981788167636, + "grad_norm": 0.09098348021507263, + "learning_rate": 7.8961241457714e-05, + "loss": 0.1316, + "step": 7530 + }, + { + "epoch": 0.4015123275999787, + "grad_norm": 0.07034831494092941, + "learning_rate": 7.890470125362174e-05, + "loss": 0.132, + "step": 7540 + }, + { + "epoch": 0.40204483731828106, + "grad_norm": 0.08528514206409454, + "learning_rate": 7.884811793024009e-05, + "loss": 0.1317, + "step": 7550 + }, + { + "epoch": 0.40257734703658343, + "grad_norm": 0.10862760245800018, + "learning_rate": 7.879149166303719e-05, + "loss": 0.1315, + "step": 7560 + }, + { + "epoch": 0.40310985675488575, + "grad_norm": 0.04836263135075569, + "learning_rate": 7.873482262761438e-05, + "loss": 0.1317, + "step": 7570 + }, + { + "epoch": 0.40364236647318813, + "grad_norm": 0.051307760179042816, + "learning_rate": 7.867811099970568e-05, + "loss": 0.1328, + "step": 7580 + }, + { + "epoch": 0.4041748761914905, + "grad_norm": 0.05256601795554161, + "learning_rate": 7.862135695517712e-05, + "loss": 0.1321, + "step": 7590 + }, + { + "epoch": 0.4047073859097928, + "grad_norm": 0.05649365857243538, + "learning_rate": 7.856456067002633e-05, + "loss": 0.1314, + "step": 7600 + }, + { + "epoch": 0.4052398956280952, + "grad_norm": 0.04195050150156021, + "learning_rate": 7.85077223203819e-05, + "loss": 0.1327, + "step": 7610 + }, + { + "epoch": 0.4057724053463976, + "grad_norm": 0.07042062282562256, + "learning_rate": 7.845084208250286e-05, + "loss": 0.1319, + "step": 7620 + }, + { + "epoch": 0.40630491506469996, + "grad_norm": 0.048713624477386475, + "learning_rate": 7.839392013277814e-05, + "loss": 0.1315, + "step": 7630 + }, + { + "epoch": 0.4068374247830023, + "grad_norm": 0.05016913264989853, + "learning_rate": 7.833695664772605e-05, + "loss": 0.132, + "step": 7640 + }, + { + "epoch": 0.40736993450130465, + "grad_norm": 0.04809438809752464, + "learning_rate": 7.827995180399364e-05, + "loss": 0.1315, + "step": 7650 + }, + { + "epoch": 0.40790244421960703, + "grad_norm": 0.0424528494477272, + "learning_rate": 7.822290577835627e-05, + "loss": 0.1312, + "step": 7660 + }, + { + "epoch": 0.40843495393790935, + "grad_norm": 0.049090851098299026, + "learning_rate": 7.8165818747717e-05, + "loss": 0.1318, + "step": 7670 + }, + { + "epoch": 0.4089674636562117, + "grad_norm": 0.09739360958337784, + "learning_rate": 7.810869088910604e-05, + "loss": 0.1314, + "step": 7680 + }, + { + "epoch": 0.4094999733745141, + "grad_norm": 0.06400451064109802, + "learning_rate": 7.805152237968019e-05, + "loss": 0.1319, + "step": 7690 + }, + { + "epoch": 0.4100324830928164, + "grad_norm": 0.09439321607351303, + "learning_rate": 7.799431339672238e-05, + "loss": 0.1315, + "step": 7700 + }, + { + "epoch": 0.4105649928111188, + "grad_norm": 0.061424221843481064, + "learning_rate": 7.793706411764095e-05, + "loss": 0.132, + "step": 7710 + }, + { + "epoch": 0.4110975025294212, + "grad_norm": 0.06444218754768372, + "learning_rate": 7.787977471996928e-05, + "loss": 0.1313, + "step": 7720 + }, + { + "epoch": 0.4116300122477235, + "grad_norm": 0.052814047783613205, + "learning_rate": 7.782244538136513e-05, + "loss": 0.1316, + "step": 7730 + }, + { + "epoch": 0.4121625219660259, + "grad_norm": 0.06464862823486328, + "learning_rate": 7.776507627961012e-05, + "loss": 0.1313, + "step": 7740 + }, + { + "epoch": 0.41269503168432825, + "grad_norm": 0.05052724853157997, + "learning_rate": 7.770766759260918e-05, + "loss": 0.1317, + "step": 7750 + }, + { + "epoch": 0.41322754140263057, + "grad_norm": 0.10346025973558426, + "learning_rate": 7.765021949839e-05, + "loss": 0.1319, + "step": 7760 + }, + { + "epoch": 0.41376005112093295, + "grad_norm": 0.07890909165143967, + "learning_rate": 7.759273217510246e-05, + "loss": 0.1316, + "step": 7770 + }, + { + "epoch": 0.4142925608392353, + "grad_norm": 0.04561850428581238, + "learning_rate": 7.75352058010181e-05, + "loss": 0.1317, + "step": 7780 + }, + { + "epoch": 0.4148250705575377, + "grad_norm": 0.09326593577861786, + "learning_rate": 7.747764055452957e-05, + "loss": 0.1309, + "step": 7790 + }, + { + "epoch": 0.41535758027584, + "grad_norm": 0.06307931989431381, + "learning_rate": 7.742003661415007e-05, + "loss": 0.1307, + "step": 7800 + }, + { + "epoch": 0.4158900899941424, + "grad_norm": 0.07909877598285675, + "learning_rate": 7.736239415851274e-05, + "loss": 0.1312, + "step": 7810 + }, + { + "epoch": 0.4164225997124448, + "grad_norm": 0.05338076129555702, + "learning_rate": 7.730471336637024e-05, + "loss": 0.1309, + "step": 7820 + }, + { + "epoch": 0.4169551094307471, + "grad_norm": 0.08736453205347061, + "learning_rate": 7.724699441659404e-05, + "loss": 0.1321, + "step": 7830 + }, + { + "epoch": 0.41748761914904947, + "grad_norm": 0.062187109142541885, + "learning_rate": 7.718923748817397e-05, + "loss": 0.132, + "step": 7840 + }, + { + "epoch": 0.41802012886735185, + "grad_norm": 0.0855235755443573, + "learning_rate": 7.713144276021768e-05, + "loss": 0.1306, + "step": 7850 + }, + { + "epoch": 0.41855263858565417, + "grad_norm": 0.04441085830330849, + "learning_rate": 7.707361041194992e-05, + "loss": 0.1313, + "step": 7860 + }, + { + "epoch": 0.41908514830395655, + "grad_norm": 0.06373197585344315, + "learning_rate": 7.70157406227122e-05, + "loss": 0.1316, + "step": 7870 + }, + { + "epoch": 0.4196176580222589, + "grad_norm": 0.05832177773118019, + "learning_rate": 7.695783357196214e-05, + "loss": 0.1312, + "step": 7880 + }, + { + "epoch": 0.42015016774056124, + "grad_norm": 0.0553959384560585, + "learning_rate": 7.689988943927285e-05, + "loss": 0.1317, + "step": 7890 + }, + { + "epoch": 0.4206826774588636, + "grad_norm": 0.07334991544485092, + "learning_rate": 7.684190840433247e-05, + "loss": 0.1312, + "step": 7900 + }, + { + "epoch": 0.421215187177166, + "grad_norm": 0.08733541518449783, + "learning_rate": 7.67838906469436e-05, + "loss": 0.1317, + "step": 7910 + }, + { + "epoch": 0.4217476968954683, + "grad_norm": 0.07919137924909592, + "learning_rate": 7.672583634702262e-05, + "loss": 0.131, + "step": 7920 + }, + { + "epoch": 0.4222802066137707, + "grad_norm": 0.08723526448011398, + "learning_rate": 7.666774568459938e-05, + "loss": 0.1318, + "step": 7930 + }, + { + "epoch": 0.42281271633207307, + "grad_norm": 0.053012095391750336, + "learning_rate": 7.660961883981636e-05, + "loss": 0.1317, + "step": 7940 + }, + { + "epoch": 0.42334522605037544, + "grad_norm": 0.09206791967153549, + "learning_rate": 7.65514559929283e-05, + "loss": 0.1319, + "step": 7950 + }, + { + "epoch": 0.42387773576867777, + "grad_norm": 0.06498973816633224, + "learning_rate": 7.649325732430161e-05, + "loss": 0.1311, + "step": 7960 + }, + { + "epoch": 0.42441024548698014, + "grad_norm": 0.07660607993602753, + "learning_rate": 7.643502301441373e-05, + "loss": 0.1314, + "step": 7970 + }, + { + "epoch": 0.4249427552052825, + "grad_norm": 0.08989237993955612, + "learning_rate": 7.637675324385266e-05, + "loss": 0.1304, + "step": 7980 + }, + { + "epoch": 0.42547526492358484, + "grad_norm": 0.08027999103069305, + "learning_rate": 7.631844819331633e-05, + "loss": 0.1311, + "step": 7990 + }, + { + "epoch": 0.4260077746418872, + "grad_norm": 0.05923927202820778, + "learning_rate": 7.626010804361216e-05, + "loss": 0.1303, + "step": 8000 + }, + { + "epoch": 0.4265402843601896, + "grad_norm": 0.05849640443921089, + "learning_rate": 7.62017329756563e-05, + "loss": 0.1307, + "step": 8010 + }, + { + "epoch": 0.4270727940784919, + "grad_norm": 0.05768370255827904, + "learning_rate": 7.614332317047326e-05, + "loss": 0.1315, + "step": 8020 + }, + { + "epoch": 0.4276053037967943, + "grad_norm": 0.05652983486652374, + "learning_rate": 7.608487880919525e-05, + "loss": 0.1311, + "step": 8030 + }, + { + "epoch": 0.42813781351509667, + "grad_norm": 0.0556759238243103, + "learning_rate": 7.602640007306165e-05, + "loss": 0.1316, + "step": 8040 + }, + { + "epoch": 0.428670323233399, + "grad_norm": 0.04655342176556587, + "learning_rate": 7.596788714341843e-05, + "loss": 0.1313, + "step": 8050 + }, + { + "epoch": 0.42920283295170136, + "grad_norm": 0.048768457025289536, + "learning_rate": 7.590934020171758e-05, + "loss": 0.1308, + "step": 8060 + }, + { + "epoch": 0.42973534267000374, + "grad_norm": 0.05214981734752655, + "learning_rate": 7.58507594295166e-05, + "loss": 0.131, + "step": 8070 + }, + { + "epoch": 0.43026785238830606, + "grad_norm": 0.060043588280677795, + "learning_rate": 7.579214500847789e-05, + "loss": 0.1315, + "step": 8080 + }, + { + "epoch": 0.43080036210660844, + "grad_norm": 0.04958285391330719, + "learning_rate": 7.573349712036815e-05, + "loss": 0.1314, + "step": 8090 + }, + { + "epoch": 0.4313328718249108, + "grad_norm": 0.05665591359138489, + "learning_rate": 7.567481594705795e-05, + "loss": 0.1314, + "step": 8100 + }, + { + "epoch": 0.4318653815432132, + "grad_norm": 0.056042492389678955, + "learning_rate": 7.561610167052095e-05, + "loss": 0.131, + "step": 8110 + }, + { + "epoch": 0.4323978912615155, + "grad_norm": 0.05700002983212471, + "learning_rate": 7.555735447283364e-05, + "loss": 0.1313, + "step": 8120 + }, + { + "epoch": 0.4329304009798179, + "grad_norm": 0.05349269136786461, + "learning_rate": 7.549857453617446e-05, + "loss": 0.1313, + "step": 8130 + }, + { + "epoch": 0.43346291069812026, + "grad_norm": 0.05427918955683708, + "learning_rate": 7.543976204282342e-05, + "loss": 0.131, + "step": 8140 + }, + { + "epoch": 0.4339954204164226, + "grad_norm": 0.12502682209014893, + "learning_rate": 7.538091717516149e-05, + "loss": 0.1309, + "step": 8150 + }, + { + "epoch": 0.43452793013472496, + "grad_norm": 0.06011335179209709, + "learning_rate": 7.532204011567006e-05, + "loss": 0.1315, + "step": 8160 + }, + { + "epoch": 0.43506043985302734, + "grad_norm": 0.07122571021318436, + "learning_rate": 7.526313104693031e-05, + "loss": 0.1314, + "step": 8170 + }, + { + "epoch": 0.43559294957132966, + "grad_norm": 0.04538768157362938, + "learning_rate": 7.520419015162267e-05, + "loss": 0.1315, + "step": 8180 + }, + { + "epoch": 0.43612545928963203, + "grad_norm": 0.04720662534236908, + "learning_rate": 7.514521761252635e-05, + "loss": 0.131, + "step": 8190 + }, + { + "epoch": 0.4366579690079344, + "grad_norm": 0.07761963456869125, + "learning_rate": 7.508621361251858e-05, + "loss": 0.1316, + "step": 8200 + }, + { + "epoch": 0.43719047872623673, + "grad_norm": 0.08107470721006393, + "learning_rate": 7.502717833457424e-05, + "loss": 0.1308, + "step": 8210 + }, + { + "epoch": 0.4377229884445391, + "grad_norm": 0.08958134800195694, + "learning_rate": 7.496811196176513e-05, + "loss": 0.1314, + "step": 8220 + }, + { + "epoch": 0.4382554981628415, + "grad_norm": 0.04781255125999451, + "learning_rate": 7.490901467725957e-05, + "loss": 0.1306, + "step": 8230 + }, + { + "epoch": 0.4387880078811438, + "grad_norm": 0.06295894831418991, + "learning_rate": 7.484988666432165e-05, + "loss": 0.1311, + "step": 8240 + }, + { + "epoch": 0.4393205175994462, + "grad_norm": 0.06639114022254944, + "learning_rate": 7.479072810631078e-05, + "loss": 0.1311, + "step": 8250 + }, + { + "epoch": 0.43985302731774856, + "grad_norm": 0.06550955027341843, + "learning_rate": 7.473153918668112e-05, + "loss": 0.1307, + "step": 8260 + }, + { + "epoch": 0.44038553703605093, + "grad_norm": 0.06374099105596542, + "learning_rate": 7.467232008898098e-05, + "loss": 0.131, + "step": 8270 + }, + { + "epoch": 0.44091804675435325, + "grad_norm": 0.055466748774051666, + "learning_rate": 7.461307099685218e-05, + "loss": 0.1306, + "step": 8280 + }, + { + "epoch": 0.44145055647265563, + "grad_norm": 0.06467512249946594, + "learning_rate": 7.455379209402964e-05, + "loss": 0.1312, + "step": 8290 + }, + { + "epoch": 0.441983066190958, + "grad_norm": 0.10842160880565643, + "learning_rate": 7.44944835643407e-05, + "loss": 0.131, + "step": 8300 + }, + { + "epoch": 0.44251557590926033, + "grad_norm": 0.10142064094543457, + "learning_rate": 7.443514559170456e-05, + "loss": 0.1303, + "step": 8310 + }, + { + "epoch": 0.4430480856275627, + "grad_norm": 0.07040092349052429, + "learning_rate": 7.437577836013174e-05, + "loss": 0.1314, + "step": 8320 + }, + { + "epoch": 0.4435805953458651, + "grad_norm": 0.06632167845964432, + "learning_rate": 7.431638205372348e-05, + "loss": 0.1305, + "step": 8330 + }, + { + "epoch": 0.4441131050641674, + "grad_norm": 0.0742000862956047, + "learning_rate": 7.425695685667118e-05, + "loss": 0.1313, + "step": 8340 + }, + { + "epoch": 0.4446456147824698, + "grad_norm": 0.05238117650151253, + "learning_rate": 7.419750295325587e-05, + "loss": 0.131, + "step": 8350 + }, + { + "epoch": 0.44517812450077215, + "grad_norm": 0.06212290748953819, + "learning_rate": 7.413802052784756e-05, + "loss": 0.131, + "step": 8360 + }, + { + "epoch": 0.4457106342190745, + "grad_norm": 0.07771137356758118, + "learning_rate": 7.407850976490469e-05, + "loss": 0.1309, + "step": 8370 + }, + { + "epoch": 0.44624314393737685, + "grad_norm": 0.0551883801817894, + "learning_rate": 7.401897084897365e-05, + "loss": 0.1301, + "step": 8380 + }, + { + "epoch": 0.44677565365567923, + "grad_norm": 0.06460625678300858, + "learning_rate": 7.395940396468808e-05, + "loss": 0.1307, + "step": 8390 + }, + { + "epoch": 0.44730816337398155, + "grad_norm": 0.08054537326097488, + "learning_rate": 7.389980929676835e-05, + "loss": 0.1305, + "step": 8400 + }, + { + "epoch": 0.4478406730922839, + "grad_norm": 0.08456294983625412, + "learning_rate": 7.384018703002098e-05, + "loss": 0.1309, + "step": 8410 + }, + { + "epoch": 0.4483731828105863, + "grad_norm": 0.06319648027420044, + "learning_rate": 7.378053734933814e-05, + "loss": 0.1304, + "step": 8420 + }, + { + "epoch": 0.4489056925288887, + "grad_norm": 0.05323270335793495, + "learning_rate": 7.372086043969694e-05, + "loss": 0.1316, + "step": 8430 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 0.05555250123143196, + "learning_rate": 7.366115648615898e-05, + "loss": 0.1313, + "step": 8440 + }, + { + "epoch": 0.4499707119654934, + "grad_norm": 0.08050252497196198, + "learning_rate": 7.360142567386968e-05, + "loss": 0.1303, + "step": 8450 + }, + { + "epoch": 0.45050322168379575, + "grad_norm": 0.0804496631026268, + "learning_rate": 7.354166818805776e-05, + "loss": 0.1305, + "step": 8460 + }, + { + "epoch": 0.4510357314020981, + "grad_norm": 0.09748142957687378, + "learning_rate": 7.34818842140347e-05, + "loss": 0.1308, + "step": 8470 + }, + { + "epoch": 0.45156824112040045, + "grad_norm": 0.0667809545993805, + "learning_rate": 7.34220739371941e-05, + "loss": 0.1312, + "step": 8480 + }, + { + "epoch": 0.4521007508387028, + "grad_norm": 0.08125482499599457, + "learning_rate": 7.336223754301105e-05, + "loss": 0.1311, + "step": 8490 + }, + { + "epoch": 0.45263326055700515, + "grad_norm": 0.057649750262498856, + "learning_rate": 7.330237521704177e-05, + "loss": 0.1307, + "step": 8500 + }, + { + "epoch": 0.4531657702753075, + "grad_norm": 0.06427519768476486, + "learning_rate": 7.324248714492279e-05, + "loss": 0.131, + "step": 8510 + }, + { + "epoch": 0.4536982799936099, + "grad_norm": 0.05290444567799568, + "learning_rate": 7.31825735123705e-05, + "loss": 0.1304, + "step": 8520 + }, + { + "epoch": 0.4542307897119122, + "grad_norm": 0.056924887001514435, + "learning_rate": 7.312263450518061e-05, + "loss": 0.1296, + "step": 8530 + }, + { + "epoch": 0.4547632994302146, + "grad_norm": 0.06351561844348907, + "learning_rate": 7.306267030922745e-05, + "loss": 0.1306, + "step": 8540 + }, + { + "epoch": 0.455295809148517, + "grad_norm": 0.08165629208087921, + "learning_rate": 7.300268111046348e-05, + "loss": 0.1307, + "step": 8550 + }, + { + "epoch": 0.4558283188668193, + "grad_norm": 0.059766896069049835, + "learning_rate": 7.294266709491873e-05, + "loss": 0.1305, + "step": 8560 + }, + { + "epoch": 0.45636082858512167, + "grad_norm": 0.05260982736945152, + "learning_rate": 7.288262844870013e-05, + "loss": 0.1301, + "step": 8570 + }, + { + "epoch": 0.45689333830342405, + "grad_norm": 0.06455428898334503, + "learning_rate": 7.282256535799106e-05, + "loss": 0.1304, + "step": 8580 + }, + { + "epoch": 0.4574258480217264, + "grad_norm": 0.05693411827087402, + "learning_rate": 7.276247800905063e-05, + "loss": 0.1304, + "step": 8590 + }, + { + "epoch": 0.45795835774002874, + "grad_norm": 0.05784597992897034, + "learning_rate": 7.270236658821322e-05, + "loss": 0.1308, + "step": 8600 + }, + { + "epoch": 0.4584908674583311, + "grad_norm": 0.10032429546117783, + "learning_rate": 7.264223128188789e-05, + "loss": 0.1308, + "step": 8610 + }, + { + "epoch": 0.4590233771766335, + "grad_norm": 0.0799618735909462, + "learning_rate": 7.258207227655768e-05, + "loss": 0.1314, + "step": 8620 + }, + { + "epoch": 0.4595558868949358, + "grad_norm": 0.08555562049150467, + "learning_rate": 7.25218897587792e-05, + "loss": 0.1301, + "step": 8630 + }, + { + "epoch": 0.4600883966132382, + "grad_norm": 0.06158687174320221, + "learning_rate": 7.246168391518196e-05, + "loss": 0.1302, + "step": 8640 + }, + { + "epoch": 0.46062090633154057, + "grad_norm": 0.06019744649529457, + "learning_rate": 7.240145493246776e-05, + "loss": 0.1304, + "step": 8650 + }, + { + "epoch": 0.4611534160498429, + "grad_norm": 0.06112377345561981, + "learning_rate": 7.234120299741021e-05, + "loss": 0.1301, + "step": 8660 + }, + { + "epoch": 0.46168592576814527, + "grad_norm": 0.04358561709523201, + "learning_rate": 7.228092829685406e-05, + "loss": 0.1299, + "step": 8670 + }, + { + "epoch": 0.46221843548644764, + "grad_norm": 0.04648636281490326, + "learning_rate": 7.22206310177147e-05, + "loss": 0.1296, + "step": 8680 + }, + { + "epoch": 0.46275094520474996, + "grad_norm": 0.040558718144893646, + "learning_rate": 7.216031134697747e-05, + "loss": 0.1307, + "step": 8690 + }, + { + "epoch": 0.46328345492305234, + "grad_norm": 0.04816916212439537, + "learning_rate": 7.209996947169719e-05, + "loss": 0.1307, + "step": 8700 + }, + { + "epoch": 0.4638159646413547, + "grad_norm": 0.08434905111789703, + "learning_rate": 7.203960557899758e-05, + "loss": 0.1304, + "step": 8710 + }, + { + "epoch": 0.46434847435965704, + "grad_norm": 0.04249223694205284, + "learning_rate": 7.197921985607055e-05, + "loss": 0.1307, + "step": 8720 + }, + { + "epoch": 0.4648809840779594, + "grad_norm": 0.04334559664130211, + "learning_rate": 7.191881249017574e-05, + "loss": 0.1301, + "step": 8730 + }, + { + "epoch": 0.4654134937962618, + "grad_norm": 0.06121005490422249, + "learning_rate": 7.185838366863995e-05, + "loss": 0.1307, + "step": 8740 + }, + { + "epoch": 0.46594600351456417, + "grad_norm": 0.05099225789308548, + "learning_rate": 7.179793357885645e-05, + "loss": 0.1305, + "step": 8750 + }, + { + "epoch": 0.4664785132328665, + "grad_norm": 0.04724998399615288, + "learning_rate": 7.173746240828451e-05, + "loss": 0.1295, + "step": 8760 + }, + { + "epoch": 0.46701102295116886, + "grad_norm": 0.04000856354832649, + "learning_rate": 7.167697034444874e-05, + "loss": 0.13, + "step": 8770 + }, + { + "epoch": 0.46754353266947124, + "grad_norm": 0.09295206516981125, + "learning_rate": 7.161645757493858e-05, + "loss": 0.1301, + "step": 8780 + }, + { + "epoch": 0.46807604238777356, + "grad_norm": 0.05277612432837486, + "learning_rate": 7.155592428740765e-05, + "loss": 0.1303, + "step": 8790 + }, + { + "epoch": 0.46860855210607594, + "grad_norm": 0.05306980386376381, + "learning_rate": 7.14953706695732e-05, + "loss": 0.1297, + "step": 8800 + }, + { + "epoch": 0.4691410618243783, + "grad_norm": 0.06097976118326187, + "learning_rate": 7.14347969092155e-05, + "loss": 0.1308, + "step": 8810 + }, + { + "epoch": 0.46967357154268063, + "grad_norm": 0.059332527220249176, + "learning_rate": 7.137420319417738e-05, + "loss": 0.1296, + "step": 8820 + }, + { + "epoch": 0.470206081260983, + "grad_norm": 0.09293901175260544, + "learning_rate": 7.131358971236344e-05, + "loss": 0.1296, + "step": 8830 + }, + { + "epoch": 0.4707385909792854, + "grad_norm": 0.046720948070287704, + "learning_rate": 7.125295665173964e-05, + "loss": 0.1304, + "step": 8840 + }, + { + "epoch": 0.4712711006975877, + "grad_norm": 0.06865198165178299, + "learning_rate": 7.119230420033259e-05, + "loss": 0.1306, + "step": 8850 + }, + { + "epoch": 0.4718036104158901, + "grad_norm": 0.11196744441986084, + "learning_rate": 7.113163254622915e-05, + "loss": 0.1301, + "step": 8860 + }, + { + "epoch": 0.47233612013419246, + "grad_norm": 0.056259218603372574, + "learning_rate": 7.107094187757559e-05, + "loss": 0.1298, + "step": 8870 + }, + { + "epoch": 0.4728686298524948, + "grad_norm": 0.06268846988677979, + "learning_rate": 7.101023238257725e-05, + "loss": 0.1303, + "step": 8880 + }, + { + "epoch": 0.47340113957079716, + "grad_norm": 0.05749877542257309, + "learning_rate": 7.094950424949784e-05, + "loss": 0.1305, + "step": 8890 + }, + { + "epoch": 0.47393364928909953, + "grad_norm": 0.05980097874999046, + "learning_rate": 7.088875766665879e-05, + "loss": 0.1299, + "step": 8900 + }, + { + "epoch": 0.4744661590074019, + "grad_norm": 0.048347923904657364, + "learning_rate": 7.082799282243881e-05, + "loss": 0.1302, + "step": 8910 + }, + { + "epoch": 0.47499866872570423, + "grad_norm": 0.0524616502225399, + "learning_rate": 7.076720990527324e-05, + "loss": 0.1301, + "step": 8920 + }, + { + "epoch": 0.4755311784440066, + "grad_norm": 0.06477531045675278, + "learning_rate": 7.070640910365344e-05, + "loss": 0.1306, + "step": 8930 + }, + { + "epoch": 0.476063688162309, + "grad_norm": 0.05950429290533066, + "learning_rate": 7.064559060612625e-05, + "loss": 0.13, + "step": 8940 + }, + { + "epoch": 0.4765961978806113, + "grad_norm": 0.0458899661898613, + "learning_rate": 7.058475460129337e-05, + "loss": 0.1299, + "step": 8950 + }, + { + "epoch": 0.4771287075989137, + "grad_norm": 0.04977622628211975, + "learning_rate": 7.05239012778108e-05, + "loss": 0.1297, + "step": 8960 + }, + { + "epoch": 0.47766121731721606, + "grad_norm": 0.052012983709573746, + "learning_rate": 7.046303082438823e-05, + "loss": 0.1304, + "step": 8970 + }, + { + "epoch": 0.4781937270355184, + "grad_norm": 0.09166349470615387, + "learning_rate": 7.040214342978851e-05, + "loss": 0.1303, + "step": 8980 + }, + { + "epoch": 0.47872623675382076, + "grad_norm": 0.057922665029764175, + "learning_rate": 7.034123928282699e-05, + "loss": 0.1292, + "step": 8990 + }, + { + "epoch": 0.47925874647212313, + "grad_norm": 0.05284808203577995, + "learning_rate": 7.028031857237098e-05, + "loss": 0.1299, + "step": 9000 + }, + { + "epoch": 0.47979125619042545, + "grad_norm": 0.05781892314553261, + "learning_rate": 7.021938148733918e-05, + "loss": 0.1304, + "step": 9010 + }, + { + "epoch": 0.48032376590872783, + "grad_norm": 0.04390615597367287, + "learning_rate": 7.0158428216701e-05, + "loss": 0.1295, + "step": 9020 + }, + { + "epoch": 0.4808562756270302, + "grad_norm": 0.06015874817967415, + "learning_rate": 7.009745894947612e-05, + "loss": 0.1299, + "step": 9030 + }, + { + "epoch": 0.4813887853453325, + "grad_norm": 0.17922475934028625, + "learning_rate": 7.003647387473378e-05, + "loss": 0.1299, + "step": 9040 + }, + { + "epoch": 0.4819212950636349, + "grad_norm": 0.07132676243782043, + "learning_rate": 6.997547318159225e-05, + "loss": 0.1304, + "step": 9050 + }, + { + "epoch": 0.4824538047819373, + "grad_norm": 0.06266484409570694, + "learning_rate": 6.991445705921825e-05, + "loss": 0.1302, + "step": 9060 + }, + { + "epoch": 0.48298631450023966, + "grad_norm": 0.04912625625729561, + "learning_rate": 6.985342569682632e-05, + "loss": 0.1299, + "step": 9070 + }, + { + "epoch": 0.483518824218542, + "grad_norm": 0.05088292434811592, + "learning_rate": 6.979237928367827e-05, + "loss": 0.1298, + "step": 9080 + }, + { + "epoch": 0.48405133393684435, + "grad_norm": 0.09333918988704681, + "learning_rate": 6.973131800908262e-05, + "loss": 0.13, + "step": 9090 + }, + { + "epoch": 0.48458384365514673, + "grad_norm": 0.05258602276444435, + "learning_rate": 6.967024206239392e-05, + "loss": 0.1292, + "step": 9100 + }, + { + "epoch": 0.48511635337344905, + "grad_norm": 0.05117359384894371, + "learning_rate": 6.960915163301222e-05, + "loss": 0.1298, + "step": 9110 + }, + { + "epoch": 0.4856488630917514, + "grad_norm": 0.0650695338845253, + "learning_rate": 6.954804691038255e-05, + "loss": 0.1302, + "step": 9120 + }, + { + "epoch": 0.4861813728100538, + "grad_norm": 0.07531211525201797, + "learning_rate": 6.948692808399417e-05, + "loss": 0.129, + "step": 9130 + }, + { + "epoch": 0.4867138825283561, + "grad_norm": 0.0522490069270134, + "learning_rate": 6.942579534338018e-05, + "loss": 0.1302, + "step": 9140 + }, + { + "epoch": 0.4872463922466585, + "grad_norm": 0.0909682959318161, + "learning_rate": 6.93646488781167e-05, + "loss": 0.1288, + "step": 9150 + }, + { + "epoch": 0.4877789019649609, + "grad_norm": 0.0672360509634018, + "learning_rate": 6.930348887782257e-05, + "loss": 0.1298, + "step": 9160 + }, + { + "epoch": 0.4883114116832632, + "grad_norm": 0.050222091376781464, + "learning_rate": 6.924231553215845e-05, + "loss": 0.1291, + "step": 9170 + }, + { + "epoch": 0.4888439214015656, + "grad_norm": 0.0731450617313385, + "learning_rate": 6.918112903082648e-05, + "loss": 0.1295, + "step": 9180 + }, + { + "epoch": 0.48937643111986795, + "grad_norm": 0.044536300003528595, + "learning_rate": 6.911992956356958e-05, + "loss": 0.1296, + "step": 9190 + }, + { + "epoch": 0.48990894083817027, + "grad_norm": 0.10119880735874176, + "learning_rate": 6.905871732017083e-05, + "loss": 0.1297, + "step": 9200 + }, + { + "epoch": 0.49044145055647265, + "grad_norm": 0.04427400976419449, + "learning_rate": 6.8997492490453e-05, + "loss": 0.1296, + "step": 9210 + }, + { + "epoch": 0.490973960274775, + "grad_norm": 0.05631903558969498, + "learning_rate": 6.893625526427785e-05, + "loss": 0.1294, + "step": 9220 + }, + { + "epoch": 0.4915064699930774, + "grad_norm": 0.05250485986471176, + "learning_rate": 6.88750058315456e-05, + "loss": 0.1288, + "step": 9230 + }, + { + "epoch": 0.4920389797113797, + "grad_norm": 0.04813829064369202, + "learning_rate": 6.881374438219426e-05, + "loss": 0.1299, + "step": 9240 + }, + { + "epoch": 0.4925714894296821, + "grad_norm": 0.10428118705749512, + "learning_rate": 6.875247110619923e-05, + "loss": 0.1293, + "step": 9250 + }, + { + "epoch": 0.4931039991479845, + "grad_norm": 0.05188250541687012, + "learning_rate": 6.869118619357244e-05, + "loss": 0.1298, + "step": 9260 + }, + { + "epoch": 0.4936365088662868, + "grad_norm": 0.06389789283275604, + "learning_rate": 6.862988983436205e-05, + "loss": 0.1297, + "step": 9270 + }, + { + "epoch": 0.49416901858458917, + "grad_norm": 0.05871303752064705, + "learning_rate": 6.856858221865158e-05, + "loss": 0.1296, + "step": 9280 + }, + { + "epoch": 0.49470152830289155, + "grad_norm": 0.09698927402496338, + "learning_rate": 6.850726353655956e-05, + "loss": 0.13, + "step": 9290 + }, + { + "epoch": 0.49523403802119387, + "grad_norm": 0.0667075663805008, + "learning_rate": 6.844593397823881e-05, + "loss": 0.1294, + "step": 9300 + }, + { + "epoch": 0.49576654773949624, + "grad_norm": 0.05773301422595978, + "learning_rate": 6.838459373387583e-05, + "loss": 0.1294, + "step": 9310 + }, + { + "epoch": 0.4962990574577986, + "grad_norm": 0.06608272343873978, + "learning_rate": 6.83232429936903e-05, + "loss": 0.1293, + "step": 9320 + }, + { + "epoch": 0.49683156717610094, + "grad_norm": 0.057207848876714706, + "learning_rate": 6.826188194793447e-05, + "loss": 0.1291, + "step": 9330 + }, + { + "epoch": 0.4973640768944033, + "grad_norm": 0.03619164600968361, + "learning_rate": 6.82005107868925e-05, + "loss": 0.1294, + "step": 9340 + }, + { + "epoch": 0.4978965866127057, + "grad_norm": 0.046284269541502, + "learning_rate": 6.813912970087994e-05, + "loss": 0.1294, + "step": 9350 + }, + { + "epoch": 0.498429096331008, + "grad_norm": 0.0476924329996109, + "learning_rate": 6.807773888024314e-05, + "loss": 0.1288, + "step": 9360 + }, + { + "epoch": 0.4989616060493104, + "grad_norm": 0.06622269749641418, + "learning_rate": 6.801633851535857e-05, + "loss": 0.1288, + "step": 9370 + }, + { + "epoch": 0.49949411576761277, + "grad_norm": 0.042118556797504425, + "learning_rate": 6.795492879663237e-05, + "loss": 0.1285, + "step": 9380 + }, + { + "epoch": 0.5000266254859151, + "grad_norm": 0.044616151601076126, + "learning_rate": 6.789350991449966e-05, + "loss": 0.1282, + "step": 9390 + }, + { + "epoch": 0.5005591352042175, + "grad_norm": 0.053620822727680206, + "learning_rate": 6.783208205942399e-05, + "loss": 0.1288, + "step": 9400 + }, + { + "epoch": 0.5010916449225198, + "grad_norm": 0.05040338635444641, + "learning_rate": 6.777064542189668e-05, + "loss": 0.1294, + "step": 9410 + }, + { + "epoch": 0.5016241546408222, + "grad_norm": 0.07730654627084732, + "learning_rate": 6.770920019243636e-05, + "loss": 0.1291, + "step": 9420 + }, + { + "epoch": 0.5021566643591245, + "grad_norm": 0.04234246164560318, + "learning_rate": 6.764774656158825e-05, + "loss": 0.1291, + "step": 9430 + }, + { + "epoch": 0.5026891740774269, + "grad_norm": 0.04666012525558472, + "learning_rate": 6.758628471992365e-05, + "loss": 0.1286, + "step": 9440 + }, + { + "epoch": 0.5032216837957293, + "grad_norm": 0.043177202343940735, + "learning_rate": 6.752481485803933e-05, + "loss": 0.1287, + "step": 9450 + }, + { + "epoch": 0.5037541935140316, + "grad_norm": 0.05593249201774597, + "learning_rate": 6.746333716655691e-05, + "loss": 0.1296, + "step": 9460 + }, + { + "epoch": 0.504286703232334, + "grad_norm": 0.050741735845804214, + "learning_rate": 6.740185183612227e-05, + "loss": 0.1286, + "step": 9470 + }, + { + "epoch": 0.5048192129506364, + "grad_norm": 0.08293752372264862, + "learning_rate": 6.734035905740504e-05, + "loss": 0.1289, + "step": 9480 + }, + { + "epoch": 0.5053517226689387, + "grad_norm": 0.11741827428340912, + "learning_rate": 6.727885902109785e-05, + "loss": 0.1286, + "step": 9490 + }, + { + "epoch": 0.5058842323872411, + "grad_norm": 0.05878937989473343, + "learning_rate": 6.7217351917916e-05, + "loss": 0.1288, + "step": 9500 + }, + { + "epoch": 0.5064167421055434, + "grad_norm": 0.04729843512177467, + "learning_rate": 6.715583793859652e-05, + "loss": 0.1291, + "step": 9510 + }, + { + "epoch": 0.5069492518238458, + "grad_norm": 0.04623175784945488, + "learning_rate": 6.709431727389789e-05, + "loss": 0.1275, + "step": 9520 + }, + { + "epoch": 0.5074817615421482, + "grad_norm": 0.047292426228523254, + "learning_rate": 6.703279011459927e-05, + "loss": 0.1285, + "step": 9530 + }, + { + "epoch": 0.5080142712604505, + "grad_norm": 0.04683827981352806, + "learning_rate": 6.697125665149993e-05, + "loss": 0.1283, + "step": 9540 + }, + { + "epoch": 0.5085467809787528, + "grad_norm": 0.08465840667486191, + "learning_rate": 6.69097170754188e-05, + "loss": 0.1289, + "step": 9550 + }, + { + "epoch": 0.5090792906970553, + "grad_norm": 0.04499583691358566, + "learning_rate": 6.684817157719364e-05, + "loss": 0.1295, + "step": 9560 + }, + { + "epoch": 0.5096118004153576, + "grad_norm": 0.05609264224767685, + "learning_rate": 6.678662034768063e-05, + "loss": 0.1291, + "step": 9570 + }, + { + "epoch": 0.5101443101336599, + "grad_norm": 0.04982760548591614, + "learning_rate": 6.672506357775375e-05, + "loss": 0.1287, + "step": 9580 + }, + { + "epoch": 0.5106768198519623, + "grad_norm": 0.04551566392183304, + "learning_rate": 6.666350145830413e-05, + "loss": 0.1287, + "step": 9590 + }, + { + "epoch": 0.5112093295702647, + "grad_norm": 0.06523692607879639, + "learning_rate": 6.660193418023947e-05, + "loss": 0.1289, + "step": 9600 + }, + { + "epoch": 0.511741839288567, + "grad_norm": 0.09148914366960526, + "learning_rate": 6.654036193448349e-05, + "loss": 0.1285, + "step": 9610 + }, + { + "epoch": 0.5122743490068694, + "grad_norm": 0.040613338351249695, + "learning_rate": 6.647878491197535e-05, + "loss": 0.1281, + "step": 9620 + }, + { + "epoch": 0.5128068587251717, + "grad_norm": 0.06947502493858337, + "learning_rate": 6.641720330366894e-05, + "loss": 0.1281, + "step": 9630 + }, + { + "epoch": 0.513339368443474, + "grad_norm": 0.060511503368616104, + "learning_rate": 6.635561730053245e-05, + "loss": 0.1285, + "step": 9640 + }, + { + "epoch": 0.5138718781617765, + "grad_norm": 0.06579563021659851, + "learning_rate": 6.629402709354766e-05, + "loss": 0.1282, + "step": 9650 + }, + { + "epoch": 0.5144043878800788, + "grad_norm": 0.055754803121089935, + "learning_rate": 6.62324328737094e-05, + "loss": 0.1284, + "step": 9660 + }, + { + "epoch": 0.5149368975983811, + "grad_norm": 0.05002092942595482, + "learning_rate": 6.617083483202493e-05, + "loss": 0.1295, + "step": 9670 + }, + { + "epoch": 0.5154694073166836, + "grad_norm": 0.03860372677445412, + "learning_rate": 6.610923315951336e-05, + "loss": 0.1284, + "step": 9680 + }, + { + "epoch": 0.5160019170349859, + "grad_norm": 0.08359445631504059, + "learning_rate": 6.604762804720508e-05, + "loss": 0.1285, + "step": 9690 + }, + { + "epoch": 0.5165344267532882, + "grad_norm": 0.05430614575743675, + "learning_rate": 6.598601968614115e-05, + "loss": 0.1283, + "step": 9700 + }, + { + "epoch": 0.5170669364715906, + "grad_norm": 0.05833563208580017, + "learning_rate": 6.592440826737266e-05, + "loss": 0.1289, + "step": 9710 + }, + { + "epoch": 0.517599446189893, + "grad_norm": 0.05940975248813629, + "learning_rate": 6.586279398196023e-05, + "loss": 0.1284, + "step": 9720 + }, + { + "epoch": 0.5181319559081953, + "grad_norm": 0.0453868992626667, + "learning_rate": 6.580117702097332e-05, + "loss": 0.1288, + "step": 9730 + }, + { + "epoch": 0.5186644656264977, + "grad_norm": 0.042583536356687546, + "learning_rate": 6.57395575754898e-05, + "loss": 0.1293, + "step": 9740 + }, + { + "epoch": 0.5191969753448, + "grad_norm": 0.05306556820869446, + "learning_rate": 6.567793583659507e-05, + "loss": 0.128, + "step": 9750 + }, + { + "epoch": 0.5197294850631023, + "grad_norm": 0.04358596354722977, + "learning_rate": 6.561631199538179e-05, + "loss": 0.129, + "step": 9760 + }, + { + "epoch": 0.5202619947814048, + "grad_norm": 0.11662445962429047, + "learning_rate": 6.555468624294907e-05, + "loss": 0.1276, + "step": 9770 + }, + { + "epoch": 0.5207945044997071, + "grad_norm": 0.050507139414548874, + "learning_rate": 6.549305877040199e-05, + "loss": 0.1291, + "step": 9780 + }, + { + "epoch": 0.5213270142180095, + "grad_norm": 0.059976451098918915, + "learning_rate": 6.543142976885088e-05, + "loss": 0.1279, + "step": 9790 + }, + { + "epoch": 0.5218595239363119, + "grad_norm": 0.04601925238966942, + "learning_rate": 6.536979942941091e-05, + "loss": 0.1288, + "step": 9800 + }, + { + "epoch": 0.5223920336546142, + "grad_norm": 0.05751890689134598, + "learning_rate": 6.530816794320134e-05, + "loss": 0.1283, + "step": 9810 + }, + { + "epoch": 0.5229245433729166, + "grad_norm": 0.05591721832752228, + "learning_rate": 6.524653550134501e-05, + "loss": 0.1287, + "step": 9820 + }, + { + "epoch": 0.5234570530912189, + "grad_norm": 0.05766240507364273, + "learning_rate": 6.518490229496772e-05, + "loss": 0.1285, + "step": 9830 + }, + { + "epoch": 0.5239895628095212, + "grad_norm": 0.054135777056217194, + "learning_rate": 6.512326851519762e-05, + "loss": 0.1287, + "step": 9840 + }, + { + "epoch": 0.5245220725278237, + "grad_norm": 0.04491560161113739, + "learning_rate": 6.506163435316468e-05, + "loss": 0.1276, + "step": 9850 + }, + { + "epoch": 0.525054582246126, + "grad_norm": 0.044994186609983444, + "learning_rate": 6.5e-05, + "loss": 0.1286, + "step": 9860 + }, + { + "epoch": 0.5255870919644283, + "grad_norm": 0.0446479506790638, + "learning_rate": 6.493836564683533e-05, + "loss": 0.1286, + "step": 9870 + }, + { + "epoch": 0.5261196016827308, + "grad_norm": 0.06419171392917633, + "learning_rate": 6.48767314848024e-05, + "loss": 0.1283, + "step": 9880 + }, + { + "epoch": 0.5266521114010331, + "grad_norm": 0.041707735508680344, + "learning_rate": 6.481509770503229e-05, + "loss": 0.1275, + "step": 9890 + }, + { + "epoch": 0.5271846211193354, + "grad_norm": 0.08214934170246124, + "learning_rate": 6.475346449865499e-05, + "loss": 0.1287, + "step": 9900 + }, + { + "epoch": 0.5277171308376378, + "grad_norm": 0.09313659369945526, + "learning_rate": 6.469183205679865e-05, + "loss": 0.1277, + "step": 9910 + }, + { + "epoch": 0.5282496405559401, + "grad_norm": 0.05460633337497711, + "learning_rate": 6.46302005705891e-05, + "loss": 0.1287, + "step": 9920 + }, + { + "epoch": 0.5287821502742425, + "grad_norm": 0.0486149825155735, + "learning_rate": 6.456857023114913e-05, + "loss": 0.1276, + "step": 9930 + }, + { + "epoch": 0.5293146599925449, + "grad_norm": 0.04761586710810661, + "learning_rate": 6.450694122959801e-05, + "loss": 0.1287, + "step": 9940 + }, + { + "epoch": 0.5298471697108472, + "grad_norm": 0.04752049222588539, + "learning_rate": 6.444531375705092e-05, + "loss": 0.1285, + "step": 9950 + }, + { + "epoch": 0.5303796794291495, + "grad_norm": 0.06729278713464737, + "learning_rate": 6.438368800461821e-05, + "loss": 0.1286, + "step": 9960 + }, + { + "epoch": 0.530912189147452, + "grad_norm": 0.04480813071131706, + "learning_rate": 6.432206416340492e-05, + "loss": 0.1284, + "step": 9970 + }, + { + "epoch": 0.5314446988657543, + "grad_norm": 0.040219422429800034, + "learning_rate": 6.426044242451022e-05, + "loss": 0.1287, + "step": 9980 + }, + { + "epoch": 0.5319772085840566, + "grad_norm": 0.03565455228090286, + "learning_rate": 6.419882297902667e-05, + "loss": 0.1284, + "step": 9990 + }, + { + "epoch": 0.532509718302359, + "grad_norm": 0.05310383439064026, + "learning_rate": 6.413720601803979e-05, + "loss": 0.1285, + "step": 10000 + }, + { + "epoch": 0.5330422280206614, + "grad_norm": 0.07043947279453278, + "learning_rate": 6.407559173262735e-05, + "loss": 0.1283, + "step": 10010 + }, + { + "epoch": 0.5335747377389637, + "grad_norm": 0.04902435466647148, + "learning_rate": 6.401398031385886e-05, + "loss": 0.1276, + "step": 10020 + }, + { + "epoch": 0.5341072474572661, + "grad_norm": 0.05668781325221062, + "learning_rate": 6.395237195279491e-05, + "loss": 0.1283, + "step": 10030 + }, + { + "epoch": 0.5346397571755684, + "grad_norm": 0.05145740881562233, + "learning_rate": 6.389076684048664e-05, + "loss": 0.1264, + "step": 10040 + }, + { + "epoch": 0.5351722668938708, + "grad_norm": 0.1139606162905693, + "learning_rate": 6.382916516797508e-05, + "loss": 0.1283, + "step": 10050 + }, + { + "epoch": 0.5357047766121732, + "grad_norm": 0.05299168825149536, + "learning_rate": 6.376756712629059e-05, + "loss": 0.1281, + "step": 10060 + }, + { + "epoch": 0.5362372863304755, + "grad_norm": 0.06942315399646759, + "learning_rate": 6.370597290645234e-05, + "loss": 0.1281, + "step": 10070 + }, + { + "epoch": 0.5367697960487778, + "grad_norm": 0.07276537269353867, + "learning_rate": 6.364438269946755e-05, + "loss": 0.1284, + "step": 10080 + }, + { + "epoch": 0.5373023057670803, + "grad_norm": 0.043881241232156754, + "learning_rate": 6.358279669633106e-05, + "loss": 0.1275, + "step": 10090 + }, + { + "epoch": 0.5378348154853826, + "grad_norm": 0.047917068004608154, + "learning_rate": 6.352121508802467e-05, + "loss": 0.1282, + "step": 10100 + }, + { + "epoch": 0.538367325203685, + "grad_norm": 0.04417307674884796, + "learning_rate": 6.345963806551651e-05, + "loss": 0.1281, + "step": 10110 + }, + { + "epoch": 0.5388998349219873, + "grad_norm": 0.053708259016275406, + "learning_rate": 6.339806581976055e-05, + "loss": 0.1286, + "step": 10120 + }, + { + "epoch": 0.5394323446402897, + "grad_norm": 0.05327571928501129, + "learning_rate": 6.333649854169587e-05, + "loss": 0.1284, + "step": 10130 + }, + { + "epoch": 0.5399648543585921, + "grad_norm": 0.062333524227142334, + "learning_rate": 6.327493642224624e-05, + "loss": 0.1281, + "step": 10140 + }, + { + "epoch": 0.5404973640768944, + "grad_norm": 0.04436059668660164, + "learning_rate": 6.321337965231937e-05, + "loss": 0.1278, + "step": 10150 + }, + { + "epoch": 0.5410298737951967, + "grad_norm": 0.07489614933729172, + "learning_rate": 6.315182842280638e-05, + "loss": 0.1284, + "step": 10160 + }, + { + "epoch": 0.5415623835134992, + "grad_norm": 0.06262974441051483, + "learning_rate": 6.309028292458122e-05, + "loss": 0.1269, + "step": 10170 + }, + { + "epoch": 0.5420948932318015, + "grad_norm": 0.08940589427947998, + "learning_rate": 6.302874334850006e-05, + "loss": 0.128, + "step": 10180 + }, + { + "epoch": 0.5426274029501038, + "grad_norm": 0.039577096700668335, + "learning_rate": 6.296720988540075e-05, + "loss": 0.1273, + "step": 10190 + }, + { + "epoch": 0.5431599126684062, + "grad_norm": 0.05988942086696625, + "learning_rate": 6.290568272610211e-05, + "loss": 0.1276, + "step": 10200 + }, + { + "epoch": 0.5436924223867086, + "grad_norm": 0.047797802835702896, + "learning_rate": 6.284416206140348e-05, + "loss": 0.1278, + "step": 10210 + }, + { + "epoch": 0.5442249321050109, + "grad_norm": 0.05901528522372246, + "learning_rate": 6.278264808208402e-05, + "loss": 0.1283, + "step": 10220 + }, + { + "epoch": 0.5447574418233133, + "grad_norm": 0.10273321717977524, + "learning_rate": 6.272114097890213e-05, + "loss": 0.1279, + "step": 10230 + }, + { + "epoch": 0.5452899515416156, + "grad_norm": 0.07229287177324295, + "learning_rate": 6.265964094259498e-05, + "loss": 0.1283, + "step": 10240 + }, + { + "epoch": 0.545822461259918, + "grad_norm": 0.04700973257422447, + "learning_rate": 6.259814816387775e-05, + "loss": 0.1276, + "step": 10250 + }, + { + "epoch": 0.5463549709782204, + "grad_norm": 0.10428871214389801, + "learning_rate": 6.25366628334431e-05, + "loss": 0.1285, + "step": 10260 + }, + { + "epoch": 0.5468874806965227, + "grad_norm": 0.048143282532691956, + "learning_rate": 6.247518514196067e-05, + "loss": 0.1275, + "step": 10270 + }, + { + "epoch": 0.547419990414825, + "grad_norm": 0.054553814232349396, + "learning_rate": 6.241371528007634e-05, + "loss": 0.1281, + "step": 10280 + }, + { + "epoch": 0.5479525001331275, + "grad_norm": 0.07188910245895386, + "learning_rate": 6.235225343841174e-05, + "loss": 0.1276, + "step": 10290 + }, + { + "epoch": 0.5484850098514298, + "grad_norm": 0.05446217581629753, + "learning_rate": 6.229079980756365e-05, + "loss": 0.1283, + "step": 10300 + }, + { + "epoch": 0.5490175195697321, + "grad_norm": 0.041187744587659836, + "learning_rate": 6.222935457810333e-05, + "loss": 0.1277, + "step": 10310 + }, + { + "epoch": 0.5495500292880345, + "grad_norm": 0.05523503199219704, + "learning_rate": 6.216791794057601e-05, + "loss": 0.1276, + "step": 10320 + }, + { + "epoch": 0.5500825390063369, + "grad_norm": 0.044267792254686356, + "learning_rate": 6.210649008550033e-05, + "loss": 0.1279, + "step": 10330 + }, + { + "epoch": 0.5506150487246392, + "grad_norm": 0.04887842759490013, + "learning_rate": 6.204507120336764e-05, + "loss": 0.128, + "step": 10340 + }, + { + "epoch": 0.5511475584429416, + "grad_norm": 0.04334214702248573, + "learning_rate": 6.198366148464143e-05, + "loss": 0.1276, + "step": 10350 + }, + { + "epoch": 0.5516800681612439, + "grad_norm": 0.05289037153124809, + "learning_rate": 6.192226111975687e-05, + "loss": 0.1275, + "step": 10360 + }, + { + "epoch": 0.5522125778795463, + "grad_norm": 0.05000938102602959, + "learning_rate": 6.186087029912005e-05, + "loss": 0.1277, + "step": 10370 + }, + { + "epoch": 0.5527450875978487, + "grad_norm": 0.05687737837433815, + "learning_rate": 6.179948921310749e-05, + "loss": 0.1282, + "step": 10380 + }, + { + "epoch": 0.553277597316151, + "grad_norm": 0.040263786911964417, + "learning_rate": 6.173811805206553e-05, + "loss": 0.1275, + "step": 10390 + }, + { + "epoch": 0.5538101070344533, + "grad_norm": 0.056092556565999985, + "learning_rate": 6.16767570063097e-05, + "loss": 0.1272, + "step": 10400 + }, + { + "epoch": 0.5543426167527558, + "grad_norm": 0.04456920921802521, + "learning_rate": 6.161540626612419e-05, + "loss": 0.1271, + "step": 10410 + }, + { + "epoch": 0.5548751264710581, + "grad_norm": 0.05201718211174011, + "learning_rate": 6.15540660217612e-05, + "loss": 0.1273, + "step": 10420 + }, + { + "epoch": 0.5554076361893605, + "grad_norm": 0.045165225863456726, + "learning_rate": 6.149273646344044e-05, + "loss": 0.1271, + "step": 10430 + }, + { + "epoch": 0.5559401459076628, + "grad_norm": 0.07475852966308594, + "learning_rate": 6.14314177813484e-05, + "loss": 0.128, + "step": 10440 + }, + { + "epoch": 0.5564726556259652, + "grad_norm": 0.08229029923677444, + "learning_rate": 6.137011016563797e-05, + "loss": 0.1277, + "step": 10450 + }, + { + "epoch": 0.5570051653442676, + "grad_norm": 0.09118565171957016, + "learning_rate": 6.130881380642755e-05, + "loss": 0.1277, + "step": 10460 + }, + { + "epoch": 0.5575376750625699, + "grad_norm": 0.04762515053153038, + "learning_rate": 6.124752889380079e-05, + "loss": 0.1275, + "step": 10470 + }, + { + "epoch": 0.5580701847808722, + "grad_norm": 0.03888937830924988, + "learning_rate": 6.118625561780574e-05, + "loss": 0.1275, + "step": 10480 + }, + { + "epoch": 0.5586026944991747, + "grad_norm": 0.04357834532856941, + "learning_rate": 6.112499416845443e-05, + "loss": 0.1279, + "step": 10490 + }, + { + "epoch": 0.559135204217477, + "grad_norm": 0.06639399379491806, + "learning_rate": 6.106374473572216e-05, + "loss": 0.128, + "step": 10500 + }, + { + "epoch": 0.5596677139357793, + "grad_norm": 0.051041729748249054, + "learning_rate": 6.100250750954699e-05, + "loss": 0.128, + "step": 10510 + }, + { + "epoch": 0.5602002236540817, + "grad_norm": 0.08065960556268692, + "learning_rate": 6.094128267982916e-05, + "loss": 0.1275, + "step": 10520 + }, + { + "epoch": 0.560732733372384, + "grad_norm": 0.04977899789810181, + "learning_rate": 6.0880070436430424e-05, + "loss": 0.1283, + "step": 10530 + }, + { + "epoch": 0.5612652430906864, + "grad_norm": 0.04065399989485741, + "learning_rate": 6.081887096917351e-05, + "loss": 0.1272, + "step": 10540 + }, + { + "epoch": 0.5617977528089888, + "grad_norm": 0.05486559495329857, + "learning_rate": 6.075768446784154e-05, + "loss": 0.1273, + "step": 10550 + }, + { + "epoch": 0.5623302625272911, + "grad_norm": 0.053763121366500854, + "learning_rate": 6.0696511122177436e-05, + "loss": 0.1275, + "step": 10560 + }, + { + "epoch": 0.5628627722455934, + "grad_norm": 0.049751050770282745, + "learning_rate": 6.063535112188329e-05, + "loss": 0.1263, + "step": 10570 + }, + { + "epoch": 0.5633952819638959, + "grad_norm": 0.036564771085977554, + "learning_rate": 6.057420465661982e-05, + "loss": 0.128, + "step": 10580 + }, + { + "epoch": 0.5639277916821982, + "grad_norm": 0.05281112715601921, + "learning_rate": 6.051307191600581e-05, + "loss": 0.1269, + "step": 10590 + }, + { + "epoch": 0.5644603014005005, + "grad_norm": 0.053066980093717575, + "learning_rate": 6.045195308961746e-05, + "loss": 0.1269, + "step": 10600 + }, + { + "epoch": 0.564992811118803, + "grad_norm": 0.05914291366934776, + "learning_rate": 6.039084836698779e-05, + "loss": 0.1275, + "step": 10610 + }, + { + "epoch": 0.5655253208371053, + "grad_norm": 0.06061727926135063, + "learning_rate": 6.032975793760609e-05, + "loss": 0.1274, + "step": 10620 + }, + { + "epoch": 0.5660578305554076, + "grad_norm": 0.06171563267707825, + "learning_rate": 6.026868199091737e-05, + "loss": 0.1273, + "step": 10630 + }, + { + "epoch": 0.56659034027371, + "grad_norm": 0.07542983442544937, + "learning_rate": 6.020762071632172e-05, + "loss": 0.1276, + "step": 10640 + }, + { + "epoch": 0.5671228499920123, + "grad_norm": 0.04028952494263649, + "learning_rate": 6.014657430317368e-05, + "loss": 0.1267, + "step": 10650 + }, + { + "epoch": 0.5676553597103147, + "grad_norm": 0.05201804265379906, + "learning_rate": 6.0085542940781755e-05, + "loss": 0.1273, + "step": 10660 + }, + { + "epoch": 0.5681878694286171, + "grad_norm": 0.047102462500333786, + "learning_rate": 6.0024526818407745e-05, + "loss": 0.1267, + "step": 10670 + }, + { + "epoch": 0.5687203791469194, + "grad_norm": 0.06168229877948761, + "learning_rate": 5.996352612526623e-05, + "loss": 0.1272, + "step": 10680 + }, + { + "epoch": 0.5692528888652217, + "grad_norm": 0.05583483725786209, + "learning_rate": 5.9902541050523886e-05, + "loss": 0.1278, + "step": 10690 + }, + { + "epoch": 0.5697853985835242, + "grad_norm": 0.06403730064630508, + "learning_rate": 5.9841571783299e-05, + "loss": 0.1273, + "step": 10700 + }, + { + "epoch": 0.5703179083018265, + "grad_norm": 0.042866677045822144, + "learning_rate": 5.9780618512660834e-05, + "loss": 0.1275, + "step": 10710 + }, + { + "epoch": 0.5708504180201288, + "grad_norm": 0.05416185408830643, + "learning_rate": 5.971968142762903e-05, + "loss": 0.1276, + "step": 10720 + }, + { + "epoch": 0.5713829277384312, + "grad_norm": 0.04284673184156418, + "learning_rate": 5.9658760717173e-05, + "loss": 0.1276, + "step": 10730 + }, + { + "epoch": 0.5719154374567336, + "grad_norm": 0.050528384745121, + "learning_rate": 5.959785657021149e-05, + "loss": 0.1272, + "step": 10740 + }, + { + "epoch": 0.572447947175036, + "grad_norm": 0.0541527085006237, + "learning_rate": 5.953696917561178e-05, + "loss": 0.1272, + "step": 10750 + }, + { + "epoch": 0.5729804568933383, + "grad_norm": 0.04789347946643829, + "learning_rate": 5.947609872218922e-05, + "loss": 0.1276, + "step": 10760 + }, + { + "epoch": 0.5735129666116406, + "grad_norm": 0.04346901550889015, + "learning_rate": 5.9415245398706645e-05, + "loss": 0.1273, + "step": 10770 + }, + { + "epoch": 0.5740454763299431, + "grad_norm": 0.03955162316560745, + "learning_rate": 5.9354409393873756e-05, + "loss": 0.1271, + "step": 10780 + }, + { + "epoch": 0.5745779860482454, + "grad_norm": 0.05987564101815224, + "learning_rate": 5.929359089634657e-05, + "loss": 0.1269, + "step": 10790 + }, + { + "epoch": 0.5751104957665477, + "grad_norm": 0.0456010103225708, + "learning_rate": 5.923279009472678e-05, + "loss": 0.1269, + "step": 10800 + }, + { + "epoch": 0.5756430054848501, + "grad_norm": 0.062419842928647995, + "learning_rate": 5.9172007177561194e-05, + "loss": 0.1274, + "step": 10810 + }, + { + "epoch": 0.5761755152031525, + "grad_norm": 0.0384056381881237, + "learning_rate": 5.911124233334122e-05, + "loss": 0.1274, + "step": 10820 + }, + { + "epoch": 0.5767080249214548, + "grad_norm": 0.06525867432355881, + "learning_rate": 5.905049575050218e-05, + "loss": 0.1271, + "step": 10830 + }, + { + "epoch": 0.5772405346397572, + "grad_norm": 0.05695752054452896, + "learning_rate": 5.8989767617422744e-05, + "loss": 0.1279, + "step": 10840 + }, + { + "epoch": 0.5777730443580595, + "grad_norm": 0.05444275960326195, + "learning_rate": 5.8929058122424406e-05, + "loss": 0.1273, + "step": 10850 + }, + { + "epoch": 0.5783055540763619, + "grad_norm": 0.051563095301389694, + "learning_rate": 5.886836745377087e-05, + "loss": 0.1273, + "step": 10860 + }, + { + "epoch": 0.5788380637946643, + "grad_norm": 0.07469698041677475, + "learning_rate": 5.8807695799667416e-05, + "loss": 0.1272, + "step": 10870 + }, + { + "epoch": 0.5793705735129666, + "grad_norm": 0.049753960222005844, + "learning_rate": 5.874704334826038e-05, + "loss": 0.1276, + "step": 10880 + }, + { + "epoch": 0.5799030832312689, + "grad_norm": 0.04683075100183487, + "learning_rate": 5.8686410287636575e-05, + "loss": 0.1273, + "step": 10890 + }, + { + "epoch": 0.5804355929495714, + "grad_norm": 0.04198311269283295, + "learning_rate": 5.862579680582263e-05, + "loss": 0.1277, + "step": 10900 + }, + { + "epoch": 0.5809681026678737, + "grad_norm": 0.045857448130846024, + "learning_rate": 5.8565203090784484e-05, + "loss": 0.1271, + "step": 10910 + }, + { + "epoch": 0.581500612386176, + "grad_norm": 0.040586717426776886, + "learning_rate": 5.8504629330426816e-05, + "loss": 0.1274, + "step": 10920 + }, + { + "epoch": 0.5820331221044784, + "grad_norm": 0.050801508128643036, + "learning_rate": 5.844407571259235e-05, + "loss": 0.127, + "step": 10930 + }, + { + "epoch": 0.5825656318227808, + "grad_norm": 0.050907645374536514, + "learning_rate": 5.8383542425061424e-05, + "loss": 0.1272, + "step": 10940 + }, + { + "epoch": 0.5830981415410831, + "grad_norm": 0.08759643882513046, + "learning_rate": 5.8323029655551266e-05, + "loss": 0.1267, + "step": 10950 + }, + { + "epoch": 0.5836306512593855, + "grad_norm": 0.05688736215233803, + "learning_rate": 5.8262537591715493e-05, + "loss": 0.1275, + "step": 10960 + }, + { + "epoch": 0.5841631609776878, + "grad_norm": 0.05095606669783592, + "learning_rate": 5.820206642114355e-05, + "loss": 0.1271, + "step": 10970 + }, + { + "epoch": 0.5846956706959902, + "grad_norm": 0.07440601289272308, + "learning_rate": 5.814161633136006e-05, + "loss": 0.1275, + "step": 10980 + }, + { + "epoch": 0.5852281804142926, + "grad_norm": 0.07147916406393051, + "learning_rate": 5.808118750982427e-05, + "loss": 0.1276, + "step": 10990 + }, + { + "epoch": 0.5857606901325949, + "grad_norm": 0.06379908323287964, + "learning_rate": 5.802078014392946e-05, + "loss": 0.1269, + "step": 11000 + }, + { + "epoch": 0.5862931998508972, + "grad_norm": 0.09271499514579773, + "learning_rate": 5.796039442100243e-05, + "loss": 0.126, + "step": 11010 + }, + { + "epoch": 0.5868257095691997, + "grad_norm": 0.04959186539053917, + "learning_rate": 5.7900030528302804e-05, + "loss": 0.1277, + "step": 11020 + }, + { + "epoch": 0.587358219287502, + "grad_norm": 0.03943556919693947, + "learning_rate": 5.783968865302254e-05, + "loss": 0.1264, + "step": 11030 + }, + { + "epoch": 0.5878907290058043, + "grad_norm": 0.04361870139837265, + "learning_rate": 5.777936898228531e-05, + "loss": 0.1276, + "step": 11040 + }, + { + "epoch": 0.5884232387241067, + "grad_norm": 0.0441637746989727, + "learning_rate": 5.771907170314593e-05, + "loss": 0.1274, + "step": 11050 + }, + { + "epoch": 0.588955748442409, + "grad_norm": 0.048377875238657, + "learning_rate": 5.7658797002589796e-05, + "loss": 0.1271, + "step": 11060 + }, + { + "epoch": 0.5894882581607115, + "grad_norm": 0.04017746075987816, + "learning_rate": 5.759854506753224e-05, + "loss": 0.1271, + "step": 11070 + }, + { + "epoch": 0.5900207678790138, + "grad_norm": 0.08903171867132187, + "learning_rate": 5.753831608481803e-05, + "loss": 0.1265, + "step": 11080 + }, + { + "epoch": 0.5905532775973161, + "grad_norm": 0.05974121764302254, + "learning_rate": 5.747811024122081e-05, + "loss": 0.127, + "step": 11090 + }, + { + "epoch": 0.5910857873156186, + "grad_norm": 0.04399004206061363, + "learning_rate": 5.741792772344232e-05, + "loss": 0.127, + "step": 11100 + }, + { + "epoch": 0.5916182970339209, + "grad_norm": 0.042719513177871704, + "learning_rate": 5.7357768718112114e-05, + "loss": 0.1271, + "step": 11110 + }, + { + "epoch": 0.5921508067522232, + "grad_norm": 0.04649467021226883, + "learning_rate": 5.729763341178678e-05, + "loss": 0.1275, + "step": 11120 + }, + { + "epoch": 0.5926833164705256, + "grad_norm": 0.061461612582206726, + "learning_rate": 5.723752199094938e-05, + "loss": 0.127, + "step": 11130 + }, + { + "epoch": 0.593215826188828, + "grad_norm": 0.09266576170921326, + "learning_rate": 5.717743464200895e-05, + "loss": 0.1276, + "step": 11140 + }, + { + "epoch": 0.5937483359071303, + "grad_norm": 0.039458803832530975, + "learning_rate": 5.711737155129987e-05, + "loss": 0.1273, + "step": 11150 + }, + { + "epoch": 0.5942808456254327, + "grad_norm": 0.0349600687623024, + "learning_rate": 5.7057332905081275e-05, + "loss": 0.127, + "step": 11160 + }, + { + "epoch": 0.594813355343735, + "grad_norm": 0.07518605887889862, + "learning_rate": 5.699731888953653e-05, + "loss": 0.1257, + "step": 11170 + }, + { + "epoch": 0.5953458650620373, + "grad_norm": 0.056736767292022705, + "learning_rate": 5.6937329690772554e-05, + "loss": 0.1275, + "step": 11180 + }, + { + "epoch": 0.5958783747803398, + "grad_norm": 0.0569731779396534, + "learning_rate": 5.687736549481939e-05, + "loss": 0.1269, + "step": 11190 + }, + { + "epoch": 0.5964108844986421, + "grad_norm": 0.06685450673103333, + "learning_rate": 5.681742648762949e-05, + "loss": 0.1275, + "step": 11200 + }, + { + "epoch": 0.5969433942169444, + "grad_norm": 0.05476146563887596, + "learning_rate": 5.675751285507722e-05, + "loss": 0.1266, + "step": 11210 + }, + { + "epoch": 0.5974759039352469, + "grad_norm": 0.04533839598298073, + "learning_rate": 5.6697624782958235e-05, + "loss": 0.1273, + "step": 11220 + }, + { + "epoch": 0.5980084136535492, + "grad_norm": 0.04754569008946419, + "learning_rate": 5.6637762456988943e-05, + "loss": 0.126, + "step": 11230 + }, + { + "epoch": 0.5985409233718515, + "grad_norm": 0.06229964643716812, + "learning_rate": 5.657792606280592e-05, + "loss": 0.126, + "step": 11240 + }, + { + "epoch": 0.5990734330901539, + "grad_norm": 0.057759564369916916, + "learning_rate": 5.65181157859653e-05, + "loss": 0.1274, + "step": 11250 + }, + { + "epoch": 0.5996059428084562, + "grad_norm": 0.05225152522325516, + "learning_rate": 5.6458331811942235e-05, + "loss": 0.1264, + "step": 11260 + }, + { + "epoch": 0.6001384525267586, + "grad_norm": 0.0388801135122776, + "learning_rate": 5.639857432613034e-05, + "loss": 0.127, + "step": 11270 + }, + { + "epoch": 0.600670962245061, + "grad_norm": 0.044373005628585815, + "learning_rate": 5.633884351384103e-05, + "loss": 0.1274, + "step": 11280 + }, + { + "epoch": 0.6012034719633633, + "grad_norm": 0.07331310957670212, + "learning_rate": 5.627913956030306e-05, + "loss": 0.1272, + "step": 11290 + }, + { + "epoch": 0.6017359816816656, + "grad_norm": 0.05038286745548248, + "learning_rate": 5.621946265066187e-05, + "loss": 0.1273, + "step": 11300 + }, + { + "epoch": 0.6022684913999681, + "grad_norm": 0.058994751423597336, + "learning_rate": 5.615981296997901e-05, + "loss": 0.1273, + "step": 11310 + }, + { + "epoch": 0.6028010011182704, + "grad_norm": 0.036481309682130814, + "learning_rate": 5.610019070323167e-05, + "loss": 0.1271, + "step": 11320 + }, + { + "epoch": 0.6033335108365727, + "grad_norm": 0.03966144099831581, + "learning_rate": 5.604059603531193e-05, + "loss": 0.127, + "step": 11330 + }, + { + "epoch": 0.6038660205548751, + "grad_norm": 0.055649157613515854, + "learning_rate": 5.5981029151026345e-05, + "loss": 0.1277, + "step": 11340 + }, + { + "epoch": 0.6043985302731775, + "grad_norm": 0.05199277400970459, + "learning_rate": 5.59214902350953e-05, + "loss": 0.1267, + "step": 11350 + }, + { + "epoch": 0.6049310399914798, + "grad_norm": 0.04140447452664375, + "learning_rate": 5.586197947215246e-05, + "loss": 0.1263, + "step": 11360 + }, + { + "epoch": 0.6054635497097822, + "grad_norm": 0.03502384573221207, + "learning_rate": 5.580249704674414e-05, + "loss": 0.1268, + "step": 11370 + }, + { + "epoch": 0.6059960594280845, + "grad_norm": 0.04866914451122284, + "learning_rate": 5.574304314332881e-05, + "loss": 0.1258, + "step": 11380 + }, + { + "epoch": 0.606528569146387, + "grad_norm": 0.039416830986738205, + "learning_rate": 5.568361794627652e-05, + "loss": 0.1266, + "step": 11390 + }, + { + "epoch": 0.6070610788646893, + "grad_norm": 0.04279816523194313, + "learning_rate": 5.562422163986827e-05, + "loss": 0.1265, + "step": 11400 + }, + { + "epoch": 0.6075935885829916, + "grad_norm": 0.08062811195850372, + "learning_rate": 5.556485440829544e-05, + "loss": 0.1266, + "step": 11410 + }, + { + "epoch": 0.608126098301294, + "grad_norm": 0.042986951768398285, + "learning_rate": 5.550551643565931e-05, + "loss": 0.1269, + "step": 11420 + }, + { + "epoch": 0.6086586080195964, + "grad_norm": 0.056119028478860855, + "learning_rate": 5.544620790597037e-05, + "loss": 0.1273, + "step": 11430 + }, + { + "epoch": 0.6091911177378987, + "grad_norm": 0.0498378686606884, + "learning_rate": 5.5386929003147835e-05, + "loss": 0.1273, + "step": 11440 + }, + { + "epoch": 0.6097236274562011, + "grad_norm": 0.04148755222558975, + "learning_rate": 5.5327679911019034e-05, + "loss": 0.1265, + "step": 11450 + }, + { + "epoch": 0.6102561371745034, + "grad_norm": 0.04891781508922577, + "learning_rate": 5.5268460813318866e-05, + "loss": 0.127, + "step": 11460 + }, + { + "epoch": 0.6107886468928058, + "grad_norm": 0.07420381903648376, + "learning_rate": 5.520927189368923e-05, + "loss": 0.1268, + "step": 11470 + }, + { + "epoch": 0.6113211566111082, + "grad_norm": 0.12143438309431076, + "learning_rate": 5.5150113335678365e-05, + "loss": 0.1273, + "step": 11480 + }, + { + "epoch": 0.6118536663294105, + "grad_norm": 0.045520998537540436, + "learning_rate": 5.509098532274044e-05, + "loss": 0.127, + "step": 11490 + }, + { + "epoch": 0.6123861760477128, + "grad_norm": 0.05857592076063156, + "learning_rate": 5.503188803823487e-05, + "loss": 0.1269, + "step": 11500 + }, + { + "epoch": 0.6129186857660153, + "grad_norm": 0.04991764947772026, + "learning_rate": 5.497282166542579e-05, + "loss": 0.1267, + "step": 11510 + }, + { + "epoch": 0.6134511954843176, + "grad_norm": 0.059020016342401505, + "learning_rate": 5.4913786387481426e-05, + "loss": 0.127, + "step": 11520 + }, + { + "epoch": 0.6139837052026199, + "grad_norm": 0.06219782307744026, + "learning_rate": 5.485478238747367e-05, + "loss": 0.1268, + "step": 11530 + }, + { + "epoch": 0.6145162149209223, + "grad_norm": 0.060054097324609756, + "learning_rate": 5.4795809848377323e-05, + "loss": 0.1266, + "step": 11540 + }, + { + "epoch": 0.6150487246392247, + "grad_norm": 0.05419805273413658, + "learning_rate": 5.473686895306971e-05, + "loss": 0.1269, + "step": 11550 + }, + { + "epoch": 0.615581234357527, + "grad_norm": 0.07475223392248154, + "learning_rate": 5.4677959884329944e-05, + "loss": 0.1264, + "step": 11560 + }, + { + "epoch": 0.6161137440758294, + "grad_norm": 0.052985042333602905, + "learning_rate": 5.4619082824838506e-05, + "loss": 0.1266, + "step": 11570 + }, + { + "epoch": 0.6166462537941317, + "grad_norm": 0.062309183180332184, + "learning_rate": 5.45602379571766e-05, + "loss": 0.1276, + "step": 11580 + }, + { + "epoch": 0.6171787635124341, + "grad_norm": 0.0704023614525795, + "learning_rate": 5.450142546382555e-05, + "loss": 0.126, + "step": 11590 + }, + { + "epoch": 0.6177112732307365, + "grad_norm": 0.04271765798330307, + "learning_rate": 5.444264552716636e-05, + "loss": 0.1267, + "step": 11600 + }, + { + "epoch": 0.6182437829490388, + "grad_norm": 0.03997405245900154, + "learning_rate": 5.438389832947903e-05, + "loss": 0.1267, + "step": 11610 + }, + { + "epoch": 0.6187762926673411, + "grad_norm": 0.06841737031936646, + "learning_rate": 5.432518405294208e-05, + "loss": 0.1259, + "step": 11620 + }, + { + "epoch": 0.6193088023856436, + "grad_norm": 0.07736402004957199, + "learning_rate": 5.426650287963186e-05, + "loss": 0.1277, + "step": 11630 + }, + { + "epoch": 0.6198413121039459, + "grad_norm": 0.05138285458087921, + "learning_rate": 5.4207854991522125e-05, + "loss": 0.1268, + "step": 11640 + }, + { + "epoch": 0.6203738218222482, + "grad_norm": 0.055199526250362396, + "learning_rate": 5.4149240570483394e-05, + "loss": 0.1258, + "step": 11650 + }, + { + "epoch": 0.6209063315405506, + "grad_norm": 0.0480291023850441, + "learning_rate": 5.409065979828243e-05, + "loss": 0.1267, + "step": 11660 + }, + { + "epoch": 0.621438841258853, + "grad_norm": 0.0840907022356987, + "learning_rate": 5.403211285658158e-05, + "loss": 0.1265, + "step": 11670 + }, + { + "epoch": 0.6219713509771553, + "grad_norm": 0.042071383446455, + "learning_rate": 5.397359992693835e-05, + "loss": 0.1262, + "step": 11680 + }, + { + "epoch": 0.6225038606954577, + "grad_norm": 0.0721912607550621, + "learning_rate": 5.3915121190804755e-05, + "loss": 0.1268, + "step": 11690 + }, + { + "epoch": 0.62303637041376, + "grad_norm": 0.058796901255846024, + "learning_rate": 5.385667682952675e-05, + "loss": 0.1263, + "step": 11700 + }, + { + "epoch": 0.6235688801320625, + "grad_norm": 0.04958143085241318, + "learning_rate": 5.3798267024343706e-05, + "loss": 0.1266, + "step": 11710 + }, + { + "epoch": 0.6241013898503648, + "grad_norm": 0.07576627284288406, + "learning_rate": 5.373989195638785e-05, + "loss": 0.1268, + "step": 11720 + }, + { + "epoch": 0.6246338995686671, + "grad_norm": 0.050743598490953445, + "learning_rate": 5.368155180668366e-05, + "loss": 0.1262, + "step": 11730 + }, + { + "epoch": 0.6251664092869695, + "grad_norm": 0.046020250767469406, + "learning_rate": 5.3623246756147346e-05, + "loss": 0.1262, + "step": 11740 + }, + { + "epoch": 0.6256989190052719, + "grad_norm": 0.04735419899225235, + "learning_rate": 5.356497698558628e-05, + "loss": 0.1265, + "step": 11750 + }, + { + "epoch": 0.6262314287235742, + "grad_norm": 0.058625295758247375, + "learning_rate": 5.3506742675698384e-05, + "loss": 0.1267, + "step": 11760 + }, + { + "epoch": 0.6267639384418766, + "grad_norm": 0.05440155416727066, + "learning_rate": 5.34485440070717e-05, + "loss": 0.1265, + "step": 11770 + }, + { + "epoch": 0.6272964481601789, + "grad_norm": 0.06765516102313995, + "learning_rate": 5.3390381160183645e-05, + "loss": 0.1273, + "step": 11780 + }, + { + "epoch": 0.6278289578784813, + "grad_norm": 0.042375244200229645, + "learning_rate": 5.333225431540062e-05, + "loss": 0.1269, + "step": 11790 + }, + { + "epoch": 0.6283614675967837, + "grad_norm": 0.04143916070461273, + "learning_rate": 5.327416365297737e-05, + "loss": 0.1267, + "step": 11800 + }, + { + "epoch": 0.628893977315086, + "grad_norm": 0.03681569918990135, + "learning_rate": 5.321610935305643e-05, + "loss": 0.1265, + "step": 11810 + }, + { + "epoch": 0.6294264870333883, + "grad_norm": 0.04781223088502884, + "learning_rate": 5.315809159566753e-05, + "loss": 0.1262, + "step": 11820 + }, + { + "epoch": 0.6299589967516908, + "grad_norm": 0.05528895556926727, + "learning_rate": 5.3100110560727155e-05, + "loss": 0.1261, + "step": 11830 + }, + { + "epoch": 0.6304915064699931, + "grad_norm": 0.050820931792259216, + "learning_rate": 5.3042166428037867e-05, + "loss": 0.1265, + "step": 11840 + }, + { + "epoch": 0.6310240161882954, + "grad_norm": 0.049191396683454514, + "learning_rate": 5.2984259377287795e-05, + "loss": 0.1266, + "step": 11850 + }, + { + "epoch": 0.6315565259065978, + "grad_norm": 0.04525256156921387, + "learning_rate": 5.2926389588050095e-05, + "loss": 0.127, + "step": 11860 + }, + { + "epoch": 0.6320890356249002, + "grad_norm": 0.03086530603468418, + "learning_rate": 5.2868557239782335e-05, + "loss": 0.1268, + "step": 11870 + }, + { + "epoch": 0.6326215453432025, + "grad_norm": 0.06164710223674774, + "learning_rate": 5.2810762511826017e-05, + "loss": 0.1265, + "step": 11880 + }, + { + "epoch": 0.6331540550615049, + "grad_norm": 0.10141383111476898, + "learning_rate": 5.275300558340596e-05, + "loss": 0.1264, + "step": 11890 + }, + { + "epoch": 0.6336865647798072, + "grad_norm": 0.043847665190696716, + "learning_rate": 5.269528663362976e-05, + "loss": 0.1266, + "step": 11900 + }, + { + "epoch": 0.6342190744981095, + "grad_norm": 0.04844609647989273, + "learning_rate": 5.2637605841487246e-05, + "loss": 0.1264, + "step": 11910 + }, + { + "epoch": 0.634751584216412, + "grad_norm": 0.04017659276723862, + "learning_rate": 5.257996338584994e-05, + "loss": 0.1266, + "step": 11920 + }, + { + "epoch": 0.6352840939347143, + "grad_norm": 0.04086530581116676, + "learning_rate": 5.2522359445470434e-05, + "loss": 0.1264, + "step": 11930 + }, + { + "epoch": 0.6358166036530166, + "grad_norm": 0.0746808871626854, + "learning_rate": 5.246479419898191e-05, + "loss": 0.1265, + "step": 11940 + }, + { + "epoch": 0.636349113371319, + "grad_norm": 0.047690387815237045, + "learning_rate": 5.2407267824897556e-05, + "loss": 0.1261, + "step": 11950 + }, + { + "epoch": 0.6368816230896214, + "grad_norm": 0.07566772401332855, + "learning_rate": 5.234978050161002e-05, + "loss": 0.1263, + "step": 11960 + }, + { + "epoch": 0.6374141328079237, + "grad_norm": 0.07059159129858017, + "learning_rate": 5.229233240739082e-05, + "loss": 0.1263, + "step": 11970 + }, + { + "epoch": 0.6379466425262261, + "grad_norm": 0.06677578389644623, + "learning_rate": 5.223492372038989e-05, + "loss": 0.1273, + "step": 11980 + }, + { + "epoch": 0.6384791522445284, + "grad_norm": 0.09771794825792313, + "learning_rate": 5.217755461863487e-05, + "loss": 0.1268, + "step": 11990 + }, + { + "epoch": 0.6390116619628308, + "grad_norm": 0.0393114909529686, + "learning_rate": 5.212022528003072e-05, + "loss": 0.1264, + "step": 12000 + }, + { + "epoch": 0.6395441716811332, + "grad_norm": 0.05768098682165146, + "learning_rate": 5.2062935882359054e-05, + "loss": 0.1267, + "step": 12010 + }, + { + "epoch": 0.6400766813994355, + "grad_norm": 0.04626571014523506, + "learning_rate": 5.2005686603277625e-05, + "loss": 0.1272, + "step": 12020 + }, + { + "epoch": 0.640609191117738, + "grad_norm": 0.06112409383058548, + "learning_rate": 5.1948477620319805e-05, + "loss": 0.1265, + "step": 12030 + }, + { + "epoch": 0.6411417008360403, + "grad_norm": 0.04936950281262398, + "learning_rate": 5.1891309110893974e-05, + "loss": 0.1268, + "step": 12040 + }, + { + "epoch": 0.6416742105543426, + "grad_norm": 0.053032536059617996, + "learning_rate": 5.183418125228301e-05, + "loss": 0.1269, + "step": 12050 + }, + { + "epoch": 0.642206720272645, + "grad_norm": 0.053800616413354874, + "learning_rate": 5.177709422164374e-05, + "loss": 0.1269, + "step": 12060 + }, + { + "epoch": 0.6427392299909473, + "grad_norm": 0.03961695730686188, + "learning_rate": 5.1720048196006376e-05, + "loss": 0.1265, + "step": 12070 + }, + { + "epoch": 0.6432717397092497, + "grad_norm": 0.040204983204603195, + "learning_rate": 5.166304335227396e-05, + "loss": 0.1262, + "step": 12080 + }, + { + "epoch": 0.6438042494275521, + "grad_norm": 0.11371159553527832, + "learning_rate": 5.160607986722186e-05, + "loss": 0.1263, + "step": 12090 + }, + { + "epoch": 0.6443367591458544, + "grad_norm": 0.037854380905628204, + "learning_rate": 5.154915791749715e-05, + "loss": 0.1265, + "step": 12100 + }, + { + "epoch": 0.6448692688641567, + "grad_norm": 0.07416236400604248, + "learning_rate": 5.1492277679618104e-05, + "loss": 0.1261, + "step": 12110 + }, + { + "epoch": 0.6454017785824592, + "grad_norm": 0.0411413200199604, + "learning_rate": 5.1435439329973664e-05, + "loss": 0.1262, + "step": 12120 + }, + { + "epoch": 0.6459342883007615, + "grad_norm": 0.06878205388784409, + "learning_rate": 5.1378643044822884e-05, + "loss": 0.1268, + "step": 12130 + }, + { + "epoch": 0.6464667980190638, + "grad_norm": 0.05869507044553757, + "learning_rate": 5.132188900029433e-05, + "loss": 0.1258, + "step": 12140 + }, + { + "epoch": 0.6469993077373662, + "grad_norm": 0.049447815865278244, + "learning_rate": 5.126517737238563e-05, + "loss": 0.1266, + "step": 12150 + }, + { + "epoch": 0.6475318174556686, + "grad_norm": 0.09829236567020416, + "learning_rate": 5.120850833696282e-05, + "loss": 0.1262, + "step": 12160 + }, + { + "epoch": 0.6480643271739709, + "grad_norm": 0.03944886848330498, + "learning_rate": 5.115188206975992e-05, + "loss": 0.127, + "step": 12170 + }, + { + "epoch": 0.6485968368922733, + "grad_norm": 0.038028497248888016, + "learning_rate": 5.109529874637824e-05, + "loss": 0.1266, + "step": 12180 + }, + { + "epoch": 0.6491293466105756, + "grad_norm": 0.05182207375764847, + "learning_rate": 5.103875854228601e-05, + "loss": 0.1259, + "step": 12190 + }, + { + "epoch": 0.649661856328878, + "grad_norm": 0.04164310172200203, + "learning_rate": 5.098226163281767e-05, + "loss": 0.1256, + "step": 12200 + }, + { + "epoch": 0.6501943660471804, + "grad_norm": 0.04075628146529198, + "learning_rate": 5.0925808193173454e-05, + "loss": 0.1267, + "step": 12210 + }, + { + "epoch": 0.6507268757654827, + "grad_norm": 0.05309925228357315, + "learning_rate": 5.0869398398418744e-05, + "loss": 0.1257, + "step": 12220 + }, + { + "epoch": 0.651259385483785, + "grad_norm": 0.05489126220345497, + "learning_rate": 5.081303242348363e-05, + "loss": 0.1262, + "step": 12230 + }, + { + "epoch": 0.6517918952020875, + "grad_norm": 0.06438528001308441, + "learning_rate": 5.075671044316228e-05, + "loss": 0.1268, + "step": 12240 + }, + { + "epoch": 0.6523244049203898, + "grad_norm": 0.045175325125455856, + "learning_rate": 5.070043263211242e-05, + "loss": 0.1262, + "step": 12250 + }, + { + "epoch": 0.6528569146386921, + "grad_norm": 0.05378909409046173, + "learning_rate": 5.064419916485485e-05, + "loss": 0.1264, + "step": 12260 + }, + { + "epoch": 0.6533894243569945, + "grad_norm": 0.05675683543086052, + "learning_rate": 5.058801021577282e-05, + "loss": 0.1266, + "step": 12270 + }, + { + "epoch": 0.6539219340752969, + "grad_norm": 0.0350642092525959, + "learning_rate": 5.053186595911152e-05, + "loss": 0.1269, + "step": 12280 + }, + { + "epoch": 0.6544544437935992, + "grad_norm": 0.05776926130056381, + "learning_rate": 5.0475766568977586e-05, + "loss": 0.1264, + "step": 12290 + }, + { + "epoch": 0.6549869535119016, + "grad_norm": 0.053256552666425705, + "learning_rate": 5.041971221933851e-05, + "loss": 0.127, + "step": 12300 + }, + { + "epoch": 0.6555194632302039, + "grad_norm": 0.03956317901611328, + "learning_rate": 5.0363703084022065e-05, + "loss": 0.1265, + "step": 12310 + }, + { + "epoch": 0.6560519729485063, + "grad_norm": 0.033014725893735886, + "learning_rate": 5.0307739336715864e-05, + "loss": 0.1261, + "step": 12320 + }, + { + "epoch": 0.6565844826668087, + "grad_norm": 0.040162548422813416, + "learning_rate": 5.0251821150966746e-05, + "loss": 0.1266, + "step": 12330 + }, + { + "epoch": 0.657116992385111, + "grad_norm": 0.08668463677167892, + "learning_rate": 5.0195948700180294e-05, + "loss": 0.1256, + "step": 12340 + }, + { + "epoch": 0.6576495021034134, + "grad_norm": 0.05517444759607315, + "learning_rate": 5.0140122157620185e-05, + "loss": 0.1262, + "step": 12350 + }, + { + "epoch": 0.6581820118217158, + "grad_norm": 0.05813097953796387, + "learning_rate": 5.008434169640781e-05, + "loss": 0.1258, + "step": 12360 + }, + { + "epoch": 0.6587145215400181, + "grad_norm": 0.036499012261629105, + "learning_rate": 5.002860748952165e-05, + "loss": 0.1261, + "step": 12370 + }, + { + "epoch": 0.6592470312583205, + "grad_norm": 0.03472182899713516, + "learning_rate": 4.997291970979672e-05, + "loss": 0.1268, + "step": 12380 + }, + { + "epoch": 0.6597795409766228, + "grad_norm": 0.06570050120353699, + "learning_rate": 4.9917278529924036e-05, + "loss": 0.1265, + "step": 12390 + }, + { + "epoch": 0.6603120506949252, + "grad_norm": 0.0551844947040081, + "learning_rate": 4.9861684122450166e-05, + "loss": 0.1262, + "step": 12400 + }, + { + "epoch": 0.6608445604132276, + "grad_norm": 0.05402039363980293, + "learning_rate": 4.9806136659776625e-05, + "loss": 0.1264, + "step": 12410 + }, + { + "epoch": 0.6613770701315299, + "grad_norm": 0.05242108181118965, + "learning_rate": 4.975063631415934e-05, + "loss": 0.1262, + "step": 12420 + }, + { + "epoch": 0.6619095798498322, + "grad_norm": 0.04954907298088074, + "learning_rate": 4.96951832577081e-05, + "loss": 0.1259, + "step": 12430 + }, + { + "epoch": 0.6624420895681347, + "grad_norm": 0.031282830983400345, + "learning_rate": 4.963977766238604e-05, + "loss": 0.1259, + "step": 12440 + }, + { + "epoch": 0.662974599286437, + "grad_norm": 0.037177179008722305, + "learning_rate": 4.95844197000092e-05, + "loss": 0.1263, + "step": 12450 + }, + { + "epoch": 0.6635071090047393, + "grad_norm": 0.059044573456048965, + "learning_rate": 4.95291095422458e-05, + "loss": 0.1251, + "step": 12460 + }, + { + "epoch": 0.6640396187230417, + "grad_norm": 0.04457508400082588, + "learning_rate": 4.94738473606159e-05, + "loss": 0.127, + "step": 12470 + }, + { + "epoch": 0.6645721284413441, + "grad_norm": 0.04706263169646263, + "learning_rate": 4.941863332649072e-05, + "loss": 0.1256, + "step": 12480 + }, + { + "epoch": 0.6651046381596464, + "grad_norm": 0.04623222351074219, + "learning_rate": 4.936346761109223e-05, + "loss": 0.1256, + "step": 12490 + }, + { + "epoch": 0.6656371478779488, + "grad_norm": 0.04978486895561218, + "learning_rate": 4.9308350385492494e-05, + "loss": 0.126, + "step": 12500 + }, + { + "epoch": 0.6661696575962511, + "grad_norm": 0.055559538304805756, + "learning_rate": 4.925328182061326e-05, + "loss": 0.126, + "step": 12510 + }, + { + "epoch": 0.6667021673145535, + "grad_norm": 0.04878619685769081, + "learning_rate": 4.9198262087225375e-05, + "loss": 0.1262, + "step": 12520 + }, + { + "epoch": 0.6672346770328559, + "grad_norm": 0.045161280781030655, + "learning_rate": 4.9143291355948225e-05, + "loss": 0.1259, + "step": 12530 + }, + { + "epoch": 0.6677671867511582, + "grad_norm": 0.08460939675569534, + "learning_rate": 4.9088369797249234e-05, + "loss": 0.1261, + "step": 12540 + }, + { + "epoch": 0.6682996964694605, + "grad_norm": 0.05501072108745575, + "learning_rate": 4.903349758144339e-05, + "loss": 0.1265, + "step": 12550 + }, + { + "epoch": 0.668832206187763, + "grad_norm": 0.04381651058793068, + "learning_rate": 4.897867487869262e-05, + "loss": 0.1259, + "step": 12560 + }, + { + "epoch": 0.6693647159060653, + "grad_norm": 0.051561057567596436, + "learning_rate": 4.8923901859005335e-05, + "loss": 0.1261, + "step": 12570 + }, + { + "epoch": 0.6698972256243676, + "grad_norm": 0.03529192507266998, + "learning_rate": 4.886917869223585e-05, + "loss": 0.1261, + "step": 12580 + }, + { + "epoch": 0.67042973534267, + "grad_norm": 0.047314297407865524, + "learning_rate": 4.881450554808389e-05, + "loss": 0.1266, + "step": 12590 + }, + { + "epoch": 0.6709622450609724, + "grad_norm": 0.06020704656839371, + "learning_rate": 4.875988259609407e-05, + "loss": 0.1258, + "step": 12600 + }, + { + "epoch": 0.6714947547792747, + "grad_norm": 0.040109023451805115, + "learning_rate": 4.870531000565537e-05, + "loss": 0.1261, + "step": 12610 + }, + { + "epoch": 0.6720272644975771, + "grad_norm": 0.03262796998023987, + "learning_rate": 4.865078794600053e-05, + "loss": 0.1264, + "step": 12620 + }, + { + "epoch": 0.6725597742158794, + "grad_norm": 0.039773985743522644, + "learning_rate": 4.859631658620569e-05, + "loss": 0.1265, + "step": 12630 + }, + { + "epoch": 0.6730922839341817, + "grad_norm": 0.046503521502017975, + "learning_rate": 4.854189609518969e-05, + "loss": 0.1259, + "step": 12640 + }, + { + "epoch": 0.6736247936524842, + "grad_norm": 0.04692930728197098, + "learning_rate": 4.848752664171362e-05, + "loss": 0.1258, + "step": 12650 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.04684825614094734, + "learning_rate": 4.843320839438035e-05, + "loss": 0.1266, + "step": 12660 + }, + { + "epoch": 0.6746898130890889, + "grad_norm": 0.051075540482997894, + "learning_rate": 4.837894152163395e-05, + "loss": 0.1257, + "step": 12670 + }, + { + "epoch": 0.6752223228073913, + "grad_norm": 0.08315866440534592, + "learning_rate": 4.832472619175913e-05, + "loss": 0.1256, + "step": 12680 + }, + { + "epoch": 0.6757548325256936, + "grad_norm": 0.05411198362708092, + "learning_rate": 4.827056257288079e-05, + "loss": 0.1265, + "step": 12690 + }, + { + "epoch": 0.676287342243996, + "grad_norm": 0.045681897550821304, + "learning_rate": 4.821645083296347e-05, + "loss": 0.1264, + "step": 12700 + }, + { + "epoch": 0.6768198519622983, + "grad_norm": 0.03494982793927193, + "learning_rate": 4.8162391139810845e-05, + "loss": 0.1265, + "step": 12710 + }, + { + "epoch": 0.6773523616806006, + "grad_norm": 0.04865271970629692, + "learning_rate": 4.8108383661065185e-05, + "loss": 0.126, + "step": 12720 + }, + { + "epoch": 0.6778848713989031, + "grad_norm": 0.055251702666282654, + "learning_rate": 4.805442856420682e-05, + "loss": 0.1264, + "step": 12730 + }, + { + "epoch": 0.6784173811172054, + "grad_norm": 0.051663871854543686, + "learning_rate": 4.800052601655362e-05, + "loss": 0.1255, + "step": 12740 + }, + { + "epoch": 0.6789498908355077, + "grad_norm": 0.05289029702544212, + "learning_rate": 4.794667618526057e-05, + "loss": 0.126, + "step": 12750 + }, + { + "epoch": 0.6794824005538102, + "grad_norm": 0.05994449183344841, + "learning_rate": 4.7892879237319136e-05, + "loss": 0.1261, + "step": 12760 + }, + { + "epoch": 0.6800149102721125, + "grad_norm": 0.03531305119395256, + "learning_rate": 4.783913533955675e-05, + "loss": 0.1265, + "step": 12770 + }, + { + "epoch": 0.6805474199904148, + "grad_norm": 0.04480816796422005, + "learning_rate": 4.7785444658636427e-05, + "loss": 0.126, + "step": 12780 + }, + { + "epoch": 0.6810799297087172, + "grad_norm": 0.045987531542778015, + "learning_rate": 4.773180736105607e-05, + "loss": 0.1268, + "step": 12790 + }, + { + "epoch": 0.6816124394270195, + "grad_norm": 0.06958389282226562, + "learning_rate": 4.767822361314805e-05, + "loss": 0.1259, + "step": 12800 + }, + { + "epoch": 0.6821449491453219, + "grad_norm": 0.03225488215684891, + "learning_rate": 4.762469358107873e-05, + "loss": 0.1257, + "step": 12810 + }, + { + "epoch": 0.6826774588636243, + "grad_norm": 0.05594348534941673, + "learning_rate": 4.757121743084784e-05, + "loss": 0.1261, + "step": 12820 + }, + { + "epoch": 0.6832099685819266, + "grad_norm": 0.06453056633472443, + "learning_rate": 4.751779532828806e-05, + "loss": 0.1263, + "step": 12830 + }, + { + "epoch": 0.6837424783002289, + "grad_norm": 0.05000981315970421, + "learning_rate": 4.746442743906442e-05, + "loss": 0.1257, + "step": 12840 + }, + { + "epoch": 0.6842749880185314, + "grad_norm": 0.06121028959751129, + "learning_rate": 4.741111392867386e-05, + "loss": 0.1258, + "step": 12850 + }, + { + "epoch": 0.6848074977368337, + "grad_norm": 0.04783171787858009, + "learning_rate": 4.7357854962444686e-05, + "loss": 0.1259, + "step": 12860 + }, + { + "epoch": 0.685340007455136, + "grad_norm": 0.0399705208837986, + "learning_rate": 4.7304650705536084e-05, + "loss": 0.1261, + "step": 12870 + }, + { + "epoch": 0.6858725171734384, + "grad_norm": 0.03981216251850128, + "learning_rate": 4.7251501322937534e-05, + "loss": 0.126, + "step": 12880 + }, + { + "epoch": 0.6864050268917408, + "grad_norm": 0.034330256283283234, + "learning_rate": 4.7198406979468366e-05, + "loss": 0.126, + "step": 12890 + }, + { + "epoch": 0.6869375366100431, + "grad_norm": 0.06301886588335037, + "learning_rate": 4.7145367839777237e-05, + "loss": 0.1255, + "step": 12900 + }, + { + "epoch": 0.6874700463283455, + "grad_norm": 0.07015033811330795, + "learning_rate": 4.709238406834164e-05, + "loss": 0.1257, + "step": 12910 + }, + { + "epoch": 0.6880025560466478, + "grad_norm": 0.04845889285206795, + "learning_rate": 4.703945582946729e-05, + "loss": 0.1266, + "step": 12920 + }, + { + "epoch": 0.6885350657649502, + "grad_norm": 0.043622374534606934, + "learning_rate": 4.69865832872878e-05, + "loss": 0.1255, + "step": 12930 + }, + { + "epoch": 0.6890675754832526, + "grad_norm": 0.046708934009075165, + "learning_rate": 4.6933766605763955e-05, + "loss": 0.1262, + "step": 12940 + }, + { + "epoch": 0.6896000852015549, + "grad_norm": 0.09181608259677887, + "learning_rate": 4.688100594868341e-05, + "loss": 0.1262, + "step": 12950 + }, + { + "epoch": 0.6901325949198572, + "grad_norm": 0.0670885518193245, + "learning_rate": 4.682830147965999e-05, + "loss": 0.1262, + "step": 12960 + }, + { + "epoch": 0.6906651046381597, + "grad_norm": 0.05586402490735054, + "learning_rate": 4.6775653362133356e-05, + "loss": 0.126, + "step": 12970 + }, + { + "epoch": 0.691197614356462, + "grad_norm": 0.0520888976752758, + "learning_rate": 4.6723061759368405e-05, + "loss": 0.1256, + "step": 12980 + }, + { + "epoch": 0.6917301240747644, + "grad_norm": 0.04255915433168411, + "learning_rate": 4.667052683445474e-05, + "loss": 0.1256, + "step": 12990 + }, + { + "epoch": 0.6922626337930667, + "grad_norm": 0.04210617393255234, + "learning_rate": 4.661804875030623e-05, + "loss": 0.1259, + "step": 13000 + }, + { + "epoch": 0.6927951435113691, + "grad_norm": 0.049725860357284546, + "learning_rate": 4.656562766966047e-05, + "loss": 0.1259, + "step": 13010 + }, + { + "epoch": 0.6933276532296715, + "grad_norm": 0.04117880016565323, + "learning_rate": 4.6513263755078305e-05, + "loss": 0.1252, + "step": 13020 + }, + { + "epoch": 0.6938601629479738, + "grad_norm": 0.06293977797031403, + "learning_rate": 4.6460957168943286e-05, + "loss": 0.1257, + "step": 13030 + }, + { + "epoch": 0.6943926726662761, + "grad_norm": 0.08014130592346191, + "learning_rate": 4.640870807346116e-05, + "loss": 0.1259, + "step": 13040 + }, + { + "epoch": 0.6949251823845786, + "grad_norm": 0.05291053652763367, + "learning_rate": 4.6356516630659444e-05, + "loss": 0.1261, + "step": 13050 + }, + { + "epoch": 0.6954576921028809, + "grad_norm": 0.061003703624010086, + "learning_rate": 4.630438300238684e-05, + "loss": 0.1256, + "step": 13060 + }, + { + "epoch": 0.6959902018211832, + "grad_norm": 0.06733989715576172, + "learning_rate": 4.625230735031276e-05, + "loss": 0.1259, + "step": 13070 + }, + { + "epoch": 0.6965227115394856, + "grad_norm": 0.07360579818487167, + "learning_rate": 4.620028983592687e-05, + "loss": 0.1261, + "step": 13080 + }, + { + "epoch": 0.697055221257788, + "grad_norm": 0.0536913201212883, + "learning_rate": 4.6148330620538474e-05, + "loss": 0.1263, + "step": 13090 + }, + { + "epoch": 0.6975877309760903, + "grad_norm": 0.04993463680148125, + "learning_rate": 4.609642986527615e-05, + "loss": 0.1262, + "step": 13100 + }, + { + "epoch": 0.6981202406943927, + "grad_norm": 0.04666028916835785, + "learning_rate": 4.6044587731087155e-05, + "loss": 0.1258, + "step": 13110 + }, + { + "epoch": 0.698652750412695, + "grad_norm": 0.07630308717489243, + "learning_rate": 4.599280437873699e-05, + "loss": 0.1259, + "step": 13120 + }, + { + "epoch": 0.6991852601309974, + "grad_norm": 0.07336148619651794, + "learning_rate": 4.594107996880884e-05, + "loss": 0.1261, + "step": 13130 + }, + { + "epoch": 0.6997177698492998, + "grad_norm": 0.0583728589117527, + "learning_rate": 4.588941466170312e-05, + "loss": 0.1263, + "step": 13140 + }, + { + "epoch": 0.7002502795676021, + "grad_norm": 0.04657367989420891, + "learning_rate": 4.5837808617636935e-05, + "loss": 0.1254, + "step": 13150 + }, + { + "epoch": 0.7007827892859044, + "grad_norm": 0.03507756069302559, + "learning_rate": 4.5786261996643664e-05, + "loss": 0.1255, + "step": 13160 + }, + { + "epoch": 0.7013152990042069, + "grad_norm": 0.10152143239974976, + "learning_rate": 4.57347749585724e-05, + "loss": 0.1261, + "step": 13170 + }, + { + "epoch": 0.7018478087225092, + "grad_norm": 0.04690668731927872, + "learning_rate": 4.568334766308741e-05, + "loss": 0.126, + "step": 13180 + }, + { + "epoch": 0.7023803184408115, + "grad_norm": 0.050610288977622986, + "learning_rate": 4.563198026966776e-05, + "loss": 0.1259, + "step": 13190 + }, + { + "epoch": 0.7029128281591139, + "grad_norm": 0.07565128803253174, + "learning_rate": 4.558067293760672e-05, + "loss": 0.1253, + "step": 13200 + }, + { + "epoch": 0.7034453378774163, + "grad_norm": 0.03581630438566208, + "learning_rate": 4.552942582601134e-05, + "loss": 0.125, + "step": 13210 + }, + { + "epoch": 0.7039778475957186, + "grad_norm": 0.06287883222103119, + "learning_rate": 4.547823909380188e-05, + "loss": 0.1264, + "step": 13220 + }, + { + "epoch": 0.704510357314021, + "grad_norm": 0.03954106569290161, + "learning_rate": 4.542711289971139e-05, + "loss": 0.1258, + "step": 13230 + }, + { + "epoch": 0.7050428670323233, + "grad_norm": 0.07557252049446106, + "learning_rate": 4.537604740228517e-05, + "loss": 0.125, + "step": 13240 + }, + { + "epoch": 0.7055753767506256, + "grad_norm": 0.07294019311666489, + "learning_rate": 4.532504275988033e-05, + "loss": 0.1259, + "step": 13250 + }, + { + "epoch": 0.7061078864689281, + "grad_norm": 0.06293601542711258, + "learning_rate": 4.527409913066522e-05, + "loss": 0.1259, + "step": 13260 + }, + { + "epoch": 0.7066403961872304, + "grad_norm": 0.043173741549253464, + "learning_rate": 4.5223216672619e-05, + "loss": 0.1255, + "step": 13270 + }, + { + "epoch": 0.7071729059055327, + "grad_norm": 0.05080621689558029, + "learning_rate": 4.517239554353116e-05, + "loss": 0.1256, + "step": 13280 + }, + { + "epoch": 0.7077054156238352, + "grad_norm": 0.04947923868894577, + "learning_rate": 4.512163590100097e-05, + "loss": 0.1257, + "step": 13290 + }, + { + "epoch": 0.7082379253421375, + "grad_norm": 0.054971180856227875, + "learning_rate": 4.507093790243704e-05, + "loss": 0.1259, + "step": 13300 + }, + { + "epoch": 0.7087704350604399, + "grad_norm": 0.0491788424551487, + "learning_rate": 4.5020301705056825e-05, + "loss": 0.1251, + "step": 13310 + }, + { + "epoch": 0.7093029447787422, + "grad_norm": 0.06562227010726929, + "learning_rate": 4.496972746588614e-05, + "loss": 0.1256, + "step": 13320 + }, + { + "epoch": 0.7098354544970445, + "grad_norm": 0.05146334320306778, + "learning_rate": 4.4919215341758614e-05, + "loss": 0.1259, + "step": 13330 + }, + { + "epoch": 0.710367964215347, + "grad_norm": 0.04213017597794533, + "learning_rate": 4.486876548931533e-05, + "loss": 0.1258, + "step": 13340 + }, + { + "epoch": 0.7109004739336493, + "grad_norm": 0.07794417440891266, + "learning_rate": 4.481837806500419e-05, + "loss": 0.1255, + "step": 13350 + }, + { + "epoch": 0.7114329836519516, + "grad_norm": 0.06125866621732712, + "learning_rate": 4.4768053225079565e-05, + "loss": 0.1255, + "step": 13360 + }, + { + "epoch": 0.711965493370254, + "grad_norm": 0.056374140083789825, + "learning_rate": 4.471779112560168e-05, + "loss": 0.1262, + "step": 13370 + }, + { + "epoch": 0.7124980030885564, + "grad_norm": 0.05176498368382454, + "learning_rate": 4.466759192243627e-05, + "loss": 0.1264, + "step": 13380 + }, + { + "epoch": 0.7130305128068587, + "grad_norm": 0.1045917272567749, + "learning_rate": 4.461745577125399e-05, + "loss": 0.1253, + "step": 13390 + }, + { + "epoch": 0.7135630225251611, + "grad_norm": 0.03207787126302719, + "learning_rate": 4.456738282752996e-05, + "loss": 0.125, + "step": 13400 + }, + { + "epoch": 0.7140955322434634, + "grad_norm": 0.06704405695199966, + "learning_rate": 4.451737324654328e-05, + "loss": 0.1253, + "step": 13410 + }, + { + "epoch": 0.7146280419617658, + "grad_norm": 0.0693150982260704, + "learning_rate": 4.4467427183376596e-05, + "loss": 0.1257, + "step": 13420 + }, + { + "epoch": 0.7151605516800682, + "grad_norm": 0.04728610813617706, + "learning_rate": 4.441754479291557e-05, + "loss": 0.125, + "step": 13430 + }, + { + "epoch": 0.7156930613983705, + "grad_norm": 0.05448344349861145, + "learning_rate": 4.43677262298484e-05, + "loss": 0.1261, + "step": 13440 + }, + { + "epoch": 0.7162255711166728, + "grad_norm": 0.036701589822769165, + "learning_rate": 4.431797164866533e-05, + "loss": 0.1259, + "step": 13450 + }, + { + "epoch": 0.7167580808349753, + "grad_norm": 0.043069060891866684, + "learning_rate": 4.426828120365824e-05, + "loss": 0.1254, + "step": 13460 + }, + { + "epoch": 0.7172905905532776, + "grad_norm": 0.07068092375993729, + "learning_rate": 4.421865504892011e-05, + "loss": 0.1251, + "step": 13470 + }, + { + "epoch": 0.7178231002715799, + "grad_norm": 0.045086752623319626, + "learning_rate": 4.416909333834451e-05, + "loss": 0.1259, + "step": 13480 + }, + { + "epoch": 0.7183556099898823, + "grad_norm": 0.03668762743473053, + "learning_rate": 4.4119596225625216e-05, + "loss": 0.1254, + "step": 13490 + }, + { + "epoch": 0.7188881197081847, + "grad_norm": 0.05897703021764755, + "learning_rate": 4.4070163864255644e-05, + "loss": 0.1256, + "step": 13500 + }, + { + "epoch": 0.719420629426487, + "grad_norm": 0.047495052218437195, + "learning_rate": 4.4020796407528455e-05, + "loss": 0.126, + "step": 13510 + }, + { + "epoch": 0.7199531391447894, + "grad_norm": 0.06927572190761566, + "learning_rate": 4.397149400853498e-05, + "loss": 0.1256, + "step": 13520 + }, + { + "epoch": 0.7204856488630917, + "grad_norm": 0.03571341931819916, + "learning_rate": 4.3922256820164856e-05, + "loss": 0.1257, + "step": 13530 + }, + { + "epoch": 0.7210181585813941, + "grad_norm": 0.04303283616900444, + "learning_rate": 4.3873084995105475e-05, + "loss": 0.1255, + "step": 13540 + }, + { + "epoch": 0.7215506682996965, + "grad_norm": 0.05867360904812813, + "learning_rate": 4.382397868584151e-05, + "loss": 0.1257, + "step": 13550 + }, + { + "epoch": 0.7220831780179988, + "grad_norm": 0.05930043384432793, + "learning_rate": 4.377493804465452e-05, + "loss": 0.1254, + "step": 13560 + }, + { + "epoch": 0.7226156877363011, + "grad_norm": 0.04837455227971077, + "learning_rate": 4.372596322362237e-05, + "loss": 0.1256, + "step": 13570 + }, + { + "epoch": 0.7231481974546036, + "grad_norm": 0.04174095019698143, + "learning_rate": 4.3677054374618844e-05, + "loss": 0.1255, + "step": 13580 + }, + { + "epoch": 0.7236807071729059, + "grad_norm": 0.0439835861325264, + "learning_rate": 4.3628211649313164e-05, + "loss": 0.1256, + "step": 13590 + }, + { + "epoch": 0.7242132168912082, + "grad_norm": 0.06301723420619965, + "learning_rate": 4.357943519916942e-05, + "loss": 0.1263, + "step": 13600 + }, + { + "epoch": 0.7247457266095106, + "grad_norm": 0.06303390860557556, + "learning_rate": 4.353072517544624e-05, + "loss": 0.1254, + "step": 13610 + }, + { + "epoch": 0.725278236327813, + "grad_norm": 0.06305810809135437, + "learning_rate": 4.348208172919626e-05, + "loss": 0.1252, + "step": 13620 + }, + { + "epoch": 0.7258107460461154, + "grad_norm": 0.08300595730543137, + "learning_rate": 4.343350501126566e-05, + "loss": 0.1258, + "step": 13630 + }, + { + "epoch": 0.7263432557644177, + "grad_norm": 0.054570749402046204, + "learning_rate": 4.338499517229365e-05, + "loss": 0.1255, + "step": 13640 + }, + { + "epoch": 0.72687576548272, + "grad_norm": 0.06060722470283508, + "learning_rate": 4.333655236271207e-05, + "loss": 0.1259, + "step": 13650 + }, + { + "epoch": 0.7274082752010225, + "grad_norm": 0.04300956055521965, + "learning_rate": 4.328817673274491e-05, + "loss": 0.1263, + "step": 13660 + }, + { + "epoch": 0.7279407849193248, + "grad_norm": 0.07667776197195053, + "learning_rate": 4.3239868432407804e-05, + "loss": 0.1256, + "step": 13670 + }, + { + "epoch": 0.7284732946376271, + "grad_norm": 0.04549676924943924, + "learning_rate": 4.3191627611507625e-05, + "loss": 0.126, + "step": 13680 + }, + { + "epoch": 0.7290058043559295, + "grad_norm": 0.03738940879702568, + "learning_rate": 4.314345441964197e-05, + "loss": 0.1258, + "step": 13690 + }, + { + "epoch": 0.7295383140742319, + "grad_norm": 0.04415878280997276, + "learning_rate": 4.3095349006198704e-05, + "loss": 0.1253, + "step": 13700 + }, + { + "epoch": 0.7300708237925342, + "grad_norm": 0.04898412898182869, + "learning_rate": 4.304731152035552e-05, + "loss": 0.1254, + "step": 13710 + }, + { + "epoch": 0.7306033335108366, + "grad_norm": 0.06354010105133057, + "learning_rate": 4.299934211107947e-05, + "loss": 0.1259, + "step": 13720 + }, + { + "epoch": 0.7311358432291389, + "grad_norm": 0.055742815136909485, + "learning_rate": 4.295144092712648e-05, + "loss": 0.1253, + "step": 13730 + }, + { + "epoch": 0.7316683529474413, + "grad_norm": 0.06040511652827263, + "learning_rate": 4.290360811704094e-05, + "loss": 0.1254, + "step": 13740 + }, + { + "epoch": 0.7322008626657437, + "grad_norm": 0.04668520390987396, + "learning_rate": 4.2855843829155166e-05, + "loss": 0.1251, + "step": 13750 + }, + { + "epoch": 0.732733372384046, + "grad_norm": 0.045908767729997635, + "learning_rate": 4.280814821158899e-05, + "loss": 0.1257, + "step": 13760 + }, + { + "epoch": 0.7332658821023483, + "grad_norm": 0.06110945716500282, + "learning_rate": 4.276052141224931e-05, + "loss": 0.1253, + "step": 13770 + }, + { + "epoch": 0.7337983918206508, + "grad_norm": 0.05198313668370247, + "learning_rate": 4.271296357882962e-05, + "loss": 0.1254, + "step": 13780 + }, + { + "epoch": 0.7343309015389531, + "grad_norm": 0.08895553648471832, + "learning_rate": 4.266547485880954e-05, + "loss": 0.1245, + "step": 13790 + }, + { + "epoch": 0.7348634112572554, + "grad_norm": 0.0400872640311718, + "learning_rate": 4.261805539945433e-05, + "loss": 0.1258, + "step": 13800 + }, + { + "epoch": 0.7353959209755578, + "grad_norm": 0.05898161605000496, + "learning_rate": 4.257070534781452e-05, + "loss": 0.1257, + "step": 13810 + }, + { + "epoch": 0.7359284306938602, + "grad_norm": 0.05569084361195564, + "learning_rate": 4.2523424850725366e-05, + "loss": 0.1256, + "step": 13820 + }, + { + "epoch": 0.7364609404121625, + "grad_norm": 0.03815682604908943, + "learning_rate": 4.2476214054806464e-05, + "loss": 0.1258, + "step": 13830 + }, + { + "epoch": 0.7369934501304649, + "grad_norm": 0.05617569014430046, + "learning_rate": 4.242907310646124e-05, + "loss": 0.1256, + "step": 13840 + }, + { + "epoch": 0.7375259598487672, + "grad_norm": 0.036379266530275345, + "learning_rate": 4.238200215187653e-05, + "loss": 0.125, + "step": 13850 + }, + { + "epoch": 0.7380584695670696, + "grad_norm": 0.03899050131440163, + "learning_rate": 4.233500133702209e-05, + "loss": 0.1252, + "step": 13860 + }, + { + "epoch": 0.738590979285372, + "grad_norm": 0.08941038697957993, + "learning_rate": 4.2288070807650195e-05, + "loss": 0.1258, + "step": 13870 + }, + { + "epoch": 0.7391234890036743, + "grad_norm": 0.04613060876727104, + "learning_rate": 4.2241210709295157e-05, + "loss": 0.1257, + "step": 13880 + }, + { + "epoch": 0.7396559987219766, + "grad_norm": 0.05115320160984993, + "learning_rate": 4.219442118727289e-05, + "loss": 0.1253, + "step": 13890 + }, + { + "epoch": 0.7401885084402791, + "grad_norm": 0.050107911229133606, + "learning_rate": 4.214770238668041e-05, + "loss": 0.1249, + "step": 13900 + }, + { + "epoch": 0.7407210181585814, + "grad_norm": 0.05156391113996506, + "learning_rate": 4.210105445239544e-05, + "loss": 0.1247, + "step": 13910 + }, + { + "epoch": 0.7412535278768837, + "grad_norm": 0.03768635913729668, + "learning_rate": 4.205447752907594e-05, + "loss": 0.1255, + "step": 13920 + }, + { + "epoch": 0.7417860375951861, + "grad_norm": 0.0679803118109703, + "learning_rate": 4.20079717611597e-05, + "loss": 0.1251, + "step": 13930 + }, + { + "epoch": 0.7423185473134885, + "grad_norm": 0.04139627143740654, + "learning_rate": 4.196153729286377e-05, + "loss": 0.1254, + "step": 13940 + }, + { + "epoch": 0.7428510570317909, + "grad_norm": 0.07428357750177383, + "learning_rate": 4.191517426818419e-05, + "loss": 0.1261, + "step": 13950 + }, + { + "epoch": 0.7433835667500932, + "grad_norm": 0.04479588195681572, + "learning_rate": 4.186888283089537e-05, + "loss": 0.1251, + "step": 13960 + }, + { + "epoch": 0.7439160764683955, + "grad_norm": 0.06392911076545715, + "learning_rate": 4.182266312454977e-05, + "loss": 0.1256, + "step": 13970 + }, + { + "epoch": 0.744448586186698, + "grad_norm": 0.054266393184661865, + "learning_rate": 4.177651529247739e-05, + "loss": 0.125, + "step": 13980 + }, + { + "epoch": 0.7449810959050003, + "grad_norm": 0.06033441051840782, + "learning_rate": 4.173043947778536e-05, + "loss": 0.1253, + "step": 13990 + }, + { + "epoch": 0.7455136056233026, + "grad_norm": 0.04095017537474632, + "learning_rate": 4.1684435823357454e-05, + "loss": 0.1255, + "step": 14000 + }, + { + "epoch": 0.746046115341605, + "grad_norm": 0.037291690707206726, + "learning_rate": 4.163850447185369e-05, + "loss": 0.1245, + "step": 14010 + }, + { + "epoch": 0.7465786250599074, + "grad_norm": 0.06799422949552536, + "learning_rate": 4.159264556570986e-05, + "loss": 0.1255, + "step": 14020 + }, + { + "epoch": 0.7471111347782097, + "grad_norm": 0.044954586774110794, + "learning_rate": 4.1546859247137124e-05, + "loss": 0.1264, + "step": 14030 + }, + { + "epoch": 0.7476436444965121, + "grad_norm": 0.041422173380851746, + "learning_rate": 4.1501145658121525e-05, + "loss": 0.1254, + "step": 14040 + }, + { + "epoch": 0.7481761542148144, + "grad_norm": 0.09260525554418564, + "learning_rate": 4.145550494042356e-05, + "loss": 0.1244, + "step": 14050 + }, + { + "epoch": 0.7487086639331167, + "grad_norm": 0.03994472324848175, + "learning_rate": 4.140993723557775e-05, + "loss": 0.1262, + "step": 14060 + }, + { + "epoch": 0.7492411736514192, + "grad_norm": 0.04395360127091408, + "learning_rate": 4.136444268489221e-05, + "loss": 0.1263, + "step": 14070 + }, + { + "epoch": 0.7497736833697215, + "grad_norm": 0.0409519337117672, + "learning_rate": 4.1319021429448204e-05, + "loss": 0.126, + "step": 14080 + }, + { + "epoch": 0.7503061930880238, + "grad_norm": 0.048877742141485214, + "learning_rate": 4.1273673610099675e-05, + "loss": 0.1253, + "step": 14090 + }, + { + "epoch": 0.7508387028063263, + "grad_norm": 0.04159548133611679, + "learning_rate": 4.122839936747289e-05, + "loss": 0.1255, + "step": 14100 + }, + { + "epoch": 0.7513712125246286, + "grad_norm": 0.036307524889707565, + "learning_rate": 4.118319884196587e-05, + "loss": 0.1249, + "step": 14110 + }, + { + "epoch": 0.7519037222429309, + "grad_norm": 0.039279136806726456, + "learning_rate": 4.1138072173748116e-05, + "loss": 0.125, + "step": 14120 + }, + { + "epoch": 0.7524362319612333, + "grad_norm": 0.05518367886543274, + "learning_rate": 4.109301950276003e-05, + "loss": 0.1256, + "step": 14130 + }, + { + "epoch": 0.7529687416795356, + "grad_norm": 0.043891094624996185, + "learning_rate": 4.104804096871259e-05, + "loss": 0.1257, + "step": 14140 + }, + { + "epoch": 0.753501251397838, + "grad_norm": 0.045587554574012756, + "learning_rate": 4.1003136711086875e-05, + "loss": 0.1263, + "step": 14150 + }, + { + "epoch": 0.7540337611161404, + "grad_norm": 0.05378378927707672, + "learning_rate": 4.0958306869133555e-05, + "loss": 0.1253, + "step": 14160 + }, + { + "epoch": 0.7545662708344427, + "grad_norm": 0.05503176152706146, + "learning_rate": 4.091355158187261e-05, + "loss": 0.1258, + "step": 14170 + }, + { + "epoch": 0.755098780552745, + "grad_norm": 0.10129349678754807, + "learning_rate": 4.0868870988092795e-05, + "loss": 0.1244, + "step": 14180 + }, + { + "epoch": 0.7556312902710475, + "grad_norm": 0.05240345746278763, + "learning_rate": 4.082426522635125e-05, + "loss": 0.1257, + "step": 14190 + }, + { + "epoch": 0.7561637999893498, + "grad_norm": 0.05487096309661865, + "learning_rate": 4.077973443497303e-05, + "loss": 0.1258, + "step": 14200 + }, + { + "epoch": 0.7566963097076521, + "grad_norm": 0.04014230892062187, + "learning_rate": 4.073527875205071e-05, + "loss": 0.1246, + "step": 14210 + }, + { + "epoch": 0.7572288194259545, + "grad_norm": 0.04955144226551056, + "learning_rate": 4.0690898315443955e-05, + "loss": 0.1252, + "step": 14220 + }, + { + "epoch": 0.7577613291442569, + "grad_norm": 0.05915694311261177, + "learning_rate": 4.064659326277911e-05, + "loss": 0.1255, + "step": 14230 + }, + { + "epoch": 0.7582938388625592, + "grad_norm": 0.07433107495307922, + "learning_rate": 4.0602363731448696e-05, + "loss": 0.1247, + "step": 14240 + }, + { + "epoch": 0.7588263485808616, + "grad_norm": 0.041825130581855774, + "learning_rate": 4.0558209858611093e-05, + "loss": 0.1253, + "step": 14250 + }, + { + "epoch": 0.7593588582991639, + "grad_norm": 0.07135327905416489, + "learning_rate": 4.051413178119002e-05, + "loss": 0.1253, + "step": 14260 + }, + { + "epoch": 0.7598913680174664, + "grad_norm": 0.0779680609703064, + "learning_rate": 4.0470129635874176e-05, + "loss": 0.1242, + "step": 14270 + }, + { + "epoch": 0.7604238777357687, + "grad_norm": 0.03307312726974487, + "learning_rate": 4.042620355911677e-05, + "loss": 0.1255, + "step": 14280 + }, + { + "epoch": 0.760956387454071, + "grad_norm": 0.037016890943050385, + "learning_rate": 4.0382353687135136e-05, + "loss": 0.1249, + "step": 14290 + }, + { + "epoch": 0.7614888971723734, + "grad_norm": 0.08262995630502701, + "learning_rate": 4.0338580155910284e-05, + "loss": 0.1249, + "step": 14300 + }, + { + "epoch": 0.7620214068906758, + "grad_norm": 0.05580204352736473, + "learning_rate": 4.029488310118648e-05, + "loss": 0.1251, + "step": 14310 + }, + { + "epoch": 0.7625539166089781, + "grad_norm": 0.0543997660279274, + "learning_rate": 4.025126265847084e-05, + "loss": 0.1261, + "step": 14320 + }, + { + "epoch": 0.7630864263272805, + "grad_norm": 0.06115228682756424, + "learning_rate": 4.02077189630329e-05, + "loss": 0.1251, + "step": 14330 + }, + { + "epoch": 0.7636189360455828, + "grad_norm": 0.046210747212171555, + "learning_rate": 4.016425214990421e-05, + "loss": 0.1255, + "step": 14340 + }, + { + "epoch": 0.7641514457638852, + "grad_norm": 0.0543675497174263, + "learning_rate": 4.0120862353877884e-05, + "loss": 0.1258, + "step": 14350 + }, + { + "epoch": 0.7646839554821876, + "grad_norm": 0.06712432205677032, + "learning_rate": 4.007754970950821e-05, + "loss": 0.1256, + "step": 14360 + }, + { + "epoch": 0.7652164652004899, + "grad_norm": 0.050090424716472626, + "learning_rate": 4.0034314351110216e-05, + "loss": 0.1257, + "step": 14370 + }, + { + "epoch": 0.7657489749187922, + "grad_norm": 0.039436932653188705, + "learning_rate": 3.999115641275929e-05, + "loss": 0.1259, + "step": 14380 + }, + { + "epoch": 0.7662814846370947, + "grad_norm": 0.03885102644562721, + "learning_rate": 3.994807602829068e-05, + "loss": 0.125, + "step": 14390 + }, + { + "epoch": 0.766813994355397, + "grad_norm": 0.03700343519449234, + "learning_rate": 3.990507333129922e-05, + "loss": 0.1254, + "step": 14400 + }, + { + "epoch": 0.7673465040736993, + "grad_norm": 0.06743155419826508, + "learning_rate": 3.986214845513874e-05, + "loss": 0.1252, + "step": 14410 + }, + { + "epoch": 0.7678790137920017, + "grad_norm": 0.05197859928011894, + "learning_rate": 3.9819301532921807e-05, + "loss": 0.1244, + "step": 14420 + }, + { + "epoch": 0.7684115235103041, + "grad_norm": 0.09455039352178574, + "learning_rate": 3.9776532697519206e-05, + "loss": 0.1254, + "step": 14430 + }, + { + "epoch": 0.7689440332286064, + "grad_norm": 0.04639993980526924, + "learning_rate": 3.97338420815596e-05, + "loss": 0.1252, + "step": 14440 + }, + { + "epoch": 0.7694765429469088, + "grad_norm": 0.03305187448859215, + "learning_rate": 3.969122981742909e-05, + "loss": 0.1244, + "step": 14450 + }, + { + "epoch": 0.7700090526652111, + "grad_norm": 0.06983647495508194, + "learning_rate": 3.9648696037270786e-05, + "loss": 0.1259, + "step": 14460 + }, + { + "epoch": 0.7705415623835135, + "grad_norm": 0.07447967678308487, + "learning_rate": 3.960624087298439e-05, + "loss": 0.1251, + "step": 14470 + }, + { + "epoch": 0.7710740721018159, + "grad_norm": 0.03923282399773598, + "learning_rate": 3.956386445622589e-05, + "loss": 0.1254, + "step": 14480 + }, + { + "epoch": 0.7716065818201182, + "grad_norm": 0.05779058113694191, + "learning_rate": 3.9521566918406984e-05, + "loss": 0.1248, + "step": 14490 + }, + { + "epoch": 0.7721390915384205, + "grad_norm": 0.043516259640455246, + "learning_rate": 3.947934839069485e-05, + "loss": 0.1248, + "step": 14500 + }, + { + "epoch": 0.772671601256723, + "grad_norm": 0.05518548563122749, + "learning_rate": 3.943720900401157e-05, + "loss": 0.1261, + "step": 14510 + }, + { + "epoch": 0.7732041109750253, + "grad_norm": 0.045164406299591064, + "learning_rate": 3.939514888903383e-05, + "loss": 0.1251, + "step": 14520 + }, + { + "epoch": 0.7737366206933276, + "grad_norm": 0.042605891823768616, + "learning_rate": 3.935316817619252e-05, + "loss": 0.1251, + "step": 14530 + }, + { + "epoch": 0.77426913041163, + "grad_norm": 0.05655062943696976, + "learning_rate": 3.931126699567228e-05, + "loss": 0.1258, + "step": 14540 + }, + { + "epoch": 0.7748016401299324, + "grad_norm": 0.06695695966482162, + "learning_rate": 3.926944547741112e-05, + "loss": 0.1257, + "step": 14550 + }, + { + "epoch": 0.7753341498482347, + "grad_norm": 0.0719684287905693, + "learning_rate": 3.922770375109997e-05, + "loss": 0.1256, + "step": 14560 + }, + { + "epoch": 0.7758666595665371, + "grad_norm": 0.043789032846689224, + "learning_rate": 3.918604194618241e-05, + "loss": 0.1254, + "step": 14570 + }, + { + "epoch": 0.7763991692848394, + "grad_norm": 0.03638778626918793, + "learning_rate": 3.9144460191854075e-05, + "loss": 0.1247, + "step": 14580 + }, + { + "epoch": 0.7769316790031419, + "grad_norm": 0.0492616705596447, + "learning_rate": 3.910295861706244e-05, + "loss": 0.1248, + "step": 14590 + }, + { + "epoch": 0.7774641887214442, + "grad_norm": 0.051167041063308716, + "learning_rate": 3.906153735050632e-05, + "loss": 0.1255, + "step": 14600 + }, + { + "epoch": 0.7779966984397465, + "grad_norm": 0.09880778193473816, + "learning_rate": 3.9020196520635454e-05, + "loss": 0.1256, + "step": 14610 + }, + { + "epoch": 0.7785292081580489, + "grad_norm": 0.06614736467599869, + "learning_rate": 3.897893625565016e-05, + "loss": 0.1248, + "step": 14620 + }, + { + "epoch": 0.7790617178763513, + "grad_norm": 0.05390491709113121, + "learning_rate": 3.893775668350095e-05, + "loss": 0.125, + "step": 14630 + }, + { + "epoch": 0.7795942275946536, + "grad_norm": 0.05455655977129936, + "learning_rate": 3.8896657931888056e-05, + "loss": 0.1248, + "step": 14640 + }, + { + "epoch": 0.780126737312956, + "grad_norm": 0.06813376396894455, + "learning_rate": 3.8855640128261135e-05, + "loss": 0.1254, + "step": 14650 + }, + { + "epoch": 0.7806592470312583, + "grad_norm": 0.03827499598264694, + "learning_rate": 3.8814703399818756e-05, + "loss": 0.1255, + "step": 14660 + }, + { + "epoch": 0.7811917567495607, + "grad_norm": 0.07484028488397598, + "learning_rate": 3.877384787350812e-05, + "loss": 0.1258, + "step": 14670 + }, + { + "epoch": 0.7817242664678631, + "grad_norm": 0.04092638939619064, + "learning_rate": 3.873307367602458e-05, + "loss": 0.1256, + "step": 14680 + }, + { + "epoch": 0.7822567761861654, + "grad_norm": 0.044872015714645386, + "learning_rate": 3.869238093381131e-05, + "loss": 0.125, + "step": 14690 + }, + { + "epoch": 0.7827892859044677, + "grad_norm": 0.053619783371686935, + "learning_rate": 3.8651769773058894e-05, + "loss": 0.1244, + "step": 14700 + }, + { + "epoch": 0.7833217956227702, + "grad_norm": 0.053385183215141296, + "learning_rate": 3.861124031970487e-05, + "loss": 0.1252, + "step": 14710 + }, + { + "epoch": 0.7838543053410725, + "grad_norm": 0.039729390293359756, + "learning_rate": 3.857079269943348e-05, + "loss": 0.1252, + "step": 14720 + }, + { + "epoch": 0.7843868150593748, + "grad_norm": 0.055133990943431854, + "learning_rate": 3.853042703767511e-05, + "loss": 0.1255, + "step": 14730 + }, + { + "epoch": 0.7849193247776772, + "grad_norm": 0.0439545176923275, + "learning_rate": 3.849014345960605e-05, + "loss": 0.1254, + "step": 14740 + }, + { + "epoch": 0.7854518344959796, + "grad_norm": 0.04726070538163185, + "learning_rate": 3.844994209014805e-05, + "loss": 0.1243, + "step": 14750 + }, + { + "epoch": 0.7859843442142819, + "grad_norm": 0.11086293309926987, + "learning_rate": 3.840982305396787e-05, + "loss": 0.1254, + "step": 14760 + }, + { + "epoch": 0.7865168539325843, + "grad_norm": 0.05439605191349983, + "learning_rate": 3.8369786475476986e-05, + "loss": 0.1247, + "step": 14770 + }, + { + "epoch": 0.7870493636508866, + "grad_norm": 0.0793689638376236, + "learning_rate": 3.832983247883116e-05, + "loss": 0.125, + "step": 14780 + }, + { + "epoch": 0.787581873369189, + "grad_norm": 0.04240609332919121, + "learning_rate": 3.8289961187930076e-05, + "loss": 0.1255, + "step": 14790 + }, + { + "epoch": 0.7881143830874914, + "grad_norm": 0.04460853338241577, + "learning_rate": 3.825017272641693e-05, + "loss": 0.1258, + "step": 14800 + }, + { + "epoch": 0.7886468928057937, + "grad_norm": 0.04268253594636917, + "learning_rate": 3.821046721767806e-05, + "loss": 0.1256, + "step": 14810 + }, + { + "epoch": 0.789179402524096, + "grad_norm": 0.07220305502414703, + "learning_rate": 3.817084478484256e-05, + "loss": 0.1258, + "step": 14820 + }, + { + "epoch": 0.7897119122423985, + "grad_norm": 0.05979606509208679, + "learning_rate": 3.8131305550781906e-05, + "loss": 0.1262, + "step": 14830 + }, + { + "epoch": 0.7902444219607008, + "grad_norm": 0.06392871588468552, + "learning_rate": 3.8091849638109575e-05, + "loss": 0.1244, + "step": 14840 + }, + { + "epoch": 0.7907769316790031, + "grad_norm": 0.042751483619213104, + "learning_rate": 3.8052477169180634e-05, + "loss": 0.125, + "step": 14850 + }, + { + "epoch": 0.7913094413973055, + "grad_norm": 0.03440069034695625, + "learning_rate": 3.801318826609144e-05, + "loss": 0.1255, + "step": 14860 + }, + { + "epoch": 0.7918419511156078, + "grad_norm": 0.061454493552446365, + "learning_rate": 3.797398305067914e-05, + "loss": 0.1251, + "step": 14870 + }, + { + "epoch": 0.7923744608339102, + "grad_norm": 0.058559708297252655, + "learning_rate": 3.7934861644521405e-05, + "loss": 0.125, + "step": 14880 + }, + { + "epoch": 0.7929069705522126, + "grad_norm": 0.0407247468829155, + "learning_rate": 3.789582416893599e-05, + "loss": 0.1254, + "step": 14890 + }, + { + "epoch": 0.7934394802705149, + "grad_norm": 0.05672033876180649, + "learning_rate": 3.78568707449804e-05, + "loss": 0.1253, + "step": 14900 + }, + { + "epoch": 0.7939719899888174, + "grad_norm": 0.056891556829214096, + "learning_rate": 3.781800149345146e-05, + "loss": 0.1244, + "step": 14910 + }, + { + "epoch": 0.7945044997071197, + "grad_norm": 0.05665665119886398, + "learning_rate": 3.7779216534885e-05, + "loss": 0.1249, + "step": 14920 + }, + { + "epoch": 0.795037009425422, + "grad_norm": 0.03597261756658554, + "learning_rate": 3.774051598955541e-05, + "loss": 0.1239, + "step": 14930 + }, + { + "epoch": 0.7955695191437244, + "grad_norm": 0.06507623195648193, + "learning_rate": 3.770189997747536e-05, + "loss": 0.1246, + "step": 14940 + }, + { + "epoch": 0.7961020288620267, + "grad_norm": 0.05575447157025337, + "learning_rate": 3.7663368618395365e-05, + "loss": 0.1251, + "step": 14950 + }, + { + "epoch": 0.7966345385803291, + "grad_norm": 0.07422123104333878, + "learning_rate": 3.7624922031803403e-05, + "loss": 0.1248, + "step": 14960 + }, + { + "epoch": 0.7971670482986315, + "grad_norm": 0.03994056582450867, + "learning_rate": 3.758656033692457e-05, + "loss": 0.1254, + "step": 14970 + }, + { + "epoch": 0.7976995580169338, + "grad_norm": 0.06453961879014969, + "learning_rate": 3.754828365272072e-05, + "loss": 0.1248, + "step": 14980 + }, + { + "epoch": 0.7982320677352361, + "grad_norm": 0.03173014149069786, + "learning_rate": 3.751009209789011e-05, + "loss": 0.1246, + "step": 14990 + }, + { + "epoch": 0.7987645774535386, + "grad_norm": 0.05219841003417969, + "learning_rate": 3.747198579086695e-05, + "loss": 0.125, + "step": 15000 + }, + { + "epoch": 0.7992970871718409, + "grad_norm": 0.04533257335424423, + "learning_rate": 3.7433964849821145e-05, + "loss": 0.1247, + "step": 15010 + }, + { + "epoch": 0.7998295968901432, + "grad_norm": 0.05037694424390793, + "learning_rate": 3.7396029392657835e-05, + "loss": 0.1254, + "step": 15020 + }, + { + "epoch": 0.8003621066084456, + "grad_norm": 0.047749314457178116, + "learning_rate": 3.7358179537017066e-05, + "loss": 0.1251, + "step": 15030 + }, + { + "epoch": 0.800894616326748, + "grad_norm": 0.06332427263259888, + "learning_rate": 3.732041540027348e-05, + "loss": 0.1246, + "step": 15040 + }, + { + "epoch": 0.8014271260450503, + "grad_norm": 0.04189267009496689, + "learning_rate": 3.728273709953586e-05, + "loss": 0.1248, + "step": 15050 + }, + { + "epoch": 0.8019596357633527, + "grad_norm": 0.04982787370681763, + "learning_rate": 3.724514475164681e-05, + "loss": 0.1251, + "step": 15060 + }, + { + "epoch": 0.802492145481655, + "grad_norm": 0.032108910381793976, + "learning_rate": 3.720763847318239e-05, + "loss": 0.125, + "step": 15070 + }, + { + "epoch": 0.8030246551999574, + "grad_norm": 0.03504796326160431, + "learning_rate": 3.717021838045175e-05, + "loss": 0.1247, + "step": 15080 + }, + { + "epoch": 0.8035571649182598, + "grad_norm": 0.05086008459329605, + "learning_rate": 3.713288458949679e-05, + "loss": 0.1253, + "step": 15090 + }, + { + "epoch": 0.8040896746365621, + "grad_norm": 0.04598323255777359, + "learning_rate": 3.709563721609178e-05, + "loss": 0.1246, + "step": 15100 + }, + { + "epoch": 0.8046221843548644, + "grad_norm": 0.034302182495594025, + "learning_rate": 3.705847637574299e-05, + "loss": 0.1256, + "step": 15110 + }, + { + "epoch": 0.8051546940731669, + "grad_norm": 0.04984142258763313, + "learning_rate": 3.7021402183688334e-05, + "loss": 0.1249, + "step": 15120 + }, + { + "epoch": 0.8056872037914692, + "grad_norm": 0.040011048316955566, + "learning_rate": 3.698441475489707e-05, + "loss": 0.1245, + "step": 15130 + }, + { + "epoch": 0.8062197135097715, + "grad_norm": 0.0699247419834137, + "learning_rate": 3.694751420406937e-05, + "loss": 0.1244, + "step": 15140 + }, + { + "epoch": 0.8067522232280739, + "grad_norm": 0.035953816026449203, + "learning_rate": 3.6910700645635975e-05, + "loss": 0.1251, + "step": 15150 + }, + { + "epoch": 0.8072847329463763, + "grad_norm": 0.09570103138685226, + "learning_rate": 3.68739741937579e-05, + "loss": 0.1245, + "step": 15160 + }, + { + "epoch": 0.8078172426646786, + "grad_norm": 0.051727280020713806, + "learning_rate": 3.683733496232599e-05, + "loss": 0.1257, + "step": 15170 + }, + { + "epoch": 0.808349752382981, + "grad_norm": 0.05984990671277046, + "learning_rate": 3.680078306496066e-05, + "loss": 0.1251, + "step": 15180 + }, + { + "epoch": 0.8088822621012833, + "grad_norm": 0.050374679267406464, + "learning_rate": 3.676431861501146e-05, + "loss": 0.1245, + "step": 15190 + }, + { + "epoch": 0.8094147718195857, + "grad_norm": 0.08578687161207199, + "learning_rate": 3.672794172555677e-05, + "loss": 0.1253, + "step": 15200 + }, + { + "epoch": 0.8099472815378881, + "grad_norm": 0.0373394675552845, + "learning_rate": 3.6691652509403475e-05, + "loss": 0.1244, + "step": 15210 + }, + { + "epoch": 0.8104797912561904, + "grad_norm": 0.03546525537967682, + "learning_rate": 3.6655451079086525e-05, + "loss": 0.1244, + "step": 15220 + }, + { + "epoch": 0.8110123009744928, + "grad_norm": 0.04470152407884598, + "learning_rate": 3.661933754686867e-05, + "loss": 0.1251, + "step": 15230 + }, + { + "epoch": 0.8115448106927952, + "grad_norm": 0.05563315749168396, + "learning_rate": 3.6583312024740076e-05, + "loss": 0.1254, + "step": 15240 + }, + { + "epoch": 0.8120773204110975, + "grad_norm": 0.04946048930287361, + "learning_rate": 3.654737462441801e-05, + "loss": 0.1246, + "step": 15250 + }, + { + "epoch": 0.8126098301293999, + "grad_norm": 0.05127432197332382, + "learning_rate": 3.651152545734643e-05, + "loss": 0.1247, + "step": 15260 + }, + { + "epoch": 0.8131423398477022, + "grad_norm": 0.05603098124265671, + "learning_rate": 3.6475764634695674e-05, + "loss": 0.1246, + "step": 15270 + }, + { + "epoch": 0.8136748495660046, + "grad_norm": 0.047840967774391174, + "learning_rate": 3.644009226736217e-05, + "loss": 0.1248, + "step": 15280 + }, + { + "epoch": 0.814207359284307, + "grad_norm": 0.04910242184996605, + "learning_rate": 3.6404508465968e-05, + "loss": 0.1252, + "step": 15290 + }, + { + "epoch": 0.8147398690026093, + "grad_norm": 0.07945267856121063, + "learning_rate": 3.6369013340860606e-05, + "loss": 0.1249, + "step": 15300 + }, + { + "epoch": 0.8152723787209116, + "grad_norm": 0.054363641887903214, + "learning_rate": 3.633360700211243e-05, + "loss": 0.125, + "step": 15310 + }, + { + "epoch": 0.8158048884392141, + "grad_norm": 0.051253627985715866, + "learning_rate": 3.629828955952062e-05, + "loss": 0.1252, + "step": 15320 + }, + { + "epoch": 0.8163373981575164, + "grad_norm": 0.049010276794433594, + "learning_rate": 3.62630611226066e-05, + "loss": 0.124, + "step": 15330 + }, + { + "epoch": 0.8168699078758187, + "grad_norm": 0.05660669878125191, + "learning_rate": 3.62279218006158e-05, + "loss": 0.1245, + "step": 15340 + }, + { + "epoch": 0.8174024175941211, + "grad_norm": 0.038386616855859756, + "learning_rate": 3.619287170251734e-05, + "loss": 0.125, + "step": 15350 + }, + { + "epoch": 0.8179349273124235, + "grad_norm": 0.06046159192919731, + "learning_rate": 3.6157910937003597e-05, + "loss": 0.1245, + "step": 15360 + }, + { + "epoch": 0.8184674370307258, + "grad_norm": 0.06180752068758011, + "learning_rate": 3.612303961248995e-05, + "loss": 0.1246, + "step": 15370 + }, + { + "epoch": 0.8189999467490282, + "grad_norm": 0.041465550661087036, + "learning_rate": 3.60882578371144e-05, + "loss": 0.1247, + "step": 15380 + }, + { + "epoch": 0.8195324564673305, + "grad_norm": 0.0629926398396492, + "learning_rate": 3.6053565718737265e-05, + "loss": 0.1248, + "step": 15390 + }, + { + "epoch": 0.8200649661856328, + "grad_norm": 0.05619725584983826, + "learning_rate": 3.601896336494083e-05, + "loss": 0.1251, + "step": 15400 + }, + { + "epoch": 0.8205974759039353, + "grad_norm": 0.06445404887199402, + "learning_rate": 3.598445088302901e-05, + "loss": 0.1244, + "step": 15410 + }, + { + "epoch": 0.8211299856222376, + "grad_norm": 0.04841604083776474, + "learning_rate": 3.595002838002704e-05, + "loss": 0.1249, + "step": 15420 + }, + { + "epoch": 0.8216624953405399, + "grad_norm": 0.04060814529657364, + "learning_rate": 3.591569596268108e-05, + "loss": 0.1251, + "step": 15430 + }, + { + "epoch": 0.8221950050588424, + "grad_norm": 0.03796577826142311, + "learning_rate": 3.5881453737457984e-05, + "loss": 0.1246, + "step": 15440 + }, + { + "epoch": 0.8227275147771447, + "grad_norm": 0.033980198204517365, + "learning_rate": 3.5847301810544856e-05, + "loss": 0.1252, + "step": 15450 + }, + { + "epoch": 0.823260024495447, + "grad_norm": 0.05901845172047615, + "learning_rate": 3.581324028784886e-05, + "loss": 0.1244, + "step": 15460 + }, + { + "epoch": 0.8237925342137494, + "grad_norm": 0.05464969575405121, + "learning_rate": 3.577926927499673e-05, + "loss": 0.1249, + "step": 15470 + }, + { + "epoch": 0.8243250439320517, + "grad_norm": 0.05274730920791626, + "learning_rate": 3.574538887733456e-05, + "loss": 0.1253, + "step": 15480 + }, + { + "epoch": 0.8248575536503541, + "grad_norm": 0.0523492768406868, + "learning_rate": 3.5711599199927446e-05, + "loss": 0.1246, + "step": 15490 + }, + { + "epoch": 0.8253900633686565, + "grad_norm": 0.03017192892730236, + "learning_rate": 3.5677900347559146e-05, + "loss": 0.1246, + "step": 15500 + }, + { + "epoch": 0.8259225730869588, + "grad_norm": 0.058320943266153336, + "learning_rate": 3.564429242473178e-05, + "loss": 0.1253, + "step": 15510 + }, + { + "epoch": 0.8264550828052611, + "grad_norm": 0.05324307456612587, + "learning_rate": 3.5610775535665465e-05, + "loss": 0.1247, + "step": 15520 + }, + { + "epoch": 0.8269875925235636, + "grad_norm": 0.05818801745772362, + "learning_rate": 3.557734978429801e-05, + "loss": 0.125, + "step": 15530 + }, + { + "epoch": 0.8275201022418659, + "grad_norm": 0.06262166053056717, + "learning_rate": 3.554401527428465e-05, + "loss": 0.1246, + "step": 15540 + }, + { + "epoch": 0.8280526119601683, + "grad_norm": 0.04980841279029846, + "learning_rate": 3.551077210899763e-05, + "loss": 0.1244, + "step": 15550 + }, + { + "epoch": 0.8285851216784706, + "grad_norm": 0.03848971053957939, + "learning_rate": 3.547762039152594e-05, + "loss": 0.1247, + "step": 15560 + }, + { + "epoch": 0.829117631396773, + "grad_norm": 0.04948917403817177, + "learning_rate": 3.5444560224675e-05, + "loss": 0.126, + "step": 15570 + }, + { + "epoch": 0.8296501411150754, + "grad_norm": 0.04893777146935463, + "learning_rate": 3.541159171096631e-05, + "loss": 0.1252, + "step": 15580 + }, + { + "epoch": 0.8301826508333777, + "grad_norm": 0.0531187430024147, + "learning_rate": 3.537871495263716e-05, + "loss": 0.1242, + "step": 15590 + }, + { + "epoch": 0.83071516055168, + "grad_norm": 0.04317576438188553, + "learning_rate": 3.534593005164027e-05, + "loss": 0.1241, + "step": 15600 + }, + { + "epoch": 0.8312476702699825, + "grad_norm": 0.03508533909916878, + "learning_rate": 3.531323710964356e-05, + "loss": 0.1251, + "step": 15610 + }, + { + "epoch": 0.8317801799882848, + "grad_norm": 0.04336007684469223, + "learning_rate": 3.528063622802974e-05, + "loss": 0.1255, + "step": 15620 + }, + { + "epoch": 0.8323126897065871, + "grad_norm": 0.05976368486881256, + "learning_rate": 3.5248127507896045e-05, + "loss": 0.1243, + "step": 15630 + }, + { + "epoch": 0.8328451994248895, + "grad_norm": 0.03515305742621422, + "learning_rate": 3.52157110500539e-05, + "loss": 0.1256, + "step": 15640 + }, + { + "epoch": 0.8333777091431919, + "grad_norm": 0.08611953258514404, + "learning_rate": 3.518338695502864e-05, + "loss": 0.1243, + "step": 15650 + }, + { + "epoch": 0.8339102188614942, + "grad_norm": 0.041929975152015686, + "learning_rate": 3.515115532305918e-05, + "loss": 0.1246, + "step": 15660 + }, + { + "epoch": 0.8344427285797966, + "grad_norm": 0.03388476371765137, + "learning_rate": 3.511901625409768e-05, + "loss": 0.1246, + "step": 15670 + }, + { + "epoch": 0.8349752382980989, + "grad_norm": 0.04702109470963478, + "learning_rate": 3.5086969847809256e-05, + "loss": 0.1241, + "step": 15680 + }, + { + "epoch": 0.8355077480164013, + "grad_norm": 0.04313468933105469, + "learning_rate": 3.50550162035717e-05, + "loss": 0.125, + "step": 15690 + }, + { + "epoch": 0.8360402577347037, + "grad_norm": 0.04187025874853134, + "learning_rate": 3.502315542047512e-05, + "loss": 0.1244, + "step": 15700 + }, + { + "epoch": 0.836572767453006, + "grad_norm": 0.0742115005850792, + "learning_rate": 3.4991387597321654e-05, + "loss": 0.1247, + "step": 15710 + }, + { + "epoch": 0.8371052771713083, + "grad_norm": 0.047620195895433426, + "learning_rate": 3.495971283262519e-05, + "loss": 0.1247, + "step": 15720 + }, + { + "epoch": 0.8376377868896108, + "grad_norm": 0.06339036673307419, + "learning_rate": 3.492813122461101e-05, + "loss": 0.1249, + "step": 15730 + }, + { + "epoch": 0.8381702966079131, + "grad_norm": 0.03321847692131996, + "learning_rate": 3.489664287121553e-05, + "loss": 0.1249, + "step": 15740 + }, + { + "epoch": 0.8387028063262154, + "grad_norm": 0.06577350944280624, + "learning_rate": 3.486524787008595e-05, + "loss": 0.1241, + "step": 15750 + }, + { + "epoch": 0.8392353160445178, + "grad_norm": 0.04866393655538559, + "learning_rate": 3.4833946318580026e-05, + "loss": 0.1244, + "step": 15760 + }, + { + "epoch": 0.8397678257628202, + "grad_norm": 0.045106563717126846, + "learning_rate": 3.4802738313765685e-05, + "loss": 0.1246, + "step": 15770 + }, + { + "epoch": 0.8403003354811225, + "grad_norm": 0.06193890795111656, + "learning_rate": 3.477162395242076e-05, + "loss": 0.1251, + "step": 15780 + }, + { + "epoch": 0.8408328451994249, + "grad_norm": 0.06153490021824837, + "learning_rate": 3.4740603331032706e-05, + "loss": 0.125, + "step": 15790 + }, + { + "epoch": 0.8413653549177272, + "grad_norm": 0.05703847110271454, + "learning_rate": 3.470967654579828e-05, + "loss": 0.1251, + "step": 15800 + }, + { + "epoch": 0.8418978646360296, + "grad_norm": 0.03664189949631691, + "learning_rate": 3.467884369262325e-05, + "loss": 0.1249, + "step": 15810 + }, + { + "epoch": 0.842430374354332, + "grad_norm": 0.037624064832925797, + "learning_rate": 3.46481048671221e-05, + "loss": 0.1245, + "step": 15820 + }, + { + "epoch": 0.8429628840726343, + "grad_norm": 0.04117140918970108, + "learning_rate": 3.4617460164617684e-05, + "loss": 0.1252, + "step": 15830 + }, + { + "epoch": 0.8434953937909366, + "grad_norm": 0.03222690895199776, + "learning_rate": 3.4586909680141047e-05, + "loss": 0.1245, + "step": 15840 + }, + { + "epoch": 0.8440279035092391, + "grad_norm": 0.057400964200496674, + "learning_rate": 3.455645350843102e-05, + "loss": 0.1248, + "step": 15850 + }, + { + "epoch": 0.8445604132275414, + "grad_norm": 0.04511050879955292, + "learning_rate": 3.452609174393395e-05, + "loss": 0.1248, + "step": 15860 + }, + { + "epoch": 0.8450929229458438, + "grad_norm": 0.03972748667001724, + "learning_rate": 3.4495824480803455e-05, + "loss": 0.1247, + "step": 15870 + }, + { + "epoch": 0.8456254326641461, + "grad_norm": 0.08818963170051575, + "learning_rate": 3.446565181290007e-05, + "loss": 0.125, + "step": 15880 + }, + { + "epoch": 0.8461579423824485, + "grad_norm": 0.07608040422201157, + "learning_rate": 3.4435573833791016e-05, + "loss": 0.1246, + "step": 15890 + }, + { + "epoch": 0.8466904521007509, + "grad_norm": 0.055682647973299026, + "learning_rate": 3.4405590636749836e-05, + "loss": 0.1255, + "step": 15900 + }, + { + "epoch": 0.8472229618190532, + "grad_norm": 0.0459553599357605, + "learning_rate": 3.437570231475618e-05, + "loss": 0.1247, + "step": 15910 + }, + { + "epoch": 0.8477554715373555, + "grad_norm": 0.07165340334177017, + "learning_rate": 3.43459089604955e-05, + "loss": 0.1245, + "step": 15920 + }, + { + "epoch": 0.848287981255658, + "grad_norm": 0.04513763263821602, + "learning_rate": 3.43162106663587e-05, + "loss": 0.1249, + "step": 15930 + }, + { + "epoch": 0.8488204909739603, + "grad_norm": 0.07213608175516129, + "learning_rate": 3.428660752444193e-05, + "loss": 0.1242, + "step": 15940 + }, + { + "epoch": 0.8493530006922626, + "grad_norm": 0.030396446585655212, + "learning_rate": 3.425709962654625e-05, + "loss": 0.1252, + "step": 15950 + }, + { + "epoch": 0.849885510410565, + "grad_norm": 0.030090300366282463, + "learning_rate": 3.4227687064177385e-05, + "loss": 0.1247, + "step": 15960 + }, + { + "epoch": 0.8504180201288674, + "grad_norm": 0.05374327301979065, + "learning_rate": 3.419836992854541e-05, + "loss": 0.1243, + "step": 15970 + }, + { + "epoch": 0.8509505298471697, + "grad_norm": 0.04545629397034645, + "learning_rate": 3.416914831056446e-05, + "loss": 0.1245, + "step": 15980 + }, + { + "epoch": 0.8514830395654721, + "grad_norm": 0.055011678487062454, + "learning_rate": 3.414002230085248e-05, + "loss": 0.1249, + "step": 15990 + }, + { + "epoch": 0.8520155492837744, + "grad_norm": 0.05431196093559265, + "learning_rate": 3.411099198973092e-05, + "loss": 0.1241, + "step": 16000 + }, + { + "epoch": 0.8525480590020768, + "grad_norm": 0.05473232641816139, + "learning_rate": 3.4082057467224484e-05, + "loss": 0.1253, + "step": 16010 + }, + { + "epoch": 0.8530805687203792, + "grad_norm": 0.07439985126256943, + "learning_rate": 3.40532188230608e-05, + "loss": 0.1245, + "step": 16020 + }, + { + "epoch": 0.8536130784386815, + "grad_norm": 0.07038458436727524, + "learning_rate": 3.402447614667018e-05, + "loss": 0.125, + "step": 16030 + }, + { + "epoch": 0.8541455881569838, + "grad_norm": 0.061058055609464645, + "learning_rate": 3.3995829527185354e-05, + "loss": 0.1249, + "step": 16040 + }, + { + "epoch": 0.8546780978752863, + "grad_norm": 0.057675547897815704, + "learning_rate": 3.396727905344115e-05, + "loss": 0.124, + "step": 16050 + }, + { + "epoch": 0.8552106075935886, + "grad_norm": 0.038779694586992264, + "learning_rate": 3.3938824813974254e-05, + "loss": 0.1242, + "step": 16060 + }, + { + "epoch": 0.8557431173118909, + "grad_norm": 0.044711895287036896, + "learning_rate": 3.391046689702292e-05, + "loss": 0.1241, + "step": 16070 + }, + { + "epoch": 0.8562756270301933, + "grad_norm": 0.0402277447283268, + "learning_rate": 3.388220539052671e-05, + "loss": 0.1241, + "step": 16080 + }, + { + "epoch": 0.8568081367484957, + "grad_norm": 0.07318955659866333, + "learning_rate": 3.3854040382126196e-05, + "loss": 0.125, + "step": 16090 + }, + { + "epoch": 0.857340646466798, + "grad_norm": 0.05128632113337517, + "learning_rate": 3.382597195916271e-05, + "loss": 0.1252, + "step": 16100 + }, + { + "epoch": 0.8578731561851004, + "grad_norm": 0.04426991939544678, + "learning_rate": 3.379800020867808e-05, + "loss": 0.124, + "step": 16110 + }, + { + "epoch": 0.8584056659034027, + "grad_norm": 0.09766895323991776, + "learning_rate": 3.377012521741433e-05, + "loss": 0.1246, + "step": 16120 + }, + { + "epoch": 0.858938175621705, + "grad_norm": 0.04723978415131569, + "learning_rate": 3.3742347071813424e-05, + "loss": 0.1255, + "step": 16130 + }, + { + "epoch": 0.8594706853400075, + "grad_norm": 0.0812908411026001, + "learning_rate": 3.3714665858017015e-05, + "loss": 0.1239, + "step": 16140 + }, + { + "epoch": 0.8600031950583098, + "grad_norm": 0.0860326737165451, + "learning_rate": 3.3687081661866164e-05, + "loss": 0.1241, + "step": 16150 + }, + { + "epoch": 0.8605357047766121, + "grad_norm": 0.04998904466629028, + "learning_rate": 3.365959456890109e-05, + "loss": 0.1247, + "step": 16160 + }, + { + "epoch": 0.8610682144949146, + "grad_norm": 0.035985738039016724, + "learning_rate": 3.3632204664360836e-05, + "loss": 0.1238, + "step": 16170 + }, + { + "epoch": 0.8616007242132169, + "grad_norm": 0.09907463192939758, + "learning_rate": 3.3604912033183126e-05, + "loss": 0.1244, + "step": 16180 + }, + { + "epoch": 0.8621332339315193, + "grad_norm": 0.07233595103025436, + "learning_rate": 3.357771676000397e-05, + "loss": 0.1253, + "step": 16190 + }, + { + "epoch": 0.8626657436498216, + "grad_norm": 0.038175683468580246, + "learning_rate": 3.355061892915752e-05, + "loss": 0.1245, + "step": 16200 + }, + { + "epoch": 0.863198253368124, + "grad_norm": 0.03324522450566292, + "learning_rate": 3.352361862467572e-05, + "loss": 0.1242, + "step": 16210 + }, + { + "epoch": 0.8637307630864264, + "grad_norm": 0.03613545373082161, + "learning_rate": 3.349671593028809e-05, + "loss": 0.1244, + "step": 16220 + }, + { + "epoch": 0.8642632728047287, + "grad_norm": 0.10772500932216644, + "learning_rate": 3.346991092942146e-05, + "loss": 0.1247, + "step": 16230 + }, + { + "epoch": 0.864795782523031, + "grad_norm": 0.05393153801560402, + "learning_rate": 3.3443203705199686e-05, + "loss": 0.1247, + "step": 16240 + }, + { + "epoch": 0.8653282922413335, + "grad_norm": 0.04021570831537247, + "learning_rate": 3.3416594340443444e-05, + "loss": 0.1248, + "step": 16250 + }, + { + "epoch": 0.8658608019596358, + "grad_norm": 0.04227181524038315, + "learning_rate": 3.339008291766991e-05, + "loss": 0.1245, + "step": 16260 + }, + { + "epoch": 0.8663933116779381, + "grad_norm": 0.044721730053424835, + "learning_rate": 3.3363669519092563e-05, + "loss": 0.1247, + "step": 16270 + }, + { + "epoch": 0.8669258213962405, + "grad_norm": 0.1196049377322197, + "learning_rate": 3.33373542266209e-05, + "loss": 0.1251, + "step": 16280 + }, + { + "epoch": 0.8674583311145428, + "grad_norm": 0.03756421059370041, + "learning_rate": 3.331113712186016e-05, + "loss": 0.1242, + "step": 16290 + }, + { + "epoch": 0.8679908408328452, + "grad_norm": 0.03522124141454697, + "learning_rate": 3.328501828611112e-05, + "loss": 0.1245, + "step": 16300 + }, + { + "epoch": 0.8685233505511476, + "grad_norm": 0.04906485602259636, + "learning_rate": 3.325899780036982e-05, + "loss": 0.125, + "step": 16310 + }, + { + "epoch": 0.8690558602694499, + "grad_norm": 0.046862684190273285, + "learning_rate": 3.3233075745327286e-05, + "loss": 0.124, + "step": 16320 + }, + { + "epoch": 0.8695883699877522, + "grad_norm": 0.1074092835187912, + "learning_rate": 3.320725220136934e-05, + "loss": 0.1245, + "step": 16330 + }, + { + "epoch": 0.8701208797060547, + "grad_norm": 0.05268271267414093, + "learning_rate": 3.3181527248576294e-05, + "loss": 0.1243, + "step": 16340 + }, + { + "epoch": 0.870653389424357, + "grad_norm": 0.049087993800640106, + "learning_rate": 3.3155900966722727e-05, + "loss": 0.1242, + "step": 16350 + }, + { + "epoch": 0.8711858991426593, + "grad_norm": 0.060601964592933655, + "learning_rate": 3.313037343527722e-05, + "loss": 0.1247, + "step": 16360 + }, + { + "epoch": 0.8717184088609617, + "grad_norm": 0.05477839335799217, + "learning_rate": 3.310494473340215e-05, + "loss": 0.1254, + "step": 16370 + }, + { + "epoch": 0.8722509185792641, + "grad_norm": 0.055110715329647064, + "learning_rate": 3.3079614939953416e-05, + "loss": 0.1246, + "step": 16380 + }, + { + "epoch": 0.8727834282975664, + "grad_norm": 0.0602547712624073, + "learning_rate": 3.305438413348016e-05, + "loss": 0.125, + "step": 16390 + }, + { + "epoch": 0.8733159380158688, + "grad_norm": 0.05673711746931076, + "learning_rate": 3.3029252392224584e-05, + "loss": 0.1245, + "step": 16400 + }, + { + "epoch": 0.8738484477341711, + "grad_norm": 0.05631018802523613, + "learning_rate": 3.30042197941217e-05, + "loss": 0.125, + "step": 16410 + }, + { + "epoch": 0.8743809574524735, + "grad_norm": 0.047678008675575256, + "learning_rate": 3.297928641679906e-05, + "loss": 0.1242, + "step": 16420 + }, + { + "epoch": 0.8749134671707759, + "grad_norm": 0.05217251926660538, + "learning_rate": 3.2954452337576504e-05, + "loss": 0.1245, + "step": 16430 + }, + { + "epoch": 0.8754459768890782, + "grad_norm": 0.05652473866939545, + "learning_rate": 3.2929717633465954e-05, + "loss": 0.1243, + "step": 16440 + }, + { + "epoch": 0.8759784866073805, + "grad_norm": 0.03848657384514809, + "learning_rate": 3.2905082381171184e-05, + "loss": 0.1243, + "step": 16450 + }, + { + "epoch": 0.876510996325683, + "grad_norm": 0.047618966549634933, + "learning_rate": 3.2880546657087554e-05, + "loss": 0.1246, + "step": 16460 + }, + { + "epoch": 0.8770435060439853, + "grad_norm": 0.06333454698324203, + "learning_rate": 3.2856110537301756e-05, + "loss": 0.1244, + "step": 16470 + }, + { + "epoch": 0.8775760157622876, + "grad_norm": 0.04720817133784294, + "learning_rate": 3.283177409759164e-05, + "loss": 0.1239, + "step": 16480 + }, + { + "epoch": 0.87810852548059, + "grad_norm": 0.03655124083161354, + "learning_rate": 3.280753741342592e-05, + "loss": 0.1248, + "step": 16490 + }, + { + "epoch": 0.8786410351988924, + "grad_norm": 0.05196612700819969, + "learning_rate": 3.278340055996396e-05, + "loss": 0.1245, + "step": 16500 + }, + { + "epoch": 0.8791735449171948, + "grad_norm": 0.039216578006744385, + "learning_rate": 3.275936361205555e-05, + "loss": 0.1248, + "step": 16510 + }, + { + "epoch": 0.8797060546354971, + "grad_norm": 0.055273279547691345, + "learning_rate": 3.2735426644240665e-05, + "loss": 0.1248, + "step": 16520 + }, + { + "epoch": 0.8802385643537994, + "grad_norm": 0.05333053693175316, + "learning_rate": 3.2711589730749266e-05, + "loss": 0.1242, + "step": 16530 + }, + { + "epoch": 0.8807710740721019, + "grad_norm": 0.062082525342702866, + "learning_rate": 3.268785294550098e-05, + "loss": 0.1251, + "step": 16540 + }, + { + "epoch": 0.8813035837904042, + "grad_norm": 0.03454854339361191, + "learning_rate": 3.266421636210497e-05, + "loss": 0.1241, + "step": 16550 + }, + { + "epoch": 0.8818360935087065, + "grad_norm": 0.036358997225761414, + "learning_rate": 3.264068005385965e-05, + "loss": 0.1246, + "step": 16560 + }, + { + "epoch": 0.8823686032270089, + "grad_norm": 0.03956957161426544, + "learning_rate": 3.261724409375252e-05, + "loss": 0.1241, + "step": 16570 + }, + { + "epoch": 0.8829011129453113, + "grad_norm": 0.07209271937608719, + "learning_rate": 3.259390855445982e-05, + "loss": 0.125, + "step": 16580 + }, + { + "epoch": 0.8834336226636136, + "grad_norm": 0.06704261153936386, + "learning_rate": 3.257067350834644e-05, + "loss": 0.1247, + "step": 16590 + }, + { + "epoch": 0.883966132381916, + "grad_norm": 0.06499594449996948, + "learning_rate": 3.25475390274656e-05, + "loss": 0.1253, + "step": 16600 + }, + { + "epoch": 0.8844986421002183, + "grad_norm": 0.03783570975065231, + "learning_rate": 3.2524505183558684e-05, + "loss": 0.1246, + "step": 16610 + }, + { + "epoch": 0.8850311518185207, + "grad_norm": 0.04036329314112663, + "learning_rate": 3.250157204805498e-05, + "loss": 0.1249, + "step": 16620 + }, + { + "epoch": 0.8855636615368231, + "grad_norm": 0.04968998581171036, + "learning_rate": 3.247873969207148e-05, + "loss": 0.125, + "step": 16630 + }, + { + "epoch": 0.8860961712551254, + "grad_norm": 0.045320551842451096, + "learning_rate": 3.245600818641265e-05, + "loss": 0.1244, + "step": 16640 + }, + { + "epoch": 0.8866286809734277, + "grad_norm": 0.06106564775109291, + "learning_rate": 3.243337760157022e-05, + "loss": 0.1247, + "step": 16650 + }, + { + "epoch": 0.8871611906917302, + "grad_norm": 0.04613622650504112, + "learning_rate": 3.241084800772296e-05, + "loss": 0.1245, + "step": 16660 + }, + { + "epoch": 0.8876937004100325, + "grad_norm": 0.05316569283604622, + "learning_rate": 3.238841947473642e-05, + "loss": 0.1236, + "step": 16670 + }, + { + "epoch": 0.8882262101283348, + "grad_norm": 0.0546153299510479, + "learning_rate": 3.236609207216283e-05, + "loss": 0.1245, + "step": 16680 + }, + { + "epoch": 0.8887587198466372, + "grad_norm": 0.06547331809997559, + "learning_rate": 3.2343865869240746e-05, + "loss": 0.1243, + "step": 16690 + }, + { + "epoch": 0.8892912295649396, + "grad_norm": 0.055185478180646896, + "learning_rate": 3.2321740934894925e-05, + "loss": 0.1245, + "step": 16700 + }, + { + "epoch": 0.8898237392832419, + "grad_norm": 0.046210877597332, + "learning_rate": 3.2299717337736076e-05, + "loss": 0.1242, + "step": 16710 + }, + { + "epoch": 0.8903562490015443, + "grad_norm": 0.04753991216421127, + "learning_rate": 3.2277795146060645e-05, + "loss": 0.1246, + "step": 16720 + }, + { + "epoch": 0.8908887587198466, + "grad_norm": 0.05761198326945305, + "learning_rate": 3.2255974427850666e-05, + "loss": 0.124, + "step": 16730 + }, + { + "epoch": 0.891421268438149, + "grad_norm": 0.061156004667282104, + "learning_rate": 3.223425525077342e-05, + "loss": 0.1244, + "step": 16740 + }, + { + "epoch": 0.8919537781564514, + "grad_norm": 0.07980604469776154, + "learning_rate": 3.2212637682181354e-05, + "loss": 0.1244, + "step": 16750 + }, + { + "epoch": 0.8924862878747537, + "grad_norm": 0.04657996818423271, + "learning_rate": 3.219112178911181e-05, + "loss": 0.1248, + "step": 16760 + }, + { + "epoch": 0.893018797593056, + "grad_norm": 0.040127284824848175, + "learning_rate": 3.216970763828683e-05, + "loss": 0.1245, + "step": 16770 + }, + { + "epoch": 0.8935513073113585, + "grad_norm": 0.04287361726164818, + "learning_rate": 3.2148395296112945e-05, + "loss": 0.1248, + "step": 16780 + }, + { + "epoch": 0.8940838170296608, + "grad_norm": 0.0566687285900116, + "learning_rate": 3.212718482868096e-05, + "loss": 0.1241, + "step": 16790 + }, + { + "epoch": 0.8946163267479631, + "grad_norm": 0.036797747015953064, + "learning_rate": 3.210607630176578e-05, + "loss": 0.1252, + "step": 16800 + }, + { + "epoch": 0.8951488364662655, + "grad_norm": 0.049759261310100555, + "learning_rate": 3.208506978082617e-05, + "loss": 0.1245, + "step": 16810 + }, + { + "epoch": 0.8956813461845679, + "grad_norm": 0.061853665858507156, + "learning_rate": 3.2064165331004594e-05, + "loss": 0.1252, + "step": 16820 + }, + { + "epoch": 0.8962138559028703, + "grad_norm": 0.05073931813240051, + "learning_rate": 3.2043363017126956e-05, + "loss": 0.1251, + "step": 16830 + }, + { + "epoch": 0.8967463656211726, + "grad_norm": 0.06072097271680832, + "learning_rate": 3.202266290370245e-05, + "loss": 0.1239, + "step": 16840 + }, + { + "epoch": 0.8972788753394749, + "grad_norm": 0.05999981239438057, + "learning_rate": 3.2002065054923325e-05, + "loss": 0.1246, + "step": 16850 + }, + { + "epoch": 0.8978113850577774, + "grad_norm": 0.048532549291849136, + "learning_rate": 3.198156953466472e-05, + "loss": 0.1242, + "step": 16860 + }, + { + "epoch": 0.8983438947760797, + "grad_norm": 0.05031272768974304, + "learning_rate": 3.196117640648444e-05, + "loss": 0.1247, + "step": 16870 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.05845622345805168, + "learning_rate": 3.1940885733622754e-05, + "loss": 0.1239, + "step": 16880 + }, + { + "epoch": 0.8994089142126844, + "grad_norm": 0.05167698487639427, + "learning_rate": 3.192069757900224e-05, + "loss": 0.1245, + "step": 16890 + }, + { + "epoch": 0.8999414239309868, + "grad_norm": 0.04188617318868637, + "learning_rate": 3.190061200522753e-05, + "loss": 0.1246, + "step": 16900 + }, + { + "epoch": 0.9004739336492891, + "grad_norm": 0.0383358858525753, + "learning_rate": 3.188062907458516e-05, + "loss": 0.124, + "step": 16910 + }, + { + "epoch": 0.9010064433675915, + "grad_norm": 0.0524710975587368, + "learning_rate": 3.186074884904336e-05, + "loss": 0.1244, + "step": 16920 + }, + { + "epoch": 0.9015389530858938, + "grad_norm": 0.034921254962682724, + "learning_rate": 3.184097139025189e-05, + "loss": 0.1246, + "step": 16930 + }, + { + "epoch": 0.9020714628041961, + "grad_norm": 0.052057795226573944, + "learning_rate": 3.1821296759541764e-05, + "loss": 0.124, + "step": 16940 + }, + { + "epoch": 0.9026039725224986, + "grad_norm": 0.06420300155878067, + "learning_rate": 3.1801725017925195e-05, + "loss": 0.124, + "step": 16950 + }, + { + "epoch": 0.9031364822408009, + "grad_norm": 0.03623140975832939, + "learning_rate": 3.178225622609528e-05, + "loss": 0.1235, + "step": 16960 + }, + { + "epoch": 0.9036689919591032, + "grad_norm": 0.04458535462617874, + "learning_rate": 3.1762890444425875e-05, + "loss": 0.1249, + "step": 16970 + }, + { + "epoch": 0.9042015016774057, + "grad_norm": 0.03834957256913185, + "learning_rate": 3.174362773297141e-05, + "loss": 0.1246, + "step": 16980 + }, + { + "epoch": 0.904734011395708, + "grad_norm": 0.04682791605591774, + "learning_rate": 3.1724468151466665e-05, + "loss": 0.1245, + "step": 16990 + }, + { + "epoch": 0.9052665211140103, + "grad_norm": 0.0558556504547596, + "learning_rate": 3.170541175932662e-05, + "loss": 0.1244, + "step": 17000 + }, + { + "epoch": 0.9057990308323127, + "grad_norm": 0.046385906636714935, + "learning_rate": 3.168645861564627e-05, + "loss": 0.1241, + "step": 17010 + }, + { + "epoch": 0.906331540550615, + "grad_norm": 0.04825804755091667, + "learning_rate": 3.166760877920041e-05, + "loss": 0.1246, + "step": 17020 + }, + { + "epoch": 0.9068640502689174, + "grad_norm": 0.0882338434457779, + "learning_rate": 3.164886230844348e-05, + "loss": 0.125, + "step": 17030 + }, + { + "epoch": 0.9073965599872198, + "grad_norm": 0.08609329909086227, + "learning_rate": 3.163021926150939e-05, + "loss": 0.1242, + "step": 17040 + }, + { + "epoch": 0.9079290697055221, + "grad_norm": 0.05161284655332565, + "learning_rate": 3.1611679696211294e-05, + "loss": 0.125, + "step": 17050 + }, + { + "epoch": 0.9084615794238244, + "grad_norm": 0.04110497981309891, + "learning_rate": 3.159324367004148e-05, + "loss": 0.1253, + "step": 17060 + }, + { + "epoch": 0.9089940891421269, + "grad_norm": 0.07606612145900726, + "learning_rate": 3.157491124017115e-05, + "loss": 0.1236, + "step": 17070 + }, + { + "epoch": 0.9095265988604292, + "grad_norm": 0.04594139754772186, + "learning_rate": 3.1556682463450214e-05, + "loss": 0.1234, + "step": 17080 + }, + { + "epoch": 0.9100591085787315, + "grad_norm": 0.039515670388936996, + "learning_rate": 3.15385573964072e-05, + "loss": 0.1243, + "step": 17090 + }, + { + "epoch": 0.910591618297034, + "grad_norm": 0.04435297101736069, + "learning_rate": 3.152053609524897e-05, + "loss": 0.1245, + "step": 17100 + }, + { + "epoch": 0.9111241280153363, + "grad_norm": 0.03617672622203827, + "learning_rate": 3.150261861586065e-05, + "loss": 0.1243, + "step": 17110 + }, + { + "epoch": 0.9116566377336386, + "grad_norm": 0.053447507321834564, + "learning_rate": 3.148480501380538e-05, + "loss": 0.1251, + "step": 17120 + }, + { + "epoch": 0.912189147451941, + "grad_norm": 0.041530635207891464, + "learning_rate": 3.1467095344324174e-05, + "loss": 0.1242, + "step": 17130 + }, + { + "epoch": 0.9127216571702433, + "grad_norm": 0.1386973112821579, + "learning_rate": 3.144948966233577e-05, + "loss": 0.1244, + "step": 17140 + }, + { + "epoch": 0.9132541668885458, + "grad_norm": 0.06289440393447876, + "learning_rate": 3.143198802243638e-05, + "loss": 0.1246, + "step": 17150 + }, + { + "epoch": 0.9137866766068481, + "grad_norm": 0.03927746042609215, + "learning_rate": 3.141459047889964e-05, + "loss": 0.1242, + "step": 17160 + }, + { + "epoch": 0.9143191863251504, + "grad_norm": 0.03453196585178375, + "learning_rate": 3.1397297085676336e-05, + "loss": 0.1243, + "step": 17170 + }, + { + "epoch": 0.9148516960434528, + "grad_norm": 0.04992485046386719, + "learning_rate": 3.138010789639429e-05, + "loss": 0.1242, + "step": 17180 + }, + { + "epoch": 0.9153842057617552, + "grad_norm": 0.04788126423954964, + "learning_rate": 3.136302296435818e-05, + "loss": 0.1246, + "step": 17190 + }, + { + "epoch": 0.9159167154800575, + "grad_norm": 0.05929577723145485, + "learning_rate": 3.1346042342549376e-05, + "loss": 0.1245, + "step": 17200 + }, + { + "epoch": 0.9164492251983599, + "grad_norm": 0.08015090227127075, + "learning_rate": 3.132916608362578e-05, + "loss": 0.1244, + "step": 17210 + }, + { + "epoch": 0.9169817349166622, + "grad_norm": 0.06038287281990051, + "learning_rate": 3.131239423992165e-05, + "loss": 0.1245, + "step": 17220 + }, + { + "epoch": 0.9175142446349646, + "grad_norm": 0.04849204048514366, + "learning_rate": 3.129572686344745e-05, + "loss": 0.1244, + "step": 17230 + }, + { + "epoch": 0.918046754353267, + "grad_norm": 0.0919271856546402, + "learning_rate": 3.1279164005889696e-05, + "loss": 0.1249, + "step": 17240 + }, + { + "epoch": 0.9185792640715693, + "grad_norm": 0.039166927337646484, + "learning_rate": 3.126270571861076e-05, + "loss": 0.1246, + "step": 17250 + }, + { + "epoch": 0.9191117737898716, + "grad_norm": 0.05480289086699486, + "learning_rate": 3.1246352052648764e-05, + "loss": 0.124, + "step": 17260 + }, + { + "epoch": 0.9196442835081741, + "grad_norm": 0.0434199757874012, + "learning_rate": 3.1230103058717373e-05, + "loss": 0.1245, + "step": 17270 + }, + { + "epoch": 0.9201767932264764, + "grad_norm": 0.03532974794507027, + "learning_rate": 3.121395878720567e-05, + "loss": 0.1242, + "step": 17280 + }, + { + "epoch": 0.9207093029447787, + "grad_norm": 0.09806732088327408, + "learning_rate": 3.119791928817798e-05, + "loss": 0.1251, + "step": 17290 + }, + { + "epoch": 0.9212418126630811, + "grad_norm": 0.0625268891453743, + "learning_rate": 3.1181984611373735e-05, + "loss": 0.1239, + "step": 17300 + }, + { + "epoch": 0.9217743223813835, + "grad_norm": 0.04700905457139015, + "learning_rate": 3.116615480620727e-05, + "loss": 0.1247, + "step": 17310 + }, + { + "epoch": 0.9223068320996858, + "grad_norm": 0.0478329174220562, + "learning_rate": 3.1150429921767754e-05, + "loss": 0.1239, + "step": 17320 + }, + { + "epoch": 0.9228393418179882, + "grad_norm": 0.041897084563970566, + "learning_rate": 3.113481000681897e-05, + "loss": 0.1249, + "step": 17330 + }, + { + "epoch": 0.9233718515362905, + "grad_norm": 0.04760069027543068, + "learning_rate": 3.111929510979918e-05, + "loss": 0.124, + "step": 17340 + }, + { + "epoch": 0.9239043612545929, + "grad_norm": 0.06711754202842712, + "learning_rate": 3.110388527882099e-05, + "loss": 0.1239, + "step": 17350 + }, + { + "epoch": 0.9244368709728953, + "grad_norm": 0.034946054220199585, + "learning_rate": 3.108858056167117e-05, + "loss": 0.1246, + "step": 17360 + }, + { + "epoch": 0.9249693806911976, + "grad_norm": 0.03772689029574394, + "learning_rate": 3.107338100581056e-05, + "loss": 0.1245, + "step": 17370 + }, + { + "epoch": 0.9255018904094999, + "grad_norm": 0.04687857627868652, + "learning_rate": 3.105828665837386e-05, + "loss": 0.1249, + "step": 17380 + }, + { + "epoch": 0.9260344001278024, + "grad_norm": 0.04124782606959343, + "learning_rate": 3.104329756616952e-05, + "loss": 0.1243, + "step": 17390 + }, + { + "epoch": 0.9265669098461047, + "grad_norm": 0.052532244473695755, + "learning_rate": 3.1028413775679595e-05, + "loss": 0.1245, + "step": 17400 + }, + { + "epoch": 0.927099419564407, + "grad_norm": 0.0953177809715271, + "learning_rate": 3.101363533305958e-05, + "loss": 0.1241, + "step": 17410 + }, + { + "epoch": 0.9276319292827094, + "grad_norm": 0.05009876564145088, + "learning_rate": 3.099896228413829e-05, + "loss": 0.1238, + "step": 17420 + }, + { + "epoch": 0.9281644390010118, + "grad_norm": 0.08097761869430542, + "learning_rate": 3.098439467441771e-05, + "loss": 0.1249, + "step": 17430 + }, + { + "epoch": 0.9286969487193141, + "grad_norm": 0.047098558396101, + "learning_rate": 3.0969932549072835e-05, + "loss": 0.1233, + "step": 17440 + }, + { + "epoch": 0.9292294584376165, + "grad_norm": 0.047214169055223465, + "learning_rate": 3.0955575952951575e-05, + "loss": 0.1245, + "step": 17450 + }, + { + "epoch": 0.9297619681559188, + "grad_norm": 0.048827920109033585, + "learning_rate": 3.0941324930574554e-05, + "loss": 0.1241, + "step": 17460 + }, + { + "epoch": 0.9302944778742213, + "grad_norm": 0.07658734172582626, + "learning_rate": 3.0927179526135044e-05, + "loss": 0.1237, + "step": 17470 + }, + { + "epoch": 0.9308269875925236, + "grad_norm": 0.034373264759778976, + "learning_rate": 3.091313978349875e-05, + "loss": 0.1252, + "step": 17480 + }, + { + "epoch": 0.9313594973108259, + "grad_norm": 0.05511806905269623, + "learning_rate": 3.089920574620375e-05, + "loss": 0.1248, + "step": 17490 + }, + { + "epoch": 0.9318920070291283, + "grad_norm": 0.04354240372776985, + "learning_rate": 3.0885377457460294e-05, + "loss": 0.125, + "step": 17500 + }, + { + "epoch": 0.9324245167474307, + "grad_norm": 0.0334496833384037, + "learning_rate": 3.0871654960150706e-05, + "loss": 0.1239, + "step": 17510 + }, + { + "epoch": 0.932957026465733, + "grad_norm": 0.043775349855422974, + "learning_rate": 3.085803829682928e-05, + "loss": 0.124, + "step": 17520 + }, + { + "epoch": 0.9334895361840354, + "grad_norm": 0.03564087674021721, + "learning_rate": 3.0844527509722045e-05, + "loss": 0.1238, + "step": 17530 + }, + { + "epoch": 0.9340220459023377, + "grad_norm": 0.03102003037929535, + "learning_rate": 3.083112264072676e-05, + "loss": 0.1248, + "step": 17540 + }, + { + "epoch": 0.93455455562064, + "grad_norm": 0.03893466666340828, + "learning_rate": 3.0817823731412704e-05, + "loss": 0.1242, + "step": 17550 + }, + { + "epoch": 0.9350870653389425, + "grad_norm": 0.05906695872545242, + "learning_rate": 3.0804630823020575e-05, + "loss": 0.1235, + "step": 17560 + }, + { + "epoch": 0.9356195750572448, + "grad_norm": 0.07867705076932907, + "learning_rate": 3.079154395646233e-05, + "loss": 0.1246, + "step": 17570 + }, + { + "epoch": 0.9361520847755471, + "grad_norm": 0.047349728643894196, + "learning_rate": 3.077856317232114e-05, + "loss": 0.1238, + "step": 17580 + }, + { + "epoch": 0.9366845944938496, + "grad_norm": 0.04795532301068306, + "learning_rate": 3.0765688510851144e-05, + "loss": 0.1243, + "step": 17590 + }, + { + "epoch": 0.9372171042121519, + "grad_norm": 0.06184261292219162, + "learning_rate": 3.075292001197743e-05, + "loss": 0.1252, + "step": 17600 + }, + { + "epoch": 0.9377496139304542, + "grad_norm": 0.05014181509613991, + "learning_rate": 3.074025771529585e-05, + "loss": 0.1248, + "step": 17610 + }, + { + "epoch": 0.9382821236487566, + "grad_norm": 0.039317913353443146, + "learning_rate": 3.0727701660072925e-05, + "loss": 0.1239, + "step": 17620 + }, + { + "epoch": 0.938814633367059, + "grad_norm": 0.10161686688661575, + "learning_rate": 3.0715251885245734e-05, + "loss": 0.1232, + "step": 17630 + }, + { + "epoch": 0.9393471430853613, + "grad_norm": 0.0897764042019844, + "learning_rate": 3.070290842942173e-05, + "loss": 0.1244, + "step": 17640 + }, + { + "epoch": 0.9398796528036637, + "grad_norm": 0.056616149842739105, + "learning_rate": 3.0690671330878704e-05, + "loss": 0.1242, + "step": 17650 + }, + { + "epoch": 0.940412162521966, + "grad_norm": 0.04649018496274948, + "learning_rate": 3.0678540627564614e-05, + "loss": 0.1241, + "step": 17660 + }, + { + "epoch": 0.9409446722402683, + "grad_norm": 0.06587915867567062, + "learning_rate": 3.066651635709746e-05, + "loss": 0.1239, + "step": 17670 + }, + { + "epoch": 0.9414771819585708, + "grad_norm": 0.04945458844304085, + "learning_rate": 3.065459855676523e-05, + "loss": 0.124, + "step": 17680 + }, + { + "epoch": 0.9420096916768731, + "grad_norm": 0.0448901429772377, + "learning_rate": 3.06427872635257e-05, + "loss": 0.1234, + "step": 17690 + }, + { + "epoch": 0.9425422013951754, + "grad_norm": 0.04893770441412926, + "learning_rate": 3.063108251400638e-05, + "loss": 0.1249, + "step": 17700 + }, + { + "epoch": 0.9430747111134778, + "grad_norm": 0.050453029572963715, + "learning_rate": 3.06194843445044e-05, + "loss": 0.125, + "step": 17710 + }, + { + "epoch": 0.9436072208317802, + "grad_norm": 0.07537666708230972, + "learning_rate": 3.060799279098633e-05, + "loss": 0.1249, + "step": 17720 + }, + { + "epoch": 0.9441397305500825, + "grad_norm": 0.04989492520689964, + "learning_rate": 3.059660788908817e-05, + "loss": 0.1246, + "step": 17730 + }, + { + "epoch": 0.9446722402683849, + "grad_norm": 0.06518174707889557, + "learning_rate": 3.058532967411516e-05, + "loss": 0.1251, + "step": 17740 + }, + { + "epoch": 0.9452047499866872, + "grad_norm": 0.046202413737773895, + "learning_rate": 3.057415818104169e-05, + "loss": 0.1238, + "step": 17750 + }, + { + "epoch": 0.9457372597049896, + "grad_norm": 0.05071398615837097, + "learning_rate": 3.056309344451123e-05, + "loss": 0.1243, + "step": 17760 + }, + { + "epoch": 0.946269769423292, + "grad_norm": 0.08481655269861221, + "learning_rate": 3.0552135498836165e-05, + "loss": 0.1247, + "step": 17770 + }, + { + "epoch": 0.9468022791415943, + "grad_norm": 0.05741250142455101, + "learning_rate": 3.0541284377997724e-05, + "loss": 0.1251, + "step": 17780 + }, + { + "epoch": 0.9473347888598967, + "grad_norm": 0.060524262487888336, + "learning_rate": 3.053054011564587e-05, + "loss": 0.1239, + "step": 17790 + }, + { + "epoch": 0.9478672985781991, + "grad_norm": 0.056961771100759506, + "learning_rate": 3.051990274509917e-05, + "loss": 0.1249, + "step": 17800 + }, + { + "epoch": 0.9483998082965014, + "grad_norm": 0.053151581436395645, + "learning_rate": 3.050937229934475e-05, + "loss": 0.1247, + "step": 17810 + }, + { + "epoch": 0.9489323180148038, + "grad_norm": 0.04388433322310448, + "learning_rate": 3.049894881103813e-05, + "loss": 0.1241, + "step": 17820 + }, + { + "epoch": 0.9494648277331061, + "grad_norm": 0.05028518661856651, + "learning_rate": 3.0488632312503152e-05, + "loss": 0.1233, + "step": 17830 + }, + { + "epoch": 0.9499973374514085, + "grad_norm": 0.06016720086336136, + "learning_rate": 3.0478422835731874e-05, + "loss": 0.1246, + "step": 17840 + }, + { + "epoch": 0.9505298471697109, + "grad_norm": 0.05909838154911995, + "learning_rate": 3.0468320412384498e-05, + "loss": 0.1246, + "step": 17850 + }, + { + "epoch": 0.9510623568880132, + "grad_norm": 0.06923595815896988, + "learning_rate": 3.0458325073789212e-05, + "loss": 0.1242, + "step": 17860 + }, + { + "epoch": 0.9515948666063155, + "grad_norm": 0.041968394070863724, + "learning_rate": 3.0448436850942146e-05, + "loss": 0.1243, + "step": 17870 + }, + { + "epoch": 0.952127376324618, + "grad_norm": 0.038111716508865356, + "learning_rate": 3.0438655774507256e-05, + "loss": 0.1241, + "step": 17880 + }, + { + "epoch": 0.9526598860429203, + "grad_norm": 0.09092561900615692, + "learning_rate": 3.0428981874816235e-05, + "loss": 0.1237, + "step": 17890 + }, + { + "epoch": 0.9531923957612226, + "grad_norm": 0.031990304589271545, + "learning_rate": 3.0419415181868416e-05, + "loss": 0.1241, + "step": 17900 + }, + { + "epoch": 0.953724905479525, + "grad_norm": 0.05379229038953781, + "learning_rate": 3.0409955725330652e-05, + "loss": 0.1236, + "step": 17910 + }, + { + "epoch": 0.9542574151978274, + "grad_norm": 0.05789874866604805, + "learning_rate": 3.0400603534537282e-05, + "loss": 0.1236, + "step": 17920 + }, + { + "epoch": 0.9547899249161297, + "grad_norm": 0.045958537608385086, + "learning_rate": 3.0391358638489997e-05, + "loss": 0.1241, + "step": 17930 + }, + { + "epoch": 0.9553224346344321, + "grad_norm": 0.06445147842168808, + "learning_rate": 3.0382221065857753e-05, + "loss": 0.1239, + "step": 17940 + }, + { + "epoch": 0.9558549443527344, + "grad_norm": 0.04746498540043831, + "learning_rate": 3.0373190844976695e-05, + "loss": 0.1239, + "step": 17950 + }, + { + "epoch": 0.9563874540710368, + "grad_norm": 0.037877731025218964, + "learning_rate": 3.0364268003850065e-05, + "loss": 0.1242, + "step": 17960 + }, + { + "epoch": 0.9569199637893392, + "grad_norm": 0.07475633174180984, + "learning_rate": 3.0355452570148126e-05, + "loss": 0.1241, + "step": 17970 + }, + { + "epoch": 0.9574524735076415, + "grad_norm": 0.061180293560028076, + "learning_rate": 3.0346744571208034e-05, + "loss": 0.1241, + "step": 17980 + }, + { + "epoch": 0.9579849832259438, + "grad_norm": 0.05577493831515312, + "learning_rate": 3.033814403403381e-05, + "loss": 0.1236, + "step": 17990 + }, + { + "epoch": 0.9585174929442463, + "grad_norm": 0.04320796579122543, + "learning_rate": 3.0329650985296228e-05, + "loss": 0.1236, + "step": 18000 + }, + { + "epoch": 0.9590500026625486, + "grad_norm": 0.07489881664514542, + "learning_rate": 3.032126545133271e-05, + "loss": 0.1249, + "step": 18010 + }, + { + "epoch": 0.9595825123808509, + "grad_norm": 0.050032421946525574, + "learning_rate": 3.0312987458147298e-05, + "loss": 0.1245, + "step": 18020 + }, + { + "epoch": 0.9601150220991533, + "grad_norm": 0.035527851432561874, + "learning_rate": 3.030481703141053e-05, + "loss": 0.1239, + "step": 18030 + }, + { + "epoch": 0.9606475318174557, + "grad_norm": 0.0719723030924797, + "learning_rate": 3.0296754196459377e-05, + "loss": 0.1234, + "step": 18040 + }, + { + "epoch": 0.961180041535758, + "grad_norm": 0.07368568331003189, + "learning_rate": 3.028879897829716e-05, + "loss": 0.1244, + "step": 18050 + }, + { + "epoch": 0.9617125512540604, + "grad_norm": 0.08034256100654602, + "learning_rate": 3.028095140159347e-05, + "loss": 0.1249, + "step": 18060 + }, + { + "epoch": 0.9622450609723627, + "grad_norm": 0.06598822772502899, + "learning_rate": 3.0273211490684106e-05, + "loss": 0.1243, + "step": 18070 + }, + { + "epoch": 0.962777570690665, + "grad_norm": 0.048927973955869675, + "learning_rate": 3.0265579269570976e-05, + "loss": 0.125, + "step": 18080 + }, + { + "epoch": 0.9633100804089675, + "grad_norm": 0.035660270601511, + "learning_rate": 3.025805476192205e-05, + "loss": 0.1239, + "step": 18090 + }, + { + "epoch": 0.9638425901272698, + "grad_norm": 0.055542632937431335, + "learning_rate": 3.025063799107126e-05, + "loss": 0.1237, + "step": 18100 + }, + { + "epoch": 0.9643750998455722, + "grad_norm": 0.06694008409976959, + "learning_rate": 3.0243328980018447e-05, + "loss": 0.1232, + "step": 18110 + }, + { + "epoch": 0.9649076095638746, + "grad_norm": 0.04520373046398163, + "learning_rate": 3.0236127751429284e-05, + "loss": 0.1245, + "step": 18120 + }, + { + "epoch": 0.9654401192821769, + "grad_norm": 0.050599873065948486, + "learning_rate": 3.022903432763519e-05, + "loss": 0.1236, + "step": 18130 + }, + { + "epoch": 0.9659726290004793, + "grad_norm": 0.10253104567527771, + "learning_rate": 3.02220487306333e-05, + "loss": 0.1239, + "step": 18140 + }, + { + "epoch": 0.9665051387187816, + "grad_norm": 0.0527169369161129, + "learning_rate": 3.021517098208635e-05, + "loss": 0.1243, + "step": 18150 + }, + { + "epoch": 0.967037648437084, + "grad_norm": 0.06449782848358154, + "learning_rate": 3.0208401103322637e-05, + "loss": 0.1237, + "step": 18160 + }, + { + "epoch": 0.9675701581553864, + "grad_norm": 0.061875950545072556, + "learning_rate": 3.0201739115335952e-05, + "loss": 0.1238, + "step": 18170 + }, + { + "epoch": 0.9681026678736887, + "grad_norm": 0.037850238382816315, + "learning_rate": 3.0195185038785507e-05, + "loss": 0.1249, + "step": 18180 + }, + { + "epoch": 0.968635177591991, + "grad_norm": 0.04714973270893097, + "learning_rate": 3.0188738893995878e-05, + "loss": 0.1244, + "step": 18190 + }, + { + "epoch": 0.9691676873102935, + "grad_norm": 0.09865976870059967, + "learning_rate": 3.0182400700956943e-05, + "loss": 0.1248, + "step": 18200 + }, + { + "epoch": 0.9697001970285958, + "grad_norm": 0.03763122111558914, + "learning_rate": 3.0176170479323794e-05, + "loss": 0.1242, + "step": 18210 + }, + { + "epoch": 0.9702327067468981, + "grad_norm": 0.038522519171237946, + "learning_rate": 3.017004824841672e-05, + "loss": 0.1245, + "step": 18220 + }, + { + "epoch": 0.9707652164652005, + "grad_norm": 0.06458954513072968, + "learning_rate": 3.0164034027221112e-05, + "loss": 0.1235, + "step": 18230 + }, + { + "epoch": 0.9712977261835029, + "grad_norm": 0.04066864028573036, + "learning_rate": 3.015812783438743e-05, + "loss": 0.1247, + "step": 18240 + }, + { + "epoch": 0.9718302359018052, + "grad_norm": 0.0487934835255146, + "learning_rate": 3.0152329688231107e-05, + "loss": 0.125, + "step": 18250 + }, + { + "epoch": 0.9723627456201076, + "grad_norm": 0.0468660444021225, + "learning_rate": 3.014663960673254e-05, + "loss": 0.1238, + "step": 18260 + }, + { + "epoch": 0.9728952553384099, + "grad_norm": 0.05677594989538193, + "learning_rate": 3.014105760753701e-05, + "loss": 0.1236, + "step": 18270 + }, + { + "epoch": 0.9734277650567122, + "grad_norm": 0.042526934295892715, + "learning_rate": 3.0135583707954613e-05, + "loss": 0.1234, + "step": 18280 + }, + { + "epoch": 0.9739602747750147, + "grad_norm": 0.06067803502082825, + "learning_rate": 3.0130217924960234e-05, + "loss": 0.1248, + "step": 18290 + }, + { + "epoch": 0.974492784493317, + "grad_norm": 0.04792502894997597, + "learning_rate": 3.012496027519348e-05, + "loss": 0.1242, + "step": 18300 + }, + { + "epoch": 0.9750252942116193, + "grad_norm": 0.05716263875365257, + "learning_rate": 3.011981077495863e-05, + "loss": 0.1241, + "step": 18310 + }, + { + "epoch": 0.9755578039299218, + "grad_norm": 0.04997260496020317, + "learning_rate": 3.011476944022458e-05, + "loss": 0.124, + "step": 18320 + }, + { + "epoch": 0.9760903136482241, + "grad_norm": 0.09484563767910004, + "learning_rate": 3.010983628662481e-05, + "loss": 0.1242, + "step": 18330 + }, + { + "epoch": 0.9766228233665264, + "grad_norm": 0.0866529643535614, + "learning_rate": 3.010501132945731e-05, + "loss": 0.1242, + "step": 18340 + }, + { + "epoch": 0.9771553330848288, + "grad_norm": 0.0486617274582386, + "learning_rate": 3.0100294583684557e-05, + "loss": 0.1238, + "step": 18350 + }, + { + "epoch": 0.9776878428031311, + "grad_norm": 0.032813165336847305, + "learning_rate": 3.0095686063933453e-05, + "loss": 0.1243, + "step": 18360 + }, + { + "epoch": 0.9782203525214335, + "grad_norm": 0.07168902456760406, + "learning_rate": 3.009118578449529e-05, + "loss": 0.1249, + "step": 18370 + }, + { + "epoch": 0.9787528622397359, + "grad_norm": 0.06777796894311905, + "learning_rate": 3.0086793759325693e-05, + "loss": 0.1246, + "step": 18380 + }, + { + "epoch": 0.9792853719580382, + "grad_norm": 0.0522090420126915, + "learning_rate": 3.0082510002044588e-05, + "loss": 0.1238, + "step": 18390 + }, + { + "epoch": 0.9798178816763405, + "grad_norm": 0.07006611675024033, + "learning_rate": 3.0078334525936163e-05, + "loss": 0.1243, + "step": 18400 + }, + { + "epoch": 0.980350391394643, + "grad_norm": 0.06946975737810135, + "learning_rate": 3.0074267343948805e-05, + "loss": 0.1244, + "step": 18410 + }, + { + "epoch": 0.9808829011129453, + "grad_norm": 0.057238370180130005, + "learning_rate": 3.0070308468695084e-05, + "loss": 0.1234, + "step": 18420 + }, + { + "epoch": 0.9814154108312477, + "grad_norm": 0.06210003048181534, + "learning_rate": 3.0066457912451707e-05, + "loss": 0.1237, + "step": 18430 + }, + { + "epoch": 0.98194792054955, + "grad_norm": 0.0649491548538208, + "learning_rate": 3.006271568715947e-05, + "loss": 0.1233, + "step": 18440 + }, + { + "epoch": 0.9824804302678524, + "grad_norm": 0.038359202444553375, + "learning_rate": 3.0059081804423232e-05, + "loss": 0.1236, + "step": 18450 + }, + { + "epoch": 0.9830129399861548, + "grad_norm": 0.05497262626886368, + "learning_rate": 3.0055556275511883e-05, + "loss": 0.1239, + "step": 18460 + }, + { + "epoch": 0.9835454497044571, + "grad_norm": 0.038170114159584045, + "learning_rate": 3.005213911135828e-05, + "loss": 0.1242, + "step": 18470 + }, + { + "epoch": 0.9840779594227594, + "grad_norm": 0.04138299450278282, + "learning_rate": 3.004883032255925e-05, + "loss": 0.1246, + "step": 18480 + }, + { + "epoch": 0.9846104691410619, + "grad_norm": 0.0714152529835701, + "learning_rate": 3.004562991937555e-05, + "loss": 0.1239, + "step": 18490 + }, + { + "epoch": 0.9851429788593642, + "grad_norm": 0.08420536667108536, + "learning_rate": 3.0042537911731818e-05, + "loss": 0.1242, + "step": 18500 + }, + { + "epoch": 0.9856754885776665, + "grad_norm": 0.036863774061203, + "learning_rate": 3.0039554309216533e-05, + "loss": 0.1245, + "step": 18510 + }, + { + "epoch": 0.986207998295969, + "grad_norm": 0.059715207666158676, + "learning_rate": 3.003667912108204e-05, + "loss": 0.1249, + "step": 18520 + }, + { + "epoch": 0.9867405080142713, + "grad_norm": 0.039495982229709625, + "learning_rate": 3.0033912356244453e-05, + "loss": 0.1241, + "step": 18530 + }, + { + "epoch": 0.9872730177325736, + "grad_norm": 0.0337030254304409, + "learning_rate": 3.0031254023283678e-05, + "loss": 0.1244, + "step": 18540 + }, + { + "epoch": 0.987805527450876, + "grad_norm": 0.04630092531442642, + "learning_rate": 3.0028704130443352e-05, + "loss": 0.1237, + "step": 18550 + }, + { + "epoch": 0.9883380371691783, + "grad_norm": 0.03899266943335533, + "learning_rate": 3.0026262685630846e-05, + "loss": 0.124, + "step": 18560 + }, + { + "epoch": 0.9888705468874807, + "grad_norm": 0.04212348535656929, + "learning_rate": 3.002392969641723e-05, + "loss": 0.1234, + "step": 18570 + }, + { + "epoch": 0.9894030566057831, + "grad_norm": 0.03455604612827301, + "learning_rate": 3.0021705170037227e-05, + "loss": 0.1241, + "step": 18580 + }, + { + "epoch": 0.9899355663240854, + "grad_norm": 0.08309216052293777, + "learning_rate": 3.0019589113389234e-05, + "loss": 0.1238, + "step": 18590 + }, + { + "epoch": 0.9904680760423877, + "grad_norm": 0.09158316254615784, + "learning_rate": 3.0017581533035255e-05, + "loss": 0.1238, + "step": 18600 + }, + { + "epoch": 0.9910005857606902, + "grad_norm": 0.043327104300260544, + "learning_rate": 3.0015682435200926e-05, + "loss": 0.1249, + "step": 18610 + }, + { + "epoch": 0.9915330954789925, + "grad_norm": 0.05239805579185486, + "learning_rate": 3.001389182577545e-05, + "loss": 0.1243, + "step": 18620 + }, + { + "epoch": 0.9920656051972948, + "grad_norm": 0.07973194122314453, + "learning_rate": 3.0012209710311613e-05, + "loss": 0.1239, + "step": 18630 + }, + { + "epoch": 0.9925981149155972, + "grad_norm": 0.04951346665620804, + "learning_rate": 3.001063609402576e-05, + "loss": 0.1229, + "step": 18640 + }, + { + "epoch": 0.9931306246338996, + "grad_norm": 0.038230083882808685, + "learning_rate": 3.0009170981797758e-05, + "loss": 0.1236, + "step": 18650 + }, + { + "epoch": 0.9936631343522019, + "grad_norm": 0.05738453194499016, + "learning_rate": 3.0007814378171008e-05, + "loss": 0.1242, + "step": 18660 + }, + { + "epoch": 0.9941956440705043, + "grad_norm": 0.060486044734716415, + "learning_rate": 3.0006566287352423e-05, + "loss": 0.1242, + "step": 18670 + }, + { + "epoch": 0.9947281537888066, + "grad_norm": 0.05927567929029465, + "learning_rate": 3.0005426713212397e-05, + "loss": 0.1243, + "step": 18680 + }, + { + "epoch": 0.995260663507109, + "grad_norm": 0.10084035247564316, + "learning_rate": 3.000439565928482e-05, + "loss": 0.1243, + "step": 18690 + }, + { + "epoch": 0.9957931732254114, + "grad_norm": 0.04450371488928795, + "learning_rate": 3.0003473128767058e-05, + "loss": 0.1244, + "step": 18700 + }, + { + "epoch": 0.9963256829437137, + "grad_norm": 0.059074439108371735, + "learning_rate": 3.000265912451991e-05, + "loss": 0.1232, + "step": 18710 + }, + { + "epoch": 0.996858192662016, + "grad_norm": 0.04628562554717064, + "learning_rate": 3.0001953649067676e-05, + "loss": 0.1247, + "step": 18720 + }, + { + "epoch": 0.9973907023803185, + "grad_norm": 0.0613800473511219, + "learning_rate": 3.000135670459806e-05, + "loss": 0.1241, + "step": 18730 + }, + { + "epoch": 0.9979232120986208, + "grad_norm": 0.060382645577192307, + "learning_rate": 3.000086829296223e-05, + "loss": 0.1247, + "step": 18740 + }, + { + "epoch": 0.9984557218169232, + "grad_norm": 0.03309040144085884, + "learning_rate": 3.0000488415674777e-05, + "loss": 0.1242, + "step": 18750 + }, + { + "epoch": 0.9989882315352255, + "grad_norm": 0.047177914530038834, + "learning_rate": 3.0000217073913716e-05, + "loss": 0.124, + "step": 18760 + }, + { + "epoch": 0.9995207412535279, + "grad_norm": 0.03689567372202873, + "learning_rate": 3.00000542685205e-05, + "loss": 0.1245, + "step": 18770 + }, + { + "epoch": 1.0, + "step": 18779, + "total_flos": 5.994863411375112e+18, + "train_loss": 0.031714059101823636, + "train_runtime": 3465.0177, + "train_samples_per_second": 1387.365, + "train_steps_per_second": 5.42 + } + ], + "logging_steps": 10, + "max_steps": 18779, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.994863411375112e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}