{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 19.54397394136808, "eval_steps": 500, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03257328990228013, "grad_norm": 0.646706223487854, "learning_rate": 9e-07, "loss": 1.1814, "step": 10 }, { "epoch": 0.06514657980456026, "grad_norm": 0.4603618383407593, "learning_rate": 1.9e-06, "loss": 1.1738, "step": 20 }, { "epoch": 0.09771986970684039, "grad_norm": 0.4963187277317047, "learning_rate": 2.9e-06, "loss": 1.1665, "step": 30 }, { "epoch": 0.13029315960912052, "grad_norm": 0.4428938329219818, "learning_rate": 3.9e-06, "loss": 1.1512, "step": 40 }, { "epoch": 0.16286644951140064, "grad_norm": 0.36607691645622253, "learning_rate": 4.9000000000000005e-06, "loss": 1.1333, "step": 50 }, { "epoch": 0.19543973941368079, "grad_norm": 0.30768465995788574, "learning_rate": 5.9e-06, "loss": 1.1202, "step": 60 }, { "epoch": 0.2280130293159609, "grad_norm": 0.23890309035778046, "learning_rate": 6.900000000000001e-06, "loss": 1.1081, "step": 70 }, { "epoch": 0.26058631921824105, "grad_norm": 0.25988394021987915, "learning_rate": 7.9e-06, "loss": 1.0964, "step": 80 }, { "epoch": 0.2931596091205212, "grad_norm": 0.2948506772518158, "learning_rate": 8.9e-06, "loss": 1.0804, "step": 90 }, { "epoch": 0.3257328990228013, "grad_norm": 0.29228347539901733, "learning_rate": 9.900000000000002e-06, "loss": 1.0646, "step": 100 }, { "epoch": 0.3583061889250814, "grad_norm": 0.24975909292697906, "learning_rate": 1.09e-05, "loss": 1.0531, "step": 110 }, { "epoch": 0.39087947882736157, "grad_norm": 0.30400097370147705, "learning_rate": 1.19e-05, "loss": 1.0387, "step": 120 }, { "epoch": 0.4234527687296417, "grad_norm": 0.2392139881849289, "learning_rate": 1.29e-05, "loss": 1.0307, "step": 130 }, { "epoch": 0.4560260586319218, "grad_norm": 0.2712036371231079, "learning_rate": 1.3900000000000002e-05, "loss": 1.0198, "step": 140 }, { "epoch": 0.48859934853420195, "grad_norm": 0.23245830833911896, "learning_rate": 1.49e-05, "loss": 1.0154, "step": 150 }, { "epoch": 0.5211726384364821, "grad_norm": 0.2727172374725342, "learning_rate": 1.59e-05, "loss": 1.0088, "step": 160 }, { "epoch": 0.5537459283387622, "grad_norm": 0.30211466550827026, "learning_rate": 1.69e-05, "loss": 0.9998, "step": 170 }, { "epoch": 0.5863192182410424, "grad_norm": 0.36023062467575073, "learning_rate": 1.79e-05, "loss": 0.9862, "step": 180 }, { "epoch": 0.6188925081433225, "grad_norm": 0.6321305632591248, "learning_rate": 1.8900000000000002e-05, "loss": 0.9607, "step": 190 }, { "epoch": 0.6514657980456026, "grad_norm": 0.7605375647544861, "learning_rate": 1.9900000000000003e-05, "loss": 0.9141, "step": 200 }, { "epoch": 0.6840390879478827, "grad_norm": 1.6783995628356934, "learning_rate": 2.09e-05, "loss": 0.8218, "step": 210 }, { "epoch": 0.7166123778501629, "grad_norm": 1.3917943239212036, "learning_rate": 2.19e-05, "loss": 0.726, "step": 220 }, { "epoch": 0.749185667752443, "grad_norm": 1.6959741115570068, "learning_rate": 2.29e-05, "loss": 0.6331, "step": 230 }, { "epoch": 0.7817589576547231, "grad_norm": 1.6433072090148926, "learning_rate": 2.39e-05, "loss": 0.5575, "step": 240 }, { "epoch": 0.8143322475570033, "grad_norm": 2.0065226554870605, "learning_rate": 2.4900000000000002e-05, "loss": 0.4904, "step": 250 }, { "epoch": 0.8469055374592834, "grad_norm": 1.7093048095703125, "learning_rate": 2.5900000000000003e-05, "loss": 0.4331, "step": 260 }, { "epoch": 0.8794788273615635, "grad_norm": 2.0112454891204834, "learning_rate": 2.6900000000000003e-05, "loss": 0.385, "step": 270 }, { "epoch": 0.9120521172638436, "grad_norm": 2.178093910217285, "learning_rate": 2.7900000000000004e-05, "loss": 0.3391, "step": 280 }, { "epoch": 0.9446254071661238, "grad_norm": 2.12821888923645, "learning_rate": 2.8899999999999998e-05, "loss": 0.3148, "step": 290 }, { "epoch": 0.9771986970684039, "grad_norm": 2.353641986846924, "learning_rate": 2.9900000000000002e-05, "loss": 0.2889, "step": 300 }, { "epoch": 1.009771986970684, "grad_norm": 2.5022237300872803, "learning_rate": 3.09e-05, "loss": 0.2709, "step": 310 }, { "epoch": 1.0423452768729642, "grad_norm": 2.0082218647003174, "learning_rate": 3.19e-05, "loss": 0.2614, "step": 320 }, { "epoch": 1.0749185667752443, "grad_norm": 2.1790828704833984, "learning_rate": 3.29e-05, "loss": 0.2539, "step": 330 }, { "epoch": 1.1074918566775245, "grad_norm": 2.4001519680023193, "learning_rate": 3.3900000000000004e-05, "loss": 0.2386, "step": 340 }, { "epoch": 1.1400651465798046, "grad_norm": 2.5760984420776367, "learning_rate": 3.49e-05, "loss": 0.2327, "step": 350 }, { "epoch": 1.1726384364820848, "grad_norm": 2.5301952362060547, "learning_rate": 3.59e-05, "loss": 0.2292, "step": 360 }, { "epoch": 1.205211726384365, "grad_norm": 2.474808692932129, "learning_rate": 3.69e-05, "loss": 0.2293, "step": 370 }, { "epoch": 1.237785016286645, "grad_norm": 2.524728298187256, "learning_rate": 3.79e-05, "loss": 0.2253, "step": 380 }, { "epoch": 1.2703583061889252, "grad_norm": 2.349524974822998, "learning_rate": 3.8900000000000004e-05, "loss": 0.2259, "step": 390 }, { "epoch": 1.3029315960912053, "grad_norm": 1.8449956178665161, "learning_rate": 3.99e-05, "loss": 0.2201, "step": 400 }, { "epoch": 1.3355048859934853, "grad_norm": 2.82806134223938, "learning_rate": 4.09e-05, "loss": 0.2162, "step": 410 }, { "epoch": 1.3680781758957654, "grad_norm": 2.695847988128662, "learning_rate": 4.19e-05, "loss": 0.2181, "step": 420 }, { "epoch": 1.4006514657980456, "grad_norm": 2.953904628753662, "learning_rate": 4.29e-05, "loss": 0.2181, "step": 430 }, { "epoch": 1.4332247557003257, "grad_norm": 2.7019174098968506, "learning_rate": 4.39e-05, "loss": 0.2166, "step": 440 }, { "epoch": 1.4657980456026058, "grad_norm": 2.1432275772094727, "learning_rate": 4.49e-05, "loss": 0.2164, "step": 450 }, { "epoch": 1.498371335504886, "grad_norm": 3.0038726329803467, "learning_rate": 4.5900000000000004e-05, "loss": 0.2136, "step": 460 }, { "epoch": 1.5309446254071661, "grad_norm": 3.1258187294006348, "learning_rate": 4.69e-05, "loss": 0.2132, "step": 470 }, { "epoch": 1.5635179153094463, "grad_norm": 2.207747459411621, "learning_rate": 4.79e-05, "loss": 0.2112, "step": 480 }, { "epoch": 1.5960912052117264, "grad_norm": 2.581730604171753, "learning_rate": 4.89e-05, "loss": 0.2099, "step": 490 }, { "epoch": 1.6286644951140063, "grad_norm": 3.099320411682129, "learning_rate": 4.99e-05, "loss": 0.2097, "step": 500 }, { "epoch": 1.6612377850162865, "grad_norm": 2.6373846530914307, "learning_rate": 5.0900000000000004e-05, "loss": 0.2079, "step": 510 }, { "epoch": 1.6938110749185666, "grad_norm": 2.9202771186828613, "learning_rate": 5.19e-05, "loss": 0.2043, "step": 520 }, { "epoch": 1.7263843648208468, "grad_norm": 2.989102363586426, "learning_rate": 5.2900000000000005e-05, "loss": 0.2033, "step": 530 }, { "epoch": 1.758957654723127, "grad_norm": 2.375894784927368, "learning_rate": 5.390000000000001e-05, "loss": 0.2059, "step": 540 }, { "epoch": 1.791530944625407, "grad_norm": 2.6863505840301514, "learning_rate": 5.4900000000000006e-05, "loss": 0.2032, "step": 550 }, { "epoch": 1.8241042345276872, "grad_norm": 2.326268434524536, "learning_rate": 5.590000000000001e-05, "loss": 0.2012, "step": 560 }, { "epoch": 1.8566775244299674, "grad_norm": 2.779634952545166, "learning_rate": 5.69e-05, "loss": 0.2027, "step": 570 }, { "epoch": 1.8892508143322475, "grad_norm": 2.6916491985321045, "learning_rate": 5.79e-05, "loss": 0.1983, "step": 580 }, { "epoch": 1.9218241042345277, "grad_norm": 2.6536519527435303, "learning_rate": 5.89e-05, "loss": 0.2007, "step": 590 }, { "epoch": 1.9543973941368078, "grad_norm": 2.6414382457733154, "learning_rate": 5.99e-05, "loss": 0.1943, "step": 600 }, { "epoch": 1.986970684039088, "grad_norm": 3.05230712890625, "learning_rate": 6.09e-05, "loss": 0.1898, "step": 610 }, { "epoch": 2.019543973941368, "grad_norm": 3.180234909057617, "learning_rate": 6.19e-05, "loss": 0.1873, "step": 620 }, { "epoch": 2.0521172638436482, "grad_norm": 2.8422200679779053, "learning_rate": 6.29e-05, "loss": 0.1811, "step": 630 }, { "epoch": 2.0846905537459284, "grad_norm": 3.6337952613830566, "learning_rate": 6.390000000000001e-05, "loss": 0.182, "step": 640 }, { "epoch": 2.1172638436482085, "grad_norm": 5.311783313751221, "learning_rate": 6.49e-05, "loss": 0.1789, "step": 650 }, { "epoch": 2.1498371335504887, "grad_norm": 2.879094123840332, "learning_rate": 6.59e-05, "loss": 0.1757, "step": 660 }, { "epoch": 2.182410423452769, "grad_norm": 2.7306885719299316, "learning_rate": 6.690000000000001e-05, "loss": 0.1756, "step": 670 }, { "epoch": 2.214983713355049, "grad_norm": 2.2274434566497803, "learning_rate": 6.790000000000001e-05, "loss": 0.1696, "step": 680 }, { "epoch": 2.247557003257329, "grad_norm": 2.76662540435791, "learning_rate": 6.89e-05, "loss": 0.1633, "step": 690 }, { "epoch": 2.2801302931596092, "grad_norm": 2.6583666801452637, "learning_rate": 6.99e-05, "loss": 0.1603, "step": 700 }, { "epoch": 2.3127035830618894, "grad_norm": 2.7752110958099365, "learning_rate": 7.09e-05, "loss": 0.1555, "step": 710 }, { "epoch": 2.3452768729641695, "grad_norm": 2.919973611831665, "learning_rate": 7.19e-05, "loss": 0.1509, "step": 720 }, { "epoch": 2.3778501628664497, "grad_norm": 2.369131565093994, "learning_rate": 7.29e-05, "loss": 0.1453, "step": 730 }, { "epoch": 2.41042345276873, "grad_norm": 6.763181686401367, "learning_rate": 7.390000000000001e-05, "loss": 0.1469, "step": 740 }, { "epoch": 2.44299674267101, "grad_norm": 5.001865386962891, "learning_rate": 7.49e-05, "loss": 0.1568, "step": 750 }, { "epoch": 2.47557003257329, "grad_norm": 3.4782662391662598, "learning_rate": 7.59e-05, "loss": 0.1372, "step": 760 }, { "epoch": 2.5081433224755703, "grad_norm": 5.221865177154541, "learning_rate": 7.69e-05, "loss": 0.1373, "step": 770 }, { "epoch": 2.5407166123778504, "grad_norm": 3.3979134559631348, "learning_rate": 7.790000000000001e-05, "loss": 0.131, "step": 780 }, { "epoch": 2.5732899022801305, "grad_norm": 3.932870626449585, "learning_rate": 7.890000000000001e-05, "loss": 0.1176, "step": 790 }, { "epoch": 2.6058631921824107, "grad_norm": 4.280229091644287, "learning_rate": 7.99e-05, "loss": 0.1133, "step": 800 }, { "epoch": 2.6384364820846904, "grad_norm": 2.9509708881378174, "learning_rate": 8.090000000000001e-05, "loss": 0.1046, "step": 810 }, { "epoch": 2.6710097719869705, "grad_norm": 2.813812255859375, "learning_rate": 8.19e-05, "loss": 0.0959, "step": 820 }, { "epoch": 2.7035830618892507, "grad_norm": 3.498904228210449, "learning_rate": 8.29e-05, "loss": 0.0842, "step": 830 }, { "epoch": 2.736156351791531, "grad_norm": 2.793267250061035, "learning_rate": 8.39e-05, "loss": 0.0821, "step": 840 }, { "epoch": 2.768729641693811, "grad_norm": 3.1398348808288574, "learning_rate": 8.49e-05, "loss": 0.0783, "step": 850 }, { "epoch": 2.801302931596091, "grad_norm": 3.613935947418213, "learning_rate": 8.59e-05, "loss": 0.0679, "step": 860 }, { "epoch": 2.8338762214983713, "grad_norm": 4.147997856140137, "learning_rate": 8.69e-05, "loss": 0.0601, "step": 870 }, { "epoch": 2.8664495114006514, "grad_norm": 4.906704425811768, "learning_rate": 8.790000000000001e-05, "loss": 0.0589, "step": 880 }, { "epoch": 2.8990228013029316, "grad_norm": 4.920569896697998, "learning_rate": 8.89e-05, "loss": 0.0602, "step": 890 }, { "epoch": 2.9315960912052117, "grad_norm": 3.0054733753204346, "learning_rate": 8.99e-05, "loss": 0.0567, "step": 900 }, { "epoch": 2.964169381107492, "grad_norm": 3.0714895725250244, "learning_rate": 9.090000000000001e-05, "loss": 0.0478, "step": 910 }, { "epoch": 2.996742671009772, "grad_norm": 2.0885047912597656, "learning_rate": 9.190000000000001e-05, "loss": 0.0417, "step": 920 }, { "epoch": 3.029315960912052, "grad_norm": 3.2938196659088135, "learning_rate": 9.290000000000001e-05, "loss": 0.0504, "step": 930 }, { "epoch": 3.0618892508143323, "grad_norm": 3.084390640258789, "learning_rate": 9.39e-05, "loss": 0.0482, "step": 940 }, { "epoch": 3.0944625407166124, "grad_norm": 3.5595154762268066, "learning_rate": 9.49e-05, "loss": 0.0464, "step": 950 }, { "epoch": 3.1270358306188926, "grad_norm": 3.5651934146881104, "learning_rate": 9.59e-05, "loss": 0.0461, "step": 960 }, { "epoch": 3.1596091205211727, "grad_norm": 2.7966909408569336, "learning_rate": 9.69e-05, "loss": 0.0455, "step": 970 }, { "epoch": 3.192182410423453, "grad_norm": 2.4500396251678467, "learning_rate": 9.790000000000001e-05, "loss": 0.0395, "step": 980 }, { "epoch": 3.224755700325733, "grad_norm": 3.2736151218414307, "learning_rate": 9.89e-05, "loss": 0.0403, "step": 990 }, { "epoch": 3.257328990228013, "grad_norm": 2.5784664154052734, "learning_rate": 9.99e-05, "loss": 0.0382, "step": 1000 }, { "epoch": 3.2899022801302933, "grad_norm": 2.018731117248535, "learning_rate": 9.999994463727085e-05, "loss": 0.0376, "step": 1010 }, { "epoch": 3.3224755700325734, "grad_norm": 2.5587472915649414, "learning_rate": 9.999975326009292e-05, "loss": 0.0385, "step": 1020 }, { "epoch": 3.3550488599348536, "grad_norm": 2.2966997623443604, "learning_rate": 9.999942518549879e-05, "loss": 0.0364, "step": 1030 }, { "epoch": 3.3876221498371337, "grad_norm": 2.336106061935425, "learning_rate": 9.999896041438544e-05, "loss": 0.036, "step": 1040 }, { "epoch": 3.420195439739414, "grad_norm": 3.2040939331054688, "learning_rate": 9.999835894802353e-05, "loss": 0.0393, "step": 1050 }, { "epoch": 3.4527687296416936, "grad_norm": 2.786553382873535, "learning_rate": 9.999762078805743e-05, "loss": 0.0345, "step": 1060 }, { "epoch": 3.4853420195439737, "grad_norm": 1.6308751106262207, "learning_rate": 9.999674593650526e-05, "loss": 0.037, "step": 1070 }, { "epoch": 3.517915309446254, "grad_norm": 1.8265235424041748, "learning_rate": 9.99957343957588e-05, "loss": 0.0344, "step": 1080 }, { "epoch": 3.550488599348534, "grad_norm": 2.4011759757995605, "learning_rate": 9.99945861685836e-05, "loss": 0.0304, "step": 1090 }, { "epoch": 3.583061889250814, "grad_norm": 2.0200209617614746, "learning_rate": 9.999330125811884e-05, "loss": 0.0328, "step": 1100 }, { "epoch": 3.6156351791530943, "grad_norm": 2.044693946838379, "learning_rate": 9.999187966787744e-05, "loss": 0.0355, "step": 1110 }, { "epoch": 3.6482084690553744, "grad_norm": 2.1637792587280273, "learning_rate": 9.999032140174595e-05, "loss": 0.0342, "step": 1120 }, { "epoch": 3.6807817589576546, "grad_norm": 1.9781519174575806, "learning_rate": 9.998862646398464e-05, "loss": 0.0349, "step": 1130 }, { "epoch": 3.7133550488599347, "grad_norm": 2.601254940032959, "learning_rate": 9.998679485922739e-05, "loss": 0.0311, "step": 1140 }, { "epoch": 3.745928338762215, "grad_norm": 1.977401614189148, "learning_rate": 9.998482659248174e-05, "loss": 0.0317, "step": 1150 }, { "epoch": 3.778501628664495, "grad_norm": 1.9435837268829346, "learning_rate": 9.998272166912883e-05, "loss": 0.0325, "step": 1160 }, { "epoch": 3.811074918566775, "grad_norm": 2.054821014404297, "learning_rate": 9.998048009492347e-05, "loss": 0.0352, "step": 1170 }, { "epoch": 3.8436482084690553, "grad_norm": 1.950892448425293, "learning_rate": 9.997810187599403e-05, "loss": 0.0325, "step": 1180 }, { "epoch": 3.8762214983713354, "grad_norm": 2.4616150856018066, "learning_rate": 9.997558701884249e-05, "loss": 0.0316, "step": 1190 }, { "epoch": 3.9087947882736156, "grad_norm": 1.8379050493240356, "learning_rate": 9.997293553034433e-05, "loss": 0.0307, "step": 1200 }, { "epoch": 3.9413680781758957, "grad_norm": 1.9157487154006958, "learning_rate": 9.997014741774866e-05, "loss": 0.0319, "step": 1210 }, { "epoch": 3.973941368078176, "grad_norm": 1.8840038776397705, "learning_rate": 9.996722268867803e-05, "loss": 0.0315, "step": 1220 }, { "epoch": 4.006514657980456, "grad_norm": 1.825744867324829, "learning_rate": 9.996416135112858e-05, "loss": 0.0297, "step": 1230 }, { "epoch": 4.039087947882736, "grad_norm": 1.5622966289520264, "learning_rate": 9.996096341346988e-05, "loss": 0.0291, "step": 1240 }, { "epoch": 4.071661237785016, "grad_norm": 1.652907371520996, "learning_rate": 9.995762888444495e-05, "loss": 0.0298, "step": 1250 }, { "epoch": 4.1042345276872965, "grad_norm": 2.1362810134887695, "learning_rate": 9.995415777317027e-05, "loss": 0.0304, "step": 1260 }, { "epoch": 4.136807817589577, "grad_norm": 1.7935142517089844, "learning_rate": 9.995055008913574e-05, "loss": 0.028, "step": 1270 }, { "epoch": 4.169381107491857, "grad_norm": 2.094823122024536, "learning_rate": 9.994680584220463e-05, "loss": 0.0294, "step": 1280 }, { "epoch": 4.201954397394137, "grad_norm": 1.5280042886734009, "learning_rate": 9.994292504261355e-05, "loss": 0.0289, "step": 1290 }, { "epoch": 4.234527687296417, "grad_norm": 1.7017203569412231, "learning_rate": 9.993890770097247e-05, "loss": 0.0267, "step": 1300 }, { "epoch": 4.267100977198697, "grad_norm": 1.8569612503051758, "learning_rate": 9.993475382826467e-05, "loss": 0.0282, "step": 1310 }, { "epoch": 4.299674267100977, "grad_norm": 1.792932152748108, "learning_rate": 9.993046343584664e-05, "loss": 0.0278, "step": 1320 }, { "epoch": 4.3322475570032575, "grad_norm": 1.6156980991363525, "learning_rate": 9.992603653544816e-05, "loss": 0.0276, "step": 1330 }, { "epoch": 4.364820846905538, "grad_norm": 1.6737850904464722, "learning_rate": 9.992147313917222e-05, "loss": 0.0276, "step": 1340 }, { "epoch": 4.397394136807818, "grad_norm": 1.6165825128555298, "learning_rate": 9.991677325949497e-05, "loss": 0.0277, "step": 1350 }, { "epoch": 4.429967426710098, "grad_norm": 1.5214238166809082, "learning_rate": 9.991193690926568e-05, "loss": 0.0258, "step": 1360 }, { "epoch": 4.462540716612378, "grad_norm": 1.648809790611267, "learning_rate": 9.990696410170678e-05, "loss": 0.0274, "step": 1370 }, { "epoch": 4.495114006514658, "grad_norm": 1.5280410051345825, "learning_rate": 9.990185485041371e-05, "loss": 0.0247, "step": 1380 }, { "epoch": 4.527687296416938, "grad_norm": 1.4952828884124756, "learning_rate": 9.989660916935498e-05, "loss": 0.0251, "step": 1390 }, { "epoch": 4.5602605863192185, "grad_norm": 1.3932770490646362, "learning_rate": 9.989122707287208e-05, "loss": 0.0256, "step": 1400 }, { "epoch": 4.592833876221499, "grad_norm": 1.6718034744262695, "learning_rate": 9.988570857567945e-05, "loss": 0.0272, "step": 1410 }, { "epoch": 4.625407166123779, "grad_norm": 1.5743407011032104, "learning_rate": 9.988005369286446e-05, "loss": 0.0261, "step": 1420 }, { "epoch": 4.657980456026059, "grad_norm": 1.6346157789230347, "learning_rate": 9.987426243988734e-05, "loss": 0.0276, "step": 1430 }, { "epoch": 4.690553745928339, "grad_norm": 1.754012942314148, "learning_rate": 9.986833483258114e-05, "loss": 0.026, "step": 1440 }, { "epoch": 4.723127035830619, "grad_norm": 1.5971155166625977, "learning_rate": 9.986227088715173e-05, "loss": 0.0258, "step": 1450 }, { "epoch": 4.755700325732899, "grad_norm": 1.5895884037017822, "learning_rate": 9.98560706201777e-05, "loss": 0.0255, "step": 1460 }, { "epoch": 4.7882736156351795, "grad_norm": 1.799141764640808, "learning_rate": 9.984973404861036e-05, "loss": 0.028, "step": 1470 }, { "epoch": 4.82084690553746, "grad_norm": 1.5767462253570557, "learning_rate": 9.984326118977361e-05, "loss": 0.0277, "step": 1480 }, { "epoch": 4.85342019543974, "grad_norm": 1.547109842300415, "learning_rate": 9.983665206136406e-05, "loss": 0.0239, "step": 1490 }, { "epoch": 4.88599348534202, "grad_norm": 1.6011266708374023, "learning_rate": 9.982990668145075e-05, "loss": 0.023, "step": 1500 }, { "epoch": 4.918566775244299, "grad_norm": 1.5277559757232666, "learning_rate": 9.982302506847534e-05, "loss": 0.0232, "step": 1510 }, { "epoch": 4.95114006514658, "grad_norm": 1.3489938974380493, "learning_rate": 9.981600724125189e-05, "loss": 0.0231, "step": 1520 }, { "epoch": 4.9837133550488595, "grad_norm": 1.2097440958023071, "learning_rate": 9.980885321896685e-05, "loss": 0.0226, "step": 1530 }, { "epoch": 5.01628664495114, "grad_norm": 1.5271075963974, "learning_rate": 9.980156302117905e-05, "loss": 0.0217, "step": 1540 }, { "epoch": 5.04885993485342, "grad_norm": 1.4403456449508667, "learning_rate": 9.979413666781963e-05, "loss": 0.0259, "step": 1550 }, { "epoch": 5.0814332247557, "grad_norm": 1.3812097311019897, "learning_rate": 9.978657417919193e-05, "loss": 0.023, "step": 1560 }, { "epoch": 5.11400651465798, "grad_norm": 1.5333428382873535, "learning_rate": 9.977887557597153e-05, "loss": 0.0242, "step": 1570 }, { "epoch": 5.14657980456026, "grad_norm": 1.1747639179229736, "learning_rate": 9.97710408792061e-05, "loss": 0.0232, "step": 1580 }, { "epoch": 5.17915309446254, "grad_norm": 1.4856892824172974, "learning_rate": 9.976307011031542e-05, "loss": 0.0207, "step": 1590 }, { "epoch": 5.2117263843648205, "grad_norm": 1.1282317638397217, "learning_rate": 9.975496329109126e-05, "loss": 0.0233, "step": 1600 }, { "epoch": 5.244299674267101, "grad_norm": 1.4065935611724854, "learning_rate": 9.974672044369732e-05, "loss": 0.0218, "step": 1610 }, { "epoch": 5.276872964169381, "grad_norm": 1.1849043369293213, "learning_rate": 9.97383415906693e-05, "loss": 0.0221, "step": 1620 }, { "epoch": 5.309446254071661, "grad_norm": 1.325810194015503, "learning_rate": 9.97298267549146e-05, "loss": 0.0213, "step": 1630 }, { "epoch": 5.342019543973941, "grad_norm": 1.113312840461731, "learning_rate": 9.972117595971249e-05, "loss": 0.0228, "step": 1640 }, { "epoch": 5.374592833876221, "grad_norm": 1.230494499206543, "learning_rate": 9.971238922871391e-05, "loss": 0.0253, "step": 1650 }, { "epoch": 5.407166123778501, "grad_norm": 1.4597903490066528, "learning_rate": 9.970346658594142e-05, "loss": 0.0224, "step": 1660 }, { "epoch": 5.4397394136807815, "grad_norm": 1.2914201021194458, "learning_rate": 9.969440805578923e-05, "loss": 0.0224, "step": 1670 }, { "epoch": 5.472312703583062, "grad_norm": 1.2426233291625977, "learning_rate": 9.968521366302298e-05, "loss": 0.0233, "step": 1680 }, { "epoch": 5.504885993485342, "grad_norm": 1.2101874351501465, "learning_rate": 9.967588343277981e-05, "loss": 0.0233, "step": 1690 }, { "epoch": 5.537459283387622, "grad_norm": 1.576130747795105, "learning_rate": 9.966641739056818e-05, "loss": 0.0204, "step": 1700 }, { "epoch": 5.570032573289902, "grad_norm": 1.4965754747390747, "learning_rate": 9.965681556226793e-05, "loss": 0.021, "step": 1710 }, { "epoch": 5.602605863192182, "grad_norm": 1.2106349468231201, "learning_rate": 9.964707797413006e-05, "loss": 0.0217, "step": 1720 }, { "epoch": 5.635179153094462, "grad_norm": 1.2536280155181885, "learning_rate": 9.963720465277679e-05, "loss": 0.0229, "step": 1730 }, { "epoch": 5.6677524429967425, "grad_norm": 1.2137837409973145, "learning_rate": 9.96271956252014e-05, "loss": 0.0227, "step": 1740 }, { "epoch": 5.700325732899023, "grad_norm": 1.4352983236312866, "learning_rate": 9.961705091876816e-05, "loss": 0.0217, "step": 1750 }, { "epoch": 5.732899022801303, "grad_norm": 1.0887129306793213, "learning_rate": 9.960677056121235e-05, "loss": 0.02, "step": 1760 }, { "epoch": 5.765472312703583, "grad_norm": 1.2834426164627075, "learning_rate": 9.959635458064005e-05, "loss": 0.0232, "step": 1770 }, { "epoch": 5.798045602605863, "grad_norm": 1.3449937105178833, "learning_rate": 9.958580300552815e-05, "loss": 0.0205, "step": 1780 }, { "epoch": 5.830618892508143, "grad_norm": 1.375016689300537, "learning_rate": 9.957511586472426e-05, "loss": 0.0231, "step": 1790 }, { "epoch": 5.863192182410423, "grad_norm": 1.2283987998962402, "learning_rate": 9.956429318744662e-05, "loss": 0.0229, "step": 1800 }, { "epoch": 5.8957654723127035, "grad_norm": 1.226679801940918, "learning_rate": 9.955333500328404e-05, "loss": 0.0201, "step": 1810 }, { "epoch": 5.928338762214984, "grad_norm": 1.6469484567642212, "learning_rate": 9.95422413421957e-05, "loss": 0.0211, "step": 1820 }, { "epoch": 5.960912052117264, "grad_norm": 1.1678240299224854, "learning_rate": 9.953101223451133e-05, "loss": 0.0214, "step": 1830 }, { "epoch": 5.993485342019544, "grad_norm": 1.2131609916687012, "learning_rate": 9.951964771093085e-05, "loss": 0.0202, "step": 1840 }, { "epoch": 6.026058631921824, "grad_norm": 1.3158583641052246, "learning_rate": 9.950814780252442e-05, "loss": 0.0204, "step": 1850 }, { "epoch": 6.058631921824104, "grad_norm": 1.2186533212661743, "learning_rate": 9.949651254073236e-05, "loss": 0.0192, "step": 1860 }, { "epoch": 6.091205211726384, "grad_norm": 1.2744988203048706, "learning_rate": 9.948474195736504e-05, "loss": 0.0209, "step": 1870 }, { "epoch": 6.1237785016286646, "grad_norm": 1.1233289241790771, "learning_rate": 9.947283608460277e-05, "loss": 0.02, "step": 1880 }, { "epoch": 6.156351791530945, "grad_norm": 1.092411994934082, "learning_rate": 9.946079495499577e-05, "loss": 0.0202, "step": 1890 }, { "epoch": 6.188925081433225, "grad_norm": 1.3971725702285767, "learning_rate": 9.944861860146401e-05, "loss": 0.021, "step": 1900 }, { "epoch": 6.221498371335505, "grad_norm": 1.7205326557159424, "learning_rate": 9.943630705729719e-05, "loss": 0.0205, "step": 1910 }, { "epoch": 6.254071661237785, "grad_norm": 1.2655634880065918, "learning_rate": 9.942386035615459e-05, "loss": 0.0222, "step": 1920 }, { "epoch": 6.286644951140065, "grad_norm": 0.9892485737800598, "learning_rate": 9.941127853206503e-05, "loss": 0.0191, "step": 1930 }, { "epoch": 6.319218241042345, "grad_norm": 1.2320622205734253, "learning_rate": 9.939856161942673e-05, "loss": 0.0196, "step": 1940 }, { "epoch": 6.351791530944626, "grad_norm": 1.315510630607605, "learning_rate": 9.938570965300724e-05, "loss": 0.0184, "step": 1950 }, { "epoch": 6.384364820846906, "grad_norm": 0.9320159554481506, "learning_rate": 9.937272266794335e-05, "loss": 0.0231, "step": 1960 }, { "epoch": 6.416938110749186, "grad_norm": 0.9621773958206177, "learning_rate": 9.935960069974096e-05, "loss": 0.0178, "step": 1970 }, { "epoch": 6.449511400651466, "grad_norm": 1.2875672578811646, "learning_rate": 9.934634378427506e-05, "loss": 0.0204, "step": 1980 }, { "epoch": 6.482084690553746, "grad_norm": 1.1186437606811523, "learning_rate": 9.933295195778954e-05, "loss": 0.0211, "step": 1990 }, { "epoch": 6.514657980456026, "grad_norm": 1.0017237663269043, "learning_rate": 9.931942525689715e-05, "loss": 0.023, "step": 2000 }, { "epoch": 6.547231270358306, "grad_norm": 0.9961879253387451, "learning_rate": 9.930576371857936e-05, "loss": 0.0205, "step": 2010 }, { "epoch": 6.579804560260587, "grad_norm": 1.1414068937301636, "learning_rate": 9.929196738018629e-05, "loss": 0.0182, "step": 2020 }, { "epoch": 6.612377850162867, "grad_norm": 1.139420509338379, "learning_rate": 9.927803627943662e-05, "loss": 0.0175, "step": 2030 }, { "epoch": 6.644951140065147, "grad_norm": 1.1123589277267456, "learning_rate": 9.926397045441744e-05, "loss": 0.0195, "step": 2040 }, { "epoch": 6.677524429967427, "grad_norm": 1.2396354675292969, "learning_rate": 9.924976994358417e-05, "loss": 0.0199, "step": 2050 }, { "epoch": 6.710097719869707, "grad_norm": 1.2193408012390137, "learning_rate": 9.923543478576048e-05, "loss": 0.0197, "step": 2060 }, { "epoch": 6.742671009771987, "grad_norm": 1.1543775796890259, "learning_rate": 9.922096502013813e-05, "loss": 0.0192, "step": 2070 }, { "epoch": 6.7752442996742674, "grad_norm": 1.1310466527938843, "learning_rate": 9.92063606862769e-05, "loss": 0.0196, "step": 2080 }, { "epoch": 6.807817589576548, "grad_norm": 1.2145164012908936, "learning_rate": 9.919162182410453e-05, "loss": 0.0196, "step": 2090 }, { "epoch": 6.840390879478828, "grad_norm": 1.0499675273895264, "learning_rate": 9.917674847391645e-05, "loss": 0.0179, "step": 2100 }, { "epoch": 6.872964169381108, "grad_norm": 0.8699261546134949, "learning_rate": 9.916174067637584e-05, "loss": 0.0168, "step": 2110 }, { "epoch": 6.905537459283387, "grad_norm": 0.8845049142837524, "learning_rate": 9.914659847251348e-05, "loss": 0.0163, "step": 2120 }, { "epoch": 6.938110749185668, "grad_norm": 0.8768344521522522, "learning_rate": 9.913132190372753e-05, "loss": 0.0189, "step": 2130 }, { "epoch": 6.970684039087947, "grad_norm": 1.0072346925735474, "learning_rate": 9.911591101178359e-05, "loss": 0.017, "step": 2140 }, { "epoch": 7.003257328990228, "grad_norm": 0.9805561304092407, "learning_rate": 9.910036583881443e-05, "loss": 0.0168, "step": 2150 }, { "epoch": 7.035830618892508, "grad_norm": 0.9882418513298035, "learning_rate": 9.908468642731995e-05, "loss": 0.019, "step": 2160 }, { "epoch": 7.068403908794788, "grad_norm": 1.0311851501464844, "learning_rate": 9.906887282016707e-05, "loss": 0.018, "step": 2170 }, { "epoch": 7.100977198697068, "grad_norm": 1.3635246753692627, "learning_rate": 9.90529250605896e-05, "loss": 0.019, "step": 2180 }, { "epoch": 7.133550488599348, "grad_norm": 0.94489586353302, "learning_rate": 9.903684319218809e-05, "loss": 0.0186, "step": 2190 }, { "epoch": 7.166123778501628, "grad_norm": 1.1365646123886108, "learning_rate": 9.902062725892976e-05, "loss": 0.0195, "step": 2200 }, { "epoch": 7.198697068403908, "grad_norm": 1.1778250932693481, "learning_rate": 9.900427730514834e-05, "loss": 0.0196, "step": 2210 }, { "epoch": 7.231270358306189, "grad_norm": 1.0461801290512085, "learning_rate": 9.8987793375544e-05, "loss": 0.0183, "step": 2220 }, { "epoch": 7.263843648208469, "grad_norm": 1.068851351737976, "learning_rate": 9.897117551518318e-05, "loss": 0.0172, "step": 2230 }, { "epoch": 7.296416938110749, "grad_norm": 0.906645655632019, "learning_rate": 9.895442376949844e-05, "loss": 0.016, "step": 2240 }, { "epoch": 7.328990228013029, "grad_norm": 0.9077069759368896, "learning_rate": 9.893753818428845e-05, "loss": 0.0178, "step": 2250 }, { "epoch": 7.361563517915309, "grad_norm": 0.8832686543464661, "learning_rate": 9.892051880571773e-05, "loss": 0.017, "step": 2260 }, { "epoch": 7.394136807817589, "grad_norm": 0.9024226665496826, "learning_rate": 9.890336568031663e-05, "loss": 0.0154, "step": 2270 }, { "epoch": 7.4267100977198695, "grad_norm": 0.8265984058380127, "learning_rate": 9.888607885498113e-05, "loss": 0.0166, "step": 2280 }, { "epoch": 7.45928338762215, "grad_norm": 0.9777111411094666, "learning_rate": 9.886865837697275e-05, "loss": 0.0172, "step": 2290 }, { "epoch": 7.49185667752443, "grad_norm": 0.9763800501823425, "learning_rate": 9.88511042939184e-05, "loss": 0.0188, "step": 2300 }, { "epoch": 7.52442996742671, "grad_norm": 1.0408334732055664, "learning_rate": 9.883341665381028e-05, "loss": 0.0187, "step": 2310 }, { "epoch": 7.55700325732899, "grad_norm": 0.6989148259162903, "learning_rate": 9.881559550500575e-05, "loss": 0.0174, "step": 2320 }, { "epoch": 7.58957654723127, "grad_norm": 1.2441219091415405, "learning_rate": 9.879764089622712e-05, "loss": 0.0175, "step": 2330 }, { "epoch": 7.62214983713355, "grad_norm": 0.9775832891464233, "learning_rate": 9.87795528765616e-05, "loss": 0.0174, "step": 2340 }, { "epoch": 7.6547231270358305, "grad_norm": 1.1409343481063843, "learning_rate": 9.876133149546118e-05, "loss": 0.0198, "step": 2350 }, { "epoch": 7.687296416938111, "grad_norm": 0.9530286192893982, "learning_rate": 9.874297680274238e-05, "loss": 0.0174, "step": 2360 }, { "epoch": 7.719869706840391, "grad_norm": 0.9352869391441345, "learning_rate": 9.872448884858624e-05, "loss": 0.0175, "step": 2370 }, { "epoch": 7.752442996742671, "grad_norm": 0.9207677245140076, "learning_rate": 9.870586768353815e-05, "loss": 0.0183, "step": 2380 }, { "epoch": 7.785016286644951, "grad_norm": 0.8910165429115295, "learning_rate": 9.868711335850764e-05, "loss": 0.0189, "step": 2390 }, { "epoch": 7.817589576547231, "grad_norm": 1.199159026145935, "learning_rate": 9.866822592476833e-05, "loss": 0.0181, "step": 2400 }, { "epoch": 7.850162866449511, "grad_norm": 1.0165252685546875, "learning_rate": 9.86492054339577e-05, "loss": 0.0193, "step": 2410 }, { "epoch": 7.8827361563517915, "grad_norm": 0.8919486999511719, "learning_rate": 9.863005193807711e-05, "loss": 0.0173, "step": 2420 }, { "epoch": 7.915309446254072, "grad_norm": 0.7932745218276978, "learning_rate": 9.861076548949143e-05, "loss": 0.017, "step": 2430 }, { "epoch": 7.947882736156352, "grad_norm": 0.8434972763061523, "learning_rate": 9.859134614092912e-05, "loss": 0.0177, "step": 2440 }, { "epoch": 7.980456026058632, "grad_norm": 1.005811333656311, "learning_rate": 9.857179394548191e-05, "loss": 0.0156, "step": 2450 }, { "epoch": 8.013029315960912, "grad_norm": 0.9450655579566956, "learning_rate": 9.855210895660477e-05, "loss": 0.0179, "step": 2460 }, { "epoch": 8.045602605863191, "grad_norm": 1.0200849771499634, "learning_rate": 9.853229122811568e-05, "loss": 0.0182, "step": 2470 }, { "epoch": 8.078175895765472, "grad_norm": 0.7630968689918518, "learning_rate": 9.851234081419559e-05, "loss": 0.0175, "step": 2480 }, { "epoch": 8.110749185667752, "grad_norm": 0.7456977963447571, "learning_rate": 9.849225776938814e-05, "loss": 0.0182, "step": 2490 }, { "epoch": 8.143322475570033, "grad_norm": 0.7413007020950317, "learning_rate": 9.847204214859964e-05, "loss": 0.0169, "step": 2500 }, { "epoch": 8.175895765472312, "grad_norm": 0.704645037651062, "learning_rate": 9.845169400709879e-05, "loss": 0.0169, "step": 2510 }, { "epoch": 8.208469055374593, "grad_norm": 0.9419644474983215, "learning_rate": 9.843121340051664e-05, "loss": 0.0164, "step": 2520 }, { "epoch": 8.241042345276872, "grad_norm": 1.1566686630249023, "learning_rate": 9.841060038484641e-05, "loss": 0.0159, "step": 2530 }, { "epoch": 8.273615635179153, "grad_norm": 0.862409770488739, "learning_rate": 9.838985501644328e-05, "loss": 0.016, "step": 2540 }, { "epoch": 8.306188925081432, "grad_norm": 0.8754869103431702, "learning_rate": 9.83689773520243e-05, "loss": 0.016, "step": 2550 }, { "epoch": 8.338762214983714, "grad_norm": 1.02450692653656, "learning_rate": 9.834796744866819e-05, "loss": 0.0164, "step": 2560 }, { "epoch": 8.371335504885993, "grad_norm": 0.7608270645141602, "learning_rate": 9.832682536381525e-05, "loss": 0.0155, "step": 2570 }, { "epoch": 8.403908794788274, "grad_norm": 0.7060204744338989, "learning_rate": 9.830555115526711e-05, "loss": 0.0164, "step": 2580 }, { "epoch": 8.436482084690553, "grad_norm": 0.6277144551277161, "learning_rate": 9.828414488118667e-05, "loss": 0.0153, "step": 2590 }, { "epoch": 8.469055374592834, "grad_norm": 0.7759043574333191, "learning_rate": 9.826260660009785e-05, "loss": 0.0147, "step": 2600 }, { "epoch": 8.501628664495113, "grad_norm": 0.8001388311386108, "learning_rate": 9.824093637088547e-05, "loss": 0.0151, "step": 2610 }, { "epoch": 8.534201954397394, "grad_norm": 0.7903552055358887, "learning_rate": 9.821913425279514e-05, "loss": 0.016, "step": 2620 }, { "epoch": 8.566775244299674, "grad_norm": 0.840638279914856, "learning_rate": 9.8197200305433e-05, "loss": 0.0155, "step": 2630 }, { "epoch": 8.599348534201955, "grad_norm": 0.9356780648231506, "learning_rate": 9.817513458876564e-05, "loss": 0.0176, "step": 2640 }, { "epoch": 8.631921824104234, "grad_norm": 0.8237717151641846, "learning_rate": 9.815293716311987e-05, "loss": 0.0157, "step": 2650 }, { "epoch": 8.664495114006515, "grad_norm": 1.1324107646942139, "learning_rate": 9.813060808918262e-05, "loss": 0.0156, "step": 2660 }, { "epoch": 8.697068403908794, "grad_norm": 0.7818335294723511, "learning_rate": 9.810814742800069e-05, "loss": 0.0156, "step": 2670 }, { "epoch": 8.729641693811075, "grad_norm": 0.8096736669540405, "learning_rate": 9.808555524098074e-05, "loss": 0.0148, "step": 2680 }, { "epoch": 8.762214983713354, "grad_norm": 1.0488592386245728, "learning_rate": 9.806283158988887e-05, "loss": 0.0151, "step": 2690 }, { "epoch": 8.794788273615636, "grad_norm": 0.8972393870353699, "learning_rate": 9.803997653685072e-05, "loss": 0.0153, "step": 2700 }, { "epoch": 8.827361563517915, "grad_norm": 0.8795937299728394, "learning_rate": 9.801699014435112e-05, "loss": 0.0159, "step": 2710 }, { "epoch": 8.859934853420196, "grad_norm": 0.7514339685440063, "learning_rate": 9.799387247523398e-05, "loss": 0.0153, "step": 2720 }, { "epoch": 8.892508143322475, "grad_norm": 0.8946635127067566, "learning_rate": 9.797062359270215e-05, "loss": 0.0136, "step": 2730 }, { "epoch": 8.925081433224756, "grad_norm": 0.8136707544326782, "learning_rate": 9.794724356031715e-05, "loss": 0.0143, "step": 2740 }, { "epoch": 8.957654723127035, "grad_norm": 0.6647421717643738, "learning_rate": 9.792373244199913e-05, "loss": 0.0142, "step": 2750 }, { "epoch": 8.990228013029316, "grad_norm": 0.8160433769226074, "learning_rate": 9.790009030202658e-05, "loss": 0.0158, "step": 2760 }, { "epoch": 9.022801302931596, "grad_norm": 0.6846116185188293, "learning_rate": 9.78763172050362e-05, "loss": 0.0133, "step": 2770 }, { "epoch": 9.055374592833877, "grad_norm": 0.806671142578125, "learning_rate": 9.785241321602274e-05, "loss": 0.0163, "step": 2780 }, { "epoch": 9.087947882736156, "grad_norm": 0.7517706751823425, "learning_rate": 9.782837840033879e-05, "loss": 0.0163, "step": 2790 }, { "epoch": 9.120521172638437, "grad_norm": 0.866486668586731, "learning_rate": 9.780421282369461e-05, "loss": 0.017, "step": 2800 }, { "epoch": 9.153094462540716, "grad_norm": 0.7923924326896667, "learning_rate": 9.777991655215797e-05, "loss": 0.0187, "step": 2810 }, { "epoch": 9.185667752442997, "grad_norm": 0.8920378684997559, "learning_rate": 9.775548965215394e-05, "loss": 0.0161, "step": 2820 }, { "epoch": 9.218241042345277, "grad_norm": 0.7856971621513367, "learning_rate": 9.773093219046474e-05, "loss": 0.0154, "step": 2830 }, { "epoch": 9.250814332247558, "grad_norm": 1.1112109422683716, "learning_rate": 9.770624423422954e-05, "loss": 0.0164, "step": 2840 }, { "epoch": 9.283387622149837, "grad_norm": 0.8900606632232666, "learning_rate": 9.768142585094426e-05, "loss": 0.0148, "step": 2850 }, { "epoch": 9.315960912052118, "grad_norm": 0.808012068271637, "learning_rate": 9.765647710846142e-05, "loss": 0.015, "step": 2860 }, { "epoch": 9.348534201954397, "grad_norm": 0.7280744910240173, "learning_rate": 9.763139807498991e-05, "loss": 0.0168, "step": 2870 }, { "epoch": 9.381107491856678, "grad_norm": 0.7034526467323303, "learning_rate": 9.760618881909487e-05, "loss": 0.0142, "step": 2880 }, { "epoch": 9.413680781758957, "grad_norm": 0.6753115653991699, "learning_rate": 9.758084940969744e-05, "loss": 0.0147, "step": 2890 }, { "epoch": 9.446254071661238, "grad_norm": 0.7895611524581909, "learning_rate": 9.755537991607459e-05, "loss": 0.0136, "step": 2900 }, { "epoch": 9.478827361563518, "grad_norm": 0.7646326422691345, "learning_rate": 9.752978040785895e-05, "loss": 0.0153, "step": 2910 }, { "epoch": 9.511400651465799, "grad_norm": 0.784275233745575, "learning_rate": 9.750405095503859e-05, "loss": 0.0136, "step": 2920 }, { "epoch": 9.543973941368078, "grad_norm": 0.6947634816169739, "learning_rate": 9.747819162795686e-05, "loss": 0.0156, "step": 2930 }, { "epoch": 9.576547231270359, "grad_norm": 0.7826408743858337, "learning_rate": 9.745220249731217e-05, "loss": 0.0134, "step": 2940 }, { "epoch": 9.609120521172638, "grad_norm": 0.8155382871627808, "learning_rate": 9.742608363415781e-05, "loss": 0.0131, "step": 2950 }, { "epoch": 9.64169381107492, "grad_norm": 0.7903912663459778, "learning_rate": 9.739983510990176e-05, "loss": 0.0151, "step": 2960 }, { "epoch": 9.674267100977199, "grad_norm": 0.6831985712051392, "learning_rate": 9.737345699630647e-05, "loss": 0.0142, "step": 2970 }, { "epoch": 9.70684039087948, "grad_norm": 0.7938959002494812, "learning_rate": 9.734694936548869e-05, "loss": 0.0157, "step": 2980 }, { "epoch": 9.739413680781759, "grad_norm": 0.8478028774261475, "learning_rate": 9.732031228991932e-05, "loss": 0.0135, "step": 2990 }, { "epoch": 9.77198697068404, "grad_norm": 0.8881486058235168, "learning_rate": 9.729354584242302e-05, "loss": 0.0145, "step": 3000 }, { "epoch": 9.80456026058632, "grad_norm": 0.8107954859733582, "learning_rate": 9.726665009617832e-05, "loss": 0.0147, "step": 3010 }, { "epoch": 9.8371335504886, "grad_norm": 0.593552827835083, "learning_rate": 9.723962512471714e-05, "loss": 0.0151, "step": 3020 }, { "epoch": 9.86970684039088, "grad_norm": 0.7691318392753601, "learning_rate": 9.72124710019247e-05, "loss": 0.0133, "step": 3030 }, { "epoch": 9.90228013029316, "grad_norm": 0.8209340572357178, "learning_rate": 9.718518780203934e-05, "loss": 0.0143, "step": 3040 }, { "epoch": 9.93485342019544, "grad_norm": 0.786946713924408, "learning_rate": 9.715777559965228e-05, "loss": 0.016, "step": 3050 }, { "epoch": 9.967426710097719, "grad_norm": 0.833220899105072, "learning_rate": 9.713023446970746e-05, "loss": 0.0142, "step": 3060 }, { "epoch": 10.0, "grad_norm": 0.8491772413253784, "learning_rate": 9.710256448750126e-05, "loss": 0.0155, "step": 3070 }, { "epoch": 10.03257328990228, "grad_norm": 0.7305663228034973, "learning_rate": 9.707476572868235e-05, "loss": 0.0135, "step": 3080 }, { "epoch": 10.06514657980456, "grad_norm": 0.7415571808815002, "learning_rate": 9.704683826925149e-05, "loss": 0.0127, "step": 3090 }, { "epoch": 10.09771986970684, "grad_norm": 0.723970890045166, "learning_rate": 9.701878218556129e-05, "loss": 0.0139, "step": 3100 }, { "epoch": 10.13029315960912, "grad_norm": 0.7780591249465942, "learning_rate": 9.699059755431598e-05, "loss": 0.015, "step": 3110 }, { "epoch": 10.1628664495114, "grad_norm": 0.8553524017333984, "learning_rate": 9.696228445257132e-05, "loss": 0.0144, "step": 3120 }, { "epoch": 10.19543973941368, "grad_norm": 0.6987228393554688, "learning_rate": 9.693384295773419e-05, "loss": 0.0143, "step": 3130 }, { "epoch": 10.22801302931596, "grad_norm": 0.6953451633453369, "learning_rate": 9.690527314756259e-05, "loss": 0.0126, "step": 3140 }, { "epoch": 10.260586319218241, "grad_norm": 0.7900610566139221, "learning_rate": 9.687657510016527e-05, "loss": 0.0142, "step": 3150 }, { "epoch": 10.29315960912052, "grad_norm": 0.7502635717391968, "learning_rate": 9.684774889400161e-05, "loss": 0.0137, "step": 3160 }, { "epoch": 10.325732899022801, "grad_norm": 0.8514730334281921, "learning_rate": 9.681879460788135e-05, "loss": 0.0145, "step": 3170 }, { "epoch": 10.35830618892508, "grad_norm": 0.6699235439300537, "learning_rate": 9.67897123209644e-05, "loss": 0.016, "step": 3180 }, { "epoch": 10.390879478827362, "grad_norm": 0.9717312455177307, "learning_rate": 9.676050211276062e-05, "loss": 0.0139, "step": 3190 }, { "epoch": 10.423452768729641, "grad_norm": 0.8204063177108765, "learning_rate": 9.673116406312962e-05, "loss": 0.0149, "step": 3200 }, { "epoch": 10.456026058631922, "grad_norm": 0.6988447308540344, "learning_rate": 9.67016982522805e-05, "loss": 0.0169, "step": 3210 }, { "epoch": 10.488599348534201, "grad_norm": 0.7764069437980652, "learning_rate": 9.667210476077164e-05, "loss": 0.0134, "step": 3220 }, { "epoch": 10.521172638436482, "grad_norm": 0.8181342482566833, "learning_rate": 9.664238366951055e-05, "loss": 0.014, "step": 3230 }, { "epoch": 10.553745928338762, "grad_norm": 0.8816471099853516, "learning_rate": 9.661253505975355e-05, "loss": 0.0132, "step": 3240 }, { "epoch": 10.586319218241043, "grad_norm": 0.6358229517936707, "learning_rate": 9.658255901310557e-05, "loss": 0.0141, "step": 3250 }, { "epoch": 10.618892508143322, "grad_norm": 0.8587698340415955, "learning_rate": 9.655245561152e-05, "loss": 0.0147, "step": 3260 }, { "epoch": 10.651465798045603, "grad_norm": 0.7497543692588806, "learning_rate": 9.65222249372984e-05, "loss": 0.0138, "step": 3270 }, { "epoch": 10.684039087947882, "grad_norm": 0.7951953411102295, "learning_rate": 9.649186707309026e-05, "loss": 0.0124, "step": 3280 }, { "epoch": 10.716612377850163, "grad_norm": 0.7938198447227478, "learning_rate": 9.646138210189283e-05, "loss": 0.0144, "step": 3290 }, { "epoch": 10.749185667752442, "grad_norm": 0.7425335049629211, "learning_rate": 9.643077010705087e-05, "loss": 0.0124, "step": 3300 }, { "epoch": 10.781758957654723, "grad_norm": 0.6020338535308838, "learning_rate": 9.640003117225637e-05, "loss": 0.014, "step": 3310 }, { "epoch": 10.814332247557003, "grad_norm": 0.7467803359031677, "learning_rate": 9.636916538154846e-05, "loss": 0.016, "step": 3320 }, { "epoch": 10.846905537459284, "grad_norm": 0.7082899212837219, "learning_rate": 9.633817281931296e-05, "loss": 0.0168, "step": 3330 }, { "epoch": 10.879478827361563, "grad_norm": 0.7912280559539795, "learning_rate": 9.630705357028242e-05, "loss": 0.0136, "step": 3340 }, { "epoch": 10.912052117263844, "grad_norm": 0.6572038531303406, "learning_rate": 9.627580771953563e-05, "loss": 0.0129, "step": 3350 }, { "epoch": 10.944625407166123, "grad_norm": 0.8878430724143982, "learning_rate": 9.624443535249759e-05, "loss": 0.0134, "step": 3360 }, { "epoch": 10.977198697068404, "grad_norm": 0.6283133625984192, "learning_rate": 9.621293655493913e-05, "loss": 0.0159, "step": 3370 }, { "epoch": 11.009771986970684, "grad_norm": 0.7855213284492493, "learning_rate": 9.618131141297675e-05, "loss": 0.0151, "step": 3380 }, { "epoch": 11.042345276872965, "grad_norm": 0.6657193303108215, "learning_rate": 9.614956001307242e-05, "loss": 0.0143, "step": 3390 }, { "epoch": 11.074918566775244, "grad_norm": 0.99737948179245, "learning_rate": 9.611768244203321e-05, "loss": 0.0149, "step": 3400 }, { "epoch": 11.107491856677525, "grad_norm": 0.5985683798789978, "learning_rate": 9.60856787870112e-05, "loss": 0.0142, "step": 3410 }, { "epoch": 11.140065146579804, "grad_norm": 0.7139508724212646, "learning_rate": 9.605354913550318e-05, "loss": 0.0149, "step": 3420 }, { "epoch": 11.172638436482085, "grad_norm": 0.6530821919441223, "learning_rate": 9.602129357535037e-05, "loss": 0.0135, "step": 3430 }, { "epoch": 11.205211726384364, "grad_norm": 0.6087831854820251, "learning_rate": 9.598891219473825e-05, "loss": 0.0132, "step": 3440 }, { "epoch": 11.237785016286646, "grad_norm": 0.631524384021759, "learning_rate": 9.595640508219625e-05, "loss": 0.0136, "step": 3450 }, { "epoch": 11.270358306188925, "grad_norm": 0.7701088786125183, "learning_rate": 9.592377232659761e-05, "loss": 0.0146, "step": 3460 }, { "epoch": 11.302931596091206, "grad_norm": 0.6873641610145569, "learning_rate": 9.589101401715904e-05, "loss": 0.0139, "step": 3470 }, { "epoch": 11.335504885993485, "grad_norm": 0.7974376678466797, "learning_rate": 9.585813024344045e-05, "loss": 0.0128, "step": 3480 }, { "epoch": 11.368078175895766, "grad_norm": 0.7017616033554077, "learning_rate": 9.58251210953449e-05, "loss": 0.0138, "step": 3490 }, { "epoch": 11.400651465798045, "grad_norm": 0.7259907722473145, "learning_rate": 9.579198666311809e-05, "loss": 0.013, "step": 3500 }, { "epoch": 11.433224755700326, "grad_norm": 0.6088874340057373, "learning_rate": 9.575872703734832e-05, "loss": 0.0126, "step": 3510 }, { "epoch": 11.465798045602606, "grad_norm": 0.5169392824172974, "learning_rate": 9.572534230896611e-05, "loss": 0.0116, "step": 3520 }, { "epoch": 11.498371335504887, "grad_norm": 0.8139016628265381, "learning_rate": 9.569183256924403e-05, "loss": 0.0126, "step": 3530 }, { "epoch": 11.530944625407166, "grad_norm": 0.7374736070632935, "learning_rate": 9.565819790979646e-05, "loss": 0.0131, "step": 3540 }, { "epoch": 11.563517915309447, "grad_norm": 0.6468128561973572, "learning_rate": 9.562443842257925e-05, "loss": 0.0129, "step": 3550 }, { "epoch": 11.596091205211726, "grad_norm": 0.6643866896629333, "learning_rate": 9.559055419988956e-05, "loss": 0.0133, "step": 3560 }, { "epoch": 11.628664495114007, "grad_norm": 0.681177020072937, "learning_rate": 9.555654533436557e-05, "loss": 0.0135, "step": 3570 }, { "epoch": 11.661237785016286, "grad_norm": 0.7795754075050354, "learning_rate": 9.552241191898621e-05, "loss": 0.0114, "step": 3580 }, { "epoch": 11.693811074918568, "grad_norm": 0.6136661171913147, "learning_rate": 9.548815404707092e-05, "loss": 0.0139, "step": 3590 }, { "epoch": 11.726384364820847, "grad_norm": 0.6191911697387695, "learning_rate": 9.545377181227942e-05, "loss": 0.0147, "step": 3600 }, { "epoch": 11.758957654723128, "grad_norm": 0.67829829454422, "learning_rate": 9.541926530861145e-05, "loss": 0.013, "step": 3610 }, { "epoch": 11.791530944625407, "grad_norm": 0.7790681719779968, "learning_rate": 9.538463463040645e-05, "loss": 0.0131, "step": 3620 }, { "epoch": 11.824104234527688, "grad_norm": 0.694274365901947, "learning_rate": 9.534987987234337e-05, "loss": 0.0138, "step": 3630 }, { "epoch": 11.856677524429967, "grad_norm": 0.5702154636383057, "learning_rate": 9.53150011294404e-05, "loss": 0.0121, "step": 3640 }, { "epoch": 11.889250814332247, "grad_norm": 0.5818066000938416, "learning_rate": 9.527999849705471e-05, "loss": 0.0132, "step": 3650 }, { "epoch": 11.921824104234528, "grad_norm": 0.6952799558639526, "learning_rate": 9.524487207088213e-05, "loss": 0.0117, "step": 3660 }, { "epoch": 11.954397394136809, "grad_norm": 0.5978224277496338, "learning_rate": 9.520962194695698e-05, "loss": 0.0137, "step": 3670 }, { "epoch": 11.986970684039088, "grad_norm": 0.6350916028022766, "learning_rate": 9.517424822165175e-05, "loss": 0.0116, "step": 3680 }, { "epoch": 12.019543973941367, "grad_norm": 0.651490330696106, "learning_rate": 9.513875099167685e-05, "loss": 0.0164, "step": 3690 }, { "epoch": 12.052117263843648, "grad_norm": 0.7070964574813843, "learning_rate": 9.510313035408035e-05, "loss": 0.0142, "step": 3700 }, { "epoch": 12.084690553745927, "grad_norm": 0.8875077962875366, "learning_rate": 9.506738640624775e-05, "loss": 0.0121, "step": 3710 }, { "epoch": 12.117263843648209, "grad_norm": 0.8056143522262573, "learning_rate": 9.50315192459016e-05, "loss": 0.013, "step": 3720 }, { "epoch": 12.149837133550488, "grad_norm": 0.7540460228919983, "learning_rate": 9.499552897110136e-05, "loss": 0.0126, "step": 3730 }, { "epoch": 12.182410423452769, "grad_norm": 0.7277035713195801, "learning_rate": 9.495941568024304e-05, "loss": 0.014, "step": 3740 }, { "epoch": 12.214983713355048, "grad_norm": 0.7212410569190979, "learning_rate": 9.492317947205904e-05, "loss": 0.0116, "step": 3750 }, { "epoch": 12.247557003257329, "grad_norm": 0.6804754137992859, "learning_rate": 9.488682044561775e-05, "loss": 0.0124, "step": 3760 }, { "epoch": 12.280130293159608, "grad_norm": 0.7375156879425049, "learning_rate": 9.485033870032335e-05, "loss": 0.0132, "step": 3770 }, { "epoch": 12.31270358306189, "grad_norm": 0.8549667000770569, "learning_rate": 9.481373433591556e-05, "loss": 0.0123, "step": 3780 }, { "epoch": 12.345276872964169, "grad_norm": 0.6877515912055969, "learning_rate": 9.47770074524693e-05, "loss": 0.0149, "step": 3790 }, { "epoch": 12.37785016286645, "grad_norm": 0.7423529624938965, "learning_rate": 9.474015815039446e-05, "loss": 0.0115, "step": 3800 }, { "epoch": 12.410423452768729, "grad_norm": 0.7478250861167908, "learning_rate": 9.470318653043565e-05, "loss": 0.0125, "step": 3810 }, { "epoch": 12.44299674267101, "grad_norm": 0.6396955847740173, "learning_rate": 9.466609269367185e-05, "loss": 0.0128, "step": 3820 }, { "epoch": 12.47557003257329, "grad_norm": 0.6483830809593201, "learning_rate": 9.46288767415162e-05, "loss": 0.0137, "step": 3830 }, { "epoch": 12.50814332247557, "grad_norm": 0.7733943462371826, "learning_rate": 9.459153877571567e-05, "loss": 0.015, "step": 3840 }, { "epoch": 12.54071661237785, "grad_norm": 0.6195006370544434, "learning_rate": 9.455407889835087e-05, "loss": 0.012, "step": 3850 }, { "epoch": 12.57328990228013, "grad_norm": 0.6243380308151245, "learning_rate": 9.451649721183564e-05, "loss": 0.0149, "step": 3860 }, { "epoch": 12.60586319218241, "grad_norm": 0.6876091361045837, "learning_rate": 9.447879381891692e-05, "loss": 0.0118, "step": 3870 }, { "epoch": 12.63843648208469, "grad_norm": 0.7486156225204468, "learning_rate": 9.444096882267428e-05, "loss": 0.0125, "step": 3880 }, { "epoch": 12.67100977198697, "grad_norm": 0.6939566731452942, "learning_rate": 9.440302232651988e-05, "loss": 0.0136, "step": 3890 }, { "epoch": 12.703583061889251, "grad_norm": 0.6740759015083313, "learning_rate": 9.436495443419795e-05, "loss": 0.0121, "step": 3900 }, { "epoch": 12.73615635179153, "grad_norm": 0.7866451144218445, "learning_rate": 9.432676524978466e-05, "loss": 0.0121, "step": 3910 }, { "epoch": 12.768729641693811, "grad_norm": 0.5861152410507202, "learning_rate": 9.42884548776878e-05, "loss": 0.0111, "step": 3920 }, { "epoch": 12.80130293159609, "grad_norm": 0.5740252137184143, "learning_rate": 9.425002342264646e-05, "loss": 0.0119, "step": 3930 }, { "epoch": 12.833876221498372, "grad_norm": 0.6962918043136597, "learning_rate": 9.421147098973077e-05, "loss": 0.0118, "step": 3940 }, { "epoch": 12.866449511400651, "grad_norm": 0.4894341826438904, "learning_rate": 9.41727976843416e-05, "loss": 0.0123, "step": 3950 }, { "epoch": 12.899022801302932, "grad_norm": 0.6954609155654907, "learning_rate": 9.413400361221029e-05, "loss": 0.0131, "step": 3960 }, { "epoch": 12.931596091205211, "grad_norm": 0.576310932636261, "learning_rate": 9.409508887939835e-05, "loss": 0.0121, "step": 3970 }, { "epoch": 12.964169381107492, "grad_norm": 0.4614481031894684, "learning_rate": 9.40560535922972e-05, "loss": 0.0132, "step": 3980 }, { "epoch": 12.996742671009772, "grad_norm": 0.47341176867485046, "learning_rate": 9.40168978576278e-05, "loss": 0.0137, "step": 3990 }, { "epoch": 13.029315960912053, "grad_norm": 0.5697060227394104, "learning_rate": 9.397762178244043e-05, "loss": 0.0133, "step": 4000 }, { "epoch": 13.061889250814332, "grad_norm": 0.612964928150177, "learning_rate": 9.393822547411439e-05, "loss": 0.0109, "step": 4010 }, { "epoch": 13.094462540716613, "grad_norm": 0.6282061338424683, "learning_rate": 9.389870904035769e-05, "loss": 0.0116, "step": 4020 }, { "epoch": 13.127035830618892, "grad_norm": 0.5586996078491211, "learning_rate": 9.385907258920672e-05, "loss": 0.0123, "step": 4030 }, { "epoch": 13.159609120521173, "grad_norm": 0.5667091012001038, "learning_rate": 9.381931622902607e-05, "loss": 0.0119, "step": 4040 }, { "epoch": 13.192182410423452, "grad_norm": 0.6335421800613403, "learning_rate": 9.377944006850807e-05, "loss": 0.0133, "step": 4050 }, { "epoch": 13.224755700325733, "grad_norm": 0.8205068111419678, "learning_rate": 9.373944421667265e-05, "loss": 0.0122, "step": 4060 }, { "epoch": 13.257328990228013, "grad_norm": 0.676856279373169, "learning_rate": 9.369932878286691e-05, "loss": 0.0126, "step": 4070 }, { "epoch": 13.289902280130294, "grad_norm": 0.6334593296051025, "learning_rate": 9.365909387676494e-05, "loss": 0.0126, "step": 4080 }, { "epoch": 13.322475570032573, "grad_norm": 0.6614803671836853, "learning_rate": 9.361873960836744e-05, "loss": 0.0126, "step": 4090 }, { "epoch": 13.355048859934854, "grad_norm": 0.6303547620773315, "learning_rate": 9.357826608800142e-05, "loss": 0.0121, "step": 4100 }, { "epoch": 13.387622149837133, "grad_norm": 0.5380171537399292, "learning_rate": 9.353767342631994e-05, "loss": 0.0121, "step": 4110 }, { "epoch": 13.420195439739414, "grad_norm": 0.5387251377105713, "learning_rate": 9.34969617343018e-05, "loss": 0.0102, "step": 4120 }, { "epoch": 13.452768729641694, "grad_norm": 0.5303720235824585, "learning_rate": 9.345613112325122e-05, "loss": 0.0126, "step": 4130 }, { "epoch": 13.485342019543975, "grad_norm": 0.5876774787902832, "learning_rate": 9.34151817047975e-05, "loss": 0.0111, "step": 4140 }, { "epoch": 13.517915309446254, "grad_norm": 0.6997594237327576, "learning_rate": 9.33741135908948e-05, "loss": 0.0119, "step": 4150 }, { "epoch": 13.550488599348535, "grad_norm": 0.602103054523468, "learning_rate": 9.33329268938218e-05, "loss": 0.0118, "step": 4160 }, { "epoch": 13.583061889250814, "grad_norm": 0.6966025829315186, "learning_rate": 9.329162172618132e-05, "loss": 0.0127, "step": 4170 }, { "epoch": 13.615635179153095, "grad_norm": 0.5898999571800232, "learning_rate": 9.325019820090013e-05, "loss": 0.0113, "step": 4180 }, { "epoch": 13.648208469055374, "grad_norm": 0.6419972777366638, "learning_rate": 9.320865643122855e-05, "loss": 0.0122, "step": 4190 }, { "epoch": 13.680781758957655, "grad_norm": 0.48004379868507385, "learning_rate": 9.316699653074023e-05, "loss": 0.0115, "step": 4200 }, { "epoch": 13.713355048859935, "grad_norm": 0.6654482483863831, "learning_rate": 9.312521861333172e-05, "loss": 0.0128, "step": 4210 }, { "epoch": 13.745928338762216, "grad_norm": 0.5661275386810303, "learning_rate": 9.308332279322224e-05, "loss": 0.0114, "step": 4220 }, { "epoch": 13.778501628664495, "grad_norm": 0.5483682155609131, "learning_rate": 9.304130918495338e-05, "loss": 0.0133, "step": 4230 }, { "epoch": 13.811074918566776, "grad_norm": 0.7615593075752258, "learning_rate": 9.299917790338874e-05, "loss": 0.0119, "step": 4240 }, { "epoch": 13.843648208469055, "grad_norm": 0.8771687150001526, "learning_rate": 9.295692906371363e-05, "loss": 0.013, "step": 4250 }, { "epoch": 13.876221498371336, "grad_norm": 0.7540078163146973, "learning_rate": 9.291456278143476e-05, "loss": 0.0138, "step": 4260 }, { "epoch": 13.908794788273616, "grad_norm": 0.711463212966919, "learning_rate": 9.287207917237994e-05, "loss": 0.0115, "step": 4270 }, { "epoch": 13.941368078175895, "grad_norm": 0.6685953736305237, "learning_rate": 9.282947835269773e-05, "loss": 0.0128, "step": 4280 }, { "epoch": 13.973941368078176, "grad_norm": 0.6462761163711548, "learning_rate": 9.278676043885715e-05, "loss": 0.0142, "step": 4290 }, { "epoch": 14.006514657980455, "grad_norm": 0.577616274356842, "learning_rate": 9.274392554764733e-05, "loss": 0.0114, "step": 4300 }, { "epoch": 14.039087947882736, "grad_norm": 0.6425755620002747, "learning_rate": 9.270097379617723e-05, "loss": 0.0124, "step": 4310 }, { "epoch": 14.071661237785015, "grad_norm": 0.5634329319000244, "learning_rate": 9.26579053018753e-05, "loss": 0.0125, "step": 4320 }, { "epoch": 14.104234527687296, "grad_norm": 0.5592512488365173, "learning_rate": 9.261472018248918e-05, "loss": 0.0131, "step": 4330 }, { "epoch": 14.136807817589576, "grad_norm": 0.7388976216316223, "learning_rate": 9.25714185560853e-05, "loss": 0.013, "step": 4340 }, { "epoch": 14.169381107491857, "grad_norm": 0.5503047108650208, "learning_rate": 9.252800054104868e-05, "loss": 0.0122, "step": 4350 }, { "epoch": 14.201954397394136, "grad_norm": 0.4611617922782898, "learning_rate": 9.248446625608252e-05, "loss": 0.0111, "step": 4360 }, { "epoch": 14.234527687296417, "grad_norm": 0.5560048222541809, "learning_rate": 9.244081582020789e-05, "loss": 0.0124, "step": 4370 }, { "epoch": 14.267100977198696, "grad_norm": 0.6567198634147644, "learning_rate": 9.239704935276339e-05, "loss": 0.0119, "step": 4380 }, { "epoch": 14.299674267100977, "grad_norm": 0.5934193134307861, "learning_rate": 9.235316697340489e-05, "loss": 0.0108, "step": 4390 }, { "epoch": 14.332247557003257, "grad_norm": 0.7520450353622437, "learning_rate": 9.230916880210512e-05, "loss": 0.0141, "step": 4400 }, { "epoch": 14.364820846905538, "grad_norm": 0.6309568285942078, "learning_rate": 9.226505495915342e-05, "loss": 0.0106, "step": 4410 }, { "epoch": 14.397394136807817, "grad_norm": 0.5420092344284058, "learning_rate": 9.222082556515536e-05, "loss": 0.0115, "step": 4420 }, { "epoch": 14.429967426710098, "grad_norm": 0.5959463715553284, "learning_rate": 9.217648074103242e-05, "loss": 0.0129, "step": 4430 }, { "epoch": 14.462540716612377, "grad_norm": 0.5967207551002502, "learning_rate": 9.213202060802161e-05, "loss": 0.0127, "step": 4440 }, { "epoch": 14.495114006514658, "grad_norm": 0.4098617732524872, "learning_rate": 9.208744528767528e-05, "loss": 0.0119, "step": 4450 }, { "epoch": 14.527687296416937, "grad_norm": 0.6198092699050903, "learning_rate": 9.204275490186064e-05, "loss": 0.0125, "step": 4460 }, { "epoch": 14.560260586319218, "grad_norm": 0.5683520436286926, "learning_rate": 9.199794957275949e-05, "loss": 0.014, "step": 4470 }, { "epoch": 14.592833876221498, "grad_norm": 0.4835983216762543, "learning_rate": 9.19530294228679e-05, "loss": 0.011, "step": 4480 }, { "epoch": 14.625407166123779, "grad_norm": 0.5733904242515564, "learning_rate": 9.190799457499583e-05, "loss": 0.0131, "step": 4490 }, { "epoch": 14.657980456026058, "grad_norm": 0.4585092067718506, "learning_rate": 9.186284515226686e-05, "loss": 0.0128, "step": 4500 }, { "epoch": 14.690553745928339, "grad_norm": 0.4910542070865631, "learning_rate": 9.181758127811777e-05, "loss": 0.0135, "step": 4510 }, { "epoch": 14.723127035830618, "grad_norm": 0.5943559408187866, "learning_rate": 9.177220307629825e-05, "loss": 0.0128, "step": 4520 }, { "epoch": 14.7557003257329, "grad_norm": 0.6429430246353149, "learning_rate": 9.172671067087059e-05, "loss": 0.012, "step": 4530 }, { "epoch": 14.788273615635179, "grad_norm": 0.5697653889656067, "learning_rate": 9.16811041862093e-05, "loss": 0.0107, "step": 4540 }, { "epoch": 14.82084690553746, "grad_norm": 0.5715034008026123, "learning_rate": 9.163538374700076e-05, "loss": 0.0118, "step": 4550 }, { "epoch": 14.853420195439739, "grad_norm": 0.638119101524353, "learning_rate": 9.158954947824287e-05, "loss": 0.011, "step": 4560 }, { "epoch": 14.88599348534202, "grad_norm": 0.5536983609199524, "learning_rate": 9.154360150524482e-05, "loss": 0.013, "step": 4570 }, { "epoch": 14.9185667752443, "grad_norm": 0.6488440632820129, "learning_rate": 9.14975399536266e-05, "loss": 0.0108, "step": 4580 }, { "epoch": 14.95114006514658, "grad_norm": 0.5250594019889832, "learning_rate": 9.14513649493187e-05, "loss": 0.0112, "step": 4590 }, { "epoch": 14.98371335504886, "grad_norm": 0.53516685962677, "learning_rate": 9.140507661856187e-05, "loss": 0.0117, "step": 4600 }, { "epoch": 15.01628664495114, "grad_norm": 0.722775936126709, "learning_rate": 9.135867508790661e-05, "loss": 0.0133, "step": 4610 }, { "epoch": 15.04885993485342, "grad_norm": 0.5718010067939758, "learning_rate": 9.131216048421291e-05, "loss": 0.0121, "step": 4620 }, { "epoch": 15.0814332247557, "grad_norm": 0.705929696559906, "learning_rate": 9.126553293464998e-05, "loss": 0.0113, "step": 4630 }, { "epoch": 15.11400651465798, "grad_norm": 0.5129148960113525, "learning_rate": 9.121879256669572e-05, "loss": 0.013, "step": 4640 }, { "epoch": 15.146579804560261, "grad_norm": 0.5438552498817444, "learning_rate": 9.117193950813652e-05, "loss": 0.0137, "step": 4650 }, { "epoch": 15.17915309446254, "grad_norm": 0.6388025283813477, "learning_rate": 9.112497388706685e-05, "loss": 0.0127, "step": 4660 }, { "epoch": 15.211726384364821, "grad_norm": 0.6098414659500122, "learning_rate": 9.10778958318889e-05, "loss": 0.0101, "step": 4670 }, { "epoch": 15.2442996742671, "grad_norm": 0.6415327191352844, "learning_rate": 9.103070547131232e-05, "loss": 0.0142, "step": 4680 }, { "epoch": 15.276872964169382, "grad_norm": 0.6037464141845703, "learning_rate": 9.098340293435375e-05, "loss": 0.0117, "step": 4690 }, { "epoch": 15.309446254071661, "grad_norm": 0.5115535855293274, "learning_rate": 9.093598835033649e-05, "loss": 0.0105, "step": 4700 }, { "epoch": 15.342019543973942, "grad_norm": 0.5303134918212891, "learning_rate": 9.088846184889021e-05, "loss": 0.0118, "step": 4710 }, { "epoch": 15.374592833876221, "grad_norm": 0.5767205357551575, "learning_rate": 9.084082355995057e-05, "loss": 0.0105, "step": 4720 }, { "epoch": 15.407166123778502, "grad_norm": 0.5441441535949707, "learning_rate": 9.079307361375882e-05, "loss": 0.0105, "step": 4730 }, { "epoch": 15.439739413680782, "grad_norm": 0.5901165008544922, "learning_rate": 9.074521214086149e-05, "loss": 0.0123, "step": 4740 }, { "epoch": 15.472312703583063, "grad_norm": 0.4205402731895447, "learning_rate": 9.069723927211001e-05, "loss": 0.0103, "step": 4750 }, { "epoch": 15.504885993485342, "grad_norm": 0.6429307460784912, "learning_rate": 9.064915513866037e-05, "loss": 0.0093, "step": 4760 }, { "epoch": 15.537459283387623, "grad_norm": 0.47192737460136414, "learning_rate": 9.060095987197279e-05, "loss": 0.0109, "step": 4770 }, { "epoch": 15.570032573289902, "grad_norm": 0.5266391634941101, "learning_rate": 9.055265360381126e-05, "loss": 0.0097, "step": 4780 }, { "epoch": 15.602605863192183, "grad_norm": 0.485973984003067, "learning_rate": 9.050423646624326e-05, "loss": 0.0101, "step": 4790 }, { "epoch": 15.635179153094462, "grad_norm": 0.5286204218864441, "learning_rate": 9.045570859163943e-05, "loss": 0.0117, "step": 4800 }, { "epoch": 15.667752442996743, "grad_norm": 0.6801819205284119, "learning_rate": 9.04070701126731e-05, "loss": 0.0106, "step": 4810 }, { "epoch": 15.700325732899023, "grad_norm": 0.5797834396362305, "learning_rate": 9.035832116232001e-05, "loss": 0.0115, "step": 4820 }, { "epoch": 15.732899022801304, "grad_norm": 0.7416790723800659, "learning_rate": 9.030946187385796e-05, "loss": 0.0112, "step": 4830 }, { "epoch": 15.765472312703583, "grad_norm": 0.5395382046699524, "learning_rate": 9.026049238086635e-05, "loss": 0.0101, "step": 4840 }, { "epoch": 15.798045602605864, "grad_norm": 0.667809784412384, "learning_rate": 9.021141281722591e-05, "loss": 0.0112, "step": 4850 }, { "epoch": 15.830618892508143, "grad_norm": 0.543626606464386, "learning_rate": 9.01622233171183e-05, "loss": 0.0117, "step": 4860 }, { "epoch": 15.863192182410424, "grad_norm": 0.5138244032859802, "learning_rate": 9.011292401502574e-05, "loss": 0.0106, "step": 4870 }, { "epoch": 15.895765472312704, "grad_norm": 0.491041898727417, "learning_rate": 9.006351504573063e-05, "loss": 0.0126, "step": 4880 }, { "epoch": 15.928338762214985, "grad_norm": 0.3895576298236847, "learning_rate": 9.001399654431519e-05, "loss": 0.0108, "step": 4890 }, { "epoch": 15.960912052117264, "grad_norm": 0.6080408692359924, "learning_rate": 8.996436864616116e-05, "loss": 0.013, "step": 4900 }, { "epoch": 15.993485342019543, "grad_norm": 0.6155601143836975, "learning_rate": 8.991463148694925e-05, "loss": 0.0096, "step": 4910 }, { "epoch": 16.026058631921824, "grad_norm": 0.6588138341903687, "learning_rate": 8.986478520265902e-05, "loss": 0.0104, "step": 4920 }, { "epoch": 16.058631921824105, "grad_norm": 0.6062794327735901, "learning_rate": 8.981482992956827e-05, "loss": 0.0107, "step": 4930 }, { "epoch": 16.091205211726383, "grad_norm": 0.6590397357940674, "learning_rate": 8.976476580425282e-05, "loss": 0.0131, "step": 4940 }, { "epoch": 16.123778501628664, "grad_norm": 0.5601251125335693, "learning_rate": 8.971459296358606e-05, "loss": 0.0121, "step": 4950 }, { "epoch": 16.156351791530945, "grad_norm": 0.5221347212791443, "learning_rate": 8.966431154473864e-05, "loss": 0.0121, "step": 4960 }, { "epoch": 16.188925081433226, "grad_norm": 0.57920902967453, "learning_rate": 8.961392168517803e-05, "loss": 0.0124, "step": 4970 }, { "epoch": 16.221498371335503, "grad_norm": 0.5354865789413452, "learning_rate": 8.956342352266821e-05, "loss": 0.012, "step": 4980 }, { "epoch": 16.254071661237784, "grad_norm": 0.618729293346405, "learning_rate": 8.95128171952692e-05, "loss": 0.0104, "step": 4990 }, { "epoch": 16.286644951140065, "grad_norm": 0.42464056611061096, "learning_rate": 8.946210284133676e-05, "loss": 0.0114, "step": 5000 }, { "epoch": 16.319218241042346, "grad_norm": 0.4815506041049957, "learning_rate": 8.941128059952201e-05, "loss": 0.01, "step": 5010 }, { "epoch": 16.351791530944624, "grad_norm": 0.49649620056152344, "learning_rate": 8.936035060877102e-05, "loss": 0.0106, "step": 5020 }, { "epoch": 16.384364820846905, "grad_norm": 0.5027061700820923, "learning_rate": 8.930931300832443e-05, "loss": 0.0104, "step": 5030 }, { "epoch": 16.416938110749186, "grad_norm": 0.5375000834465027, "learning_rate": 8.925816793771711e-05, "loss": 0.0095, "step": 5040 }, { "epoch": 16.449511400651467, "grad_norm": 0.4966764450073242, "learning_rate": 8.92069155367777e-05, "loss": 0.0114, "step": 5050 }, { "epoch": 16.482084690553744, "grad_norm": 0.7109740376472473, "learning_rate": 8.915555594562834e-05, "loss": 0.0129, "step": 5060 }, { "epoch": 16.514657980456025, "grad_norm": 0.5642775297164917, "learning_rate": 8.910408930468416e-05, "loss": 0.0102, "step": 5070 }, { "epoch": 16.547231270358306, "grad_norm": 0.5686492323875427, "learning_rate": 8.905251575465303e-05, "loss": 0.0102, "step": 5080 }, { "epoch": 16.579804560260587, "grad_norm": 0.5111770033836365, "learning_rate": 8.900083543653502e-05, "loss": 0.0101, "step": 5090 }, { "epoch": 16.612377850162865, "grad_norm": 0.48311617970466614, "learning_rate": 8.894904849162218e-05, "loss": 0.0113, "step": 5100 }, { "epoch": 16.644951140065146, "grad_norm": 0.4612133502960205, "learning_rate": 8.889715506149802e-05, "loss": 0.0097, "step": 5110 }, { "epoch": 16.677524429967427, "grad_norm": 0.6265704035758972, "learning_rate": 8.884515528803722e-05, "loss": 0.0101, "step": 5120 }, { "epoch": 16.710097719869708, "grad_norm": 0.5671060681343079, "learning_rate": 8.879304931340517e-05, "loss": 0.0098, "step": 5130 }, { "epoch": 16.742671009771986, "grad_norm": 0.4591500461101532, "learning_rate": 8.874083728005759e-05, "loss": 0.0106, "step": 5140 }, { "epoch": 16.775244299674267, "grad_norm": 0.7012014389038086, "learning_rate": 8.868851933074021e-05, "loss": 0.0115, "step": 5150 }, { "epoch": 16.807817589576548, "grad_norm": 0.6138771176338196, "learning_rate": 8.863609560848829e-05, "loss": 0.0117, "step": 5160 }, { "epoch": 16.84039087947883, "grad_norm": 0.5631691813468933, "learning_rate": 8.85835662566263e-05, "loss": 0.0096, "step": 5170 }, { "epoch": 16.872964169381106, "grad_norm": 0.6066752076148987, "learning_rate": 8.853093141876747e-05, "loss": 0.0118, "step": 5180 }, { "epoch": 16.905537459283387, "grad_norm": 0.3970053791999817, "learning_rate": 8.847819123881343e-05, "loss": 0.0103, "step": 5190 }, { "epoch": 16.938110749185668, "grad_norm": 0.5872803330421448, "learning_rate": 8.842534586095383e-05, "loss": 0.0093, "step": 5200 }, { "epoch": 16.97068403908795, "grad_norm": 0.5278239846229553, "learning_rate": 8.837239542966593e-05, "loss": 0.0116, "step": 5210 }, { "epoch": 17.003257328990227, "grad_norm": 0.5052501559257507, "learning_rate": 8.831934008971417e-05, "loss": 0.0102, "step": 5220 }, { "epoch": 17.035830618892508, "grad_norm": 0.5707674026489258, "learning_rate": 8.826617998614982e-05, "loss": 0.0085, "step": 5230 }, { "epoch": 17.06840390879479, "grad_norm": 0.5154997706413269, "learning_rate": 8.821291526431056e-05, "loss": 0.0113, "step": 5240 }, { "epoch": 17.10097719869707, "grad_norm": 0.4334968328475952, "learning_rate": 8.815954606982015e-05, "loss": 0.0112, "step": 5250 }, { "epoch": 17.133550488599347, "grad_norm": 0.6048435568809509, "learning_rate": 8.810607254858789e-05, "loss": 0.0117, "step": 5260 }, { "epoch": 17.16612377850163, "grad_norm": 0.4900098145008087, "learning_rate": 8.805249484680838e-05, "loss": 0.0098, "step": 5270 }, { "epoch": 17.19869706840391, "grad_norm": 0.43989741802215576, "learning_rate": 8.799881311096096e-05, "loss": 0.0119, "step": 5280 }, { "epoch": 17.23127035830619, "grad_norm": 0.4381119906902313, "learning_rate": 8.794502748780949e-05, "loss": 0.0098, "step": 5290 }, { "epoch": 17.263843648208468, "grad_norm": 0.5834327340126038, "learning_rate": 8.78911381244018e-05, "loss": 0.0115, "step": 5300 }, { "epoch": 17.29641693811075, "grad_norm": 0.43369826674461365, "learning_rate": 8.783714516806933e-05, "loss": 0.0093, "step": 5310 }, { "epoch": 17.32899022801303, "grad_norm": 0.5850085020065308, "learning_rate": 8.77830487664268e-05, "loss": 0.0112, "step": 5320 }, { "epoch": 17.36156351791531, "grad_norm": 0.5535557270050049, "learning_rate": 8.772884906737167e-05, "loss": 0.0117, "step": 5330 }, { "epoch": 17.39413680781759, "grad_norm": 0.5345462560653687, "learning_rate": 8.767454621908387e-05, "loss": 0.0112, "step": 5340 }, { "epoch": 17.42671009771987, "grad_norm": 0.5913922190666199, "learning_rate": 8.76201403700253e-05, "loss": 0.0118, "step": 5350 }, { "epoch": 17.45928338762215, "grad_norm": 0.5394867658615112, "learning_rate": 8.756563166893949e-05, "loss": 0.0103, "step": 5360 }, { "epoch": 17.49185667752443, "grad_norm": 0.637417197227478, "learning_rate": 8.751102026485113e-05, "loss": 0.0121, "step": 5370 }, { "epoch": 17.52442996742671, "grad_norm": 0.5312429666519165, "learning_rate": 8.745630630706571e-05, "loss": 0.0112, "step": 5380 }, { "epoch": 17.55700325732899, "grad_norm": 0.5353013873100281, "learning_rate": 8.740148994516912e-05, "loss": 0.0104, "step": 5390 }, { "epoch": 17.58957654723127, "grad_norm": 0.5983322858810425, "learning_rate": 8.73465713290272e-05, "loss": 0.0091, "step": 5400 }, { "epoch": 17.622149837133552, "grad_norm": 0.6481793522834778, "learning_rate": 8.729155060878533e-05, "loss": 0.0109, "step": 5410 }, { "epoch": 17.65472312703583, "grad_norm": 0.51454097032547, "learning_rate": 8.723642793486809e-05, "loss": 0.0104, "step": 5420 }, { "epoch": 17.68729641693811, "grad_norm": 0.6376858949661255, "learning_rate": 8.718120345797873e-05, "loss": 0.01, "step": 5430 }, { "epoch": 17.71986970684039, "grad_norm": 0.5232966542243958, "learning_rate": 8.712587732909889e-05, "loss": 0.0104, "step": 5440 }, { "epoch": 17.752442996742673, "grad_norm": 0.40592941641807556, "learning_rate": 8.707044969948806e-05, "loss": 0.0103, "step": 5450 }, { "epoch": 17.78501628664495, "grad_norm": 0.5707107782363892, "learning_rate": 8.701492072068329e-05, "loss": 0.0097, "step": 5460 }, { "epoch": 17.81758957654723, "grad_norm": 0.5730787515640259, "learning_rate": 8.695929054449869e-05, "loss": 0.0109, "step": 5470 }, { "epoch": 17.850162866449512, "grad_norm": 0.5907419323921204, "learning_rate": 8.690355932302501e-05, "loss": 0.0106, "step": 5480 }, { "epoch": 17.88273615635179, "grad_norm": 0.5448195934295654, "learning_rate": 8.684772720862931e-05, "loss": 0.0102, "step": 5490 }, { "epoch": 17.91530944625407, "grad_norm": 0.5052299499511719, "learning_rate": 8.679179435395446e-05, "loss": 0.0102, "step": 5500 }, { "epoch": 17.94788273615635, "grad_norm": 0.57944256067276, "learning_rate": 8.673576091191874e-05, "loss": 0.0115, "step": 5510 }, { "epoch": 17.980456026058633, "grad_norm": 0.5408090949058533, "learning_rate": 8.667962703571541e-05, "loss": 0.0105, "step": 5520 }, { "epoch": 18.01302931596091, "grad_norm": 0.5815545320510864, "learning_rate": 8.662339287881238e-05, "loss": 0.0098, "step": 5530 }, { "epoch": 18.04560260586319, "grad_norm": 0.5216434001922607, "learning_rate": 8.656705859495169e-05, "loss": 0.0108, "step": 5540 }, { "epoch": 18.078175895765472, "grad_norm": 0.41256457567214966, "learning_rate": 8.651062433814912e-05, "loss": 0.0096, "step": 5550 }, { "epoch": 18.110749185667753, "grad_norm": 0.4612780809402466, "learning_rate": 8.645409026269375e-05, "loss": 0.0098, "step": 5560 }, { "epoch": 18.14332247557003, "grad_norm": 0.6036962866783142, "learning_rate": 8.639745652314759e-05, "loss": 0.0109, "step": 5570 }, { "epoch": 18.175895765472312, "grad_norm": 0.45757317543029785, "learning_rate": 8.634072327434515e-05, "loss": 0.0104, "step": 5580 }, { "epoch": 18.208469055374593, "grad_norm": 0.5633235573768616, "learning_rate": 8.628389067139294e-05, "loss": 0.0105, "step": 5590 }, { "epoch": 18.241042345276874, "grad_norm": 0.48329007625579834, "learning_rate": 8.622695886966911e-05, "loss": 0.01, "step": 5600 }, { "epoch": 18.27361563517915, "grad_norm": 0.4269546568393707, "learning_rate": 8.616992802482308e-05, "loss": 0.0123, "step": 5610 }, { "epoch": 18.306188925081432, "grad_norm": 0.4761641323566437, "learning_rate": 8.611279829277496e-05, "loss": 0.0112, "step": 5620 }, { "epoch": 18.338762214983714, "grad_norm": 0.5688640475273132, "learning_rate": 8.605556982971528e-05, "loss": 0.011, "step": 5630 }, { "epoch": 18.371335504885995, "grad_norm": 0.5268420577049255, "learning_rate": 8.599824279210447e-05, "loss": 0.0111, "step": 5640 }, { "epoch": 18.403908794788272, "grad_norm": 0.5499677658081055, "learning_rate": 8.594081733667243e-05, "loss": 0.0087, "step": 5650 }, { "epoch": 18.436482084690553, "grad_norm": 0.592589259147644, "learning_rate": 8.58832936204182e-05, "loss": 0.0095, "step": 5660 }, { "epoch": 18.469055374592834, "grad_norm": 0.4770977795124054, "learning_rate": 8.582567180060942e-05, "loss": 0.0099, "step": 5670 }, { "epoch": 18.501628664495115, "grad_norm": 0.5618078708648682, "learning_rate": 8.576795203478194e-05, "loss": 0.0107, "step": 5680 }, { "epoch": 18.534201954397393, "grad_norm": 0.46116217970848083, "learning_rate": 8.571013448073939e-05, "loss": 0.0114, "step": 5690 }, { "epoch": 18.566775244299674, "grad_norm": 0.4571772515773773, "learning_rate": 8.565221929655275e-05, "loss": 0.0102, "step": 5700 }, { "epoch": 18.599348534201955, "grad_norm": 0.4506760537624359, "learning_rate": 8.559420664055992e-05, "loss": 0.0114, "step": 5710 }, { "epoch": 18.631921824104236, "grad_norm": 0.487032413482666, "learning_rate": 8.553609667136532e-05, "loss": 0.0098, "step": 5720 }, { "epoch": 18.664495114006513, "grad_norm": 0.40095847845077515, "learning_rate": 8.547788954783936e-05, "loss": 0.0093, "step": 5730 }, { "epoch": 18.697068403908794, "grad_norm": 0.43920227885246277, "learning_rate": 8.541958542911808e-05, "loss": 0.0108, "step": 5740 }, { "epoch": 18.729641693811075, "grad_norm": 0.45471495389938354, "learning_rate": 8.536118447460275e-05, "loss": 0.0085, "step": 5750 }, { "epoch": 18.762214983713356, "grad_norm": 0.3742855191230774, "learning_rate": 8.530268684395932e-05, "loss": 0.0098, "step": 5760 }, { "epoch": 18.794788273615634, "grad_norm": 0.5401120781898499, "learning_rate": 8.524409269711807e-05, "loss": 0.011, "step": 5770 }, { "epoch": 18.827361563517915, "grad_norm": 0.4368399679660797, "learning_rate": 8.51854021942732e-05, "loss": 0.0117, "step": 5780 }, { "epoch": 18.859934853420196, "grad_norm": 0.5863309502601624, "learning_rate": 8.512661549588227e-05, "loss": 0.0108, "step": 5790 }, { "epoch": 18.892508143322477, "grad_norm": 0.4833745062351227, "learning_rate": 8.506773276266588e-05, "loss": 0.011, "step": 5800 }, { "epoch": 18.925081433224754, "grad_norm": 0.5938926339149475, "learning_rate": 8.500875415560721e-05, "loss": 0.0107, "step": 5810 }, { "epoch": 18.957654723127035, "grad_norm": 0.47496920824050903, "learning_rate": 8.494967983595144e-05, "loss": 0.0128, "step": 5820 }, { "epoch": 18.990228013029316, "grad_norm": 0.43583986163139343, "learning_rate": 8.489050996520558e-05, "loss": 0.0116, "step": 5830 }, { "epoch": 19.022801302931597, "grad_norm": 0.4981800317764282, "learning_rate": 8.483124470513775e-05, "loss": 0.0097, "step": 5840 }, { "epoch": 19.055374592833875, "grad_norm": 0.5075828433036804, "learning_rate": 8.477188421777692e-05, "loss": 0.0094, "step": 5850 }, { "epoch": 19.087947882736156, "grad_norm": 0.49366408586502075, "learning_rate": 8.47124286654124e-05, "loss": 0.0113, "step": 5860 }, { "epoch": 19.120521172638437, "grad_norm": 0.6152781844139099, "learning_rate": 8.465287821059341e-05, "loss": 0.01, "step": 5870 }, { "epoch": 19.153094462540718, "grad_norm": 0.5412830710411072, "learning_rate": 8.45932330161286e-05, "loss": 0.0107, "step": 5880 }, { "epoch": 19.185667752442995, "grad_norm": 0.46815627813339233, "learning_rate": 8.453349324508567e-05, "loss": 0.0104, "step": 5890 }, { "epoch": 19.218241042345277, "grad_norm": 0.43253058195114136, "learning_rate": 8.447365906079088e-05, "loss": 0.0093, "step": 5900 }, { "epoch": 19.250814332247558, "grad_norm": 0.40782538056373596, "learning_rate": 8.441373062682856e-05, "loss": 0.0104, "step": 5910 }, { "epoch": 19.28338762214984, "grad_norm": 0.4392983317375183, "learning_rate": 8.43537081070408e-05, "loss": 0.0103, "step": 5920 }, { "epoch": 19.315960912052116, "grad_norm": 0.47564175724983215, "learning_rate": 8.429359166552689e-05, "loss": 0.011, "step": 5930 }, { "epoch": 19.348534201954397, "grad_norm": 0.4908657670021057, "learning_rate": 8.423338146664284e-05, "loss": 0.0112, "step": 5940 }, { "epoch": 19.381107491856678, "grad_norm": 0.4445395767688751, "learning_rate": 8.417307767500107e-05, "loss": 0.0104, "step": 5950 }, { "epoch": 19.41368078175896, "grad_norm": 0.38990992307662964, "learning_rate": 8.411268045546983e-05, "loss": 0.0104, "step": 5960 }, { "epoch": 19.446254071661237, "grad_norm": 0.5295190811157227, "learning_rate": 8.405218997317281e-05, "loss": 0.0113, "step": 5970 }, { "epoch": 19.478827361563518, "grad_norm": 0.5957738757133484, "learning_rate": 8.399160639348869e-05, "loss": 0.0136, "step": 5980 }, { "epoch": 19.5114006514658, "grad_norm": 0.5141053199768066, "learning_rate": 8.393092988205065e-05, "loss": 0.0107, "step": 5990 }, { "epoch": 19.54397394136808, "grad_norm": 0.5684329271316528, "learning_rate": 8.387016060474597e-05, "loss": 0.0105, "step": 6000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 66, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }