{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00291970802919708, "grad_norm": 4.875, "learning_rate": 7.246376811594204e-08, "loss": 1.320786714553833, "step": 2 }, { "epoch": 0.00583941605839416, "grad_norm": 26.375, "learning_rate": 2.173913043478261e-07, "loss": 2.3353517055511475, "step": 4 }, { "epoch": 0.008759124087591242, "grad_norm": 5.125, "learning_rate": 3.623188405797102e-07, "loss": 1.9446890354156494, "step": 6 }, { "epoch": 0.01167883211678832, "grad_norm": 2.234375, "learning_rate": 5.072463768115942e-07, "loss": 1.6843594312667847, "step": 8 }, { "epoch": 0.014598540145985401, "grad_norm": 8.8125, "learning_rate": 6.521739130434783e-07, "loss": 1.8062303066253662, "step": 10 }, { "epoch": 0.017518248175182483, "grad_norm": 5.0, "learning_rate": 7.971014492753623e-07, "loss": 1.9280399084091187, "step": 12 }, { "epoch": 0.020437956204379562, "grad_norm": 3.015625, "learning_rate": 9.420289855072465e-07, "loss": 1.570988655090332, "step": 14 }, { "epoch": 0.02335766423357664, "grad_norm": 11.25, "learning_rate": 1.0869565217391306e-06, "loss": 1.7710015773773193, "step": 16 }, { "epoch": 0.026277372262773723, "grad_norm": 4.53125, "learning_rate": 1.2318840579710147e-06, "loss": 1.9166163206100464, "step": 18 }, { "epoch": 0.029197080291970802, "grad_norm": 23.5, "learning_rate": 1.3768115942028987e-06, "loss": 1.9079008102416992, "step": 20 }, { "epoch": 0.032116788321167884, "grad_norm": 6.15625, "learning_rate": 1.521739130434783e-06, "loss": 1.9891327619552612, "step": 22 }, { "epoch": 0.035036496350364967, "grad_norm": 8.6875, "learning_rate": 1.6666666666666667e-06, "loss": 1.8731980323791504, "step": 24 }, { "epoch": 0.03795620437956204, "grad_norm": 41.5, "learning_rate": 1.8115942028985508e-06, "loss": 1.996793508529663, "step": 26 }, { "epoch": 0.040875912408759124, "grad_norm": 16.125, "learning_rate": 1.956521739130435e-06, "loss": 2.4439406394958496, "step": 28 }, { "epoch": 0.043795620437956206, "grad_norm": 4.78125, "learning_rate": 2.101449275362319e-06, "loss": 1.4941191673278809, "step": 30 }, { "epoch": 0.04671532846715328, "grad_norm": 5.71875, "learning_rate": 2.246376811594203e-06, "loss": 1.9384567737579346, "step": 32 }, { "epoch": 0.049635036496350364, "grad_norm": 3.140625, "learning_rate": 2.391304347826087e-06, "loss": 2.106153964996338, "step": 34 }, { "epoch": 0.052554744525547446, "grad_norm": 25.875, "learning_rate": 2.5362318840579714e-06, "loss": 2.235496997833252, "step": 36 }, { "epoch": 0.05547445255474453, "grad_norm": 6.46875, "learning_rate": 2.6811594202898555e-06, "loss": 2.4106810092926025, "step": 38 }, { "epoch": 0.058394160583941604, "grad_norm": 4.375, "learning_rate": 2.8260869565217393e-06, "loss": 1.6466758251190186, "step": 40 }, { "epoch": 0.061313868613138686, "grad_norm": 95.5, "learning_rate": 2.9710144927536235e-06, "loss": 1.9993230104446411, "step": 42 }, { "epoch": 0.06423357664233577, "grad_norm": 3.953125, "learning_rate": 3.1159420289855073e-06, "loss": 1.7203528881072998, "step": 44 }, { "epoch": 0.06715328467153285, "grad_norm": 13.5625, "learning_rate": 3.2608695652173914e-06, "loss": 2.5018796920776367, "step": 46 }, { "epoch": 0.07007299270072993, "grad_norm": 12.6875, "learning_rate": 3.4057971014492756e-06, "loss": 1.935620903968811, "step": 48 }, { "epoch": 0.072992700729927, "grad_norm": 4.125, "learning_rate": 3.55072463768116e-06, "loss": 1.9458433389663696, "step": 50 }, { "epoch": 0.07591240875912408, "grad_norm": 2.171875, "learning_rate": 3.6956521739130436e-06, "loss": 1.321602702140808, "step": 52 }, { "epoch": 0.07883211678832117, "grad_norm": 3.578125, "learning_rate": 3.840579710144928e-06, "loss": 2.0101318359375, "step": 54 }, { "epoch": 0.08175182481751825, "grad_norm": 5.625, "learning_rate": 3.9855072463768115e-06, "loss": 2.0588250160217285, "step": 56 }, { "epoch": 0.08467153284671533, "grad_norm": 5.3125, "learning_rate": 4.130434782608696e-06, "loss": 1.860298752784729, "step": 58 }, { "epoch": 0.08759124087591241, "grad_norm": 5.9375, "learning_rate": 4.27536231884058e-06, "loss": 1.9684100151062012, "step": 60 }, { "epoch": 0.0905109489051095, "grad_norm": 9.375, "learning_rate": 4.4202898550724645e-06, "loss": 1.980459213256836, "step": 62 }, { "epoch": 0.09343065693430656, "grad_norm": 4.90625, "learning_rate": 4.565217391304348e-06, "loss": 1.8493075370788574, "step": 64 }, { "epoch": 0.09635036496350365, "grad_norm": 2.609375, "learning_rate": 4.710144927536232e-06, "loss": 1.5537524223327637, "step": 66 }, { "epoch": 0.09927007299270073, "grad_norm": 4.46875, "learning_rate": 4.855072463768117e-06, "loss": 1.8475682735443115, "step": 68 }, { "epoch": 0.10218978102189781, "grad_norm": 3.734375, "learning_rate": 5e-06, "loss": 1.7411353588104248, "step": 70 }, { "epoch": 0.10510948905109489, "grad_norm": 29.875, "learning_rate": 4.999973760423467e-06, "loss": 2.0845284461975098, "step": 72 }, { "epoch": 0.10802919708029197, "grad_norm": 6.21875, "learning_rate": 4.99989504230588e-06, "loss": 1.5018064975738525, "step": 74 }, { "epoch": 0.11094890510948906, "grad_norm": 2.21875, "learning_rate": 4.999763847483267e-06, "loss": 1.464540958404541, "step": 76 }, { "epoch": 0.11386861313868613, "grad_norm": 4.53125, "learning_rate": 4.999580179015625e-06, "loss": 1.8232789039611816, "step": 78 }, { "epoch": 0.11678832116788321, "grad_norm": 1.7578125, "learning_rate": 4.999344041186848e-06, "loss": 1.096325159072876, "step": 80 }, { "epoch": 0.11970802919708029, "grad_norm": 3.328125, "learning_rate": 4.999055439504633e-06, "loss": 1.8037409782409668, "step": 82 }, { "epoch": 0.12262773722627737, "grad_norm": 3.84375, "learning_rate": 4.998714380700345e-06, "loss": 1.5575973987579346, "step": 84 }, { "epoch": 0.12554744525547445, "grad_norm": 4.1875, "learning_rate": 4.998320872728862e-06, "loss": 1.8613684177398682, "step": 86 }, { "epoch": 0.12846715328467154, "grad_norm": 5.15625, "learning_rate": 4.9978749247683895e-06, "loss": 1.732508897781372, "step": 88 }, { "epoch": 0.13138686131386862, "grad_norm": 2.59375, "learning_rate": 4.99737654722025e-06, "loss": 1.3435773849487305, "step": 90 }, { "epoch": 0.1343065693430657, "grad_norm": 3.25, "learning_rate": 4.996825751708635e-06, "loss": 1.7478176355361938, "step": 92 }, { "epoch": 0.13722627737226278, "grad_norm": 2.03125, "learning_rate": 4.996222551080337e-06, "loss": 1.4358994960784912, "step": 94 }, { "epoch": 0.14014598540145987, "grad_norm": 5.4375, "learning_rate": 4.9955669594044466e-06, "loss": 1.870757818222046, "step": 96 }, { "epoch": 0.14306569343065692, "grad_norm": 3.671875, "learning_rate": 4.994858991972031e-06, "loss": 1.6408865451812744, "step": 98 }, { "epoch": 0.145985401459854, "grad_norm": 3.375, "learning_rate": 4.994098665295768e-06, "loss": 1.4728097915649414, "step": 100 }, { "epoch": 0.14890510948905109, "grad_norm": 7.4375, "learning_rate": 4.9932859971095705e-06, "loss": 1.7583755254745483, "step": 102 }, { "epoch": 0.15182481751824817, "grad_norm": 3.25, "learning_rate": 4.992421006368166e-06, "loss": 1.6836040019989014, "step": 104 }, { "epoch": 0.15474452554744525, "grad_norm": 26.25, "learning_rate": 4.991503713246659e-06, "loss": 1.9515830278396606, "step": 106 }, { "epoch": 0.15766423357664233, "grad_norm": 62.25, "learning_rate": 4.990534139140055e-06, "loss": 2.0257816314697266, "step": 108 }, { "epoch": 0.16058394160583941, "grad_norm": 2.640625, "learning_rate": 4.989512306662767e-06, "loss": 1.4182727336883545, "step": 110 }, { "epoch": 0.1635036496350365, "grad_norm": 6.6875, "learning_rate": 4.988438239648084e-06, "loss": 1.70530366897583, "step": 112 }, { "epoch": 0.16642335766423358, "grad_norm": 3.5625, "learning_rate": 4.98731196314762e-06, "loss": 1.5088133811950684, "step": 114 }, { "epoch": 0.16934306569343066, "grad_norm": 3.078125, "learning_rate": 4.986133503430724e-06, "loss": 1.6265062093734741, "step": 116 }, { "epoch": 0.17226277372262774, "grad_norm": 6.4375, "learning_rate": 4.98490288798387e-06, "loss": 1.402962327003479, "step": 118 }, { "epoch": 0.17518248175182483, "grad_norm": 4.125, "learning_rate": 4.983620145510017e-06, "loss": 1.8057794570922852, "step": 120 }, { "epoch": 0.1781021897810219, "grad_norm": 6.875, "learning_rate": 4.982285305927937e-06, "loss": 1.9605462551116943, "step": 122 }, { "epoch": 0.181021897810219, "grad_norm": 3.625, "learning_rate": 4.980898400371521e-06, "loss": 1.8519611358642578, "step": 124 }, { "epoch": 0.18394160583941604, "grad_norm": 10.0625, "learning_rate": 4.9794594611890465e-06, "loss": 1.6692755222320557, "step": 126 }, { "epoch": 0.18686131386861313, "grad_norm": 6.1875, "learning_rate": 4.977968521942429e-06, "loss": 1.8997008800506592, "step": 128 }, { "epoch": 0.1897810218978102, "grad_norm": 1.8515625, "learning_rate": 4.97642561740644e-06, "loss": 1.8168402910232544, "step": 130 }, { "epoch": 0.1927007299270073, "grad_norm": 16.375, "learning_rate": 4.974830783567886e-06, "loss": 1.4727129936218262, "step": 132 }, { "epoch": 0.19562043795620437, "grad_norm": 7.71875, "learning_rate": 4.973184057624781e-06, "loss": 1.6138420104980469, "step": 134 }, { "epoch": 0.19854014598540146, "grad_norm": 3.5, "learning_rate": 4.971485477985474e-06, "loss": 1.6893023252487183, "step": 136 }, { "epoch": 0.20145985401459854, "grad_norm": 1.421875, "learning_rate": 4.969735084267752e-06, "loss": 1.3670828342437744, "step": 138 }, { "epoch": 0.20437956204379562, "grad_norm": 8.4375, "learning_rate": 4.967932917297915e-06, "loss": 1.6938685178756714, "step": 140 }, { "epoch": 0.2072992700729927, "grad_norm": 4.0625, "learning_rate": 4.966079019109831e-06, "loss": 2.2959558963775635, "step": 142 }, { "epoch": 0.21021897810218979, "grad_norm": 3.328125, "learning_rate": 4.964173432943946e-06, "loss": 1.6218578815460205, "step": 144 }, { "epoch": 0.21313868613138687, "grad_norm": 9.0625, "learning_rate": 4.962216203246281e-06, "loss": 2.592639446258545, "step": 146 }, { "epoch": 0.21605839416058395, "grad_norm": 3.3125, "learning_rate": 4.960207375667396e-06, "loss": 1.5585392713546753, "step": 148 }, { "epoch": 0.21897810218978103, "grad_norm": 3.96875, "learning_rate": 4.958146997061319e-06, "loss": 1.6422696113586426, "step": 150 }, { "epoch": 0.22189781021897811, "grad_norm": 5.59375, "learning_rate": 4.956035115484465e-06, "loss": 1.7883186340332031, "step": 152 }, { "epoch": 0.22481751824817517, "grad_norm": 2.140625, "learning_rate": 4.953871780194501e-06, "loss": 1.657930612564087, "step": 154 }, { "epoch": 0.22773722627737225, "grad_norm": 24.125, "learning_rate": 4.951657041649206e-06, "loss": 1.7987116575241089, "step": 156 }, { "epoch": 0.23065693430656933, "grad_norm": 12.0, "learning_rate": 4.9493909515052944e-06, "loss": 2.016146659851074, "step": 158 }, { "epoch": 0.23357664233576642, "grad_norm": 7.90625, "learning_rate": 4.947073562617206e-06, "loss": 1.3612116575241089, "step": 160 }, { "epoch": 0.2364963503649635, "grad_norm": 3.8125, "learning_rate": 4.944704929035877e-06, "loss": 1.7367652654647827, "step": 162 }, { "epoch": 0.23941605839416058, "grad_norm": 2.875, "learning_rate": 4.942285106007477e-06, "loss": 1.3203725814819336, "step": 164 }, { "epoch": 0.24233576642335766, "grad_norm": 11.25, "learning_rate": 4.9398141499721246e-06, "loss": 1.7288057804107666, "step": 166 }, { "epoch": 0.24525547445255474, "grad_norm": 1.5625, "learning_rate": 4.937292118562566e-06, "loss": 1.383696436882019, "step": 168 }, { "epoch": 0.24817518248175183, "grad_norm": 12.5625, "learning_rate": 4.934719070602833e-06, "loss": 1.6433072090148926, "step": 170 }, { "epoch": 0.2510948905109489, "grad_norm": 3.109375, "learning_rate": 4.932095066106872e-06, "loss": 1.4025721549987793, "step": 172 }, { "epoch": 0.25401459854014596, "grad_norm": 4.1875, "learning_rate": 4.929420166277141e-06, "loss": 1.6988599300384521, "step": 174 }, { "epoch": 0.2569343065693431, "grad_norm": 3.65625, "learning_rate": 4.926694433503186e-06, "loss": 1.6042873859405518, "step": 176 }, { "epoch": 0.25985401459854013, "grad_norm": 1.6484375, "learning_rate": 4.923917931360185e-06, "loss": 1.2862474918365479, "step": 178 }, { "epoch": 0.26277372262773724, "grad_norm": 6.65625, "learning_rate": 4.9210907246074615e-06, "loss": 1.7310783863067627, "step": 180 }, { "epoch": 0.2656934306569343, "grad_norm": 4.5625, "learning_rate": 4.9182128791869796e-06, "loss": 1.5482988357543945, "step": 182 }, { "epoch": 0.2686131386861314, "grad_norm": 1.5078125, "learning_rate": 4.9152844622218e-06, "loss": 1.2439241409301758, "step": 184 }, { "epoch": 0.27153284671532846, "grad_norm": 4.3125, "learning_rate": 4.91230554201452e-06, "loss": 1.5766255855560303, "step": 186 }, { "epoch": 0.27445255474452557, "grad_norm": 3.90625, "learning_rate": 4.9092761880456764e-06, "loss": 1.311848759651184, "step": 188 }, { "epoch": 0.2773722627737226, "grad_norm": 39.75, "learning_rate": 4.906196470972128e-06, "loss": 1.5088813304901123, "step": 190 }, { "epoch": 0.28029197080291973, "grad_norm": 6.40625, "learning_rate": 4.903066462625405e-06, "loss": 1.6081913709640503, "step": 192 }, { "epoch": 0.2832116788321168, "grad_norm": 6.125, "learning_rate": 4.899886236010036e-06, "loss": 1.7471773624420166, "step": 194 }, { "epoch": 0.28613138686131384, "grad_norm": 4.09375, "learning_rate": 4.896655865301842e-06, "loss": 1.6127898693084717, "step": 196 }, { "epoch": 0.28905109489051095, "grad_norm": 3.1875, "learning_rate": 4.893375425846209e-06, "loss": 1.6075236797332764, "step": 198 }, { "epoch": 0.291970802919708, "grad_norm": 3.53125, "learning_rate": 4.890044994156331e-06, "loss": 1.712640643119812, "step": 200 }, { "epoch": 0.2948905109489051, "grad_norm": 3.84375, "learning_rate": 4.886664647911422e-06, "loss": 1.5669183731079102, "step": 202 }, { "epoch": 0.29781021897810217, "grad_norm": 5.6875, "learning_rate": 4.883234465954909e-06, "loss": 1.7576971054077148, "step": 204 }, { "epoch": 0.3007299270072993, "grad_norm": 2.515625, "learning_rate": 4.879754528292588e-06, "loss": 1.5543663501739502, "step": 206 }, { "epoch": 0.30364963503649633, "grad_norm": 2.921875, "learning_rate": 4.876224916090762e-06, "loss": 1.9160549640655518, "step": 208 }, { "epoch": 0.30656934306569344, "grad_norm": 4.34375, "learning_rate": 4.872645711674348e-06, "loss": 1.646159291267395, "step": 210 }, { "epoch": 0.3094890510948905, "grad_norm": 1.625, "learning_rate": 4.8690169985249516e-06, "loss": 1.1048507690429688, "step": 212 }, { "epoch": 0.3124087591240876, "grad_norm": 1.5625, "learning_rate": 4.865338861278925e-06, "loss": 1.0736052989959717, "step": 214 }, { "epoch": 0.31532846715328466, "grad_norm": 3.59375, "learning_rate": 4.8616113857253925e-06, "loss": 1.2035229206085205, "step": 216 }, { "epoch": 0.3182481751824818, "grad_norm": 23.625, "learning_rate": 4.857834658804247e-06, "loss": 1.137906789779663, "step": 218 }, { "epoch": 0.32116788321167883, "grad_norm": 4.5625, "learning_rate": 4.8540087686041234e-06, "loss": 1.7008376121520996, "step": 220 }, { "epoch": 0.32408759124087594, "grad_norm": 8.75, "learning_rate": 4.850133804360346e-06, "loss": 1.6337850093841553, "step": 222 }, { "epoch": 0.327007299270073, "grad_norm": 3.984375, "learning_rate": 4.8462098564528455e-06, "loss": 1.1808865070343018, "step": 224 }, { "epoch": 0.32992700729927005, "grad_norm": 3.59375, "learning_rate": 4.842237016404048e-06, "loss": 1.5622849464416504, "step": 226 }, { "epoch": 0.33284671532846716, "grad_norm": 1.1875, "learning_rate": 4.838215376876744e-06, "loss": 1.1768817901611328, "step": 228 }, { "epoch": 0.3357664233576642, "grad_norm": 6.0, "learning_rate": 4.834145031671931e-06, "loss": 1.3726277351379395, "step": 230 }, { "epoch": 0.3386861313868613, "grad_norm": 28.375, "learning_rate": 4.830026075726615e-06, "loss": 1.1469438076019287, "step": 232 }, { "epoch": 0.3416058394160584, "grad_norm": 3.421875, "learning_rate": 4.8258586051116045e-06, "loss": 1.5012977123260498, "step": 234 }, { "epoch": 0.3445255474452555, "grad_norm": 12.9375, "learning_rate": 4.821642717029269e-06, "loss": 1.6817822456359863, "step": 236 }, { "epoch": 0.34744525547445254, "grad_norm": 5.0625, "learning_rate": 4.8173785098112675e-06, "loss": 1.525681495666504, "step": 238 }, { "epoch": 0.35036496350364965, "grad_norm": 15.4375, "learning_rate": 4.81306608291626e-06, "loss": 2.0758631229400635, "step": 240 }, { "epoch": 0.3532846715328467, "grad_norm": 3.25, "learning_rate": 4.808705536927586e-06, "loss": 1.4310352802276611, "step": 242 }, { "epoch": 0.3562043795620438, "grad_norm": 3.28125, "learning_rate": 4.804296973550915e-06, "loss": 1.6908133029937744, "step": 244 }, { "epoch": 0.35912408759124087, "grad_norm": 3.15625, "learning_rate": 4.799840495611879e-06, "loss": 1.2480230331420898, "step": 246 }, { "epoch": 0.362043795620438, "grad_norm": 2.75, "learning_rate": 4.795336207053674e-06, "loss": 1.5943894386291504, "step": 248 }, { "epoch": 0.36496350364963503, "grad_norm": 3.953125, "learning_rate": 4.790784212934631e-06, "loss": 1.1932544708251953, "step": 250 }, { "epoch": 0.3678832116788321, "grad_norm": 5.53125, "learning_rate": 4.786184619425773e-06, "loss": 1.4538475275039673, "step": 252 }, { "epoch": 0.3708029197080292, "grad_norm": 6.1875, "learning_rate": 4.781537533808331e-06, "loss": 1.7138783931732178, "step": 254 }, { "epoch": 0.37372262773722625, "grad_norm": 1.609375, "learning_rate": 4.7768430644712435e-06, "loss": 1.37872314453125, "step": 256 }, { "epoch": 0.37664233576642336, "grad_norm": 6.25, "learning_rate": 4.772101320908636e-06, "loss": 1.4937684535980225, "step": 258 }, { "epoch": 0.3795620437956204, "grad_norm": 8.8125, "learning_rate": 4.767312413717256e-06, "loss": 1.4460338354110718, "step": 260 }, { "epoch": 0.38248175182481753, "grad_norm": 4.28125, "learning_rate": 4.7624764545939015e-06, "loss": 1.4206737279891968, "step": 262 }, { "epoch": 0.3854014598540146, "grad_norm": 2.671875, "learning_rate": 4.757593556332811e-06, "loss": 1.3555597066879272, "step": 264 }, { "epoch": 0.3883211678832117, "grad_norm": 3.1875, "learning_rate": 4.752663832823038e-06, "loss": 1.6055470705032349, "step": 266 }, { "epoch": 0.39124087591240875, "grad_norm": 4.09375, "learning_rate": 4.747687399045787e-06, "loss": 1.3127577304840088, "step": 268 }, { "epoch": 0.39416058394160586, "grad_norm": 5.40625, "learning_rate": 4.7426643710717386e-06, "loss": 1.6612601280212402, "step": 270 }, { "epoch": 0.3970802919708029, "grad_norm": 5.34375, "learning_rate": 4.737594866058339e-06, "loss": 1.2799599170684814, "step": 272 }, { "epoch": 0.4, "grad_norm": 6.71875, "learning_rate": 4.7324790022470675e-06, "loss": 1.9163275957107544, "step": 274 }, { "epoch": 0.4029197080291971, "grad_norm": 2.328125, "learning_rate": 4.727316898960681e-06, "loss": 1.4439561367034912, "step": 276 }, { "epoch": 0.4058394160583942, "grad_norm": 10.6875, "learning_rate": 4.722108676600427e-06, "loss": 1.2920876741409302, "step": 278 }, { "epoch": 0.40875912408759124, "grad_norm": 3.671875, "learning_rate": 4.7168544566432365e-06, "loss": 1.691207766532898, "step": 280 }, { "epoch": 0.4116788321167883, "grad_norm": 3.21875, "learning_rate": 4.711554361638896e-06, "loss": 1.527019739151001, "step": 282 }, { "epoch": 0.4145985401459854, "grad_norm": 3.1875, "learning_rate": 4.70620851520718e-06, "loss": 1.4309567213058472, "step": 284 }, { "epoch": 0.41751824817518246, "grad_norm": 2.390625, "learning_rate": 4.7008170420349746e-06, "loss": 1.2672343254089355, "step": 286 }, { "epoch": 0.42043795620437957, "grad_norm": 1.765625, "learning_rate": 4.695380067873368e-06, "loss": 1.3927721977233887, "step": 288 }, { "epoch": 0.4233576642335766, "grad_norm": 2.75, "learning_rate": 4.689897719534715e-06, "loss": 1.5347919464111328, "step": 290 }, { "epoch": 0.42627737226277373, "grad_norm": 4.5625, "learning_rate": 4.68437012488968e-06, "loss": 1.2839910984039307, "step": 292 }, { "epoch": 0.4291970802919708, "grad_norm": 48.25, "learning_rate": 4.678797412864258e-06, "loss": 1.3073639869689941, "step": 294 }, { "epoch": 0.4321167883211679, "grad_norm": 4.1875, "learning_rate": 4.673179713436762e-06, "loss": 1.5608128309249878, "step": 296 }, { "epoch": 0.43503649635036495, "grad_norm": 2.875, "learning_rate": 4.667517157634797e-06, "loss": 1.6924610137939453, "step": 298 }, { "epoch": 0.43795620437956206, "grad_norm": 3.515625, "learning_rate": 4.6618098775322e-06, "loss": 1.218139886856079, "step": 300 }, { "epoch": 0.4408759124087591, "grad_norm": 5.34375, "learning_rate": 4.656058006245959e-06, "loss": 1.4968738555908203, "step": 302 }, { "epoch": 0.44379562043795623, "grad_norm": 6.59375, "learning_rate": 4.650261677933111e-06, "loss": 1.522092580795288, "step": 304 }, { "epoch": 0.4467153284671533, "grad_norm": 3.109375, "learning_rate": 4.644421027787614e-06, "loss": 1.15757155418396, "step": 306 }, { "epoch": 0.44963503649635034, "grad_norm": 2.5, "learning_rate": 4.638536192037186e-06, "loss": 1.0606379508972168, "step": 308 }, { "epoch": 0.45255474452554745, "grad_norm": 10.375, "learning_rate": 4.63260730794014e-06, "loss": 1.674492597579956, "step": 310 }, { "epoch": 0.4554744525547445, "grad_norm": 3.421875, "learning_rate": 4.62663451378217e-06, "loss": 1.4489834308624268, "step": 312 }, { "epoch": 0.4583941605839416, "grad_norm": 1.6640625, "learning_rate": 4.620617948873133e-06, "loss": 1.4036529064178467, "step": 314 }, { "epoch": 0.46131386861313867, "grad_norm": 4.21875, "learning_rate": 4.6145577535438004e-06, "loss": 1.482384204864502, "step": 316 }, { "epoch": 0.4642335766423358, "grad_norm": 2.8125, "learning_rate": 4.608454069142578e-06, "loss": 1.4590518474578857, "step": 318 }, { "epoch": 0.46715328467153283, "grad_norm": 4.53125, "learning_rate": 4.602307038032216e-06, "loss": 1.7169837951660156, "step": 320 }, { "epoch": 0.47007299270072994, "grad_norm": 4.75, "learning_rate": 4.596116803586487e-06, "loss": 1.5060232877731323, "step": 322 }, { "epoch": 0.472992700729927, "grad_norm": 2.828125, "learning_rate": 4.5898835101868415e-06, "loss": 1.4886112213134766, "step": 324 }, { "epoch": 0.4759124087591241, "grad_norm": 1.7265625, "learning_rate": 4.583607303219037e-06, "loss": 1.4076815843582153, "step": 326 }, { "epoch": 0.47883211678832116, "grad_norm": 10.4375, "learning_rate": 4.577288329069753e-06, "loss": 1.5618150234222412, "step": 328 }, { "epoch": 0.48175182481751827, "grad_norm": 4.75, "learning_rate": 4.570926735123171e-06, "loss": 1.274332046508789, "step": 330 }, { "epoch": 0.4846715328467153, "grad_norm": 4.4375, "learning_rate": 4.564522669757543e-06, "loss": 1.4747687578201294, "step": 332 }, { "epoch": 0.48759124087591244, "grad_norm": 6.40625, "learning_rate": 4.558076282341723e-06, "loss": 1.653844952583313, "step": 334 }, { "epoch": 0.4905109489051095, "grad_norm": 39.5, "learning_rate": 4.551587723231692e-06, "loss": 1.0735116004943848, "step": 336 }, { "epoch": 0.49343065693430654, "grad_norm": 36.0, "learning_rate": 4.545057143767042e-06, "loss": 1.6714699268341064, "step": 338 }, { "epoch": 0.49635036496350365, "grad_norm": 4.15625, "learning_rate": 4.538484696267453e-06, "loss": 1.4629170894622803, "step": 340 }, { "epoch": 0.4992700729927007, "grad_norm": 10.3125, "learning_rate": 4.5318705340291394e-06, "loss": 1.5702762603759766, "step": 342 }, { "epoch": 0.5021897810218978, "grad_norm": 4.96875, "learning_rate": 4.525214811321269e-06, "loss": 1.5001425743103027, "step": 344 }, { "epoch": 0.5051094890510949, "grad_norm": 5.0625, "learning_rate": 4.518517683382373e-06, "loss": 1.4789342880249023, "step": 346 }, { "epoch": 0.5080291970802919, "grad_norm": 4.15625, "learning_rate": 4.511779306416716e-06, "loss": 1.4476077556610107, "step": 348 }, { "epoch": 0.5109489051094891, "grad_norm": 1.703125, "learning_rate": 4.504999837590665e-06, "loss": 1.1996196508407593, "step": 350 }, { "epoch": 0.5138686131386861, "grad_norm": 4.1875, "learning_rate": 4.49817943502901e-06, "loss": 1.532009482383728, "step": 352 }, { "epoch": 0.5167883211678832, "grad_norm": 1.65625, "learning_rate": 4.4913182578112815e-06, "loss": 1.2889015674591064, "step": 354 }, { "epoch": 0.5197080291970803, "grad_norm": 1.640625, "learning_rate": 4.484416465968049e-06, "loss": 1.3533192873001099, "step": 356 }, { "epoch": 0.5226277372262774, "grad_norm": 4.3125, "learning_rate": 4.477474220477172e-06, "loss": 1.4686871767044067, "step": 358 }, { "epoch": 0.5255474452554745, "grad_norm": 3.78125, "learning_rate": 4.470491683260056e-06, "loss": 1.4659610986709595, "step": 360 }, { "epoch": 0.5284671532846715, "grad_norm": 2.46875, "learning_rate": 4.463469017177876e-06, "loss": 1.487034797668457, "step": 362 }, { "epoch": 0.5313868613138686, "grad_norm": 3.3125, "learning_rate": 4.456406386027772e-06, "loss": 1.1844420433044434, "step": 364 }, { "epoch": 0.5343065693430656, "grad_norm": 7.34375, "learning_rate": 4.4493039545390345e-06, "loss": 1.5557405948638916, "step": 366 }, { "epoch": 0.5372262773722628, "grad_norm": 2.984375, "learning_rate": 4.442161888369258e-06, "loss": 1.3480842113494873, "step": 368 }, { "epoch": 0.5401459854014599, "grad_norm": 2.90625, "learning_rate": 4.43498035410048e-06, "loss": 1.2928515672683716, "step": 370 }, { "epoch": 0.5430656934306569, "grad_norm": 7.3125, "learning_rate": 4.427759519235294e-06, "loss": 1.7453609704971313, "step": 372 }, { "epoch": 0.545985401459854, "grad_norm": 2.640625, "learning_rate": 4.420499552192944e-06, "loss": 1.4482967853546143, "step": 374 }, { "epoch": 0.5489051094890511, "grad_norm": 2.0, "learning_rate": 4.413200622305395e-06, "loss": 1.6135839223861694, "step": 376 }, { "epoch": 0.5518248175182482, "grad_norm": 13.9375, "learning_rate": 4.405862899813384e-06, "loss": 1.570212483406067, "step": 378 }, { "epoch": 0.5547445255474452, "grad_norm": 1.3671875, "learning_rate": 4.398486555862451e-06, "loss": 1.298504114151001, "step": 380 }, { "epoch": 0.5576642335766423, "grad_norm": 7.8125, "learning_rate": 4.391071762498941e-06, "loss": 1.4520879983901978, "step": 382 }, { "epoch": 0.5605839416058395, "grad_norm": 14.8125, "learning_rate": 4.383618692666002e-06, "loss": 1.3408211469650269, "step": 384 }, { "epoch": 0.5635036496350365, "grad_norm": 3.375, "learning_rate": 4.376127520199541e-06, "loss": 1.4031929969787598, "step": 386 }, { "epoch": 0.5664233576642336, "grad_norm": 4.03125, "learning_rate": 4.3685984198241735e-06, "loss": 1.5412940979003906, "step": 388 }, { "epoch": 0.5693430656934306, "grad_norm": 6.78125, "learning_rate": 4.361031567149149e-06, "loss": 1.3730320930480957, "step": 390 }, { "epoch": 0.5722627737226277, "grad_norm": 7.28125, "learning_rate": 4.353427138664254e-06, "loss": 1.3442788124084473, "step": 392 }, { "epoch": 0.5751824817518248, "grad_norm": 6.90625, "learning_rate": 4.345785311735698e-06, "loss": 1.4140475988388062, "step": 394 }, { "epoch": 0.5781021897810219, "grad_norm": 6.25, "learning_rate": 4.3381062646019676e-06, "loss": 1.5376839637756348, "step": 396 }, { "epoch": 0.581021897810219, "grad_norm": 4.25, "learning_rate": 4.330390176369685e-06, "loss": 1.5938429832458496, "step": 398 }, { "epoch": 0.583941605839416, "grad_norm": 1.546875, "learning_rate": 4.322637227009414e-06, "loss": 1.1486091613769531, "step": 400 }, { "epoch": 0.5868613138686132, "grad_norm": 3.578125, "learning_rate": 4.314847597351475e-06, "loss": 1.452984094619751, "step": 402 }, { "epoch": 0.5897810218978102, "grad_norm": 3.953125, "learning_rate": 4.3070214690817195e-06, "loss": 1.4647376537322998, "step": 404 }, { "epoch": 0.5927007299270073, "grad_norm": 2.203125, "learning_rate": 4.299159024737295e-06, "loss": 1.2110595703125, "step": 406 }, { "epoch": 0.5956204379562043, "grad_norm": 4.1875, "learning_rate": 4.291260447702389e-06, "loss": 1.3485263586044312, "step": 408 }, { "epoch": 0.5985401459854015, "grad_norm": 5.25, "learning_rate": 4.283325922203949e-06, "loss": 1.3334099054336548, "step": 410 }, { "epoch": 0.6014598540145986, "grad_norm": 2.0625, "learning_rate": 4.2753556333073875e-06, "loss": 1.2992541790008545, "step": 412 }, { "epoch": 0.6043795620437956, "grad_norm": 8.3125, "learning_rate": 4.267349766912266e-06, "loss": 1.3331689834594727, "step": 414 }, { "epoch": 0.6072992700729927, "grad_norm": 3.71875, "learning_rate": 4.259308509747955e-06, "loss": 1.4391039609909058, "step": 416 }, { "epoch": 0.6102189781021898, "grad_norm": 9.6875, "learning_rate": 4.251232049369287e-06, "loss": 1.145450472831726, "step": 418 }, { "epoch": 0.6131386861313869, "grad_norm": 10.875, "learning_rate": 4.243120574152169e-06, "loss": 1.5916063785552979, "step": 420 }, { "epoch": 0.6160583941605839, "grad_norm": 4.75, "learning_rate": 4.234974273289204e-06, "loss": 1.619133710861206, "step": 422 }, { "epoch": 0.618978102189781, "grad_norm": 4.375, "learning_rate": 4.226793336785265e-06, "loss": 1.4133093357086182, "step": 424 }, { "epoch": 0.621897810218978, "grad_norm": 6.03125, "learning_rate": 4.218577955453074e-06, "loss": 1.253399133682251, "step": 426 }, { "epoch": 0.6248175182481752, "grad_norm": 4.6875, "learning_rate": 4.210328320908744e-06, "loss": 1.4635814428329468, "step": 428 }, { "epoch": 0.6277372262773723, "grad_norm": 2.875, "learning_rate": 4.20204462556731e-06, "loss": 1.3652441501617432, "step": 430 }, { "epoch": 0.6306569343065693, "grad_norm": 8.9375, "learning_rate": 4.193727062638247e-06, "loss": 1.5560953617095947, "step": 432 }, { "epoch": 0.6335766423357664, "grad_norm": 3.53125, "learning_rate": 4.18537582612096e-06, "loss": 1.4227533340454102, "step": 434 }, { "epoch": 0.6364963503649635, "grad_norm": 3.265625, "learning_rate": 4.176991110800256e-06, "loss": 1.2683900594711304, "step": 436 }, { "epoch": 0.6394160583941606, "grad_norm": 14.1875, "learning_rate": 4.168573112241805e-06, "loss": 1.2102452516555786, "step": 438 }, { "epoch": 0.6423357664233577, "grad_norm": 4.84375, "learning_rate": 4.16012202678758e-06, "loss": 1.2587625980377197, "step": 440 }, { "epoch": 0.6452554744525547, "grad_norm": 5.46875, "learning_rate": 4.1516380515512705e-06, "loss": 1.410897970199585, "step": 442 }, { "epoch": 0.6481751824817519, "grad_norm": 1.78125, "learning_rate": 4.143121384413695e-06, "loss": 1.4373693466186523, "step": 444 }, { "epoch": 0.6510948905109489, "grad_norm": 2.78125, "learning_rate": 4.134572224018176e-06, "loss": 1.4430195093154907, "step": 446 }, { "epoch": 0.654014598540146, "grad_norm": 7.90625, "learning_rate": 4.125990769765911e-06, "loss": 1.4238855838775635, "step": 448 }, { "epoch": 0.656934306569343, "grad_norm": 2.25, "learning_rate": 4.117377221811324e-06, "loss": 1.4734668731689453, "step": 450 }, { "epoch": 0.6598540145985401, "grad_norm": 2.734375, "learning_rate": 4.108731781057393e-06, "loss": 1.5210154056549072, "step": 452 }, { "epoch": 0.6627737226277373, "grad_norm": 1.25, "learning_rate": 4.100054649150967e-06, "loss": 1.237725019454956, "step": 454 }, { "epoch": 0.6656934306569343, "grad_norm": 3.953125, "learning_rate": 4.091346028478059e-06, "loss": 1.4640438556671143, "step": 456 }, { "epoch": 0.6686131386861314, "grad_norm": 9.0, "learning_rate": 4.0826061221591326e-06, "loss": 1.105014681816101, "step": 458 }, { "epoch": 0.6715328467153284, "grad_norm": 42.25, "learning_rate": 4.073835134044356e-06, "loss": 1.4338090419769287, "step": 460 }, { "epoch": 0.6744525547445256, "grad_norm": 5.90625, "learning_rate": 4.065033268708854e-06, "loss": 1.3917622566223145, "step": 462 }, { "epoch": 0.6773722627737226, "grad_norm": 3.359375, "learning_rate": 4.056200731447929e-06, "loss": 1.0591514110565186, "step": 464 }, { "epoch": 0.6802919708029197, "grad_norm": 4.625, "learning_rate": 4.0473377282722845e-06, "loss": 1.4084625244140625, "step": 466 }, { "epoch": 0.6832116788321168, "grad_norm": 3.734375, "learning_rate": 4.038444465903208e-06, "loss": 1.4596691131591797, "step": 468 }, { "epoch": 0.6861313868613139, "grad_norm": 11.125, "learning_rate": 4.029521151767757e-06, "loss": 1.2422056198120117, "step": 470 }, { "epoch": 0.689051094890511, "grad_norm": 4.4375, "learning_rate": 4.0205679939939164e-06, "loss": 1.33591628074646, "step": 472 }, { "epoch": 0.691970802919708, "grad_norm": 2.21875, "learning_rate": 4.011585201405747e-06, "loss": 1.2504942417144775, "step": 474 }, { "epoch": 0.6948905109489051, "grad_norm": 3.6875, "learning_rate": 4.002572983518515e-06, "loss": 1.2631410360336304, "step": 476 }, { "epoch": 0.6978102189781021, "grad_norm": 5.8125, "learning_rate": 3.993531550533804e-06, "loss": 1.3914625644683838, "step": 478 }, { "epoch": 0.7007299270072993, "grad_norm": 20.0, "learning_rate": 3.98446111333461e-06, "loss": 1.288975715637207, "step": 480 }, { "epoch": 0.7036496350364964, "grad_norm": 3.234375, "learning_rate": 3.9753618834804295e-06, "loss": 1.4152731895446777, "step": 482 }, { "epoch": 0.7065693430656934, "grad_norm": 5.71875, "learning_rate": 3.966234073202316e-06, "loss": 1.316530466079712, "step": 484 }, { "epoch": 0.7094890510948905, "grad_norm": 56.5, "learning_rate": 3.957077895397941e-06, "loss": 1.3749709129333496, "step": 486 }, { "epoch": 0.7124087591240876, "grad_norm": 1.734375, "learning_rate": 3.947893563626615e-06, "loss": 1.2120707035064697, "step": 488 }, { "epoch": 0.7153284671532847, "grad_norm": 3.546875, "learning_rate": 3.93868129210432e-06, "loss": 1.4016718864440918, "step": 490 }, { "epoch": 0.7182481751824817, "grad_norm": 8.8125, "learning_rate": 3.929441295698702e-06, "loss": 1.154693841934204, "step": 492 }, { "epoch": 0.7211678832116788, "grad_norm": 3.640625, "learning_rate": 3.920173789924065e-06, "loss": 1.334530234336853, "step": 494 }, { "epoch": 0.724087591240876, "grad_norm": 1.921875, "learning_rate": 3.910878990936346e-06, "loss": 1.3103371858596802, "step": 496 }, { "epoch": 0.727007299270073, "grad_norm": 2.84375, "learning_rate": 3.901557115528069e-06, "loss": 1.244321584701538, "step": 498 }, { "epoch": 0.7299270072992701, "grad_norm": 4.40625, "learning_rate": 3.892208381123289e-06, "loss": 1.4268873929977417, "step": 500 }, { "epoch": 0.7328467153284671, "grad_norm": 1.4765625, "learning_rate": 3.8828330057725225e-06, "loss": 1.3552806377410889, "step": 502 }, { "epoch": 0.7357664233576642, "grad_norm": 3.65625, "learning_rate": 3.873431208147664e-06, "loss": 1.6077991724014282, "step": 504 }, { "epoch": 0.7386861313868613, "grad_norm": 2.21875, "learning_rate": 3.864003207536879e-06, "loss": 1.2244906425476074, "step": 506 }, { "epoch": 0.7416058394160584, "grad_norm": 2.265625, "learning_rate": 3.854549223839497e-06, "loss": 1.0374276638031006, "step": 508 }, { "epoch": 0.7445255474452555, "grad_norm": 6.96875, "learning_rate": 3.845069477560876e-06, "loss": 1.547581434249878, "step": 510 }, { "epoch": 0.7474452554744525, "grad_norm": 2.203125, "learning_rate": 3.835564189807263e-06, "loss": 1.225568175315857, "step": 512 }, { "epoch": 0.7503649635036497, "grad_norm": 4.09375, "learning_rate": 3.826033582280635e-06, "loss": 1.2825735807418823, "step": 514 }, { "epoch": 0.7532846715328467, "grad_norm": 2.96875, "learning_rate": 3.816477877273533e-06, "loss": 1.430619716644287, "step": 516 }, { "epoch": 0.7562043795620438, "grad_norm": 10.9375, "learning_rate": 3.8068972976638703e-06, "loss": 1.489488124847412, "step": 518 }, { "epoch": 0.7591240875912408, "grad_norm": 4.3125, "learning_rate": 3.797292066909734e-06, "loss": 0.8555082082748413, "step": 520 }, { "epoch": 0.762043795620438, "grad_norm": 3.703125, "learning_rate": 3.787662409044184e-06, "loss": 1.3753139972686768, "step": 522 }, { "epoch": 0.7649635036496351, "grad_norm": 8.0, "learning_rate": 3.7780085486700126e-06, "loss": 1.6844412088394165, "step": 524 }, { "epoch": 0.7678832116788321, "grad_norm": 5.25, "learning_rate": 3.768330710954517e-06, "loss": 1.592594027519226, "step": 526 }, { "epoch": 0.7708029197080292, "grad_norm": 1.5, "learning_rate": 3.7586291216242433e-06, "loss": 1.2550559043884277, "step": 528 }, { "epoch": 0.7737226277372263, "grad_norm": 3.953125, "learning_rate": 3.748904006959719e-06, "loss": 1.1512435674667358, "step": 530 }, { "epoch": 0.7766423357664234, "grad_norm": 10.375, "learning_rate": 3.739155593790182e-06, "loss": 1.5256032943725586, "step": 532 }, { "epoch": 0.7795620437956204, "grad_norm": 10.75, "learning_rate": 3.729384109488282e-06, "loss": 1.6810424327850342, "step": 534 }, { "epoch": 0.7824817518248175, "grad_norm": 3.734375, "learning_rate": 3.719589781964787e-06, "loss": 1.4392688274383545, "step": 536 }, { "epoch": 0.7854014598540145, "grad_norm": 4.125, "learning_rate": 3.7097728396632555e-06, "loss": 1.4172781705856323, "step": 538 }, { "epoch": 0.7883211678832117, "grad_norm": 4.125, "learning_rate": 3.6999335115547185e-06, "loss": 1.401853322982788, "step": 540 }, { "epoch": 0.7912408759124088, "grad_norm": 6.375, "learning_rate": 3.690072027132335e-06, "loss": 1.534106731414795, "step": 542 }, { "epoch": 0.7941605839416058, "grad_norm": 5.0, "learning_rate": 3.680188616406037e-06, "loss": 1.629064679145813, "step": 544 }, { "epoch": 0.7970802919708029, "grad_norm": 3.5625, "learning_rate": 3.6702835098971706e-06, "loss": 1.5794017314910889, "step": 546 }, { "epoch": 0.8, "grad_norm": 7.90625, "learning_rate": 3.6603569386331122e-06, "loss": 1.556319236755371, "step": 548 }, { "epoch": 0.8029197080291971, "grad_norm": 5.125, "learning_rate": 3.6504091341418853e-06, "loss": 1.5984359979629517, "step": 550 }, { "epoch": 0.8058394160583942, "grad_norm": 4.5, "learning_rate": 3.640440328446759e-06, "loss": 1.5283421277999878, "step": 552 }, { "epoch": 0.8087591240875912, "grad_norm": 6.75, "learning_rate": 3.6304507540608357e-06, "loss": 1.383811116218567, "step": 554 }, { "epoch": 0.8116788321167884, "grad_norm": 3.640625, "learning_rate": 3.620440643981629e-06, "loss": 1.3146003484725952, "step": 556 }, { "epoch": 0.8145985401459854, "grad_norm": 4.125, "learning_rate": 3.6104102316856255e-06, "loss": 1.4131672382354736, "step": 558 }, { "epoch": 0.8175182481751825, "grad_norm": 13.25, "learning_rate": 3.600359751122845e-06, "loss": 1.549619197845459, "step": 560 }, { "epoch": 0.8204379562043795, "grad_norm": 2.796875, "learning_rate": 3.590289436711379e-06, "loss": 1.5269279479980469, "step": 562 }, { "epoch": 0.8233576642335766, "grad_norm": 3.046875, "learning_rate": 3.5801995233319265e-06, "loss": 1.3862372636795044, "step": 564 }, { "epoch": 0.8262773722627738, "grad_norm": 2.484375, "learning_rate": 3.5700902463223137e-06, "loss": 1.2330877780914307, "step": 566 }, { "epoch": 0.8291970802919708, "grad_norm": 7.125, "learning_rate": 3.559961841472005e-06, "loss": 1.4884552955627441, "step": 568 }, { "epoch": 0.8321167883211679, "grad_norm": 3.28125, "learning_rate": 3.5498145450166057e-06, "loss": 1.3787778615951538, "step": 570 }, { "epoch": 0.8350364963503649, "grad_norm": 3.609375, "learning_rate": 3.5396485936323456e-06, "loss": 1.3882396221160889, "step": 572 }, { "epoch": 0.8379562043795621, "grad_norm": 3.15625, "learning_rate": 3.529464224430568e-06, "loss": 1.3656411170959473, "step": 574 }, { "epoch": 0.8408759124087591, "grad_norm": 5.65625, "learning_rate": 3.5192616749521942e-06, "loss": 1.5140806436538696, "step": 576 }, { "epoch": 0.8437956204379562, "grad_norm": 4.5, "learning_rate": 3.5090411831621803e-06, "loss": 1.5188113451004028, "step": 578 }, { "epoch": 0.8467153284671532, "grad_norm": 2.671875, "learning_rate": 3.498802987443974e-06, "loss": 1.3665883541107178, "step": 580 }, { "epoch": 0.8496350364963504, "grad_norm": 5.25, "learning_rate": 3.4885473265939464e-06, "loss": 1.383296012878418, "step": 582 }, { "epoch": 0.8525547445255475, "grad_norm": 2.71875, "learning_rate": 3.478274439815831e-06, "loss": 1.2266430854797363, "step": 584 }, { "epoch": 0.8554744525547445, "grad_norm": 3.9375, "learning_rate": 3.467984566715137e-06, "loss": 1.5247292518615723, "step": 586 }, { "epoch": 0.8583941605839416, "grad_norm": 4.125, "learning_rate": 3.4576779472935644e-06, "loss": 1.4203873872756958, "step": 588 }, { "epoch": 0.8613138686131386, "grad_norm": 2.46875, "learning_rate": 3.447354821943407e-06, "loss": 1.222019076347351, "step": 590 }, { "epoch": 0.8642335766423358, "grad_norm": 4.8125, "learning_rate": 3.4370154314419395e-06, "loss": 1.2593979835510254, "step": 592 }, { "epoch": 0.8671532846715329, "grad_norm": 3.21875, "learning_rate": 3.4266600169458135e-06, "loss": 1.22776460647583, "step": 594 }, { "epoch": 0.8700729927007299, "grad_norm": 2.703125, "learning_rate": 3.4162888199854182e-06, "loss": 1.2717225551605225, "step": 596 }, { "epoch": 0.872992700729927, "grad_norm": 1.2890625, "learning_rate": 3.405902082459259e-06, "loss": 1.0713449716567993, "step": 598 }, { "epoch": 0.8759124087591241, "grad_norm": 3.453125, "learning_rate": 3.3955000466283073e-06, "loss": 1.2096487283706665, "step": 600 }, { "epoch": 0.8788321167883212, "grad_norm": 2.03125, "learning_rate": 3.385082955110355e-06, "loss": 1.2699155807495117, "step": 602 }, { "epoch": 0.8817518248175182, "grad_norm": 2.328125, "learning_rate": 3.3746510508743533e-06, "loss": 1.3786303997039795, "step": 604 }, { "epoch": 0.8846715328467153, "grad_norm": 5.53125, "learning_rate": 3.3642045772347453e-06, "loss": 1.3685808181762695, "step": 606 }, { "epoch": 0.8875912408759125, "grad_norm": 9.0625, "learning_rate": 3.353743777845795e-06, "loss": 1.178727626800537, "step": 608 }, { "epoch": 0.8905109489051095, "grad_norm": 4.1875, "learning_rate": 3.343268896695897e-06, "loss": 1.383094310760498, "step": 610 }, { "epoch": 0.8934306569343066, "grad_norm": 3.359375, "learning_rate": 3.3327801781018925e-06, "loss": 1.4056508541107178, "step": 612 }, { "epoch": 0.8963503649635036, "grad_norm": 4.65625, "learning_rate": 3.322277866703367e-06, "loss": 1.5974513292312622, "step": 614 }, { "epoch": 0.8992700729927007, "grad_norm": 1.1875, "learning_rate": 3.3117622074569476e-06, "loss": 1.1610685586929321, "step": 616 }, { "epoch": 0.9021897810218978, "grad_norm": 10.75, "learning_rate": 3.3012334456305846e-06, "loss": 0.901719331741333, "step": 618 }, { "epoch": 0.9051094890510949, "grad_norm": 8.3125, "learning_rate": 3.2906918267978355e-06, "loss": 1.2409268617630005, "step": 620 }, { "epoch": 0.908029197080292, "grad_norm": 3.453125, "learning_rate": 3.2801375968321355e-06, "loss": 1.4349682331085205, "step": 622 }, { "epoch": 0.910948905109489, "grad_norm": 6.875, "learning_rate": 3.269571001901061e-06, "loss": 1.3277549743652344, "step": 624 }, { "epoch": 0.9138686131386862, "grad_norm": 5.1875, "learning_rate": 3.2589922884605924e-06, "loss": 1.3614181280136108, "step": 626 }, { "epoch": 0.9167883211678832, "grad_norm": 9.125, "learning_rate": 3.2484017032493615e-06, "loss": 1.705947756767273, "step": 628 }, { "epoch": 0.9197080291970803, "grad_norm": 4.0, "learning_rate": 3.237799493282897e-06, "loss": 1.3996449708938599, "step": 630 }, { "epoch": 0.9226277372262773, "grad_norm": 2.75, "learning_rate": 3.2271859058478666e-06, "loss": 1.4013357162475586, "step": 632 }, { "epoch": 0.9255474452554745, "grad_norm": 3.46875, "learning_rate": 3.2165611884963055e-06, "loss": 1.2193137407302856, "step": 634 }, { "epoch": 0.9284671532846716, "grad_norm": 2.421875, "learning_rate": 3.2059255890398445e-06, "loss": 0.9855245351791382, "step": 636 }, { "epoch": 0.9313868613138686, "grad_norm": 3.59375, "learning_rate": 3.1952793555439276e-06, "loss": 1.4272806644439697, "step": 638 }, { "epoch": 0.9343065693430657, "grad_norm": 3.421875, "learning_rate": 3.18462273632203e-06, "loss": 1.1866121292114258, "step": 640 }, { "epoch": 0.9372262773722628, "grad_norm": 6.84375, "learning_rate": 3.173955979929863e-06, "loss": 1.385930061340332, "step": 642 }, { "epoch": 0.9401459854014599, "grad_norm": 1.8125, "learning_rate": 3.163279335159578e-06, "loss": 1.283376932144165, "step": 644 }, { "epoch": 0.9430656934306569, "grad_norm": 5.0625, "learning_rate": 3.152593051033966e-06, "loss": 1.368044376373291, "step": 646 }, { "epoch": 0.945985401459854, "grad_norm": 14.0625, "learning_rate": 3.1418973768006424e-06, "loss": 0.6849503517150879, "step": 648 }, { "epoch": 0.948905109489051, "grad_norm": 2.140625, "learning_rate": 3.1311925619262417e-06, "loss": 1.3481240272521973, "step": 650 }, { "epoch": 0.9518248175182482, "grad_norm": 3.234375, "learning_rate": 3.1204788560905935e-06, "loss": 1.390141248703003, "step": 652 }, { "epoch": 0.9547445255474453, "grad_norm": 8.8125, "learning_rate": 3.1097565091809033e-06, "loss": 1.3187050819396973, "step": 654 }, { "epoch": 0.9576642335766423, "grad_norm": 12.125, "learning_rate": 3.0990257712859184e-06, "loss": 1.3746651411056519, "step": 656 }, { "epoch": 0.9605839416058394, "grad_norm": 7.09375, "learning_rate": 3.0882868926901e-06, "loss": 1.2352771759033203, "step": 658 }, { "epoch": 0.9635036496350365, "grad_norm": 3.46875, "learning_rate": 3.077540123867783e-06, "loss": 1.328325629234314, "step": 660 }, { "epoch": 0.9664233576642336, "grad_norm": 3.46875, "learning_rate": 3.066785715477334e-06, "loss": 1.2275207042694092, "step": 662 }, { "epoch": 0.9693430656934306, "grad_norm": 2.4375, "learning_rate": 3.056023918355307e-06, "loss": 1.335202693939209, "step": 664 }, { "epoch": 0.9722627737226277, "grad_norm": 6.5, "learning_rate": 3.0452549835105895e-06, "loss": 1.4829626083374023, "step": 666 }, { "epoch": 0.9751824817518249, "grad_norm": 34.0, "learning_rate": 3.03447916211855e-06, "loss": 1.5850169658660889, "step": 668 }, { "epoch": 0.9781021897810219, "grad_norm": 6.5, "learning_rate": 3.0236967055151804e-06, "loss": 1.671141266822815, "step": 670 }, { "epoch": 0.981021897810219, "grad_norm": 23.125, "learning_rate": 3.0129078651912317e-06, "loss": 1.300727128982544, "step": 672 }, { "epoch": 0.983941605839416, "grad_norm": 8.875, "learning_rate": 3.00211289278635e-06, "loss": 1.4001004695892334, "step": 674 }, { "epoch": 0.9868613138686131, "grad_norm": 8.875, "learning_rate": 2.991312040083206e-06, "loss": 0.47176289558410645, "step": 676 }, { "epoch": 0.9897810218978103, "grad_norm": 2.875, "learning_rate": 2.9805055590016225e-06, "loss": 1.2891722917556763, "step": 678 }, { "epoch": 0.9927007299270073, "grad_norm": 4.1875, "learning_rate": 2.9696937015926995e-06, "loss": 1.365147352218628, "step": 680 }, { "epoch": 0.9956204379562044, "grad_norm": 1.8828125, "learning_rate": 2.9588767200329348e-06, "loss": 1.2809860706329346, "step": 682 }, { "epoch": 0.9985401459854014, "grad_norm": 8.25, "learning_rate": 2.9480548666183427e-06, "loss": 1.6904196739196777, "step": 684 }, { "epoch": 1.0014598540145985, "grad_norm": 2.21875, "learning_rate": 2.9372283937585675e-06, "loss": 1.3279258012771606, "step": 686 }, { "epoch": 1.0043795620437956, "grad_norm": 4.34375, "learning_rate": 2.926397553970999e-06, "loss": 1.277381181716919, "step": 688 }, { "epoch": 1.0072992700729928, "grad_norm": 5.84375, "learning_rate": 2.915562599874882e-06, "loss": 1.500443935394287, "step": 690 }, { "epoch": 1.0102189781021897, "grad_norm": 9.875, "learning_rate": 2.904723784185422e-06, "loss": 1.2994956970214844, "step": 692 }, { "epoch": 1.013138686131387, "grad_norm": 10.6875, "learning_rate": 2.893881359707894e-06, "loss": 1.227457046508789, "step": 694 }, { "epoch": 1.0160583941605839, "grad_norm": 2.984375, "learning_rate": 2.883035579331744e-06, "loss": 1.2923262119293213, "step": 696 }, { "epoch": 1.018978102189781, "grad_norm": 4.0, "learning_rate": 2.8721866960246912e-06, "loss": 1.445424199104309, "step": 698 }, { "epoch": 1.0218978102189782, "grad_norm": 2.1875, "learning_rate": 2.861334962826828e-06, "loss": 1.1312172412872314, "step": 700 }, { "epoch": 1.0248175182481751, "grad_norm": 3.734375, "learning_rate": 2.8504806328447177e-06, "loss": 1.4891958236694336, "step": 702 }, { "epoch": 1.0277372262773723, "grad_norm": 3.734375, "learning_rate": 2.8396239592454914e-06, "loss": 1.4066648483276367, "step": 704 }, { "epoch": 1.0306569343065692, "grad_norm": 4.21875, "learning_rate": 2.828765195250942e-06, "loss": 1.4027667045593262, "step": 706 }, { "epoch": 1.0335766423357664, "grad_norm": 3.828125, "learning_rate": 2.8179045941316214e-06, "loss": 1.3984425067901611, "step": 708 }, { "epoch": 1.0364963503649636, "grad_norm": 37.25, "learning_rate": 2.8070424092009264e-06, "loss": 1.5881340503692627, "step": 710 }, { "epoch": 1.0394160583941605, "grad_norm": 5.21875, "learning_rate": 2.7961788938091994e-06, "loss": 1.3652167320251465, "step": 712 }, { "epoch": 1.0423357664233577, "grad_norm": 9.0, "learning_rate": 2.785314301337811e-06, "loss": 1.4395644664764404, "step": 714 }, { "epoch": 1.0452554744525548, "grad_norm": 4.125, "learning_rate": 2.7744488851932568e-06, "loss": 1.3807083368301392, "step": 716 }, { "epoch": 1.0481751824817518, "grad_norm": 16.625, "learning_rate": 2.76358289880124e-06, "loss": 1.2562787532806396, "step": 718 }, { "epoch": 1.051094890510949, "grad_norm": 4.03125, "learning_rate": 2.752716595600768e-06, "loss": 1.2394318580627441, "step": 720 }, { "epoch": 1.054014598540146, "grad_norm": 8.625, "learning_rate": 2.7418502290382352e-06, "loss": 1.1047321557998657, "step": 722 }, { "epoch": 1.056934306569343, "grad_norm": 4.46875, "learning_rate": 2.7309840525615146e-06, "loss": 1.5514793395996094, "step": 724 }, { "epoch": 1.0598540145985402, "grad_norm": 3.234375, "learning_rate": 2.720118319614047e-06, "loss": 1.2009215354919434, "step": 726 }, { "epoch": 1.0627737226277372, "grad_norm": 2.65625, "learning_rate": 2.709253283628924e-06, "loss": 1.2573150396347046, "step": 728 }, { "epoch": 1.0656934306569343, "grad_norm": 8.9375, "learning_rate": 2.698389198022987e-06, "loss": 1.624213457107544, "step": 730 }, { "epoch": 1.0686131386861315, "grad_norm": 5.375, "learning_rate": 2.6875263161909054e-06, "loss": 1.3574187755584717, "step": 732 }, { "epoch": 1.0715328467153284, "grad_norm": 7.4375, "learning_rate": 2.676664891499275e-06, "loss": 1.2222844362258911, "step": 734 }, { "epoch": 1.0744525547445256, "grad_norm": 1.640625, "learning_rate": 2.6658051772807046e-06, "loss": 1.2617628574371338, "step": 736 }, { "epoch": 1.0773722627737226, "grad_norm": 8.0, "learning_rate": 2.6549474268279074e-06, "loss": 1.3748055696487427, "step": 738 }, { "epoch": 1.0802919708029197, "grad_norm": 8.5625, "learning_rate": 2.644091893387793e-06, "loss": 1.4741809368133545, "step": 740 }, { "epoch": 1.0832116788321169, "grad_norm": 7.1875, "learning_rate": 2.6332388301555615e-06, "loss": 1.3683550357818604, "step": 742 }, { "epoch": 1.0861313868613138, "grad_norm": 23.125, "learning_rate": 2.622388490268799e-06, "loss": 1.4302444458007812, "step": 744 }, { "epoch": 1.089051094890511, "grad_norm": 2.875, "learning_rate": 2.6115411268015716e-06, "loss": 1.3794375658035278, "step": 746 }, { "epoch": 1.091970802919708, "grad_norm": 3.5, "learning_rate": 2.6006969927585214e-06, "loss": 1.6521217823028564, "step": 748 }, { "epoch": 1.094890510948905, "grad_norm": 4.09375, "learning_rate": 2.589856341068969e-06, "loss": 1.380043625831604, "step": 750 }, { "epoch": 1.0978102189781023, "grad_norm": 2.84375, "learning_rate": 2.5790194245810125e-06, "loss": 1.2655432224273682, "step": 752 }, { "epoch": 1.1007299270072992, "grad_norm": 8.6875, "learning_rate": 2.568186496055628e-06, "loss": 1.4429633617401123, "step": 754 }, { "epoch": 1.1036496350364964, "grad_norm": 2.34375, "learning_rate": 2.5573578081607793e-06, "loss": 1.1212751865386963, "step": 756 }, { "epoch": 1.1065693430656935, "grad_norm": 2.71875, "learning_rate": 2.546533613465518e-06, "loss": 0.9118128418922424, "step": 758 }, { "epoch": 1.1094890510948905, "grad_norm": 2.9375, "learning_rate": 2.5357141644340966e-06, "loss": 1.3533203601837158, "step": 760 }, { "epoch": 1.1124087591240877, "grad_norm": 5.625, "learning_rate": 2.5248997134200833e-06, "loss": 1.2528855800628662, "step": 762 }, { "epoch": 1.1153284671532846, "grad_norm": 2.5, "learning_rate": 2.5140905126604677e-06, "loss": 1.244079351425171, "step": 764 }, { "epoch": 1.1182481751824818, "grad_norm": 5.71875, "learning_rate": 2.503286814269783e-06, "loss": 1.3053560256958008, "step": 766 }, { "epoch": 1.121167883211679, "grad_norm": 1.5546875, "learning_rate": 2.4924888702342266e-06, "loss": 1.2007651329040527, "step": 768 }, { "epoch": 1.1240875912408759, "grad_norm": 5.5625, "learning_rate": 2.481696932405779e-06, "loss": 1.3610585927963257, "step": 770 }, { "epoch": 1.127007299270073, "grad_norm": 2.59375, "learning_rate": 2.4709112524963326e-06, "loss": 1.3990166187286377, "step": 772 }, { "epoch": 1.12992700729927, "grad_norm": 3.484375, "learning_rate": 2.4601320820718196e-06, "loss": 1.3095015287399292, "step": 774 }, { "epoch": 1.1328467153284671, "grad_norm": 2.84375, "learning_rate": 2.4493596725463435e-06, "loss": 1.2231605052947998, "step": 776 }, { "epoch": 1.1357664233576643, "grad_norm": 5.875, "learning_rate": 2.438594275176318e-06, "loss": 1.3952467441558838, "step": 778 }, { "epoch": 1.1386861313868613, "grad_norm": 5.09375, "learning_rate": 2.4278361410546027e-06, "loss": 1.2288057804107666, "step": 780 }, { "epoch": 1.1416058394160584, "grad_norm": 7.15625, "learning_rate": 2.41708552110465e-06, "loss": 1.46846342086792, "step": 782 }, { "epoch": 1.1445255474452556, "grad_norm": 3.421875, "learning_rate": 2.4063426660746517e-06, "loss": 1.3782763481140137, "step": 784 }, { "epoch": 1.1474452554744525, "grad_norm": 9.375, "learning_rate": 2.3956078265316883e-06, "loss": 1.2458666563034058, "step": 786 }, { "epoch": 1.1503649635036497, "grad_norm": 3.59375, "learning_rate": 2.3848812528558887e-06, "loss": 1.2981244325637817, "step": 788 }, { "epoch": 1.1532846715328466, "grad_norm": 5.96875, "learning_rate": 2.374163195234586e-06, "loss": 1.3579144477844238, "step": 790 }, { "epoch": 1.1562043795620438, "grad_norm": 1.4765625, "learning_rate": 2.3634539036564853e-06, "loss": 1.2424495220184326, "step": 792 }, { "epoch": 1.159124087591241, "grad_norm": 3.78125, "learning_rate": 2.352753627905833e-06, "loss": 1.6642348766326904, "step": 794 }, { "epoch": 1.162043795620438, "grad_norm": 5.90625, "learning_rate": 2.3420626175565877e-06, "loss": 1.1931509971618652, "step": 796 }, { "epoch": 1.164963503649635, "grad_norm": 3.75, "learning_rate": 2.331381121966603e-06, "loss": 1.3377602100372314, "step": 798 }, { "epoch": 1.167883211678832, "grad_norm": 3.640625, "learning_rate": 2.3207093902718066e-06, "loss": 1.2145559787750244, "step": 800 }, { "epoch": 1.1708029197080292, "grad_norm": 2.078125, "learning_rate": 2.3100476713803967e-06, "loss": 1.1511560678482056, "step": 802 }, { "epoch": 1.1737226277372264, "grad_norm": 4.75, "learning_rate": 2.2993962139670292e-06, "loss": 1.5985954999923706, "step": 804 }, { "epoch": 1.1766423357664233, "grad_norm": 6.71875, "learning_rate": 2.288755266467022e-06, "loss": 1.4606941938400269, "step": 806 }, { "epoch": 1.1795620437956205, "grad_norm": 7.75, "learning_rate": 2.2781250770705575e-06, "loss": 1.5486199855804443, "step": 808 }, { "epoch": 1.1824817518248176, "grad_norm": 4.1875, "learning_rate": 2.267505893716898e-06, "loss": 1.3502545356750488, "step": 810 }, { "epoch": 1.1854014598540146, "grad_norm": 7.40625, "learning_rate": 2.2568979640885964e-06, "loss": 1.5650737285614014, "step": 812 }, { "epoch": 1.1883211678832117, "grad_norm": 7.96875, "learning_rate": 2.246301535605726e-06, "loss": 1.6433610916137695, "step": 814 }, { "epoch": 1.1912408759124087, "grad_norm": 3.78125, "learning_rate": 2.2357168554201066e-06, "loss": 1.0836632251739502, "step": 816 }, { "epoch": 1.1941605839416058, "grad_norm": 3.796875, "learning_rate": 2.225144170409537e-06, "loss": 1.1502854824066162, "step": 818 }, { "epoch": 1.197080291970803, "grad_norm": 3.015625, "learning_rate": 2.2145837271720433e-06, "loss": 1.6808114051818848, "step": 820 }, { "epoch": 1.2, "grad_norm": 3.296875, "learning_rate": 2.204035772020121e-06, "loss": 1.3705600500106812, "step": 822 }, { "epoch": 1.2029197080291971, "grad_norm": 2.78125, "learning_rate": 2.1935005509749933e-06, "loss": 1.1946570873260498, "step": 824 }, { "epoch": 1.205839416058394, "grad_norm": 17.75, "learning_rate": 2.182978309760874e-06, "loss": 1.5363470315933228, "step": 826 }, { "epoch": 1.2087591240875912, "grad_norm": 3.78125, "learning_rate": 2.1724692937992313e-06, "loss": 1.4042502641677856, "step": 828 }, { "epoch": 1.2116788321167884, "grad_norm": 17.25, "learning_rate": 2.16197374820307e-06, "loss": 1.2589643001556396, "step": 830 }, { "epoch": 1.2145985401459853, "grad_norm": 3.359375, "learning_rate": 2.1514919177712085e-06, "loss": 1.6056280136108398, "step": 832 }, { "epoch": 1.2175182481751825, "grad_norm": 4.3125, "learning_rate": 2.141024046982573e-06, "loss": 1.3564906120300293, "step": 834 }, { "epoch": 1.2204379562043797, "grad_norm": 11.625, "learning_rate": 2.1305703799904947e-06, "loss": 0.9380712509155273, "step": 836 }, { "epoch": 1.2233576642335766, "grad_norm": 8.75, "learning_rate": 2.120131160617013e-06, "loss": 1.0530650615692139, "step": 838 }, { "epoch": 1.2262773722627738, "grad_norm": 8.4375, "learning_rate": 2.1097066323471897e-06, "loss": 0.7292347550392151, "step": 840 }, { "epoch": 1.2291970802919707, "grad_norm": 8.125, "learning_rate": 2.0992970383234336e-06, "loss": 0.9691898226737976, "step": 842 }, { "epoch": 1.2321167883211679, "grad_norm": 1.796875, "learning_rate": 2.088902621339823e-06, "loss": 1.152883768081665, "step": 844 }, { "epoch": 1.235036496350365, "grad_norm": 6.3125, "learning_rate": 2.078523623836446e-06, "loss": 1.4850080013275146, "step": 846 }, { "epoch": 1.237956204379562, "grad_norm": 7.3125, "learning_rate": 2.0681602878937472e-06, "loss": 1.3769371509552002, "step": 848 }, { "epoch": 1.2408759124087592, "grad_norm": 3.53125, "learning_rate": 2.057812855226879e-06, "loss": 1.103143334388733, "step": 850 }, { "epoch": 1.243795620437956, "grad_norm": 3.578125, "learning_rate": 2.0474815671800644e-06, "loss": 1.4019992351531982, "step": 852 }, { "epoch": 1.2467153284671533, "grad_norm": 5.40625, "learning_rate": 2.0371666647209694e-06, "loss": 1.1963081359863281, "step": 854 }, { "epoch": 1.2496350364963504, "grad_norm": 3.0625, "learning_rate": 2.0268683884350803e-06, "loss": 1.1888788938522339, "step": 856 }, { "epoch": 1.2525547445255474, "grad_norm": 13.6875, "learning_rate": 2.0165869785200938e-06, "loss": 1.2623980045318604, "step": 858 }, { "epoch": 1.2554744525547445, "grad_norm": 6.4375, "learning_rate": 2.0063226747803143e-06, "loss": 1.2596468925476074, "step": 860 }, { "epoch": 1.2583941605839417, "grad_norm": 3.859375, "learning_rate": 1.9960757166210596e-06, "loss": 1.333680272102356, "step": 862 }, { "epoch": 1.2613138686131387, "grad_norm": 3.71875, "learning_rate": 1.9858463430430807e-06, "loss": 1.1413600444793701, "step": 864 }, { "epoch": 1.2642335766423358, "grad_norm": 5.5625, "learning_rate": 1.9756347926369813e-06, "loss": 1.3728548288345337, "step": 866 }, { "epoch": 1.2671532846715328, "grad_norm": 4.15625, "learning_rate": 1.9654413035776585e-06, "loss": 1.449355125427246, "step": 868 }, { "epoch": 1.27007299270073, "grad_norm": 4.09375, "learning_rate": 1.9552661136187444e-06, "loss": 1.1183695793151855, "step": 870 }, { "epoch": 1.2729927007299269, "grad_norm": 4.40625, "learning_rate": 1.945109460087061e-06, "loss": 1.1493186950683594, "step": 872 }, { "epoch": 1.275912408759124, "grad_norm": 2.640625, "learning_rate": 1.934971579877088e-06, "loss": 1.3397104740142822, "step": 874 }, { "epoch": 1.2788321167883212, "grad_norm": 4.3125, "learning_rate": 1.9248527094454316e-06, "loss": 1.3082889318466187, "step": 876 }, { "epoch": 1.2817518248175181, "grad_norm": 8.4375, "learning_rate": 1.9147530848053152e-06, "loss": 1.563565731048584, "step": 878 }, { "epoch": 1.2846715328467153, "grad_norm": 8.25, "learning_rate": 1.9046729415210686e-06, "loss": 1.4606716632843018, "step": 880 }, { "epoch": 1.2875912408759125, "grad_norm": 4.65625, "learning_rate": 1.8946125147026427e-06, "loss": 1.3690614700317383, "step": 882 }, { "epoch": 1.2905109489051094, "grad_norm": 7.8125, "learning_rate": 1.8845720390001154e-06, "loss": 1.6756688356399536, "step": 884 }, { "epoch": 1.2934306569343066, "grad_norm": 3.21875, "learning_rate": 1.874551748598226e-06, "loss": 1.2701613903045654, "step": 886 }, { "epoch": 1.2963503649635038, "grad_norm": 4.78125, "learning_rate": 1.8645518772109077e-06, "loss": 1.5865097045898438, "step": 888 }, { "epoch": 1.2992700729927007, "grad_norm": 3.921875, "learning_rate": 1.8545726580758428e-06, "loss": 1.401726484298706, "step": 890 }, { "epoch": 1.3021897810218979, "grad_norm": 7.78125, "learning_rate": 1.8446143239490168e-06, "loss": 1.6153247356414795, "step": 892 }, { "epoch": 1.305109489051095, "grad_norm": 6.125, "learning_rate": 1.8346771070992914e-06, "loss": 1.4763232469558716, "step": 894 }, { "epoch": 1.308029197080292, "grad_norm": 1.984375, "learning_rate": 1.82476123930299e-06, "loss": 1.2044928073883057, "step": 896 }, { "epoch": 1.310948905109489, "grad_norm": 1.4296875, "learning_rate": 1.8148669518384862e-06, "loss": 1.0226365327835083, "step": 898 }, { "epoch": 1.313868613138686, "grad_norm": 2.1875, "learning_rate": 1.804994475480815e-06, "loss": 1.0369101762771606, "step": 900 }, { "epoch": 1.3167883211678832, "grad_norm": 1.6875, "learning_rate": 1.7951440404962856e-06, "loss": 1.1433358192443848, "step": 902 }, { "epoch": 1.3197080291970802, "grad_norm": 5.3125, "learning_rate": 1.7853158766371143e-06, "loss": 1.1160844564437866, "step": 904 }, { "epoch": 1.3226277372262774, "grad_norm": 10.1875, "learning_rate": 1.7755102131360639e-06, "loss": 1.3365674018859863, "step": 906 }, { "epoch": 1.3255474452554745, "grad_norm": 2.21875, "learning_rate": 1.7657272787010967e-06, "loss": 1.3394170999526978, "step": 908 }, { "epoch": 1.3284671532846715, "grad_norm": 14.0625, "learning_rate": 1.7559673015100405e-06, "loss": 1.2542470693588257, "step": 910 }, { "epoch": 1.3313868613138686, "grad_norm": 1.9453125, "learning_rate": 1.7462305092052676e-06, "loss": 1.2083182334899902, "step": 912 }, { "epoch": 1.3343065693430658, "grad_norm": 2.234375, "learning_rate": 1.7365171288883841e-06, "loss": 1.0745160579681396, "step": 914 }, { "epoch": 1.3372262773722627, "grad_norm": 5.5, "learning_rate": 1.7268273871149335e-06, "loss": 1.4868173599243164, "step": 916 }, { "epoch": 1.34014598540146, "grad_norm": 5.96875, "learning_rate": 1.7171615098891117e-06, "loss": 0.7804101705551147, "step": 918 }, { "epoch": 1.343065693430657, "grad_norm": 3.65625, "learning_rate": 1.7075197226584969e-06, "loss": 1.3761916160583496, "step": 920 }, { "epoch": 1.345985401459854, "grad_norm": 2.640625, "learning_rate": 1.6979022503087905e-06, "loss": 1.413581132888794, "step": 922 }, { "epoch": 1.348905109489051, "grad_norm": 8.125, "learning_rate": 1.688309317158572e-06, "loss": 1.6476316452026367, "step": 924 }, { "epoch": 1.3518248175182481, "grad_norm": 5.4375, "learning_rate": 1.6787411469540677e-06, "loss": 1.5541059970855713, "step": 926 }, { "epoch": 1.3547445255474453, "grad_norm": 6.125, "learning_rate": 1.6691979628639281e-06, "loss": 1.5634403228759766, "step": 928 }, { "epoch": 1.3576642335766422, "grad_norm": 2.65625, "learning_rate": 1.6596799874740294e-06, "loss": 1.2540359497070312, "step": 930 }, { "epoch": 1.3605839416058394, "grad_norm": 5.59375, "learning_rate": 1.6501874427822767e-06, "loss": 1.4849543571472168, "step": 932 }, { "epoch": 1.3635036496350366, "grad_norm": 6.40625, "learning_rate": 1.6407205501934285e-06, "loss": 1.141026496887207, "step": 934 }, { "epoch": 1.3664233576642335, "grad_norm": 2.375, "learning_rate": 1.6312795305139328e-06, "loss": 0.9827671647071838, "step": 936 }, { "epoch": 1.3693430656934307, "grad_norm": 5.5, "learning_rate": 1.6218646039467725e-06, "loss": 1.4801573753356934, "step": 938 }, { "epoch": 1.3722627737226278, "grad_norm": 3.109375, "learning_rate": 1.6124759900863365e-06, "loss": 1.6479110717773438, "step": 940 }, { "epoch": 1.3751824817518248, "grad_norm": 7.25, "learning_rate": 1.6031139079132933e-06, "loss": 1.2483787536621094, "step": 942 }, { "epoch": 1.378102189781022, "grad_norm": 1.453125, "learning_rate": 1.593778575789484e-06, "loss": 1.2027292251586914, "step": 944 }, { "epoch": 1.3810218978102191, "grad_norm": 3.859375, "learning_rate": 1.5844702114528315e-06, "loss": 1.5109983682632446, "step": 946 }, { "epoch": 1.383941605839416, "grad_norm": 5.34375, "learning_rate": 1.5751890320122568e-06, "loss": 1.3143746852874756, "step": 948 }, { "epoch": 1.3868613138686132, "grad_norm": 8.25, "learning_rate": 1.5659352539426215e-06, "loss": 1.2749611139297485, "step": 950 }, { "epoch": 1.3897810218978102, "grad_norm": 2.125, "learning_rate": 1.5567090930796746e-06, "loss": 1.244338035583496, "step": 952 }, { "epoch": 1.3927007299270073, "grad_norm": 4.3125, "learning_rate": 1.5475107646150203e-06, "loss": 1.3380858898162842, "step": 954 }, { "epoch": 1.3956204379562043, "grad_norm": 1.15625, "learning_rate": 1.5383404830910981e-06, "loss": 1.4054020643234253, "step": 956 }, { "epoch": 1.3985401459854014, "grad_norm": 10.5625, "learning_rate": 1.529198462396175e-06, "loss": 1.4239089488983154, "step": 958 }, { "epoch": 1.4014598540145986, "grad_norm": 9.25, "learning_rate": 1.5200849157593666e-06, "loss": 1.610469102859497, "step": 960 }, { "epoch": 1.4043795620437955, "grad_norm": 1.71875, "learning_rate": 1.5110000557456542e-06, "loss": 1.1694961786270142, "step": 962 }, { "epoch": 1.4072992700729927, "grad_norm": 5.625, "learning_rate": 1.5019440942509312e-06, "loss": 1.5139713287353516, "step": 964 }, { "epoch": 1.4102189781021899, "grad_norm": 3.953125, "learning_rate": 1.4929172424970576e-06, "loss": 1.376784324645996, "step": 966 }, { "epoch": 1.4131386861313868, "grad_norm": 2.34375, "learning_rate": 1.483919711026939e-06, "loss": 1.3103041648864746, "step": 968 }, { "epoch": 1.416058394160584, "grad_norm": 1.328125, "learning_rate": 1.4749517096996116e-06, "loss": 1.2476757764816284, "step": 970 }, { "epoch": 1.4189781021897812, "grad_norm": 3.703125, "learning_rate": 1.4660134476853485e-06, "loss": 1.3406193256378174, "step": 972 }, { "epoch": 1.421897810218978, "grad_norm": 4.375, "learning_rate": 1.4571051334607813e-06, "loss": 1.2700021266937256, "step": 974 }, { "epoch": 1.4248175182481753, "grad_norm": 2.90625, "learning_rate": 1.4482269748040358e-06, "loss": 1.2266380786895752, "step": 976 }, { "epoch": 1.4277372262773722, "grad_norm": 3.3125, "learning_rate": 1.4393791787898896e-06, "loss": 1.189935564994812, "step": 978 }, { "epoch": 1.4306569343065694, "grad_norm": 4.8125, "learning_rate": 1.430561951784938e-06, "loss": 1.4163111448287964, "step": 980 }, { "epoch": 1.4335766423357663, "grad_norm": 7.125, "learning_rate": 1.4217754994427844e-06, "loss": 1.6390494108200073, "step": 982 }, { "epoch": 1.4364963503649635, "grad_norm": 1.6640625, "learning_rate": 1.4130200266992408e-06, "loss": 1.1357786655426025, "step": 984 }, { "epoch": 1.4394160583941606, "grad_norm": 3.5625, "learning_rate": 1.4042957377675484e-06, "loss": 1.2841823101043701, "step": 986 }, { "epoch": 1.4423357664233576, "grad_norm": 7.34375, "learning_rate": 1.395602836133616e-06, "loss": 1.3807730674743652, "step": 988 }, { "epoch": 1.4452554744525548, "grad_norm": 1.421875, "learning_rate": 1.386941524551273e-06, "loss": 1.135375738143921, "step": 990 }, { "epoch": 1.448175182481752, "grad_norm": 2.875, "learning_rate": 1.37831200503754e-06, "loss": 1.1764510869979858, "step": 992 }, { "epoch": 1.4510948905109489, "grad_norm": 8.9375, "learning_rate": 1.3697144788679174e-06, "loss": 1.2467272281646729, "step": 994 }, { "epoch": 1.454014598540146, "grad_norm": 5.90625, "learning_rate": 1.3611491465716898e-06, "loss": 1.4708714485168457, "step": 996 }, { "epoch": 1.4569343065693432, "grad_norm": 3.71875, "learning_rate": 1.3526162079272495e-06, "loss": 1.402409553527832, "step": 998 }, { "epoch": 1.4598540145985401, "grad_norm": 4.59375, "learning_rate": 1.34411586195744e-06, "loss": 1.2477829456329346, "step": 1000 }, { "epoch": 1.4627737226277373, "grad_norm": 4.0625, "learning_rate": 1.3356483069249088e-06, "loss": 1.3877084255218506, "step": 1002 }, { "epoch": 1.4656934306569342, "grad_norm": 7.875, "learning_rate": 1.3272137403274844e-06, "loss": 1.555393934249878, "step": 1004 }, { "epoch": 1.4686131386861314, "grad_norm": 3.671875, "learning_rate": 1.318812358893572e-06, "loss": 1.3621551990509033, "step": 1006 }, { "epoch": 1.4715328467153284, "grad_norm": 4.59375, "learning_rate": 1.3104443585775642e-06, "loss": 1.3545817136764526, "step": 1008 }, { "epoch": 1.4744525547445255, "grad_norm": 3.9375, "learning_rate": 1.3021099345552695e-06, "loss": 1.4017988443374634, "step": 1010 }, { "epoch": 1.4773722627737227, "grad_norm": 6.21875, "learning_rate": 1.2938092812193615e-06, "loss": 1.3940372467041016, "step": 1012 }, { "epoch": 1.4802919708029196, "grad_norm": 3.1875, "learning_rate": 1.285542592174842e-06, "loss": 1.1765646934509277, "step": 1014 }, { "epoch": 1.4832116788321168, "grad_norm": 6.0, "learning_rate": 1.277310060234529e-06, "loss": 1.385852336883545, "step": 1016 }, { "epoch": 1.486131386861314, "grad_norm": 4.8125, "learning_rate": 1.2691118774145577e-06, "loss": 1.395111322402954, "step": 1018 }, { "epoch": 1.489051094890511, "grad_norm": 1.640625, "learning_rate": 1.2609482349299021e-06, "loss": 1.325355052947998, "step": 1020 }, { "epoch": 1.491970802919708, "grad_norm": 3.515625, "learning_rate": 1.2528193231899156e-06, "loss": 1.2050141096115112, "step": 1022 }, { "epoch": 1.4948905109489052, "grad_norm": 4.03125, "learning_rate": 1.2447253317938871e-06, "loss": 1.6511290073394775, "step": 1024 }, { "epoch": 1.4978102189781022, "grad_norm": 3.609375, "learning_rate": 1.236666449526623e-06, "loss": 1.28155517578125, "step": 1026 }, { "epoch": 1.5007299270072991, "grad_norm": 3.734375, "learning_rate": 1.2286428643540418e-06, "loss": 1.4207556247711182, "step": 1028 }, { "epoch": 1.5036496350364965, "grad_norm": 3.359375, "learning_rate": 1.22065476341879e-06, "loss": 1.3519251346588135, "step": 1030 }, { "epoch": 1.5065693430656935, "grad_norm": 5.84375, "learning_rate": 1.2127023330358777e-06, "loss": 1.396289587020874, "step": 1032 }, { "epoch": 1.5094890510948904, "grad_norm": 2.65625, "learning_rate": 1.204785758688331e-06, "loss": 1.3400771617889404, "step": 1034 }, { "epoch": 1.5124087591240876, "grad_norm": 31.25, "learning_rate": 1.1969052250228683e-06, "loss": 1.1934255361557007, "step": 1036 }, { "epoch": 1.5153284671532847, "grad_norm": 4.90625, "learning_rate": 1.1890609158455949e-06, "loss": 1.4513096809387207, "step": 1038 }, { "epoch": 1.5182481751824817, "grad_norm": 2.625, "learning_rate": 1.181253014117711e-06, "loss": 1.1264418363571167, "step": 1040 }, { "epoch": 1.5211678832116788, "grad_norm": 1.65625, "learning_rate": 1.1734817019512465e-06, "loss": 1.1497807502746582, "step": 1042 }, { "epoch": 1.524087591240876, "grad_norm": 7.8125, "learning_rate": 1.1657471606048157e-06, "loss": 1.6058242321014404, "step": 1044 }, { "epoch": 1.527007299270073, "grad_norm": 22.25, "learning_rate": 1.1580495704793874e-06, "loss": 1.4766197204589844, "step": 1046 }, { "epoch": 1.5299270072992701, "grad_norm": 3.75, "learning_rate": 1.1503891111140767e-06, "loss": 1.2432148456573486, "step": 1048 }, { "epoch": 1.5328467153284673, "grad_norm": 28.25, "learning_rate": 1.1427659611819604e-06, "loss": 1.1451390981674194, "step": 1050 }, { "epoch": 1.5357664233576642, "grad_norm": 3.734375, "learning_rate": 1.1351802984859045e-06, "loss": 1.3471091985702515, "step": 1052 }, { "epoch": 1.5386861313868612, "grad_norm": 1.640625, "learning_rate": 1.127632299954423e-06, "loss": 1.1958954334259033, "step": 1054 }, { "epoch": 1.5416058394160586, "grad_norm": 10.8125, "learning_rate": 1.1201221416375456e-06, "loss": 1.3556766510009766, "step": 1056 }, { "epoch": 1.5445255474452555, "grad_norm": 4.75, "learning_rate": 1.1126499987027172e-06, "loss": 1.6111273765563965, "step": 1058 }, { "epoch": 1.5474452554744524, "grad_norm": 12.5, "learning_rate": 1.1052160454307085e-06, "loss": 1.5189365148544312, "step": 1060 }, { "epoch": 1.5503649635036496, "grad_norm": 3.96875, "learning_rate": 1.0978204552115493e-06, "loss": 1.3763346672058105, "step": 1062 }, { "epoch": 1.5532846715328468, "grad_norm": 4.375, "learning_rate": 1.0904634005404902e-06, "loss": 1.450345754623413, "step": 1064 }, { "epoch": 1.5562043795620437, "grad_norm": 4.09375, "learning_rate": 1.0831450530139747e-06, "loss": 1.2109770774841309, "step": 1066 }, { "epoch": 1.5591240875912409, "grad_norm": 7.0, "learning_rate": 1.0758655833256381e-06, "loss": 1.2681195735931396, "step": 1068 }, { "epoch": 1.562043795620438, "grad_norm": 2.640625, "learning_rate": 1.0686251612623277e-06, "loss": 1.2694846391677856, "step": 1070 }, { "epoch": 1.564963503649635, "grad_norm": 8.6875, "learning_rate": 1.0614239557001389e-06, "loss": 1.5101749897003174, "step": 1072 }, { "epoch": 1.5678832116788322, "grad_norm": 3.171875, "learning_rate": 1.0542621346004806e-06, "loss": 1.313795566558838, "step": 1074 }, { "epoch": 1.5708029197080293, "grad_norm": 9.0, "learning_rate": 1.047139865006155e-06, "loss": 1.1664808988571167, "step": 1076 }, { "epoch": 1.5737226277372263, "grad_norm": 2.03125, "learning_rate": 1.0400573130374641e-06, "loss": 1.203639030456543, "step": 1078 }, { "epoch": 1.5766423357664232, "grad_norm": 3.265625, "learning_rate": 1.0330146438883304e-06, "loss": 1.5285131931304932, "step": 1080 }, { "epoch": 1.5795620437956206, "grad_norm": 6.5625, "learning_rate": 1.0260120218224485e-06, "loss": 1.516188144683838, "step": 1082 }, { "epoch": 1.5824817518248175, "grad_norm": 6.9375, "learning_rate": 1.019049610169452e-06, "loss": 1.3165411949157715, "step": 1084 }, { "epoch": 1.5854014598540145, "grad_norm": 4.6875, "learning_rate": 1.012127571321104e-06, "loss": 1.1730577945709229, "step": 1086 }, { "epoch": 1.5883211678832116, "grad_norm": 4.46875, "learning_rate": 1.0052460667275102e-06, "loss": 1.3837532997131348, "step": 1088 }, { "epoch": 1.5912408759124088, "grad_norm": 4.71875, "learning_rate": 9.984052568933507e-07, "loss": 1.342604398727417, "step": 1090 }, { "epoch": 1.5941605839416058, "grad_norm": 1.8046875, "learning_rate": 9.916053013741396e-07, "loss": 1.0345500707626343, "step": 1092 }, { "epoch": 1.597080291970803, "grad_norm": 3.578125, "learning_rate": 9.848463587725024e-07, "loss": 1.3031237125396729, "step": 1094 }, { "epoch": 1.6, "grad_norm": 2.6875, "learning_rate": 9.78128586734476e-07, "loss": 1.4126646518707275, "step": 1096 }, { "epoch": 1.602919708029197, "grad_norm": 2.796875, "learning_rate": 9.714521419458333e-07, "loss": 1.2036532163619995, "step": 1098 }, { "epoch": 1.6058394160583942, "grad_norm": 5.34375, "learning_rate": 9.648171801284254e-07, "loss": 1.3445477485656738, "step": 1100 }, { "epoch": 1.6087591240875914, "grad_norm": 6.875, "learning_rate": 9.582238560365534e-07, "loss": 1.4824466705322266, "step": 1102 }, { "epoch": 1.6116788321167883, "grad_norm": 2.171875, "learning_rate": 9.516723234533573e-07, "loss": 0.6945338845252991, "step": 1104 }, { "epoch": 1.6145985401459853, "grad_norm": 4.375, "learning_rate": 9.451627351872289e-07, "loss": 1.691240906715393, "step": 1106 }, { "epoch": 1.6175182481751826, "grad_norm": 5.0625, "learning_rate": 9.386952430682478e-07, "loss": 1.6143536567687988, "step": 1108 }, { "epoch": 1.6204379562043796, "grad_norm": 4.90625, "learning_rate": 9.322699979446395e-07, "loss": 1.0810116529464722, "step": 1110 }, { "epoch": 1.6233576642335765, "grad_norm": 3.953125, "learning_rate": 9.25887149679259e-07, "loss": 1.3443822860717773, "step": 1112 }, { "epoch": 1.6262773722627737, "grad_norm": 2.5, "learning_rate": 9.19546847146093e-07, "loss": 1.392272710800171, "step": 1114 }, { "epoch": 1.6291970802919709, "grad_norm": 2.890625, "learning_rate": 9.132492382267895e-07, "loss": 1.2860863208770752, "step": 1116 }, { "epoch": 1.6321167883211678, "grad_norm": 6.03125, "learning_rate": 9.069944698072071e-07, "loss": 1.4681463241577148, "step": 1118 }, { "epoch": 1.635036496350365, "grad_norm": 1.828125, "learning_rate": 9.0078268777399e-07, "loss": 1.1984715461730957, "step": 1120 }, { "epoch": 1.6379562043795621, "grad_norm": 3.328125, "learning_rate": 8.946140370111651e-07, "loss": 1.3620171546936035, "step": 1122 }, { "epoch": 1.640875912408759, "grad_norm": 2.5625, "learning_rate": 8.884886613967625e-07, "loss": 1.0197124481201172, "step": 1124 }, { "epoch": 1.6437956204379562, "grad_norm": 2.96875, "learning_rate": 8.824067037994597e-07, "loss": 1.2507963180541992, "step": 1126 }, { "epoch": 1.6467153284671534, "grad_norm": 7.59375, "learning_rate": 8.763683060752492e-07, "loss": 1.5034403800964355, "step": 1128 }, { "epoch": 1.6496350364963503, "grad_norm": 3.703125, "learning_rate": 8.703736090641302e-07, "loss": 1.250478744506836, "step": 1130 }, { "epoch": 1.6525547445255473, "grad_norm": 2.921875, "learning_rate": 8.644227525868238e-07, "loss": 1.2682870626449585, "step": 1132 }, { "epoch": 1.6554744525547447, "grad_norm": 8.5, "learning_rate": 8.585158754415114e-07, "loss": 1.5448431968688965, "step": 1134 }, { "epoch": 1.6583941605839416, "grad_norm": 5.65625, "learning_rate": 8.52653115400598e-07, "loss": 1.3879718780517578, "step": 1136 }, { "epoch": 1.6613138686131386, "grad_norm": 3.3125, "learning_rate": 8.468346092074961e-07, "loss": 1.3755671977996826, "step": 1138 }, { "epoch": 1.6642335766423357, "grad_norm": 2.75, "learning_rate": 8.410604925734411e-07, "loss": 1.1513915061950684, "step": 1140 }, { "epoch": 1.667153284671533, "grad_norm": 14.1875, "learning_rate": 8.35330900174322e-07, "loss": 1.5474663972854614, "step": 1142 }, { "epoch": 1.6700729927007298, "grad_norm": 3.515625, "learning_rate": 8.296459656475413e-07, "loss": 0.8504141569137573, "step": 1144 }, { "epoch": 1.672992700729927, "grad_norm": 5.78125, "learning_rate": 8.240058215888998e-07, "loss": 1.3289515972137451, "step": 1146 }, { "epoch": 1.6759124087591242, "grad_norm": 6.9375, "learning_rate": 8.184105995494998e-07, "loss": 0.9470740556716919, "step": 1148 }, { "epoch": 1.6788321167883211, "grad_norm": 3.359375, "learning_rate": 8.128604300326812e-07, "loss": 1.352350115776062, "step": 1150 }, { "epoch": 1.6817518248175183, "grad_norm": 4.78125, "learning_rate": 8.073554424909755e-07, "loss": 1.3660526275634766, "step": 1152 }, { "epoch": 1.6846715328467154, "grad_norm": 2.46875, "learning_rate": 8.01895765323087e-07, "loss": 1.2722463607788086, "step": 1154 }, { "epoch": 1.6875912408759124, "grad_norm": 6.21875, "learning_rate": 7.964815258708971e-07, "loss": 1.13301420211792, "step": 1156 }, { "epoch": 1.6905109489051093, "grad_norm": 2.03125, "learning_rate": 7.911128504164947e-07, "loss": 1.3945411443710327, "step": 1158 }, { "epoch": 1.6934306569343067, "grad_norm": 1.7421875, "learning_rate": 7.857898641792322e-07, "loss": 1.1629891395568848, "step": 1160 }, { "epoch": 1.6963503649635037, "grad_norm": 2.09375, "learning_rate": 7.805126913128018e-07, "loss": 1.1993281841278076, "step": 1162 }, { "epoch": 1.6992700729927006, "grad_norm": 3.0625, "learning_rate": 7.752814549023437e-07, "loss": 1.4611374139785767, "step": 1164 }, { "epoch": 1.7021897810218978, "grad_norm": 4.625, "learning_rate": 7.700962769615704e-07, "loss": 1.1919968128204346, "step": 1166 }, { "epoch": 1.705109489051095, "grad_norm": 2.515625, "learning_rate": 7.649572784299255e-07, "loss": 1.2250781059265137, "step": 1168 }, { "epoch": 1.7080291970802919, "grad_norm": 8.1875, "learning_rate": 7.598645791697601e-07, "loss": 1.3479260206222534, "step": 1170 }, { "epoch": 1.710948905109489, "grad_norm": 4.25, "learning_rate": 7.548182979635389e-07, "loss": 1.3197946548461914, "step": 1172 }, { "epoch": 1.7138686131386862, "grad_norm": 8.6875, "learning_rate": 7.49818552511068e-07, "loss": 1.1691796779632568, "step": 1174 }, { "epoch": 1.7167883211678832, "grad_norm": 3.203125, "learning_rate": 7.448654594267496e-07, "loss": 1.2978925704956055, "step": 1176 }, { "epoch": 1.7197080291970803, "grad_norm": 2.96875, "learning_rate": 7.399591342368644e-07, "loss": 1.174210786819458, "step": 1178 }, { "epoch": 1.7226277372262775, "grad_norm": 4.625, "learning_rate": 7.350996913768743e-07, "loss": 1.2740840911865234, "step": 1180 }, { "epoch": 1.7255474452554744, "grad_norm": 8.0625, "learning_rate": 7.302872441887562e-07, "loss": 1.1019668579101562, "step": 1182 }, { "epoch": 1.7284671532846714, "grad_norm": 2.84375, "learning_rate": 7.255219049183552e-07, "loss": 1.3885023593902588, "step": 1184 }, { "epoch": 1.7313868613138688, "grad_norm": 5.625, "learning_rate": 7.208037847127683e-07, "loss": 1.5192725658416748, "step": 1186 }, { "epoch": 1.7343065693430657, "grad_norm": 6.625, "learning_rate": 7.161329936177522e-07, "loss": 1.3260494470596313, "step": 1188 }, { "epoch": 1.7372262773722627, "grad_norm": 3.375, "learning_rate": 7.115096405751567e-07, "loss": 1.3762927055358887, "step": 1190 }, { "epoch": 1.7401459854014598, "grad_norm": 1.8515625, "learning_rate": 7.069338334203818e-07, "loss": 1.0026099681854248, "step": 1192 }, { "epoch": 1.743065693430657, "grad_norm": 1.1015625, "learning_rate": 7.024056788798658e-07, "loss": 1.1264629364013672, "step": 1194 }, { "epoch": 1.745985401459854, "grad_norm": 16.75, "learning_rate": 6.979252825685927e-07, "loss": 1.5443601608276367, "step": 1196 }, { "epoch": 1.748905109489051, "grad_norm": 1.8671875, "learning_rate": 6.934927489876312e-07, "loss": 1.0794442892074585, "step": 1198 }, { "epoch": 1.7518248175182483, "grad_norm": 6.90625, "learning_rate": 6.891081815216958e-07, "loss": 1.348907470703125, "step": 1200 }, { "epoch": 1.7547445255474452, "grad_norm": 3.140625, "learning_rate": 6.847716824367369e-07, "loss": 1.3414909839630127, "step": 1202 }, { "epoch": 1.7576642335766424, "grad_norm": 4.59375, "learning_rate": 6.804833528775531e-07, "loss": 1.4073083400726318, "step": 1204 }, { "epoch": 1.7605839416058395, "grad_norm": 3.671875, "learning_rate": 6.762432928654358e-07, "loss": 0.8366962671279907, "step": 1206 }, { "epoch": 1.7635036496350365, "grad_norm": 5.53125, "learning_rate": 6.720516012958325e-07, "loss": 1.3547214269638062, "step": 1208 }, { "epoch": 1.7664233576642334, "grad_norm": 5.21875, "learning_rate": 6.679083759360433e-07, "loss": 1.6114599704742432, "step": 1210 }, { "epoch": 1.7693430656934308, "grad_norm": 4.5, "learning_rate": 6.638137134229375e-07, "loss": 1.5248315334320068, "step": 1212 }, { "epoch": 1.7722627737226277, "grad_norm": 3.6875, "learning_rate": 6.597677092607025e-07, "loss": 1.093032956123352, "step": 1214 }, { "epoch": 1.7751824817518247, "grad_norm": 4.5, "learning_rate": 6.557704578186146e-07, "loss": 1.408461093902588, "step": 1216 }, { "epoch": 1.7781021897810219, "grad_norm": 9.9375, "learning_rate": 6.518220523288382e-07, "loss": 1.3268358707427979, "step": 1218 }, { "epoch": 1.781021897810219, "grad_norm": 4.75, "learning_rate": 6.479225848842523e-07, "loss": 1.544386386871338, "step": 1220 }, { "epoch": 1.783941605839416, "grad_norm": 5.9375, "learning_rate": 6.440721464362998e-07, "loss": 1.4272065162658691, "step": 1222 }, { "epoch": 1.7868613138686131, "grad_norm": 3.515625, "learning_rate": 6.402708267928694e-07, "loss": 1.3150466680526733, "step": 1224 }, { "epoch": 1.7897810218978103, "grad_norm": 5.0, "learning_rate": 6.365187146161991e-07, "loss": 1.2979998588562012, "step": 1226 }, { "epoch": 1.7927007299270072, "grad_norm": 4.75, "learning_rate": 6.32815897420809e-07, "loss": 1.6841963529586792, "step": 1228 }, { "epoch": 1.7956204379562044, "grad_norm": 5.0, "learning_rate": 6.29162461571459e-07, "loss": 1.6227900981903076, "step": 1230 }, { "epoch": 1.7985401459854016, "grad_norm": 11.6875, "learning_rate": 6.25558492281135e-07, "loss": 1.4919426441192627, "step": 1232 }, { "epoch": 1.8014598540145985, "grad_norm": 4.8125, "learning_rate": 6.220040736090617e-07, "loss": 1.3797836303710938, "step": 1234 }, { "epoch": 1.8043795620437955, "grad_norm": 4.09375, "learning_rate": 6.18499288458743e-07, "loss": 1.6902371644973755, "step": 1236 }, { "epoch": 1.8072992700729928, "grad_norm": 2.453125, "learning_rate": 6.150442185760258e-07, "loss": 1.2298048734664917, "step": 1238 }, { "epoch": 1.8102189781021898, "grad_norm": 4.53125, "learning_rate": 6.116389445471948e-07, "loss": 1.3514063358306885, "step": 1240 }, { "epoch": 1.8131386861313867, "grad_norm": 3.828125, "learning_rate": 6.082835457970935e-07, "loss": 1.3649213314056396, "step": 1242 }, { "epoch": 1.816058394160584, "grad_norm": 4.15625, "learning_rate": 6.0497810058727e-07, "loss": 1.3873786926269531, "step": 1244 }, { "epoch": 1.818978102189781, "grad_norm": 5.21875, "learning_rate": 6.017226860141535e-07, "loss": 1.6073391437530518, "step": 1246 }, { "epoch": 1.821897810218978, "grad_norm": 2.90625, "learning_rate": 5.985173780072558e-07, "loss": 1.333566427230835, "step": 1248 }, { "epoch": 1.8248175182481752, "grad_norm": 3.0625, "learning_rate": 5.953622513273977e-07, "loss": 1.3585089445114136, "step": 1250 }, { "epoch": 1.8277372262773723, "grad_norm": 3.953125, "learning_rate": 5.92257379564969e-07, "loss": 1.195847749710083, "step": 1252 }, { "epoch": 1.8306569343065693, "grad_norm": 4.84375, "learning_rate": 5.892028351382101e-07, "loss": 1.4418195486068726, "step": 1254 }, { "epoch": 1.8335766423357664, "grad_norm": 4.09375, "learning_rate": 5.861986892915227e-07, "loss": 1.384018063545227, "step": 1256 }, { "epoch": 1.8364963503649636, "grad_norm": 9.4375, "learning_rate": 5.832450120938093e-07, "loss": 1.3380024433135986, "step": 1258 }, { "epoch": 1.8394160583941606, "grad_norm": 6.46875, "learning_rate": 5.803418724368373e-07, "loss": 1.3088436126708984, "step": 1260 }, { "epoch": 1.8423357664233575, "grad_norm": 9.9375, "learning_rate": 5.774893380336338e-07, "loss": 1.5858633518218994, "step": 1262 }, { "epoch": 1.845255474452555, "grad_norm": 6.375, "learning_rate": 5.746874754169053e-07, "loss": 1.5293078422546387, "step": 1264 }, { "epoch": 1.8481751824817518, "grad_norm": 2.921875, "learning_rate": 5.719363499374861e-07, "loss": 1.1518256664276123, "step": 1266 }, { "epoch": 1.8510948905109488, "grad_norm": 7.6875, "learning_rate": 5.692360257628144e-07, "loss": 1.3224802017211914, "step": 1268 }, { "epoch": 1.854014598540146, "grad_norm": 4.28125, "learning_rate": 5.665865658754341e-07, "loss": 1.2233679294586182, "step": 1270 }, { "epoch": 1.856934306569343, "grad_norm": 6.34375, "learning_rate": 5.639880320715284e-07, "loss": 1.4993672370910645, "step": 1272 }, { "epoch": 1.85985401459854, "grad_norm": 3.703125, "learning_rate": 5.614404849594762e-07, "loss": 1.3802194595336914, "step": 1274 }, { "epoch": 1.8627737226277372, "grad_norm": 2.5625, "learning_rate": 5.589439839584404e-07, "loss": 1.0489559173583984, "step": 1276 }, { "epoch": 1.8656934306569344, "grad_norm": 1.40625, "learning_rate": 5.564985872969791e-07, "loss": 1.2326107025146484, "step": 1278 }, { "epoch": 1.8686131386861313, "grad_norm": 5.4375, "learning_rate": 5.541043520116912e-07, "loss": 1.1945993900299072, "step": 1280 }, { "epoch": 1.8715328467153285, "grad_norm": 2.625, "learning_rate": 5.517613339458832e-07, "loss": 1.2813007831573486, "step": 1282 }, { "epoch": 1.8744525547445257, "grad_norm": 4.46875, "learning_rate": 5.494695877482676e-07, "loss": 1.1684314012527466, "step": 1284 }, { "epoch": 1.8773722627737226, "grad_norm": 3.71875, "learning_rate": 5.472291668716893e-07, "loss": 1.222388505935669, "step": 1286 }, { "epoch": 1.8802919708029195, "grad_norm": 2.984375, "learning_rate": 5.450401235718762e-07, "loss": 1.2156729698181152, "step": 1288 }, { "epoch": 1.883211678832117, "grad_norm": 5.96875, "learning_rate": 5.42902508906224e-07, "loss": 1.311574935913086, "step": 1290 }, { "epoch": 1.8861313868613139, "grad_norm": 7.96875, "learning_rate": 5.408163727326021e-07, "loss": 1.34036123752594, "step": 1292 }, { "epoch": 1.8890510948905108, "grad_norm": 3.640625, "learning_rate": 5.387817637081928e-07, "loss": 1.1132798194885254, "step": 1294 }, { "epoch": 1.891970802919708, "grad_norm": 3.359375, "learning_rate": 5.367987292883554e-07, "loss": 1.3646128177642822, "step": 1296 }, { "epoch": 1.8948905109489051, "grad_norm": 5.1875, "learning_rate": 5.348673157255195e-07, "loss": 1.4554338455200195, "step": 1298 }, { "epoch": 1.897810218978102, "grad_norm": 3.96875, "learning_rate": 5.329875680681065e-07, "loss": 1.4109296798706055, "step": 1300 }, { "epoch": 1.9007299270072993, "grad_norm": 4.875, "learning_rate": 5.311595301594783e-07, "loss": 1.1961219310760498, "step": 1302 }, { "epoch": 1.9036496350364964, "grad_norm": 2.921875, "learning_rate": 5.293832446369158e-07, "loss": 0.6657427549362183, "step": 1304 }, { "epoch": 1.9065693430656934, "grad_norm": 10.4375, "learning_rate": 5.276587529306236e-07, "loss": 1.397131323814392, "step": 1306 }, { "epoch": 1.9094890510948905, "grad_norm": 6.5, "learning_rate": 5.25986095262763e-07, "loss": 1.323398470878601, "step": 1308 }, { "epoch": 1.9124087591240877, "grad_norm": 3.203125, "learning_rate": 5.243653106465157e-07, "loss": 1.3060777187347412, "step": 1310 }, { "epoch": 1.9153284671532846, "grad_norm": 5.71875, "learning_rate": 5.227964368851721e-07, "loss": 1.5433318614959717, "step": 1312 }, { "epoch": 1.9182481751824818, "grad_norm": 3.359375, "learning_rate": 5.212795105712508e-07, "loss": 1.4788509607315063, "step": 1314 }, { "epoch": 1.921167883211679, "grad_norm": 4.8125, "learning_rate": 5.198145670856438e-07, "loss": 1.3976120948791504, "step": 1316 }, { "epoch": 1.924087591240876, "grad_norm": 2.0625, "learning_rate": 5.184016405967931e-07, "loss": 1.1872693300247192, "step": 1318 }, { "epoch": 1.9270072992700729, "grad_norm": 2.296875, "learning_rate": 5.170407640598921e-07, "loss": 1.1601970195770264, "step": 1320 }, { "epoch": 1.92992700729927, "grad_norm": 3.5625, "learning_rate": 5.157319692161178e-07, "loss": 1.205195426940918, "step": 1322 }, { "epoch": 1.9328467153284672, "grad_norm": 3.734375, "learning_rate": 5.144752865918901e-07, "loss": 1.1591906547546387, "step": 1324 }, { "epoch": 1.9357664233576641, "grad_norm": 3.421875, "learning_rate": 5.132707454981602e-07, "loss": 1.3498120307922363, "step": 1326 }, { "epoch": 1.9386861313868613, "grad_norm": 3.796875, "learning_rate": 5.121183740297261e-07, "loss": 1.3916034698486328, "step": 1328 }, { "epoch": 1.9416058394160585, "grad_norm": 17.375, "learning_rate": 5.110181990645788e-07, "loss": 1.2117153406143188, "step": 1330 }, { "epoch": 1.9445255474452554, "grad_norm": 1.734375, "learning_rate": 5.099702462632737e-07, "loss": 1.19834566116333, "step": 1332 }, { "epoch": 1.9474452554744526, "grad_norm": 10.0625, "learning_rate": 5.089745400683333e-07, "loss": 0.8368179798126221, "step": 1334 }, { "epoch": 1.9503649635036497, "grad_norm": 5.625, "learning_rate": 5.080311037036767e-07, "loss": 1.314239263534546, "step": 1336 }, { "epoch": 1.9532846715328467, "grad_norm": 1.65625, "learning_rate": 5.071399591740777e-07, "loss": 1.216627597808838, "step": 1338 }, { "epoch": 1.9562043795620438, "grad_norm": 6.375, "learning_rate": 5.063011272646521e-07, "loss": 1.2274556159973145, "step": 1340 }, { "epoch": 1.959124087591241, "grad_norm": 2.546875, "learning_rate": 5.055146275403725e-07, "loss": 1.4812201261520386, "step": 1342 }, { "epoch": 1.962043795620438, "grad_norm": 5.71875, "learning_rate": 5.047804783456117e-07, "loss": 1.215821623802185, "step": 1344 }, { "epoch": 1.964963503649635, "grad_norm": 4.71875, "learning_rate": 5.040986968037157e-07, "loss": 1.318119764328003, "step": 1346 }, { "epoch": 1.967883211678832, "grad_norm": 2.953125, "learning_rate": 5.034692988166033e-07, "loss": 1.2136964797973633, "step": 1348 }, { "epoch": 1.9708029197080292, "grad_norm": 4.125, "learning_rate": 5.028922990643963e-07, "loss": 1.3341786861419678, "step": 1350 }, { "epoch": 1.9737226277372262, "grad_norm": 3.75, "learning_rate": 5.023677110050759e-07, "loss": 1.4188188314437866, "step": 1352 }, { "epoch": 1.9766423357664233, "grad_norm": 3.421875, "learning_rate": 5.018955468741701e-07, "loss": 1.608628511428833, "step": 1354 }, { "epoch": 1.9795620437956205, "grad_norm": 3.359375, "learning_rate": 5.014758176844665e-07, "loss": 1.5936325788497925, "step": 1356 }, { "epoch": 1.9824817518248175, "grad_norm": 2.796875, "learning_rate": 5.011085332257579e-07, "loss": 1.178612232208252, "step": 1358 }, { "epoch": 1.9854014598540146, "grad_norm": 7.1875, "learning_rate": 5.007937020646117e-07, "loss": 1.1231637001037598, "step": 1360 }, { "epoch": 1.9883211678832118, "grad_norm": 1.90625, "learning_rate": 5.005313315441716e-07, "loss": 0.6363063454627991, "step": 1362 }, { "epoch": 1.9912408759124087, "grad_norm": 5.5, "learning_rate": 5.003214277839851e-07, "loss": 1.3855026960372925, "step": 1364 }, { "epoch": 1.994160583941606, "grad_norm": 5.6875, "learning_rate": 5.00163995679862e-07, "loss": 1.346792459487915, "step": 1366 }, { "epoch": 1.997080291970803, "grad_norm": 8.1875, "learning_rate": 5.000590389037593e-07, "loss": 1.3148702383041382, "step": 1368 }, { "epoch": 2.0, "grad_norm": 4.0625, "learning_rate": 5.00006559903696e-07, "loss": 1.6425683498382568, "step": 1370 }, { "epoch": 2.0, "step": 1370, "total_flos": 1.984544544032555e+18, "train_loss": 1.409229011779284, "train_runtime": 8212.4061, "train_samples_per_second": 2.669, "train_steps_per_second": 0.167 } ], "logging_steps": 2, "max_steps": 1370, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.984544544032555e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }