{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1410, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00425531914893617, "grad_norm": 0.9765625, "learning_rate": 1.6901408450704225e-07, "loss": 1.472063422203064, "step": 2 }, { "epoch": 0.00851063829787234, "grad_norm": 0.94140625, "learning_rate": 5.070422535211268e-07, "loss": 1.9255280494689941, "step": 4 }, { "epoch": 0.01276595744680851, "grad_norm": 1.9296875, "learning_rate": 8.450704225352114e-07, "loss": 1.9632502794265747, "step": 6 }, { "epoch": 0.01702127659574468, "grad_norm": 1.34375, "learning_rate": 1.1830985915492958e-06, "loss": 1.6374425888061523, "step": 8 }, { "epoch": 0.02127659574468085, "grad_norm": 1.1953125, "learning_rate": 1.5211267605633803e-06, "loss": 1.959162950515747, "step": 10 }, { "epoch": 0.02553191489361702, "grad_norm": 0.9140625, "learning_rate": 1.8591549295774647e-06, "loss": 1.4726247787475586, "step": 12 }, { "epoch": 0.029787234042553193, "grad_norm": 1.765625, "learning_rate": 2.1971830985915494e-06, "loss": 2.0769670009613037, "step": 14 }, { "epoch": 0.03404255319148936, "grad_norm": 0.953125, "learning_rate": 2.535211267605634e-06, "loss": 1.9118707180023193, "step": 16 }, { "epoch": 0.03829787234042553, "grad_norm": 1.171875, "learning_rate": 2.8732394366197183e-06, "loss": 1.7807828187942505, "step": 18 }, { "epoch": 0.0425531914893617, "grad_norm": 1.5390625, "learning_rate": 3.2112676056338028e-06, "loss": 1.9391090869903564, "step": 20 }, { "epoch": 0.04680851063829787, "grad_norm": 0.72265625, "learning_rate": 3.549295774647887e-06, "loss": 1.6522634029388428, "step": 22 }, { "epoch": 0.05106382978723404, "grad_norm": 1.1328125, "learning_rate": 3.887323943661972e-06, "loss": 1.692237138748169, "step": 24 }, { "epoch": 0.05531914893617021, "grad_norm": 1.1015625, "learning_rate": 4.225352112676056e-06, "loss": 1.443329930305481, "step": 26 }, { "epoch": 0.059574468085106386, "grad_norm": 1.0625, "learning_rate": 4.563380281690141e-06, "loss": 1.758739948272705, "step": 28 }, { "epoch": 0.06382978723404255, "grad_norm": 0.8515625, "learning_rate": 4.901408450704226e-06, "loss": 1.6877835988998413, "step": 30 }, { "epoch": 0.06808510638297872, "grad_norm": 1.5078125, "learning_rate": 5.2394366197183095e-06, "loss": 1.468690037727356, "step": 32 }, { "epoch": 0.07234042553191489, "grad_norm": 0.64453125, "learning_rate": 5.577464788732395e-06, "loss": 1.6828500032424927, "step": 34 }, { "epoch": 0.07659574468085106, "grad_norm": 1.0546875, "learning_rate": 5.915492957746479e-06, "loss": 1.6752516031265259, "step": 36 }, { "epoch": 0.08085106382978724, "grad_norm": 2.984375, "learning_rate": 6.253521126760563e-06, "loss": 1.9001795053482056, "step": 38 }, { "epoch": 0.0851063829787234, "grad_norm": 0.9296875, "learning_rate": 6.591549295774649e-06, "loss": 1.6414787769317627, "step": 40 }, { "epoch": 0.08936170212765958, "grad_norm": 0.8828125, "learning_rate": 6.929577464788733e-06, "loss": 1.3303271532058716, "step": 42 }, { "epoch": 0.09361702127659574, "grad_norm": 0.80078125, "learning_rate": 7.267605633802817e-06, "loss": 1.5457786321640015, "step": 44 }, { "epoch": 0.09787234042553192, "grad_norm": 0.54296875, "learning_rate": 7.605633802816902e-06, "loss": 1.4271644353866577, "step": 46 }, { "epoch": 0.10212765957446808, "grad_norm": 0.6484375, "learning_rate": 7.943661971830987e-06, "loss": 1.5979524850845337, "step": 48 }, { "epoch": 0.10638297872340426, "grad_norm": 1.0234375, "learning_rate": 8.28169014084507e-06, "loss": 1.6684672832489014, "step": 50 }, { "epoch": 0.11063829787234042, "grad_norm": 0.7109375, "learning_rate": 8.619718309859156e-06, "loss": 1.3746291399002075, "step": 52 }, { "epoch": 0.1148936170212766, "grad_norm": 0.5546875, "learning_rate": 8.95774647887324e-06, "loss": 1.4159908294677734, "step": 54 }, { "epoch": 0.11914893617021277, "grad_norm": 0.58984375, "learning_rate": 9.295774647887323e-06, "loss": 1.2559518814086914, "step": 56 }, { "epoch": 0.12340425531914893, "grad_norm": 0.6484375, "learning_rate": 9.63380281690141e-06, "loss": 1.4071341753005981, "step": 58 }, { "epoch": 0.1276595744680851, "grad_norm": 0.74609375, "learning_rate": 9.971830985915494e-06, "loss": 1.325224757194519, "step": 60 }, { "epoch": 0.13191489361702127, "grad_norm": 0.94921875, "learning_rate": 1.0309859154929577e-05, "loss": 1.2854632139205933, "step": 62 }, { "epoch": 0.13617021276595745, "grad_norm": 0.87890625, "learning_rate": 1.0647887323943662e-05, "loss": 1.0856443643569946, "step": 64 }, { "epoch": 0.14042553191489363, "grad_norm": 0.4296875, "learning_rate": 1.0985915492957746e-05, "loss": 1.4248228073120117, "step": 66 }, { "epoch": 0.14468085106382977, "grad_norm": 0.50390625, "learning_rate": 1.1323943661971831e-05, "loss": 1.3485311269760132, "step": 68 }, { "epoch": 0.14893617021276595, "grad_norm": 0.6328125, "learning_rate": 1.1661971830985917e-05, "loss": 1.3656905889511108, "step": 70 }, { "epoch": 0.15319148936170213, "grad_norm": 0.6171875, "learning_rate": 1.2e-05, "loss": 1.4325069189071655, "step": 72 }, { "epoch": 0.1574468085106383, "grad_norm": 0.69921875, "learning_rate": 1.1999947154376356e-05, "loss": 1.541415810585022, "step": 74 }, { "epoch": 0.16170212765957448, "grad_norm": 0.53515625, "learning_rate": 1.199978861866902e-05, "loss": 1.385392665863037, "step": 76 }, { "epoch": 0.16595744680851063, "grad_norm": 0.609375, "learning_rate": 1.19995243963688e-05, "loss": 1.2694331407546997, "step": 78 }, { "epoch": 0.1702127659574468, "grad_norm": 1.125, "learning_rate": 1.1999154493293607e-05, "loss": 1.5753132104873657, "step": 80 }, { "epoch": 0.17446808510638298, "grad_norm": 0.546875, "learning_rate": 1.1998678917588341e-05, "loss": 1.2333686351776123, "step": 82 }, { "epoch": 0.17872340425531916, "grad_norm": 0.63671875, "learning_rate": 1.1998097679724704e-05, "loss": 1.2822571992874146, "step": 84 }, { "epoch": 0.1829787234042553, "grad_norm": 0.5546875, "learning_rate": 1.1997410792500985e-05, "loss": 1.3188749551773071, "step": 86 }, { "epoch": 0.18723404255319148, "grad_norm": 0.71875, "learning_rate": 1.1996618271041757e-05, "loss": 1.3399384021759033, "step": 88 }, { "epoch": 0.19148936170212766, "grad_norm": 0.75390625, "learning_rate": 1.1995720132797555e-05, "loss": 1.3193027973175049, "step": 90 }, { "epoch": 0.19574468085106383, "grad_norm": 0.8515625, "learning_rate": 1.1994716397544498e-05, "loss": 1.30392324924469, "step": 92 }, { "epoch": 0.2, "grad_norm": 0.45703125, "learning_rate": 1.1993607087383841e-05, "loss": 1.1891350746154785, "step": 94 }, { "epoch": 0.20425531914893616, "grad_norm": 0.51171875, "learning_rate": 1.1992392226741494e-05, "loss": 1.2335644960403442, "step": 96 }, { "epoch": 0.20851063829787234, "grad_norm": 0.48046875, "learning_rate": 1.1991071842367492e-05, "loss": 1.3029327392578125, "step": 98 }, { "epoch": 0.2127659574468085, "grad_norm": 0.55859375, "learning_rate": 1.1989645963335381e-05, "loss": 1.2645999193191528, "step": 100 }, { "epoch": 0.2170212765957447, "grad_norm": 0.83203125, "learning_rate": 1.1988114621041614e-05, "loss": 1.2268767356872559, "step": 102 }, { "epoch": 0.22127659574468084, "grad_norm": 0.90234375, "learning_rate": 1.1986477849204828e-05, "loss": 1.1907193660736084, "step": 104 }, { "epoch": 0.225531914893617, "grad_norm": 0.64453125, "learning_rate": 1.1984735683865123e-05, "loss": 1.31586754322052, "step": 106 }, { "epoch": 0.2297872340425532, "grad_norm": 1.0703125, "learning_rate": 1.1982888163383247e-05, "loss": 1.299729347229004, "step": 108 }, { "epoch": 0.23404255319148937, "grad_norm": 1.84375, "learning_rate": 1.1980935328439775e-05, "loss": 1.708440899848938, "step": 110 }, { "epoch": 0.23829787234042554, "grad_norm": 0.73828125, "learning_rate": 1.1978877222034202e-05, "loss": 1.2829785346984863, "step": 112 }, { "epoch": 0.2425531914893617, "grad_norm": 0.68359375, "learning_rate": 1.197671388948399e-05, "loss": 1.272111415863037, "step": 114 }, { "epoch": 0.24680851063829787, "grad_norm": 0.421875, "learning_rate": 1.1974445378423578e-05, "loss": 1.3535809516906738, "step": 116 }, { "epoch": 0.251063829787234, "grad_norm": 0.85546875, "learning_rate": 1.1972071738803339e-05, "loss": 1.2550489902496338, "step": 118 }, { "epoch": 0.2553191489361702, "grad_norm": 1.0859375, "learning_rate": 1.1969593022888462e-05, "loss": 1.2029892206192017, "step": 120 }, { "epoch": 0.25957446808510637, "grad_norm": 0.80078125, "learning_rate": 1.1967009285257822e-05, "loss": 1.0597739219665527, "step": 122 }, { "epoch": 0.26382978723404255, "grad_norm": 0.5546875, "learning_rate": 1.1964320582802759e-05, "loss": 1.2965384721755981, "step": 124 }, { "epoch": 0.2680851063829787, "grad_norm": 0.3515625, "learning_rate": 1.196152697472584e-05, "loss": 1.3368679285049438, "step": 126 }, { "epoch": 0.2723404255319149, "grad_norm": 0.79296875, "learning_rate": 1.1958628522539549e-05, "loss": 1.3335758447647095, "step": 128 }, { "epoch": 0.2765957446808511, "grad_norm": 0.470703125, "learning_rate": 1.1955625290064935e-05, "loss": 1.3016529083251953, "step": 130 }, { "epoch": 0.28085106382978725, "grad_norm": 0.8671875, "learning_rate": 1.1952517343430199e-05, "loss": 1.253875494003296, "step": 132 }, { "epoch": 0.2851063829787234, "grad_norm": 0.478515625, "learning_rate": 1.1949304751069256e-05, "loss": 1.2634450197219849, "step": 134 }, { "epoch": 0.28936170212765955, "grad_norm": 1.2109375, "learning_rate": 1.1945987583720202e-05, "loss": 1.294474482536316, "step": 136 }, { "epoch": 0.2936170212765957, "grad_norm": 0.85546875, "learning_rate": 1.194256591442378e-05, "loss": 1.2694545984268188, "step": 138 }, { "epoch": 0.2978723404255319, "grad_norm": 0.484375, "learning_rate": 1.1939039818521758e-05, "loss": 1.4072679281234741, "step": 140 }, { "epoch": 0.3021276595744681, "grad_norm": 0.58203125, "learning_rate": 1.1935409373655282e-05, "loss": 1.3019527196884155, "step": 142 }, { "epoch": 0.30638297872340425, "grad_norm": 1.125, "learning_rate": 1.1931674659763148e-05, "loss": 1.4703279733657837, "step": 144 }, { "epoch": 0.31063829787234043, "grad_norm": 1.453125, "learning_rate": 1.1927835759080058e-05, "loss": 1.1757651567459106, "step": 146 }, { "epoch": 0.3148936170212766, "grad_norm": 0.58203125, "learning_rate": 1.1923892756134807e-05, "loss": 1.2418992519378662, "step": 148 }, { "epoch": 0.3191489361702128, "grad_norm": 0.55078125, "learning_rate": 1.1919845737748413e-05, "loss": 1.1974143981933594, "step": 150 }, { "epoch": 0.32340425531914896, "grad_norm": 0.8359375, "learning_rate": 1.1915694793032215e-05, "loss": 1.3293455839157104, "step": 152 }, { "epoch": 0.3276595744680851, "grad_norm": 0.66015625, "learning_rate": 1.1911440013385906e-05, "loss": 1.1985448598861694, "step": 154 }, { "epoch": 0.33191489361702126, "grad_norm": 0.46875, "learning_rate": 1.1907081492495521e-05, "loss": 1.2568351030349731, "step": 156 }, { "epoch": 0.33617021276595743, "grad_norm": 0.453125, "learning_rate": 1.1902619326331371e-05, "loss": 1.2663094997406006, "step": 158 }, { "epoch": 0.3404255319148936, "grad_norm": 0.384765625, "learning_rate": 1.1898053613145944e-05, "loss": 1.1971551179885864, "step": 160 }, { "epoch": 0.3446808510638298, "grad_norm": 0.75390625, "learning_rate": 1.1893384453471717e-05, "loss": 1.2108319997787476, "step": 162 }, { "epoch": 0.34893617021276596, "grad_norm": 0.423828125, "learning_rate": 1.1888611950118964e-05, "loss": 1.2176121473312378, "step": 164 }, { "epoch": 0.35319148936170214, "grad_norm": 0.546875, "learning_rate": 1.188373620817349e-05, "loss": 1.2852199077606201, "step": 166 }, { "epoch": 0.3574468085106383, "grad_norm": 0.90234375, "learning_rate": 1.1878757334994293e-05, "loss": 1.137981653213501, "step": 168 }, { "epoch": 0.3617021276595745, "grad_norm": 0.490234375, "learning_rate": 1.1873675440211238e-05, "loss": 1.2986195087432861, "step": 170 }, { "epoch": 0.3659574468085106, "grad_norm": 0.54296875, "learning_rate": 1.1868490635722617e-05, "loss": 1.2511855363845825, "step": 172 }, { "epoch": 0.3702127659574468, "grad_norm": 0.365234375, "learning_rate": 1.186320303569269e-05, "loss": 1.2008732557296753, "step": 174 }, { "epoch": 0.37446808510638296, "grad_norm": 0.58203125, "learning_rate": 1.185781275654917e-05, "loss": 1.3959091901779175, "step": 176 }, { "epoch": 0.37872340425531914, "grad_norm": 0.53125, "learning_rate": 1.1852319916980676e-05, "loss": 1.3956475257873535, "step": 178 }, { "epoch": 0.3829787234042553, "grad_norm": 0.63671875, "learning_rate": 1.1846724637934086e-05, "loss": 1.1432154178619385, "step": 180 }, { "epoch": 0.3872340425531915, "grad_norm": 0.4140625, "learning_rate": 1.184102704261191e-05, "loss": 1.198095679283142, "step": 182 }, { "epoch": 0.39148936170212767, "grad_norm": 0.52734375, "learning_rate": 1.1835227256469556e-05, "loss": 1.126910924911499, "step": 184 }, { "epoch": 0.39574468085106385, "grad_norm": 0.546875, "learning_rate": 1.1829325407212569e-05, "loss": 1.340002179145813, "step": 186 }, { "epoch": 0.4, "grad_norm": 0.56640625, "learning_rate": 1.1823321624793831e-05, "loss": 1.2044755220413208, "step": 188 }, { "epoch": 0.40425531914893614, "grad_norm": 0.42578125, "learning_rate": 1.1817216041410678e-05, "loss": 1.1999846696853638, "step": 190 }, { "epoch": 0.4085106382978723, "grad_norm": 0.94140625, "learning_rate": 1.181100879150202e-05, "loss": 1.2849934101104736, "step": 192 }, { "epoch": 0.4127659574468085, "grad_norm": 0.3671875, "learning_rate": 1.180470001174535e-05, "loss": 1.3638895750045776, "step": 194 }, { "epoch": 0.41702127659574467, "grad_norm": 0.5390625, "learning_rate": 1.179828984105375e-05, "loss": 1.2097505331039429, "step": 196 }, { "epoch": 0.42127659574468085, "grad_norm": 1.140625, "learning_rate": 1.1791778420572834e-05, "loss": 1.2969235181808472, "step": 198 }, { "epoch": 0.425531914893617, "grad_norm": 0.70703125, "learning_rate": 1.1785165893677632e-05, "loss": 1.3074672222137451, "step": 200 }, { "epoch": 0.4297872340425532, "grad_norm": 0.57421875, "learning_rate": 1.1778452405969437e-05, "loss": 1.2175475358963013, "step": 202 }, { "epoch": 0.4340425531914894, "grad_norm": 1.0859375, "learning_rate": 1.1771638105272605e-05, "loss": 1.1964837312698364, "step": 204 }, { "epoch": 0.43829787234042555, "grad_norm": 0.455078125, "learning_rate": 1.176472314163129e-05, "loss": 1.2108904123306274, "step": 206 }, { "epoch": 0.4425531914893617, "grad_norm": 0.4609375, "learning_rate": 1.1757707667306142e-05, "loss": 1.2564092874526978, "step": 208 }, { "epoch": 0.44680851063829785, "grad_norm": 0.625, "learning_rate": 1.1750591836770963e-05, "loss": 1.2397825717926025, "step": 210 }, { "epoch": 0.451063829787234, "grad_norm": 0.58984375, "learning_rate": 1.1743375806709292e-05, "loss": 1.141276478767395, "step": 212 }, { "epoch": 0.4553191489361702, "grad_norm": 0.43359375, "learning_rate": 1.1736059736010964e-05, "loss": 1.2472527027130127, "step": 214 }, { "epoch": 0.4595744680851064, "grad_norm": 0.4921875, "learning_rate": 1.1728643785768619e-05, "loss": 1.2373621463775635, "step": 216 }, { "epoch": 0.46382978723404256, "grad_norm": 0.578125, "learning_rate": 1.1721128119274132e-05, "loss": 1.3174031972885132, "step": 218 }, { "epoch": 0.46808510638297873, "grad_norm": 1.0390625, "learning_rate": 1.171351290201504e-05, "loss": 1.4028608798980713, "step": 220 }, { "epoch": 0.4723404255319149, "grad_norm": 0.75390625, "learning_rate": 1.170579830167089e-05, "loss": 1.2434858083724976, "step": 222 }, { "epoch": 0.4765957446808511, "grad_norm": 0.60546875, "learning_rate": 1.1697984488109536e-05, "loss": 1.2289927005767822, "step": 224 }, { "epoch": 0.4808510638297872, "grad_norm": 0.43359375, "learning_rate": 1.1690071633383422e-05, "loss": 1.1950970888137817, "step": 226 }, { "epoch": 0.4851063829787234, "grad_norm": 0.423828125, "learning_rate": 1.168205991172577e-05, "loss": 1.398798942565918, "step": 228 }, { "epoch": 0.48936170212765956, "grad_norm": 0.50390625, "learning_rate": 1.1673949499546763e-05, "loss": 1.2393437623977661, "step": 230 }, { "epoch": 0.49361702127659574, "grad_norm": 0.75, "learning_rate": 1.166574057542964e-05, "loss": 1.2385178804397583, "step": 232 }, { "epoch": 0.4978723404255319, "grad_norm": 0.6875, "learning_rate": 1.165743332012679e-05, "loss": 1.4011635780334473, "step": 234 }, { "epoch": 0.502127659574468, "grad_norm": 0.52734375, "learning_rate": 1.1649027916555742e-05, "loss": 1.2445231676101685, "step": 236 }, { "epoch": 0.5063829787234042, "grad_norm": 0.53515625, "learning_rate": 1.1640524549795163e-05, "loss": 1.2868069410324097, "step": 238 }, { "epoch": 0.5106382978723404, "grad_norm": 0.369140625, "learning_rate": 1.1631923407080772e-05, "loss": 1.3375487327575684, "step": 240 }, { "epoch": 0.5148936170212766, "grad_norm": 0.62109375, "learning_rate": 1.1623224677801212e-05, "loss": 1.109569787979126, "step": 242 }, { "epoch": 0.5191489361702127, "grad_norm": 2.34375, "learning_rate": 1.1614428553493886e-05, "loss": 1.1656110286712646, "step": 244 }, { "epoch": 0.5234042553191489, "grad_norm": 0.95703125, "learning_rate": 1.160553522784075e-05, "loss": 1.159610629081726, "step": 246 }, { "epoch": 0.5276595744680851, "grad_norm": 0.474609375, "learning_rate": 1.1596544896664021e-05, "loss": 1.24387788772583, "step": 248 }, { "epoch": 0.5319148936170213, "grad_norm": 0.408203125, "learning_rate": 1.1587457757921896e-05, "loss": 1.324474811553955, "step": 250 }, { "epoch": 0.5361702127659574, "grad_norm": 0.4296875, "learning_rate": 1.1578274011704169e-05, "loss": 1.4482465982437134, "step": 252 }, { "epoch": 0.5404255319148936, "grad_norm": 0.88671875, "learning_rate": 1.1568993860227838e-05, "loss": 1.425924301147461, "step": 254 }, { "epoch": 0.5446808510638298, "grad_norm": 1.6796875, "learning_rate": 1.155961750783265e-05, "loss": 1.6282589435577393, "step": 256 }, { "epoch": 0.548936170212766, "grad_norm": 0.68359375, "learning_rate": 1.1550145160976607e-05, "loss": 1.294546127319336, "step": 258 }, { "epoch": 0.5531914893617021, "grad_norm": 0.416015625, "learning_rate": 1.1540577028231398e-05, "loss": 1.2809118032455444, "step": 260 }, { "epoch": 0.5574468085106383, "grad_norm": 1.0546875, "learning_rate": 1.1530913320277837e-05, "loss": 1.2208646535873413, "step": 262 }, { "epoch": 0.5617021276595745, "grad_norm": 0.55859375, "learning_rate": 1.1521154249901204e-05, "loss": 1.2243047952651978, "step": 264 }, { "epoch": 0.5659574468085107, "grad_norm": 0.640625, "learning_rate": 1.1511300031986567e-05, "loss": 1.325520634651184, "step": 266 }, { "epoch": 0.5702127659574469, "grad_norm": 0.50390625, "learning_rate": 1.1501350883514048e-05, "loss": 1.1810495853424072, "step": 268 }, { "epoch": 0.574468085106383, "grad_norm": 0.9140625, "learning_rate": 1.149130702355404e-05, "loss": 1.360308289527893, "step": 270 }, { "epoch": 0.5787234042553191, "grad_norm": 0.5859375, "learning_rate": 1.14811686732624e-05, "loss": 1.2189104557037354, "step": 272 }, { "epoch": 0.5829787234042553, "grad_norm": 0.66015625, "learning_rate": 1.1470936055875562e-05, "loss": 1.3855215311050415, "step": 274 }, { "epoch": 0.5872340425531914, "grad_norm": 0.44140625, "learning_rate": 1.1460609396705629e-05, "loss": 1.239030361175537, "step": 276 }, { "epoch": 0.5914893617021276, "grad_norm": 0.51171875, "learning_rate": 1.1450188923135407e-05, "loss": 1.2763073444366455, "step": 278 }, { "epoch": 0.5957446808510638, "grad_norm": 1.6796875, "learning_rate": 1.1439674864613413e-05, "loss": 1.1475056409835815, "step": 280 }, { "epoch": 0.6, "grad_norm": 0.91015625, "learning_rate": 1.14290674526488e-05, "loss": 1.3000105619430542, "step": 282 }, { "epoch": 0.6042553191489362, "grad_norm": 0.59765625, "learning_rate": 1.1418366920806277e-05, "loss": 1.2847286462783813, "step": 284 }, { "epoch": 0.6085106382978723, "grad_norm": 0.328125, "learning_rate": 1.1407573504700965e-05, "loss": 1.2533907890319824, "step": 286 }, { "epoch": 0.6127659574468085, "grad_norm": 0.447265625, "learning_rate": 1.1396687441993191e-05, "loss": 1.092968463897705, "step": 288 }, { "epoch": 0.6170212765957447, "grad_norm": 0.3984375, "learning_rate": 1.1385708972383283e-05, "loss": 1.4811941385269165, "step": 290 }, { "epoch": 0.6212765957446809, "grad_norm": 0.5234375, "learning_rate": 1.1374638337606272e-05, "loss": 1.2241995334625244, "step": 292 }, { "epoch": 0.625531914893617, "grad_norm": 0.39453125, "learning_rate": 1.1363475781426572e-05, "loss": 1.273016095161438, "step": 294 }, { "epoch": 0.6297872340425532, "grad_norm": 0.486328125, "learning_rate": 1.1352221549632619e-05, "loss": 1.3111282587051392, "step": 296 }, { "epoch": 0.6340425531914894, "grad_norm": 0.369140625, "learning_rate": 1.134087589003145e-05, "loss": 1.2370787858963013, "step": 298 }, { "epoch": 0.6382978723404256, "grad_norm": 0.9609375, "learning_rate": 1.132943905244326e-05, "loss": 1.2171998023986816, "step": 300 }, { "epoch": 0.6425531914893617, "grad_norm": 0.61328125, "learning_rate": 1.1317911288695888e-05, "loss": 1.3028873205184937, "step": 302 }, { "epoch": 0.6468085106382979, "grad_norm": 0.388671875, "learning_rate": 1.1306292852619274e-05, "loss": 1.2210191488265991, "step": 304 }, { "epoch": 0.6510638297872341, "grad_norm": 0.51953125, "learning_rate": 1.129458400003988e-05, "loss": 1.2221373319625854, "step": 306 }, { "epoch": 0.6553191489361702, "grad_norm": 0.7109375, "learning_rate": 1.1282784988775045e-05, "loss": 1.236470341682434, "step": 308 }, { "epoch": 0.6595744680851063, "grad_norm": 0.390625, "learning_rate": 1.1270896078627315e-05, "loss": 1.0521761178970337, "step": 310 }, { "epoch": 0.6638297872340425, "grad_norm": 0.859375, "learning_rate": 1.125891753137872e-05, "loss": 1.1648889780044556, "step": 312 }, { "epoch": 0.6680851063829787, "grad_norm": 0.484375, "learning_rate": 1.1246849610785009e-05, "loss": 1.2399919033050537, "step": 314 }, { "epoch": 0.6723404255319149, "grad_norm": 0.69140625, "learning_rate": 1.1234692582569843e-05, "loss": 1.2077488899230957, "step": 316 }, { "epoch": 0.676595744680851, "grad_norm": 1.234375, "learning_rate": 1.1222446714418947e-05, "loss": 1.4379267692565918, "step": 318 }, { "epoch": 0.6808510638297872, "grad_norm": 0.349609375, "learning_rate": 1.1210112275974216e-05, "loss": 1.2180498838424683, "step": 320 }, { "epoch": 0.6851063829787234, "grad_norm": 0.49609375, "learning_rate": 1.1197689538827766e-05, "loss": 1.190024971961975, "step": 322 }, { "epoch": 0.6893617021276596, "grad_norm": 0.78515625, "learning_rate": 1.1185178776515973e-05, "loss": 1.2704949378967285, "step": 324 }, { "epoch": 0.6936170212765957, "grad_norm": 0.50390625, "learning_rate": 1.1172580264513435e-05, "loss": 1.2116349935531616, "step": 326 }, { "epoch": 0.6978723404255319, "grad_norm": 1.015625, "learning_rate": 1.1159894280226908e-05, "loss": 1.4247322082519531, "step": 328 }, { "epoch": 0.7021276595744681, "grad_norm": 0.76171875, "learning_rate": 1.114712110298921e-05, "loss": 1.222773551940918, "step": 330 }, { "epoch": 0.7063829787234043, "grad_norm": 0.443359375, "learning_rate": 1.1134261014053054e-05, "loss": 1.2406312227249146, "step": 332 }, { "epoch": 0.7106382978723405, "grad_norm": 0.41015625, "learning_rate": 1.1121314296584864e-05, "loss": 1.1038767099380493, "step": 334 }, { "epoch": 0.7148936170212766, "grad_norm": 0.96875, "learning_rate": 1.1108281235658543e-05, "loss": 1.2219905853271484, "step": 336 }, { "epoch": 0.7191489361702128, "grad_norm": 0.71484375, "learning_rate": 1.1095162118249182e-05, "loss": 1.2996376752853394, "step": 338 }, { "epoch": 0.723404255319149, "grad_norm": 0.69140625, "learning_rate": 1.1081957233226762e-05, "loss": 1.2108495235443115, "step": 340 }, { "epoch": 0.7276595744680852, "grad_norm": 4.40625, "learning_rate": 1.1068666871349777e-05, "loss": 1.1036784648895264, "step": 342 }, { "epoch": 0.7319148936170212, "grad_norm": 0.75, "learning_rate": 1.1055291325258833e-05, "loss": 1.1888855695724487, "step": 344 }, { "epoch": 0.7361702127659574, "grad_norm": 2.484375, "learning_rate": 1.1041830889470211e-05, "loss": 1.2789053916931152, "step": 346 }, { "epoch": 0.7404255319148936, "grad_norm": 0.66015625, "learning_rate": 1.1028285860369379e-05, "loss": 1.2360132932662964, "step": 348 }, { "epoch": 0.7446808510638298, "grad_norm": 0.6875, "learning_rate": 1.1014656536204471e-05, "loss": 1.271801233291626, "step": 350 }, { "epoch": 0.7489361702127659, "grad_norm": 0.333984375, "learning_rate": 1.1000943217079704e-05, "loss": 1.177423119544983, "step": 352 }, { "epoch": 0.7531914893617021, "grad_norm": 0.431640625, "learning_rate": 1.098714620494879e-05, "loss": 1.1379421949386597, "step": 354 }, { "epoch": 0.7574468085106383, "grad_norm": 0.462890625, "learning_rate": 1.0973265803608273e-05, "loss": 1.293025255203247, "step": 356 }, { "epoch": 0.7617021276595745, "grad_norm": 0.34765625, "learning_rate": 1.0959302318690851e-05, "loss": 1.1501177549362183, "step": 358 }, { "epoch": 0.7659574468085106, "grad_norm": 0.36328125, "learning_rate": 1.0945256057658632e-05, "loss": 1.1921217441558838, "step": 360 }, { "epoch": 0.7702127659574468, "grad_norm": 0.6171875, "learning_rate": 1.0931127329796376e-05, "loss": 1.219430923461914, "step": 362 }, { "epoch": 0.774468085106383, "grad_norm": 3.84375, "learning_rate": 1.0916916446204684e-05, "loss": 1.2632174491882324, "step": 364 }, { "epoch": 0.7787234042553192, "grad_norm": 1.2421875, "learning_rate": 1.090262371979314e-05, "loss": 1.1648533344268799, "step": 366 }, { "epoch": 0.7829787234042553, "grad_norm": 0.392578125, "learning_rate": 1.0888249465273429e-05, "loss": 1.1504024267196655, "step": 368 }, { "epoch": 0.7872340425531915, "grad_norm": 0.66796875, "learning_rate": 1.08737939991524e-05, "loss": 1.2344441413879395, "step": 370 }, { "epoch": 0.7914893617021277, "grad_norm": 1.234375, "learning_rate": 1.0859257639725105e-05, "loss": 1.1171855926513672, "step": 372 }, { "epoch": 0.7957446808510639, "grad_norm": 0.42578125, "learning_rate": 1.0844640707067789e-05, "loss": 1.0803868770599365, "step": 374 }, { "epoch": 0.8, "grad_norm": 0.7109375, "learning_rate": 1.0829943523030833e-05, "loss": 1.1519043445587158, "step": 376 }, { "epoch": 0.8042553191489362, "grad_norm": 0.478515625, "learning_rate": 1.0815166411231678e-05, "loss": 1.2066103219985962, "step": 378 }, { "epoch": 0.8085106382978723, "grad_norm": 0.55859375, "learning_rate": 1.0800309697047694e-05, "loss": 1.2266093492507935, "step": 380 }, { "epoch": 0.8127659574468085, "grad_norm": 0.5078125, "learning_rate": 1.0785373707609015e-05, "loss": 1.1117401123046875, "step": 382 }, { "epoch": 0.8170212765957446, "grad_norm": 1.4609375, "learning_rate": 1.0770358771791342e-05, "loss": 1.210506796836853, "step": 384 }, { "epoch": 0.8212765957446808, "grad_norm": 0.5, "learning_rate": 1.0755265220208694e-05, "loss": 1.0881282091140747, "step": 386 }, { "epoch": 0.825531914893617, "grad_norm": 0.380859375, "learning_rate": 1.0740093385206134e-05, "loss": 1.1627310514450073, "step": 388 }, { "epoch": 0.8297872340425532, "grad_norm": 0.60546875, "learning_rate": 1.0724843600852442e-05, "loss": 1.3014237880706787, "step": 390 }, { "epoch": 0.8340425531914893, "grad_norm": 0.388671875, "learning_rate": 1.0709516202932775e-05, "loss": 1.1474575996398926, "step": 392 }, { "epoch": 0.8382978723404255, "grad_norm": 0.578125, "learning_rate": 1.0694111528941255e-05, "loss": 1.0830378532409668, "step": 394 }, { "epoch": 0.8425531914893617, "grad_norm": 0.44140625, "learning_rate": 1.0678629918073552e-05, "loss": 1.3125864267349243, "step": 396 }, { "epoch": 0.8468085106382979, "grad_norm": 0.447265625, "learning_rate": 1.0663071711219407e-05, "loss": 1.2408422231674194, "step": 398 }, { "epoch": 0.851063829787234, "grad_norm": 0.6484375, "learning_rate": 1.0647437250955132e-05, "loss": 1.164583444595337, "step": 400 }, { "epoch": 0.8553191489361702, "grad_norm": 0.82421875, "learning_rate": 1.0631726881536062e-05, "loss": 1.215876579284668, "step": 402 }, { "epoch": 0.8595744680851064, "grad_norm": 0.66015625, "learning_rate": 1.0615940948888973e-05, "loss": 1.1813125610351562, "step": 404 }, { "epoch": 0.8638297872340426, "grad_norm": 0.59765625, "learning_rate": 1.0600079800604474e-05, "loss": 1.2217594385147095, "step": 406 }, { "epoch": 0.8680851063829788, "grad_norm": 2.921875, "learning_rate": 1.0584143785929342e-05, "loss": 1.2609615325927734, "step": 408 }, { "epoch": 0.8723404255319149, "grad_norm": 0.62890625, "learning_rate": 1.0568133255758849e-05, "loss": 1.143092393875122, "step": 410 }, { "epoch": 0.8765957446808511, "grad_norm": 0.62109375, "learning_rate": 1.0552048562629009e-05, "loss": 1.2375463247299194, "step": 412 }, { "epoch": 0.8808510638297873, "grad_norm": 0.74609375, "learning_rate": 1.0535890060708838e-05, "loss": 1.1186902523040771, "step": 414 }, { "epoch": 0.8851063829787233, "grad_norm": 0.439453125, "learning_rate": 1.0519658105792554e-05, "loss": 1.1387929916381836, "step": 416 }, { "epoch": 0.8893617021276595, "grad_norm": 1.421875, "learning_rate": 1.0503353055291729e-05, "loss": 1.181614875793457, "step": 418 }, { "epoch": 0.8936170212765957, "grad_norm": 1.03125, "learning_rate": 1.0486975268227431e-05, "loss": 1.308741807937622, "step": 420 }, { "epoch": 0.8978723404255319, "grad_norm": 1.21875, "learning_rate": 1.0470525105222318e-05, "loss": 1.0869234800338745, "step": 422 }, { "epoch": 0.902127659574468, "grad_norm": 0.443359375, "learning_rate": 1.0454002928492686e-05, "loss": 1.1498181819915771, "step": 424 }, { "epoch": 0.9063829787234042, "grad_norm": 0.6796875, "learning_rate": 1.0437409101840513e-05, "loss": 1.3278398513793945, "step": 426 }, { "epoch": 0.9106382978723404, "grad_norm": 0.44140625, "learning_rate": 1.0420743990645426e-05, "loss": 1.2144547700881958, "step": 428 }, { "epoch": 0.9148936170212766, "grad_norm": 0.486328125, "learning_rate": 1.0404007961856676e-05, "loss": 1.191633701324463, "step": 430 }, { "epoch": 0.9191489361702128, "grad_norm": 0.5078125, "learning_rate": 1.0387201383985043e-05, "loss": 1.2432807683944702, "step": 432 }, { "epoch": 0.9234042553191489, "grad_norm": 0.703125, "learning_rate": 1.0370324627094734e-05, "loss": 1.5649425983428955, "step": 434 }, { "epoch": 0.9276595744680851, "grad_norm": 0.55859375, "learning_rate": 1.0353378062795224e-05, "loss": 1.2039592266082764, "step": 436 }, { "epoch": 0.9319148936170213, "grad_norm": 0.49609375, "learning_rate": 1.033636206423308e-05, "loss": 1.1712656021118164, "step": 438 }, { "epoch": 0.9361702127659575, "grad_norm": 0.75390625, "learning_rate": 1.0319277006083738e-05, "loss": 1.030342936515808, "step": 440 }, { "epoch": 0.9404255319148936, "grad_norm": 0.74609375, "learning_rate": 1.0302123264543267e-05, "loss": 1.1908173561096191, "step": 442 }, { "epoch": 0.9446808510638298, "grad_norm": 1.5234375, "learning_rate": 1.028490121732007e-05, "loss": 1.174695611000061, "step": 444 }, { "epoch": 0.948936170212766, "grad_norm": 1.8203125, "learning_rate": 1.026761124362657e-05, "loss": 1.3273422718048096, "step": 446 }, { "epoch": 0.9531914893617022, "grad_norm": 0.3828125, "learning_rate": 1.0250253724170875e-05, "loss": 1.162235975265503, "step": 448 }, { "epoch": 0.9574468085106383, "grad_norm": 0.53515625, "learning_rate": 1.0232829041148372e-05, "loss": 1.1651887893676758, "step": 450 }, { "epoch": 0.9617021276595744, "grad_norm": 0.48828125, "learning_rate": 1.0215337578233328e-05, "loss": 1.1634246110916138, "step": 452 }, { "epoch": 0.9659574468085106, "grad_norm": 0.5625, "learning_rate": 1.019777972057044e-05, "loss": 1.0295268297195435, "step": 454 }, { "epoch": 0.9702127659574468, "grad_norm": 0.41796875, "learning_rate": 1.0180155854766348e-05, "loss": 1.178024411201477, "step": 456 }, { "epoch": 0.9744680851063829, "grad_norm": 0.62109375, "learning_rate": 1.0162466368881124e-05, "loss": 1.2120832204818726, "step": 458 }, { "epoch": 0.9787234042553191, "grad_norm": 0.50390625, "learning_rate": 1.0144711652419738e-05, "loss": 1.1555849313735962, "step": 460 }, { "epoch": 0.9829787234042553, "grad_norm": 4.0625, "learning_rate": 1.0126892096323463e-05, "loss": 1.2941299676895142, "step": 462 }, { "epoch": 0.9872340425531915, "grad_norm": 2.34375, "learning_rate": 1.0109008092961276e-05, "loss": 1.0498948097229004, "step": 464 }, { "epoch": 0.9914893617021276, "grad_norm": 0.71875, "learning_rate": 1.0091060036121233e-05, "loss": 1.2505208253860474, "step": 466 }, { "epoch": 0.9957446808510638, "grad_norm": 0.51953125, "learning_rate": 1.0073048321001766e-05, "loss": 1.1784660816192627, "step": 468 }, { "epoch": 1.0, "grad_norm": 0.6875, "learning_rate": 1.0054973344203011e-05, "loss": 1.2162238359451294, "step": 470 }, { "epoch": 1.004255319148936, "grad_norm": 0.33203125, "learning_rate": 1.003683550371806e-05, "loss": 0.902032196521759, "step": 472 }, { "epoch": 1.0085106382978724, "grad_norm": 0.84375, "learning_rate": 1.00186351989242e-05, "loss": 0.6829485893249512, "step": 474 }, { "epoch": 1.0127659574468084, "grad_norm": 0.380859375, "learning_rate": 1.0000372830574128e-05, "loss": 0.9958571195602417, "step": 476 }, { "epoch": 1.0170212765957447, "grad_norm": 0.341796875, "learning_rate": 9.982048800787103e-06, "loss": 0.8577584624290466, "step": 478 }, { "epoch": 1.0212765957446808, "grad_norm": 0.443359375, "learning_rate": 9.96366351304012e-06, "loss": 0.7623387575149536, "step": 480 }, { "epoch": 1.025531914893617, "grad_norm": 0.443359375, "learning_rate": 9.945217372159019e-06, "loss": 0.6408636569976807, "step": 482 }, { "epoch": 1.0297872340425531, "grad_norm": 0.40625, "learning_rate": 9.926710784309548e-06, "loss": 0.8527731895446777, "step": 484 }, { "epoch": 1.0340425531914894, "grad_norm": 0.71875, "learning_rate": 9.908144156988452e-06, "loss": 1.0902431011199951, "step": 486 }, { "epoch": 1.0382978723404255, "grad_norm": 0.5703125, "learning_rate": 9.88951789901448e-06, "loss": 0.9952311515808105, "step": 488 }, { "epoch": 1.0425531914893618, "grad_norm": 0.466796875, "learning_rate": 9.87083242051939e-06, "loss": 1.0575801134109497, "step": 490 }, { "epoch": 1.0468085106382978, "grad_norm": 0.8515625, "learning_rate": 9.852088132938916e-06, "loss": 0.8896694779396057, "step": 492 }, { "epoch": 1.0510638297872341, "grad_norm": 0.61328125, "learning_rate": 9.833285449003712e-06, "loss": 0.8272213935852051, "step": 494 }, { "epoch": 1.0553191489361702, "grad_norm": 0.490234375, "learning_rate": 9.814424782730261e-06, "loss": 0.897000789642334, "step": 496 }, { "epoch": 1.0595744680851065, "grad_norm": 0.6015625, "learning_rate": 9.79550654941176e-06, "loss": 0.7115342020988464, "step": 498 }, { "epoch": 1.0638297872340425, "grad_norm": 0.458984375, "learning_rate": 9.776531165608975e-06, "loss": 0.7840989232063293, "step": 500 }, { "epoch": 1.0680851063829788, "grad_norm": 0.400390625, "learning_rate": 9.757499049141065e-06, "loss": 0.8686625361442566, "step": 502 }, { "epoch": 1.0723404255319149, "grad_norm": 0.455078125, "learning_rate": 9.738410619076393e-06, "loss": 0.5279070138931274, "step": 504 }, { "epoch": 1.076595744680851, "grad_norm": 1.2421875, "learning_rate": 9.71926629572329e-06, "loss": 0.7969399094581604, "step": 506 }, { "epoch": 1.0808510638297872, "grad_norm": 1.3828125, "learning_rate": 9.7000665006208e-06, "loss": 0.9214133024215698, "step": 508 }, { "epoch": 1.0851063829787233, "grad_norm": 0.88671875, "learning_rate": 9.680811656529397e-06, "loss": 0.8827441930770874, "step": 510 }, { "epoch": 1.0893617021276596, "grad_norm": 1.1328125, "learning_rate": 9.661502187421687e-06, "loss": 0.7750219702720642, "step": 512 }, { "epoch": 1.0936170212765957, "grad_norm": 0.384765625, "learning_rate": 9.64213851847306e-06, "loss": 0.7688886523246765, "step": 514 }, { "epoch": 1.097872340425532, "grad_norm": 0.64453125, "learning_rate": 9.62272107605233e-06, "loss": 0.9912289977073669, "step": 516 }, { "epoch": 1.102127659574468, "grad_norm": 0.9765625, "learning_rate": 9.603250287712357e-06, "loss": 0.8116132020950317, "step": 518 }, { "epoch": 1.1063829787234043, "grad_norm": 0.6171875, "learning_rate": 9.583726582180619e-06, "loss": 0.5431628227233887, "step": 520 }, { "epoch": 1.1106382978723404, "grad_norm": 0.82421875, "learning_rate": 9.564150389349784e-06, "loss": 0.7063818573951721, "step": 522 }, { "epoch": 1.1148936170212767, "grad_norm": 0.46484375, "learning_rate": 9.544522140268226e-06, "loss": 0.8259474635124207, "step": 524 }, { "epoch": 1.1191489361702127, "grad_norm": 0.65625, "learning_rate": 9.524842267130567e-06, "loss": 0.8532420992851257, "step": 526 }, { "epoch": 1.123404255319149, "grad_norm": 0.65625, "learning_rate": 9.505111203268119e-06, "loss": 0.7610599398612976, "step": 528 }, { "epoch": 1.127659574468085, "grad_norm": 0.59375, "learning_rate": 9.48532938313937e-06, "loss": 0.8436508178710938, "step": 530 }, { "epoch": 1.1319148936170214, "grad_norm": 0.353515625, "learning_rate": 9.465497242320423e-06, "loss": 1.2464487552642822, "step": 532 }, { "epoch": 1.1361702127659574, "grad_norm": 0.60546875, "learning_rate": 9.445615217495373e-06, "loss": 0.7736493945121765, "step": 534 }, { "epoch": 1.1404255319148937, "grad_norm": 0.5546875, "learning_rate": 9.42568374644672e-06, "loss": 0.9255214333534241, "step": 536 }, { "epoch": 1.1446808510638298, "grad_norm": 0.52734375, "learning_rate": 9.40570326804573e-06, "loss": 0.7744427919387817, "step": 538 }, { "epoch": 1.148936170212766, "grad_norm": 0.251953125, "learning_rate": 9.385674222242742e-06, "loss": 0.6865782737731934, "step": 540 }, { "epoch": 1.1531914893617021, "grad_norm": 6.25, "learning_rate": 9.365597050057524e-06, "loss": 0.8758373260498047, "step": 542 }, { "epoch": 1.1574468085106382, "grad_norm": 0.48046875, "learning_rate": 9.345472193569518e-06, "loss": 0.8117732405662537, "step": 544 }, { "epoch": 1.1617021276595745, "grad_norm": 0.44140625, "learning_rate": 9.325300095908145e-06, "loss": 0.9483519196510315, "step": 546 }, { "epoch": 1.1659574468085105, "grad_norm": 0.4921875, "learning_rate": 9.305081201243022e-06, "loss": 0.660556972026825, "step": 548 }, { "epoch": 1.1702127659574468, "grad_norm": 0.828125, "learning_rate": 9.284815954774185e-06, "loss": 0.7756091952323914, "step": 550 }, { "epoch": 1.174468085106383, "grad_norm": 0.62109375, "learning_rate": 9.264504802722297e-06, "loss": 0.8955855369567871, "step": 552 }, { "epoch": 1.1787234042553192, "grad_norm": 0.58203125, "learning_rate": 9.244148192318819e-06, "loss": 0.8398646712303162, "step": 554 }, { "epoch": 1.1829787234042553, "grad_norm": 0.97265625, "learning_rate": 9.223746571796152e-06, "loss": 0.8468598127365112, "step": 556 }, { "epoch": 1.1872340425531915, "grad_norm": 0.490234375, "learning_rate": 9.203300390377784e-06, "loss": 0.6725097298622131, "step": 558 }, { "epoch": 1.1914893617021276, "grad_norm": 0.59765625, "learning_rate": 9.182810098268377e-06, "loss": 0.7907771468162537, "step": 560 }, { "epoch": 1.195744680851064, "grad_norm": 0.453125, "learning_rate": 9.162276146643881e-06, "loss": 0.8897430896759033, "step": 562 }, { "epoch": 1.2, "grad_norm": 1.515625, "learning_rate": 9.141698987641577e-06, "loss": 0.9244027137756348, "step": 564 }, { "epoch": 1.2042553191489362, "grad_norm": 1.2734375, "learning_rate": 9.121079074350135e-06, "loss": 0.8451488614082336, "step": 566 }, { "epoch": 1.2085106382978723, "grad_norm": 1.6015625, "learning_rate": 9.100416860799625e-06, "loss": 0.9149748682975769, "step": 568 }, { "epoch": 1.2127659574468086, "grad_norm": 0.443359375, "learning_rate": 9.079712801951533e-06, "loss": 0.8140401244163513, "step": 570 }, { "epoch": 1.2170212765957447, "grad_norm": 0.640625, "learning_rate": 9.058967353688733e-06, "loss": 0.8866817355155945, "step": 572 }, { "epoch": 1.2212765957446807, "grad_norm": 0.70703125, "learning_rate": 9.038180972805454e-06, "loss": 0.8173488974571228, "step": 574 }, { "epoch": 1.225531914893617, "grad_norm": 0.55859375, "learning_rate": 9.017354116997226e-06, "loss": 0.7841181755065918, "step": 576 }, { "epoch": 1.2297872340425533, "grad_norm": 0.48046875, "learning_rate": 8.99648724485079e-06, "loss": 0.5890490412712097, "step": 578 }, { "epoch": 1.2340425531914894, "grad_norm": 0.578125, "learning_rate": 8.975580815834008e-06, "loss": 0.5997076034545898, "step": 580 }, { "epoch": 1.2382978723404254, "grad_norm": 1.1328125, "learning_rate": 8.954635290285748e-06, "loss": 0.6937717199325562, "step": 582 }, { "epoch": 1.2425531914893617, "grad_norm": 0.73046875, "learning_rate": 8.933651129405741e-06, "loss": 0.7356208562850952, "step": 584 }, { "epoch": 1.2468085106382978, "grad_norm": 0.875, "learning_rate": 8.912628795244435e-06, "loss": 0.8549614548683167, "step": 586 }, { "epoch": 1.251063829787234, "grad_norm": 0.40234375, "learning_rate": 8.891568750692811e-06, "loss": 0.645767092704773, "step": 588 }, { "epoch": 1.2553191489361701, "grad_norm": 0.61328125, "learning_rate": 8.870471459472202e-06, "loss": 0.9579916596412659, "step": 590 }, { "epoch": 1.2595744680851064, "grad_norm": 0.498046875, "learning_rate": 8.849337386124065e-06, "loss": 0.6670525670051575, "step": 592 }, { "epoch": 1.2638297872340425, "grad_norm": 0.5234375, "learning_rate": 8.828166995999771e-06, "loss": 0.9148899912834167, "step": 594 }, { "epoch": 1.2680851063829788, "grad_norm": 0.6015625, "learning_rate": 8.806960755250352e-06, "loss": 0.9241386651992798, "step": 596 }, { "epoch": 1.2723404255319148, "grad_norm": 0.8046875, "learning_rate": 8.785719130816227e-06, "loss": 0.8401479721069336, "step": 598 }, { "epoch": 1.2765957446808511, "grad_norm": 0.59765625, "learning_rate": 8.76444259041694e-06, "loss": 0.9863938689231873, "step": 600 }, { "epoch": 1.2808510638297872, "grad_norm": 0.58984375, "learning_rate": 8.743131602540837e-06, "loss": 0.9384634494781494, "step": 602 }, { "epoch": 1.2851063829787235, "grad_norm": 0.6484375, "learning_rate": 8.721786636434773e-06, "loss": 0.7852924466133118, "step": 604 }, { "epoch": 1.2893617021276595, "grad_norm": 0.490234375, "learning_rate": 8.70040816209377e-06, "loss": 0.9877030849456787, "step": 606 }, { "epoch": 1.2936170212765958, "grad_norm": 0.494140625, "learning_rate": 8.67899665025066e-06, "loss": 0.7262607216835022, "step": 608 }, { "epoch": 1.297872340425532, "grad_norm": 1.578125, "learning_rate": 8.657552572365738e-06, "loss": 1.0153322219848633, "step": 610 }, { "epoch": 1.302127659574468, "grad_norm": 0.34375, "learning_rate": 8.636076400616361e-06, "loss": 0.8889206051826477, "step": 612 }, { "epoch": 1.3063829787234043, "grad_norm": 0.40234375, "learning_rate": 8.614568607886572e-06, "loss": 1.0539144277572632, "step": 614 }, { "epoch": 1.3106382978723405, "grad_norm": 0.392578125, "learning_rate": 8.593029667756665e-06, "loss": 0.9332261085510254, "step": 616 }, { "epoch": 1.3148936170212766, "grad_norm": 0.4609375, "learning_rate": 8.57146005449278e-06, "loss": 0.7537972331047058, "step": 618 }, { "epoch": 1.3191489361702127, "grad_norm": 0.361328125, "learning_rate": 8.549860243036443e-06, "loss": 0.8345380425453186, "step": 620 }, { "epoch": 1.323404255319149, "grad_norm": 0.70703125, "learning_rate": 8.528230708994113e-06, "loss": 0.8078710436820984, "step": 622 }, { "epoch": 1.327659574468085, "grad_norm": 1.5390625, "learning_rate": 8.506571928626716e-06, "loss": 0.6944683790206909, "step": 624 }, { "epoch": 1.3319148936170213, "grad_norm": 0.333984375, "learning_rate": 8.484884378839148e-06, "loss": 0.8724764585494995, "step": 626 }, { "epoch": 1.3361702127659574, "grad_norm": 0.7109375, "learning_rate": 8.463168537169782e-06, "loss": 0.9229905009269714, "step": 628 }, { "epoch": 1.3404255319148937, "grad_norm": 0.466796875, "learning_rate": 8.44142488177995e-06, "loss": 0.8973690271377563, "step": 630 }, { "epoch": 1.3446808510638297, "grad_norm": 0.5390625, "learning_rate": 8.419653891443415e-06, "loss": 0.8710704445838928, "step": 632 }, { "epoch": 1.348936170212766, "grad_norm": 1.890625, "learning_rate": 8.397856045535826e-06, "loss": 0.9143708348274231, "step": 634 }, { "epoch": 1.353191489361702, "grad_norm": 0.40234375, "learning_rate": 8.37603182402417e-06, "loss": 0.7919833660125732, "step": 636 }, { "epoch": 1.3574468085106384, "grad_norm": 0.341796875, "learning_rate": 8.354181707456192e-06, "loss": 0.7822130918502808, "step": 638 }, { "epoch": 1.3617021276595744, "grad_norm": 0.40625, "learning_rate": 8.332306176949824e-06, "loss": 0.635791003704071, "step": 640 }, { "epoch": 1.3659574468085105, "grad_norm": 0.341796875, "learning_rate": 8.310405714182593e-06, "loss": 0.765158474445343, "step": 642 }, { "epoch": 1.3702127659574468, "grad_norm": 0.56640625, "learning_rate": 8.288480801380998e-06, "loss": 0.526314914226532, "step": 644 }, { "epoch": 1.374468085106383, "grad_norm": 0.392578125, "learning_rate": 8.266531921309911e-06, "loss": 0.8815028071403503, "step": 646 }, { "epoch": 1.3787234042553191, "grad_norm": 0.59765625, "learning_rate": 8.244559557261944e-06, "loss": 0.8624444007873535, "step": 648 }, { "epoch": 1.3829787234042552, "grad_norm": 0.69921875, "learning_rate": 8.22256419304679e-06, "loss": 1.1067816019058228, "step": 650 }, { "epoch": 1.3872340425531915, "grad_norm": 0.408203125, "learning_rate": 8.200546312980595e-06, "loss": 0.8086753487586975, "step": 652 }, { "epoch": 1.3914893617021278, "grad_norm": 0.412109375, "learning_rate": 8.17850640187528e-06, "loss": 0.8894110321998596, "step": 654 }, { "epoch": 1.3957446808510638, "grad_norm": 0.38671875, "learning_rate": 8.156444945027855e-06, "loss": 0.9589279294013977, "step": 656 }, { "epoch": 1.4, "grad_norm": 0.82421875, "learning_rate": 8.134362428209765e-06, "loss": 0.8438636064529419, "step": 658 }, { "epoch": 1.4042553191489362, "grad_norm": 0.412109375, "learning_rate": 8.11225933765616e-06, "loss": 0.7788761258125305, "step": 660 }, { "epoch": 1.4085106382978723, "grad_norm": 0.361328125, "learning_rate": 8.090136160055213e-06, "loss": 0.8602153658866882, "step": 662 }, { "epoch": 1.4127659574468086, "grad_norm": 0.64453125, "learning_rate": 8.067993382537386e-06, "loss": 1.1651355028152466, "step": 664 }, { "epoch": 1.4170212765957446, "grad_norm": 0.376953125, "learning_rate": 8.045831492664716e-06, "loss": 0.8709754347801208, "step": 666 }, { "epoch": 1.421276595744681, "grad_norm": 0.36328125, "learning_rate": 8.023650978420076e-06, "loss": 0.8617551922798157, "step": 668 }, { "epoch": 1.425531914893617, "grad_norm": 0.322265625, "learning_rate": 8.001452328196425e-06, "loss": 0.7164908647537231, "step": 670 }, { "epoch": 1.4297872340425533, "grad_norm": 0.68359375, "learning_rate": 7.979236030786065e-06, "loss": 0.874544084072113, "step": 672 }, { "epoch": 1.4340425531914893, "grad_norm": 0.357421875, "learning_rate": 7.957002575369866e-06, "loss": 0.8772100806236267, "step": 674 }, { "epoch": 1.4382978723404256, "grad_norm": 0.82421875, "learning_rate": 7.934752451506499e-06, "loss": 0.8531442880630493, "step": 676 }, { "epoch": 1.4425531914893617, "grad_norm": 0.703125, "learning_rate": 7.912486149121662e-06, "loss": 0.8926745653152466, "step": 678 }, { "epoch": 1.4468085106382977, "grad_norm": 0.302734375, "learning_rate": 7.89020415849729e-06, "loss": 0.8355059623718262, "step": 680 }, { "epoch": 1.451063829787234, "grad_norm": 0.5234375, "learning_rate": 7.867906970260748e-06, "loss": 0.7553901076316833, "step": 682 }, { "epoch": 1.4553191489361703, "grad_norm": 0.427734375, "learning_rate": 7.845595075374053e-06, "loss": 0.7148939967155457, "step": 684 }, { "epoch": 1.4595744680851064, "grad_norm": 0.8203125, "learning_rate": 7.823268965123027e-06, "loss": 0.7749176621437073, "step": 686 }, { "epoch": 1.4638297872340424, "grad_norm": 1.6796875, "learning_rate": 7.800929131106519e-06, "loss": 1.0506820678710938, "step": 688 }, { "epoch": 1.4680851063829787, "grad_norm": 0.67578125, "learning_rate": 7.77857606522555e-06, "loss": 0.5485996603965759, "step": 690 }, { "epoch": 1.472340425531915, "grad_norm": 0.416015625, "learning_rate": 7.756210259672503e-06, "loss": 0.8781046271324158, "step": 692 }, { "epoch": 1.476595744680851, "grad_norm": 0.435546875, "learning_rate": 7.733832206920267e-06, "loss": 0.8102371692657471, "step": 694 }, { "epoch": 1.4808510638297872, "grad_norm": 0.4296875, "learning_rate": 7.711442399711406e-06, "loss": 0.8387575149536133, "step": 696 }, { "epoch": 1.4851063829787234, "grad_norm": 0.357421875, "learning_rate": 7.689041331047307e-06, "loss": 0.7191005945205688, "step": 698 }, { "epoch": 1.4893617021276595, "grad_norm": 0.77734375, "learning_rate": 7.66662949417732e-06, "loss": 0.560632586479187, "step": 700 }, { "epoch": 1.4936170212765958, "grad_norm": 0.8515625, "learning_rate": 7.644207382587906e-06, "loss": 0.8454610705375671, "step": 702 }, { "epoch": 1.4978723404255319, "grad_norm": 0.455078125, "learning_rate": 7.621775489991757e-06, "loss": 0.5917819738388062, "step": 704 }, { "epoch": 1.5021276595744681, "grad_norm": 0.69140625, "learning_rate": 7.599334310316937e-06, "loss": 0.8950475454330444, "step": 706 }, { "epoch": 1.5063829787234042, "grad_norm": 0.83203125, "learning_rate": 7.576884337696004e-06, "loss": 0.9987728595733643, "step": 708 }, { "epoch": 1.5106382978723403, "grad_norm": 0.5, "learning_rate": 7.554426066455125e-06, "loss": 0.8234822154045105, "step": 710 }, { "epoch": 1.5148936170212766, "grad_norm": 1.15625, "learning_rate": 7.5319599911031986e-06, "loss": 0.948941707611084, "step": 712 }, { "epoch": 1.5191489361702128, "grad_norm": 0.95703125, "learning_rate": 7.509486606320955e-06, "loss": 0.8466644883155823, "step": 714 }, { "epoch": 1.523404255319149, "grad_norm": 2.265625, "learning_rate": 7.487006406950077e-06, "loss": 0.7706676721572876, "step": 716 }, { "epoch": 1.527659574468085, "grad_norm": 0.42578125, "learning_rate": 7.464519887982301e-06, "loss": 0.8639274835586548, "step": 718 }, { "epoch": 1.5319148936170213, "grad_norm": 0.28515625, "learning_rate": 7.442027544548502e-06, "loss": 0.8100276589393616, "step": 720 }, { "epoch": 1.5361702127659576, "grad_norm": 0.71484375, "learning_rate": 7.419529871907815e-06, "loss": 0.8926405310630798, "step": 722 }, { "epoch": 1.5404255319148936, "grad_norm": 0.341796875, "learning_rate": 7.397027365436715e-06, "loss": 0.8414310216903687, "step": 724 }, { "epoch": 1.5446808510638297, "grad_norm": 1.1953125, "learning_rate": 7.374520520618113e-06, "loss": 0.8629379868507385, "step": 726 }, { "epoch": 1.548936170212766, "grad_norm": 0.37890625, "learning_rate": 7.352009833030451e-06, "loss": 0.8124608397483826, "step": 728 }, { "epoch": 1.5531914893617023, "grad_norm": 0.361328125, "learning_rate": 7.329495798336777e-06, "loss": 1.0221534967422485, "step": 730 }, { "epoch": 1.5574468085106383, "grad_norm": 0.369140625, "learning_rate": 7.306978912273843e-06, "loss": 0.6406850218772888, "step": 732 }, { "epoch": 1.5617021276595744, "grad_norm": 2.296875, "learning_rate": 7.284459670641185e-06, "loss": 0.6190369129180908, "step": 734 }, { "epoch": 1.5659574468085107, "grad_norm": 0.384765625, "learning_rate": 7.261938569290206e-06, "loss": 0.8675222396850586, "step": 736 }, { "epoch": 1.570212765957447, "grad_norm": 0.609375, "learning_rate": 7.239416104113262e-06, "loss": 0.8379670977592468, "step": 738 }, { "epoch": 1.574468085106383, "grad_norm": 0.376953125, "learning_rate": 7.216892771032732e-06, "loss": 0.7264598608016968, "step": 740 }, { "epoch": 1.578723404255319, "grad_norm": 0.54296875, "learning_rate": 7.1943690659901095e-06, "loss": 0.8947696685791016, "step": 742 }, { "epoch": 1.5829787234042554, "grad_norm": 0.7109375, "learning_rate": 7.17184548493508e-06, "loss": 0.7789361476898193, "step": 744 }, { "epoch": 1.5872340425531914, "grad_norm": 0.400390625, "learning_rate": 7.149322523814594e-06, "loss": 0.8117201328277588, "step": 746 }, { "epoch": 1.5914893617021275, "grad_norm": 0.455078125, "learning_rate": 7.1268006785619575e-06, "loss": 0.7403523921966553, "step": 748 }, { "epoch": 1.5957446808510638, "grad_norm": 0.58984375, "learning_rate": 7.104280445085897e-06, "loss": 0.8037891387939453, "step": 750 }, { "epoch": 1.6, "grad_norm": 0.435546875, "learning_rate": 7.081762319259662e-06, "loss": 0.8160814642906189, "step": 752 }, { "epoch": 1.6042553191489362, "grad_norm": 0.326171875, "learning_rate": 7.0592467969100836e-06, "loss": 0.7555669546127319, "step": 754 }, { "epoch": 1.6085106382978722, "grad_norm": 0.78515625, "learning_rate": 7.036734373806672e-06, "loss": 0.8494399785995483, "step": 756 }, { "epoch": 1.6127659574468085, "grad_norm": 0.41015625, "learning_rate": 7.01422554565069e-06, "loss": 1.0269806385040283, "step": 758 }, { "epoch": 1.6170212765957448, "grad_norm": 0.466796875, "learning_rate": 6.991720808064251e-06, "loss": 0.9812240600585938, "step": 760 }, { "epoch": 1.6212765957446809, "grad_norm": 1.671875, "learning_rate": 6.969220656579391e-06, "loss": 0.8393826484680176, "step": 762 }, { "epoch": 1.625531914893617, "grad_norm": 0.486328125, "learning_rate": 6.946725586627165e-06, "loss": 0.9660863876342773, "step": 764 }, { "epoch": 1.6297872340425532, "grad_norm": 0.310546875, "learning_rate": 6.924236093526747e-06, "loss": 1.0426111221313477, "step": 766 }, { "epoch": 1.6340425531914895, "grad_norm": 0.470703125, "learning_rate": 6.901752672474499e-06, "loss": 0.6731575727462769, "step": 768 }, { "epoch": 1.6382978723404256, "grad_norm": 0.458984375, "learning_rate": 6.879275818533095e-06, "loss": 0.9503965377807617, "step": 770 }, { "epoch": 1.6425531914893616, "grad_norm": 0.486328125, "learning_rate": 6.8568060266206056e-06, "loss": 1.0612298250198364, "step": 772 }, { "epoch": 1.646808510638298, "grad_norm": 0.6875, "learning_rate": 6.834343791499595e-06, "loss": 0.7399391531944275, "step": 774 }, { "epoch": 1.6510638297872342, "grad_norm": 0.64453125, "learning_rate": 6.811889607766242e-06, "loss": 0.6109141707420349, "step": 776 }, { "epoch": 1.65531914893617, "grad_norm": 0.3515625, "learning_rate": 6.789443969839441e-06, "loss": 0.8604304790496826, "step": 778 }, { "epoch": 1.6595744680851063, "grad_norm": 0.59375, "learning_rate": 6.767007371949911e-06, "loss": 0.864715576171875, "step": 780 }, { "epoch": 1.6638297872340426, "grad_norm": 0.3984375, "learning_rate": 6.744580308129327e-06, "loss": 0.8427615165710449, "step": 782 }, { "epoch": 1.6680851063829787, "grad_norm": 0.53125, "learning_rate": 6.722163272199424e-06, "loss": 0.9220309853553772, "step": 784 }, { "epoch": 1.6723404255319148, "grad_norm": 1.046875, "learning_rate": 6.69975675776114e-06, "loss": 0.8783171772956848, "step": 786 }, { "epoch": 1.676595744680851, "grad_norm": 0.30078125, "learning_rate": 6.677361258183735e-06, "loss": 0.6494432687759399, "step": 788 }, { "epoch": 1.6808510638297873, "grad_norm": 0.462890625, "learning_rate": 6.6549772665939346e-06, "loss": 0.8931559920310974, "step": 790 }, { "epoch": 1.6851063829787234, "grad_norm": 0.6796875, "learning_rate": 6.632605275865074e-06, "loss": 0.7723158597946167, "step": 792 }, { "epoch": 1.6893617021276595, "grad_norm": 0.474609375, "learning_rate": 6.610245778606232e-06, "loss": 0.9853664636611938, "step": 794 }, { "epoch": 1.6936170212765957, "grad_norm": 0.54296875, "learning_rate": 6.587899267151401e-06, "loss": 0.7868849635124207, "step": 796 }, { "epoch": 1.697872340425532, "grad_norm": 0.51953125, "learning_rate": 6.56556623354864e-06, "loss": 0.852700412273407, "step": 798 }, { "epoch": 1.702127659574468, "grad_norm": 0.443359375, "learning_rate": 6.543247169549232e-06, "loss": 0.8994773626327515, "step": 800 }, { "epoch": 1.7063829787234042, "grad_norm": 0.34765625, "learning_rate": 6.520942566596868e-06, "loss": 0.8999802470207214, "step": 802 }, { "epoch": 1.7106382978723405, "grad_norm": 0.5, "learning_rate": 6.4986529158168215e-06, "loss": 0.7869191765785217, "step": 804 }, { "epoch": 1.7148936170212767, "grad_norm": 0.64453125, "learning_rate": 6.476378708005135e-06, "loss": 0.8270288705825806, "step": 806 }, { "epoch": 1.7191489361702128, "grad_norm": 0.75, "learning_rate": 6.454120433617804e-06, "loss": 0.9229409694671631, "step": 808 }, { "epoch": 1.7234042553191489, "grad_norm": 0.59765625, "learning_rate": 6.431878582759994e-06, "loss": 0.7548995614051819, "step": 810 }, { "epoch": 1.7276595744680852, "grad_norm": 0.357421875, "learning_rate": 6.409653645175241e-06, "loss": 0.8321532607078552, "step": 812 }, { "epoch": 1.7319148936170212, "grad_norm": 0.62109375, "learning_rate": 6.387446110234658e-06, "loss": 0.6601775288581848, "step": 814 }, { "epoch": 1.7361702127659573, "grad_norm": 0.34765625, "learning_rate": 6.365256466926183e-06, "loss": 0.8633728623390198, "step": 816 }, { "epoch": 1.7404255319148936, "grad_norm": 0.447265625, "learning_rate": 6.343085203843786e-06, "loss": 0.9041755199432373, "step": 818 }, { "epoch": 1.7446808510638299, "grad_norm": 0.53515625, "learning_rate": 6.32093280917673e-06, "loss": 0.8834015727043152, "step": 820 }, { "epoch": 1.748936170212766, "grad_norm": 0.466796875, "learning_rate": 6.29879977069881e-06, "loss": 0.7971745133399963, "step": 822 }, { "epoch": 1.753191489361702, "grad_norm": 0.359375, "learning_rate": 6.2766865757576164e-06, "loss": 0.8187481164932251, "step": 824 }, { "epoch": 1.7574468085106383, "grad_norm": 0.58203125, "learning_rate": 6.254593711263813e-06, "loss": 0.8846163153648376, "step": 826 }, { "epoch": 1.7617021276595746, "grad_norm": 0.875, "learning_rate": 6.232521663680393e-06, "loss": 0.9830833077430725, "step": 828 }, { "epoch": 1.7659574468085106, "grad_norm": 0.953125, "learning_rate": 6.210470919011992e-06, "loss": 0.7482036352157593, "step": 830 }, { "epoch": 1.7702127659574467, "grad_norm": 0.50390625, "learning_rate": 6.188441962794179e-06, "loss": 0.8920266628265381, "step": 832 }, { "epoch": 1.774468085106383, "grad_norm": 0.3359375, "learning_rate": 6.166435280082749e-06, "loss": 0.8772265315055847, "step": 834 }, { "epoch": 1.7787234042553193, "grad_norm": 0.435546875, "learning_rate": 6.1444513554430745e-06, "loss": 0.8204891681671143, "step": 836 }, { "epoch": 1.7829787234042553, "grad_norm": 0.4765625, "learning_rate": 6.122490672939405e-06, "loss": 0.5873453617095947, "step": 838 }, { "epoch": 1.7872340425531914, "grad_norm": 0.44921875, "learning_rate": 6.100553716124224e-06, "loss": 0.8039622902870178, "step": 840 }, { "epoch": 1.7914893617021277, "grad_norm": 1.15625, "learning_rate": 6.078640968027598e-06, "loss": 0.6872312426567078, "step": 842 }, { "epoch": 1.795744680851064, "grad_norm": 0.78125, "learning_rate": 6.056752911146548e-06, "loss": 0.8578442931175232, "step": 844 }, { "epoch": 1.8, "grad_norm": 0.4921875, "learning_rate": 6.034890027434413e-06, "loss": 0.7564026117324829, "step": 846 }, { "epoch": 1.804255319148936, "grad_norm": 0.78515625, "learning_rate": 6.013052798290241e-06, "loss": 0.8832213878631592, "step": 848 }, { "epoch": 1.8085106382978724, "grad_norm": 0.62109375, "learning_rate": 5.9912417045482e-06, "loss": 0.8571723699569702, "step": 850 }, { "epoch": 1.8127659574468085, "grad_norm": 0.60546875, "learning_rate": 5.969457226466977e-06, "loss": 0.824770450592041, "step": 852 }, { "epoch": 1.8170212765957445, "grad_norm": 0.38671875, "learning_rate": 5.9476998437192066e-06, "loss": 0.8723496794700623, "step": 854 }, { "epoch": 1.8212765957446808, "grad_norm": 0.388671875, "learning_rate": 5.925970035380918e-06, "loss": 0.7535234093666077, "step": 856 }, { "epoch": 1.825531914893617, "grad_norm": 0.40625, "learning_rate": 5.904268279920973e-06, "loss": 0.9033308625221252, "step": 858 }, { "epoch": 1.8297872340425532, "grad_norm": 0.345703125, "learning_rate": 5.88259505519054e-06, "loss": 0.670329749584198, "step": 860 }, { "epoch": 1.8340425531914892, "grad_norm": 0.43359375, "learning_rate": 5.860950838412565e-06, "loss": 0.8669137358665466, "step": 862 }, { "epoch": 1.8382978723404255, "grad_norm": 0.357421875, "learning_rate": 5.839336106171274e-06, "loss": 0.8537063598632812, "step": 864 }, { "epoch": 1.8425531914893618, "grad_norm": 0.6640625, "learning_rate": 5.81775133440167e-06, "loss": 0.8618923425674438, "step": 866 }, { "epoch": 1.8468085106382979, "grad_norm": 0.400390625, "learning_rate": 5.79619699837905e-06, "loss": 0.7936420440673828, "step": 868 }, { "epoch": 1.851063829787234, "grad_norm": 0.46484375, "learning_rate": 5.774673572708554e-06, "loss": 0.7838106155395508, "step": 870 }, { "epoch": 1.8553191489361702, "grad_norm": 0.455078125, "learning_rate": 5.753181531314708e-06, "loss": 0.8583153486251831, "step": 872 }, { "epoch": 1.8595744680851065, "grad_norm": 0.43359375, "learning_rate": 5.7317213474309764e-06, "loss": 0.9282540678977966, "step": 874 }, { "epoch": 1.8638297872340426, "grad_norm": 0.75, "learning_rate": 5.710293493589363e-06, "loss": 0.6059424877166748, "step": 876 }, { "epoch": 1.8680851063829786, "grad_norm": 3.71875, "learning_rate": 5.688898441609994e-06, "loss": 0.9776955842971802, "step": 878 }, { "epoch": 1.872340425531915, "grad_norm": 0.72265625, "learning_rate": 5.6675366625907264e-06, "loss": 0.900459885597229, "step": 880 }, { "epoch": 1.8765957446808512, "grad_norm": 0.458984375, "learning_rate": 5.646208626896784e-06, "loss": 0.758176326751709, "step": 882 }, { "epoch": 1.8808510638297873, "grad_norm": 0.41796875, "learning_rate": 5.624914804150397e-06, "loss": 0.8674149513244629, "step": 884 }, { "epoch": 1.8851063829787233, "grad_norm": 0.6796875, "learning_rate": 5.6036556632204564e-06, "loss": 0.778677761554718, "step": 886 }, { "epoch": 1.8893617021276596, "grad_norm": 0.63671875, "learning_rate": 5.582431672212195e-06, "loss": 0.8965961933135986, "step": 888 }, { "epoch": 1.8936170212765957, "grad_norm": 0.408203125, "learning_rate": 5.5612432984568815e-06, "loss": 0.5581719279289246, "step": 890 }, { "epoch": 1.8978723404255318, "grad_norm": 1.609375, "learning_rate": 5.5400910085015275e-06, "loss": 0.8819167017936707, "step": 892 }, { "epoch": 1.902127659574468, "grad_norm": 0.490234375, "learning_rate": 5.518975268098611e-06, "loss": 0.9992945194244385, "step": 894 }, { "epoch": 1.9063829787234043, "grad_norm": 0.330078125, "learning_rate": 5.497896542195829e-06, "loss": 0.6863605976104736, "step": 896 }, { "epoch": 1.9106382978723404, "grad_norm": 0.447265625, "learning_rate": 5.476855294925857e-06, "loss": 0.7966746687889099, "step": 898 }, { "epoch": 1.9148936170212765, "grad_norm": 0.310546875, "learning_rate": 5.455851989596123e-06, "loss": 1.0022021532058716, "step": 900 }, { "epoch": 1.9191489361702128, "grad_norm": 0.3359375, "learning_rate": 5.434887088678614e-06, "loss": 0.7175713181495667, "step": 902 }, { "epoch": 1.923404255319149, "grad_norm": 0.43359375, "learning_rate": 5.413961053799693e-06, "loss": 0.787550687789917, "step": 904 }, { "epoch": 1.9276595744680851, "grad_norm": 0.53515625, "learning_rate": 5.393074345729926e-06, "loss": 0.9805369973182678, "step": 906 }, { "epoch": 1.9319148936170212, "grad_norm": 0.71875, "learning_rate": 5.372227424373942e-06, "loss": 0.90399169921875, "step": 908 }, { "epoch": 1.9361702127659575, "grad_norm": 0.49609375, "learning_rate": 5.351420748760311e-06, "loss": 0.8127355575561523, "step": 910 }, { "epoch": 1.9404255319148938, "grad_norm": 0.34765625, "learning_rate": 5.330654777031428e-06, "loss": 0.9437844157218933, "step": 912 }, { "epoch": 1.9446808510638298, "grad_norm": 0.412109375, "learning_rate": 5.309929966433428e-06, "loss": 1.0004428625106812, "step": 914 }, { "epoch": 1.9489361702127659, "grad_norm": 1.953125, "learning_rate": 5.289246773306118e-06, "loss": 0.8540473580360413, "step": 916 }, { "epoch": 1.9531914893617022, "grad_norm": 0.51171875, "learning_rate": 5.268605653072935e-06, "loss": 0.7977997660636902, "step": 918 }, { "epoch": 1.9574468085106385, "grad_norm": 0.494140625, "learning_rate": 5.248007060230907e-06, "loss": 0.9748218655586243, "step": 920 }, { "epoch": 1.9617021276595743, "grad_norm": 0.41796875, "learning_rate": 5.227451448340651e-06, "loss": 0.86171555519104, "step": 922 }, { "epoch": 1.9659574468085106, "grad_norm": 0.353515625, "learning_rate": 5.206939270016393e-06, "loss": 0.8841200470924377, "step": 924 }, { "epoch": 1.9702127659574469, "grad_norm": 0.50390625, "learning_rate": 5.186470976915983e-06, "loss": 0.9302433133125305, "step": 926 }, { "epoch": 1.974468085106383, "grad_norm": 0.484375, "learning_rate": 5.166047019730971e-06, "loss": 0.6985507011413574, "step": 928 }, { "epoch": 1.978723404255319, "grad_norm": 0.3984375, "learning_rate": 5.145667848176675e-06, "loss": 0.9847785830497742, "step": 930 }, { "epoch": 1.9829787234042553, "grad_norm": 0.4140625, "learning_rate": 5.1253339109822705e-06, "loss": 0.9930030703544617, "step": 932 }, { "epoch": 1.9872340425531916, "grad_norm": 0.5703125, "learning_rate": 5.10504565588092e-06, "loss": 0.7830001711845398, "step": 934 }, { "epoch": 1.9914893617021276, "grad_norm": 0.365234375, "learning_rate": 5.084803529599915e-06, "loss": 0.607052743434906, "step": 936 }, { "epoch": 1.9957446808510637, "grad_norm": 1.0078125, "learning_rate": 5.064607977850834e-06, "loss": 0.9631056785583496, "step": 938 }, { "epoch": 2.0, "grad_norm": 0.60546875, "learning_rate": 5.044459445319727e-06, "loss": 0.6884191036224365, "step": 940 }, { "epoch": 2.0042553191489363, "grad_norm": 0.4296875, "learning_rate": 5.024358375657334e-06, "loss": 0.5563607215881348, "step": 942 }, { "epoch": 2.008510638297872, "grad_norm": 0.412109375, "learning_rate": 5.004305211469303e-06, "loss": 0.5658197402954102, "step": 944 }, { "epoch": 2.0127659574468084, "grad_norm": 0.4453125, "learning_rate": 4.984300394306453e-06, "loss": 0.5938859581947327, "step": 946 }, { "epoch": 2.0170212765957447, "grad_norm": 0.416015625, "learning_rate": 4.964344364655053e-06, "loss": 0.5363519191741943, "step": 948 }, { "epoch": 2.021276595744681, "grad_norm": 0.66015625, "learning_rate": 4.944437561927118e-06, "loss": 0.6647061109542847, "step": 950 }, { "epoch": 2.025531914893617, "grad_norm": 0.357421875, "learning_rate": 4.92458042445073e-06, "loss": 0.557117223739624, "step": 952 }, { "epoch": 2.029787234042553, "grad_norm": 0.427734375, "learning_rate": 4.9047733894603946e-06, "loss": 0.3953529894351959, "step": 954 }, { "epoch": 2.0340425531914894, "grad_norm": 0.439453125, "learning_rate": 4.88501689308741e-06, "loss": 0.779535710811615, "step": 956 }, { "epoch": 2.0382978723404257, "grad_norm": 0.36328125, "learning_rate": 4.8653113703502695e-06, "loss": 0.5275522470474243, "step": 958 }, { "epoch": 2.0425531914893615, "grad_norm": 0.51953125, "learning_rate": 4.845657255145068e-06, "loss": 0.5947195291519165, "step": 960 }, { "epoch": 2.046808510638298, "grad_norm": 2.640625, "learning_rate": 4.8260549802359605e-06, "loss": 0.6270468235015869, "step": 962 }, { "epoch": 2.051063829787234, "grad_norm": 0.64453125, "learning_rate": 4.806504977245636e-06, "loss": 0.6905896067619324, "step": 964 }, { "epoch": 2.0553191489361704, "grad_norm": 0.51953125, "learning_rate": 4.7870076766457995e-06, "loss": 0.5533561110496521, "step": 966 }, { "epoch": 2.0595744680851062, "grad_norm": 0.609375, "learning_rate": 4.767563507747705e-06, "loss": 0.6810728311538696, "step": 968 }, { "epoch": 2.0638297872340425, "grad_norm": 0.337890625, "learning_rate": 4.748172898692704e-06, "loss": 0.3691895306110382, "step": 970 }, { "epoch": 2.068085106382979, "grad_norm": 0.6953125, "learning_rate": 4.728836276442803e-06, "loss": 0.5883108377456665, "step": 972 }, { "epoch": 2.072340425531915, "grad_norm": 0.498046875, "learning_rate": 4.7095540667712775e-06, "loss": 0.5326440334320068, "step": 974 }, { "epoch": 2.076595744680851, "grad_norm": 1.84375, "learning_rate": 4.690326694253294e-06, "loss": 0.41566312313079834, "step": 976 }, { "epoch": 2.0808510638297872, "grad_norm": 0.62109375, "learning_rate": 4.671154582256559e-06, "loss": 0.7029457688331604, "step": 978 }, { "epoch": 2.0851063829787235, "grad_norm": 0.2412109375, "learning_rate": 4.6520381529319954e-06, "loss": 0.4108755588531494, "step": 980 }, { "epoch": 2.0893617021276594, "grad_norm": 0.4609375, "learning_rate": 4.632977827204445e-06, "loss": 0.4902803599834442, "step": 982 }, { "epoch": 2.0936170212765957, "grad_norm": 0.98828125, "learning_rate": 4.613974024763411e-06, "loss": 0.5197535753250122, "step": 984 }, { "epoch": 2.097872340425532, "grad_norm": 0.341796875, "learning_rate": 4.595027164053805e-06, "loss": 0.4887603521347046, "step": 986 }, { "epoch": 2.1021276595744682, "grad_norm": 0.45703125, "learning_rate": 4.5761376622667406e-06, "loss": 0.276875376701355, "step": 988 }, { "epoch": 2.106382978723404, "grad_norm": 0.375, "learning_rate": 4.557305935330346e-06, "loss": 0.6325949430465698, "step": 990 }, { "epoch": 2.1106382978723404, "grad_norm": 0.451171875, "learning_rate": 4.538532397900599e-06, "loss": 0.6041569709777832, "step": 992 }, { "epoch": 2.1148936170212767, "grad_norm": 0.408203125, "learning_rate": 4.519817463352204e-06, "loss": 0.6599090099334717, "step": 994 }, { "epoch": 2.119148936170213, "grad_norm": 0.68359375, "learning_rate": 4.5011615437694915e-06, "loss": 0.5671730041503906, "step": 996 }, { "epoch": 2.123404255319149, "grad_norm": 0.51953125, "learning_rate": 4.48256504993734e-06, "loss": 0.5928320288658142, "step": 998 }, { "epoch": 2.127659574468085, "grad_norm": 0.396484375, "learning_rate": 4.464028391332129e-06, "loss": 0.20121610164642334, "step": 1000 }, { "epoch": 2.1319148936170214, "grad_norm": 0.4140625, "learning_rate": 4.445551976112725e-06, "loss": 0.7131472826004028, "step": 1002 }, { "epoch": 2.1361702127659576, "grad_norm": 0.52734375, "learning_rate": 4.4271362111115006e-06, "loss": 0.5065695643424988, "step": 1004 }, { "epoch": 2.1404255319148935, "grad_norm": 0.765625, "learning_rate": 4.408781501825362e-06, "loss": 0.733562707901001, "step": 1006 }, { "epoch": 2.1446808510638298, "grad_norm": 0.439453125, "learning_rate": 4.390488252406838e-06, "loss": 0.5062799453735352, "step": 1008 }, { "epoch": 2.148936170212766, "grad_norm": 0.310546875, "learning_rate": 4.372256865655169e-06, "loss": 0.39719632267951965, "step": 1010 }, { "epoch": 2.153191489361702, "grad_norm": 0.384765625, "learning_rate": 4.354087743007433e-06, "loss": 0.5480824112892151, "step": 1012 }, { "epoch": 2.157446808510638, "grad_norm": 0.640625, "learning_rate": 4.335981284529725e-06, "loss": 0.5634360909461975, "step": 1014 }, { "epoch": 2.1617021276595745, "grad_norm": 0.5390625, "learning_rate": 4.317937888908333e-06, "loss": 0.6165044903755188, "step": 1016 }, { "epoch": 2.1659574468085108, "grad_norm": 0.66796875, "learning_rate": 4.2999579534409626e-06, "loss": 0.3983045220375061, "step": 1018 }, { "epoch": 2.1702127659574466, "grad_norm": 0.73828125, "learning_rate": 4.282041874027989e-06, "loss": 0.41795188188552856, "step": 1020 }, { "epoch": 2.174468085106383, "grad_norm": 0.474609375, "learning_rate": 4.264190045163742e-06, "loss": 0.6024309396743774, "step": 1022 }, { "epoch": 2.178723404255319, "grad_norm": 1.015625, "learning_rate": 4.246402859927817e-06, "loss": 0.6394532918930054, "step": 1024 }, { "epoch": 2.1829787234042555, "grad_norm": 0.55078125, "learning_rate": 4.22868070997642e-06, "loss": 0.4610865116119385, "step": 1026 }, { "epoch": 2.1872340425531913, "grad_norm": 0.40234375, "learning_rate": 4.211023985533748e-06, "loss": 0.5758063197135925, "step": 1028 }, { "epoch": 2.1914893617021276, "grad_norm": 0.443359375, "learning_rate": 4.1934330753833885e-06, "loss": 0.6651563048362732, "step": 1030 }, { "epoch": 2.195744680851064, "grad_norm": 3.75, "learning_rate": 4.175908366859766e-06, "loss": 0.5991113185882568, "step": 1032 }, { "epoch": 2.2, "grad_norm": 0.470703125, "learning_rate": 4.158450245839608e-06, "loss": 0.6895382404327393, "step": 1034 }, { "epoch": 2.204255319148936, "grad_norm": 0.76953125, "learning_rate": 4.141059096733455e-06, "loss": 0.4550260305404663, "step": 1036 }, { "epoch": 2.2085106382978723, "grad_norm": 0.42578125, "learning_rate": 4.123735302477193e-06, "loss": 0.4480676054954529, "step": 1038 }, { "epoch": 2.2127659574468086, "grad_norm": 0.419921875, "learning_rate": 4.106479244523616e-06, "loss": 0.5520376563072205, "step": 1040 }, { "epoch": 2.217021276595745, "grad_norm": 0.470703125, "learning_rate": 4.0892913028340335e-06, "loss": 0.6519399285316467, "step": 1042 }, { "epoch": 2.2212765957446807, "grad_norm": 0.44921875, "learning_rate": 4.072171855869905e-06, "loss": 0.5653026700019836, "step": 1044 }, { "epoch": 2.225531914893617, "grad_norm": 0.44140625, "learning_rate": 4.055121280584499e-06, "loss": 0.5862460732460022, "step": 1046 }, { "epoch": 2.2297872340425533, "grad_norm": 1.1484375, "learning_rate": 4.038139952414603e-06, "loss": 0.8048577308654785, "step": 1048 }, { "epoch": 2.2340425531914896, "grad_norm": 0.396484375, "learning_rate": 4.02122824527225e-06, "loss": 0.530511736869812, "step": 1050 }, { "epoch": 2.2382978723404254, "grad_norm": 0.5234375, "learning_rate": 4.004386531536482e-06, "loss": 0.43314328789711, "step": 1052 }, { "epoch": 2.2425531914893617, "grad_norm": 0.57421875, "learning_rate": 3.987615182045163e-06, "loss": 0.5556919574737549, "step": 1054 }, { "epoch": 2.246808510638298, "grad_norm": 0.392578125, "learning_rate": 3.9709145660868015e-06, "loss": 0.6972762942314148, "step": 1056 }, { "epoch": 2.251063829787234, "grad_norm": 0.55859375, "learning_rate": 3.9542850513924275e-06, "loss": 0.31911152601242065, "step": 1058 }, { "epoch": 2.25531914893617, "grad_norm": 0.65234375, "learning_rate": 3.9377270041274875e-06, "loss": 0.6526750922203064, "step": 1060 }, { "epoch": 2.2595744680851064, "grad_norm": 0.76171875, "learning_rate": 3.921240788883785e-06, "loss": 0.5144931077957153, "step": 1062 }, { "epoch": 2.2638297872340427, "grad_norm": 0.69921875, "learning_rate": 3.904826768671458e-06, "loss": 0.7288011312484741, "step": 1064 }, { "epoch": 2.2680851063829786, "grad_norm": 0.578125, "learning_rate": 3.888485304910978e-06, "loss": 0.6101799607276917, "step": 1066 }, { "epoch": 2.272340425531915, "grad_norm": 0.6796875, "learning_rate": 3.8722167574252e-06, "loss": 0.5383592247962952, "step": 1068 }, { "epoch": 2.276595744680851, "grad_norm": 0.82421875, "learning_rate": 3.856021484431428e-06, "loss": 0.6244062185287476, "step": 1070 }, { "epoch": 2.2808510638297874, "grad_norm": 0.41796875, "learning_rate": 3.839899842533538e-06, "loss": 0.4686053991317749, "step": 1072 }, { "epoch": 2.2851063829787233, "grad_norm": 0.5078125, "learning_rate": 3.823852186714121e-06, "loss": 0.5087999105453491, "step": 1074 }, { "epoch": 2.2893617021276595, "grad_norm": 0.48828125, "learning_rate": 3.80787887032667e-06, "loss": 0.4900204837322235, "step": 1076 }, { "epoch": 2.293617021276596, "grad_norm": 0.70703125, "learning_rate": 3.7919802450877993e-06, "loss": 0.5593716502189636, "step": 1078 }, { "epoch": 2.297872340425532, "grad_norm": 0.859375, "learning_rate": 3.7761566610694882e-06, "loss": 0.3470194339752197, "step": 1080 }, { "epoch": 2.302127659574468, "grad_norm": 0.5546875, "learning_rate": 3.7604084666913924e-06, "loss": 0.28270450234413147, "step": 1082 }, { "epoch": 2.3063829787234043, "grad_norm": 0.4609375, "learning_rate": 3.74473600871316e-06, "loss": 0.6159269213676453, "step": 1084 }, { "epoch": 2.3106382978723405, "grad_norm": 0.38671875, "learning_rate": 3.729139632226795e-06, "loss": 0.46399620175361633, "step": 1086 }, { "epoch": 2.3148936170212764, "grad_norm": 1.3046875, "learning_rate": 3.713619680649067e-06, "loss": 0.39948195219039917, "step": 1088 }, { "epoch": 2.3191489361702127, "grad_norm": 1.890625, "learning_rate": 3.698176495713943e-06, "loss": 0.4936513602733612, "step": 1090 }, { "epoch": 2.323404255319149, "grad_norm": 0.453125, "learning_rate": 3.6828104174650614e-06, "loss": 0.6025733351707458, "step": 1092 }, { "epoch": 2.3276595744680852, "grad_norm": 0.42578125, "learning_rate": 3.667521784248253e-06, "loss": 0.5419857501983643, "step": 1094 }, { "epoch": 2.331914893617021, "grad_norm": 0.5625, "learning_rate": 3.652310932704083e-06, "loss": 0.5457516312599182, "step": 1096 }, { "epoch": 2.3361702127659574, "grad_norm": 0.6015625, "learning_rate": 3.637178197760443e-06, "loss": 0.5860179662704468, "step": 1098 }, { "epoch": 2.3404255319148937, "grad_norm": 0.36328125, "learning_rate": 3.6221239126251687e-06, "loss": 0.4711592197418213, "step": 1100 }, { "epoch": 2.34468085106383, "grad_norm": 0.64453125, "learning_rate": 3.6071484087787147e-06, "loss": 0.6296599507331848, "step": 1102 }, { "epoch": 2.348936170212766, "grad_norm": 1.96875, "learning_rate": 3.59225201596685e-06, "loss": 0.7384690046310425, "step": 1104 }, { "epoch": 2.353191489361702, "grad_norm": 0.63671875, "learning_rate": 3.577435062193391e-06, "loss": 0.5660156607627869, "step": 1106 }, { "epoch": 2.3574468085106384, "grad_norm": 1.875, "learning_rate": 3.562697873712993e-06, "loss": 0.5146188139915466, "step": 1108 }, { "epoch": 2.3617021276595747, "grad_norm": 1.046875, "learning_rate": 3.548040775023951e-06, "loss": 0.4210270643234253, "step": 1110 }, { "epoch": 2.3659574468085105, "grad_norm": 0.796875, "learning_rate": 3.5334640888610656e-06, "loss": 0.4388498365879059, "step": 1112 }, { "epoch": 2.370212765957447, "grad_norm": 1.90625, "learning_rate": 3.5189681361885336e-06, "loss": 0.3667604327201843, "step": 1114 }, { "epoch": 2.374468085106383, "grad_norm": 0.36328125, "learning_rate": 3.5045532361928817e-06, "loss": 0.4419676959514618, "step": 1116 }, { "epoch": 2.378723404255319, "grad_norm": 0.66015625, "learning_rate": 3.490219706275933e-06, "loss": 0.6218468546867371, "step": 1118 }, { "epoch": 2.382978723404255, "grad_norm": 0.435546875, "learning_rate": 3.4759678620478234e-06, "loss": 0.4756940007209778, "step": 1120 }, { "epoch": 2.3872340425531915, "grad_norm": 0.375, "learning_rate": 3.4617980173200518e-06, "loss": 0.6557533144950867, "step": 1122 }, { "epoch": 2.391489361702128, "grad_norm": 0.42578125, "learning_rate": 3.447710484098571e-06, "loss": 0.3975709080696106, "step": 1124 }, { "epoch": 2.395744680851064, "grad_norm": 0.58203125, "learning_rate": 3.43370557257691e-06, "loss": 0.5444962382316589, "step": 1126 }, { "epoch": 2.4, "grad_norm": 0.47265625, "learning_rate": 3.4197835911293578e-06, "loss": 0.48340773582458496, "step": 1128 }, { "epoch": 2.404255319148936, "grad_norm": 0.5625, "learning_rate": 3.4059448463041582e-06, "loss": 0.8209078311920166, "step": 1130 }, { "epoch": 2.4085106382978725, "grad_norm": 0.462890625, "learning_rate": 3.3921896428167704e-06, "loss": 0.6566969156265259, "step": 1132 }, { "epoch": 2.4127659574468083, "grad_norm": 0.439453125, "learning_rate": 3.378518283543155e-06, "loss": 0.7115936875343323, "step": 1134 }, { "epoch": 2.4170212765957446, "grad_norm": 0.6875, "learning_rate": 3.3649310695131094e-06, "loss": 0.48289287090301514, "step": 1136 }, { "epoch": 2.421276595744681, "grad_norm": 0.462890625, "learning_rate": 3.3514282999036305e-06, "loss": 0.3552096486091614, "step": 1138 }, { "epoch": 2.425531914893617, "grad_norm": 0.400390625, "learning_rate": 3.3380102720323343e-06, "loss": 0.635092556476593, "step": 1140 }, { "epoch": 2.429787234042553, "grad_norm": 0.66796875, "learning_rate": 3.324677281350911e-06, "loss": 0.4491591453552246, "step": 1142 }, { "epoch": 2.4340425531914893, "grad_norm": 0.859375, "learning_rate": 3.3114296214386135e-06, "loss": 0.5700670480728149, "step": 1144 }, { "epoch": 2.4382978723404256, "grad_norm": 1.2265625, "learning_rate": 3.2982675839957957e-06, "loss": 0.6150033473968506, "step": 1146 }, { "epoch": 2.4425531914893615, "grad_norm": 0.5703125, "learning_rate": 3.28519145883749e-06, "loss": 0.2981261909008026, "step": 1148 }, { "epoch": 2.4468085106382977, "grad_norm": 0.4140625, "learning_rate": 3.2722015338870253e-06, "loss": 0.43131235241889954, "step": 1150 }, { "epoch": 2.451063829787234, "grad_norm": 1.1640625, "learning_rate": 3.2592980951696847e-06, "loss": 0.5070037841796875, "step": 1152 }, { "epoch": 2.4553191489361703, "grad_norm": 1.078125, "learning_rate": 3.2464814268064147e-06, "loss": 0.4555862843990326, "step": 1154 }, { "epoch": 2.4595744680851066, "grad_norm": 0.49609375, "learning_rate": 3.2337518110075632e-06, "loss": 0.5812932252883911, "step": 1156 }, { "epoch": 2.4638297872340424, "grad_norm": 1.359375, "learning_rate": 3.221109528066664e-06, "loss": 0.5926228761672974, "step": 1158 }, { "epoch": 2.4680851063829787, "grad_norm": 0.828125, "learning_rate": 3.2085548563542688e-06, "loss": 0.6022335290908813, "step": 1160 }, { "epoch": 2.472340425531915, "grad_norm": 0.7890625, "learning_rate": 3.19608807231182e-06, "loss": 0.5389635562896729, "step": 1162 }, { "epoch": 2.476595744680851, "grad_norm": 0.50390625, "learning_rate": 3.1837094504455587e-06, "loss": 0.586044192314148, "step": 1164 }, { "epoch": 2.480851063829787, "grad_norm": 0.482421875, "learning_rate": 3.17141926332048e-06, "loss": 0.5692299604415894, "step": 1166 }, { "epoch": 2.4851063829787234, "grad_norm": 0.44140625, "learning_rate": 3.159217781554335e-06, "loss": 0.658069372177124, "step": 1168 }, { "epoch": 2.4893617021276597, "grad_norm": 0.416015625, "learning_rate": 3.1471052738116726e-06, "loss": 0.5921551585197449, "step": 1170 }, { "epoch": 2.4936170212765956, "grad_norm": 0.35546875, "learning_rate": 3.135082006797918e-06, "loss": 0.45771515369415283, "step": 1172 }, { "epoch": 2.497872340425532, "grad_norm": 0.490234375, "learning_rate": 3.123148245253508e-06, "loss": 0.3539358079433441, "step": 1174 }, { "epoch": 2.502127659574468, "grad_norm": 0.41796875, "learning_rate": 3.111304251948056e-06, "loss": 0.6486715078353882, "step": 1176 }, { "epoch": 2.506382978723404, "grad_norm": 0.75, "learning_rate": 3.0995502876745657e-06, "loss": 0.3491562008857727, "step": 1178 }, { "epoch": 2.5106382978723403, "grad_norm": 1.046875, "learning_rate": 3.087886611243692e-06, "loss": 0.554216742515564, "step": 1180 }, { "epoch": 2.5148936170212766, "grad_norm": 0.447265625, "learning_rate": 3.076313479478042e-06, "loss": 0.46358993649482727, "step": 1182 }, { "epoch": 2.519148936170213, "grad_norm": 2.625, "learning_rate": 3.064831147206519e-06, "loss": 0.7309602499008179, "step": 1184 }, { "epoch": 2.523404255319149, "grad_norm": 0.380859375, "learning_rate": 3.05343986725871e-06, "loss": 0.5900013446807861, "step": 1186 }, { "epoch": 2.527659574468085, "grad_norm": 1.734375, "learning_rate": 3.0421398904593186e-06, "loss": 0.8710350394248962, "step": 1188 }, { "epoch": 2.5319148936170213, "grad_norm": 0.34375, "learning_rate": 3.030931465622647e-06, "loss": 0.7665842175483704, "step": 1190 }, { "epoch": 2.5361702127659576, "grad_norm": 0.423828125, "learning_rate": 3.0198148395471105e-06, "loss": 0.5311375260353088, "step": 1192 }, { "epoch": 2.5404255319148934, "grad_norm": 0.41015625, "learning_rate": 3.00879025700981e-06, "loss": 0.2682938873767853, "step": 1194 }, { "epoch": 2.5446808510638297, "grad_norm": 0.9140625, "learning_rate": 2.997857960761137e-06, "loss": 0.5427325367927551, "step": 1196 }, { "epoch": 2.548936170212766, "grad_norm": 0.765625, "learning_rate": 2.98701819151943e-06, "loss": 0.49154531955718994, "step": 1198 }, { "epoch": 2.5531914893617023, "grad_norm": 0.39453125, "learning_rate": 2.976271187965673e-06, "loss": 0.5094670057296753, "step": 1200 }, { "epoch": 2.5574468085106385, "grad_norm": 0.71875, "learning_rate": 2.9656171867382446e-06, "loss": 0.4511142075061798, "step": 1202 }, { "epoch": 2.5617021276595744, "grad_norm": 0.5859375, "learning_rate": 2.955056422427704e-06, "loss": 0.5634865760803223, "step": 1204 }, { "epoch": 2.5659574468085107, "grad_norm": 0.51953125, "learning_rate": 2.9445891275716233e-06, "loss": 0.3763676583766937, "step": 1206 }, { "epoch": 2.570212765957447, "grad_norm": 0.703125, "learning_rate": 2.9342155326494704e-06, "loss": 0.5212900638580322, "step": 1208 }, { "epoch": 2.574468085106383, "grad_norm": 0.72265625, "learning_rate": 2.9239358660775357e-06, "loss": 0.4663785994052887, "step": 1210 }, { "epoch": 2.578723404255319, "grad_norm": 0.50390625, "learning_rate": 2.9137503542038966e-06, "loss": 0.5414974093437195, "step": 1212 }, { "epoch": 2.5829787234042554, "grad_norm": 0.470703125, "learning_rate": 2.903659221303441e-06, "loss": 0.6152816414833069, "step": 1214 }, { "epoch": 2.5872340425531917, "grad_norm": 0.455078125, "learning_rate": 2.893662689572925e-06, "loss": 0.42417243123054504, "step": 1216 }, { "epoch": 2.5914893617021275, "grad_norm": 0.419921875, "learning_rate": 2.883760979126076e-06, "loss": 0.6008761525154114, "step": 1218 }, { "epoch": 2.595744680851064, "grad_norm": 0.90234375, "learning_rate": 2.8739543079887554e-06, "loss": 0.749297022819519, "step": 1220 }, { "epoch": 2.6, "grad_norm": 0.94140625, "learning_rate": 2.8642428920941513e-06, "loss": 0.6406426429748535, "step": 1222 }, { "epoch": 2.604255319148936, "grad_norm": 0.63671875, "learning_rate": 2.8546269452780275e-06, "loss": 0.5915369391441345, "step": 1224 }, { "epoch": 2.608510638297872, "grad_norm": 2.078125, "learning_rate": 2.8451066792740108e-06, "loss": 0.7708158493041992, "step": 1226 }, { "epoch": 2.6127659574468085, "grad_norm": 0.796875, "learning_rate": 2.835682303708931e-06, "loss": 0.2944878339767456, "step": 1228 }, { "epoch": 2.617021276595745, "grad_norm": 0.56640625, "learning_rate": 2.826354026098208e-06, "loss": 0.4445026218891144, "step": 1230 }, { "epoch": 2.621276595744681, "grad_norm": 0.546875, "learning_rate": 2.817122051841277e-06, "loss": 0.5953022837638855, "step": 1232 }, { "epoch": 2.625531914893617, "grad_norm": 0.66015625, "learning_rate": 2.807986584217072e-06, "loss": 0.47725632786750793, "step": 1234 }, { "epoch": 2.629787234042553, "grad_norm": 3.375, "learning_rate": 2.7989478243795434e-06, "loss": 0.5917444229125977, "step": 1236 }, { "epoch": 2.6340425531914895, "grad_norm": 0.58984375, "learning_rate": 2.790005971353233e-06, "loss": 0.6352754831314087, "step": 1238 }, { "epoch": 2.6382978723404253, "grad_norm": 0.400390625, "learning_rate": 2.7811612220288905e-06, "loss": 0.5205258131027222, "step": 1240 }, { "epoch": 2.6425531914893616, "grad_norm": 0.53515625, "learning_rate": 2.77241377115914e-06, "loss": 0.716691255569458, "step": 1242 }, { "epoch": 2.646808510638298, "grad_norm": 0.74609375, "learning_rate": 2.7637638113541866e-06, "loss": 0.3764870762825012, "step": 1244 }, { "epoch": 2.651063829787234, "grad_norm": 0.95703125, "learning_rate": 2.755211533077581e-06, "loss": 0.5524653196334839, "step": 1246 }, { "epoch": 2.65531914893617, "grad_norm": 0.4375, "learning_rate": 2.746757124642024e-06, "loss": 0.5442506074905396, "step": 1248 }, { "epoch": 2.6595744680851063, "grad_norm": 0.53515625, "learning_rate": 2.7384007722052168e-06, "loss": 0.5800641775131226, "step": 1250 }, { "epoch": 2.6638297872340426, "grad_norm": 0.5546875, "learning_rate": 2.7301426597657662e-06, "loss": 0.5853485465049744, "step": 1252 }, { "epoch": 2.6680851063829785, "grad_norm": 0.53515625, "learning_rate": 2.721982969159132e-06, "loss": 0.38345175981521606, "step": 1254 }, { "epoch": 2.6723404255319148, "grad_norm": 0.421875, "learning_rate": 2.7139218800536224e-06, "loss": 0.6944982409477234, "step": 1256 }, { "epoch": 2.676595744680851, "grad_norm": 0.384765625, "learning_rate": 2.7059595699464363e-06, "loss": 0.5350843667984009, "step": 1258 }, { "epoch": 2.6808510638297873, "grad_norm": 0.380859375, "learning_rate": 2.6980962141597594e-06, "loss": 0.5438748598098755, "step": 1260 }, { "epoch": 2.6851063829787236, "grad_norm": 0.431640625, "learning_rate": 2.6903319858369005e-06, "loss": 0.7831379175186157, "step": 1262 }, { "epoch": 2.6893617021276595, "grad_norm": 0.83203125, "learning_rate": 2.6826670559384784e-06, "loss": 0.3888491094112396, "step": 1264 }, { "epoch": 2.6936170212765957, "grad_norm": 0.63671875, "learning_rate": 2.6751015932386615e-06, "loss": 0.4081690311431885, "step": 1266 }, { "epoch": 2.697872340425532, "grad_norm": 0.55859375, "learning_rate": 2.6676357643214467e-06, "loss": 0.757609486579895, "step": 1268 }, { "epoch": 2.702127659574468, "grad_norm": 0.359375, "learning_rate": 2.660269733576995e-06, "loss": 0.2168269008398056, "step": 1270 }, { "epoch": 2.706382978723404, "grad_norm": 0.4453125, "learning_rate": 2.6530036631980093e-06, "loss": 0.5121868848800659, "step": 1272 }, { "epoch": 2.7106382978723405, "grad_norm": 0.67578125, "learning_rate": 2.6458377131761655e-06, "loss": 0.588572084903717, "step": 1274 }, { "epoch": 2.7148936170212767, "grad_norm": 0.396484375, "learning_rate": 2.6387720412985873e-06, "loss": 0.5837306380271912, "step": 1276 }, { "epoch": 2.719148936170213, "grad_norm": 1.2734375, "learning_rate": 2.631806803144373e-06, "loss": 0.5779358148574829, "step": 1278 }, { "epoch": 2.723404255319149, "grad_norm": 1.1171875, "learning_rate": 2.624942152081171e-06, "loss": 0.42244261503219604, "step": 1280 }, { "epoch": 2.727659574468085, "grad_norm": 1.1796875, "learning_rate": 2.6181782392618002e-06, "loss": 0.5723677277565002, "step": 1282 }, { "epoch": 2.731914893617021, "grad_norm": 0.65625, "learning_rate": 2.611515213620924e-06, "loss": 0.6737433075904846, "step": 1284 }, { "epoch": 2.7361702127659573, "grad_norm": 0.953125, "learning_rate": 2.604953221871769e-06, "loss": 0.6697869300842285, "step": 1286 }, { "epoch": 2.7404255319148936, "grad_norm": 0.431640625, "learning_rate": 2.5984924085028968e-06, "loss": 0.41797778010368347, "step": 1288 }, { "epoch": 2.74468085106383, "grad_norm": 0.53515625, "learning_rate": 2.5921329157750205e-06, "loss": 0.6901787519454956, "step": 1290 }, { "epoch": 2.748936170212766, "grad_norm": 0.62109375, "learning_rate": 2.5858748837178724e-06, "loss": 0.48409298062324524, "step": 1292 }, { "epoch": 2.753191489361702, "grad_norm": 1.4765625, "learning_rate": 2.579718450127124e-06, "loss": 0.48840850591659546, "step": 1294 }, { "epoch": 2.7574468085106383, "grad_norm": 0.515625, "learning_rate": 2.5736637505613453e-06, "loss": 0.5451318621635437, "step": 1296 }, { "epoch": 2.7617021276595746, "grad_norm": 0.609375, "learning_rate": 2.5677109183390254e-06, "loss": 0.3569204807281494, "step": 1298 }, { "epoch": 2.7659574468085104, "grad_norm": 0.43359375, "learning_rate": 2.5618600845356374e-06, "loss": 0.6634436845779419, "step": 1300 }, { "epoch": 2.7702127659574467, "grad_norm": 0.416015625, "learning_rate": 2.5561113779807473e-06, "loss": 0.40077003836631775, "step": 1302 }, { "epoch": 2.774468085106383, "grad_norm": 2.5625, "learning_rate": 2.550464925255182e-06, "loss": 0.49653542041778564, "step": 1304 }, { "epoch": 2.7787234042553193, "grad_norm": 0.498046875, "learning_rate": 2.544920850688239e-06, "loss": 0.3718079626560211, "step": 1306 }, { "epoch": 2.7829787234042556, "grad_norm": 0.490234375, "learning_rate": 2.5394792763549506e-06, "loss": 0.6696460843086243, "step": 1308 }, { "epoch": 2.7872340425531914, "grad_norm": 0.494140625, "learning_rate": 2.534140322073397e-06, "loss": 0.4750995337963104, "step": 1310 }, { "epoch": 2.7914893617021277, "grad_norm": 0.412109375, "learning_rate": 2.5289041054020637e-06, "loss": 0.38971856236457825, "step": 1312 }, { "epoch": 2.795744680851064, "grad_norm": 0.44921875, "learning_rate": 2.523770741637259e-06, "loss": 0.5828387141227722, "step": 1314 }, { "epoch": 2.8, "grad_norm": 0.494140625, "learning_rate": 2.518740343810568e-06, "loss": 0.2812992334365845, "step": 1316 }, { "epoch": 2.804255319148936, "grad_norm": 0.69921875, "learning_rate": 2.513813022686371e-06, "loss": 0.6449145674705505, "step": 1318 }, { "epoch": 2.8085106382978724, "grad_norm": 0.59375, "learning_rate": 2.5089888867594004e-06, "loss": 0.42496779561042786, "step": 1320 }, { "epoch": 2.8127659574468087, "grad_norm": 0.90625, "learning_rate": 2.5042680422523538e-06, "loss": 0.6403509974479675, "step": 1322 }, { "epoch": 2.8170212765957445, "grad_norm": 0.65625, "learning_rate": 2.4996505931135513e-06, "loss": 0.5965058207511902, "step": 1324 }, { "epoch": 2.821276595744681, "grad_norm": 0.55078125, "learning_rate": 2.4951366410146506e-06, "loss": 0.38872432708740234, "step": 1326 }, { "epoch": 2.825531914893617, "grad_norm": 0.60546875, "learning_rate": 2.4907262853484093e-06, "loss": 0.47181040048599243, "step": 1328 }, { "epoch": 2.829787234042553, "grad_norm": 0.40234375, "learning_rate": 2.4864196232264913e-06, "loss": 0.5333115458488464, "step": 1330 }, { "epoch": 2.8340425531914892, "grad_norm": 0.6015625, "learning_rate": 2.4822167494773325e-06, "loss": 0.6577153205871582, "step": 1332 }, { "epoch": 2.8382978723404255, "grad_norm": 0.55078125, "learning_rate": 2.4781177566440513e-06, "loss": 0.544109046459198, "step": 1334 }, { "epoch": 2.842553191489362, "grad_norm": 0.96484375, "learning_rate": 2.474122734982411e-06, "loss": 0.26606178283691406, "step": 1336 }, { "epoch": 2.846808510638298, "grad_norm": 0.3984375, "learning_rate": 2.4702317724588332e-06, "loss": 0.486730694770813, "step": 1338 }, { "epoch": 2.851063829787234, "grad_norm": 0.58203125, "learning_rate": 2.4664449547484595e-06, "loss": 0.47592607140541077, "step": 1340 }, { "epoch": 2.8553191489361702, "grad_norm": 0.5625, "learning_rate": 2.462762365233268e-06, "loss": 0.4367084801197052, "step": 1342 }, { "epoch": 2.8595744680851065, "grad_norm": 0.4453125, "learning_rate": 2.459184085000232e-06, "loss": 0.3742711842060089, "step": 1344 }, { "epoch": 2.8638297872340424, "grad_norm": 0.6328125, "learning_rate": 2.455710192839539e-06, "loss": 0.5936036705970764, "step": 1346 }, { "epoch": 2.8680851063829786, "grad_norm": 0.404296875, "learning_rate": 2.452340765242855e-06, "loss": 0.6136466860771179, "step": 1348 }, { "epoch": 2.872340425531915, "grad_norm": 0.625, "learning_rate": 2.449075876401641e-06, "loss": 0.6735158562660217, "step": 1350 }, { "epoch": 2.876595744680851, "grad_norm": 0.62109375, "learning_rate": 2.4459155982055145e-06, "loss": 0.6614925861358643, "step": 1352 }, { "epoch": 2.8808510638297875, "grad_norm": 1.015625, "learning_rate": 2.4428600002406735e-06, "loss": 0.780015230178833, "step": 1354 }, { "epoch": 2.8851063829787233, "grad_norm": 0.51953125, "learning_rate": 2.4399091497883596e-06, "loss": 0.38140493631362915, "step": 1356 }, { "epoch": 2.8893617021276596, "grad_norm": 0.482421875, "learning_rate": 2.4370631118233766e-06, "loss": 0.38466039299964905, "step": 1358 }, { "epoch": 2.8936170212765955, "grad_norm": 0.5390625, "learning_rate": 2.4343219490126636e-06, "loss": 0.6831486821174622, "step": 1360 }, { "epoch": 2.8978723404255318, "grad_norm": 0.474609375, "learning_rate": 2.4316857217139125e-06, "loss": 0.5675507187843323, "step": 1362 }, { "epoch": 2.902127659574468, "grad_norm": 0.4765625, "learning_rate": 2.429154487974237e-06, "loss": 0.5387779474258423, "step": 1364 }, { "epoch": 2.9063829787234043, "grad_norm": 0.466796875, "learning_rate": 2.4267283035288974e-06, "loss": 0.5070762634277344, "step": 1366 }, { "epoch": 2.9106382978723406, "grad_norm": 0.404296875, "learning_rate": 2.4244072218000737e-06, "loss": 0.49968618154525757, "step": 1368 }, { "epoch": 2.9148936170212765, "grad_norm": 0.439453125, "learning_rate": 2.422191293895687e-06, "loss": 0.7925405502319336, "step": 1370 }, { "epoch": 2.9191489361702128, "grad_norm": 0.8203125, "learning_rate": 2.4200805686082757e-06, "loss": 0.4414962828159332, "step": 1372 }, { "epoch": 2.923404255319149, "grad_norm": 0.72265625, "learning_rate": 2.4180750924139205e-06, "loss": 0.5193897485733032, "step": 1374 }, { "epoch": 2.927659574468085, "grad_norm": 0.455078125, "learning_rate": 2.4161749094712216e-06, "loss": 0.5439836978912354, "step": 1376 }, { "epoch": 2.931914893617021, "grad_norm": 1.015625, "learning_rate": 2.414380061620327e-06, "loss": 0.5974451899528503, "step": 1378 }, { "epoch": 2.9361702127659575, "grad_norm": 0.482421875, "learning_rate": 2.4126905883820076e-06, "loss": 0.43398624658584595, "step": 1380 }, { "epoch": 2.9404255319148938, "grad_norm": 0.69921875, "learning_rate": 2.411106526956792e-06, "loss": 0.7541142702102661, "step": 1382 }, { "epoch": 2.94468085106383, "grad_norm": 0.498046875, "learning_rate": 2.4096279122241438e-06, "loss": 0.592811107635498, "step": 1384 }, { "epoch": 2.948936170212766, "grad_norm": 0.392578125, "learning_rate": 2.408254776741697e-06, "loss": 0.6341920495033264, "step": 1386 }, { "epoch": 2.953191489361702, "grad_norm": 0.76171875, "learning_rate": 2.4069871507445332e-06, "loss": 0.755580484867096, "step": 1388 }, { "epoch": 2.9574468085106385, "grad_norm": 0.455078125, "learning_rate": 2.4058250621445224e-06, "loss": 0.682244598865509, "step": 1390 }, { "epoch": 2.9617021276595743, "grad_norm": 0.484375, "learning_rate": 2.4047685365297056e-06, "loss": 0.5976744890213013, "step": 1392 }, { "epoch": 2.9659574468085106, "grad_norm": 0.4375, "learning_rate": 2.403817597163731e-06, "loss": 0.5079911351203918, "step": 1394 }, { "epoch": 2.970212765957447, "grad_norm": 1.4296875, "learning_rate": 2.402972264985341e-06, "loss": 0.4225712716579437, "step": 1396 }, { "epoch": 2.974468085106383, "grad_norm": 0.57421875, "learning_rate": 2.4022325586079132e-06, "loss": 0.6215579509735107, "step": 1398 }, { "epoch": 2.978723404255319, "grad_norm": 0.3828125, "learning_rate": 2.4015984943190496e-06, "loss": 0.455652117729187, "step": 1400 }, { "epoch": 2.9829787234042553, "grad_norm": 0.70703125, "learning_rate": 2.401070086080218e-06, "loss": 0.5189418792724609, "step": 1402 }, { "epoch": 2.9872340425531916, "grad_norm": 2.890625, "learning_rate": 2.400647345526445e-06, "loss": 0.5081955790519714, "step": 1404 }, { "epoch": 2.9914893617021274, "grad_norm": 0.443359375, "learning_rate": 2.400330281966059e-06, "loss": 0.5243685841560364, "step": 1406 }, { "epoch": 2.9957446808510637, "grad_norm": 0.6796875, "learning_rate": 2.400118902380485e-06, "loss": 0.6540034413337708, "step": 1408 }, { "epoch": 3.0, "grad_norm": 1.359375, "learning_rate": 2.400013211424094e-06, "loss": 0.3355269134044647, "step": 1410 }, { "epoch": 3.0, "step": 1410, "total_flos": 4.1743170019314893e+18, "train_loss": 0.8916917941037644, "train_runtime": 10519.9057, "train_samples_per_second": 4.289, "train_steps_per_second": 0.134 } ], "logging_steps": 2, "max_steps": 1410, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.1743170019314893e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }