9b-15 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
dae24b3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1370,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00291970802919708,
"grad_norm": 4.875,
"learning_rate": 7.246376811594204e-08,
"loss": 1.320786714553833,
"step": 2
},
{
"epoch": 0.00583941605839416,
"grad_norm": 26.375,
"learning_rate": 2.173913043478261e-07,
"loss": 2.3353517055511475,
"step": 4
},
{
"epoch": 0.008759124087591242,
"grad_norm": 5.125,
"learning_rate": 3.623188405797102e-07,
"loss": 1.9446890354156494,
"step": 6
},
{
"epoch": 0.01167883211678832,
"grad_norm": 2.234375,
"learning_rate": 5.072463768115942e-07,
"loss": 1.6843594312667847,
"step": 8
},
{
"epoch": 0.014598540145985401,
"grad_norm": 8.8125,
"learning_rate": 6.521739130434783e-07,
"loss": 1.8062303066253662,
"step": 10
},
{
"epoch": 0.017518248175182483,
"grad_norm": 5.0,
"learning_rate": 7.971014492753623e-07,
"loss": 1.9280399084091187,
"step": 12
},
{
"epoch": 0.020437956204379562,
"grad_norm": 3.015625,
"learning_rate": 9.420289855072465e-07,
"loss": 1.570988655090332,
"step": 14
},
{
"epoch": 0.02335766423357664,
"grad_norm": 11.25,
"learning_rate": 1.0869565217391306e-06,
"loss": 1.7710015773773193,
"step": 16
},
{
"epoch": 0.026277372262773723,
"grad_norm": 4.53125,
"learning_rate": 1.2318840579710147e-06,
"loss": 1.9166163206100464,
"step": 18
},
{
"epoch": 0.029197080291970802,
"grad_norm": 23.5,
"learning_rate": 1.3768115942028987e-06,
"loss": 1.9079008102416992,
"step": 20
},
{
"epoch": 0.032116788321167884,
"grad_norm": 6.15625,
"learning_rate": 1.521739130434783e-06,
"loss": 1.9891327619552612,
"step": 22
},
{
"epoch": 0.035036496350364967,
"grad_norm": 8.6875,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.8731980323791504,
"step": 24
},
{
"epoch": 0.03795620437956204,
"grad_norm": 41.5,
"learning_rate": 1.8115942028985508e-06,
"loss": 1.996793508529663,
"step": 26
},
{
"epoch": 0.040875912408759124,
"grad_norm": 16.125,
"learning_rate": 1.956521739130435e-06,
"loss": 2.4439406394958496,
"step": 28
},
{
"epoch": 0.043795620437956206,
"grad_norm": 4.78125,
"learning_rate": 2.101449275362319e-06,
"loss": 1.4941191673278809,
"step": 30
},
{
"epoch": 0.04671532846715328,
"grad_norm": 5.71875,
"learning_rate": 2.246376811594203e-06,
"loss": 1.9384567737579346,
"step": 32
},
{
"epoch": 0.049635036496350364,
"grad_norm": 3.140625,
"learning_rate": 2.391304347826087e-06,
"loss": 2.106153964996338,
"step": 34
},
{
"epoch": 0.052554744525547446,
"grad_norm": 25.875,
"learning_rate": 2.5362318840579714e-06,
"loss": 2.235496997833252,
"step": 36
},
{
"epoch": 0.05547445255474453,
"grad_norm": 6.46875,
"learning_rate": 2.6811594202898555e-06,
"loss": 2.4106810092926025,
"step": 38
},
{
"epoch": 0.058394160583941604,
"grad_norm": 4.375,
"learning_rate": 2.8260869565217393e-06,
"loss": 1.6466758251190186,
"step": 40
},
{
"epoch": 0.061313868613138686,
"grad_norm": 95.5,
"learning_rate": 2.9710144927536235e-06,
"loss": 1.9993230104446411,
"step": 42
},
{
"epoch": 0.06423357664233577,
"grad_norm": 3.953125,
"learning_rate": 3.1159420289855073e-06,
"loss": 1.7203528881072998,
"step": 44
},
{
"epoch": 0.06715328467153285,
"grad_norm": 13.5625,
"learning_rate": 3.2608695652173914e-06,
"loss": 2.5018796920776367,
"step": 46
},
{
"epoch": 0.07007299270072993,
"grad_norm": 12.6875,
"learning_rate": 3.4057971014492756e-06,
"loss": 1.935620903968811,
"step": 48
},
{
"epoch": 0.072992700729927,
"grad_norm": 4.125,
"learning_rate": 3.55072463768116e-06,
"loss": 1.9458433389663696,
"step": 50
},
{
"epoch": 0.07591240875912408,
"grad_norm": 2.171875,
"learning_rate": 3.6956521739130436e-06,
"loss": 1.321602702140808,
"step": 52
},
{
"epoch": 0.07883211678832117,
"grad_norm": 3.578125,
"learning_rate": 3.840579710144928e-06,
"loss": 2.0101318359375,
"step": 54
},
{
"epoch": 0.08175182481751825,
"grad_norm": 5.625,
"learning_rate": 3.9855072463768115e-06,
"loss": 2.0588250160217285,
"step": 56
},
{
"epoch": 0.08467153284671533,
"grad_norm": 5.3125,
"learning_rate": 4.130434782608696e-06,
"loss": 1.860298752784729,
"step": 58
},
{
"epoch": 0.08759124087591241,
"grad_norm": 5.9375,
"learning_rate": 4.27536231884058e-06,
"loss": 1.9684100151062012,
"step": 60
},
{
"epoch": 0.0905109489051095,
"grad_norm": 9.375,
"learning_rate": 4.4202898550724645e-06,
"loss": 1.980459213256836,
"step": 62
},
{
"epoch": 0.09343065693430656,
"grad_norm": 4.90625,
"learning_rate": 4.565217391304348e-06,
"loss": 1.8493075370788574,
"step": 64
},
{
"epoch": 0.09635036496350365,
"grad_norm": 2.609375,
"learning_rate": 4.710144927536232e-06,
"loss": 1.5537524223327637,
"step": 66
},
{
"epoch": 0.09927007299270073,
"grad_norm": 4.46875,
"learning_rate": 4.855072463768117e-06,
"loss": 1.8475682735443115,
"step": 68
},
{
"epoch": 0.10218978102189781,
"grad_norm": 3.734375,
"learning_rate": 5e-06,
"loss": 1.7411353588104248,
"step": 70
},
{
"epoch": 0.10510948905109489,
"grad_norm": 29.875,
"learning_rate": 4.999973760423467e-06,
"loss": 2.0845284461975098,
"step": 72
},
{
"epoch": 0.10802919708029197,
"grad_norm": 6.21875,
"learning_rate": 4.99989504230588e-06,
"loss": 1.5018064975738525,
"step": 74
},
{
"epoch": 0.11094890510948906,
"grad_norm": 2.21875,
"learning_rate": 4.999763847483267e-06,
"loss": 1.464540958404541,
"step": 76
},
{
"epoch": 0.11386861313868613,
"grad_norm": 4.53125,
"learning_rate": 4.999580179015625e-06,
"loss": 1.8232789039611816,
"step": 78
},
{
"epoch": 0.11678832116788321,
"grad_norm": 1.7578125,
"learning_rate": 4.999344041186848e-06,
"loss": 1.096325159072876,
"step": 80
},
{
"epoch": 0.11970802919708029,
"grad_norm": 3.328125,
"learning_rate": 4.999055439504633e-06,
"loss": 1.8037409782409668,
"step": 82
},
{
"epoch": 0.12262773722627737,
"grad_norm": 3.84375,
"learning_rate": 4.998714380700345e-06,
"loss": 1.5575973987579346,
"step": 84
},
{
"epoch": 0.12554744525547445,
"grad_norm": 4.1875,
"learning_rate": 4.998320872728862e-06,
"loss": 1.8613684177398682,
"step": 86
},
{
"epoch": 0.12846715328467154,
"grad_norm": 5.15625,
"learning_rate": 4.9978749247683895e-06,
"loss": 1.732508897781372,
"step": 88
},
{
"epoch": 0.13138686131386862,
"grad_norm": 2.59375,
"learning_rate": 4.99737654722025e-06,
"loss": 1.3435773849487305,
"step": 90
},
{
"epoch": 0.1343065693430657,
"grad_norm": 3.25,
"learning_rate": 4.996825751708635e-06,
"loss": 1.7478176355361938,
"step": 92
},
{
"epoch": 0.13722627737226278,
"grad_norm": 2.03125,
"learning_rate": 4.996222551080337e-06,
"loss": 1.4358994960784912,
"step": 94
},
{
"epoch": 0.14014598540145987,
"grad_norm": 5.4375,
"learning_rate": 4.9955669594044466e-06,
"loss": 1.870757818222046,
"step": 96
},
{
"epoch": 0.14306569343065692,
"grad_norm": 3.671875,
"learning_rate": 4.994858991972031e-06,
"loss": 1.6408865451812744,
"step": 98
},
{
"epoch": 0.145985401459854,
"grad_norm": 3.375,
"learning_rate": 4.994098665295768e-06,
"loss": 1.4728097915649414,
"step": 100
},
{
"epoch": 0.14890510948905109,
"grad_norm": 7.4375,
"learning_rate": 4.9932859971095705e-06,
"loss": 1.7583755254745483,
"step": 102
},
{
"epoch": 0.15182481751824817,
"grad_norm": 3.25,
"learning_rate": 4.992421006368166e-06,
"loss": 1.6836040019989014,
"step": 104
},
{
"epoch": 0.15474452554744525,
"grad_norm": 26.25,
"learning_rate": 4.991503713246659e-06,
"loss": 1.9515830278396606,
"step": 106
},
{
"epoch": 0.15766423357664233,
"grad_norm": 62.25,
"learning_rate": 4.990534139140055e-06,
"loss": 2.0257816314697266,
"step": 108
},
{
"epoch": 0.16058394160583941,
"grad_norm": 2.640625,
"learning_rate": 4.989512306662767e-06,
"loss": 1.4182727336883545,
"step": 110
},
{
"epoch": 0.1635036496350365,
"grad_norm": 6.6875,
"learning_rate": 4.988438239648084e-06,
"loss": 1.70530366897583,
"step": 112
},
{
"epoch": 0.16642335766423358,
"grad_norm": 3.5625,
"learning_rate": 4.98731196314762e-06,
"loss": 1.5088133811950684,
"step": 114
},
{
"epoch": 0.16934306569343066,
"grad_norm": 3.078125,
"learning_rate": 4.986133503430724e-06,
"loss": 1.6265062093734741,
"step": 116
},
{
"epoch": 0.17226277372262774,
"grad_norm": 6.4375,
"learning_rate": 4.98490288798387e-06,
"loss": 1.402962327003479,
"step": 118
},
{
"epoch": 0.17518248175182483,
"grad_norm": 4.125,
"learning_rate": 4.983620145510017e-06,
"loss": 1.8057794570922852,
"step": 120
},
{
"epoch": 0.1781021897810219,
"grad_norm": 6.875,
"learning_rate": 4.982285305927937e-06,
"loss": 1.9605462551116943,
"step": 122
},
{
"epoch": 0.181021897810219,
"grad_norm": 3.625,
"learning_rate": 4.980898400371521e-06,
"loss": 1.8519611358642578,
"step": 124
},
{
"epoch": 0.18394160583941604,
"grad_norm": 10.0625,
"learning_rate": 4.9794594611890465e-06,
"loss": 1.6692755222320557,
"step": 126
},
{
"epoch": 0.18686131386861313,
"grad_norm": 6.1875,
"learning_rate": 4.977968521942429e-06,
"loss": 1.8997008800506592,
"step": 128
},
{
"epoch": 0.1897810218978102,
"grad_norm": 1.8515625,
"learning_rate": 4.97642561740644e-06,
"loss": 1.8168402910232544,
"step": 130
},
{
"epoch": 0.1927007299270073,
"grad_norm": 16.375,
"learning_rate": 4.974830783567886e-06,
"loss": 1.4727129936218262,
"step": 132
},
{
"epoch": 0.19562043795620437,
"grad_norm": 7.71875,
"learning_rate": 4.973184057624781e-06,
"loss": 1.6138420104980469,
"step": 134
},
{
"epoch": 0.19854014598540146,
"grad_norm": 3.5,
"learning_rate": 4.971485477985474e-06,
"loss": 1.6893023252487183,
"step": 136
},
{
"epoch": 0.20145985401459854,
"grad_norm": 1.421875,
"learning_rate": 4.969735084267752e-06,
"loss": 1.3670828342437744,
"step": 138
},
{
"epoch": 0.20437956204379562,
"grad_norm": 8.4375,
"learning_rate": 4.967932917297915e-06,
"loss": 1.6938685178756714,
"step": 140
},
{
"epoch": 0.2072992700729927,
"grad_norm": 4.0625,
"learning_rate": 4.966079019109831e-06,
"loss": 2.2959558963775635,
"step": 142
},
{
"epoch": 0.21021897810218979,
"grad_norm": 3.328125,
"learning_rate": 4.964173432943946e-06,
"loss": 1.6218578815460205,
"step": 144
},
{
"epoch": 0.21313868613138687,
"grad_norm": 9.0625,
"learning_rate": 4.962216203246281e-06,
"loss": 2.592639446258545,
"step": 146
},
{
"epoch": 0.21605839416058395,
"grad_norm": 3.3125,
"learning_rate": 4.960207375667396e-06,
"loss": 1.5585392713546753,
"step": 148
},
{
"epoch": 0.21897810218978103,
"grad_norm": 3.96875,
"learning_rate": 4.958146997061319e-06,
"loss": 1.6422696113586426,
"step": 150
},
{
"epoch": 0.22189781021897811,
"grad_norm": 5.59375,
"learning_rate": 4.956035115484465e-06,
"loss": 1.7883186340332031,
"step": 152
},
{
"epoch": 0.22481751824817517,
"grad_norm": 2.140625,
"learning_rate": 4.953871780194501e-06,
"loss": 1.657930612564087,
"step": 154
},
{
"epoch": 0.22773722627737225,
"grad_norm": 24.125,
"learning_rate": 4.951657041649206e-06,
"loss": 1.7987116575241089,
"step": 156
},
{
"epoch": 0.23065693430656933,
"grad_norm": 12.0,
"learning_rate": 4.9493909515052944e-06,
"loss": 2.016146659851074,
"step": 158
},
{
"epoch": 0.23357664233576642,
"grad_norm": 7.90625,
"learning_rate": 4.947073562617206e-06,
"loss": 1.3612116575241089,
"step": 160
},
{
"epoch": 0.2364963503649635,
"grad_norm": 3.8125,
"learning_rate": 4.944704929035877e-06,
"loss": 1.7367652654647827,
"step": 162
},
{
"epoch": 0.23941605839416058,
"grad_norm": 2.875,
"learning_rate": 4.942285106007477e-06,
"loss": 1.3203725814819336,
"step": 164
},
{
"epoch": 0.24233576642335766,
"grad_norm": 11.25,
"learning_rate": 4.9398141499721246e-06,
"loss": 1.7288057804107666,
"step": 166
},
{
"epoch": 0.24525547445255474,
"grad_norm": 1.5625,
"learning_rate": 4.937292118562566e-06,
"loss": 1.383696436882019,
"step": 168
},
{
"epoch": 0.24817518248175183,
"grad_norm": 12.5625,
"learning_rate": 4.934719070602833e-06,
"loss": 1.6433072090148926,
"step": 170
},
{
"epoch": 0.2510948905109489,
"grad_norm": 3.109375,
"learning_rate": 4.932095066106872e-06,
"loss": 1.4025721549987793,
"step": 172
},
{
"epoch": 0.25401459854014596,
"grad_norm": 4.1875,
"learning_rate": 4.929420166277141e-06,
"loss": 1.6988599300384521,
"step": 174
},
{
"epoch": 0.2569343065693431,
"grad_norm": 3.65625,
"learning_rate": 4.926694433503186e-06,
"loss": 1.6042873859405518,
"step": 176
},
{
"epoch": 0.25985401459854013,
"grad_norm": 1.6484375,
"learning_rate": 4.923917931360185e-06,
"loss": 1.2862474918365479,
"step": 178
},
{
"epoch": 0.26277372262773724,
"grad_norm": 6.65625,
"learning_rate": 4.9210907246074615e-06,
"loss": 1.7310783863067627,
"step": 180
},
{
"epoch": 0.2656934306569343,
"grad_norm": 4.5625,
"learning_rate": 4.9182128791869796e-06,
"loss": 1.5482988357543945,
"step": 182
},
{
"epoch": 0.2686131386861314,
"grad_norm": 1.5078125,
"learning_rate": 4.9152844622218e-06,
"loss": 1.2439241409301758,
"step": 184
},
{
"epoch": 0.27153284671532846,
"grad_norm": 4.3125,
"learning_rate": 4.91230554201452e-06,
"loss": 1.5766255855560303,
"step": 186
},
{
"epoch": 0.27445255474452557,
"grad_norm": 3.90625,
"learning_rate": 4.9092761880456764e-06,
"loss": 1.311848759651184,
"step": 188
},
{
"epoch": 0.2773722627737226,
"grad_norm": 39.75,
"learning_rate": 4.906196470972128e-06,
"loss": 1.5088813304901123,
"step": 190
},
{
"epoch": 0.28029197080291973,
"grad_norm": 6.40625,
"learning_rate": 4.903066462625405e-06,
"loss": 1.6081913709640503,
"step": 192
},
{
"epoch": 0.2832116788321168,
"grad_norm": 6.125,
"learning_rate": 4.899886236010036e-06,
"loss": 1.7471773624420166,
"step": 194
},
{
"epoch": 0.28613138686131384,
"grad_norm": 4.09375,
"learning_rate": 4.896655865301842e-06,
"loss": 1.6127898693084717,
"step": 196
},
{
"epoch": 0.28905109489051095,
"grad_norm": 3.1875,
"learning_rate": 4.893375425846209e-06,
"loss": 1.6075236797332764,
"step": 198
},
{
"epoch": 0.291970802919708,
"grad_norm": 3.53125,
"learning_rate": 4.890044994156331e-06,
"loss": 1.712640643119812,
"step": 200
},
{
"epoch": 0.2948905109489051,
"grad_norm": 3.84375,
"learning_rate": 4.886664647911422e-06,
"loss": 1.5669183731079102,
"step": 202
},
{
"epoch": 0.29781021897810217,
"grad_norm": 5.6875,
"learning_rate": 4.883234465954909e-06,
"loss": 1.7576971054077148,
"step": 204
},
{
"epoch": 0.3007299270072993,
"grad_norm": 2.515625,
"learning_rate": 4.879754528292588e-06,
"loss": 1.5543663501739502,
"step": 206
},
{
"epoch": 0.30364963503649633,
"grad_norm": 2.921875,
"learning_rate": 4.876224916090762e-06,
"loss": 1.9160549640655518,
"step": 208
},
{
"epoch": 0.30656934306569344,
"grad_norm": 4.34375,
"learning_rate": 4.872645711674348e-06,
"loss": 1.646159291267395,
"step": 210
},
{
"epoch": 0.3094890510948905,
"grad_norm": 1.625,
"learning_rate": 4.8690169985249516e-06,
"loss": 1.1048507690429688,
"step": 212
},
{
"epoch": 0.3124087591240876,
"grad_norm": 1.5625,
"learning_rate": 4.865338861278925e-06,
"loss": 1.0736052989959717,
"step": 214
},
{
"epoch": 0.31532846715328466,
"grad_norm": 3.59375,
"learning_rate": 4.8616113857253925e-06,
"loss": 1.2035229206085205,
"step": 216
},
{
"epoch": 0.3182481751824818,
"grad_norm": 23.625,
"learning_rate": 4.857834658804247e-06,
"loss": 1.137906789779663,
"step": 218
},
{
"epoch": 0.32116788321167883,
"grad_norm": 4.5625,
"learning_rate": 4.8540087686041234e-06,
"loss": 1.7008376121520996,
"step": 220
},
{
"epoch": 0.32408759124087594,
"grad_norm": 8.75,
"learning_rate": 4.850133804360346e-06,
"loss": 1.6337850093841553,
"step": 222
},
{
"epoch": 0.327007299270073,
"grad_norm": 3.984375,
"learning_rate": 4.8462098564528455e-06,
"loss": 1.1808865070343018,
"step": 224
},
{
"epoch": 0.32992700729927005,
"grad_norm": 3.59375,
"learning_rate": 4.842237016404048e-06,
"loss": 1.5622849464416504,
"step": 226
},
{
"epoch": 0.33284671532846716,
"grad_norm": 1.1875,
"learning_rate": 4.838215376876744e-06,
"loss": 1.1768817901611328,
"step": 228
},
{
"epoch": 0.3357664233576642,
"grad_norm": 6.0,
"learning_rate": 4.834145031671931e-06,
"loss": 1.3726277351379395,
"step": 230
},
{
"epoch": 0.3386861313868613,
"grad_norm": 28.375,
"learning_rate": 4.830026075726615e-06,
"loss": 1.1469438076019287,
"step": 232
},
{
"epoch": 0.3416058394160584,
"grad_norm": 3.421875,
"learning_rate": 4.8258586051116045e-06,
"loss": 1.5012977123260498,
"step": 234
},
{
"epoch": 0.3445255474452555,
"grad_norm": 12.9375,
"learning_rate": 4.821642717029269e-06,
"loss": 1.6817822456359863,
"step": 236
},
{
"epoch": 0.34744525547445254,
"grad_norm": 5.0625,
"learning_rate": 4.8173785098112675e-06,
"loss": 1.525681495666504,
"step": 238
},
{
"epoch": 0.35036496350364965,
"grad_norm": 15.4375,
"learning_rate": 4.81306608291626e-06,
"loss": 2.0758631229400635,
"step": 240
},
{
"epoch": 0.3532846715328467,
"grad_norm": 3.25,
"learning_rate": 4.808705536927586e-06,
"loss": 1.4310352802276611,
"step": 242
},
{
"epoch": 0.3562043795620438,
"grad_norm": 3.28125,
"learning_rate": 4.804296973550915e-06,
"loss": 1.6908133029937744,
"step": 244
},
{
"epoch": 0.35912408759124087,
"grad_norm": 3.15625,
"learning_rate": 4.799840495611879e-06,
"loss": 1.2480230331420898,
"step": 246
},
{
"epoch": 0.362043795620438,
"grad_norm": 2.75,
"learning_rate": 4.795336207053674e-06,
"loss": 1.5943894386291504,
"step": 248
},
{
"epoch": 0.36496350364963503,
"grad_norm": 3.953125,
"learning_rate": 4.790784212934631e-06,
"loss": 1.1932544708251953,
"step": 250
},
{
"epoch": 0.3678832116788321,
"grad_norm": 5.53125,
"learning_rate": 4.786184619425773e-06,
"loss": 1.4538475275039673,
"step": 252
},
{
"epoch": 0.3708029197080292,
"grad_norm": 6.1875,
"learning_rate": 4.781537533808331e-06,
"loss": 1.7138783931732178,
"step": 254
},
{
"epoch": 0.37372262773722625,
"grad_norm": 1.609375,
"learning_rate": 4.7768430644712435e-06,
"loss": 1.37872314453125,
"step": 256
},
{
"epoch": 0.37664233576642336,
"grad_norm": 6.25,
"learning_rate": 4.772101320908636e-06,
"loss": 1.4937684535980225,
"step": 258
},
{
"epoch": 0.3795620437956204,
"grad_norm": 8.8125,
"learning_rate": 4.767312413717256e-06,
"loss": 1.4460338354110718,
"step": 260
},
{
"epoch": 0.38248175182481753,
"grad_norm": 4.28125,
"learning_rate": 4.7624764545939015e-06,
"loss": 1.4206737279891968,
"step": 262
},
{
"epoch": 0.3854014598540146,
"grad_norm": 2.671875,
"learning_rate": 4.757593556332811e-06,
"loss": 1.3555597066879272,
"step": 264
},
{
"epoch": 0.3883211678832117,
"grad_norm": 3.1875,
"learning_rate": 4.752663832823038e-06,
"loss": 1.6055470705032349,
"step": 266
},
{
"epoch": 0.39124087591240875,
"grad_norm": 4.09375,
"learning_rate": 4.747687399045787e-06,
"loss": 1.3127577304840088,
"step": 268
},
{
"epoch": 0.39416058394160586,
"grad_norm": 5.40625,
"learning_rate": 4.7426643710717386e-06,
"loss": 1.6612601280212402,
"step": 270
},
{
"epoch": 0.3970802919708029,
"grad_norm": 5.34375,
"learning_rate": 4.737594866058339e-06,
"loss": 1.2799599170684814,
"step": 272
},
{
"epoch": 0.4,
"grad_norm": 6.71875,
"learning_rate": 4.7324790022470675e-06,
"loss": 1.9163275957107544,
"step": 274
},
{
"epoch": 0.4029197080291971,
"grad_norm": 2.328125,
"learning_rate": 4.727316898960681e-06,
"loss": 1.4439561367034912,
"step": 276
},
{
"epoch": 0.4058394160583942,
"grad_norm": 10.6875,
"learning_rate": 4.722108676600427e-06,
"loss": 1.2920876741409302,
"step": 278
},
{
"epoch": 0.40875912408759124,
"grad_norm": 3.671875,
"learning_rate": 4.7168544566432365e-06,
"loss": 1.691207766532898,
"step": 280
},
{
"epoch": 0.4116788321167883,
"grad_norm": 3.21875,
"learning_rate": 4.711554361638896e-06,
"loss": 1.527019739151001,
"step": 282
},
{
"epoch": 0.4145985401459854,
"grad_norm": 3.1875,
"learning_rate": 4.70620851520718e-06,
"loss": 1.4309567213058472,
"step": 284
},
{
"epoch": 0.41751824817518246,
"grad_norm": 2.390625,
"learning_rate": 4.7008170420349746e-06,
"loss": 1.2672343254089355,
"step": 286
},
{
"epoch": 0.42043795620437957,
"grad_norm": 1.765625,
"learning_rate": 4.695380067873368e-06,
"loss": 1.3927721977233887,
"step": 288
},
{
"epoch": 0.4233576642335766,
"grad_norm": 2.75,
"learning_rate": 4.689897719534715e-06,
"loss": 1.5347919464111328,
"step": 290
},
{
"epoch": 0.42627737226277373,
"grad_norm": 4.5625,
"learning_rate": 4.68437012488968e-06,
"loss": 1.2839910984039307,
"step": 292
},
{
"epoch": 0.4291970802919708,
"grad_norm": 48.25,
"learning_rate": 4.678797412864258e-06,
"loss": 1.3073639869689941,
"step": 294
},
{
"epoch": 0.4321167883211679,
"grad_norm": 4.1875,
"learning_rate": 4.673179713436762e-06,
"loss": 1.5608128309249878,
"step": 296
},
{
"epoch": 0.43503649635036495,
"grad_norm": 2.875,
"learning_rate": 4.667517157634797e-06,
"loss": 1.6924610137939453,
"step": 298
},
{
"epoch": 0.43795620437956206,
"grad_norm": 3.515625,
"learning_rate": 4.6618098775322e-06,
"loss": 1.218139886856079,
"step": 300
},
{
"epoch": 0.4408759124087591,
"grad_norm": 5.34375,
"learning_rate": 4.656058006245959e-06,
"loss": 1.4968738555908203,
"step": 302
},
{
"epoch": 0.44379562043795623,
"grad_norm": 6.59375,
"learning_rate": 4.650261677933111e-06,
"loss": 1.522092580795288,
"step": 304
},
{
"epoch": 0.4467153284671533,
"grad_norm": 3.109375,
"learning_rate": 4.644421027787614e-06,
"loss": 1.15757155418396,
"step": 306
},
{
"epoch": 0.44963503649635034,
"grad_norm": 2.5,
"learning_rate": 4.638536192037186e-06,
"loss": 1.0606379508972168,
"step": 308
},
{
"epoch": 0.45255474452554745,
"grad_norm": 10.375,
"learning_rate": 4.63260730794014e-06,
"loss": 1.674492597579956,
"step": 310
},
{
"epoch": 0.4554744525547445,
"grad_norm": 3.421875,
"learning_rate": 4.62663451378217e-06,
"loss": 1.4489834308624268,
"step": 312
},
{
"epoch": 0.4583941605839416,
"grad_norm": 1.6640625,
"learning_rate": 4.620617948873133e-06,
"loss": 1.4036529064178467,
"step": 314
},
{
"epoch": 0.46131386861313867,
"grad_norm": 4.21875,
"learning_rate": 4.6145577535438004e-06,
"loss": 1.482384204864502,
"step": 316
},
{
"epoch": 0.4642335766423358,
"grad_norm": 2.8125,
"learning_rate": 4.608454069142578e-06,
"loss": 1.4590518474578857,
"step": 318
},
{
"epoch": 0.46715328467153283,
"grad_norm": 4.53125,
"learning_rate": 4.602307038032216e-06,
"loss": 1.7169837951660156,
"step": 320
},
{
"epoch": 0.47007299270072994,
"grad_norm": 4.75,
"learning_rate": 4.596116803586487e-06,
"loss": 1.5060232877731323,
"step": 322
},
{
"epoch": 0.472992700729927,
"grad_norm": 2.828125,
"learning_rate": 4.5898835101868415e-06,
"loss": 1.4886112213134766,
"step": 324
},
{
"epoch": 0.4759124087591241,
"grad_norm": 1.7265625,
"learning_rate": 4.583607303219037e-06,
"loss": 1.4076815843582153,
"step": 326
},
{
"epoch": 0.47883211678832116,
"grad_norm": 10.4375,
"learning_rate": 4.577288329069753e-06,
"loss": 1.5618150234222412,
"step": 328
},
{
"epoch": 0.48175182481751827,
"grad_norm": 4.75,
"learning_rate": 4.570926735123171e-06,
"loss": 1.274332046508789,
"step": 330
},
{
"epoch": 0.4846715328467153,
"grad_norm": 4.4375,
"learning_rate": 4.564522669757543e-06,
"loss": 1.4747687578201294,
"step": 332
},
{
"epoch": 0.48759124087591244,
"grad_norm": 6.40625,
"learning_rate": 4.558076282341723e-06,
"loss": 1.653844952583313,
"step": 334
},
{
"epoch": 0.4905109489051095,
"grad_norm": 39.5,
"learning_rate": 4.551587723231692e-06,
"loss": 1.0735116004943848,
"step": 336
},
{
"epoch": 0.49343065693430654,
"grad_norm": 36.0,
"learning_rate": 4.545057143767042e-06,
"loss": 1.6714699268341064,
"step": 338
},
{
"epoch": 0.49635036496350365,
"grad_norm": 4.15625,
"learning_rate": 4.538484696267453e-06,
"loss": 1.4629170894622803,
"step": 340
},
{
"epoch": 0.4992700729927007,
"grad_norm": 10.3125,
"learning_rate": 4.5318705340291394e-06,
"loss": 1.5702762603759766,
"step": 342
},
{
"epoch": 0.5021897810218978,
"grad_norm": 4.96875,
"learning_rate": 4.525214811321269e-06,
"loss": 1.5001425743103027,
"step": 344
},
{
"epoch": 0.5051094890510949,
"grad_norm": 5.0625,
"learning_rate": 4.518517683382373e-06,
"loss": 1.4789342880249023,
"step": 346
},
{
"epoch": 0.5080291970802919,
"grad_norm": 4.15625,
"learning_rate": 4.511779306416716e-06,
"loss": 1.4476077556610107,
"step": 348
},
{
"epoch": 0.5109489051094891,
"grad_norm": 1.703125,
"learning_rate": 4.504999837590665e-06,
"loss": 1.1996196508407593,
"step": 350
},
{
"epoch": 0.5138686131386861,
"grad_norm": 4.1875,
"learning_rate": 4.49817943502901e-06,
"loss": 1.532009482383728,
"step": 352
},
{
"epoch": 0.5167883211678832,
"grad_norm": 1.65625,
"learning_rate": 4.4913182578112815e-06,
"loss": 1.2889015674591064,
"step": 354
},
{
"epoch": 0.5197080291970803,
"grad_norm": 1.640625,
"learning_rate": 4.484416465968049e-06,
"loss": 1.3533192873001099,
"step": 356
},
{
"epoch": 0.5226277372262774,
"grad_norm": 4.3125,
"learning_rate": 4.477474220477172e-06,
"loss": 1.4686871767044067,
"step": 358
},
{
"epoch": 0.5255474452554745,
"grad_norm": 3.78125,
"learning_rate": 4.470491683260056e-06,
"loss": 1.4659610986709595,
"step": 360
},
{
"epoch": 0.5284671532846715,
"grad_norm": 2.46875,
"learning_rate": 4.463469017177876e-06,
"loss": 1.487034797668457,
"step": 362
},
{
"epoch": 0.5313868613138686,
"grad_norm": 3.3125,
"learning_rate": 4.456406386027772e-06,
"loss": 1.1844420433044434,
"step": 364
},
{
"epoch": 0.5343065693430656,
"grad_norm": 7.34375,
"learning_rate": 4.4493039545390345e-06,
"loss": 1.5557405948638916,
"step": 366
},
{
"epoch": 0.5372262773722628,
"grad_norm": 2.984375,
"learning_rate": 4.442161888369258e-06,
"loss": 1.3480842113494873,
"step": 368
},
{
"epoch": 0.5401459854014599,
"grad_norm": 2.90625,
"learning_rate": 4.43498035410048e-06,
"loss": 1.2928515672683716,
"step": 370
},
{
"epoch": 0.5430656934306569,
"grad_norm": 7.3125,
"learning_rate": 4.427759519235294e-06,
"loss": 1.7453609704971313,
"step": 372
},
{
"epoch": 0.545985401459854,
"grad_norm": 2.640625,
"learning_rate": 4.420499552192944e-06,
"loss": 1.4482967853546143,
"step": 374
},
{
"epoch": 0.5489051094890511,
"grad_norm": 2.0,
"learning_rate": 4.413200622305395e-06,
"loss": 1.6135839223861694,
"step": 376
},
{
"epoch": 0.5518248175182482,
"grad_norm": 13.9375,
"learning_rate": 4.405862899813384e-06,
"loss": 1.570212483406067,
"step": 378
},
{
"epoch": 0.5547445255474452,
"grad_norm": 1.3671875,
"learning_rate": 4.398486555862451e-06,
"loss": 1.298504114151001,
"step": 380
},
{
"epoch": 0.5576642335766423,
"grad_norm": 7.8125,
"learning_rate": 4.391071762498941e-06,
"loss": 1.4520879983901978,
"step": 382
},
{
"epoch": 0.5605839416058395,
"grad_norm": 14.8125,
"learning_rate": 4.383618692666002e-06,
"loss": 1.3408211469650269,
"step": 384
},
{
"epoch": 0.5635036496350365,
"grad_norm": 3.375,
"learning_rate": 4.376127520199541e-06,
"loss": 1.4031929969787598,
"step": 386
},
{
"epoch": 0.5664233576642336,
"grad_norm": 4.03125,
"learning_rate": 4.3685984198241735e-06,
"loss": 1.5412940979003906,
"step": 388
},
{
"epoch": 0.5693430656934306,
"grad_norm": 6.78125,
"learning_rate": 4.361031567149149e-06,
"loss": 1.3730320930480957,
"step": 390
},
{
"epoch": 0.5722627737226277,
"grad_norm": 7.28125,
"learning_rate": 4.353427138664254e-06,
"loss": 1.3442788124084473,
"step": 392
},
{
"epoch": 0.5751824817518248,
"grad_norm": 6.90625,
"learning_rate": 4.345785311735698e-06,
"loss": 1.4140475988388062,
"step": 394
},
{
"epoch": 0.5781021897810219,
"grad_norm": 6.25,
"learning_rate": 4.3381062646019676e-06,
"loss": 1.5376839637756348,
"step": 396
},
{
"epoch": 0.581021897810219,
"grad_norm": 4.25,
"learning_rate": 4.330390176369685e-06,
"loss": 1.5938429832458496,
"step": 398
},
{
"epoch": 0.583941605839416,
"grad_norm": 1.546875,
"learning_rate": 4.322637227009414e-06,
"loss": 1.1486091613769531,
"step": 400
},
{
"epoch": 0.5868613138686132,
"grad_norm": 3.578125,
"learning_rate": 4.314847597351475e-06,
"loss": 1.452984094619751,
"step": 402
},
{
"epoch": 0.5897810218978102,
"grad_norm": 3.953125,
"learning_rate": 4.3070214690817195e-06,
"loss": 1.4647376537322998,
"step": 404
},
{
"epoch": 0.5927007299270073,
"grad_norm": 2.203125,
"learning_rate": 4.299159024737295e-06,
"loss": 1.2110595703125,
"step": 406
},
{
"epoch": 0.5956204379562043,
"grad_norm": 4.1875,
"learning_rate": 4.291260447702389e-06,
"loss": 1.3485263586044312,
"step": 408
},
{
"epoch": 0.5985401459854015,
"grad_norm": 5.25,
"learning_rate": 4.283325922203949e-06,
"loss": 1.3334099054336548,
"step": 410
},
{
"epoch": 0.6014598540145986,
"grad_norm": 2.0625,
"learning_rate": 4.2753556333073875e-06,
"loss": 1.2992541790008545,
"step": 412
},
{
"epoch": 0.6043795620437956,
"grad_norm": 8.3125,
"learning_rate": 4.267349766912266e-06,
"loss": 1.3331689834594727,
"step": 414
},
{
"epoch": 0.6072992700729927,
"grad_norm": 3.71875,
"learning_rate": 4.259308509747955e-06,
"loss": 1.4391039609909058,
"step": 416
},
{
"epoch": 0.6102189781021898,
"grad_norm": 9.6875,
"learning_rate": 4.251232049369287e-06,
"loss": 1.145450472831726,
"step": 418
},
{
"epoch": 0.6131386861313869,
"grad_norm": 10.875,
"learning_rate": 4.243120574152169e-06,
"loss": 1.5916063785552979,
"step": 420
},
{
"epoch": 0.6160583941605839,
"grad_norm": 4.75,
"learning_rate": 4.234974273289204e-06,
"loss": 1.619133710861206,
"step": 422
},
{
"epoch": 0.618978102189781,
"grad_norm": 4.375,
"learning_rate": 4.226793336785265e-06,
"loss": 1.4133093357086182,
"step": 424
},
{
"epoch": 0.621897810218978,
"grad_norm": 6.03125,
"learning_rate": 4.218577955453074e-06,
"loss": 1.253399133682251,
"step": 426
},
{
"epoch": 0.6248175182481752,
"grad_norm": 4.6875,
"learning_rate": 4.210328320908744e-06,
"loss": 1.4635814428329468,
"step": 428
},
{
"epoch": 0.6277372262773723,
"grad_norm": 2.875,
"learning_rate": 4.20204462556731e-06,
"loss": 1.3652441501617432,
"step": 430
},
{
"epoch": 0.6306569343065693,
"grad_norm": 8.9375,
"learning_rate": 4.193727062638247e-06,
"loss": 1.5560953617095947,
"step": 432
},
{
"epoch": 0.6335766423357664,
"grad_norm": 3.53125,
"learning_rate": 4.18537582612096e-06,
"loss": 1.4227533340454102,
"step": 434
},
{
"epoch": 0.6364963503649635,
"grad_norm": 3.265625,
"learning_rate": 4.176991110800256e-06,
"loss": 1.2683900594711304,
"step": 436
},
{
"epoch": 0.6394160583941606,
"grad_norm": 14.1875,
"learning_rate": 4.168573112241805e-06,
"loss": 1.2102452516555786,
"step": 438
},
{
"epoch": 0.6423357664233577,
"grad_norm": 4.84375,
"learning_rate": 4.16012202678758e-06,
"loss": 1.2587625980377197,
"step": 440
},
{
"epoch": 0.6452554744525547,
"grad_norm": 5.46875,
"learning_rate": 4.1516380515512705e-06,
"loss": 1.410897970199585,
"step": 442
},
{
"epoch": 0.6481751824817519,
"grad_norm": 1.78125,
"learning_rate": 4.143121384413695e-06,
"loss": 1.4373693466186523,
"step": 444
},
{
"epoch": 0.6510948905109489,
"grad_norm": 2.78125,
"learning_rate": 4.134572224018176e-06,
"loss": 1.4430195093154907,
"step": 446
},
{
"epoch": 0.654014598540146,
"grad_norm": 7.90625,
"learning_rate": 4.125990769765911e-06,
"loss": 1.4238855838775635,
"step": 448
},
{
"epoch": 0.656934306569343,
"grad_norm": 2.25,
"learning_rate": 4.117377221811324e-06,
"loss": 1.4734668731689453,
"step": 450
},
{
"epoch": 0.6598540145985401,
"grad_norm": 2.734375,
"learning_rate": 4.108731781057393e-06,
"loss": 1.5210154056549072,
"step": 452
},
{
"epoch": 0.6627737226277373,
"grad_norm": 1.25,
"learning_rate": 4.100054649150967e-06,
"loss": 1.237725019454956,
"step": 454
},
{
"epoch": 0.6656934306569343,
"grad_norm": 3.953125,
"learning_rate": 4.091346028478059e-06,
"loss": 1.4640438556671143,
"step": 456
},
{
"epoch": 0.6686131386861314,
"grad_norm": 9.0,
"learning_rate": 4.0826061221591326e-06,
"loss": 1.105014681816101,
"step": 458
},
{
"epoch": 0.6715328467153284,
"grad_norm": 42.25,
"learning_rate": 4.073835134044356e-06,
"loss": 1.4338090419769287,
"step": 460
},
{
"epoch": 0.6744525547445256,
"grad_norm": 5.90625,
"learning_rate": 4.065033268708854e-06,
"loss": 1.3917622566223145,
"step": 462
},
{
"epoch": 0.6773722627737226,
"grad_norm": 3.359375,
"learning_rate": 4.056200731447929e-06,
"loss": 1.0591514110565186,
"step": 464
},
{
"epoch": 0.6802919708029197,
"grad_norm": 4.625,
"learning_rate": 4.0473377282722845e-06,
"loss": 1.4084625244140625,
"step": 466
},
{
"epoch": 0.6832116788321168,
"grad_norm": 3.734375,
"learning_rate": 4.038444465903208e-06,
"loss": 1.4596691131591797,
"step": 468
},
{
"epoch": 0.6861313868613139,
"grad_norm": 11.125,
"learning_rate": 4.029521151767757e-06,
"loss": 1.2422056198120117,
"step": 470
},
{
"epoch": 0.689051094890511,
"grad_norm": 4.4375,
"learning_rate": 4.0205679939939164e-06,
"loss": 1.33591628074646,
"step": 472
},
{
"epoch": 0.691970802919708,
"grad_norm": 2.21875,
"learning_rate": 4.011585201405747e-06,
"loss": 1.2504942417144775,
"step": 474
},
{
"epoch": 0.6948905109489051,
"grad_norm": 3.6875,
"learning_rate": 4.002572983518515e-06,
"loss": 1.2631410360336304,
"step": 476
},
{
"epoch": 0.6978102189781021,
"grad_norm": 5.8125,
"learning_rate": 3.993531550533804e-06,
"loss": 1.3914625644683838,
"step": 478
},
{
"epoch": 0.7007299270072993,
"grad_norm": 20.0,
"learning_rate": 3.98446111333461e-06,
"loss": 1.288975715637207,
"step": 480
},
{
"epoch": 0.7036496350364964,
"grad_norm": 3.234375,
"learning_rate": 3.9753618834804295e-06,
"loss": 1.4152731895446777,
"step": 482
},
{
"epoch": 0.7065693430656934,
"grad_norm": 5.71875,
"learning_rate": 3.966234073202316e-06,
"loss": 1.316530466079712,
"step": 484
},
{
"epoch": 0.7094890510948905,
"grad_norm": 56.5,
"learning_rate": 3.957077895397941e-06,
"loss": 1.3749709129333496,
"step": 486
},
{
"epoch": 0.7124087591240876,
"grad_norm": 1.734375,
"learning_rate": 3.947893563626615e-06,
"loss": 1.2120707035064697,
"step": 488
},
{
"epoch": 0.7153284671532847,
"grad_norm": 3.546875,
"learning_rate": 3.93868129210432e-06,
"loss": 1.4016718864440918,
"step": 490
},
{
"epoch": 0.7182481751824817,
"grad_norm": 8.8125,
"learning_rate": 3.929441295698702e-06,
"loss": 1.154693841934204,
"step": 492
},
{
"epoch": 0.7211678832116788,
"grad_norm": 3.640625,
"learning_rate": 3.920173789924065e-06,
"loss": 1.334530234336853,
"step": 494
},
{
"epoch": 0.724087591240876,
"grad_norm": 1.921875,
"learning_rate": 3.910878990936346e-06,
"loss": 1.3103371858596802,
"step": 496
},
{
"epoch": 0.727007299270073,
"grad_norm": 2.84375,
"learning_rate": 3.901557115528069e-06,
"loss": 1.244321584701538,
"step": 498
},
{
"epoch": 0.7299270072992701,
"grad_norm": 4.40625,
"learning_rate": 3.892208381123289e-06,
"loss": 1.4268873929977417,
"step": 500
},
{
"epoch": 0.7328467153284671,
"grad_norm": 1.4765625,
"learning_rate": 3.8828330057725225e-06,
"loss": 1.3552806377410889,
"step": 502
},
{
"epoch": 0.7357664233576642,
"grad_norm": 3.65625,
"learning_rate": 3.873431208147664e-06,
"loss": 1.6077991724014282,
"step": 504
},
{
"epoch": 0.7386861313868613,
"grad_norm": 2.21875,
"learning_rate": 3.864003207536879e-06,
"loss": 1.2244906425476074,
"step": 506
},
{
"epoch": 0.7416058394160584,
"grad_norm": 2.265625,
"learning_rate": 3.854549223839497e-06,
"loss": 1.0374276638031006,
"step": 508
},
{
"epoch": 0.7445255474452555,
"grad_norm": 6.96875,
"learning_rate": 3.845069477560876e-06,
"loss": 1.547581434249878,
"step": 510
},
{
"epoch": 0.7474452554744525,
"grad_norm": 2.203125,
"learning_rate": 3.835564189807263e-06,
"loss": 1.225568175315857,
"step": 512
},
{
"epoch": 0.7503649635036497,
"grad_norm": 4.09375,
"learning_rate": 3.826033582280635e-06,
"loss": 1.2825735807418823,
"step": 514
},
{
"epoch": 0.7532846715328467,
"grad_norm": 2.96875,
"learning_rate": 3.816477877273533e-06,
"loss": 1.430619716644287,
"step": 516
},
{
"epoch": 0.7562043795620438,
"grad_norm": 10.9375,
"learning_rate": 3.8068972976638703e-06,
"loss": 1.489488124847412,
"step": 518
},
{
"epoch": 0.7591240875912408,
"grad_norm": 4.3125,
"learning_rate": 3.797292066909734e-06,
"loss": 0.8555082082748413,
"step": 520
},
{
"epoch": 0.762043795620438,
"grad_norm": 3.703125,
"learning_rate": 3.787662409044184e-06,
"loss": 1.3753139972686768,
"step": 522
},
{
"epoch": 0.7649635036496351,
"grad_norm": 8.0,
"learning_rate": 3.7780085486700126e-06,
"loss": 1.6844412088394165,
"step": 524
},
{
"epoch": 0.7678832116788321,
"grad_norm": 5.25,
"learning_rate": 3.768330710954517e-06,
"loss": 1.592594027519226,
"step": 526
},
{
"epoch": 0.7708029197080292,
"grad_norm": 1.5,
"learning_rate": 3.7586291216242433e-06,
"loss": 1.2550559043884277,
"step": 528
},
{
"epoch": 0.7737226277372263,
"grad_norm": 3.953125,
"learning_rate": 3.748904006959719e-06,
"loss": 1.1512435674667358,
"step": 530
},
{
"epoch": 0.7766423357664234,
"grad_norm": 10.375,
"learning_rate": 3.739155593790182e-06,
"loss": 1.5256032943725586,
"step": 532
},
{
"epoch": 0.7795620437956204,
"grad_norm": 10.75,
"learning_rate": 3.729384109488282e-06,
"loss": 1.6810424327850342,
"step": 534
},
{
"epoch": 0.7824817518248175,
"grad_norm": 3.734375,
"learning_rate": 3.719589781964787e-06,
"loss": 1.4392688274383545,
"step": 536
},
{
"epoch": 0.7854014598540145,
"grad_norm": 4.125,
"learning_rate": 3.7097728396632555e-06,
"loss": 1.4172781705856323,
"step": 538
},
{
"epoch": 0.7883211678832117,
"grad_norm": 4.125,
"learning_rate": 3.6999335115547185e-06,
"loss": 1.401853322982788,
"step": 540
},
{
"epoch": 0.7912408759124088,
"grad_norm": 6.375,
"learning_rate": 3.690072027132335e-06,
"loss": 1.534106731414795,
"step": 542
},
{
"epoch": 0.7941605839416058,
"grad_norm": 5.0,
"learning_rate": 3.680188616406037e-06,
"loss": 1.629064679145813,
"step": 544
},
{
"epoch": 0.7970802919708029,
"grad_norm": 3.5625,
"learning_rate": 3.6702835098971706e-06,
"loss": 1.5794017314910889,
"step": 546
},
{
"epoch": 0.8,
"grad_norm": 7.90625,
"learning_rate": 3.6603569386331122e-06,
"loss": 1.556319236755371,
"step": 548
},
{
"epoch": 0.8029197080291971,
"grad_norm": 5.125,
"learning_rate": 3.6504091341418853e-06,
"loss": 1.5984359979629517,
"step": 550
},
{
"epoch": 0.8058394160583942,
"grad_norm": 4.5,
"learning_rate": 3.640440328446759e-06,
"loss": 1.5283421277999878,
"step": 552
},
{
"epoch": 0.8087591240875912,
"grad_norm": 6.75,
"learning_rate": 3.6304507540608357e-06,
"loss": 1.383811116218567,
"step": 554
},
{
"epoch": 0.8116788321167884,
"grad_norm": 3.640625,
"learning_rate": 3.620440643981629e-06,
"loss": 1.3146003484725952,
"step": 556
},
{
"epoch": 0.8145985401459854,
"grad_norm": 4.125,
"learning_rate": 3.6104102316856255e-06,
"loss": 1.4131672382354736,
"step": 558
},
{
"epoch": 0.8175182481751825,
"grad_norm": 13.25,
"learning_rate": 3.600359751122845e-06,
"loss": 1.549619197845459,
"step": 560
},
{
"epoch": 0.8204379562043795,
"grad_norm": 2.796875,
"learning_rate": 3.590289436711379e-06,
"loss": 1.5269279479980469,
"step": 562
},
{
"epoch": 0.8233576642335766,
"grad_norm": 3.046875,
"learning_rate": 3.5801995233319265e-06,
"loss": 1.3862372636795044,
"step": 564
},
{
"epoch": 0.8262773722627738,
"grad_norm": 2.484375,
"learning_rate": 3.5700902463223137e-06,
"loss": 1.2330877780914307,
"step": 566
},
{
"epoch": 0.8291970802919708,
"grad_norm": 7.125,
"learning_rate": 3.559961841472005e-06,
"loss": 1.4884552955627441,
"step": 568
},
{
"epoch": 0.8321167883211679,
"grad_norm": 3.28125,
"learning_rate": 3.5498145450166057e-06,
"loss": 1.3787778615951538,
"step": 570
},
{
"epoch": 0.8350364963503649,
"grad_norm": 3.609375,
"learning_rate": 3.5396485936323456e-06,
"loss": 1.3882396221160889,
"step": 572
},
{
"epoch": 0.8379562043795621,
"grad_norm": 3.15625,
"learning_rate": 3.529464224430568e-06,
"loss": 1.3656411170959473,
"step": 574
},
{
"epoch": 0.8408759124087591,
"grad_norm": 5.65625,
"learning_rate": 3.5192616749521942e-06,
"loss": 1.5140806436538696,
"step": 576
},
{
"epoch": 0.8437956204379562,
"grad_norm": 4.5,
"learning_rate": 3.5090411831621803e-06,
"loss": 1.5188113451004028,
"step": 578
},
{
"epoch": 0.8467153284671532,
"grad_norm": 2.671875,
"learning_rate": 3.498802987443974e-06,
"loss": 1.3665883541107178,
"step": 580
},
{
"epoch": 0.8496350364963504,
"grad_norm": 5.25,
"learning_rate": 3.4885473265939464e-06,
"loss": 1.383296012878418,
"step": 582
},
{
"epoch": 0.8525547445255475,
"grad_norm": 2.71875,
"learning_rate": 3.478274439815831e-06,
"loss": 1.2266430854797363,
"step": 584
},
{
"epoch": 0.8554744525547445,
"grad_norm": 3.9375,
"learning_rate": 3.467984566715137e-06,
"loss": 1.5247292518615723,
"step": 586
},
{
"epoch": 0.8583941605839416,
"grad_norm": 4.125,
"learning_rate": 3.4576779472935644e-06,
"loss": 1.4203873872756958,
"step": 588
},
{
"epoch": 0.8613138686131386,
"grad_norm": 2.46875,
"learning_rate": 3.447354821943407e-06,
"loss": 1.222019076347351,
"step": 590
},
{
"epoch": 0.8642335766423358,
"grad_norm": 4.8125,
"learning_rate": 3.4370154314419395e-06,
"loss": 1.2593979835510254,
"step": 592
},
{
"epoch": 0.8671532846715329,
"grad_norm": 3.21875,
"learning_rate": 3.4266600169458135e-06,
"loss": 1.22776460647583,
"step": 594
},
{
"epoch": 0.8700729927007299,
"grad_norm": 2.703125,
"learning_rate": 3.4162888199854182e-06,
"loss": 1.2717225551605225,
"step": 596
},
{
"epoch": 0.872992700729927,
"grad_norm": 1.2890625,
"learning_rate": 3.405902082459259e-06,
"loss": 1.0713449716567993,
"step": 598
},
{
"epoch": 0.8759124087591241,
"grad_norm": 3.453125,
"learning_rate": 3.3955000466283073e-06,
"loss": 1.2096487283706665,
"step": 600
},
{
"epoch": 0.8788321167883212,
"grad_norm": 2.03125,
"learning_rate": 3.385082955110355e-06,
"loss": 1.2699155807495117,
"step": 602
},
{
"epoch": 0.8817518248175182,
"grad_norm": 2.328125,
"learning_rate": 3.3746510508743533e-06,
"loss": 1.3786303997039795,
"step": 604
},
{
"epoch": 0.8846715328467153,
"grad_norm": 5.53125,
"learning_rate": 3.3642045772347453e-06,
"loss": 1.3685808181762695,
"step": 606
},
{
"epoch": 0.8875912408759125,
"grad_norm": 9.0625,
"learning_rate": 3.353743777845795e-06,
"loss": 1.178727626800537,
"step": 608
},
{
"epoch": 0.8905109489051095,
"grad_norm": 4.1875,
"learning_rate": 3.343268896695897e-06,
"loss": 1.383094310760498,
"step": 610
},
{
"epoch": 0.8934306569343066,
"grad_norm": 3.359375,
"learning_rate": 3.3327801781018925e-06,
"loss": 1.4056508541107178,
"step": 612
},
{
"epoch": 0.8963503649635036,
"grad_norm": 4.65625,
"learning_rate": 3.322277866703367e-06,
"loss": 1.5974513292312622,
"step": 614
},
{
"epoch": 0.8992700729927007,
"grad_norm": 1.1875,
"learning_rate": 3.3117622074569476e-06,
"loss": 1.1610685586929321,
"step": 616
},
{
"epoch": 0.9021897810218978,
"grad_norm": 10.75,
"learning_rate": 3.3012334456305846e-06,
"loss": 0.901719331741333,
"step": 618
},
{
"epoch": 0.9051094890510949,
"grad_norm": 8.3125,
"learning_rate": 3.2906918267978355e-06,
"loss": 1.2409268617630005,
"step": 620
},
{
"epoch": 0.908029197080292,
"grad_norm": 3.453125,
"learning_rate": 3.2801375968321355e-06,
"loss": 1.4349682331085205,
"step": 622
},
{
"epoch": 0.910948905109489,
"grad_norm": 6.875,
"learning_rate": 3.269571001901061e-06,
"loss": 1.3277549743652344,
"step": 624
},
{
"epoch": 0.9138686131386862,
"grad_norm": 5.1875,
"learning_rate": 3.2589922884605924e-06,
"loss": 1.3614181280136108,
"step": 626
},
{
"epoch": 0.9167883211678832,
"grad_norm": 9.125,
"learning_rate": 3.2484017032493615e-06,
"loss": 1.705947756767273,
"step": 628
},
{
"epoch": 0.9197080291970803,
"grad_norm": 4.0,
"learning_rate": 3.237799493282897e-06,
"loss": 1.3996449708938599,
"step": 630
},
{
"epoch": 0.9226277372262773,
"grad_norm": 2.75,
"learning_rate": 3.2271859058478666e-06,
"loss": 1.4013357162475586,
"step": 632
},
{
"epoch": 0.9255474452554745,
"grad_norm": 3.46875,
"learning_rate": 3.2165611884963055e-06,
"loss": 1.2193137407302856,
"step": 634
},
{
"epoch": 0.9284671532846716,
"grad_norm": 2.421875,
"learning_rate": 3.2059255890398445e-06,
"loss": 0.9855245351791382,
"step": 636
},
{
"epoch": 0.9313868613138686,
"grad_norm": 3.59375,
"learning_rate": 3.1952793555439276e-06,
"loss": 1.4272806644439697,
"step": 638
},
{
"epoch": 0.9343065693430657,
"grad_norm": 3.421875,
"learning_rate": 3.18462273632203e-06,
"loss": 1.1866121292114258,
"step": 640
},
{
"epoch": 0.9372262773722628,
"grad_norm": 6.84375,
"learning_rate": 3.173955979929863e-06,
"loss": 1.385930061340332,
"step": 642
},
{
"epoch": 0.9401459854014599,
"grad_norm": 1.8125,
"learning_rate": 3.163279335159578e-06,
"loss": 1.283376932144165,
"step": 644
},
{
"epoch": 0.9430656934306569,
"grad_norm": 5.0625,
"learning_rate": 3.152593051033966e-06,
"loss": 1.368044376373291,
"step": 646
},
{
"epoch": 0.945985401459854,
"grad_norm": 14.0625,
"learning_rate": 3.1418973768006424e-06,
"loss": 0.6849503517150879,
"step": 648
},
{
"epoch": 0.948905109489051,
"grad_norm": 2.140625,
"learning_rate": 3.1311925619262417e-06,
"loss": 1.3481240272521973,
"step": 650
},
{
"epoch": 0.9518248175182482,
"grad_norm": 3.234375,
"learning_rate": 3.1204788560905935e-06,
"loss": 1.390141248703003,
"step": 652
},
{
"epoch": 0.9547445255474453,
"grad_norm": 8.8125,
"learning_rate": 3.1097565091809033e-06,
"loss": 1.3187050819396973,
"step": 654
},
{
"epoch": 0.9576642335766423,
"grad_norm": 12.125,
"learning_rate": 3.0990257712859184e-06,
"loss": 1.3746651411056519,
"step": 656
},
{
"epoch": 0.9605839416058394,
"grad_norm": 7.09375,
"learning_rate": 3.0882868926901e-06,
"loss": 1.2352771759033203,
"step": 658
},
{
"epoch": 0.9635036496350365,
"grad_norm": 3.46875,
"learning_rate": 3.077540123867783e-06,
"loss": 1.328325629234314,
"step": 660
},
{
"epoch": 0.9664233576642336,
"grad_norm": 3.46875,
"learning_rate": 3.066785715477334e-06,
"loss": 1.2275207042694092,
"step": 662
},
{
"epoch": 0.9693430656934306,
"grad_norm": 2.4375,
"learning_rate": 3.056023918355307e-06,
"loss": 1.335202693939209,
"step": 664
},
{
"epoch": 0.9722627737226277,
"grad_norm": 6.5,
"learning_rate": 3.0452549835105895e-06,
"loss": 1.4829626083374023,
"step": 666
},
{
"epoch": 0.9751824817518249,
"grad_norm": 34.0,
"learning_rate": 3.03447916211855e-06,
"loss": 1.5850169658660889,
"step": 668
},
{
"epoch": 0.9781021897810219,
"grad_norm": 6.5,
"learning_rate": 3.0236967055151804e-06,
"loss": 1.671141266822815,
"step": 670
},
{
"epoch": 0.981021897810219,
"grad_norm": 23.125,
"learning_rate": 3.0129078651912317e-06,
"loss": 1.300727128982544,
"step": 672
},
{
"epoch": 0.983941605839416,
"grad_norm": 8.875,
"learning_rate": 3.00211289278635e-06,
"loss": 1.4001004695892334,
"step": 674
},
{
"epoch": 0.9868613138686131,
"grad_norm": 8.875,
"learning_rate": 2.991312040083206e-06,
"loss": 0.47176289558410645,
"step": 676
},
{
"epoch": 0.9897810218978103,
"grad_norm": 2.875,
"learning_rate": 2.9805055590016225e-06,
"loss": 1.2891722917556763,
"step": 678
},
{
"epoch": 0.9927007299270073,
"grad_norm": 4.1875,
"learning_rate": 2.9696937015926995e-06,
"loss": 1.365147352218628,
"step": 680
},
{
"epoch": 0.9956204379562044,
"grad_norm": 1.8828125,
"learning_rate": 2.9588767200329348e-06,
"loss": 1.2809860706329346,
"step": 682
},
{
"epoch": 0.9985401459854014,
"grad_norm": 8.25,
"learning_rate": 2.9480548666183427e-06,
"loss": 1.6904196739196777,
"step": 684
},
{
"epoch": 1.0014598540145985,
"grad_norm": 2.21875,
"learning_rate": 2.9372283937585675e-06,
"loss": 1.3279258012771606,
"step": 686
},
{
"epoch": 1.0043795620437956,
"grad_norm": 4.34375,
"learning_rate": 2.926397553970999e-06,
"loss": 1.277381181716919,
"step": 688
},
{
"epoch": 1.0072992700729928,
"grad_norm": 5.84375,
"learning_rate": 2.915562599874882e-06,
"loss": 1.500443935394287,
"step": 690
},
{
"epoch": 1.0102189781021897,
"grad_norm": 9.875,
"learning_rate": 2.904723784185422e-06,
"loss": 1.2994956970214844,
"step": 692
},
{
"epoch": 1.013138686131387,
"grad_norm": 10.6875,
"learning_rate": 2.893881359707894e-06,
"loss": 1.227457046508789,
"step": 694
},
{
"epoch": 1.0160583941605839,
"grad_norm": 2.984375,
"learning_rate": 2.883035579331744e-06,
"loss": 1.2923262119293213,
"step": 696
},
{
"epoch": 1.018978102189781,
"grad_norm": 4.0,
"learning_rate": 2.8721866960246912e-06,
"loss": 1.445424199104309,
"step": 698
},
{
"epoch": 1.0218978102189782,
"grad_norm": 2.1875,
"learning_rate": 2.861334962826828e-06,
"loss": 1.1312172412872314,
"step": 700
},
{
"epoch": 1.0248175182481751,
"grad_norm": 3.734375,
"learning_rate": 2.8504806328447177e-06,
"loss": 1.4891958236694336,
"step": 702
},
{
"epoch": 1.0277372262773723,
"grad_norm": 3.734375,
"learning_rate": 2.8396239592454914e-06,
"loss": 1.4066648483276367,
"step": 704
},
{
"epoch": 1.0306569343065692,
"grad_norm": 4.21875,
"learning_rate": 2.828765195250942e-06,
"loss": 1.4027667045593262,
"step": 706
},
{
"epoch": 1.0335766423357664,
"grad_norm": 3.828125,
"learning_rate": 2.8179045941316214e-06,
"loss": 1.3984425067901611,
"step": 708
},
{
"epoch": 1.0364963503649636,
"grad_norm": 37.25,
"learning_rate": 2.8070424092009264e-06,
"loss": 1.5881340503692627,
"step": 710
},
{
"epoch": 1.0394160583941605,
"grad_norm": 5.21875,
"learning_rate": 2.7961788938091994e-06,
"loss": 1.3652167320251465,
"step": 712
},
{
"epoch": 1.0423357664233577,
"grad_norm": 9.0,
"learning_rate": 2.785314301337811e-06,
"loss": 1.4395644664764404,
"step": 714
},
{
"epoch": 1.0452554744525548,
"grad_norm": 4.125,
"learning_rate": 2.7744488851932568e-06,
"loss": 1.3807083368301392,
"step": 716
},
{
"epoch": 1.0481751824817518,
"grad_norm": 16.625,
"learning_rate": 2.76358289880124e-06,
"loss": 1.2562787532806396,
"step": 718
},
{
"epoch": 1.051094890510949,
"grad_norm": 4.03125,
"learning_rate": 2.752716595600768e-06,
"loss": 1.2394318580627441,
"step": 720
},
{
"epoch": 1.054014598540146,
"grad_norm": 8.625,
"learning_rate": 2.7418502290382352e-06,
"loss": 1.1047321557998657,
"step": 722
},
{
"epoch": 1.056934306569343,
"grad_norm": 4.46875,
"learning_rate": 2.7309840525615146e-06,
"loss": 1.5514793395996094,
"step": 724
},
{
"epoch": 1.0598540145985402,
"grad_norm": 3.234375,
"learning_rate": 2.720118319614047e-06,
"loss": 1.2009215354919434,
"step": 726
},
{
"epoch": 1.0627737226277372,
"grad_norm": 2.65625,
"learning_rate": 2.709253283628924e-06,
"loss": 1.2573150396347046,
"step": 728
},
{
"epoch": 1.0656934306569343,
"grad_norm": 8.9375,
"learning_rate": 2.698389198022987e-06,
"loss": 1.624213457107544,
"step": 730
},
{
"epoch": 1.0686131386861315,
"grad_norm": 5.375,
"learning_rate": 2.6875263161909054e-06,
"loss": 1.3574187755584717,
"step": 732
},
{
"epoch": 1.0715328467153284,
"grad_norm": 7.4375,
"learning_rate": 2.676664891499275e-06,
"loss": 1.2222844362258911,
"step": 734
},
{
"epoch": 1.0744525547445256,
"grad_norm": 1.640625,
"learning_rate": 2.6658051772807046e-06,
"loss": 1.2617628574371338,
"step": 736
},
{
"epoch": 1.0773722627737226,
"grad_norm": 8.0,
"learning_rate": 2.6549474268279074e-06,
"loss": 1.3748055696487427,
"step": 738
},
{
"epoch": 1.0802919708029197,
"grad_norm": 8.5625,
"learning_rate": 2.644091893387793e-06,
"loss": 1.4741809368133545,
"step": 740
},
{
"epoch": 1.0832116788321169,
"grad_norm": 7.1875,
"learning_rate": 2.6332388301555615e-06,
"loss": 1.3683550357818604,
"step": 742
},
{
"epoch": 1.0861313868613138,
"grad_norm": 23.125,
"learning_rate": 2.622388490268799e-06,
"loss": 1.4302444458007812,
"step": 744
},
{
"epoch": 1.089051094890511,
"grad_norm": 2.875,
"learning_rate": 2.6115411268015716e-06,
"loss": 1.3794375658035278,
"step": 746
},
{
"epoch": 1.091970802919708,
"grad_norm": 3.5,
"learning_rate": 2.6006969927585214e-06,
"loss": 1.6521217823028564,
"step": 748
},
{
"epoch": 1.094890510948905,
"grad_norm": 4.09375,
"learning_rate": 2.589856341068969e-06,
"loss": 1.380043625831604,
"step": 750
},
{
"epoch": 1.0978102189781023,
"grad_norm": 2.84375,
"learning_rate": 2.5790194245810125e-06,
"loss": 1.2655432224273682,
"step": 752
},
{
"epoch": 1.1007299270072992,
"grad_norm": 8.6875,
"learning_rate": 2.568186496055628e-06,
"loss": 1.4429633617401123,
"step": 754
},
{
"epoch": 1.1036496350364964,
"grad_norm": 2.34375,
"learning_rate": 2.5573578081607793e-06,
"loss": 1.1212751865386963,
"step": 756
},
{
"epoch": 1.1065693430656935,
"grad_norm": 2.71875,
"learning_rate": 2.546533613465518e-06,
"loss": 0.9118128418922424,
"step": 758
},
{
"epoch": 1.1094890510948905,
"grad_norm": 2.9375,
"learning_rate": 2.5357141644340966e-06,
"loss": 1.3533203601837158,
"step": 760
},
{
"epoch": 1.1124087591240877,
"grad_norm": 5.625,
"learning_rate": 2.5248997134200833e-06,
"loss": 1.2528855800628662,
"step": 762
},
{
"epoch": 1.1153284671532846,
"grad_norm": 2.5,
"learning_rate": 2.5140905126604677e-06,
"loss": 1.244079351425171,
"step": 764
},
{
"epoch": 1.1182481751824818,
"grad_norm": 5.71875,
"learning_rate": 2.503286814269783e-06,
"loss": 1.3053560256958008,
"step": 766
},
{
"epoch": 1.121167883211679,
"grad_norm": 1.5546875,
"learning_rate": 2.4924888702342266e-06,
"loss": 1.2007651329040527,
"step": 768
},
{
"epoch": 1.1240875912408759,
"grad_norm": 5.5625,
"learning_rate": 2.481696932405779e-06,
"loss": 1.3610585927963257,
"step": 770
},
{
"epoch": 1.127007299270073,
"grad_norm": 2.59375,
"learning_rate": 2.4709112524963326e-06,
"loss": 1.3990166187286377,
"step": 772
},
{
"epoch": 1.12992700729927,
"grad_norm": 3.484375,
"learning_rate": 2.4601320820718196e-06,
"loss": 1.3095015287399292,
"step": 774
},
{
"epoch": 1.1328467153284671,
"grad_norm": 2.84375,
"learning_rate": 2.4493596725463435e-06,
"loss": 1.2231605052947998,
"step": 776
},
{
"epoch": 1.1357664233576643,
"grad_norm": 5.875,
"learning_rate": 2.438594275176318e-06,
"loss": 1.3952467441558838,
"step": 778
},
{
"epoch": 1.1386861313868613,
"grad_norm": 5.09375,
"learning_rate": 2.4278361410546027e-06,
"loss": 1.2288057804107666,
"step": 780
},
{
"epoch": 1.1416058394160584,
"grad_norm": 7.15625,
"learning_rate": 2.41708552110465e-06,
"loss": 1.46846342086792,
"step": 782
},
{
"epoch": 1.1445255474452556,
"grad_norm": 3.421875,
"learning_rate": 2.4063426660746517e-06,
"loss": 1.3782763481140137,
"step": 784
},
{
"epoch": 1.1474452554744525,
"grad_norm": 9.375,
"learning_rate": 2.3956078265316883e-06,
"loss": 1.2458666563034058,
"step": 786
},
{
"epoch": 1.1503649635036497,
"grad_norm": 3.59375,
"learning_rate": 2.3848812528558887e-06,
"loss": 1.2981244325637817,
"step": 788
},
{
"epoch": 1.1532846715328466,
"grad_norm": 5.96875,
"learning_rate": 2.374163195234586e-06,
"loss": 1.3579144477844238,
"step": 790
},
{
"epoch": 1.1562043795620438,
"grad_norm": 1.4765625,
"learning_rate": 2.3634539036564853e-06,
"loss": 1.2424495220184326,
"step": 792
},
{
"epoch": 1.159124087591241,
"grad_norm": 3.78125,
"learning_rate": 2.352753627905833e-06,
"loss": 1.6642348766326904,
"step": 794
},
{
"epoch": 1.162043795620438,
"grad_norm": 5.90625,
"learning_rate": 2.3420626175565877e-06,
"loss": 1.1931509971618652,
"step": 796
},
{
"epoch": 1.164963503649635,
"grad_norm": 3.75,
"learning_rate": 2.331381121966603e-06,
"loss": 1.3377602100372314,
"step": 798
},
{
"epoch": 1.167883211678832,
"grad_norm": 3.640625,
"learning_rate": 2.3207093902718066e-06,
"loss": 1.2145559787750244,
"step": 800
},
{
"epoch": 1.1708029197080292,
"grad_norm": 2.078125,
"learning_rate": 2.3100476713803967e-06,
"loss": 1.1511560678482056,
"step": 802
},
{
"epoch": 1.1737226277372264,
"grad_norm": 4.75,
"learning_rate": 2.2993962139670292e-06,
"loss": 1.5985954999923706,
"step": 804
},
{
"epoch": 1.1766423357664233,
"grad_norm": 6.71875,
"learning_rate": 2.288755266467022e-06,
"loss": 1.4606941938400269,
"step": 806
},
{
"epoch": 1.1795620437956205,
"grad_norm": 7.75,
"learning_rate": 2.2781250770705575e-06,
"loss": 1.5486199855804443,
"step": 808
},
{
"epoch": 1.1824817518248176,
"grad_norm": 4.1875,
"learning_rate": 2.267505893716898e-06,
"loss": 1.3502545356750488,
"step": 810
},
{
"epoch": 1.1854014598540146,
"grad_norm": 7.40625,
"learning_rate": 2.2568979640885964e-06,
"loss": 1.5650737285614014,
"step": 812
},
{
"epoch": 1.1883211678832117,
"grad_norm": 7.96875,
"learning_rate": 2.246301535605726e-06,
"loss": 1.6433610916137695,
"step": 814
},
{
"epoch": 1.1912408759124087,
"grad_norm": 3.78125,
"learning_rate": 2.2357168554201066e-06,
"loss": 1.0836632251739502,
"step": 816
},
{
"epoch": 1.1941605839416058,
"grad_norm": 3.796875,
"learning_rate": 2.225144170409537e-06,
"loss": 1.1502854824066162,
"step": 818
},
{
"epoch": 1.197080291970803,
"grad_norm": 3.015625,
"learning_rate": 2.2145837271720433e-06,
"loss": 1.6808114051818848,
"step": 820
},
{
"epoch": 1.2,
"grad_norm": 3.296875,
"learning_rate": 2.204035772020121e-06,
"loss": 1.3705600500106812,
"step": 822
},
{
"epoch": 1.2029197080291971,
"grad_norm": 2.78125,
"learning_rate": 2.1935005509749933e-06,
"loss": 1.1946570873260498,
"step": 824
},
{
"epoch": 1.205839416058394,
"grad_norm": 17.75,
"learning_rate": 2.182978309760874e-06,
"loss": 1.5363470315933228,
"step": 826
},
{
"epoch": 1.2087591240875912,
"grad_norm": 3.78125,
"learning_rate": 2.1724692937992313e-06,
"loss": 1.4042502641677856,
"step": 828
},
{
"epoch": 1.2116788321167884,
"grad_norm": 17.25,
"learning_rate": 2.16197374820307e-06,
"loss": 1.2589643001556396,
"step": 830
},
{
"epoch": 1.2145985401459853,
"grad_norm": 3.359375,
"learning_rate": 2.1514919177712085e-06,
"loss": 1.6056280136108398,
"step": 832
},
{
"epoch": 1.2175182481751825,
"grad_norm": 4.3125,
"learning_rate": 2.141024046982573e-06,
"loss": 1.3564906120300293,
"step": 834
},
{
"epoch": 1.2204379562043797,
"grad_norm": 11.625,
"learning_rate": 2.1305703799904947e-06,
"loss": 0.9380712509155273,
"step": 836
},
{
"epoch": 1.2233576642335766,
"grad_norm": 8.75,
"learning_rate": 2.120131160617013e-06,
"loss": 1.0530650615692139,
"step": 838
},
{
"epoch": 1.2262773722627738,
"grad_norm": 8.4375,
"learning_rate": 2.1097066323471897e-06,
"loss": 0.7292347550392151,
"step": 840
},
{
"epoch": 1.2291970802919707,
"grad_norm": 8.125,
"learning_rate": 2.0992970383234336e-06,
"loss": 0.9691898226737976,
"step": 842
},
{
"epoch": 1.2321167883211679,
"grad_norm": 1.796875,
"learning_rate": 2.088902621339823e-06,
"loss": 1.152883768081665,
"step": 844
},
{
"epoch": 1.235036496350365,
"grad_norm": 6.3125,
"learning_rate": 2.078523623836446e-06,
"loss": 1.4850080013275146,
"step": 846
},
{
"epoch": 1.237956204379562,
"grad_norm": 7.3125,
"learning_rate": 2.0681602878937472e-06,
"loss": 1.3769371509552002,
"step": 848
},
{
"epoch": 1.2408759124087592,
"grad_norm": 3.53125,
"learning_rate": 2.057812855226879e-06,
"loss": 1.103143334388733,
"step": 850
},
{
"epoch": 1.243795620437956,
"grad_norm": 3.578125,
"learning_rate": 2.0474815671800644e-06,
"loss": 1.4019992351531982,
"step": 852
},
{
"epoch": 1.2467153284671533,
"grad_norm": 5.40625,
"learning_rate": 2.0371666647209694e-06,
"loss": 1.1963081359863281,
"step": 854
},
{
"epoch": 1.2496350364963504,
"grad_norm": 3.0625,
"learning_rate": 2.0268683884350803e-06,
"loss": 1.1888788938522339,
"step": 856
},
{
"epoch": 1.2525547445255474,
"grad_norm": 13.6875,
"learning_rate": 2.0165869785200938e-06,
"loss": 1.2623980045318604,
"step": 858
},
{
"epoch": 1.2554744525547445,
"grad_norm": 6.4375,
"learning_rate": 2.0063226747803143e-06,
"loss": 1.2596468925476074,
"step": 860
},
{
"epoch": 1.2583941605839417,
"grad_norm": 3.859375,
"learning_rate": 1.9960757166210596e-06,
"loss": 1.333680272102356,
"step": 862
},
{
"epoch": 1.2613138686131387,
"grad_norm": 3.71875,
"learning_rate": 1.9858463430430807e-06,
"loss": 1.1413600444793701,
"step": 864
},
{
"epoch": 1.2642335766423358,
"grad_norm": 5.5625,
"learning_rate": 1.9756347926369813e-06,
"loss": 1.3728548288345337,
"step": 866
},
{
"epoch": 1.2671532846715328,
"grad_norm": 4.15625,
"learning_rate": 1.9654413035776585e-06,
"loss": 1.449355125427246,
"step": 868
},
{
"epoch": 1.27007299270073,
"grad_norm": 4.09375,
"learning_rate": 1.9552661136187444e-06,
"loss": 1.1183695793151855,
"step": 870
},
{
"epoch": 1.2729927007299269,
"grad_norm": 4.40625,
"learning_rate": 1.945109460087061e-06,
"loss": 1.1493186950683594,
"step": 872
},
{
"epoch": 1.275912408759124,
"grad_norm": 2.640625,
"learning_rate": 1.934971579877088e-06,
"loss": 1.3397104740142822,
"step": 874
},
{
"epoch": 1.2788321167883212,
"grad_norm": 4.3125,
"learning_rate": 1.9248527094454316e-06,
"loss": 1.3082889318466187,
"step": 876
},
{
"epoch": 1.2817518248175181,
"grad_norm": 8.4375,
"learning_rate": 1.9147530848053152e-06,
"loss": 1.563565731048584,
"step": 878
},
{
"epoch": 1.2846715328467153,
"grad_norm": 8.25,
"learning_rate": 1.9046729415210686e-06,
"loss": 1.4606716632843018,
"step": 880
},
{
"epoch": 1.2875912408759125,
"grad_norm": 4.65625,
"learning_rate": 1.8946125147026427e-06,
"loss": 1.3690614700317383,
"step": 882
},
{
"epoch": 1.2905109489051094,
"grad_norm": 7.8125,
"learning_rate": 1.8845720390001154e-06,
"loss": 1.6756688356399536,
"step": 884
},
{
"epoch": 1.2934306569343066,
"grad_norm": 3.21875,
"learning_rate": 1.874551748598226e-06,
"loss": 1.2701613903045654,
"step": 886
},
{
"epoch": 1.2963503649635038,
"grad_norm": 4.78125,
"learning_rate": 1.8645518772109077e-06,
"loss": 1.5865097045898438,
"step": 888
},
{
"epoch": 1.2992700729927007,
"grad_norm": 3.921875,
"learning_rate": 1.8545726580758428e-06,
"loss": 1.401726484298706,
"step": 890
},
{
"epoch": 1.3021897810218979,
"grad_norm": 7.78125,
"learning_rate": 1.8446143239490168e-06,
"loss": 1.6153247356414795,
"step": 892
},
{
"epoch": 1.305109489051095,
"grad_norm": 6.125,
"learning_rate": 1.8346771070992914e-06,
"loss": 1.4763232469558716,
"step": 894
},
{
"epoch": 1.308029197080292,
"grad_norm": 1.984375,
"learning_rate": 1.82476123930299e-06,
"loss": 1.2044928073883057,
"step": 896
},
{
"epoch": 1.310948905109489,
"grad_norm": 1.4296875,
"learning_rate": 1.8148669518384862e-06,
"loss": 1.0226365327835083,
"step": 898
},
{
"epoch": 1.313868613138686,
"grad_norm": 2.1875,
"learning_rate": 1.804994475480815e-06,
"loss": 1.0369101762771606,
"step": 900
},
{
"epoch": 1.3167883211678832,
"grad_norm": 1.6875,
"learning_rate": 1.7951440404962856e-06,
"loss": 1.1433358192443848,
"step": 902
},
{
"epoch": 1.3197080291970802,
"grad_norm": 5.3125,
"learning_rate": 1.7853158766371143e-06,
"loss": 1.1160844564437866,
"step": 904
},
{
"epoch": 1.3226277372262774,
"grad_norm": 10.1875,
"learning_rate": 1.7755102131360639e-06,
"loss": 1.3365674018859863,
"step": 906
},
{
"epoch": 1.3255474452554745,
"grad_norm": 2.21875,
"learning_rate": 1.7657272787010967e-06,
"loss": 1.3394170999526978,
"step": 908
},
{
"epoch": 1.3284671532846715,
"grad_norm": 14.0625,
"learning_rate": 1.7559673015100405e-06,
"loss": 1.2542470693588257,
"step": 910
},
{
"epoch": 1.3313868613138686,
"grad_norm": 1.9453125,
"learning_rate": 1.7462305092052676e-06,
"loss": 1.2083182334899902,
"step": 912
},
{
"epoch": 1.3343065693430658,
"grad_norm": 2.234375,
"learning_rate": 1.7365171288883841e-06,
"loss": 1.0745160579681396,
"step": 914
},
{
"epoch": 1.3372262773722627,
"grad_norm": 5.5,
"learning_rate": 1.7268273871149335e-06,
"loss": 1.4868173599243164,
"step": 916
},
{
"epoch": 1.34014598540146,
"grad_norm": 5.96875,
"learning_rate": 1.7171615098891117e-06,
"loss": 0.7804101705551147,
"step": 918
},
{
"epoch": 1.343065693430657,
"grad_norm": 3.65625,
"learning_rate": 1.7075197226584969e-06,
"loss": 1.3761916160583496,
"step": 920
},
{
"epoch": 1.345985401459854,
"grad_norm": 2.640625,
"learning_rate": 1.6979022503087905e-06,
"loss": 1.413581132888794,
"step": 922
},
{
"epoch": 1.348905109489051,
"grad_norm": 8.125,
"learning_rate": 1.688309317158572e-06,
"loss": 1.6476316452026367,
"step": 924
},
{
"epoch": 1.3518248175182481,
"grad_norm": 5.4375,
"learning_rate": 1.6787411469540677e-06,
"loss": 1.5541059970855713,
"step": 926
},
{
"epoch": 1.3547445255474453,
"grad_norm": 6.125,
"learning_rate": 1.6691979628639281e-06,
"loss": 1.5634403228759766,
"step": 928
},
{
"epoch": 1.3576642335766422,
"grad_norm": 2.65625,
"learning_rate": 1.6596799874740294e-06,
"loss": 1.2540359497070312,
"step": 930
},
{
"epoch": 1.3605839416058394,
"grad_norm": 5.59375,
"learning_rate": 1.6501874427822767e-06,
"loss": 1.4849543571472168,
"step": 932
},
{
"epoch": 1.3635036496350366,
"grad_norm": 6.40625,
"learning_rate": 1.6407205501934285e-06,
"loss": 1.141026496887207,
"step": 934
},
{
"epoch": 1.3664233576642335,
"grad_norm": 2.375,
"learning_rate": 1.6312795305139328e-06,
"loss": 0.9827671647071838,
"step": 936
},
{
"epoch": 1.3693430656934307,
"grad_norm": 5.5,
"learning_rate": 1.6218646039467725e-06,
"loss": 1.4801573753356934,
"step": 938
},
{
"epoch": 1.3722627737226278,
"grad_norm": 3.109375,
"learning_rate": 1.6124759900863365e-06,
"loss": 1.6479110717773438,
"step": 940
},
{
"epoch": 1.3751824817518248,
"grad_norm": 7.25,
"learning_rate": 1.6031139079132933e-06,
"loss": 1.2483787536621094,
"step": 942
},
{
"epoch": 1.378102189781022,
"grad_norm": 1.453125,
"learning_rate": 1.593778575789484e-06,
"loss": 1.2027292251586914,
"step": 944
},
{
"epoch": 1.3810218978102191,
"grad_norm": 3.859375,
"learning_rate": 1.5844702114528315e-06,
"loss": 1.5109983682632446,
"step": 946
},
{
"epoch": 1.383941605839416,
"grad_norm": 5.34375,
"learning_rate": 1.5751890320122568e-06,
"loss": 1.3143746852874756,
"step": 948
},
{
"epoch": 1.3868613138686132,
"grad_norm": 8.25,
"learning_rate": 1.5659352539426215e-06,
"loss": 1.2749611139297485,
"step": 950
},
{
"epoch": 1.3897810218978102,
"grad_norm": 2.125,
"learning_rate": 1.5567090930796746e-06,
"loss": 1.244338035583496,
"step": 952
},
{
"epoch": 1.3927007299270073,
"grad_norm": 4.3125,
"learning_rate": 1.5475107646150203e-06,
"loss": 1.3380858898162842,
"step": 954
},
{
"epoch": 1.3956204379562043,
"grad_norm": 1.15625,
"learning_rate": 1.5383404830910981e-06,
"loss": 1.4054020643234253,
"step": 956
},
{
"epoch": 1.3985401459854014,
"grad_norm": 10.5625,
"learning_rate": 1.529198462396175e-06,
"loss": 1.4239089488983154,
"step": 958
},
{
"epoch": 1.4014598540145986,
"grad_norm": 9.25,
"learning_rate": 1.5200849157593666e-06,
"loss": 1.610469102859497,
"step": 960
},
{
"epoch": 1.4043795620437955,
"grad_norm": 1.71875,
"learning_rate": 1.5110000557456542e-06,
"loss": 1.1694961786270142,
"step": 962
},
{
"epoch": 1.4072992700729927,
"grad_norm": 5.625,
"learning_rate": 1.5019440942509312e-06,
"loss": 1.5139713287353516,
"step": 964
},
{
"epoch": 1.4102189781021899,
"grad_norm": 3.953125,
"learning_rate": 1.4929172424970576e-06,
"loss": 1.376784324645996,
"step": 966
},
{
"epoch": 1.4131386861313868,
"grad_norm": 2.34375,
"learning_rate": 1.483919711026939e-06,
"loss": 1.3103041648864746,
"step": 968
},
{
"epoch": 1.416058394160584,
"grad_norm": 1.328125,
"learning_rate": 1.4749517096996116e-06,
"loss": 1.2476757764816284,
"step": 970
},
{
"epoch": 1.4189781021897812,
"grad_norm": 3.703125,
"learning_rate": 1.4660134476853485e-06,
"loss": 1.3406193256378174,
"step": 972
},
{
"epoch": 1.421897810218978,
"grad_norm": 4.375,
"learning_rate": 1.4571051334607813e-06,
"loss": 1.2700021266937256,
"step": 974
},
{
"epoch": 1.4248175182481753,
"grad_norm": 2.90625,
"learning_rate": 1.4482269748040358e-06,
"loss": 1.2266380786895752,
"step": 976
},
{
"epoch": 1.4277372262773722,
"grad_norm": 3.3125,
"learning_rate": 1.4393791787898896e-06,
"loss": 1.189935564994812,
"step": 978
},
{
"epoch": 1.4306569343065694,
"grad_norm": 4.8125,
"learning_rate": 1.430561951784938e-06,
"loss": 1.4163111448287964,
"step": 980
},
{
"epoch": 1.4335766423357663,
"grad_norm": 7.125,
"learning_rate": 1.4217754994427844e-06,
"loss": 1.6390494108200073,
"step": 982
},
{
"epoch": 1.4364963503649635,
"grad_norm": 1.6640625,
"learning_rate": 1.4130200266992408e-06,
"loss": 1.1357786655426025,
"step": 984
},
{
"epoch": 1.4394160583941606,
"grad_norm": 3.5625,
"learning_rate": 1.4042957377675484e-06,
"loss": 1.2841823101043701,
"step": 986
},
{
"epoch": 1.4423357664233576,
"grad_norm": 7.34375,
"learning_rate": 1.395602836133616e-06,
"loss": 1.3807730674743652,
"step": 988
},
{
"epoch": 1.4452554744525548,
"grad_norm": 1.421875,
"learning_rate": 1.386941524551273e-06,
"loss": 1.135375738143921,
"step": 990
},
{
"epoch": 1.448175182481752,
"grad_norm": 2.875,
"learning_rate": 1.37831200503754e-06,
"loss": 1.1764510869979858,
"step": 992
},
{
"epoch": 1.4510948905109489,
"grad_norm": 8.9375,
"learning_rate": 1.3697144788679174e-06,
"loss": 1.2467272281646729,
"step": 994
},
{
"epoch": 1.454014598540146,
"grad_norm": 5.90625,
"learning_rate": 1.3611491465716898e-06,
"loss": 1.4708714485168457,
"step": 996
},
{
"epoch": 1.4569343065693432,
"grad_norm": 3.71875,
"learning_rate": 1.3526162079272495e-06,
"loss": 1.402409553527832,
"step": 998
},
{
"epoch": 1.4598540145985401,
"grad_norm": 4.59375,
"learning_rate": 1.34411586195744e-06,
"loss": 1.2477829456329346,
"step": 1000
},
{
"epoch": 1.4627737226277373,
"grad_norm": 4.0625,
"learning_rate": 1.3356483069249088e-06,
"loss": 1.3877084255218506,
"step": 1002
},
{
"epoch": 1.4656934306569342,
"grad_norm": 7.875,
"learning_rate": 1.3272137403274844e-06,
"loss": 1.555393934249878,
"step": 1004
},
{
"epoch": 1.4686131386861314,
"grad_norm": 3.671875,
"learning_rate": 1.318812358893572e-06,
"loss": 1.3621551990509033,
"step": 1006
},
{
"epoch": 1.4715328467153284,
"grad_norm": 4.59375,
"learning_rate": 1.3104443585775642e-06,
"loss": 1.3545817136764526,
"step": 1008
},
{
"epoch": 1.4744525547445255,
"grad_norm": 3.9375,
"learning_rate": 1.3021099345552695e-06,
"loss": 1.4017988443374634,
"step": 1010
},
{
"epoch": 1.4773722627737227,
"grad_norm": 6.21875,
"learning_rate": 1.2938092812193615e-06,
"loss": 1.3940372467041016,
"step": 1012
},
{
"epoch": 1.4802919708029196,
"grad_norm": 3.1875,
"learning_rate": 1.285542592174842e-06,
"loss": 1.1765646934509277,
"step": 1014
},
{
"epoch": 1.4832116788321168,
"grad_norm": 6.0,
"learning_rate": 1.277310060234529e-06,
"loss": 1.385852336883545,
"step": 1016
},
{
"epoch": 1.486131386861314,
"grad_norm": 4.8125,
"learning_rate": 1.2691118774145577e-06,
"loss": 1.395111322402954,
"step": 1018
},
{
"epoch": 1.489051094890511,
"grad_norm": 1.640625,
"learning_rate": 1.2609482349299021e-06,
"loss": 1.325355052947998,
"step": 1020
},
{
"epoch": 1.491970802919708,
"grad_norm": 3.515625,
"learning_rate": 1.2528193231899156e-06,
"loss": 1.2050141096115112,
"step": 1022
},
{
"epoch": 1.4948905109489052,
"grad_norm": 4.03125,
"learning_rate": 1.2447253317938871e-06,
"loss": 1.6511290073394775,
"step": 1024
},
{
"epoch": 1.4978102189781022,
"grad_norm": 3.609375,
"learning_rate": 1.236666449526623e-06,
"loss": 1.28155517578125,
"step": 1026
},
{
"epoch": 1.5007299270072991,
"grad_norm": 3.734375,
"learning_rate": 1.2286428643540418e-06,
"loss": 1.4207556247711182,
"step": 1028
},
{
"epoch": 1.5036496350364965,
"grad_norm": 3.359375,
"learning_rate": 1.22065476341879e-06,
"loss": 1.3519251346588135,
"step": 1030
},
{
"epoch": 1.5065693430656935,
"grad_norm": 5.84375,
"learning_rate": 1.2127023330358777e-06,
"loss": 1.396289587020874,
"step": 1032
},
{
"epoch": 1.5094890510948904,
"grad_norm": 2.65625,
"learning_rate": 1.204785758688331e-06,
"loss": 1.3400771617889404,
"step": 1034
},
{
"epoch": 1.5124087591240876,
"grad_norm": 31.25,
"learning_rate": 1.1969052250228683e-06,
"loss": 1.1934255361557007,
"step": 1036
},
{
"epoch": 1.5153284671532847,
"grad_norm": 4.90625,
"learning_rate": 1.1890609158455949e-06,
"loss": 1.4513096809387207,
"step": 1038
},
{
"epoch": 1.5182481751824817,
"grad_norm": 2.625,
"learning_rate": 1.181253014117711e-06,
"loss": 1.1264418363571167,
"step": 1040
},
{
"epoch": 1.5211678832116788,
"grad_norm": 1.65625,
"learning_rate": 1.1734817019512465e-06,
"loss": 1.1497807502746582,
"step": 1042
},
{
"epoch": 1.524087591240876,
"grad_norm": 7.8125,
"learning_rate": 1.1657471606048157e-06,
"loss": 1.6058242321014404,
"step": 1044
},
{
"epoch": 1.527007299270073,
"grad_norm": 22.25,
"learning_rate": 1.1580495704793874e-06,
"loss": 1.4766197204589844,
"step": 1046
},
{
"epoch": 1.5299270072992701,
"grad_norm": 3.75,
"learning_rate": 1.1503891111140767e-06,
"loss": 1.2432148456573486,
"step": 1048
},
{
"epoch": 1.5328467153284673,
"grad_norm": 28.25,
"learning_rate": 1.1427659611819604e-06,
"loss": 1.1451390981674194,
"step": 1050
},
{
"epoch": 1.5357664233576642,
"grad_norm": 3.734375,
"learning_rate": 1.1351802984859045e-06,
"loss": 1.3471091985702515,
"step": 1052
},
{
"epoch": 1.5386861313868612,
"grad_norm": 1.640625,
"learning_rate": 1.127632299954423e-06,
"loss": 1.1958954334259033,
"step": 1054
},
{
"epoch": 1.5416058394160586,
"grad_norm": 10.8125,
"learning_rate": 1.1201221416375456e-06,
"loss": 1.3556766510009766,
"step": 1056
},
{
"epoch": 1.5445255474452555,
"grad_norm": 4.75,
"learning_rate": 1.1126499987027172e-06,
"loss": 1.6111273765563965,
"step": 1058
},
{
"epoch": 1.5474452554744524,
"grad_norm": 12.5,
"learning_rate": 1.1052160454307085e-06,
"loss": 1.5189365148544312,
"step": 1060
},
{
"epoch": 1.5503649635036496,
"grad_norm": 3.96875,
"learning_rate": 1.0978204552115493e-06,
"loss": 1.3763346672058105,
"step": 1062
},
{
"epoch": 1.5532846715328468,
"grad_norm": 4.375,
"learning_rate": 1.0904634005404902e-06,
"loss": 1.450345754623413,
"step": 1064
},
{
"epoch": 1.5562043795620437,
"grad_norm": 4.09375,
"learning_rate": 1.0831450530139747e-06,
"loss": 1.2109770774841309,
"step": 1066
},
{
"epoch": 1.5591240875912409,
"grad_norm": 7.0,
"learning_rate": 1.0758655833256381e-06,
"loss": 1.2681195735931396,
"step": 1068
},
{
"epoch": 1.562043795620438,
"grad_norm": 2.640625,
"learning_rate": 1.0686251612623277e-06,
"loss": 1.2694846391677856,
"step": 1070
},
{
"epoch": 1.564963503649635,
"grad_norm": 8.6875,
"learning_rate": 1.0614239557001389e-06,
"loss": 1.5101749897003174,
"step": 1072
},
{
"epoch": 1.5678832116788322,
"grad_norm": 3.171875,
"learning_rate": 1.0542621346004806e-06,
"loss": 1.313795566558838,
"step": 1074
},
{
"epoch": 1.5708029197080293,
"grad_norm": 9.0,
"learning_rate": 1.047139865006155e-06,
"loss": 1.1664808988571167,
"step": 1076
},
{
"epoch": 1.5737226277372263,
"grad_norm": 2.03125,
"learning_rate": 1.0400573130374641e-06,
"loss": 1.203639030456543,
"step": 1078
},
{
"epoch": 1.5766423357664232,
"grad_norm": 3.265625,
"learning_rate": 1.0330146438883304e-06,
"loss": 1.5285131931304932,
"step": 1080
},
{
"epoch": 1.5795620437956206,
"grad_norm": 6.5625,
"learning_rate": 1.0260120218224485e-06,
"loss": 1.516188144683838,
"step": 1082
},
{
"epoch": 1.5824817518248175,
"grad_norm": 6.9375,
"learning_rate": 1.019049610169452e-06,
"loss": 1.3165411949157715,
"step": 1084
},
{
"epoch": 1.5854014598540145,
"grad_norm": 4.6875,
"learning_rate": 1.012127571321104e-06,
"loss": 1.1730577945709229,
"step": 1086
},
{
"epoch": 1.5883211678832116,
"grad_norm": 4.46875,
"learning_rate": 1.0052460667275102e-06,
"loss": 1.3837532997131348,
"step": 1088
},
{
"epoch": 1.5912408759124088,
"grad_norm": 4.71875,
"learning_rate": 9.984052568933507e-07,
"loss": 1.342604398727417,
"step": 1090
},
{
"epoch": 1.5941605839416058,
"grad_norm": 1.8046875,
"learning_rate": 9.916053013741396e-07,
"loss": 1.0345500707626343,
"step": 1092
},
{
"epoch": 1.597080291970803,
"grad_norm": 3.578125,
"learning_rate": 9.848463587725024e-07,
"loss": 1.3031237125396729,
"step": 1094
},
{
"epoch": 1.6,
"grad_norm": 2.6875,
"learning_rate": 9.78128586734476e-07,
"loss": 1.4126646518707275,
"step": 1096
},
{
"epoch": 1.602919708029197,
"grad_norm": 2.796875,
"learning_rate": 9.714521419458333e-07,
"loss": 1.2036532163619995,
"step": 1098
},
{
"epoch": 1.6058394160583942,
"grad_norm": 5.34375,
"learning_rate": 9.648171801284254e-07,
"loss": 1.3445477485656738,
"step": 1100
},
{
"epoch": 1.6087591240875914,
"grad_norm": 6.875,
"learning_rate": 9.582238560365534e-07,
"loss": 1.4824466705322266,
"step": 1102
},
{
"epoch": 1.6116788321167883,
"grad_norm": 2.171875,
"learning_rate": 9.516723234533573e-07,
"loss": 0.6945338845252991,
"step": 1104
},
{
"epoch": 1.6145985401459853,
"grad_norm": 4.375,
"learning_rate": 9.451627351872289e-07,
"loss": 1.691240906715393,
"step": 1106
},
{
"epoch": 1.6175182481751826,
"grad_norm": 5.0625,
"learning_rate": 9.386952430682478e-07,
"loss": 1.6143536567687988,
"step": 1108
},
{
"epoch": 1.6204379562043796,
"grad_norm": 4.90625,
"learning_rate": 9.322699979446395e-07,
"loss": 1.0810116529464722,
"step": 1110
},
{
"epoch": 1.6233576642335765,
"grad_norm": 3.953125,
"learning_rate": 9.25887149679259e-07,
"loss": 1.3443822860717773,
"step": 1112
},
{
"epoch": 1.6262773722627737,
"grad_norm": 2.5,
"learning_rate": 9.19546847146093e-07,
"loss": 1.392272710800171,
"step": 1114
},
{
"epoch": 1.6291970802919709,
"grad_norm": 2.890625,
"learning_rate": 9.132492382267895e-07,
"loss": 1.2860863208770752,
"step": 1116
},
{
"epoch": 1.6321167883211678,
"grad_norm": 6.03125,
"learning_rate": 9.069944698072071e-07,
"loss": 1.4681463241577148,
"step": 1118
},
{
"epoch": 1.635036496350365,
"grad_norm": 1.828125,
"learning_rate": 9.0078268777399e-07,
"loss": 1.1984715461730957,
"step": 1120
},
{
"epoch": 1.6379562043795621,
"grad_norm": 3.328125,
"learning_rate": 8.946140370111651e-07,
"loss": 1.3620171546936035,
"step": 1122
},
{
"epoch": 1.640875912408759,
"grad_norm": 2.5625,
"learning_rate": 8.884886613967625e-07,
"loss": 1.0197124481201172,
"step": 1124
},
{
"epoch": 1.6437956204379562,
"grad_norm": 2.96875,
"learning_rate": 8.824067037994597e-07,
"loss": 1.2507963180541992,
"step": 1126
},
{
"epoch": 1.6467153284671534,
"grad_norm": 7.59375,
"learning_rate": 8.763683060752492e-07,
"loss": 1.5034403800964355,
"step": 1128
},
{
"epoch": 1.6496350364963503,
"grad_norm": 3.703125,
"learning_rate": 8.703736090641302e-07,
"loss": 1.250478744506836,
"step": 1130
},
{
"epoch": 1.6525547445255473,
"grad_norm": 2.921875,
"learning_rate": 8.644227525868238e-07,
"loss": 1.2682870626449585,
"step": 1132
},
{
"epoch": 1.6554744525547447,
"grad_norm": 8.5,
"learning_rate": 8.585158754415114e-07,
"loss": 1.5448431968688965,
"step": 1134
},
{
"epoch": 1.6583941605839416,
"grad_norm": 5.65625,
"learning_rate": 8.52653115400598e-07,
"loss": 1.3879718780517578,
"step": 1136
},
{
"epoch": 1.6613138686131386,
"grad_norm": 3.3125,
"learning_rate": 8.468346092074961e-07,
"loss": 1.3755671977996826,
"step": 1138
},
{
"epoch": 1.6642335766423357,
"grad_norm": 2.75,
"learning_rate": 8.410604925734411e-07,
"loss": 1.1513915061950684,
"step": 1140
},
{
"epoch": 1.667153284671533,
"grad_norm": 14.1875,
"learning_rate": 8.35330900174322e-07,
"loss": 1.5474663972854614,
"step": 1142
},
{
"epoch": 1.6700729927007298,
"grad_norm": 3.515625,
"learning_rate": 8.296459656475413e-07,
"loss": 0.8504141569137573,
"step": 1144
},
{
"epoch": 1.672992700729927,
"grad_norm": 5.78125,
"learning_rate": 8.240058215888998e-07,
"loss": 1.3289515972137451,
"step": 1146
},
{
"epoch": 1.6759124087591242,
"grad_norm": 6.9375,
"learning_rate": 8.184105995494998e-07,
"loss": 0.9470740556716919,
"step": 1148
},
{
"epoch": 1.6788321167883211,
"grad_norm": 3.359375,
"learning_rate": 8.128604300326812e-07,
"loss": 1.352350115776062,
"step": 1150
},
{
"epoch": 1.6817518248175183,
"grad_norm": 4.78125,
"learning_rate": 8.073554424909755e-07,
"loss": 1.3660526275634766,
"step": 1152
},
{
"epoch": 1.6846715328467154,
"grad_norm": 2.46875,
"learning_rate": 8.01895765323087e-07,
"loss": 1.2722463607788086,
"step": 1154
},
{
"epoch": 1.6875912408759124,
"grad_norm": 6.21875,
"learning_rate": 7.964815258708971e-07,
"loss": 1.13301420211792,
"step": 1156
},
{
"epoch": 1.6905109489051093,
"grad_norm": 2.03125,
"learning_rate": 7.911128504164947e-07,
"loss": 1.3945411443710327,
"step": 1158
},
{
"epoch": 1.6934306569343067,
"grad_norm": 1.7421875,
"learning_rate": 7.857898641792322e-07,
"loss": 1.1629891395568848,
"step": 1160
},
{
"epoch": 1.6963503649635037,
"grad_norm": 2.09375,
"learning_rate": 7.805126913128018e-07,
"loss": 1.1993281841278076,
"step": 1162
},
{
"epoch": 1.6992700729927006,
"grad_norm": 3.0625,
"learning_rate": 7.752814549023437e-07,
"loss": 1.4611374139785767,
"step": 1164
},
{
"epoch": 1.7021897810218978,
"grad_norm": 4.625,
"learning_rate": 7.700962769615704e-07,
"loss": 1.1919968128204346,
"step": 1166
},
{
"epoch": 1.705109489051095,
"grad_norm": 2.515625,
"learning_rate": 7.649572784299255e-07,
"loss": 1.2250781059265137,
"step": 1168
},
{
"epoch": 1.7080291970802919,
"grad_norm": 8.1875,
"learning_rate": 7.598645791697601e-07,
"loss": 1.3479260206222534,
"step": 1170
},
{
"epoch": 1.710948905109489,
"grad_norm": 4.25,
"learning_rate": 7.548182979635389e-07,
"loss": 1.3197946548461914,
"step": 1172
},
{
"epoch": 1.7138686131386862,
"grad_norm": 8.6875,
"learning_rate": 7.49818552511068e-07,
"loss": 1.1691796779632568,
"step": 1174
},
{
"epoch": 1.7167883211678832,
"grad_norm": 3.203125,
"learning_rate": 7.448654594267496e-07,
"loss": 1.2978925704956055,
"step": 1176
},
{
"epoch": 1.7197080291970803,
"grad_norm": 2.96875,
"learning_rate": 7.399591342368644e-07,
"loss": 1.174210786819458,
"step": 1178
},
{
"epoch": 1.7226277372262775,
"grad_norm": 4.625,
"learning_rate": 7.350996913768743e-07,
"loss": 1.2740840911865234,
"step": 1180
},
{
"epoch": 1.7255474452554744,
"grad_norm": 8.0625,
"learning_rate": 7.302872441887562e-07,
"loss": 1.1019668579101562,
"step": 1182
},
{
"epoch": 1.7284671532846714,
"grad_norm": 2.84375,
"learning_rate": 7.255219049183552e-07,
"loss": 1.3885023593902588,
"step": 1184
},
{
"epoch": 1.7313868613138688,
"grad_norm": 5.625,
"learning_rate": 7.208037847127683e-07,
"loss": 1.5192725658416748,
"step": 1186
},
{
"epoch": 1.7343065693430657,
"grad_norm": 6.625,
"learning_rate": 7.161329936177522e-07,
"loss": 1.3260494470596313,
"step": 1188
},
{
"epoch": 1.7372262773722627,
"grad_norm": 3.375,
"learning_rate": 7.115096405751567e-07,
"loss": 1.3762927055358887,
"step": 1190
},
{
"epoch": 1.7401459854014598,
"grad_norm": 1.8515625,
"learning_rate": 7.069338334203818e-07,
"loss": 1.0026099681854248,
"step": 1192
},
{
"epoch": 1.743065693430657,
"grad_norm": 1.1015625,
"learning_rate": 7.024056788798658e-07,
"loss": 1.1264629364013672,
"step": 1194
},
{
"epoch": 1.745985401459854,
"grad_norm": 16.75,
"learning_rate": 6.979252825685927e-07,
"loss": 1.5443601608276367,
"step": 1196
},
{
"epoch": 1.748905109489051,
"grad_norm": 1.8671875,
"learning_rate": 6.934927489876312e-07,
"loss": 1.0794442892074585,
"step": 1198
},
{
"epoch": 1.7518248175182483,
"grad_norm": 6.90625,
"learning_rate": 6.891081815216958e-07,
"loss": 1.348907470703125,
"step": 1200
},
{
"epoch": 1.7547445255474452,
"grad_norm": 3.140625,
"learning_rate": 6.847716824367369e-07,
"loss": 1.3414909839630127,
"step": 1202
},
{
"epoch": 1.7576642335766424,
"grad_norm": 4.59375,
"learning_rate": 6.804833528775531e-07,
"loss": 1.4073083400726318,
"step": 1204
},
{
"epoch": 1.7605839416058395,
"grad_norm": 3.671875,
"learning_rate": 6.762432928654358e-07,
"loss": 0.8366962671279907,
"step": 1206
},
{
"epoch": 1.7635036496350365,
"grad_norm": 5.53125,
"learning_rate": 6.720516012958325e-07,
"loss": 1.3547214269638062,
"step": 1208
},
{
"epoch": 1.7664233576642334,
"grad_norm": 5.21875,
"learning_rate": 6.679083759360433e-07,
"loss": 1.6114599704742432,
"step": 1210
},
{
"epoch": 1.7693430656934308,
"grad_norm": 4.5,
"learning_rate": 6.638137134229375e-07,
"loss": 1.5248315334320068,
"step": 1212
},
{
"epoch": 1.7722627737226277,
"grad_norm": 3.6875,
"learning_rate": 6.597677092607025e-07,
"loss": 1.093032956123352,
"step": 1214
},
{
"epoch": 1.7751824817518247,
"grad_norm": 4.5,
"learning_rate": 6.557704578186146e-07,
"loss": 1.408461093902588,
"step": 1216
},
{
"epoch": 1.7781021897810219,
"grad_norm": 9.9375,
"learning_rate": 6.518220523288382e-07,
"loss": 1.3268358707427979,
"step": 1218
},
{
"epoch": 1.781021897810219,
"grad_norm": 4.75,
"learning_rate": 6.479225848842523e-07,
"loss": 1.544386386871338,
"step": 1220
},
{
"epoch": 1.783941605839416,
"grad_norm": 5.9375,
"learning_rate": 6.440721464362998e-07,
"loss": 1.4272065162658691,
"step": 1222
},
{
"epoch": 1.7868613138686131,
"grad_norm": 3.515625,
"learning_rate": 6.402708267928694e-07,
"loss": 1.3150466680526733,
"step": 1224
},
{
"epoch": 1.7897810218978103,
"grad_norm": 5.0,
"learning_rate": 6.365187146161991e-07,
"loss": 1.2979998588562012,
"step": 1226
},
{
"epoch": 1.7927007299270072,
"grad_norm": 4.75,
"learning_rate": 6.32815897420809e-07,
"loss": 1.6841963529586792,
"step": 1228
},
{
"epoch": 1.7956204379562044,
"grad_norm": 5.0,
"learning_rate": 6.29162461571459e-07,
"loss": 1.6227900981903076,
"step": 1230
},
{
"epoch": 1.7985401459854016,
"grad_norm": 11.6875,
"learning_rate": 6.25558492281135e-07,
"loss": 1.4919426441192627,
"step": 1232
},
{
"epoch": 1.8014598540145985,
"grad_norm": 4.8125,
"learning_rate": 6.220040736090617e-07,
"loss": 1.3797836303710938,
"step": 1234
},
{
"epoch": 1.8043795620437955,
"grad_norm": 4.09375,
"learning_rate": 6.18499288458743e-07,
"loss": 1.6902371644973755,
"step": 1236
},
{
"epoch": 1.8072992700729928,
"grad_norm": 2.453125,
"learning_rate": 6.150442185760258e-07,
"loss": 1.2298048734664917,
"step": 1238
},
{
"epoch": 1.8102189781021898,
"grad_norm": 4.53125,
"learning_rate": 6.116389445471948e-07,
"loss": 1.3514063358306885,
"step": 1240
},
{
"epoch": 1.8131386861313867,
"grad_norm": 3.828125,
"learning_rate": 6.082835457970935e-07,
"loss": 1.3649213314056396,
"step": 1242
},
{
"epoch": 1.816058394160584,
"grad_norm": 4.15625,
"learning_rate": 6.0497810058727e-07,
"loss": 1.3873786926269531,
"step": 1244
},
{
"epoch": 1.818978102189781,
"grad_norm": 5.21875,
"learning_rate": 6.017226860141535e-07,
"loss": 1.6073391437530518,
"step": 1246
},
{
"epoch": 1.821897810218978,
"grad_norm": 2.90625,
"learning_rate": 5.985173780072558e-07,
"loss": 1.333566427230835,
"step": 1248
},
{
"epoch": 1.8248175182481752,
"grad_norm": 3.0625,
"learning_rate": 5.953622513273977e-07,
"loss": 1.3585089445114136,
"step": 1250
},
{
"epoch": 1.8277372262773723,
"grad_norm": 3.953125,
"learning_rate": 5.92257379564969e-07,
"loss": 1.195847749710083,
"step": 1252
},
{
"epoch": 1.8306569343065693,
"grad_norm": 4.84375,
"learning_rate": 5.892028351382101e-07,
"loss": 1.4418195486068726,
"step": 1254
},
{
"epoch": 1.8335766423357664,
"grad_norm": 4.09375,
"learning_rate": 5.861986892915227e-07,
"loss": 1.384018063545227,
"step": 1256
},
{
"epoch": 1.8364963503649636,
"grad_norm": 9.4375,
"learning_rate": 5.832450120938093e-07,
"loss": 1.3380024433135986,
"step": 1258
},
{
"epoch": 1.8394160583941606,
"grad_norm": 6.46875,
"learning_rate": 5.803418724368373e-07,
"loss": 1.3088436126708984,
"step": 1260
},
{
"epoch": 1.8423357664233575,
"grad_norm": 9.9375,
"learning_rate": 5.774893380336338e-07,
"loss": 1.5858633518218994,
"step": 1262
},
{
"epoch": 1.845255474452555,
"grad_norm": 6.375,
"learning_rate": 5.746874754169053e-07,
"loss": 1.5293078422546387,
"step": 1264
},
{
"epoch": 1.8481751824817518,
"grad_norm": 2.921875,
"learning_rate": 5.719363499374861e-07,
"loss": 1.1518256664276123,
"step": 1266
},
{
"epoch": 1.8510948905109488,
"grad_norm": 7.6875,
"learning_rate": 5.692360257628144e-07,
"loss": 1.3224802017211914,
"step": 1268
},
{
"epoch": 1.854014598540146,
"grad_norm": 4.28125,
"learning_rate": 5.665865658754341e-07,
"loss": 1.2233679294586182,
"step": 1270
},
{
"epoch": 1.856934306569343,
"grad_norm": 6.34375,
"learning_rate": 5.639880320715284e-07,
"loss": 1.4993672370910645,
"step": 1272
},
{
"epoch": 1.85985401459854,
"grad_norm": 3.703125,
"learning_rate": 5.614404849594762e-07,
"loss": 1.3802194595336914,
"step": 1274
},
{
"epoch": 1.8627737226277372,
"grad_norm": 2.5625,
"learning_rate": 5.589439839584404e-07,
"loss": 1.0489559173583984,
"step": 1276
},
{
"epoch": 1.8656934306569344,
"grad_norm": 1.40625,
"learning_rate": 5.564985872969791e-07,
"loss": 1.2326107025146484,
"step": 1278
},
{
"epoch": 1.8686131386861313,
"grad_norm": 5.4375,
"learning_rate": 5.541043520116912e-07,
"loss": 1.1945993900299072,
"step": 1280
},
{
"epoch": 1.8715328467153285,
"grad_norm": 2.625,
"learning_rate": 5.517613339458832e-07,
"loss": 1.2813007831573486,
"step": 1282
},
{
"epoch": 1.8744525547445257,
"grad_norm": 4.46875,
"learning_rate": 5.494695877482676e-07,
"loss": 1.1684314012527466,
"step": 1284
},
{
"epoch": 1.8773722627737226,
"grad_norm": 3.71875,
"learning_rate": 5.472291668716893e-07,
"loss": 1.222388505935669,
"step": 1286
},
{
"epoch": 1.8802919708029195,
"grad_norm": 2.984375,
"learning_rate": 5.450401235718762e-07,
"loss": 1.2156729698181152,
"step": 1288
},
{
"epoch": 1.883211678832117,
"grad_norm": 5.96875,
"learning_rate": 5.42902508906224e-07,
"loss": 1.311574935913086,
"step": 1290
},
{
"epoch": 1.8861313868613139,
"grad_norm": 7.96875,
"learning_rate": 5.408163727326021e-07,
"loss": 1.34036123752594,
"step": 1292
},
{
"epoch": 1.8890510948905108,
"grad_norm": 3.640625,
"learning_rate": 5.387817637081928e-07,
"loss": 1.1132798194885254,
"step": 1294
},
{
"epoch": 1.891970802919708,
"grad_norm": 3.359375,
"learning_rate": 5.367987292883554e-07,
"loss": 1.3646128177642822,
"step": 1296
},
{
"epoch": 1.8948905109489051,
"grad_norm": 5.1875,
"learning_rate": 5.348673157255195e-07,
"loss": 1.4554338455200195,
"step": 1298
},
{
"epoch": 1.897810218978102,
"grad_norm": 3.96875,
"learning_rate": 5.329875680681065e-07,
"loss": 1.4109296798706055,
"step": 1300
},
{
"epoch": 1.9007299270072993,
"grad_norm": 4.875,
"learning_rate": 5.311595301594783e-07,
"loss": 1.1961219310760498,
"step": 1302
},
{
"epoch": 1.9036496350364964,
"grad_norm": 2.921875,
"learning_rate": 5.293832446369158e-07,
"loss": 0.6657427549362183,
"step": 1304
},
{
"epoch": 1.9065693430656934,
"grad_norm": 10.4375,
"learning_rate": 5.276587529306236e-07,
"loss": 1.397131323814392,
"step": 1306
},
{
"epoch": 1.9094890510948905,
"grad_norm": 6.5,
"learning_rate": 5.25986095262763e-07,
"loss": 1.323398470878601,
"step": 1308
},
{
"epoch": 1.9124087591240877,
"grad_norm": 3.203125,
"learning_rate": 5.243653106465157e-07,
"loss": 1.3060777187347412,
"step": 1310
},
{
"epoch": 1.9153284671532846,
"grad_norm": 5.71875,
"learning_rate": 5.227964368851721e-07,
"loss": 1.5433318614959717,
"step": 1312
},
{
"epoch": 1.9182481751824818,
"grad_norm": 3.359375,
"learning_rate": 5.212795105712508e-07,
"loss": 1.4788509607315063,
"step": 1314
},
{
"epoch": 1.921167883211679,
"grad_norm": 4.8125,
"learning_rate": 5.198145670856438e-07,
"loss": 1.3976120948791504,
"step": 1316
},
{
"epoch": 1.924087591240876,
"grad_norm": 2.0625,
"learning_rate": 5.184016405967931e-07,
"loss": 1.1872693300247192,
"step": 1318
},
{
"epoch": 1.9270072992700729,
"grad_norm": 2.296875,
"learning_rate": 5.170407640598921e-07,
"loss": 1.1601970195770264,
"step": 1320
},
{
"epoch": 1.92992700729927,
"grad_norm": 3.5625,
"learning_rate": 5.157319692161178e-07,
"loss": 1.205195426940918,
"step": 1322
},
{
"epoch": 1.9328467153284672,
"grad_norm": 3.734375,
"learning_rate": 5.144752865918901e-07,
"loss": 1.1591906547546387,
"step": 1324
},
{
"epoch": 1.9357664233576641,
"grad_norm": 3.421875,
"learning_rate": 5.132707454981602e-07,
"loss": 1.3498120307922363,
"step": 1326
},
{
"epoch": 1.9386861313868613,
"grad_norm": 3.796875,
"learning_rate": 5.121183740297261e-07,
"loss": 1.3916034698486328,
"step": 1328
},
{
"epoch": 1.9416058394160585,
"grad_norm": 17.375,
"learning_rate": 5.110181990645788e-07,
"loss": 1.2117153406143188,
"step": 1330
},
{
"epoch": 1.9445255474452554,
"grad_norm": 1.734375,
"learning_rate": 5.099702462632737e-07,
"loss": 1.19834566116333,
"step": 1332
},
{
"epoch": 1.9474452554744526,
"grad_norm": 10.0625,
"learning_rate": 5.089745400683333e-07,
"loss": 0.8368179798126221,
"step": 1334
},
{
"epoch": 1.9503649635036497,
"grad_norm": 5.625,
"learning_rate": 5.080311037036767e-07,
"loss": 1.314239263534546,
"step": 1336
},
{
"epoch": 1.9532846715328467,
"grad_norm": 1.65625,
"learning_rate": 5.071399591740777e-07,
"loss": 1.216627597808838,
"step": 1338
},
{
"epoch": 1.9562043795620438,
"grad_norm": 6.375,
"learning_rate": 5.063011272646521e-07,
"loss": 1.2274556159973145,
"step": 1340
},
{
"epoch": 1.959124087591241,
"grad_norm": 2.546875,
"learning_rate": 5.055146275403725e-07,
"loss": 1.4812201261520386,
"step": 1342
},
{
"epoch": 1.962043795620438,
"grad_norm": 5.71875,
"learning_rate": 5.047804783456117e-07,
"loss": 1.215821623802185,
"step": 1344
},
{
"epoch": 1.964963503649635,
"grad_norm": 4.71875,
"learning_rate": 5.040986968037157e-07,
"loss": 1.318119764328003,
"step": 1346
},
{
"epoch": 1.967883211678832,
"grad_norm": 2.953125,
"learning_rate": 5.034692988166033e-07,
"loss": 1.2136964797973633,
"step": 1348
},
{
"epoch": 1.9708029197080292,
"grad_norm": 4.125,
"learning_rate": 5.028922990643963e-07,
"loss": 1.3341786861419678,
"step": 1350
},
{
"epoch": 1.9737226277372262,
"grad_norm": 3.75,
"learning_rate": 5.023677110050759e-07,
"loss": 1.4188188314437866,
"step": 1352
},
{
"epoch": 1.9766423357664233,
"grad_norm": 3.421875,
"learning_rate": 5.018955468741701e-07,
"loss": 1.608628511428833,
"step": 1354
},
{
"epoch": 1.9795620437956205,
"grad_norm": 3.359375,
"learning_rate": 5.014758176844665e-07,
"loss": 1.5936325788497925,
"step": 1356
},
{
"epoch": 1.9824817518248175,
"grad_norm": 2.796875,
"learning_rate": 5.011085332257579e-07,
"loss": 1.178612232208252,
"step": 1358
},
{
"epoch": 1.9854014598540146,
"grad_norm": 7.1875,
"learning_rate": 5.007937020646117e-07,
"loss": 1.1231637001037598,
"step": 1360
},
{
"epoch": 1.9883211678832118,
"grad_norm": 1.90625,
"learning_rate": 5.005313315441716e-07,
"loss": 0.6363063454627991,
"step": 1362
},
{
"epoch": 1.9912408759124087,
"grad_norm": 5.5,
"learning_rate": 5.003214277839851e-07,
"loss": 1.3855026960372925,
"step": 1364
},
{
"epoch": 1.994160583941606,
"grad_norm": 5.6875,
"learning_rate": 5.00163995679862e-07,
"loss": 1.346792459487915,
"step": 1366
},
{
"epoch": 1.997080291970803,
"grad_norm": 8.1875,
"learning_rate": 5.000590389037593e-07,
"loss": 1.3148702383041382,
"step": 1368
},
{
"epoch": 2.0,
"grad_norm": 4.0625,
"learning_rate": 5.00006559903696e-07,
"loss": 1.6425683498382568,
"step": 1370
},
{
"epoch": 2.0,
"step": 1370,
"total_flos": 1.984544544032555e+18,
"train_loss": 1.409229011779284,
"train_runtime": 8212.4061,
"train_samples_per_second": 2.669,
"train_steps_per_second": 0.167
}
],
"logging_steps": 2,
"max_steps": 1370,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 9999999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.984544544032555e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}