9b-34 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
7abbb82 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1410,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004259850905218318,
"grad_norm": 0.80859375,
"learning_rate": 4.225352112676057e-07,
"loss": 1.8638979196548462,
"step": 2
},
{
"epoch": 0.008519701810436636,
"grad_norm": 0.69921875,
"learning_rate": 1.267605633802817e-06,
"loss": 1.9382712841033936,
"step": 4
},
{
"epoch": 0.012779552715654952,
"grad_norm": 1.09375,
"learning_rate": 2.1126760563380285e-06,
"loss": 1.8919719457626343,
"step": 6
},
{
"epoch": 0.01703940362087327,
"grad_norm": 0.6875,
"learning_rate": 2.957746478873239e-06,
"loss": 1.9754539728164673,
"step": 8
},
{
"epoch": 0.021299254526091587,
"grad_norm": 0.8046875,
"learning_rate": 3.8028169014084508e-06,
"loss": 1.9735431671142578,
"step": 10
},
{
"epoch": 0.025559105431309903,
"grad_norm": 0.490234375,
"learning_rate": 4.6478873239436615e-06,
"loss": 1.962188959121704,
"step": 12
},
{
"epoch": 0.029818956336528223,
"grad_norm": 0.796875,
"learning_rate": 5.492957746478874e-06,
"loss": 1.8216444253921509,
"step": 14
},
{
"epoch": 0.03407880724174654,
"grad_norm": 0.65234375,
"learning_rate": 6.338028169014085e-06,
"loss": 1.879197359085083,
"step": 16
},
{
"epoch": 0.038338658146964855,
"grad_norm": 0.58203125,
"learning_rate": 7.183098591549295e-06,
"loss": 1.9074592590332031,
"step": 18
},
{
"epoch": 0.042598509052183174,
"grad_norm": 0.5625,
"learning_rate": 8.028169014084507e-06,
"loss": 1.8535538911819458,
"step": 20
},
{
"epoch": 0.046858359957401494,
"grad_norm": 0.70703125,
"learning_rate": 8.873239436619718e-06,
"loss": 1.7652872800827026,
"step": 22
},
{
"epoch": 0.051118210862619806,
"grad_norm": 0.470703125,
"learning_rate": 9.71830985915493e-06,
"loss": 1.7537739276885986,
"step": 24
},
{
"epoch": 0.055378061767838126,
"grad_norm": 0.4375,
"learning_rate": 1.056338028169014e-05,
"loss": 1.6868540048599243,
"step": 26
},
{
"epoch": 0.059637912673056445,
"grad_norm": 0.419921875,
"learning_rate": 1.1408450704225351e-05,
"loss": 1.694838523864746,
"step": 28
},
{
"epoch": 0.06389776357827476,
"grad_norm": 0.6953125,
"learning_rate": 1.2253521126760564e-05,
"loss": 1.7118496894836426,
"step": 30
},
{
"epoch": 0.06815761448349308,
"grad_norm": 0.5,
"learning_rate": 1.3098591549295775e-05,
"loss": 1.7367674112319946,
"step": 32
},
{
"epoch": 0.0724174653887114,
"grad_norm": 0.625,
"learning_rate": 1.3943661971830985e-05,
"loss": 1.6931723356246948,
"step": 34
},
{
"epoch": 0.07667731629392971,
"grad_norm": 0.87109375,
"learning_rate": 1.4788732394366198e-05,
"loss": 1.8322988748550415,
"step": 36
},
{
"epoch": 0.08093716719914804,
"grad_norm": 0.71875,
"learning_rate": 1.563380281690141e-05,
"loss": 1.723473072052002,
"step": 38
},
{
"epoch": 0.08519701810436635,
"grad_norm": 0.765625,
"learning_rate": 1.6478873239436623e-05,
"loss": 1.584614872932434,
"step": 40
},
{
"epoch": 0.08945686900958466,
"grad_norm": 0.458984375,
"learning_rate": 1.7323943661971833e-05,
"loss": 1.757703423500061,
"step": 42
},
{
"epoch": 0.09371671991480299,
"grad_norm": 0.384765625,
"learning_rate": 1.816901408450704e-05,
"loss": 1.5781840085983276,
"step": 44
},
{
"epoch": 0.0979765708200213,
"grad_norm": 0.98828125,
"learning_rate": 1.9014084507042255e-05,
"loss": 1.6014955043792725,
"step": 46
},
{
"epoch": 0.10223642172523961,
"grad_norm": 0.73828125,
"learning_rate": 1.9859154929577465e-05,
"loss": 1.674490213394165,
"step": 48
},
{
"epoch": 0.10649627263045794,
"grad_norm": 0.61328125,
"learning_rate": 2.0704225352112676e-05,
"loss": 1.393676519393921,
"step": 50
},
{
"epoch": 0.11075612353567625,
"grad_norm": 0.69921875,
"learning_rate": 2.154929577464789e-05,
"loss": 1.495132327079773,
"step": 52
},
{
"epoch": 0.11501597444089456,
"grad_norm": 0.5703125,
"learning_rate": 2.23943661971831e-05,
"loss": 1.5442848205566406,
"step": 54
},
{
"epoch": 0.11927582534611289,
"grad_norm": 0.396484375,
"learning_rate": 2.3239436619718308e-05,
"loss": 1.459320068359375,
"step": 56
},
{
"epoch": 0.1235356762513312,
"grad_norm": 0.7578125,
"learning_rate": 2.4084507042253522e-05,
"loss": 1.4848005771636963,
"step": 58
},
{
"epoch": 0.12779552715654952,
"grad_norm": 0.310546875,
"learning_rate": 2.4929577464788733e-05,
"loss": 1.3594304323196411,
"step": 60
},
{
"epoch": 0.13205537806176784,
"grad_norm": 0.6640625,
"learning_rate": 2.5774647887323944e-05,
"loss": 1.4234025478363037,
"step": 62
},
{
"epoch": 0.13631522896698617,
"grad_norm": 0.27734375,
"learning_rate": 2.6619718309859158e-05,
"loss": 1.3102433681488037,
"step": 64
},
{
"epoch": 0.14057507987220447,
"grad_norm": 0.3046875,
"learning_rate": 2.746478873239437e-05,
"loss": 1.3784610033035278,
"step": 66
},
{
"epoch": 0.1448349307774228,
"grad_norm": 0.5625,
"learning_rate": 2.8309859154929576e-05,
"loss": 1.325601577758789,
"step": 68
},
{
"epoch": 0.14909478168264112,
"grad_norm": 0.36328125,
"learning_rate": 2.915492957746479e-05,
"loss": 1.3749815225601196,
"step": 70
},
{
"epoch": 0.15335463258785942,
"grad_norm": 0.2890625,
"learning_rate": 3e-05,
"loss": 1.4330412149429321,
"step": 72
},
{
"epoch": 0.15761448349307774,
"grad_norm": 1.609375,
"learning_rate": 2.9999867885940888e-05,
"loss": 1.4420013427734375,
"step": 74
},
{
"epoch": 0.16187433439829607,
"grad_norm": 0.345703125,
"learning_rate": 2.999947154667255e-05,
"loss": 1.3299309015274048,
"step": 76
},
{
"epoch": 0.16613418530351437,
"grad_norm": 0.3125,
"learning_rate": 2.9998810990921997e-05,
"loss": 1.2926124334335327,
"step": 78
},
{
"epoch": 0.1703940362087327,
"grad_norm": 0.34765625,
"learning_rate": 2.999788623323402e-05,
"loss": 1.4053008556365967,
"step": 80
},
{
"epoch": 0.17465388711395102,
"grad_norm": 0.341796875,
"learning_rate": 2.999669729397085e-05,
"loss": 1.3341435194015503,
"step": 82
},
{
"epoch": 0.17891373801916932,
"grad_norm": 1.1171875,
"learning_rate": 2.999524419931176e-05,
"loss": 1.3351154327392578,
"step": 84
},
{
"epoch": 0.18317358892438765,
"grad_norm": 0.275390625,
"learning_rate": 2.9993526981252465e-05,
"loss": 1.279821515083313,
"step": 86
},
{
"epoch": 0.18743343982960597,
"grad_norm": 0.29296875,
"learning_rate": 2.999154567760439e-05,
"loss": 1.2992417812347412,
"step": 88
},
{
"epoch": 0.19169329073482427,
"grad_norm": 0.43359375,
"learning_rate": 2.998930033199389e-05,
"loss": 1.2671334743499756,
"step": 90
},
{
"epoch": 0.1959531416400426,
"grad_norm": 0.396484375,
"learning_rate": 2.9986790993861245e-05,
"loss": 1.4086840152740479,
"step": 92
},
{
"epoch": 0.20021299254526093,
"grad_norm": 0.4765625,
"learning_rate": 2.9984017718459603e-05,
"loss": 1.260171890258789,
"step": 94
},
{
"epoch": 0.20447284345047922,
"grad_norm": 0.4453125,
"learning_rate": 2.998098056685374e-05,
"loss": 1.337839126586914,
"step": 96
},
{
"epoch": 0.20873269435569755,
"grad_norm": 0.44921875,
"learning_rate": 2.9977679605918732e-05,
"loss": 1.3128660917282104,
"step": 98
},
{
"epoch": 0.21299254526091588,
"grad_norm": 0.3515625,
"learning_rate": 2.9974114908338454e-05,
"loss": 1.3010621070861816,
"step": 100
},
{
"epoch": 0.21725239616613418,
"grad_norm": 0.2451171875,
"learning_rate": 2.9970286552604036e-05,
"loss": 1.3007886409759521,
"step": 102
},
{
"epoch": 0.2215122470713525,
"grad_norm": 0.251953125,
"learning_rate": 2.996619462301207e-05,
"loss": 1.3102291822433472,
"step": 104
},
{
"epoch": 0.22577209797657083,
"grad_norm": 0.515625,
"learning_rate": 2.9961839209662808e-05,
"loss": 1.4082542657852173,
"step": 106
},
{
"epoch": 0.23003194888178913,
"grad_norm": 0.447265625,
"learning_rate": 2.9957220408458118e-05,
"loss": 1.220694899559021,
"step": 108
},
{
"epoch": 0.23429179978700745,
"grad_norm": 0.494140625,
"learning_rate": 2.9952338321099435e-05,
"loss": 1.3067396879196167,
"step": 110
},
{
"epoch": 0.23855165069222578,
"grad_norm": 0.34765625,
"learning_rate": 2.9947193055085505e-05,
"loss": 1.283180594444275,
"step": 112
},
{
"epoch": 0.24281150159744408,
"grad_norm": 0.255859375,
"learning_rate": 2.9941784723709973e-05,
"loss": 1.273285150527954,
"step": 114
},
{
"epoch": 0.2470713525026624,
"grad_norm": 0.4453125,
"learning_rate": 2.993611344605895e-05,
"loss": 1.2464628219604492,
"step": 116
},
{
"epoch": 0.25133120340788073,
"grad_norm": 0.255859375,
"learning_rate": 2.9930179347008347e-05,
"loss": 1.2899105548858643,
"step": 118
},
{
"epoch": 0.25559105431309903,
"grad_norm": 0.71484375,
"learning_rate": 2.9923982557221154e-05,
"loss": 1.166140079498291,
"step": 120
},
{
"epoch": 0.2598509052183174,
"grad_norm": 0.359375,
"learning_rate": 2.9917523213144554e-05,
"loss": 1.3073238134384155,
"step": 122
},
{
"epoch": 0.2641107561235357,
"grad_norm": 0.251953125,
"learning_rate": 2.9910801457006897e-05,
"loss": 1.2734519243240356,
"step": 124
},
{
"epoch": 0.268370607028754,
"grad_norm": 0.2470703125,
"learning_rate": 2.9903817436814603e-05,
"loss": 1.190434455871582,
"step": 126
},
{
"epoch": 0.27263045793397234,
"grad_norm": 0.41796875,
"learning_rate": 2.9896571306348874e-05,
"loss": 1.2509433031082153,
"step": 128
},
{
"epoch": 0.27689030883919064,
"grad_norm": 0.48828125,
"learning_rate": 2.9889063225162337e-05,
"loss": 1.3253034353256226,
"step": 130
},
{
"epoch": 0.28115015974440893,
"grad_norm": 0.318359375,
"learning_rate": 2.98812933585755e-05,
"loss": 1.2004448175430298,
"step": 132
},
{
"epoch": 0.2854100106496273,
"grad_norm": 0.2890625,
"learning_rate": 2.9873261877673142e-05,
"loss": 1.1677073240280151,
"step": 134
},
{
"epoch": 0.2896698615548456,
"grad_norm": 0.48046875,
"learning_rate": 2.9864968959300505e-05,
"loss": 1.2530547380447388,
"step": 136
},
{
"epoch": 0.2939297124600639,
"grad_norm": 0.482421875,
"learning_rate": 2.985641478605945e-05,
"loss": 1.2573705911636353,
"step": 138
},
{
"epoch": 0.29818956336528224,
"grad_norm": 0.435546875,
"learning_rate": 2.9847599546304395e-05,
"loss": 1.3057535886764526,
"step": 140
},
{
"epoch": 0.30244941427050054,
"grad_norm": 0.2734375,
"learning_rate": 2.9838523434138204e-05,
"loss": 1.2737255096435547,
"step": 142
},
{
"epoch": 0.30670926517571884,
"grad_norm": 0.419921875,
"learning_rate": 2.982918664940787e-05,
"loss": 1.2411197423934937,
"step": 144
},
{
"epoch": 0.3109691160809372,
"grad_norm": 0.3203125,
"learning_rate": 2.9819589397700148e-05,
"loss": 1.2790652513504028,
"step": 146
},
{
"epoch": 0.3152289669861555,
"grad_norm": 0.408203125,
"learning_rate": 2.9809731890337017e-05,
"loss": 1.2759779691696167,
"step": 148
},
{
"epoch": 0.3194888178913738,
"grad_norm": 0.337890625,
"learning_rate": 2.979961434437103e-05,
"loss": 1.3018522262573242,
"step": 150
},
{
"epoch": 0.32374866879659214,
"grad_norm": 0.365234375,
"learning_rate": 2.9789236982580538e-05,
"loss": 1.3175352811813354,
"step": 152
},
{
"epoch": 0.32800851970181044,
"grad_norm": 0.32421875,
"learning_rate": 2.9778600033464767e-05,
"loss": 1.291448712348938,
"step": 154
},
{
"epoch": 0.33226837060702874,
"grad_norm": 0.275390625,
"learning_rate": 2.97677037312388e-05,
"loss": 1.2841699123382568,
"step": 156
},
{
"epoch": 0.3365282215122471,
"grad_norm": 0.33203125,
"learning_rate": 2.975654831582843e-05,
"loss": 1.3081012964248657,
"step": 158
},
{
"epoch": 0.3407880724174654,
"grad_norm": 0.2734375,
"learning_rate": 2.9745134032864862e-05,
"loss": 1.2524945735931396,
"step": 160
},
{
"epoch": 0.3450479233226837,
"grad_norm": 0.228515625,
"learning_rate": 2.973346113367929e-05,
"loss": 1.1932268142700195,
"step": 162
},
{
"epoch": 0.34930777422790205,
"grad_norm": 0.453125,
"learning_rate": 2.972152987529741e-05,
"loss": 1.2276166677474976,
"step": 164
},
{
"epoch": 0.35356762513312034,
"grad_norm": 0.2734375,
"learning_rate": 2.9709340520433722e-05,
"loss": 1.2343382835388184,
"step": 166
},
{
"epoch": 0.35782747603833864,
"grad_norm": 0.96875,
"learning_rate": 2.9696893337485734e-05,
"loss": 1.3475210666656494,
"step": 168
},
{
"epoch": 0.362087326943557,
"grad_norm": 0.69921875,
"learning_rate": 2.9684188600528098e-05,
"loss": 1.2921943664550781,
"step": 170
},
{
"epoch": 0.3663471778487753,
"grad_norm": 0.515625,
"learning_rate": 2.967122658930654e-05,
"loss": 1.1600617170333862,
"step": 172
},
{
"epoch": 0.3706070287539936,
"grad_norm": 0.341796875,
"learning_rate": 2.9658007589231723e-05,
"loss": 1.3178966045379639,
"step": 174
},
{
"epoch": 0.37486687965921195,
"grad_norm": 0.33203125,
"learning_rate": 2.9644531891372925e-05,
"loss": 1.3098689317703247,
"step": 176
},
{
"epoch": 0.37912673056443025,
"grad_norm": 0.234375,
"learning_rate": 2.9630799792451687e-05,
"loss": 1.1713343858718872,
"step": 178
},
{
"epoch": 0.38338658146964855,
"grad_norm": 0.2109375,
"learning_rate": 2.9616811594835214e-05,
"loss": 1.2428940534591675,
"step": 180
},
{
"epoch": 0.3876464323748669,
"grad_norm": 0.359375,
"learning_rate": 2.9602567606529776e-05,
"loss": 1.2774041891098022,
"step": 182
},
{
"epoch": 0.3919062832800852,
"grad_norm": 0.33984375,
"learning_rate": 2.9588068141173888e-05,
"loss": 1.2816710472106934,
"step": 184
},
{
"epoch": 0.3961661341853035,
"grad_norm": 0.349609375,
"learning_rate": 2.9573313518031424e-05,
"loss": 1.1415907144546509,
"step": 186
},
{
"epoch": 0.40042598509052185,
"grad_norm": 0.396484375,
"learning_rate": 2.955830406198458e-05,
"loss": 1.232388973236084,
"step": 188
},
{
"epoch": 0.40468583599574015,
"grad_norm": 0.3125,
"learning_rate": 2.95430401035267e-05,
"loss": 1.250954031944275,
"step": 190
},
{
"epoch": 0.40894568690095845,
"grad_norm": 0.310546875,
"learning_rate": 2.9527521978755053e-05,
"loss": 1.205154299736023,
"step": 192
},
{
"epoch": 0.4132055378061768,
"grad_norm": 0.26171875,
"learning_rate": 2.9511750029363377e-05,
"loss": 1.2991074323654175,
"step": 194
},
{
"epoch": 0.4174653887113951,
"grad_norm": 0.296875,
"learning_rate": 2.949572460263438e-05,
"loss": 1.2244809865951538,
"step": 196
},
{
"epoch": 0.4217252396166134,
"grad_norm": 0.41796875,
"learning_rate": 2.947944605143208e-05,
"loss": 1.2750012874603271,
"step": 198
},
{
"epoch": 0.42598509052183176,
"grad_norm": 0.376953125,
"learning_rate": 2.9462914734194078e-05,
"loss": 1.3423129320144653,
"step": 200
},
{
"epoch": 0.43024494142705005,
"grad_norm": 0.35546875,
"learning_rate": 2.9446131014923593e-05,
"loss": 1.280989646911621,
"step": 202
},
{
"epoch": 0.43450479233226835,
"grad_norm": 0.26171875,
"learning_rate": 2.9429095263181514e-05,
"loss": 1.2904020547866821,
"step": 204
},
{
"epoch": 0.4387646432374867,
"grad_norm": 0.427734375,
"learning_rate": 2.9411807854078226e-05,
"loss": 1.2392964363098145,
"step": 206
},
{
"epoch": 0.443024494142705,
"grad_norm": 0.3046875,
"learning_rate": 2.9394269168265358e-05,
"loss": 1.2662076950073242,
"step": 208
},
{
"epoch": 0.4472843450479233,
"grad_norm": 0.3046875,
"learning_rate": 2.9376479591927408e-05,
"loss": 1.2238541841506958,
"step": 210
},
{
"epoch": 0.45154419595314166,
"grad_norm": 0.263671875,
"learning_rate": 2.935843951677323e-05,
"loss": 1.2052886486053467,
"step": 212
},
{
"epoch": 0.45580404685835996,
"grad_norm": 0.3515625,
"learning_rate": 2.9340149340027412e-05,
"loss": 1.2680332660675049,
"step": 214
},
{
"epoch": 0.46006389776357826,
"grad_norm": 0.318359375,
"learning_rate": 2.9321609464421546e-05,
"loss": 1.233550786972046,
"step": 216
},
{
"epoch": 0.4643237486687966,
"grad_norm": 0.578125,
"learning_rate": 2.930282029818533e-05,
"loss": 1.2138795852661133,
"step": 218
},
{
"epoch": 0.4685835995740149,
"grad_norm": 0.2578125,
"learning_rate": 2.92837822550376e-05,
"loss": 1.1213371753692627,
"step": 220
},
{
"epoch": 0.4728434504792332,
"grad_norm": 0.404296875,
"learning_rate": 2.9264495754177225e-05,
"loss": 1.2127740383148193,
"step": 222
},
{
"epoch": 0.47710330138445156,
"grad_norm": 1.5078125,
"learning_rate": 2.924496122027384e-05,
"loss": 1.3384878635406494,
"step": 224
},
{
"epoch": 0.48136315228966986,
"grad_norm": 0.345703125,
"learning_rate": 2.9225179083458555e-05,
"loss": 1.1937229633331299,
"step": 226
},
{
"epoch": 0.48562300319488816,
"grad_norm": 0.52734375,
"learning_rate": 2.9205149779314425e-05,
"loss": 1.2608391046524048,
"step": 228
},
{
"epoch": 0.4898828541001065,
"grad_norm": 0.31640625,
"learning_rate": 2.918487374886691e-05,
"loss": 1.2325993776321411,
"step": 230
},
{
"epoch": 0.4941427050053248,
"grad_norm": 0.287109375,
"learning_rate": 2.91643514385741e-05,
"loss": 1.2050917148590088,
"step": 232
},
{
"epoch": 0.4984025559105431,
"grad_norm": 0.296875,
"learning_rate": 2.9143583300316975e-05,
"loss": 1.2299753427505493,
"step": 234
},
{
"epoch": 0.5026624068157615,
"grad_norm": 0.322265625,
"learning_rate": 2.9122569791389354e-05,
"loss": 1.2500553131103516,
"step": 236
},
{
"epoch": 0.5069222577209798,
"grad_norm": 0.46875,
"learning_rate": 2.9101311374487908e-05,
"loss": 1.3044551610946655,
"step": 238
},
{
"epoch": 0.5111821086261981,
"grad_norm": 0.28125,
"learning_rate": 2.907980851770193e-05,
"loss": 1.1923537254333496,
"step": 240
},
{
"epoch": 0.5154419595314164,
"grad_norm": 0.419921875,
"learning_rate": 2.905806169450303e-05,
"loss": 1.2567352056503296,
"step": 242
},
{
"epoch": 0.5197018104366348,
"grad_norm": 0.369140625,
"learning_rate": 2.9036071383734716e-05,
"loss": 1.2812081575393677,
"step": 244
},
{
"epoch": 0.5239616613418531,
"grad_norm": 0.4609375,
"learning_rate": 2.9013838069601874e-05,
"loss": 1.2612706422805786,
"step": 246
},
{
"epoch": 0.5282215122470714,
"grad_norm": 0.27734375,
"learning_rate": 2.8991362241660053e-05,
"loss": 1.2162076234817505,
"step": 248
},
{
"epoch": 0.5324813631522897,
"grad_norm": 0.267578125,
"learning_rate": 2.8968644394804736e-05,
"loss": 1.2357534170150757,
"step": 250
},
{
"epoch": 0.536741214057508,
"grad_norm": 0.255859375,
"learning_rate": 2.894568502926042e-05,
"loss": 1.144363284111023,
"step": 252
},
{
"epoch": 0.5410010649627263,
"grad_norm": 0.68359375,
"learning_rate": 2.8922484650569597e-05,
"loss": 1.1998339891433716,
"step": 254
},
{
"epoch": 0.5452609158679447,
"grad_norm": 0.435546875,
"learning_rate": 2.8899043769581627e-05,
"loss": 1.1842751502990723,
"step": 256
},
{
"epoch": 0.549520766773163,
"grad_norm": 0.78515625,
"learning_rate": 2.8875362902441517e-05,
"loss": 1.1901715993881226,
"step": 258
},
{
"epoch": 0.5537806176783813,
"grad_norm": 0.447265625,
"learning_rate": 2.885144257057849e-05,
"loss": 1.3038347959518433,
"step": 260
},
{
"epoch": 0.5580404685835996,
"grad_norm": 0.46484375,
"learning_rate": 2.8827283300694593e-05,
"loss": 1.2350062131881714,
"step": 262
},
{
"epoch": 0.5623003194888179,
"grad_norm": 0.443359375,
"learning_rate": 2.8802885624753013e-05,
"loss": 1.2469710111618042,
"step": 264
},
{
"epoch": 0.5665601703940362,
"grad_norm": 0.38671875,
"learning_rate": 2.8778250079966417e-05,
"loss": 1.2484819889068604,
"step": 266
},
{
"epoch": 0.5708200212992546,
"grad_norm": 0.48046875,
"learning_rate": 2.875337720878512e-05,
"loss": 1.213232159614563,
"step": 268
},
{
"epoch": 0.5750798722044729,
"grad_norm": 0.349609375,
"learning_rate": 2.8728267558885102e-05,
"loss": 1.1985093355178833,
"step": 270
},
{
"epoch": 0.5793397231096912,
"grad_norm": 0.28125,
"learning_rate": 2.8702921683156e-05,
"loss": 1.2459266185760498,
"step": 272
},
{
"epoch": 0.5835995740149095,
"grad_norm": 0.251953125,
"learning_rate": 2.867734013968891e-05,
"loss": 1.3075346946716309,
"step": 274
},
{
"epoch": 0.5878594249201278,
"grad_norm": 0.63671875,
"learning_rate": 2.8651523491764074e-05,
"loss": 1.254473090171814,
"step": 276
},
{
"epoch": 0.5921192758253461,
"grad_norm": 0.5703125,
"learning_rate": 2.8625472307838518e-05,
"loss": 1.2639200687408447,
"step": 278
},
{
"epoch": 0.5963791267305645,
"grad_norm": 0.703125,
"learning_rate": 2.8599187161533533e-05,
"loss": 1.23056161403656,
"step": 280
},
{
"epoch": 0.6006389776357828,
"grad_norm": 0.53125,
"learning_rate": 2.8572668631622e-05,
"loss": 1.2778501510620117,
"step": 282
},
{
"epoch": 0.6048988285410011,
"grad_norm": 0.376953125,
"learning_rate": 2.8545917302015693e-05,
"loss": 1.240308403968811,
"step": 284
},
{
"epoch": 0.6091586794462194,
"grad_norm": 0.2734375,
"learning_rate": 2.851893376175241e-05,
"loss": 1.3061432838439941,
"step": 286
},
{
"epoch": 0.6134185303514377,
"grad_norm": 0.326171875,
"learning_rate": 2.849171860498298e-05,
"loss": 1.1693536043167114,
"step": 288
},
{
"epoch": 0.617678381256656,
"grad_norm": 0.396484375,
"learning_rate": 2.8464272430958208e-05,
"loss": 1.3255276679992676,
"step": 290
},
{
"epoch": 0.6219382321618744,
"grad_norm": 0.4140625,
"learning_rate": 2.843659584401568e-05,
"loss": 1.1839312314987183,
"step": 292
},
{
"epoch": 0.6261980830670927,
"grad_norm": 0.400390625,
"learning_rate": 2.840868945356643e-05,
"loss": 1.2237545251846313,
"step": 294
},
{
"epoch": 0.630457933972311,
"grad_norm": 0.279296875,
"learning_rate": 2.8380553874081544e-05,
"loss": 1.219810962677002,
"step": 296
},
{
"epoch": 0.6347177848775293,
"grad_norm": 0.2099609375,
"learning_rate": 2.8352189725078623e-05,
"loss": 1.148103952407837,
"step": 298
},
{
"epoch": 0.6389776357827476,
"grad_norm": 0.70703125,
"learning_rate": 2.8323597631108148e-05,
"loss": 1.266182780265808,
"step": 300
},
{
"epoch": 0.6432374866879659,
"grad_norm": 0.9765625,
"learning_rate": 2.829477822173972e-05,
"loss": 1.1832197904586792,
"step": 302
},
{
"epoch": 0.6474973375931843,
"grad_norm": 0.322265625,
"learning_rate": 2.8265732131548185e-05,
"loss": 1.2743726968765259,
"step": 304
},
{
"epoch": 0.6517571884984026,
"grad_norm": 0.2216796875,
"learning_rate": 2.82364600000997e-05,
"loss": 1.2408907413482666,
"step": 306
},
{
"epoch": 0.6560170394036209,
"grad_norm": 0.46875,
"learning_rate": 2.8206962471937612e-05,
"loss": 1.2314817905426025,
"step": 308
},
{
"epoch": 0.6602768903088392,
"grad_norm": 0.279296875,
"learning_rate": 2.817724019656829e-05,
"loss": 1.0730669498443604,
"step": 310
},
{
"epoch": 0.6645367412140575,
"grad_norm": 0.3203125,
"learning_rate": 2.81472938284468e-05,
"loss": 1.250943660736084,
"step": 312
},
{
"epoch": 0.6687965921192758,
"grad_norm": 0.98046875,
"learning_rate": 2.811712402696252e-05,
"loss": 1.1586111783981323,
"step": 314
},
{
"epoch": 0.6730564430244942,
"grad_norm": 0.267578125,
"learning_rate": 2.808673145642461e-05,
"loss": 1.2091357707977295,
"step": 316
},
{
"epoch": 0.6773162939297125,
"grad_norm": 0.7890625,
"learning_rate": 2.805611678604737e-05,
"loss": 1.219393253326416,
"step": 318
},
{
"epoch": 0.6815761448349308,
"grad_norm": 0.302734375,
"learning_rate": 2.8025280689935538e-05,
"loss": 1.2416179180145264,
"step": 320
},
{
"epoch": 0.6858359957401491,
"grad_norm": 0.46875,
"learning_rate": 2.7994223847069417e-05,
"loss": 1.2236298322677612,
"step": 322
},
{
"epoch": 0.6900958466453674,
"grad_norm": 1.109375,
"learning_rate": 2.7962946941289932e-05,
"loss": 1.1898835897445679,
"step": 324
},
{
"epoch": 0.6943556975505857,
"grad_norm": 0.357421875,
"learning_rate": 2.7931450661283587e-05,
"loss": 1.1595722436904907,
"step": 326
},
{
"epoch": 0.6986155484558041,
"grad_norm": 0.53125,
"learning_rate": 2.7899735700567272e-05,
"loss": 1.221711277961731,
"step": 328
},
{
"epoch": 0.7028753993610224,
"grad_norm": 0.2275390625,
"learning_rate": 2.7867802757473023e-05,
"loss": 1.2105400562286377,
"step": 330
},
{
"epoch": 0.7071352502662407,
"grad_norm": 0.30078125,
"learning_rate": 2.7835652535132635e-05,
"loss": 1.2640867233276367,
"step": 332
},
{
"epoch": 0.711395101171459,
"grad_norm": 0.330078125,
"learning_rate": 2.780328574146216e-05,
"loss": 1.259413480758667,
"step": 334
},
{
"epoch": 0.7156549520766773,
"grad_norm": 0.4453125,
"learning_rate": 2.7770703089146355e-05,
"loss": 1.3237056732177734,
"step": 336
},
{
"epoch": 0.7199148029818956,
"grad_norm": 0.400390625,
"learning_rate": 2.7737905295622957e-05,
"loss": 1.2199316024780273,
"step": 338
},
{
"epoch": 0.724174653887114,
"grad_norm": 0.51953125,
"learning_rate": 2.7704893083066906e-05,
"loss": 1.1969261169433594,
"step": 340
},
{
"epoch": 0.7284345047923323,
"grad_norm": 0.466796875,
"learning_rate": 2.7671667178374443e-05,
"loss": 1.2693402767181396,
"step": 342
},
{
"epoch": 0.7326943556975506,
"grad_norm": 0.265625,
"learning_rate": 2.7638228313147083e-05,
"loss": 1.230875015258789,
"step": 344
},
{
"epoch": 0.7369542066027689,
"grad_norm": 0.375,
"learning_rate": 2.760457722367553e-05,
"loss": 1.1558018922805786,
"step": 346
},
{
"epoch": 0.7412140575079872,
"grad_norm": 2.40625,
"learning_rate": 2.7570714650923446e-05,
"loss": 1.3312543630599976,
"step": 348
},
{
"epoch": 0.7454739084132055,
"grad_norm": 0.283203125,
"learning_rate": 2.7536641340511177e-05,
"loss": 1.1423282623291016,
"step": 350
},
{
"epoch": 0.7497337593184239,
"grad_norm": 0.41796875,
"learning_rate": 2.7502358042699257e-05,
"loss": 1.1751903295516968,
"step": 352
},
{
"epoch": 0.7539936102236422,
"grad_norm": 0.384765625,
"learning_rate": 2.7467865512371974e-05,
"loss": 1.2713823318481445,
"step": 354
},
{
"epoch": 0.7582534611288605,
"grad_norm": 0.330078125,
"learning_rate": 2.7433164509020684e-05,
"loss": 1.2887362241744995,
"step": 356
},
{
"epoch": 0.7625133120340788,
"grad_norm": 0.341796875,
"learning_rate": 2.7398255796727127e-05,
"loss": 1.2369112968444824,
"step": 358
},
{
"epoch": 0.7667731629392971,
"grad_norm": 0.34375,
"learning_rate": 2.7363140144146578e-05,
"loss": 1.150454044342041,
"step": 360
},
{
"epoch": 0.7710330138445154,
"grad_norm": 0.275390625,
"learning_rate": 2.7327818324490938e-05,
"loss": 1.2185767889022827,
"step": 362
},
{
"epoch": 0.7752928647497338,
"grad_norm": 0.6953125,
"learning_rate": 2.729229111551171e-05,
"loss": 1.2292591333389282,
"step": 364
},
{
"epoch": 0.7795527156549521,
"grad_norm": 0.236328125,
"learning_rate": 2.725655929948285e-05,
"loss": 1.2185684442520142,
"step": 366
},
{
"epoch": 0.7838125665601704,
"grad_norm": 0.30078125,
"learning_rate": 2.722062366318357e-05,
"loss": 1.1981046199798584,
"step": 368
},
{
"epoch": 0.7880724174653887,
"grad_norm": 0.2197265625,
"learning_rate": 2.7184484997881e-05,
"loss": 1.1411432027816772,
"step": 370
},
{
"epoch": 0.792332268370607,
"grad_norm": 0.85546875,
"learning_rate": 2.7148144099312765e-05,
"loss": 1.2738561630249023,
"step": 372
},
{
"epoch": 0.7965921192758253,
"grad_norm": 0.275390625,
"learning_rate": 2.7111601767669473e-05,
"loss": 1.1942780017852783,
"step": 374
},
{
"epoch": 0.8008519701810437,
"grad_norm": 0.212890625,
"learning_rate": 2.7074858807577084e-05,
"loss": 1.1684967279434204,
"step": 376
},
{
"epoch": 0.805111821086262,
"grad_norm": 0.171875,
"learning_rate": 2.7037916028079198e-05,
"loss": 1.1836313009262085,
"step": 378
},
{
"epoch": 0.8093716719914803,
"grad_norm": 0.365234375,
"learning_rate": 2.7000774242619235e-05,
"loss": 1.2047457695007324,
"step": 380
},
{
"epoch": 0.8136315228966986,
"grad_norm": 0.2294921875,
"learning_rate": 2.696343426902254e-05,
"loss": 1.186992883682251,
"step": 382
},
{
"epoch": 0.8178913738019169,
"grad_norm": 0.376953125,
"learning_rate": 2.6925896929478355e-05,
"loss": 1.1887181997299194,
"step": 384
},
{
"epoch": 0.8221512247071352,
"grad_norm": 0.328125,
"learning_rate": 2.6888163050521734e-05,
"loss": 1.2181212902069092,
"step": 386
},
{
"epoch": 0.8264110756123536,
"grad_norm": 0.55078125,
"learning_rate": 2.6850233463015334e-05,
"loss": 1.1820951700210571,
"step": 388
},
{
"epoch": 0.8306709265175719,
"grad_norm": 0.267578125,
"learning_rate": 2.6812109002131106e-05,
"loss": 1.1575113534927368,
"step": 390
},
{
"epoch": 0.8349307774227902,
"grad_norm": 0.40625,
"learning_rate": 2.6773790507331936e-05,
"loss": 1.1017088890075684,
"step": 392
},
{
"epoch": 0.8391906283280085,
"grad_norm": 0.310546875,
"learning_rate": 2.673527882235314e-05,
"loss": 1.1889958381652832,
"step": 394
},
{
"epoch": 0.8434504792332268,
"grad_norm": 0.373046875,
"learning_rate": 2.6696574795183882e-05,
"loss": 1.1406269073486328,
"step": 396
},
{
"epoch": 0.8477103301384451,
"grad_norm": 0.453125,
"learning_rate": 2.665767927804852e-05,
"loss": 1.172967791557312,
"step": 398
},
{
"epoch": 0.8519701810436635,
"grad_norm": 0.23046875,
"learning_rate": 2.661859312738783e-05,
"loss": 1.2290892601013184,
"step": 400
},
{
"epoch": 0.8562300319488818,
"grad_norm": 0.255859375,
"learning_rate": 2.6579317203840154e-05,
"loss": 1.0655782222747803,
"step": 402
},
{
"epoch": 0.8604898828541001,
"grad_norm": 0.208984375,
"learning_rate": 2.6539852372222434e-05,
"loss": 1.1730587482452393,
"step": 404
},
{
"epoch": 0.8647497337593184,
"grad_norm": 0.283203125,
"learning_rate": 2.6500199501511184e-05,
"loss": 1.2667183876037598,
"step": 406
},
{
"epoch": 0.8690095846645367,
"grad_norm": 0.3125,
"learning_rate": 2.646035946482336e-05,
"loss": 1.2611602544784546,
"step": 408
},
{
"epoch": 0.873269435569755,
"grad_norm": 0.234375,
"learning_rate": 2.6420333139397122e-05,
"loss": 1.2684861421585083,
"step": 410
},
{
"epoch": 0.8775292864749734,
"grad_norm": 0.2333984375,
"learning_rate": 2.638012140657252e-05,
"loss": 1.2144488096237183,
"step": 412
},
{
"epoch": 0.8817891373801917,
"grad_norm": 0.44921875,
"learning_rate": 2.6339725151772095e-05,
"loss": 1.2024558782577515,
"step": 414
},
{
"epoch": 0.88604898828541,
"grad_norm": 0.478515625,
"learning_rate": 2.6299145264481386e-05,
"loss": 1.2472572326660156,
"step": 416
},
{
"epoch": 0.8903088391906283,
"grad_norm": 0.310546875,
"learning_rate": 2.625838263822932e-05,
"loss": 1.15989351272583,
"step": 418
},
{
"epoch": 0.8945686900958466,
"grad_norm": 0.236328125,
"learning_rate": 2.621743817056858e-05,
"loss": 1.2214092016220093,
"step": 420
},
{
"epoch": 0.898828541001065,
"grad_norm": 0.37109375,
"learning_rate": 2.6176312763055795e-05,
"loss": 1.1031744480133057,
"step": 422
},
{
"epoch": 0.9030883919062833,
"grad_norm": 0.251953125,
"learning_rate": 2.6135007321231715e-05,
"loss": 1.0990759134292603,
"step": 424
},
{
"epoch": 0.9073482428115016,
"grad_norm": 0.41015625,
"learning_rate": 2.6093522754601284e-05,
"loss": 1.180249810218811,
"step": 426
},
{
"epoch": 0.9116080937167199,
"grad_norm": 0.185546875,
"learning_rate": 2.6051859976613564e-05,
"loss": 1.1679967641830444,
"step": 428
},
{
"epoch": 0.9158679446219382,
"grad_norm": 0.294921875,
"learning_rate": 2.601001990464169e-05,
"loss": 1.1675636768341064,
"step": 430
},
{
"epoch": 0.9201277955271565,
"grad_norm": 0.93359375,
"learning_rate": 2.5968003459962608e-05,
"loss": 1.187214732170105,
"step": 432
},
{
"epoch": 0.9243876464323749,
"grad_norm": 0.23828125,
"learning_rate": 2.592581156773684e-05,
"loss": 1.1574485301971436,
"step": 434
},
{
"epoch": 0.9286474973375932,
"grad_norm": 0.33203125,
"learning_rate": 2.588344515698806e-05,
"loss": 1.207824468612671,
"step": 436
},
{
"epoch": 0.9329073482428115,
"grad_norm": 0.275390625,
"learning_rate": 2.58409051605827e-05,
"loss": 1.160508155822754,
"step": 438
},
{
"epoch": 0.9371671991480298,
"grad_norm": 0.3125,
"learning_rate": 2.5798192515209343e-05,
"loss": 1.1380846500396729,
"step": 440
},
{
"epoch": 0.9414270500532481,
"grad_norm": 0.1884765625,
"learning_rate": 2.5755308161358166e-05,
"loss": 1.1430374383926392,
"step": 442
},
{
"epoch": 0.9456869009584664,
"grad_norm": 1.0625,
"learning_rate": 2.5712253043300174e-05,
"loss": 1.1965644359588623,
"step": 444
},
{
"epoch": 0.9499467518636848,
"grad_norm": 0.302734375,
"learning_rate": 2.5669028109066426e-05,
"loss": 1.2050869464874268,
"step": 446
},
{
"epoch": 0.9542066027689031,
"grad_norm": 0.380859375,
"learning_rate": 2.5625634310427188e-05,
"loss": 1.1945817470550537,
"step": 448
},
{
"epoch": 0.9584664536741214,
"grad_norm": 0.275390625,
"learning_rate": 2.558207260287093e-05,
"loss": 1.1947966814041138,
"step": 450
},
{
"epoch": 0.9627263045793397,
"grad_norm": 0.279296875,
"learning_rate": 2.553834394558332e-05,
"loss": 1.134352445602417,
"step": 452
},
{
"epoch": 0.966986155484558,
"grad_norm": 1.0859375,
"learning_rate": 2.5494449301426102e-05,
"loss": 1.2251217365264893,
"step": 454
},
{
"epoch": 0.9712460063897763,
"grad_norm": 0.2177734375,
"learning_rate": 2.5450389636915867e-05,
"loss": 1.081860899925232,
"step": 456
},
{
"epoch": 0.9755058572949947,
"grad_norm": 0.1943359375,
"learning_rate": 2.540616592220281e-05,
"loss": 1.182367205619812,
"step": 458
},
{
"epoch": 0.979765708200213,
"grad_norm": 0.2451171875,
"learning_rate": 2.5361779131049344e-05,
"loss": 1.158174991607666,
"step": 460
},
{
"epoch": 0.9840255591054313,
"grad_norm": 0.2001953125,
"learning_rate": 2.5317230240808656e-05,
"loss": 1.1436811685562134,
"step": 462
},
{
"epoch": 0.9882854100106496,
"grad_norm": 0.31640625,
"learning_rate": 2.527252023240319e-05,
"loss": 1.1009982824325562,
"step": 464
},
{
"epoch": 0.9925452609158679,
"grad_norm": 0.39453125,
"learning_rate": 2.5227650090303083e-05,
"loss": 1.2242732048034668,
"step": 466
},
{
"epoch": 0.9968051118210862,
"grad_norm": 0.255859375,
"learning_rate": 2.5182620802504415e-05,
"loss": 1.1412031650543213,
"step": 468
},
{
"epoch": 1.0,
"grad_norm": 0.453125,
"learning_rate": 2.513743336050753e-05,
"loss": 1.3368866443634033,
"step": 470
},
{
"epoch": 1.0042598509052183,
"grad_norm": 0.23828125,
"learning_rate": 2.5092088759295147e-05,
"loss": 0.9358726739883423,
"step": 472
},
{
"epoch": 1.0085197018104366,
"grad_norm": 0.189453125,
"learning_rate": 2.5046587997310503e-05,
"loss": 0.9842238426208496,
"step": 474
},
{
"epoch": 1.012779552715655,
"grad_norm": 0.2314453125,
"learning_rate": 2.500093207643532e-05,
"loss": 0.909864068031311,
"step": 476
},
{
"epoch": 1.0170394036208732,
"grad_norm": 0.248046875,
"learning_rate": 2.4955122001967757e-05,
"loss": 0.8217376470565796,
"step": 478
},
{
"epoch": 1.0212992545260915,
"grad_norm": 0.2236328125,
"learning_rate": 2.4909158782600303e-05,
"loss": 0.9412868618965149,
"step": 480
},
{
"epoch": 1.0255591054313098,
"grad_norm": 0.423828125,
"learning_rate": 2.4863043430397546e-05,
"loss": 0.9232436418533325,
"step": 482
},
{
"epoch": 1.0298189563365283,
"grad_norm": 0.1708984375,
"learning_rate": 2.481677696077387e-05,
"loss": 0.9075867533683777,
"step": 484
},
{
"epoch": 1.0340788072417466,
"grad_norm": 0.3203125,
"learning_rate": 2.477036039247113e-05,
"loss": 0.9229554533958435,
"step": 486
},
{
"epoch": 1.038338658146965,
"grad_norm": 0.28515625,
"learning_rate": 2.4723794747536204e-05,
"loss": 0.8909753561019897,
"step": 488
},
{
"epoch": 1.0425985090521832,
"grad_norm": 0.2578125,
"learning_rate": 2.4677081051298473e-05,
"loss": 0.8516156077384949,
"step": 490
},
{
"epoch": 1.0468583599574015,
"grad_norm": 0.4375,
"learning_rate": 2.4630220332347293e-05,
"loss": 0.910189151763916,
"step": 492
},
{
"epoch": 1.0511182108626198,
"grad_norm": 0.2314453125,
"learning_rate": 2.458321362250928e-05,
"loss": 0.8809674978256226,
"step": 494
},
{
"epoch": 1.055378061767838,
"grad_norm": 0.337890625,
"learning_rate": 2.4536061956825653e-05,
"loss": 0.9545248746871948,
"step": 496
},
{
"epoch": 1.0596379126730564,
"grad_norm": 0.359375,
"learning_rate": 2.44887663735294e-05,
"loss": 0.8128166794776917,
"step": 498
},
{
"epoch": 1.0638977635782747,
"grad_norm": 0.271484375,
"learning_rate": 2.4441327914022435e-05,
"loss": 0.7933678030967712,
"step": 500
},
{
"epoch": 1.068157614483493,
"grad_norm": 0.5859375,
"learning_rate": 2.4393747622852666e-05,
"loss": 0.845329761505127,
"step": 502
},
{
"epoch": 1.0724174653887113,
"grad_norm": 0.291015625,
"learning_rate": 2.4346026547690983e-05,
"loss": 0.8825768232345581,
"step": 504
},
{
"epoch": 1.0766773162939298,
"grad_norm": 0.322265625,
"learning_rate": 2.4298165739308227e-05,
"loss": 0.9173828959465027,
"step": 506
},
{
"epoch": 1.0809371671991481,
"grad_norm": 0.625,
"learning_rate": 2.4250166251551998e-05,
"loss": 0.9571421146392822,
"step": 508
},
{
"epoch": 1.0851970181043664,
"grad_norm": 0.29296875,
"learning_rate": 2.4202029141323492e-05,
"loss": 0.8474833369255066,
"step": 510
},
{
"epoch": 1.0894568690095847,
"grad_norm": 0.34765625,
"learning_rate": 2.415375546855422e-05,
"loss": 0.8801344633102417,
"step": 512
},
{
"epoch": 1.093716719914803,
"grad_norm": 0.33984375,
"learning_rate": 2.4105346296182648e-05,
"loss": 0.8761341571807861,
"step": 514
},
{
"epoch": 1.0979765708200213,
"grad_norm": 0.921875,
"learning_rate": 2.4056802690130826e-05,
"loss": 0.8511140942573547,
"step": 516
},
{
"epoch": 1.1022364217252396,
"grad_norm": 0.2470703125,
"learning_rate": 2.4008125719280893e-05,
"loss": 0.8243319392204285,
"step": 518
},
{
"epoch": 1.106496272630458,
"grad_norm": 0.2353515625,
"learning_rate": 2.395931645545155e-05,
"loss": 0.9023821949958801,
"step": 520
},
{
"epoch": 1.1107561235356762,
"grad_norm": 0.5859375,
"learning_rate": 2.391037597337446e-05,
"loss": 0.8977804183959961,
"step": 522
},
{
"epoch": 1.1150159744408945,
"grad_norm": 0.271484375,
"learning_rate": 2.3861305350670564e-05,
"loss": 0.8644490242004395,
"step": 524
},
{
"epoch": 1.1192758253461128,
"grad_norm": 0.220703125,
"learning_rate": 2.381210566782642e-05,
"loss": 0.8652825951576233,
"step": 526
},
{
"epoch": 1.123535676251331,
"grad_norm": 0.48828125,
"learning_rate": 2.3762778008170296e-05,
"loss": 0.9315000176429749,
"step": 528
},
{
"epoch": 1.1277955271565494,
"grad_norm": 0.298828125,
"learning_rate": 2.3713323457848425e-05,
"loss": 0.8627546429634094,
"step": 530
},
{
"epoch": 1.132055378061768,
"grad_norm": 0.3515625,
"learning_rate": 2.366374310580106e-05,
"loss": 0.8466436266899109,
"step": 532
},
{
"epoch": 1.1363152289669862,
"grad_norm": 0.234375,
"learning_rate": 2.3614038043738432e-05,
"loss": 0.8433495163917542,
"step": 534
},
{
"epoch": 1.1405750798722045,
"grad_norm": 0.671875,
"learning_rate": 2.35642093661168e-05,
"loss": 0.9653686285018921,
"step": 536
},
{
"epoch": 1.1448349307774228,
"grad_norm": 0.46484375,
"learning_rate": 2.351425817011432e-05,
"loss": 0.9155454039573669,
"step": 538
},
{
"epoch": 1.1490947816826411,
"grad_norm": 0.2333984375,
"learning_rate": 2.3464185555606854e-05,
"loss": 0.8044310212135315,
"step": 540
},
{
"epoch": 1.1533546325878594,
"grad_norm": 0.2451171875,
"learning_rate": 2.3413992625143808e-05,
"loss": 0.8448784947395325,
"step": 542
},
{
"epoch": 1.1576144834930777,
"grad_norm": 0.51171875,
"learning_rate": 2.3363680483923794e-05,
"loss": 0.9145954251289368,
"step": 544
},
{
"epoch": 1.161874334398296,
"grad_norm": 0.251953125,
"learning_rate": 2.3313250239770364e-05,
"loss": 0.8059402108192444,
"step": 546
},
{
"epoch": 1.1661341853035143,
"grad_norm": 0.32421875,
"learning_rate": 2.326270300310756e-05,
"loss": 0.910370945930481,
"step": 548
},
{
"epoch": 1.1703940362087326,
"grad_norm": 0.2060546875,
"learning_rate": 2.3212039886935464e-05,
"loss": 0.8459041118621826,
"step": 550
},
{
"epoch": 1.174653887113951,
"grad_norm": 0.23046875,
"learning_rate": 2.3161262006805744e-05,
"loss": 0.8679651618003845,
"step": 552
},
{
"epoch": 1.1789137380191694,
"grad_norm": 0.353515625,
"learning_rate": 2.3110370480797046e-05,
"loss": 0.8998923897743225,
"step": 554
},
{
"epoch": 1.1831735889243877,
"grad_norm": 0.380859375,
"learning_rate": 2.3059366429490382e-05,
"loss": 0.9410486817359924,
"step": 556
},
{
"epoch": 1.187433439829606,
"grad_norm": 0.294921875,
"learning_rate": 2.3008250975944458e-05,
"loss": 0.8485605120658875,
"step": 558
},
{
"epoch": 1.1916932907348243,
"grad_norm": 0.240234375,
"learning_rate": 2.2957025245670945e-05,
"loss": 0.8777744770050049,
"step": 560
},
{
"epoch": 1.1959531416400426,
"grad_norm": 0.40234375,
"learning_rate": 2.2905690366609703e-05,
"loss": 0.9006752967834473,
"step": 562
},
{
"epoch": 1.200212992545261,
"grad_norm": 0.3828125,
"learning_rate": 2.2854247469103943e-05,
"loss": 0.8309807181358337,
"step": 564
},
{
"epoch": 1.2044728434504792,
"grad_norm": 0.26953125,
"learning_rate": 2.280269768587534e-05,
"loss": 0.9057250618934631,
"step": 566
},
{
"epoch": 1.2087326943556975,
"grad_norm": 0.2080078125,
"learning_rate": 2.2751042151999064e-05,
"loss": 0.829549252986908,
"step": 568
},
{
"epoch": 1.2129925452609158,
"grad_norm": 0.2734375,
"learning_rate": 2.2699282004878834e-05,
"loss": 0.9091805219650269,
"step": 570
},
{
"epoch": 1.2172523961661341,
"grad_norm": 0.2490234375,
"learning_rate": 2.264741838422183e-05,
"loss": 0.8178958296775818,
"step": 572
},
{
"epoch": 1.2215122470713524,
"grad_norm": 0.62109375,
"learning_rate": 2.2595452432013637e-05,
"loss": 0.9319694638252258,
"step": 574
},
{
"epoch": 1.225772097976571,
"grad_norm": 0.255859375,
"learning_rate": 2.2543385292493068e-05,
"loss": 0.8054318428039551,
"step": 576
},
{
"epoch": 1.230031948881789,
"grad_norm": 0.37890625,
"learning_rate": 2.2491218112126974e-05,
"loss": 0.8717759847640991,
"step": 578
},
{
"epoch": 1.2342917997870075,
"grad_norm": 0.28515625,
"learning_rate": 2.2438952039585023e-05,
"loss": 0.9084351062774658,
"step": 580
},
{
"epoch": 1.2385516506922258,
"grad_norm": 0.25390625,
"learning_rate": 2.238658822571437e-05,
"loss": 0.877246618270874,
"step": 582
},
{
"epoch": 1.2428115015974441,
"grad_norm": 0.380859375,
"learning_rate": 2.2334127823514353e-05,
"loss": 0.8917878866195679,
"step": 584
},
{
"epoch": 1.2470713525026624,
"grad_norm": 0.2392578125,
"learning_rate": 2.2281571988111087e-05,
"loss": 0.9018102884292603,
"step": 586
},
{
"epoch": 1.2513312034078807,
"grad_norm": 0.181640625,
"learning_rate": 2.222892187673203e-05,
"loss": 0.8929234147071838,
"step": 588
},
{
"epoch": 1.255591054313099,
"grad_norm": 0.291015625,
"learning_rate": 2.2176178648680504e-05,
"loss": 0.9248031973838806,
"step": 590
},
{
"epoch": 1.2598509052183173,
"grad_norm": 0.275390625,
"learning_rate": 2.2123343465310163e-05,
"loss": 0.9204663038253784,
"step": 592
},
{
"epoch": 1.2641107561235356,
"grad_norm": 0.2021484375,
"learning_rate": 2.2070417489999427e-05,
"loss": 0.8040061593055725,
"step": 594
},
{
"epoch": 1.268370607028754,
"grad_norm": 0.349609375,
"learning_rate": 2.201740188812588e-05,
"loss": 0.9146944880485535,
"step": 596
},
{
"epoch": 1.2726304579339724,
"grad_norm": 0.2578125,
"learning_rate": 2.196429782704057e-05,
"loss": 0.8526248931884766,
"step": 598
},
{
"epoch": 1.2768903088391905,
"grad_norm": 0.328125,
"learning_rate": 2.191110647604235e-05,
"loss": 0.8366101384162903,
"step": 600
},
{
"epoch": 1.281150159744409,
"grad_norm": 0.2333984375,
"learning_rate": 2.1857829006352092e-05,
"loss": 0.8716267347335815,
"step": 602
},
{
"epoch": 1.2854100106496273,
"grad_norm": 0.333984375,
"learning_rate": 2.180446659108693e-05,
"loss": 0.9040926694869995,
"step": 604
},
{
"epoch": 1.2896698615548456,
"grad_norm": 0.251953125,
"learning_rate": 2.1751020405234427e-05,
"loss": 0.8583382368087769,
"step": 606
},
{
"epoch": 1.293929712460064,
"grad_norm": 0.314453125,
"learning_rate": 2.1697491625626652e-05,
"loss": 0.8685941696166992,
"step": 608
},
{
"epoch": 1.2981895633652822,
"grad_norm": 0.21484375,
"learning_rate": 2.1643881430914343e-05,
"loss": 0.8654310703277588,
"step": 610
},
{
"epoch": 1.3024494142705005,
"grad_norm": 0.2265625,
"learning_rate": 2.1590191001540903e-05,
"loss": 0.8943390846252441,
"step": 612
},
{
"epoch": 1.3067092651757188,
"grad_norm": 0.26171875,
"learning_rate": 2.153642151971643e-05,
"loss": 0.8576252460479736,
"step": 614
},
{
"epoch": 1.3109691160809371,
"grad_norm": 0.4375,
"learning_rate": 2.1482574169391664e-05,
"loss": 0.8761968612670898,
"step": 616
},
{
"epoch": 1.3152289669861554,
"grad_norm": 0.232421875,
"learning_rate": 2.1428650136231948e-05,
"loss": 0.8207455277442932,
"step": 618
},
{
"epoch": 1.3194888178913737,
"grad_norm": 0.22265625,
"learning_rate": 2.1374650607591106e-05,
"loss": 0.8694437742233276,
"step": 620
},
{
"epoch": 1.323748668796592,
"grad_norm": 0.408203125,
"learning_rate": 2.1320576772485284e-05,
"loss": 0.872995138168335,
"step": 622
},
{
"epoch": 1.3280085197018106,
"grad_norm": 0.30859375,
"learning_rate": 2.126642982156679e-05,
"loss": 0.9666632413864136,
"step": 624
},
{
"epoch": 1.3322683706070286,
"grad_norm": 0.3828125,
"learning_rate": 2.1212210947097873e-05,
"loss": 0.8025370836257935,
"step": 626
},
{
"epoch": 1.3365282215122471,
"grad_norm": 0.349609375,
"learning_rate": 2.1157921342924457e-05,
"loss": 0.8531129956245422,
"step": 628
},
{
"epoch": 1.3407880724174654,
"grad_norm": 0.2734375,
"learning_rate": 2.1103562204449876e-05,
"loss": 0.8310921788215637,
"step": 630
},
{
"epoch": 1.3450479233226837,
"grad_norm": 0.291015625,
"learning_rate": 2.1049134728608537e-05,
"loss": 0.903289794921875,
"step": 632
},
{
"epoch": 1.349307774227902,
"grad_norm": 0.28125,
"learning_rate": 2.0994640113839568e-05,
"loss": 0.8707770705223083,
"step": 634
},
{
"epoch": 1.3535676251331203,
"grad_norm": 0.267578125,
"learning_rate": 2.0940079560060427e-05,
"loss": 0.8999609351158142,
"step": 636
},
{
"epoch": 1.3578274760383386,
"grad_norm": 0.2119140625,
"learning_rate": 2.088545426864048e-05,
"loss": 0.8670209646224976,
"step": 638
},
{
"epoch": 1.362087326943557,
"grad_norm": 0.349609375,
"learning_rate": 2.0830765442374563e-05,
"loss": 0.8102102279663086,
"step": 640
},
{
"epoch": 1.3663471778487752,
"grad_norm": 0.2041015625,
"learning_rate": 2.077601428545648e-05,
"loss": 0.8202542662620544,
"step": 642
},
{
"epoch": 1.3706070287539935,
"grad_norm": 0.234375,
"learning_rate": 2.0721202003452496e-05,
"loss": 0.8944796323776245,
"step": 644
},
{
"epoch": 1.374866879659212,
"grad_norm": 0.208984375,
"learning_rate": 2.066632980327478e-05,
"loss": 0.9467480778694153,
"step": 646
},
{
"epoch": 1.3791267305644301,
"grad_norm": 0.482421875,
"learning_rate": 2.061139889315486e-05,
"loss": 0.8729652762413025,
"step": 648
},
{
"epoch": 1.3833865814696487,
"grad_norm": 0.275390625,
"learning_rate": 2.0556410482616977e-05,
"loss": 0.8954660892486572,
"step": 650
},
{
"epoch": 1.387646432374867,
"grad_norm": 0.2734375,
"learning_rate": 2.050136578245149e-05,
"loss": 0.870725691318512,
"step": 652
},
{
"epoch": 1.3919062832800853,
"grad_norm": 0.251953125,
"learning_rate": 2.0446266004688197e-05,
"loss": 0.8651110529899597,
"step": 654
},
{
"epoch": 1.3961661341853036,
"grad_norm": 0.240234375,
"learning_rate": 2.039111236256964e-05,
"loss": 0.8937119841575623,
"step": 656
},
{
"epoch": 1.4004259850905219,
"grad_norm": 0.2333984375,
"learning_rate": 2.0335906070524416e-05,
"loss": 0.8803120851516724,
"step": 658
},
{
"epoch": 1.4046858359957402,
"grad_norm": 0.2236328125,
"learning_rate": 2.02806483441404e-05,
"loss": 0.8514755368232727,
"step": 660
},
{
"epoch": 1.4089456869009584,
"grad_norm": 0.1630859375,
"learning_rate": 2.0225340400138033e-05,
"loss": 0.8654860258102417,
"step": 662
},
{
"epoch": 1.4132055378061767,
"grad_norm": 0.1904296875,
"learning_rate": 2.0169983456343464e-05,
"loss": 0.861249566078186,
"step": 664
},
{
"epoch": 1.417465388711395,
"grad_norm": 0.48046875,
"learning_rate": 2.011457873166179e-05,
"loss": 0.8996407389640808,
"step": 666
},
{
"epoch": 1.4217252396166133,
"grad_norm": 0.390625,
"learning_rate": 2.005912744605019e-05,
"loss": 0.822201132774353,
"step": 668
},
{
"epoch": 1.4259850905218316,
"grad_norm": 0.294921875,
"learning_rate": 2.0003630820491066e-05,
"loss": 0.8432199358940125,
"step": 670
},
{
"epoch": 1.4302449414270502,
"grad_norm": 0.263671875,
"learning_rate": 1.9948090076965163e-05,
"loss": 0.8672274351119995,
"step": 672
},
{
"epoch": 1.4345047923322682,
"grad_norm": 0.25,
"learning_rate": 1.9892506438424666e-05,
"loss": 0.8486787676811218,
"step": 674
},
{
"epoch": 1.4387646432374868,
"grad_norm": 0.279296875,
"learning_rate": 1.9836881128766248e-05,
"loss": 0.8892148733139038,
"step": 676
},
{
"epoch": 1.443024494142705,
"grad_norm": 0.27734375,
"learning_rate": 1.9781215372804158e-05,
"loss": 0.8915472030639648,
"step": 678
},
{
"epoch": 1.4472843450479234,
"grad_norm": 0.3203125,
"learning_rate": 1.9725510396243226e-05,
"loss": 0.8767306804656982,
"step": 680
},
{
"epoch": 1.4515441959531417,
"grad_norm": 0.421875,
"learning_rate": 1.9669767425651873e-05,
"loss": 1.0251777172088623,
"step": 682
},
{
"epoch": 1.45580404685836,
"grad_norm": 0.32421875,
"learning_rate": 1.9613987688435132e-05,
"loss": 0.8821164965629578,
"step": 684
},
{
"epoch": 1.4600638977635783,
"grad_norm": 0.2099609375,
"learning_rate": 1.955817241280757e-05,
"loss": 0.8836470246315002,
"step": 686
},
{
"epoch": 1.4643237486687966,
"grad_norm": 0.173828125,
"learning_rate": 1.9502322827766297e-05,
"loss": 0.9067674279212952,
"step": 688
},
{
"epoch": 1.4685835995740149,
"grad_norm": 0.2294921875,
"learning_rate": 1.9446440163063875e-05,
"loss": 0.9052207469940186,
"step": 690
},
{
"epoch": 1.4728434504792332,
"grad_norm": 0.271484375,
"learning_rate": 1.939052564918126e-05,
"loss": 0.8458245396614075,
"step": 692
},
{
"epoch": 1.4771033013844517,
"grad_norm": 0.328125,
"learning_rate": 1.9334580517300668e-05,
"loss": 0.9541709423065186,
"step": 694
},
{
"epoch": 1.4813631522896697,
"grad_norm": 0.2255859375,
"learning_rate": 1.9278605999278513e-05,
"loss": 0.9391557574272156,
"step": 696
},
{
"epoch": 1.4856230031948883,
"grad_norm": 0.18359375,
"learning_rate": 1.922260332761827e-05,
"loss": 0.9119634628295898,
"step": 698
},
{
"epoch": 1.4898828541001066,
"grad_norm": 0.3671875,
"learning_rate": 1.9166573735443302e-05,
"loss": 0.872115433216095,
"step": 700
},
{
"epoch": 1.4941427050053249,
"grad_norm": 2.609375,
"learning_rate": 1.9110518456469764e-05,
"loss": 0.9491547346115112,
"step": 702
},
{
"epoch": 1.4984025559105432,
"grad_norm": 0.1826171875,
"learning_rate": 1.905443872497939e-05,
"loss": 0.8039662837982178,
"step": 704
},
{
"epoch": 1.5026624068157615,
"grad_norm": 0.26953125,
"learning_rate": 1.8998335775792343e-05,
"loss": 0.8376708030700684,
"step": 706
},
{
"epoch": 1.5069222577209798,
"grad_norm": 0.2275390625,
"learning_rate": 1.894221084424001e-05,
"loss": 0.8669439554214478,
"step": 708
},
{
"epoch": 1.511182108626198,
"grad_norm": 0.1962890625,
"learning_rate": 1.888606516613781e-05,
"loss": 0.8526804447174072,
"step": 710
},
{
"epoch": 1.5154419595314164,
"grad_norm": 0.2060546875,
"learning_rate": 1.8829899977757996e-05,
"loss": 0.838132381439209,
"step": 712
},
{
"epoch": 1.5197018104366347,
"grad_norm": 0.294921875,
"learning_rate": 1.8773716515802387e-05,
"loss": 0.9030261635780334,
"step": 714
},
{
"epoch": 1.5239616613418532,
"grad_norm": 0.298828125,
"learning_rate": 1.8717516017375192e-05,
"loss": 0.8684689998626709,
"step": 716
},
{
"epoch": 1.5282215122470713,
"grad_norm": 0.208984375,
"learning_rate": 1.866129971995575e-05,
"loss": 0.950151264667511,
"step": 718
},
{
"epoch": 1.5324813631522898,
"grad_norm": 0.4765625,
"learning_rate": 1.8605068861371255e-05,
"loss": 0.9864886403083801,
"step": 720
},
{
"epoch": 1.5367412140575079,
"grad_norm": 0.2421875,
"learning_rate": 1.8548824679769538e-05,
"loss": 0.9203893542289734,
"step": 722
},
{
"epoch": 1.5410010649627264,
"grad_norm": 0.2490234375,
"learning_rate": 1.8492568413591787e-05,
"loss": 0.8589147329330444,
"step": 724
},
{
"epoch": 1.5452609158679447,
"grad_norm": 0.2265625,
"learning_rate": 1.8436301301545282e-05,
"loss": 0.7150123119354248,
"step": 726
},
{
"epoch": 1.549520766773163,
"grad_norm": 0.1796875,
"learning_rate": 1.8380024582576128e-05,
"loss": 0.843291163444519,
"step": 728
},
{
"epoch": 1.5537806176783813,
"grad_norm": 0.267578125,
"learning_rate": 1.8323739495841943e-05,
"loss": 0.8748659491539001,
"step": 730
},
{
"epoch": 1.5580404685835996,
"grad_norm": 0.2314453125,
"learning_rate": 1.8267447280684607e-05,
"loss": 0.8816359043121338,
"step": 732
},
{
"epoch": 1.5623003194888179,
"grad_norm": 0.1611328125,
"learning_rate": 1.8211149176602964e-05,
"loss": 0.9086512923240662,
"step": 734
},
{
"epoch": 1.5665601703940362,
"grad_norm": 0.19140625,
"learning_rate": 1.8154846423225515e-05,
"loss": 0.9282605648040771,
"step": 736
},
{
"epoch": 1.5708200212992547,
"grad_norm": 0.486328125,
"learning_rate": 1.8098540260283158e-05,
"loss": 0.8508008122444153,
"step": 738
},
{
"epoch": 1.5750798722044728,
"grad_norm": 0.2021484375,
"learning_rate": 1.8042231927581833e-05,
"loss": 0.7999932169914246,
"step": 740
},
{
"epoch": 1.5793397231096913,
"grad_norm": 0.341796875,
"learning_rate": 1.7985922664975274e-05,
"loss": 0.9391716718673706,
"step": 742
},
{
"epoch": 1.5835995740149094,
"grad_norm": 0.201171875,
"learning_rate": 1.79296137123377e-05,
"loss": 0.8545106649398804,
"step": 744
},
{
"epoch": 1.5878594249201279,
"grad_norm": 0.197265625,
"learning_rate": 1.7873306309536485e-05,
"loss": 0.8491992950439453,
"step": 746
},
{
"epoch": 1.592119275825346,
"grad_norm": 0.306640625,
"learning_rate": 1.7817001696404894e-05,
"loss": 0.8515585064888,
"step": 748
},
{
"epoch": 1.5963791267305645,
"grad_norm": 0.212890625,
"learning_rate": 1.7760701112714742e-05,
"loss": 0.8558241128921509,
"step": 750
},
{
"epoch": 1.6006389776357828,
"grad_norm": 0.28125,
"learning_rate": 1.7704405798149154e-05,
"loss": 0.8748922944068909,
"step": 752
},
{
"epoch": 1.604898828541001,
"grad_norm": 0.45703125,
"learning_rate": 1.764811699227521e-05,
"loss": 0.881086528301239,
"step": 754
},
{
"epoch": 1.6091586794462194,
"grad_norm": 0.2373046875,
"learning_rate": 1.7591835934516677e-05,
"loss": 0.8601434230804443,
"step": 756
},
{
"epoch": 1.6134185303514377,
"grad_norm": 0.27734375,
"learning_rate": 1.7535563864126723e-05,
"loss": 0.925481915473938,
"step": 758
},
{
"epoch": 1.617678381256656,
"grad_norm": 0.224609375,
"learning_rate": 1.7479302020160627e-05,
"loss": 0.8856874108314514,
"step": 760
},
{
"epoch": 1.6219382321618743,
"grad_norm": 0.6875,
"learning_rate": 1.7423051641448478e-05,
"loss": 0.9088162779808044,
"step": 762
},
{
"epoch": 1.6261980830670928,
"grad_norm": 0.2734375,
"learning_rate": 1.7366813966567914e-05,
"loss": 0.7893877029418945,
"step": 764
},
{
"epoch": 1.6304579339723109,
"grad_norm": 0.3046875,
"learning_rate": 1.7310590233816868e-05,
"loss": 0.8651562929153442,
"step": 766
},
{
"epoch": 1.6347177848775294,
"grad_norm": 0.2470703125,
"learning_rate": 1.7254381681186248e-05,
"loss": 0.8518175482749939,
"step": 768
},
{
"epoch": 1.6389776357827475,
"grad_norm": 0.306640625,
"learning_rate": 1.7198189546332738e-05,
"loss": 0.8798878192901611,
"step": 770
},
{
"epoch": 1.643237486687966,
"grad_norm": 0.248046875,
"learning_rate": 1.7142015066551515e-05,
"loss": 0.815255343914032,
"step": 772
},
{
"epoch": 1.6474973375931843,
"grad_norm": 0.2373046875,
"learning_rate": 1.7085859478748988e-05,
"loss": 0.936029314994812,
"step": 774
},
{
"epoch": 1.6517571884984026,
"grad_norm": 0.2060546875,
"learning_rate": 1.7029724019415604e-05,
"loss": 0.9097844362258911,
"step": 776
},
{
"epoch": 1.6560170394036209,
"grad_norm": 0.29296875,
"learning_rate": 1.6973609924598605e-05,
"loss": 0.8360726833343506,
"step": 778
},
{
"epoch": 1.6602768903088392,
"grad_norm": 0.31640625,
"learning_rate": 1.691751842987478e-05,
"loss": 0.7691276669502258,
"step": 780
},
{
"epoch": 1.6645367412140575,
"grad_norm": 0.412109375,
"learning_rate": 1.6861450770323317e-05,
"loss": 0.9032488465309143,
"step": 782
},
{
"epoch": 1.6687965921192758,
"grad_norm": 0.30859375,
"learning_rate": 1.680540818049856e-05,
"loss": 0.8317678570747375,
"step": 784
},
{
"epoch": 1.6730564430244943,
"grad_norm": 0.455078125,
"learning_rate": 1.674939189440285e-05,
"loss": 0.8583813905715942,
"step": 786
},
{
"epoch": 1.6773162939297124,
"grad_norm": 0.2314453125,
"learning_rate": 1.6693403145459335e-05,
"loss": 0.8612514138221741,
"step": 788
},
{
"epoch": 1.681576144834931,
"grad_norm": 0.19921875,
"learning_rate": 1.6637443166484836e-05,
"loss": 0.8975757360458374,
"step": 790
},
{
"epoch": 1.685835995740149,
"grad_norm": 0.2734375,
"learning_rate": 1.6581513189662684e-05,
"loss": 0.8868735432624817,
"step": 792
},
{
"epoch": 1.6900958466453675,
"grad_norm": 0.251953125,
"learning_rate": 1.652561444651558e-05,
"loss": 0.887550950050354,
"step": 794
},
{
"epoch": 1.6943556975505856,
"grad_norm": 0.25,
"learning_rate": 1.6469748167878502e-05,
"loss": 0.8832526803016663,
"step": 796
},
{
"epoch": 1.698615548455804,
"grad_norm": 0.203125,
"learning_rate": 1.64139155838716e-05,
"loss": 0.8911911845207214,
"step": 798
},
{
"epoch": 1.7028753993610224,
"grad_norm": 0.181640625,
"learning_rate": 1.635811792387308e-05,
"loss": 0.8105019927024841,
"step": 800
},
{
"epoch": 1.7071352502662407,
"grad_norm": 0.337890625,
"learning_rate": 1.630235641649217e-05,
"loss": 0.8116901516914368,
"step": 802
},
{
"epoch": 1.711395101171459,
"grad_norm": 0.2353515625,
"learning_rate": 1.6246632289542054e-05,
"loss": 0.936326801776886,
"step": 804
},
{
"epoch": 1.7156549520766773,
"grad_norm": 0.251953125,
"learning_rate": 1.6190946770012838e-05,
"loss": 0.7342237234115601,
"step": 806
},
{
"epoch": 1.7199148029818956,
"grad_norm": 0.314453125,
"learning_rate": 1.613530108404451e-05,
"loss": 0.8804312944412231,
"step": 808
},
{
"epoch": 1.7241746538871139,
"grad_norm": 0.255859375,
"learning_rate": 1.6079696456899987e-05,
"loss": 0.900128960609436,
"step": 810
},
{
"epoch": 1.7284345047923324,
"grad_norm": 0.2353515625,
"learning_rate": 1.6024134112938102e-05,
"loss": 0.9259054660797119,
"step": 812
},
{
"epoch": 1.7326943556975505,
"grad_norm": 0.353515625,
"learning_rate": 1.5968615275586648e-05,
"loss": 0.7679681777954102,
"step": 814
},
{
"epoch": 1.736954206602769,
"grad_norm": 0.267578125,
"learning_rate": 1.5913141167315455e-05,
"loss": 0.8207501173019409,
"step": 816
},
{
"epoch": 1.741214057507987,
"grad_norm": 0.69140625,
"learning_rate": 1.5857713009609468e-05,
"loss": 0.8840711116790771,
"step": 818
},
{
"epoch": 1.7454739084132056,
"grad_norm": 1.03125,
"learning_rate": 1.5802332022941827e-05,
"loss": 0.87161785364151,
"step": 820
},
{
"epoch": 1.749733759318424,
"grad_norm": 0.1552734375,
"learning_rate": 1.5746999426747028e-05,
"loss": 0.8653435111045837,
"step": 822
},
{
"epoch": 1.7539936102236422,
"grad_norm": 0.39453125,
"learning_rate": 1.5691716439394043e-05,
"loss": 0.8810278177261353,
"step": 824
},
{
"epoch": 1.7582534611288605,
"grad_norm": 0.29296875,
"learning_rate": 1.563648427815953e-05,
"loss": 0.8902249336242676,
"step": 826
},
{
"epoch": 1.7625133120340788,
"grad_norm": 0.32421875,
"learning_rate": 1.558130415920098e-05,
"loss": 0.8972048163414001,
"step": 828
},
{
"epoch": 1.766773162939297,
"grad_norm": 0.2412109375,
"learning_rate": 1.552617729752998e-05,
"loss": 0.8320347666740417,
"step": 830
},
{
"epoch": 1.7710330138445154,
"grad_norm": 0.1982421875,
"learning_rate": 1.5471104906985447e-05,
"loss": 0.8805668354034424,
"step": 832
},
{
"epoch": 1.775292864749734,
"grad_norm": 0.146484375,
"learning_rate": 1.5416088200206873e-05,
"loss": 0.8669639229774475,
"step": 834
},
{
"epoch": 1.779552715654952,
"grad_norm": 0.2412109375,
"learning_rate": 1.5361128388607685e-05,
"loss": 0.8641019463539124,
"step": 836
},
{
"epoch": 1.7838125665601705,
"grad_norm": 0.2080078125,
"learning_rate": 1.5306226682348513e-05,
"loss": 0.8257539868354797,
"step": 838
},
{
"epoch": 1.7880724174653886,
"grad_norm": 0.2353515625,
"learning_rate": 1.525138429031056e-05,
"loss": 0.8225594758987427,
"step": 840
},
{
"epoch": 1.792332268370607,
"grad_norm": 0.2734375,
"learning_rate": 1.5196602420068995e-05,
"loss": 0.8701678514480591,
"step": 842
},
{
"epoch": 1.7965921192758252,
"grad_norm": 0.28515625,
"learning_rate": 1.514188227786637e-05,
"loss": 0.8979432582855225,
"step": 844
},
{
"epoch": 1.8008519701810437,
"grad_norm": 0.26171875,
"learning_rate": 1.5087225068586032e-05,
"loss": 0.8577451109886169,
"step": 846
},
{
"epoch": 1.805111821086262,
"grad_norm": 0.375,
"learning_rate": 1.5032631995725602e-05,
"loss": 0.7677904367446899,
"step": 848
},
{
"epoch": 1.8093716719914803,
"grad_norm": 0.2138671875,
"learning_rate": 1.4978104261370499e-05,
"loss": 0.8740429878234863,
"step": 850
},
{
"epoch": 1.8136315228966986,
"grad_norm": 0.2080078125,
"learning_rate": 1.4923643066167442e-05,
"loss": 0.8772373795509338,
"step": 852
},
{
"epoch": 1.817891373801917,
"grad_norm": 0.1923828125,
"learning_rate": 1.4869249609298016e-05,
"loss": 0.8475224375724792,
"step": 854
},
{
"epoch": 1.8221512247071352,
"grad_norm": 0.2294921875,
"learning_rate": 1.4814925088452294e-05,
"loss": 0.8336386680603027,
"step": 856
},
{
"epoch": 1.8264110756123535,
"grad_norm": 0.26171875,
"learning_rate": 1.4760670699802433e-05,
"loss": 0.8594604730606079,
"step": 858
},
{
"epoch": 1.830670926517572,
"grad_norm": 0.345703125,
"learning_rate": 1.4706487637976349e-05,
"loss": 0.8947794437408447,
"step": 860
},
{
"epoch": 1.83493077742279,
"grad_norm": 0.181640625,
"learning_rate": 1.4652377096031413e-05,
"loss": 0.8090410828590393,
"step": 862
},
{
"epoch": 1.8391906283280086,
"grad_norm": 0.255859375,
"learning_rate": 1.4598340265428186e-05,
"loss": 0.8447999954223633,
"step": 864
},
{
"epoch": 1.8434504792332267,
"grad_norm": 0.1728515625,
"learning_rate": 1.4544378336004174e-05,
"loss": 0.8753990530967712,
"step": 866
},
{
"epoch": 1.8477103301384452,
"grad_norm": 0.291015625,
"learning_rate": 1.4490492495947626e-05,
"loss": 0.8337631225585938,
"step": 868
},
{
"epoch": 1.8519701810436635,
"grad_norm": 0.515625,
"learning_rate": 1.4436683931771386e-05,
"loss": 0.8855006098747253,
"step": 870
},
{
"epoch": 1.8562300319488818,
"grad_norm": 0.25,
"learning_rate": 1.4382953828286769e-05,
"loss": 0.8446431756019592,
"step": 872
},
{
"epoch": 1.8604898828541,
"grad_norm": 0.1845703125,
"learning_rate": 1.4329303368577442e-05,
"loss": 0.9195294976234436,
"step": 874
},
{
"epoch": 1.8647497337593184,
"grad_norm": 0.2197265625,
"learning_rate": 1.4275733733973408e-05,
"loss": 0.8846089243888855,
"step": 876
},
{
"epoch": 1.8690095846645367,
"grad_norm": 0.310546875,
"learning_rate": 1.4222246104024985e-05,
"loss": 0.8711283802986145,
"step": 878
},
{
"epoch": 1.873269435569755,
"grad_norm": 0.2421875,
"learning_rate": 1.4168841656476817e-05,
"loss": 0.8777478337287903,
"step": 880
},
{
"epoch": 1.8775292864749735,
"grad_norm": 0.2060546875,
"learning_rate": 1.411552156724196e-05,
"loss": 0.9211516976356506,
"step": 882
},
{
"epoch": 1.8817891373801916,
"grad_norm": 0.2060546875,
"learning_rate": 1.4062287010375991e-05,
"loss": 0.7991109490394592,
"step": 884
},
{
"epoch": 1.8860489882854101,
"grad_norm": 0.267578125,
"learning_rate": 1.4009139158051142e-05,
"loss": 0.7523772120475769,
"step": 886
},
{
"epoch": 1.8903088391906282,
"grad_norm": 0.306640625,
"learning_rate": 1.3956079180530488e-05,
"loss": 0.8029102087020874,
"step": 888
},
{
"epoch": 1.8945686900958467,
"grad_norm": 1.09375,
"learning_rate": 1.3903108246142204e-05,
"loss": 0.9185020923614502,
"step": 890
},
{
"epoch": 1.898828541001065,
"grad_norm": 0.185546875,
"learning_rate": 1.3850227521253819e-05,
"loss": 0.8490954041481018,
"step": 892
},
{
"epoch": 1.9030883919062833,
"grad_norm": 0.2490234375,
"learning_rate": 1.379743817024653e-05,
"loss": 0.9293335676193237,
"step": 894
},
{
"epoch": 1.9073482428115016,
"grad_norm": 0.26953125,
"learning_rate": 1.3744741355489573e-05,
"loss": 0.83982253074646,
"step": 896
},
{
"epoch": 1.91160809371672,
"grad_norm": 1.0703125,
"learning_rate": 1.3692138237314642e-05,
"loss": 0.8462101817131042,
"step": 898
},
{
"epoch": 1.9158679446219382,
"grad_norm": 0.296875,
"learning_rate": 1.3639629973990308e-05,
"loss": 0.8812525272369385,
"step": 900
},
{
"epoch": 1.9201277955271565,
"grad_norm": 0.2021484375,
"learning_rate": 1.3587217721696534e-05,
"loss": 0.8216854929924011,
"step": 902
},
{
"epoch": 1.924387646432375,
"grad_norm": 0.267578125,
"learning_rate": 1.3534902634499233e-05,
"loss": 0.8462478518486023,
"step": 904
},
{
"epoch": 1.928647497337593,
"grad_norm": 0.2099609375,
"learning_rate": 1.3482685864324816e-05,
"loss": 0.8769442439079285,
"step": 906
},
{
"epoch": 1.9329073482428116,
"grad_norm": 0.2890625,
"learning_rate": 1.3430568560934854e-05,
"loss": 0.8453910946846008,
"step": 908
},
{
"epoch": 1.9371671991480297,
"grad_norm": 0.236328125,
"learning_rate": 1.3378551871900778e-05,
"loss": 0.7549237012863159,
"step": 910
},
{
"epoch": 1.9414270500532482,
"grad_norm": 0.2734375,
"learning_rate": 1.332663694257857e-05,
"loss": 0.8484979867935181,
"step": 912
},
{
"epoch": 1.9456869009584663,
"grad_norm": 0.33203125,
"learning_rate": 1.3274824916083569e-05,
"loss": 0.8290879130363464,
"step": 914
},
{
"epoch": 1.9499467518636848,
"grad_norm": 0.314453125,
"learning_rate": 1.3223116933265295e-05,
"loss": 0.880619466304779,
"step": 916
},
{
"epoch": 1.9542066027689031,
"grad_norm": 0.205078125,
"learning_rate": 1.3171514132682338e-05,
"loss": 0.8705392479896545,
"step": 918
},
{
"epoch": 1.9584664536741214,
"grad_norm": 0.2333984375,
"learning_rate": 1.3120017650577267e-05,
"loss": 0.849368691444397,
"step": 920
},
{
"epoch": 1.9627263045793397,
"grad_norm": 1.046875,
"learning_rate": 1.3068628620851627e-05,
"loss": 0.8190315961837769,
"step": 922
},
{
"epoch": 1.966986155484558,
"grad_norm": 0.390625,
"learning_rate": 1.3017348175040983e-05,
"loss": 0.8338907361030579,
"step": 924
},
{
"epoch": 1.9712460063897763,
"grad_norm": 0.2294921875,
"learning_rate": 1.2966177442289958e-05,
"loss": 0.783728837966919,
"step": 926
},
{
"epoch": 1.9755058572949946,
"grad_norm": 0.16796875,
"learning_rate": 1.2915117549327428e-05,
"loss": 0.8934606313705444,
"step": 928
},
{
"epoch": 1.9797657082002131,
"grad_norm": 0.458984375,
"learning_rate": 1.2864169620441688e-05,
"loss": 0.8038821220397949,
"step": 930
},
{
"epoch": 1.9840255591054312,
"grad_norm": 0.349609375,
"learning_rate": 1.2813334777455677e-05,
"loss": 0.9299109578132629,
"step": 932
},
{
"epoch": 1.9882854100106497,
"grad_norm": 0.1748046875,
"learning_rate": 1.27626141397023e-05,
"loss": 0.7765668034553528,
"step": 934
},
{
"epoch": 1.9925452609158678,
"grad_norm": 0.224609375,
"learning_rate": 1.2712008823999787e-05,
"loss": 0.8893784284591675,
"step": 936
},
{
"epoch": 1.9968051118210863,
"grad_norm": 0.2275390625,
"learning_rate": 1.2661519944627085e-05,
"loss": 0.8529191017150879,
"step": 938
},
{
"epoch": 2.0,
"grad_norm": 0.365234375,
"learning_rate": 1.2611148613299316e-05,
"loss": 0.8112186789512634,
"step": 940
},
{
"epoch": 2.0042598509052185,
"grad_norm": 0.1748046875,
"learning_rate": 1.2560895939143335e-05,
"loss": 0.6377139687538147,
"step": 942
},
{
"epoch": 2.0085197018104366,
"grad_norm": 0.15625,
"learning_rate": 1.2510763028673259e-05,
"loss": 0.5881322026252747,
"step": 944
},
{
"epoch": 2.012779552715655,
"grad_norm": 0.16015625,
"learning_rate": 1.2460750985766133e-05,
"loss": 0.5497787594795227,
"step": 946
},
{
"epoch": 2.017039403620873,
"grad_norm": 0.27734375,
"learning_rate": 1.2410860911637633e-05,
"loss": 0.6513974070549011,
"step": 948
},
{
"epoch": 2.0212992545260917,
"grad_norm": 0.16796875,
"learning_rate": 1.2361093904817794e-05,
"loss": 0.6880634427070618,
"step": 950
},
{
"epoch": 2.02555910543131,
"grad_norm": 0.2470703125,
"learning_rate": 1.2311451061126825e-05,
"loss": 0.669802188873291,
"step": 952
},
{
"epoch": 2.0298189563365283,
"grad_norm": 0.2451171875,
"learning_rate": 1.2261933473650986e-05,
"loss": 0.6532925963401794,
"step": 954
},
{
"epoch": 2.0340788072417464,
"grad_norm": 0.216796875,
"learning_rate": 1.2212542232718526e-05,
"loss": 0.6424761414527893,
"step": 956
},
{
"epoch": 2.038338658146965,
"grad_norm": 0.3046875,
"learning_rate": 1.2163278425875673e-05,
"loss": 0.599922776222229,
"step": 958
},
{
"epoch": 2.042598509052183,
"grad_norm": 0.361328125,
"learning_rate": 1.211414313786267e-05,
"loss": 0.5999573469161987,
"step": 960
},
{
"epoch": 2.0468583599574015,
"grad_norm": 0.341796875,
"learning_rate": 1.2065137450589902e-05,
"loss": 0.5664547681808472,
"step": 962
},
{
"epoch": 2.0511182108626196,
"grad_norm": 0.27734375,
"learning_rate": 1.2016262443114092e-05,
"loss": 0.6771121025085449,
"step": 964
},
{
"epoch": 2.055378061767838,
"grad_norm": 0.251953125,
"learning_rate": 1.19675191916145e-05,
"loss": 0.6011976003646851,
"step": 966
},
{
"epoch": 2.0596379126730566,
"grad_norm": 0.45703125,
"learning_rate": 1.1918908769369263e-05,
"loss": 0.624125599861145,
"step": 968
},
{
"epoch": 2.0638977635782747,
"grad_norm": 0.52734375,
"learning_rate": 1.187043224673176e-05,
"loss": 0.5838209390640259,
"step": 970
},
{
"epoch": 2.0681576144834932,
"grad_norm": 0.4765625,
"learning_rate": 1.1822090691107007e-05,
"loss": 0.6163349151611328,
"step": 972
},
{
"epoch": 2.0724174653887113,
"grad_norm": 0.2578125,
"learning_rate": 1.1773885166928193e-05,
"loss": 0.6664748787879944,
"step": 974
},
{
"epoch": 2.07667731629393,
"grad_norm": 0.26171875,
"learning_rate": 1.1725816735633235e-05,
"loss": 0.6090631484985352,
"step": 976
},
{
"epoch": 2.080937167199148,
"grad_norm": 0.2734375,
"learning_rate": 1.1677886455641398e-05,
"loss": 0.6150251030921936,
"step": 978
},
{
"epoch": 2.0851970181043664,
"grad_norm": 0.59375,
"learning_rate": 1.1630095382329988e-05,
"loss": 0.6834192872047424,
"step": 980
},
{
"epoch": 2.0894568690095845,
"grad_norm": 0.263671875,
"learning_rate": 1.158244456801111e-05,
"loss": 0.5855680108070374,
"step": 982
},
{
"epoch": 2.093716719914803,
"grad_norm": 0.26171875,
"learning_rate": 1.1534935061908528e-05,
"loss": 0.6290924549102783,
"step": 984
},
{
"epoch": 2.097976570820021,
"grad_norm": 0.27734375,
"learning_rate": 1.1487567910134513e-05,
"loss": 0.5710505247116089,
"step": 986
},
{
"epoch": 2.1022364217252396,
"grad_norm": 0.3203125,
"learning_rate": 1.1440344155666851e-05,
"loss": 0.6610984802246094,
"step": 988
},
{
"epoch": 2.106496272630458,
"grad_norm": 0.2138671875,
"learning_rate": 1.1393264838325865e-05,
"loss": 0.6294957995414734,
"step": 990
},
{
"epoch": 2.110756123535676,
"grad_norm": 0.46484375,
"learning_rate": 1.1346330994751497e-05,
"loss": 0.6489307880401611,
"step": 992
},
{
"epoch": 2.1150159744408947,
"grad_norm": 0.287109375,
"learning_rate": 1.1299543658380509e-05,
"loss": 0.5717250108718872,
"step": 994
},
{
"epoch": 2.119275825346113,
"grad_norm": 0.236328125,
"learning_rate": 1.1252903859423728e-05,
"loss": 0.5853033065795898,
"step": 996
},
{
"epoch": 2.1235356762513313,
"grad_norm": 0.2216796875,
"learning_rate": 1.120641262484335e-05,
"loss": 0.608925461769104,
"step": 998
},
{
"epoch": 2.1277955271565494,
"grad_norm": 0.216796875,
"learning_rate": 1.1160070978330323e-05,
"loss": 0.6262862086296082,
"step": 1000
},
{
"epoch": 2.132055378061768,
"grad_norm": 0.30078125,
"learning_rate": 1.1113879940281813e-05,
"loss": 0.5531333088874817,
"step": 1002
},
{
"epoch": 2.136315228966986,
"grad_norm": 0.302734375,
"learning_rate": 1.1067840527778752e-05,
"loss": 0.6142609119415283,
"step": 1004
},
{
"epoch": 2.1405750798722045,
"grad_norm": 0.271484375,
"learning_rate": 1.1021953754563406e-05,
"loss": 0.6254585981369019,
"step": 1006
},
{
"epoch": 2.1448349307774226,
"grad_norm": 0.224609375,
"learning_rate": 1.0976220631017094e-05,
"loss": 0.648613691329956,
"step": 1008
},
{
"epoch": 2.149094781682641,
"grad_norm": 0.400390625,
"learning_rate": 1.0930642164137922e-05,
"loss": 0.4957270324230194,
"step": 1010
},
{
"epoch": 2.1533546325878596,
"grad_norm": 0.2353515625,
"learning_rate": 1.0885219357518583e-05,
"loss": 0.6625660061836243,
"step": 1012
},
{
"epoch": 2.1576144834930777,
"grad_norm": 0.21875,
"learning_rate": 1.0839953211324313e-05,
"loss": 0.6448312401771545,
"step": 1014
},
{
"epoch": 2.1618743343982962,
"grad_norm": 0.310546875,
"learning_rate": 1.0794844722270831e-05,
"loss": 0.6265139579772949,
"step": 1016
},
{
"epoch": 2.1661341853035143,
"grad_norm": 0.3515625,
"learning_rate": 1.0749894883602406e-05,
"loss": 0.58893221616745,
"step": 1018
},
{
"epoch": 2.170394036208733,
"grad_norm": 0.5234375,
"learning_rate": 1.0705104685069973e-05,
"loss": 0.524358332157135,
"step": 1020
},
{
"epoch": 2.174653887113951,
"grad_norm": 0.392578125,
"learning_rate": 1.0660475112909354e-05,
"loss": 0.6041074395179749,
"step": 1022
},
{
"epoch": 2.1789137380191694,
"grad_norm": 0.25,
"learning_rate": 1.0616007149819543e-05,
"loss": 0.6296215653419495,
"step": 1024
},
{
"epoch": 2.1831735889243875,
"grad_norm": 0.37890625,
"learning_rate": 1.057170177494105e-05,
"loss": 0.6504489779472351,
"step": 1026
},
{
"epoch": 2.187433439829606,
"grad_norm": 0.361328125,
"learning_rate": 1.052755996383437e-05,
"loss": 0.6803485155105591,
"step": 1028
},
{
"epoch": 2.191693290734824,
"grad_norm": 0.2333984375,
"learning_rate": 1.0483582688458472e-05,
"loss": 0.6579641699790955,
"step": 1030
},
{
"epoch": 2.1959531416400426,
"grad_norm": 0.5234375,
"learning_rate": 1.0439770917149414e-05,
"loss": 0.6605786085128784,
"step": 1032
},
{
"epoch": 2.2002129925452607,
"grad_norm": 0.189453125,
"learning_rate": 1.0396125614599018e-05,
"loss": 0.6570585370063782,
"step": 1034
},
{
"epoch": 2.2044728434504792,
"grad_norm": 0.337890625,
"learning_rate": 1.0352647741833637e-05,
"loss": 0.6363896131515503,
"step": 1036
},
{
"epoch": 2.2087326943556977,
"grad_norm": 0.296875,
"learning_rate": 1.0309338256192982e-05,
"loss": 0.6393426656723022,
"step": 1038
},
{
"epoch": 2.212992545260916,
"grad_norm": 0.345703125,
"learning_rate": 1.0266198111309041e-05,
"loss": 0.7091052532196045,
"step": 1040
},
{
"epoch": 2.2172523961661343,
"grad_norm": 0.75,
"learning_rate": 1.0223228257085083e-05,
"loss": 0.6515456438064575,
"step": 1042
},
{
"epoch": 2.2215122470713524,
"grad_norm": 0.244140625,
"learning_rate": 1.0180429639674761e-05,
"loss": 0.6235453486442566,
"step": 1044
},
{
"epoch": 2.225772097976571,
"grad_norm": 0.263671875,
"learning_rate": 1.0137803201461248e-05,
"loss": 0.5850796699523926,
"step": 1046
},
{
"epoch": 2.230031948881789,
"grad_norm": 0.443359375,
"learning_rate": 1.0095349881036508e-05,
"loss": 0.5203170776367188,
"step": 1048
},
{
"epoch": 2.2342917997870075,
"grad_norm": 0.72265625,
"learning_rate": 1.0053070613180625e-05,
"loss": 0.6159985065460205,
"step": 1050
},
{
"epoch": 2.2385516506922256,
"grad_norm": 0.203125,
"learning_rate": 1.0010966328841206e-05,
"loss": 0.6239602565765381,
"step": 1052
},
{
"epoch": 2.242811501597444,
"grad_norm": 0.265625,
"learning_rate": 9.969037955112908e-06,
"loss": 0.6027981042861938,
"step": 1054
},
{
"epoch": 2.247071352502662,
"grad_norm": 0.30078125,
"learning_rate": 9.927286415217005e-06,
"loss": 0.591469407081604,
"step": 1056
},
{
"epoch": 2.2513312034078807,
"grad_norm": 0.1982421875,
"learning_rate": 9.88571262848107e-06,
"loss": 0.5683766007423401,
"step": 1058
},
{
"epoch": 2.255591054313099,
"grad_norm": 0.35546875,
"learning_rate": 9.844317510318719e-06,
"loss": 0.6158217191696167,
"step": 1060
},
{
"epoch": 2.2598509052183173,
"grad_norm": 0.60546875,
"learning_rate": 9.803101972209462e-06,
"loss": 0.5769312381744385,
"step": 1062
},
{
"epoch": 2.264110756123536,
"grad_norm": 0.24609375,
"learning_rate": 9.762066921678647e-06,
"loss": 0.5810741186141968,
"step": 1064
},
{
"epoch": 2.268370607028754,
"grad_norm": 0.28125,
"learning_rate": 9.721213262277447e-06,
"loss": 0.5853366255760193,
"step": 1066
},
{
"epoch": 2.2726304579339724,
"grad_norm": 0.2314453125,
"learning_rate": 9.680541893563e-06,
"loss": 0.5754764676094055,
"step": 1068
},
{
"epoch": 2.2768903088391905,
"grad_norm": 0.376953125,
"learning_rate": 9.640053711078571e-06,
"loss": 0.6414265632629395,
"step": 1070
},
{
"epoch": 2.281150159744409,
"grad_norm": 0.703125,
"learning_rate": 9.599749606333844e-06,
"loss": 0.5730122327804565,
"step": 1072
},
{
"epoch": 2.285410010649627,
"grad_norm": 0.224609375,
"learning_rate": 9.559630466785301e-06,
"loss": 0.6548243761062622,
"step": 1074
},
{
"epoch": 2.2896698615548456,
"grad_norm": 0.359375,
"learning_rate": 9.519697175816675e-06,
"loss": 0.6757615804672241,
"step": 1076
},
{
"epoch": 2.2939297124600637,
"grad_norm": 0.271484375,
"learning_rate": 9.4799506127195e-06,
"loss": 0.6540831923484802,
"step": 1078
},
{
"epoch": 2.2981895633652822,
"grad_norm": 0.5390625,
"learning_rate": 9.44039165267372e-06,
"loss": 0.5985897779464722,
"step": 1080
},
{
"epoch": 2.3024494142705008,
"grad_norm": 0.318359375,
"learning_rate": 9.40102116672848e-06,
"loss": 0.6373129487037659,
"step": 1082
},
{
"epoch": 2.306709265175719,
"grad_norm": 0.32421875,
"learning_rate": 9.361840021782899e-06,
"loss": 0.5798696279525757,
"step": 1084
},
{
"epoch": 2.3109691160809374,
"grad_norm": 0.53125,
"learning_rate": 9.322849080566986e-06,
"loss": 0.6472339034080505,
"step": 1086
},
{
"epoch": 2.3152289669861554,
"grad_norm": 0.2236328125,
"learning_rate": 9.284049201622668e-06,
"loss": 0.5931280851364136,
"step": 1088
},
{
"epoch": 2.319488817891374,
"grad_norm": 0.26171875,
"learning_rate": 9.245441239284858e-06,
"loss": 0.6150895953178406,
"step": 1090
},
{
"epoch": 2.323748668796592,
"grad_norm": 0.37890625,
"learning_rate": 9.207026043662654e-06,
"loss": 0.5743486285209656,
"step": 1092
},
{
"epoch": 2.3280085197018106,
"grad_norm": 0.302734375,
"learning_rate": 9.168804460620634e-06,
"loss": 0.6586934328079224,
"step": 1094
},
{
"epoch": 2.3322683706070286,
"grad_norm": 0.333984375,
"learning_rate": 9.130777331760208e-06,
"loss": 0.581457793712616,
"step": 1096
},
{
"epoch": 2.336528221512247,
"grad_norm": 0.236328125,
"learning_rate": 9.092945494401107e-06,
"loss": 0.602104663848877,
"step": 1098
},
{
"epoch": 2.3407880724174652,
"grad_norm": 0.318359375,
"learning_rate": 9.055309781562922e-06,
"loss": 0.5987313985824585,
"step": 1100
},
{
"epoch": 2.3450479233226837,
"grad_norm": 0.212890625,
"learning_rate": 9.017871021946787e-06,
"loss": 0.5123194456100464,
"step": 1102
},
{
"epoch": 2.349307774227902,
"grad_norm": 0.494140625,
"learning_rate": 8.980630039917124e-06,
"loss": 0.5810441374778748,
"step": 1104
},
{
"epoch": 2.3535676251331203,
"grad_norm": 0.5390625,
"learning_rate": 8.943587655483478e-06,
"loss": 0.5871768593788147,
"step": 1106
},
{
"epoch": 2.357827476038339,
"grad_norm": 0.2412109375,
"learning_rate": 8.906744684282483e-06,
"loss": 0.6104775667190552,
"step": 1108
},
{
"epoch": 2.362087326943557,
"grad_norm": 0.296875,
"learning_rate": 8.870101937559877e-06,
"loss": 0.6351394653320312,
"step": 1110
},
{
"epoch": 2.3663471778487755,
"grad_norm": 0.52734375,
"learning_rate": 8.833660222152663e-06,
"loss": 0.6355900168418884,
"step": 1112
},
{
"epoch": 2.3706070287539935,
"grad_norm": 0.2294921875,
"learning_rate": 8.797420340471334e-06,
"loss": 0.5833765268325806,
"step": 1114
},
{
"epoch": 2.374866879659212,
"grad_norm": 0.2216796875,
"learning_rate": 8.761383090482205e-06,
"loss": 0.6019313931465149,
"step": 1116
},
{
"epoch": 2.37912673056443,
"grad_norm": 0.2265625,
"learning_rate": 8.725549265689833e-06,
"loss": 0.5999468564987183,
"step": 1118
},
{
"epoch": 2.3833865814696487,
"grad_norm": 0.484375,
"learning_rate": 8.689919655119559e-06,
"loss": 0.6521666646003723,
"step": 1120
},
{
"epoch": 2.3876464323748667,
"grad_norm": 0.224609375,
"learning_rate": 8.654495043300129e-06,
"loss": 0.612395703792572,
"step": 1122
},
{
"epoch": 2.3919062832800853,
"grad_norm": 0.466796875,
"learning_rate": 8.619276210246427e-06,
"loss": 0.5964239239692688,
"step": 1124
},
{
"epoch": 2.3961661341853033,
"grad_norm": 0.26171875,
"learning_rate": 8.584263931442275e-06,
"loss": 0.6384221911430359,
"step": 1126
},
{
"epoch": 2.400425985090522,
"grad_norm": 0.28125,
"learning_rate": 8.549458977823395e-06,
"loss": 0.6933798789978027,
"step": 1128
},
{
"epoch": 2.40468583599574,
"grad_norm": 0.1962890625,
"learning_rate": 8.514862115760396e-06,
"loss": 0.5889874696731567,
"step": 1130
},
{
"epoch": 2.4089456869009584,
"grad_norm": 0.30859375,
"learning_rate": 8.480474107041925e-06,
"loss": 0.6254542469978333,
"step": 1132
},
{
"epoch": 2.413205537806177,
"grad_norm": 0.314453125,
"learning_rate": 8.446295708857888e-06,
"loss": 0.6616327166557312,
"step": 1134
},
{
"epoch": 2.417465388711395,
"grad_norm": 0.375,
"learning_rate": 8.412327673782774e-06,
"loss": 0.6202198266983032,
"step": 1136
},
{
"epoch": 2.4217252396166136,
"grad_norm": 0.267578125,
"learning_rate": 8.378570749759076e-06,
"loss": 0.6176246404647827,
"step": 1138
},
{
"epoch": 2.4259850905218316,
"grad_norm": 0.369140625,
"learning_rate": 8.345025680080836e-06,
"loss": 0.5884604454040527,
"step": 1140
},
{
"epoch": 2.43024494142705,
"grad_norm": 0.4453125,
"learning_rate": 8.311693203377277e-06,
"loss": 0.5704495906829834,
"step": 1142
},
{
"epoch": 2.4345047923322682,
"grad_norm": 0.357421875,
"learning_rate": 8.278574053596534e-06,
"loss": 0.5104537606239319,
"step": 1144
},
{
"epoch": 2.4387646432374868,
"grad_norm": 0.375,
"learning_rate": 8.245668959989489e-06,
"loss": 0.6920484900474548,
"step": 1146
},
{
"epoch": 2.443024494142705,
"grad_norm": 0.3359375,
"learning_rate": 8.212978647093724e-06,
"loss": 0.605790376663208,
"step": 1148
},
{
"epoch": 2.4472843450479234,
"grad_norm": 0.431640625,
"learning_rate": 8.180503834717563e-06,
"loss": 0.6005589962005615,
"step": 1150
},
{
"epoch": 2.451544195953142,
"grad_norm": 0.30078125,
"learning_rate": 8.148245237924212e-06,
"loss": 0.6908122301101685,
"step": 1152
},
{
"epoch": 2.45580404685836,
"grad_norm": 0.392578125,
"learning_rate": 8.116203567016035e-06,
"loss": 0.5939027667045593,
"step": 1154
},
{
"epoch": 2.460063897763578,
"grad_norm": 0.357421875,
"learning_rate": 8.084379527518908e-06,
"loss": 0.6245042681694031,
"step": 1156
},
{
"epoch": 2.4643237486687966,
"grad_norm": 0.25390625,
"learning_rate": 8.05277382016666e-06,
"loss": 0.5638337731361389,
"step": 1158
},
{
"epoch": 2.468583599574015,
"grad_norm": 0.2890625,
"learning_rate": 8.021387140885672e-06,
"loss": 0.665945291519165,
"step": 1160
},
{
"epoch": 2.472843450479233,
"grad_norm": 0.2255859375,
"learning_rate": 7.99022018077955e-06,
"loss": 0.5603002309799194,
"step": 1162
},
{
"epoch": 2.4771033013844517,
"grad_norm": 0.287109375,
"learning_rate": 7.959273626113896e-06,
"loss": 0.5992410182952881,
"step": 1164
},
{
"epoch": 2.4813631522896697,
"grad_norm": 0.216796875,
"learning_rate": 7.9285481583012e-06,
"loss": 0.6628497242927551,
"step": 1166
},
{
"epoch": 2.4856230031948883,
"grad_norm": 0.984375,
"learning_rate": 7.898044453885837e-06,
"loss": 0.5260273218154907,
"step": 1168
},
{
"epoch": 2.4898828541001063,
"grad_norm": 0.265625,
"learning_rate": 7.867763184529182e-06,
"loss": 0.6244964599609375,
"step": 1170
},
{
"epoch": 2.494142705005325,
"grad_norm": 0.369140625,
"learning_rate": 7.837705016994796e-06,
"loss": 0.6657370328903198,
"step": 1172
},
{
"epoch": 2.498402555910543,
"grad_norm": 0.333984375,
"learning_rate": 7.80787061313377e-06,
"loss": 0.6410002708435059,
"step": 1174
},
{
"epoch": 2.5026624068157615,
"grad_norm": 0.1953125,
"learning_rate": 7.77826062987014e-06,
"loss": 0.5408449769020081,
"step": 1176
},
{
"epoch": 2.50692225772098,
"grad_norm": 0.35546875,
"learning_rate": 7.748875719186413e-06,
"loss": 0.5735031962394714,
"step": 1178
},
{
"epoch": 2.511182108626198,
"grad_norm": 0.322265625,
"learning_rate": 7.71971652810923e-06,
"loss": 0.6153873801231384,
"step": 1180
},
{
"epoch": 2.515441959531416,
"grad_norm": 0.33984375,
"learning_rate": 7.690783698695106e-06,
"loss": 0.5873544216156006,
"step": 1182
},
{
"epoch": 2.5197018104366347,
"grad_norm": 0.259765625,
"learning_rate": 7.662077868016297e-06,
"loss": 0.6717422604560852,
"step": 1184
},
{
"epoch": 2.523961661341853,
"grad_norm": 0.94921875,
"learning_rate": 7.633599668146775e-06,
"loss": 0.6083505153656006,
"step": 1186
},
{
"epoch": 2.5282215122470713,
"grad_norm": 0.30078125,
"learning_rate": 7.605349726148296e-06,
"loss": 0.6134154200553894,
"step": 1188
},
{
"epoch": 2.5324813631522898,
"grad_norm": 0.275390625,
"learning_rate": 7.577328664056617e-06,
"loss": 0.589963972568512,
"step": 1190
},
{
"epoch": 2.536741214057508,
"grad_norm": 0.51171875,
"learning_rate": 7.549537098867776e-06,
"loss": 0.5288025140762329,
"step": 1192
},
{
"epoch": 2.5410010649627264,
"grad_norm": 0.2451171875,
"learning_rate": 7.521975642524525e-06,
"loss": 0.616111159324646,
"step": 1194
},
{
"epoch": 2.545260915867945,
"grad_norm": 0.3046875,
"learning_rate": 7.494644901902843e-06,
"loss": 0.6015118360519409,
"step": 1196
},
{
"epoch": 2.549520766773163,
"grad_norm": 0.30078125,
"learning_rate": 7.467545478798574e-06,
"loss": 0.5770639777183533,
"step": 1198
},
{
"epoch": 2.553780617678381,
"grad_norm": 0.365234375,
"learning_rate": 7.440677969914182e-06,
"loss": 0.6590741872787476,
"step": 1200
},
{
"epoch": 2.5580404685835996,
"grad_norm": 0.2392578125,
"learning_rate": 7.4140429668456115e-06,
"loss": 0.47983720898628235,
"step": 1202
},
{
"epoch": 2.562300319488818,
"grad_norm": 0.31640625,
"learning_rate": 7.38764105606926e-06,
"loss": 0.549656093120575,
"step": 1204
},
{
"epoch": 2.566560170394036,
"grad_norm": 1.1640625,
"learning_rate": 7.361472818929058e-06,
"loss": 0.5793447494506836,
"step": 1206
},
{
"epoch": 2.5708200212992547,
"grad_norm": 0.251953125,
"learning_rate": 7.335538831623676e-06,
"loss": 0.637956976890564,
"step": 1208
},
{
"epoch": 2.5750798722044728,
"grad_norm": 0.5625,
"learning_rate": 7.309839665193839e-06,
"loss": 0.5784144401550293,
"step": 1210
},
{
"epoch": 2.5793397231096913,
"grad_norm": 0.2275390625,
"learning_rate": 7.284375885509741e-06,
"loss": 0.6299670338630676,
"step": 1212
},
{
"epoch": 2.5835995740149094,
"grad_norm": 0.322265625,
"learning_rate": 7.259148053258603e-06,
"loss": 0.674586296081543,
"step": 1214
},
{
"epoch": 2.587859424920128,
"grad_norm": 0.3671875,
"learning_rate": 7.234156723932312e-06,
"loss": 0.6188330054283142,
"step": 1216
},
{
"epoch": 2.592119275825346,
"grad_norm": 0.271484375,
"learning_rate": 7.20940244781519e-06,
"loss": 0.6208375096321106,
"step": 1218
},
{
"epoch": 2.5963791267305645,
"grad_norm": 0.388671875,
"learning_rate": 7.184885769971888e-06,
"loss": 0.6017476916313171,
"step": 1220
},
{
"epoch": 2.600638977635783,
"grad_norm": 0.333984375,
"learning_rate": 7.160607230235378e-06,
"loss": 0.6354559659957886,
"step": 1222
},
{
"epoch": 2.604898828541001,
"grad_norm": 0.31640625,
"learning_rate": 7.136567363195069e-06,
"loss": 0.6745753884315491,
"step": 1224
},
{
"epoch": 2.609158679446219,
"grad_norm": 0.427734375,
"learning_rate": 7.112766698185027e-06,
"loss": 0.5988171100616455,
"step": 1226
},
{
"epoch": 2.6134185303514377,
"grad_norm": 0.25390625,
"learning_rate": 7.089205759272327e-06,
"loss": 0.6004793643951416,
"step": 1228
},
{
"epoch": 2.617678381256656,
"grad_norm": 0.31640625,
"learning_rate": 7.06588506524552e-06,
"loss": 0.5850980877876282,
"step": 1230
},
{
"epoch": 2.6219382321618743,
"grad_norm": 0.33203125,
"learning_rate": 7.042805129603193e-06,
"loss": 0.5615159869194031,
"step": 1232
},
{
"epoch": 2.626198083067093,
"grad_norm": 0.333984375,
"learning_rate": 7.019966460542681e-06,
"loss": 0.6120025515556335,
"step": 1234
},
{
"epoch": 2.630457933972311,
"grad_norm": 0.35546875,
"learning_rate": 6.997369560948859e-06,
"loss": 0.6796953082084656,
"step": 1236
},
{
"epoch": 2.6347177848775294,
"grad_norm": 0.306640625,
"learning_rate": 6.975014928383083e-06,
"loss": 0.5794081091880798,
"step": 1238
},
{
"epoch": 2.6389776357827475,
"grad_norm": 0.28515625,
"learning_rate": 6.952903055072226e-06,
"loss": 0.5920906066894531,
"step": 1240
},
{
"epoch": 2.643237486687966,
"grad_norm": 0.2412109375,
"learning_rate": 6.9310344278978505e-06,
"loss": 0.5745714902877808,
"step": 1242
},
{
"epoch": 2.647497337593184,
"grad_norm": 0.2578125,
"learning_rate": 6.909409528385466e-06,
"loss": 0.5876143574714661,
"step": 1244
},
{
"epoch": 2.6517571884984026,
"grad_norm": 0.2734375,
"learning_rate": 6.888028832693953e-06,
"loss": 0.586786150932312,
"step": 1246
},
{
"epoch": 2.656017039403621,
"grad_norm": 0.380859375,
"learning_rate": 6.86689281160506e-06,
"loss": 0.5594542622566223,
"step": 1248
},
{
"epoch": 2.660276890308839,
"grad_norm": 0.23046875,
"learning_rate": 6.846001930513041e-06,
"loss": 0.6434107422828674,
"step": 1250
},
{
"epoch": 2.6645367412140573,
"grad_norm": 0.361328125,
"learning_rate": 6.825356649414415e-06,
"loss": 0.6385661959648132,
"step": 1252
},
{
"epoch": 2.668796592119276,
"grad_norm": 0.291015625,
"learning_rate": 6.80495742289783e-06,
"loss": 0.6039466261863708,
"step": 1254
},
{
"epoch": 2.6730564430244943,
"grad_norm": 0.298828125,
"learning_rate": 6.784804700134056e-06,
"loss": 0.6025973558425903,
"step": 1256
},
{
"epoch": 2.6773162939297124,
"grad_norm": 0.296875,
"learning_rate": 6.764898924866091e-06,
"loss": 0.6119323372840881,
"step": 1258
},
{
"epoch": 2.681576144834931,
"grad_norm": 0.32421875,
"learning_rate": 6.7452405353993985e-06,
"loss": 0.617369532585144,
"step": 1260
},
{
"epoch": 2.685835995740149,
"grad_norm": 0.330078125,
"learning_rate": 6.72582996459225e-06,
"loss": 0.6640692949295044,
"step": 1262
},
{
"epoch": 2.6900958466453675,
"grad_norm": 0.3359375,
"learning_rate": 6.706667639846196e-06,
"loss": 0.6609706282615662,
"step": 1264
},
{
"epoch": 2.6943556975505856,
"grad_norm": 0.2490234375,
"learning_rate": 6.687753983096654e-06,
"loss": 0.53211909532547,
"step": 1266
},
{
"epoch": 2.698615548455804,
"grad_norm": 0.455078125,
"learning_rate": 6.669089410803617e-06,
"loss": 0.667971134185791,
"step": 1268
},
{
"epoch": 2.702875399361022,
"grad_norm": 0.265625,
"learning_rate": 6.650674333942487e-06,
"loss": 0.5798393487930298,
"step": 1270
},
{
"epoch": 2.7071352502662407,
"grad_norm": 0.439453125,
"learning_rate": 6.632509157995023e-06,
"loss": 0.6258153915405273,
"step": 1272
},
{
"epoch": 2.711395101171459,
"grad_norm": 0.392578125,
"learning_rate": 6.614594282940414e-06,
"loss": 0.624832272529602,
"step": 1274
},
{
"epoch": 2.7156549520766773,
"grad_norm": 0.349609375,
"learning_rate": 6.596930103246468e-06,
"loss": 0.5772223472595215,
"step": 1276
},
{
"epoch": 2.7199148029818954,
"grad_norm": 0.61328125,
"learning_rate": 6.579517007860933e-06,
"loss": 0.5936267971992493,
"step": 1278
},
{
"epoch": 2.724174653887114,
"grad_norm": 0.2197265625,
"learning_rate": 6.562355380202927e-06,
"loss": 0.668041467666626,
"step": 1280
},
{
"epoch": 2.7284345047923324,
"grad_norm": 0.25,
"learning_rate": 6.5454455981545e-06,
"loss": 0.5487772226333618,
"step": 1282
},
{
"epoch": 2.7326943556975505,
"grad_norm": 0.205078125,
"learning_rate": 6.528788034052311e-06,
"loss": 0.6349499225616455,
"step": 1284
},
{
"epoch": 2.736954206602769,
"grad_norm": 0.23828125,
"learning_rate": 6.512383054679422e-06,
"loss": 0.5938593149185181,
"step": 1286
},
{
"epoch": 2.741214057507987,
"grad_norm": 0.263671875,
"learning_rate": 6.496231021257242e-06,
"loss": 0.6245843172073364,
"step": 1288
},
{
"epoch": 2.7454739084132056,
"grad_norm": 0.357421875,
"learning_rate": 6.480332289437552e-06,
"loss": 0.5823163390159607,
"step": 1290
},
{
"epoch": 2.749733759318424,
"grad_norm": 0.3359375,
"learning_rate": 6.464687209294682e-06,
"loss": 0.5846402049064636,
"step": 1292
},
{
"epoch": 2.753993610223642,
"grad_norm": 0.412109375,
"learning_rate": 6.44929612531781e-06,
"loss": 0.6277037262916565,
"step": 1294
},
{
"epoch": 2.7582534611288603,
"grad_norm": 0.20703125,
"learning_rate": 6.434159376403363e-06,
"loss": 0.6208704113960266,
"step": 1296
},
{
"epoch": 2.762513312034079,
"grad_norm": 0.32421875,
"learning_rate": 6.419277295847563e-06,
"loss": 0.5632691979408264,
"step": 1298
},
{
"epoch": 2.7667731629392973,
"grad_norm": 0.296875,
"learning_rate": 6.404650211339093e-06,
"loss": 0.6156328320503235,
"step": 1300
},
{
"epoch": 2.7710330138445154,
"grad_norm": 0.259765625,
"learning_rate": 6.390278444951868e-06,
"loss": 0.6689990758895874,
"step": 1302
},
{
"epoch": 2.775292864749734,
"grad_norm": 0.25,
"learning_rate": 6.376162313137955e-06,
"loss": 0.6374217867851257,
"step": 1304
},
{
"epoch": 2.779552715654952,
"grad_norm": 0.439453125,
"learning_rate": 6.3623021267205975e-06,
"loss": 0.6087695360183716,
"step": 1306
},
{
"epoch": 2.7838125665601705,
"grad_norm": 0.2734375,
"learning_rate": 6.348698190887377e-06,
"loss": 0.5766043066978455,
"step": 1308
},
{
"epoch": 2.7880724174653886,
"grad_norm": 0.359375,
"learning_rate": 6.3353508051834924e-06,
"loss": 0.6857935786247253,
"step": 1310
},
{
"epoch": 2.792332268370607,
"grad_norm": 0.259765625,
"learning_rate": 6.322260263505159e-06,
"loss": 0.6080771684646606,
"step": 1312
},
{
"epoch": 2.796592119275825,
"grad_norm": 0.232421875,
"learning_rate": 6.309426854093147e-06,
"loss": 0.5428948402404785,
"step": 1314
},
{
"epoch": 2.8008519701810437,
"grad_norm": 0.609375,
"learning_rate": 6.2968508595264195e-06,
"loss": 0.6500948667526245,
"step": 1316
},
{
"epoch": 2.8051118210862622,
"grad_norm": 0.306640625,
"learning_rate": 6.284532556715927e-06,
"loss": 0.6038864850997925,
"step": 1318
},
{
"epoch": 2.8093716719914803,
"grad_norm": 0.404296875,
"learning_rate": 6.272472216898501e-06,
"loss": 0.6369448304176331,
"step": 1320
},
{
"epoch": 2.8136315228966984,
"grad_norm": 0.32421875,
"learning_rate": 6.260670105630885e-06,
"loss": 0.6288717985153198,
"step": 1322
},
{
"epoch": 2.817891373801917,
"grad_norm": 0.3984375,
"learning_rate": 6.2491264827838775e-06,
"loss": 0.6535931825637817,
"step": 1324
},
{
"epoch": 2.8221512247071354,
"grad_norm": 0.28515625,
"learning_rate": 6.237841602536627e-06,
"loss": 0.6414341330528259,
"step": 1326
},
{
"epoch": 2.8264110756123535,
"grad_norm": 0.2578125,
"learning_rate": 6.226815713371023e-06,
"loss": 0.5740489959716797,
"step": 1328
},
{
"epoch": 2.830670926517572,
"grad_norm": 0.2890625,
"learning_rate": 6.216049058066229e-06,
"loss": 0.5453130602836609,
"step": 1330
},
{
"epoch": 2.83493077742279,
"grad_norm": 0.228515625,
"learning_rate": 6.205541873693331e-06,
"loss": 0.531428873538971,
"step": 1332
},
{
"epoch": 2.8391906283280086,
"grad_norm": 0.3046875,
"learning_rate": 6.195294391610128e-06,
"loss": 0.6185562014579773,
"step": 1334
},
{
"epoch": 2.8434504792332267,
"grad_norm": 0.3671875,
"learning_rate": 6.185306837456027e-06,
"loss": 0.6069992184638977,
"step": 1336
},
{
"epoch": 2.847710330138445,
"grad_norm": 0.26953125,
"learning_rate": 6.1755794311470824e-06,
"loss": 0.5699736475944519,
"step": 1338
},
{
"epoch": 2.8519701810436633,
"grad_norm": 0.875,
"learning_rate": 6.166112386871149e-06,
"loss": 0.5937331318855286,
"step": 1340
},
{
"epoch": 2.856230031948882,
"grad_norm": 0.30078125,
"learning_rate": 6.15690591308317e-06,
"loss": 0.5383535623550415,
"step": 1342
},
{
"epoch": 2.8604898828541003,
"grad_norm": 0.2119140625,
"learning_rate": 6.14796021250058e-06,
"loss": 0.5439456701278687,
"step": 1344
},
{
"epoch": 2.8647497337593184,
"grad_norm": 0.2578125,
"learning_rate": 6.139275482098847e-06,
"loss": 0.5950272083282471,
"step": 1346
},
{
"epoch": 2.8690095846645365,
"grad_norm": 0.255859375,
"learning_rate": 6.130851913107137e-06,
"loss": 0.5372447967529297,
"step": 1348
},
{
"epoch": 2.873269435569755,
"grad_norm": 0.2294921875,
"learning_rate": 6.122689691004103e-06,
"loss": 0.5755343437194824,
"step": 1350
},
{
"epoch": 2.8775292864749735,
"grad_norm": 0.28515625,
"learning_rate": 6.114788995513787e-06,
"loss": 0.6370142102241516,
"step": 1352
},
{
"epoch": 2.8817891373801916,
"grad_norm": 0.28125,
"learning_rate": 6.107150000601684e-06,
"loss": 0.5815765261650085,
"step": 1354
},
{
"epoch": 2.88604898828541,
"grad_norm": 0.2490234375,
"learning_rate": 6.099772874470899e-06,
"loss": 0.6185727715492249,
"step": 1356
},
{
"epoch": 2.890308839190628,
"grad_norm": 0.36328125,
"learning_rate": 6.092657779558442e-06,
"loss": 0.5713162422180176,
"step": 1358
},
{
"epoch": 2.8945686900958467,
"grad_norm": 0.283203125,
"learning_rate": 6.08580487253166e-06,
"loss": 0.6169173121452332,
"step": 1360
},
{
"epoch": 2.8988285410010652,
"grad_norm": 0.34375,
"learning_rate": 6.079214304284781e-06,
"loss": 0.5929686427116394,
"step": 1362
},
{
"epoch": 2.9030883919062833,
"grad_norm": 0.294921875,
"learning_rate": 6.072886219935593e-06,
"loss": 0.5761704444885254,
"step": 1364
},
{
"epoch": 2.9073482428115014,
"grad_norm": 0.25390625,
"learning_rate": 6.066820758822244e-06,
"loss": 0.5787940621376038,
"step": 1366
},
{
"epoch": 2.91160809371672,
"grad_norm": 0.26171875,
"learning_rate": 6.0610180545001845e-06,
"loss": 0.5501613020896912,
"step": 1368
},
{
"epoch": 2.9158679446219384,
"grad_norm": 0.1943359375,
"learning_rate": 6.055478234739217e-06,
"loss": 0.5612152218818665,
"step": 1370
},
{
"epoch": 2.9201277955271565,
"grad_norm": 0.255859375,
"learning_rate": 6.050201421520689e-06,
"loss": 0.6078463792800903,
"step": 1372
},
{
"epoch": 2.924387646432375,
"grad_norm": 0.1953125,
"learning_rate": 6.045187731034801e-06,
"loss": 0.5890936255455017,
"step": 1374
},
{
"epoch": 2.928647497337593,
"grad_norm": 0.302734375,
"learning_rate": 6.040437273678055e-06,
"loss": 0.6533024311065674,
"step": 1376
},
{
"epoch": 2.9329073482428116,
"grad_norm": 4.3125,
"learning_rate": 6.0359501540508174e-06,
"loss": 0.6827770471572876,
"step": 1378
},
{
"epoch": 2.9371671991480297,
"grad_norm": 0.3359375,
"learning_rate": 6.0317264709550185e-06,
"loss": 0.6418617963790894,
"step": 1380
},
{
"epoch": 2.9414270500532482,
"grad_norm": 0.2431640625,
"learning_rate": 6.02776631739198e-06,
"loss": 0.5774567127227783,
"step": 1382
},
{
"epoch": 2.9456869009584663,
"grad_norm": 0.38671875,
"learning_rate": 6.0240697805603594e-06,
"loss": 0.6014460921287537,
"step": 1384
},
{
"epoch": 2.949946751863685,
"grad_norm": 0.51171875,
"learning_rate": 6.020636941854242e-06,
"loss": 0.5642235279083252,
"step": 1386
},
{
"epoch": 2.9542066027689033,
"grad_norm": 0.333984375,
"learning_rate": 6.017467876861333e-06,
"loss": 0.5891353487968445,
"step": 1388
},
{
"epoch": 2.9584664536741214,
"grad_norm": 0.244140625,
"learning_rate": 6.014562655361307e-06,
"loss": 0.5744375586509705,
"step": 1390
},
{
"epoch": 2.9627263045793395,
"grad_norm": 0.255859375,
"learning_rate": 6.011921341324265e-06,
"loss": 0.5458447933197021,
"step": 1392
},
{
"epoch": 2.966986155484558,
"grad_norm": 0.23828125,
"learning_rate": 6.009543992909327e-06,
"loss": 0.6621728539466858,
"step": 1394
},
{
"epoch": 2.9712460063897765,
"grad_norm": 0.236328125,
"learning_rate": 6.007430662463352e-06,
"loss": 0.5778822898864746,
"step": 1396
},
{
"epoch": 2.9755058572949946,
"grad_norm": 0.5390625,
"learning_rate": 6.005581396519782e-06,
"loss": 0.5913535952568054,
"step": 1398
},
{
"epoch": 2.979765708200213,
"grad_norm": 1.1328125,
"learning_rate": 6.0039962357976234e-06,
"loss": 0.5911454558372498,
"step": 1400
},
{
"epoch": 2.984025559105431,
"grad_norm": 0.2197265625,
"learning_rate": 6.002675215200546e-06,
"loss": 0.5291861295700073,
"step": 1402
},
{
"epoch": 2.9882854100106497,
"grad_norm": 0.29296875,
"learning_rate": 6.001618363816112e-06,
"loss": 0.577559232711792,
"step": 1404
},
{
"epoch": 2.992545260915868,
"grad_norm": 0.28125,
"learning_rate": 6.000825704915147e-06,
"loss": 0.5995616912841797,
"step": 1406
},
{
"epoch": 2.9968051118210863,
"grad_norm": 0.251953125,
"learning_rate": 6.000297255951213e-06,
"loss": 0.5123644471168518,
"step": 1408
},
{
"epoch": 3.0,
"grad_norm": 0.2890625,
"learning_rate": 6.000033028560234e-06,
"loss": 0.584560215473175,
"step": 1410
},
{
"epoch": 3.0,
"step": 1410,
"total_flos": 3.6491913262740275e+18,
"train_loss": 0.9255026633857836,
"train_runtime": 18814.6678,
"train_samples_per_second": 2.396,
"train_steps_per_second": 0.075
}
],
"logging_steps": 2,
"max_steps": 1410,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.6491913262740275e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}